diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,8931 +1,76164 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9996064541519087, + "epoch": 0.999953931911365, "eval_steps": 500, - "global_step": 1270, + "global_step": 10853, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0007870916961826052, - "grad_norm": 4.882866791973475, - "learning_rate": 7.8125e-08, - "loss": 0.357, + "epoch": 9.213617727000507e-05, + "grad_norm": 5.052331877744982, + "learning_rate": 9.208103130755065e-09, + "loss": 0.3834, "step": 1 }, { - "epoch": 0.0015741833923652105, - "grad_norm": 4.89981767485179, - "learning_rate": 1.5625e-07, - "loss": 0.3398, + "epoch": 0.00018427235454001014, + "grad_norm": 4.968514973300749, + "learning_rate": 1.841620626151013e-08, + "loss": 0.3996, "step": 2 }, { - "epoch": 0.0023612750885478157, - "grad_norm": 4.908915346462736, - "learning_rate": 2.3437500000000003e-07, - "loss": 0.3326, + "epoch": 0.0002764085318100152, + "grad_norm": 5.173905523123122, + "learning_rate": 2.7624309392265195e-08, + "loss": 0.4053, "step": 3 }, { - "epoch": 0.003148366784730421, - "grad_norm": 4.731649267914947, - "learning_rate": 3.125e-07, - "loss": 0.342, + "epoch": 0.0003685447090800203, + "grad_norm": 4.954247638334854, + "learning_rate": 3.683241252302026e-08, + "loss": 0.3794, "step": 4 }, { - "epoch": 0.003935458480913027, - "grad_norm": 4.769633474207938, - "learning_rate": 3.90625e-07, - "loss": 0.3431, + "epoch": 0.0004606808863500253, + "grad_norm": 4.9278156311152586, + "learning_rate": 4.604051565377533e-08, + "loss": 0.3735, "step": 5 }, { - "epoch": 0.004722550177095631, - "grad_norm": 4.754569879633701, - "learning_rate": 4.6875000000000006e-07, - "loss": 0.3369, + "epoch": 0.0005528170636200304, + "grad_norm": 4.865883765294612, + "learning_rate": 5.524861878453039e-08, + "loss": 0.3882, "step": 6 }, { - "epoch": 0.005509641873278237, - "grad_norm": 4.354074850343827, - "learning_rate": 5.468750000000001e-07, - "loss": 0.3416, + "epoch": 0.0006449532408900354, + "grad_norm": 4.8700873305276255, + "learning_rate": 6.445672191528546e-08, + "loss": 0.3758, "step": 7 }, { - "epoch": 0.006296733569460842, - "grad_norm": 4.51384309102365, - "learning_rate": 6.25e-07, - "loss": 0.3577, + "epoch": 0.0007370894181600406, + "grad_norm": 5.087648327313099, + "learning_rate": 7.366482504604052e-08, + "loss": 0.3758, "step": 8 }, { - "epoch": 0.0070838252656434475, - "grad_norm": 4.127868399899779, - "learning_rate": 7.03125e-07, - "loss": 0.3447, + "epoch": 0.0008292255954300456, + "grad_norm": 5.30641963090733, + "learning_rate": 8.287292817679558e-08, + "loss": 0.4019, "step": 9 }, { - "epoch": 0.007870916961826053, - "grad_norm": 4.268670776824985, - "learning_rate": 7.8125e-07, - "loss": 0.318, + "epoch": 0.0009213617727000506, + "grad_norm": 5.070246570827928, + "learning_rate": 9.208103130755066e-08, + "loss": 0.3827, "step": 10 }, { - "epoch": 0.008658008658008658, - "grad_norm": 3.7083716156028674, - "learning_rate": 8.59375e-07, - "loss": 0.3106, + "epoch": 0.0010134979499700557, + "grad_norm": 5.216031896054235, + "learning_rate": 1.0128913443830572e-07, + "loss": 0.3846, "step": 11 }, { - "epoch": 0.009445100354191263, - "grad_norm": 3.1545864445099263, - "learning_rate": 9.375000000000001e-07, - "loss": 0.3131, + "epoch": 0.0011056341272400608, + "grad_norm": 4.876861133902156, + "learning_rate": 1.1049723756906078e-07, + "loss": 0.3843, "step": 12 }, { - "epoch": 0.01023219205037387, - "grad_norm": 3.020045714405798, - "learning_rate": 1.0156250000000001e-06, - "loss": 0.3215, + "epoch": 0.001197770304510066, + "grad_norm": 5.023561914398882, + "learning_rate": 1.1970534069981586e-07, + "loss": 0.3511, "step": 13 }, { - "epoch": 0.011019283746556474, - "grad_norm": 2.748876831681126, - "learning_rate": 1.0937500000000001e-06, - "loss": 0.3072, + "epoch": 0.0012899064817800709, + "grad_norm": 5.145451738529512, + "learning_rate": 1.2891344383057092e-07, + "loss": 0.3672, "step": 14 }, { - "epoch": 0.011806375442739079, - "grad_norm": 2.2307366833759485, - "learning_rate": 1.1718750000000001e-06, - "loss": 0.2922, + "epoch": 0.001382042659050076, + "grad_norm": 4.792948201942286, + "learning_rate": 1.3812154696132598e-07, + "loss": 0.3796, "step": 15 }, { - "epoch": 0.012593467138921684, - "grad_norm": 2.219422516987874, - "learning_rate": 1.25e-06, - "loss": 0.2842, + "epoch": 0.0014741788363200811, + "grad_norm": 4.551919134485809, + "learning_rate": 1.4732965009208104e-07, + "loss": 0.3803, "step": 16 }, { - "epoch": 0.01338055883510429, - "grad_norm": 2.7073031779339973, - "learning_rate": 1.328125e-06, - "loss": 0.2674, + "epoch": 0.001566315013590086, + "grad_norm": 4.8263836273678145, + "learning_rate": 1.5653775322283613e-07, + "loss": 0.3716, "step": 17 }, { - "epoch": 0.014167650531286895, - "grad_norm": 2.873035911017537, - "learning_rate": 1.40625e-06, - "loss": 0.294, + "epoch": 0.0016584511908600912, + "grad_norm": 4.740504995965325, + "learning_rate": 1.6574585635359117e-07, + "loss": 0.3838, "step": 18 }, { - "epoch": 0.0149547422274695, - "grad_norm": 2.119880363778339, - "learning_rate": 1.484375e-06, - "loss": 0.2693, + "epoch": 0.0017505873681300963, + "grad_norm": 4.3895949096154805, + "learning_rate": 1.7495395948434625e-07, + "loss": 0.3516, "step": 19 }, { - "epoch": 0.015741833923652106, - "grad_norm": 1.7740660860958901, - "learning_rate": 1.5625e-06, - "loss": 0.2607, + "epoch": 0.0018427235454001013, + "grad_norm": 4.730549482054788, + "learning_rate": 1.8416206261510132e-07, + "loss": 0.3971, "step": 20 }, { - "epoch": 0.01652892561983471, - "grad_norm": 1.654838099179133, - "learning_rate": 1.640625e-06, - "loss": 0.2539, + "epoch": 0.0019348597226701064, + "grad_norm": 4.326518829493085, + "learning_rate": 1.9337016574585635e-07, + "loss": 0.3621, "step": 21 }, { - "epoch": 0.017316017316017316, - "grad_norm": 2.1067372096520884, - "learning_rate": 1.71875e-06, - "loss": 0.274, + "epoch": 0.0020269958999401113, + "grad_norm": 4.411579317357037, + "learning_rate": 2.0257826887661144e-07, + "loss": 0.3671, "step": 22 }, { - "epoch": 0.01810310901219992, - "grad_norm": 2.227492365997846, - "learning_rate": 1.796875e-06, - "loss": 0.2667, + "epoch": 0.0021191320772101165, + "grad_norm": 4.406425398983609, + "learning_rate": 2.1178637200736653e-07, + "loss": 0.3586, "step": 23 }, { - "epoch": 0.018890200708382526, - "grad_norm": 1.9818482942437547, - "learning_rate": 1.8750000000000003e-06, - "loss": 0.2507, + "epoch": 0.0022112682544801216, + "grad_norm": 4.250027158629804, + "learning_rate": 2.2099447513812156e-07, + "loss": 0.3351, "step": 24 }, { - "epoch": 0.01967729240456513, - "grad_norm": 1.9916019664977938, - "learning_rate": 1.953125e-06, - "loss": 0.2302, + "epoch": 0.0023034044317501268, + "grad_norm": 4.453036416890763, + "learning_rate": 2.3020257826887662e-07, + "loss": 0.3547, "step": 25 }, { - "epoch": 0.02046438410074774, - "grad_norm": 2.0987871479467533, - "learning_rate": 2.0312500000000002e-06, - "loss": 0.2563, + "epoch": 0.002395540609020132, + "grad_norm": 4.338068169465262, + "learning_rate": 2.394106813996317e-07, + "loss": 0.3643, "step": 26 }, { - "epoch": 0.021251475796930343, - "grad_norm": 1.7851505967742112, - "learning_rate": 2.109375e-06, - "loss": 0.2432, + "epoch": 0.002487676786290137, + "grad_norm": 4.081751082657096, + "learning_rate": 2.486187845303868e-07, + "loss": 0.3735, "step": 27 }, { - "epoch": 0.02203856749311295, - "grad_norm": 1.6067598902195293, - "learning_rate": 2.1875000000000002e-06, - "loss": 0.2466, + "epoch": 0.0025798129635601417, + "grad_norm": 3.759847993257449, + "learning_rate": 2.5782688766114184e-07, + "loss": 0.3538, "step": 28 }, { - "epoch": 0.022825659189295553, - "grad_norm": 1.436243142469347, - "learning_rate": 2.265625e-06, - "loss": 0.2486, + "epoch": 0.002671949140830147, + "grad_norm": 3.656922967111338, + "learning_rate": 2.670349907918969e-07, + "loss": 0.3466, "step": 29 }, { - "epoch": 0.023612750885478158, - "grad_norm": 1.631080710695958, - "learning_rate": 2.3437500000000002e-06, - "loss": 0.2692, + "epoch": 0.002764085318100152, + "grad_norm": 3.5621860682515782, + "learning_rate": 2.7624309392265196e-07, + "loss": 0.3678, "step": 30 }, { - "epoch": 0.024399842581660763, - "grad_norm": 1.42554302342302, - "learning_rate": 2.421875e-06, - "loss": 0.2374, + "epoch": 0.002856221495370157, + "grad_norm": 3.074394037601397, + "learning_rate": 2.85451197053407e-07, + "loss": 0.3472, "step": 31 }, { - "epoch": 0.025186934277843367, - "grad_norm": 1.479794666013743, - "learning_rate": 2.5e-06, - "loss": 0.238, + "epoch": 0.0029483576726401623, + "grad_norm": 3.0932831564716263, + "learning_rate": 2.946593001841621e-07, + "loss": 0.3326, "step": 32 }, { - "epoch": 0.025974025974025976, - "grad_norm": 1.3857185652178832, - "learning_rate": 2.5781250000000004e-06, - "loss": 0.2366, + "epoch": 0.0030404938499101674, + "grad_norm": 3.145618703123189, + "learning_rate": 3.0386740331491715e-07, + "loss": 0.3714, "step": 33 }, { - "epoch": 0.02676111767020858, - "grad_norm": 1.335993998237778, - "learning_rate": 2.65625e-06, - "loss": 0.2251, + "epoch": 0.003132630027180172, + "grad_norm": 2.847899608744511, + "learning_rate": 3.1307550644567226e-07, + "loss": 0.3392, "step": 34 }, { - "epoch": 0.027548209366391185, - "grad_norm": 1.5950255189913525, - "learning_rate": 2.7343750000000004e-06, - "loss": 0.2506, + "epoch": 0.0032247662044501773, + "grad_norm": 2.9243824195879493, + "learning_rate": 3.2228360957642727e-07, + "loss": 0.3501, "step": 35 }, { - "epoch": 0.02833530106257379, - "grad_norm": 1.3773024411686232, - "learning_rate": 2.8125e-06, - "loss": 0.2107, + "epoch": 0.0033169023817201824, + "grad_norm": 2.7832329637023085, + "learning_rate": 3.3149171270718233e-07, + "loss": 0.3466, "step": 36 }, { - "epoch": 0.029122392758756395, - "grad_norm": 1.391558709917223, - "learning_rate": 2.8906250000000004e-06, - "loss": 0.236, + "epoch": 0.0034090385589901876, + "grad_norm": 2.572516529700843, + "learning_rate": 3.4069981583793745e-07, + "loss": 0.3354, "step": 37 }, { - "epoch": 0.029909484454939, - "grad_norm": 1.4317153691023394, - "learning_rate": 2.96875e-06, - "loss": 0.2369, + "epoch": 0.0035011747362601927, + "grad_norm": 2.2478775351473783, + "learning_rate": 3.499079189686925e-07, + "loss": 0.3341, "step": 38 }, { - "epoch": 0.030696576151121605, - "grad_norm": 1.5982850208202695, - "learning_rate": 3.0468750000000004e-06, - "loss": 0.2135, + "epoch": 0.003593310913530198, + "grad_norm": 2.4125693100520214, + "learning_rate": 3.591160220994475e-07, + "loss": 0.3518, "step": 39 }, { - "epoch": 0.03148366784730421, - "grad_norm": 1.2463748947443163, - "learning_rate": 3.125e-06, - "loss": 0.2109, + "epoch": 0.0036854470908002025, + "grad_norm": 2.161167186955303, + "learning_rate": 3.6832412523020263e-07, + "loss": 0.2944, "step": 40 }, { - "epoch": 0.032270759543486814, - "grad_norm": 1.489195263138514, - "learning_rate": 3.2031250000000004e-06, - "loss": 0.2254, + "epoch": 0.0037775832680702077, + "grad_norm": 2.340176936611769, + "learning_rate": 3.775322283609577e-07, + "loss": 0.3516, "step": 41 }, { - "epoch": 0.03305785123966942, - "grad_norm": 1.1952984228039816, - "learning_rate": 3.28125e-06, - "loss": 0.2124, + "epoch": 0.003869719445340213, + "grad_norm": 2.0582341569455407, + "learning_rate": 3.867403314917127e-07, + "loss": 0.2974, "step": 42 }, { - "epoch": 0.033844942935852024, - "grad_norm": 1.3331698750786545, - "learning_rate": 3.3593750000000003e-06, - "loss": 0.2192, + "epoch": 0.003961855622610218, + "grad_norm": 2.1235508876322244, + "learning_rate": 3.959484346224678e-07, + "loss": 0.3482, "step": 43 }, { - "epoch": 0.03463203463203463, - "grad_norm": 1.3944936006633961, - "learning_rate": 3.4375e-06, - "loss": 0.2024, + "epoch": 0.004053991799880223, + "grad_norm": 1.810336528178075, + "learning_rate": 4.051565377532229e-07, + "loss": 0.2894, "step": 44 }, { - "epoch": 0.03541912632821724, - "grad_norm": 1.3992213437238004, - "learning_rate": 3.5156250000000003e-06, - "loss": 0.2274, + "epoch": 0.004146127977150228, + "grad_norm": 1.9229985402496037, + "learning_rate": 4.1436464088397794e-07, + "loss": 0.3062, "step": 45 }, { - "epoch": 0.03620621802439984, - "grad_norm": 1.3664016160053327, - "learning_rate": 3.59375e-06, - "loss": 0.2152, + "epoch": 0.004238264154420233, + "grad_norm": 1.8631489706081992, + "learning_rate": 4.2357274401473305e-07, + "loss": 0.318, "step": 46 }, { - "epoch": 0.03699330972058245, - "grad_norm": 1.4891884814728509, - "learning_rate": 3.6718750000000003e-06, - "loss": 0.2292, + "epoch": 0.0043304003316902385, + "grad_norm": 1.7237134286009292, + "learning_rate": 4.3278084714548806e-07, + "loss": 0.3012, "step": 47 }, { - "epoch": 0.03778040141676505, - "grad_norm": 1.3270512221194979, - "learning_rate": 3.7500000000000005e-06, - "loss": 0.1997, + "epoch": 0.004422536508960243, + "grad_norm": 1.849534050723691, + "learning_rate": 4.419889502762431e-07, + "loss": 0.3069, "step": 48 }, { - "epoch": 0.03856749311294766, - "grad_norm": 1.4002885202427642, - "learning_rate": 3.828125000000001e-06, - "loss": 0.2049, + "epoch": 0.004514672686230248, + "grad_norm": 1.7076022714184271, + "learning_rate": 4.5119705340699824e-07, + "loss": 0.3099, "step": 49 }, { - "epoch": 0.03935458480913026, - "grad_norm": 1.3129491965923514, - "learning_rate": 3.90625e-06, - "loss": 0.2108, + "epoch": 0.0046068088635002535, + "grad_norm": 1.8201092125421363, + "learning_rate": 4.6040515653775325e-07, + "loss": 0.3363, "step": 50 }, { - "epoch": 0.04014167650531287, - "grad_norm": 1.5308223134960726, - "learning_rate": 3.984375e-06, - "loss": 0.2185, + "epoch": 0.004698945040770258, + "grad_norm": 1.5793947312415075, + "learning_rate": 4.696132596685083e-07, + "loss": 0.2906, "step": 51 }, { - "epoch": 0.04092876820149548, - "grad_norm": 1.4637864330743848, - "learning_rate": 4.0625000000000005e-06, - "loss": 0.1959, + "epoch": 0.004791081218040264, + "grad_norm": 1.7063324415343504, + "learning_rate": 4.788213627992634e-07, + "loss": 0.3293, "step": 52 }, { - "epoch": 0.04171585989767808, - "grad_norm": 1.3727993193826824, - "learning_rate": 4.140625000000001e-06, - "loss": 0.2079, + "epoch": 0.0048832173953102685, + "grad_norm": 1.5334969451288736, + "learning_rate": 4.880294659300184e-07, + "loss": 0.2983, "step": 53 }, { - "epoch": 0.04250295159386069, - "grad_norm": 1.324719262777532, - "learning_rate": 4.21875e-06, - "loss": 0.1986, + "epoch": 0.004975353572580274, + "grad_norm": 1.5375570913421324, + "learning_rate": 4.972375690607735e-07, + "loss": 0.3062, "step": 54 }, { - "epoch": 0.04329004329004329, - "grad_norm": 1.5169702547195179, - "learning_rate": 4.296875e-06, - "loss": 0.2233, + "epoch": 0.005067489749850279, + "grad_norm": 1.531795714777244, + "learning_rate": 5.064456721915287e-07, + "loss": 0.2957, "step": 55 }, { - "epoch": 0.0440771349862259, - "grad_norm": 1.2762861570952524, - "learning_rate": 4.3750000000000005e-06, - "loss": 0.1887, + "epoch": 0.0051596259271202835, + "grad_norm": 1.6190246422343229, + "learning_rate": 5.156537753222837e-07, + "loss": 0.3123, "step": 56 }, { - "epoch": 0.0448642266824085, - "grad_norm": 1.3276694806352698, - "learning_rate": 4.453125000000001e-06, - "loss": 0.2039, + "epoch": 0.005251762104390289, + "grad_norm": 1.550801896644375, + "learning_rate": 5.248618784530387e-07, + "loss": 0.3136, "step": 57 }, { - "epoch": 0.045651318378591106, - "grad_norm": 1.3146250055299598, - "learning_rate": 4.53125e-06, - "loss": 0.2142, + "epoch": 0.005343898281660294, + "grad_norm": 1.4783938737129982, + "learning_rate": 5.340699815837938e-07, + "loss": 0.2735, "step": 58 }, { - "epoch": 0.046438410074773714, - "grad_norm": 1.333854988794211, - "learning_rate": 4.609375e-06, - "loss": 0.1999, + "epoch": 0.005436034458930299, + "grad_norm": 1.4621021369750078, + "learning_rate": 5.432780847145488e-07, + "loss": 0.2884, "step": 59 }, { - "epoch": 0.047225501770956316, - "grad_norm": 1.3581177171903593, - "learning_rate": 4.6875000000000004e-06, - "loss": 0.1957, + "epoch": 0.005528170636200304, + "grad_norm": 1.3921579835622648, + "learning_rate": 5.524861878453039e-07, + "loss": 0.2867, "step": 60 }, { - "epoch": 0.048012593467138924, - "grad_norm": 1.3130565355707189, - "learning_rate": 4.765625000000001e-06, - "loss": 0.2122, + "epoch": 0.005620306813470309, + "grad_norm": 1.5262968398070378, + "learning_rate": 5.61694290976059e-07, + "loss": 0.298, "step": 61 }, { - "epoch": 0.048799685163321525, - "grad_norm": 1.4142816132337854, - "learning_rate": 4.84375e-06, - "loss": 0.2147, + "epoch": 0.005712442990740314, + "grad_norm": 1.5915698039983843, + "learning_rate": 5.70902394106814e-07, + "loss": 0.3088, "step": 62 }, { - "epoch": 0.049586776859504134, - "grad_norm": 1.357898232243354, - "learning_rate": 4.921875e-06, - "loss": 0.1877, + "epoch": 0.005804579168010319, + "grad_norm": 1.5397162188270683, + "learning_rate": 5.80110497237569e-07, + "loss": 0.2788, "step": 63 }, { - "epoch": 0.050373868555686735, - "grad_norm": 1.4153342064426397, - "learning_rate": 5e-06, - "loss": 0.1945, + "epoch": 0.005896715345280325, + "grad_norm": 1.4822801901893587, + "learning_rate": 5.893186003683242e-07, + "loss": 0.3007, "step": 64 }, { - "epoch": 0.05116096025186934, - "grad_norm": 1.4495342715013748, - "learning_rate": 4.999991517675219e-06, - "loss": 0.1939, + "epoch": 0.005988851522550329, + "grad_norm": 1.4690507666454256, + "learning_rate": 5.985267034990793e-07, + "loss": 0.2784, "step": 65 }, { - "epoch": 0.05194805194805195, - "grad_norm": 1.1539274129121713, - "learning_rate": 4.999966070758437e-06, - "loss": 0.2003, + "epoch": 0.006080987699820335, + "grad_norm": 1.3696146492094494, + "learning_rate": 6.077348066298343e-07, + "loss": 0.2824, "step": 66 }, { - "epoch": 0.05273514364423455, - "grad_norm": 1.3379283904444008, - "learning_rate": 4.999923659422332e-06, - "loss": 0.2007, + "epoch": 0.00617312387709034, + "grad_norm": 1.4331132908765383, + "learning_rate": 6.169429097605894e-07, + "loss": 0.2809, "step": 67 }, { - "epoch": 0.05352223534041716, - "grad_norm": 1.3492954613335875, - "learning_rate": 4.999864283954702e-06, - "loss": 0.1989, + "epoch": 0.006265260054360344, + "grad_norm": 1.3225884273908008, + "learning_rate": 6.261510128913445e-07, + "loss": 0.2803, "step": 68 }, { - "epoch": 0.05430932703659976, - "grad_norm": 1.1801853129144864, - "learning_rate": 4.99978794475846e-06, - "loss": 0.2114, + "epoch": 0.00635739623163035, + "grad_norm": 1.3651533674561502, + "learning_rate": 6.353591160220995e-07, + "loss": 0.2722, "step": 69 }, { - "epoch": 0.05509641873278237, - "grad_norm": 1.2068999367428581, - "learning_rate": 4.999694642351633e-06, - "loss": 0.2033, + "epoch": 0.0064495324089003546, + "grad_norm": 1.3972206619657161, + "learning_rate": 6.445672191528545e-07, + "loss": 0.2856, "step": 70 }, { - "epoch": 0.05588351042896497, - "grad_norm": 1.2287271472480104, - "learning_rate": 4.999584377367359e-06, - "loss": 0.1895, + "epoch": 0.00654166858617036, + "grad_norm": 1.3786952654094349, + "learning_rate": 6.537753222836097e-07, + "loss": 0.2821, "step": 71 }, { - "epoch": 0.05667060212514758, - "grad_norm": 1.3129837217534652, - "learning_rate": 4.99945715055388e-06, - "loss": 0.1905, + "epoch": 0.006633804763440365, + "grad_norm": 1.3835265992223351, + "learning_rate": 6.629834254143647e-07, + "loss": 0.2859, "step": 72 }, { - "epoch": 0.05745769382133018, - "grad_norm": 1.1734967025843308, - "learning_rate": 4.99931296277454e-06, - "loss": 0.213, + "epoch": 0.0067259409407103695, + "grad_norm": 1.3167354078664357, + "learning_rate": 6.721915285451197e-07, + "loss": 0.2693, "step": 73 }, { - "epoch": 0.05824478551751279, - "grad_norm": 1.3738466570011791, - "learning_rate": 4.999151815007776e-06, - "loss": 0.2214, + "epoch": 0.006818077117980375, + "grad_norm": 1.2157572639965608, + "learning_rate": 6.813996316758749e-07, + "loss": 0.2678, "step": 74 }, { - "epoch": 0.0590318772136954, - "grad_norm": 1.273179655688277, - "learning_rate": 4.9989737083471165e-06, - "loss": 0.1894, + "epoch": 0.00691021329525038, + "grad_norm": 1.4145127549666732, + "learning_rate": 6.906077348066299e-07, + "loss": 0.2752, "step": 75 }, { - "epoch": 0.059818968909878, - "grad_norm": 1.0843431120214646, - "learning_rate": 4.998778644001165e-06, - "loss": 0.1967, + "epoch": 0.007002349472520385, + "grad_norm": 1.3643122772858198, + "learning_rate": 6.99815837937385e-07, + "loss": 0.2893, "step": 76 }, { - "epoch": 0.06060606060606061, - "grad_norm": 1.4896402431576707, - "learning_rate": 4.998566623293603e-06, - "loss": 0.1752, + "epoch": 0.00709448564979039, + "grad_norm": 1.292319390060687, + "learning_rate": 7.0902394106814e-07, + "loss": 0.26, "step": 77 }, { - "epoch": 0.06139315230224321, - "grad_norm": 1.3405458603738243, - "learning_rate": 4.9983376476631725e-06, - "loss": 0.1998, + "epoch": 0.007186621827060396, + "grad_norm": 1.4121119413990915, + "learning_rate": 7.18232044198895e-07, + "loss": 0.2838, "step": 78 }, { - "epoch": 0.06218024399842582, - "grad_norm": 1.3641086369593634, - "learning_rate": 4.998091718663671e-06, - "loss": 0.2047, + "epoch": 0.0072787580043304, + "grad_norm": 1.3573902039549837, + "learning_rate": 7.274401473296501e-07, + "loss": 0.2673, "step": 79 }, { - "epoch": 0.06296733569460843, - "grad_norm": 1.3391162585136267, - "learning_rate": 4.997828837963937e-06, - "loss": 0.181, + "epoch": 0.007370894181600405, + "grad_norm": 1.251616266192662, + "learning_rate": 7.366482504604053e-07, + "loss": 0.2795, "step": 80 }, { - "epoch": 0.06375442739079103, - "grad_norm": 1.1899411991269295, - "learning_rate": 4.997549007347843e-06, - "loss": 0.1946, + "epoch": 0.007463030358870411, + "grad_norm": 1.2676616962987843, + "learning_rate": 7.458563535911603e-07, + "loss": 0.274, "step": 81 }, { - "epoch": 0.06454151908697363, - "grad_norm": 1.3917818646896112, - "learning_rate": 4.997252228714279e-06, - "loss": 0.1919, + "epoch": 0.007555166536140415, + "grad_norm": 1.2779375532844437, + "learning_rate": 7.550644567219154e-07, + "loss": 0.237, "step": 82 }, { - "epoch": 0.06532861078315624, - "grad_norm": 1.2543099071691322, - "learning_rate": 4.996938504077145e-06, - "loss": 0.1948, + "epoch": 0.007647302713410421, + "grad_norm": 1.2823725432016053, + "learning_rate": 7.642725598526704e-07, + "loss": 0.2535, "step": 83 }, { - "epoch": 0.06611570247933884, - "grad_norm": 1.3941008619735185, - "learning_rate": 4.99660783556533e-06, - "loss": 0.1861, + "epoch": 0.007739438890680426, + "grad_norm": 1.2914112825622275, + "learning_rate": 7.734806629834254e-07, + "loss": 0.2579, "step": 84 }, { - "epoch": 0.06690279417552145, - "grad_norm": 1.1765528133487257, - "learning_rate": 4.9962602254227075e-06, - "loss": 0.1817, + "epoch": 0.00783157506795043, + "grad_norm": 1.255061266268493, + "learning_rate": 7.826887661141805e-07, + "loss": 0.2553, "step": 85 }, { - "epoch": 0.06768988587170405, - "grad_norm": 1.223066746932356, - "learning_rate": 4.995895676008109e-06, - "loss": 0.1934, + "epoch": 0.007923711245220436, + "grad_norm": 1.37482617750355, + "learning_rate": 7.918968692449356e-07, + "loss": 0.2552, "step": 86 }, { - "epoch": 0.06847697756788666, - "grad_norm": 1.3140944559909808, - "learning_rate": 4.995514189795316e-06, - "loss": 0.197, + "epoch": 0.008015847422490441, + "grad_norm": 1.1987336596664846, + "learning_rate": 8.011049723756907e-07, + "loss": 0.2544, "step": 87 }, { - "epoch": 0.06926406926406926, - "grad_norm": 1.1819977914205286, - "learning_rate": 4.99511576937304e-06, - "loss": 0.1972, + "epoch": 0.008107983599760445, + "grad_norm": 1.352509072072678, + "learning_rate": 8.103130755064458e-07, + "loss": 0.2697, "step": 88 }, { - "epoch": 0.07005116096025187, - "grad_norm": 1.3152579578345207, - "learning_rate": 4.994700417444907e-06, - "loss": 0.207, + "epoch": 0.008200119777030451, + "grad_norm": 1.2902205773865623, + "learning_rate": 8.195211786372008e-07, + "loss": 0.2477, "step": 89 }, { - "epoch": 0.07083825265643448, - "grad_norm": 1.2064669225701854, - "learning_rate": 4.994268136829438e-06, - "loss": 0.1953, + "epoch": 0.008292255954300456, + "grad_norm": 1.395341337522817, + "learning_rate": 8.287292817679559e-07, + "loss": 0.2627, "step": 90 }, { - "epoch": 0.07162534435261708, - "grad_norm": 1.1619755664518439, - "learning_rate": 4.993818930460026e-06, - "loss": 0.1982, + "epoch": 0.00838439213157046, + "grad_norm": 1.268160686732221, + "learning_rate": 8.379373848987109e-07, + "loss": 0.2851, "step": 91 }, { - "epoch": 0.07241243604879968, - "grad_norm": 1.1792837872493809, - "learning_rate": 4.993352801384924e-06, - "loss": 0.1886, + "epoch": 0.008476528308840466, + "grad_norm": 1.3489234822086935, + "learning_rate": 8.471454880294661e-07, + "loss": 0.2619, "step": 92 }, { - "epoch": 0.07319952774498228, - "grad_norm": 1.097328306217708, - "learning_rate": 4.992869752767218e-06, - "loss": 0.1673, + "epoch": 0.008568664486110471, + "grad_norm": 1.3220216831015386, + "learning_rate": 8.563535911602211e-07, + "loss": 0.2579, "step": 93 }, { - "epoch": 0.0739866194411649, - "grad_norm": 1.2788239338552108, - "learning_rate": 4.992369787884809e-06, - "loss": 0.1972, + "epoch": 0.008660800663380477, + "grad_norm": 1.1760530408428194, + "learning_rate": 8.655616942909761e-07, + "loss": 0.2461, "step": 94 }, { - "epoch": 0.0747737111373475, - "grad_norm": 1.1905278770669998, - "learning_rate": 4.991852910130388e-06, - "loss": 0.1872, + "epoch": 0.008752936840650481, + "grad_norm": 1.5393872750676103, + "learning_rate": 8.747697974217311e-07, + "loss": 0.2838, "step": 95 }, { - "epoch": 0.0755608028335301, - "grad_norm": 1.2133270115400816, - "learning_rate": 4.9913191230114154e-06, - "loss": 0.1748, + "epoch": 0.008845073017920486, + "grad_norm": 1.4358175929752721, + "learning_rate": 8.839779005524863e-07, + "loss": 0.255, "step": 96 }, { - "epoch": 0.07634789452971272, - "grad_norm": 1.2840440499091732, - "learning_rate": 4.990768430150096e-06, - "loss": 0.1942, + "epoch": 0.008937209195190492, + "grad_norm": 1.3687969807695604, + "learning_rate": 8.931860036832413e-07, + "loss": 0.2563, "step": 97 }, { - "epoch": 0.07713498622589532, - "grad_norm": 1.5346248945491554, - "learning_rate": 4.990200835283353e-06, - "loss": 0.1861, + "epoch": 0.009029345372460496, + "grad_norm": 1.3219226525379928, + "learning_rate": 9.023941068139965e-07, + "loss": 0.2482, "step": 98 }, { - "epoch": 0.07792207792207792, - "grad_norm": 1.1936205681426777, - "learning_rate": 4.989616342262807e-06, - "loss": 0.1975, + "epoch": 0.009121481549730501, + "grad_norm": 1.3519450540596627, + "learning_rate": 9.116022099447515e-07, + "loss": 0.2524, "step": 99 }, { - "epoch": 0.07870916961826052, - "grad_norm": 1.2662437794316659, - "learning_rate": 4.989014955054746e-06, - "loss": 0.1853, + "epoch": 0.009213617727000507, + "grad_norm": 1.2749396142008642, + "learning_rate": 9.208103130755065e-07, + "loss": 0.2371, "step": 100 }, { - "epoch": 0.07949626131444314, - "grad_norm": 1.116915072535967, - "learning_rate": 4.988396677740097e-06, - "loss": 0.1738, + "epoch": 0.009305753904270513, + "grad_norm": 1.1980272664855356, + "learning_rate": 9.300184162062616e-07, + "loss": 0.2508, "step": 101 }, { - "epoch": 0.08028335301062574, - "grad_norm": 1.1577366023558335, - "learning_rate": 4.9877615145144055e-06, - "loss": 0.2045, + "epoch": 0.009397890081540516, + "grad_norm": 1.3779116744427602, + "learning_rate": 9.392265193370166e-07, + "loss": 0.2719, "step": 102 }, { - "epoch": 0.08107044470680834, - "grad_norm": 1.1022178093801993, - "learning_rate": 4.9871094696878e-06, - "loss": 0.1814, + "epoch": 0.009490026258810522, + "grad_norm": 1.2481451142639794, + "learning_rate": 9.484346224677716e-07, + "loss": 0.2406, "step": 103 }, { - "epoch": 0.08185753640299095, - "grad_norm": 1.218327314143879, - "learning_rate": 4.986440547684963e-06, - "loss": 0.1822, + "epoch": 0.009582162436080528, + "grad_norm": 1.2456086574919798, + "learning_rate": 9.576427255985269e-07, + "loss": 0.2594, "step": 104 }, { - "epoch": 0.08264462809917356, - "grad_norm": 1.0747362510591434, - "learning_rate": 4.985754753045108e-06, - "loss": 0.1639, + "epoch": 0.009674298613350531, + "grad_norm": 1.3052822180290655, + "learning_rate": 9.66850828729282e-07, + "loss": 0.2578, "step": 105 }, { - "epoch": 0.08343171979535616, - "grad_norm": 1.236686976609853, - "learning_rate": 4.9850520904219406e-06, - "loss": 0.1773, + "epoch": 0.009766434790620537, + "grad_norm": 1.2798135957331098, + "learning_rate": 9.760589318600369e-07, + "loss": 0.2449, "step": 106 }, { - "epoch": 0.08421881149153876, - "grad_norm": 1.2843110878866029, - "learning_rate": 4.98433256458363e-06, - "loss": 0.1931, + "epoch": 0.009858570967890543, + "grad_norm": 1.2233602450508594, + "learning_rate": 9.85267034990792e-07, + "loss": 0.23, "step": 107 }, { - "epoch": 0.08500590318772137, - "grad_norm": 1.2556201190754803, - "learning_rate": 4.983596180412779e-06, - "loss": 0.1891, + "epoch": 0.009950707145160548, + "grad_norm": 1.27138699960983, + "learning_rate": 9.94475138121547e-07, + "loss": 0.2575, "step": 108 }, { - "epoch": 0.08579299488390398, - "grad_norm": 1.1736861180333642, - "learning_rate": 4.982842942906386e-06, - "loss": 0.1932, + "epoch": 0.010042843322430552, + "grad_norm": 1.4590705423131205, + "learning_rate": 1.003683241252302e-06, + "loss": 0.2414, "step": 109 }, { - "epoch": 0.08658008658008658, - "grad_norm": 1.250703274500956, - "learning_rate": 4.982072857175816e-06, - "loss": 0.1979, + "epoch": 0.010134979499700558, + "grad_norm": 1.3032600900132378, + "learning_rate": 1.0128913443830573e-06, + "loss": 0.2422, "step": 110 }, { - "epoch": 0.08736717827626919, - "grad_norm": 1.111910462348759, - "learning_rate": 4.981285928446762e-06, - "loss": 0.1729, + "epoch": 0.010227115676970563, + "grad_norm": 1.246423626921792, + "learning_rate": 1.0220994475138122e-06, + "loss": 0.2504, "step": 111 }, { - "epoch": 0.0881542699724518, - "grad_norm": 1.2267625409230847, - "learning_rate": 4.980482162059214e-06, - "loss": 0.1993, + "epoch": 0.010319251854240567, + "grad_norm": 1.3617978945476827, + "learning_rate": 1.0313075506445673e-06, + "loss": 0.2625, "step": 112 }, { - "epoch": 0.0889413616686344, - "grad_norm": 1.350342930816002, - "learning_rate": 4.979661563467415e-06, - "loss": 0.1914, + "epoch": 0.010411388031510573, + "grad_norm": 1.3138368592325604, + "learning_rate": 1.0405156537753222e-06, + "loss": 0.2717, "step": 113 }, { - "epoch": 0.089728453364817, - "grad_norm": 0.9837790085016399, - "learning_rate": 4.978824138239835e-06, - "loss": 0.1852, + "epoch": 0.010503524208780578, + "grad_norm": 1.45931937030065, + "learning_rate": 1.0497237569060774e-06, + "loss": 0.2338, "step": 114 }, { - "epoch": 0.09051554506099961, - "grad_norm": 1.1412715811918805, - "learning_rate": 4.977969892059123e-06, - "loss": 0.1791, + "epoch": 0.010595660386050582, + "grad_norm": 1.2209072353641341, + "learning_rate": 1.0589318600368325e-06, + "loss": 0.2571, "step": 115 }, { - "epoch": 0.09130263675718221, - "grad_norm": 1.091735318231847, - "learning_rate": 4.977098830722074e-06, - "loss": 0.1879, + "epoch": 0.010687796563320588, + "grad_norm": 1.2221332342582498, + "learning_rate": 1.0681399631675876e-06, + "loss": 0.2181, "step": 116 }, { - "epoch": 0.09208972845336481, - "grad_norm": 1.1356995797773966, - "learning_rate": 4.976210960139587e-06, - "loss": 0.1942, + "epoch": 0.010779932740590593, + "grad_norm": 1.3055782277521266, + "learning_rate": 1.0773480662983427e-06, + "loss": 0.2413, "step": 117 }, { - "epoch": 0.09287682014954743, - "grad_norm": 1.197221158258512, - "learning_rate": 4.975306286336628e-06, - "loss": 0.1822, + "epoch": 0.010872068917860599, + "grad_norm": 1.3001013433954538, + "learning_rate": 1.0865561694290976e-06, + "loss": 0.2561, "step": 118 }, { - "epoch": 0.09366391184573003, - "grad_norm": 1.1622435205009634, - "learning_rate": 4.974384815452187e-06, - "loss": 0.1938, + "epoch": 0.010964205095130602, + "grad_norm": 1.4430759553426427, + "learning_rate": 1.0957642725598527e-06, + "loss": 0.2393, "step": 119 }, { - "epoch": 0.09445100354191263, - "grad_norm": 1.1980574826361372, - "learning_rate": 4.9734465537392365e-06, - "loss": 0.1703, + "epoch": 0.011056341272400608, + "grad_norm": 1.4425457370059072, + "learning_rate": 1.1049723756906078e-06, + "loss": 0.2349, "step": 120 }, { - "epoch": 0.09523809523809523, - "grad_norm": 1.090793092501407, - "learning_rate": 4.972491507564688e-06, - "loss": 0.1681, + "epoch": 0.011148477449670614, + "grad_norm": 1.235681217544338, + "learning_rate": 1.114180478821363e-06, + "loss": 0.2315, "step": 121 }, { - "epoch": 0.09602518693427785, - "grad_norm": 1.2120296842604672, - "learning_rate": 4.9715196834093525e-06, - "loss": 0.1562, + "epoch": 0.011240613626940617, + "grad_norm": 1.291133894680049, + "learning_rate": 1.123388581952118e-06, + "loss": 0.2442, "step": 122 }, { - "epoch": 0.09681227863046045, - "grad_norm": 1.1420618659168036, - "learning_rate": 4.97053108786789e-06, - "loss": 0.1812, + "epoch": 0.011332749804210623, + "grad_norm": 1.2986607434244122, + "learning_rate": 1.132596685082873e-06, + "loss": 0.2427, "step": 123 }, { - "epoch": 0.09759937032664305, - "grad_norm": 1.145370838994205, - "learning_rate": 4.969525727648774e-06, - "loss": 0.1873, + "epoch": 0.011424885981480629, + "grad_norm": 1.3600935260637073, + "learning_rate": 1.141804788213628e-06, + "loss": 0.2507, "step": 124 }, { - "epoch": 0.09838646202282567, - "grad_norm": 1.1676600414602372, - "learning_rate": 4.9685036095742365e-06, - "loss": 0.1972, + "epoch": 0.011517022158750634, + "grad_norm": 1.2882709655715936, + "learning_rate": 1.1510128913443832e-06, + "loss": 0.2491, "step": 125 }, { - "epoch": 0.09917355371900827, - "grad_norm": 1.204479600477317, - "learning_rate": 4.967464740580228e-06, - "loss": 0.1904, + "epoch": 0.011609158336020638, + "grad_norm": 1.236349701513875, + "learning_rate": 1.160220994475138e-06, + "loss": 0.2238, "step": 126 }, { - "epoch": 0.09996064541519087, - "grad_norm": 1.119994012971968, - "learning_rate": 4.9664091277163664e-06, - "loss": 0.1851, + "epoch": 0.011701294513290644, + "grad_norm": 1.277175622784304, + "learning_rate": 1.1694290976058934e-06, + "loss": 0.2306, "step": 127 }, { - "epoch": 0.10074773711137347, - "grad_norm": 1.1043684264734095, - "learning_rate": 4.9653367781458946e-06, - "loss": 0.1926, + "epoch": 0.01179343069056065, + "grad_norm": 1.3466287077359933, + "learning_rate": 1.1786372007366483e-06, + "loss": 0.2529, "step": 128 }, { - "epoch": 0.10153482880755609, - "grad_norm": 1.127680976136701, - "learning_rate": 4.964247699145626e-06, - "loss": 0.1886, + "epoch": 0.011885566867830653, + "grad_norm": 1.2600725855409367, + "learning_rate": 1.1878453038674034e-06, + "loss": 0.2297, "step": 129 }, { - "epoch": 0.10232192050373869, - "grad_norm": 1.1813875803533898, - "learning_rate": 4.963141898105898e-06, - "loss": 0.1858, + "epoch": 0.011977703045100659, + "grad_norm": 1.1909522608327074, + "learning_rate": 1.1970534069981586e-06, + "loss": 0.2428, "step": 130 }, { - "epoch": 0.10310901219992129, - "grad_norm": 1.1497128287458092, - "learning_rate": 4.962019382530521e-06, - "loss": 0.1724, + "epoch": 0.012069839222370664, + "grad_norm": 1.3275342654407982, + "learning_rate": 1.2062615101289135e-06, + "loss": 0.2387, "step": 131 }, { - "epoch": 0.1038961038961039, - "grad_norm": 1.223498886081565, - "learning_rate": 4.960880160036728e-06, - "loss": 0.194, + "epoch": 0.01216197539964067, + "grad_norm": 1.3832794168368345, + "learning_rate": 1.2154696132596686e-06, + "loss": 0.2606, "step": 132 }, { - "epoch": 0.1046831955922865, - "grad_norm": 1.1861652899170938, - "learning_rate": 4.959724238355124e-06, - "loss": 0.1841, + "epoch": 0.012254111576910674, + "grad_norm": 1.4083734454299084, + "learning_rate": 1.2246777163904237e-06, + "loss": 0.2558, "step": 133 }, { - "epoch": 0.1054702872884691, - "grad_norm": 1.0805114288365025, - "learning_rate": 4.958551625329631e-06, - "loss": 0.1646, + "epoch": 0.01234624775418068, + "grad_norm": 1.3604330663851263, + "learning_rate": 1.2338858195211788e-06, + "loss": 0.2131, "step": 134 }, { - "epoch": 0.10625737898465171, - "grad_norm": 1.29808710670669, - "learning_rate": 4.957362328917437e-06, - "loss": 0.1833, + "epoch": 0.012438383931450685, + "grad_norm": 1.368946573958846, + "learning_rate": 1.243093922651934e-06, + "loss": 0.2607, "step": 135 }, { - "epoch": 0.10704447068083432, - "grad_norm": 1.2660501691777906, - "learning_rate": 4.95615635718894e-06, - "loss": 0.1753, + "epoch": 0.012530520108720689, + "grad_norm": 1.4349854840515686, + "learning_rate": 1.252302025782689e-06, + "loss": 0.2543, "step": 136 }, { - "epoch": 0.10783156237701692, - "grad_norm": 1.1429230314494303, - "learning_rate": 4.954933718327697e-06, - "loss": 0.1734, + "epoch": 0.012622656285990694, + "grad_norm": 1.3053177174437076, + "learning_rate": 1.261510128913444e-06, + "loss": 0.2066, "step": 137 }, { - "epoch": 0.10861865407319952, - "grad_norm": 1.114357361335831, - "learning_rate": 4.953694420630361e-06, - "loss": 0.1925, + "epoch": 0.0127147924632607, + "grad_norm": 1.4065693991109225, + "learning_rate": 1.270718232044199e-06, + "loss": 0.2428, "step": 138 }, { - "epoch": 0.10940574576938213, - "grad_norm": 1.1238119767186239, - "learning_rate": 4.952438472506636e-06, - "loss": 0.1805, + "epoch": 0.012806928640530704, + "grad_norm": 1.3060084203827886, + "learning_rate": 1.2799263351749542e-06, + "loss": 0.2452, "step": 139 }, { - "epoch": 0.11019283746556474, - "grad_norm": 1.1524735878912507, - "learning_rate": 4.951165882479206e-06, - "loss": 0.1783, + "epoch": 0.012899064817800709, + "grad_norm": 1.42770860862496, + "learning_rate": 1.289134438305709e-06, + "loss": 0.2375, "step": 140 }, { - "epoch": 0.11097992916174734, - "grad_norm": 1.0546198047284017, - "learning_rate": 4.949876659183693e-06, - "loss": 0.1745, + "epoch": 0.012991200995070715, + "grad_norm": 1.3712130826622553, + "learning_rate": 1.2983425414364642e-06, + "loss": 0.2296, "step": 141 }, { - "epoch": 0.11176702085792994, - "grad_norm": 1.0925714956018635, - "learning_rate": 4.94857081136858e-06, - "loss": 0.1763, + "epoch": 0.01308333717234072, + "grad_norm": 1.2949739115350103, + "learning_rate": 1.3075506445672193e-06, + "loss": 0.2249, "step": 142 }, { - "epoch": 0.11255411255411256, - "grad_norm": 1.1039385653204372, - "learning_rate": 4.947248347895172e-06, - "loss": 0.1777, + "epoch": 0.013175473349610724, + "grad_norm": 1.4444498310803144, + "learning_rate": 1.3167587476979742e-06, + "loss": 0.2336, "step": 143 }, { - "epoch": 0.11334120425029516, - "grad_norm": 1.145622347104172, - "learning_rate": 4.945909277737519e-06, - "loss": 0.1804, + "epoch": 0.01326760952688073, + "grad_norm": 1.327765157794959, + "learning_rate": 1.3259668508287293e-06, + "loss": 0.2305, "step": 144 }, { - "epoch": 0.11412829594647776, - "grad_norm": 1.0810330697861197, - "learning_rate": 4.944553609982363e-06, - "loss": 0.18, + "epoch": 0.013359745704150735, + "grad_norm": 1.49483024693552, + "learning_rate": 1.3351749539594844e-06, + "loss": 0.2524, "step": 145 }, { - "epoch": 0.11491538764266036, - "grad_norm": 1.079722871077113, - "learning_rate": 4.943181353829077e-06, - "loss": 0.1805, + "epoch": 0.013451881881420739, + "grad_norm": 1.4128065918962016, + "learning_rate": 1.3443830570902393e-06, + "loss": 0.2421, "step": 146 }, { - "epoch": 0.11570247933884298, - "grad_norm": 1.2122723148500483, - "learning_rate": 4.941792518589596e-06, - "loss": 0.2113, + "epoch": 0.013544018058690745, + "grad_norm": 1.458256896983337, + "learning_rate": 1.3535911602209945e-06, + "loss": 0.256, "step": 147 }, { - "epoch": 0.11648957103502558, - "grad_norm": 1.1619622709214918, - "learning_rate": 4.940387113688364e-06, - "loss": 0.1714, + "epoch": 0.01363615423596075, + "grad_norm": 1.5761688856396325, + "learning_rate": 1.3627992633517498e-06, + "loss": 0.2283, "step": 148 }, { - "epoch": 0.11727666273120818, - "grad_norm": 1.0508760593348456, - "learning_rate": 4.93896514866226e-06, - "loss": 0.1625, + "epoch": 0.013728290413230756, + "grad_norm": 1.4268159296492195, + "learning_rate": 1.372007366482505e-06, + "loss": 0.231, "step": 149 }, { - "epoch": 0.1180637544273908, - "grad_norm": 1.0710088382142664, - "learning_rate": 4.93752663316054e-06, - "loss": 0.1778, + "epoch": 0.01382042659050076, + "grad_norm": 1.240181839931121, + "learning_rate": 1.3812154696132598e-06, + "loss": 0.2265, "step": 150 }, { - "epoch": 0.1188508461235734, - "grad_norm": 1.0503531295721205, - "learning_rate": 4.936071576944769e-06, - "loss": 0.1726, + "epoch": 0.013912562767770765, + "grad_norm": 1.3560921208474808, + "learning_rate": 1.390423572744015e-06, + "loss": 0.2347, "step": 151 }, { - "epoch": 0.119637937819756, - "grad_norm": 1.0686610020146463, - "learning_rate": 4.934599989888753e-06, - "loss": 0.1769, + "epoch": 0.01400469894504077, + "grad_norm": 1.440218247026957, + "learning_rate": 1.39963167587477e-06, + "loss": 0.2265, "step": 152 }, { - "epoch": 0.1204250295159386, - "grad_norm": 1.072378297090023, - "learning_rate": 4.933111881978478e-06, - "loss": 0.1866, + "epoch": 0.014096835122310775, + "grad_norm": 1.3168656248813988, + "learning_rate": 1.408839779005525e-06, + "loss": 0.2361, "step": 153 }, { - "epoch": 0.12121212121212122, - "grad_norm": 1.2495883030259693, - "learning_rate": 4.931607263312033e-06, - "loss": 0.1998, + "epoch": 0.01418897129958078, + "grad_norm": 1.387358557045741, + "learning_rate": 1.41804788213628e-06, + "loss": 0.2538, "step": 154 }, { - "epoch": 0.12199921290830382, - "grad_norm": 1.109893027407933, - "learning_rate": 4.93008614409955e-06, - "loss": 0.1805, + "epoch": 0.014281107476850786, + "grad_norm": 1.4226212454591165, + "learning_rate": 1.4272559852670352e-06, + "loss": 0.2386, "step": 155 }, { - "epoch": 0.12278630460448642, - "grad_norm": 1.1570851370725408, - "learning_rate": 4.928548534663133e-06, - "loss": 0.1725, + "epoch": 0.014373243654120791, + "grad_norm": 1.4868929751549826, + "learning_rate": 1.43646408839779e-06, + "loss": 0.2503, "step": 156 }, { - "epoch": 0.12357339630066903, - "grad_norm": 1.1758781032456742, - "learning_rate": 4.9269944454367815e-06, - "loss": 0.176, + "epoch": 0.014465379831390795, + "grad_norm": 1.3156667636135637, + "learning_rate": 1.4456721915285452e-06, + "loss": 0.2439, "step": 157 }, { - "epoch": 0.12436048799685163, - "grad_norm": 1.1408455648753233, - "learning_rate": 4.925423886966328e-06, - "loss": 0.1848, + "epoch": 0.0145575160086608, + "grad_norm": 1.3284249384355258, + "learning_rate": 1.4548802946593003e-06, + "loss": 0.2325, "step": 158 }, { - "epoch": 0.12514757969303425, - "grad_norm": 1.1318514267380126, - "learning_rate": 4.923836869909363e-06, - "loss": 0.1764, + "epoch": 0.014649652185930806, + "grad_norm": 1.2878557366716903, + "learning_rate": 1.4640883977900552e-06, + "loss": 0.2056, "step": 159 }, { - "epoch": 0.12593467138921685, - "grad_norm": 1.1451300788977063, - "learning_rate": 4.9222334050351595e-06, - "loss": 0.1756, + "epoch": 0.01474178836320081, + "grad_norm": 1.4793729308344177, + "learning_rate": 1.4732965009208105e-06, + "loss": 0.2571, "step": 160 }, { - "epoch": 0.12672176308539945, - "grad_norm": 1.1117305593028235, - "learning_rate": 4.920613503224608e-06, - "loss": 0.1797, + "epoch": 0.014833924540470816, + "grad_norm": 1.2481891533067875, + "learning_rate": 1.4825046040515656e-06, + "loss": 0.2195, "step": 161 }, { - "epoch": 0.12750885478158205, - "grad_norm": 1.1301581138966732, - "learning_rate": 4.9189771754701335e-06, - "loss": 0.1675, + "epoch": 0.014926060717740821, + "grad_norm": 1.3944875094813025, + "learning_rate": 1.4917127071823205e-06, + "loss": 0.2245, "step": 162 }, { - "epoch": 0.12829594647776466, - "grad_norm": 1.0326917294149387, - "learning_rate": 4.917324432875627e-06, - "loss": 0.1784, + "epoch": 0.015018196895010825, + "grad_norm": 1.37854617862289, + "learning_rate": 1.5009208103130757e-06, + "loss": 0.2298, "step": 163 }, { - "epoch": 0.12908303817394726, - "grad_norm": 1.1983588521884831, - "learning_rate": 4.915655286656368e-06, - "loss": 0.1966, + "epoch": 0.01511033307228083, + "grad_norm": 1.3740996859347074, + "learning_rate": 1.5101289134438308e-06, + "loss": 0.2343, "step": 164 }, { - "epoch": 0.12987012987012986, - "grad_norm": 1.0140424703790007, - "learning_rate": 4.9139697481389505e-06, - "loss": 0.1744, + "epoch": 0.015202469249550836, + "grad_norm": 1.379655917316226, + "learning_rate": 1.5193370165745857e-06, + "loss": 0.2271, "step": 165 }, { - "epoch": 0.1306572215663125, - "grad_norm": 1.223539092779737, - "learning_rate": 4.9122678287612e-06, - "loss": 0.1831, + "epoch": 0.015294605426820842, + "grad_norm": 1.2845573670743051, + "learning_rate": 1.5285451197053408e-06, + "loss": 0.221, "step": 166 }, { - "epoch": 0.1314443132624951, - "grad_norm": 1.0918972348910556, - "learning_rate": 4.910549540072104e-06, - "loss": 0.1843, + "epoch": 0.015386741604090846, + "grad_norm": 1.3382949270875386, + "learning_rate": 1.537753222836096e-06, + "loss": 0.2053, "step": 167 }, { - "epoch": 0.1322314049586777, - "grad_norm": 1.1292739304249166, - "learning_rate": 4.908814893731728e-06, - "loss": 0.1552, + "epoch": 0.015478877781360851, + "grad_norm": 1.2241039135765772, + "learning_rate": 1.5469613259668508e-06, + "loss": 0.2051, "step": 168 }, { - "epoch": 0.1330184966548603, - "grad_norm": 1.1923518362383727, - "learning_rate": 4.9070639015111406e-06, - "loss": 0.1895, + "epoch": 0.015571013958630857, + "grad_norm": 1.353071391505974, + "learning_rate": 1.556169429097606e-06, + "loss": 0.2238, "step": 169 }, { - "epoch": 0.1338055883510429, - "grad_norm": 1.083542335588892, - "learning_rate": 4.905296575292329e-06, - "loss": 0.1745, + "epoch": 0.01566315013590086, + "grad_norm": 1.3108612555966297, + "learning_rate": 1.565377532228361e-06, + "loss": 0.2422, "step": 170 }, { - "epoch": 0.1345926800472255, - "grad_norm": 1.2673623015109376, - "learning_rate": 4.90351292706812e-06, - "loss": 0.1726, + "epoch": 0.015755286313170868, + "grad_norm": 1.3798597771479884, + "learning_rate": 1.574585635359116e-06, + "loss": 0.211, "step": 171 }, { - "epoch": 0.1353797717434081, - "grad_norm": 1.1129476624507257, - "learning_rate": 4.901712968942101e-06, - "loss": 0.1706, + "epoch": 0.015847422490440872, + "grad_norm": 1.2463211759017325, + "learning_rate": 1.5837937384898713e-06, + "loss": 0.2124, "step": 172 }, { - "epoch": 0.13616686343959072, - "grad_norm": 1.1735922432656085, - "learning_rate": 4.899896713128536e-06, - "loss": 0.1741, + "epoch": 0.015939558667710876, + "grad_norm": 1.2337293449366062, + "learning_rate": 1.5930018416206264e-06, + "loss": 0.2183, "step": 173 }, { - "epoch": 0.13695395513577333, - "grad_norm": 1.2331570034422519, - "learning_rate": 4.898064171952281e-06, - "loss": 0.1946, + "epoch": 0.016031694844980883, + "grad_norm": 1.2103763277878807, + "learning_rate": 1.6022099447513815e-06, + "loss": 0.2236, "step": 174 }, { - "epoch": 0.13774104683195593, - "grad_norm": 1.2376618802061816, - "learning_rate": 4.896215357848706e-06, - "loss": 0.1715, + "epoch": 0.016123831022250887, + "grad_norm": 1.220903675064504, + "learning_rate": 1.6114180478821364e-06, + "loss": 0.2141, "step": 175 }, { - "epoch": 0.13852813852813853, - "grad_norm": 1.0860947256302276, - "learning_rate": 4.894350283363603e-06, - "loss": 0.1664, + "epoch": 0.01621596719952089, + "grad_norm": 1.358619080502357, + "learning_rate": 1.6206261510128915e-06, + "loss": 0.2423, "step": 176 }, { - "epoch": 0.13931523022432113, - "grad_norm": 1.1284792933006988, - "learning_rate": 4.892468961153105e-06, - "loss": 0.1721, + "epoch": 0.016308103376790898, + "grad_norm": 1.2782364766180747, + "learning_rate": 1.6298342541436466e-06, + "loss": 0.2346, "step": 177 }, { - "epoch": 0.14010232192050373, - "grad_norm": 1.1811695933066144, - "learning_rate": 4.8905714039836026e-06, - "loss": 0.1768, + "epoch": 0.016400239554060902, + "grad_norm": 1.3105220268621274, + "learning_rate": 1.6390423572744015e-06, + "loss": 0.2506, "step": 178 }, { - "epoch": 0.14088941361668633, - "grad_norm": 1.1690172197627666, - "learning_rate": 4.888657624731652e-06, - "loss": 0.1784, + "epoch": 0.016492375731330906, + "grad_norm": 1.2782540222227745, + "learning_rate": 1.6482504604051566e-06, + "loss": 0.2216, "step": 179 }, { - "epoch": 0.14167650531286896, - "grad_norm": 1.2215187765329307, - "learning_rate": 4.88672763638389e-06, - "loss": 0.1762, + "epoch": 0.016584511908600913, + "grad_norm": 1.5337856840982391, + "learning_rate": 1.6574585635359118e-06, + "loss": 0.2348, "step": 180 }, { - "epoch": 0.14246359700905156, - "grad_norm": 1.1657625904368065, - "learning_rate": 4.884781452036948e-06, - "loss": 0.1754, + "epoch": 0.016676648085870917, + "grad_norm": 1.2855938394022077, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.2321, "step": 181 }, { - "epoch": 0.14325068870523416, - "grad_norm": 1.0812019740421663, - "learning_rate": 4.88281908489736e-06, - "loss": 0.1745, + "epoch": 0.01676878426314092, + "grad_norm": 1.3688482992570172, + "learning_rate": 1.6758747697974218e-06, + "loss": 0.2443, "step": 182 }, { - "epoch": 0.14403778040141677, - "grad_norm": 1.1662972444477193, - "learning_rate": 4.880840548281475e-06, - "loss": 0.1844, + "epoch": 0.016860920440410928, + "grad_norm": 1.3343184731235973, + "learning_rate": 1.685082872928177e-06, + "loss": 0.2121, "step": 183 }, { - "epoch": 0.14482487209759937, - "grad_norm": 1.1318261660435303, - "learning_rate": 4.878845855615364e-06, - "loss": 0.177, + "epoch": 0.016953056617680932, + "grad_norm": 1.225401208028096, + "learning_rate": 1.6942909760589322e-06, + "loss": 0.2287, "step": 184 }, { - "epoch": 0.14561196379378197, - "grad_norm": 1.0454173174935852, - "learning_rate": 4.876835020434733e-06, - "loss": 0.1726, + "epoch": 0.01704519279495094, + "grad_norm": 1.2179622098203036, + "learning_rate": 1.7034990791896871e-06, + "loss": 0.2049, "step": 185 }, { - "epoch": 0.14639905548996457, - "grad_norm": 1.1649728572384528, - "learning_rate": 4.874808056384826e-06, - "loss": 0.1829, + "epoch": 0.017137328972220943, + "grad_norm": 1.5066030755860567, + "learning_rate": 1.7127071823204422e-06, + "loss": 0.2424, "step": 186 }, { - "epoch": 0.1471861471861472, - "grad_norm": 0.9809711097737751, - "learning_rate": 4.8727649772203375e-06, - "loss": 0.1626, + "epoch": 0.017229465149490947, + "grad_norm": 1.4045090484290212, + "learning_rate": 1.7219152854511971e-06, + "loss": 0.2227, "step": 187 }, { - "epoch": 0.1479732388823298, - "grad_norm": 1.0024677570018588, - "learning_rate": 4.8707057968053175e-06, - "loss": 0.1564, + "epoch": 0.017321601326760954, + "grad_norm": 1.272435969600215, + "learning_rate": 1.7311233885819523e-06, + "loss": 0.2431, "step": 188 }, { - "epoch": 0.1487603305785124, - "grad_norm": 1.0801740218719516, - "learning_rate": 4.868630529113075e-06, - "loss": 0.1571, + "epoch": 0.017413737504030958, + "grad_norm": 1.3028523579116038, + "learning_rate": 1.7403314917127074e-06, + "loss": 0.2179, "step": 189 }, { - "epoch": 0.149547422274695, - "grad_norm": 1.0633734918657578, - "learning_rate": 4.866539188226086e-06, - "loss": 0.1558, + "epoch": 0.017505873681300962, + "grad_norm": 1.515833129596805, + "learning_rate": 1.7495395948434623e-06, + "loss": 0.2518, "step": 190 }, { - "epoch": 0.1503345139708776, - "grad_norm": 1.1110942685300096, - "learning_rate": 4.864431788335895e-06, - "loss": 0.1739, + "epoch": 0.01759800985857097, + "grad_norm": 1.3640712213334758, + "learning_rate": 1.7587476979742174e-06, + "loss": 0.2519, "step": 191 }, { - "epoch": 0.1511216056670602, - "grad_norm": 1.088865739839623, - "learning_rate": 4.862308343743024e-06, - "loss": 0.1705, + "epoch": 0.017690146035840973, + "grad_norm": 1.2963567960878155, + "learning_rate": 1.7679558011049725e-06, + "loss": 0.2122, "step": 192 }, { - "epoch": 0.1519086973632428, - "grad_norm": 1.158763785538179, - "learning_rate": 4.86016886885687e-06, - "loss": 0.1754, + "epoch": 0.017782282213110977, + "grad_norm": 1.2385890307787466, + "learning_rate": 1.7771639042357274e-06, + "loss": 0.218, "step": 193 }, { - "epoch": 0.15269578905942544, - "grad_norm": 1.0665033787081621, - "learning_rate": 4.858013378195609e-06, - "loss": 0.1814, + "epoch": 0.017874418390380984, + "grad_norm": 1.2918958910678935, + "learning_rate": 1.7863720073664825e-06, + "loss": 0.2275, "step": 194 }, { - "epoch": 0.15348288075560804, - "grad_norm": 1.0347506383595513, - "learning_rate": 4.855841886386099e-06, - "loss": 0.1659, + "epoch": 0.017966554567650988, + "grad_norm": 1.3240547033002077, + "learning_rate": 1.7955801104972378e-06, + "loss": 0.2265, "step": 195 }, { - "epoch": 0.15426997245179064, - "grad_norm": 1.3652087096932124, - "learning_rate": 4.8536544081637785e-06, - "loss": 0.1693, + "epoch": 0.01805869074492099, + "grad_norm": 1.3215736623947212, + "learning_rate": 1.804788213627993e-06, + "loss": 0.2106, "step": 196 }, { - "epoch": 0.15505706414797324, - "grad_norm": 1.1571980267809596, - "learning_rate": 4.8514509583725685e-06, - "loss": 0.1735, + "epoch": 0.018150826922191, + "grad_norm": 1.3962463357518629, + "learning_rate": 1.8139963167587479e-06, + "loss": 0.2421, "step": 197 }, { - "epoch": 0.15584415584415584, - "grad_norm": 1.1126611625924816, - "learning_rate": 4.849231551964771e-06, - "loss": 0.1878, + "epoch": 0.018242963099461003, + "grad_norm": 1.3634363454930103, + "learning_rate": 1.823204419889503e-06, + "loss": 0.2258, "step": 198 }, { - "epoch": 0.15663124754033844, - "grad_norm": 1.0666827506948415, - "learning_rate": 4.846996204000967e-06, - "loss": 0.1686, + "epoch": 0.018335099276731007, + "grad_norm": 1.3838622302412065, + "learning_rate": 1.832412523020258e-06, + "loss": 0.2086, "step": 199 }, { - "epoch": 0.15741833923652104, - "grad_norm": 1.1408187983192677, - "learning_rate": 4.844744929649912e-06, - "loss": 0.1785, + "epoch": 0.018427235454001014, + "grad_norm": 1.3181256821025102, + "learning_rate": 1.841620626151013e-06, + "loss": 0.2129, "step": 200 }, { - "epoch": 0.15820543093270367, - "grad_norm": 1.1050850982745672, - "learning_rate": 4.842477744188441e-06, - "loss": 0.1663, + "epoch": 0.018519371631271018, + "grad_norm": 1.270539722225883, + "learning_rate": 1.8508287292817681e-06, + "loss": 0.2191, "step": 201 }, { - "epoch": 0.15899252262888627, - "grad_norm": 1.0153624350350885, - "learning_rate": 4.840194663001354e-06, - "loss": 0.1755, + "epoch": 0.018611507808541025, + "grad_norm": 1.265711181176557, + "learning_rate": 1.8600368324125232e-06, + "loss": 0.2061, "step": 202 }, { - "epoch": 0.15977961432506887, - "grad_norm": 1.0251264155888737, - "learning_rate": 4.837895701581322e-06, - "loss": 0.1537, + "epoch": 0.01870364398581103, + "grad_norm": 1.4039473787664178, + "learning_rate": 1.8692449355432781e-06, + "loss": 0.2277, "step": 203 }, { - "epoch": 0.16056670602125148, - "grad_norm": 1.0673153393456505, - "learning_rate": 4.835580875528776e-06, - "loss": 0.1633, + "epoch": 0.018795780163081033, + "grad_norm": 1.345966851950806, + "learning_rate": 1.8784530386740332e-06, + "loss": 0.2065, "step": 204 }, { - "epoch": 0.16135379771743408, - "grad_norm": 1.0273828987011315, - "learning_rate": 4.833250200551798e-06, - "loss": 0.1746, + "epoch": 0.01888791634035104, + "grad_norm": 1.3892265247643658, + "learning_rate": 1.8876611418047884e-06, + "loss": 0.2117, "step": 205 }, { - "epoch": 0.16214088941361668, - "grad_norm": 1.0964068866663357, - "learning_rate": 4.830903692466023e-06, - "loss": 0.1674, + "epoch": 0.018980052517621044, + "grad_norm": 1.3391019958709516, + "learning_rate": 1.8968692449355433e-06, + "loss": 0.2241, "step": 206 }, { - "epoch": 0.16292798110979928, - "grad_norm": 1.1142080493277295, - "learning_rate": 4.828541367194527e-06, - "loss": 0.1828, + "epoch": 0.019072188694891048, + "grad_norm": 1.3767301542758652, + "learning_rate": 1.9060773480662986e-06, + "loss": 0.2394, "step": 207 }, { - "epoch": 0.1637150728059819, - "grad_norm": 1.0617790409690397, - "learning_rate": 4.826163240767717e-06, - "loss": 0.1676, + "epoch": 0.019164324872161055, + "grad_norm": 1.193499504261302, + "learning_rate": 1.9152854511970537e-06, + "loss": 0.2147, "step": 208 }, { - "epoch": 0.1645021645021645, - "grad_norm": 1.2859855971245049, - "learning_rate": 4.8237693293232256e-06, - "loss": 0.1942, + "epoch": 0.01925646104943106, + "grad_norm": 1.42744498061299, + "learning_rate": 1.9244935543278086e-06, + "loss": 0.2454, "step": 209 }, { - "epoch": 0.1652892561983471, - "grad_norm": 1.000840540957111, - "learning_rate": 4.821359649105801e-06, - "loss": 0.1686, + "epoch": 0.019348597226701063, + "grad_norm": 1.2070717468524428, + "learning_rate": 1.933701657458564e-06, + "loss": 0.2281, "step": 210 }, { - "epoch": 0.1660763478945297, - "grad_norm": 1.049595380158752, - "learning_rate": 4.818934216467195e-06, - "loss": 0.1696, + "epoch": 0.01944073340397107, + "grad_norm": 1.2184409700694656, + "learning_rate": 1.942909760589319e-06, + "loss": 0.2011, "step": 211 }, { - "epoch": 0.16686343959071231, - "grad_norm": 1.0218031530162965, - "learning_rate": 4.816493047866053e-06, - "loss": 0.1653, + "epoch": 0.019532869581241074, + "grad_norm": 1.244082773508379, + "learning_rate": 1.9521178637200737e-06, + "loss": 0.2198, "step": 212 }, { - "epoch": 0.16765053128689492, - "grad_norm": 1.0715206508098112, - "learning_rate": 4.8140361598678034e-06, - "loss": 0.1735, + "epoch": 0.019625005758511078, + "grad_norm": 1.1946783073071228, + "learning_rate": 1.961325966850829e-06, + "loss": 0.2174, "step": 213 }, { - "epoch": 0.16843762298307752, - "grad_norm": 1.093161202120212, - "learning_rate": 4.811563569144544e-06, - "loss": 0.1698, + "epoch": 0.019717141935781085, + "grad_norm": 1.2601297485847678, + "learning_rate": 1.970534069981584e-06, + "loss": 0.2225, "step": 214 }, { - "epoch": 0.16922471467926015, - "grad_norm": 1.078958887147992, - "learning_rate": 4.809075292474929e-06, - "loss": 0.1671, + "epoch": 0.01980927811305109, + "grad_norm": 1.3124765793917974, + "learning_rate": 1.979742173112339e-06, + "loss": 0.2267, "step": 215 }, { - "epoch": 0.17001180637544275, - "grad_norm": 1.1213364259804648, - "learning_rate": 4.806571346744053e-06, - "loss": 0.1798, + "epoch": 0.019901414290321096, + "grad_norm": 1.3267678443080182, + "learning_rate": 1.988950276243094e-06, + "loss": 0.2297, "step": 216 }, { - "epoch": 0.17079889807162535, - "grad_norm": 1.102076724202232, - "learning_rate": 4.804051748943343e-06, - "loss": 0.1845, + "epoch": 0.0199935504675911, + "grad_norm": 1.267293008421713, + "learning_rate": 1.998158379373849e-06, + "loss": 0.2181, "step": 217 }, { - "epoch": 0.17158598976780795, - "grad_norm": 1.1103430873095865, - "learning_rate": 4.801516516170437e-06, - "loss": 0.177, + "epoch": 0.020085686644861104, + "grad_norm": 1.2619101408630657, + "learning_rate": 2.007366482504604e-06, + "loss": 0.2397, "step": 218 }, { - "epoch": 0.17237308146399055, - "grad_norm": 1.228711789290585, - "learning_rate": 4.798965665629068e-06, - "loss": 0.1636, + "epoch": 0.02017782282213111, + "grad_norm": 1.3636244878125987, + "learning_rate": 2.0165745856353593e-06, + "loss": 0.2253, "step": 219 }, { - "epoch": 0.17316017316017315, - "grad_norm": 1.1219855198900837, - "learning_rate": 4.796399214628949e-06, - "loss": 0.1802, + "epoch": 0.020269958999401115, + "grad_norm": 1.394830925894432, + "learning_rate": 2.0257826887661147e-06, + "loss": 0.2252, "step": 220 }, { - "epoch": 0.17394726485635575, - "grad_norm": 1.1846418832749555, - "learning_rate": 4.7938171805856596e-06, - "loss": 0.1717, + "epoch": 0.02036209517667112, + "grad_norm": 1.2983165359381221, + "learning_rate": 2.0349907918968696e-06, + "loss": 0.2278, "step": 221 }, { - "epoch": 0.17473435655253838, - "grad_norm": 1.0672386815907553, - "learning_rate": 4.791219581020518e-06, - "loss": 0.1663, + "epoch": 0.020454231353941126, + "grad_norm": 1.2967437740330148, + "learning_rate": 2.0441988950276245e-06, + "loss": 0.2124, "step": 222 }, { - "epoch": 0.17552144824872098, - "grad_norm": 1.0398388591323704, - "learning_rate": 4.788606433560473e-06, - "loss": 0.1593, + "epoch": 0.02054636753121113, + "grad_norm": 1.4482194246277718, + "learning_rate": 2.0534069981583794e-06, + "loss": 0.2216, "step": 223 }, { - "epoch": 0.1763085399449036, - "grad_norm": 1.1402534682960337, - "learning_rate": 4.785977755937977e-06, - "loss": 0.1876, + "epoch": 0.020638503708481134, + "grad_norm": 1.310894495587751, + "learning_rate": 2.0626151012891347e-06, + "loss": 0.222, "step": 224 }, { - "epoch": 0.1770956316410862, - "grad_norm": 1.1260603683997887, - "learning_rate": 4.783333565990865e-06, - "loss": 0.172, + "epoch": 0.02073063988575114, + "grad_norm": 1.2475533975236348, + "learning_rate": 2.0718232044198896e-06, + "loss": 0.2043, "step": 225 }, { - "epoch": 0.1778827233372688, - "grad_norm": 1.062290554096683, - "learning_rate": 4.780673881662242e-06, - "loss": 0.1709, + "epoch": 0.020822776063021145, + "grad_norm": 1.4060174527930498, + "learning_rate": 2.0810313075506445e-06, + "loss": 0.222, "step": 226 }, { - "epoch": 0.1786698150334514, - "grad_norm": 1.0650729387286197, - "learning_rate": 4.777998721000353e-06, - "loss": 0.1614, + "epoch": 0.02091491224029115, + "grad_norm": 1.4368485294275846, + "learning_rate": 2.0902394106814e-06, + "loss": 0.2425, "step": 227 }, { - "epoch": 0.179456906729634, - "grad_norm": 1.0365419204779498, - "learning_rate": 4.775308102158461e-06, - "loss": 0.1605, + "epoch": 0.021007048417561156, + "grad_norm": 1.259305482075362, + "learning_rate": 2.0994475138121547e-06, + "loss": 0.223, "step": 228 }, { - "epoch": 0.18024399842581662, - "grad_norm": 1.1444494636007958, - "learning_rate": 4.772602043394731e-06, - "loss": 0.1867, + "epoch": 0.02109918459483116, + "grad_norm": 1.433635435091614, + "learning_rate": 2.1086556169429096e-06, + "loss": 0.2223, "step": 229 }, { - "epoch": 0.18103109012199922, - "grad_norm": 1.1053808430839196, - "learning_rate": 4.769880563072097e-06, - "loss": 0.1627, + "epoch": 0.021191320772101164, + "grad_norm": 1.3258788470822962, + "learning_rate": 2.117863720073665e-06, + "loss": 0.223, "step": 230 }, { - "epoch": 0.18181818181818182, - "grad_norm": 1.0763207393373317, - "learning_rate": 4.767143679658143e-06, - "loss": 0.1703, + "epoch": 0.02128345694937117, + "grad_norm": 1.3345971348097236, + "learning_rate": 2.1270718232044203e-06, + "loss": 0.2088, "step": 231 }, { - "epoch": 0.18260527351436442, - "grad_norm": 1.1302336936081483, - "learning_rate": 4.764391411724977e-06, - "loss": 0.1697, + "epoch": 0.021375593126641175, + "grad_norm": 1.1506446317260917, + "learning_rate": 2.136279926335175e-06, + "loss": 0.2183, "step": 232 }, { - "epoch": 0.18339236521054703, - "grad_norm": 1.059980991296742, - "learning_rate": 4.7616237779491026e-06, - "loss": 0.1658, + "epoch": 0.021467729303911182, + "grad_norm": 1.2501482254633949, + "learning_rate": 2.14548802946593e-06, + "loss": 0.2218, "step": 233 }, { - "epoch": 0.18417945690672963, - "grad_norm": 1.0952807461742509, - "learning_rate": 4.758840797111295e-06, - "loss": 0.1833, + "epoch": 0.021559865481181186, + "grad_norm": 1.2715617957043448, + "learning_rate": 2.1546961325966854e-06, + "loss": 0.2167, "step": 234 }, { - "epoch": 0.18496654860291223, - "grad_norm": 1.0263555674269131, - "learning_rate": 4.756042488096472e-06, - "loss": 0.1732, + "epoch": 0.02165200165845119, + "grad_norm": 1.4251050947489576, + "learning_rate": 2.1639042357274403e-06, + "loss": 0.2378, "step": 235 }, { - "epoch": 0.18575364029909486, - "grad_norm": 1.088261327233659, - "learning_rate": 4.753228869893566e-06, - "loss": 0.1646, + "epoch": 0.021744137835721197, + "grad_norm": 1.2400115125049491, + "learning_rate": 2.1731123388581952e-06, + "loss": 0.2294, "step": 236 }, { - "epoch": 0.18654073199527746, - "grad_norm": 1.0644325115229099, - "learning_rate": 4.750399961595395e-06, - "loss": 0.1576, + "epoch": 0.0218362740129912, + "grad_norm": 1.3035788835712026, + "learning_rate": 2.1823204419889505e-06, + "loss": 0.1968, "step": 237 }, { - "epoch": 0.18732782369146006, - "grad_norm": 0.9952967090049917, - "learning_rate": 4.747555782398537e-06, - "loss": 0.1598, + "epoch": 0.021928410190261205, + "grad_norm": 1.471849667145631, + "learning_rate": 2.1915285451197054e-06, + "loss": 0.2228, "step": 238 }, { - "epoch": 0.18811491538764266, - "grad_norm": 1.0300249714418026, - "learning_rate": 4.7446963516031904e-06, - "loss": 0.1883, + "epoch": 0.022020546367531212, + "grad_norm": 1.3647414373400866, + "learning_rate": 2.2007366482504604e-06, + "loss": 0.224, "step": 239 }, { - "epoch": 0.18890200708382526, - "grad_norm": 1.0275382678304879, - "learning_rate": 4.741821688613054e-06, - "loss": 0.1704, + "epoch": 0.022112682544801216, + "grad_norm": 1.537121143452077, + "learning_rate": 2.2099447513812157e-06, + "loss": 0.2334, "step": 240 }, { - "epoch": 0.18968909878000786, - "grad_norm": 1.0616733952682182, - "learning_rate": 4.738931812935186e-06, - "loss": 0.1907, + "epoch": 0.02220481872207122, + "grad_norm": 1.2899612307328876, + "learning_rate": 2.2191528545119706e-06, + "loss": 0.207, "step": 241 }, { - "epoch": 0.19047619047619047, - "grad_norm": 1.0103628221724312, - "learning_rate": 4.736026744179878e-06, - "loss": 0.1556, + "epoch": 0.022296954899341227, + "grad_norm": 1.2217522545050996, + "learning_rate": 2.228360957642726e-06, + "loss": 0.2236, "step": 242 }, { - "epoch": 0.1912632821723731, - "grad_norm": 1.0535669337117792, - "learning_rate": 4.73310650206052e-06, - "loss": 0.1809, + "epoch": 0.02238909107661123, + "grad_norm": 1.395775888810296, + "learning_rate": 2.237569060773481e-06, + "loss": 0.2244, "step": 243 }, { - "epoch": 0.1920503738685557, - "grad_norm": 1.0554553563643476, - "learning_rate": 4.730171106393466e-06, - "loss": 0.1675, + "epoch": 0.022481227253881235, + "grad_norm": 1.284804900306348, + "learning_rate": 2.246777163904236e-06, + "loss": 0.2098, "step": 244 }, { - "epoch": 0.1928374655647383, - "grad_norm": 0.9417424551436594, - "learning_rate": 4.7272205770979e-06, - "loss": 0.1438, + "epoch": 0.022573363431151242, + "grad_norm": 1.304346808920648, + "learning_rate": 2.255985267034991e-06, + "loss": 0.2038, "step": 245 }, { - "epoch": 0.1936245572609209, - "grad_norm": 1.1154888244817747, - "learning_rate": 4.724254934195698e-06, - "loss": 0.1765, + "epoch": 0.022665499608421246, + "grad_norm": 1.3086929964965677, + "learning_rate": 2.265193370165746e-06, + "loss": 0.221, "step": 246 }, { - "epoch": 0.1944116489571035, - "grad_norm": 1.1742188581521773, - "learning_rate": 4.721274197811298e-06, - "loss": 0.1711, + "epoch": 0.022757635785691253, + "grad_norm": 1.2371068797697236, + "learning_rate": 2.2744014732965013e-06, + "loss": 0.2059, "step": 247 }, { - "epoch": 0.1951987406532861, - "grad_norm": 1.057640390538921, - "learning_rate": 4.71827838817156e-06, - "loss": 0.1678, + "epoch": 0.022849771962961257, + "grad_norm": 1.237860938668767, + "learning_rate": 2.283609576427256e-06, + "loss": 0.2116, "step": 248 }, { - "epoch": 0.1959858323494687, - "grad_norm": 1.022336905613029, - "learning_rate": 4.715267525605627e-06, - "loss": 0.1552, + "epoch": 0.02294190814023126, + "grad_norm": 1.1873106876741861, + "learning_rate": 2.292817679558011e-06, + "loss": 0.2044, "step": 249 }, { - "epoch": 0.19677292404565133, - "grad_norm": 1.181830506383501, - "learning_rate": 4.712241630544792e-06, - "loss": 0.1765, + "epoch": 0.02303404431750127, + "grad_norm": 1.284075741757394, + "learning_rate": 2.3020257826887664e-06, + "loss": 0.2265, "step": 250 }, { - "epoch": 0.19756001574183393, - "grad_norm": 1.1571296526874602, - "learning_rate": 4.709200723522353e-06, - "loss": 0.1758, + "epoch": 0.023126180494771272, + "grad_norm": 1.2554034425448573, + "learning_rate": 2.3112338858195213e-06, + "loss": 0.2098, "step": 251 }, { - "epoch": 0.19834710743801653, - "grad_norm": 1.082056647389628, - "learning_rate": 4.706144825173481e-06, - "loss": 0.1638, + "epoch": 0.023218316672041276, + "grad_norm": 1.3561997983957859, + "learning_rate": 2.320441988950276e-06, + "loss": 0.2274, "step": 252 }, { - "epoch": 0.19913419913419914, - "grad_norm": 1.0648327864294944, - "learning_rate": 4.703073956235071e-06, - "loss": 0.1747, + "epoch": 0.023310452849311283, + "grad_norm": 1.289899655742179, + "learning_rate": 2.3296500920810315e-06, + "loss": 0.203, "step": 253 }, { - "epoch": 0.19992129083038174, - "grad_norm": 1.1273460773870558, - "learning_rate": 4.6999881375456116e-06, - "loss": 0.1767, + "epoch": 0.023402589026581287, + "grad_norm": 1.207952118169262, + "learning_rate": 2.338858195211787e-06, + "loss": 0.2124, "step": 254 }, { - "epoch": 0.20070838252656434, - "grad_norm": 1.0782376126285664, - "learning_rate": 4.696887390045035e-06, - "loss": 0.169, + "epoch": 0.02349472520385129, + "grad_norm": 1.3805095413072321, + "learning_rate": 2.3480662983425418e-06, + "loss": 0.2284, "step": 255 }, { - "epoch": 0.20149547422274694, - "grad_norm": 1.043398805036875, - "learning_rate": 4.693771734774578e-06, - "loss": 0.1774, + "epoch": 0.0235868613811213, + "grad_norm": 1.3972049227450618, + "learning_rate": 2.3572744014732967e-06, + "loss": 0.2212, "step": 256 }, { - "epoch": 0.20228256591892957, - "grad_norm": 1.067320862475683, - "learning_rate": 4.690641192876643e-06, - "loss": 0.1607, + "epoch": 0.023678997558391302, + "grad_norm": 1.2247515177867434, + "learning_rate": 2.366482504604052e-06, + "loss": 0.2158, "step": 257 }, { - "epoch": 0.20306965761511217, - "grad_norm": 1.1843944163744937, - "learning_rate": 4.687495785594646e-06, - "loss": 0.1633, + "epoch": 0.023771133735661306, + "grad_norm": 1.1692729997781546, + "learning_rate": 2.375690607734807e-06, + "loss": 0.2223, "step": 258 }, { - "epoch": 0.20385674931129477, - "grad_norm": 1.0931562611646284, - "learning_rate": 4.684335534272881e-06, - "loss": 0.1687, + "epoch": 0.023863269912931313, + "grad_norm": 1.4293157160410055, + "learning_rate": 2.384898710865562e-06, + "loss": 0.2355, "step": 259 }, { - "epoch": 0.20464384100747737, - "grad_norm": 1.1204870400497637, - "learning_rate": 4.68116046035637e-06, - "loss": 0.1639, + "epoch": 0.023955406090201317, + "grad_norm": 1.2833231867557153, + "learning_rate": 2.394106813996317e-06, + "loss": 0.2093, "step": 260 }, { - "epoch": 0.20543093270365997, - "grad_norm": 1.2082791443480092, - "learning_rate": 4.6779705853907205e-06, - "loss": 0.1683, + "epoch": 0.02404754226747132, + "grad_norm": 1.430588872964235, + "learning_rate": 2.403314917127072e-06, + "loss": 0.2299, "step": 261 }, { - "epoch": 0.20621802439984258, - "grad_norm": 1.0646518318192153, - "learning_rate": 4.674765931021976e-06, - "loss": 0.1611, + "epoch": 0.02413967844474133, + "grad_norm": 1.3955869018367655, + "learning_rate": 2.412523020257827e-06, + "loss": 0.2235, "step": 262 }, { - "epoch": 0.20700511609602518, - "grad_norm": 1.1268791395123645, - "learning_rate": 4.671546518996473e-06, - "loss": 0.1553, + "epoch": 0.024231814622011332, + "grad_norm": 1.3209105842207622, + "learning_rate": 2.4217311233885823e-06, + "loss": 0.2314, "step": 263 }, { - "epoch": 0.2077922077922078, - "grad_norm": 1.0048534045343525, - "learning_rate": 4.668312371160688e-06, - "loss": 0.1571, + "epoch": 0.02432395079928134, + "grad_norm": 1.3675314084283223, + "learning_rate": 2.430939226519337e-06, + "loss": 0.192, "step": 264 }, { - "epoch": 0.2085792994883904, - "grad_norm": 1.0052893495164037, - "learning_rate": 4.665063509461098e-06, - "loss": 0.1679, + "epoch": 0.024416086976551343, + "grad_norm": 1.3043781570646351, + "learning_rate": 2.440147329650092e-06, + "loss": 0.2218, "step": 265 }, { - "epoch": 0.209366391184573, - "grad_norm": 0.9679422598052939, - "learning_rate": 4.661799955944019e-06, - "loss": 0.1556, + "epoch": 0.024508223153821347, + "grad_norm": 1.2993635785678224, + "learning_rate": 2.4493554327808474e-06, + "loss": 0.2003, "step": 266 }, { - "epoch": 0.2101534828807556, - "grad_norm": 1.0487292157874373, - "learning_rate": 4.658521732755471e-06, - "loss": 0.183, + "epoch": 0.024600359331091354, + "grad_norm": 1.2707418652729778, + "learning_rate": 2.4585635359116027e-06, + "loss": 0.2036, "step": 267 }, { - "epoch": 0.2109405745769382, - "grad_norm": 1.0878511570789495, - "learning_rate": 4.655228862141017e-06, - "loss": 0.1762, + "epoch": 0.02469249550836136, + "grad_norm": 1.2834662882271706, + "learning_rate": 2.4677716390423576e-06, + "loss": 0.2168, "step": 268 }, { - "epoch": 0.2117276662731208, - "grad_norm": 0.9275216638767947, - "learning_rate": 4.651921366445613e-06, - "loss": 0.1483, + "epoch": 0.024784631685631362, + "grad_norm": 1.3433418131656627, + "learning_rate": 2.4769797421731125e-06, + "loss": 0.2164, "step": 269 }, { - "epoch": 0.21251475796930341, - "grad_norm": 1.0291173856009612, - "learning_rate": 4.648599268113464e-06, - "loss": 0.1657, + "epoch": 0.02487676786290137, + "grad_norm": 1.3675989386071368, + "learning_rate": 2.486187845303868e-06, + "loss": 0.2136, "step": 270 }, { - "epoch": 0.21330184966548604, - "grad_norm": 0.9814951923963836, - "learning_rate": 4.645262589687861e-06, - "loss": 0.1737, + "epoch": 0.024968904040171373, + "grad_norm": 1.2819757220021681, + "learning_rate": 2.4953959484346228e-06, + "loss": 0.2217, "step": 271 }, { - "epoch": 0.21408894136166864, - "grad_norm": 0.9574503772544043, - "learning_rate": 4.641911353811038e-06, - "loss": 0.1638, + "epoch": 0.025061040217441377, + "grad_norm": 1.4596357277210503, + "learning_rate": 2.504604051565378e-06, + "loss": 0.2018, "step": 272 }, { - "epoch": 0.21487603305785125, - "grad_norm": 0.9684496500051328, - "learning_rate": 4.638545583224011e-06, - "loss": 0.1649, + "epoch": 0.025153176394711384, + "grad_norm": 1.2597067670175457, + "learning_rate": 2.513812154696133e-06, + "loss": 0.2315, "step": 273 }, { - "epoch": 0.21566312475403385, - "grad_norm": 1.0314787067828541, - "learning_rate": 4.635165300766428e-06, - "loss": 0.1699, + "epoch": 0.025245312571981388, + "grad_norm": 1.2971255477113983, + "learning_rate": 2.523020257826888e-06, + "loss": 0.2087, "step": 274 }, { - "epoch": 0.21645021645021645, - "grad_norm": 1.0287264097080684, - "learning_rate": 4.63177052937641e-06, - "loss": 0.1602, + "epoch": 0.025337448749251392, + "grad_norm": 1.332947894390514, + "learning_rate": 2.5322283609576432e-06, + "loss": 0.2127, "step": 275 }, { - "epoch": 0.21723730814639905, - "grad_norm": 1.1114659065296888, - "learning_rate": 4.628361292090403e-06, - "loss": 0.1783, + "epoch": 0.0254295849265214, + "grad_norm": 1.1260373924980331, + "learning_rate": 2.541436464088398e-06, + "loss": 0.1973, "step": 276 }, { - "epoch": 0.21802439984258165, - "grad_norm": 1.0298788844790752, - "learning_rate": 4.6249376120430115e-06, - "loss": 0.1678, + "epoch": 0.025521721103791403, + "grad_norm": 1.2445947528995862, + "learning_rate": 2.550644567219153e-06, + "loss": 0.2099, "step": 277 }, { - "epoch": 0.21881149153876425, - "grad_norm": 1.0099420287081406, - "learning_rate": 4.621499512466847e-06, - "loss": 0.1672, + "epoch": 0.025613857281061407, + "grad_norm": 1.3694045461593618, + "learning_rate": 2.5598526703499083e-06, + "loss": 0.2243, "step": 278 }, { - "epoch": 0.21959858323494688, - "grad_norm": 0.9892117727941296, - "learning_rate": 4.618047016692374e-06, - "loss": 0.1663, + "epoch": 0.025705993458331414, + "grad_norm": 1.297962915474662, + "learning_rate": 2.5690607734806632e-06, + "loss": 0.2204, "step": 279 }, { - "epoch": 0.22038567493112948, - "grad_norm": 0.9289360238552057, - "learning_rate": 4.614580148147744e-06, - "loss": 0.1563, + "epoch": 0.025798129635601418, + "grad_norm": 1.2909158781778913, + "learning_rate": 2.578268876611418e-06, + "loss": 0.2211, "step": 280 }, { - "epoch": 0.22117276662731208, - "grad_norm": 0.9603340451855991, - "learning_rate": 4.61109893035864e-06, - "loss": 0.1561, + "epoch": 0.025890265812871426, + "grad_norm": 1.320578975065506, + "learning_rate": 2.5874769797421735e-06, + "loss": 0.1965, "step": 281 }, { - "epoch": 0.22195985832349469, - "grad_norm": 1.0449269347565262, - "learning_rate": 4.607603386948119e-06, - "loss": 0.165, + "epoch": 0.02598240199014143, + "grad_norm": 1.2210472413434825, + "learning_rate": 2.5966850828729284e-06, + "loss": 0.2134, "step": 282 }, { - "epoch": 0.2227469500196773, - "grad_norm": 0.990226128298578, - "learning_rate": 4.604093541636448e-06, - "loss": 0.1704, + "epoch": 0.026074538167411433, + "grad_norm": 1.1578898719596564, + "learning_rate": 2.6058931860036833e-06, + "loss": 0.2039, "step": 283 }, { - "epoch": 0.2235340417158599, - "grad_norm": 1.031797952555019, - "learning_rate": 4.600569418240946e-06, - "loss": 0.1677, + "epoch": 0.02616667434468144, + "grad_norm": 1.3398605179539043, + "learning_rate": 2.6151012891344386e-06, + "loss": 0.2091, "step": 284 }, { - "epoch": 0.2243211334120425, - "grad_norm": 1.0506428763431659, - "learning_rate": 4.597031040675819e-06, - "loss": 0.1802, + "epoch": 0.026258810521951444, + "grad_norm": 1.2764074537081127, + "learning_rate": 2.6243093922651935e-06, + "loss": 0.2127, "step": 285 }, { - "epoch": 0.22510822510822512, - "grad_norm": 0.980146123693525, - "learning_rate": 4.593478432952002e-06, - "loss": 0.1656, + "epoch": 0.026350946699221448, + "grad_norm": 1.201092481830308, + "learning_rate": 2.6335174953959484e-06, + "loss": 0.2134, "step": 286 }, { - "epoch": 0.22589531680440772, - "grad_norm": 1.0058178922055618, - "learning_rate": 4.589911619176993e-06, - "loss": 0.1601, + "epoch": 0.026443082876491456, + "grad_norm": 1.3808995093878993, + "learning_rate": 2.6427255985267037e-06, + "loss": 0.2229, "step": 287 }, { - "epoch": 0.22668240850059032, - "grad_norm": 1.1532752501338874, - "learning_rate": 4.586330623554691e-06, - "loss": 0.1707, + "epoch": 0.02653521905376146, + "grad_norm": 1.3001384527969946, + "learning_rate": 2.6519337016574586e-06, + "loss": 0.2292, "step": 288 }, { - "epoch": 0.22746950019677292, - "grad_norm": 0.9925104519486038, - "learning_rate": 4.582735470385229e-06, - "loss": 0.1712, + "epoch": 0.026627355231031463, + "grad_norm": 1.3053774770933373, + "learning_rate": 2.6611418047882135e-06, + "loss": 0.22, "step": 289 }, { - "epoch": 0.22825659189295552, - "grad_norm": 1.1312813134045174, - "learning_rate": 4.579126184064814e-06, - "loss": 0.1607, + "epoch": 0.02671949140830147, + "grad_norm": 1.449134250644644, + "learning_rate": 2.670349907918969e-06, + "loss": 0.2194, "step": 290 }, { - "epoch": 0.22904368358913813, - "grad_norm": 1.2454875330122912, - "learning_rate": 4.575502789085555e-06, - "loss": 0.1656, + "epoch": 0.026811627585571474, + "grad_norm": 1.437368533930005, + "learning_rate": 2.6795580110497238e-06, + "loss": 0.2239, "step": 291 }, { - "epoch": 0.22983077528532073, - "grad_norm": 0.9825183210915687, - "learning_rate": 4.571865310035304e-06, - "loss": 0.1589, + "epoch": 0.026903763762841478, + "grad_norm": 1.2446885130520247, + "learning_rate": 2.6887661141804787e-06, + "loss": 0.2141, "step": 292 }, { - "epoch": 0.23061786698150336, - "grad_norm": 1.0887371255437703, - "learning_rate": 4.568213771597484e-06, - "loss": 0.1585, + "epoch": 0.026995899940111485, + "grad_norm": 1.7040675956383482, + "learning_rate": 2.697974217311234e-06, + "loss": 0.2224, "step": 293 }, { - "epoch": 0.23140495867768596, - "grad_norm": 1.0975434488519114, - "learning_rate": 4.564548198550922e-06, - "loss": 0.1435, + "epoch": 0.02708803611738149, + "grad_norm": 1.332041322961633, + "learning_rate": 2.707182320441989e-06, + "loss": 0.2214, "step": 294 }, { - "epoch": 0.23219205037386856, - "grad_norm": 1.0593259383463134, - "learning_rate": 4.5608686157696844e-06, - "loss": 0.167, + "epoch": 0.027180172294651497, + "grad_norm": 1.2590166578500492, + "learning_rate": 2.716390423572744e-06, + "loss": 0.2179, "step": 295 }, { - "epoch": 0.23297914207005116, - "grad_norm": 1.1536948102561841, - "learning_rate": 4.557175048222901e-06, - "loss": 0.1621, + "epoch": 0.0272723084719215, + "grad_norm": 1.4226861813160818, + "learning_rate": 2.7255985267034996e-06, + "loss": 0.2196, "step": 296 }, { - "epoch": 0.23376623376623376, - "grad_norm": 1.1369019291567328, - "learning_rate": 4.5534675209746076e-06, - "loss": 0.1654, + "epoch": 0.027364444649191504, + "grad_norm": 1.3934346019180117, + "learning_rate": 2.7348066298342545e-06, + "loss": 0.2201, "step": 297 }, { - "epoch": 0.23455332546241636, - "grad_norm": 0.9585590140764199, - "learning_rate": 4.5497460591835615e-06, - "loss": 0.148, + "epoch": 0.02745658082646151, + "grad_norm": 1.2888343424352768, + "learning_rate": 2.74401473296501e-06, + "loss": 0.2109, "step": 298 }, { - "epoch": 0.23534041715859896, - "grad_norm": 1.2337420030262027, - "learning_rate": 4.546010688103082e-06, - "loss": 0.1599, + "epoch": 0.027548717003731515, + "grad_norm": 1.4048452398008997, + "learning_rate": 2.7532228360957647e-06, + "loss": 0.22, "step": 299 }, { - "epoch": 0.2361275088547816, - "grad_norm": 1.1641848426244756, - "learning_rate": 4.542261433080874e-06, - "loss": 0.1641, + "epoch": 0.02764085318100152, + "grad_norm": 1.2759644069246936, + "learning_rate": 2.7624309392265196e-06, + "loss": 0.2002, "step": 300 }, { - "epoch": 0.2369146005509642, - "grad_norm": 0.9715264597638171, - "learning_rate": 4.538498319558854e-06, - "loss": 0.1604, + "epoch": 0.027732989358271527, + "grad_norm": 1.257513125532842, + "learning_rate": 2.771639042357275e-06, + "loss": 0.2016, "step": 301 }, { - "epoch": 0.2377016922471468, - "grad_norm": 1.2043568904283137, - "learning_rate": 4.534721373072986e-06, - "loss": 0.1561, + "epoch": 0.02782512553554153, + "grad_norm": 1.1604132143904993, + "learning_rate": 2.78084714548803e-06, + "loss": 0.2001, "step": 302 }, { - "epoch": 0.2384887839433294, - "grad_norm": 1.087701432883666, - "learning_rate": 4.530930619253097e-06, - "loss": 0.1573, + "epoch": 0.027917261712811534, + "grad_norm": 1.300325855758057, + "learning_rate": 2.7900552486187847e-06, + "loss": 0.2034, "step": 303 }, { - "epoch": 0.239275875639512, - "grad_norm": 1.0432095830081018, - "learning_rate": 4.527126083822713e-06, - "loss": 0.1576, + "epoch": 0.02800939789008154, + "grad_norm": 1.3505529212765077, + "learning_rate": 2.79926335174954e-06, + "loss": 0.2161, "step": 304 }, { - "epoch": 0.2400629673356946, - "grad_norm": 1.1515388977241858, - "learning_rate": 4.523307792598877e-06, - "loss": 0.1836, + "epoch": 0.028101534067351545, + "grad_norm": 1.2614356404469431, + "learning_rate": 2.808471454880295e-06, + "loss": 0.1892, "step": 305 }, { - "epoch": 0.2408500590318772, - "grad_norm": 1.1236907370811289, - "learning_rate": 4.519475771491978e-06, - "loss": 0.1654, + "epoch": 0.02819367024462155, + "grad_norm": 1.461383179054911, + "learning_rate": 2.81767955801105e-06, + "loss": 0.2128, "step": 306 }, { - "epoch": 0.24163715072805983, - "grad_norm": 1.0492490872684002, - "learning_rate": 4.515630046505575e-06, - "loss": 0.1604, + "epoch": 0.028285806421891557, + "grad_norm": 1.25242901824847, + "learning_rate": 2.826887661141805e-06, + "loss": 0.22, "step": 307 }, { - "epoch": 0.24242424242424243, - "grad_norm": 1.0414505694174347, - "learning_rate": 4.511770643736217e-06, - "loss": 0.1587, + "epoch": 0.02837794259916156, + "grad_norm": 1.2270884655785574, + "learning_rate": 2.83609576427256e-06, + "loss": 0.1904, "step": 308 }, { - "epoch": 0.24321133412042503, - "grad_norm": 0.9963463131455829, - "learning_rate": 4.507897589373272e-06, - "loss": 0.1536, + "epoch": 0.028470078776431564, + "grad_norm": 1.23710825518334, + "learning_rate": 2.845303867403315e-06, + "loss": 0.2057, "step": 309 }, { - "epoch": 0.24399842581660763, - "grad_norm": 0.9437267739253786, - "learning_rate": 4.504010909698744e-06, - "loss": 0.1573, + "epoch": 0.02856221495370157, + "grad_norm": 1.2747541107670282, + "learning_rate": 2.8545119705340703e-06, + "loss": 0.2165, "step": 310 }, { - "epoch": 0.24478551751279023, - "grad_norm": 0.9915304289222059, - "learning_rate": 4.500110631087095e-06, - "loss": 0.1519, + "epoch": 0.028654351130971575, + "grad_norm": 1.5134645452681845, + "learning_rate": 2.8637200736648252e-06, + "loss": 0.2277, "step": 311 }, { - "epoch": 0.24557260920897284, - "grad_norm": 0.9782358310573961, - "learning_rate": 4.496196780005069e-06, - "loss": 0.1629, + "epoch": 0.028746487308241583, + "grad_norm": 1.2152059027151885, + "learning_rate": 2.87292817679558e-06, + "loss": 0.2047, "step": 312 }, { - "epoch": 0.24635970090515544, - "grad_norm": 1.0770165377269398, - "learning_rate": 4.492269383011512e-06, - "loss": 0.1623, + "epoch": 0.028838623485511587, + "grad_norm": 1.3277121581950153, + "learning_rate": 2.8821362799263355e-06, + "loss": 0.2256, "step": 313 }, { - "epoch": 0.24714679260133807, - "grad_norm": 1.052396599909024, - "learning_rate": 4.4883284667571894e-06, - "loss": 0.1533, + "epoch": 0.02893075966278159, + "grad_norm": 1.32730086437395, + "learning_rate": 2.8913443830570904e-06, + "loss": 0.2202, "step": 314 }, { - "epoch": 0.24793388429752067, - "grad_norm": 1.0084809840218907, - "learning_rate": 4.4843740579846055e-06, - "loss": 0.1512, + "epoch": 0.029022895840051598, + "grad_norm": 1.363811967060764, + "learning_rate": 2.9005524861878453e-06, + "loss": 0.2197, "step": 315 }, { - "epoch": 0.24872097599370327, - "grad_norm": 1.0756395659672484, - "learning_rate": 4.480406183527823e-06, - "loss": 0.1682, + "epoch": 0.0291150320173216, + "grad_norm": 1.2590769976128895, + "learning_rate": 2.9097605893186006e-06, + "loss": 0.2082, "step": 316 }, { - "epoch": 0.24950806768988587, - "grad_norm": 1.095604151904482, - "learning_rate": 4.476424870312286e-06, - "loss": 0.1588, + "epoch": 0.029207168194591605, + "grad_norm": 1.2277995813074711, + "learning_rate": 2.9189686924493555e-06, + "loss": 0.2061, "step": 317 }, { - "epoch": 0.2502951593860685, - "grad_norm": 1.073871876794014, - "learning_rate": 4.472430145354622e-06, - "loss": 0.1663, + "epoch": 0.029299304371861613, + "grad_norm": 1.3793722738217344, + "learning_rate": 2.9281767955801104e-06, + "loss": 0.1971, "step": 318 }, { - "epoch": 0.2510822510822511, - "grad_norm": 1.00181438336178, - "learning_rate": 4.46842203576248e-06, - "loss": 0.1668, + "epoch": 0.029391440549131616, + "grad_norm": 1.361895665830025, + "learning_rate": 2.937384898710866e-06, + "loss": 0.2427, "step": 319 }, { - "epoch": 0.2518693427784337, - "grad_norm": 1.0179064844212398, - "learning_rate": 4.464400568734327e-06, - "loss": 0.1618, + "epoch": 0.02948357672640162, + "grad_norm": 1.2966036850711782, + "learning_rate": 2.946593001841621e-06, + "loss": 0.2244, "step": 320 }, { - "epoch": 0.2526564344746163, - "grad_norm": 1.1266566093245078, - "learning_rate": 4.460365771559275e-06, - "loss": 0.1726, + "epoch": 0.029575712903671628, + "grad_norm": 1.1971066961262655, + "learning_rate": 2.955801104972376e-06, + "loss": 0.2265, "step": 321 }, { - "epoch": 0.2534435261707989, - "grad_norm": 1.0831980755033608, - "learning_rate": 4.456317671616892e-06, - "loss": 0.1674, + "epoch": 0.02966784908094163, + "grad_norm": 1.2180842224212953, + "learning_rate": 2.9650092081031313e-06, + "loss": 0.2238, "step": 322 }, { - "epoch": 0.2542306178669815, - "grad_norm": 0.9991360442603613, - "learning_rate": 4.452256296377017e-06, - "loss": 0.1534, + "epoch": 0.029759985258211635, + "grad_norm": 1.1589180181774166, + "learning_rate": 2.974217311233886e-06, + "loss": 0.1929, "step": 323 }, { - "epoch": 0.2550177095631641, - "grad_norm": 0.9497710360440503, - "learning_rate": 4.448181673399573e-06, - "loss": 0.1562, + "epoch": 0.029852121435481643, + "grad_norm": 1.2133350157084903, + "learning_rate": 2.983425414364641e-06, + "loss": 0.2134, "step": 324 }, { - "epoch": 0.25580480125934674, - "grad_norm": 1.1113260986403124, - "learning_rate": 4.444093830334381e-06, - "loss": 0.1639, + "epoch": 0.029944257612751646, + "grad_norm": 1.1779599790119306, + "learning_rate": 2.9926335174953964e-06, + "loss": 0.2066, "step": 325 }, { - "epoch": 0.2565918929555293, - "grad_norm": 1.1452949830587935, - "learning_rate": 4.4399927949209685e-06, - "loss": 0.1633, + "epoch": 0.03003639379002165, + "grad_norm": 1.1847663049212864, + "learning_rate": 3.0018416206261513e-06, + "loss": 0.1822, "step": 326 }, { - "epoch": 0.25737898465171194, - "grad_norm": 1.0842379105419755, - "learning_rate": 4.43587859498839e-06, - "loss": 0.1754, + "epoch": 0.030128529967291658, + "grad_norm": 1.2734701566041768, + "learning_rate": 3.0110497237569062e-06, + "loss": 0.2225, "step": 327 }, { - "epoch": 0.2581660763478945, - "grad_norm": 1.0361570331888057, - "learning_rate": 4.431751258455029e-06, - "loss": 0.1629, + "epoch": 0.03022066614456166, + "grad_norm": 1.456216336880786, + "learning_rate": 3.0202578268876615e-06, + "loss": 0.2303, "step": 328 }, { - "epoch": 0.25895316804407714, - "grad_norm": 0.9514704452172565, - "learning_rate": 4.4276108133284115e-06, - "loss": 0.1615, + "epoch": 0.03031280232183167, + "grad_norm": 1.3359492802492923, + "learning_rate": 3.0294659300184164e-06, + "loss": 0.2327, "step": 329 }, { - "epoch": 0.2597402597402597, - "grad_norm": 1.0051943736689641, - "learning_rate": 4.4234572877050175e-06, - "loss": 0.1635, + "epoch": 0.030404938499101673, + "grad_norm": 1.3170633885393337, + "learning_rate": 3.0386740331491713e-06, + "loss": 0.2113, "step": 330 }, { - "epoch": 0.26052735143644234, - "grad_norm": 1.061826511574687, - "learning_rate": 4.419290709770091e-06, - "loss": 0.1572, + "epoch": 0.030497074676371676, + "grad_norm": 1.3663569659678911, + "learning_rate": 3.0478821362799267e-06, + "loss": 0.2292, "step": 331 }, { - "epoch": 0.261314443132625, - "grad_norm": 1.0098180333606226, - "learning_rate": 4.415111107797445e-06, - "loss": 0.1625, + "epoch": 0.030589210853641684, + "grad_norm": 1.3094102769500708, + "learning_rate": 3.0570902394106816e-06, + "loss": 0.2158, "step": 332 }, { - "epoch": 0.26210153482880755, - "grad_norm": 0.9258158779374888, - "learning_rate": 4.4109185101492735e-06, - "loss": 0.163, + "epoch": 0.030681347030911688, + "grad_norm": 1.2497293850188491, + "learning_rate": 3.0662983425414365e-06, + "loss": 0.2161, "step": 333 }, { - "epoch": 0.2628886265249902, - "grad_norm": 1.031959410480149, - "learning_rate": 4.406712945275955e-06, - "loss": 0.1601, + "epoch": 0.03077348320818169, + "grad_norm": 1.1791717194673006, + "learning_rate": 3.075506445672192e-06, + "loss": 0.2042, "step": 334 }, { - "epoch": 0.26367571822117275, - "grad_norm": 1.098174422684468, - "learning_rate": 4.402494441715864e-06, - "loss": 0.1632, + "epoch": 0.0308656193854517, + "grad_norm": 1.2243353185304204, + "learning_rate": 3.0847145488029467e-06, + "loss": 0.2221, "step": 335 }, { - "epoch": 0.2644628099173554, - "grad_norm": 0.9325275936138202, - "learning_rate": 4.398263028095175e-06, - "loss": 0.1568, + "epoch": 0.030957755562721703, + "grad_norm": 1.2144934202478015, + "learning_rate": 3.0939226519337016e-06, + "loss": 0.2091, "step": 336 }, { - "epoch": 0.26524990161353795, - "grad_norm": 0.9452361980478395, - "learning_rate": 4.394018733127667e-06, - "loss": 0.1514, + "epoch": 0.031049891739991706, + "grad_norm": 1.3869201324530631, + "learning_rate": 3.103130755064457e-06, + "loss": 0.22, "step": 337 }, { - "epoch": 0.2660369933097206, - "grad_norm": 0.9440560796701104, - "learning_rate": 4.389761585614531e-06, - "loss": 0.1568, + "epoch": 0.031142027917261714, + "grad_norm": 1.2522412871255026, + "learning_rate": 3.112338858195212e-06, + "loss": 0.2012, "step": 338 }, { - "epoch": 0.2668240850059032, - "grad_norm": 0.9825093172685871, - "learning_rate": 4.3854916144441714e-06, - "loss": 0.1513, + "epoch": 0.031234164094531718, + "grad_norm": 1.223982875197098, + "learning_rate": 3.1215469613259667e-06, + "loss": 0.2362, "step": 339 }, { - "epoch": 0.2676111767020858, - "grad_norm": 0.9909422001877334, - "learning_rate": 4.381208848592017e-06, - "loss": 0.1607, + "epoch": 0.03132630027180172, + "grad_norm": 1.180490704761597, + "learning_rate": 3.130755064456722e-06, + "loss": 0.2006, "step": 340 }, { - "epoch": 0.2683982683982684, - "grad_norm": 1.026772957857381, - "learning_rate": 4.3769133171203146e-06, - "loss": 0.1579, + "epoch": 0.031418436449071725, + "grad_norm": 1.334003641795415, + "learning_rate": 3.139963167587477e-06, + "loss": 0.217, "step": 341 }, { - "epoch": 0.269185360094451, - "grad_norm": 0.9727634660522837, - "learning_rate": 4.372605049177939e-06, - "loss": 0.1611, + "epoch": 0.031510572626341736, + "grad_norm": 1.2176716108720274, + "learning_rate": 3.149171270718232e-06, + "loss": 0.2124, "step": 342 }, { - "epoch": 0.2699724517906336, - "grad_norm": 0.9991705382361779, - "learning_rate": 4.368284074000193e-06, - "loss": 0.1423, + "epoch": 0.03160270880361174, + "grad_norm": 1.3395536252432325, + "learning_rate": 3.1583793738489876e-06, + "loss": 0.2098, "step": 343 }, { - "epoch": 0.2707595434868162, - "grad_norm": 1.0413166825567135, - "learning_rate": 4.363950420908608e-06, - "loss": 0.1531, + "epoch": 0.031694844980881744, + "grad_norm": 1.283429869492175, + "learning_rate": 3.1675874769797425e-06, + "loss": 0.2314, "step": 344 }, { - "epoch": 0.2715466351829988, - "grad_norm": 1.051399331371367, - "learning_rate": 4.3596041193107475e-06, - "loss": 0.1537, + "epoch": 0.03178698115815175, + "grad_norm": 1.228230288659854, + "learning_rate": 3.176795580110498e-06, + "loss": 0.1965, "step": 345 }, { - "epoch": 0.27233372687918145, - "grad_norm": 1.1268002118202416, - "learning_rate": 4.355245198700003e-06, - "loss": 0.1687, + "epoch": 0.03187911733542175, + "grad_norm": 1.2440315402589066, + "learning_rate": 3.1860036832412528e-06, + "loss": 0.2225, "step": 346 }, { - "epoch": 0.273120818575364, - "grad_norm": 1.0579162910588005, - "learning_rate": 4.3508736886554e-06, - "loss": 0.1545, + "epoch": 0.031971253512691755, + "grad_norm": 1.2499654799548787, + "learning_rate": 3.1952117863720077e-06, + "loss": 0.2206, "step": 347 }, { - "epoch": 0.27390791027154665, - "grad_norm": 1.0780531804812832, - "learning_rate": 4.346489618841393e-06, - "loss": 0.1478, + "epoch": 0.032063389689961766, + "grad_norm": 1.2049177369182857, + "learning_rate": 3.204419889502763e-06, + "loss": 0.2035, "step": 348 }, { - "epoch": 0.2746950019677292, - "grad_norm": 1.1629336261073622, - "learning_rate": 4.342093019007664e-06, - "loss": 0.1507, + "epoch": 0.03215552586723177, + "grad_norm": 1.2316769248478436, + "learning_rate": 3.213627992633518e-06, + "loss": 0.2145, "step": 349 }, { - "epoch": 0.27548209366391185, - "grad_norm": 0.9806357134318359, - "learning_rate": 4.337683918988924e-06, - "loss": 0.1605, + "epoch": 0.032247662044501774, + "grad_norm": 1.2321172561939648, + "learning_rate": 3.222836095764273e-06, + "loss": 0.2112, "step": 350 }, { - "epoch": 0.2762691853600944, - "grad_norm": 1.0271547256327147, - "learning_rate": 4.333262348704708e-06, - "loss": 0.1544, + "epoch": 0.03233979822177178, + "grad_norm": 1.1523176756649027, + "learning_rate": 3.232044198895028e-06, + "loss": 0.2056, "step": 351 }, { - "epoch": 0.27705627705627706, - "grad_norm": 1.040963108089893, - "learning_rate": 4.328828338159173e-06, - "loss": 0.1505, + "epoch": 0.03243193439904178, + "grad_norm": 1.1236159577981801, + "learning_rate": 3.241252302025783e-06, + "loss": 0.1955, "step": 352 }, { - "epoch": 0.2778433687524597, - "grad_norm": 1.036202462349552, - "learning_rate": 4.324381917440891e-06, - "loss": 0.1558, + "epoch": 0.03252407057631179, + "grad_norm": 1.2130769573681421, + "learning_rate": 3.250460405156538e-06, + "loss": 0.2054, "step": 353 }, { - "epoch": 0.27863046044864226, - "grad_norm": 0.975994343559266, - "learning_rate": 4.319923116722651e-06, - "loss": 0.1641, + "epoch": 0.032616206753581796, + "grad_norm": 1.1913432800528299, + "learning_rate": 3.2596685082872933e-06, + "loss": 0.1981, "step": 354 }, { - "epoch": 0.2794175521448249, - "grad_norm": 1.039409188253541, - "learning_rate": 4.315451966261248e-06, - "loss": 0.1549, + "epoch": 0.0327083429308518, + "grad_norm": 1.3859609731695497, + "learning_rate": 3.268876611418048e-06, + "loss": 0.2342, "step": 355 }, { - "epoch": 0.28020464384100746, - "grad_norm": 1.047725080130562, - "learning_rate": 4.310968496397284e-06, - "loss": 0.165, + "epoch": 0.032800479108121804, + "grad_norm": 1.3072352238426874, + "learning_rate": 3.278084714548803e-06, + "loss": 0.2183, "step": 356 }, { - "epoch": 0.2809917355371901, - "grad_norm": 1.0011313336241248, - "learning_rate": 4.306472737554957e-06, - "loss": 0.1456, + "epoch": 0.03289261528539181, + "grad_norm": 1.3473692917629814, + "learning_rate": 3.2872928176795584e-06, + "loss": 0.2029, "step": 357 }, { - "epoch": 0.28177882723337266, - "grad_norm": 0.9015679935576075, - "learning_rate": 4.301964720241857e-06, - "loss": 0.1369, + "epoch": 0.03298475146266181, + "grad_norm": 1.2572639062134559, + "learning_rate": 3.2965009208103133e-06, + "loss": 0.2214, "step": 358 }, { - "epoch": 0.2825659189295553, - "grad_norm": 1.049381444767021, - "learning_rate": 4.297444475048755e-06, - "loss": 0.1563, + "epoch": 0.03307688763993182, + "grad_norm": 1.203052241995266, + "learning_rate": 3.305709023941068e-06, + "loss": 0.2026, "step": 359 }, { - "epoch": 0.2833530106257379, - "grad_norm": 1.0194195709152667, - "learning_rate": 4.292912032649403e-06, - "loss": 0.1649, + "epoch": 0.033169023817201826, + "grad_norm": 1.2948231954080816, + "learning_rate": 3.3149171270718235e-06, + "loss": 0.2209, "step": 360 }, { - "epoch": 0.2841401023219205, - "grad_norm": 0.957368492301693, - "learning_rate": 4.2883674238003195e-06, - "loss": 0.1515, + "epoch": 0.03326115999447183, + "grad_norm": 1.1961645679085242, + "learning_rate": 3.3241252302025784e-06, + "loss": 0.2119, "step": 361 }, { - "epoch": 0.2849271940181031, - "grad_norm": 1.1143901057936236, - "learning_rate": 4.2838106793405825e-06, - "loss": 0.1625, + "epoch": 0.033353296171741834, + "grad_norm": 1.2097174219855942, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.2142, "step": 362 }, { - "epoch": 0.2857142857142857, - "grad_norm": 1.0882799366794436, - "learning_rate": 4.2792418301916225e-06, - "loss": 0.153, + "epoch": 0.03344543234901184, + "grad_norm": 1.242640883301389, + "learning_rate": 3.3425414364640887e-06, + "loss": 0.21, "step": 363 }, { - "epoch": 0.2865013774104683, - "grad_norm": 1.0192177035415801, - "learning_rate": 4.274660907357009e-06, - "loss": 0.1645, + "epoch": 0.03353756852628184, + "grad_norm": 1.1964795201693772, + "learning_rate": 3.3517495395948436e-06, + "loss": 0.2179, "step": 364 }, { - "epoch": 0.2872884691066509, - "grad_norm": 1.0205240256871457, - "learning_rate": 4.2700679419222415e-06, - "loss": 0.1459, + "epoch": 0.03362970470355185, + "grad_norm": 1.2315695513233953, + "learning_rate": 3.3609576427255985e-06, + "loss": 0.2066, "step": 365 }, { - "epoch": 0.28807556080283353, - "grad_norm": 1.2141057030738465, - "learning_rate": 4.265462965054539e-06, - "loss": 0.1597, + "epoch": 0.033721840880821856, + "grad_norm": 1.112363729635578, + "learning_rate": 3.370165745856354e-06, + "loss": 0.2008, "step": 366 }, { - "epoch": 0.28886265249901616, - "grad_norm": 1.017121088131926, - "learning_rate": 4.260846008002631e-06, - "loss": 0.1619, + "epoch": 0.03381397705809186, + "grad_norm": 1.2655168156807526, + "learning_rate": 3.379373848987109e-06, + "loss": 0.193, "step": 367 }, { - "epoch": 0.28964974419519873, - "grad_norm": 1.0877328731947116, - "learning_rate": 4.25621710209654e-06, - "loss": 0.1716, + "epoch": 0.033906113235361864, + "grad_norm": 1.1496127915515584, + "learning_rate": 3.3885819521178644e-06, + "loss": 0.1982, "step": 368 }, { - "epoch": 0.29043683589138136, - "grad_norm": 1.1099554764936985, - "learning_rate": 4.251576278747372e-06, - "loss": 0.1599, + "epoch": 0.03399824941263187, + "grad_norm": 1.174710108082871, + "learning_rate": 3.3977900552486193e-06, + "loss": 0.1977, "step": 369 }, { - "epoch": 0.29122392758756394, - "grad_norm": 0.962048395782395, - "learning_rate": 4.246923569447105e-06, - "loss": 0.1465, + "epoch": 0.03409038558990188, + "grad_norm": 1.2707966093570304, + "learning_rate": 3.4069981583793742e-06, + "loss": 0.2018, "step": 370 }, { - "epoch": 0.29201101928374656, - "grad_norm": 1.0782102946203345, - "learning_rate": 4.24225900576837e-06, - "loss": 0.1584, + "epoch": 0.03418252176717188, + "grad_norm": 1.2562422642370359, + "learning_rate": 3.416206261510129e-06, + "loss": 0.2088, "step": 371 }, { - "epoch": 0.29279811097992914, - "grad_norm": 1.0600722154446367, - "learning_rate": 4.237582619364244e-06, - "loss": 0.1518, + "epoch": 0.034274657944441886, + "grad_norm": 1.2280848584726436, + "learning_rate": 3.4254143646408845e-06, + "loss": 0.2105, "step": 372 }, { - "epoch": 0.29358520267611177, - "grad_norm": 1.0154082912245785, - "learning_rate": 4.23289444196803e-06, - "loss": 0.1455, + "epoch": 0.03436679412171189, + "grad_norm": 1.217850057779426, + "learning_rate": 3.4346224677716394e-06, + "loss": 0.2063, "step": 373 }, { - "epoch": 0.2943722943722944, - "grad_norm": 1.1254176051245297, - "learning_rate": 4.228194505393041e-06, - "loss": 0.1544, + "epoch": 0.034458930298981894, + "grad_norm": 1.2091286639825245, + "learning_rate": 3.4438305709023943e-06, + "loss": 0.2128, "step": 374 }, { - "epoch": 0.29515938606847697, - "grad_norm": 1.1003313998342341, - "learning_rate": 4.22348284153239e-06, - "loss": 0.1611, + "epoch": 0.0345510664762519, + "grad_norm": 1.1715733041783976, + "learning_rate": 3.4530386740331496e-06, + "loss": 0.2021, "step": 375 }, { - "epoch": 0.2959464777646596, - "grad_norm": 0.9110264218620379, - "learning_rate": 4.218759482358765e-06, - "loss": 0.1479, + "epoch": 0.03464320265352191, + "grad_norm": 1.2001707885169832, + "learning_rate": 3.4622467771639045e-06, + "loss": 0.2257, "step": 376 }, { - "epoch": 0.2967335694608422, - "grad_norm": 1.0433752096490876, - "learning_rate": 4.214024459924221e-06, - "loss": 0.1561, + "epoch": 0.03473533883079191, + "grad_norm": 1.2368175910547214, + "learning_rate": 3.4714548802946594e-06, + "loss": 0.2145, "step": 377 }, { - "epoch": 0.2975206611570248, - "grad_norm": 0.9985242964251728, - "learning_rate": 4.209277806359956e-06, - "loss": 0.1486, + "epoch": 0.034827475008061916, + "grad_norm": 1.166337000359825, + "learning_rate": 3.4806629834254147e-06, + "loss": 0.2079, "step": 378 }, { - "epoch": 0.2983077528532074, - "grad_norm": 0.9830203630270159, - "learning_rate": 4.204519553876095e-06, - "loss": 0.153, + "epoch": 0.03491961118533192, + "grad_norm": 1.1572816320295447, + "learning_rate": 3.4898710865561696e-06, + "loss": 0.1901, "step": 379 }, { - "epoch": 0.29909484454939, - "grad_norm": 1.0623334389041004, - "learning_rate": 4.199749734761473e-06, - "loss": 0.1584, + "epoch": 0.035011747362601923, + "grad_norm": 1.1237321439729187, + "learning_rate": 3.4990791896869245e-06, + "loss": 0.2007, "step": 380 }, { - "epoch": 0.29988193624557263, - "grad_norm": 1.007050119697646, - "learning_rate": 4.194968381383414e-06, - "loss": 0.162, + "epoch": 0.03510388353987193, + "grad_norm": 1.215928962970433, + "learning_rate": 3.50828729281768e-06, + "loss": 0.2195, "step": 381 }, { - "epoch": 0.3006690279417552, - "grad_norm": 0.9212276043601202, - "learning_rate": 4.1901755261875116e-06, - "loss": 0.1417, + "epoch": 0.03519601971714194, + "grad_norm": 1.251329116724435, + "learning_rate": 3.5174953959484348e-06, + "loss": 0.2066, "step": 382 }, { - "epoch": 0.30145611963793784, - "grad_norm": 1.0195210503773229, - "learning_rate": 4.18537120169741e-06, - "loss": 0.1631, + "epoch": 0.03528815589441194, + "grad_norm": 1.2234531446138686, + "learning_rate": 3.5267034990791897e-06, + "loss": 0.2093, "step": 383 }, { - "epoch": 0.3022432113341204, - "grad_norm": 0.9791393783618954, - "learning_rate": 4.1805554405145805e-06, - "loss": 0.151, + "epoch": 0.035380292071681946, + "grad_norm": 1.2252920368192435, + "learning_rate": 3.535911602209945e-06, + "loss": 0.2202, "step": 384 }, { - "epoch": 0.30303030303030304, - "grad_norm": 0.9560471554995319, - "learning_rate": 4.175728275318105e-06, - "loss": 0.1537, + "epoch": 0.03547242824895195, + "grad_norm": 1.2609463141051451, + "learning_rate": 3.5451197053407e-06, + "loss": 0.2001, "step": 385 }, { - "epoch": 0.3038173947264856, - "grad_norm": 0.9732207377472094, - "learning_rate": 4.170889738864448e-06, - "loss": 0.1541, + "epoch": 0.03556456442622195, + "grad_norm": 1.2927472157292323, + "learning_rate": 3.554327808471455e-06, + "loss": 0.2256, "step": 386 }, { - "epoch": 0.30460448642266824, - "grad_norm": 1.0273971232086052, - "learning_rate": 4.166039863987241e-06, - "loss": 0.1623, + "epoch": 0.035656700603491964, + "grad_norm": 1.155641554981568, + "learning_rate": 3.56353591160221e-06, + "loss": 0.199, "step": 387 }, { - "epoch": 0.30539157811885087, - "grad_norm": 1.0066781633766182, - "learning_rate": 4.161178683597055e-06, - "loss": 0.1623, + "epoch": 0.03574883678076197, + "grad_norm": 1.2654887045193257, + "learning_rate": 3.572744014732965e-06, + "loss": 0.2078, "step": 388 }, { - "epoch": 0.30617866981503344, - "grad_norm": 0.9519906303887405, - "learning_rate": 4.156306230681178e-06, - "loss": 0.1606, + "epoch": 0.03584097295803197, + "grad_norm": 1.2166982932381427, + "learning_rate": 3.58195211786372e-06, + "loss": 0.2103, "step": 389 }, { - "epoch": 0.3069657615112161, - "grad_norm": 1.0274010773396909, - "learning_rate": 4.151422538303393e-06, - "loss": 0.1588, + "epoch": 0.035933109135301976, + "grad_norm": 1.37669075363763, + "learning_rate": 3.5911602209944757e-06, + "loss": 0.2152, "step": 390 }, { - "epoch": 0.30775285320739865, - "grad_norm": 1.0092975668609663, - "learning_rate": 4.1465276396037516e-06, - "loss": 0.1549, + "epoch": 0.03602524531257198, + "grad_norm": 1.3416512571374115, + "learning_rate": 3.6003683241252306e-06, + "loss": 0.1992, "step": 391 }, { - "epoch": 0.3085399449035813, - "grad_norm": 0.9952464194936945, - "learning_rate": 4.141621567798351e-06, - "loss": 0.1468, + "epoch": 0.03611738148984198, + "grad_norm": 1.229234386328588, + "learning_rate": 3.609576427255986e-06, + "loss": 0.2187, "step": 392 }, { - "epoch": 0.30932703659976385, - "grad_norm": 0.9809046515355889, - "learning_rate": 4.136704356179105e-06, - "loss": 0.1509, + "epoch": 0.036209517667111994, + "grad_norm": 1.1832506871580368, + "learning_rate": 3.618784530386741e-06, + "loss": 0.1978, "step": 393 }, { - "epoch": 0.3101141282959465, - "grad_norm": 1.0624718823483572, - "learning_rate": 4.131776038113524e-06, - "loss": 0.1629, + "epoch": 0.036301653844382, + "grad_norm": 1.2383981566876872, + "learning_rate": 3.6279926335174957e-06, + "loss": 0.207, "step": 394 }, { - "epoch": 0.3109012199921291, - "grad_norm": 0.9361042861540975, - "learning_rate": 4.126836647044484e-06, - "loss": 0.1453, + "epoch": 0.036393790021652, + "grad_norm": 1.1413755558262018, + "learning_rate": 3.637200736648251e-06, + "loss": 0.2155, "step": 395 }, { - "epoch": 0.3116883116883117, - "grad_norm": 1.0675702598561039, - "learning_rate": 4.121886216489999e-06, - "loss": 0.1657, + "epoch": 0.036485926198922006, + "grad_norm": 1.2450403165766708, + "learning_rate": 3.646408839779006e-06, + "loss": 0.2142, "step": 396 }, { - "epoch": 0.3124754033844943, - "grad_norm": 1.0221190108601212, - "learning_rate": 4.116924780042997e-06, - "loss": 0.1609, + "epoch": 0.03657806237619201, + "grad_norm": 1.2177024569527306, + "learning_rate": 3.655616942909761e-06, + "loss": 0.2167, "step": 397 }, { - "epoch": 0.3132624950806769, - "grad_norm": 0.98812521742716, - "learning_rate": 4.111952371371091e-06, - "loss": 0.1488, + "epoch": 0.03667019855346201, + "grad_norm": 1.2723658885665146, + "learning_rate": 3.664825046040516e-06, + "loss": 0.2194, "step": 398 }, { - "epoch": 0.3140495867768595, - "grad_norm": 0.9689235987787954, - "learning_rate": 4.106969024216348e-06, - "loss": 0.1547, + "epoch": 0.036762334730732024, + "grad_norm": 1.174670391591689, + "learning_rate": 3.674033149171271e-06, + "loss": 0.2089, "step": 399 }, { - "epoch": 0.3148366784730421, - "grad_norm": 1.0046393279094348, - "learning_rate": 4.101974772395066e-06, - "loss": 0.1467, + "epoch": 0.03685447090800203, + "grad_norm": 1.2221763371090841, + "learning_rate": 3.683241252302026e-06, + "loss": 0.2106, "step": 400 }, { - "epoch": 0.3156237701692247, - "grad_norm": 0.968527185963086, - "learning_rate": 4.096969649797534e-06, - "loss": 0.1432, + "epoch": 0.03694660708527203, + "grad_norm": 1.2965457839204568, + "learning_rate": 3.6924493554327813e-06, + "loss": 0.2099, "step": 401 }, { - "epoch": 0.31641086186540734, - "grad_norm": 1.0188815460176754, - "learning_rate": 4.091953690387815e-06, - "loss": 0.1521, + "epoch": 0.037038743262542036, + "grad_norm": 1.2033987453656643, + "learning_rate": 3.7016574585635362e-06, + "loss": 0.2095, "step": 402 }, { - "epoch": 0.3171979535615899, - "grad_norm": 1.035965071382904, - "learning_rate": 4.086926928203506e-06, - "loss": 0.1575, + "epoch": 0.03713087943981204, + "grad_norm": 1.2068890988997176, + "learning_rate": 3.710865561694291e-06, + "loss": 0.201, "step": 403 }, { - "epoch": 0.31798504525777255, - "grad_norm": 1.0400881738148544, - "learning_rate": 4.081889397355509e-06, - "loss": 0.1646, + "epoch": 0.03722301561708205, + "grad_norm": 1.203262066657531, + "learning_rate": 3.7200736648250464e-06, + "loss": 0.2073, "step": 404 }, { - "epoch": 0.3187721369539551, - "grad_norm": 1.0353365656909388, - "learning_rate": 4.076841132027805e-06, - "loss": 0.1578, + "epoch": 0.037315151794352054, + "grad_norm": 1.2129025553528847, + "learning_rate": 3.7292817679558014e-06, + "loss": 0.2214, "step": 405 }, { - "epoch": 0.31955922865013775, - "grad_norm": 0.9785090873779988, - "learning_rate": 4.071782166477213e-06, - "loss": 0.1485, + "epoch": 0.03740728797162206, + "grad_norm": 1.2850805303315755, + "learning_rate": 3.7384898710865563e-06, + "loss": 0.2202, "step": 406 }, { - "epoch": 0.3203463203463203, - "grad_norm": 1.0365440161437718, - "learning_rate": 4.066712535033164e-06, - "loss": 0.1644, + "epoch": 0.03749942414889206, + "grad_norm": 1.1849304905609503, + "learning_rate": 3.7476979742173116e-06, + "loss": 0.1994, "step": 407 }, { - "epoch": 0.32113341204250295, - "grad_norm": 0.9337858697268638, - "learning_rate": 4.061632272097467e-06, - "loss": 0.1396, + "epoch": 0.037591560326162066, + "grad_norm": 1.1447297755974901, + "learning_rate": 3.7569060773480665e-06, + "loss": 0.1877, "step": 408 }, { - "epoch": 0.3219205037386856, - "grad_norm": 0.9930564105014524, - "learning_rate": 4.056541412144073e-06, - "loss": 0.1466, + "epoch": 0.03768369650343207, + "grad_norm": 1.3731333236915204, + "learning_rate": 3.7661141804788214e-06, + "loss": 0.2068, "step": 409 }, { - "epoch": 0.32270759543486816, - "grad_norm": 1.0123860857315623, - "learning_rate": 4.051439989718845e-06, - "loss": 0.1718, + "epoch": 0.03777583268070208, + "grad_norm": 1.3507563680134274, + "learning_rate": 3.7753222836095767e-06, + "loss": 0.2184, "step": 410 }, { - "epoch": 0.3234946871310508, - "grad_norm": 0.9886983463565112, - "learning_rate": 4.0463280394393216e-06, - "loss": 0.1465, + "epoch": 0.037867968857972084, + "grad_norm": 1.2967176512561887, + "learning_rate": 3.7845303867403316e-06, + "loss": 0.1949, "step": 411 }, { - "epoch": 0.32428177882723336, - "grad_norm": 0.9489896550219313, - "learning_rate": 4.041205595994478e-06, - "loss": 0.1553, + "epoch": 0.03796010503524209, + "grad_norm": 1.1615648678608848, + "learning_rate": 3.7937384898710865e-06, + "loss": 0.1906, "step": 412 }, { - "epoch": 0.325068870523416, - "grad_norm": 0.935055903913981, - "learning_rate": 4.036072694144501e-06, - "loss": 0.1486, + "epoch": 0.03805224121251209, + "grad_norm": 1.1806971928220382, + "learning_rate": 3.802946593001842e-06, + "loss": 0.1814, "step": 413 }, { - "epoch": 0.32585596221959856, - "grad_norm": 1.0109287737515016, - "learning_rate": 4.030929368720539e-06, - "loss": 0.1563, + "epoch": 0.038144377389782096, + "grad_norm": 1.273886277912273, + "learning_rate": 3.812154696132597e-06, + "loss": 0.2248, "step": 414 }, { - "epoch": 0.3266430539157812, - "grad_norm": 0.9682667210224672, - "learning_rate": 4.025775654624481e-06, - "loss": 0.154, + "epoch": 0.038236513567052106, + "grad_norm": 1.1335712217012766, + "learning_rate": 3.8213627992633525e-06, + "loss": 0.2128, "step": 415 }, { - "epoch": 0.3274301456119638, - "grad_norm": 0.9195115794003238, - "learning_rate": 4.020611586828705e-06, - "loss": 0.1433, + "epoch": 0.03832864974432211, + "grad_norm": 1.1793652832224044, + "learning_rate": 3.830570902394107e-06, + "loss": 0.1874, "step": 416 }, { - "epoch": 0.3282172373081464, - "grad_norm": 0.886911970381121, - "learning_rate": 4.015437200375855e-06, - "loss": 0.1374, + "epoch": 0.038420785921592114, + "grad_norm": 1.2334577784320606, + "learning_rate": 3.839779005524862e-06, + "loss": 0.2141, "step": 417 }, { - "epoch": 0.329004329004329, - "grad_norm": 1.021240159520919, - "learning_rate": 4.01025253037859e-06, - "loss": 0.1567, + "epoch": 0.03851292209886212, + "grad_norm": 1.1609662659175002, + "learning_rate": 3.848987108655617e-06, + "loss": 0.2094, "step": 418 }, { - "epoch": 0.3297914207005116, - "grad_norm": 0.9462875663398478, - "learning_rate": 4.005057612019353e-06, - "loss": 0.1516, + "epoch": 0.03860505827613212, + "grad_norm": 1.1745343514754072, + "learning_rate": 3.858195211786372e-06, + "loss": 0.1954, "step": 419 }, { - "epoch": 0.3305785123966942, - "grad_norm": 0.9850150964188347, - "learning_rate": 3.9998524805501335e-06, - "loss": 0.149, + "epoch": 0.038697194453402126, + "grad_norm": 1.2526388364382148, + "learning_rate": 3.867403314917128e-06, + "loss": 0.2275, "step": 420 }, { - "epoch": 0.3313656040928768, - "grad_norm": 1.0466582919958791, - "learning_rate": 3.994637171292223e-06, - "loss": 0.1504, + "epoch": 0.038789330630672136, + "grad_norm": 1.4330497133104598, + "learning_rate": 3.876611418047883e-06, + "loss": 0.2126, "step": 421 }, { - "epoch": 0.3321526957890594, - "grad_norm": 0.9892923577104711, - "learning_rate": 3.989411719635979e-06, - "loss": 0.1465, + "epoch": 0.03888146680794214, + "grad_norm": 1.2562815683824222, + "learning_rate": 3.885819521178638e-06, + "loss": 0.2024, "step": 422 }, { - "epoch": 0.33293978748524206, - "grad_norm": 1.0797299646481497, - "learning_rate": 3.984176161040585e-06, - "loss": 0.1655, + "epoch": 0.038973602985212144, + "grad_norm": 1.4160966441130192, + "learning_rate": 3.8950276243093926e-06, + "loss": 0.2131, "step": 423 }, { - "epoch": 0.33372687918142463, - "grad_norm": 1.0280855278386611, - "learning_rate": 3.978930531033807e-06, - "loss": 0.1614, + "epoch": 0.03906573916248215, + "grad_norm": 1.3816473101884428, + "learning_rate": 3.9042357274401475e-06, + "loss": 0.2113, "step": 424 }, { - "epoch": 0.33451397087760726, - "grad_norm": 1.0013220452351206, - "learning_rate": 3.973674865211754e-06, - "loss": 0.1529, + "epoch": 0.03915787533975215, + "grad_norm": 1.2554377015630513, + "learning_rate": 3.913443830570902e-06, + "loss": 0.2132, "step": 425 }, { - "epoch": 0.33530106257378983, - "grad_norm": 1.0185754306460224, - "learning_rate": 3.968409199238639e-06, - "loss": 0.1535, + "epoch": 0.039250011517022156, + "grad_norm": 1.3398302209339412, + "learning_rate": 3.922651933701658e-06, + "loss": 0.2099, "step": 426 }, { - "epoch": 0.33608815426997246, - "grad_norm": 0.9680713968649773, - "learning_rate": 3.963133568846533e-06, - "loss": 0.1532, + "epoch": 0.039342147694292166, + "grad_norm": 1.3603810885516354, + "learning_rate": 3.931860036832413e-06, + "loss": 0.2125, "step": 427 }, { - "epoch": 0.33687524596615503, - "grad_norm": 1.0465737054070945, - "learning_rate": 3.957848009835125e-06, - "loss": 0.1557, + "epoch": 0.03943428387156217, + "grad_norm": 1.258039947717146, + "learning_rate": 3.941068139963168e-06, + "loss": 0.2057, "step": 428 }, { - "epoch": 0.33766233766233766, - "grad_norm": 1.0072097384887637, - "learning_rate": 3.952552558071475e-06, - "loss": 0.1686, + "epoch": 0.039526420048832174, + "grad_norm": 1.2036522078809784, + "learning_rate": 3.950276243093923e-06, + "loss": 0.1912, "step": 429 }, { - "epoch": 0.3384494293585203, - "grad_norm": 1.0495298691679416, - "learning_rate": 3.947247249489779e-06, - "loss": 0.1487, + "epoch": 0.03961855622610218, + "grad_norm": 1.2675274340086102, + "learning_rate": 3.959484346224678e-06, + "loss": 0.2224, "step": 430 }, { - "epoch": 0.33923652105470287, - "grad_norm": 1.0214586461562896, - "learning_rate": 3.941932120091116e-06, - "loss": 0.1621, + "epoch": 0.03971069240337218, + "grad_norm": 1.3973388775812086, + "learning_rate": 3.968692449355433e-06, + "loss": 0.2296, "step": 431 }, { - "epoch": 0.3400236127508855, - "grad_norm": 1.0494096714602847, - "learning_rate": 3.93660720594321e-06, - "loss": 0.1598, + "epoch": 0.03980282858064219, + "grad_norm": 1.543316930802054, + "learning_rate": 3.977900552486188e-06, + "loss": 0.2062, "step": 432 }, { - "epoch": 0.34081070444706807, - "grad_norm": 1.0334818385570048, - "learning_rate": 3.93127254318018e-06, - "loss": 0.1577, + "epoch": 0.039894964757912196, + "grad_norm": 1.1195071306132687, + "learning_rate": 3.987108655616943e-06, + "loss": 0.2085, "step": 433 }, { - "epoch": 0.3415977961432507, - "grad_norm": 0.9700994625756835, - "learning_rate": 3.925928168002302e-06, - "loss": 0.1526, + "epoch": 0.0399871009351822, + "grad_norm": 1.4042138133782505, + "learning_rate": 3.996316758747698e-06, + "loss": 0.1929, "step": 434 }, { - "epoch": 0.34238488783943327, - "grad_norm": 1.047736033995709, - "learning_rate": 3.920574116675756e-06, - "loss": 0.1581, + "epoch": 0.040079237112452204, + "grad_norm": 1.458286673287262, + "learning_rate": 4.005524861878453e-06, + "loss": 0.2097, "step": 435 }, { - "epoch": 0.3431719795356159, - "grad_norm": 1.0493869403649712, - "learning_rate": 3.915210425532383e-06, - "loss": 0.1495, + "epoch": 0.04017137328972221, + "grad_norm": 1.2963948315513325, + "learning_rate": 4.014732965009208e-06, + "loss": 0.2118, "step": 436 }, { - "epoch": 0.34395907123179853, - "grad_norm": 1.010254528268069, - "learning_rate": 3.90983713096944e-06, - "loss": 0.1539, + "epoch": 0.04026350946699221, + "grad_norm": 1.559050465013681, + "learning_rate": 4.023941068139964e-06, + "loss": 0.2149, "step": 437 }, { - "epoch": 0.3447461629279811, - "grad_norm": 0.9846398029609658, - "learning_rate": 3.9044542694493515e-06, - "loss": 0.1463, + "epoch": 0.04035564564426222, + "grad_norm": 1.2769319583081065, + "learning_rate": 4.033149171270719e-06, + "loss": 0.2219, "step": 438 }, { - "epoch": 0.34553325462416373, - "grad_norm": 1.2083136674514858, - "learning_rate": 3.899061877499461e-06, - "loss": 0.1601, + "epoch": 0.040447781821532226, + "grad_norm": 1.3096848506654442, + "learning_rate": 4.0423572744014736e-06, + "loss": 0.1945, "step": 439 }, { - "epoch": 0.3463203463203463, - "grad_norm": 0.97978554217786, - "learning_rate": 3.893659991711782e-06, - "loss": 0.139, + "epoch": 0.04053991799880223, + "grad_norm": 1.185303690120011, + "learning_rate": 4.051565377532229e-06, + "loss": 0.1891, "step": 440 }, { - "epoch": 0.34710743801652894, - "grad_norm": 1.1022405018344112, - "learning_rate": 3.888248648742756e-06, - "loss": 0.1617, + "epoch": 0.040632054176072234, + "grad_norm": 1.159162633689249, + "learning_rate": 4.060773480662984e-06, + "loss": 0.2175, "step": 441 }, { - "epoch": 0.3478945297127115, - "grad_norm": 1.0077367730076683, - "learning_rate": 3.882827885312999e-06, - "loss": 0.1488, + "epoch": 0.04072419035334224, + "grad_norm": 1.2662580758209543, + "learning_rate": 4.069981583793739e-06, + "loss": 0.2064, "step": 442 }, { - "epoch": 0.34868162140889414, - "grad_norm": 1.0119669193080498, - "learning_rate": 3.877397738207051e-06, - "loss": 0.1433, + "epoch": 0.04081632653061224, + "grad_norm": 1.1246911765480914, + "learning_rate": 4.079189686924494e-06, + "loss": 0.219, "step": 443 }, { - "epoch": 0.34946871310507677, - "grad_norm": 0.9336119872704435, - "learning_rate": 3.8719582442731276e-06, - "loss": 0.1393, + "epoch": 0.04090846270788225, + "grad_norm": 1.1350391555698733, + "learning_rate": 4.088397790055249e-06, + "loss": 0.203, "step": 444 }, { - "epoch": 0.35025580480125934, - "grad_norm": 1.0144372790745282, - "learning_rate": 3.866509440422873e-06, - "loss": 0.1515, + "epoch": 0.041000598885152256, + "grad_norm": 1.4680786044597403, + "learning_rate": 4.097605893186004e-06, + "loss": 0.2131, "step": 445 }, { - "epoch": 0.35104289649744197, - "grad_norm": 1.0618851735205919, - "learning_rate": 3.861051363631107e-06, - "loss": 0.1403, + "epoch": 0.04109273506242226, + "grad_norm": 1.2159762672968413, + "learning_rate": 4.106813996316759e-06, + "loss": 0.2029, "step": 446 }, { - "epoch": 0.35182998819362454, - "grad_norm": 1.0256940692518137, - "learning_rate": 3.855584050935574e-06, - "loss": 0.1533, + "epoch": 0.041184871239692264, + "grad_norm": 1.289465041658357, + "learning_rate": 4.1160220994475145e-06, + "loss": 0.2069, "step": 447 }, { - "epoch": 0.3526170798898072, - "grad_norm": 1.004262449427633, - "learning_rate": 3.85010753943669e-06, - "loss": 0.1437, + "epoch": 0.04127700741696227, + "grad_norm": 1.577269595539687, + "learning_rate": 4.125230202578269e-06, + "loss": 0.2179, "step": 448 }, { - "epoch": 0.35340417158598975, - "grad_norm": 0.9608822952661715, - "learning_rate": 3.844621866297295e-06, - "loss": 0.1374, + "epoch": 0.04136914359423228, + "grad_norm": 1.2817783420567126, + "learning_rate": 4.134438305709024e-06, + "loss": 0.2043, "step": 449 }, { - "epoch": 0.3541912632821724, - "grad_norm": 1.032805552257636, - "learning_rate": 3.839127068742399e-06, - "loss": 0.1612, + "epoch": 0.04146127977150228, + "grad_norm": 1.1310315648514593, + "learning_rate": 4.143646408839779e-06, + "loss": 0.2162, "step": 450 }, { - "epoch": 0.354978354978355, - "grad_norm": 1.089158815357864, - "learning_rate": 3.833623184058926e-06, - "loss": 0.1564, + "epoch": 0.041553415948772286, + "grad_norm": 1.3119107380289001, + "learning_rate": 4.152854511970534e-06, + "loss": 0.2197, "step": 451 }, { - "epoch": 0.3557654466745376, - "grad_norm": 1.0527347217082683, - "learning_rate": 3.8281102495954684e-06, - "loss": 0.1475, + "epoch": 0.04164555212604229, + "grad_norm": 1.3865007259353652, + "learning_rate": 4.162062615101289e-06, + "loss": 0.24, "step": 452 }, { - "epoch": 0.3565525383707202, - "grad_norm": 1.012969201356965, - "learning_rate": 3.8225883027620245e-06, - "loss": 0.1443, + "epoch": 0.041737688303312294, + "grad_norm": 1.1858485712993334, + "learning_rate": 4.171270718232045e-06, + "loss": 0.209, "step": 453 }, { - "epoch": 0.3573396300669028, - "grad_norm": 0.9952397622221426, - "learning_rate": 3.817057381029752e-06, - "loss": 0.1488, + "epoch": 0.0418298244805823, + "grad_norm": 1.1178283987286062, + "learning_rate": 4.1804788213628e-06, + "loss": 0.1983, "step": 454 }, { - "epoch": 0.3581267217630854, - "grad_norm": 0.9773911500192811, - "learning_rate": 3.811517521930711e-06, - "loss": 0.1419, + "epoch": 0.04192196065785231, + "grad_norm": 1.201810219473216, + "learning_rate": 4.1896869244935545e-06, + "loss": 0.2191, "step": 455 }, { - "epoch": 0.358913813459268, - "grad_norm": 1.04344141144674, - "learning_rate": 3.805968763057609e-06, - "loss": 0.1335, + "epoch": 0.04201409683512231, + "grad_norm": 1.2475953832029654, + "learning_rate": 4.1988950276243095e-06, + "loss": 0.2023, "step": 456 }, { - "epoch": 0.3597009051554506, - "grad_norm": 0.9127224357677829, - "learning_rate": 3.8004111420635453e-06, - "loss": 0.1421, + "epoch": 0.042106233012392316, + "grad_norm": 1.1804605088793463, + "learning_rate": 4.208103130755064e-06, + "loss": 0.2032, "step": 457 }, { - "epoch": 0.36048799685163324, - "grad_norm": 0.948335811441799, - "learning_rate": 3.7948446966617568e-06, - "loss": 0.1545, + "epoch": 0.04219836918966232, + "grad_norm": 1.2262939863172024, + "learning_rate": 4.217311233885819e-06, + "loss": 0.2193, "step": 458 }, { - "epoch": 0.3612750885478158, - "grad_norm": 1.054156015531643, - "learning_rate": 3.7892694646253624e-06, - "loss": 0.1462, + "epoch": 0.042290505366932324, + "grad_norm": 1.2144298896507224, + "learning_rate": 4.226519337016575e-06, + "loss": 0.221, "step": 459 }, { - "epoch": 0.36206218024399844, - "grad_norm": 1.0883694334017704, - "learning_rate": 3.783685483787105e-06, - "loss": 0.1469, + "epoch": 0.04238264154420233, + "grad_norm": 1.2934211101242432, + "learning_rate": 4.23572744014733e-06, + "loss": 0.2258, "step": 460 }, { - "epoch": 0.362849271940181, - "grad_norm": 1.0265972829923478, - "learning_rate": 3.7780927920390965e-06, - "loss": 0.1572, + "epoch": 0.04247477772147234, + "grad_norm": 1.1425010577356645, + "learning_rate": 4.244935543278086e-06, + "loss": 0.2027, "step": 461 }, { - "epoch": 0.36363636363636365, - "grad_norm": 0.9351090223515385, - "learning_rate": 3.772491427332557e-06, - "loss": 0.1317, + "epoch": 0.04256691389874234, + "grad_norm": 1.185579804260426, + "learning_rate": 4.2541436464088406e-06, + "loss": 0.2351, "step": 462 }, { - "epoch": 0.3644234553325462, - "grad_norm": 0.96672130032329, - "learning_rate": 3.766881427677563e-06, - "loss": 0.1474, + "epoch": 0.042659050076012346, + "grad_norm": 1.1729170129773308, + "learning_rate": 4.2633517495395955e-06, + "loss": 0.2035, "step": 463 }, { - "epoch": 0.36521054702872885, - "grad_norm": 0.9284227954997755, - "learning_rate": 3.761262831142788e-06, - "loss": 0.144, + "epoch": 0.04275118625328235, + "grad_norm": 1.1888699825576086, + "learning_rate": 4.27255985267035e-06, + "loss": 0.2234, "step": 464 }, { - "epoch": 0.3659976387249115, - "grad_norm": 1.02329013434613, - "learning_rate": 3.755635675855238e-06, - "loss": 0.1459, + "epoch": 0.042843322430552354, + "grad_norm": 1.0956917428584472, + "learning_rate": 4.281767955801105e-06, + "loss": 0.1724, "step": 465 }, { - "epoch": 0.36678473042109405, - "grad_norm": 0.9548918394606087, - "learning_rate": 3.7500000000000005e-06, - "loss": 0.1431, + "epoch": 0.042935458607822365, + "grad_norm": 1.2428999853822662, + "learning_rate": 4.29097605893186e-06, + "loss": 0.2288, "step": 466 }, { - "epoch": 0.3675718221172767, - "grad_norm": 1.0029018534160843, - "learning_rate": 3.744355841819983e-06, - "loss": 0.1551, + "epoch": 0.04302759478509237, + "grad_norm": 1.1530798568369878, + "learning_rate": 4.300184162062616e-06, + "loss": 0.2017, "step": 467 }, { - "epoch": 0.36835891381345925, - "grad_norm": 1.0170466682076178, - "learning_rate": 3.7387032396156497e-06, - "loss": 0.1574, + "epoch": 0.04311973096236237, + "grad_norm": 1.3439487754256692, + "learning_rate": 4.309392265193371e-06, + "loss": 0.2096, "step": 468 }, { - "epoch": 0.3691460055096419, - "grad_norm": 0.950504547373793, - "learning_rate": 3.7330422317447686e-06, - "loss": 0.1413, + "epoch": 0.043211867139632376, + "grad_norm": 1.1683549358961656, + "learning_rate": 4.318600368324126e-06, + "loss": 0.2148, "step": 469 }, { - "epoch": 0.36993309720582446, - "grad_norm": 1.00490564394254, - "learning_rate": 3.7273728566221447e-06, - "loss": 0.1539, + "epoch": 0.04330400331690238, + "grad_norm": 1.2522700216036091, + "learning_rate": 4.327808471454881e-06, + "loss": 0.2164, "step": 470 }, { - "epoch": 0.3707201889020071, - "grad_norm": 1.0241073155182219, - "learning_rate": 3.721695152719364e-06, - "loss": 0.1505, + "epoch": 0.043396139494172384, + "grad_norm": 1.2387500928446844, + "learning_rate": 4.3370165745856355e-06, + "loss": 0.2054, "step": 471 }, { - "epoch": 0.3715072805981897, - "grad_norm": 1.0650129974030413, - "learning_rate": 3.716009158564528e-06, - "loss": 0.1517, + "epoch": 0.043488275671442395, + "grad_norm": 1.1954111876349953, + "learning_rate": 4.3462246777163904e-06, + "loss": 0.2119, "step": 472 }, { - "epoch": 0.3722943722943723, - "grad_norm": 1.0412500508709561, - "learning_rate": 3.710314912741997e-06, - "loss": 0.1447, + "epoch": 0.0435804118487124, + "grad_norm": 1.6231312157976416, + "learning_rate": 4.355432780847146e-06, + "loss": 0.2158, "step": 473 }, { - "epoch": 0.3730814639905549, - "grad_norm": 1.0273490151395026, - "learning_rate": 3.7046124538921237e-06, - "loss": 0.1429, + "epoch": 0.0436725480259824, + "grad_norm": 1.3358159604896864, + "learning_rate": 4.364640883977901e-06, + "loss": 0.2089, "step": 474 }, { - "epoch": 0.3738685556867375, - "grad_norm": 0.9952543111661871, - "learning_rate": 3.698901820710995e-06, - "loss": 0.1418, + "epoch": 0.043764684203252406, + "grad_norm": 1.4873292661852786, + "learning_rate": 4.373848987108656e-06, + "loss": 0.2184, "step": 475 }, { - "epoch": 0.3746556473829201, - "grad_norm": 1.0824700054534682, - "learning_rate": 3.693183051950168e-06, - "loss": 0.1437, + "epoch": 0.04385682038052241, + "grad_norm": 1.5926804326802442, + "learning_rate": 4.383057090239411e-06, + "loss": 0.2198, "step": 476 }, { - "epoch": 0.3754427390791027, - "grad_norm": 1.0142752196453109, - "learning_rate": 3.6874561864164056e-06, - "loss": 0.1435, + "epoch": 0.043948956557792414, + "grad_norm": 1.1544457982412275, + "learning_rate": 4.392265193370166e-06, + "loss": 0.2047, "step": 477 }, { - "epoch": 0.3762298307752853, - "grad_norm": 0.9888106276082754, - "learning_rate": 3.6817212629714135e-06, - "loss": 0.1395, + "epoch": 0.044041092735062425, + "grad_norm": 1.3790644801344767, + "learning_rate": 4.401473296500921e-06, + "loss": 0.2104, "step": 478 }, { - "epoch": 0.37701692247146795, - "grad_norm": 0.9673698851206235, - "learning_rate": 3.675978320531579e-06, - "loss": 0.1425, + "epoch": 0.04413322891233243, + "grad_norm": 1.4144109682196038, + "learning_rate": 4.4106813996316765e-06, + "loss": 0.2169, "step": 479 }, { - "epoch": 0.3778040141676505, - "grad_norm": 1.096283920214562, - "learning_rate": 3.670227398067705e-06, - "loss": 0.1515, + "epoch": 0.04422536508960243, + "grad_norm": 1.235543716580457, + "learning_rate": 4.419889502762431e-06, + "loss": 0.2173, "step": 480 }, { - "epoch": 0.37859110586383315, - "grad_norm": 1.0303413811027284, - "learning_rate": 3.664468534604745e-06, - "loss": 0.1462, + "epoch": 0.044317501266872436, + "grad_norm": 1.2283735168259708, + "learning_rate": 4.429097605893186e-06, + "loss": 0.1962, "step": 481 }, { - "epoch": 0.37937819756001573, - "grad_norm": 0.9550517801003708, - "learning_rate": 3.6587017692215387e-06, - "loss": 0.1483, + "epoch": 0.04440963744414244, + "grad_norm": 1.4283017557499813, + "learning_rate": 4.438305709023941e-06, + "loss": 0.218, "step": 482 }, { - "epoch": 0.38016528925619836, - "grad_norm": 1.0499765951347195, - "learning_rate": 3.6529271410505483e-06, - "loss": 0.1516, + "epoch": 0.04450177362141245, + "grad_norm": 1.4867664072627964, + "learning_rate": 4.447513812154696e-06, + "loss": 0.2183, "step": 483 }, { - "epoch": 0.38095238095238093, - "grad_norm": 1.0612285971687154, - "learning_rate": 3.6471446892775896e-06, - "loss": 0.145, + "epoch": 0.044593909798682455, + "grad_norm": 1.4108802496691264, + "learning_rate": 4.456721915285452e-06, + "loss": 0.2013, "step": 484 }, { - "epoch": 0.38173947264856356, - "grad_norm": 0.9976574649139153, - "learning_rate": 3.6413544531415712e-06, - "loss": 0.1493, + "epoch": 0.04468604597595246, + "grad_norm": 1.166986570586516, + "learning_rate": 4.465930018416207e-06, + "loss": 0.2002, "step": 485 }, { - "epoch": 0.3825265643447462, - "grad_norm": 1.011974051278155, - "learning_rate": 3.635556471934224e-06, - "loss": 0.1557, + "epoch": 0.04477818215322246, + "grad_norm": 1.1648717298249538, + "learning_rate": 4.475138121546962e-06, + "loss": 0.2209, "step": 486 }, { - "epoch": 0.38331365604092876, - "grad_norm": 1.015959048224715, - "learning_rate": 3.629750784999835e-06, - "loss": 0.152, + "epoch": 0.044870318330492466, + "grad_norm": 1.2876822750663734, + "learning_rate": 4.484346224677717e-06, + "loss": 0.2046, "step": 487 }, { - "epoch": 0.3841007477371114, - "grad_norm": 0.9638439392236781, - "learning_rate": 3.623937431734982e-06, - "loss": 0.1464, + "epoch": 0.04496245450776247, + "grad_norm": 1.2801973779891336, + "learning_rate": 4.493554327808472e-06, + "loss": 0.2199, "step": 488 }, { - "epoch": 0.38488783943329397, - "grad_norm": 0.9820530085625633, - "learning_rate": 3.6181164515882663e-06, - "loss": 0.1468, + "epoch": 0.04505459068503248, + "grad_norm": 1.298576742843075, + "learning_rate": 4.502762430939227e-06, + "loss": 0.2117, "step": 489 }, { - "epoch": 0.3856749311294766, - "grad_norm": 0.9281524539517508, - "learning_rate": 3.6122878840600417e-06, - "loss": 0.1451, + "epoch": 0.045146726862302484, + "grad_norm": 1.3131277756849138, + "learning_rate": 4.511970534069982e-06, + "loss": 0.2126, "step": 490 }, { - "epoch": 0.38646202282565917, - "grad_norm": 1.039305922376239, - "learning_rate": 3.606451768702151e-06, - "loss": 0.1486, + "epoch": 0.04523886303957249, + "grad_norm": 1.163802616612496, + "learning_rate": 4.521178637200737e-06, + "loss": 0.2079, "step": 491 }, { - "epoch": 0.3872491145218418, - "grad_norm": 1.026987888426606, - "learning_rate": 3.600608145117656e-06, - "loss": 0.1381, + "epoch": 0.04533099921684249, + "grad_norm": 1.223443299674731, + "learning_rate": 4.530386740331492e-06, + "loss": 0.2047, "step": 492 }, { - "epoch": 0.3880362062180244, - "grad_norm": 1.058827889093346, - "learning_rate": 3.594757052960566e-06, - "loss": 0.1555, + "epoch": 0.045423135394112496, + "grad_norm": 1.183473280850051, + "learning_rate": 4.539594843462248e-06, + "loss": 0.1852, "step": 493 }, { - "epoch": 0.388823297914207, - "grad_norm": 1.0027016575115129, - "learning_rate": 3.588898531935573e-06, - "loss": 0.1413, + "epoch": 0.04551527157138251, + "grad_norm": 1.1651402187049773, + "learning_rate": 4.5488029465930025e-06, + "loss": 0.2038, "step": 494 }, { - "epoch": 0.38961038961038963, - "grad_norm": 1.0766471714794614, - "learning_rate": 3.583032621797778e-06, - "loss": 0.1418, + "epoch": 0.04560740774865251, + "grad_norm": 1.1460140058645047, + "learning_rate": 4.5580110497237574e-06, + "loss": 0.196, "step": 495 }, { - "epoch": 0.3903974813065722, - "grad_norm": 1.0326313481110534, - "learning_rate": 3.5771593623524263e-06, - "loss": 0.1345, + "epoch": 0.045699543925922514, + "grad_norm": 1.1776138222171473, + "learning_rate": 4.567219152854512e-06, + "loss": 0.1995, "step": 496 }, { - "epoch": 0.39118457300275483, - "grad_norm": 0.9649958546178075, - "learning_rate": 3.5712787934546336e-06, - "loss": 0.1397, + "epoch": 0.04579168010319252, + "grad_norm": 1.2204020301333764, + "learning_rate": 4.576427255985267e-06, + "loss": 0.2143, "step": 497 }, { - "epoch": 0.3919716646989374, - "grad_norm": 1.0461258832079, - "learning_rate": 3.5653909550091138e-06, - "loss": 0.16, + "epoch": 0.04588381628046252, + "grad_norm": 1.205241368495962, + "learning_rate": 4.585635359116022e-06, + "loss": 0.1979, "step": 498 }, { - "epoch": 0.39275875639512003, - "grad_norm": 0.9741702004779168, - "learning_rate": 3.559495886969916e-06, - "loss": 0.1366, + "epoch": 0.045975952457732526, + "grad_norm": 1.1734468213424174, + "learning_rate": 4.594843462246777e-06, + "loss": 0.2078, "step": 499 }, { - "epoch": 0.39354584809130266, - "grad_norm": 0.9875184829637668, - "learning_rate": 3.553593629340144e-06, - "loss": 0.1391, + "epoch": 0.04606808863500254, + "grad_norm": 1.1082444575732158, + "learning_rate": 4.604051565377533e-06, + "loss": 0.1961, "step": 500 }, { - "epoch": 0.39354584809130266, - "eval_loss": 0.14773064851760864, - "eval_runtime": 18.0322, - "eval_samples_per_second": 45.585, - "eval_steps_per_second": 5.712, + "epoch": 0.04606808863500254, + "eval_loss": 0.20690900087356567, + "eval_runtime": 299.5863, + "eval_samples_per_second": 23.422, + "eval_steps_per_second": 2.931, "step": 500 }, { - "epoch": 0.39433293978748524, - "grad_norm": 0.9563741831859393, - "learning_rate": 3.5476842221716915e-06, - "loss": 0.1453, + "epoch": 0.04616022481227254, + "grad_norm": 1.2088449274977096, + "learning_rate": 4.613259668508288e-06, + "loss": 0.2047, "step": 501 }, { - "epoch": 0.39512003148366787, - "grad_norm": 0.9839000041167648, - "learning_rate": 3.541767705564967e-06, - "loss": 0.1509, + "epoch": 0.046252360989542544, + "grad_norm": 1.142903312485517, + "learning_rate": 4.622467771639043e-06, + "loss": 0.2019, "step": 502 }, { - "epoch": 0.39590712317985044, - "grad_norm": 0.9666175985112762, - "learning_rate": 3.535844119668622e-06, - "loss": 0.1436, + "epoch": 0.04634449716681255, + "grad_norm": 1.2256105445368588, + "learning_rate": 4.6316758747697975e-06, + "loss": 0.1987, "step": 503 }, { - "epoch": 0.39669421487603307, - "grad_norm": 1.0513295542177603, - "learning_rate": 3.5299135046792816e-06, - "loss": 0.1371, + "epoch": 0.04643663334408255, + "grad_norm": 1.1904513269582884, + "learning_rate": 4.640883977900552e-06, + "loss": 0.2127, "step": 504 }, { - "epoch": 0.39748130657221564, - "grad_norm": 1.0136623338528887, - "learning_rate": 3.5239759008412666e-06, - "loss": 0.1498, + "epoch": 0.046528769521352556, + "grad_norm": 1.1457301085346001, + "learning_rate": 4.650092081031307e-06, + "loss": 0.197, "step": 505 }, { - "epoch": 0.39826839826839827, - "grad_norm": 0.9764920494655156, - "learning_rate": 3.518031348446324e-06, - "loss": 0.1371, + "epoch": 0.04662090569862257, + "grad_norm": 1.3077547506039735, + "learning_rate": 4.659300184162063e-06, + "loss": 0.1919, "step": 506 }, { - "epoch": 0.3990554899645809, - "grad_norm": 1.0113031849627157, - "learning_rate": 3.5120798878333544e-06, - "loss": 0.1453, + "epoch": 0.04671304187589257, + "grad_norm": 1.0715902844691318, + "learning_rate": 4.668508287292818e-06, + "loss": 0.1938, "step": 507 }, { - "epoch": 0.3998425816607635, - "grad_norm": 0.9947509560502654, - "learning_rate": 3.506121559388135e-06, - "loss": 0.1233, + "epoch": 0.046805178053162574, + "grad_norm": 1.1850578820772626, + "learning_rate": 4.677716390423574e-06, + "loss": 0.2025, "step": 508 }, { - "epoch": 0.4006296733569461, - "grad_norm": 1.1135464243984814, - "learning_rate": 3.500156403543046e-06, - "loss": 0.151, + "epoch": 0.04689731423043258, + "grad_norm": 1.4073409971969546, + "learning_rate": 4.686924493554329e-06, + "loss": 0.2144, "step": 509 }, { - "epoch": 0.4014167650531287, - "grad_norm": 1.0687025563863246, - "learning_rate": 3.4941844607768007e-06, - "loss": 0.1384, + "epoch": 0.04698945040770258, + "grad_norm": 1.2622270185118862, + "learning_rate": 4.6961325966850835e-06, + "loss": 0.2194, "step": 510 }, { - "epoch": 0.4022038567493113, - "grad_norm": 0.9654525860741724, - "learning_rate": 3.488205771614164e-06, - "loss": 0.1348, + "epoch": 0.04708158658497259, + "grad_norm": 1.2236474583396226, + "learning_rate": 4.7053406998158384e-06, + "loss": 0.2092, "step": 511 }, { - "epoch": 0.4029909484454939, - "grad_norm": 1.07357744190682, - "learning_rate": 3.4822203766256834e-06, - "loss": 0.1412, + "epoch": 0.0471737227622426, + "grad_norm": 1.1348264980475313, + "learning_rate": 4.714548802946593e-06, + "loss": 0.2052, "step": 512 }, { - "epoch": 0.4037780401416765, - "grad_norm": 1.2491546536330014, - "learning_rate": 3.4762283164274104e-06, - "loss": 0.1523, + "epoch": 0.0472658589395126, + "grad_norm": 1.3406440516081668, + "learning_rate": 4.723756906077348e-06, + "loss": 0.207, "step": 513 }, { - "epoch": 0.40456513183785914, - "grad_norm": 1.0398955239354635, - "learning_rate": 3.4702296316806243e-06, - "loss": 0.1507, + "epoch": 0.047357995116782604, + "grad_norm": 1.446315027401559, + "learning_rate": 4.732965009208104e-06, + "loss": 0.2169, "step": 514 }, { - "epoch": 0.4053522235340417, - "grad_norm": 0.947562520308943, - "learning_rate": 3.4642243630915606e-06, - "loss": 0.1486, + "epoch": 0.04745013129405261, + "grad_norm": 1.2350193328918795, + "learning_rate": 4.742173112338859e-06, + "loss": 0.2123, "step": 515 }, { - "epoch": 0.40613931523022434, - "grad_norm": 0.9405204018759319, - "learning_rate": 3.45821255141113e-06, - "loss": 0.1287, + "epoch": 0.04754226747132261, + "grad_norm": 1.3453002863844115, + "learning_rate": 4.751381215469614e-06, + "loss": 0.2213, "step": 516 }, { - "epoch": 0.4069264069264069, - "grad_norm": 1.01025400774114, - "learning_rate": 3.452194237434642e-06, - "loss": 0.1349, + "epoch": 0.04763440364859262, + "grad_norm": 1.3208799605546395, + "learning_rate": 4.760589318600369e-06, + "loss": 0.2248, "step": 517 }, { - "epoch": 0.40771349862258954, - "grad_norm": 1.0404932578099988, - "learning_rate": 3.446169462001534e-06, - "loss": 0.1508, + "epoch": 0.04772653982586263, + "grad_norm": 1.197417531741685, + "learning_rate": 4.769797421731124e-06, + "loss": 0.2067, "step": 518 }, { - "epoch": 0.4085005903187721, - "grad_norm": 1.029425420995215, - "learning_rate": 3.4401382659950868e-06, - "loss": 0.1362, + "epoch": 0.04781867600313263, + "grad_norm": 1.166778788170299, + "learning_rate": 4.7790055248618785e-06, + "loss": 0.2121, "step": 519 }, { - "epoch": 0.40928768201495475, - "grad_norm": 1.025768159905711, - "learning_rate": 3.4341006903421493e-06, - "loss": 0.1437, + "epoch": 0.047910812180402634, + "grad_norm": 1.2056681536807465, + "learning_rate": 4.788213627992634e-06, + "loss": 0.2048, "step": 520 }, { - "epoch": 0.4100747737111374, - "grad_norm": 0.9507044448226175, - "learning_rate": 3.4280567760128658e-06, - "loss": 0.1393, + "epoch": 0.04800294835767264, + "grad_norm": 1.1862496944445569, + "learning_rate": 4.797421731123389e-06, + "loss": 0.2052, "step": 521 }, { - "epoch": 0.41086186540731995, - "grad_norm": 1.0374082813027519, - "learning_rate": 3.4220065640203916e-06, - "loss": 0.16, + "epoch": 0.04809508453494264, + "grad_norm": 1.1650915820727847, + "learning_rate": 4.806629834254144e-06, + "loss": 0.2037, "step": 522 }, { - "epoch": 0.4116489571035026, - "grad_norm": 0.9378353888939086, - "learning_rate": 3.415950095420616e-06, - "loss": 0.1355, + "epoch": 0.04818722071221265, + "grad_norm": 1.0997406935439868, + "learning_rate": 4.815837937384899e-06, + "loss": 0.2162, "step": 523 }, { - "epoch": 0.41243604879968515, - "grad_norm": 0.924561930587711, - "learning_rate": 3.4098874113118863e-06, - "loss": 0.1452, + "epoch": 0.04827935688948266, + "grad_norm": 1.142599490179962, + "learning_rate": 4.825046040515654e-06, + "loss": 0.1947, "step": 524 }, { - "epoch": 0.4132231404958678, - "grad_norm": 0.9505049489489825, - "learning_rate": 3.403818552834727e-06, - "loss": 0.1448, + "epoch": 0.04837149306675266, + "grad_norm": 1.3182614669715278, + "learning_rate": 4.834254143646409e-06, + "loss": 0.2224, "step": 525 }, { - "epoch": 0.41401023219205035, - "grad_norm": 0.9701870488491394, - "learning_rate": 3.397743561171562e-06, - "loss": 0.1341, + "epoch": 0.048463629244022664, + "grad_norm": 1.1006337099413035, + "learning_rate": 4.8434622467771645e-06, + "loss": 0.2104, "step": 526 }, { - "epoch": 0.414797323888233, - "grad_norm": 0.9122288876708122, - "learning_rate": 3.3916624775464318e-06, - "loss": 0.1291, + "epoch": 0.04855576542129267, + "grad_norm": 1.2851582791992746, + "learning_rate": 4.852670349907919e-06, + "loss": 0.2112, "step": 527 }, { - "epoch": 0.4155844155844156, - "grad_norm": 0.985150804267496, - "learning_rate": 3.385575343224718e-06, - "loss": 0.141, + "epoch": 0.04864790159856268, + "grad_norm": 1.226468392041963, + "learning_rate": 4.861878453038674e-06, + "loss": 0.2075, "step": 528 }, { - "epoch": 0.4163715072805982, - "grad_norm": 0.9910844190276262, - "learning_rate": 3.3794821995128606e-06, - "loss": 0.1473, + "epoch": 0.04874003777583268, + "grad_norm": 1.1605259603239653, + "learning_rate": 4.871086556169429e-06, + "loss": 0.2143, "step": 529 }, { - "epoch": 0.4171585989767808, - "grad_norm": 0.9925292173111532, - "learning_rate": 3.3733830877580796e-06, - "loss": 0.1492, + "epoch": 0.04883217395310269, + "grad_norm": 1.1872303036487082, + "learning_rate": 4.880294659300184e-06, + "loss": 0.2131, "step": 530 }, { - "epoch": 0.4179456906729634, - "grad_norm": 0.9483537804421872, - "learning_rate": 3.3672780493480927e-06, - "loss": 0.1476, + "epoch": 0.04892431013037269, + "grad_norm": 1.2324441915157167, + "learning_rate": 4.889502762430939e-06, + "loss": 0.2198, "step": 531 }, { - "epoch": 0.418732782369146, - "grad_norm": 0.9716970355806354, - "learning_rate": 3.3611671257108323e-06, - "loss": 0.1288, + "epoch": 0.049016446307642694, + "grad_norm": 1.084873626256991, + "learning_rate": 4.898710865561695e-06, + "loss": 0.1981, "step": 532 }, { - "epoch": 0.4195198740653286, - "grad_norm": 1.0525983321400059, - "learning_rate": 3.3550503583141726e-06, - "loss": 0.1541, + "epoch": 0.0491085824849127, + "grad_norm": 1.0866645738744949, + "learning_rate": 4.90791896869245e-06, + "loss": 0.2105, "step": 533 }, { - "epoch": 0.4203069657615112, - "grad_norm": 0.9476841095185634, - "learning_rate": 3.3489277886656373e-06, - "loss": 0.1395, + "epoch": 0.04920071866218271, + "grad_norm": 1.2290547614755443, + "learning_rate": 4.9171270718232054e-06, + "loss": 0.2274, "step": 534 }, { - "epoch": 0.42109405745769385, - "grad_norm": 0.8883884320293254, - "learning_rate": 3.342799458312127e-06, - "loss": 0.1374, + "epoch": 0.04929285483945271, + "grad_norm": 1.1524273585439213, + "learning_rate": 4.92633517495396e-06, + "loss": 0.2047, "step": 535 }, { - "epoch": 0.4218811491538764, - "grad_norm": 1.026818858865084, - "learning_rate": 3.336665408839633e-06, - "loss": 0.1413, + "epoch": 0.04938499101672272, + "grad_norm": 1.061747508568193, + "learning_rate": 4.935543278084715e-06, + "loss": 0.1734, "step": 536 }, { - "epoch": 0.42266824085005905, - "grad_norm": 0.9146805645048051, - "learning_rate": 3.330525681872954e-06, - "loss": 0.1352, + "epoch": 0.04947712719399272, + "grad_norm": 1.2916494032401462, + "learning_rate": 4.94475138121547e-06, + "loss": 0.2168, "step": 537 }, { - "epoch": 0.4234553325462416, - "grad_norm": 1.0439955820386841, - "learning_rate": 3.3243803190754166e-06, - "loss": 0.1482, + "epoch": 0.049569263371262724, + "grad_norm": 1.269578911667256, + "learning_rate": 4.953959484346225e-06, + "loss": 0.2109, "step": 538 }, { - "epoch": 0.42424242424242425, - "grad_norm": 0.9964413472110166, - "learning_rate": 3.3182293621485923e-06, - "loss": 0.1524, + "epoch": 0.04966139954853273, + "grad_norm": 1.378804388434808, + "learning_rate": 4.96316758747698e-06, + "loss": 0.2091, "step": 539 }, { - "epoch": 0.42502951593860683, - "grad_norm": 0.9626977177442709, - "learning_rate": 3.312072852832012e-06, - "loss": 0.1427, + "epoch": 0.04975353572580274, + "grad_norm": 1.1049708603346846, + "learning_rate": 4.972375690607736e-06, + "loss": 0.2072, "step": 540 }, { - "epoch": 0.42581660763478946, - "grad_norm": 0.8939878261316884, - "learning_rate": 3.3059108329028845e-06, - "loss": 0.1283, + "epoch": 0.04984567190307274, + "grad_norm": 1.2404672009645543, + "learning_rate": 4.981583793738491e-06, + "loss": 0.2066, "step": 541 }, { - "epoch": 0.4266036993309721, - "grad_norm": 1.0155176108909485, - "learning_rate": 3.299743344175814e-06, - "loss": 0.1434, + "epoch": 0.04993780808034275, + "grad_norm": 1.1799110644655362, + "learning_rate": 4.9907918968692455e-06, + "loss": 0.2128, "step": 542 }, { - "epoch": 0.42739079102715466, - "grad_norm": 0.983969699589635, - "learning_rate": 3.293570428502515e-06, - "loss": 0.1479, + "epoch": 0.05002994425761275, + "grad_norm": 1.1896138346028056, + "learning_rate": 5e-06, + "loss": 0.2015, "step": 543 }, { - "epoch": 0.4281778827233373, - "grad_norm": 1.006021089515589, - "learning_rate": 3.287392127771526e-06, - "loss": 0.1386, + "epoch": 0.050122080434882754, + "grad_norm": 1.2225805533773542, + "learning_rate": 4.999999883937366e-06, + "loss": 0.213, "step": 544 }, { - "epoch": 0.42896497441951986, - "grad_norm": 0.9594215523929834, - "learning_rate": 3.2812084839079316e-06, - "loss": 0.1326, + "epoch": 0.050214216612152765, + "grad_norm": 1.2114568339356038, + "learning_rate": 4.999999535749473e-06, + "loss": 0.2064, "step": 545 }, { - "epoch": 0.4297520661157025, - "grad_norm": 0.958170693889915, - "learning_rate": 3.275019538873071e-06, - "loss": 0.1418, + "epoch": 0.05030635278942277, + "grad_norm": 1.1217499299052078, + "learning_rate": 4.999998955436354e-06, + "loss": 0.2024, "step": 546 }, { - "epoch": 0.43053915781188506, - "grad_norm": 1.0256772435691563, - "learning_rate": 3.268825334664259e-06, - "loss": 0.1526, + "epoch": 0.05039848896669277, + "grad_norm": 1.133686745541808, + "learning_rate": 4.999998142998064e-06, + "loss": 0.2001, "step": 547 }, { - "epoch": 0.4313262495080677, - "grad_norm": 1.0552921930006323, - "learning_rate": 3.2626259133144955e-06, - "loss": 0.1441, + "epoch": 0.050490625143962777, + "grad_norm": 1.2440473687463243, + "learning_rate": 4.999997098434676e-06, + "loss": 0.2089, "step": 548 }, { - "epoch": 0.43211334120425027, - "grad_norm": 1.0605556822333475, - "learning_rate": 3.2564213168921867e-06, - "loss": 0.1431, + "epoch": 0.05058276132123278, + "grad_norm": 1.1784492874879233, + "learning_rate": 4.999995821746289e-06, + "loss": 0.2084, "step": 549 }, { - "epoch": 0.4329004329004329, - "grad_norm": 0.982864648139079, - "learning_rate": 3.2502115875008523e-06, - "loss": 0.149, + "epoch": 0.050674897498502784, + "grad_norm": 1.188597001170747, + "learning_rate": 4.9999943129330204e-06, + "loss": 0.2187, "step": 550 }, { - "epoch": 0.4336875245966155, - "grad_norm": 1.0049025924744737, - "learning_rate": 3.2439967672788462e-06, - "loss": 0.1334, + "epoch": 0.050767033675772795, + "grad_norm": 1.295042393574803, + "learning_rate": 4.999992571995011e-06, + "loss": 0.1944, "step": 551 }, { - "epoch": 0.4344746162927981, - "grad_norm": 0.9263761254409442, - "learning_rate": 3.2377768983990677e-06, - "loss": 0.1401, + "epoch": 0.0508591698530428, + "grad_norm": 1.0516405339241583, + "learning_rate": 4.999990598932423e-06, + "loss": 0.2001, "step": 552 }, { - "epoch": 0.43526170798898073, - "grad_norm": 1.0390883988019344, - "learning_rate": 3.2315520230686747e-06, - "loss": 0.1493, + "epoch": 0.0509513060303128, + "grad_norm": 1.0917727740604706, + "learning_rate": 4.999988393745438e-06, + "loss": 0.2018, "step": 553 }, { - "epoch": 0.4360487996851633, - "grad_norm": 0.9913895430005143, - "learning_rate": 3.2253221835287984e-06, - "loss": 0.1406, + "epoch": 0.051043442207582806, + "grad_norm": 1.2785884522845907, + "learning_rate": 4.999985956434263e-06, + "loss": 0.2369, "step": 554 }, { - "epoch": 0.43683589138134593, - "grad_norm": 0.9801664753977715, - "learning_rate": 3.2190874220542577e-06, - "loss": 0.1341, + "epoch": 0.05113557838485281, + "grad_norm": 1.1693288663566566, + "learning_rate": 4.999983286999121e-06, + "loss": 0.1985, "step": 555 }, { - "epoch": 0.4376229830775285, - "grad_norm": 0.9519041413125566, - "learning_rate": 3.2128477809532687e-06, - "loss": 0.1469, + "epoch": 0.051227714562122814, + "grad_norm": 1.1206519528655148, + "learning_rate": 4.999980385440262e-06, + "loss": 0.2056, "step": 556 }, { - "epoch": 0.43841007477371113, - "grad_norm": 1.0289764585305627, - "learning_rate": 3.2066033025671612e-06, - "loss": 0.1473, + "epoch": 0.051319850739392825, + "grad_norm": 1.1626004840751911, + "learning_rate": 4.999977251757956e-06, + "loss": 0.2036, "step": 557 }, { - "epoch": 0.43919716646989376, - "grad_norm": 1.0005689404595521, - "learning_rate": 3.200354029270091e-06, - "loss": 0.1477, + "epoch": 0.05141198691666283, + "grad_norm": 1.326436529000634, + "learning_rate": 4.999973885952492e-06, + "loss": 0.2105, "step": 558 }, { - "epoch": 0.43998425816607634, - "grad_norm": 1.056463362355424, - "learning_rate": 3.1941000034687516e-06, - "loss": 0.1488, + "epoch": 0.05150412309393283, + "grad_norm": 1.1994189863044933, + "learning_rate": 4.9999702880241855e-06, + "loss": 0.2136, "step": 559 }, { - "epoch": 0.44077134986225897, - "grad_norm": 0.9574144607007496, - "learning_rate": 3.187841267602084e-06, - "loss": 0.1445, + "epoch": 0.051596259271202836, + "grad_norm": 1.2267888006027625, + "learning_rate": 4.999966457973367e-06, + "loss": 0.2173, "step": 560 }, { - "epoch": 0.44155844155844154, - "grad_norm": 0.9562953543913302, - "learning_rate": 3.1815778641409924e-06, - "loss": 0.1414, + "epoch": 0.05168839544847284, + "grad_norm": 1.1688358053384447, + "learning_rate": 4.999962395800395e-06, + "loss": 0.2334, "step": 561 }, { - "epoch": 0.44234553325462417, - "grad_norm": 0.9444651486667015, - "learning_rate": 3.1753098355880557e-06, - "loss": 0.138, + "epoch": 0.05178053162574285, + "grad_norm": 1.3029073613643016, + "learning_rate": 4.999958101505645e-06, + "loss": 0.2071, "step": 562 }, { - "epoch": 0.44313262495080674, - "grad_norm": 0.9465351053953429, - "learning_rate": 3.169037224477236e-06, - "loss": 0.1437, + "epoch": 0.051872667803012855, + "grad_norm": 1.1677420008318726, + "learning_rate": 4.999953575089516e-06, + "loss": 0.2165, "step": 563 }, { - "epoch": 0.44391971664698937, - "grad_norm": 1.0206836940486426, - "learning_rate": 3.162760073373594e-06, - "loss": 0.1411, + "epoch": 0.05196480398028286, + "grad_norm": 1.116469178905927, + "learning_rate": 4.999948816552429e-06, + "loss": 0.2057, "step": 564 }, { - "epoch": 0.444706808343172, - "grad_norm": 1.0878905236564318, - "learning_rate": 3.1564784248729965e-06, - "loss": 0.1408, + "epoch": 0.05205694015755286, + "grad_norm": 1.3097286700249924, + "learning_rate": 4.999943825894825e-06, + "loss": 0.2254, "step": 565 }, { - "epoch": 0.4454939000393546, - "grad_norm": 1.0130102955883906, - "learning_rate": 3.15019232160183e-06, - "loss": 0.1428, + "epoch": 0.052149076334822866, + "grad_norm": 1.1322032799911372, + "learning_rate": 4.999938603117167e-06, + "loss": 0.2032, "step": 566 }, { - "epoch": 0.4462809917355372, - "grad_norm": 0.980086016231054, - "learning_rate": 3.1439018062167092e-06, - "loss": 0.143, + "epoch": 0.05224121251209287, + "grad_norm": 1.2103922176557846, + "learning_rate": 4.999933148219942e-06, + "loss": 0.2353, "step": 567 }, { - "epoch": 0.4470680834317198, - "grad_norm": 1.0249915137559014, - "learning_rate": 3.1376069214041917e-06, - "loss": 0.1471, + "epoch": 0.05233334868936288, + "grad_norm": 1.3405882785620524, + "learning_rate": 4.999927461203654e-06, + "loss": 0.2122, "step": 568 }, { - "epoch": 0.4478551751279024, - "grad_norm": 1.1016327132095007, - "learning_rate": 3.1313077098804817e-06, - "loss": 0.1606, + "epoch": 0.052425484866632885, + "grad_norm": 1.1357985528866983, + "learning_rate": 4.999921542068833e-06, + "loss": 0.2023, "step": 569 }, { - "epoch": 0.448642266824085, - "grad_norm": 1.0411771801722989, - "learning_rate": 3.1250042143911462e-06, - "loss": 0.1499, + "epoch": 0.05251762104390289, + "grad_norm": 1.1622773497726775, + "learning_rate": 4.9999153908160285e-06, + "loss": 0.1914, "step": 570 }, { - "epoch": 0.4494293585202676, - "grad_norm": 1.0122030093902548, - "learning_rate": 3.118696477710822e-06, - "loss": 0.141, + "epoch": 0.05260975722117289, + "grad_norm": 1.2580747807039376, + "learning_rate": 4.999909007445809e-06, + "loss": 0.2155, "step": 571 }, { - "epoch": 0.45021645021645024, - "grad_norm": 1.0708872672849516, - "learning_rate": 3.1123845426429265e-06, - "loss": 0.128, + "epoch": 0.052701893398442896, + "grad_norm": 1.1608877424155948, + "learning_rate": 4.99990239195877e-06, + "loss": 0.2156, "step": 572 }, { - "epoch": 0.4510035419126328, - "grad_norm": 1.029737403462412, - "learning_rate": 3.106068452019365e-06, - "loss": 0.1383, + "epoch": 0.0527940295757129, + "grad_norm": 1.1581171779291344, + "learning_rate": 4.999895544355525e-06, + "loss": 0.2128, "step": 573 }, { - "epoch": 0.45179063360881544, - "grad_norm": 0.9988296671107193, - "learning_rate": 3.099748248700245e-06, - "loss": 0.1376, + "epoch": 0.05288616575298291, + "grad_norm": 1.153474177296958, + "learning_rate": 4.9998884646367094e-06, + "loss": 0.1973, "step": 574 }, { - "epoch": 0.452577725304998, - "grad_norm": 1.0475513726672416, - "learning_rate": 3.0934239755735782e-06, - "loss": 0.1355, + "epoch": 0.052978301930252915, + "grad_norm": 1.1874129378072187, + "learning_rate": 4.999881152802981e-06, + "loss": 0.2063, "step": 575 }, { - "epoch": 0.45336481700118064, - "grad_norm": 1.0654745191838768, - "learning_rate": 3.0870956755549973e-06, - "loss": 0.143, + "epoch": 0.05307043810752292, + "grad_norm": 1.0913852863675626, + "learning_rate": 4.999873608855019e-06, + "loss": 0.2013, "step": 576 }, { - "epoch": 0.4541519086973632, - "grad_norm": 0.9397526290083124, - "learning_rate": 3.0807633915874585e-06, - "loss": 0.1406, + "epoch": 0.05316257428479292, + "grad_norm": 1.124914712901831, + "learning_rate": 4.999865832793522e-06, + "loss": 0.2111, "step": 577 }, { - "epoch": 0.45493900039354584, - "grad_norm": 1.052837564760308, - "learning_rate": 3.0744271666409526e-06, - "loss": 0.1454, + "epoch": 0.053254710462062926, + "grad_norm": 1.1644687308281916, + "learning_rate": 4.9998578246192155e-06, + "loss": 0.2059, "step": 578 }, { - "epoch": 0.4557260920897285, - "grad_norm": 1.1289865006459998, - "learning_rate": 3.0680870437122145e-06, - "loss": 0.1554, + "epoch": 0.05334684663933294, + "grad_norm": 1.2942615223555596, + "learning_rate": 4.9998495843328385e-06, + "loss": 0.2221, "step": 579 }, { - "epoch": 0.45651318378591105, - "grad_norm": 0.9614320131595296, - "learning_rate": 3.0617430658244295e-06, - "loss": 0.1368, + "epoch": 0.05343898281660294, + "grad_norm": 1.123179821863574, + "learning_rate": 4.9998411119351605e-06, + "loss": 0.2102, "step": 580 }, { - "epoch": 0.4573002754820937, - "grad_norm": 0.9849943444472453, - "learning_rate": 3.0553952760269427e-06, - "loss": 0.1372, + "epoch": 0.053531118993872945, + "grad_norm": 1.2273638033386287, + "learning_rate": 4.999832407426966e-06, + "loss": 0.215, "step": 581 }, { - "epoch": 0.45808736717827625, - "grad_norm": 0.9938446057985301, - "learning_rate": 3.0490437173949656e-06, - "loss": 0.1397, + "epoch": 0.05362325517114295, + "grad_norm": 1.2098380897181231, + "learning_rate": 4.999823470809062e-06, + "loss": 0.2148, "step": 582 }, { - "epoch": 0.4588744588744589, - "grad_norm": 0.9430590894578916, - "learning_rate": 3.0426884330292844e-06, - "loss": 0.1404, + "epoch": 0.05371539134841295, + "grad_norm": 0.9746673941052318, + "learning_rate": 4.999814302082281e-06, + "loss": 0.1878, "step": 583 }, { - "epoch": 0.45966155057064145, - "grad_norm": 0.8998337127762756, - "learning_rate": 3.0363294660559685e-06, - "loss": 0.133, + "epoch": 0.053807527525682956, + "grad_norm": 1.0967724336422364, + "learning_rate": 4.999804901247472e-06, + "loss": 0.2021, "step": 584 }, { - "epoch": 0.4604486422668241, - "grad_norm": 0.9469777276964015, - "learning_rate": 3.0299668596260755e-06, - "loss": 0.1429, + "epoch": 0.05389966370295297, + "grad_norm": 1.2264527641340204, + "learning_rate": 4.99979526830551e-06, + "loss": 0.2083, "step": 585 }, { - "epoch": 0.4612357339630067, - "grad_norm": 0.9961208676961326, - "learning_rate": 3.023600656915362e-06, - "loss": 0.1381, + "epoch": 0.05399179988022297, + "grad_norm": 1.185566661438185, + "learning_rate": 4.999785403257288e-06, + "loss": 0.1993, "step": 586 }, { - "epoch": 0.4620228256591893, - "grad_norm": 0.9726679702119774, - "learning_rate": 3.017230901123985e-06, - "loss": 0.1391, + "epoch": 0.054083936057492975, + "grad_norm": 1.176578670156522, + "learning_rate": 4.9997753061037225e-06, + "loss": 0.1965, "step": 587 }, { - "epoch": 0.4628099173553719, - "grad_norm": 1.016233754336966, - "learning_rate": 3.0108576354762176e-06, - "loss": 0.1464, + "epoch": 0.05417607223476298, + "grad_norm": 1.4360254532578547, + "learning_rate": 4.9997649768457505e-06, + "loss": 0.2219, "step": 588 }, { - "epoch": 0.4635970090515545, - "grad_norm": 0.891890572894692, - "learning_rate": 3.0044809032201448e-06, - "loss": 0.1312, + "epoch": 0.05426820841203298, + "grad_norm": 1.144309223073869, + "learning_rate": 4.999754415484331e-06, + "loss": 0.2147, "step": 589 }, { - "epoch": 0.4643841007477371, - "grad_norm": 0.9300922465018149, - "learning_rate": 2.9981007476273787e-06, - "loss": 0.1272, + "epoch": 0.05436034458930299, + "grad_norm": 1.0619722443303092, + "learning_rate": 4.9997436220204455e-06, + "loss": 0.2046, "step": 590 }, { - "epoch": 0.4651711924439197, - "grad_norm": 1.0381540629264334, - "learning_rate": 2.9917172119927607e-06, - "loss": 0.1479, + "epoch": 0.054452480766573, + "grad_norm": 1.1799893495434046, + "learning_rate": 4.9997325964550945e-06, + "loss": 0.2243, "step": 591 }, { - "epoch": 0.4659582841401023, - "grad_norm": 1.0642195977009175, - "learning_rate": 2.9853303396340695e-06, - "loss": 0.1364, + "epoch": 0.054544616943843, + "grad_norm": 1.1114971013751254, + "learning_rate": 4.999721338789304e-06, + "loss": 0.2069, "step": 592 }, { - "epoch": 0.46674537583628495, - "grad_norm": 0.9295272897205104, - "learning_rate": 2.9789401738917244e-06, - "loss": 0.1249, + "epoch": 0.054636753121113005, + "grad_norm": 1.094133912064876, + "learning_rate": 4.999709849024118e-06, + "loss": 0.2018, "step": 593 }, { - "epoch": 0.4675324675324675, - "grad_norm": 1.0180029223750298, - "learning_rate": 2.9725467581284944e-06, - "loss": 0.1407, + "epoch": 0.05472888929838301, + "grad_norm": 1.0666830212123013, + "learning_rate": 4.999698127160604e-06, + "loss": 0.1898, "step": 594 }, { - "epoch": 0.46831955922865015, - "grad_norm": 1.1385262618991847, - "learning_rate": 2.966150135729203e-06, - "loss": 0.1502, + "epoch": 0.05482102547565301, + "grad_norm": 1.1414568181365667, + "learning_rate": 4.999686173199849e-06, + "loss": 0.2035, "step": 595 }, { - "epoch": 0.4691066509248327, - "grad_norm": 1.0067715931565462, - "learning_rate": 2.9597503501004345e-06, - "loss": 0.1286, + "epoch": 0.05491316165292302, + "grad_norm": 1.1433975732381854, + "learning_rate": 4.999673987142964e-06, + "loss": 0.2044, "step": 596 }, { - "epoch": 0.46989374262101535, - "grad_norm": 0.9465710841629198, - "learning_rate": 2.9533474446702346e-06, - "loss": 0.1358, + "epoch": 0.05500529783019303, + "grad_norm": 1.2889576924059074, + "learning_rate": 4.999661568991081e-06, + "loss": 0.2042, "step": 597 }, { - "epoch": 0.4706808343171979, - "grad_norm": 1.04804051578767, - "learning_rate": 2.946941462887824e-06, - "loss": 0.1333, + "epoch": 0.05509743400746303, + "grad_norm": 1.2353072072103293, + "learning_rate": 4.999648918745352e-06, + "loss": 0.2115, "step": 598 }, { - "epoch": 0.47146792601338056, - "grad_norm": 1.0917713383450702, - "learning_rate": 2.940532448223296e-06, - "loss": 0.1462, + "epoch": 0.055189570184733035, + "grad_norm": 1.2571958587328962, + "learning_rate": 4.999636036406951e-06, + "loss": 0.2169, "step": 599 }, { - "epoch": 0.4722550177095632, - "grad_norm": 0.9580513732250364, - "learning_rate": 2.9341204441673267e-06, - "loss": 0.1321, + "epoch": 0.05528170636200304, + "grad_norm": 1.46152140513451, + "learning_rate": 4.999622921977076e-06, + "loss": 0.2131, "step": 600 }, { - "epoch": 0.47304210940574576, - "grad_norm": 0.9439921102070582, - "learning_rate": 2.927705494230875e-06, - "loss": 0.1441, + "epoch": 0.05537384253927304, + "grad_norm": 1.1820367409058008, + "learning_rate": 4.999609575456944e-06, + "loss": 0.1844, "step": 601 }, { - "epoch": 0.4738292011019284, - "grad_norm": 1.0178216949448748, - "learning_rate": 2.9212876419448943e-06, - "loss": 0.1405, + "epoch": 0.05546597871654305, + "grad_norm": 1.1737151850144656, + "learning_rate": 4.9995959968477926e-06, + "loss": 0.2256, "step": 602 }, { - "epoch": 0.47461629279811096, - "grad_norm": 1.0297426762245179, - "learning_rate": 2.9148669308600298e-06, - "loss": 0.1392, + "epoch": 0.05555811489381306, + "grad_norm": 1.1214004446252206, + "learning_rate": 4.9995821861508844e-06, + "loss": 0.1867, "step": 603 }, { - "epoch": 0.4754033844942936, - "grad_norm": 0.9415986568330708, - "learning_rate": 2.9084434045463255e-06, - "loss": 0.1282, + "epoch": 0.05565025107108306, + "grad_norm": 1.1260505778426881, + "learning_rate": 4.999568143367501e-06, + "loss": 0.1964, "step": 604 }, { - "epoch": 0.47619047619047616, - "grad_norm": 1.0337230890115443, - "learning_rate": 2.9020171065929327e-06, - "loss": 0.1394, + "epoch": 0.055742387248353065, + "grad_norm": 1.167666210113292, + "learning_rate": 4.999553868498948e-06, + "loss": 0.2115, "step": 605 }, { - "epoch": 0.4769775678866588, - "grad_norm": 1.0540052550471415, - "learning_rate": 2.895588080607807e-06, - "loss": 0.1472, + "epoch": 0.05583452342562307, + "grad_norm": 1.1045012954183473, + "learning_rate": 4.999539361546547e-06, + "loss": 0.1758, "step": 606 }, { - "epoch": 0.4777646595828414, - "grad_norm": 1.0081872244466563, - "learning_rate": 2.8891563702174174e-06, - "loss": 0.1372, + "epoch": 0.05592665960289308, + "grad_norm": 1.233219811553698, + "learning_rate": 4.999524622511649e-06, + "loss": 0.2164, "step": 607 }, { - "epoch": 0.478551751279024, - "grad_norm": 1.0145019904402564, - "learning_rate": 2.8827220190664505e-06, - "loss": 0.1399, + "epoch": 0.05601879578016308, + "grad_norm": 1.2706612209541799, + "learning_rate": 4.99950965139562e-06, + "loss": 0.2008, "step": 608 }, { - "epoch": 0.4793388429752066, - "grad_norm": 1.0258604105718838, - "learning_rate": 2.8762850708175098e-06, - "loss": 0.1499, + "epoch": 0.05611093195743309, + "grad_norm": 1.2646084508514779, + "learning_rate": 4.999494448199851e-06, + "loss": 0.2092, "step": 609 }, { - "epoch": 0.4801259346713892, - "grad_norm": 1.0836484331180423, - "learning_rate": 2.869845569150825e-06, - "loss": 0.1388, + "epoch": 0.05620306813470309, + "grad_norm": 1.149501357551309, + "learning_rate": 4.9994790129257535e-06, + "loss": 0.1984, "step": 610 }, { - "epoch": 0.4809130263675718, - "grad_norm": 0.9946389106293178, - "learning_rate": 2.863403557763951e-06, - "loss": 0.1323, + "epoch": 0.056295204311973095, + "grad_norm": 1.3291468516688794, + "learning_rate": 4.999463345574761e-06, + "loss": 0.2162, "step": 611 }, { - "epoch": 0.4817001180637544, - "grad_norm": 0.9968164583365795, - "learning_rate": 2.856959080371474e-06, - "loss": 0.1402, + "epoch": 0.0563873404892431, + "grad_norm": 1.205429054407277, + "learning_rate": 4.999447446148328e-06, + "loss": 0.2137, "step": 612 }, { - "epoch": 0.48248720975993703, - "grad_norm": 1.0526146596249044, - "learning_rate": 2.8505121807047155e-06, - "loss": 0.1342, + "epoch": 0.05647947666651311, + "grad_norm": 1.2381435162688017, + "learning_rate": 4.999431314647929e-06, + "loss": 0.2129, "step": 613 }, { - "epoch": 0.48327430145611966, - "grad_norm": 0.9881771003275511, - "learning_rate": 2.8440629025114308e-06, - "loss": 0.1414, + "epoch": 0.05657161284378311, + "grad_norm": 1.2673565020592805, + "learning_rate": 4.999414951075065e-06, + "loss": 0.2007, "step": 614 }, { - "epoch": 0.48406139315230223, - "grad_norm": 1.0170639400089367, - "learning_rate": 2.8376112895555184e-06, - "loss": 0.1415, + "epoch": 0.05666374902105312, + "grad_norm": 1.1037792471970673, + "learning_rate": 4.999398355431253e-06, + "loss": 0.2108, "step": 615 }, { - "epoch": 0.48484848484848486, - "grad_norm": 0.9618458339894986, - "learning_rate": 2.83115738561672e-06, - "loss": 0.125, + "epoch": 0.05675588519832312, + "grad_norm": 1.16253375115935, + "learning_rate": 4.999381527718036e-06, + "loss": 0.2098, "step": 616 }, { - "epoch": 0.48563557654466744, - "grad_norm": 1.166675709546666, - "learning_rate": 2.8247012344903235e-06, - "loss": 0.1537, + "epoch": 0.056848021375593125, + "grad_norm": 1.1401961391380055, + "learning_rate": 4.999364467936974e-06, + "loss": 0.2076, "step": 617 }, { - "epoch": 0.48642266824085006, - "grad_norm": 1.0308351089525765, - "learning_rate": 2.8182428799868643e-06, - "loss": 0.1435, + "epoch": 0.05694015755286313, + "grad_norm": 1.070625343707576, + "learning_rate": 4.999347176089653e-06, + "loss": 0.1909, "step": 618 }, { - "epoch": 0.48720975993703264, - "grad_norm": 0.9008466844444718, - "learning_rate": 2.811782365931832e-06, - "loss": 0.1255, + "epoch": 0.05703229373013314, + "grad_norm": 1.1323161172935006, + "learning_rate": 4.999329652177677e-06, + "loss": 0.2081, "step": 619 }, { - "epoch": 0.48799685163321527, - "grad_norm": 1.0328591551300574, - "learning_rate": 2.8053197361653684e-06, - "loss": 0.1431, + "epoch": 0.05712442990740314, + "grad_norm": 1.1839375977802664, + "learning_rate": 4.9993118962026735e-06, + "loss": 0.2152, "step": 620 }, { - "epoch": 0.4887839433293979, - "grad_norm": 1.0223227370370647, - "learning_rate": 2.7988550345419733e-06, - "loss": 0.1302, + "epoch": 0.05721656608467315, + "grad_norm": 1.1346559448427023, + "learning_rate": 4.999293908166292e-06, + "loss": 0.1946, "step": 621 }, { - "epoch": 0.48957103502558047, - "grad_norm": 1.0130656273790444, - "learning_rate": 2.792388304930207e-06, - "loss": 0.1413, + "epoch": 0.05730870226194315, + "grad_norm": 1.2069917410518431, + "learning_rate": 4.999275688070202e-06, + "loss": 0.1944, "step": 622 }, { - "epoch": 0.4903581267217631, - "grad_norm": 0.9678629630338841, - "learning_rate": 2.7859195912123875e-06, - "loss": 0.1411, + "epoch": 0.057400838439213155, + "grad_norm": 1.1793130901944762, + "learning_rate": 4.999257235916096e-06, + "loss": 0.2065, "step": 623 }, { - "epoch": 0.4911452184179457, - "grad_norm": 1.0630235458290422, - "learning_rate": 2.779448937284302e-06, - "loss": 0.144, + "epoch": 0.057492974616483165, + "grad_norm": 1.1633227618690698, + "learning_rate": 4.999238551705686e-06, + "loss": 0.1944, "step": 624 }, { - "epoch": 0.4919323101141283, - "grad_norm": 1.1368466359085148, - "learning_rate": 2.772976387054899e-06, - "loss": 0.1603, + "epoch": 0.05758511079375317, + "grad_norm": 1.173343604205386, + "learning_rate": 4.9992196354407075e-06, + "loss": 0.2122, "step": 625 }, { - "epoch": 0.4927194018103109, - "grad_norm": 1.0638972206646764, - "learning_rate": 2.766501984445999e-06, - "loss": 0.1469, + "epoch": 0.05767724697102317, + "grad_norm": 1.1303138827921249, + "learning_rate": 4.999200487122917e-06, + "loss": 0.2187, "step": 626 }, { - "epoch": 0.4935064935064935, - "grad_norm": 0.9878723437777639, - "learning_rate": 2.7600257733919887e-06, - "loss": 0.1347, + "epoch": 0.05776938314829318, + "grad_norm": 1.0398772349211105, + "learning_rate": 4.999181106754093e-06, + "loss": 0.1956, "step": 627 }, { - "epoch": 0.49429358520267613, - "grad_norm": 0.9482438523704221, - "learning_rate": 2.7535477978395295e-06, - "loss": 0.1301, + "epoch": 0.05786151932556318, + "grad_norm": 1.1638532808388222, + "learning_rate": 4.999161494336033e-06, + "loss": 0.1927, "step": 628 }, { - "epoch": 0.4950806768988587, - "grad_norm": 1.0213978192147322, - "learning_rate": 2.7470681017472556e-06, - "loss": 0.1442, + "epoch": 0.057953655502833185, + "grad_norm": 1.108151966330762, + "learning_rate": 4.99914164987056e-06, + "loss": 0.1986, "step": 629 }, { - "epoch": 0.49586776859504134, - "grad_norm": 1.0113916573838844, - "learning_rate": 2.740586729085476e-06, - "loss": 0.1477, + "epoch": 0.058045791680103195, + "grad_norm": 1.183480026573518, + "learning_rate": 4.999121573359516e-06, + "loss": 0.2119, "step": 630 }, { - "epoch": 0.4966548602912239, - "grad_norm": 1.0353820062718653, - "learning_rate": 2.7341037238358774e-06, - "loss": 0.1483, + "epoch": 0.0581379278573732, + "grad_norm": 1.1555341737613782, + "learning_rate": 4.999101264804765e-06, + "loss": 0.1911, "step": 631 }, { - "epoch": 0.49744195198740654, - "grad_norm": 1.0552352024187672, - "learning_rate": 2.727619129991224e-06, - "loss": 0.1328, + "epoch": 0.0582300640346432, + "grad_norm": 1.1980595228638002, + "learning_rate": 4.9990807242081915e-06, + "loss": 0.2053, "step": 632 }, { - "epoch": 0.4982290436835891, - "grad_norm": 0.9937705442973395, - "learning_rate": 2.7211329915550615e-06, - "loss": 0.1409, + "epoch": 0.05832220021191321, + "grad_norm": 1.3207559140555372, + "learning_rate": 4.999059951571705e-06, + "loss": 0.2119, "step": 633 }, { - "epoch": 0.49901613537977174, - "grad_norm": 1.0486309341654392, - "learning_rate": 2.714645352541415e-06, - "loss": 0.15, + "epoch": 0.05841433638918321, + "grad_norm": 1.2652476003839102, + "learning_rate": 4.9990389468972336e-06, + "loss": 0.2239, "step": 634 }, { - "epoch": 0.49980322707595437, - "grad_norm": 1.015369060592149, - "learning_rate": 2.7081562569744948e-06, - "loss": 0.1298, + "epoch": 0.058506472566453215, + "grad_norm": 1.1645579104689652, + "learning_rate": 4.999017710186725e-06, + "loss": 0.2233, "step": 635 }, { - "epoch": 0.500590318772137, - "grad_norm": 1.014091287328762, - "learning_rate": 2.701665748888393e-06, - "loss": 0.139, + "epoch": 0.058598608743723225, + "grad_norm": 1.188157223333697, + "learning_rate": 4.998996241442155e-06, + "loss": 0.2056, "step": 636 }, { - "epoch": 0.5013774104683195, - "grad_norm": 1.010797057516188, - "learning_rate": 2.695173872326788e-06, - "loss": 0.1306, + "epoch": 0.05869074492099323, + "grad_norm": 1.174226549897811, + "learning_rate": 4.998974540665514e-06, + "loss": 0.2078, "step": 637 }, { - "epoch": 0.5021645021645021, - "grad_norm": 0.9886264059190445, - "learning_rate": 2.6886806713426435e-06, - "loss": 0.1493, + "epoch": 0.05878288109826323, + "grad_norm": 1.026345579666321, + "learning_rate": 4.998952607858818e-06, + "loss": 0.1947, "step": 638 }, { - "epoch": 0.5029515938606848, - "grad_norm": 0.9006497838538798, - "learning_rate": 2.6821861899979116e-06, - "loss": 0.127, + "epoch": 0.05887501727553324, + "grad_norm": 1.3408302898393387, + "learning_rate": 4.998930443024103e-06, + "loss": 0.1999, "step": 639 }, { - "epoch": 0.5037386855568674, - "grad_norm": 1.0409028373992908, - "learning_rate": 2.6756904723632325e-06, - "loss": 0.1453, + "epoch": 0.05896715345280324, + "grad_norm": 1.2737103319088758, + "learning_rate": 4.9989080461634285e-06, + "loss": 0.2139, "step": 640 }, { - "epoch": 0.50452577725305, - "grad_norm": 0.9741943151013064, - "learning_rate": 2.6691935625176357e-06, - "loss": 0.1353, + "epoch": 0.05905928963007325, + "grad_norm": 1.013191424805071, + "learning_rate": 4.9988854172788725e-06, + "loss": 0.1945, "step": 641 }, { - "epoch": 0.5053128689492326, - "grad_norm": 0.949636504358609, - "learning_rate": 2.6626955045482405e-06, - "loss": 0.1335, + "epoch": 0.059151425807343255, + "grad_norm": 1.2422989589513913, + "learning_rate": 4.998862556372537e-06, + "loss": 0.2146, "step": 642 }, { - "epoch": 0.5060999606454152, - "grad_norm": 0.9249297082390363, - "learning_rate": 2.6561963425499575e-06, - "loss": 0.1338, + "epoch": 0.05924356198461326, + "grad_norm": 1.0916578038400657, + "learning_rate": 4.998839463446543e-06, + "loss": 0.1872, "step": 643 }, { - "epoch": 0.5068870523415978, - "grad_norm": 1.0151555535359889, - "learning_rate": 2.649696120625188e-06, - "loss": 0.1515, + "epoch": 0.05933569816188326, + "grad_norm": 1.2140922793923727, + "learning_rate": 4.998816138503038e-06, + "loss": 0.2128, "step": 644 }, { - "epoch": 0.5076741440377804, - "grad_norm": 1.039472398997662, - "learning_rate": 2.643194882883528e-06, - "loss": 0.1474, + "epoch": 0.05942783433915327, + "grad_norm": 1.187481640612988, + "learning_rate": 4.9987925815441835e-06, + "loss": 0.186, "step": 645 }, { - "epoch": 0.508461235733963, - "grad_norm": 0.9434610266773801, - "learning_rate": 2.6366926734414648e-06, - "loss": 0.1304, + "epoch": 0.05951997051642327, + "grad_norm": 1.0595639803375902, + "learning_rate": 4.99876879257217e-06, + "loss": 0.1814, "step": 646 }, { - "epoch": 0.5092483274301456, - "grad_norm": 0.8865198426440791, - "learning_rate": 2.6301895364220816e-06, - "loss": 0.1202, + "epoch": 0.05961210669369328, + "grad_norm": 1.1918728406860226, + "learning_rate": 4.9987447715892046e-06, + "loss": 0.2033, "step": 647 }, { - "epoch": 0.5100354191263282, - "grad_norm": 0.9546278944005607, - "learning_rate": 2.6236855159547527e-06, - "loss": 0.1291, + "epoch": 0.059704242870963285, + "grad_norm": 1.1277418699802502, + "learning_rate": 4.998720518597518e-06, + "loss": 0.1976, "step": 648 }, { - "epoch": 0.5108225108225108, - "grad_norm": 0.9693013564144493, - "learning_rate": 2.6171806561748503e-06, - "loss": 0.1339, + "epoch": 0.05979637904823329, + "grad_norm": 1.2389827145068848, + "learning_rate": 4.998696033599363e-06, + "loss": 0.2214, "step": 649 }, { - "epoch": 0.5116096025186935, - "grad_norm": 1.0027100891356027, - "learning_rate": 2.610675001223441e-06, - "loss": 0.1407, + "epoch": 0.05988851522550329, + "grad_norm": 1.1715668898340321, + "learning_rate": 4.998671316597012e-06, + "loss": 0.1957, "step": 650 }, { - "epoch": 0.512396694214876, - "grad_norm": 0.8560729540932264, - "learning_rate": 2.6041685952469877e-06, - "loss": 0.116, + "epoch": 0.0599806514027733, + "grad_norm": 1.1157040510100455, + "learning_rate": 4.998646367592761e-06, + "loss": 0.1977, "step": 651 }, { - "epoch": 0.5131837859110586, - "grad_norm": 0.9257606438562741, - "learning_rate": 2.597661482397049e-06, - "loss": 0.1262, + "epoch": 0.0600727875800433, + "grad_norm": 1.2964622196527131, + "learning_rate": 4.998621186588925e-06, + "loss": 0.2137, "step": 652 }, { - "epoch": 0.5139708776072412, - "grad_norm": 1.0514657045725575, - "learning_rate": 2.5911537068299803e-06, - "loss": 0.1469, + "epoch": 0.06016492375731331, + "grad_norm": 1.1874309992271197, + "learning_rate": 4.998595773587844e-06, + "loss": 0.198, "step": 653 }, { - "epoch": 0.5147579693034239, - "grad_norm": 0.9545058570137028, - "learning_rate": 2.584645312706634e-06, - "loss": 0.1302, + "epoch": 0.060257059934583315, + "grad_norm": 1.0821447628825356, + "learning_rate": 4.998570128591875e-06, + "loss": 0.1983, "step": 654 }, { - "epoch": 0.5155450609996065, - "grad_norm": 0.9392962167917809, - "learning_rate": 2.5781363441920614e-06, - "loss": 0.1335, + "epoch": 0.06034919611185332, + "grad_norm": 1.1274894688033879, + "learning_rate": 4.998544251603402e-06, + "loss": 0.197, "step": 655 }, { - "epoch": 0.516332152695789, - "grad_norm": 0.9496925045032614, - "learning_rate": 2.5716268454552094e-06, - "loss": 0.135, + "epoch": 0.06044133228912332, + "grad_norm": 1.2286272649030554, + "learning_rate": 4.998518142624826e-06, + "loss": 0.2165, "step": 656 }, { - "epoch": 0.5171192443919717, - "grad_norm": 1.002838327785164, - "learning_rate": 2.565116860668625e-06, - "loss": 0.1316, + "epoch": 0.06053346846639333, + "grad_norm": 1.0008382256801454, + "learning_rate": 4.998491801658571e-06, + "loss": 0.1782, "step": 657 }, { - "epoch": 0.5179063360881543, - "grad_norm": 1.053225279922735, - "learning_rate": 2.5586064340081516e-06, - "loss": 0.1512, + "epoch": 0.06062560464366334, + "grad_norm": 1.1573290350734389, + "learning_rate": 4.998465228707084e-06, + "loss": 0.1976, "step": 658 }, { - "epoch": 0.5186934277843369, - "grad_norm": 0.9441130740117648, - "learning_rate": 2.5520956096526323e-06, - "loss": 0.131, + "epoch": 0.06071774082093334, + "grad_norm": 1.1294960460081294, + "learning_rate": 4.998438423772831e-06, + "loss": 0.2195, "step": 659 }, { - "epoch": 0.5194805194805194, - "grad_norm": 0.9889334534500898, - "learning_rate": 2.5455844317836077e-06, - "loss": 0.1331, + "epoch": 0.060809876998203345, + "grad_norm": 1.154289522109689, + "learning_rate": 4.998411386858303e-06, + "loss": 0.2114, "step": 660 }, { - "epoch": 0.5202676111767021, - "grad_norm": 0.9646380257679634, - "learning_rate": 2.53907294458502e-06, - "loss": 0.1291, + "epoch": 0.06090201317547335, + "grad_norm": 1.0669414151184684, + "learning_rate": 4.998384117966007e-06, + "loss": 0.1967, "step": 661 }, { - "epoch": 0.5210547028728847, - "grad_norm": 1.0529655580058879, - "learning_rate": 2.5325611922429074e-06, - "loss": 0.1491, + "epoch": 0.06099414935274335, + "grad_norm": 1.1281826805262178, + "learning_rate": 4.998356617098478e-06, + "loss": 0.2076, "step": 662 }, { - "epoch": 0.5218417945690673, - "grad_norm": 0.9926818678117324, - "learning_rate": 2.5260492189451076e-06, - "loss": 0.1443, + "epoch": 0.06108628553001336, + "grad_norm": 1.114308852904428, + "learning_rate": 4.9983288842582665e-06, + "loss": 0.2047, "step": 663 }, { - "epoch": 0.52262888626525, - "grad_norm": 0.9144135454846201, - "learning_rate": 2.51953706888096e-06, - "loss": 0.1217, + "epoch": 0.06117842170728337, + "grad_norm": 1.2071437766523476, + "learning_rate": 4.9983009194479505e-06, + "loss": 0.2026, "step": 664 }, { - "epoch": 0.5234159779614325, - "grad_norm": 0.921592430215234, - "learning_rate": 2.513024786241001e-06, - "loss": 0.1248, + "epoch": 0.06127055788455337, + "grad_norm": 1.1827937087808504, + "learning_rate": 4.998272722670126e-06, + "loss": 0.2008, "step": 665 }, { - "epoch": 0.5242030696576151, - "grad_norm": 0.9514782593826102, - "learning_rate": 2.5065124152166692e-06, - "loss": 0.1297, + "epoch": 0.061362694061823375, + "grad_norm": 1.0733621691103314, + "learning_rate": 4.998244293927409e-06, + "loss": 0.1813, "step": 666 }, { - "epoch": 0.5249901613537977, - "grad_norm": 1.0117305817250293, - "learning_rate": 2.5e-06, - "loss": 0.1497, + "epoch": 0.06145483023909338, + "grad_norm": 1.1788970733123387, + "learning_rate": 4.998215633222441e-06, + "loss": 0.219, "step": 667 }, { - "epoch": 0.5257772530499804, - "grad_norm": 1.013668842181626, - "learning_rate": 2.4934875847833308e-06, - "loss": 0.1224, + "epoch": 0.06154696641636338, + "grad_norm": 1.2172001325394024, + "learning_rate": 4.998186740557882e-06, + "loss": 0.1977, "step": 668 }, { - "epoch": 0.526564344746163, - "grad_norm": 1.000211936689413, - "learning_rate": 2.4869752137589994e-06, - "loss": 0.1419, + "epoch": 0.061639102593633394, + "grad_norm": 1.1195473427959475, + "learning_rate": 4.998157615936416e-06, + "loss": 0.1914, "step": 669 }, { - "epoch": 0.5273514364423455, - "grad_norm": 0.9733370358487723, - "learning_rate": 2.48046293111904e-06, - "loss": 0.1245, + "epoch": 0.0617312387709034, + "grad_norm": 1.1674007562772488, + "learning_rate": 4.998128259360747e-06, + "loss": 0.2087, "step": 670 }, { - "epoch": 0.5281385281385281, - "grad_norm": 1.0646062724041805, - "learning_rate": 2.473950781054893e-06, - "loss": 0.1383, + "epoch": 0.0618233749481734, + "grad_norm": 1.212028870572152, + "learning_rate": 4.998098670833599e-06, + "loss": 0.211, "step": 671 }, { - "epoch": 0.5289256198347108, - "grad_norm": 0.930785726380819, - "learning_rate": 2.467438807757094e-06, - "loss": 0.1295, + "epoch": 0.061915511125443405, + "grad_norm": 1.1367268757070708, + "learning_rate": 4.998068850357721e-06, + "loss": 0.1912, "step": 672 }, { - "epoch": 0.5297127115308934, - "grad_norm": 0.9786127857256359, - "learning_rate": 2.460927055414981e-06, - "loss": 0.146, + "epoch": 0.06200764730271341, + "grad_norm": 1.08789125897739, + "learning_rate": 4.998038797935882e-06, + "loss": 0.1952, "step": 673 }, { - "epoch": 0.5304998032270759, - "grad_norm": 0.9744140929407867, - "learning_rate": 2.4544155682163922e-06, - "loss": 0.1298, + "epoch": 0.06209978347998341, + "grad_norm": 1.1357554005551076, + "learning_rate": 4.9980085135708715e-06, + "loss": 0.2017, "step": 674 }, { - "epoch": 0.5312868949232585, - "grad_norm": 0.9484782784554407, - "learning_rate": 2.447904390347369e-06, - "loss": 0.1278, + "epoch": 0.062191919657253424, + "grad_norm": 1.14464491950595, + "learning_rate": 4.997977997265501e-06, + "loss": 0.2049, "step": 675 }, { - "epoch": 0.5320739866194412, - "grad_norm": 1.1172109264151044, - "learning_rate": 2.441393565991849e-06, - "loss": 0.146, + "epoch": 0.06228405583452343, + "grad_norm": 1.0226569388533309, + "learning_rate": 4.997947249022605e-06, + "loss": 0.1861, "step": 676 }, { - "epoch": 0.5328610783156238, - "grad_norm": 0.9832334733375834, - "learning_rate": 2.4348831393313763e-06, - "loss": 0.1341, + "epoch": 0.06237619201179343, + "grad_norm": 1.12500761449028, + "learning_rate": 4.997916268845038e-06, + "loss": 0.1935, "step": 677 }, { - "epoch": 0.5336481700118064, - "grad_norm": 0.9993612072993626, - "learning_rate": 2.428373154544791e-06, - "loss": 0.1348, + "epoch": 0.062468328189063435, + "grad_norm": 1.0950565167941166, + "learning_rate": 4.997885056735677e-06, + "loss": 0.1959, "step": 678 }, { - "epoch": 0.5344352617079889, - "grad_norm": 0.8743067419696096, - "learning_rate": 2.42186365580794e-06, - "loss": 0.1127, + "epoch": 0.06256046436633345, + "grad_norm": 1.1475008577190142, + "learning_rate": 4.99785361269742e-06, + "loss": 0.1987, "step": 679 }, { - "epoch": 0.5352223534041716, - "grad_norm": 0.9878917132746777, - "learning_rate": 2.4153546872933667e-06, - "loss": 0.1289, + "epoch": 0.06265260054360344, + "grad_norm": 1.153601965067529, + "learning_rate": 4.9978219367331856e-06, + "loss": 0.2098, "step": 680 }, { - "epoch": 0.5360094451003542, - "grad_norm": 0.9872701224310093, - "learning_rate": 2.4088462931700214e-06, - "loss": 0.1382, + "epoch": 0.06274473672087345, + "grad_norm": 1.1044287968459916, + "learning_rate": 4.997790028845916e-06, + "loss": 0.2078, "step": 681 }, { - "epoch": 0.5367965367965368, - "grad_norm": 1.0291331541759994, - "learning_rate": 2.4023385176029516e-06, - "loss": 0.1398, + "epoch": 0.06283687289814345, + "grad_norm": 1.087079920318585, + "learning_rate": 4.997757889038573e-06, + "loss": 0.2082, "step": 682 }, { - "epoch": 0.5375836284927195, - "grad_norm": 1.0314844938730774, - "learning_rate": 2.3958314047530127e-06, - "loss": 0.1407, + "epoch": 0.06292900907541346, + "grad_norm": 1.163440565892733, + "learning_rate": 4.9977255173141405e-06, + "loss": 0.1917, "step": 683 }, { - "epoch": 0.538370720188902, - "grad_norm": 0.9922009235690711, - "learning_rate": 2.3893249987765598e-06, - "loss": 0.1375, + "epoch": 0.06302114525268347, + "grad_norm": 1.0799809277476804, + "learning_rate": 4.997692913675626e-06, + "loss": 0.2071, "step": 684 }, { - "epoch": 0.5391578118850846, - "grad_norm": 1.0407160996339295, - "learning_rate": 2.3828193438251497e-06, - "loss": 0.1356, + "epoch": 0.06311328142995347, + "grad_norm": 1.0607317616778018, + "learning_rate": 4.997660078126055e-06, + "loss": 0.2097, "step": 685 }, { - "epoch": 0.5399449035812672, - "grad_norm": 0.9860703004700557, - "learning_rate": 2.376314484045248e-06, - "loss": 0.132, + "epoch": 0.06320541760722348, + "grad_norm": 1.079311486129121, + "learning_rate": 4.997627010668477e-06, + "loss": 0.2119, "step": 686 }, { - "epoch": 0.5407319952774499, - "grad_norm": 1.0540933767364977, - "learning_rate": 2.369810463577919e-06, - "loss": 0.1467, + "epoch": 0.06329755378449348, + "grad_norm": 1.0586204104270576, + "learning_rate": 4.997593711305963e-06, + "loss": 0.1931, "step": 687 }, { - "epoch": 0.5415190869736324, - "grad_norm": 1.0135356185084303, - "learning_rate": 2.3633073265585356e-06, - "loss": 0.1381, + "epoch": 0.06338968996176349, + "grad_norm": 1.3372536926610412, + "learning_rate": 4.997560180041604e-06, + "loss": 0.2016, "step": 688 }, { - "epoch": 0.542306178669815, - "grad_norm": 0.9743937278639236, - "learning_rate": 2.3568051171164724e-06, - "loss": 0.1324, + "epoch": 0.0634818261390335, + "grad_norm": 1.026145931825911, + "learning_rate": 4.997526416878513e-06, + "loss": 0.1991, "step": 689 }, { - "epoch": 0.5430932703659976, - "grad_norm": 1.0422560526589146, - "learning_rate": 2.350303879374813e-06, - "loss": 0.136, + "epoch": 0.0635739623163035, + "grad_norm": 1.0825949947992264, + "learning_rate": 4.997492421819825e-06, + "loss": 0.1901, "step": 690 }, { - "epoch": 0.5438803620621803, - "grad_norm": 1.0503391352080245, - "learning_rate": 2.3438036574500434e-06, - "loss": 0.147, + "epoch": 0.0636660984935735, + "grad_norm": 1.1104887396834857, + "learning_rate": 4.997458194868697e-06, + "loss": 0.1964, "step": 691 }, { - "epoch": 0.5446674537583629, - "grad_norm": 0.9557517793781123, - "learning_rate": 2.3373044954517603e-06, - "loss": 0.1216, + "epoch": 0.0637582346708435, + "grad_norm": 1.1414256643477074, + "learning_rate": 4.997423736028308e-06, + "loss": 0.1942, "step": 692 }, { - "epoch": 0.5454545454545454, - "grad_norm": 0.9898057468780994, - "learning_rate": 2.330806437482365e-06, - "loss": 0.1342, + "epoch": 0.06385037084811351, + "grad_norm": 1.0428539165758621, + "learning_rate": 4.997389045301856e-06, + "loss": 0.1803, "step": 693 }, { - "epoch": 0.546241637150728, - "grad_norm": 0.9685373418602369, - "learning_rate": 2.3243095276367687e-06, - "loss": 0.1294, + "epoch": 0.06394250702538351, + "grad_norm": 1.1658483507123416, + "learning_rate": 4.997354122692561e-06, + "loss": 0.2028, "step": 694 }, { - "epoch": 0.5470287288469107, - "grad_norm": 1.0187901801029866, - "learning_rate": 2.317813810002089e-06, - "loss": 0.1366, + "epoch": 0.06403464320265352, + "grad_norm": 0.9640305471456437, + "learning_rate": 4.997318968203668e-06, + "loss": 0.1639, "step": 695 }, { - "epoch": 0.5478158205430933, - "grad_norm": 1.036393473441657, - "learning_rate": 2.3113193286573577e-06, - "loss": 0.1384, + "epoch": 0.06412677937992353, + "grad_norm": 0.9944000405107695, + "learning_rate": 4.99728358183844e-06, + "loss": 0.1952, "step": 696 }, { - "epoch": 0.5486029122392759, - "grad_norm": 0.9735402694275894, - "learning_rate": 2.3048261276732133e-06, - "loss": 0.1325, + "epoch": 0.06421891555719353, + "grad_norm": 1.0588180086567232, + "learning_rate": 4.9972479636001625e-06, + "loss": 0.1902, "step": 697 }, { - "epoch": 0.5493900039354584, - "grad_norm": 0.9435211562075637, - "learning_rate": 2.298334251111607e-06, - "loss": 0.1272, + "epoch": 0.06431105173446354, + "grad_norm": 1.1141307683264974, + "learning_rate": 4.9972121134921435e-06, + "loss": 0.1993, "step": 698 }, { - "epoch": 0.5501770956316411, - "grad_norm": 0.9238771765346788, - "learning_rate": 2.2918437430255056e-06, - "loss": 0.1329, + "epoch": 0.06440318791173354, + "grad_norm": 1.2576518697394798, + "learning_rate": 4.99717603151771e-06, + "loss": 0.1968, "step": 699 }, { - "epoch": 0.5509641873278237, - "grad_norm": 0.9732329075427437, - "learning_rate": 2.285354647458585e-06, - "loss": 0.1316, + "epoch": 0.06449532408900355, + "grad_norm": 1.0725923379796012, + "learning_rate": 4.997139717680214e-06, + "loss": 0.1962, "step": 700 }, { - "epoch": 0.5517512790240063, - "grad_norm": 1.032698839528823, - "learning_rate": 2.2788670084449393e-06, - "loss": 0.1438, + "epoch": 0.06458746026627356, + "grad_norm": 1.1265762810463271, + "learning_rate": 4.9971031719830255e-06, + "loss": 0.2103, "step": 701 }, { - "epoch": 0.5525383707201889, - "grad_norm": 0.9453000245373157, - "learning_rate": 2.2723808700087764e-06, - "loss": 0.1349, + "epoch": 0.06467959644354355, + "grad_norm": 1.0764056898213448, + "learning_rate": 4.9970663944295396e-06, + "loss": 0.1929, "step": 702 }, { - "epoch": 0.5533254624163715, - "grad_norm": 1.017447417352295, - "learning_rate": 2.2658962761641235e-06, - "loss": 0.1346, + "epoch": 0.06477173262081357, + "grad_norm": 1.1443294790794694, + "learning_rate": 4.99702938502317e-06, + "loss": 0.2004, "step": 703 }, { - "epoch": 0.5541125541125541, - "grad_norm": 1.0593240948345142, - "learning_rate": 2.2594132709145245e-06, - "loss": 0.1391, + "epoch": 0.06486386879808356, + "grad_norm": 1.1556168002401073, + "learning_rate": 4.996992143767353e-06, + "loss": 0.1955, "step": 704 }, { - "epoch": 0.5548996458087367, - "grad_norm": 1.0524825811903469, - "learning_rate": 2.2529318982527453e-06, - "loss": 0.1397, + "epoch": 0.06495600497535357, + "grad_norm": 1.1330344957803058, + "learning_rate": 4.996954670665547e-06, + "loss": 0.1971, "step": 705 }, { - "epoch": 0.5556867375049194, - "grad_norm": 0.9994684504324172, - "learning_rate": 2.246452202160471e-06, - "loss": 0.1416, + "epoch": 0.06504814115262358, + "grad_norm": 1.1245253883139057, + "learning_rate": 4.996916965721232e-06, + "loss": 0.2186, "step": 706 }, { - "epoch": 0.5564738292011019, - "grad_norm": 1.033406524556106, - "learning_rate": 2.2399742266080126e-06, - "loss": 0.1269, + "epoch": 0.06514027732989358, + "grad_norm": 1.1056427514136586, + "learning_rate": 4.996879028937906e-06, + "loss": 0.2043, "step": 707 }, { - "epoch": 0.5572609208972845, - "grad_norm": 0.9246218098662428, - "learning_rate": 2.233498015554002e-06, - "loss": 0.1242, + "epoch": 0.06523241350716359, + "grad_norm": 1.1164453535256291, + "learning_rate": 4.996840860319094e-06, + "loss": 0.2042, "step": 708 }, { - "epoch": 0.5580480125934671, - "grad_norm": 0.904021008692359, - "learning_rate": 2.227023612945102e-06, - "loss": 0.1217, + "epoch": 0.06532454968443359, + "grad_norm": 1.0858019482374586, + "learning_rate": 4.996802459868341e-06, + "loss": 0.1927, "step": 709 }, { - "epoch": 0.5588351042896498, - "grad_norm": 0.9921447266760961, - "learning_rate": 2.220551062715699e-06, - "loss": 0.1241, + "epoch": 0.0654166858617036, + "grad_norm": 1.1400184015748671, + "learning_rate": 4.996763827589209e-06, + "loss": 0.2088, "step": 710 }, { - "epoch": 0.5596221959858324, - "grad_norm": 1.0273952935358304, - "learning_rate": 2.2140804087876134e-06, - "loss": 0.1244, + "epoch": 0.0655088220389736, + "grad_norm": 1.2412575997017532, + "learning_rate": 4.996724963485288e-06, + "loss": 0.2067, "step": 711 }, { - "epoch": 0.5604092876820149, - "grad_norm": 0.9958421204937957, - "learning_rate": 2.207611695069794e-06, - "loss": 0.1277, + "epoch": 0.06560095821624361, + "grad_norm": 1.102175865083544, + "learning_rate": 4.996685867560186e-06, + "loss": 0.2011, "step": 712 }, { - "epoch": 0.5611963793781976, - "grad_norm": 1.0226304738126037, - "learning_rate": 2.2011449654580266e-06, - "loss": 0.1319, + "epoch": 0.06569309439351362, + "grad_norm": 1.1335302279554809, + "learning_rate": 4.996646539817531e-06, + "loss": 0.2104, "step": 713 }, { - "epoch": 0.5619834710743802, - "grad_norm": 0.9427241568832295, - "learning_rate": 2.1946802638346324e-06, - "loss": 0.1208, + "epoch": 0.06578523057078361, + "grad_norm": 1.0993877114695203, + "learning_rate": 4.996606980260977e-06, + "loss": 0.2188, "step": 714 }, { - "epoch": 0.5627705627705628, - "grad_norm": 0.9526667511261941, - "learning_rate": 2.1882176340681682e-06, - "loss": 0.1234, + "epoch": 0.06587736674805363, + "grad_norm": 1.0510165045528403, + "learning_rate": 4.9965671888941955e-06, + "loss": 0.1866, "step": 715 }, { - "epoch": 0.5635576544667453, - "grad_norm": 0.9726636294262463, - "learning_rate": 2.181757120013136e-06, - "loss": 0.1241, + "epoch": 0.06596950292532362, + "grad_norm": 1.1947642401183298, + "learning_rate": 4.996527165720882e-06, + "loss": 0.2027, "step": 716 }, { - "epoch": 0.564344746162928, - "grad_norm": 0.9577642489452165, - "learning_rate": 2.1752987655096765e-06, - "loss": 0.1286, + "epoch": 0.06606163910259363, + "grad_norm": 1.1376782085741068, + "learning_rate": 4.996486910744753e-06, + "loss": 0.184, "step": 717 }, { - "epoch": 0.5651318378591106, - "grad_norm": 0.9119267395234483, - "learning_rate": 2.1688426143832804e-06, - "loss": 0.132, + "epoch": 0.06615377527986364, + "grad_norm": 0.9570102832377015, + "learning_rate": 4.996446423969546e-06, + "loss": 0.1791, "step": 718 }, { - "epoch": 0.5659189295552932, - "grad_norm": 0.944139995902989, - "learning_rate": 2.162388710444482e-06, - "loss": 0.1234, + "epoch": 0.06624591145713364, + "grad_norm": 1.1353836050535746, + "learning_rate": 4.9964057053990186e-06, + "loss": 0.2073, "step": 719 }, { - "epoch": 0.5667060212514758, - "grad_norm": 0.9645692490749199, - "learning_rate": 2.155937097488571e-06, - "loss": 0.1251, + "epoch": 0.06633804763440365, + "grad_norm": 1.2154219117874654, + "learning_rate": 4.996364755036954e-06, + "loss": 0.201, "step": 720 }, { - "epoch": 0.5674931129476584, - "grad_norm": 0.9720255939912888, - "learning_rate": 2.1494878192952857e-06, - "loss": 0.1319, + "epoch": 0.06643018381167365, + "grad_norm": 1.0109850083514942, + "learning_rate": 4.996323572887153e-06, + "loss": 0.1882, "step": 721 }, { - "epoch": 0.568280204643841, - "grad_norm": 0.9511775624645177, - "learning_rate": 2.1430409196285268e-06, - "loss": 0.1327, + "epoch": 0.06652231998894366, + "grad_norm": 1.101603698240089, + "learning_rate": 4.996282158953439e-06, + "loss": 0.1845, "step": 722 }, { - "epoch": 0.5690672963400236, - "grad_norm": 0.9008868958605895, - "learning_rate": 2.1365964422360495e-06, - "loss": 0.1257, + "epoch": 0.06661445616621367, + "grad_norm": 1.1990367891628382, + "learning_rate": 4.996240513239658e-06, + "loss": 0.2094, "step": 723 }, { - "epoch": 0.5698543880362062, - "grad_norm": 1.0087528087899673, - "learning_rate": 2.1301544308491755e-06, - "loss": 0.1404, + "epoch": 0.06670659234348367, + "grad_norm": 1.1531297226861148, + "learning_rate": 4.9961986357496775e-06, + "loss": 0.2001, "step": 724 }, { - "epoch": 0.5706414797323889, - "grad_norm": 0.9236247008656706, - "learning_rate": 2.1237149291824906e-06, - "loss": 0.1122, + "epoch": 0.06679872852075368, + "grad_norm": 1.1181593399001286, + "learning_rate": 4.996156526487383e-06, + "loss": 0.1898, "step": 725 }, { - "epoch": 0.5714285714285714, - "grad_norm": 0.9656011748968637, - "learning_rate": 2.11727798093355e-06, - "loss": 0.1238, + "epoch": 0.06689086469802367, + "grad_norm": 1.1830597611536493, + "learning_rate": 4.996114185456688e-06, + "loss": 0.1877, "step": 726 }, { - "epoch": 0.572215663124754, - "grad_norm": 0.9574761410065884, - "learning_rate": 2.110843629782583e-06, - "loss": 0.1205, + "epoch": 0.06698300087529369, + "grad_norm": 1.1232584394160454, + "learning_rate": 4.996071612661523e-06, + "loss": 0.1953, "step": 727 }, { - "epoch": 0.5730027548209367, - "grad_norm": 1.0073901156504852, - "learning_rate": 2.1044119193921935e-06, - "loss": 0.141, + "epoch": 0.06707513705256368, + "grad_norm": 1.0889290193640835, + "learning_rate": 4.996028808105838e-06, + "loss": 0.2108, "step": 728 }, { - "epoch": 0.5737898465171193, - "grad_norm": 1.0135184994615516, - "learning_rate": 2.097982893407068e-06, - "loss": 0.1391, + "epoch": 0.0671672732298337, + "grad_norm": 1.0995182161943409, + "learning_rate": 4.995985771793611e-06, + "loss": 0.1885, "step": 729 }, { - "epoch": 0.5745769382133018, - "grad_norm": 0.9943855979768463, - "learning_rate": 2.0915565954536745e-06, - "loss": 0.1261, + "epoch": 0.0672594094071037, + "grad_norm": 0.991728475400526, + "learning_rate": 4.995942503728837e-06, + "loss": 0.1945, "step": 730 }, { - "epoch": 0.5753640299094844, - "grad_norm": 0.9263403288426786, - "learning_rate": 2.085133069139971e-06, - "loss": 0.1199, + "epoch": 0.0673515455843737, + "grad_norm": 1.1414476841467807, + "learning_rate": 4.995899003915532e-06, + "loss": 0.2073, "step": 731 }, { - "epoch": 0.5761511216056671, - "grad_norm": 0.9250174978891127, - "learning_rate": 2.078712358055106e-06, - "loss": 0.1292, + "epoch": 0.06744368176164371, + "grad_norm": 1.1816046268935128, + "learning_rate": 4.995855272357736e-06, + "loss": 0.2024, "step": 732 }, { - "epoch": 0.5769382133018497, - "grad_norm": 0.9594966083023022, - "learning_rate": 2.0722945057691253e-06, - "loss": 0.13, + "epoch": 0.06753581793891371, + "grad_norm": 1.0793429170100068, + "learning_rate": 4.995811309059509e-06, + "loss": 0.2027, "step": 733 }, { - "epoch": 0.5777253049980323, - "grad_norm": 0.9996158846425939, - "learning_rate": 2.0658795558326745e-06, - "loss": 0.1346, + "epoch": 0.06762795411618372, + "grad_norm": 1.0463184895901732, + "learning_rate": 4.995767114024934e-06, + "loss": 0.1931, "step": 734 }, { - "epoch": 0.5785123966942148, - "grad_norm": 0.9512575411801811, - "learning_rate": 2.059467551776705e-06, - "loss": 0.1277, + "epoch": 0.06772009029345373, + "grad_norm": 1.11156175838415, + "learning_rate": 4.995722687258113e-06, + "loss": 0.2125, "step": 735 }, { - "epoch": 0.5792994883903975, - "grad_norm": 0.9956048932258434, - "learning_rate": 2.053058537112177e-06, - "loss": 0.1364, + "epoch": 0.06781222647072373, + "grad_norm": 1.1620522039851269, + "learning_rate": 4.995678028763172e-06, + "loss": 0.1767, "step": 736 }, { - "epoch": 0.5800865800865801, - "grad_norm": 0.9822563309180476, - "learning_rate": 2.0466525553297666e-06, - "loss": 0.124, + "epoch": 0.06790436264799374, + "grad_norm": 1.1748972006716736, + "learning_rate": 4.995633138544258e-06, + "loss": 0.1896, "step": 737 }, { - "epoch": 0.5808736717827627, - "grad_norm": 0.9983781840041562, - "learning_rate": 2.0402496498995667e-06, - "loss": 0.1347, + "epoch": 0.06799649882526373, + "grad_norm": 1.1248616510945046, + "learning_rate": 4.995588016605539e-06, + "loss": 0.1975, "step": 738 }, { - "epoch": 0.5816607634789452, - "grad_norm": 0.9827604253780587, - "learning_rate": 2.0338498642707977e-06, - "loss": 0.1369, + "epoch": 0.06808863500253375, + "grad_norm": 1.1490092157815965, + "learning_rate": 4.995542662951203e-06, + "loss": 0.2018, "step": 739 }, { - "epoch": 0.5824478551751279, - "grad_norm": 1.0756801683767687, - "learning_rate": 2.027453241871506e-06, - "loss": 0.1323, + "epoch": 0.06818077117980376, + "grad_norm": 1.29389132222856, + "learning_rate": 4.9954970775854626e-06, + "loss": 0.2105, "step": 740 }, { - "epoch": 0.5832349468713105, - "grad_norm": 1.0081971409472221, - "learning_rate": 2.0210598261082764e-06, - "loss": 0.1356, + "epoch": 0.06827290735707375, + "grad_norm": 1.0475002924733825, + "learning_rate": 4.99545126051255e-06, + "loss": 0.1922, "step": 741 }, { - "epoch": 0.5840220385674931, - "grad_norm": 0.9898906943423369, - "learning_rate": 2.014669660365931e-06, - "loss": 0.1368, + "epoch": 0.06836504353434376, + "grad_norm": 1.1500051467045975, + "learning_rate": 4.99540521173672e-06, + "loss": 0.2023, "step": 742 }, { - "epoch": 0.5848091302636758, - "grad_norm": 0.9690524566063999, - "learning_rate": 2.0082827880072393e-06, - "loss": 0.135, + "epoch": 0.06845717971161376, + "grad_norm": 1.15475035543741, + "learning_rate": 4.995358931262246e-06, + "loss": 0.1969, "step": 743 }, { - "epoch": 0.5855962219598583, - "grad_norm": 0.9708412001010785, - "learning_rate": 2.0018992523726217e-06, - "loss": 0.1252, + "epoch": 0.06854931588888377, + "grad_norm": 1.1034626957034204, + "learning_rate": 4.995312419093427e-06, + "loss": 0.2191, "step": 744 }, { - "epoch": 0.5863833136560409, - "grad_norm": 0.9322317029959182, - "learning_rate": 1.995519096779855e-06, - "loss": 0.1205, + "epoch": 0.06864145206615377, + "grad_norm": 1.1053020130496054, + "learning_rate": 4.995265675234583e-06, + "loss": 0.1987, "step": 745 }, { - "epoch": 0.5871704053522235, - "grad_norm": 0.934365579766912, - "learning_rate": 1.9891423645237832e-06, - "loss": 0.1194, + "epoch": 0.06873358824342378, + "grad_norm": 1.1992876381966995, + "learning_rate": 4.995218699690051e-06, + "loss": 0.209, "step": 746 }, { - "epoch": 0.5879574970484062, - "grad_norm": 0.9421279165900748, - "learning_rate": 1.982769098876015e-06, - "loss": 0.1319, + "epoch": 0.06882572442069379, + "grad_norm": 1.082494388880573, + "learning_rate": 4.995171492464195e-06, + "loss": 0.1994, "step": 747 }, { - "epoch": 0.5887445887445888, - "grad_norm": 0.9954570155310445, - "learning_rate": 1.9763993430846394e-06, - "loss": 0.1369, + "epoch": 0.06891786059796379, + "grad_norm": 1.129990460435263, + "learning_rate": 4.995124053561396e-06, + "loss": 0.2027, "step": 748 }, { - "epoch": 0.5895316804407713, - "grad_norm": 0.931791564112829, - "learning_rate": 1.970033140373925e-06, - "loss": 0.1315, + "epoch": 0.0690099967752338, + "grad_norm": 1.0028326159846859, + "learning_rate": 4.9950763829860615e-06, + "loss": 0.1868, "step": 749 }, { - "epoch": 0.5903187721369539, - "grad_norm": 0.9679723780616554, - "learning_rate": 1.9636705339440327e-06, - "loss": 0.1377, + "epoch": 0.0691021329525038, + "grad_norm": 1.0755633729487084, + "learning_rate": 4.995028480742616e-06, + "loss": 0.1966, "step": 750 }, { - "epoch": 0.5911058638331366, - "grad_norm": 0.9863750681505877, - "learning_rate": 1.957311566970716e-06, - "loss": 0.1293, + "epoch": 0.0691942691297738, + "grad_norm": 1.1063468461219594, + "learning_rate": 4.9949803468355075e-06, + "loss": 0.1893, "step": 751 }, { - "epoch": 0.5918929555293192, - "grad_norm": 0.9598202963903522, - "learning_rate": 1.9509562826050353e-06, - "loss": 0.1273, + "epoch": 0.06928640530704382, + "grad_norm": 1.048084914654042, + "learning_rate": 4.994931981269206e-06, + "loss": 0.2112, "step": 752 }, { - "epoch": 0.5926800472255017, - "grad_norm": 0.9800756257622318, - "learning_rate": 1.944604723973058e-06, - "loss": 0.1284, + "epoch": 0.06937854148431381, + "grad_norm": 1.0230459409513677, + "learning_rate": 4.9948833840482e-06, + "loss": 0.1965, "step": 753 }, { - "epoch": 0.5934671389216843, - "grad_norm": 1.0001757874575956, - "learning_rate": 1.938256934175571e-06, - "loss": 0.1303, + "epoch": 0.06947067766158382, + "grad_norm": 1.0632313033802272, + "learning_rate": 4.994834555177004e-06, + "loss": 0.2011, "step": 754 }, { - "epoch": 0.594254230617867, - "grad_norm": 0.9299561635899479, - "learning_rate": 1.9319129562877863e-06, - "loss": 0.1239, + "epoch": 0.06956281383885382, + "grad_norm": 1.0589252280154375, + "learning_rate": 4.994785494660151e-06, + "loss": 0.2018, "step": 755 }, { - "epoch": 0.5950413223140496, - "grad_norm": 1.0345095738407815, - "learning_rate": 1.925572833359048e-06, - "loss": 0.1305, + "epoch": 0.06965495001612383, + "grad_norm": 1.1755938840081506, + "learning_rate": 4.994736202502196e-06, + "loss": 0.2266, "step": 756 }, { - "epoch": 0.5958284140102322, - "grad_norm": 1.0520535233317054, - "learning_rate": 1.9192366084125423e-06, - "loss": 0.1373, + "epoch": 0.06974708619339384, + "grad_norm": 1.0493776400120107, + "learning_rate": 4.994686678707716e-06, + "loss": 0.2197, "step": 757 }, { - "epoch": 0.5966155057064148, - "grad_norm": 1.029018918955376, - "learning_rate": 1.9129043244450027e-06, - "loss": 0.1382, + "epoch": 0.06983922237066384, + "grad_norm": 0.9856328790985965, + "learning_rate": 4.9946369232813104e-06, + "loss": 0.1926, "step": 758 }, { - "epoch": 0.5974025974025974, - "grad_norm": 1.0294584813791954, - "learning_rate": 1.906576024426422e-06, - "loss": 0.1368, + "epoch": 0.06993135854793385, + "grad_norm": 0.9684741371548949, + "learning_rate": 4.994586936227598e-06, + "loss": 0.1856, "step": 759 }, { - "epoch": 0.59818968909878, - "grad_norm": 0.9330122675132353, - "learning_rate": 1.9002517512997555e-06, - "loss": 0.1145, + "epoch": 0.07002349472520385, + "grad_norm": 1.0235610225225114, + "learning_rate": 4.99453671755122e-06, + "loss": 0.1985, "step": 760 }, { - "epoch": 0.5989767807949626, - "grad_norm": 0.9623676868988281, - "learning_rate": 1.8939315479806352e-06, - "loss": 0.1335, + "epoch": 0.07011563090247386, + "grad_norm": 1.0056452861965137, + "learning_rate": 4.994486267256839e-06, + "loss": 0.2043, "step": 761 }, { - "epoch": 0.5997638724911453, - "grad_norm": 0.9245436138689049, - "learning_rate": 1.8876154573570744e-06, - "loss": 0.1307, + "epoch": 0.07020776707974385, + "grad_norm": 0.9480271147994421, + "learning_rate": 4.994435585349139e-06, + "loss": 0.1854, "step": 762 }, { - "epoch": 0.6005509641873278, - "grad_norm": 0.8942520983106202, - "learning_rate": 1.8813035222891785e-06, - "loss": 0.1272, + "epoch": 0.07029990325701387, + "grad_norm": 1.073899838330998, + "learning_rate": 4.994384671832827e-06, + "loss": 0.194, "step": 763 }, { - "epoch": 0.6013380558835104, - "grad_norm": 0.9343347217079488, - "learning_rate": 1.8749957856088546e-06, - "loss": 0.1317, + "epoch": 0.07039203943428388, + "grad_norm": 1.1042378147261156, + "learning_rate": 4.994333526712629e-06, + "loss": 0.2038, "step": 764 }, { - "epoch": 0.602125147579693, - "grad_norm": 0.9393324326245188, - "learning_rate": 1.8686922901195197e-06, - "loss": 0.1313, + "epoch": 0.07048417561155387, + "grad_norm": 1.0929836003453606, + "learning_rate": 4.994282149993296e-06, + "loss": 0.1941, "step": 765 }, { - "epoch": 0.6029122392758757, - "grad_norm": 0.9333264421793994, - "learning_rate": 1.8623930785958092e-06, - "loss": 0.1226, + "epoch": 0.07057631178882388, + "grad_norm": 1.002112688526324, + "learning_rate": 4.994230541679596e-06, + "loss": 0.1952, "step": 766 }, { - "epoch": 0.6036993309720582, - "grad_norm": 0.9718728327996774, - "learning_rate": 1.8560981937832916e-06, - "loss": 0.1314, + "epoch": 0.07066844796609388, + "grad_norm": 1.0405743611774914, + "learning_rate": 4.994178701776322e-06, + "loss": 0.1973, "step": 767 }, { - "epoch": 0.6044864226682408, - "grad_norm": 0.9437466891844623, - "learning_rate": 1.849807678398171e-06, - "loss": 0.1271, + "epoch": 0.07076058414336389, + "grad_norm": 1.0611508046462508, + "learning_rate": 4.994126630288287e-06, + "loss": 0.1998, "step": 768 }, { - "epoch": 0.6052735143644234, - "grad_norm": 0.9433172532376, - "learning_rate": 1.8435215751270048e-06, - "loss": 0.1083, + "epoch": 0.0708527203206339, + "grad_norm": 1.0044887753865614, + "learning_rate": 4.994074327220326e-06, + "loss": 0.1912, "step": 769 }, { - "epoch": 0.6060606060606061, - "grad_norm": 0.9486485907428178, - "learning_rate": 1.8372399266264069e-06, - "loss": 0.1245, + "epoch": 0.0709448564979039, + "grad_norm": 1.0125081975049615, + "learning_rate": 4.994021792577296e-06, + "loss": 0.1996, "step": 770 }, { - "epoch": 0.6068476977567887, - "grad_norm": 0.9345585045873044, - "learning_rate": 1.8309627755227643e-06, - "loss": 0.1205, + "epoch": 0.07103699267517391, + "grad_norm": 1.1171803446967403, + "learning_rate": 4.993969026364074e-06, + "loss": 0.2068, "step": 771 }, { - "epoch": 0.6076347894529712, - "grad_norm": 1.0082946745736912, - "learning_rate": 1.8246901644119447e-06, - "loss": 0.1337, + "epoch": 0.0711291288524439, + "grad_norm": 1.148820539107491, + "learning_rate": 4.993916028585559e-06, + "loss": 0.198, "step": 772 }, { - "epoch": 0.6084218811491539, - "grad_norm": 0.9640602588467792, - "learning_rate": 1.8184221358590078e-06, - "loss": 0.123, + "epoch": 0.07122126502971392, + "grad_norm": 1.0778949515489327, + "learning_rate": 4.993862799246672e-06, + "loss": 0.1984, "step": 773 }, { - "epoch": 0.6092089728453365, - "grad_norm": 1.0256774883323883, - "learning_rate": 1.812158732397917e-06, - "loss": 0.1331, + "epoch": 0.07131340120698393, + "grad_norm": 1.1483231088801946, + "learning_rate": 4.9938093383523565e-06, + "loss": 0.2117, "step": 774 }, { - "epoch": 0.6099960645415191, - "grad_norm": 0.9485492161002549, - "learning_rate": 1.8058999965312484e-06, - "loss": 0.1328, + "epoch": 0.07140553738425393, + "grad_norm": 1.1566007468065098, + "learning_rate": 4.993755645907575e-06, + "loss": 0.2116, "step": 775 }, { - "epoch": 0.6107831562377017, - "grad_norm": 0.9763406590147844, - "learning_rate": 1.799645970729909e-06, - "loss": 0.1309, + "epoch": 0.07149767356152394, + "grad_norm": 1.0319713754984687, + "learning_rate": 4.993701721917314e-06, + "loss": 0.2012, "step": 776 }, { - "epoch": 0.6115702479338843, - "grad_norm": 0.9917227985654803, - "learning_rate": 1.793396697432839e-06, - "loss": 0.1349, + "epoch": 0.07158980973879393, + "grad_norm": 1.077311245060311, + "learning_rate": 4.993647566386579e-06, + "loss": 0.2059, "step": 777 }, { - "epoch": 0.6123573396300669, - "grad_norm": 0.9926597353156553, - "learning_rate": 1.7871522190467327e-06, - "loss": 0.1303, + "epoch": 0.07168194591606394, + "grad_norm": 1.1405163265066733, + "learning_rate": 4.993593179320399e-06, + "loss": 0.1983, "step": 778 }, { - "epoch": 0.6131444313262495, - "grad_norm": 0.9259479446299848, - "learning_rate": 1.7809125779457432e-06, - "loss": 0.1145, + "epoch": 0.07177408209333394, + "grad_norm": 1.0767703872817018, + "learning_rate": 4.993538560723824e-06, + "loss": 0.2079, "step": 779 }, { - "epoch": 0.6139315230224321, - "grad_norm": 0.9291022839595524, - "learning_rate": 1.7746778164712024e-06, - "loss": 0.119, + "epoch": 0.07186621827060395, + "grad_norm": 1.2227974845354599, + "learning_rate": 4.993483710601926e-06, + "loss": 0.2136, "step": 780 }, { - "epoch": 0.6147186147186147, - "grad_norm": 0.951539094911597, - "learning_rate": 1.768447976931326e-06, - "loss": 0.1261, + "epoch": 0.07195835444787396, + "grad_norm": 1.0541963353313086, + "learning_rate": 4.993428628959796e-06, + "loss": 0.1913, "step": 781 }, { - "epoch": 0.6155057064147973, - "grad_norm": 0.993556323836548, - "learning_rate": 1.7622231016009333e-06, - "loss": 0.1297, + "epoch": 0.07205049062514396, + "grad_norm": 1.0513503112741112, + "learning_rate": 4.993373315802551e-06, + "loss": 0.201, "step": 782 }, { - "epoch": 0.6162927981109799, - "grad_norm": 0.9391833540663885, - "learning_rate": 1.7560032327211546e-06, - "loss": 0.124, + "epoch": 0.07214262680241397, + "grad_norm": 1.0201367211302232, + "learning_rate": 4.993317771135324e-06, + "loss": 0.1903, "step": 783 }, { - "epoch": 0.6170798898071626, - "grad_norm": 1.031878772377542, - "learning_rate": 1.7497884124991487e-06, - "loss": 0.1308, + "epoch": 0.07223476297968397, + "grad_norm": 1.1327814545653032, + "learning_rate": 4.993261994963275e-06, + "loss": 0.2212, "step": 784 }, { - "epoch": 0.6178669815033452, - "grad_norm": 0.9834671756142636, - "learning_rate": 1.7435786831078144e-06, - "loss": 0.1303, + "epoch": 0.07232689915695398, + "grad_norm": 1.0740213154007985, + "learning_rate": 4.99320598729158e-06, + "loss": 0.1948, "step": 785 }, { - "epoch": 0.6186540731995277, - "grad_norm": 0.9859388240495401, - "learning_rate": 1.7373740866855043e-06, - "loss": 0.1326, + "epoch": 0.07241903533422399, + "grad_norm": 0.957971255991909, + "learning_rate": 4.993149748125441e-06, + "loss": 0.1928, "step": 786 }, { - "epoch": 0.6194411648957103, - "grad_norm": 1.0156315373671152, - "learning_rate": 1.731174665335742e-06, - "loss": 0.1333, + "epoch": 0.07251117151149399, + "grad_norm": 1.0166766681701447, + "learning_rate": 4.99309327747008e-06, + "loss": 0.1938, "step": 787 }, { - "epoch": 0.620228256591893, - "grad_norm": 0.8457875340285443, - "learning_rate": 1.724980461126929e-06, - "loss": 0.1149, + "epoch": 0.072603307688764, + "grad_norm": 1.0764254895760437, + "learning_rate": 4.99303657533074e-06, + "loss": 0.2087, "step": 788 }, { - "epoch": 0.6210153482880756, - "grad_norm": 0.9812167735229308, - "learning_rate": 1.7187915160920692e-06, - "loss": 0.1341, + "epoch": 0.07269544386603399, + "grad_norm": 1.0328362715853223, + "learning_rate": 4.9929796417126855e-06, + "loss": 0.2004, "step": 789 }, { - "epoch": 0.6218024399842582, - "grad_norm": 0.9479256338770862, - "learning_rate": 1.7126078722284739e-06, - "loss": 0.1171, + "epoch": 0.072787580043304, + "grad_norm": 1.0993910395671507, + "learning_rate": 4.992922476621203e-06, + "loss": 0.1968, "step": 790 }, { - "epoch": 0.6225895316804407, - "grad_norm": 0.9626147311559159, - "learning_rate": 1.706429571497486e-06, - "loss": 0.1195, + "epoch": 0.07287971622057401, + "grad_norm": 1.174010366680361, + "learning_rate": 4.992865080061599e-06, + "loss": 0.2078, "step": 791 }, { - "epoch": 0.6233766233766234, - "grad_norm": 0.996537388600602, - "learning_rate": 1.7002566558241862e-06, - "loss": 0.1347, + "epoch": 0.07297185239784401, + "grad_norm": 1.0829664383846667, + "learning_rate": 4.992807452039206e-06, + "loss": 0.2075, "step": 792 }, { - "epoch": 0.624163715072806, - "grad_norm": 1.0979789094333103, - "learning_rate": 1.694089167097116e-06, - "loss": 0.1442, + "epoch": 0.07306398857511402, + "grad_norm": 1.0362818708953512, + "learning_rate": 4.992749592559372e-06, + "loss": 0.2064, "step": 793 }, { - "epoch": 0.6249508067689886, - "grad_norm": 0.9903394042224888, - "learning_rate": 1.6879271471679887e-06, - "loss": 0.1275, + "epoch": 0.07315612475238402, + "grad_norm": 1.1051992276067801, + "learning_rate": 4.99269150162747e-06, + "loss": 0.2035, "step": 794 }, { - "epoch": 0.6257378984651711, - "grad_norm": 0.9904940377814807, - "learning_rate": 1.681770637851409e-06, - "loss": 0.139, + "epoch": 0.07324826092965403, + "grad_norm": 1.0070264686302375, + "learning_rate": 4.9926331792488935e-06, + "loss": 0.2013, "step": 795 }, { - "epoch": 0.6265249901613538, - "grad_norm": 0.9969630818236452, - "learning_rate": 1.675619680924584e-06, - "loss": 0.1325, + "epoch": 0.07334039710692403, + "grad_norm": 1.0971405705997572, + "learning_rate": 4.992574625429059e-06, + "loss": 0.1991, "step": 796 }, { - "epoch": 0.6273120818575364, - "grad_norm": 1.0558109930918702, - "learning_rate": 1.6694743181270474e-06, - "loss": 0.1448, + "epoch": 0.07343253328419404, + "grad_norm": 1.0608316209147346, + "learning_rate": 4.992515840173401e-06, + "loss": 0.205, "step": 797 }, { - "epoch": 0.628099173553719, - "grad_norm": 0.956496508451797, - "learning_rate": 1.663334591160368e-06, - "loss": 0.1217, + "epoch": 0.07352466946146405, + "grad_norm": 1.0833481979795825, + "learning_rate": 4.992456823487381e-06, + "loss": 0.2123, "step": 798 }, { - "epoch": 0.6288862652499017, - "grad_norm": 0.9677341961932617, - "learning_rate": 1.657200541687874e-06, - "loss": 0.136, + "epoch": 0.07361680563873405, + "grad_norm": 1.130384094246514, + "learning_rate": 4.992397575376474e-06, + "loss": 0.206, "step": 799 }, { - "epoch": 0.6296733569460842, - "grad_norm": 1.0046971327809577, - "learning_rate": 1.6510722113343633e-06, - "loss": 0.1322, + "epoch": 0.07370894181600406, + "grad_norm": 1.032883544230814, + "learning_rate": 4.992338095846185e-06, + "loss": 0.1954, "step": 800 }, { - "epoch": 0.6304604486422668, - "grad_norm": 0.9254874616921521, - "learning_rate": 1.6449496416858285e-06, - "loss": 0.1227, + "epoch": 0.07380107799327405, + "grad_norm": 1.0325683668694534, + "learning_rate": 4.992278384902036e-06, + "loss": 0.1823, "step": 801 }, { - "epoch": 0.6312475403384494, - "grad_norm": 1.128513971443689, - "learning_rate": 1.6388328742891679e-06, - "loss": 0.1357, + "epoch": 0.07389321417054406, + "grad_norm": 1.0856330855658867, + "learning_rate": 4.992218442549571e-06, + "loss": 0.1972, "step": 802 }, { - "epoch": 0.6320346320346321, - "grad_norm": 1.0186779597478501, - "learning_rate": 1.6327219506519082e-06, - "loss": 0.1369, + "epoch": 0.07398535034781407, + "grad_norm": 0.9419401027800425, + "learning_rate": 4.992158268794355e-06, + "loss": 0.174, "step": 803 }, { - "epoch": 0.6328217237308147, - "grad_norm": 0.9605839825909683, - "learning_rate": 1.6266169122419208e-06, - "loss": 0.1222, + "epoch": 0.07407748652508407, + "grad_norm": 0.9955805004494609, + "learning_rate": 4.992097863641975e-06, + "loss": 0.1921, "step": 804 }, { - "epoch": 0.6336088154269972, - "grad_norm": 1.0294396317293524, - "learning_rate": 1.6205178004871392e-06, - "loss": 0.1265, + "epoch": 0.07416962270235408, + "grad_norm": 1.0997153728401974, + "learning_rate": 4.992037227098041e-06, + "loss": 0.2003, "step": 805 }, { - "epoch": 0.6343959071231798, - "grad_norm": 0.9797067002404048, - "learning_rate": 1.6144246567752831e-06, - "loss": 0.1298, + "epoch": 0.07426175887962408, + "grad_norm": 1.1910036898293976, + "learning_rate": 4.991976359168182e-06, + "loss": 0.2154, "step": 806 }, { - "epoch": 0.6351829988193625, - "grad_norm": 0.9531255383226177, - "learning_rate": 1.6083375224535689e-06, - "loss": 0.1204, + "epoch": 0.07435389505689409, + "grad_norm": 1.1056827367049222, + "learning_rate": 4.99191525985805e-06, + "loss": 0.2022, "step": 807 }, { - "epoch": 0.6359700905155451, - "grad_norm": 0.9228494475342526, - "learning_rate": 1.6022564388284391e-06, - "loss": 0.1122, + "epoch": 0.0744460312341641, + "grad_norm": 1.042144172015194, + "learning_rate": 4.991853929173318e-06, + "loss": 0.1988, "step": 808 }, { - "epoch": 0.6367571822117276, - "grad_norm": 0.9854787445128979, - "learning_rate": 1.596181447165273e-06, - "loss": 0.1287, + "epoch": 0.0745381674114341, + "grad_norm": 1.01022506354823, + "learning_rate": 4.99179236711968e-06, + "loss": 0.1876, "step": 809 }, { - "epoch": 0.6375442739079102, - "grad_norm": 0.9205768495534565, - "learning_rate": 1.5901125886881147e-06, - "loss": 0.1206, + "epoch": 0.07463030358870411, + "grad_norm": 1.0524961526541383, + "learning_rate": 4.991730573702852e-06, + "loss": 0.1909, "step": 810 }, { - "epoch": 0.6383313656040929, - "grad_norm": 0.9798975631304712, - "learning_rate": 1.5840499045793845e-06, - "loss": 0.1231, + "epoch": 0.0747224397659741, + "grad_norm": 1.1702706877003066, + "learning_rate": 4.991668548928573e-06, + "loss": 0.2195, "step": 811 }, { - "epoch": 0.6391184573002755, - "grad_norm": 0.9296415144186752, - "learning_rate": 1.5779934359796095e-06, - "loss": 0.1202, + "epoch": 0.07481457594324412, + "grad_norm": 1.0918293776434618, + "learning_rate": 4.991606292802601e-06, + "loss": 0.2199, "step": 812 }, { - "epoch": 0.6399055489964581, - "grad_norm": 0.9814672778856722, - "learning_rate": 1.5719432239871347e-06, - "loss": 0.1211, + "epoch": 0.07490671212051413, + "grad_norm": 1.0721208493063734, + "learning_rate": 4.991543805330716e-06, + "loss": 0.2144, "step": 813 }, { - "epoch": 0.6406926406926406, - "grad_norm": 0.9650667010737961, - "learning_rate": 1.5658993096578512e-06, - "loss": 0.123, + "epoch": 0.07499884829778412, + "grad_norm": 1.0564964966450854, + "learning_rate": 4.991481086518721e-06, + "loss": 0.1924, "step": 814 }, { - "epoch": 0.6414797323888233, - "grad_norm": 0.9606428490346777, - "learning_rate": 1.5598617340049145e-06, - "loss": 0.1196, + "epoch": 0.07509098447505413, + "grad_norm": 1.028522715058719, + "learning_rate": 4.9914181363724394e-06, + "loss": 0.1979, "step": 815 }, { - "epoch": 0.6422668240850059, - "grad_norm": 0.9865842262641049, - "learning_rate": 1.5538305379984661e-06, - "loss": 0.1414, + "epoch": 0.07518312065232413, + "grad_norm": 1.0724064172531385, + "learning_rate": 4.991354954897715e-06, + "loss": 0.2131, "step": 816 }, { - "epoch": 0.6430539157811885, - "grad_norm": 0.9734870716677574, - "learning_rate": 1.547805762565358e-06, - "loss": 0.1286, + "epoch": 0.07527525682959414, + "grad_norm": 1.0614207713972115, + "learning_rate": 4.991291542100416e-06, + "loss": 0.207, "step": 817 }, { - "epoch": 0.6438410074773712, - "grad_norm": 0.9439567875437019, - "learning_rate": 1.5417874485888706e-06, - "loss": 0.1109, + "epoch": 0.07536739300686414, + "grad_norm": 1.0449337855654208, + "learning_rate": 4.991227897986428e-06, + "loss": 0.1917, "step": 818 }, { - "epoch": 0.6446280991735537, - "grad_norm": 1.0235998100882107, - "learning_rate": 1.5357756369084398e-06, - "loss": 0.123, + "epoch": 0.07545952918413415, + "grad_norm": 1.1357051354151935, + "learning_rate": 4.991164022561662e-06, + "loss": 0.2016, "step": 819 }, { - "epoch": 0.6454151908697363, - "grad_norm": 1.0269524388931728, - "learning_rate": 1.5297703683193755e-06, - "loss": 0.1324, + "epoch": 0.07555166536140416, + "grad_norm": 0.9585027095704082, + "learning_rate": 4.991099915832048e-06, + "loss": 0.1846, "step": 820 }, { - "epoch": 0.6462022825659189, - "grad_norm": 0.9493642789135233, - "learning_rate": 1.5237716835725907e-06, - "loss": 0.1125, + "epoch": 0.07564380153867416, + "grad_norm": 1.1083315681597967, + "learning_rate": 4.9910355778035394e-06, + "loss": 0.2127, "step": 821 }, { - "epoch": 0.6469893742621016, - "grad_norm": 1.162601877497598, - "learning_rate": 1.5177796233743174e-06, - "loss": 0.1249, + "epoch": 0.07573593771594417, + "grad_norm": 1.1383747660532335, + "learning_rate": 4.990971008482109e-06, + "loss": 0.1968, "step": 822 }, { - "epoch": 0.6477764659582841, - "grad_norm": 1.034038388079516, - "learning_rate": 1.511794228385837e-06, - "loss": 0.1217, + "epoch": 0.07582807389321417, + "grad_norm": 1.104749280816824, + "learning_rate": 4.990906207873753e-06, + "loss": 0.1904, "step": 823 }, { - "epoch": 0.6485635576544667, - "grad_norm": 0.9672639615152381, - "learning_rate": 1.5058155392232004e-06, - "loss": 0.1208, + "epoch": 0.07592021007048418, + "grad_norm": 1.0561974215226457, + "learning_rate": 4.990841175984486e-06, + "loss": 0.2, "step": 824 }, { - "epoch": 0.6493506493506493, - "grad_norm": 1.080256521732267, - "learning_rate": 1.4998435964569552e-06, - "loss": 0.1279, + "epoch": 0.07601234624775419, + "grad_norm": 1.0340894305919612, + "learning_rate": 4.9907759128203485e-06, + "loss": 0.1958, "step": 825 }, { - "epoch": 0.650137741046832, - "grad_norm": 0.9417495839242918, - "learning_rate": 1.4938784406118663e-06, - "loss": 0.1249, + "epoch": 0.07610448242502418, + "grad_norm": 1.020276399091769, + "learning_rate": 4.9907104183874e-06, + "loss": 0.1939, "step": 826 }, { - "epoch": 0.6509248327430146, - "grad_norm": 1.006350286001005, - "learning_rate": 1.4879201121666466e-06, - "loss": 0.1251, + "epoch": 0.0761966186022942, + "grad_norm": 1.074585805099351, + "learning_rate": 4.990644692691721e-06, + "loss": 0.1983, "step": 827 }, { - "epoch": 0.6517119244391971, - "grad_norm": 0.968507626389286, - "learning_rate": 1.4819686515536763e-06, - "loss": 0.1203, + "epoch": 0.07628875477956419, + "grad_norm": 0.9840992754978488, + "learning_rate": 4.990578735739413e-06, + "loss": 0.1936, "step": 828 }, { - "epoch": 0.6524990161353798, - "grad_norm": 0.979256644659201, - "learning_rate": 1.4760240991587338e-06, - "loss": 0.1309, + "epoch": 0.0763808909568342, + "grad_norm": 1.0576207176961585, + "learning_rate": 4.990512547536602e-06, + "loss": 0.196, "step": 829 }, { - "epoch": 0.6532861078315624, - "grad_norm": 1.041802414674734, - "learning_rate": 1.4700864953207192e-06, - "loss": 0.124, + "epoch": 0.07647302713410421, + "grad_norm": 1.054963331859563, + "learning_rate": 4.990446128089434e-06, + "loss": 0.1939, "step": 830 }, { - "epoch": 0.654073199527745, - "grad_norm": 0.9623673717149763, - "learning_rate": 1.4641558803313783e-06, - "loss": 0.1153, + "epoch": 0.07656516331137421, + "grad_norm": 1.0430434986538253, + "learning_rate": 4.990379477404073e-06, + "loss": 0.1973, "step": 831 }, { - "epoch": 0.6548602912239276, - "grad_norm": 1.0049463554640272, - "learning_rate": 1.4582322944350335e-06, - "loss": 0.123, + "epoch": 0.07665729948864422, + "grad_norm": 1.0974937731254035, + "learning_rate": 4.9903125954867114e-06, + "loss": 0.2066, "step": 832 }, { - "epoch": 0.6556473829201102, - "grad_norm": 0.9822560730942449, - "learning_rate": 1.4523157778283082e-06, - "loss": 0.1253, + "epoch": 0.07674943566591422, + "grad_norm": 1.0481385062164033, + "learning_rate": 4.990245482343556e-06, + "loss": 0.1749, "step": 833 }, { - "epoch": 0.6564344746162928, - "grad_norm": 1.0300014906979744, - "learning_rate": 1.4464063706598563e-06, - "loss": 0.121, + "epoch": 0.07684157184318423, + "grad_norm": 1.1591353967604399, + "learning_rate": 4.990178137980841e-06, + "loss": 0.2042, "step": 834 }, { - "epoch": 0.6572215663124754, - "grad_norm": 0.9605069437184749, - "learning_rate": 1.440504113030084e-06, - "loss": 0.1303, + "epoch": 0.07693370802045423, + "grad_norm": 1.020946978751914, + "learning_rate": 4.990110562404817e-06, + "loss": 0.1887, "step": 835 }, { - "epoch": 0.658008658008658, - "grad_norm": 1.0062748427154549, - "learning_rate": 1.4346090449908862e-06, - "loss": 0.1254, + "epoch": 0.07702584419772424, + "grad_norm": 1.0521526449822267, + "learning_rate": 4.990042755621759e-06, + "loss": 0.1925, "step": 836 }, { - "epoch": 0.6587957497048406, - "grad_norm": 1.003505120930448, - "learning_rate": 1.4287212065453681e-06, - "loss": 0.1293, + "epoch": 0.07711798037499425, + "grad_norm": 1.08541448464305, + "learning_rate": 4.989974717637963e-06, + "loss": 0.1917, "step": 837 }, { - "epoch": 0.6595828414010232, - "grad_norm": 0.9215109848797975, - "learning_rate": 1.4228406376475741e-06, - "loss": 0.1156, + "epoch": 0.07721011655226424, + "grad_norm": 1.044048482126402, + "learning_rate": 4.989906448459748e-06, + "loss": 0.2083, "step": 838 }, { - "epoch": 0.6603699330972058, - "grad_norm": 1.0375359512611602, - "learning_rate": 1.4169673782022232e-06, - "loss": 0.1251, + "epoch": 0.07730225272953425, + "grad_norm": 1.0120160811235484, + "learning_rate": 4.98983794809345e-06, + "loss": 0.1941, "step": 839 }, { - "epoch": 0.6611570247933884, - "grad_norm": 1.0075633482471045, - "learning_rate": 1.411101468064429e-06, - "loss": 0.1273, + "epoch": 0.07739438890680425, + "grad_norm": 1.102690087406534, + "learning_rate": 4.989769216545431e-06, + "loss": 0.1952, "step": 840 }, { - "epoch": 0.6619441164895711, - "grad_norm": 1.0079245494150497, - "learning_rate": 1.4052429470394353e-06, - "loss": 0.1302, + "epoch": 0.07748652508407426, + "grad_norm": 1.0589664371165017, + "learning_rate": 4.9897002538220715e-06, + "loss": 0.1889, "step": 841 }, { - "epoch": 0.6627312081857536, - "grad_norm": 0.9589739631373009, - "learning_rate": 1.3993918548823453e-06, - "loss": 0.1219, + "epoch": 0.07757866126134427, + "grad_norm": 1.0781668139280136, + "learning_rate": 4.989631059929777e-06, + "loss": 0.2183, "step": 842 }, { - "epoch": 0.6635182998819362, - "grad_norm": 0.9854619269672102, - "learning_rate": 1.3935482312978494e-06, - "loss": 0.1264, + "epoch": 0.07767079743861427, + "grad_norm": 1.0272497094730775, + "learning_rate": 4.989561634874969e-06, + "loss": 0.1995, "step": 843 }, { - "epoch": 0.6643053915781189, - "grad_norm": 1.0139593156707545, - "learning_rate": 1.3877121159399587e-06, - "loss": 0.1352, + "epoch": 0.07776293361588428, + "grad_norm": 1.1132914451731781, + "learning_rate": 4.9894919786640964e-06, + "loss": 0.2053, "step": 844 }, { - "epoch": 0.6650924832743015, - "grad_norm": 0.9879913850797528, - "learning_rate": 1.381883548411735e-06, - "loss": 0.1252, + "epoch": 0.07785506979315428, + "grad_norm": 1.1159855286710003, + "learning_rate": 4.989422091303625e-06, + "loss": 0.1962, "step": 845 }, { - "epoch": 0.6658795749704841, - "grad_norm": 0.9828821822604814, - "learning_rate": 1.376062568265018e-06, - "loss": 0.1262, + "epoch": 0.07794720597042429, + "grad_norm": 1.164205836975356, + "learning_rate": 4.989351972800045e-06, + "loss": 0.2039, "step": 846 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.9902383754663022, - "learning_rate": 1.370249215000166e-06, - "loss": 0.1339, + "epoch": 0.0780393421476943, + "grad_norm": 1.0153737620570222, + "learning_rate": 4.989281623159866e-06, + "loss": 0.205, "step": 847 }, { - "epoch": 0.6674537583628493, - "grad_norm": 1.0169925787410046, - "learning_rate": 1.3644435280657765e-06, - "loss": 0.1325, + "epoch": 0.0781314783249643, + "grad_norm": 1.1936452816832441, + "learning_rate": 4.98921104238962e-06, + "loss": 0.2094, "step": 848 }, { - "epoch": 0.6682408500590319, - "grad_norm": 0.9802382914836032, - "learning_rate": 1.3586455468584292e-06, - "loss": 0.1294, + "epoch": 0.0782236145022343, + "grad_norm": 1.0872486263629397, + "learning_rate": 4.989140230495862e-06, + "loss": 0.2017, "step": 849 }, { - "epoch": 0.6690279417552145, - "grad_norm": 0.9103087080426163, - "learning_rate": 1.3528553107224108e-06, - "loss": 0.1132, + "epoch": 0.0783157506795043, + "grad_norm": 1.0430936509820625, + "learning_rate": 4.989069187485165e-06, + "loss": 0.1959, "step": 850 }, { - "epoch": 0.669815033451397, - "grad_norm": 1.0322697690605673, - "learning_rate": 1.347072858949453e-06, - "loss": 0.1326, + "epoch": 0.07840788685677431, + "grad_norm": 1.043073581421324, + "learning_rate": 4.988997913364126e-06, + "loss": 0.1943, "step": 851 }, { - "epoch": 0.6706021251475797, - "grad_norm": 0.940497609406273, - "learning_rate": 1.3412982307784617e-06, - "loss": 0.1142, + "epoch": 0.07850002303404431, + "grad_norm": 1.142893762388807, + "learning_rate": 4.988926408139363e-06, + "loss": 0.2164, "step": 852 }, { - "epoch": 0.6713892168437623, - "grad_norm": 0.9651333506256994, - "learning_rate": 1.3355314653952555e-06, - "loss": 0.12, + "epoch": 0.07859215921131432, + "grad_norm": 1.1122064536056118, + "learning_rate": 4.988854671817516e-06, + "loss": 0.2032, "step": 853 }, { - "epoch": 0.6721763085399449, - "grad_norm": 0.8974492403550183, - "learning_rate": 1.3297726019322948e-06, - "loss": 0.1252, + "epoch": 0.07868429538858433, + "grad_norm": 1.0929495216494962, + "learning_rate": 4.988782704405244e-06, + "loss": 0.1949, "step": 854 }, { - "epoch": 0.6729634002361276, - "grad_norm": 0.9779192150286001, - "learning_rate": 1.3240216794684212e-06, - "loss": 0.1265, + "epoch": 0.07877643156585433, + "grad_norm": 1.2464150623403418, + "learning_rate": 4.98871050590923e-06, + "loss": 0.2328, "step": 855 }, { - "epoch": 0.6737504919323101, - "grad_norm": 1.0060169889058102, - "learning_rate": 1.3182787370285865e-06, - "loss": 0.1305, + "epoch": 0.07886856774312434, + "grad_norm": 1.0842337596440934, + "learning_rate": 4.988638076336178e-06, + "loss": 0.2011, "step": 856 }, { - "epoch": 0.6745375836284927, - "grad_norm": 0.9623311050243877, - "learning_rate": 1.3125438135835955e-06, - "loss": 0.114, + "epoch": 0.07896070392039434, + "grad_norm": 1.0893100625521832, + "learning_rate": 4.988565415692812e-06, + "loss": 0.2067, "step": 857 }, { - "epoch": 0.6753246753246753, - "grad_norm": 1.005880860747008, - "learning_rate": 1.3068169480498333e-06, - "loss": 0.1237, + "epoch": 0.07905284009766435, + "grad_norm": 0.9557182611203138, + "learning_rate": 4.988492523985881e-06, + "loss": 0.1788, "step": 858 }, { - "epoch": 0.676111767020858, - "grad_norm": 1.0295442665880505, - "learning_rate": 1.3010981792890053e-06, - "loss": 0.141, + "epoch": 0.07914497627493436, + "grad_norm": 1.0570627168797462, + "learning_rate": 4.9884194012221496e-06, + "loss": 0.2019, "step": 859 }, { - "epoch": 0.6768988587170406, - "grad_norm": 0.9746775819035803, - "learning_rate": 1.2953875461078777e-06, - "loss": 0.1174, + "epoch": 0.07923711245220436, + "grad_norm": 1.0179565473648933, + "learning_rate": 4.98834604740841e-06, + "loss": 0.2019, "step": 860 }, { - "epoch": 0.6776859504132231, - "grad_norm": 0.9651023742880912, - "learning_rate": 1.289685087258004e-06, - "loss": 0.1179, + "epoch": 0.07932924862947437, + "grad_norm": 1.0519805334151477, + "learning_rate": 4.988272462551471e-06, + "loss": 0.204, "step": 861 }, { - "epoch": 0.6784730421094057, - "grad_norm": 0.9778504990448126, - "learning_rate": 1.283990841435473e-06, - "loss": 0.1232, + "epoch": 0.07942138480674436, + "grad_norm": 0.9369015774026637, + "learning_rate": 4.988198646658167e-06, + "loss": 0.1809, "step": 862 }, { - "epoch": 0.6792601338055884, - "grad_norm": 0.9823411560425596, - "learning_rate": 1.2783048472806364e-06, - "loss": 0.1214, + "epoch": 0.07951352098401437, + "grad_norm": 1.1315810175239667, + "learning_rate": 4.988124599735351e-06, + "loss": 0.2129, "step": 863 }, { - "epoch": 0.680047225501771, - "grad_norm": 0.9509119170509043, - "learning_rate": 1.2726271433778559e-06, - "loss": 0.1331, + "epoch": 0.07960565716128438, + "grad_norm": 1.073675633216396, + "learning_rate": 4.988050321789898e-06, + "loss": 0.1985, "step": 864 }, { - "epoch": 0.6808343171979535, - "grad_norm": 0.9637465369074552, - "learning_rate": 1.266957768255232e-06, - "loss": 0.1221, + "epoch": 0.07969779333855438, + "grad_norm": 0.9730213494505685, + "learning_rate": 4.987975812828704e-06, + "loss": 0.1924, "step": 865 }, { - "epoch": 0.6816214088941361, - "grad_norm": 1.0309739334485784, - "learning_rate": 1.2612967603843512e-06, - "loss": 0.1337, + "epoch": 0.07978992951582439, + "grad_norm": 1.0859711498805535, + "learning_rate": 4.987901072858689e-06, + "loss": 0.1946, "step": 866 }, { - "epoch": 0.6824085005903188, - "grad_norm": 0.9227141127754309, - "learning_rate": 1.2556441581800182e-06, - "loss": 0.1118, + "epoch": 0.07988206569309439, + "grad_norm": 1.10664046990388, + "learning_rate": 4.9878261018867915e-06, + "loss": 0.2062, "step": 867 }, { - "epoch": 0.6831955922865014, - "grad_norm": 0.983027599423059, - "learning_rate": 1.2500000000000007e-06, - "loss": 0.1201, + "epoch": 0.0799742018703644, + "grad_norm": 1.1293880063200286, + "learning_rate": 4.9877508999199724e-06, + "loss": 0.215, "step": 868 }, { - "epoch": 0.683982683982684, - "grad_norm": 0.969869074022873, - "learning_rate": 1.2443643241447629e-06, - "loss": 0.1205, + "epoch": 0.0800663380476344, + "grad_norm": 1.0544035282612745, + "learning_rate": 4.987675466965215e-06, + "loss": 0.1872, "step": 869 }, { - "epoch": 0.6847697756788665, - "grad_norm": 0.9626068462653994, - "learning_rate": 1.2387371688572133e-06, - "loss": 0.1294, + "epoch": 0.08015847422490441, + "grad_norm": 1.081822317114401, + "learning_rate": 4.987599803029522e-06, + "loss": 0.2051, "step": 870 }, { - "epoch": 0.6855568673750492, - "grad_norm": 0.9924688128052054, - "learning_rate": 1.233118572322437e-06, - "loss": 0.1193, + "epoch": 0.08025061040217442, + "grad_norm": 1.0983981922980988, + "learning_rate": 4.98752390811992e-06, + "loss": 0.1988, "step": 871 }, { - "epoch": 0.6863439590712318, - "grad_norm": 0.9409212105627156, - "learning_rate": 1.2275085726674442e-06, - "loss": 0.1186, + "epoch": 0.08034274657944442, + "grad_norm": 1.0687075549507865, + "learning_rate": 4.987447782243456e-06, + "loss": 0.2103, "step": 872 }, { - "epoch": 0.6871310507674144, - "grad_norm": 0.9321864217317675, - "learning_rate": 1.2219072079609046e-06, - "loss": 0.118, + "epoch": 0.08043488275671443, + "grad_norm": 1.129069762940162, + "learning_rate": 4.9873714254071966e-06, + "loss": 0.2106, "step": 873 }, { - "epoch": 0.6879181424635971, - "grad_norm": 0.8802354237634122, - "learning_rate": 1.2163145162128948e-06, - "loss": 0.1092, + "epoch": 0.08052701893398442, + "grad_norm": 1.0695215954521389, + "learning_rate": 4.987294837618233e-06, + "loss": 0.2056, "step": 874 }, { - "epoch": 0.6887052341597796, - "grad_norm": 0.9820858832906886, - "learning_rate": 1.2107305353746376e-06, - "loss": 0.1261, + "epoch": 0.08061915511125443, + "grad_norm": 1.006571818809714, + "learning_rate": 4.987218018883676e-06, + "loss": 0.2022, "step": 875 }, { - "epoch": 0.6894923258559622, - "grad_norm": 1.0214787998802317, - "learning_rate": 1.2051553033382426e-06, - "loss": 0.121, + "epoch": 0.08071129128852444, + "grad_norm": 1.1207288976979883, + "learning_rate": 4.987140969210659e-06, + "loss": 0.2053, "step": 876 }, { - "epoch": 0.6902794175521448, - "grad_norm": 0.9157258726824631, - "learning_rate": 1.1995888579364551e-06, - "loss": 0.1189, + "epoch": 0.08080342746579444, + "grad_norm": 1.1091698949899884, + "learning_rate": 4.987063688606335e-06, + "loss": 0.2054, "step": 877 }, { - "epoch": 0.6910665092483275, - "grad_norm": 0.9531462191249618, - "learning_rate": 1.1940312369423919e-06, - "loss": 0.1184, + "epoch": 0.08089556364306445, + "grad_norm": 0.9936982863486912, + "learning_rate": 4.98698617707788e-06, + "loss": 0.1949, "step": 878 }, { - "epoch": 0.69185360094451, - "grad_norm": 0.9748879770068989, - "learning_rate": 1.18848247806929e-06, - "loss": 0.1201, + "epoch": 0.08098769982033445, + "grad_norm": 1.0218531560234463, + "learning_rate": 4.98690843463249e-06, + "loss": 0.183, "step": 879 }, { - "epoch": 0.6926406926406926, - "grad_norm": 0.9952760658770881, - "learning_rate": 1.1829426189702487e-06, - "loss": 0.1211, + "epoch": 0.08107983599760446, + "grad_norm": 1.0482785566667265, + "learning_rate": 4.986830461277384e-06, + "loss": 0.1962, "step": 880 }, { - "epoch": 0.6934277843368752, - "grad_norm": 0.9561514586133496, - "learning_rate": 1.177411697237977e-06, - "loss": 0.1208, + "epoch": 0.08117197217487447, + "grad_norm": 1.0487002736259132, + "learning_rate": 4.986752257019804e-06, + "loss": 0.2063, "step": 881 }, { - "epoch": 0.6942148760330579, - "grad_norm": 1.0289787958991654, - "learning_rate": 1.1718897504045328e-06, - "loss": 0.1329, + "epoch": 0.08126410835214447, + "grad_norm": 1.0225704173071417, + "learning_rate": 4.9866738218670075e-06, + "loss": 0.1873, "step": 882 }, { - "epoch": 0.6950019677292405, - "grad_norm": 1.012367533381528, - "learning_rate": 1.1663768159410748e-06, - "loss": 0.1286, + "epoch": 0.08135624452941448, + "grad_norm": 0.9922761716304604, + "learning_rate": 4.986595155826279e-06, + "loss": 0.1932, "step": 883 }, { - "epoch": 0.695789059425423, - "grad_norm": 0.9932326189371155, - "learning_rate": 1.160872931257602e-06, - "loss": 0.1207, + "epoch": 0.08144838070668448, + "grad_norm": 1.1203291309724843, + "learning_rate": 4.986516258904923e-06, + "loss": 0.2085, "step": 884 }, { - "epoch": 0.6965761511216056, - "grad_norm": 0.9375475650331836, - "learning_rate": 1.1553781337027061e-06, - "loss": 0.1162, + "epoch": 0.08154051688395449, + "grad_norm": 1.0137824728220541, + "learning_rate": 4.986437131110265e-06, + "loss": 0.1957, "step": 885 }, { - "epoch": 0.6973632428177883, - "grad_norm": 1.0035582921316957, - "learning_rate": 1.149892460563311e-06, - "loss": 0.1272, + "epoch": 0.08163265306122448, + "grad_norm": 1.078763682867198, + "learning_rate": 4.986357772449652e-06, + "loss": 0.2051, "step": 886 }, { - "epoch": 0.6981503345139709, - "grad_norm": 0.969216495536807, - "learning_rate": 1.1444159490644278e-06, - "loss": 0.1322, + "epoch": 0.0817247892384945, + "grad_norm": 1.1459363420840565, + "learning_rate": 4.986278182930452e-06, + "loss": 0.2071, "step": 887 }, { - "epoch": 0.6989374262101535, - "grad_norm": 0.9727140149487835, - "learning_rate": 1.1389486363688935e-06, - "loss": 0.1109, + "epoch": 0.0818169254157645, + "grad_norm": 1.0489858592922743, + "learning_rate": 4.986198362560055e-06, + "loss": 0.2049, "step": 888 }, { - "epoch": 0.699724517906336, - "grad_norm": 1.035921852021017, - "learning_rate": 1.1334905595771274e-06, - "loss": 0.125, + "epoch": 0.0819090615930345, + "grad_norm": 1.1867738602471791, + "learning_rate": 4.986118311345873e-06, + "loss": 0.1922, "step": 889 }, { - "epoch": 0.7005116096025187, - "grad_norm": 0.9817389469807767, - "learning_rate": 1.1280417557268735e-06, - "loss": 0.1263, + "epoch": 0.08200119777030451, + "grad_norm": 1.0816042554701828, + "learning_rate": 4.9860380292953375e-06, + "loss": 0.1961, "step": 890 }, { - "epoch": 0.7012987012987013, - "grad_norm": 0.941993125359632, - "learning_rate": 1.12260226179295e-06, - "loss": 0.1204, + "epoch": 0.08209333394757451, + "grad_norm": 1.0776867351148292, + "learning_rate": 4.985957516415903e-06, + "loss": 0.2077, "step": 891 }, { - "epoch": 0.7020857929948839, - "grad_norm": 0.9776393790876531, - "learning_rate": 1.1171721146870015e-06, - "loss": 0.1351, + "epoch": 0.08218547012484452, + "grad_norm": 1.0266838629289732, + "learning_rate": 4.985876772715047e-06, + "loss": 0.1845, "step": 892 }, { - "epoch": 0.7028728846910665, - "grad_norm": 1.0272253940679958, - "learning_rate": 1.1117513512572436e-06, - "loss": 0.1297, + "epoch": 0.08227760630211453, + "grad_norm": 1.0714186396487406, + "learning_rate": 4.985795798200265e-06, + "loss": 0.1991, "step": 893 }, { - "epoch": 0.7036599763872491, - "grad_norm": 0.9825257474446853, - "learning_rate": 1.1063400082882188e-06, - "loss": 0.1089, + "epoch": 0.08236974247938453, + "grad_norm": 1.1565923591963108, + "learning_rate": 4.9857145928790745e-06, + "loss": 0.2053, "step": 894 }, { - "epoch": 0.7044470680834317, - "grad_norm": 0.9941185616779367, - "learning_rate": 1.10093812250054e-06, - "loss": 0.1182, + "epoch": 0.08246187865665454, + "grad_norm": 1.0961658661114462, + "learning_rate": 4.9856331567590175e-06, + "loss": 0.2075, "step": 895 }, { - "epoch": 0.7052341597796143, - "grad_norm": 0.9809962090348159, - "learning_rate": 1.095545730550649e-06, - "loss": 0.1221, + "epoch": 0.08255401483392454, + "grad_norm": 0.9307410162608049, + "learning_rate": 4.985551489847654e-06, + "loss": 0.1784, "step": 896 }, { - "epoch": 0.706021251475797, - "grad_norm": 0.9359419759382669, - "learning_rate": 1.0901628690305593e-06, - "loss": 0.1175, + "epoch": 0.08264615101119455, + "grad_norm": 1.0401378502092977, + "learning_rate": 4.985469592152567e-06, + "loss": 0.1867, "step": 897 }, { - "epoch": 0.7068083431719795, - "grad_norm": 1.0254153011332428, - "learning_rate": 1.0847895744676173e-06, - "loss": 0.1364, + "epoch": 0.08273828718846456, + "grad_norm": 1.0339494204766255, + "learning_rate": 4.985387463681361e-06, + "loss": 0.1986, "step": 898 }, { - "epoch": 0.7075954348681621, - "grad_norm": 1.0451822058149052, - "learning_rate": 1.0794258833242452e-06, - "loss": 0.1341, + "epoch": 0.08283042336573455, + "grad_norm": 1.1980946192748525, + "learning_rate": 4.985305104441661e-06, + "loss": 0.2064, "step": 899 }, { - "epoch": 0.7083825265643447, - "grad_norm": 1.0267091614696302, - "learning_rate": 1.0740718319976992e-06, - "loss": 0.1284, + "epoch": 0.08292255954300456, + "grad_norm": 1.0323921168593682, + "learning_rate": 4.9852225144411156e-06, + "loss": 0.2084, "step": 900 }, { - "epoch": 0.7091696182605274, - "grad_norm": 0.8928053655240218, - "learning_rate": 1.0687274568198208e-06, - "loss": 0.1009, + "epoch": 0.08301469572027456, + "grad_norm": 1.0136930453828628, + "learning_rate": 4.985139693687392e-06, + "loss": 0.1888, "step": 901 }, { - "epoch": 0.70995670995671, - "grad_norm": 1.057234091878292, - "learning_rate": 1.063392794056792e-06, - "loss": 0.1346, + "epoch": 0.08310683189754457, + "grad_norm": 0.9971484787535447, + "learning_rate": 4.985056642188179e-06, + "loss": 0.2017, "step": 902 }, { - "epoch": 0.7107438016528925, - "grad_norm": 0.9612239375437197, - "learning_rate": 1.0580678799088847e-06, - "loss": 0.1158, + "epoch": 0.08319896807481457, + "grad_norm": 0.9555914001671217, + "learning_rate": 4.984973359951192e-06, + "loss": 0.1815, "step": 903 }, { - "epoch": 0.7115308933490752, - "grad_norm": 0.9876590104136502, - "learning_rate": 1.0527527505102213e-06, - "loss": 0.1193, + "epoch": 0.08329110425208458, + "grad_norm": 1.0364717190048256, + "learning_rate": 4.984889846984159e-06, + "loss": 0.188, "step": 904 }, { - "epoch": 0.7123179850452578, - "grad_norm": 1.0155629806285287, - "learning_rate": 1.0474474419285255e-06, - "loss": 0.1206, + "epoch": 0.08338324042935459, + "grad_norm": 1.005245975401244, + "learning_rate": 4.984806103294837e-06, + "loss": 0.1874, "step": 905 }, { - "epoch": 0.7131050767414404, - "grad_norm": 1.108914897353474, - "learning_rate": 1.0421519901648759e-06, - "loss": 0.1244, + "epoch": 0.08347537660662459, + "grad_norm": 1.0742626967198226, + "learning_rate": 4.9847221288910004e-06, + "loss": 0.2091, "step": 906 }, { - "epoch": 0.7138921684376229, - "grad_norm": 0.9624208122062576, - "learning_rate": 1.0368664311534674e-06, - "loss": 0.122, + "epoch": 0.0835675127838946, + "grad_norm": 1.0572271589829996, + "learning_rate": 4.984637923780448e-06, + "loss": 0.1896, "step": 907 }, { - "epoch": 0.7146792601338056, - "grad_norm": 0.9051835119610858, - "learning_rate": 1.031590800761361e-06, - "loss": 0.1115, + "epoch": 0.0836596489611646, + "grad_norm": 1.072817626504438, + "learning_rate": 4.984553487970995e-06, + "loss": 0.2027, "step": 908 }, { - "epoch": 0.7154663518299882, - "grad_norm": 1.001641822545354, - "learning_rate": 1.0263251347882467e-06, - "loss": 0.1205, + "epoch": 0.0837517851384346, + "grad_norm": 1.1697331427088027, + "learning_rate": 4.984468821470485e-06, + "loss": 0.2009, "step": 909 }, { - "epoch": 0.7162534435261708, - "grad_norm": 0.9587134043689033, - "learning_rate": 1.021069468966194e-06, - "loss": 0.114, + "epoch": 0.08384392131570462, + "grad_norm": 1.099615722951779, + "learning_rate": 4.984383924286776e-06, + "loss": 0.2073, "step": 910 }, { - "epoch": 0.7170405352223534, - "grad_norm": 0.9734138315261187, - "learning_rate": 1.0158238389594164e-06, - "loss": 0.1237, + "epoch": 0.08393605749297461, + "grad_norm": 1.1067206700378336, + "learning_rate": 4.984298796427754e-06, + "loss": 0.2053, "step": 911 }, { - "epoch": 0.717827626918536, - "grad_norm": 0.9654730718585164, - "learning_rate": 1.0105882803640215e-06, - "loss": 0.1241, + "epoch": 0.08402819367024462, + "grad_norm": 1.1386230456096778, + "learning_rate": 4.984213437901321e-06, + "loss": 0.2053, "step": 912 }, { - "epoch": 0.7186147186147186, - "grad_norm": 1.0069324283880368, - "learning_rate": 1.0053628287077782e-06, - "loss": 0.129, + "epoch": 0.08412032984751462, + "grad_norm": 1.051467856985353, + "learning_rate": 4.984127848715402e-06, + "loss": 0.2002, "step": 913 }, { - "epoch": 0.7194018103109012, - "grad_norm": 0.9724112904663149, - "learning_rate": 1.000147519449867e-06, - "loss": 0.1217, + "epoch": 0.08421246602478463, + "grad_norm": 1.0159847224874938, + "learning_rate": 4.984042028877945e-06, + "loss": 0.1739, "step": 914 }, { - "epoch": 0.7201889020070839, - "grad_norm": 0.9009157337976961, - "learning_rate": 9.94942387980648e-07, - "loss": 0.1215, + "epoch": 0.08430460220205464, + "grad_norm": 1.1102387510829792, + "learning_rate": 4.983955978396919e-06, + "loss": 0.1952, "step": 915 }, { - "epoch": 0.7209759937032665, - "grad_norm": 0.9903710831464596, - "learning_rate": 9.89747469621411e-07, - "loss": 0.1247, + "epoch": 0.08439673837932464, + "grad_norm": 1.044485002079122, + "learning_rate": 4.983869697280312e-06, + "loss": 0.2054, "step": 916 }, { - "epoch": 0.721763085399449, - "grad_norm": 0.9627081908787005, - "learning_rate": 9.845627996241459e-07, - "loss": 0.1235, + "epoch": 0.08448887455659465, + "grad_norm": 1.1181813544003023, + "learning_rate": 4.983783185536137e-06, + "loss": 0.1931, "step": 917 }, { - "epoch": 0.7225501770956316, - "grad_norm": 0.9666668030573422, - "learning_rate": 9.793884131712943e-07, - "loss": 0.123, + "epoch": 0.08458101073386465, + "grad_norm": 1.0630599308615696, + "learning_rate": 4.983696443172426e-06, + "loss": 0.1876, "step": 918 }, { - "epoch": 0.7233372687918143, - "grad_norm": 0.9644911943474369, - "learning_rate": 9.742243453755202e-07, - "loss": 0.1142, + "epoch": 0.08467314691113466, + "grad_norm": 0.9944252347497624, + "learning_rate": 4.983609470197233e-06, + "loss": 0.1866, "step": 919 }, { - "epoch": 0.7241243604879969, - "grad_norm": 0.9984844200263358, - "learning_rate": 9.690706312794618e-07, - "loss": 0.1251, + "epoch": 0.08476528308840466, + "grad_norm": 1.0413027140377702, + "learning_rate": 4.983522266618633e-06, + "loss": 0.196, "step": 920 }, { - "epoch": 0.7249114521841794, - "grad_norm": 0.988262708447867, - "learning_rate": 9.639273058555004e-07, - "loss": 0.1233, + "epoch": 0.08485741926567467, + "grad_norm": 1.1229738346239124, + "learning_rate": 4.983434832444724e-06, + "loss": 0.1916, "step": 921 }, { - "epoch": 0.725698543880362, - "grad_norm": 0.9062607929130434, - "learning_rate": 9.587944040055225e-07, - "loss": 0.1116, + "epoch": 0.08494955544294468, + "grad_norm": 1.033570992977324, + "learning_rate": 4.983347167683623e-06, + "loss": 0.1942, "step": 922 }, { - "epoch": 0.7264856355765447, - "grad_norm": 1.0132516720132552, - "learning_rate": 9.536719605606795e-07, - "loss": 0.1314, + "epoch": 0.08504169162021467, + "grad_norm": 1.0615629937133617, + "learning_rate": 4.98325927234347e-06, + "loss": 0.1998, "step": 923 }, { - "epoch": 0.7272727272727273, - "grad_norm": 0.9210291352044477, - "learning_rate": 9.485600102811556e-07, - "loss": 0.108, + "epoch": 0.08513382779748468, + "grad_norm": 1.0603013903796452, + "learning_rate": 4.983171146432427e-06, + "loss": 0.1958, "step": 924 }, { - "epoch": 0.7280598189689099, - "grad_norm": 1.0099118734494892, - "learning_rate": 9.434585878559277e-07, - "loss": 0.1172, + "epoch": 0.08522596397475468, + "grad_norm": 1.0836485017893724, + "learning_rate": 4.983082789958675e-06, + "loss": 0.1969, "step": 925 }, { - "epoch": 0.7288469106650924, - "grad_norm": 1.0237482529235973, - "learning_rate": 9.383677279025347e-07, - "loss": 0.1186, + "epoch": 0.08531810015202469, + "grad_norm": 1.1918839586269157, + "learning_rate": 4.9829942029304194e-06, + "loss": 0.1979, "step": 926 }, { - "epoch": 0.7296340023612751, - "grad_norm": 0.9855331385764105, - "learning_rate": 9.332874649668369e-07, - "loss": 0.1185, + "epoch": 0.0854102363292947, + "grad_norm": 1.0560845818273896, + "learning_rate": 4.982905385355885e-06, + "loss": 0.1971, "step": 927 }, { - "epoch": 0.7304210940574577, - "grad_norm": 0.9369233888911801, - "learning_rate": 9.282178335227885e-07, - "loss": 0.1067, + "epoch": 0.0855023725065647, + "grad_norm": 1.0371941917693073, + "learning_rate": 4.982816337243318e-06, + "loss": 0.199, "step": 928 }, { - "epoch": 0.7312081857536403, - "grad_norm": 1.025834900254658, - "learning_rate": 9.231588679721956e-07, - "loss": 0.1256, + "epoch": 0.08559450868383471, + "grad_norm": 1.0181772522742907, + "learning_rate": 4.982727058600987e-06, + "loss": 0.1991, "step": 929 }, { - "epoch": 0.731995277449823, - "grad_norm": 1.0004815551544541, - "learning_rate": 9.181106026444913e-07, - "loss": 0.1171, + "epoch": 0.08568664486110471, + "grad_norm": 0.9936553320104157, + "learning_rate": 4.98263754943718e-06, + "loss": 0.1841, "step": 930 }, { - "epoch": 0.7327823691460055, - "grad_norm": 0.9247417584553485, - "learning_rate": 9.130730717964948e-07, - "loss": 0.1132, + "epoch": 0.08577878103837472, + "grad_norm": 1.151859083100634, + "learning_rate": 4.9825478097602115e-06, + "loss": 0.19, "step": 931 }, { - "epoch": 0.7335694608421881, - "grad_norm": 0.9769073592720867, - "learning_rate": 9.08046309612185e-07, - "loss": 0.1242, + "epoch": 0.08587091721564473, + "grad_norm": 1.1087091001896687, + "learning_rate": 4.982457839578411e-06, + "loss": 0.1975, "step": 932 }, { - "epoch": 0.7343565525383707, - "grad_norm": 0.96681906386633, - "learning_rate": 9.030303502024662e-07, - "loss": 0.1179, + "epoch": 0.08596305339291473, + "grad_norm": 1.0020521314066806, + "learning_rate": 4.982367638900132e-06, + "loss": 0.206, "step": 933 }, { - "epoch": 0.7351436442345534, - "grad_norm": 1.021595769957744, - "learning_rate": 8.980252276049345e-07, - "loss": 0.1161, + "epoch": 0.08605518957018474, + "grad_norm": 1.0393467185829126, + "learning_rate": 4.982277207733751e-06, + "loss": 0.1917, "step": 934 }, { - "epoch": 0.7359307359307359, - "grad_norm": 0.9231065432942811, - "learning_rate": 8.930309757836517e-07, - "loss": 0.1149, + "epoch": 0.08614732574745473, + "grad_norm": 1.3665091289700992, + "learning_rate": 4.982186546087665e-06, + "loss": 0.2101, "step": 935 }, { - "epoch": 0.7367178276269185, - "grad_norm": 1.026367432921577, - "learning_rate": 8.880476286289091e-07, - "loss": 0.1284, + "epoch": 0.08623946192472474, + "grad_norm": 0.970728944149346, + "learning_rate": 4.98209565397029e-06, + "loss": 0.1853, "step": 936 }, { - "epoch": 0.7375049193231011, - "grad_norm": 0.981215601065822, - "learning_rate": 8.830752199570033e-07, - "loss": 0.1133, + "epoch": 0.08633159810199474, + "grad_norm": 1.0188718196066726, + "learning_rate": 4.9820045313900675e-06, + "loss": 0.1909, "step": 937 }, { - "epoch": 0.7382920110192838, - "grad_norm": 0.9212608185738064, - "learning_rate": 8.781137835100021e-07, - "loss": 0.1077, + "epoch": 0.08642373427926475, + "grad_norm": 1.0079807293569125, + "learning_rate": 4.981913178355456e-06, + "loss": 0.1798, "step": 938 }, { - "epoch": 0.7390791027154664, - "grad_norm": 0.9833427367903659, - "learning_rate": 8.731633529555167e-07, - "loss": 0.1164, + "epoch": 0.08651587045653476, + "grad_norm": 0.9823783578124373, + "learning_rate": 4.981821594874939e-06, + "loss": 0.1792, "step": 939 }, { - "epoch": 0.7398661944116489, - "grad_norm": 0.9854894539977124, - "learning_rate": 8.682239618864763e-07, - "loss": 0.1155, + "epoch": 0.08660800663380476, + "grad_norm": 1.035277129236254, + "learning_rate": 4.981729780957021e-06, + "loss": 0.1908, "step": 940 }, { - "epoch": 0.7406532861078315, - "grad_norm": 0.9551803394241506, - "learning_rate": 8.632956438208962e-07, - "loss": 0.1162, + "epoch": 0.08670014281107477, + "grad_norm": 1.0201631403526028, + "learning_rate": 4.981637736610224e-06, + "loss": 0.182, "step": 941 }, { - "epoch": 0.7414403778040142, - "grad_norm": 0.9042419017178762, - "learning_rate": 8.583784322016503e-07, - "loss": 0.109, + "epoch": 0.08679227898834477, + "grad_norm": 1.0623794729008158, + "learning_rate": 4.981545461843098e-06, + "loss": 0.1962, "step": 942 }, { - "epoch": 0.7422274695001968, - "grad_norm": 0.9609816099291726, - "learning_rate": 8.534723603962497e-07, - "loss": 0.1191, + "epoch": 0.08688441516561478, + "grad_norm": 1.0952542593720789, + "learning_rate": 4.9814529566642065e-06, + "loss": 0.1876, "step": 943 }, { - "epoch": 0.7430145611963794, - "grad_norm": 1.0149972325544658, - "learning_rate": 8.48577461696608e-07, - "loss": 0.1192, + "epoch": 0.08697655134288479, + "grad_norm": 1.0851540110165558, + "learning_rate": 4.981360221082143e-06, + "loss": 0.1981, "step": 944 }, { - "epoch": 0.743801652892562, - "grad_norm": 0.9584184891745349, - "learning_rate": 8.436937693188232e-07, - "loss": 0.1267, + "epoch": 0.08706868752015479, + "grad_norm": 1.117127612131708, + "learning_rate": 4.9812672551055144e-06, + "loss": 0.2034, "step": 945 }, { - "epoch": 0.7445887445887446, - "grad_norm": 0.9986011121611049, - "learning_rate": 8.38821316402946e-07, - "loss": 0.1177, + "epoch": 0.0871608236974248, + "grad_norm": 1.210471734588479, + "learning_rate": 4.981174058742955e-06, + "loss": 0.201, "step": 946 }, { - "epoch": 0.7453758362849272, - "grad_norm": 0.962256278467975, - "learning_rate": 8.339601360127592e-07, - "loss": 0.1131, + "epoch": 0.0872529598746948, + "grad_norm": 1.1075330267974028, + "learning_rate": 4.981080632003117e-06, + "loss": 0.211, "step": 947 }, { - "epoch": 0.7461629279811098, - "grad_norm": 0.9419406227649391, - "learning_rate": 8.291102611355526e-07, - "loss": 0.1123, + "epoch": 0.0873450960519648, + "grad_norm": 1.070657470857975, + "learning_rate": 4.980986974894676e-06, + "loss": 0.1781, "step": 948 }, { - "epoch": 0.7469500196772924, - "grad_norm": 0.9728190857016107, - "learning_rate": 8.242717246818957e-07, - "loss": 0.1197, + "epoch": 0.08743723222923482, + "grad_norm": 1.0137187662237797, + "learning_rate": 4.980893087426326e-06, + "loss": 0.1832, "step": 949 }, { - "epoch": 0.747737111373475, - "grad_norm": 1.0169044023539633, - "learning_rate": 8.1944455948542e-07, - "loss": 0.1219, + "epoch": 0.08752936840650481, + "grad_norm": 1.2310842945020835, + "learning_rate": 4.980798969606787e-06, + "loss": 0.2071, "step": 950 }, { - "epoch": 0.7485242030696576, - "grad_norm": 0.9972018368498321, - "learning_rate": 8.146287983025902e-07, - "loss": 0.1241, + "epoch": 0.08762150458377482, + "grad_norm": 0.9978182904900013, + "learning_rate": 4.980704621444797e-06, + "loss": 0.1889, "step": 951 }, { - "epoch": 0.7493112947658402, - "grad_norm": 1.040910663691627, - "learning_rate": 8.098244738124888e-07, - "loss": 0.1138, + "epoch": 0.08771364076104482, + "grad_norm": 1.1657157637314783, + "learning_rate": 4.980610042949115e-06, + "loss": 0.2151, "step": 952 }, { - "epoch": 0.7500983864620229, - "grad_norm": 1.0438538265069202, - "learning_rate": 8.050316186165862e-07, - "loss": 0.134, + "epoch": 0.08780577693831483, + "grad_norm": 1.0615920322109136, + "learning_rate": 4.980515234128522e-06, + "loss": 0.1894, "step": 953 }, { - "epoch": 0.7508854781582054, - "grad_norm": 0.9793759854817412, - "learning_rate": 8.002502652385278e-07, - "loss": 0.1241, + "epoch": 0.08789791311558483, + "grad_norm": 0.9965168434781553, + "learning_rate": 4.980420194991826e-06, + "loss": 0.1723, "step": 954 }, { - "epoch": 0.751672569854388, - "grad_norm": 0.9636283038275181, - "learning_rate": 7.954804461239054e-07, - "loss": 0.1171, + "epoch": 0.08799004929285484, + "grad_norm": 1.0223984489495057, + "learning_rate": 4.980324925547845e-06, + "loss": 0.2016, "step": 955 }, { - "epoch": 0.7524596615505706, - "grad_norm": 0.9416057200961391, - "learning_rate": 7.907221936400452e-07, - "loss": 0.1194, + "epoch": 0.08808218547012485, + "grad_norm": 1.0267736631959716, + "learning_rate": 4.980229425805429e-06, + "loss": 0.1948, "step": 956 }, { - "epoch": 0.7532467532467533, - "grad_norm": 0.9258555685816136, - "learning_rate": 7.859755400757793e-07, - "loss": 0.1199, + "epoch": 0.08817432164739485, + "grad_norm": 1.0487644444256707, + "learning_rate": 4.9801336957734435e-06, + "loss": 0.1976, "step": 957 }, { - "epoch": 0.7540338449429359, - "grad_norm": 0.9697084160189383, - "learning_rate": 7.812405176412354e-07, - "loss": 0.1206, + "epoch": 0.08826645782466486, + "grad_norm": 0.9800925451113316, + "learning_rate": 4.980037735460778e-06, + "loss": 0.1884, "step": 958 }, { - "epoch": 0.7548209366391184, - "grad_norm": 1.008937777573116, - "learning_rate": 7.76517158467611e-07, - "loss": 0.1238, + "epoch": 0.08835859400193485, + "grad_norm": 1.0512460138994213, + "learning_rate": 4.9799415448763414e-06, + "loss": 0.1905, "step": 959 }, { - "epoch": 0.755608028335301, - "grad_norm": 0.987888023607684, - "learning_rate": 7.718054946069589e-07, - "loss": 0.1246, + "epoch": 0.08845073017920486, + "grad_norm": 1.0438604822998108, + "learning_rate": 4.979845124029066e-06, + "loss": 0.1997, "step": 960 }, { - "epoch": 0.7563951200314837, - "grad_norm": 0.9699505992391279, - "learning_rate": 7.671055580319706e-07, - "loss": 0.1203, + "epoch": 0.08854286635647488, + "grad_norm": 1.1256703426732806, + "learning_rate": 4.979748472927903e-06, + "loss": 0.1826, "step": 961 }, { - "epoch": 0.7571822117276663, - "grad_norm": 0.9382257403962697, - "learning_rate": 7.62417380635756e-07, - "loss": 0.1151, + "epoch": 0.08863500253374487, + "grad_norm": 1.1642895647997922, + "learning_rate": 4.979651591581829e-06, + "loss": 0.1938, "step": 962 }, { - "epoch": 0.7579693034238488, - "grad_norm": 0.9437354430265479, - "learning_rate": 7.577409942316305e-07, - "loss": 0.1163, + "epoch": 0.08872713871101488, + "grad_norm": 1.061865516308078, + "learning_rate": 4.979554479999836e-06, + "loss": 0.1979, "step": 963 }, { - "epoch": 0.7587563951200315, - "grad_norm": 0.9503047394426882, - "learning_rate": 7.530764305528959e-07, - "loss": 0.1211, + "epoch": 0.08881927488828488, + "grad_norm": 1.0511995629982513, + "learning_rate": 4.979457138190944e-06, + "loss": 0.1991, "step": 964 }, { - "epoch": 0.7595434868162141, - "grad_norm": 1.0356071156065598, - "learning_rate": 7.484237212526288e-07, - "loss": 0.1273, + "epoch": 0.08891141106555489, + "grad_norm": 1.0627898550832569, + "learning_rate": 4.979359566164189e-06, + "loss": 0.1892, "step": 965 }, { - "epoch": 0.7603305785123967, - "grad_norm": 0.9856511020736725, - "learning_rate": 7.437828979034606e-07, - "loss": 0.1315, + "epoch": 0.0890035472428249, + "grad_norm": 1.1520194609491567, + "learning_rate": 4.979261763928632e-06, + "loss": 0.2088, "step": 966 }, { - "epoch": 0.7611176702085793, - "grad_norm": 0.9629717208752256, - "learning_rate": 7.391539919973698e-07, - "loss": 0.1062, + "epoch": 0.0890956834200949, + "grad_norm": 1.0750536562889166, + "learning_rate": 4.979163731493354e-06, + "loss": 0.2057, "step": 967 }, { - "epoch": 0.7619047619047619, - "grad_norm": 0.9607163301231785, - "learning_rate": 7.345370349454611e-07, - "loss": 0.1189, + "epoch": 0.08918781959736491, + "grad_norm": 1.0233899360449537, + "learning_rate": 4.979065468867456e-06, + "loss": 0.1966, "step": 968 }, { - "epoch": 0.7626918536009445, - "grad_norm": 0.9578086152431808, - "learning_rate": 7.2993205807776e-07, - "loss": 0.1183, + "epoch": 0.0892799557746349, + "grad_norm": 1.0189667297065197, + "learning_rate": 4.978966976060062e-06, + "loss": 0.1893, "step": 969 }, { - "epoch": 0.7634789452971271, - "grad_norm": 0.9162347277154375, - "learning_rate": 7.253390926429918e-07, - "loss": 0.1104, + "epoch": 0.08937209195190492, + "grad_norm": 0.9615841945851383, + "learning_rate": 4.978868253080318e-06, + "loss": 0.1795, "step": 970 }, { - "epoch": 0.7642660369933097, - "grad_norm": 0.9822027407988481, - "learning_rate": 7.207581698083782e-07, - "loss": 0.1304, + "epoch": 0.08946422812917491, + "grad_norm": 1.0374752006939945, + "learning_rate": 4.9787692999373895e-06, + "loss": 0.1927, "step": 971 }, { - "epoch": 0.7650531286894924, - "grad_norm": 0.940452384125095, - "learning_rate": 7.161893206594175e-07, - "loss": 0.1168, + "epoch": 0.08955636430644492, + "grad_norm": 0.9781931331030623, + "learning_rate": 4.978670116640465e-06, + "loss": 0.1886, "step": 972 }, { - "epoch": 0.7658402203856749, - "grad_norm": 0.9588362662800347, - "learning_rate": 7.116325761996818e-07, - "loss": 0.1206, + "epoch": 0.08964850048371494, + "grad_norm": 1.0482569650890077, + "learning_rate": 4.978570703198754e-06, + "loss": 0.2073, "step": 973 }, { - "epoch": 0.7666273120818575, - "grad_norm": 1.011535036970359, - "learning_rate": 7.070879673505976e-07, - "loss": 0.1141, + "epoch": 0.08974063666098493, + "grad_norm": 1.0307954204033831, + "learning_rate": 4.978471059621486e-06, + "loss": 0.2001, "step": 974 }, { - "epoch": 0.7674144037780402, - "grad_norm": 0.9688050929102817, - "learning_rate": 7.025555249512461e-07, - "loss": 0.1134, + "epoch": 0.08983277283825494, + "grad_norm": 0.9504730932711084, + "learning_rate": 4.978371185917913e-06, + "loss": 0.1871, "step": 975 }, { - "epoch": 0.7682014954742228, - "grad_norm": 0.9177610587932681, - "learning_rate": 6.980352797581438e-07, - "loss": 0.1089, + "epoch": 0.08992490901552494, + "grad_norm": 1.0212992592289591, + "learning_rate": 4.978271082097309e-06, + "loss": 0.1865, "step": 976 }, { - "epoch": 0.7689885871704053, - "grad_norm": 1.0472410615763514, - "learning_rate": 6.935272624450432e-07, - "loss": 0.1249, + "epoch": 0.09001704519279495, + "grad_norm": 1.0584402344944974, + "learning_rate": 4.978170748168968e-06, + "loss": 0.1827, "step": 977 }, { - "epoch": 0.7697756788665879, - "grad_norm": 0.9636506719030409, - "learning_rate": 6.890315036027156e-07, - "loss": 0.1166, + "epoch": 0.09010918137006496, + "grad_norm": 1.0265243323297528, + "learning_rate": 4.978070184142207e-06, + "loss": 0.1955, "step": 978 }, { - "epoch": 0.7705627705627706, - "grad_norm": 0.9412495575321557, - "learning_rate": 6.845480337387525e-07, - "loss": 0.1195, + "epoch": 0.09020131754733496, + "grad_norm": 0.99918479647745, + "learning_rate": 4.977969390026362e-06, + "loss": 0.1902, "step": 979 }, { - "epoch": 0.7713498622589532, - "grad_norm": 0.9930839323289444, - "learning_rate": 6.800768832773505e-07, - "loss": 0.1267, + "epoch": 0.09029345372460497, + "grad_norm": 1.0545620541518919, + "learning_rate": 4.9778683658307925e-06, + "loss": 0.1904, "step": 980 }, { - "epoch": 0.7721369539551358, - "grad_norm": 0.9929049734904327, - "learning_rate": 6.756180825591099e-07, - "loss": 0.1199, + "epoch": 0.09038558990187497, + "grad_norm": 1.0303710328312456, + "learning_rate": 4.977767111564879e-06, + "loss": 0.1922, "step": 981 }, { - "epoch": 0.7729240456513183, - "grad_norm": 0.9842816070024283, - "learning_rate": 6.711716618408282e-07, - "loss": 0.1179, + "epoch": 0.09047772607914498, + "grad_norm": 1.0446625626228556, + "learning_rate": 4.977665627238023e-06, + "loss": 0.1855, "step": 982 }, { - "epoch": 0.773711137347501, - "grad_norm": 0.9944580491304532, - "learning_rate": 6.66737651295292e-07, - "loss": 0.1198, + "epoch": 0.09056986225641499, + "grad_norm": 1.0552315927395985, + "learning_rate": 4.977563912859645e-06, + "loss": 0.1869, "step": 983 }, { - "epoch": 0.7744982290436836, - "grad_norm": 0.9839708888147434, - "learning_rate": 6.623160810110765e-07, - "loss": 0.1193, + "epoch": 0.09066199843368498, + "grad_norm": 1.003974927091642, + "learning_rate": 4.977461968439193e-06, + "loss": 0.1923, "step": 984 }, { - "epoch": 0.7752853207398662, - "grad_norm": 0.9098159637038072, - "learning_rate": 6.579069809923367e-07, - "loss": 0.1123, + "epoch": 0.090754134610955, + "grad_norm": 1.0230489571997252, + "learning_rate": 4.9773597939861294e-06, + "loss": 0.1856, "step": 985 }, { - "epoch": 0.7760724124360489, - "grad_norm": 0.9606605502093222, - "learning_rate": 6.535103811586085e-07, - "loss": 0.1174, + "epoch": 0.09084627078822499, + "grad_norm": 1.0105592868572502, + "learning_rate": 4.977257389509943e-06, + "loss": 0.1929, "step": 986 }, { - "epoch": 0.7768595041322314, - "grad_norm": 1.0270282467875798, - "learning_rate": 6.491263113446005e-07, - "loss": 0.1287, + "epoch": 0.090938406965495, + "grad_norm": 1.0725121055688818, + "learning_rate": 4.9771547550201414e-06, + "loss": 0.1856, "step": 987 }, { - "epoch": 0.777646595828414, - "grad_norm": 0.9366844684145114, - "learning_rate": 6.44754801299998e-07, - "loss": 0.1158, + "epoch": 0.09103054314276501, + "grad_norm": 1.0409324997207798, + "learning_rate": 4.977051890526254e-06, + "loss": 0.1922, "step": 988 }, { - "epoch": 0.7784336875245966, - "grad_norm": 0.9037352215899601, - "learning_rate": 6.403958806892535e-07, - "loss": 0.1053, + "epoch": 0.09112267932003501, + "grad_norm": 1.0338303485068927, + "learning_rate": 4.976948796037831e-06, + "loss": 0.194, "step": 989 }, { - "epoch": 0.7792207792207793, - "grad_norm": 0.9440225724017625, - "learning_rate": 6.360495790913926e-07, - "loss": 0.114, + "epoch": 0.09121481549730502, + "grad_norm": 1.0689018179069636, + "learning_rate": 4.976845471564447e-06, + "loss": 0.1924, "step": 990 }, { - "epoch": 0.7800078709169618, - "grad_norm": 0.9351662026461205, - "learning_rate": 6.317159259998074e-07, - "loss": 0.113, + "epoch": 0.09130695167457502, + "grad_norm": 1.0394659919045186, + "learning_rate": 4.976741917115695e-06, + "loss": 0.1917, "step": 991 }, { - "epoch": 0.7807949626131444, - "grad_norm": 0.9872272779542443, - "learning_rate": 6.273949508220612e-07, - "loss": 0.1217, + "epoch": 0.09139908785184503, + "grad_norm": 1.0150076898279992, + "learning_rate": 4.976638132701188e-06, + "loss": 0.1842, "step": 992 }, { - "epoch": 0.781582054309327, - "grad_norm": 1.0021043961378415, - "learning_rate": 6.23086682879686e-07, - "loss": 0.1194, + "epoch": 0.09149122402911503, + "grad_norm": 1.0841944088787114, + "learning_rate": 4.976534118330565e-06, + "loss": 0.1788, "step": 993 }, { - "epoch": 0.7823691460055097, - "grad_norm": 0.9797645648660196, - "learning_rate": 6.187911514079834e-07, - "loss": 0.1294, + "epoch": 0.09158336020638504, + "grad_norm": 1.0401076161846878, + "learning_rate": 4.9764298740134814e-06, + "loss": 0.1901, "step": 994 }, { - "epoch": 0.7831562377016923, - "grad_norm": 1.0054784443943467, - "learning_rate": 6.14508385555829e-07, - "loss": 0.1236, + "epoch": 0.09167549638365505, + "grad_norm": 0.9971280819093569, + "learning_rate": 4.976325399759619e-06, + "loss": 0.1951, "step": 995 }, { - "epoch": 0.7839433293978748, - "grad_norm": 0.9433076242026539, - "learning_rate": 6.102384143854698e-07, - "loss": 0.1147, + "epoch": 0.09176763256092504, + "grad_norm": 0.9976774105341277, + "learning_rate": 4.976220695578675e-06, + "loss": 0.1741, "step": 996 }, { - "epoch": 0.7847304210940574, - "grad_norm": 0.9383907844400864, - "learning_rate": 6.059812668723336e-07, - "loss": 0.115, + "epoch": 0.09185976873819506, + "grad_norm": 1.0794336104421778, + "learning_rate": 4.976115761480373e-06, + "loss": 0.2019, "step": 997 }, { - "epoch": 0.7855175127902401, - "grad_norm": 0.9452315722932242, - "learning_rate": 6.017369719048255e-07, - "loss": 0.1154, + "epoch": 0.09195190491546505, + "grad_norm": 1.1589141007240227, + "learning_rate": 4.9760105974744576e-06, + "loss": 0.2021, "step": 998 }, { - "epoch": 0.7863046044864227, - "grad_norm": 0.9247930090252802, - "learning_rate": 5.975055582841358e-07, - "loss": 0.1127, + "epoch": 0.09204404109273506, + "grad_norm": 1.0959230462918457, + "learning_rate": 4.97590520357069e-06, + "loss": 0.1871, "step": 999 }, { - "epoch": 0.7870916961826053, - "grad_norm": 1.0061872579787852, - "learning_rate": 5.932870547240455e-07, - "loss": 0.1183, + "epoch": 0.09213617727000507, + "grad_norm": 1.0170637360982258, + "learning_rate": 4.97579957977886e-06, + "loss": 0.192, "step": 1000 }, { - "epoch": 0.7870916961826053, - "eval_loss": 0.11849173903465271, - "eval_runtime": 18.0453, - "eval_samples_per_second": 45.552, - "eval_steps_per_second": 5.708, + "epoch": 0.09213617727000507, + "eval_loss": 0.19303320348262787, + "eval_runtime": 299.1988, + "eval_samples_per_second": 23.453, + "eval_steps_per_second": 2.935, "step": 1000 }, { - "epoch": 0.7878787878787878, - "grad_norm": 0.9623831636196449, - "learning_rate": 5.890814898507277e-07, - "loss": 0.1201, + "epoch": 0.09222831344727507, + "grad_norm": 1.1568554165939138, + "learning_rate": 4.97569372610877e-06, + "loss": 0.1831, "step": 1001 }, { - "epoch": 0.7886658795749705, - "grad_norm": 1.0229456536544794, - "learning_rate": 5.848888922025553e-07, - "loss": 0.1223, + "epoch": 0.09232044962454508, + "grad_norm": 1.1944389893604717, + "learning_rate": 4.975587642570252e-06, + "loss": 0.1828, "step": 1002 }, { - "epoch": 0.7894529712711531, - "grad_norm": 1.0277580850565635, - "learning_rate": 5.8070929022991e-07, - "loss": 0.1178, + "epoch": 0.09241258580181508, + "grad_norm": 1.0170283022489994, + "learning_rate": 4.975481329173156e-06, + "loss": 0.1856, "step": 1003 }, { - "epoch": 0.7902400629673357, - "grad_norm": 1.0160977169162413, - "learning_rate": 5.76542712294983e-07, - "loss": 0.1216, + "epoch": 0.09250472197908509, + "grad_norm": 1.0558442749265609, + "learning_rate": 4.975374785927351e-06, + "loss": 0.1847, "step": 1004 }, { - "epoch": 0.7910271546635183, - "grad_norm": 0.9181674404236817, - "learning_rate": 5.723891866715899e-07, - "loss": 0.1118, + "epoch": 0.0925968581563551, + "grad_norm": 1.0094563700785, + "learning_rate": 4.975268012842732e-06, + "loss": 0.1876, "step": 1005 }, { - "epoch": 0.7918142463597009, - "grad_norm": 0.9799718055295829, - "learning_rate": 5.682487415449719e-07, - "loss": 0.1217, + "epoch": 0.0926889943336251, + "grad_norm": 1.1605189632893436, + "learning_rate": 4.97516100992921e-06, + "loss": 0.1964, "step": 1006 }, { - "epoch": 0.7926013380558835, - "grad_norm": 0.980465660075739, - "learning_rate": 5.641214050116098e-07, - "loss": 0.1252, + "epoch": 0.09278113051089511, + "grad_norm": 1.0938956132780517, + "learning_rate": 4.975053777196723e-06, + "loss": 0.2036, "step": 1007 }, { - "epoch": 0.7933884297520661, - "grad_norm": 0.9234542096536653, - "learning_rate": 5.600072050790317e-07, - "loss": 0.1096, + "epoch": 0.0928732666881651, + "grad_norm": 1.0767350046907365, + "learning_rate": 4.974946314655226e-06, + "loss": 0.2035, "step": 1008 }, { - "epoch": 0.7941755214482488, - "grad_norm": 0.8947896913580902, - "learning_rate": 5.559061696656199e-07, - "loss": 0.1075, + "epoch": 0.09296540286543511, + "grad_norm": 0.9971359022353502, + "learning_rate": 4.974838622314698e-06, + "loss": 0.1969, "step": 1009 }, { - "epoch": 0.7949626131444313, - "grad_norm": 0.9790572754851533, - "learning_rate": 5.518183266004276e-07, - "loss": 0.1171, + "epoch": 0.09305753904270511, + "grad_norm": 1.093729265610002, + "learning_rate": 4.974730700185136e-06, + "loss": 0.2025, "step": 1010 }, { - "epoch": 0.7957497048406139, - "grad_norm": 0.9667752845159187, - "learning_rate": 5.477437036229832e-07, - "loss": 0.1098, + "epoch": 0.09314967521997512, + "grad_norm": 1.0774952559409026, + "learning_rate": 4.974622548276564e-06, + "loss": 0.2024, "step": 1011 }, { - "epoch": 0.7965367965367965, - "grad_norm": 1.1238301396219903, - "learning_rate": 5.436823283831083e-07, - "loss": 0.1373, + "epoch": 0.09324181139724513, + "grad_norm": 1.009403051152341, + "learning_rate": 4.974514166599021e-06, + "loss": 0.1936, "step": 1012 }, { - "epoch": 0.7973238882329792, - "grad_norm": 0.9927017197297952, - "learning_rate": 5.396342284407252e-07, - "loss": 0.1188, + "epoch": 0.09333394757451513, + "grad_norm": 1.0763173811540299, + "learning_rate": 4.974405555162571e-06, + "loss": 0.1912, "step": 1013 }, { - "epoch": 0.7981109799291618, - "grad_norm": 0.9934845633471078, - "learning_rate": 5.355994312656734e-07, - "loss": 0.1142, + "epoch": 0.09342608375178514, + "grad_norm": 1.1012649720307428, + "learning_rate": 4.9742967139773e-06, + "loss": 0.2018, "step": 1014 }, { - "epoch": 0.7988980716253443, - "grad_norm": 0.9432843712008361, - "learning_rate": 5.315779642375199e-07, - "loss": 0.1158, + "epoch": 0.09351821992905514, + "grad_norm": 0.9708640152172979, + "learning_rate": 4.974187643053312e-06, + "loss": 0.1711, "step": 1015 }, { - "epoch": 0.799685163321527, - "grad_norm": 1.0251992909650254, - "learning_rate": 5.275698546453775e-07, - "loss": 0.1175, + "epoch": 0.09361035610632515, + "grad_norm": 1.109835431173934, + "learning_rate": 4.9740783424007355e-06, + "loss": 0.1957, "step": 1016 }, { - "epoch": 0.8004722550177096, - "grad_norm": 1.010003247709555, - "learning_rate": 5.235751296877148e-07, - "loss": 0.1223, + "epoch": 0.09370249228359516, + "grad_norm": 1.1759709250141979, + "learning_rate": 4.973968812029718e-06, + "loss": 0.21, "step": 1017 }, { - "epoch": 0.8012593467138922, - "grad_norm": 1.0290265946769084, - "learning_rate": 5.195938164721767e-07, - "loss": 0.1213, + "epoch": 0.09379462846086516, + "grad_norm": 0.9865822952697535, + "learning_rate": 4.973859051950431e-06, + "loss": 0.175, "step": 1018 }, { - "epoch": 0.8020464384100747, - "grad_norm": 1.0065180201235937, - "learning_rate": 5.156259420153962e-07, - "loss": 0.1238, + "epoch": 0.09388676463813517, + "grad_norm": 1.1361881263400175, + "learning_rate": 4.973749062173065e-06, + "loss": 0.1994, "step": 1019 }, { - "epoch": 0.8028335301062574, - "grad_norm": 0.9673994358176322, - "learning_rate": 5.116715332428118e-07, - "loss": 0.1106, + "epoch": 0.09397890081540516, + "grad_norm": 1.109853232173025, + "learning_rate": 4.973638842707831e-06, + "loss": 0.1969, "step": 1020 }, { - "epoch": 0.80362062180244, - "grad_norm": 0.9855226309577549, - "learning_rate": 5.077306169884888e-07, - "loss": 0.1107, + "epoch": 0.09407103699267517, + "grad_norm": 0.9731737615905076, + "learning_rate": 4.973528393564965e-06, + "loss": 0.1752, "step": 1021 }, { - "epoch": 0.8044077134986226, - "grad_norm": 0.9719491378095487, - "learning_rate": 5.038032199949313e-07, - "loss": 0.1241, + "epoch": 0.09416317316994519, + "grad_norm": 1.074193893659733, + "learning_rate": 4.973417714754721e-06, + "loss": 0.194, "step": 1022 }, { - "epoch": 0.8051948051948052, - "grad_norm": 0.9767976318784359, - "learning_rate": 4.998893689129061e-07, - "loss": 0.1165, + "epoch": 0.09425530934721518, + "grad_norm": 1.024455833636712, + "learning_rate": 4.973306806287376e-06, + "loss": 0.1903, "step": 1023 }, { - "epoch": 0.8059818968909878, - "grad_norm": 0.9563874747183178, - "learning_rate": 4.959890903012568e-07, - "loss": 0.1084, + "epoch": 0.0943474455244852, + "grad_norm": 0.9573557205309052, + "learning_rate": 4.9731956681732284e-06, + "loss": 0.1828, "step": 1024 }, { - "epoch": 0.8067689885871704, - "grad_norm": 0.9724824504692731, - "learning_rate": 4.921024106267283e-07, - "loss": 0.1199, + "epoch": 0.09443958170175519, + "grad_norm": 1.0124926306215474, + "learning_rate": 4.973084300422597e-06, + "loss": 0.1931, "step": 1025 }, { - "epoch": 0.807556080283353, - "grad_norm": 0.9939320935755256, - "learning_rate": 4.882293562637827e-07, - "loss": 0.1221, + "epoch": 0.0945317178790252, + "grad_norm": 1.0815661147002018, + "learning_rate": 4.972972703045822e-06, + "loss": 0.195, "step": 1026 }, { - "epoch": 0.8083431719795356, - "grad_norm": 0.9407936084194475, - "learning_rate": 4.843699534944258e-07, - "loss": 0.114, + "epoch": 0.0946238540562952, + "grad_norm": 1.110883335805394, + "learning_rate": 4.972860876053265e-06, + "loss": 0.1982, "step": 1027 }, { - "epoch": 0.8091302636757183, - "grad_norm": 1.00154528232707, - "learning_rate": 4.805242285080222e-07, - "loss": 0.116, + "epoch": 0.09471599023356521, + "grad_norm": 1.0693046793652852, + "learning_rate": 4.97274881945531e-06, + "loss": 0.1811, "step": 1028 }, { - "epoch": 0.8099173553719008, - "grad_norm": 0.9808095624927836, - "learning_rate": 4.7669220740112376e-07, - "loss": 0.1166, + "epoch": 0.09480812641083522, + "grad_norm": 1.1919130340115107, + "learning_rate": 4.97263653326236e-06, + "loss": 0.2073, "step": 1029 }, { - "epoch": 0.8107044470680834, - "grad_norm": 0.9094800308143445, - "learning_rate": 4.728739161772874e-07, - "loss": 0.1091, + "epoch": 0.09490026258810522, + "grad_norm": 1.0577602885838477, + "learning_rate": 4.972524017484842e-06, + "loss": 0.1841, "step": 1030 }, { - "epoch": 0.811491538764266, - "grad_norm": 0.8938814168806941, - "learning_rate": 4.690693807469035e-07, - "loss": 0.1032, + "epoch": 0.09499239876537523, + "grad_norm": 1.1057953679804238, + "learning_rate": 4.972411272133204e-06, + "loss": 0.1848, "step": 1031 }, { - "epoch": 0.8122786304604487, - "grad_norm": 0.9946966466940443, - "learning_rate": 4.6527862692701487e-07, - "loss": 0.1158, + "epoch": 0.09508453494264522, + "grad_norm": 1.1472304844236627, + "learning_rate": 4.972298297217913e-06, + "loss": 0.1851, "step": 1032 }, { - "epoch": 0.8130657221566312, - "grad_norm": 0.9504344369313477, - "learning_rate": 4.615016804411465e-07, - "loss": 0.1114, + "epoch": 0.09517667111991523, + "grad_norm": 1.0139962897450747, + "learning_rate": 4.972185092749458e-06, + "loss": 0.1888, "step": 1033 }, { - "epoch": 0.8138528138528138, - "grad_norm": 0.9453878426287087, - "learning_rate": 4.5773856691912726e-07, - "loss": 0.1208, + "epoch": 0.09526880729718525, + "grad_norm": 1.0237739212232981, + "learning_rate": 4.972071658738352e-06, + "loss": 0.1961, "step": 1034 }, { - "epoch": 0.8146399055489965, - "grad_norm": 1.0239509589587361, - "learning_rate": 4.53989311896918e-07, - "loss": 0.1186, + "epoch": 0.09536094347445524, + "grad_norm": 1.1226050341955258, + "learning_rate": 4.971957995195126e-06, + "loss": 0.1919, "step": 1035 }, { - "epoch": 0.8154269972451791, - "grad_norm": 0.915087834897205, - "learning_rate": 4.502539408164386e-07, - "loss": 0.106, + "epoch": 0.09545307965172525, + "grad_norm": 1.0615783114613073, + "learning_rate": 4.971844102130334e-06, + "loss": 0.1929, "step": 1036 }, { - "epoch": 0.8162140889413617, - "grad_norm": 0.936045456280838, - "learning_rate": 4.465324790253922e-07, - "loss": 0.1104, + "epoch": 0.09554521582899525, + "grad_norm": 1.016886377753397, + "learning_rate": 4.971729979554551e-06, + "loss": 0.1956, "step": 1037 }, { - "epoch": 0.8170011806375442, - "grad_norm": 0.986165452237028, - "learning_rate": 4.428249517770986e-07, - "loss": 0.1137, + "epoch": 0.09563735200626526, + "grad_norm": 1.0576392129215906, + "learning_rate": 4.9716156274783746e-06, + "loss": 0.186, "step": 1038 }, { - "epoch": 0.8177882723337269, - "grad_norm": 0.9641211589952483, - "learning_rate": 4.391313842303166e-07, - "loss": 0.1164, + "epoch": 0.09572948818353527, + "grad_norm": 1.0326102477262193, + "learning_rate": 4.9715010459124205e-06, + "loss": 0.2068, "step": 1039 }, { - "epoch": 0.8185753640299095, - "grad_norm": 0.9334697738707801, - "learning_rate": 4.3545180144907857e-07, - "loss": 0.113, + "epoch": 0.09582162436080527, + "grad_norm": 0.9670626996840229, + "learning_rate": 4.971386234867328e-06, + "loss": 0.197, "step": 1040 }, { - "epoch": 0.8193624557260921, - "grad_norm": 0.9871068120631671, - "learning_rate": 4.3178622840251647e-07, - "loss": 0.1241, + "epoch": 0.09591376053807528, + "grad_norm": 1.0570562814072233, + "learning_rate": 4.971271194353757e-06, + "loss": 0.1895, "step": 1041 }, { - "epoch": 0.8201495474222747, - "grad_norm": 0.9529229208831895, - "learning_rate": 4.2813468996469654e-07, - "loss": 0.121, + "epoch": 0.09600589671534528, + "grad_norm": 1.0598769897437053, + "learning_rate": 4.971155924382392e-06, + "loss": 0.1856, "step": 1042 }, { - "epoch": 0.8209366391184573, - "grad_norm": 0.9336358260352323, - "learning_rate": 4.2449721091444545e-07, - "loss": 0.1134, + "epoch": 0.09609803289261529, + "grad_norm": 0.9717398940768603, + "learning_rate": 4.971040424963931e-06, + "loss": 0.1694, "step": 1043 }, { - "epoch": 0.8217237308146399, - "grad_norm": 1.0307131336178375, - "learning_rate": 4.2087381593518716e-07, - "loss": 0.1274, + "epoch": 0.09619016906988528, + "grad_norm": 1.0484387235197232, + "learning_rate": 4.970924696109102e-06, + "loss": 0.1974, "step": 1044 }, { - "epoch": 0.8225108225108225, - "grad_norm": 0.9433490812621332, - "learning_rate": 4.1726452961477147e-07, - "loss": 0.116, + "epoch": 0.0962823052471553, + "grad_norm": 1.0682781178742418, + "learning_rate": 4.970808737828648e-06, + "loss": 0.214, "step": 1045 }, { - "epoch": 0.8232979142070052, - "grad_norm": 0.9751019494649681, - "learning_rate": 4.136693764453101e-07, - "loss": 0.1129, + "epoch": 0.0963744414244253, + "grad_norm": 1.0480298626312177, + "learning_rate": 4.970692550133337e-06, + "loss": 0.1959, "step": 1046 }, { - "epoch": 0.8240850059031877, - "grad_norm": 1.056328093801445, - "learning_rate": 4.1008838082300743e-07, - "loss": 0.1168, + "epoch": 0.0964665776016953, + "grad_norm": 1.1064752815320091, + "learning_rate": 4.970576133033958e-06, + "loss": 0.1924, "step": 1047 }, { - "epoch": 0.8248720975993703, - "grad_norm": 1.0079092402082175, - "learning_rate": 4.065215670479991e-07, - "loss": 0.1258, + "epoch": 0.09655871377896531, + "grad_norm": 1.0614078272511498, + "learning_rate": 4.970459486541318e-06, + "loss": 0.2013, "step": 1048 }, { - "epoch": 0.8256591892955529, - "grad_norm": 0.9852819205932097, - "learning_rate": 4.02968959324182e-07, - "loss": 0.1161, + "epoch": 0.09665084995623531, + "grad_norm": 1.064795035168974, + "learning_rate": 4.970342610666249e-06, + "loss": 0.1947, "step": 1049 }, { - "epoch": 0.8264462809917356, - "grad_norm": 0.9840178810234324, - "learning_rate": 3.9943058175905493e-07, - "loss": 0.1184, + "epoch": 0.09674298613350532, + "grad_norm": 0.9799709467241581, + "learning_rate": 4.970225505419602e-06, + "loss": 0.1769, "step": 1050 }, { - "epoch": 0.8272333726879182, - "grad_norm": 1.0203308364665442, - "learning_rate": 3.9590645836355275e-07, - "loss": 0.1232, + "epoch": 0.09683512231077533, + "grad_norm": 1.043741540422472, + "learning_rate": 4.970108170812252e-06, + "loss": 0.1953, "step": 1051 }, { - "epoch": 0.8280204643841007, - "grad_norm": 0.967005062015959, - "learning_rate": 3.923966130518814e-07, - "loss": 0.1209, + "epoch": 0.09692725848804533, + "grad_norm": 1.061623477697688, + "learning_rate": 4.969990606855093e-06, + "loss": 0.2071, "step": 1052 }, { - "epoch": 0.8288075560802833, - "grad_norm": 0.9434419280443521, - "learning_rate": 3.889010696413606e-07, - "loss": 0.1211, + "epoch": 0.09701939466531534, + "grad_norm": 1.1020868177462027, + "learning_rate": 4.969872813559039e-06, + "loss": 0.1821, "step": 1053 }, { - "epoch": 0.829594647776466, - "grad_norm": 0.9442684717641329, - "learning_rate": 3.8541985185225645e-07, - "loss": 0.1078, + "epoch": 0.09711153084258534, + "grad_norm": 0.988163345783669, + "learning_rate": 4.9697547909350295e-06, + "loss": 0.1987, "step": 1054 }, { - "epoch": 0.8303817394726486, - "grad_norm": 0.9927726007062886, - "learning_rate": 3.819529833076263e-07, - "loss": 0.1214, + "epoch": 0.09720366701985535, + "grad_norm": 1.0267991125472582, + "learning_rate": 4.969636538994021e-06, + "loss": 0.1918, "step": 1055 }, { - "epoch": 0.8311688311688312, - "grad_norm": 0.9486732370194126, - "learning_rate": 3.7850048753315274e-07, - "loss": 0.1087, + "epoch": 0.09729580319712536, + "grad_norm": 1.070032876028796, + "learning_rate": 4.969518057746995e-06, + "loss": 0.2002, "step": 1056 }, { - "epoch": 0.8319559228650137, - "grad_norm": 0.9803070634979109, - "learning_rate": 3.750623879569895e-07, - "loss": 0.1181, + "epoch": 0.09738793937439535, + "grad_norm": 0.954895748932761, + "learning_rate": 4.969399347204951e-06, + "loss": 0.1829, "step": 1057 }, { - "epoch": 0.8327430145611964, - "grad_norm": 1.0010555667628969, - "learning_rate": 3.716387079095973e-07, - "loss": 0.1172, + "epoch": 0.09748007555166537, + "grad_norm": 1.0721201059228953, + "learning_rate": 4.969280407378912e-06, + "loss": 0.192, "step": 1058 }, { - "epoch": 0.833530106257379, - "grad_norm": 0.9548728126471466, - "learning_rate": 3.6822947062359004e-07, - "loss": 0.1125, + "epoch": 0.09757221172893536, + "grad_norm": 1.0107730979473246, + "learning_rate": 4.9691612382799215e-06, + "loss": 0.194, "step": 1059 }, { - "epoch": 0.8343171979535616, - "grad_norm": 1.0333251380116057, - "learning_rate": 3.6483469923357327e-07, - "loss": 0.1119, + "epoch": 0.09766434790620537, + "grad_norm": 0.9854433143433244, + "learning_rate": 4.969041839919044e-06, + "loss": 0.1909, "step": 1060 }, { - "epoch": 0.8351042896497441, - "grad_norm": 0.9473784570136893, - "learning_rate": 3.614544167759901e-07, - "loss": 0.1136, + "epoch": 0.09775648408347537, + "grad_norm": 1.0348738031722355, + "learning_rate": 4.968922212307367e-06, + "loss": 0.1922, "step": 1061 }, { - "epoch": 0.8358913813459268, - "grad_norm": 0.9283902827668026, - "learning_rate": 3.5808864618896295e-07, - "loss": 0.1004, + "epoch": 0.09784862026074538, + "grad_norm": 1.0040960545465387, + "learning_rate": 4.968802355455995e-06, + "loss": 0.1923, "step": 1062 }, { - "epoch": 0.8366784730421094, - "grad_norm": 0.9856546672764643, - "learning_rate": 3.5473741031213983e-07, - "loss": 0.1136, + "epoch": 0.09794075643801539, + "grad_norm": 1.0094386883868456, + "learning_rate": 4.96868226937606e-06, + "loss": 0.1751, "step": 1063 }, { - "epoch": 0.837465564738292, - "grad_norm": 0.955024488651013, - "learning_rate": 3.51400731886537e-07, - "loss": 0.1199, + "epoch": 0.09803289261528539, + "grad_norm": 1.073963995133156, + "learning_rate": 4.96856195407871e-06, + "loss": 0.1931, "step": 1064 }, { - "epoch": 0.8382526564344747, - "grad_norm": 0.9527578800496054, - "learning_rate": 3.4807863355438703e-07, - "loss": 0.1178, + "epoch": 0.0981250287925554, + "grad_norm": 0.9703948692708834, + "learning_rate": 4.968441409575117e-06, + "loss": 0.1906, "step": 1065 }, { - "epoch": 0.8390397481306572, - "grad_norm": 0.9748866277343534, - "learning_rate": 3.447711378589841e-07, - "loss": 0.1126, + "epoch": 0.0982171649698254, + "grad_norm": 1.06228397745977, + "learning_rate": 4.968320635876473e-06, + "loss": 0.1857, "step": 1066 }, { - "epoch": 0.8398268398268398, - "grad_norm": 0.9722315190803439, - "learning_rate": 3.414782672445291e-07, - "loss": 0.1143, + "epoch": 0.09830930114709541, + "grad_norm": 1.0662602435123827, + "learning_rate": 4.968199632993994e-06, + "loss": 0.1943, "step": 1067 }, { - "epoch": 0.8406139315230224, - "grad_norm": 1.0223007557494088, - "learning_rate": 3.3820004405598157e-07, - "loss": 0.1141, + "epoch": 0.09840143732436542, + "grad_norm": 1.030389208026544, + "learning_rate": 4.968078400938912e-06, + "loss": 0.1981, "step": 1068 }, { - "epoch": 0.8414010232192051, - "grad_norm": 0.969999076611352, - "learning_rate": 3.3493649053890325e-07, - "loss": 0.1161, + "epoch": 0.09849357350163541, + "grad_norm": 1.0075981112842045, + "learning_rate": 4.967956939722485e-06, + "loss": 0.1969, "step": 1069 }, { - "epoch": 0.8421881149153877, - "grad_norm": 0.9878840165050939, - "learning_rate": 3.3168762883931256e-07, - "loss": 0.1164, + "epoch": 0.09858570967890543, + "grad_norm": 0.958118698153524, + "learning_rate": 4.967835249355991e-06, + "loss": 0.1858, "step": 1070 }, { - "epoch": 0.8429752066115702, - "grad_norm": 1.030579753477139, - "learning_rate": 3.284534810035278e-07, - "loss": 0.1258, + "epoch": 0.09867784585617542, + "grad_norm": 0.9474495525140502, + "learning_rate": 4.967713329850728e-06, + "loss": 0.1859, "step": 1071 }, { - "epoch": 0.8437622983077528, - "grad_norm": 0.987395029549749, - "learning_rate": 3.252340689780245e-07, - "loss": 0.1219, + "epoch": 0.09876998203344543, + "grad_norm": 1.0554983849327597, + "learning_rate": 4.967591181218017e-06, + "loss": 0.192, "step": 1072 }, { - "epoch": 0.8445493900039355, - "grad_norm": 0.9750378310364627, - "learning_rate": 3.2202941460927977e-07, - "loss": 0.1275, + "epoch": 0.09886211821071544, + "grad_norm": 1.0152034702976793, + "learning_rate": 4.967468803469199e-06, + "loss": 0.195, "step": 1073 }, { - "epoch": 0.8453364817001181, - "grad_norm": 0.9446071297273908, - "learning_rate": 3.1883953964363057e-07, - "loss": 0.1177, + "epoch": 0.09895425438798544, + "grad_norm": 1.0168864653893954, + "learning_rate": 4.967346196615638e-06, + "loss": 0.1767, "step": 1074 }, { - "epoch": 0.8461235733963006, - "grad_norm": 0.9469617356782836, - "learning_rate": 3.156644657271196e-07, - "loss": 0.1128, + "epoch": 0.09904639056525545, + "grad_norm": 1.043221440739377, + "learning_rate": 4.967223360668716e-06, + "loss": 0.1846, "step": 1075 }, { - "epoch": 0.8469106650924833, - "grad_norm": 0.951509014508041, - "learning_rate": 3.12504214405355e-07, - "loss": 0.108, + "epoch": 0.09913852674252545, + "grad_norm": 1.0048446057039784, + "learning_rate": 4.9671002956398395e-06, + "loss": 0.1973, "step": 1076 }, { - "epoch": 0.8476977567886659, - "grad_norm": 0.9472516964934904, - "learning_rate": 3.093588071233578e-07, - "loss": 0.1141, + "epoch": 0.09923066291979546, + "grad_norm": 0.9969031964290073, + "learning_rate": 4.966977001540436e-06, + "loss": 0.1926, "step": 1077 }, { - "epoch": 0.8484848484848485, - "grad_norm": 0.9084748554063148, - "learning_rate": 3.06228265225422e-07, - "loss": 0.1116, + "epoch": 0.09932279909706546, + "grad_norm": 1.052777627221389, + "learning_rate": 4.966853478381951e-06, + "loss": 0.1995, "step": 1078 }, { - "epoch": 0.8492719401810311, - "grad_norm": 0.9337280963011981, - "learning_rate": 3.031126099549653e-07, - "loss": 0.1119, + "epoch": 0.09941493527433547, + "grad_norm": 1.068967031109921, + "learning_rate": 4.966729726175857e-06, + "loss": 0.1848, "step": 1079 }, { - "epoch": 0.8500590318772137, - "grad_norm": 0.9417103223229273, - "learning_rate": 3.000118624543888e-07, - "loss": 0.1117, + "epoch": 0.09950707145160548, + "grad_norm": 1.160503081694701, + "learning_rate": 4.96660574493364e-06, + "loss": 0.1954, "step": 1080 }, { - "epoch": 0.8508461235733963, - "grad_norm": 0.9259525236801444, - "learning_rate": 2.9692604376492935e-07, - "loss": 0.1067, + "epoch": 0.09959920762887547, + "grad_norm": 1.0796967030397735, + "learning_rate": 4.9664815346668165e-06, + "loss": 0.2055, "step": 1081 }, { - "epoch": 0.8516332152695789, - "grad_norm": 0.98856338356212, - "learning_rate": 2.9385517482651974e-07, - "loss": 0.1218, + "epoch": 0.09969134380614549, + "grad_norm": 1.005831162809125, + "learning_rate": 4.966357095386915e-06, + "loss": 0.1972, "step": 1082 }, { - "epoch": 0.8524203069657615, - "grad_norm": 0.9208981942587281, - "learning_rate": 2.907992764776471e-07, - "loss": 0.1078, + "epoch": 0.09978347998341548, + "grad_norm": 1.0052790015292061, + "learning_rate": 4.966232427105493e-06, + "loss": 0.1825, "step": 1083 }, { - "epoch": 0.8532073986619442, - "grad_norm": 1.0070559943311361, - "learning_rate": 2.877583694552083e-07, - "loss": 0.1236, + "epoch": 0.0998756161606855, + "grad_norm": 1.0082971570804145, + "learning_rate": 4.9661075298341245e-06, + "loss": 0.1679, "step": 1084 }, { - "epoch": 0.8539944903581267, - "grad_norm": 1.0056266955151931, - "learning_rate": 2.847324743943733e-07, - "loss": 0.1168, + "epoch": 0.0999677523379555, + "grad_norm": 1.002477114385746, + "learning_rate": 4.965982403584406e-06, + "loss": 0.1978, "step": 1085 }, { - "epoch": 0.8547815820543093, - "grad_norm": 0.9843297655404193, - "learning_rate": 2.8172161182844076e-07, - "loss": 0.1179, + "epoch": 0.1000598885152255, + "grad_norm": 1.0561649001397835, + "learning_rate": 4.965857048367956e-06, + "loss": 0.2016, "step": 1086 }, { - "epoch": 0.855568673750492, - "grad_norm": 0.9794092387444499, - "learning_rate": 2.7872580218870293e-07, - "loss": 0.1143, + "epoch": 0.10015202469249551, + "grad_norm": 1.090128521697667, + "learning_rate": 4.965731464196415e-06, + "loss": 0.1981, "step": 1087 }, { - "epoch": 0.8563557654466746, - "grad_norm": 0.9193678895589191, - "learning_rate": 2.757450658043029e-07, - "loss": 0.1033, + "epoch": 0.10024416086976551, + "grad_norm": 1.1605860564434374, + "learning_rate": 4.96560565108144e-06, + "loss": 0.2093, "step": 1088 }, { - "epoch": 0.8571428571428571, - "grad_norm": 0.943695053791707, - "learning_rate": 2.7277942290210105e-07, - "loss": 0.1197, + "epoch": 0.10033629704703552, + "grad_norm": 0.9908203729796794, + "learning_rate": 4.965479609034717e-06, + "loss": 0.1761, "step": 1089 }, { - "epoch": 0.8579299488390397, - "grad_norm": 0.9996493471626344, - "learning_rate": 2.698288936065338e-07, - "loss": 0.1121, + "epoch": 0.10042843322430553, + "grad_norm": 1.0689295025072343, + "learning_rate": 4.9653533380679455e-06, + "loss": 0.2124, "step": 1090 }, { - "epoch": 0.8587170405352224, - "grad_norm": 0.9457549280655658, - "learning_rate": 2.6689349793947993e-07, - "loss": 0.1135, + "epoch": 0.10052056940157553, + "grad_norm": 0.9557530326333923, + "learning_rate": 4.965226838192852e-06, + "loss": 0.1835, "step": 1091 }, { - "epoch": 0.859504132231405, - "grad_norm": 0.9370356637509019, - "learning_rate": 2.639732558201219e-07, - "loss": 0.1142, + "epoch": 0.10061270557884554, + "grad_norm": 0.9607802521798345, + "learning_rate": 4.965100109421182e-06, + "loss": 0.1779, "step": 1092 }, { - "epoch": 0.8602912239275876, - "grad_norm": 0.9355392394517238, - "learning_rate": 2.610681870648149e-07, - "loss": 0.1101, + "epoch": 0.10070484175611553, + "grad_norm": 1.0016875203479627, + "learning_rate": 4.9649731517647e-06, + "loss": 0.1769, "step": 1093 }, { - "epoch": 0.8610783156237701, - "grad_norm": 0.9395826896807081, - "learning_rate": 2.5817831138694685e-07, - "loss": 0.1143, + "epoch": 0.10079697793338555, + "grad_norm": 1.0440688028642757, + "learning_rate": 4.964845965235196e-06, + "loss": 0.1934, "step": 1094 }, { - "epoch": 0.8618654073199528, - "grad_norm": 0.9169613408255519, - "learning_rate": 2.553036483968094e-07, - "loss": 0.1125, + "epoch": 0.10088911411065554, + "grad_norm": 1.099885377144038, + "learning_rate": 4.964718549844479e-06, + "loss": 0.2077, "step": 1095 }, { - "epoch": 0.8626524990161354, - "grad_norm": 0.9214498686393413, - "learning_rate": 2.5244421760146354e-07, - "loss": 0.1061, + "epoch": 0.10098125028792555, + "grad_norm": 1.0364884967207673, + "learning_rate": 4.964590905604379e-06, + "loss": 0.1839, "step": 1096 }, { - "epoch": 0.863439590712318, - "grad_norm": 0.9759019789855492, - "learning_rate": 2.496000384046046e-07, - "loss": 0.1134, + "epoch": 0.10107338646519556, + "grad_norm": 1.119047007151761, + "learning_rate": 4.964463032526749e-06, + "loss": 0.1921, "step": 1097 }, { - "epoch": 0.8642266824085005, - "grad_norm": 1.002942267675624, - "learning_rate": 2.467711301064349e-07, - "loss": 0.1249, + "epoch": 0.10116552264246556, + "grad_norm": 1.0229016109535547, + "learning_rate": 4.9643349306234615e-06, + "loss": 0.1873, "step": 1098 }, { - "epoch": 0.8650137741046832, - "grad_norm": 0.9774180094397517, - "learning_rate": 2.4395751190352924e-07, - "loss": 0.1192, + "epoch": 0.10125765881973557, + "grad_norm": 1.0007547076017496, + "learning_rate": 4.96420659990641e-06, + "loss": 0.1809, "step": 1099 }, { - "epoch": 0.8658008658008658, - "grad_norm": 0.9977028529540362, - "learning_rate": 2.411592028887058e-07, - "loss": 0.1189, + "epoch": 0.10134979499700557, + "grad_norm": 1.1215031197209377, + "learning_rate": 4.9640780403875095e-06, + "loss": 0.1995, "step": 1100 }, { - "epoch": 0.8665879574970484, - "grad_norm": 0.9904872465257003, - "learning_rate": 2.383762220508984e-07, - "loss": 0.1183, + "epoch": 0.10144193117427558, + "grad_norm": 1.0890006150478866, + "learning_rate": 4.963949252078698e-06, + "loss": 0.1965, "step": 1101 }, { - "epoch": 0.867375049193231, - "grad_norm": 0.9148412123541501, - "learning_rate": 2.356085882750242e-07, - "loss": 0.1078, + "epoch": 0.10153406735154559, + "grad_norm": 1.0198293652323223, + "learning_rate": 4.963820234991934e-06, + "loss": 0.2028, "step": 1102 }, { - "epoch": 0.8681621408894136, - "grad_norm": 1.0117176122067204, - "learning_rate": 2.328563203418574e-07, - "loss": 0.1217, + "epoch": 0.10162620352881559, + "grad_norm": 0.9681566672517501, + "learning_rate": 4.963690989139196e-06, + "loss": 0.1775, "step": 1103 }, { - "epoch": 0.8689492325855962, - "grad_norm": 0.9872081797420905, - "learning_rate": 2.3011943692790389e-07, - "loss": 0.117, + "epoch": 0.1017183397060856, + "grad_norm": 0.9842979036405417, + "learning_rate": 4.963561514532485e-06, + "loss": 0.174, "step": 1104 }, { - "epoch": 0.8697363242817788, - "grad_norm": 0.9476791655485511, - "learning_rate": 2.2739795660526948e-07, - "loss": 0.1157, + "epoch": 0.1018104758833556, + "grad_norm": 1.056487078327593, + "learning_rate": 4.963431811183821e-06, + "loss": 0.1923, "step": 1105 }, { - "epoch": 0.8705234159779615, - "grad_norm": 0.9183530530163464, - "learning_rate": 2.246918978415394e-07, - "loss": 0.1108, + "epoch": 0.1019026120606256, + "grad_norm": 0.9681514651053884, + "learning_rate": 4.963301879105249e-06, + "loss": 0.1735, "step": 1106 }, { - "epoch": 0.8713105076741441, - "grad_norm": 0.9622583408924335, - "learning_rate": 2.2200127899964786e-07, - "loss": 0.1188, + "epoch": 0.10199474823789562, + "grad_norm": 1.0279411063186674, + "learning_rate": 4.963171718308833e-06, + "loss": 0.1939, "step": 1107 }, { - "epoch": 0.8720975993703266, - "grad_norm": 0.9915067004751748, - "learning_rate": 2.1932611833775846e-07, - "loss": 0.1151, + "epoch": 0.10208688441516561, + "grad_norm": 1.0199638016460721, + "learning_rate": 4.963041328806656e-06, + "loss": 0.184, "step": 1108 }, { - "epoch": 0.8728846910665092, - "grad_norm": 0.9404810815181894, - "learning_rate": 2.1666643400913512e-07, - "loss": 0.1133, + "epoch": 0.10217902059243562, + "grad_norm": 1.0831824990470151, + "learning_rate": 4.962910710610827e-06, + "loss": 0.1919, "step": 1109 }, { - "epoch": 0.8736717827626919, - "grad_norm": 0.9750904254975121, - "learning_rate": 2.1402224406202377e-07, - "loss": 0.1187, + "epoch": 0.10227115676970562, + "grad_norm": 1.0908326206048449, + "learning_rate": 4.962779863733475e-06, + "loss": 0.1765, "step": 1110 }, { - "epoch": 0.8744588744588745, - "grad_norm": 0.942666742797311, - "learning_rate": 2.1139356643952667e-07, - "loss": 0.1133, + "epoch": 0.10236329294697563, + "grad_norm": 1.0045235303450732, + "learning_rate": 4.962648788186747e-06, + "loss": 0.1892, "step": 1111 }, { - "epoch": 0.875245966155057, - "grad_norm": 0.9261400322366565, - "learning_rate": 2.0878041897948121e-07, - "loss": 0.1095, + "epoch": 0.10245542912424563, + "grad_norm": 0.9971996009560316, + "learning_rate": 4.9625174839828135e-06, + "loss": 0.1818, "step": 1112 }, { - "epoch": 0.8760330578512396, - "grad_norm": 1.0714254257987408, - "learning_rate": 2.0618281941434058e-07, - "loss": 0.1197, + "epoch": 0.10254756530151564, + "grad_norm": 1.1008529906643778, + "learning_rate": 4.9623859511338664e-06, + "loss": 0.1859, "step": 1113 }, { - "epoch": 0.8768201495474223, - "grad_norm": 0.9322738584358286, - "learning_rate": 2.036007853710503e-07, - "loss": 0.114, + "epoch": 0.10263970147878565, + "grad_norm": 1.0675327190930683, + "learning_rate": 4.962254189652119e-06, + "loss": 0.1938, "step": 1114 }, { - "epoch": 0.8776072412436049, - "grad_norm": 0.9346649367642453, - "learning_rate": 2.0103433437093256e-07, - "loss": 0.1027, + "epoch": 0.10273183765605565, + "grad_norm": 0.9678039471099696, + "learning_rate": 4.962122199549806e-06, + "loss": 0.1842, "step": 1115 }, { - "epoch": 0.8783943329397875, - "grad_norm": 0.9499461297298013, - "learning_rate": 1.9848348382956294e-07, - "loss": 0.1228, + "epoch": 0.10282397383332566, + "grad_norm": 0.9783428439299713, + "learning_rate": 4.96198998083918e-06, + "loss": 0.1868, "step": 1116 }, { - "epoch": 0.87918142463597, - "grad_norm": 0.9811562591520676, - "learning_rate": 1.9594825105665654e-07, - "loss": 0.1168, + "epoch": 0.10291611001059565, + "grad_norm": 1.0795078906373854, + "learning_rate": 4.961857533532521e-06, + "loss": 0.2017, "step": 1117 }, { - "epoch": 0.8799685163321527, - "grad_norm": 0.901833893893408, - "learning_rate": 1.934286532559468e-07, - "loss": 0.0992, + "epoch": 0.10300824618786567, + "grad_norm": 0.9862093313250959, + "learning_rate": 4.961724857642125e-06, + "loss": 0.188, "step": 1118 }, { - "epoch": 0.8807556080283353, - "grad_norm": 0.9566664879297264, - "learning_rate": 1.9092470752507225e-07, - "loss": 0.1114, + "epoch": 0.10310038236513568, + "grad_norm": 1.023997592371275, + "learning_rate": 4.96159195318031e-06, + "loss": 0.1807, "step": 1119 }, { - "epoch": 0.8815426997245179, - "grad_norm": 0.8992036132523128, - "learning_rate": 1.8843643085545677e-07, - "loss": 0.1113, + "epoch": 0.10319251854240567, + "grad_norm": 1.1350867035477064, + "learning_rate": 4.9614588201594175e-06, + "loss": 0.1962, "step": 1120 }, { - "epoch": 0.8823297914207006, - "grad_norm": 1.031563578089281, - "learning_rate": 1.8596384013219726e-07, - "loss": 0.1168, + "epoch": 0.10328465471967568, + "grad_norm": 1.0486867617781612, + "learning_rate": 4.961325458591809e-06, + "loss": 0.1967, "step": 1121 }, { - "epoch": 0.8831168831168831, - "grad_norm": 0.942026348911581, - "learning_rate": 1.8350695213394777e-07, - "loss": 0.1206, + "epoch": 0.10337679089694568, + "grad_norm": 0.9979044530364211, + "learning_rate": 4.961191868489866e-06, + "loss": 0.1847, "step": 1122 }, { - "epoch": 0.8839039748130657, - "grad_norm": 0.9439190891698341, - "learning_rate": 1.8106578353280585e-07, - "loss": 0.1138, + "epoch": 0.10346892707421569, + "grad_norm": 1.0898276661037767, + "learning_rate": 4.961058049865994e-06, + "loss": 0.1841, "step": 1123 }, { - "epoch": 0.8846910665092483, - "grad_norm": 0.9261746330624396, - "learning_rate": 1.7864035089419973e-07, - "loss": 0.1118, + "epoch": 0.1035610632514857, + "grad_norm": 1.0539187928170928, + "learning_rate": 4.960924002732616e-06, + "loss": 0.2036, "step": 1124 }, { - "epoch": 0.885478158205431, - "grad_norm": 0.9958265328548447, - "learning_rate": 1.7623067067677467e-07, - "loss": 0.1235, + "epoch": 0.1036531994287557, + "grad_norm": 0.973520042589487, + "learning_rate": 4.9607897271021815e-06, + "loss": 0.1765, "step": 1125 }, { - "epoch": 0.8862652499016135, - "grad_norm": 1.0264812939307284, - "learning_rate": 1.7383675923228372e-07, - "loss": 0.1221, + "epoch": 0.10374533560602571, + "grad_norm": 1.0595893348731948, + "learning_rate": 4.960655222987155e-06, + "loss": 0.2013, "step": 1126 }, { - "epoch": 0.8870523415977961, - "grad_norm": 1.0457105898882355, - "learning_rate": 1.7145863280547348e-07, - "loss": 0.1183, + "epoch": 0.1038374717832957, + "grad_norm": 1.1099133446922225, + "learning_rate": 4.960520490400026e-06, + "loss": 0.1849, "step": 1127 }, { - "epoch": 0.8878394332939787, - "grad_norm": 0.9616200935637597, - "learning_rate": 1.6909630753397716e-07, - "loss": 0.1055, + "epoch": 0.10392960796056572, + "grad_norm": 0.9847206344296402, + "learning_rate": 4.9603855293533045e-06, + "loss": 0.1859, "step": 1128 }, { - "epoch": 0.8886265249901614, - "grad_norm": 1.0114965743550393, - "learning_rate": 1.6674979944820258e-07, - "loss": 0.1247, + "epoch": 0.10402174413783571, + "grad_norm": 1.0085016434462313, + "learning_rate": 4.960250339859523e-06, + "loss": 0.1922, "step": 1129 }, { - "epoch": 0.889413616686344, - "grad_norm": 1.0002918205099012, - "learning_rate": 1.644191244712251e-07, - "loss": 0.1245, + "epoch": 0.10411388031510573, + "grad_norm": 1.0132928372271228, + "learning_rate": 4.960114921931231e-06, + "loss": 0.1816, "step": 1130 }, { - "epoch": 0.8902007083825265, - "grad_norm": 1.0106011414793612, - "learning_rate": 1.621042984186777e-07, - "loss": 0.1222, + "epoch": 0.10420601649237574, + "grad_norm": 0.945813808487549, + "learning_rate": 4.959979275581005e-06, + "loss": 0.1926, "step": 1131 }, { - "epoch": 0.8909878000787091, - "grad_norm": 0.9663301334412446, - "learning_rate": 1.598053369986463e-07, - "loss": 0.1194, + "epoch": 0.10429815266964573, + "grad_norm": 1.091489002731477, + "learning_rate": 4.959843400821438e-06, + "loss": 0.187, "step": 1132 }, { - "epoch": 0.8917748917748918, - "grad_norm": 0.9465191160189073, - "learning_rate": 1.5752225581155995e-07, - "loss": 0.1085, + "epoch": 0.10439028884691574, + "grad_norm": 0.9602509824379453, + "learning_rate": 4.959707297665146e-06, + "loss": 0.1666, "step": 1133 }, { - "epoch": 0.8925619834710744, - "grad_norm": 1.0466368501224192, - "learning_rate": 1.5525507035008852e-07, - "loss": 0.1306, + "epoch": 0.10448242502418574, + "grad_norm": 1.070831104626253, + "learning_rate": 4.959570966124768e-06, + "loss": 0.1983, "step": 1134 }, { - "epoch": 0.893349075167257, - "grad_norm": 0.9494535695924311, - "learning_rate": 1.5300379599903408e-07, - "loss": 0.1172, + "epoch": 0.10457456120145575, + "grad_norm": 1.0644935824954898, + "learning_rate": 4.959434406212959e-06, + "loss": 0.187, "step": 1135 }, { - "epoch": 0.8941361668634396, - "grad_norm": 0.9062893885912572, - "learning_rate": 1.507684480352292e-07, - "loss": 0.109, + "epoch": 0.10466669737872576, + "grad_norm": 1.18188816476157, + "learning_rate": 4.959297617942403e-06, + "loss": 0.2021, "step": 1136 }, { - "epoch": 0.8949232585596222, - "grad_norm": 0.9371709714503786, - "learning_rate": 1.4854904162743127e-07, - "loss": 0.1047, + "epoch": 0.10475883355599576, + "grad_norm": 1.0758197586073297, + "learning_rate": 4.959160601325797e-06, + "loss": 0.1852, "step": 1137 }, { - "epoch": 0.8957103502558048, - "grad_norm": 1.0110155118454318, - "learning_rate": 1.4634559183622193e-07, - "loss": 0.126, + "epoch": 0.10485096973326577, + "grad_norm": 1.0990404429156002, + "learning_rate": 4.959023356375866e-06, + "loss": 0.1921, "step": 1138 }, { - "epoch": 0.8964974419519874, - "grad_norm": 1.0367843431150414, - "learning_rate": 1.4415811361390142e-07, - "loss": 0.1251, + "epoch": 0.10494310591053577, + "grad_norm": 1.028267704366153, + "learning_rate": 4.9588858831053495e-06, + "loss": 0.1953, "step": 1139 }, { - "epoch": 0.89728453364817, - "grad_norm": 0.9836901063346848, - "learning_rate": 1.4198662180439166e-07, - "loss": 0.1225, + "epoch": 0.10503524208780578, + "grad_norm": 0.9471378455619729, + "learning_rate": 4.958748181527016e-06, + "loss": 0.1834, "step": 1140 }, { - "epoch": 0.8980716253443526, - "grad_norm": 1.0729047805019176, - "learning_rate": 1.3983113114313078e-07, - "loss": 0.1321, + "epoch": 0.10512737826507579, + "grad_norm": 1.0244112736454591, + "learning_rate": 4.958610251653649e-06, + "loss": 0.1766, "step": 1141 }, { - "epoch": 0.8988587170405352, - "grad_norm": 0.9172308694693998, - "learning_rate": 1.3769165625697633e-07, - "loss": 0.1094, + "epoch": 0.10521951444234579, + "grad_norm": 1.0541172839830792, + "learning_rate": 4.958472093498055e-06, + "loss": 0.1942, "step": 1142 }, { - "epoch": 0.8996458087367178, - "grad_norm": 0.9246235655173746, - "learning_rate": 1.355682116641052e-07, - "loss": 0.1098, + "epoch": 0.1053116506196158, + "grad_norm": 0.9910686182968134, + "learning_rate": 4.9583337070730625e-06, + "loss": 0.1839, "step": 1143 }, { - "epoch": 0.9004329004329005, - "grad_norm": 0.9671203173685715, - "learning_rate": 1.3346081177391474e-07, - "loss": 0.1084, + "epoch": 0.10540378679688579, + "grad_norm": 0.9317050369769572, + "learning_rate": 4.958195092391521e-06, + "loss": 0.1908, "step": 1144 }, { - "epoch": 0.901219992129083, - "grad_norm": 0.9683781341379243, - "learning_rate": 1.3136947088692537e-07, - "loss": 0.119, + "epoch": 0.1054959229741558, + "grad_norm": 1.01507009801172, + "learning_rate": 4.958056249466301e-06, + "loss": 0.1772, "step": 1145 }, { - "epoch": 0.9020070838252656, - "grad_norm": 0.9395787949515245, - "learning_rate": 1.2929420319468254e-07, - "loss": 0.1135, + "epoch": 0.1055880591514258, + "grad_norm": 0.9887305632286719, + "learning_rate": 4.957917178310293e-06, + "loss": 0.1931, "step": 1146 }, { - "epoch": 0.9027941755214482, - "grad_norm": 0.9442679409050593, - "learning_rate": 1.272350227796626e-07, - "loss": 0.1215, + "epoch": 0.10568019532869581, + "grad_norm": 1.0393372989945, + "learning_rate": 4.957777878936411e-06, + "loss": 0.1966, "step": 1147 }, { - "epoch": 0.9035812672176309, - "grad_norm": 0.9549776744266577, - "learning_rate": 1.2519194361517468e-07, - "loss": 0.1065, + "epoch": 0.10577233150596582, + "grad_norm": 0.9892641791079512, + "learning_rate": 4.957638351357587e-06, + "loss": 0.1931, "step": 1148 }, { - "epoch": 0.9043683589138135, - "grad_norm": 0.867426121426782, - "learning_rate": 1.231649795652684e-07, - "loss": 0.0945, + "epoch": 0.10586446768323582, + "grad_norm": 0.9559643694345603, + "learning_rate": 4.957498595586779e-06, + "loss": 0.1823, "step": 1149 }, { - "epoch": 0.905155450609996, - "grad_norm": 0.9448412697453261, - "learning_rate": 1.2115414438463646e-07, - "loss": 0.1101, + "epoch": 0.10595660386050583, + "grad_norm": 0.979023807959273, + "learning_rate": 4.957358611636962e-06, + "loss": 0.178, "step": 1150 }, { - "epoch": 0.9059425423061787, - "grad_norm": 0.9917169805460462, - "learning_rate": 1.1915945171852572e-07, - "loss": 0.1163, + "epoch": 0.10604874003777583, + "grad_norm": 1.0774638507040097, + "learning_rate": 4.957218399521133e-06, + "loss": 0.1908, "step": 1151 }, { - "epoch": 0.9067296340023613, - "grad_norm": 0.943816804931634, - "learning_rate": 1.171809151026404e-07, - "loss": 0.1079, + "epoch": 0.10614087621504584, + "grad_norm": 0.9904806549507087, + "learning_rate": 4.957077959252311e-06, + "loss": 0.1955, "step": 1152 }, { - "epoch": 0.9075167256985439, - "grad_norm": 0.9275143855618218, - "learning_rate": 1.1521854796305243e-07, - "loss": 0.1075, + "epoch": 0.10623301239231585, + "grad_norm": 1.0784191373101655, + "learning_rate": 4.956937290843537e-06, + "loss": 0.1904, "step": 1153 }, { - "epoch": 0.9083038173947264, - "grad_norm": 1.0135210788160407, - "learning_rate": 1.1327236361611066e-07, - "loss": 0.1267, + "epoch": 0.10632514856958585, + "grad_norm": 1.1522667775208266, + "learning_rate": 4.95679639430787e-06, + "loss": 0.2227, "step": 1154 }, { - "epoch": 0.9090909090909091, - "grad_norm": 0.918433915733085, - "learning_rate": 1.1134237526834901e-07, - "loss": 0.1134, + "epoch": 0.10641728474685586, + "grad_norm": 0.9904608761960886, + "learning_rate": 4.956655269658393e-06, + "loss": 0.1822, "step": 1155 }, { - "epoch": 0.9098780007870917, - "grad_norm": 1.0485642131970048, - "learning_rate": 1.0942859601639793e-07, - "loss": 0.1181, + "epoch": 0.10650942092412585, + "grad_norm": 1.0503960405331592, + "learning_rate": 4.956513916908211e-06, + "loss": 0.1937, "step": 1156 }, { - "epoch": 0.9106650924832743, - "grad_norm": 0.9666449795771067, - "learning_rate": 1.0753103884689503e-07, - "loss": 0.1142, + "epoch": 0.10660155710139586, + "grad_norm": 1.0262513807644829, + "learning_rate": 4.956372336070448e-06, + "loss": 0.1808, "step": 1157 }, { - "epoch": 0.911452184179457, - "grad_norm": 0.9155262976479461, - "learning_rate": 1.0564971663639761e-07, - "loss": 0.1079, + "epoch": 0.10669369327866587, + "grad_norm": 1.0342363298395292, + "learning_rate": 4.956230527158248e-06, + "loss": 0.1749, "step": 1158 }, { - "epoch": 0.9122392758756395, - "grad_norm": 0.9136247354670238, - "learning_rate": 1.0378464215129419e-07, - "loss": 0.1131, + "epoch": 0.10678582945593587, + "grad_norm": 0.9789103538040284, + "learning_rate": 4.95608849018478e-06, + "loss": 0.1849, "step": 1159 }, { - "epoch": 0.9130263675718221, - "grad_norm": 0.9434235785738653, - "learning_rate": 1.0193582804771868e-07, - "loss": 0.1126, + "epoch": 0.10687796563320588, + "grad_norm": 1.0614565305811643, + "learning_rate": 4.95594622516323e-06, + "loss": 0.2029, "step": 1160 }, { - "epoch": 0.9138134592680047, - "grad_norm": 0.9320439540121583, - "learning_rate": 1.0010328687146464e-07, - "loss": 0.1128, + "epoch": 0.10697010181047588, + "grad_norm": 1.0560558999226575, + "learning_rate": 4.95580373210681e-06, + "loss": 0.2107, "step": 1161 }, { - "epoch": 0.9146005509641874, - "grad_norm": 0.9883313764295104, - "learning_rate": 9.828703105789983e-08, - "loss": 0.1189, + "epoch": 0.10706223798774589, + "grad_norm": 1.0797810477588987, + "learning_rate": 4.955661011028748e-06, + "loss": 0.2075, "step": 1162 }, { - "epoch": 0.91538764266037, - "grad_norm": 0.962978283773575, - "learning_rate": 9.648707293188092e-08, - "loss": 0.1181, + "epoch": 0.1071543741650159, + "grad_norm": 1.0029566214938326, + "learning_rate": 4.955518061942298e-06, + "loss": 0.1868, "step": 1163 }, { - "epoch": 0.9161747343565525, - "grad_norm": 0.9154295867986278, - "learning_rate": 9.470342470767197e-08, - "loss": 0.1077, + "epoch": 0.1072465103422859, + "grad_norm": 1.0588961555129832, + "learning_rate": 4.955374884860731e-06, + "loss": 0.2038, "step": 1164 }, { - "epoch": 0.9169618260527351, - "grad_norm": 0.9190343113758656, - "learning_rate": 9.293609848885971e-08, - "loss": 0.1101, + "epoch": 0.10733864651955591, + "grad_norm": 1.00256844787753, + "learning_rate": 4.9552314797973426e-06, + "loss": 0.2008, "step": 1165 }, { - "epoch": 0.9177489177489178, - "grad_norm": 0.8935174011429071, - "learning_rate": 9.118510626827198e-08, - "loss": 0.1112, + "epoch": 0.1074307826968259, + "grad_norm": 1.0284873277323123, + "learning_rate": 4.955087846765446e-06, + "loss": 0.1886, "step": 1166 }, { - "epoch": 0.9185360094451004, - "grad_norm": 0.9304289493526803, - "learning_rate": 8.945045992789669e-08, - "loss": 0.1037, + "epoch": 0.10752291887409592, + "grad_norm": 0.9660868860677466, + "learning_rate": 4.954943985778379e-06, + "loss": 0.1911, "step": 1167 }, { - "epoch": 0.9193231011412829, - "grad_norm": 0.9708158046423621, - "learning_rate": 8.773217123880074e-08, - "loss": 0.1255, + "epoch": 0.10761505505136591, + "grad_norm": 1.006615188078031, + "learning_rate": 4.954799896849499e-06, + "loss": 0.1988, "step": 1168 }, { - "epoch": 0.9201101928374655, - "grad_norm": 0.95525649253936, - "learning_rate": 8.603025186105064e-08, - "loss": 0.1119, + "epoch": 0.10770719122863592, + "grad_norm": 1.0946500653930293, + "learning_rate": 4.954655579992184e-06, + "loss": 0.2008, "step": 1169 }, { - "epoch": 0.9208972845336482, - "grad_norm": 1.0094573892269945, - "learning_rate": 8.434471334363204e-08, - "loss": 0.1201, + "epoch": 0.10779932740590593, + "grad_norm": 1.0096850735445058, + "learning_rate": 4.954511035219835e-06, + "loss": 0.1905, "step": 1170 }, { - "epoch": 0.9216843762298308, - "grad_norm": 0.9367157782024292, - "learning_rate": 8.267556712437342e-08, - "loss": 0.1057, + "epoch": 0.10789146358317593, + "grad_norm": 0.9542605339416056, + "learning_rate": 4.954366262545871e-06, + "loss": 0.1893, "step": 1171 }, { - "epoch": 0.9224714679260134, - "grad_norm": 0.9747763894177717, - "learning_rate": 8.102282452986693e-08, - "loss": 0.1098, + "epoch": 0.10798359976044594, + "grad_norm": 0.9358990406040504, + "learning_rate": 4.954221261983736e-06, + "loss": 0.1841, "step": 1172 }, { - "epoch": 0.9232585596221959, - "grad_norm": 0.9372662016679384, - "learning_rate": 7.938649677539268e-08, - "loss": 0.1081, + "epoch": 0.10807573593771594, + "grad_norm": 1.0250006987149098, + "learning_rate": 4.954076033546892e-06, + "loss": 0.1942, "step": 1173 }, { - "epoch": 0.9240456513183786, - "grad_norm": 0.9887803764792047, - "learning_rate": 7.77665949648404e-08, - "loss": 0.1199, + "epoch": 0.10816787211498595, + "grad_norm": 1.02972348562299, + "learning_rate": 4.953930577248825e-06, + "loss": 0.1924, "step": 1174 }, { - "epoch": 0.9248327430145612, - "grad_norm": 0.948252366615409, - "learning_rate": 7.616313009063791e-08, - "loss": 0.1064, + "epoch": 0.10826000829225596, + "grad_norm": 0.9230680708989243, + "learning_rate": 4.95378489310304e-06, + "loss": 0.1795, "step": 1175 }, { - "epoch": 0.9256198347107438, - "grad_norm": 0.948272373380358, - "learning_rate": 7.457611303367196e-08, - "loss": 0.1153, + "epoch": 0.10835214446952596, + "grad_norm": 1.006934556434401, + "learning_rate": 4.953638981123063e-06, + "loss": 0.1859, "step": 1176 }, { - "epoch": 0.9264069264069265, - "grad_norm": 0.951666041729817, - "learning_rate": 7.300555456321884e-08, - "loss": 0.1175, + "epoch": 0.10844428064679597, + "grad_norm": 0.9479125968654736, + "learning_rate": 4.9534928413224424e-06, + "loss": 0.1685, "step": 1177 }, { - "epoch": 0.927194018103109, - "grad_norm": 0.982566934759444, - "learning_rate": 7.145146533686725e-08, - "loss": 0.1164, + "epoch": 0.10853641682406596, + "grad_norm": 1.002904725939237, + "learning_rate": 4.953346473714748e-06, + "loss": 0.1972, "step": 1178 }, { - "epoch": 0.9279811097992916, - "grad_norm": 0.9553483812186222, - "learning_rate": 6.991385590044947e-08, - "loss": 0.1169, + "epoch": 0.10862855300133598, + "grad_norm": 0.9705753499726706, + "learning_rate": 4.953199878313569e-06, + "loss": 0.1833, "step": 1179 }, { - "epoch": 0.9287682014954742, - "grad_norm": 0.9724385774768447, - "learning_rate": 6.839273668796747e-08, - "loss": 0.1078, + "epoch": 0.10872068917860599, + "grad_norm": 0.9865043838278399, + "learning_rate": 4.953053055132518e-06, + "loss": 0.1868, "step": 1180 }, { - "epoch": 0.9295552931916569, - "grad_norm": 0.9324047746069145, - "learning_rate": 6.688811802152279e-08, - "loss": 0.1162, + "epoch": 0.10881282535587598, + "grad_norm": 0.9364729281823607, + "learning_rate": 4.9529060041852264e-06, + "loss": 0.1877, "step": 1181 }, { - "epoch": 0.9303423848878394, - "grad_norm": 0.9711188611376046, - "learning_rate": 6.540001011124703e-08, - "loss": 0.1089, + "epoch": 0.108904961533146, + "grad_norm": 0.9197551228545804, + "learning_rate": 4.9527587254853485e-06, + "loss": 0.1765, "step": 1182 }, { - "epoch": 0.931129476584022, - "grad_norm": 1.0007682860058293, - "learning_rate": 6.392842305523172e-08, - "loss": 0.1225, + "epoch": 0.10899709771041599, + "grad_norm": 0.9930734696539932, + "learning_rate": 4.952611219046559e-06, + "loss": 0.1975, "step": 1183 }, { - "epoch": 0.9319165682802046, - "grad_norm": 0.9074164360304593, - "learning_rate": 6.247336683946031e-08, - "loss": 0.1086, + "epoch": 0.109089233887686, + "grad_norm": 0.9438054002784088, + "learning_rate": 4.952463484882553e-06, + "loss": 0.1746, "step": 1184 }, { - "epoch": 0.9327036599763873, - "grad_norm": 0.9132051814101239, - "learning_rate": 6.103485133774039e-08, - "loss": 0.1168, + "epoch": 0.109181370064956, + "grad_norm": 1.0275596921449845, + "learning_rate": 4.9523155230070495e-06, + "loss": 0.1882, "step": 1185 }, { - "epoch": 0.9334907516725699, - "grad_norm": 0.9362633318018305, - "learning_rate": 5.961288631163687e-08, - "loss": 0.1162, + "epoch": 0.10927350624222601, + "grad_norm": 0.9827711445828464, + "learning_rate": 4.952167333433785e-06, + "loss": 0.1813, "step": 1186 }, { - "epoch": 0.9342778433687524, - "grad_norm": 1.0037784789548483, - "learning_rate": 5.820748141040444e-08, - "loss": 0.1246, + "epoch": 0.10936564241949602, + "grad_norm": 1.0596005745086683, + "learning_rate": 4.952018916176521e-06, + "loss": 0.1867, "step": 1187 }, { - "epoch": 0.935064935064935, - "grad_norm": 0.9109713869964553, - "learning_rate": 5.681864617092414e-08, - "loss": 0.1062, + "epoch": 0.10945777859676602, + "grad_norm": 0.9426441584350082, + "learning_rate": 4.9518702712490355e-06, + "loss": 0.1697, "step": 1188 }, { - "epoch": 0.9358520267611177, - "grad_norm": 0.9353212505070928, - "learning_rate": 5.544639001763719e-08, - "loss": 0.1116, + "epoch": 0.10954991477403603, + "grad_norm": 1.0978130786232543, + "learning_rate": 4.951721398665131e-06, + "loss": 0.195, "step": 1189 }, { - "epoch": 0.9366391184573003, - "grad_norm": 0.9763975248080838, - "learning_rate": 5.4090722262481463e-08, - "loss": 0.1183, + "epoch": 0.10964205095130602, + "grad_norm": 1.054271925935406, + "learning_rate": 4.951572298438632e-06, + "loss": 0.1778, "step": 1190 }, { - "epoch": 0.9374262101534829, - "grad_norm": 0.9389308712439575, - "learning_rate": 5.2751652104828245e-08, - "loss": 0.1125, + "epoch": 0.10973418712857604, + "grad_norm": 0.9363792710466154, + "learning_rate": 4.95142297058338e-06, + "loss": 0.182, "step": 1191 }, { - "epoch": 0.9382133018496654, - "grad_norm": 0.9219409357377748, - "learning_rate": 5.142918863141999e-08, - "loss": 0.1045, + "epoch": 0.10982632330584605, + "grad_norm": 0.9582824053873974, + "learning_rate": 4.951273415113243e-06, + "loss": 0.191, "step": 1192 }, { - "epoch": 0.9390003935458481, - "grad_norm": 0.9112274236581333, - "learning_rate": 5.012334081630821e-08, - "loss": 0.1154, + "epoch": 0.10991845948311604, + "grad_norm": 1.04768214799217, + "learning_rate": 4.951123632042104e-06, + "loss": 0.1876, "step": 1193 }, { - "epoch": 0.9397874852420307, - "grad_norm": 1.0413638466805462, - "learning_rate": 4.8834117520793754e-08, - "loss": 0.1235, + "epoch": 0.11001059566038605, + "grad_norm": 0.9511803599003008, + "learning_rate": 4.950973621383873e-06, + "loss": 0.1682, "step": 1194 }, { - "epoch": 0.9405745769382133, - "grad_norm": 0.929473983088696, - "learning_rate": 4.756152749336468e-08, - "loss": 0.1216, + "epoch": 0.11010273183765605, + "grad_norm": 1.0673589766836193, + "learning_rate": 4.950823383152478e-06, + "loss": 0.2048, "step": 1195 }, { - "epoch": 0.9413616686343959, - "grad_norm": 0.9499785152690334, - "learning_rate": 4.6305579369638474e-08, - "loss": 0.119, + "epoch": 0.11019486801492606, + "grad_norm": 1.0445742808478182, + "learning_rate": 4.9506729173618675e-06, + "loss": 0.1819, "step": 1196 }, { - "epoch": 0.9421487603305785, - "grad_norm": 0.9512339302739883, - "learning_rate": 4.506628167230326e-08, - "loss": 0.1128, + "epoch": 0.11028700419219607, + "grad_norm": 0.8806288475750527, + "learning_rate": 4.950522224026012e-06, + "loss": 0.1729, "step": 1197 }, { - "epoch": 0.9429358520267611, - "grad_norm": 0.9789643064479855, - "learning_rate": 4.384364281105974e-08, - "loss": 0.1156, + "epoch": 0.11037914036946607, + "grad_norm": 1.076709708388022, + "learning_rate": 4.950371303158905e-06, + "loss": 0.1789, "step": 1198 }, { - "epoch": 0.9437229437229437, - "grad_norm": 0.9338458352272411, - "learning_rate": 4.2637671082563225e-08, - "loss": 0.1097, + "epoch": 0.11047127654673608, + "grad_norm": 0.9229384393059257, + "learning_rate": 4.950220154774559e-06, + "loss": 0.1733, "step": 1199 }, { - "epoch": 0.9445100354191264, - "grad_norm": 0.9477658752462017, - "learning_rate": 4.144837467036922e-08, - "loss": 0.1062, + "epoch": 0.11056341272400608, + "grad_norm": 0.9186878701388156, + "learning_rate": 4.950068778887007e-06, + "loss": 0.1753, "step": 1200 }, { - "epoch": 0.9452971271153089, - "grad_norm": 0.9108658408708349, - "learning_rate": 4.0275761644876785e-08, - "loss": 0.113, + "epoch": 0.11065554890127609, + "grad_norm": 0.9700339527796721, + "learning_rate": 4.949917175510307e-06, + "loss": 0.1912, "step": 1201 }, { - "epoch": 0.9460842188114915, - "grad_norm": 0.9670323890311822, - "learning_rate": 3.911983996327251e-08, - "loss": 0.1159, + "epoch": 0.11074768507854608, + "grad_norm": 0.9264578935919071, + "learning_rate": 4.949765344658532e-06, + "loss": 0.1807, "step": 1202 }, { - "epoch": 0.9468713105076741, - "grad_norm": 0.9508320183305409, - "learning_rate": 3.798061746947995e-08, - "loss": 0.1183, + "epoch": 0.1108398212558161, + "grad_norm": 0.9953730707901259, + "learning_rate": 4.949613286345781e-06, + "loss": 0.1897, "step": 1203 }, { - "epoch": 0.9476584022038568, - "grad_norm": 0.9446280692518585, - "learning_rate": 3.6858101894102774e-08, - "loss": 0.1039, + "epoch": 0.1109319574330861, + "grad_norm": 0.9958814097903571, + "learning_rate": 4.9494610005861745e-06, + "loss": 0.1855, "step": 1204 }, { - "epoch": 0.9484454939000394, - "grad_norm": 0.8915603321077954, - "learning_rate": 3.575230085437448e-08, - "loss": 0.1131, + "epoch": 0.1110240936103561, + "grad_norm": 0.9974041687614713, + "learning_rate": 4.949308487393849e-06, + "loss": 0.1887, "step": 1205 }, { - "epoch": 0.9492325855962219, - "grad_norm": 1.0049928450920236, - "learning_rate": 3.466322185410542e-08, - "loss": 0.1075, + "epoch": 0.11111622978762611, + "grad_norm": 1.049073106143341, + "learning_rate": 4.949155746782966e-06, + "loss": 0.2123, "step": 1206 }, { - "epoch": 0.9500196772924046, - "grad_norm": 0.9377285133019151, - "learning_rate": 3.3590872283633944e-08, - "loss": 0.1047, + "epoch": 0.11120836596489611, + "grad_norm": 0.9442037997859811, + "learning_rate": 4.94900277876771e-06, + "loss": 0.1698, "step": 1207 }, { - "epoch": 0.9508067689885872, - "grad_norm": 1.0283573568735918, - "learning_rate": 3.253525941977309e-08, - "loss": 0.1207, + "epoch": 0.11130050214216612, + "grad_norm": 1.0030492263525004, + "learning_rate": 4.948849583362282e-06, + "loss": 0.1939, "step": 1208 }, { - "epoch": 0.9515938606847698, - "grad_norm": 0.8716127646526632, - "learning_rate": 3.1496390425764246e-08, - "loss": 0.1034, + "epoch": 0.11139263831943613, + "grad_norm": 1.0918873927109156, + "learning_rate": 4.948696160580907e-06, + "loss": 0.2061, "step": 1209 }, { - "epoch": 0.9523809523809523, - "grad_norm": 0.9413176172911034, - "learning_rate": 3.047427235122663e-08, - "loss": 0.1094, + "epoch": 0.11148477449670613, + "grad_norm": 0.9850049521735987, + "learning_rate": 4.948542510437829e-06, + "loss": 0.1791, "step": 1210 }, { - "epoch": 0.953168044077135, - "grad_norm": 0.9235158713031617, - "learning_rate": 2.9468912132110117e-08, - "loss": 0.1112, + "epoch": 0.11157691067397614, + "grad_norm": 0.8914351041716434, + "learning_rate": 4.948388632947316e-06, + "loss": 0.1618, "step": 1211 }, { - "epoch": 0.9539551357733176, - "grad_norm": 1.0172961607730988, - "learning_rate": 2.8480316590648315e-08, - "loss": 0.1174, + "epoch": 0.11166904685124614, + "grad_norm": 0.9481190935623166, + "learning_rate": 4.948234528123655e-06, + "loss": 0.1926, "step": 1212 }, { - "epoch": 0.9547422274695002, - "grad_norm": 0.9633734296197954, - "learning_rate": 2.750849243531223e-08, - "loss": 0.119, + "epoch": 0.11176118302851615, + "grad_norm": 1.0067784660769212, + "learning_rate": 4.948080195981154e-06, + "loss": 0.1871, "step": 1213 }, { - "epoch": 0.9555293191656828, - "grad_norm": 0.8730009976362983, - "learning_rate": 2.655344626076417e-08, - "loss": 0.1012, + "epoch": 0.11185331920578616, + "grad_norm": 0.962562766383318, + "learning_rate": 4.947925636534144e-06, + "loss": 0.1781, "step": 1214 }, { - "epoch": 0.9563164108618654, - "grad_norm": 0.9472387317654097, - "learning_rate": 2.5615184547813364e-08, - "loss": 0.1169, + "epoch": 0.11194545538305616, + "grad_norm": 1.0027653781996462, + "learning_rate": 4.947770849796975e-06, + "loss": 0.1888, "step": 1215 }, { - "epoch": 0.957103502558048, - "grad_norm": 1.009282211648514, - "learning_rate": 2.4693713663372643e-08, - "loss": 0.1193, + "epoch": 0.11203759156032617, + "grad_norm": 0.9736579394329236, + "learning_rate": 4.9476158357840194e-06, + "loss": 0.1795, "step": 1216 }, { - "epoch": 0.9578905942542306, - "grad_norm": 0.9320555090575322, - "learning_rate": 2.378903986041403e-08, - "loss": 0.1134, + "epoch": 0.11212972773759616, + "grad_norm": 1.0500111563474652, + "learning_rate": 4.9474605945096695e-06, + "loss": 0.2044, "step": 1217 }, { - "epoch": 0.9586776859504132, - "grad_norm": 0.9964824551488268, - "learning_rate": 2.2901169277927126e-08, - "loss": 0.123, + "epoch": 0.11222186391486617, + "grad_norm": 1.0505396188236562, + "learning_rate": 4.94730512598834e-06, + "loss": 0.1849, "step": 1218 }, { - "epoch": 0.9594647776465959, - "grad_norm": 0.9252548740156445, - "learning_rate": 2.2030107940877733e-08, - "loss": 0.1145, + "epoch": 0.11231400009213617, + "grad_norm": 0.9764803395123072, + "learning_rate": 4.947149430234467e-06, + "loss": 0.1906, "step": 1219 }, { - "epoch": 0.9602518693427784, - "grad_norm": 1.0057993388132023, - "learning_rate": 2.117586176016512e-08, - "loss": 0.1246, + "epoch": 0.11240613626940618, + "grad_norm": 1.0016094631221018, + "learning_rate": 4.946993507262505e-06, + "loss": 0.1858, "step": 1220 }, { - "epoch": 0.961038961038961, - "grad_norm": 0.953596307625609, - "learning_rate": 2.0338436532584826e-08, - "loss": 0.1113, + "epoch": 0.11249827244667619, + "grad_norm": 0.9867478645758117, + "learning_rate": 4.946837357086933e-06, + "loss": 0.1871, "step": 1221 }, { - "epoch": 0.9618260527351437, - "grad_norm": 0.8951377954636907, - "learning_rate": 1.9517837940786767e-08, - "loss": 0.1033, + "epoch": 0.11259040862394619, + "grad_norm": 1.0536305452882067, + "learning_rate": 4.946680979722249e-06, + "loss": 0.2072, "step": 1222 }, { - "epoch": 0.9626131444313263, - "grad_norm": 1.0221216536124687, - "learning_rate": 1.8714071553238012e-08, - "loss": 0.125, + "epoch": 0.1126825448012162, + "grad_norm": 1.0001369286623907, + "learning_rate": 4.946524375182973e-06, + "loss": 0.1849, "step": 1223 }, { - "epoch": 0.9634002361275088, - "grad_norm": 0.9387870706305922, - "learning_rate": 1.7927142824184784e-08, - "loss": 0.1144, + "epoch": 0.1127746809784862, + "grad_norm": 0.9590582772180609, + "learning_rate": 4.946367543483645e-06, + "loss": 0.1948, "step": 1224 }, { - "epoch": 0.9641873278236914, - "grad_norm": 0.9321486745496109, - "learning_rate": 1.7157057093614704e-08, - "loss": 0.1094, + "epoch": 0.11286681715575621, + "grad_norm": 1.0317577767091315, + "learning_rate": 4.946210484638827e-06, + "loss": 0.1842, "step": 1225 }, { - "epoch": 0.9649744195198741, - "grad_norm": 1.0128840378759991, - "learning_rate": 1.6403819587221814e-08, - "loss": 0.1217, + "epoch": 0.11295895333302622, + "grad_norm": 0.9817570764988467, + "learning_rate": 4.946053198663103e-06, + "loss": 0.1647, "step": 1226 }, { - "epoch": 0.9657615112160567, - "grad_norm": 0.924640460993744, - "learning_rate": 1.5667435416370226e-08, - "loss": 0.1115, + "epoch": 0.11305108951029622, + "grad_norm": 1.0826573918808229, + "learning_rate": 4.945895685571076e-06, + "loss": 0.2078, "step": 1227 }, { - "epoch": 0.9665486029122393, - "grad_norm": 0.9932790569946806, - "learning_rate": 1.494790957805997e-08, - "loss": 0.115, + "epoch": 0.11314322568756623, + "grad_norm": 0.9688980456746177, + "learning_rate": 4.945737945377372e-06, + "loss": 0.1812, "step": 1228 }, { - "epoch": 0.9673356946084218, - "grad_norm": 0.9381951620042324, - "learning_rate": 1.4245246954892323e-08, - "loss": 0.1096, + "epoch": 0.11323536186483622, + "grad_norm": 0.992886307038925, + "learning_rate": 4.945579978096635e-06, + "loss": 0.1841, "step": 1229 }, { - "epoch": 0.9681227863046045, - "grad_norm": 0.9070754098607924, - "learning_rate": 1.3559452315037025e-08, - "loss": 0.107, + "epoch": 0.11332749804210623, + "grad_norm": 1.0512449225985512, + "learning_rate": 4.945421783743535e-06, + "loss": 0.1818, "step": 1230 }, { - "epoch": 0.9689098780007871, - "grad_norm": 0.9138545833001099, - "learning_rate": 1.2890530312200944e-08, - "loss": 0.1027, + "epoch": 0.11341963421937624, + "grad_norm": 1.0098209511209224, + "learning_rate": 4.945263362332759e-06, + "loss": 0.1857, "step": 1231 }, { - "epoch": 0.9696969696969697, - "grad_norm": 0.959458803400461, - "learning_rate": 1.2238485485594753e-08, - "loss": 0.1163, + "epoch": 0.11351177039664624, + "grad_norm": 1.1196581888871462, + "learning_rate": 4.945104713879017e-06, + "loss": 0.1891, "step": 1232 }, { - "epoch": 0.9704840613931524, - "grad_norm": 0.9577614004721761, - "learning_rate": 1.160332225990296e-08, - "loss": 0.1148, + "epoch": 0.11360390657391625, + "grad_norm": 0.933560568750918, + "learning_rate": 4.9449458383970386e-06, + "loss": 0.1791, "step": 1233 }, { - "epoch": 0.9712711530893349, - "grad_norm": 0.9713054396060389, - "learning_rate": 1.0985044945254763e-08, - "loss": 0.1249, + "epoch": 0.11369604275118625, + "grad_norm": 1.0192174357061985, + "learning_rate": 4.944786735901576e-06, + "loss": 0.1794, "step": 1234 }, { - "epoch": 0.9720582447855175, - "grad_norm": 0.9727071823355634, - "learning_rate": 1.0383657737192964e-08, - "loss": 0.1188, + "epoch": 0.11378817892845626, + "grad_norm": 1.0521293689124385, + "learning_rate": 4.944627406407401e-06, + "loss": 0.1932, "step": 1235 }, { - "epoch": 0.9728453364817001, - "grad_norm": 0.9525541993565831, - "learning_rate": 9.79916471664677e-09, - "loss": 0.1129, + "epoch": 0.11388031510572626, + "grad_norm": 0.9583785459922292, + "learning_rate": 4.94446784992931e-06, + "loss": 0.17, "step": 1236 }, { - "epoch": 0.9736324281778828, - "grad_norm": 0.9792192000262094, - "learning_rate": 9.231569849904309e-09, - "loss": 0.124, + "epoch": 0.11397245128299627, + "grad_norm": 1.050868615441966, + "learning_rate": 4.9443080664821156e-06, + "loss": 0.1939, "step": 1237 }, { - "epoch": 0.9744195198740653, - "grad_norm": 0.9711922853486228, - "learning_rate": 8.680876988584607e-09, - "loss": 0.1114, + "epoch": 0.11406458746026628, + "grad_norm": 1.0764625733452038, + "learning_rate": 4.944148056080654e-06, + "loss": 0.1994, "step": 1238 }, { - "epoch": 0.9752066115702479, - "grad_norm": 0.9623623423156289, - "learning_rate": 8.147089869612045e-09, - "loss": 0.1064, + "epoch": 0.11415672363753628, + "grad_norm": 1.0277252761190465, + "learning_rate": 4.943987818739782e-06, + "loss": 0.1876, "step": 1239 }, { - "epoch": 0.9759937032664305, - "grad_norm": 0.9583105118384698, - "learning_rate": 7.630212115191381e-09, - "loss": 0.1128, + "epoch": 0.11424885981480629, + "grad_norm": 1.0350109470595392, + "learning_rate": 4.943827354474378e-06, + "loss": 0.1894, "step": 1240 }, { - "epoch": 0.9767807949626132, - "grad_norm": 0.958416495157565, - "learning_rate": 7.130247232782217e-09, - "loss": 0.1124, + "epoch": 0.11434099599207628, + "grad_norm": 1.0146682594155343, + "learning_rate": 4.943666663299341e-06, + "loss": 0.1729, "step": 1241 }, { - "epoch": 0.9775678866587958, - "grad_norm": 0.9739671638616083, - "learning_rate": 6.647198615076789e-09, - "loss": 0.1239, + "epoch": 0.1144331321693463, + "grad_norm": 1.028187292870586, + "learning_rate": 4.943505745229592e-06, + "loss": 0.1855, "step": 1242 }, { - "epoch": 0.9783549783549783, - "grad_norm": 0.9831503405039121, - "learning_rate": 6.181069539974716e-09, - "loss": 0.1152, + "epoch": 0.1145252683466163, + "grad_norm": 1.099924483199594, + "learning_rate": 4.943344600280071e-06, + "loss": 0.2035, "step": 1243 }, { - "epoch": 0.9791420700511609, - "grad_norm": 0.9837400272891019, - "learning_rate": 5.7318631705630126e-09, - "loss": 0.1196, + "epoch": 0.1146174045238863, + "grad_norm": 1.1454243464465912, + "learning_rate": 4.943183228465742e-06, + "loss": 0.1993, "step": 1244 }, { - "epoch": 0.9799291617473436, - "grad_norm": 0.9663876709057511, - "learning_rate": 5.299582555093052e-09, - "loss": 0.1174, + "epoch": 0.11470954070115631, + "grad_norm": 0.9485855285325462, + "learning_rate": 4.943021629801586e-06, + "loss": 0.1691, "step": 1245 }, { - "epoch": 0.9807162534435262, - "grad_norm": 0.9077428753737183, - "learning_rate": 4.884230626960307e-09, - "loss": 0.1046, + "epoch": 0.11480167687842631, + "grad_norm": 1.0626072276859084, + "learning_rate": 4.9428598043026085e-06, + "loss": 0.1846, "step": 1246 }, { - "epoch": 0.9815033451397088, - "grad_norm": 0.9624717672564354, - "learning_rate": 4.485810204684638e-09, - "loss": 0.1143, + "epoch": 0.11489381305569632, + "grad_norm": 0.9269346524433504, + "learning_rate": 4.942697751983837e-06, + "loss": 0.1773, "step": 1247 }, { - "epoch": 0.9822904368358913, - "grad_norm": 0.9768731662603329, - "learning_rate": 4.104323991891424e-09, - "loss": 0.1111, + "epoch": 0.11498594923296633, + "grad_norm": 0.9535100536348952, + "learning_rate": 4.942535472860315e-06, + "loss": 0.1787, "step": 1248 }, { - "epoch": 0.983077528532074, - "grad_norm": 0.9918364976987204, - "learning_rate": 3.739774577292688e-09, - "loss": 0.1146, + "epoch": 0.11507808541023633, + "grad_norm": 1.0023957002528299, + "learning_rate": 4.942372966947112e-06, + "loss": 0.1866, "step": 1249 }, { - "epoch": 0.9838646202282566, - "grad_norm": 0.9235361594154657, - "learning_rate": 3.392164434669609e-09, - "loss": 0.115, + "epoch": 0.11517022158750634, + "grad_norm": 0.994322779502017, + "learning_rate": 4.942210234259316e-06, + "loss": 0.1778, "step": 1250 }, { - "epoch": 0.9846517119244392, - "grad_norm": 0.9407569711042593, - "learning_rate": 3.0614959228558728e-09, - "loss": 0.1048, + "epoch": 0.11526235776477634, + "grad_norm": 0.9575100289627279, + "learning_rate": 4.9420472748120365e-06, + "loss": 0.1941, "step": 1251 }, { - "epoch": 0.9854388036206218, - "grad_norm": 0.9503971642912823, - "learning_rate": 2.7477712857215676e-09, - "loss": 0.1118, + "epoch": 0.11535449394204635, + "grad_norm": 0.9484403113363623, + "learning_rate": 4.941884088620405e-06, + "loss": 0.1731, "step": 1252 }, { - "epoch": 0.9862258953168044, - "grad_norm": 0.9423931683522886, - "learning_rate": 2.450992652157924e-09, - "loss": 0.1052, + "epoch": 0.11544663011931634, + "grad_norm": 1.0017289164577379, + "learning_rate": 4.941720675699573e-06, + "loss": 0.1819, "step": 1253 }, { - "epoch": 0.987012987012987, - "grad_norm": 0.9692730394836648, - "learning_rate": 2.1711620360634344e-09, - "loss": 0.1134, + "epoch": 0.11553876629658635, + "grad_norm": 0.9694215635786955, + "learning_rate": 4.941557036064714e-06, + "loss": 0.1838, "step": 1254 }, { - "epoch": 0.9878000787091696, - "grad_norm": 0.8578557269739953, - "learning_rate": 1.9082813363294205e-09, - "loss": 0.1062, + "epoch": 0.11563090247385636, + "grad_norm": 1.021592492535489, + "learning_rate": 4.9413931697310215e-06, + "loss": 0.197, "step": 1255 }, { - "epoch": 0.9885871704053523, - "grad_norm": 0.9417059944579995, - "learning_rate": 1.662352336827544e-09, - "loss": 0.1198, + "epoch": 0.11572303865112636, + "grad_norm": 1.0982720789156573, + "learning_rate": 4.941229076713709e-06, + "loss": 0.1999, "step": 1256 }, { - "epoch": 0.9893742621015348, - "grad_norm": 1.0199292836158018, - "learning_rate": 1.4333767063973159e-09, - "loss": 0.1085, + "epoch": 0.11581517482839637, + "grad_norm": 0.9286929084767928, + "learning_rate": 4.9410647570280156e-06, + "loss": 0.1699, "step": 1257 }, { - "epoch": 0.9901613537977174, - "grad_norm": 0.9283942300248242, - "learning_rate": 1.221355998835272e-09, - "loss": 0.1069, + "epoch": 0.11590731100566637, + "grad_norm": 1.0284887867762864, + "learning_rate": 4.940900210689196e-06, + "loss": 0.1965, "step": 1258 }, { - "epoch": 0.9909484454939, - "grad_norm": 0.9097063653552931, - "learning_rate": 1.0262916528841483e-09, - "loss": 0.1074, + "epoch": 0.11599944718293638, + "grad_norm": 1.0434528337767535, + "learning_rate": 4.94073543771253e-06, + "loss": 0.1971, "step": 1259 }, { - "epoch": 0.9917355371900827, - "grad_norm": 0.9806238353833303, - "learning_rate": 8.481849922237217e-10, - "loss": 0.1136, + "epoch": 0.11609158336020639, + "grad_norm": 0.9721395364025261, + "learning_rate": 4.940570438113315e-06, + "loss": 0.1916, "step": 1260 }, { - "epoch": 0.9925226288862653, - "grad_norm": 0.9354892517698297, - "learning_rate": 6.870372254602631e-10, - "loss": 0.1048, + "epoch": 0.11618371953747639, + "grad_norm": 1.026835672769356, + "learning_rate": 4.940405211906872e-06, + "loss": 0.19, "step": 1261 }, { - "epoch": 0.9933097205824478, - "grad_norm": 0.9045069809298675, - "learning_rate": 5.428494461201527e-10, - "loss": 0.109, + "epoch": 0.1162758557147464, + "grad_norm": 0.983921489168916, + "learning_rate": 4.9402397591085435e-06, + "loss": 0.1901, "step": 1262 }, { - "epoch": 0.9940968122786304, - "grad_norm": 0.9462573748322036, - "learning_rate": 4.156226326415547e-10, - "loss": 0.1156, + "epoch": 0.1163679918920164, + "grad_norm": 1.006030490851194, + "learning_rate": 4.94007407973369e-06, + "loss": 0.2099, "step": 1263 }, { - "epoch": 0.9948839039748131, - "grad_norm": 0.9123815541723352, - "learning_rate": 3.0535764836747696e-10, - "loss": 0.1078, + "epoch": 0.1164601280692864, + "grad_norm": 0.946916078921123, + "learning_rate": 4.939908173797696e-06, + "loss": 0.1796, "step": 1264 }, { - "epoch": 0.9956709956709957, - "grad_norm": 0.960544516493779, - "learning_rate": 2.1205524154105372e-10, - "loss": 0.1088, + "epoch": 0.11655226424655642, + "grad_norm": 0.9186429153859957, + "learning_rate": 4.939742041315964e-06, + "loss": 0.1764, "step": 1265 }, { - "epoch": 0.9964580873671782, - "grad_norm": 0.8887242195384208, - "learning_rate": 1.357160452988837e-10, - "loss": 0.1031, + "epoch": 0.11664440042382641, + "grad_norm": 1.0162264460794295, + "learning_rate": 4.939575682303923e-06, + "loss": 0.1946, "step": 1266 }, { - "epoch": 0.9972451790633609, - "grad_norm": 0.9502744942086568, - "learning_rate": 7.63405776685322e-11, - "loss": 0.1124, + "epoch": 0.11673653660109642, + "grad_norm": 0.939647214901781, + "learning_rate": 4.939409096777017e-06, + "loss": 0.1816, "step": 1267 }, { - "epoch": 0.9980322707595435, - "grad_norm": 0.8901953170723566, - "learning_rate": 3.3929241563535056e-11, - "loss": 0.1001, + "epoch": 0.11682867277836642, + "grad_norm": 1.0127923569806176, + "learning_rate": 4.939242284750712e-06, + "loss": 0.1787, "step": 1268 }, { - "epoch": 0.9988193624557261, - "grad_norm": 0.9602428624879035, - "learning_rate": 8.482324780900718e-12, - "loss": 0.1116, + "epoch": 0.11692080895563643, + "grad_norm": 0.9696949793271437, + "learning_rate": 4.9390752462405e-06, + "loss": 0.182, "step": 1269 }, { - "epoch": 0.9996064541519087, - "grad_norm": 0.9779040145681426, - "learning_rate": 0.0, - "loss": 0.119, + "epoch": 0.11701294513290643, + "grad_norm": 1.0188460893399238, + "learning_rate": 4.938907981261889e-06, + "loss": 0.1925, "step": 1270 }, { - "epoch": 0.9996064541519087, - "step": 1270, - "total_flos": 223330201436160.0, - "train_loss": 0.14531472616308316, - "train_runtime": 7100.3282, - "train_samples_per_second": 11.452, - "train_steps_per_second": 0.179 + "epoch": 0.11710508131017644, + "grad_norm": 1.047535048711083, + "learning_rate": 4.938740489830409e-06, + "loss": 0.1858, + "step": 1271 + }, + { + "epoch": 0.11719721748744645, + "grad_norm": 0.9839924846788682, + "learning_rate": 4.938572771961612e-06, + "loss": 0.193, + "step": 1272 + }, + { + "epoch": 0.11728935366471645, + "grad_norm": 1.0763836514639684, + "learning_rate": 4.93840482767107e-06, + "loss": 0.1942, + "step": 1273 + }, + { + "epoch": 0.11738148984198646, + "grad_norm": 0.9660646463485411, + "learning_rate": 4.938236656974378e-06, + "loss": 0.1703, + "step": 1274 + }, + { + "epoch": 0.11747362601925646, + "grad_norm": 0.9776665253257636, + "learning_rate": 4.9380682598871505e-06, + "loss": 0.1821, + "step": 1275 + }, + { + "epoch": 0.11756576219652647, + "grad_norm": 1.002267232847209, + "learning_rate": 4.937899636425022e-06, + "loss": 0.2, + "step": 1276 + }, + { + "epoch": 0.11765789837379648, + "grad_norm": 0.9378913381320999, + "learning_rate": 4.9377307866036506e-06, + "loss": 0.1895, + "step": 1277 + }, + { + "epoch": 0.11775003455106647, + "grad_norm": 0.9960120707110922, + "learning_rate": 4.9375617104387124e-06, + "loss": 0.1887, + "step": 1278 + }, + { + "epoch": 0.11784217072833648, + "grad_norm": 1.0253843069745778, + "learning_rate": 4.9373924079459076e-06, + "loss": 0.1895, + "step": 1279 + }, + { + "epoch": 0.11793430690560648, + "grad_norm": 0.9694205674452752, + "learning_rate": 4.937222879140955e-06, + "loss": 0.1948, + "step": 1280 + }, + { + "epoch": 0.11802644308287649, + "grad_norm": 0.9340661317572511, + "learning_rate": 4.937053124039597e-06, + "loss": 0.1793, + "step": 1281 + }, + { + "epoch": 0.1181185792601465, + "grad_norm": 1.0174735224747125, + "learning_rate": 4.9368831426575925e-06, + "loss": 0.182, + "step": 1282 + }, + { + "epoch": 0.1182107154374165, + "grad_norm": 0.9767629316407481, + "learning_rate": 4.9367129350107265e-06, + "loss": 0.1837, + "step": 1283 + }, + { + "epoch": 0.11830285161468651, + "grad_norm": 1.054908188076334, + "learning_rate": 4.936542501114803e-06, + "loss": 0.1852, + "step": 1284 + }, + { + "epoch": 0.11839498779195651, + "grad_norm": 0.9595958456149299, + "learning_rate": 4.936371840985645e-06, + "loss": 0.17, + "step": 1285 + }, + { + "epoch": 0.11848712396922652, + "grad_norm": 1.0916556011220866, + "learning_rate": 4.9362009546391e-06, + "loss": 0.1717, + "step": 1286 + }, + { + "epoch": 0.11857926014649652, + "grad_norm": 1.0532332905680384, + "learning_rate": 4.9360298420910335e-06, + "loss": 0.1647, + "step": 1287 + }, + { + "epoch": 0.11867139632376653, + "grad_norm": 1.069006714331314, + "learning_rate": 4.935858503357335e-06, + "loss": 0.1872, + "step": 1288 + }, + { + "epoch": 0.11876353250103654, + "grad_norm": 1.0146730910464414, + "learning_rate": 4.935686938453912e-06, + "loss": 0.19, + "step": 1289 + }, + { + "epoch": 0.11885566867830653, + "grad_norm": 1.1013136967926964, + "learning_rate": 4.935515147396695e-06, + "loss": 0.1942, + "step": 1290 + }, + { + "epoch": 0.11894780485557654, + "grad_norm": 1.022490231526319, + "learning_rate": 4.935343130201633e-06, + "loss": 0.1719, + "step": 1291 + }, + { + "epoch": 0.11903994103284654, + "grad_norm": 1.0736875280791962, + "learning_rate": 4.935170886884701e-06, + "loss": 0.1981, + "step": 1292 + }, + { + "epoch": 0.11913207721011655, + "grad_norm": 1.0171104588512216, + "learning_rate": 4.934998417461888e-06, + "loss": 0.1832, + "step": 1293 + }, + { + "epoch": 0.11922421338738656, + "grad_norm": 0.984348887268898, + "learning_rate": 4.9348257219492116e-06, + "loss": 0.1683, + "step": 1294 + }, + { + "epoch": 0.11931634956465656, + "grad_norm": 1.0769453299087037, + "learning_rate": 4.934652800362704e-06, + "loss": 0.2061, + "step": 1295 + }, + { + "epoch": 0.11940848574192657, + "grad_norm": 1.0409966331483476, + "learning_rate": 4.934479652718422e-06, + "loss": 0.1865, + "step": 1296 + }, + { + "epoch": 0.11950062191919657, + "grad_norm": 1.046150491665991, + "learning_rate": 4.934306279032442e-06, + "loss": 0.1836, + "step": 1297 + }, + { + "epoch": 0.11959275809646658, + "grad_norm": 1.0300935796451935, + "learning_rate": 4.934132679320863e-06, + "loss": 0.1818, + "step": 1298 + }, + { + "epoch": 0.11968489427373659, + "grad_norm": 1.0223005476376819, + "learning_rate": 4.933958853599803e-06, + "loss": 0.2019, + "step": 1299 + }, + { + "epoch": 0.11977703045100659, + "grad_norm": 0.9953052620305401, + "learning_rate": 4.9337848018854005e-06, + "loss": 0.191, + "step": 1300 + }, + { + "epoch": 0.1198691666282766, + "grad_norm": 0.9167793138450512, + "learning_rate": 4.933610524193817e-06, + "loss": 0.1575, + "step": 1301 + }, + { + "epoch": 0.1199613028055466, + "grad_norm": 0.9501468070448875, + "learning_rate": 4.933436020541235e-06, + "loss": 0.1935, + "step": 1302 + }, + { + "epoch": 0.1200534389828166, + "grad_norm": 0.9908154738642798, + "learning_rate": 4.933261290943856e-06, + "loss": 0.1634, + "step": 1303 + }, + { + "epoch": 0.1201455751600866, + "grad_norm": 1.0252708980694958, + "learning_rate": 4.933086335417905e-06, + "loss": 0.1912, + "step": 1304 + }, + { + "epoch": 0.12023771133735661, + "grad_norm": 0.964216175527814, + "learning_rate": 4.932911153979626e-06, + "loss": 0.1788, + "step": 1305 + }, + { + "epoch": 0.12032984751462662, + "grad_norm": 0.9947350149300663, + "learning_rate": 4.932735746645284e-06, + "loss": 0.1872, + "step": 1306 + }, + { + "epoch": 0.12042198369189662, + "grad_norm": 1.0222442690970204, + "learning_rate": 4.9325601134311665e-06, + "loss": 0.1716, + "step": 1307 + }, + { + "epoch": 0.12051411986916663, + "grad_norm": 0.9714762563929501, + "learning_rate": 4.932384254353581e-06, + "loss": 0.2076, + "step": 1308 + }, + { + "epoch": 0.12060625604643663, + "grad_norm": 1.0023546871765083, + "learning_rate": 4.932208169428855e-06, + "loss": 0.1822, + "step": 1309 + }, + { + "epoch": 0.12069839222370664, + "grad_norm": 0.9454582559482887, + "learning_rate": 4.932031858673338e-06, + "loss": 0.1814, + "step": 1310 + }, + { + "epoch": 0.12079052840097665, + "grad_norm": 0.9685169957367546, + "learning_rate": 4.931855322103403e-06, + "loss": 0.1932, + "step": 1311 + }, + { + "epoch": 0.12088266457824665, + "grad_norm": 0.9805185074459721, + "learning_rate": 4.9316785597354385e-06, + "loss": 0.1805, + "step": 1312 + }, + { + "epoch": 0.12097480075551666, + "grad_norm": 0.9804392769335055, + "learning_rate": 4.931501571585858e-06, + "loss": 0.1845, + "step": 1313 + }, + { + "epoch": 0.12106693693278665, + "grad_norm": 1.00322430997519, + "learning_rate": 4.931324357671095e-06, + "loss": 0.1851, + "step": 1314 + }, + { + "epoch": 0.12115907311005666, + "grad_norm": 0.9328218448764897, + "learning_rate": 4.931146918007604e-06, + "loss": 0.1836, + "step": 1315 + }, + { + "epoch": 0.12125120928732668, + "grad_norm": 0.9179723457446634, + "learning_rate": 4.93096925261186e-06, + "loss": 0.1784, + "step": 1316 + }, + { + "epoch": 0.12134334546459667, + "grad_norm": 1.0730735318120903, + "learning_rate": 4.930791361500359e-06, + "loss": 0.1995, + "step": 1317 + }, + { + "epoch": 0.12143548164186668, + "grad_norm": 1.0073445260759302, + "learning_rate": 4.930613244689618e-06, + "loss": 0.1876, + "step": 1318 + }, + { + "epoch": 0.12152761781913668, + "grad_norm": 0.9666047869313055, + "learning_rate": 4.930434902196177e-06, + "loss": 0.1844, + "step": 1319 + }, + { + "epoch": 0.12161975399640669, + "grad_norm": 0.9566473238654624, + "learning_rate": 4.930256334036593e-06, + "loss": 0.1834, + "step": 1320 + }, + { + "epoch": 0.12171189017367669, + "grad_norm": 1.0697107399386463, + "learning_rate": 4.930077540227447e-06, + "loss": 0.1792, + "step": 1321 + }, + { + "epoch": 0.1218040263509467, + "grad_norm": 0.9209255293099364, + "learning_rate": 4.92989852078534e-06, + "loss": 0.1636, + "step": 1322 + }, + { + "epoch": 0.12189616252821671, + "grad_norm": 1.1125165716627952, + "learning_rate": 4.929719275726893e-06, + "loss": 0.1838, + "step": 1323 + }, + { + "epoch": 0.1219882987054867, + "grad_norm": 0.9940504687221988, + "learning_rate": 4.9295398050687505e-06, + "loss": 0.1737, + "step": 1324 + }, + { + "epoch": 0.12208043488275672, + "grad_norm": 1.0103196274970314, + "learning_rate": 4.929360108827575e-06, + "loss": 0.1867, + "step": 1325 + }, + { + "epoch": 0.12217257106002671, + "grad_norm": 1.0723418698938951, + "learning_rate": 4.929180187020053e-06, + "loss": 0.1873, + "step": 1326 + }, + { + "epoch": 0.12226470723729672, + "grad_norm": 0.9596676997934364, + "learning_rate": 4.9290000396628875e-06, + "loss": 0.1845, + "step": 1327 + }, + { + "epoch": 0.12235684341456673, + "grad_norm": 0.9962848010523647, + "learning_rate": 4.928819666772808e-06, + "loss": 0.1789, + "step": 1328 + }, + { + "epoch": 0.12244897959183673, + "grad_norm": 1.0363419257999569, + "learning_rate": 4.9286390683665615e-06, + "loss": 0.1886, + "step": 1329 + }, + { + "epoch": 0.12254111576910674, + "grad_norm": 0.9278551228569101, + "learning_rate": 4.9284582444609156e-06, + "loss": 0.1816, + "step": 1330 + }, + { + "epoch": 0.12263325194637674, + "grad_norm": 1.0265744616787111, + "learning_rate": 4.9282771950726605e-06, + "loss": 0.1864, + "step": 1331 + }, + { + "epoch": 0.12272538812364675, + "grad_norm": 0.9152490785424434, + "learning_rate": 4.928095920218606e-06, + "loss": 0.1797, + "step": 1332 + }, + { + "epoch": 0.12281752430091676, + "grad_norm": 0.8831438931297253, + "learning_rate": 4.927914419915585e-06, + "loss": 0.1858, + "step": 1333 + }, + { + "epoch": 0.12290966047818676, + "grad_norm": 0.9599196224749852, + "learning_rate": 4.927732694180448e-06, + "loss": 0.1894, + "step": 1334 + }, + { + "epoch": 0.12300179665545677, + "grad_norm": 0.9345601055819366, + "learning_rate": 4.9275507430300694e-06, + "loss": 0.19, + "step": 1335 + }, + { + "epoch": 0.12309393283272677, + "grad_norm": 0.9531446635108866, + "learning_rate": 4.927368566481343e-06, + "loss": 0.1768, + "step": 1336 + }, + { + "epoch": 0.12318606900999678, + "grad_norm": 0.8954454298056077, + "learning_rate": 4.927186164551184e-06, + "loss": 0.1661, + "step": 1337 + }, + { + "epoch": 0.12327820518726679, + "grad_norm": 1.0406211772874898, + "learning_rate": 4.927003537256528e-06, + "loss": 0.1896, + "step": 1338 + }, + { + "epoch": 0.12337034136453678, + "grad_norm": 0.9427072884318984, + "learning_rate": 4.926820684614333e-06, + "loss": 0.1783, + "step": 1339 + }, + { + "epoch": 0.1234624775418068, + "grad_norm": 1.063398218693465, + "learning_rate": 4.9266376066415764e-06, + "loss": 0.2, + "step": 1340 + }, + { + "epoch": 0.12355461371907679, + "grad_norm": 1.014467337241986, + "learning_rate": 4.926454303355256e-06, + "loss": 0.1778, + "step": 1341 + }, + { + "epoch": 0.1236467498963468, + "grad_norm": 0.9867211771794864, + "learning_rate": 4.926270774772392e-06, + "loss": 0.1868, + "step": 1342 + }, + { + "epoch": 0.1237388860736168, + "grad_norm": 0.9314537671161427, + "learning_rate": 4.926087020910027e-06, + "loss": 0.1689, + "step": 1343 + }, + { + "epoch": 0.12383102225088681, + "grad_norm": 0.9945656614235373, + "learning_rate": 4.925903041785221e-06, + "loss": 0.1915, + "step": 1344 + }, + { + "epoch": 0.12392315842815682, + "grad_norm": 1.054070147484718, + "learning_rate": 4.925718837415055e-06, + "loss": 0.1874, + "step": 1345 + }, + { + "epoch": 0.12401529460542682, + "grad_norm": 1.003037762311901, + "learning_rate": 4.925534407816634e-06, + "loss": 0.1769, + "step": 1346 + }, + { + "epoch": 0.12410743078269683, + "grad_norm": 0.9398314757426698, + "learning_rate": 4.925349753007083e-06, + "loss": 0.1737, + "step": 1347 + }, + { + "epoch": 0.12419956695996683, + "grad_norm": 1.0109341517521457, + "learning_rate": 4.925164873003546e-06, + "loss": 0.184, + "step": 1348 + }, + { + "epoch": 0.12429170313723684, + "grad_norm": 1.045989030595084, + "learning_rate": 4.92497976782319e-06, + "loss": 0.1924, + "step": 1349 + }, + { + "epoch": 0.12438383931450685, + "grad_norm": 0.9613180803344651, + "learning_rate": 4.924794437483202e-06, + "loss": 0.1723, + "step": 1350 + }, + { + "epoch": 0.12447597549177684, + "grad_norm": 1.0326785619529568, + "learning_rate": 4.924608882000789e-06, + "loss": 0.1987, + "step": 1351 + }, + { + "epoch": 0.12456811166904685, + "grad_norm": 0.9158713928345703, + "learning_rate": 4.92442310139318e-06, + "loss": 0.1765, + "step": 1352 + }, + { + "epoch": 0.12466024784631685, + "grad_norm": 0.9681906696968335, + "learning_rate": 4.924237095677625e-06, + "loss": 0.1783, + "step": 1353 + }, + { + "epoch": 0.12475238402358686, + "grad_norm": 1.0316433171773627, + "learning_rate": 4.924050864871396e-06, + "loss": 0.191, + "step": 1354 + }, + { + "epoch": 0.12484452020085687, + "grad_norm": 1.0732753608787855, + "learning_rate": 4.923864408991782e-06, + "loss": 0.1873, + "step": 1355 + }, + { + "epoch": 0.12493665637812687, + "grad_norm": 1.0115088181506764, + "learning_rate": 4.923677728056098e-06, + "loss": 0.184, + "step": 1356 + }, + { + "epoch": 0.12502879255539687, + "grad_norm": 0.9831921194489737, + "learning_rate": 4.923490822081675e-06, + "loss": 0.1831, + "step": 1357 + }, + { + "epoch": 0.1251209287326669, + "grad_norm": 0.9368075813037634, + "learning_rate": 4.923303691085869e-06, + "loss": 0.1716, + "step": 1358 + }, + { + "epoch": 0.1252130649099369, + "grad_norm": 0.9158976927466699, + "learning_rate": 4.9231163350860535e-06, + "loss": 0.1734, + "step": 1359 + }, + { + "epoch": 0.12530520108720689, + "grad_norm": 0.9316616396852874, + "learning_rate": 4.922928754099626e-06, + "loss": 0.1706, + "step": 1360 + }, + { + "epoch": 0.1253973372644769, + "grad_norm": 1.1250088621018242, + "learning_rate": 4.9227409481440034e-06, + "loss": 0.2038, + "step": 1361 + }, + { + "epoch": 0.1254894734417469, + "grad_norm": 0.9675671851618894, + "learning_rate": 4.922552917236622e-06, + "loss": 0.1746, + "step": 1362 + }, + { + "epoch": 0.1255816096190169, + "grad_norm": 0.9821306778029524, + "learning_rate": 4.922364661394943e-06, + "loss": 0.1879, + "step": 1363 + }, + { + "epoch": 0.1256737457962869, + "grad_norm": 0.896728621906593, + "learning_rate": 4.922176180636443e-06, + "loss": 0.1632, + "step": 1364 + }, + { + "epoch": 0.12576588197355693, + "grad_norm": 0.9697621684195716, + "learning_rate": 4.921987474978626e-06, + "loss": 0.1888, + "step": 1365 + }, + { + "epoch": 0.12585801815082692, + "grad_norm": 0.9522967075080769, + "learning_rate": 4.921798544439009e-06, + "loss": 0.1875, + "step": 1366 + }, + { + "epoch": 0.12595015432809692, + "grad_norm": 1.051166642356011, + "learning_rate": 4.921609389035138e-06, + "loss": 0.1997, + "step": 1367 + }, + { + "epoch": 0.12604229050536694, + "grad_norm": 1.0744627934977156, + "learning_rate": 4.921420008784573e-06, + "loss": 0.1905, + "step": 1368 + }, + { + "epoch": 0.12613442668263694, + "grad_norm": 0.9960260593456565, + "learning_rate": 4.9212304037049015e-06, + "loss": 0.1829, + "step": 1369 + }, + { + "epoch": 0.12622656285990694, + "grad_norm": 1.0202692128949833, + "learning_rate": 4.921040573813726e-06, + "loss": 0.1732, + "step": 1370 + }, + { + "epoch": 0.12631869903717693, + "grad_norm": 1.0365842879051643, + "learning_rate": 4.9208505191286714e-06, + "loss": 0.1855, + "step": 1371 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 1.1591917314275566, + "learning_rate": 4.920660239667387e-06, + "loss": 0.209, + "step": 1372 + }, + { + "epoch": 0.12650297139171696, + "grad_norm": 0.9993800116845046, + "learning_rate": 4.920469735447538e-06, + "loss": 0.1883, + "step": 1373 + }, + { + "epoch": 0.12659510756898695, + "grad_norm": 0.9608088730057035, + "learning_rate": 4.920279006486815e-06, + "loss": 0.1812, + "step": 1374 + }, + { + "epoch": 0.12668724374625698, + "grad_norm": 0.9592626621524494, + "learning_rate": 4.920088052802924e-06, + "loss": 0.1908, + "step": 1375 + }, + { + "epoch": 0.12677937992352697, + "grad_norm": 0.9979910451476817, + "learning_rate": 4.919896874413597e-06, + "loss": 0.1756, + "step": 1376 + }, + { + "epoch": 0.12687151610079697, + "grad_norm": 0.9705279536103502, + "learning_rate": 4.919705471336585e-06, + "loss": 0.1696, + "step": 1377 + }, + { + "epoch": 0.126963652278067, + "grad_norm": 1.02529637426661, + "learning_rate": 4.919513843589661e-06, + "loss": 0.1979, + "step": 1378 + }, + { + "epoch": 0.127055788455337, + "grad_norm": 0.9709462058488644, + "learning_rate": 4.919321991190614e-06, + "loss": 0.1803, + "step": 1379 + }, + { + "epoch": 0.127147924632607, + "grad_norm": 1.072594825084783, + "learning_rate": 4.919129914157261e-06, + "loss": 0.1933, + "step": 1380 + }, + { + "epoch": 0.127240060809877, + "grad_norm": 1.1100361520927327, + "learning_rate": 4.918937612507435e-06, + "loss": 0.1769, + "step": 1381 + }, + { + "epoch": 0.127332196987147, + "grad_norm": 0.9545515898210492, + "learning_rate": 4.918745086258992e-06, + "loss": 0.1717, + "step": 1382 + }, + { + "epoch": 0.127424333164417, + "grad_norm": 0.9027879802968097, + "learning_rate": 4.918552335429806e-06, + "loss": 0.1644, + "step": 1383 + }, + { + "epoch": 0.127516469341687, + "grad_norm": 0.9607434821952252, + "learning_rate": 4.918359360037776e-06, + "loss": 0.1759, + "step": 1384 + }, + { + "epoch": 0.12760860551895703, + "grad_norm": 1.0212850726584626, + "learning_rate": 4.918166160100819e-06, + "loss": 0.1868, + "step": 1385 + }, + { + "epoch": 0.12770074169622703, + "grad_norm": 0.9851648999704009, + "learning_rate": 4.917972735636875e-06, + "loss": 0.1792, + "step": 1386 + }, + { + "epoch": 0.12779287787349702, + "grad_norm": 0.9860306797823638, + "learning_rate": 4.9177790866639005e-06, + "loss": 0.1576, + "step": 1387 + }, + { + "epoch": 0.12788501405076702, + "grad_norm": 0.9616676315179897, + "learning_rate": 4.917585213199878e-06, + "loss": 0.1748, + "step": 1388 + }, + { + "epoch": 0.12797715022803705, + "grad_norm": 1.0519830739673464, + "learning_rate": 4.9173911152628095e-06, + "loss": 0.202, + "step": 1389 + }, + { + "epoch": 0.12806928640530704, + "grad_norm": 1.0777062892635907, + "learning_rate": 4.917196792870715e-06, + "loss": 0.1892, + "step": 1390 + }, + { + "epoch": 0.12816142258257704, + "grad_norm": 0.9991283645825584, + "learning_rate": 4.917002246041638e-06, + "loss": 0.2017, + "step": 1391 + }, + { + "epoch": 0.12825355875984706, + "grad_norm": 1.0345531811177093, + "learning_rate": 4.916807474793643e-06, + "loss": 0.2083, + "step": 1392 + }, + { + "epoch": 0.12834569493711706, + "grad_norm": 0.9962307459016241, + "learning_rate": 4.916612479144812e-06, + "loss": 0.1898, + "step": 1393 + }, + { + "epoch": 0.12843783111438706, + "grad_norm": 0.9750941523903573, + "learning_rate": 4.916417259113254e-06, + "loss": 0.186, + "step": 1394 + }, + { + "epoch": 0.12852996729165708, + "grad_norm": 1.0734279770420394, + "learning_rate": 4.916221814717092e-06, + "loss": 0.1941, + "step": 1395 + }, + { + "epoch": 0.12862210346892708, + "grad_norm": 1.1087277465598613, + "learning_rate": 4.916026145974476e-06, + "loss": 0.1863, + "step": 1396 + }, + { + "epoch": 0.12871423964619708, + "grad_norm": 0.970503983339328, + "learning_rate": 4.915830252903572e-06, + "loss": 0.1809, + "step": 1397 + }, + { + "epoch": 0.12880637582346707, + "grad_norm": 1.0270549073779653, + "learning_rate": 4.915634135522569e-06, + "loss": 0.1737, + "step": 1398 + }, + { + "epoch": 0.1288985120007371, + "grad_norm": 1.017718130375212, + "learning_rate": 4.915437793849676e-06, + "loss": 0.1936, + "step": 1399 + }, + { + "epoch": 0.1289906481780071, + "grad_norm": 0.9258354769873366, + "learning_rate": 4.915241227903125e-06, + "loss": 0.176, + "step": 1400 + }, + { + "epoch": 0.1290827843552771, + "grad_norm": 1.1632022523711172, + "learning_rate": 4.915044437701165e-06, + "loss": 0.193, + "step": 1401 + }, + { + "epoch": 0.12917492053254712, + "grad_norm": 0.9909882624273187, + "learning_rate": 4.914847423262069e-06, + "loss": 0.1772, + "step": 1402 + }, + { + "epoch": 0.1292670567098171, + "grad_norm": 1.0227967032568668, + "learning_rate": 4.9146501846041304e-06, + "loss": 0.1813, + "step": 1403 + }, + { + "epoch": 0.1293591928870871, + "grad_norm": 1.023350428596177, + "learning_rate": 4.914452721745662e-06, + "loss": 0.1751, + "step": 1404 + }, + { + "epoch": 0.1294513290643571, + "grad_norm": 1.0953277903910312, + "learning_rate": 4.914255034704998e-06, + "loss": 0.2063, + "step": 1405 + }, + { + "epoch": 0.12954346524162713, + "grad_norm": 1.0076641007495757, + "learning_rate": 4.914057123500495e-06, + "loss": 0.1864, + "step": 1406 + }, + { + "epoch": 0.12963560141889713, + "grad_norm": 1.0569146161299305, + "learning_rate": 4.913858988150528e-06, + "loss": 0.1817, + "step": 1407 + }, + { + "epoch": 0.12972773759616713, + "grad_norm": 1.0438485163527682, + "learning_rate": 4.9136606286734945e-06, + "loss": 0.1638, + "step": 1408 + }, + { + "epoch": 0.12981987377343715, + "grad_norm": 0.9678728108018867, + "learning_rate": 4.913462045087811e-06, + "loss": 0.1735, + "step": 1409 + }, + { + "epoch": 0.12991200995070715, + "grad_norm": 1.0572497449704699, + "learning_rate": 4.9132632374119185e-06, + "loss": 0.1859, + "step": 1410 + }, + { + "epoch": 0.13000414612797714, + "grad_norm": 1.1213963454815326, + "learning_rate": 4.913064205664273e-06, + "loss": 0.18, + "step": 1411 + }, + { + "epoch": 0.13009628230524717, + "grad_norm": 1.0531509919261324, + "learning_rate": 4.912864949863358e-06, + "loss": 0.1701, + "step": 1412 + }, + { + "epoch": 0.13018841848251717, + "grad_norm": 1.0597007287157376, + "learning_rate": 4.912665470027671e-06, + "loss": 0.1975, + "step": 1413 + }, + { + "epoch": 0.13028055465978716, + "grad_norm": 1.0209805064275213, + "learning_rate": 4.912465766175736e-06, + "loss": 0.1686, + "step": 1414 + }, + { + "epoch": 0.13037269083705716, + "grad_norm": 0.9354404787118893, + "learning_rate": 4.912265838326095e-06, + "loss": 0.1649, + "step": 1415 + }, + { + "epoch": 0.13046482701432718, + "grad_norm": 0.9685000508425466, + "learning_rate": 4.912065686497312e-06, + "loss": 0.1792, + "step": 1416 + }, + { + "epoch": 0.13055696319159718, + "grad_norm": 0.9793173080999195, + "learning_rate": 4.91186531070797e-06, + "loss": 0.1733, + "step": 1417 + }, + { + "epoch": 0.13064909936886718, + "grad_norm": 1.0267034852588974, + "learning_rate": 4.911664710976674e-06, + "loss": 0.1911, + "step": 1418 + }, + { + "epoch": 0.1307412355461372, + "grad_norm": 1.0679568316502386, + "learning_rate": 4.91146388732205e-06, + "loss": 0.1959, + "step": 1419 + }, + { + "epoch": 0.1308333717234072, + "grad_norm": 1.043916463767155, + "learning_rate": 4.911262839762745e-06, + "loss": 0.1844, + "step": 1420 + }, + { + "epoch": 0.1309255079006772, + "grad_norm": 1.1426541467661104, + "learning_rate": 4.911061568317425e-06, + "loss": 0.1839, + "step": 1421 + }, + { + "epoch": 0.1310176440779472, + "grad_norm": 1.0670712794431116, + "learning_rate": 4.910860073004779e-06, + "loss": 0.1893, + "step": 1422 + }, + { + "epoch": 0.13110978025521722, + "grad_norm": 1.0217657650798537, + "learning_rate": 4.910658353843517e-06, + "loss": 0.1757, + "step": 1423 + }, + { + "epoch": 0.13120191643248721, + "grad_norm": 1.042824076262155, + "learning_rate": 4.910456410852367e-06, + "loss": 0.163, + "step": 1424 + }, + { + "epoch": 0.1312940526097572, + "grad_norm": 0.956885833442343, + "learning_rate": 4.91025424405008e-06, + "loss": 0.1723, + "step": 1425 + }, + { + "epoch": 0.13138618878702724, + "grad_norm": 1.1449966036167125, + "learning_rate": 4.910051853455426e-06, + "loss": 0.1911, + "step": 1426 + }, + { + "epoch": 0.13147832496429723, + "grad_norm": 1.036287110180777, + "learning_rate": 4.909849239087199e-06, + "loss": 0.1988, + "step": 1427 + }, + { + "epoch": 0.13157046114156723, + "grad_norm": 0.9948309049567811, + "learning_rate": 4.90964640096421e-06, + "loss": 0.1951, + "step": 1428 + }, + { + "epoch": 0.13166259731883725, + "grad_norm": 1.0844421392628185, + "learning_rate": 4.9094433391052935e-06, + "loss": 0.1812, + "step": 1429 + }, + { + "epoch": 0.13175473349610725, + "grad_norm": 1.0495914214075481, + "learning_rate": 4.909240053529304e-06, + "loss": 0.1985, + "step": 1430 + }, + { + "epoch": 0.13184686967337725, + "grad_norm": 0.9651180837150052, + "learning_rate": 4.909036544255116e-06, + "loss": 0.1702, + "step": 1431 + }, + { + "epoch": 0.13193900585064725, + "grad_norm": 0.9955966957325579, + "learning_rate": 4.908832811301626e-06, + "loss": 0.1781, + "step": 1432 + }, + { + "epoch": 0.13203114202791727, + "grad_norm": 1.0035279832126653, + "learning_rate": 4.90862885468775e-06, + "loss": 0.1743, + "step": 1433 + }, + { + "epoch": 0.13212327820518727, + "grad_norm": 1.0358847851511948, + "learning_rate": 4.908424674432425e-06, + "loss": 0.1895, + "step": 1434 + }, + { + "epoch": 0.13221541438245726, + "grad_norm": 0.9875222025539104, + "learning_rate": 4.908220270554611e-06, + "loss": 0.1809, + "step": 1435 + }, + { + "epoch": 0.1323075505597273, + "grad_norm": 1.0278508432913862, + "learning_rate": 4.908015643073285e-06, + "loss": 0.1833, + "step": 1436 + }, + { + "epoch": 0.13239968673699729, + "grad_norm": 1.0273581077755105, + "learning_rate": 4.907810792007447e-06, + "loss": 0.1984, + "step": 1437 + }, + { + "epoch": 0.13249182291426728, + "grad_norm": 1.0560009720633898, + "learning_rate": 4.907605717376118e-06, + "loss": 0.1864, + "step": 1438 + }, + { + "epoch": 0.13258395909153728, + "grad_norm": 0.9626044030041234, + "learning_rate": 4.90740041919834e-06, + "loss": 0.1758, + "step": 1439 + }, + { + "epoch": 0.1326760952688073, + "grad_norm": 0.9672689247849761, + "learning_rate": 4.907194897493173e-06, + "loss": 0.1771, + "step": 1440 + }, + { + "epoch": 0.1327682314460773, + "grad_norm": 1.0450504004609606, + "learning_rate": 4.906989152279701e-06, + "loss": 0.1905, + "step": 1441 + }, + { + "epoch": 0.1328603676233473, + "grad_norm": 0.9482587626895994, + "learning_rate": 4.9067831835770275e-06, + "loss": 0.1738, + "step": 1442 + }, + { + "epoch": 0.13295250380061732, + "grad_norm": 0.9463239494076845, + "learning_rate": 4.906576991404276e-06, + "loss": 0.1854, + "step": 1443 + }, + { + "epoch": 0.13304463997788732, + "grad_norm": 1.0061874594493472, + "learning_rate": 4.9063705757805915e-06, + "loss": 0.1985, + "step": 1444 + }, + { + "epoch": 0.13313677615515732, + "grad_norm": 0.9068357394587006, + "learning_rate": 4.906163936725141e-06, + "loss": 0.1595, + "step": 1445 + }, + { + "epoch": 0.13322891233242734, + "grad_norm": 1.011124096617946, + "learning_rate": 4.905957074257109e-06, + "loss": 0.1716, + "step": 1446 + }, + { + "epoch": 0.13332104850969734, + "grad_norm": 0.9635712912469494, + "learning_rate": 4.905749988395704e-06, + "loss": 0.1686, + "step": 1447 + }, + { + "epoch": 0.13341318468696733, + "grad_norm": 0.9726084759736766, + "learning_rate": 4.905542679160155e-06, + "loss": 0.1682, + "step": 1448 + }, + { + "epoch": 0.13350532086423733, + "grad_norm": 0.9982505363951202, + "learning_rate": 4.905335146569707e-06, + "loss": 0.1972, + "step": 1449 + }, + { + "epoch": 0.13359745704150736, + "grad_norm": 1.0309683229900235, + "learning_rate": 4.9051273906436335e-06, + "loss": 0.1786, + "step": 1450 + }, + { + "epoch": 0.13368959321877735, + "grad_norm": 1.014497711913621, + "learning_rate": 4.904919411401222e-06, + "loss": 0.1753, + "step": 1451 + }, + { + "epoch": 0.13378172939604735, + "grad_norm": 0.9494034749604296, + "learning_rate": 4.9047112088617855e-06, + "loss": 0.1685, + "step": 1452 + }, + { + "epoch": 0.13387386557331737, + "grad_norm": 0.9533159259651824, + "learning_rate": 4.904502783044654e-06, + "loss": 0.183, + "step": 1453 + }, + { + "epoch": 0.13396600175058737, + "grad_norm": 1.0198352992862363, + "learning_rate": 4.90429413396918e-06, + "loss": 0.1997, + "step": 1454 + }, + { + "epoch": 0.13405813792785737, + "grad_norm": 1.028575773022091, + "learning_rate": 4.904085261654736e-06, + "loss": 0.1874, + "step": 1455 + }, + { + "epoch": 0.13415027410512737, + "grad_norm": 0.9643745703997726, + "learning_rate": 4.903876166120718e-06, + "loss": 0.186, + "step": 1456 + }, + { + "epoch": 0.1342424102823974, + "grad_norm": 0.9918455781983645, + "learning_rate": 4.903666847386539e-06, + "loss": 0.1926, + "step": 1457 + }, + { + "epoch": 0.1343345464596674, + "grad_norm": 1.0310036777983294, + "learning_rate": 4.903457305471635e-06, + "loss": 0.1888, + "step": 1458 + }, + { + "epoch": 0.13442668263693738, + "grad_norm": 0.9804790007901808, + "learning_rate": 4.90324754039546e-06, + "loss": 0.1899, + "step": 1459 + }, + { + "epoch": 0.1345188188142074, + "grad_norm": 0.873722915764326, + "learning_rate": 4.903037552177494e-06, + "loss": 0.1743, + "step": 1460 + }, + { + "epoch": 0.1346109549914774, + "grad_norm": 0.9519299025740255, + "learning_rate": 4.9028273408372315e-06, + "loss": 0.1809, + "step": 1461 + }, + { + "epoch": 0.1347030911687474, + "grad_norm": 0.9612320511182931, + "learning_rate": 4.902616906394193e-06, + "loss": 0.1636, + "step": 1462 + }, + { + "epoch": 0.13479522734601743, + "grad_norm": 0.9937787642498676, + "learning_rate": 4.9024062488679145e-06, + "loss": 0.1743, + "step": 1463 + }, + { + "epoch": 0.13488736352328742, + "grad_norm": 0.8938554298239793, + "learning_rate": 4.9021953682779585e-06, + "loss": 0.1561, + "step": 1464 + }, + { + "epoch": 0.13497949970055742, + "grad_norm": 1.0398410959428166, + "learning_rate": 4.901984264643904e-06, + "loss": 0.1925, + "step": 1465 + }, + { + "epoch": 0.13507163587782742, + "grad_norm": 1.0310209853854573, + "learning_rate": 4.9017729379853515e-06, + "loss": 0.1992, + "step": 1466 + }, + { + "epoch": 0.13516377205509744, + "grad_norm": 1.0207343294570042, + "learning_rate": 4.901561388321923e-06, + "loss": 0.2045, + "step": 1467 + }, + { + "epoch": 0.13525590823236744, + "grad_norm": 0.87864572441413, + "learning_rate": 4.901349615673262e-06, + "loss": 0.1572, + "step": 1468 + }, + { + "epoch": 0.13534804440963744, + "grad_norm": 0.9196847849987159, + "learning_rate": 4.90113762005903e-06, + "loss": 0.1802, + "step": 1469 + }, + { + "epoch": 0.13544018058690746, + "grad_norm": 0.9578454364771498, + "learning_rate": 4.900925401498912e-06, + "loss": 0.1858, + "step": 1470 + }, + { + "epoch": 0.13553231676417746, + "grad_norm": 0.9488263656936775, + "learning_rate": 4.900712960012612e-06, + "loss": 0.1801, + "step": 1471 + }, + { + "epoch": 0.13562445294144745, + "grad_norm": 0.9320142456071285, + "learning_rate": 4.900500295619855e-06, + "loss": 0.1808, + "step": 1472 + }, + { + "epoch": 0.13571658911871745, + "grad_norm": 0.8749696112915272, + "learning_rate": 4.900287408340387e-06, + "loss": 0.1707, + "step": 1473 + }, + { + "epoch": 0.13580872529598748, + "grad_norm": 0.9555507551898182, + "learning_rate": 4.900074298193976e-06, + "loss": 0.1826, + "step": 1474 + }, + { + "epoch": 0.13590086147325747, + "grad_norm": 1.0200858753072042, + "learning_rate": 4.899860965200407e-06, + "loss": 0.1936, + "step": 1475 + }, + { + "epoch": 0.13599299765052747, + "grad_norm": 1.046105205148264, + "learning_rate": 4.89964740937949e-06, + "loss": 0.1949, + "step": 1476 + }, + { + "epoch": 0.1360851338277975, + "grad_norm": 0.9703684004608017, + "learning_rate": 4.899433630751052e-06, + "loss": 0.1812, + "step": 1477 + }, + { + "epoch": 0.1361772700050675, + "grad_norm": 0.9358632265946936, + "learning_rate": 4.8992196293349435e-06, + "loss": 0.1692, + "step": 1478 + }, + { + "epoch": 0.1362694061823375, + "grad_norm": 0.9421548138334379, + "learning_rate": 4.899005405151034e-06, + "loss": 0.173, + "step": 1479 + }, + { + "epoch": 0.1363615423596075, + "grad_norm": 0.9874060354260804, + "learning_rate": 4.898790958219215e-06, + "loss": 0.1917, + "step": 1480 + }, + { + "epoch": 0.1364536785368775, + "grad_norm": 1.0241768866081442, + "learning_rate": 4.898576288559396e-06, + "loss": 0.194, + "step": 1481 + }, + { + "epoch": 0.1365458147141475, + "grad_norm": 0.967676077385443, + "learning_rate": 4.898361396191512e-06, + "loss": 0.1869, + "step": 1482 + }, + { + "epoch": 0.1366379508914175, + "grad_norm": 1.063157649624893, + "learning_rate": 4.898146281135514e-06, + "loss": 0.1878, + "step": 1483 + }, + { + "epoch": 0.13673008706868753, + "grad_norm": 1.0213338083271282, + "learning_rate": 4.8979309434113745e-06, + "loss": 0.171, + "step": 1484 + }, + { + "epoch": 0.13682222324595752, + "grad_norm": 0.9752201110126685, + "learning_rate": 4.89771538303909e-06, + "loss": 0.176, + "step": 1485 + }, + { + "epoch": 0.13691435942322752, + "grad_norm": 0.9992722204337339, + "learning_rate": 4.897499600038673e-06, + "loss": 0.1763, + "step": 1486 + }, + { + "epoch": 0.13700649560049755, + "grad_norm": 0.9673495125040701, + "learning_rate": 4.8972835944301615e-06, + "loss": 0.181, + "step": 1487 + }, + { + "epoch": 0.13709863177776754, + "grad_norm": 0.9172227987836397, + "learning_rate": 4.89706736623361e-06, + "loss": 0.1687, + "step": 1488 + }, + { + "epoch": 0.13719076795503754, + "grad_norm": 0.9179738386385932, + "learning_rate": 4.896850915469095e-06, + "loss": 0.1708, + "step": 1489 + }, + { + "epoch": 0.13728290413230754, + "grad_norm": 1.0866865421153868, + "learning_rate": 4.896634242156715e-06, + "loss": 0.1981, + "step": 1490 + }, + { + "epoch": 0.13737504030957756, + "grad_norm": 1.1181657985966733, + "learning_rate": 4.896417346316587e-06, + "loss": 0.2006, + "step": 1491 + }, + { + "epoch": 0.13746717648684756, + "grad_norm": 0.958948999492823, + "learning_rate": 4.8962002279688514e-06, + "loss": 0.176, + "step": 1492 + }, + { + "epoch": 0.13755931266411756, + "grad_norm": 1.0453082102751459, + "learning_rate": 4.8959828871336665e-06, + "loss": 0.1824, + "step": 1493 + }, + { + "epoch": 0.13765144884138758, + "grad_norm": 1.0704601468240302, + "learning_rate": 4.895765323831212e-06, + "loss": 0.1798, + "step": 1494 + }, + { + "epoch": 0.13774358501865758, + "grad_norm": 1.046104125736917, + "learning_rate": 4.895547538081691e-06, + "loss": 0.1865, + "step": 1495 + }, + { + "epoch": 0.13783572119592757, + "grad_norm": 0.9953218833475491, + "learning_rate": 4.895329529905322e-06, + "loss": 0.1798, + "step": 1496 + }, + { + "epoch": 0.1379278573731976, + "grad_norm": 0.9993516892546119, + "learning_rate": 4.895111299322348e-06, + "loss": 0.1779, + "step": 1497 + }, + { + "epoch": 0.1380199935504676, + "grad_norm": 1.1037371027487326, + "learning_rate": 4.894892846353032e-06, + "loss": 0.1718, + "step": 1498 + }, + { + "epoch": 0.1381121297277376, + "grad_norm": 1.1042776255985305, + "learning_rate": 4.8946741710176584e-06, + "loss": 0.1769, + "step": 1499 + }, + { + "epoch": 0.1382042659050076, + "grad_norm": 1.0159415033155752, + "learning_rate": 4.894455273336531e-06, + "loss": 0.1963, + "step": 1500 + }, + { + "epoch": 0.1382042659050076, + "eval_loss": 0.1832522302865982, + "eval_runtime": 300.5966, + "eval_samples_per_second": 23.344, + "eval_steps_per_second": 2.921, + "step": 1500 + }, + { + "epoch": 0.13829640208227761, + "grad_norm": 1.0590835289628433, + "learning_rate": 4.894236153329972e-06, + "loss": 0.1821, + "step": 1501 + }, + { + "epoch": 0.1383885382595476, + "grad_norm": 1.0055100832100328, + "learning_rate": 4.894016811018329e-06, + "loss": 0.164, + "step": 1502 + }, + { + "epoch": 0.1384806744368176, + "grad_norm": 1.031947823782199, + "learning_rate": 4.893797246421968e-06, + "loss": 0.1967, + "step": 1503 + }, + { + "epoch": 0.13857281061408763, + "grad_norm": 0.9897293422122585, + "learning_rate": 4.893577459561274e-06, + "loss": 0.1844, + "step": 1504 + }, + { + "epoch": 0.13866494679135763, + "grad_norm": 1.0213676397867222, + "learning_rate": 4.893357450456657e-06, + "loss": 0.1896, + "step": 1505 + }, + { + "epoch": 0.13875708296862763, + "grad_norm": 1.0049304145032214, + "learning_rate": 4.893137219128542e-06, + "loss": 0.1719, + "step": 1506 + }, + { + "epoch": 0.13884921914589762, + "grad_norm": 0.9605931957652998, + "learning_rate": 4.892916765597378e-06, + "loss": 0.1735, + "step": 1507 + }, + { + "epoch": 0.13894135532316765, + "grad_norm": 1.09891001647981, + "learning_rate": 4.892696089883636e-06, + "loss": 0.2017, + "step": 1508 + }, + { + "epoch": 0.13903349150043764, + "grad_norm": 1.0245227870445939, + "learning_rate": 4.8924751920078045e-06, + "loss": 0.1845, + "step": 1509 + }, + { + "epoch": 0.13912562767770764, + "grad_norm": 0.9744899746698248, + "learning_rate": 4.892254071990393e-06, + "loss": 0.1673, + "step": 1510 + }, + { + "epoch": 0.13921776385497767, + "grad_norm": 1.007033072682871, + "learning_rate": 4.892032729851934e-06, + "loss": 0.1638, + "step": 1511 + }, + { + "epoch": 0.13930990003224766, + "grad_norm": 1.0239978553664408, + "learning_rate": 4.891811165612979e-06, + "loss": 0.2006, + "step": 1512 + }, + { + "epoch": 0.13940203620951766, + "grad_norm": 0.9688980878806478, + "learning_rate": 4.8915893792941e-06, + "loss": 0.1741, + "step": 1513 + }, + { + "epoch": 0.13949417238678768, + "grad_norm": 1.0124497462752957, + "learning_rate": 4.891367370915889e-06, + "loss": 0.1853, + "step": 1514 + }, + { + "epoch": 0.13958630856405768, + "grad_norm": 0.9443507545978798, + "learning_rate": 4.89114514049896e-06, + "loss": 0.1853, + "step": 1515 + }, + { + "epoch": 0.13967844474132768, + "grad_norm": 0.8944876191974487, + "learning_rate": 4.890922688063949e-06, + "loss": 0.1713, + "step": 1516 + }, + { + "epoch": 0.13977058091859768, + "grad_norm": 0.9278038889701954, + "learning_rate": 4.8907000136315075e-06, + "loss": 0.159, + "step": 1517 + }, + { + "epoch": 0.1398627170958677, + "grad_norm": 0.9544401348206267, + "learning_rate": 4.890477117222313e-06, + "loss": 0.1646, + "step": 1518 + }, + { + "epoch": 0.1399548532731377, + "grad_norm": 0.993320836333389, + "learning_rate": 4.890253998857061e-06, + "loss": 0.1712, + "step": 1519 + }, + { + "epoch": 0.1400469894504077, + "grad_norm": 0.9672368448981169, + "learning_rate": 4.890030658556467e-06, + "loss": 0.1763, + "step": 1520 + }, + { + "epoch": 0.14013912562767772, + "grad_norm": 1.0007096297433211, + "learning_rate": 4.88980709634127e-06, + "loss": 0.1778, + "step": 1521 + }, + { + "epoch": 0.14023126180494772, + "grad_norm": 1.0380589027334104, + "learning_rate": 4.889583312232227e-06, + "loss": 0.2014, + "step": 1522 + }, + { + "epoch": 0.1403233979822177, + "grad_norm": 1.0112883370951264, + "learning_rate": 4.889359306250117e-06, + "loss": 0.173, + "step": 1523 + }, + { + "epoch": 0.1404155341594877, + "grad_norm": 0.9158439308113651, + "learning_rate": 4.889135078415736e-06, + "loss": 0.1703, + "step": 1524 + }, + { + "epoch": 0.14050767033675773, + "grad_norm": 0.971656273295498, + "learning_rate": 4.888910628749908e-06, + "loss": 0.2035, + "step": 1525 + }, + { + "epoch": 0.14059980651402773, + "grad_norm": 1.028981671131926, + "learning_rate": 4.88868595727347e-06, + "loss": 0.1804, + "step": 1526 + }, + { + "epoch": 0.14069194269129773, + "grad_norm": 0.9897401268515336, + "learning_rate": 4.888461064007284e-06, + "loss": 0.1767, + "step": 1527 + }, + { + "epoch": 0.14078407886856775, + "grad_norm": 0.9659819889789746, + "learning_rate": 4.888235948972232e-06, + "loss": 0.1853, + "step": 1528 + }, + { + "epoch": 0.14087621504583775, + "grad_norm": 0.9421444160040869, + "learning_rate": 4.888010612189213e-06, + "loss": 0.1643, + "step": 1529 + }, + { + "epoch": 0.14096835122310775, + "grad_norm": 1.0162138437056278, + "learning_rate": 4.8877850536791535e-06, + "loss": 0.191, + "step": 1530 + }, + { + "epoch": 0.14106048740037777, + "grad_norm": 1.074012467266447, + "learning_rate": 4.887559273462994e-06, + "loss": 0.1941, + "step": 1531 + }, + { + "epoch": 0.14115262357764777, + "grad_norm": 0.9711479225310997, + "learning_rate": 4.8873332715617e-06, + "loss": 0.1845, + "step": 1532 + }, + { + "epoch": 0.14124475975491776, + "grad_norm": 1.0096948886121502, + "learning_rate": 4.887107047996253e-06, + "loss": 0.1911, + "step": 1533 + }, + { + "epoch": 0.14133689593218776, + "grad_norm": 0.9516594505504503, + "learning_rate": 4.886880602787661e-06, + "loss": 0.1763, + "step": 1534 + }, + { + "epoch": 0.1414290321094578, + "grad_norm": 0.9870978246474578, + "learning_rate": 4.886653935956949e-06, + "loss": 0.172, + "step": 1535 + }, + { + "epoch": 0.14152116828672778, + "grad_norm": 0.9766472869579116, + "learning_rate": 4.88642704752516e-06, + "loss": 0.1664, + "step": 1536 + }, + { + "epoch": 0.14161330446399778, + "grad_norm": 0.8996626934665198, + "learning_rate": 4.886199937513365e-06, + "loss": 0.1725, + "step": 1537 + }, + { + "epoch": 0.1417054406412678, + "grad_norm": 1.1275854369973244, + "learning_rate": 4.885972605942647e-06, + "loss": 0.1811, + "step": 1538 + }, + { + "epoch": 0.1417975768185378, + "grad_norm": 0.9686314114387246, + "learning_rate": 4.8857450528341166e-06, + "loss": 0.1725, + "step": 1539 + }, + { + "epoch": 0.1418897129958078, + "grad_norm": 0.9318170462863459, + "learning_rate": 4.8855172782089015e-06, + "loss": 0.1632, + "step": 1540 + }, + { + "epoch": 0.1419818491730778, + "grad_norm": 0.9707586029191749, + "learning_rate": 4.88528928208815e-06, + "loss": 0.1759, + "step": 1541 + }, + { + "epoch": 0.14207398535034782, + "grad_norm": 1.0456594221249906, + "learning_rate": 4.885061064493033e-06, + "loss": 0.1786, + "step": 1542 + }, + { + "epoch": 0.14216612152761782, + "grad_norm": 0.9707787392031242, + "learning_rate": 4.884832625444738e-06, + "loss": 0.1732, + "step": 1543 + }, + { + "epoch": 0.1422582577048878, + "grad_norm": 0.9532507142755162, + "learning_rate": 4.8846039649644785e-06, + "loss": 0.1662, + "step": 1544 + }, + { + "epoch": 0.14235039388215784, + "grad_norm": 0.9159276555645566, + "learning_rate": 4.884375083073483e-06, + "loss": 0.1715, + "step": 1545 + }, + { + "epoch": 0.14244253005942784, + "grad_norm": 1.0166336431813243, + "learning_rate": 4.8841459797930045e-06, + "loss": 0.1841, + "step": 1546 + }, + { + "epoch": 0.14253466623669783, + "grad_norm": 0.9938953514211266, + "learning_rate": 4.8839166551443165e-06, + "loss": 0.1917, + "step": 1547 + }, + { + "epoch": 0.14262680241396786, + "grad_norm": 0.9985060362734772, + "learning_rate": 4.883687109148709e-06, + "loss": 0.1909, + "step": 1548 + }, + { + "epoch": 0.14271893859123785, + "grad_norm": 0.9480753155210944, + "learning_rate": 4.883457341827498e-06, + "loss": 0.1634, + "step": 1549 + }, + { + "epoch": 0.14281107476850785, + "grad_norm": 1.044475869424449, + "learning_rate": 4.883227353202016e-06, + "loss": 0.1905, + "step": 1550 + }, + { + "epoch": 0.14290321094577785, + "grad_norm": 0.9751713676071151, + "learning_rate": 4.882997143293617e-06, + "loss": 0.1766, + "step": 1551 + }, + { + "epoch": 0.14299534712304787, + "grad_norm": 1.0616405307473622, + "learning_rate": 4.882766712123677e-06, + "loss": 0.177, + "step": 1552 + }, + { + "epoch": 0.14308748330031787, + "grad_norm": 0.9891993285446687, + "learning_rate": 4.882536059713592e-06, + "loss": 0.1902, + "step": 1553 + }, + { + "epoch": 0.14317961947758787, + "grad_norm": 0.9812316312540941, + "learning_rate": 4.882305186084777e-06, + "loss": 0.1828, + "step": 1554 + }, + { + "epoch": 0.1432717556548579, + "grad_norm": 1.0198101696782014, + "learning_rate": 4.88207409125867e-06, + "loss": 0.1784, + "step": 1555 + }, + { + "epoch": 0.1433638918321279, + "grad_norm": 0.9787051486382536, + "learning_rate": 4.881842775256726e-06, + "loss": 0.1802, + "step": 1556 + }, + { + "epoch": 0.14345602800939788, + "grad_norm": 0.9757898719198366, + "learning_rate": 4.8816112381004245e-06, + "loss": 0.1722, + "step": 1557 + }, + { + "epoch": 0.14354816418666788, + "grad_norm": 0.994552243999101, + "learning_rate": 4.881379479811263e-06, + "loss": 0.183, + "step": 1558 + }, + { + "epoch": 0.1436403003639379, + "grad_norm": 0.9879186122580522, + "learning_rate": 4.881147500410761e-06, + "loss": 0.1727, + "step": 1559 + }, + { + "epoch": 0.1437324365412079, + "grad_norm": 0.9389846554223322, + "learning_rate": 4.880915299920457e-06, + "loss": 0.183, + "step": 1560 + }, + { + "epoch": 0.1438245727184779, + "grad_norm": 0.9176926081504023, + "learning_rate": 4.8806828783619106e-06, + "loss": 0.1648, + "step": 1561 + }, + { + "epoch": 0.14391670889574792, + "grad_norm": 1.0177887444327505, + "learning_rate": 4.880450235756704e-06, + "loss": 0.1858, + "step": 1562 + }, + { + "epoch": 0.14400884507301792, + "grad_norm": 0.8823529515319729, + "learning_rate": 4.880217372126436e-06, + "loss": 0.1709, + "step": 1563 + }, + { + "epoch": 0.14410098125028792, + "grad_norm": 0.9499161520652284, + "learning_rate": 4.8799842874927285e-06, + "loss": 0.1833, + "step": 1564 + }, + { + "epoch": 0.14419311742755794, + "grad_norm": 0.9768941581657758, + "learning_rate": 4.879750981877224e-06, + "loss": 0.1751, + "step": 1565 + }, + { + "epoch": 0.14428525360482794, + "grad_norm": 1.0382432327832614, + "learning_rate": 4.879517455301585e-06, + "loss": 0.1696, + "step": 1566 + }, + { + "epoch": 0.14437738978209794, + "grad_norm": 1.0226195818276123, + "learning_rate": 4.8792837077874945e-06, + "loss": 0.1897, + "step": 1567 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 0.9536502285629268, + "learning_rate": 4.8790497393566546e-06, + "loss": 0.1905, + "step": 1568 + }, + { + "epoch": 0.14456166213663796, + "grad_norm": 0.9933028554235266, + "learning_rate": 4.878815550030792e-06, + "loss": 0.1772, + "step": 1569 + }, + { + "epoch": 0.14465379831390796, + "grad_norm": 0.94927888214594, + "learning_rate": 4.878581139831649e-06, + "loss": 0.174, + "step": 1570 + }, + { + "epoch": 0.14474593449117795, + "grad_norm": 0.9734027162264917, + "learning_rate": 4.87834650878099e-06, + "loss": 0.1878, + "step": 1571 + }, + { + "epoch": 0.14483807066844798, + "grad_norm": 0.9927226017155047, + "learning_rate": 4.8781116569006026e-06, + "loss": 0.1783, + "step": 1572 + }, + { + "epoch": 0.14493020684571797, + "grad_norm": 0.8755924113255371, + "learning_rate": 4.877876584212292e-06, + "loss": 0.1789, + "step": 1573 + }, + { + "epoch": 0.14502234302298797, + "grad_norm": 0.8908355930191686, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.1659, + "step": 1574 + }, + { + "epoch": 0.14511447920025797, + "grad_norm": 0.9964829044139177, + "learning_rate": 4.8774057764992275e-06, + "loss": 0.1903, + "step": 1575 + }, + { + "epoch": 0.145206615377528, + "grad_norm": 1.0087199814403942, + "learning_rate": 4.877170041518187e-06, + "loss": 0.1846, + "step": 1576 + }, + { + "epoch": 0.145298751554798, + "grad_norm": 0.9427168363296254, + "learning_rate": 4.876934085816654e-06, + "loss": 0.1805, + "step": 1577 + }, + { + "epoch": 0.14539088773206799, + "grad_norm": 1.0114845794628744, + "learning_rate": 4.8766979094165346e-06, + "loss": 0.1767, + "step": 1578 + }, + { + "epoch": 0.145483023909338, + "grad_norm": 0.9339557450945638, + "learning_rate": 4.8764615123397584e-06, + "loss": 0.1773, + "step": 1579 + }, + { + "epoch": 0.145575160086608, + "grad_norm": 1.0022634915094149, + "learning_rate": 4.876224894608275e-06, + "loss": 0.1836, + "step": 1580 + }, + { + "epoch": 0.145667296263878, + "grad_norm": 1.0173079709967716, + "learning_rate": 4.875988056244055e-06, + "loss": 0.1951, + "step": 1581 + }, + { + "epoch": 0.14575943244114803, + "grad_norm": 0.9166566621649066, + "learning_rate": 4.875750997269088e-06, + "loss": 0.1713, + "step": 1582 + }, + { + "epoch": 0.14585156861841803, + "grad_norm": 0.9612055526441895, + "learning_rate": 4.875513717705385e-06, + "loss": 0.1803, + "step": 1583 + }, + { + "epoch": 0.14594370479568802, + "grad_norm": 0.9483501489894158, + "learning_rate": 4.875276217574978e-06, + "loss": 0.1804, + "step": 1584 + }, + { + "epoch": 0.14603584097295802, + "grad_norm": 0.9754513204683902, + "learning_rate": 4.875038496899919e-06, + "loss": 0.1776, + "step": 1585 + }, + { + "epoch": 0.14612797715022804, + "grad_norm": 0.9887240406962738, + "learning_rate": 4.874800555702278e-06, + "loss": 0.1859, + "step": 1586 + }, + { + "epoch": 0.14622011332749804, + "grad_norm": 0.9269334563736277, + "learning_rate": 4.874562394004152e-06, + "loss": 0.1852, + "step": 1587 + }, + { + "epoch": 0.14631224950476804, + "grad_norm": 0.9301846330078121, + "learning_rate": 4.874324011827651e-06, + "loss": 0.1743, + "step": 1588 + }, + { + "epoch": 0.14640438568203806, + "grad_norm": 1.0003398362769642, + "learning_rate": 4.874085409194911e-06, + "loss": 0.1961, + "step": 1589 + }, + { + "epoch": 0.14649652185930806, + "grad_norm": 0.9124366367357205, + "learning_rate": 4.873846586128083e-06, + "loss": 0.1683, + "step": 1590 + }, + { + "epoch": 0.14658865803657806, + "grad_norm": 0.9337326845444719, + "learning_rate": 4.873607542649347e-06, + "loss": 0.1814, + "step": 1591 + }, + { + "epoch": 0.14668079421384805, + "grad_norm": 0.9707004361697791, + "learning_rate": 4.873368278780893e-06, + "loss": 0.1835, + "step": 1592 + }, + { + "epoch": 0.14677293039111808, + "grad_norm": 0.9567094098967206, + "learning_rate": 4.87312879454494e-06, + "loss": 0.1687, + "step": 1593 + }, + { + "epoch": 0.14686506656838808, + "grad_norm": 1.0388349235769987, + "learning_rate": 4.872889089963723e-06, + "loss": 0.1999, + "step": 1594 + }, + { + "epoch": 0.14695720274565807, + "grad_norm": 1.0055808616791115, + "learning_rate": 4.872649165059497e-06, + "loss": 0.196, + "step": 1595 + }, + { + "epoch": 0.1470493389229281, + "grad_norm": 0.9344422199590832, + "learning_rate": 4.872409019854543e-06, + "loss": 0.1674, + "step": 1596 + }, + { + "epoch": 0.1471414751001981, + "grad_norm": 0.9358621830893651, + "learning_rate": 4.872168654371155e-06, + "loss": 0.1735, + "step": 1597 + }, + { + "epoch": 0.1472336112774681, + "grad_norm": 0.9787417589126662, + "learning_rate": 4.8719280686316524e-06, + "loss": 0.1662, + "step": 1598 + }, + { + "epoch": 0.14732574745473812, + "grad_norm": 0.969455208958826, + "learning_rate": 4.871687262658373e-06, + "loss": 0.1887, + "step": 1599 + }, + { + "epoch": 0.1474178836320081, + "grad_norm": 0.9719521093017446, + "learning_rate": 4.871446236473676e-06, + "loss": 0.1807, + "step": 1600 + }, + { + "epoch": 0.1475100198092781, + "grad_norm": 0.967409307452388, + "learning_rate": 4.871204990099941e-06, + "loss": 0.1689, + "step": 1601 + }, + { + "epoch": 0.1476021559865481, + "grad_norm": 1.0137520395644053, + "learning_rate": 4.870963523559567e-06, + "loss": 0.1768, + "step": 1602 + }, + { + "epoch": 0.14769429216381813, + "grad_norm": 1.0032661099951032, + "learning_rate": 4.8707218368749755e-06, + "loss": 0.2019, + "step": 1603 + }, + { + "epoch": 0.14778642834108813, + "grad_norm": 0.994483855416401, + "learning_rate": 4.870479930068607e-06, + "loss": 0.1846, + "step": 1604 + }, + { + "epoch": 0.14787856451835812, + "grad_norm": 0.9323151361272326, + "learning_rate": 4.8702378031629204e-06, + "loss": 0.1703, + "step": 1605 + }, + { + "epoch": 0.14797070069562815, + "grad_norm": 0.9392494774290172, + "learning_rate": 4.869995456180399e-06, + "loss": 0.1724, + "step": 1606 + }, + { + "epoch": 0.14806283687289815, + "grad_norm": 0.9840474087404582, + "learning_rate": 4.869752889143544e-06, + "loss": 0.1831, + "step": 1607 + }, + { + "epoch": 0.14815497305016814, + "grad_norm": 0.9347889910562127, + "learning_rate": 4.8695101020748796e-06, + "loss": 0.1707, + "step": 1608 + }, + { + "epoch": 0.14824710922743817, + "grad_norm": 0.9683681475620743, + "learning_rate": 4.869267094996946e-06, + "loss": 0.1821, + "step": 1609 + }, + { + "epoch": 0.14833924540470816, + "grad_norm": 0.951231654892894, + "learning_rate": 4.869023867932309e-06, + "loss": 0.173, + "step": 1610 + }, + { + "epoch": 0.14843138158197816, + "grad_norm": 0.9654930888260544, + "learning_rate": 4.868780420903549e-06, + "loss": 0.1802, + "step": 1611 + }, + { + "epoch": 0.14852351775924816, + "grad_norm": 1.0105698517481636, + "learning_rate": 4.868536753933273e-06, + "loss": 0.1892, + "step": 1612 + }, + { + "epoch": 0.14861565393651818, + "grad_norm": 0.9740568467052928, + "learning_rate": 4.868292867044104e-06, + "loss": 0.1802, + "step": 1613 + }, + { + "epoch": 0.14870779011378818, + "grad_norm": 1.0226350980028478, + "learning_rate": 4.868048760258688e-06, + "loss": 0.1893, + "step": 1614 + }, + { + "epoch": 0.14879992629105818, + "grad_norm": 0.952735105485474, + "learning_rate": 4.86780443359969e-06, + "loss": 0.1895, + "step": 1615 + }, + { + "epoch": 0.1488920624683282, + "grad_norm": 0.9784818281571142, + "learning_rate": 4.8675598870897945e-06, + "loss": 0.1775, + "step": 1616 + }, + { + "epoch": 0.1489841986455982, + "grad_norm": 0.9473189174777866, + "learning_rate": 4.86731512075171e-06, + "loss": 0.1766, + "step": 1617 + }, + { + "epoch": 0.1490763348228682, + "grad_norm": 1.0432332046226518, + "learning_rate": 4.86707013460816e-06, + "loss": 0.1785, + "step": 1618 + }, + { + "epoch": 0.1491684710001382, + "grad_norm": 1.036504291535368, + "learning_rate": 4.866824928681895e-06, + "loss": 0.1896, + "step": 1619 + }, + { + "epoch": 0.14926060717740822, + "grad_norm": 1.0094182586165745, + "learning_rate": 4.86657950299568e-06, + "loss": 0.1908, + "step": 1620 + }, + { + "epoch": 0.1493527433546782, + "grad_norm": 0.9844012875372663, + "learning_rate": 4.866333857572303e-06, + "loss": 0.1854, + "step": 1621 + }, + { + "epoch": 0.1494448795319482, + "grad_norm": 0.9374114123884193, + "learning_rate": 4.866087992434573e-06, + "loss": 0.157, + "step": 1622 + }, + { + "epoch": 0.14953701570921824, + "grad_norm": 1.006479066028266, + "learning_rate": 4.865841907605319e-06, + "loss": 0.1824, + "step": 1623 + }, + { + "epoch": 0.14962915188648823, + "grad_norm": 0.9427763406675067, + "learning_rate": 4.865595603107388e-06, + "loss": 0.1777, + "step": 1624 + }, + { + "epoch": 0.14972128806375823, + "grad_norm": 1.0306848153573909, + "learning_rate": 4.865349078963652e-06, + "loss": 0.1883, + "step": 1625 + }, + { + "epoch": 0.14981342424102825, + "grad_norm": 1.0514858020888211, + "learning_rate": 4.865102335196999e-06, + "loss": 0.1851, + "step": 1626 + }, + { + "epoch": 0.14990556041829825, + "grad_norm": 0.9753039893941937, + "learning_rate": 4.8648553718303386e-06, + "loss": 0.1826, + "step": 1627 + }, + { + "epoch": 0.14999769659556825, + "grad_norm": 1.0084449553216943, + "learning_rate": 4.864608188886603e-06, + "loss": 0.1822, + "step": 1628 + }, + { + "epoch": 0.15008983277283824, + "grad_norm": 1.006986138104111, + "learning_rate": 4.8643607863887435e-06, + "loss": 0.1828, + "step": 1629 + }, + { + "epoch": 0.15018196895010827, + "grad_norm": 1.0297786557131936, + "learning_rate": 4.8641131643597294e-06, + "loss": 0.2041, + "step": 1630 + }, + { + "epoch": 0.15027410512737827, + "grad_norm": 1.0037871234057218, + "learning_rate": 4.863865322822553e-06, + "loss": 0.1904, + "step": 1631 + }, + { + "epoch": 0.15036624130464826, + "grad_norm": 0.9211265793923229, + "learning_rate": 4.863617261800229e-06, + "loss": 0.1853, + "step": 1632 + }, + { + "epoch": 0.1504583774819183, + "grad_norm": 0.9938628142088426, + "learning_rate": 4.863368981315786e-06, + "loss": 0.1869, + "step": 1633 + }, + { + "epoch": 0.15055051365918828, + "grad_norm": 1.017354171445112, + "learning_rate": 4.86312048139228e-06, + "loss": 0.1867, + "step": 1634 + }, + { + "epoch": 0.15064264983645828, + "grad_norm": 0.9333495575889209, + "learning_rate": 4.862871762052782e-06, + "loss": 0.1667, + "step": 1635 + }, + { + "epoch": 0.15073478601372828, + "grad_norm": 0.9518180727935168, + "learning_rate": 4.862622823320388e-06, + "loss": 0.1788, + "step": 1636 + }, + { + "epoch": 0.1508269221909983, + "grad_norm": 1.009626987099678, + "learning_rate": 4.862373665218209e-06, + "loss": 0.1648, + "step": 1637 + }, + { + "epoch": 0.1509190583682683, + "grad_norm": 0.9908827014313232, + "learning_rate": 4.862124287769382e-06, + "loss": 0.1888, + "step": 1638 + }, + { + "epoch": 0.1510111945455383, + "grad_norm": 0.9522038223620974, + "learning_rate": 4.86187469099706e-06, + "loss": 0.1736, + "step": 1639 + }, + { + "epoch": 0.15110333072280832, + "grad_norm": 1.0418156159165777, + "learning_rate": 4.861624874924419e-06, + "loss": 0.1875, + "step": 1640 + }, + { + "epoch": 0.15119546690007832, + "grad_norm": 0.9646383512472281, + "learning_rate": 4.861374839574654e-06, + "loss": 0.1712, + "step": 1641 + }, + { + "epoch": 0.15128760307734831, + "grad_norm": 1.0021890657671846, + "learning_rate": 4.861124584970981e-06, + "loss": 0.193, + "step": 1642 + }, + { + "epoch": 0.15137973925461834, + "grad_norm": 1.0054203275956135, + "learning_rate": 4.860874111136637e-06, + "loss": 0.1775, + "step": 1643 + }, + { + "epoch": 0.15147187543188834, + "grad_norm": 0.9488906602949967, + "learning_rate": 4.860623418094877e-06, + "loss": 0.1907, + "step": 1644 + }, + { + "epoch": 0.15156401160915833, + "grad_norm": 1.0123184700678247, + "learning_rate": 4.8603725058689785e-06, + "loss": 0.1903, + "step": 1645 + }, + { + "epoch": 0.15165614778642833, + "grad_norm": 1.0996320649763636, + "learning_rate": 4.860121374482239e-06, + "loss": 0.1722, + "step": 1646 + }, + { + "epoch": 0.15174828396369835, + "grad_norm": 0.9861341443618606, + "learning_rate": 4.859870023957976e-06, + "loss": 0.1835, + "step": 1647 + }, + { + "epoch": 0.15184042014096835, + "grad_norm": 0.9267396471067681, + "learning_rate": 4.8596184543195265e-06, + "loss": 0.1555, + "step": 1648 + }, + { + "epoch": 0.15193255631823835, + "grad_norm": 1.2004716263814705, + "learning_rate": 4.859366665590251e-06, + "loss": 0.1959, + "step": 1649 + }, + { + "epoch": 0.15202469249550837, + "grad_norm": 0.968519280731306, + "learning_rate": 4.859114657793526e-06, + "loss": 0.1843, + "step": 1650 + }, + { + "epoch": 0.15211682867277837, + "grad_norm": 0.9437339279271343, + "learning_rate": 4.858862430952751e-06, + "loss": 0.1683, + "step": 1651 + }, + { + "epoch": 0.15220896485004837, + "grad_norm": 1.1495872644096086, + "learning_rate": 4.858609985091345e-06, + "loss": 0.188, + "step": 1652 + }, + { + "epoch": 0.15230110102731836, + "grad_norm": 0.9283484607685915, + "learning_rate": 4.858357320232749e-06, + "loss": 0.1684, + "step": 1653 + }, + { + "epoch": 0.1523932372045884, + "grad_norm": 0.9333600768277015, + "learning_rate": 4.858104436400422e-06, + "loss": 0.1672, + "step": 1654 + }, + { + "epoch": 0.15248537338185839, + "grad_norm": 1.0603988968811429, + "learning_rate": 4.857851333617844e-06, + "loss": 0.1833, + "step": 1655 + }, + { + "epoch": 0.15257750955912838, + "grad_norm": 0.8944080350738115, + "learning_rate": 4.857598011908515e-06, + "loss": 0.1706, + "step": 1656 + }, + { + "epoch": 0.1526696457363984, + "grad_norm": 1.0031502555495742, + "learning_rate": 4.857344471295958e-06, + "loss": 0.186, + "step": 1657 + }, + { + "epoch": 0.1527617819136684, + "grad_norm": 1.0782002303208784, + "learning_rate": 4.857090711803713e-06, + "loss": 0.1812, + "step": 1658 + }, + { + "epoch": 0.1528539180909384, + "grad_norm": 0.9937919230014062, + "learning_rate": 4.856836733455341e-06, + "loss": 0.1638, + "step": 1659 + }, + { + "epoch": 0.15294605426820843, + "grad_norm": 0.9962883028859182, + "learning_rate": 4.8565825362744255e-06, + "loss": 0.1753, + "step": 1660 + }, + { + "epoch": 0.15303819044547842, + "grad_norm": 0.8589383960068907, + "learning_rate": 4.8563281202845666e-06, + "loss": 0.1608, + "step": 1661 + }, + { + "epoch": 0.15313032662274842, + "grad_norm": 0.9811901617708566, + "learning_rate": 4.85607348550939e-06, + "loss": 0.1871, + "step": 1662 + }, + { + "epoch": 0.15322246280001842, + "grad_norm": 0.9399582788710602, + "learning_rate": 4.855818631972535e-06, + "loss": 0.1832, + "step": 1663 + }, + { + "epoch": 0.15331459897728844, + "grad_norm": 0.9372479897951055, + "learning_rate": 4.855563559697668e-06, + "loss": 0.1684, + "step": 1664 + }, + { + "epoch": 0.15340673515455844, + "grad_norm": 0.9427635954759986, + "learning_rate": 4.855308268708469e-06, + "loss": 0.1647, + "step": 1665 + }, + { + "epoch": 0.15349887133182843, + "grad_norm": 0.9627536164774635, + "learning_rate": 4.8550527590286455e-06, + "loss": 0.1803, + "step": 1666 + }, + { + "epoch": 0.15359100750909846, + "grad_norm": 0.9739001116786681, + "learning_rate": 4.85479703068192e-06, + "loss": 0.168, + "step": 1667 + }, + { + "epoch": 0.15368314368636846, + "grad_norm": 0.9356987155049376, + "learning_rate": 4.854541083692036e-06, + "loss": 0.1746, + "step": 1668 + }, + { + "epoch": 0.15377527986363845, + "grad_norm": 1.0169478043112947, + "learning_rate": 4.854284918082759e-06, + "loss": 0.1695, + "step": 1669 + }, + { + "epoch": 0.15386741604090845, + "grad_norm": 0.977947892285413, + "learning_rate": 4.854028533877874e-06, + "loss": 0.1782, + "step": 1670 + }, + { + "epoch": 0.15395955221817847, + "grad_norm": 0.9697331096055015, + "learning_rate": 4.8537719311011865e-06, + "loss": 0.1767, + "step": 1671 + }, + { + "epoch": 0.15405168839544847, + "grad_norm": 0.9882267562435134, + "learning_rate": 4.853515109776522e-06, + "loss": 0.1781, + "step": 1672 + }, + { + "epoch": 0.15414382457271847, + "grad_norm": 0.9743762721224799, + "learning_rate": 4.8532580699277256e-06, + "loss": 0.1655, + "step": 1673 + }, + { + "epoch": 0.1542359607499885, + "grad_norm": 0.8948799120507229, + "learning_rate": 4.853000811578665e-06, + "loss": 0.1713, + "step": 1674 + }, + { + "epoch": 0.1543280969272585, + "grad_norm": 0.9685729986401584, + "learning_rate": 4.852743334753226e-06, + "loss": 0.1772, + "step": 1675 + }, + { + "epoch": 0.1544202331045285, + "grad_norm": 1.0010420081338955, + "learning_rate": 4.852485639475314e-06, + "loss": 0.1827, + "step": 1676 + }, + { + "epoch": 0.1545123692817985, + "grad_norm": 0.9126092389102316, + "learning_rate": 4.852227725768857e-06, + "loss": 0.1753, + "step": 1677 + }, + { + "epoch": 0.1546045054590685, + "grad_norm": 0.9552177894021503, + "learning_rate": 4.8519695936578045e-06, + "loss": 0.1828, + "step": 1678 + }, + { + "epoch": 0.1546966416363385, + "grad_norm": 0.9534993774757963, + "learning_rate": 4.851711243166121e-06, + "loss": 0.1808, + "step": 1679 + }, + { + "epoch": 0.1547887778136085, + "grad_norm": 0.9956457154947068, + "learning_rate": 4.851452674317795e-06, + "loss": 0.1898, + "step": 1680 + }, + { + "epoch": 0.15488091399087853, + "grad_norm": 0.9164339311946711, + "learning_rate": 4.851193887136835e-06, + "loss": 0.1635, + "step": 1681 + }, + { + "epoch": 0.15497305016814852, + "grad_norm": 0.95908455887589, + "learning_rate": 4.850934881647271e-06, + "loss": 0.1802, + "step": 1682 + }, + { + "epoch": 0.15506518634541852, + "grad_norm": 1.0414535544577295, + "learning_rate": 4.850675657873149e-06, + "loss": 0.2002, + "step": 1683 + }, + { + "epoch": 0.15515732252268855, + "grad_norm": 0.9468141742151894, + "learning_rate": 4.850416215838539e-06, + "loss": 0.1796, + "step": 1684 + }, + { + "epoch": 0.15524945869995854, + "grad_norm": 1.0035960741903245, + "learning_rate": 4.850156555567531e-06, + "loss": 0.1758, + "step": 1685 + }, + { + "epoch": 0.15534159487722854, + "grad_norm": 0.9767439731982273, + "learning_rate": 4.849896677084234e-06, + "loss": 0.1774, + "step": 1686 + }, + { + "epoch": 0.15543373105449854, + "grad_norm": 1.034221041451684, + "learning_rate": 4.849636580412778e-06, + "loss": 0.2034, + "step": 1687 + }, + { + "epoch": 0.15552586723176856, + "grad_norm": 0.9576078272674569, + "learning_rate": 4.849376265577312e-06, + "loss": 0.1867, + "step": 1688 + }, + { + "epoch": 0.15561800340903856, + "grad_norm": 1.0442517742292943, + "learning_rate": 4.849115732602006e-06, + "loss": 0.2, + "step": 1689 + }, + { + "epoch": 0.15571013958630855, + "grad_norm": 0.8993671722695789, + "learning_rate": 4.848854981511053e-06, + "loss": 0.1634, + "step": 1690 + }, + { + "epoch": 0.15580227576357858, + "grad_norm": 0.9595537498820411, + "learning_rate": 4.848594012328661e-06, + "loss": 0.1763, + "step": 1691 + }, + { + "epoch": 0.15589441194084858, + "grad_norm": 0.9627114516540672, + "learning_rate": 4.848332825079063e-06, + "loss": 0.1853, + "step": 1692 + }, + { + "epoch": 0.15598654811811857, + "grad_norm": 0.9229982001738194, + "learning_rate": 4.848071419786509e-06, + "loss": 0.1715, + "step": 1693 + }, + { + "epoch": 0.1560786842953886, + "grad_norm": 0.9070563011756574, + "learning_rate": 4.847809796475271e-06, + "loss": 0.1743, + "step": 1694 + }, + { + "epoch": 0.1561708204726586, + "grad_norm": 0.912107518148019, + "learning_rate": 4.8475479551696405e-06, + "loss": 0.1714, + "step": 1695 + }, + { + "epoch": 0.1562629566499286, + "grad_norm": 1.031131238472556, + "learning_rate": 4.847285895893931e-06, + "loss": 0.1881, + "step": 1696 + }, + { + "epoch": 0.1563550928271986, + "grad_norm": 0.8837742757425879, + "learning_rate": 4.847023618672472e-06, + "loss": 0.1628, + "step": 1697 + }, + { + "epoch": 0.1564472290044686, + "grad_norm": 0.8733263411942864, + "learning_rate": 4.846761123529618e-06, + "loss": 0.1644, + "step": 1698 + }, + { + "epoch": 0.1565393651817386, + "grad_norm": 0.9097551228910571, + "learning_rate": 4.846498410489741e-06, + "loss": 0.1682, + "step": 1699 + }, + { + "epoch": 0.1566315013590086, + "grad_norm": 0.9915359550647295, + "learning_rate": 4.846235479577234e-06, + "loss": 0.1853, + "step": 1700 + }, + { + "epoch": 0.15672363753627863, + "grad_norm": 1.030370760761187, + "learning_rate": 4.845972330816511e-06, + "loss": 0.1927, + "step": 1701 + }, + { + "epoch": 0.15681577371354863, + "grad_norm": 1.0241222156942211, + "learning_rate": 4.845708964232003e-06, + "loss": 0.1904, + "step": 1702 + }, + { + "epoch": 0.15690790989081863, + "grad_norm": 0.9367671000205519, + "learning_rate": 4.845445379848167e-06, + "loss": 0.1812, + "step": 1703 + }, + { + "epoch": 0.15700004606808862, + "grad_norm": 0.969147645704287, + "learning_rate": 4.845181577689474e-06, + "loss": 0.1826, + "step": 1704 + }, + { + "epoch": 0.15709218224535865, + "grad_norm": 0.914293608169453, + "learning_rate": 4.844917557780419e-06, + "loss": 0.1794, + "step": 1705 + }, + { + "epoch": 0.15718431842262864, + "grad_norm": 1.007827722207317, + "learning_rate": 4.844653320145517e-06, + "loss": 0.2025, + "step": 1706 + }, + { + "epoch": 0.15727645459989864, + "grad_norm": 0.9844002604666592, + "learning_rate": 4.844388864809302e-06, + "loss": 0.1754, + "step": 1707 + }, + { + "epoch": 0.15736859077716867, + "grad_norm": 1.0507807427487061, + "learning_rate": 4.844124191796328e-06, + "loss": 0.1871, + "step": 1708 + }, + { + "epoch": 0.15746072695443866, + "grad_norm": 1.0395668195003018, + "learning_rate": 4.843859301131171e-06, + "loss": 0.2118, + "step": 1709 + }, + { + "epoch": 0.15755286313170866, + "grad_norm": 1.0186142609766382, + "learning_rate": 4.843594192838425e-06, + "loss": 0.1991, + "step": 1710 + }, + { + "epoch": 0.15764499930897868, + "grad_norm": 0.9191028208060649, + "learning_rate": 4.8433288669427055e-06, + "loss": 0.1673, + "step": 1711 + }, + { + "epoch": 0.15773713548624868, + "grad_norm": 1.0107944706696486, + "learning_rate": 4.84306332346865e-06, + "loss": 0.1813, + "step": 1712 + }, + { + "epoch": 0.15782927166351868, + "grad_norm": 0.8938007261528311, + "learning_rate": 4.842797562440913e-06, + "loss": 0.1716, + "step": 1713 + }, + { + "epoch": 0.15792140784078867, + "grad_norm": 0.9842512427881709, + "learning_rate": 4.842531583884168e-06, + "loss": 0.1797, + "step": 1714 + }, + { + "epoch": 0.1580135440180587, + "grad_norm": 1.0168585547681084, + "learning_rate": 4.842265387823115e-06, + "loss": 0.1949, + "step": 1715 + }, + { + "epoch": 0.1581056801953287, + "grad_norm": 0.9344893607216574, + "learning_rate": 4.841998974282469e-06, + "loss": 0.1687, + "step": 1716 + }, + { + "epoch": 0.1581978163725987, + "grad_norm": 0.9569816615949233, + "learning_rate": 4.841732343286965e-06, + "loss": 0.1737, + "step": 1717 + }, + { + "epoch": 0.15828995254986872, + "grad_norm": 0.937606627263079, + "learning_rate": 4.841465494861362e-06, + "loss": 0.1811, + "step": 1718 + }, + { + "epoch": 0.15838208872713871, + "grad_norm": 1.0238800385610571, + "learning_rate": 4.841198429030435e-06, + "loss": 0.1739, + "step": 1719 + }, + { + "epoch": 0.1584742249044087, + "grad_norm": 0.9619074678640244, + "learning_rate": 4.840931145818982e-06, + "loss": 0.1632, + "step": 1720 + }, + { + "epoch": 0.1585663610816787, + "grad_norm": 0.9303762249991079, + "learning_rate": 4.84066364525182e-06, + "loss": 0.1721, + "step": 1721 + }, + { + "epoch": 0.15865849725894873, + "grad_norm": 0.9418596387167747, + "learning_rate": 4.8403959273537875e-06, + "loss": 0.1739, + "step": 1722 + }, + { + "epoch": 0.15875063343621873, + "grad_norm": 1.087199652929513, + "learning_rate": 4.8401279921497405e-06, + "loss": 0.1902, + "step": 1723 + }, + { + "epoch": 0.15884276961348873, + "grad_norm": 0.9637578112894941, + "learning_rate": 4.839859839664557e-06, + "loss": 0.198, + "step": 1724 + }, + { + "epoch": 0.15893490579075875, + "grad_norm": 0.8742090828607239, + "learning_rate": 4.839591469923137e-06, + "loss": 0.1676, + "step": 1725 + }, + { + "epoch": 0.15902704196802875, + "grad_norm": 0.9493609600429447, + "learning_rate": 4.8393228829503966e-06, + "loss": 0.1838, + "step": 1726 + }, + { + "epoch": 0.15911917814529875, + "grad_norm": 0.9210297680349159, + "learning_rate": 4.839054078771275e-06, + "loss": 0.1838, + "step": 1727 + }, + { + "epoch": 0.15921131432256877, + "grad_norm": 0.9447500968465715, + "learning_rate": 4.83878505741073e-06, + "loss": 0.1835, + "step": 1728 + }, + { + "epoch": 0.15930345049983877, + "grad_norm": 0.9540678329702622, + "learning_rate": 4.838515818893741e-06, + "loss": 0.172, + "step": 1729 + }, + { + "epoch": 0.15939558667710876, + "grad_norm": 1.0123215650229256, + "learning_rate": 4.838246363245306e-06, + "loss": 0.1641, + "step": 1730 + }, + { + "epoch": 0.15948772285437876, + "grad_norm": 0.9966400481188536, + "learning_rate": 4.837976690490445e-06, + "loss": 0.1905, + "step": 1731 + }, + { + "epoch": 0.15957985903164879, + "grad_norm": 0.9095553032746777, + "learning_rate": 4.837706800654197e-06, + "loss": 0.1769, + "step": 1732 + }, + { + "epoch": 0.15967199520891878, + "grad_norm": 1.0303333847098155, + "learning_rate": 4.83743669376162e-06, + "loss": 0.1803, + "step": 1733 + }, + { + "epoch": 0.15976413138618878, + "grad_norm": 0.963888249747459, + "learning_rate": 4.8371663698377955e-06, + "loss": 0.1683, + "step": 1734 + }, + { + "epoch": 0.1598562675634588, + "grad_norm": 0.9576505684732173, + "learning_rate": 4.836895828907822e-06, + "loss": 0.1851, + "step": 1735 + }, + { + "epoch": 0.1599484037407288, + "grad_norm": 1.0554891537306332, + "learning_rate": 4.836625070996818e-06, + "loss": 0.188, + "step": 1736 + }, + { + "epoch": 0.1600405399179988, + "grad_norm": 1.030216874614439, + "learning_rate": 4.836354096129926e-06, + "loss": 0.1659, + "step": 1737 + }, + { + "epoch": 0.1601326760952688, + "grad_norm": 1.0900760942160297, + "learning_rate": 4.8360829043323046e-06, + "loss": 0.1832, + "step": 1738 + }, + { + "epoch": 0.16022481227253882, + "grad_norm": 0.9371181926316801, + "learning_rate": 4.835811495629134e-06, + "loss": 0.175, + "step": 1739 + }, + { + "epoch": 0.16031694844980882, + "grad_norm": 0.9495594727001155, + "learning_rate": 4.835539870045613e-06, + "loss": 0.1992, + "step": 1740 + }, + { + "epoch": 0.1604090846270788, + "grad_norm": 0.917342942507763, + "learning_rate": 4.8352680276069654e-06, + "loss": 0.1653, + "step": 1741 + }, + { + "epoch": 0.16050122080434884, + "grad_norm": 0.90525029954931, + "learning_rate": 4.83499596833843e-06, + "loss": 0.1795, + "step": 1742 + }, + { + "epoch": 0.16059335698161883, + "grad_norm": 1.0455965458196492, + "learning_rate": 4.834723692265268e-06, + "loss": 0.1817, + "step": 1743 + }, + { + "epoch": 0.16068549315888883, + "grad_norm": 0.959693505057054, + "learning_rate": 4.834451199412759e-06, + "loss": 0.1781, + "step": 1744 + }, + { + "epoch": 0.16077762933615886, + "grad_norm": 0.9527369490107168, + "learning_rate": 4.8341784898062056e-06, + "loss": 0.1819, + "step": 1745 + }, + { + "epoch": 0.16086976551342885, + "grad_norm": 0.9431052982892322, + "learning_rate": 4.833905563470928e-06, + "loss": 0.1706, + "step": 1746 + }, + { + "epoch": 0.16096190169069885, + "grad_norm": 1.0048907347623248, + "learning_rate": 4.833632420432267e-06, + "loss": 0.1831, + "step": 1747 + }, + { + "epoch": 0.16105403786796885, + "grad_norm": 1.0442032356650304, + "learning_rate": 4.833359060715586e-06, + "loss": 0.19, + "step": 1748 + }, + { + "epoch": 0.16114617404523887, + "grad_norm": 1.037538480498732, + "learning_rate": 4.8330854843462635e-06, + "loss": 0.1804, + "step": 1749 + }, + { + "epoch": 0.16123831022250887, + "grad_norm": 0.9676875781513236, + "learning_rate": 4.832811691349703e-06, + "loss": 0.1804, + "step": 1750 + }, + { + "epoch": 0.16133044639977887, + "grad_norm": 1.0135987525084371, + "learning_rate": 4.832537681751327e-06, + "loss": 0.1827, + "step": 1751 + }, + { + "epoch": 0.1614225825770489, + "grad_norm": 1.018385777602988, + "learning_rate": 4.832263455576576e-06, + "loss": 0.1972, + "step": 1752 + }, + { + "epoch": 0.1615147187543189, + "grad_norm": 1.1980934939018493, + "learning_rate": 4.8319890128509115e-06, + "loss": 0.1822, + "step": 1753 + }, + { + "epoch": 0.16160685493158888, + "grad_norm": 1.0080490206517214, + "learning_rate": 4.831714353599817e-06, + "loss": 0.1886, + "step": 1754 + }, + { + "epoch": 0.16169899110885888, + "grad_norm": 0.941493167404903, + "learning_rate": 4.831439477848793e-06, + "loss": 0.1725, + "step": 1755 + }, + { + "epoch": 0.1617911272861289, + "grad_norm": 0.9039272326667056, + "learning_rate": 4.831164385623362e-06, + "loss": 0.1589, + "step": 1756 + }, + { + "epoch": 0.1618832634633989, + "grad_norm": 0.9530095888259483, + "learning_rate": 4.830889076949069e-06, + "loss": 0.1746, + "step": 1757 + }, + { + "epoch": 0.1619753996406689, + "grad_norm": 0.905148587751177, + "learning_rate": 4.830613551851473e-06, + "loss": 0.1681, + "step": 1758 + }, + { + "epoch": 0.16206753581793892, + "grad_norm": 0.9308074744746171, + "learning_rate": 4.830337810356157e-06, + "loss": 0.1776, + "step": 1759 + }, + { + "epoch": 0.16215967199520892, + "grad_norm": 0.9875824921474106, + "learning_rate": 4.830061852488726e-06, + "loss": 0.1828, + "step": 1760 + }, + { + "epoch": 0.16225180817247892, + "grad_norm": 0.9813374242149387, + "learning_rate": 4.829785678274801e-06, + "loss": 0.1839, + "step": 1761 + }, + { + "epoch": 0.16234394434974894, + "grad_norm": 0.931234653866775, + "learning_rate": 4.829509287740024e-06, + "loss": 0.1788, + "step": 1762 + }, + { + "epoch": 0.16243608052701894, + "grad_norm": 0.9585368043301006, + "learning_rate": 4.82923268091006e-06, + "loss": 0.188, + "step": 1763 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 1.0074529442860354, + "learning_rate": 4.828955857810591e-06, + "loss": 0.191, + "step": 1764 + }, + { + "epoch": 0.16262035288155893, + "grad_norm": 0.9410361170473834, + "learning_rate": 4.828678818467319e-06, + "loss": 0.1763, + "step": 1765 + }, + { + "epoch": 0.16271248905882896, + "grad_norm": 1.0208109082183012, + "learning_rate": 4.828401562905969e-06, + "loss": 0.2075, + "step": 1766 + }, + { + "epoch": 0.16280462523609895, + "grad_norm": 0.9269680529094783, + "learning_rate": 4.828124091152283e-06, + "loss": 0.1737, + "step": 1767 + }, + { + "epoch": 0.16289676141336895, + "grad_norm": 1.0100855715032826, + "learning_rate": 4.827846403232024e-06, + "loss": 0.1928, + "step": 1768 + }, + { + "epoch": 0.16298889759063898, + "grad_norm": 0.9905609107994238, + "learning_rate": 4.827568499170977e-06, + "loss": 0.1722, + "step": 1769 + }, + { + "epoch": 0.16308103376790897, + "grad_norm": 1.0136821739673834, + "learning_rate": 4.8272903789949435e-06, + "loss": 0.1732, + "step": 1770 + }, + { + "epoch": 0.16317316994517897, + "grad_norm": 1.014178191785531, + "learning_rate": 4.8270120427297485e-06, + "loss": 0.1892, + "step": 1771 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.9882369844302558, + "learning_rate": 4.8267334904012345e-06, + "loss": 0.1798, + "step": 1772 + }, + { + "epoch": 0.163357442299719, + "grad_norm": 0.9940919783308403, + "learning_rate": 4.8264547220352655e-06, + "loss": 0.1862, + "step": 1773 + }, + { + "epoch": 0.163449578476989, + "grad_norm": 1.0582977739314734, + "learning_rate": 4.826175737657725e-06, + "loss": 0.1913, + "step": 1774 + }, + { + "epoch": 0.16354171465425899, + "grad_norm": 1.0787650948609633, + "learning_rate": 4.825896537294518e-06, + "loss": 0.1854, + "step": 1775 + }, + { + "epoch": 0.163633850831529, + "grad_norm": 1.0181516343182249, + "learning_rate": 4.825617120971566e-06, + "loss": 0.1747, + "step": 1776 + }, + { + "epoch": 0.163725987008799, + "grad_norm": 0.9365780887109146, + "learning_rate": 4.825337488714814e-06, + "loss": 0.185, + "step": 1777 + }, + { + "epoch": 0.163818123186069, + "grad_norm": 1.034805261001223, + "learning_rate": 4.825057640550226e-06, + "loss": 0.1875, + "step": 1778 + }, + { + "epoch": 0.16391025936333903, + "grad_norm": 1.0004955506710502, + "learning_rate": 4.824777576503786e-06, + "loss": 0.1832, + "step": 1779 + }, + { + "epoch": 0.16400239554060903, + "grad_norm": 1.045133789322791, + "learning_rate": 4.824497296601499e-06, + "loss": 0.1743, + "step": 1780 + }, + { + "epoch": 0.16409453171787902, + "grad_norm": 0.929674001723364, + "learning_rate": 4.8242168008693864e-06, + "loss": 0.1688, + "step": 1781 + }, + { + "epoch": 0.16418666789514902, + "grad_norm": 0.8909382328694191, + "learning_rate": 4.823936089333494e-06, + "loss": 0.1679, + "step": 1782 + }, + { + "epoch": 0.16427880407241904, + "grad_norm": 0.8714078813260372, + "learning_rate": 4.823655162019886e-06, + "loss": 0.1616, + "step": 1783 + }, + { + "epoch": 0.16437094024968904, + "grad_norm": 0.9117516027805643, + "learning_rate": 4.823374018954646e-06, + "loss": 0.1665, + "step": 1784 + }, + { + "epoch": 0.16446307642695904, + "grad_norm": 0.8647610420636894, + "learning_rate": 4.823092660163878e-06, + "loss": 0.1528, + "step": 1785 + }, + { + "epoch": 0.16455521260422906, + "grad_norm": 0.9530561108679019, + "learning_rate": 4.822811085673706e-06, + "loss": 0.1691, + "step": 1786 + }, + { + "epoch": 0.16464734878149906, + "grad_norm": 0.8638732798559001, + "learning_rate": 4.822529295510276e-06, + "loss": 0.1568, + "step": 1787 + }, + { + "epoch": 0.16473948495876906, + "grad_norm": 0.9641560836422756, + "learning_rate": 4.82224728969975e-06, + "loss": 0.1783, + "step": 1788 + }, + { + "epoch": 0.16483162113603905, + "grad_norm": 0.924759701849841, + "learning_rate": 4.821965068268314e-06, + "loss": 0.1727, + "step": 1789 + }, + { + "epoch": 0.16492375731330908, + "grad_norm": 0.9362299471705925, + "learning_rate": 4.82168263124217e-06, + "loss": 0.1741, + "step": 1790 + }, + { + "epoch": 0.16501589349057907, + "grad_norm": 0.981951133307446, + "learning_rate": 4.8213999786475455e-06, + "loss": 0.186, + "step": 1791 + }, + { + "epoch": 0.16510802966784907, + "grad_norm": 0.8704969480803476, + "learning_rate": 4.821117110510683e-06, + "loss": 0.1634, + "step": 1792 + }, + { + "epoch": 0.1652001658451191, + "grad_norm": 0.9063406667410021, + "learning_rate": 4.820834026857846e-06, + "loss": 0.1793, + "step": 1793 + }, + { + "epoch": 0.1652923020223891, + "grad_norm": 1.0657161087119753, + "learning_rate": 4.820550727715321e-06, + "loss": 0.1889, + "step": 1794 + }, + { + "epoch": 0.1653844381996591, + "grad_norm": 0.9127685674712928, + "learning_rate": 4.820267213109409e-06, + "loss": 0.1724, + "step": 1795 + }, + { + "epoch": 0.16547657437692911, + "grad_norm": 0.9727997916206534, + "learning_rate": 4.8199834830664395e-06, + "loss": 0.1777, + "step": 1796 + }, + { + "epoch": 0.1655687105541991, + "grad_norm": 0.9452854756210244, + "learning_rate": 4.819699537612752e-06, + "loss": 0.1615, + "step": 1797 + }, + { + "epoch": 0.1656608467314691, + "grad_norm": 0.9578005282418585, + "learning_rate": 4.819415376774714e-06, + "loss": 0.1827, + "step": 1798 + }, + { + "epoch": 0.1657529829087391, + "grad_norm": 1.0597663881266928, + "learning_rate": 4.819131000578707e-06, + "loss": 0.1843, + "step": 1799 + }, + { + "epoch": 0.16584511908600913, + "grad_norm": 0.9342840629471308, + "learning_rate": 4.818846409051139e-06, + "loss": 0.1813, + "step": 1800 + }, + { + "epoch": 0.16593725526327913, + "grad_norm": 0.98252920230672, + "learning_rate": 4.818561602218431e-06, + "loss": 0.1838, + "step": 1801 + }, + { + "epoch": 0.16602939144054912, + "grad_norm": 0.9424563329013992, + "learning_rate": 4.818276580107029e-06, + "loss": 0.1702, + "step": 1802 + }, + { + "epoch": 0.16612152761781915, + "grad_norm": 0.9202413050049445, + "learning_rate": 4.817991342743396e-06, + "loss": 0.1735, + "step": 1803 + }, + { + "epoch": 0.16621366379508914, + "grad_norm": 0.9549194113487504, + "learning_rate": 4.81770589015402e-06, + "loss": 0.1796, + "step": 1804 + }, + { + "epoch": 0.16630579997235914, + "grad_norm": 0.9465694214625567, + "learning_rate": 4.8174202223654e-06, + "loss": 0.1843, + "step": 1805 + }, + { + "epoch": 0.16639793614962914, + "grad_norm": 0.9349905458192107, + "learning_rate": 4.8171343394040645e-06, + "loss": 0.1745, + "step": 1806 + }, + { + "epoch": 0.16649007232689916, + "grad_norm": 0.8796702727017657, + "learning_rate": 4.816848241296556e-06, + "loss": 0.1695, + "step": 1807 + }, + { + "epoch": 0.16658220850416916, + "grad_norm": 0.9207017502379715, + "learning_rate": 4.816561928069439e-06, + "loss": 0.1825, + "step": 1808 + }, + { + "epoch": 0.16667434468143916, + "grad_norm": 0.8798529220695297, + "learning_rate": 4.8162753997492965e-06, + "loss": 0.1758, + "step": 1809 + }, + { + "epoch": 0.16676648085870918, + "grad_norm": 1.010096798746228, + "learning_rate": 4.815988656362735e-06, + "loss": 0.1908, + "step": 1810 + }, + { + "epoch": 0.16685861703597918, + "grad_norm": 0.906477990452234, + "learning_rate": 4.815701697936377e-06, + "loss": 0.1817, + "step": 1811 + }, + { + "epoch": 0.16695075321324918, + "grad_norm": 0.9584264334027195, + "learning_rate": 4.815414524496867e-06, + "loss": 0.1689, + "step": 1812 + }, + { + "epoch": 0.1670428893905192, + "grad_norm": 1.0568204132693753, + "learning_rate": 4.8151271360708704e-06, + "loss": 0.1817, + "step": 1813 + }, + { + "epoch": 0.1671350255677892, + "grad_norm": 1.0170922070435067, + "learning_rate": 4.814839532685069e-06, + "loss": 0.1794, + "step": 1814 + }, + { + "epoch": 0.1672271617450592, + "grad_norm": 0.9452343091927007, + "learning_rate": 4.814551714366168e-06, + "loss": 0.1775, + "step": 1815 + }, + { + "epoch": 0.1673192979223292, + "grad_norm": 1.0103963129300657, + "learning_rate": 4.814263681140892e-06, + "loss": 0.1789, + "step": 1816 + }, + { + "epoch": 0.16741143409959922, + "grad_norm": 1.057193287404099, + "learning_rate": 4.813975433035984e-06, + "loss": 0.18, + "step": 1817 + }, + { + "epoch": 0.1675035702768692, + "grad_norm": 1.0599542152721877, + "learning_rate": 4.813686970078207e-06, + "loss": 0.1861, + "step": 1818 + }, + { + "epoch": 0.1675957064541392, + "grad_norm": 0.9410257979008102, + "learning_rate": 4.813398292294345e-06, + "loss": 0.1782, + "step": 1819 + }, + { + "epoch": 0.16768784263140923, + "grad_norm": 0.9106844898045353, + "learning_rate": 4.813109399711204e-06, + "loss": 0.1605, + "step": 1820 + }, + { + "epoch": 0.16777997880867923, + "grad_norm": 0.9510723876689734, + "learning_rate": 4.812820292355607e-06, + "loss": 0.1768, + "step": 1821 + }, + { + "epoch": 0.16787211498594923, + "grad_norm": 0.9713976398794669, + "learning_rate": 4.812530970254396e-06, + "loss": 0.1768, + "step": 1822 + }, + { + "epoch": 0.16796425116321922, + "grad_norm": 0.9726574938485535, + "learning_rate": 4.812241433434436e-06, + "loss": 0.188, + "step": 1823 + }, + { + "epoch": 0.16805638734048925, + "grad_norm": 1.0053761517023432, + "learning_rate": 4.81195168192261e-06, + "loss": 0.1749, + "step": 1824 + }, + { + "epoch": 0.16814852351775925, + "grad_norm": 1.0033991069529102, + "learning_rate": 4.81166171574582e-06, + "loss": 0.1782, + "step": 1825 + }, + { + "epoch": 0.16824065969502924, + "grad_norm": 0.9968269303113185, + "learning_rate": 4.811371534930993e-06, + "loss": 0.1805, + "step": 1826 + }, + { + "epoch": 0.16833279587229927, + "grad_norm": 0.9615227789782784, + "learning_rate": 4.8110811395050695e-06, + "loss": 0.1696, + "step": 1827 + }, + { + "epoch": 0.16842493204956926, + "grad_norm": 0.9821799299380174, + "learning_rate": 4.810790529495013e-06, + "loss": 0.1791, + "step": 1828 + }, + { + "epoch": 0.16851706822683926, + "grad_norm": 0.9811162841683847, + "learning_rate": 4.810499704927808e-06, + "loss": 0.1723, + "step": 1829 + }, + { + "epoch": 0.1686092044041093, + "grad_norm": 1.1244318132152042, + "learning_rate": 4.810208665830456e-06, + "loss": 0.1907, + "step": 1830 + }, + { + "epoch": 0.16870134058137928, + "grad_norm": 0.9817177234015824, + "learning_rate": 4.809917412229981e-06, + "loss": 0.1788, + "step": 1831 + }, + { + "epoch": 0.16879347675864928, + "grad_norm": 0.9184917085261609, + "learning_rate": 4.809625944153425e-06, + "loss": 0.1617, + "step": 1832 + }, + { + "epoch": 0.16888561293591928, + "grad_norm": 0.9234623678622588, + "learning_rate": 4.8093342616278525e-06, + "loss": 0.1809, + "step": 1833 + }, + { + "epoch": 0.1689777491131893, + "grad_norm": 0.9581615643494954, + "learning_rate": 4.809042364680345e-06, + "loss": 0.1634, + "step": 1834 + }, + { + "epoch": 0.1690698852904593, + "grad_norm": 0.9697985187154081, + "learning_rate": 4.808750253338006e-06, + "loss": 0.1871, + "step": 1835 + }, + { + "epoch": 0.1691620214677293, + "grad_norm": 0.9221846846576924, + "learning_rate": 4.8084579276279565e-06, + "loss": 0.176, + "step": 1836 + }, + { + "epoch": 0.16925415764499932, + "grad_norm": 0.962272543721862, + "learning_rate": 4.80816538757734e-06, + "loss": 0.1954, + "step": 1837 + }, + { + "epoch": 0.16934629382226932, + "grad_norm": 0.8740315446603265, + "learning_rate": 4.80787263321332e-06, + "loss": 0.1638, + "step": 1838 + }, + { + "epoch": 0.16943842999953931, + "grad_norm": 0.9401843869180659, + "learning_rate": 4.8075796645630764e-06, + "loss": 0.1839, + "step": 1839 + }, + { + "epoch": 0.1695305661768093, + "grad_norm": 0.9919651044339668, + "learning_rate": 4.807286481653813e-06, + "loss": 0.1719, + "step": 1840 + }, + { + "epoch": 0.16962270235407934, + "grad_norm": 0.931631600254304, + "learning_rate": 4.806993084512752e-06, + "loss": 0.1786, + "step": 1841 + }, + { + "epoch": 0.16971483853134933, + "grad_norm": 1.0388517287914885, + "learning_rate": 4.806699473167134e-06, + "loss": 0.1916, + "step": 1842 + }, + { + "epoch": 0.16980697470861933, + "grad_norm": 0.8913690534162603, + "learning_rate": 4.806405647644222e-06, + "loss": 0.1826, + "step": 1843 + }, + { + "epoch": 0.16989911088588935, + "grad_norm": 1.0257086366954042, + "learning_rate": 4.806111607971298e-06, + "loss": 0.1751, + "step": 1844 + }, + { + "epoch": 0.16999124706315935, + "grad_norm": 0.9335569005047046, + "learning_rate": 4.805817354175663e-06, + "loss": 0.1715, + "step": 1845 + }, + { + "epoch": 0.17008338324042935, + "grad_norm": 0.9491430087260864, + "learning_rate": 4.805522886284637e-06, + "loss": 0.1731, + "step": 1846 + }, + { + "epoch": 0.17017551941769937, + "grad_norm": 0.8936870287779184, + "learning_rate": 4.8052282043255635e-06, + "loss": 0.1777, + "step": 1847 + }, + { + "epoch": 0.17026765559496937, + "grad_norm": 0.8902565793090995, + "learning_rate": 4.804933308325804e-06, + "loss": 0.1694, + "step": 1848 + }, + { + "epoch": 0.17035979177223937, + "grad_norm": 0.9985363911822385, + "learning_rate": 4.8046381983127385e-06, + "loss": 0.2018, + "step": 1849 + }, + { + "epoch": 0.17045192794950936, + "grad_norm": 0.9426301604024907, + "learning_rate": 4.8043428743137675e-06, + "loss": 0.1867, + "step": 1850 + }, + { + "epoch": 0.1705440641267794, + "grad_norm": 0.8550027014618022, + "learning_rate": 4.8040473363563136e-06, + "loss": 0.1637, + "step": 1851 + }, + { + "epoch": 0.17063620030404938, + "grad_norm": 0.9734804002388993, + "learning_rate": 4.8037515844678165e-06, + "loss": 0.1866, + "step": 1852 + }, + { + "epoch": 0.17072833648131938, + "grad_norm": 0.9928693625462343, + "learning_rate": 4.803455618675736e-06, + "loss": 0.1784, + "step": 1853 + }, + { + "epoch": 0.1708204726585894, + "grad_norm": 0.9402222477752512, + "learning_rate": 4.803159439007554e-06, + "loss": 0.1678, + "step": 1854 + }, + { + "epoch": 0.1709126088358594, + "grad_norm": 0.9737747885881045, + "learning_rate": 4.80286304549077e-06, + "loss": 0.1844, + "step": 1855 + }, + { + "epoch": 0.1710047450131294, + "grad_norm": 0.8979782554294682, + "learning_rate": 4.802566438152904e-06, + "loss": 0.1707, + "step": 1856 + }, + { + "epoch": 0.1710968811903994, + "grad_norm": 1.0231193949257447, + "learning_rate": 4.802269617021497e-06, + "loss": 0.1965, + "step": 1857 + }, + { + "epoch": 0.17118901736766942, + "grad_norm": 0.9882485601160279, + "learning_rate": 4.801972582124108e-06, + "loss": 0.1855, + "step": 1858 + }, + { + "epoch": 0.17128115354493942, + "grad_norm": 0.9804779701977544, + "learning_rate": 4.801675333488317e-06, + "loss": 0.1846, + "step": 1859 + }, + { + "epoch": 0.17137328972220942, + "grad_norm": 0.9887115862803393, + "learning_rate": 4.801377871141723e-06, + "loss": 0.1818, + "step": 1860 + }, + { + "epoch": 0.17146542589947944, + "grad_norm": 0.9908343960616961, + "learning_rate": 4.801080195111948e-06, + "loss": 0.1728, + "step": 1861 + }, + { + "epoch": 0.17155756207674944, + "grad_norm": 0.9121926413556142, + "learning_rate": 4.800782305426628e-06, + "loss": 0.1736, + "step": 1862 + }, + { + "epoch": 0.17164969825401943, + "grad_norm": 1.0130289652715774, + "learning_rate": 4.800484202113423e-06, + "loss": 0.1725, + "step": 1863 + }, + { + "epoch": 0.17174183443128946, + "grad_norm": 0.9997676957196446, + "learning_rate": 4.800185885200013e-06, + "loss": 0.1723, + "step": 1864 + }, + { + "epoch": 0.17183397060855946, + "grad_norm": 0.9695560315706072, + "learning_rate": 4.7998873547140954e-06, + "loss": 0.1721, + "step": 1865 + }, + { + "epoch": 0.17192610678582945, + "grad_norm": 0.9735293306486302, + "learning_rate": 4.799588610683389e-06, + "loss": 0.1845, + "step": 1866 + }, + { + "epoch": 0.17201824296309945, + "grad_norm": 0.9944144850116116, + "learning_rate": 4.799289653135633e-06, + "loss": 0.1756, + "step": 1867 + }, + { + "epoch": 0.17211037914036947, + "grad_norm": 0.9316812929228392, + "learning_rate": 4.7989904820985854e-06, + "loss": 0.1747, + "step": 1868 + }, + { + "epoch": 0.17220251531763947, + "grad_norm": 0.9486629348319279, + "learning_rate": 4.798691097600024e-06, + "loss": 0.1783, + "step": 1869 + }, + { + "epoch": 0.17229465149490947, + "grad_norm": 0.9921250810385875, + "learning_rate": 4.798391499667747e-06, + "loss": 0.1806, + "step": 1870 + }, + { + "epoch": 0.1723867876721795, + "grad_norm": 0.9467281601207117, + "learning_rate": 4.798091688329572e-06, + "loss": 0.1792, + "step": 1871 + }, + { + "epoch": 0.1724789238494495, + "grad_norm": 0.8938437016708146, + "learning_rate": 4.7977916636133365e-06, + "loss": 0.166, + "step": 1872 + }, + { + "epoch": 0.17257106002671949, + "grad_norm": 0.9033608634661399, + "learning_rate": 4.797491425546898e-06, + "loss": 0.1624, + "step": 1873 + }, + { + "epoch": 0.17266319620398948, + "grad_norm": 1.0093173143492151, + "learning_rate": 4.797190974158133e-06, + "loss": 0.1804, + "step": 1874 + }, + { + "epoch": 0.1727553323812595, + "grad_norm": 0.9269759911665011, + "learning_rate": 4.796890309474938e-06, + "loss": 0.1924, + "step": 1875 + }, + { + "epoch": 0.1728474685585295, + "grad_norm": 0.9511087442600202, + "learning_rate": 4.796589431525232e-06, + "loss": 0.1717, + "step": 1876 + }, + { + "epoch": 0.1729396047357995, + "grad_norm": 0.9727630535856038, + "learning_rate": 4.796288340336949e-06, + "loss": 0.1824, + "step": 1877 + }, + { + "epoch": 0.17303174091306953, + "grad_norm": 0.9098476882737225, + "learning_rate": 4.795987035938047e-06, + "loss": 0.1598, + "step": 1878 + }, + { + "epoch": 0.17312387709033952, + "grad_norm": 0.9586208598334143, + "learning_rate": 4.795685518356501e-06, + "loss": 0.176, + "step": 1879 + }, + { + "epoch": 0.17321601326760952, + "grad_norm": 0.9342244991941684, + "learning_rate": 4.795383787620308e-06, + "loss": 0.1622, + "step": 1880 + }, + { + "epoch": 0.17330814944487954, + "grad_norm": 0.9062253340794538, + "learning_rate": 4.795081843757483e-06, + "loss": 0.1579, + "step": 1881 + }, + { + "epoch": 0.17340028562214954, + "grad_norm": 0.93803547801233, + "learning_rate": 4.794779686796062e-06, + "loss": 0.166, + "step": 1882 + }, + { + "epoch": 0.17349242179941954, + "grad_norm": 0.9996096252341287, + "learning_rate": 4.794477316764101e-06, + "loss": 0.1842, + "step": 1883 + }, + { + "epoch": 0.17358455797668954, + "grad_norm": 0.9514554221599457, + "learning_rate": 4.794174733689672e-06, + "loss": 0.1672, + "step": 1884 + }, + { + "epoch": 0.17367669415395956, + "grad_norm": 0.9379721576841644, + "learning_rate": 4.793871937600874e-06, + "loss": 0.1689, + "step": 1885 + }, + { + "epoch": 0.17376883033122956, + "grad_norm": 0.9887485087873079, + "learning_rate": 4.7935689285258195e-06, + "loss": 0.1695, + "step": 1886 + }, + { + "epoch": 0.17386096650849955, + "grad_norm": 0.9185524648048645, + "learning_rate": 4.793265706492643e-06, + "loss": 0.175, + "step": 1887 + }, + { + "epoch": 0.17395310268576958, + "grad_norm": 0.944094137645397, + "learning_rate": 4.792962271529499e-06, + "loss": 0.1694, + "step": 1888 + }, + { + "epoch": 0.17404523886303958, + "grad_norm": 0.9907587903074953, + "learning_rate": 4.792658623664561e-06, + "loss": 0.1788, + "step": 1889 + }, + { + "epoch": 0.17413737504030957, + "grad_norm": 0.8874671631140197, + "learning_rate": 4.792354762926023e-06, + "loss": 0.1604, + "step": 1890 + }, + { + "epoch": 0.17422951121757957, + "grad_norm": 0.917699450885086, + "learning_rate": 4.792050689342098e-06, + "loss": 0.1882, + "step": 1891 + }, + { + "epoch": 0.1743216473948496, + "grad_norm": 0.8728825937831589, + "learning_rate": 4.791746402941021e-06, + "loss": 0.1583, + "step": 1892 + }, + { + "epoch": 0.1744137835721196, + "grad_norm": 0.999026083350806, + "learning_rate": 4.791441903751043e-06, + "loss": 0.1835, + "step": 1893 + }, + { + "epoch": 0.1745059197493896, + "grad_norm": 0.9922845901509115, + "learning_rate": 4.791137191800438e-06, + "loss": 0.1817, + "step": 1894 + }, + { + "epoch": 0.1745980559266596, + "grad_norm": 0.9063701103169883, + "learning_rate": 4.790832267117498e-06, + "loss": 0.1633, + "step": 1895 + }, + { + "epoch": 0.1746901921039296, + "grad_norm": 0.9812700186466582, + "learning_rate": 4.790527129730536e-06, + "loss": 0.1825, + "step": 1896 + }, + { + "epoch": 0.1747823282811996, + "grad_norm": 0.9791016126703898, + "learning_rate": 4.790221779667883e-06, + "loss": 0.1834, + "step": 1897 + }, + { + "epoch": 0.17487446445846963, + "grad_norm": 1.019954906964735, + "learning_rate": 4.789916216957892e-06, + "loss": 0.1793, + "step": 1898 + }, + { + "epoch": 0.17496660063573963, + "grad_norm": 1.0658787511106087, + "learning_rate": 4.789610441628932e-06, + "loss": 0.1805, + "step": 1899 + }, + { + "epoch": 0.17505873681300962, + "grad_norm": 0.9131631203861305, + "learning_rate": 4.789304453709398e-06, + "loss": 0.1692, + "step": 1900 + }, + { + "epoch": 0.17515087299027962, + "grad_norm": 0.9370079179863117, + "learning_rate": 4.788998253227698e-06, + "loss": 0.1774, + "step": 1901 + }, + { + "epoch": 0.17524300916754965, + "grad_norm": 0.9204477318927515, + "learning_rate": 4.788691840212264e-06, + "loss": 0.1696, + "step": 1902 + }, + { + "epoch": 0.17533514534481964, + "grad_norm": 0.9702024390215709, + "learning_rate": 4.788385214691546e-06, + "loss": 0.1758, + "step": 1903 + }, + { + "epoch": 0.17542728152208964, + "grad_norm": 1.0381178605677073, + "learning_rate": 4.788078376694017e-06, + "loss": 0.18, + "step": 1904 + }, + { + "epoch": 0.17551941769935966, + "grad_norm": 0.9955510617085207, + "learning_rate": 4.787771326248162e-06, + "loss": 0.1738, + "step": 1905 + }, + { + "epoch": 0.17561155387662966, + "grad_norm": 0.8831789806878049, + "learning_rate": 4.787464063382493e-06, + "loss": 0.157, + "step": 1906 + }, + { + "epoch": 0.17570369005389966, + "grad_norm": 0.9652670549320401, + "learning_rate": 4.787156588125541e-06, + "loss": 0.1738, + "step": 1907 + }, + { + "epoch": 0.17579582623116966, + "grad_norm": 1.016867250112242, + "learning_rate": 4.786848900505852e-06, + "loss": 0.197, + "step": 1908 + }, + { + "epoch": 0.17588796240843968, + "grad_norm": 0.8937479093266593, + "learning_rate": 4.786541000551997e-06, + "loss": 0.1709, + "step": 1909 + }, + { + "epoch": 0.17598009858570968, + "grad_norm": 0.9348137203113728, + "learning_rate": 4.786232888292564e-06, + "loss": 0.1696, + "step": 1910 + }, + { + "epoch": 0.17607223476297967, + "grad_norm": 0.9606862462398517, + "learning_rate": 4.785924563756162e-06, + "loss": 0.1778, + "step": 1911 + }, + { + "epoch": 0.1761643709402497, + "grad_norm": 0.9522177279868732, + "learning_rate": 4.785616026971418e-06, + "loss": 0.1802, + "step": 1912 + }, + { + "epoch": 0.1762565071175197, + "grad_norm": 0.9383905997510276, + "learning_rate": 4.78530727796698e-06, + "loss": 0.1735, + "step": 1913 + }, + { + "epoch": 0.1763486432947897, + "grad_norm": 0.9510793409227706, + "learning_rate": 4.784998316771515e-06, + "loss": 0.1739, + "step": 1914 + }, + { + "epoch": 0.17644077947205972, + "grad_norm": 1.0062514891320191, + "learning_rate": 4.784689143413711e-06, + "loss": 0.1829, + "step": 1915 + }, + { + "epoch": 0.1765329156493297, + "grad_norm": 0.906545128875752, + "learning_rate": 4.784379757922273e-06, + "loss": 0.1647, + "step": 1916 + }, + { + "epoch": 0.1766250518265997, + "grad_norm": 0.9973752258860914, + "learning_rate": 4.78407016032593e-06, + "loss": 0.183, + "step": 1917 + }, + { + "epoch": 0.1767171880038697, + "grad_norm": 0.9141803308976617, + "learning_rate": 4.783760350653426e-06, + "loss": 0.1575, + "step": 1918 + }, + { + "epoch": 0.17680932418113973, + "grad_norm": 0.9502312399313518, + "learning_rate": 4.783450328933527e-06, + "loss": 0.1746, + "step": 1919 + }, + { + "epoch": 0.17690146035840973, + "grad_norm": 1.0723880151854668, + "learning_rate": 4.78314009519502e-06, + "loss": 0.1945, + "step": 1920 + }, + { + "epoch": 0.17699359653567973, + "grad_norm": 0.8964554327793061, + "learning_rate": 4.782829649466709e-06, + "loss": 0.1613, + "step": 1921 + }, + { + "epoch": 0.17708573271294975, + "grad_norm": 0.9319472469096941, + "learning_rate": 4.78251899177742e-06, + "loss": 0.1661, + "step": 1922 + }, + { + "epoch": 0.17717786889021975, + "grad_norm": 1.0237495405647998, + "learning_rate": 4.7822081221559965e-06, + "loss": 0.1866, + "step": 1923 + }, + { + "epoch": 0.17727000506748974, + "grad_norm": 0.8369732203496116, + "learning_rate": 4.781897040631304e-06, + "loss": 0.1652, + "step": 1924 + }, + { + "epoch": 0.17736214124475974, + "grad_norm": 1.0220020925106441, + "learning_rate": 4.781585747232224e-06, + "loss": 0.1963, + "step": 1925 + }, + { + "epoch": 0.17745427742202977, + "grad_norm": 0.9377498492935152, + "learning_rate": 4.781274241987664e-06, + "loss": 0.1687, + "step": 1926 + }, + { + "epoch": 0.17754641359929976, + "grad_norm": 0.9221667519044533, + "learning_rate": 4.7809625249265436e-06, + "loss": 0.1633, + "step": 1927 + }, + { + "epoch": 0.17763854977656976, + "grad_norm": 1.096594760082437, + "learning_rate": 4.780650596077808e-06, + "loss": 0.1839, + "step": 1928 + }, + { + "epoch": 0.17773068595383978, + "grad_norm": 1.0274699250823436, + "learning_rate": 4.780338455470419e-06, + "loss": 0.1784, + "step": 1929 + }, + { + "epoch": 0.17782282213110978, + "grad_norm": 0.9173298175308037, + "learning_rate": 4.780026103133358e-06, + "loss": 0.1671, + "step": 1930 + }, + { + "epoch": 0.17791495830837978, + "grad_norm": 1.0241515678846906, + "learning_rate": 4.7797135390956294e-06, + "loss": 0.1667, + "step": 1931 + }, + { + "epoch": 0.1780070944856498, + "grad_norm": 1.0382108734004492, + "learning_rate": 4.779400763386253e-06, + "loss": 0.1912, + "step": 1932 + }, + { + "epoch": 0.1780992306629198, + "grad_norm": 0.9695327271031154, + "learning_rate": 4.77908777603427e-06, + "loss": 0.1753, + "step": 1933 + }, + { + "epoch": 0.1781913668401898, + "grad_norm": 1.038422251537585, + "learning_rate": 4.778774577068741e-06, + "loss": 0.1856, + "step": 1934 + }, + { + "epoch": 0.1782835030174598, + "grad_norm": 0.8941524559172731, + "learning_rate": 4.778461166518748e-06, + "loss": 0.162, + "step": 1935 + }, + { + "epoch": 0.17837563919472982, + "grad_norm": 0.9313317705606107, + "learning_rate": 4.778147544413392e-06, + "loss": 0.1728, + "step": 1936 + }, + { + "epoch": 0.17846777537199982, + "grad_norm": 1.0519701170000495, + "learning_rate": 4.777833710781789e-06, + "loss": 0.1957, + "step": 1937 + }, + { + "epoch": 0.1785599115492698, + "grad_norm": 1.0387981318355688, + "learning_rate": 4.777519665653082e-06, + "loss": 0.1878, + "step": 1938 + }, + { + "epoch": 0.17865204772653984, + "grad_norm": 0.9444838158399083, + "learning_rate": 4.777205409056429e-06, + "loss": 0.1779, + "step": 1939 + }, + { + "epoch": 0.17874418390380983, + "grad_norm": 0.9575307026752148, + "learning_rate": 4.776890941021008e-06, + "loss": 0.2017, + "step": 1940 + }, + { + "epoch": 0.17883632008107983, + "grad_norm": 1.0234915939858733, + "learning_rate": 4.776576261576018e-06, + "loss": 0.1722, + "step": 1941 + }, + { + "epoch": 0.17892845625834983, + "grad_norm": 0.9803379275090085, + "learning_rate": 4.776261370750678e-06, + "loss": 0.1706, + "step": 1942 + }, + { + "epoch": 0.17902059243561985, + "grad_norm": 0.9358718176258201, + "learning_rate": 4.775946268574224e-06, + "loss": 0.1763, + "step": 1943 + }, + { + "epoch": 0.17911272861288985, + "grad_norm": 0.9786414947095523, + "learning_rate": 4.775630955075915e-06, + "loss": 0.1992, + "step": 1944 + }, + { + "epoch": 0.17920486479015985, + "grad_norm": 1.0318275409794424, + "learning_rate": 4.775315430285026e-06, + "loss": 0.1981, + "step": 1945 + }, + { + "epoch": 0.17929700096742987, + "grad_norm": 0.9631276712451919, + "learning_rate": 4.7749996942308546e-06, + "loss": 0.1888, + "step": 1946 + }, + { + "epoch": 0.17938913714469987, + "grad_norm": 1.0003266569020262, + "learning_rate": 4.774683746942717e-06, + "loss": 0.1842, + "step": 1947 + }, + { + "epoch": 0.17948127332196986, + "grad_norm": 0.9265546188995342, + "learning_rate": 4.774367588449948e-06, + "loss": 0.1578, + "step": 1948 + }, + { + "epoch": 0.1795734094992399, + "grad_norm": 0.9009805935055492, + "learning_rate": 4.774051218781904e-06, + "loss": 0.1632, + "step": 1949 + }, + { + "epoch": 0.17966554567650989, + "grad_norm": 0.9809819232416425, + "learning_rate": 4.77373463796796e-06, + "loss": 0.1786, + "step": 1950 + }, + { + "epoch": 0.17975768185377988, + "grad_norm": 1.0260467345453894, + "learning_rate": 4.7734178460375105e-06, + "loss": 0.1657, + "step": 1951 + }, + { + "epoch": 0.17984981803104988, + "grad_norm": 0.9280186899004385, + "learning_rate": 4.773100843019969e-06, + "loss": 0.1662, + "step": 1952 + }, + { + "epoch": 0.1799419542083199, + "grad_norm": 0.9861324889794633, + "learning_rate": 4.7727836289447685e-06, + "loss": 0.1815, + "step": 1953 + }, + { + "epoch": 0.1800340903855899, + "grad_norm": 0.9184406579084488, + "learning_rate": 4.7724662038413646e-06, + "loss": 0.1805, + "step": 1954 + }, + { + "epoch": 0.1801262265628599, + "grad_norm": 1.065223211061827, + "learning_rate": 4.772148567739229e-06, + "loss": 0.172, + "step": 1955 + }, + { + "epoch": 0.18021836274012992, + "grad_norm": 0.9539061206964681, + "learning_rate": 4.7718307206678535e-06, + "loss": 0.1828, + "step": 1956 + }, + { + "epoch": 0.18031049891739992, + "grad_norm": 0.9148835147838171, + "learning_rate": 4.7715126626567525e-06, + "loss": 0.1541, + "step": 1957 + }, + { + "epoch": 0.18040263509466992, + "grad_norm": 0.9232041932852285, + "learning_rate": 4.7711943937354555e-06, + "loss": 0.1751, + "step": 1958 + }, + { + "epoch": 0.18049477127193994, + "grad_norm": 0.9216980915355115, + "learning_rate": 4.770875913933515e-06, + "loss": 0.1851, + "step": 1959 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 0.9540739614506976, + "learning_rate": 4.770557223280501e-06, + "loss": 0.1919, + "step": 1960 + }, + { + "epoch": 0.18067904362647993, + "grad_norm": 0.9591787025956806, + "learning_rate": 4.7702383218060044e-06, + "loss": 0.1772, + "step": 1961 + }, + { + "epoch": 0.18077117980374993, + "grad_norm": 1.059574276391285, + "learning_rate": 4.769919209539635e-06, + "loss": 0.2, + "step": 1962 + }, + { + "epoch": 0.18086331598101996, + "grad_norm": 1.0242113087334594, + "learning_rate": 4.769599886511024e-06, + "loss": 0.1847, + "step": 1963 + }, + { + "epoch": 0.18095545215828995, + "grad_norm": 0.8678928771391421, + "learning_rate": 4.769280352749817e-06, + "loss": 0.1636, + "step": 1964 + }, + { + "epoch": 0.18104758833555995, + "grad_norm": 0.9217600598060104, + "learning_rate": 4.768960608285688e-06, + "loss": 0.1729, + "step": 1965 + }, + { + "epoch": 0.18113972451282997, + "grad_norm": 0.8802794485229976, + "learning_rate": 4.76864065314832e-06, + "loss": 0.1748, + "step": 1966 + }, + { + "epoch": 0.18123186069009997, + "grad_norm": 0.9870852789685146, + "learning_rate": 4.768320487367424e-06, + "loss": 0.1683, + "step": 1967 + }, + { + "epoch": 0.18132399686736997, + "grad_norm": 0.9296981723017914, + "learning_rate": 4.768000110972727e-06, + "loss": 0.1689, + "step": 1968 + }, + { + "epoch": 0.18141613304463997, + "grad_norm": 0.9677522771888776, + "learning_rate": 4.767679523993976e-06, + "loss": 0.1883, + "step": 1969 + }, + { + "epoch": 0.18150826922191, + "grad_norm": 0.9016253481332323, + "learning_rate": 4.767358726460936e-06, + "loss": 0.1605, + "step": 1970 + }, + { + "epoch": 0.18160040539918, + "grad_norm": 0.9058389324448571, + "learning_rate": 4.7670377184033944e-06, + "loss": 0.1687, + "step": 1971 + }, + { + "epoch": 0.18169254157644998, + "grad_norm": 1.017821513904742, + "learning_rate": 4.7667164998511574e-06, + "loss": 0.1911, + "step": 1972 + }, + { + "epoch": 0.18178467775372, + "grad_norm": 0.9469304159614529, + "learning_rate": 4.766395070834049e-06, + "loss": 0.1783, + "step": 1973 + }, + { + "epoch": 0.18187681393099, + "grad_norm": 0.9290326425937189, + "learning_rate": 4.7660734313819135e-06, + "loss": 0.1666, + "step": 1974 + }, + { + "epoch": 0.18196895010826, + "grad_norm": 0.907021416145737, + "learning_rate": 4.765751581524617e-06, + "loss": 0.1551, + "step": 1975 + }, + { + "epoch": 0.18206108628553003, + "grad_norm": 0.9225218369066349, + "learning_rate": 4.765429521292042e-06, + "loss": 0.1812, + "step": 1976 + }, + { + "epoch": 0.18215322246280002, + "grad_norm": 0.9316138857926441, + "learning_rate": 4.765107250714093e-06, + "loss": 0.17, + "step": 1977 + }, + { + "epoch": 0.18224535864007002, + "grad_norm": 0.9764313721990456, + "learning_rate": 4.764784769820691e-06, + "loss": 0.1843, + "step": 1978 + }, + { + "epoch": 0.18233749481734002, + "grad_norm": 0.9237308847508986, + "learning_rate": 4.76446207864178e-06, + "loss": 0.1797, + "step": 1979 + }, + { + "epoch": 0.18242963099461004, + "grad_norm": 0.9644001296709762, + "learning_rate": 4.764139177207321e-06, + "loss": 0.187, + "step": 1980 + }, + { + "epoch": 0.18252176717188004, + "grad_norm": 0.8821755436379798, + "learning_rate": 4.763816065547295e-06, + "loss": 0.1686, + "step": 1981 + }, + { + "epoch": 0.18261390334915004, + "grad_norm": 0.9503521212580078, + "learning_rate": 4.763492743691705e-06, + "loss": 0.1591, + "step": 1982 + }, + { + "epoch": 0.18270603952642006, + "grad_norm": 0.9082913689491118, + "learning_rate": 4.7631692116705695e-06, + "loss": 0.176, + "step": 1983 + }, + { + "epoch": 0.18279817570369006, + "grad_norm": 0.9100042108564763, + "learning_rate": 4.76284546951393e-06, + "loss": 0.1638, + "step": 1984 + }, + { + "epoch": 0.18289031188096005, + "grad_norm": 1.0130194245152968, + "learning_rate": 4.762521517251844e-06, + "loss": 0.1889, + "step": 1985 + }, + { + "epoch": 0.18298244805823005, + "grad_norm": 0.9483601310990681, + "learning_rate": 4.762197354914391e-06, + "loss": 0.1789, + "step": 1986 + }, + { + "epoch": 0.18307458423550008, + "grad_norm": 0.9978614650039507, + "learning_rate": 4.761872982531671e-06, + "loss": 0.188, + "step": 1987 + }, + { + "epoch": 0.18316672041277007, + "grad_norm": 1.0247173947884123, + "learning_rate": 4.761548400133801e-06, + "loss": 0.1727, + "step": 1988 + }, + { + "epoch": 0.18325885659004007, + "grad_norm": 1.043154599032093, + "learning_rate": 4.761223607750919e-06, + "loss": 0.1821, + "step": 1989 + }, + { + "epoch": 0.1833509927673101, + "grad_norm": 1.069427452926741, + "learning_rate": 4.760898605413182e-06, + "loss": 0.1953, + "step": 1990 + }, + { + "epoch": 0.1834431289445801, + "grad_norm": 0.9171673945079573, + "learning_rate": 4.760573393150766e-06, + "loss": 0.1682, + "step": 1991 + }, + { + "epoch": 0.1835352651218501, + "grad_norm": 0.9707474375781835, + "learning_rate": 4.760247970993867e-06, + "loss": 0.187, + "step": 1992 + }, + { + "epoch": 0.1836274012991201, + "grad_norm": 0.9115866782437404, + "learning_rate": 4.7599223389727e-06, + "loss": 0.166, + "step": 1993 + }, + { + "epoch": 0.1837195374763901, + "grad_norm": 0.9093587060651217, + "learning_rate": 4.759596497117501e-06, + "loss": 0.1621, + "step": 1994 + }, + { + "epoch": 0.1838116736536601, + "grad_norm": 0.939985698078961, + "learning_rate": 4.759270445458524e-06, + "loss": 0.186, + "step": 1995 + }, + { + "epoch": 0.1839038098309301, + "grad_norm": 0.9930713127947033, + "learning_rate": 4.758944184026043e-06, + "loss": 0.1755, + "step": 1996 + }, + { + "epoch": 0.18399594600820013, + "grad_norm": 0.9060670257127519, + "learning_rate": 4.758617712850352e-06, + "loss": 0.1656, + "step": 1997 + }, + { + "epoch": 0.18408808218547013, + "grad_norm": 0.9596014968387875, + "learning_rate": 4.758291031961763e-06, + "loss": 0.1774, + "step": 1998 + }, + { + "epoch": 0.18418021836274012, + "grad_norm": 0.9071035450847232, + "learning_rate": 4.757964141390609e-06, + "loss": 0.1779, + "step": 1999 + }, + { + "epoch": 0.18427235454001015, + "grad_norm": 0.916038591656752, + "learning_rate": 4.75763704116724e-06, + "loss": 0.1701, + "step": 2000 + }, + { + "epoch": 0.18427235454001015, + "eval_loss": 0.1747845858335495, + "eval_runtime": 299.1226, + "eval_samples_per_second": 23.459, + "eval_steps_per_second": 2.935, + "step": 2000 + }, + { + "epoch": 0.18436449071728014, + "grad_norm": 0.9620227340872637, + "learning_rate": 4.757309731322029e-06, + "loss": 0.1766, + "step": 2001 + }, + { + "epoch": 0.18445662689455014, + "grad_norm": 0.9936552481938895, + "learning_rate": 4.756982211885368e-06, + "loss": 0.185, + "step": 2002 + }, + { + "epoch": 0.18454876307182014, + "grad_norm": 0.9243700651653634, + "learning_rate": 4.756654482887665e-06, + "loss": 0.1629, + "step": 2003 + }, + { + "epoch": 0.18464089924909016, + "grad_norm": 0.9388336836899591, + "learning_rate": 4.756326544359351e-06, + "loss": 0.1677, + "step": 2004 + }, + { + "epoch": 0.18473303542636016, + "grad_norm": 0.9691896762005906, + "learning_rate": 4.7559983963308735e-06, + "loss": 0.159, + "step": 2005 + }, + { + "epoch": 0.18482517160363016, + "grad_norm": 0.91871871300768, + "learning_rate": 4.755670038832703e-06, + "loss": 0.1657, + "step": 2006 + }, + { + "epoch": 0.18491730778090018, + "grad_norm": 0.959152484145046, + "learning_rate": 4.755341471895325e-06, + "loss": 0.1813, + "step": 2007 + }, + { + "epoch": 0.18500944395817018, + "grad_norm": 0.9236567392647953, + "learning_rate": 4.75501269554925e-06, + "loss": 0.1655, + "step": 2008 + }, + { + "epoch": 0.18510158013544017, + "grad_norm": 0.917760585288594, + "learning_rate": 4.754683709825003e-06, + "loss": 0.1762, + "step": 2009 + }, + { + "epoch": 0.1851937163127102, + "grad_norm": 0.8881508952042282, + "learning_rate": 4.7543545147531314e-06, + "loss": 0.1677, + "step": 2010 + }, + { + "epoch": 0.1852858524899802, + "grad_norm": 0.9048943834112922, + "learning_rate": 4.754025110364201e-06, + "loss": 0.1648, + "step": 2011 + }, + { + "epoch": 0.1853779886672502, + "grad_norm": 0.9174376650297095, + "learning_rate": 4.753695496688795e-06, + "loss": 0.1641, + "step": 2012 + }, + { + "epoch": 0.1854701248445202, + "grad_norm": 0.9517016421197806, + "learning_rate": 4.753365673757521e-06, + "loss": 0.1783, + "step": 2013 + }, + { + "epoch": 0.18556226102179021, + "grad_norm": 0.9098170442182718, + "learning_rate": 4.7530356416010004e-06, + "loss": 0.1584, + "step": 2014 + }, + { + "epoch": 0.1856543971990602, + "grad_norm": 0.9253465519598166, + "learning_rate": 4.7527054002498785e-06, + "loss": 0.1692, + "step": 2015 + }, + { + "epoch": 0.1857465333763302, + "grad_norm": 0.9304349614009866, + "learning_rate": 4.752374949734818e-06, + "loss": 0.1764, + "step": 2016 + }, + { + "epoch": 0.18583866955360023, + "grad_norm": 0.9380871995269432, + "learning_rate": 4.752044290086501e-06, + "loss": 0.174, + "step": 2017 + }, + { + "epoch": 0.18593080573087023, + "grad_norm": 0.9037755594294037, + "learning_rate": 4.75171342133563e-06, + "loss": 0.1584, + "step": 2018 + }, + { + "epoch": 0.18602294190814023, + "grad_norm": 0.9530499133921239, + "learning_rate": 4.751382343512924e-06, + "loss": 0.1765, + "step": 2019 + }, + { + "epoch": 0.18611507808541022, + "grad_norm": 0.9837126600118509, + "learning_rate": 4.751051056649126e-06, + "loss": 0.1754, + "step": 2020 + }, + { + "epoch": 0.18620721426268025, + "grad_norm": 0.9610159521960262, + "learning_rate": 4.750719560774994e-06, + "loss": 0.1713, + "step": 2021 + }, + { + "epoch": 0.18629935043995025, + "grad_norm": 0.9192661443054622, + "learning_rate": 4.75038785592131e-06, + "loss": 0.1624, + "step": 2022 + }, + { + "epoch": 0.18639148661722024, + "grad_norm": 0.9674669683439728, + "learning_rate": 4.750055942118871e-06, + "loss": 0.1772, + "step": 2023 + }, + { + "epoch": 0.18648362279449027, + "grad_norm": 0.9778177795518106, + "learning_rate": 4.749723819398496e-06, + "loss": 0.1693, + "step": 2024 + }, + { + "epoch": 0.18657575897176026, + "grad_norm": 0.9154309692494186, + "learning_rate": 4.749391487791021e-06, + "loss": 0.167, + "step": 2025 + }, + { + "epoch": 0.18666789514903026, + "grad_norm": 0.9903773650928431, + "learning_rate": 4.749058947327306e-06, + "loss": 0.1675, + "step": 2026 + }, + { + "epoch": 0.18676003132630029, + "grad_norm": 0.9298684751001484, + "learning_rate": 4.7487261980382235e-06, + "loss": 0.1686, + "step": 2027 + }, + { + "epoch": 0.18685216750357028, + "grad_norm": 0.9751804305795381, + "learning_rate": 4.748393239954674e-06, + "loss": 0.1811, + "step": 2028 + }, + { + "epoch": 0.18694430368084028, + "grad_norm": 0.916861137524208, + "learning_rate": 4.748060073107568e-06, + "loss": 0.1852, + "step": 2029 + }, + { + "epoch": 0.18703643985811028, + "grad_norm": 0.9351359370165698, + "learning_rate": 4.747726697527844e-06, + "loss": 0.1744, + "step": 2030 + }, + { + "epoch": 0.1871285760353803, + "grad_norm": 0.9126688237293801, + "learning_rate": 4.747393113246453e-06, + "loss": 0.1643, + "step": 2031 + }, + { + "epoch": 0.1872207122126503, + "grad_norm": 0.9693557147267255, + "learning_rate": 4.74705932029437e-06, + "loss": 0.1742, + "step": 2032 + }, + { + "epoch": 0.1873128483899203, + "grad_norm": 0.9757807922463323, + "learning_rate": 4.746725318702587e-06, + "loss": 0.166, + "step": 2033 + }, + { + "epoch": 0.18740498456719032, + "grad_norm": 1.0355600426168787, + "learning_rate": 4.746391108502116e-06, + "loss": 0.1829, + "step": 2034 + }, + { + "epoch": 0.18749712074446032, + "grad_norm": 0.9040436676391728, + "learning_rate": 4.7460566897239905e-06, + "loss": 0.1662, + "step": 2035 + }, + { + "epoch": 0.1875892569217303, + "grad_norm": 1.0315787072138687, + "learning_rate": 4.745722062399258e-06, + "loss": 0.1904, + "step": 2036 + }, + { + "epoch": 0.1876813930990003, + "grad_norm": 0.8845840894873757, + "learning_rate": 4.745387226558991e-06, + "loss": 0.1578, + "step": 2037 + }, + { + "epoch": 0.18777352927627033, + "grad_norm": 0.9747265772515474, + "learning_rate": 4.745052182234278e-06, + "loss": 0.1845, + "step": 2038 + }, + { + "epoch": 0.18786566545354033, + "grad_norm": 0.8418652499136062, + "learning_rate": 4.744716929456229e-06, + "loss": 0.1648, + "step": 2039 + }, + { + "epoch": 0.18795780163081033, + "grad_norm": 0.9120095457085243, + "learning_rate": 4.744381468255971e-06, + "loss": 0.1719, + "step": 2040 + }, + { + "epoch": 0.18804993780808035, + "grad_norm": 0.9253441991466294, + "learning_rate": 4.7440457986646525e-06, + "loss": 0.1741, + "step": 2041 + }, + { + "epoch": 0.18814207398535035, + "grad_norm": 0.892395716534565, + "learning_rate": 4.743709920713439e-06, + "loss": 0.1623, + "step": 2042 + }, + { + "epoch": 0.18823421016262035, + "grad_norm": 0.9443349478550868, + "learning_rate": 4.743373834433519e-06, + "loss": 0.1722, + "step": 2043 + }, + { + "epoch": 0.18832634633989037, + "grad_norm": 0.9663983479939164, + "learning_rate": 4.743037539856097e-06, + "loss": 0.1874, + "step": 2044 + }, + { + "epoch": 0.18841848251716037, + "grad_norm": 0.8949159341792653, + "learning_rate": 4.742701037012397e-06, + "loss": 0.1707, + "step": 2045 + }, + { + "epoch": 0.18851061869443037, + "grad_norm": 0.9418778718283833, + "learning_rate": 4.7423643259336656e-06, + "loss": 0.1754, + "step": 2046 + }, + { + "epoch": 0.18860275487170036, + "grad_norm": 0.9752328192488944, + "learning_rate": 4.742027406651164e-06, + "loss": 0.1647, + "step": 2047 + }, + { + "epoch": 0.1886948910489704, + "grad_norm": 1.100330463505503, + "learning_rate": 4.741690279196178e-06, + "loss": 0.1866, + "step": 2048 + }, + { + "epoch": 0.18878702722624038, + "grad_norm": 0.9272295304956722, + "learning_rate": 4.741352943600007e-06, + "loss": 0.1817, + "step": 2049 + }, + { + "epoch": 0.18887916340351038, + "grad_norm": 0.9627565386987668, + "learning_rate": 4.741015399893974e-06, + "loss": 0.176, + "step": 2050 + }, + { + "epoch": 0.1889712995807804, + "grad_norm": 1.00925483113557, + "learning_rate": 4.740677648109421e-06, + "loss": 0.1825, + "step": 2051 + }, + { + "epoch": 0.1890634357580504, + "grad_norm": 0.8733725473410048, + "learning_rate": 4.740339688277707e-06, + "loss": 0.1708, + "step": 2052 + }, + { + "epoch": 0.1891555719353204, + "grad_norm": 0.8996489754962257, + "learning_rate": 4.7400015204302105e-06, + "loss": 0.1527, + "step": 2053 + }, + { + "epoch": 0.1892477081125904, + "grad_norm": 1.215286111800239, + "learning_rate": 4.739663144598333e-06, + "loss": 0.1734, + "step": 2054 + }, + { + "epoch": 0.18933984428986042, + "grad_norm": 0.9496525339998125, + "learning_rate": 4.739324560813491e-06, + "loss": 0.1641, + "step": 2055 + }, + { + "epoch": 0.18943198046713042, + "grad_norm": 1.02856117189786, + "learning_rate": 4.738985769107123e-06, + "loss": 0.2055, + "step": 2056 + }, + { + "epoch": 0.18952411664440041, + "grad_norm": 0.9598011558340099, + "learning_rate": 4.738646769510685e-06, + "loss": 0.1707, + "step": 2057 + }, + { + "epoch": 0.18961625282167044, + "grad_norm": 1.006354046062315, + "learning_rate": 4.738307562055653e-06, + "loss": 0.1777, + "step": 2058 + }, + { + "epoch": 0.18970838899894044, + "grad_norm": 0.9327924509299893, + "learning_rate": 4.737968146773524e-06, + "loss": 0.1703, + "step": 2059 + }, + { + "epoch": 0.18980052517621043, + "grad_norm": 0.9457591674178287, + "learning_rate": 4.737628523695811e-06, + "loss": 0.1727, + "step": 2060 + }, + { + "epoch": 0.18989266135348046, + "grad_norm": 0.9430303988478719, + "learning_rate": 4.737288692854049e-06, + "loss": 0.1813, + "step": 2061 + }, + { + "epoch": 0.18998479753075045, + "grad_norm": 0.9261016048878836, + "learning_rate": 4.736948654279791e-06, + "loss": 0.1772, + "step": 2062 + }, + { + "epoch": 0.19007693370802045, + "grad_norm": 0.9157174430306279, + "learning_rate": 4.73660840800461e-06, + "loss": 0.1589, + "step": 2063 + }, + { + "epoch": 0.19016906988529045, + "grad_norm": 0.9654944454044344, + "learning_rate": 4.736267954060097e-06, + "loss": 0.1712, + "step": 2064 + }, + { + "epoch": 0.19026120606256047, + "grad_norm": 0.9651362786809399, + "learning_rate": 4.735927292477864e-06, + "loss": 0.1814, + "step": 2065 + }, + { + "epoch": 0.19035334223983047, + "grad_norm": 0.9044877928218676, + "learning_rate": 4.735586423289542e-06, + "loss": 0.168, + "step": 2066 + }, + { + "epoch": 0.19044547841710047, + "grad_norm": 0.9307633438458481, + "learning_rate": 4.735245346526779e-06, + "loss": 0.1755, + "step": 2067 + }, + { + "epoch": 0.1905376145943705, + "grad_norm": 0.9901411171327417, + "learning_rate": 4.734904062221246e-06, + "loss": 0.1798, + "step": 2068 + }, + { + "epoch": 0.1906297507716405, + "grad_norm": 0.9888693596561464, + "learning_rate": 4.734562570404629e-06, + "loss": 0.1725, + "step": 2069 + }, + { + "epoch": 0.19072188694891049, + "grad_norm": 0.9139611246565976, + "learning_rate": 4.734220871108638e-06, + "loss": 0.1639, + "step": 2070 + }, + { + "epoch": 0.19081402312618048, + "grad_norm": 0.9049028935278389, + "learning_rate": 4.733878964364998e-06, + "loss": 0.1762, + "step": 2071 + }, + { + "epoch": 0.1909061593034505, + "grad_norm": 0.931420719373214, + "learning_rate": 4.7335368502054564e-06, + "loss": 0.1787, + "step": 2072 + }, + { + "epoch": 0.1909982954807205, + "grad_norm": 0.9522269497391289, + "learning_rate": 4.733194528661778e-06, + "loss": 0.1751, + "step": 2073 + }, + { + "epoch": 0.1910904316579905, + "grad_norm": 0.9549558070156487, + "learning_rate": 4.732851999765747e-06, + "loss": 0.1684, + "step": 2074 + }, + { + "epoch": 0.19118256783526053, + "grad_norm": 0.9653026251701582, + "learning_rate": 4.732509263549167e-06, + "loss": 0.1713, + "step": 2075 + }, + { + "epoch": 0.19127470401253052, + "grad_norm": 0.9395360707584086, + "learning_rate": 4.732166320043862e-06, + "loss": 0.1595, + "step": 2076 + }, + { + "epoch": 0.19136684018980052, + "grad_norm": 0.9011041007700445, + "learning_rate": 4.731823169281674e-06, + "loss": 0.1726, + "step": 2077 + }, + { + "epoch": 0.19145897636707054, + "grad_norm": 0.9982033798259724, + "learning_rate": 4.731479811294464e-06, + "loss": 0.1802, + "step": 2078 + }, + { + "epoch": 0.19155111254434054, + "grad_norm": 0.9653945847759827, + "learning_rate": 4.731136246114114e-06, + "loss": 0.1786, + "step": 2079 + }, + { + "epoch": 0.19164324872161054, + "grad_norm": 0.9572793258606881, + "learning_rate": 4.730792473772523e-06, + "loss": 0.17, + "step": 2080 + }, + { + "epoch": 0.19173538489888053, + "grad_norm": 1.017565135723274, + "learning_rate": 4.730448494301612e-06, + "loss": 0.174, + "step": 2081 + }, + { + "epoch": 0.19182752107615056, + "grad_norm": 0.9507693595346857, + "learning_rate": 4.7301043077333165e-06, + "loss": 0.1805, + "step": 2082 + }, + { + "epoch": 0.19191965725342056, + "grad_norm": 0.9061693205241929, + "learning_rate": 4.729759914099597e-06, + "loss": 0.1636, + "step": 2083 + }, + { + "epoch": 0.19201179343069055, + "grad_norm": 0.9924987247261973, + "learning_rate": 4.729415313432429e-06, + "loss": 0.17, + "step": 2084 + }, + { + "epoch": 0.19210392960796058, + "grad_norm": 0.9801999793423163, + "learning_rate": 4.729070505763809e-06, + "loss": 0.1798, + "step": 2085 + }, + { + "epoch": 0.19219606578523057, + "grad_norm": 0.9452879034672409, + "learning_rate": 4.728725491125753e-06, + "loss": 0.1726, + "step": 2086 + }, + { + "epoch": 0.19228820196250057, + "grad_norm": 0.9947615608484313, + "learning_rate": 4.728380269550296e-06, + "loss": 0.1876, + "step": 2087 + }, + { + "epoch": 0.19238033813977057, + "grad_norm": 1.0462479131548408, + "learning_rate": 4.7280348410694905e-06, + "loss": 0.1865, + "step": 2088 + }, + { + "epoch": 0.1924724743170406, + "grad_norm": 0.9309592228183027, + "learning_rate": 4.72768920571541e-06, + "loss": 0.1876, + "step": 2089 + }, + { + "epoch": 0.1925646104943106, + "grad_norm": 0.9042067215573306, + "learning_rate": 4.727343363520147e-06, + "loss": 0.1715, + "step": 2090 + }, + { + "epoch": 0.1926567466715806, + "grad_norm": 0.9590467825615356, + "learning_rate": 4.7269973145158134e-06, + "loss": 0.168, + "step": 2091 + }, + { + "epoch": 0.1927488828488506, + "grad_norm": 0.9612689987095125, + "learning_rate": 4.7266510587345395e-06, + "loss": 0.1712, + "step": 2092 + }, + { + "epoch": 0.1928410190261206, + "grad_norm": 0.9089807471667561, + "learning_rate": 4.726304596208475e-06, + "loss": 0.1624, + "step": 2093 + }, + { + "epoch": 0.1929331552033906, + "grad_norm": 0.8967926180786936, + "learning_rate": 4.725957926969789e-06, + "loss": 0.1564, + "step": 2094 + }, + { + "epoch": 0.19302529138066063, + "grad_norm": 0.9384858000388413, + "learning_rate": 4.72561105105067e-06, + "loss": 0.169, + "step": 2095 + }, + { + "epoch": 0.19311742755793063, + "grad_norm": 0.9604763558849385, + "learning_rate": 4.7252639684833255e-06, + "loss": 0.1687, + "step": 2096 + }, + { + "epoch": 0.19320956373520062, + "grad_norm": 0.9183646669661314, + "learning_rate": 4.724916679299982e-06, + "loss": 0.1664, + "step": 2097 + }, + { + "epoch": 0.19330169991247062, + "grad_norm": 0.917100174808335, + "learning_rate": 4.7245691835328855e-06, + "loss": 0.1676, + "step": 2098 + }, + { + "epoch": 0.19339383608974064, + "grad_norm": 0.9365272475470837, + "learning_rate": 4.724221481214301e-06, + "loss": 0.1673, + "step": 2099 + }, + { + "epoch": 0.19348597226701064, + "grad_norm": 1.0141662835470497, + "learning_rate": 4.723873572376512e-06, + "loss": 0.185, + "step": 2100 + }, + { + "epoch": 0.19357810844428064, + "grad_norm": 0.9558839300005915, + "learning_rate": 4.723525457051823e-06, + "loss": 0.1795, + "step": 2101 + }, + { + "epoch": 0.19367024462155066, + "grad_norm": 0.9411376137788801, + "learning_rate": 4.723177135272556e-06, + "loss": 0.1752, + "step": 2102 + }, + { + "epoch": 0.19376238079882066, + "grad_norm": 0.8955626892165673, + "learning_rate": 4.7228286070710525e-06, + "loss": 0.1726, + "step": 2103 + }, + { + "epoch": 0.19385451697609066, + "grad_norm": 0.9072698897943006, + "learning_rate": 4.722479872479674e-06, + "loss": 0.1588, + "step": 2104 + }, + { + "epoch": 0.19394665315336065, + "grad_norm": 0.921780763256833, + "learning_rate": 4.7221309315308e-06, + "loss": 0.1765, + "step": 2105 + }, + { + "epoch": 0.19403878933063068, + "grad_norm": 0.8677649999621102, + "learning_rate": 4.721781784256829e-06, + "loss": 0.1535, + "step": 2106 + }, + { + "epoch": 0.19413092550790068, + "grad_norm": 1.0361240936326264, + "learning_rate": 4.721432430690181e-06, + "loss": 0.18, + "step": 2107 + }, + { + "epoch": 0.19422306168517067, + "grad_norm": 1.0506692873586285, + "learning_rate": 4.721082870863293e-06, + "loss": 0.1744, + "step": 2108 + }, + { + "epoch": 0.1943151978624407, + "grad_norm": 0.9632736944905299, + "learning_rate": 4.720733104808621e-06, + "loss": 0.1563, + "step": 2109 + }, + { + "epoch": 0.1944073340397107, + "grad_norm": 0.9477773210722018, + "learning_rate": 4.720383132558641e-06, + "loss": 0.174, + "step": 2110 + }, + { + "epoch": 0.1944994702169807, + "grad_norm": 1.105717006171067, + "learning_rate": 4.720032954145849e-06, + "loss": 0.1774, + "step": 2111 + }, + { + "epoch": 0.19459160639425072, + "grad_norm": 0.9737476883844909, + "learning_rate": 4.719682569602757e-06, + "loss": 0.1692, + "step": 2112 + }, + { + "epoch": 0.1946837425715207, + "grad_norm": 1.0352161292591884, + "learning_rate": 4.7193319789619e-06, + "loss": 0.1729, + "step": 2113 + }, + { + "epoch": 0.1947758787487907, + "grad_norm": 0.893049222834855, + "learning_rate": 4.718981182255831e-06, + "loss": 0.167, + "step": 2114 + }, + { + "epoch": 0.1948680149260607, + "grad_norm": 0.9409597426616632, + "learning_rate": 4.71863017951712e-06, + "loss": 0.1851, + "step": 2115 + }, + { + "epoch": 0.19496015110333073, + "grad_norm": 0.932584725623311, + "learning_rate": 4.718278970778357e-06, + "loss": 0.1704, + "step": 2116 + }, + { + "epoch": 0.19505228728060073, + "grad_norm": 0.9747757753227256, + "learning_rate": 4.717927556072153e-06, + "loss": 0.175, + "step": 2117 + }, + { + "epoch": 0.19514442345787072, + "grad_norm": 0.964675319367432, + "learning_rate": 4.717575935431138e-06, + "loss": 0.1741, + "step": 2118 + }, + { + "epoch": 0.19523655963514075, + "grad_norm": 1.0099187231751314, + "learning_rate": 4.7172241088879575e-06, + "loss": 0.1854, + "step": 2119 + }, + { + "epoch": 0.19532869581241075, + "grad_norm": 0.9059186999651861, + "learning_rate": 4.716872076475281e-06, + "loss": 0.1786, + "step": 2120 + }, + { + "epoch": 0.19542083198968074, + "grad_norm": 0.9383878112214665, + "learning_rate": 4.7165198382257926e-06, + "loss": 0.1691, + "step": 2121 + }, + { + "epoch": 0.19551296816695074, + "grad_norm": 0.9390251394741452, + "learning_rate": 4.716167394172198e-06, + "loss": 0.1747, + "step": 2122 + }, + { + "epoch": 0.19560510434422076, + "grad_norm": 0.9179061143840483, + "learning_rate": 4.715814744347224e-06, + "loss": 0.1699, + "step": 2123 + }, + { + "epoch": 0.19569724052149076, + "grad_norm": 0.8956384202715454, + "learning_rate": 4.715461888783612e-06, + "loss": 0.1581, + "step": 2124 + }, + { + "epoch": 0.19578937669876076, + "grad_norm": 0.9922412568775244, + "learning_rate": 4.715108827514125e-06, + "loss": 0.1852, + "step": 2125 + }, + { + "epoch": 0.19588151287603078, + "grad_norm": 0.9201841509112736, + "learning_rate": 4.714755560571545e-06, + "loss": 0.1806, + "step": 2126 + }, + { + "epoch": 0.19597364905330078, + "grad_norm": 0.9291530212197076, + "learning_rate": 4.7144020879886736e-06, + "loss": 0.1678, + "step": 2127 + }, + { + "epoch": 0.19606578523057078, + "grad_norm": 0.8892646385384433, + "learning_rate": 4.714048409798328e-06, + "loss": 0.1588, + "step": 2128 + }, + { + "epoch": 0.1961579214078408, + "grad_norm": 0.9578647584230859, + "learning_rate": 4.713694526033351e-06, + "loss": 0.1696, + "step": 2129 + }, + { + "epoch": 0.1962500575851108, + "grad_norm": 0.9303263756088295, + "learning_rate": 4.713340436726599e-06, + "loss": 0.1876, + "step": 2130 + }, + { + "epoch": 0.1963421937623808, + "grad_norm": 1.0143043159123855, + "learning_rate": 4.712986141910948e-06, + "loss": 0.1833, + "step": 2131 + }, + { + "epoch": 0.1964343299396508, + "grad_norm": 1.0257791241751693, + "learning_rate": 4.712631641619297e-06, + "loss": 0.1813, + "step": 2132 + }, + { + "epoch": 0.19652646611692082, + "grad_norm": 0.9326155949771587, + "learning_rate": 4.7122769358845595e-06, + "loss": 0.1834, + "step": 2133 + }, + { + "epoch": 0.19661860229419081, + "grad_norm": 0.9501425332398825, + "learning_rate": 4.71192202473967e-06, + "loss": 0.1883, + "step": 2134 + }, + { + "epoch": 0.1967107384714608, + "grad_norm": 0.92519734244431, + "learning_rate": 4.711566908217583e-06, + "loss": 0.1733, + "step": 2135 + }, + { + "epoch": 0.19680287464873084, + "grad_norm": 0.9141559769713349, + "learning_rate": 4.71121158635127e-06, + "loss": 0.1693, + "step": 2136 + }, + { + "epoch": 0.19689501082600083, + "grad_norm": 0.8635507856697013, + "learning_rate": 4.710856059173723e-06, + "loss": 0.1597, + "step": 2137 + }, + { + "epoch": 0.19698714700327083, + "grad_norm": 0.9169944069705963, + "learning_rate": 4.710500326717954e-06, + "loss": 0.1789, + "step": 2138 + }, + { + "epoch": 0.19707928318054083, + "grad_norm": 0.9519571879140262, + "learning_rate": 4.7101443890169915e-06, + "loss": 0.1681, + "step": 2139 + }, + { + "epoch": 0.19717141935781085, + "grad_norm": 0.9514507211318625, + "learning_rate": 4.7097882461038845e-06, + "loss": 0.1725, + "step": 2140 + }, + { + "epoch": 0.19726355553508085, + "grad_norm": 0.9761404726228229, + "learning_rate": 4.7094318980117005e-06, + "loss": 0.1759, + "step": 2141 + }, + { + "epoch": 0.19735569171235084, + "grad_norm": 0.9768124223352279, + "learning_rate": 4.709075344773527e-06, + "loss": 0.179, + "step": 2142 + }, + { + "epoch": 0.19744782788962087, + "grad_norm": 0.9305036698633852, + "learning_rate": 4.70871858642247e-06, + "loss": 0.1669, + "step": 2143 + }, + { + "epoch": 0.19753996406689087, + "grad_norm": 0.9332637277566961, + "learning_rate": 4.708361622991656e-06, + "loss": 0.164, + "step": 2144 + }, + { + "epoch": 0.19763210024416086, + "grad_norm": 0.9666366370430729, + "learning_rate": 4.708004454514226e-06, + "loss": 0.183, + "step": 2145 + }, + { + "epoch": 0.1977242364214309, + "grad_norm": 0.9087507122980967, + "learning_rate": 4.7076470810233455e-06, + "loss": 0.1715, + "step": 2146 + }, + { + "epoch": 0.19781637259870088, + "grad_norm": 0.9415921197844276, + "learning_rate": 4.707289502552196e-06, + "loss": 0.1791, + "step": 2147 + }, + { + "epoch": 0.19790850877597088, + "grad_norm": 0.8989464774107121, + "learning_rate": 4.706931719133978e-06, + "loss": 0.1655, + "step": 2148 + }, + { + "epoch": 0.19800064495324088, + "grad_norm": 1.0465683898896436, + "learning_rate": 4.706573730801913e-06, + "loss": 0.189, + "step": 2149 + }, + { + "epoch": 0.1980927811305109, + "grad_norm": 0.920378805791493, + "learning_rate": 4.706215537589239e-06, + "loss": 0.1758, + "step": 2150 + }, + { + "epoch": 0.1981849173077809, + "grad_norm": 0.8983349357724887, + "learning_rate": 4.705857139529215e-06, + "loss": 0.1497, + "step": 2151 + }, + { + "epoch": 0.1982770534850509, + "grad_norm": 0.9580161374361805, + "learning_rate": 4.705498536655119e-06, + "loss": 0.1821, + "step": 2152 + }, + { + "epoch": 0.19836918966232092, + "grad_norm": 0.9231662092252367, + "learning_rate": 4.705139729000246e-06, + "loss": 0.1692, + "step": 2153 + }, + { + "epoch": 0.19846132583959092, + "grad_norm": 0.9511535860655734, + "learning_rate": 4.704780716597912e-06, + "loss": 0.177, + "step": 2154 + }, + { + "epoch": 0.19855346201686092, + "grad_norm": 0.9592527774843944, + "learning_rate": 4.7044214994814505e-06, + "loss": 0.1872, + "step": 2155 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 0.941224979580375, + "learning_rate": 4.704062077684216e-06, + "loss": 0.1692, + "step": 2156 + }, + { + "epoch": 0.19873773437140094, + "grad_norm": 0.8864677413252431, + "learning_rate": 4.703702451239582e-06, + "loss": 0.1711, + "step": 2157 + }, + { + "epoch": 0.19882987054867093, + "grad_norm": 0.9760104907836626, + "learning_rate": 4.703342620180936e-06, + "loss": 0.1891, + "step": 2158 + }, + { + "epoch": 0.19892200672594093, + "grad_norm": 0.925158598268592, + "learning_rate": 4.702982584541691e-06, + "loss": 0.1695, + "step": 2159 + }, + { + "epoch": 0.19901414290321096, + "grad_norm": 0.969472216082636, + "learning_rate": 4.702622344355276e-06, + "loss": 0.1771, + "step": 2160 + }, + { + "epoch": 0.19910627908048095, + "grad_norm": 1.0508981404224167, + "learning_rate": 4.702261899655139e-06, + "loss": 0.177, + "step": 2161 + }, + { + "epoch": 0.19919841525775095, + "grad_norm": 0.9847930314760027, + "learning_rate": 4.701901250474748e-06, + "loss": 0.1827, + "step": 2162 + }, + { + "epoch": 0.19929055143502097, + "grad_norm": 0.966509026140173, + "learning_rate": 4.70154039684759e-06, + "loss": 0.1706, + "step": 2163 + }, + { + "epoch": 0.19938268761229097, + "grad_norm": 1.0693603050159732, + "learning_rate": 4.701179338807168e-06, + "loss": 0.1866, + "step": 2164 + }, + { + "epoch": 0.19947482378956097, + "grad_norm": 0.9131224498772273, + "learning_rate": 4.7008180763870075e-06, + "loss": 0.1569, + "step": 2165 + }, + { + "epoch": 0.19956695996683096, + "grad_norm": 0.9114405522217581, + "learning_rate": 4.700456609620652e-06, + "loss": 0.1687, + "step": 2166 + }, + { + "epoch": 0.199659096144101, + "grad_norm": 0.9712061010659172, + "learning_rate": 4.700094938541664e-06, + "loss": 0.169, + "step": 2167 + }, + { + "epoch": 0.199751232321371, + "grad_norm": 0.9870353066400023, + "learning_rate": 4.6997330631836235e-06, + "loss": 0.1736, + "step": 2168 + }, + { + "epoch": 0.19984336849864098, + "grad_norm": 1.0457732545812704, + "learning_rate": 4.699370983580132e-06, + "loss": 0.1676, + "step": 2169 + }, + { + "epoch": 0.199935504675911, + "grad_norm": 0.9582776467618426, + "learning_rate": 4.699008699764807e-06, + "loss": 0.1622, + "step": 2170 + }, + { + "epoch": 0.200027640853181, + "grad_norm": 0.924560477144988, + "learning_rate": 4.698646211771287e-06, + "loss": 0.1778, + "step": 2171 + }, + { + "epoch": 0.200119777030451, + "grad_norm": 0.9603418732544574, + "learning_rate": 4.698283519633231e-06, + "loss": 0.173, + "step": 2172 + }, + { + "epoch": 0.200211913207721, + "grad_norm": 0.9318117654352273, + "learning_rate": 4.6979206233843136e-06, + "loss": 0.1763, + "step": 2173 + }, + { + "epoch": 0.20030404938499102, + "grad_norm": 0.8500574809360554, + "learning_rate": 4.697557523058229e-06, + "loss": 0.1612, + "step": 2174 + }, + { + "epoch": 0.20039618556226102, + "grad_norm": 0.9321952438861371, + "learning_rate": 4.6971942186886925e-06, + "loss": 0.1657, + "step": 2175 + }, + { + "epoch": 0.20048832173953102, + "grad_norm": 0.8878808978445922, + "learning_rate": 4.696830710309437e-06, + "loss": 0.1669, + "step": 2176 + }, + { + "epoch": 0.20058045791680104, + "grad_norm": 0.979230897783306, + "learning_rate": 4.696466997954212e-06, + "loss": 0.1746, + "step": 2177 + }, + { + "epoch": 0.20067259409407104, + "grad_norm": 0.9049009072181234, + "learning_rate": 4.696103081656791e-06, + "loss": 0.1701, + "step": 2178 + }, + { + "epoch": 0.20076473027134104, + "grad_norm": 0.9108225412943807, + "learning_rate": 4.695738961450962e-06, + "loss": 0.1588, + "step": 2179 + }, + { + "epoch": 0.20085686644861106, + "grad_norm": 0.9517811181905188, + "learning_rate": 4.695374637370534e-06, + "loss": 0.173, + "step": 2180 + }, + { + "epoch": 0.20094900262588106, + "grad_norm": 0.9484256391007594, + "learning_rate": 4.695010109449335e-06, + "loss": 0.174, + "step": 2181 + }, + { + "epoch": 0.20104113880315105, + "grad_norm": 0.9833242975609692, + "learning_rate": 4.694645377721211e-06, + "loss": 0.1824, + "step": 2182 + }, + { + "epoch": 0.20113327498042105, + "grad_norm": 0.9384489452108485, + "learning_rate": 4.694280442220027e-06, + "loss": 0.1606, + "step": 2183 + }, + { + "epoch": 0.20122541115769108, + "grad_norm": 0.9458788028655759, + "learning_rate": 4.693915302979669e-06, + "loss": 0.1865, + "step": 2184 + }, + { + "epoch": 0.20131754733496107, + "grad_norm": 0.9715149119245432, + "learning_rate": 4.693549960034038e-06, + "loss": 0.1758, + "step": 2185 + }, + { + "epoch": 0.20140968351223107, + "grad_norm": 0.8895745071235284, + "learning_rate": 4.693184413417058e-06, + "loss": 0.1652, + "step": 2186 + }, + { + "epoch": 0.2015018196895011, + "grad_norm": 0.9179333420101142, + "learning_rate": 4.692818663162668e-06, + "loss": 0.1668, + "step": 2187 + }, + { + "epoch": 0.2015939558667711, + "grad_norm": 1.0524495392370385, + "learning_rate": 4.69245270930483e-06, + "loss": 0.1626, + "step": 2188 + }, + { + "epoch": 0.2016860920440411, + "grad_norm": 0.9301977089994403, + "learning_rate": 4.6920865518775214e-06, + "loss": 0.161, + "step": 2189 + }, + { + "epoch": 0.20177822822131108, + "grad_norm": 0.9799561176585696, + "learning_rate": 4.6917201909147415e-06, + "loss": 0.1838, + "step": 2190 + }, + { + "epoch": 0.2018703643985811, + "grad_norm": 0.9996105143573751, + "learning_rate": 4.691353626450505e-06, + "loss": 0.1726, + "step": 2191 + }, + { + "epoch": 0.2019625005758511, + "grad_norm": 1.025059301876579, + "learning_rate": 4.690986858518849e-06, + "loss": 0.1822, + "step": 2192 + }, + { + "epoch": 0.2020546367531211, + "grad_norm": 0.9165762821501996, + "learning_rate": 4.6906198871538265e-06, + "loss": 0.1639, + "step": 2193 + }, + { + "epoch": 0.20214677293039113, + "grad_norm": 1.0481638600088068, + "learning_rate": 4.690252712389513e-06, + "loss": 0.1855, + "step": 2194 + }, + { + "epoch": 0.20223890910766112, + "grad_norm": 0.9492628877870042, + "learning_rate": 4.6898853342599994e-06, + "loss": 0.1567, + "step": 2195 + }, + { + "epoch": 0.20233104528493112, + "grad_norm": 1.02721472974448, + "learning_rate": 4.689517752799396e-06, + "loss": 0.1904, + "step": 2196 + }, + { + "epoch": 0.20242318146220115, + "grad_norm": 0.9749242814804707, + "learning_rate": 4.689149968041834e-06, + "loss": 0.1844, + "step": 2197 + }, + { + "epoch": 0.20251531763947114, + "grad_norm": 1.0038104998977666, + "learning_rate": 4.6887819800214615e-06, + "loss": 0.1743, + "step": 2198 + }, + { + "epoch": 0.20260745381674114, + "grad_norm": 1.005188089752521, + "learning_rate": 4.688413788772447e-06, + "loss": 0.1788, + "step": 2199 + }, + { + "epoch": 0.20269958999401114, + "grad_norm": 0.9150762628236749, + "learning_rate": 4.688045394328976e-06, + "loss": 0.1737, + "step": 2200 + }, + { + "epoch": 0.20279172617128116, + "grad_norm": 0.9398977054241704, + "learning_rate": 4.687676796725256e-06, + "loss": 0.1634, + "step": 2201 + }, + { + "epoch": 0.20288386234855116, + "grad_norm": 0.9427101062525798, + "learning_rate": 4.687307995995509e-06, + "loss": 0.1661, + "step": 2202 + }, + { + "epoch": 0.20297599852582116, + "grad_norm": 0.9673031921019366, + "learning_rate": 4.68693899217398e-06, + "loss": 0.1687, + "step": 2203 + }, + { + "epoch": 0.20306813470309118, + "grad_norm": 0.9799184390341139, + "learning_rate": 4.6865697852949285e-06, + "loss": 0.1661, + "step": 2204 + }, + { + "epoch": 0.20316027088036118, + "grad_norm": 0.9799685128302213, + "learning_rate": 4.686200375392639e-06, + "loss": 0.182, + "step": 2205 + }, + { + "epoch": 0.20325240705763117, + "grad_norm": 0.9174324923130279, + "learning_rate": 4.6858307625014084e-06, + "loss": 0.1579, + "step": 2206 + }, + { + "epoch": 0.20334454323490117, + "grad_norm": 0.9515024330765529, + "learning_rate": 4.685460946655556e-06, + "loss": 0.1784, + "step": 2207 + }, + { + "epoch": 0.2034366794121712, + "grad_norm": 1.0194734725774568, + "learning_rate": 4.68509092788942e-06, + "loss": 0.1754, + "step": 2208 + }, + { + "epoch": 0.2035288155894412, + "grad_norm": 0.9333275447322845, + "learning_rate": 4.684720706237356e-06, + "loss": 0.1666, + "step": 2209 + }, + { + "epoch": 0.2036209517667112, + "grad_norm": 0.90507569208577, + "learning_rate": 4.68435028173374e-06, + "loss": 0.1643, + "step": 2210 + }, + { + "epoch": 0.2037130879439812, + "grad_norm": 1.047547987412547, + "learning_rate": 4.683979654412965e-06, + "loss": 0.1965, + "step": 2211 + }, + { + "epoch": 0.2038052241212512, + "grad_norm": 0.8708624834050619, + "learning_rate": 4.683608824309443e-06, + "loss": 0.1751, + "step": 2212 + }, + { + "epoch": 0.2038973602985212, + "grad_norm": 0.9282192492999448, + "learning_rate": 4.683237791457608e-06, + "loss": 0.1772, + "step": 2213 + }, + { + "epoch": 0.20398949647579123, + "grad_norm": 0.9310633455335762, + "learning_rate": 4.682866555891908e-06, + "loss": 0.1808, + "step": 2214 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 0.8971755268118722, + "learning_rate": 4.6824951176468134e-06, + "loss": 0.1652, + "step": 2215 + }, + { + "epoch": 0.20417376883033123, + "grad_norm": 0.9276435656284576, + "learning_rate": 4.682123476756813e-06, + "loss": 0.1685, + "step": 2216 + }, + { + "epoch": 0.20426590500760122, + "grad_norm": 0.8795940253862768, + "learning_rate": 4.681751633256413e-06, + "loss": 0.1668, + "step": 2217 + }, + { + "epoch": 0.20435804118487125, + "grad_norm": 0.9207697016900108, + "learning_rate": 4.681379587180138e-06, + "loss": 0.175, + "step": 2218 + }, + { + "epoch": 0.20445017736214124, + "grad_norm": 0.9294762082235091, + "learning_rate": 4.681007338562535e-06, + "loss": 0.1796, + "step": 2219 + }, + { + "epoch": 0.20454231353941124, + "grad_norm": 0.9734872584164359, + "learning_rate": 4.680634887438165e-06, + "loss": 0.1733, + "step": 2220 + }, + { + "epoch": 0.20463444971668127, + "grad_norm": 0.924133430170834, + "learning_rate": 4.6802622338416115e-06, + "loss": 0.161, + "step": 2221 + }, + { + "epoch": 0.20472658589395126, + "grad_norm": 0.8720069613724715, + "learning_rate": 4.679889377807475e-06, + "loss": 0.1514, + "step": 2222 + }, + { + "epoch": 0.20481872207122126, + "grad_norm": 0.8737950896382805, + "learning_rate": 4.679516319370374e-06, + "loss": 0.1527, + "step": 2223 + }, + { + "epoch": 0.20491085824849126, + "grad_norm": 0.9553532936958365, + "learning_rate": 4.679143058564949e-06, + "loss": 0.1844, + "step": 2224 + }, + { + "epoch": 0.20500299442576128, + "grad_norm": 1.0052618454260092, + "learning_rate": 4.678769595425856e-06, + "loss": 0.1941, + "step": 2225 + }, + { + "epoch": 0.20509513060303128, + "grad_norm": 0.9698993550808772, + "learning_rate": 4.6783959299877725e-06, + "loss": 0.1606, + "step": 2226 + }, + { + "epoch": 0.20518726678030128, + "grad_norm": 0.9874980391024155, + "learning_rate": 4.678022062285392e-06, + "loss": 0.1753, + "step": 2227 + }, + { + "epoch": 0.2052794029575713, + "grad_norm": 0.9071206997289588, + "learning_rate": 4.677647992353428e-06, + "loss": 0.1654, + "step": 2228 + }, + { + "epoch": 0.2053715391348413, + "grad_norm": 0.9335882793666567, + "learning_rate": 4.677273720226615e-06, + "loss": 0.1701, + "step": 2229 + }, + { + "epoch": 0.2054636753121113, + "grad_norm": 0.8977947704353159, + "learning_rate": 4.6768992459397015e-06, + "loss": 0.1505, + "step": 2230 + }, + { + "epoch": 0.20555581148938132, + "grad_norm": 0.9250319869015167, + "learning_rate": 4.67652456952746e-06, + "loss": 0.1564, + "step": 2231 + }, + { + "epoch": 0.20564794766665132, + "grad_norm": 0.9457739881833873, + "learning_rate": 4.6761496910246766e-06, + "loss": 0.1708, + "step": 2232 + }, + { + "epoch": 0.2057400838439213, + "grad_norm": 0.8952312643772637, + "learning_rate": 4.6757746104661606e-06, + "loss": 0.159, + "step": 2233 + }, + { + "epoch": 0.2058322200211913, + "grad_norm": 0.9126755394162147, + "learning_rate": 4.675399327886738e-06, + "loss": 0.1598, + "step": 2234 + }, + { + "epoch": 0.20592435619846133, + "grad_norm": 0.9376971680289053, + "learning_rate": 4.675023843321254e-06, + "loss": 0.1645, + "step": 2235 + }, + { + "epoch": 0.20601649237573133, + "grad_norm": 0.9812020676495905, + "learning_rate": 4.674648156804571e-06, + "loss": 0.1508, + "step": 2236 + }, + { + "epoch": 0.20610862855300133, + "grad_norm": 1.0616534309734076, + "learning_rate": 4.674272268371574e-06, + "loss": 0.177, + "step": 2237 + }, + { + "epoch": 0.20620076473027135, + "grad_norm": 0.9902707787686815, + "learning_rate": 4.673896178057162e-06, + "loss": 0.1725, + "step": 2238 + }, + { + "epoch": 0.20629290090754135, + "grad_norm": 0.9329391926904818, + "learning_rate": 4.673519885896256e-06, + "loss": 0.182, + "step": 2239 + }, + { + "epoch": 0.20638503708481135, + "grad_norm": 0.9761864416718115, + "learning_rate": 4.673143391923794e-06, + "loss": 0.1788, + "step": 2240 + }, + { + "epoch": 0.20647717326208134, + "grad_norm": 0.9435942541496123, + "learning_rate": 4.672766696174736e-06, + "loss": 0.1664, + "step": 2241 + }, + { + "epoch": 0.20656930943935137, + "grad_norm": 0.8962267290534224, + "learning_rate": 4.672389798684055e-06, + "loss": 0.173, + "step": 2242 + }, + { + "epoch": 0.20666144561662136, + "grad_norm": 0.9259279692548622, + "learning_rate": 4.672012699486748e-06, + "loss": 0.1722, + "step": 2243 + }, + { + "epoch": 0.20675358179389136, + "grad_norm": 0.9124221318809105, + "learning_rate": 4.671635398617828e-06, + "loss": 0.1616, + "step": 2244 + }, + { + "epoch": 0.20684571797116139, + "grad_norm": 0.8905141403926174, + "learning_rate": 4.671257896112327e-06, + "loss": 0.1604, + "step": 2245 + }, + { + "epoch": 0.20693785414843138, + "grad_norm": 0.9232672218716768, + "learning_rate": 4.670880192005298e-06, + "loss": 0.165, + "step": 2246 + }, + { + "epoch": 0.20702999032570138, + "grad_norm": 0.9323052407767077, + "learning_rate": 4.670502286331809e-06, + "loss": 0.1805, + "step": 2247 + }, + { + "epoch": 0.2071221265029714, + "grad_norm": 0.9634973629389704, + "learning_rate": 4.670124179126948e-06, + "loss": 0.1761, + "step": 2248 + }, + { + "epoch": 0.2072142626802414, + "grad_norm": 0.9278126931365306, + "learning_rate": 4.669745870425824e-06, + "loss": 0.172, + "step": 2249 + }, + { + "epoch": 0.2073063988575114, + "grad_norm": 0.849220352694767, + "learning_rate": 4.669367360263563e-06, + "loss": 0.1517, + "step": 2250 + }, + { + "epoch": 0.2073985350347814, + "grad_norm": 0.9132996602721539, + "learning_rate": 4.668988648675309e-06, + "loss": 0.164, + "step": 2251 + }, + { + "epoch": 0.20749067121205142, + "grad_norm": 0.8700626009502423, + "learning_rate": 4.668609735696225e-06, + "loss": 0.1628, + "step": 2252 + }, + { + "epoch": 0.20758280738932142, + "grad_norm": 0.9505619550445175, + "learning_rate": 4.668230621361494e-06, + "loss": 0.1642, + "step": 2253 + }, + { + "epoch": 0.2076749435665914, + "grad_norm": 0.9806376626717213, + "learning_rate": 4.667851305706316e-06, + "loss": 0.1875, + "step": 2254 + }, + { + "epoch": 0.20776707974386144, + "grad_norm": 0.952808410032759, + "learning_rate": 4.667471788765911e-06, + "loss": 0.169, + "step": 2255 + }, + { + "epoch": 0.20785921592113143, + "grad_norm": 1.0121652933247183, + "learning_rate": 4.667092070575518e-06, + "loss": 0.1769, + "step": 2256 + }, + { + "epoch": 0.20795135209840143, + "grad_norm": 0.8796668160626014, + "learning_rate": 4.666712151170392e-06, + "loss": 0.151, + "step": 2257 + }, + { + "epoch": 0.20804348827567143, + "grad_norm": 0.9275469318790555, + "learning_rate": 4.6663320305858106e-06, + "loss": 0.1674, + "step": 2258 + }, + { + "epoch": 0.20813562445294145, + "grad_norm": 0.9732098865299389, + "learning_rate": 4.665951708857066e-06, + "loss": 0.1701, + "step": 2259 + }, + { + "epoch": 0.20822776063021145, + "grad_norm": 0.9026738441040866, + "learning_rate": 4.665571186019473e-06, + "loss": 0.159, + "step": 2260 + }, + { + "epoch": 0.20831989680748145, + "grad_norm": 0.9593697452130802, + "learning_rate": 4.665190462108362e-06, + "loss": 0.1625, + "step": 2261 + }, + { + "epoch": 0.20841203298475147, + "grad_norm": 1.0499408869674476, + "learning_rate": 4.664809537159084e-06, + "loss": 0.179, + "step": 2262 + }, + { + "epoch": 0.20850416916202147, + "grad_norm": 0.9468448196539467, + "learning_rate": 4.664428411207007e-06, + "loss": 0.1548, + "step": 2263 + }, + { + "epoch": 0.20859630533929147, + "grad_norm": 0.9702256795456764, + "learning_rate": 4.664047084287518e-06, + "loss": 0.171, + "step": 2264 + }, + { + "epoch": 0.2086884415165615, + "grad_norm": 0.98262891832084, + "learning_rate": 4.663665556436025e-06, + "loss": 0.1658, + "step": 2265 + }, + { + "epoch": 0.2087805776938315, + "grad_norm": 0.9745838646564671, + "learning_rate": 4.663283827687953e-06, + "loss": 0.1773, + "step": 2266 + }, + { + "epoch": 0.20887271387110148, + "grad_norm": 0.9604527581018515, + "learning_rate": 4.662901898078746e-06, + "loss": 0.1737, + "step": 2267 + }, + { + "epoch": 0.20896485004837148, + "grad_norm": 0.9844126821220404, + "learning_rate": 4.662519767643863e-06, + "loss": 0.1678, + "step": 2268 + }, + { + "epoch": 0.2090569862256415, + "grad_norm": 0.957428150635439, + "learning_rate": 4.662137436418786e-06, + "loss": 0.1804, + "step": 2269 + }, + { + "epoch": 0.2091491224029115, + "grad_norm": 0.9349317076208526, + "learning_rate": 4.661754904439018e-06, + "loss": 0.177, + "step": 2270 + }, + { + "epoch": 0.2092412585801815, + "grad_norm": 0.9298674765899402, + "learning_rate": 4.661372171740073e-06, + "loss": 0.1702, + "step": 2271 + }, + { + "epoch": 0.20933339475745152, + "grad_norm": 0.9608961443235141, + "learning_rate": 4.660989238357489e-06, + "loss": 0.1725, + "step": 2272 + }, + { + "epoch": 0.20942553093472152, + "grad_norm": 0.9192686971637103, + "learning_rate": 4.660606104326822e-06, + "loss": 0.1676, + "step": 2273 + }, + { + "epoch": 0.20951766711199152, + "grad_norm": 0.9484587340929831, + "learning_rate": 4.660222769683645e-06, + "loss": 0.1689, + "step": 2274 + }, + { + "epoch": 0.20960980328926151, + "grad_norm": 0.9166918556649847, + "learning_rate": 4.659839234463552e-06, + "loss": 0.1666, + "step": 2275 + }, + { + "epoch": 0.20970193946653154, + "grad_norm": 0.9401238964664081, + "learning_rate": 4.659455498702154e-06, + "loss": 0.1676, + "step": 2276 + }, + { + "epoch": 0.20979407564380154, + "grad_norm": 0.9401025662602998, + "learning_rate": 4.65907156243508e-06, + "loss": 0.1778, + "step": 2277 + }, + { + "epoch": 0.20988621182107153, + "grad_norm": 0.9752374373591519, + "learning_rate": 4.65868742569798e-06, + "loss": 0.1672, + "step": 2278 + }, + { + "epoch": 0.20997834799834156, + "grad_norm": 0.9809510171831634, + "learning_rate": 4.658303088526519e-06, + "loss": 0.184, + "step": 2279 + }, + { + "epoch": 0.21007048417561155, + "grad_norm": 0.8925811679962771, + "learning_rate": 4.657918550956384e-06, + "loss": 0.1697, + "step": 2280 + }, + { + "epoch": 0.21016262035288155, + "grad_norm": 0.9315080343767318, + "learning_rate": 4.65753381302328e-06, + "loss": 0.1737, + "step": 2281 + }, + { + "epoch": 0.21025475653015158, + "grad_norm": 0.8436564944976962, + "learning_rate": 4.657148874762929e-06, + "loss": 0.1501, + "step": 2282 + }, + { + "epoch": 0.21034689270742157, + "grad_norm": 0.9222611884806714, + "learning_rate": 4.656763736211073e-06, + "loss": 0.17, + "step": 2283 + }, + { + "epoch": 0.21043902888469157, + "grad_norm": 0.9276655656851056, + "learning_rate": 4.656378397403472e-06, + "loss": 0.1628, + "step": 2284 + }, + { + "epoch": 0.21053116506196157, + "grad_norm": 0.9146059613952714, + "learning_rate": 4.655992858375904e-06, + "loss": 0.17, + "step": 2285 + }, + { + "epoch": 0.2106233012392316, + "grad_norm": 0.9490196294298086, + "learning_rate": 4.655607119164168e-06, + "loss": 0.1624, + "step": 2286 + }, + { + "epoch": 0.2107154374165016, + "grad_norm": 0.9463987771147194, + "learning_rate": 4.655221179804078e-06, + "loss": 0.1729, + "step": 2287 + }, + { + "epoch": 0.21080757359377159, + "grad_norm": 0.9384821744611376, + "learning_rate": 4.65483504033147e-06, + "loss": 0.1709, + "step": 2288 + }, + { + "epoch": 0.2108997097710416, + "grad_norm": 0.9991342293889612, + "learning_rate": 4.654448700782197e-06, + "loss": 0.173, + "step": 2289 + }, + { + "epoch": 0.2109918459483116, + "grad_norm": 0.9473308951074153, + "learning_rate": 4.65406216119213e-06, + "loss": 0.1816, + "step": 2290 + }, + { + "epoch": 0.2110839821255816, + "grad_norm": 0.9682808078755373, + "learning_rate": 4.653675421597159e-06, + "loss": 0.1666, + "step": 2291 + }, + { + "epoch": 0.2111761183028516, + "grad_norm": 0.9236695196106048, + "learning_rate": 4.653288482033194e-06, + "loss": 0.1725, + "step": 2292 + }, + { + "epoch": 0.21126825448012163, + "grad_norm": 0.9706887089132648, + "learning_rate": 4.652901342536162e-06, + "loss": 0.1893, + "step": 2293 + }, + { + "epoch": 0.21136039065739162, + "grad_norm": 1.0069353159222993, + "learning_rate": 4.652514003142008e-06, + "loss": 0.1593, + "step": 2294 + }, + { + "epoch": 0.21145252683466162, + "grad_norm": 1.0053797728020337, + "learning_rate": 4.652126463886697e-06, + "loss": 0.1876, + "step": 2295 + }, + { + "epoch": 0.21154466301193164, + "grad_norm": 0.9440259393861917, + "learning_rate": 4.651738724806213e-06, + "loss": 0.1547, + "step": 2296 + }, + { + "epoch": 0.21163679918920164, + "grad_norm": 0.8471903564795712, + "learning_rate": 4.651350785936556e-06, + "loss": 0.1456, + "step": 2297 + }, + { + "epoch": 0.21172893536647164, + "grad_norm": 0.9864966435131894, + "learning_rate": 4.650962647313747e-06, + "loss": 0.1737, + "step": 2298 + }, + { + "epoch": 0.21182107154374166, + "grad_norm": 1.018468284517782, + "learning_rate": 4.650574308973826e-06, + "loss": 0.1832, + "step": 2299 + }, + { + "epoch": 0.21191320772101166, + "grad_norm": 1.0098138817405005, + "learning_rate": 4.6501857709528475e-06, + "loss": 0.1904, + "step": 2300 + }, + { + "epoch": 0.21200534389828166, + "grad_norm": 1.0461555531592517, + "learning_rate": 4.649797033286889e-06, + "loss": 0.1821, + "step": 2301 + }, + { + "epoch": 0.21209748007555165, + "grad_norm": 0.9609524234503527, + "learning_rate": 4.6494080960120444e-06, + "loss": 0.1708, + "step": 2302 + }, + { + "epoch": 0.21218961625282168, + "grad_norm": 0.8956399373758597, + "learning_rate": 4.6490189591644274e-06, + "loss": 0.1596, + "step": 2303 + }, + { + "epoch": 0.21228175243009167, + "grad_norm": 0.9086186604621006, + "learning_rate": 4.648629622780169e-06, + "loss": 0.1742, + "step": 2304 + }, + { + "epoch": 0.21237388860736167, + "grad_norm": 0.9054650267010966, + "learning_rate": 4.648240086895418e-06, + "loss": 0.1585, + "step": 2305 + }, + { + "epoch": 0.2124660247846317, + "grad_norm": 0.973003929436934, + "learning_rate": 4.647850351546345e-06, + "loss": 0.1628, + "step": 2306 + }, + { + "epoch": 0.2125581609619017, + "grad_norm": 0.920435551963916, + "learning_rate": 4.647460416769134e-06, + "loss": 0.1639, + "step": 2307 + }, + { + "epoch": 0.2126502971391717, + "grad_norm": 1.010065351946995, + "learning_rate": 4.647070282599994e-06, + "loss": 0.171, + "step": 2308 + }, + { + "epoch": 0.2127424333164417, + "grad_norm": 1.0122412359159743, + "learning_rate": 4.646679949075146e-06, + "loss": 0.1932, + "step": 2309 + }, + { + "epoch": 0.2128345694937117, + "grad_norm": 0.9360254209298183, + "learning_rate": 4.646289416230834e-06, + "loss": 0.1694, + "step": 2310 + }, + { + "epoch": 0.2129267056709817, + "grad_norm": 1.0354873116708483, + "learning_rate": 4.645898684103318e-06, + "loss": 0.1771, + "step": 2311 + }, + { + "epoch": 0.2130188418482517, + "grad_norm": 0.9613647007080832, + "learning_rate": 4.6455077527288795e-06, + "loss": 0.18, + "step": 2312 + }, + { + "epoch": 0.21311097802552173, + "grad_norm": 0.9226544949065173, + "learning_rate": 4.6451166221438145e-06, + "loss": 0.1657, + "step": 2313 + }, + { + "epoch": 0.21320311420279173, + "grad_norm": 1.0212738272454622, + "learning_rate": 4.644725292384441e-06, + "loss": 0.1681, + "step": 2314 + }, + { + "epoch": 0.21329525038006172, + "grad_norm": 0.9275529095731044, + "learning_rate": 4.6443337634870926e-06, + "loss": 0.1597, + "step": 2315 + }, + { + "epoch": 0.21338738655733175, + "grad_norm": 0.9251203990872005, + "learning_rate": 4.643942035488123e-06, + "loss": 0.1665, + "step": 2316 + }, + { + "epoch": 0.21347952273460175, + "grad_norm": 0.8799100754109026, + "learning_rate": 4.643550108423905e-06, + "loss": 0.1609, + "step": 2317 + }, + { + "epoch": 0.21357165891187174, + "grad_norm": 0.9864751850142733, + "learning_rate": 4.64315798233083e-06, + "loss": 0.1724, + "step": 2318 + }, + { + "epoch": 0.21366379508914174, + "grad_norm": 0.9305526319557794, + "learning_rate": 4.642765657245304e-06, + "loss": 0.1703, + "step": 2319 + }, + { + "epoch": 0.21375593126641176, + "grad_norm": 0.9165264859623368, + "learning_rate": 4.642373133203757e-06, + "loss": 0.1597, + "step": 2320 + }, + { + "epoch": 0.21384806744368176, + "grad_norm": 0.9499805101389946, + "learning_rate": 4.641980410242634e-06, + "loss": 0.1678, + "step": 2321 + }, + { + "epoch": 0.21394020362095176, + "grad_norm": 0.9481987039539749, + "learning_rate": 4.6415874883983995e-06, + "loss": 0.1672, + "step": 2322 + }, + { + "epoch": 0.21403233979822178, + "grad_norm": 0.8697561079038395, + "learning_rate": 4.641194367707535e-06, + "loss": 0.1636, + "step": 2323 + }, + { + "epoch": 0.21412447597549178, + "grad_norm": 0.9583502223016455, + "learning_rate": 4.640801048206545e-06, + "loss": 0.1702, + "step": 2324 + }, + { + "epoch": 0.21421661215276178, + "grad_norm": 0.9973479221639489, + "learning_rate": 4.6404075299319465e-06, + "loss": 0.1651, + "step": 2325 + }, + { + "epoch": 0.2143087483300318, + "grad_norm": 0.8790869012185121, + "learning_rate": 4.640013812920278e-06, + "loss": 0.1584, + "step": 2326 + }, + { + "epoch": 0.2144008845073018, + "grad_norm": 0.8205862861509154, + "learning_rate": 4.639619897208097e-06, + "loss": 0.1542, + "step": 2327 + }, + { + "epoch": 0.2144930206845718, + "grad_norm": 0.970252112224293, + "learning_rate": 4.639225782831978e-06, + "loss": 0.1679, + "step": 2328 + }, + { + "epoch": 0.2145851568618418, + "grad_norm": 0.9452446931266928, + "learning_rate": 4.638831469828515e-06, + "loss": 0.1701, + "step": 2329 + }, + { + "epoch": 0.21467729303911182, + "grad_norm": 0.9891248909941764, + "learning_rate": 4.638436958234321e-06, + "loss": 0.1851, + "step": 2330 + }, + { + "epoch": 0.2147694292163818, + "grad_norm": 0.8998482758735131, + "learning_rate": 4.638042248086023e-06, + "loss": 0.1676, + "step": 2331 + }, + { + "epoch": 0.2148615653936518, + "grad_norm": 1.0211991710865993, + "learning_rate": 4.637647339420273e-06, + "loss": 0.172, + "step": 2332 + }, + { + "epoch": 0.21495370157092183, + "grad_norm": 1.0033330908470734, + "learning_rate": 4.637252232273738e-06, + "loss": 0.1764, + "step": 2333 + }, + { + "epoch": 0.21504583774819183, + "grad_norm": 0.8913172960585651, + "learning_rate": 4.6368569266831035e-06, + "loss": 0.1557, + "step": 2334 + }, + { + "epoch": 0.21513797392546183, + "grad_norm": 1.043087204088054, + "learning_rate": 4.636461422685072e-06, + "loss": 0.1745, + "step": 2335 + }, + { + "epoch": 0.21523011010273183, + "grad_norm": 0.8909211705779044, + "learning_rate": 4.63606572031637e-06, + "loss": 0.1649, + "step": 2336 + }, + { + "epoch": 0.21532224628000185, + "grad_norm": 0.9538343349636392, + "learning_rate": 4.635669819613734e-06, + "loss": 0.1642, + "step": 2337 + }, + { + "epoch": 0.21541438245727185, + "grad_norm": 0.9715472692776371, + "learning_rate": 4.635273720613925e-06, + "loss": 0.1712, + "step": 2338 + }, + { + "epoch": 0.21550651863454184, + "grad_norm": 0.9349140770076221, + "learning_rate": 4.634877423353723e-06, + "loss": 0.1721, + "step": 2339 + }, + { + "epoch": 0.21559865481181187, + "grad_norm": 0.9606087264919402, + "learning_rate": 4.634480927869921e-06, + "loss": 0.1668, + "step": 2340 + }, + { + "epoch": 0.21569079098908187, + "grad_norm": 0.8532265613158703, + "learning_rate": 4.634084234199335e-06, + "loss": 0.1369, + "step": 2341 + }, + { + "epoch": 0.21578292716635186, + "grad_norm": 0.8989400773969781, + "learning_rate": 4.633687342378799e-06, + "loss": 0.1698, + "step": 2342 + }, + { + "epoch": 0.2158750633436219, + "grad_norm": 1.0121332804629664, + "learning_rate": 4.633290252445164e-06, + "loss": 0.1878, + "step": 2343 + }, + { + "epoch": 0.21596719952089188, + "grad_norm": 1.030553980973197, + "learning_rate": 4.632892964435299e-06, + "loss": 0.1805, + "step": 2344 + }, + { + "epoch": 0.21605933569816188, + "grad_norm": 0.9629513385761489, + "learning_rate": 4.632495478386092e-06, + "loss": 0.1634, + "step": 2345 + }, + { + "epoch": 0.21615147187543188, + "grad_norm": 0.906324183769925, + "learning_rate": 4.632097794334451e-06, + "loss": 0.172, + "step": 2346 + }, + { + "epoch": 0.2162436080527019, + "grad_norm": 0.9551279217162751, + "learning_rate": 4.631699912317301e-06, + "loss": 0.1656, + "step": 2347 + }, + { + "epoch": 0.2163357442299719, + "grad_norm": 0.9365362458955379, + "learning_rate": 4.631301832371584e-06, + "loss": 0.1759, + "step": 2348 + }, + { + "epoch": 0.2164278804072419, + "grad_norm": 0.9881321567680756, + "learning_rate": 4.630903554534262e-06, + "loss": 0.1685, + "step": 2349 + }, + { + "epoch": 0.21652001658451192, + "grad_norm": 0.9674707035004548, + "learning_rate": 4.630505078842317e-06, + "loss": 0.1823, + "step": 2350 + }, + { + "epoch": 0.21661215276178192, + "grad_norm": 0.9793487730071272, + "learning_rate": 4.630106405332745e-06, + "loss": 0.1827, + "step": 2351 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 0.9409326842175851, + "learning_rate": 4.629707534042564e-06, + "loss": 0.1639, + "step": 2352 + }, + { + "epoch": 0.2167964251163219, + "grad_norm": 1.0087420208183764, + "learning_rate": 4.6293084650088095e-06, + "loss": 0.1795, + "step": 2353 + }, + { + "epoch": 0.21688856129359194, + "grad_norm": 1.044876791324977, + "learning_rate": 4.628909198268534e-06, + "loss": 0.169, + "step": 2354 + }, + { + "epoch": 0.21698069747086193, + "grad_norm": 0.9389750409437341, + "learning_rate": 4.628509733858813e-06, + "loss": 0.1679, + "step": 2355 + }, + { + "epoch": 0.21707283364813193, + "grad_norm": 0.8676673622848394, + "learning_rate": 4.628110071816732e-06, + "loss": 0.1624, + "step": 2356 + }, + { + "epoch": 0.21716496982540195, + "grad_norm": 0.9154277930953563, + "learning_rate": 4.6277102121794015e-06, + "loss": 0.1751, + "step": 2357 + }, + { + "epoch": 0.21725710600267195, + "grad_norm": 0.9619309892433496, + "learning_rate": 4.62731015498395e-06, + "loss": 0.1738, + "step": 2358 + }, + { + "epoch": 0.21734924217994195, + "grad_norm": 0.926667275696638, + "learning_rate": 4.626909900267521e-06, + "loss": 0.1743, + "step": 2359 + }, + { + "epoch": 0.21744137835721197, + "grad_norm": 0.9403910037068345, + "learning_rate": 4.626509448067279e-06, + "loss": 0.1655, + "step": 2360 + }, + { + "epoch": 0.21753351453448197, + "grad_norm": 0.8818296140485228, + "learning_rate": 4.626108798420406e-06, + "loss": 0.1574, + "step": 2361 + }, + { + "epoch": 0.21762565071175197, + "grad_norm": 0.9323796067026165, + "learning_rate": 4.625707951364102e-06, + "loss": 0.1781, + "step": 2362 + }, + { + "epoch": 0.21771778688902196, + "grad_norm": 0.9243899591524739, + "learning_rate": 4.625306906935586e-06, + "loss": 0.1743, + "step": 2363 + }, + { + "epoch": 0.217809923066292, + "grad_norm": 0.9527407527487539, + "learning_rate": 4.624905665172095e-06, + "loss": 0.1642, + "step": 2364 + }, + { + "epoch": 0.21790205924356199, + "grad_norm": 0.8815844393739036, + "learning_rate": 4.6245042261108845e-06, + "loss": 0.1668, + "step": 2365 + }, + { + "epoch": 0.21799419542083198, + "grad_norm": 0.9479977436616651, + "learning_rate": 4.6241025897892275e-06, + "loss": 0.1842, + "step": 2366 + }, + { + "epoch": 0.218086331598102, + "grad_norm": 1.0207347307047026, + "learning_rate": 4.623700756244417e-06, + "loss": 0.1859, + "step": 2367 + }, + { + "epoch": 0.218178467775372, + "grad_norm": 0.9197498111886677, + "learning_rate": 4.6232987255137625e-06, + "loss": 0.1515, + "step": 2368 + }, + { + "epoch": 0.218270603952642, + "grad_norm": 0.8838719986716687, + "learning_rate": 4.622896497634593e-06, + "loss": 0.1483, + "step": 2369 + }, + { + "epoch": 0.218362740129912, + "grad_norm": 0.900451505897105, + "learning_rate": 4.622494072644255e-06, + "loss": 0.1735, + "step": 2370 + }, + { + "epoch": 0.21845487630718202, + "grad_norm": 0.8934791164057562, + "learning_rate": 4.622091450580114e-06, + "loss": 0.1656, + "step": 2371 + }, + { + "epoch": 0.21854701248445202, + "grad_norm": 0.895686185724979, + "learning_rate": 4.621688631479554e-06, + "loss": 0.1653, + "step": 2372 + }, + { + "epoch": 0.21863914866172202, + "grad_norm": 0.9833309677439741, + "learning_rate": 4.621285615379976e-06, + "loss": 0.1743, + "step": 2373 + }, + { + "epoch": 0.21873128483899204, + "grad_norm": 0.9372570832200562, + "learning_rate": 4.620882402318799e-06, + "loss": 0.1736, + "step": 2374 + }, + { + "epoch": 0.21882342101626204, + "grad_norm": 0.8761533867954667, + "learning_rate": 4.620478992333463e-06, + "loss": 0.1618, + "step": 2375 + }, + { + "epoch": 0.21891555719353203, + "grad_norm": 0.8919996604272982, + "learning_rate": 4.620075385461426e-06, + "loss": 0.1608, + "step": 2376 + }, + { + "epoch": 0.21900769337080206, + "grad_norm": 0.8657870181470666, + "learning_rate": 4.61967158174016e-06, + "loss": 0.1508, + "step": 2377 + }, + { + "epoch": 0.21909982954807206, + "grad_norm": 0.923022145836738, + "learning_rate": 4.61926758120716e-06, + "loss": 0.1579, + "step": 2378 + }, + { + "epoch": 0.21919196572534205, + "grad_norm": 0.9272554038487848, + "learning_rate": 4.618863383899937e-06, + "loss": 0.1696, + "step": 2379 + }, + { + "epoch": 0.21928410190261205, + "grad_norm": 0.9473433534596402, + "learning_rate": 4.618458989856021e-06, + "loss": 0.1765, + "step": 2380 + }, + { + "epoch": 0.21937623807988207, + "grad_norm": 0.923103183354492, + "learning_rate": 4.618054399112959e-06, + "loss": 0.1675, + "step": 2381 + }, + { + "epoch": 0.21946837425715207, + "grad_norm": 0.9369984252173658, + "learning_rate": 4.617649611708318e-06, + "loss": 0.1711, + "step": 2382 + }, + { + "epoch": 0.21956051043442207, + "grad_norm": 1.0737046620535462, + "learning_rate": 4.617244627679684e-06, + "loss": 0.1829, + "step": 2383 + }, + { + "epoch": 0.2196526466116921, + "grad_norm": 0.9481793986325134, + "learning_rate": 4.6168394470646575e-06, + "loss": 0.1639, + "step": 2384 + }, + { + "epoch": 0.2197447827889621, + "grad_norm": 1.0068030260607288, + "learning_rate": 4.61643406990086e-06, + "loss": 0.1714, + "step": 2385 + }, + { + "epoch": 0.2198369189662321, + "grad_norm": 0.9234873321825561, + "learning_rate": 4.616028496225933e-06, + "loss": 0.165, + "step": 2386 + }, + { + "epoch": 0.21992905514350208, + "grad_norm": 0.9145551409494252, + "learning_rate": 4.6156227260775314e-06, + "loss": 0.1637, + "step": 2387 + }, + { + "epoch": 0.2200211913207721, + "grad_norm": 0.9326334048386388, + "learning_rate": 4.615216759493332e-06, + "loss": 0.1835, + "step": 2388 + }, + { + "epoch": 0.2201133274980421, + "grad_norm": 0.8781396547544478, + "learning_rate": 4.614810596511028e-06, + "loss": 0.1571, + "step": 2389 + }, + { + "epoch": 0.2202054636753121, + "grad_norm": 0.9321770521056157, + "learning_rate": 4.614404237168334e-06, + "loss": 0.1606, + "step": 2390 + }, + { + "epoch": 0.22029759985258213, + "grad_norm": 0.9109442841057521, + "learning_rate": 4.613997681502977e-06, + "loss": 0.1523, + "step": 2391 + }, + { + "epoch": 0.22038973602985212, + "grad_norm": 0.9073470892509928, + "learning_rate": 4.61359092955271e-06, + "loss": 0.1544, + "step": 2392 + }, + { + "epoch": 0.22048187220712212, + "grad_norm": 0.9727998985444354, + "learning_rate": 4.613183981355297e-06, + "loss": 0.1606, + "step": 2393 + }, + { + "epoch": 0.22057400838439215, + "grad_norm": 0.9071330785105896, + "learning_rate": 4.612776836948524e-06, + "loss": 0.1543, + "step": 2394 + }, + { + "epoch": 0.22066614456166214, + "grad_norm": 0.9097721335699841, + "learning_rate": 4.612369496370194e-06, + "loss": 0.1574, + "step": 2395 + }, + { + "epoch": 0.22075828073893214, + "grad_norm": 0.9304001670063471, + "learning_rate": 4.611961959658129e-06, + "loss": 0.1568, + "step": 2396 + }, + { + "epoch": 0.22085041691620214, + "grad_norm": 0.9445814202450755, + "learning_rate": 4.611554226850168e-06, + "loss": 0.1821, + "step": 2397 + }, + { + "epoch": 0.22094255309347216, + "grad_norm": 0.9995514602468918, + "learning_rate": 4.6111462979841704e-06, + "loss": 0.1621, + "step": 2398 + }, + { + "epoch": 0.22103468927074216, + "grad_norm": 0.9832850713868987, + "learning_rate": 4.610738173098012e-06, + "loss": 0.1711, + "step": 2399 + }, + { + "epoch": 0.22112682544801215, + "grad_norm": 0.9329304727726307, + "learning_rate": 4.610329852229587e-06, + "loss": 0.1566, + "step": 2400 + }, + { + "epoch": 0.22121896162528218, + "grad_norm": 0.9107929098618456, + "learning_rate": 4.6099213354168085e-06, + "loss": 0.1735, + "step": 2401 + }, + { + "epoch": 0.22131109780255218, + "grad_norm": 0.9163785528280652, + "learning_rate": 4.609512622697606e-06, + "loss": 0.1564, + "step": 2402 + }, + { + "epoch": 0.22140323397982217, + "grad_norm": 0.9260260556809167, + "learning_rate": 4.609103714109931e-06, + "loss": 0.1583, + "step": 2403 + }, + { + "epoch": 0.22149537015709217, + "grad_norm": 0.995726300308974, + "learning_rate": 4.608694609691747e-06, + "loss": 0.1728, + "step": 2404 + }, + { + "epoch": 0.2215875063343622, + "grad_norm": 0.9277977487816504, + "learning_rate": 4.608285309481043e-06, + "loss": 0.1636, + "step": 2405 + }, + { + "epoch": 0.2216796425116322, + "grad_norm": 0.9227743080386674, + "learning_rate": 4.607875813515821e-06, + "loss": 0.1707, + "step": 2406 + }, + { + "epoch": 0.2217717786889022, + "grad_norm": 0.8860722109582515, + "learning_rate": 4.607466121834103e-06, + "loss": 0.1579, + "step": 2407 + }, + { + "epoch": 0.2218639148661722, + "grad_norm": 0.920265036722145, + "learning_rate": 4.607056234473928e-06, + "loss": 0.1578, + "step": 2408 + }, + { + "epoch": 0.2219560510434422, + "grad_norm": 0.8811025661892827, + "learning_rate": 4.606646151473355e-06, + "loss": 0.1675, + "step": 2409 + }, + { + "epoch": 0.2220481872207122, + "grad_norm": 0.8916127939474685, + "learning_rate": 4.606235872870461e-06, + "loss": 0.1773, + "step": 2410 + }, + { + "epoch": 0.22214032339798223, + "grad_norm": 0.9900333498792695, + "learning_rate": 4.605825398703339e-06, + "loss": 0.1751, + "step": 2411 + }, + { + "epoch": 0.22223245957525223, + "grad_norm": 0.9706997511826468, + "learning_rate": 4.605414729010102e-06, + "loss": 0.1808, + "step": 2412 + }, + { + "epoch": 0.22232459575252222, + "grad_norm": 0.9164289847295664, + "learning_rate": 4.605003863828881e-06, + "loss": 0.1609, + "step": 2413 + }, + { + "epoch": 0.22241673192979222, + "grad_norm": 0.9547324103869823, + "learning_rate": 4.604592803197825e-06, + "loss": 0.1822, + "step": 2414 + }, + { + "epoch": 0.22250886810706225, + "grad_norm": 0.8917918711512808, + "learning_rate": 4.6041815471551e-06, + "loss": 0.1579, + "step": 2415 + }, + { + "epoch": 0.22260100428433224, + "grad_norm": 0.9874142973301107, + "learning_rate": 4.603770095738892e-06, + "loss": 0.1781, + "step": 2416 + }, + { + "epoch": 0.22269314046160224, + "grad_norm": 0.9212348176350641, + "learning_rate": 4.603358448987405e-06, + "loss": 0.1673, + "step": 2417 + }, + { + "epoch": 0.22278527663887226, + "grad_norm": 0.9412577615662902, + "learning_rate": 4.602946606938858e-06, + "loss": 0.1729, + "step": 2418 + }, + { + "epoch": 0.22287741281614226, + "grad_norm": 0.8402429294824661, + "learning_rate": 4.6025345696314935e-06, + "loss": 0.1637, + "step": 2419 + }, + { + "epoch": 0.22296954899341226, + "grad_norm": 1.0231878663886147, + "learning_rate": 4.602122337103568e-06, + "loss": 0.1861, + "step": 2420 + }, + { + "epoch": 0.22306168517068226, + "grad_norm": 0.9267704717175042, + "learning_rate": 4.601709909393357e-06, + "loss": 0.1789, + "step": 2421 + }, + { + "epoch": 0.22315382134795228, + "grad_norm": 0.9402886812499314, + "learning_rate": 4.601297286539155e-06, + "loss": 0.1844, + "step": 2422 + }, + { + "epoch": 0.22324595752522228, + "grad_norm": 0.9232675247411815, + "learning_rate": 4.600884468579273e-06, + "loss": 0.1718, + "step": 2423 + }, + { + "epoch": 0.22333809370249227, + "grad_norm": 0.8850107261659447, + "learning_rate": 4.600471455552043e-06, + "loss": 0.1545, + "step": 2424 + }, + { + "epoch": 0.2234302298797623, + "grad_norm": 0.9001963815880184, + "learning_rate": 4.600058247495812e-06, + "loss": 0.168, + "step": 2425 + }, + { + "epoch": 0.2235223660570323, + "grad_norm": 1.0125379821849876, + "learning_rate": 4.599644844448946e-06, + "loss": 0.175, + "step": 2426 + }, + { + "epoch": 0.2236145022343023, + "grad_norm": 0.9715366729797272, + "learning_rate": 4.599231246449831e-06, + "loss": 0.1705, + "step": 2427 + }, + { + "epoch": 0.22370663841157232, + "grad_norm": 0.983021655050111, + "learning_rate": 4.5988174535368686e-06, + "loss": 0.1568, + "step": 2428 + }, + { + "epoch": 0.22379877458884231, + "grad_norm": 0.9512948661524263, + "learning_rate": 4.59840346574848e-06, + "loss": 0.1809, + "step": 2429 + }, + { + "epoch": 0.2238909107661123, + "grad_norm": 0.9968867206661762, + "learning_rate": 4.597989283123104e-06, + "loss": 0.1843, + "step": 2430 + }, + { + "epoch": 0.2239830469433823, + "grad_norm": 1.0375629522642, + "learning_rate": 4.597574905699196e-06, + "loss": 0.1685, + "step": 2431 + }, + { + "epoch": 0.22407518312065233, + "grad_norm": 1.0263084195890293, + "learning_rate": 4.597160333515233e-06, + "loss": 0.1767, + "step": 2432 + }, + { + "epoch": 0.22416731929792233, + "grad_norm": 0.9303738801673548, + "learning_rate": 4.596745566609707e-06, + "loss": 0.1598, + "step": 2433 + }, + { + "epoch": 0.22425945547519233, + "grad_norm": 0.927679806741521, + "learning_rate": 4.5963306050211296e-06, + "loss": 0.1649, + "step": 2434 + }, + { + "epoch": 0.22435159165246235, + "grad_norm": 0.9658005546537741, + "learning_rate": 4.595915448788031e-06, + "loss": 0.1818, + "step": 2435 + }, + { + "epoch": 0.22444372782973235, + "grad_norm": 0.9499725030180026, + "learning_rate": 4.5955000979489565e-06, + "loss": 0.1766, + "step": 2436 + }, + { + "epoch": 0.22453586400700234, + "grad_norm": 0.9462439750542999, + "learning_rate": 4.595084552542472e-06, + "loss": 0.1728, + "step": 2437 + }, + { + "epoch": 0.22462800018427234, + "grad_norm": 0.9720158256266674, + "learning_rate": 4.594668812607162e-06, + "loss": 0.1667, + "step": 2438 + }, + { + "epoch": 0.22472013636154237, + "grad_norm": 0.9173112612757395, + "learning_rate": 4.594252878181627e-06, + "loss": 0.163, + "step": 2439 + }, + { + "epoch": 0.22481227253881236, + "grad_norm": 0.940867147194557, + "learning_rate": 4.593836749304487e-06, + "loss": 0.1511, + "step": 2440 + }, + { + "epoch": 0.22490440871608236, + "grad_norm": 0.9835880434614503, + "learning_rate": 4.59342042601438e-06, + "loss": 0.1839, + "step": 2441 + }, + { + "epoch": 0.22499654489335238, + "grad_norm": 0.9879031852310044, + "learning_rate": 4.59300390834996e-06, + "loss": 0.178, + "step": 2442 + }, + { + "epoch": 0.22508868107062238, + "grad_norm": 0.9081333169346187, + "learning_rate": 4.592587196349902e-06, + "loss": 0.1798, + "step": 2443 + }, + { + "epoch": 0.22518081724789238, + "grad_norm": 0.8798711443957609, + "learning_rate": 4.592170290052898e-06, + "loss": 0.1654, + "step": 2444 + }, + { + "epoch": 0.2252729534251624, + "grad_norm": 0.9326071476847042, + "learning_rate": 4.591753189497658e-06, + "loss": 0.1544, + "step": 2445 + }, + { + "epoch": 0.2253650896024324, + "grad_norm": 1.1534455607712155, + "learning_rate": 4.591335894722909e-06, + "loss": 0.1937, + "step": 2446 + }, + { + "epoch": 0.2254572257797024, + "grad_norm": 0.9028689022204759, + "learning_rate": 4.5909184057673976e-06, + "loss": 0.1604, + "step": 2447 + }, + { + "epoch": 0.2255493619569724, + "grad_norm": 0.8946199825952447, + "learning_rate": 4.590500722669886e-06, + "loss": 0.1697, + "step": 2448 + }, + { + "epoch": 0.22564149813424242, + "grad_norm": 0.9077763847398155, + "learning_rate": 4.590082845469158e-06, + "loss": 0.1507, + "step": 2449 + }, + { + "epoch": 0.22573363431151242, + "grad_norm": 0.9592178561443785, + "learning_rate": 4.589664774204013e-06, + "loss": 0.1696, + "step": 2450 + }, + { + "epoch": 0.2258257704887824, + "grad_norm": 0.9613902738050768, + "learning_rate": 4.589246508913267e-06, + "loss": 0.1774, + "step": 2451 + }, + { + "epoch": 0.22591790666605244, + "grad_norm": 0.8930021527771865, + "learning_rate": 4.58882804963576e-06, + "loss": 0.1676, + "step": 2452 + }, + { + "epoch": 0.22601004284332243, + "grad_norm": 0.9254850657625637, + "learning_rate": 4.588409396410342e-06, + "loss": 0.1784, + "step": 2453 + }, + { + "epoch": 0.22610217902059243, + "grad_norm": 0.8648068592964746, + "learning_rate": 4.587990549275889e-06, + "loss": 0.1647, + "step": 2454 + }, + { + "epoch": 0.22619431519786243, + "grad_norm": 0.9218315395309605, + "learning_rate": 4.587571508271288e-06, + "loss": 0.1643, + "step": 2455 + }, + { + "epoch": 0.22628645137513245, + "grad_norm": 0.9359752443930277, + "learning_rate": 4.587152273435447e-06, + "loss": 0.166, + "step": 2456 + }, + { + "epoch": 0.22637858755240245, + "grad_norm": 0.9467255151495074, + "learning_rate": 4.586732844807293e-06, + "loss": 0.1695, + "step": 2457 + }, + { + "epoch": 0.22647072372967245, + "grad_norm": 0.9775875127483362, + "learning_rate": 4.58631322242577e-06, + "loss": 0.1672, + "step": 2458 + }, + { + "epoch": 0.22656285990694247, + "grad_norm": 1.0104603751151366, + "learning_rate": 4.58589340632984e-06, + "loss": 0.1704, + "step": 2459 + }, + { + "epoch": 0.22665499608421247, + "grad_norm": 0.9989237155219942, + "learning_rate": 4.585473396558482e-06, + "loss": 0.1896, + "step": 2460 + }, + { + "epoch": 0.22674713226148246, + "grad_norm": 0.9295942764578942, + "learning_rate": 4.585053193150695e-06, + "loss": 0.148, + "step": 2461 + }, + { + "epoch": 0.2268392684387525, + "grad_norm": 0.9990328294585157, + "learning_rate": 4.584632796145495e-06, + "loss": 0.1617, + "step": 2462 + }, + { + "epoch": 0.2269314046160225, + "grad_norm": 1.0342567607730049, + "learning_rate": 4.584212205581915e-06, + "loss": 0.1736, + "step": 2463 + }, + { + "epoch": 0.22702354079329248, + "grad_norm": 0.9424632951659273, + "learning_rate": 4.5837914214990085e-06, + "loss": 0.1685, + "step": 2464 + }, + { + "epoch": 0.22711567697056248, + "grad_norm": 0.8664763361776535, + "learning_rate": 4.583370443935843e-06, + "loss": 0.1574, + "step": 2465 + }, + { + "epoch": 0.2272078131478325, + "grad_norm": 0.9720244211223175, + "learning_rate": 4.582949272931508e-06, + "loss": 0.1708, + "step": 2466 + }, + { + "epoch": 0.2272999493251025, + "grad_norm": 0.9624471682686935, + "learning_rate": 4.582527908525109e-06, + "loss": 0.1701, + "step": 2467 + }, + { + "epoch": 0.2273920855023725, + "grad_norm": 0.9668421091692699, + "learning_rate": 4.5821063507557695e-06, + "loss": 0.1771, + "step": 2468 + }, + { + "epoch": 0.22748422167964252, + "grad_norm": 0.9317703673514961, + "learning_rate": 4.581684599662632e-06, + "loss": 0.1706, + "step": 2469 + }, + { + "epoch": 0.22757635785691252, + "grad_norm": 0.9286142161019658, + "learning_rate": 4.581262655284854e-06, + "loss": 0.1661, + "step": 2470 + }, + { + "epoch": 0.22766849403418252, + "grad_norm": 0.9792223907797291, + "learning_rate": 4.580840517661615e-06, + "loss": 0.1513, + "step": 2471 + }, + { + "epoch": 0.2277606302114525, + "grad_norm": 0.8942474980522034, + "learning_rate": 4.58041818683211e-06, + "loss": 0.1607, + "step": 2472 + }, + { + "epoch": 0.22785276638872254, + "grad_norm": 0.9241481692834103, + "learning_rate": 4.579995662835552e-06, + "loss": 0.1693, + "step": 2473 + }, + { + "epoch": 0.22794490256599254, + "grad_norm": 0.9423013205176731, + "learning_rate": 4.579572945711174e-06, + "loss": 0.1548, + "step": 2474 + }, + { + "epoch": 0.22803703874326253, + "grad_norm": 1.0187746405002311, + "learning_rate": 4.579150035498223e-06, + "loss": 0.1702, + "step": 2475 + }, + { + "epoch": 0.22812917492053256, + "grad_norm": 1.0143035138735677, + "learning_rate": 4.578726932235969e-06, + "loss": 0.1638, + "step": 2476 + }, + { + "epoch": 0.22822131109780255, + "grad_norm": 0.9930147973024229, + "learning_rate": 4.5783036359636935e-06, + "loss": 0.1677, + "step": 2477 + }, + { + "epoch": 0.22831344727507255, + "grad_norm": 0.9625773296101545, + "learning_rate": 4.5778801467207035e-06, + "loss": 0.1699, + "step": 2478 + }, + { + "epoch": 0.22840558345234258, + "grad_norm": 0.9811159556227943, + "learning_rate": 4.577456464546317e-06, + "loss": 0.1745, + "step": 2479 + }, + { + "epoch": 0.22849771962961257, + "grad_norm": 0.9546391442575929, + "learning_rate": 4.5770325894798754e-06, + "loss": 0.1701, + "step": 2480 + }, + { + "epoch": 0.22858985580688257, + "grad_norm": 0.8864094301933726, + "learning_rate": 4.5766085215607335e-06, + "loss": 0.1704, + "step": 2481 + }, + { + "epoch": 0.22868199198415257, + "grad_norm": 0.9331663904936708, + "learning_rate": 4.576184260828267e-06, + "loss": 0.1603, + "step": 2482 + }, + { + "epoch": 0.2287741281614226, + "grad_norm": 1.0563673952258763, + "learning_rate": 4.575759807321869e-06, + "loss": 0.179, + "step": 2483 + }, + { + "epoch": 0.2288662643386926, + "grad_norm": 0.9305700277198957, + "learning_rate": 4.575335161080948e-06, + "loss": 0.1595, + "step": 2484 + }, + { + "epoch": 0.22895840051596258, + "grad_norm": 0.9856789510726491, + "learning_rate": 4.574910322144935e-06, + "loss": 0.1785, + "step": 2485 + }, + { + "epoch": 0.2290505366932326, + "grad_norm": 0.9650074532401066, + "learning_rate": 4.574485290553276e-06, + "loss": 0.1671, + "step": 2486 + }, + { + "epoch": 0.2291426728705026, + "grad_norm": 0.9194999600270781, + "learning_rate": 4.574060066345434e-06, + "loss": 0.1794, + "step": 2487 + }, + { + "epoch": 0.2292348090477726, + "grad_norm": 0.9499206998494362, + "learning_rate": 4.573634649560891e-06, + "loss": 0.1651, + "step": 2488 + }, + { + "epoch": 0.2293269452250426, + "grad_norm": 0.9880827662100659, + "learning_rate": 4.573209040239148e-06, + "loss": 0.172, + "step": 2489 + }, + { + "epoch": 0.22941908140231262, + "grad_norm": 0.9704437917207153, + "learning_rate": 4.572783238419723e-06, + "loss": 0.165, + "step": 2490 + }, + { + "epoch": 0.22951121757958262, + "grad_norm": 0.9473886009713728, + "learning_rate": 4.572357244142151e-06, + "loss": 0.1687, + "step": 2491 + }, + { + "epoch": 0.22960335375685262, + "grad_norm": 1.0115985599601431, + "learning_rate": 4.5719310574459846e-06, + "loss": 0.1661, + "step": 2492 + }, + { + "epoch": 0.22969548993412264, + "grad_norm": 1.0523002183614862, + "learning_rate": 4.5715046783707976e-06, + "loss": 0.161, + "step": 2493 + }, + { + "epoch": 0.22978762611139264, + "grad_norm": 0.9012474784949154, + "learning_rate": 4.571078106956178e-06, + "loss": 0.1588, + "step": 2494 + }, + { + "epoch": 0.22987976228866264, + "grad_norm": 1.0733611855009282, + "learning_rate": 4.570651343241733e-06, + "loss": 0.1731, + "step": 2495 + }, + { + "epoch": 0.22997189846593266, + "grad_norm": 1.0563609033763628, + "learning_rate": 4.570224387267089e-06, + "loss": 0.179, + "step": 2496 + }, + { + "epoch": 0.23006403464320266, + "grad_norm": 0.9549484688378221, + "learning_rate": 4.569797239071887e-06, + "loss": 0.1831, + "step": 2497 + }, + { + "epoch": 0.23015617082047266, + "grad_norm": 1.0449068654366036, + "learning_rate": 4.569369898695789e-06, + "loss": 0.1741, + "step": 2498 + }, + { + "epoch": 0.23024830699774265, + "grad_norm": 0.9602792119218708, + "learning_rate": 4.568942366178473e-06, + "loss": 0.1623, + "step": 2499 + }, + { + "epoch": 0.23034044317501268, + "grad_norm": 0.9618534029768782, + "learning_rate": 4.568514641559636e-06, + "loss": 0.1647, + "step": 2500 + }, + { + "epoch": 0.23034044317501268, + "eval_loss": 0.16871164739131927, + "eval_runtime": 300.164, + "eval_samples_per_second": 23.377, + "eval_steps_per_second": 2.925, + "step": 2500 + }, + { + "epoch": 0.23043257935228267, + "grad_norm": 0.9533742219678426, + "learning_rate": 4.5680867248789916e-06, + "loss": 0.1741, + "step": 2501 + }, + { + "epoch": 0.23052471552955267, + "grad_norm": 0.9298655405790685, + "learning_rate": 4.567658616176273e-06, + "loss": 0.1708, + "step": 2502 + }, + { + "epoch": 0.2306168517068227, + "grad_norm": 0.9479029081415975, + "learning_rate": 4.5672303154912275e-06, + "loss": 0.1682, + "step": 2503 + }, + { + "epoch": 0.2307089878840927, + "grad_norm": 0.8851274933549781, + "learning_rate": 4.566801822863626e-06, + "loss": 0.1666, + "step": 2504 + }, + { + "epoch": 0.2308011240613627, + "grad_norm": 0.8924494397459016, + "learning_rate": 4.566373138333253e-06, + "loss": 0.1619, + "step": 2505 + }, + { + "epoch": 0.23089326023863269, + "grad_norm": 0.9435939583116043, + "learning_rate": 4.565944261939911e-06, + "loss": 0.163, + "step": 2506 + }, + { + "epoch": 0.2309853964159027, + "grad_norm": 0.8879874757562365, + "learning_rate": 4.565515193723423e-06, + "loss": 0.1756, + "step": 2507 + }, + { + "epoch": 0.2310775325931727, + "grad_norm": 0.9376920867552287, + "learning_rate": 4.5650859337236256e-06, + "loss": 0.1614, + "step": 2508 + }, + { + "epoch": 0.2311696687704427, + "grad_norm": 0.9673814720139658, + "learning_rate": 4.564656481980378e-06, + "loss": 0.1733, + "step": 2509 + }, + { + "epoch": 0.23126180494771273, + "grad_norm": 0.9152447120818502, + "learning_rate": 4.564226838533553e-06, + "loss": 0.1586, + "step": 2510 + }, + { + "epoch": 0.23135394112498273, + "grad_norm": 0.8820356484085558, + "learning_rate": 4.563797003423045e-06, + "loss": 0.1673, + "step": 2511 + }, + { + "epoch": 0.23144607730225272, + "grad_norm": 0.9391183631130411, + "learning_rate": 4.563366976688762e-06, + "loss": 0.1761, + "step": 2512 + }, + { + "epoch": 0.23153821347952275, + "grad_norm": 0.9636950958740574, + "learning_rate": 4.562936758370634e-06, + "loss": 0.1637, + "step": 2513 + }, + { + "epoch": 0.23163034965679274, + "grad_norm": 1.037525619091228, + "learning_rate": 4.5625063485086065e-06, + "loss": 0.1925, + "step": 2514 + }, + { + "epoch": 0.23172248583406274, + "grad_norm": 0.94368533310796, + "learning_rate": 4.562075747142641e-06, + "loss": 0.1921, + "step": 2515 + }, + { + "epoch": 0.23181462201133274, + "grad_norm": 0.9306148789692448, + "learning_rate": 4.561644954312721e-06, + "loss": 0.1609, + "step": 2516 + }, + { + "epoch": 0.23190675818860276, + "grad_norm": 0.8962214850100089, + "learning_rate": 4.561213970058845e-06, + "loss": 0.1568, + "step": 2517 + }, + { + "epoch": 0.23199889436587276, + "grad_norm": 0.9376759104456283, + "learning_rate": 4.560782794421031e-06, + "loss": 0.1733, + "step": 2518 + }, + { + "epoch": 0.23209103054314276, + "grad_norm": 0.9774780989550023, + "learning_rate": 4.5603514274393125e-06, + "loss": 0.1688, + "step": 2519 + }, + { + "epoch": 0.23218316672041278, + "grad_norm": 0.8467491530853, + "learning_rate": 4.559919869153742e-06, + "loss": 0.1526, + "step": 2520 + }, + { + "epoch": 0.23227530289768278, + "grad_norm": 0.9208731668470556, + "learning_rate": 4.559488119604389e-06, + "loss": 0.1625, + "step": 2521 + }, + { + "epoch": 0.23236743907495278, + "grad_norm": 1.0214193792829334, + "learning_rate": 4.5590561788313435e-06, + "loss": 0.1769, + "step": 2522 + }, + { + "epoch": 0.23245957525222277, + "grad_norm": 0.9530626148431194, + "learning_rate": 4.55862404687471e-06, + "loss": 0.1768, + "step": 2523 + }, + { + "epoch": 0.2325517114294928, + "grad_norm": 0.9441286615058059, + "learning_rate": 4.558191723774612e-06, + "loss": 0.1662, + "step": 2524 + }, + { + "epoch": 0.2326438476067628, + "grad_norm": 0.9633992603927569, + "learning_rate": 4.557759209571191e-06, + "loss": 0.1649, + "step": 2525 + }, + { + "epoch": 0.2327359837840328, + "grad_norm": 0.8883571013742829, + "learning_rate": 4.557326504304606e-06, + "loss": 0.1669, + "step": 2526 + }, + { + "epoch": 0.23282811996130282, + "grad_norm": 0.9452980355449644, + "learning_rate": 4.556893608015034e-06, + "loss": 0.1695, + "step": 2527 + }, + { + "epoch": 0.2329202561385728, + "grad_norm": 0.9022590301416014, + "learning_rate": 4.556460520742669e-06, + "loss": 0.1571, + "step": 2528 + }, + { + "epoch": 0.2330123923158428, + "grad_norm": 0.929852406290282, + "learning_rate": 4.556027242527723e-06, + "loss": 0.1743, + "step": 2529 + }, + { + "epoch": 0.23310452849311283, + "grad_norm": 0.966516869274674, + "learning_rate": 4.555593773410426e-06, + "loss": 0.1702, + "step": 2530 + }, + { + "epoch": 0.23319666467038283, + "grad_norm": 0.8832043346632404, + "learning_rate": 4.555160113431027e-06, + "loss": 0.1653, + "step": 2531 + }, + { + "epoch": 0.23328880084765283, + "grad_norm": 0.9671050400227129, + "learning_rate": 4.554726262629789e-06, + "loss": 0.1752, + "step": 2532 + }, + { + "epoch": 0.23338093702492282, + "grad_norm": 0.8813498900554017, + "learning_rate": 4.554292221046997e-06, + "loss": 0.1535, + "step": 2533 + }, + { + "epoch": 0.23347307320219285, + "grad_norm": 0.928958558583173, + "learning_rate": 4.553857988722951e-06, + "loss": 0.1611, + "step": 2534 + }, + { + "epoch": 0.23356520937946285, + "grad_norm": 0.9717573709876711, + "learning_rate": 4.55342356569797e-06, + "loss": 0.1683, + "step": 2535 + }, + { + "epoch": 0.23365734555673284, + "grad_norm": 0.9072668433737212, + "learning_rate": 4.5529889520123896e-06, + "loss": 0.17, + "step": 2536 + }, + { + "epoch": 0.23374948173400287, + "grad_norm": 0.9099867455316708, + "learning_rate": 4.5525541477065644e-06, + "loss": 0.1746, + "step": 2537 + }, + { + "epoch": 0.23384161791127286, + "grad_norm": 0.9050044764166337, + "learning_rate": 4.552119152820866e-06, + "loss": 0.1606, + "step": 2538 + }, + { + "epoch": 0.23393375408854286, + "grad_norm": 1.0216026559059102, + "learning_rate": 4.551683967395683e-06, + "loss": 0.1692, + "step": 2539 + }, + { + "epoch": 0.23402589026581286, + "grad_norm": 0.9592363893092558, + "learning_rate": 4.5512485914714225e-06, + "loss": 0.1649, + "step": 2540 + }, + { + "epoch": 0.23411802644308288, + "grad_norm": 0.9168069315815013, + "learning_rate": 4.55081302508851e-06, + "loss": 0.165, + "step": 2541 + }, + { + "epoch": 0.23421016262035288, + "grad_norm": 0.918494290815671, + "learning_rate": 4.550377268287387e-06, + "loss": 0.1698, + "step": 2542 + }, + { + "epoch": 0.23430229879762288, + "grad_norm": 0.9922874023864924, + "learning_rate": 4.549941321108514e-06, + "loss": 0.1714, + "step": 2543 + }, + { + "epoch": 0.2343944349748929, + "grad_norm": 1.0043624274520886, + "learning_rate": 4.549505183592368e-06, + "loss": 0.1849, + "step": 2544 + }, + { + "epoch": 0.2344865711521629, + "grad_norm": 0.9095985103827297, + "learning_rate": 4.549068855779447e-06, + "loss": 0.1526, + "step": 2545 + }, + { + "epoch": 0.2345787073294329, + "grad_norm": 0.9878365543468702, + "learning_rate": 4.5486323377102615e-06, + "loss": 0.1765, + "step": 2546 + }, + { + "epoch": 0.23467084350670292, + "grad_norm": 0.8155329876941606, + "learning_rate": 4.548195629425343e-06, + "loss": 0.1394, + "step": 2547 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 0.9100479491854085, + "learning_rate": 4.547758730965239e-06, + "loss": 0.1598, + "step": 2548 + }, + { + "epoch": 0.2348551158612429, + "grad_norm": 0.9992825507444053, + "learning_rate": 4.547321642370517e-06, + "loss": 0.1826, + "step": 2549 + }, + { + "epoch": 0.2349472520385129, + "grad_norm": 0.9061188471199144, + "learning_rate": 4.5468843636817605e-06, + "loss": 0.1576, + "step": 2550 + }, + { + "epoch": 0.23503938821578294, + "grad_norm": 0.9626546650825835, + "learning_rate": 4.54644689493957e-06, + "loss": 0.1722, + "step": 2551 + }, + { + "epoch": 0.23513152439305293, + "grad_norm": 0.9163551094540457, + "learning_rate": 4.546009236184565e-06, + "loss": 0.1463, + "step": 2552 + }, + { + "epoch": 0.23522366057032293, + "grad_norm": 0.9758440630401635, + "learning_rate": 4.545571387457382e-06, + "loss": 0.1688, + "step": 2553 + }, + { + "epoch": 0.23531579674759295, + "grad_norm": 0.9592381292109499, + "learning_rate": 4.545133348798677e-06, + "loss": 0.1758, + "step": 2554 + }, + { + "epoch": 0.23540793292486295, + "grad_norm": 1.0261072582249307, + "learning_rate": 4.54469512024912e-06, + "loss": 0.1713, + "step": 2555 + }, + { + "epoch": 0.23550006910213295, + "grad_norm": 0.9247050273249168, + "learning_rate": 4.5442567018494e-06, + "loss": 0.1625, + "step": 2556 + }, + { + "epoch": 0.23559220527940294, + "grad_norm": 0.8985845157903278, + "learning_rate": 4.543818093640226e-06, + "loss": 0.174, + "step": 2557 + }, + { + "epoch": 0.23568434145667297, + "grad_norm": 0.9681388653177704, + "learning_rate": 4.543379295662322e-06, + "loss": 0.1713, + "step": 2558 + }, + { + "epoch": 0.23577647763394297, + "grad_norm": 0.9448366597258528, + "learning_rate": 4.54294030795643e-06, + "loss": 0.1573, + "step": 2559 + }, + { + "epoch": 0.23586861381121296, + "grad_norm": 0.9314244476538863, + "learning_rate": 4.5425011305633106e-06, + "loss": 0.1792, + "step": 2560 + }, + { + "epoch": 0.235960749988483, + "grad_norm": 1.0225904621056787, + "learning_rate": 4.542061763523742e-06, + "loss": 0.1756, + "step": 2561 + }, + { + "epoch": 0.23605288616575298, + "grad_norm": 0.9305300082933172, + "learning_rate": 4.541622206878519e-06, + "loss": 0.1614, + "step": 2562 + }, + { + "epoch": 0.23614502234302298, + "grad_norm": 0.9248622005950687, + "learning_rate": 4.541182460668453e-06, + "loss": 0.1735, + "step": 2563 + }, + { + "epoch": 0.236237158520293, + "grad_norm": 0.9311672044363726, + "learning_rate": 4.540742524934377e-06, + "loss": 0.1728, + "step": 2564 + }, + { + "epoch": 0.236329294697563, + "grad_norm": 0.9839928437689336, + "learning_rate": 4.540302399717138e-06, + "loss": 0.1706, + "step": 2565 + }, + { + "epoch": 0.236421430874833, + "grad_norm": 0.9426141389876795, + "learning_rate": 4.5398620850576016e-06, + "loss": 0.162, + "step": 2566 + }, + { + "epoch": 0.236513567052103, + "grad_norm": 0.8881256101693608, + "learning_rate": 4.53942158099665e-06, + "loss": 0.1562, + "step": 2567 + }, + { + "epoch": 0.23660570322937302, + "grad_norm": 0.9105265463755824, + "learning_rate": 4.538980887575187e-06, + "loss": 0.1598, + "step": 2568 + }, + { + "epoch": 0.23669783940664302, + "grad_norm": 0.8733143352802549, + "learning_rate": 4.538540004834128e-06, + "loss": 0.155, + "step": 2569 + }, + { + "epoch": 0.23678997558391301, + "grad_norm": 0.885143091633039, + "learning_rate": 4.53809893281441e-06, + "loss": 0.1525, + "step": 2570 + }, + { + "epoch": 0.23688211176118304, + "grad_norm": 0.9585313460757532, + "learning_rate": 4.537657671556987e-06, + "loss": 0.1646, + "step": 2571 + }, + { + "epoch": 0.23697424793845304, + "grad_norm": 0.9948560820566639, + "learning_rate": 4.5372162211028305e-06, + "loss": 0.1807, + "step": 2572 + }, + { + "epoch": 0.23706638411572303, + "grad_norm": 0.9080172248460118, + "learning_rate": 4.536774581492928e-06, + "loss": 0.1494, + "step": 2573 + }, + { + "epoch": 0.23715852029299303, + "grad_norm": 0.9556030654445338, + "learning_rate": 4.5363327527682855e-06, + "loss": 0.1593, + "step": 2574 + }, + { + "epoch": 0.23725065647026305, + "grad_norm": 0.919600867467294, + "learning_rate": 4.535890734969929e-06, + "loss": 0.1704, + "step": 2575 + }, + { + "epoch": 0.23734279264753305, + "grad_norm": 1.0317406838521088, + "learning_rate": 4.535448528138899e-06, + "loss": 0.176, + "step": 2576 + }, + { + "epoch": 0.23743492882480305, + "grad_norm": 0.943160391590626, + "learning_rate": 4.535006132316253e-06, + "loss": 0.1911, + "step": 2577 + }, + { + "epoch": 0.23752706500207307, + "grad_norm": 0.8917538056406878, + "learning_rate": 4.534563547543069e-06, + "loss": 0.1606, + "step": 2578 + }, + { + "epoch": 0.23761920117934307, + "grad_norm": 0.9851079183758169, + "learning_rate": 4.53412077386044e-06, + "loss": 0.1817, + "step": 2579 + }, + { + "epoch": 0.23771133735661307, + "grad_norm": 0.9027548501120674, + "learning_rate": 4.533677811309479e-06, + "loss": 0.1672, + "step": 2580 + }, + { + "epoch": 0.2378034735338831, + "grad_norm": 0.945909808985898, + "learning_rate": 4.533234659931315e-06, + "loss": 0.1845, + "step": 2581 + }, + { + "epoch": 0.2378956097111531, + "grad_norm": 0.8521446168221071, + "learning_rate": 4.532791319767093e-06, + "loss": 0.1434, + "step": 2582 + }, + { + "epoch": 0.23798774588842309, + "grad_norm": 0.9503670223175602, + "learning_rate": 4.532347790857978e-06, + "loss": 0.1808, + "step": 2583 + }, + { + "epoch": 0.23807988206569308, + "grad_norm": 0.9063415363527835, + "learning_rate": 4.531904073245152e-06, + "loss": 0.1701, + "step": 2584 + }, + { + "epoch": 0.2381720182429631, + "grad_norm": 0.8789860283816323, + "learning_rate": 4.531460166969816e-06, + "loss": 0.1668, + "step": 2585 + }, + { + "epoch": 0.2382641544202331, + "grad_norm": 0.9391306489506216, + "learning_rate": 4.531016072073182e-06, + "loss": 0.1697, + "step": 2586 + }, + { + "epoch": 0.2383562905975031, + "grad_norm": 0.9586292371328816, + "learning_rate": 4.530571788596489e-06, + "loss": 0.166, + "step": 2587 + }, + { + "epoch": 0.23844842677477313, + "grad_norm": 0.9708360436328097, + "learning_rate": 4.530127316580986e-06, + "loss": 0.1775, + "step": 2588 + }, + { + "epoch": 0.23854056295204312, + "grad_norm": 0.8840283994456286, + "learning_rate": 4.5296826560679445e-06, + "loss": 0.1493, + "step": 2589 + }, + { + "epoch": 0.23863269912931312, + "grad_norm": 0.9292858225886818, + "learning_rate": 4.529237807098649e-06, + "loss": 0.1625, + "step": 2590 + }, + { + "epoch": 0.23872483530658312, + "grad_norm": 0.8977136056150155, + "learning_rate": 4.528792769714404e-06, + "loss": 0.1758, + "step": 2591 + }, + { + "epoch": 0.23881697148385314, + "grad_norm": 1.0151817811358843, + "learning_rate": 4.528347543956533e-06, + "loss": 0.1596, + "step": 2592 + }, + { + "epoch": 0.23890910766112314, + "grad_norm": 0.9335523901294361, + "learning_rate": 4.527902129866374e-06, + "loss": 0.1732, + "step": 2593 + }, + { + "epoch": 0.23900124383839313, + "grad_norm": 0.9256864277979158, + "learning_rate": 4.527456527485284e-06, + "loss": 0.1683, + "step": 2594 + }, + { + "epoch": 0.23909338001566316, + "grad_norm": 1.0210336286294275, + "learning_rate": 4.527010736854637e-06, + "loss": 0.1601, + "step": 2595 + }, + { + "epoch": 0.23918551619293316, + "grad_norm": 0.960562630209394, + "learning_rate": 4.526564758015825e-06, + "loss": 0.1597, + "step": 2596 + }, + { + "epoch": 0.23927765237020315, + "grad_norm": 0.9923091348049548, + "learning_rate": 4.5261185910102575e-06, + "loss": 0.1749, + "step": 2597 + }, + { + "epoch": 0.23936978854747318, + "grad_norm": 0.9185996987364619, + "learning_rate": 4.525672235879361e-06, + "loss": 0.1757, + "step": 2598 + }, + { + "epoch": 0.23946192472474317, + "grad_norm": 0.9504821525147511, + "learning_rate": 4.5252256926645786e-06, + "loss": 0.1733, + "step": 2599 + }, + { + "epoch": 0.23955406090201317, + "grad_norm": 0.8817193190901266, + "learning_rate": 4.5247789614073725e-06, + "loss": 0.1518, + "step": 2600 + }, + { + "epoch": 0.23964619707928317, + "grad_norm": 0.8998627255833563, + "learning_rate": 4.524332042149223e-06, + "loss": 0.1598, + "step": 2601 + }, + { + "epoch": 0.2397383332565532, + "grad_norm": 0.8916195806919052, + "learning_rate": 4.523884934931624e-06, + "loss": 0.1536, + "step": 2602 + }, + { + "epoch": 0.2398304694338232, + "grad_norm": 0.8969910004039, + "learning_rate": 4.523437639796092e-06, + "loss": 0.1558, + "step": 2603 + }, + { + "epoch": 0.2399226056110932, + "grad_norm": 0.9579370144538515, + "learning_rate": 4.522990156784157e-06, + "loss": 0.1696, + "step": 2604 + }, + { + "epoch": 0.2400147417883632, + "grad_norm": 0.9256485173070821, + "learning_rate": 4.522542485937369e-06, + "loss": 0.165, + "step": 2605 + }, + { + "epoch": 0.2401068779656332, + "grad_norm": 0.8763097473674882, + "learning_rate": 4.522094627297293e-06, + "loss": 0.1518, + "step": 2606 + }, + { + "epoch": 0.2401990141429032, + "grad_norm": 0.9590186614529309, + "learning_rate": 4.521646580905513e-06, + "loss": 0.1663, + "step": 2607 + }, + { + "epoch": 0.2402911503201732, + "grad_norm": 0.8953321886812519, + "learning_rate": 4.521198346803631e-06, + "loss": 0.156, + "step": 2608 + }, + { + "epoch": 0.24038328649744323, + "grad_norm": 0.9207768164702115, + "learning_rate": 4.520749925033264e-06, + "loss": 0.1607, + "step": 2609 + }, + { + "epoch": 0.24047542267471322, + "grad_norm": 0.919939995459796, + "learning_rate": 4.52030131563605e-06, + "loss": 0.1725, + "step": 2610 + }, + { + "epoch": 0.24056755885198322, + "grad_norm": 0.9646077134116254, + "learning_rate": 4.519852518653641e-06, + "loss": 0.167, + "step": 2611 + }, + { + "epoch": 0.24065969502925325, + "grad_norm": 0.905875090197065, + "learning_rate": 4.519403534127709e-06, + "loss": 0.1651, + "step": 2612 + }, + { + "epoch": 0.24075183120652324, + "grad_norm": 0.9733708732781958, + "learning_rate": 4.51895436209994e-06, + "loss": 0.1682, + "step": 2613 + }, + { + "epoch": 0.24084396738379324, + "grad_norm": 0.9138941832811526, + "learning_rate": 4.5185050026120425e-06, + "loss": 0.1581, + "step": 2614 + }, + { + "epoch": 0.24093610356106326, + "grad_norm": 0.8714836624140244, + "learning_rate": 4.5180554557057376e-06, + "loss": 0.1607, + "step": 2615 + }, + { + "epoch": 0.24102823973833326, + "grad_norm": 0.896212162592746, + "learning_rate": 4.5176057214227665e-06, + "loss": 0.1557, + "step": 2616 + }, + { + "epoch": 0.24112037591560326, + "grad_norm": 0.9795900304013812, + "learning_rate": 4.517155799804888e-06, + "loss": 0.1594, + "step": 2617 + }, + { + "epoch": 0.24121251209287325, + "grad_norm": 0.9620751253748585, + "learning_rate": 4.516705690893874e-06, + "loss": 0.1746, + "step": 2618 + }, + { + "epoch": 0.24130464827014328, + "grad_norm": 0.9766714887522009, + "learning_rate": 4.516255394731522e-06, + "loss": 0.1655, + "step": 2619 + }, + { + "epoch": 0.24139678444741328, + "grad_norm": 0.9644944957480073, + "learning_rate": 4.515804911359639e-06, + "loss": 0.1604, + "step": 2620 + }, + { + "epoch": 0.24148892062468327, + "grad_norm": 1.0028858940407497, + "learning_rate": 4.5153542408200524e-06, + "loss": 0.1666, + "step": 2621 + }, + { + "epoch": 0.2415810568019533, + "grad_norm": 0.9909913147953266, + "learning_rate": 4.514903383154608e-06, + "loss": 0.1715, + "step": 2622 + }, + { + "epoch": 0.2416731929792233, + "grad_norm": 0.9720807397909752, + "learning_rate": 4.5144523384051675e-06, + "loss": 0.1704, + "step": 2623 + }, + { + "epoch": 0.2417653291564933, + "grad_norm": 0.9983520531859589, + "learning_rate": 4.514001106613611e-06, + "loss": 0.1673, + "step": 2624 + }, + { + "epoch": 0.2418574653337633, + "grad_norm": 0.871023507107817, + "learning_rate": 4.513549687821834e-06, + "loss": 0.1598, + "step": 2625 + }, + { + "epoch": 0.2419496015110333, + "grad_norm": 0.875727688350462, + "learning_rate": 4.513098082071753e-06, + "loss": 0.1796, + "step": 2626 + }, + { + "epoch": 0.2420417376883033, + "grad_norm": 0.9984200375498948, + "learning_rate": 4.512646289405298e-06, + "loss": 0.1737, + "step": 2627 + }, + { + "epoch": 0.2421338738655733, + "grad_norm": 0.9219824808671676, + "learning_rate": 4.5121943098644185e-06, + "loss": 0.1727, + "step": 2628 + }, + { + "epoch": 0.24222601004284333, + "grad_norm": 0.903236560497671, + "learning_rate": 4.5117421434910805e-06, + "loss": 0.1563, + "step": 2629 + }, + { + "epoch": 0.24231814622011333, + "grad_norm": 1.1088373746993974, + "learning_rate": 4.511289790327268e-06, + "loss": 0.1795, + "step": 2630 + }, + { + "epoch": 0.24241028239738333, + "grad_norm": 0.9099632964748022, + "learning_rate": 4.510837250414982e-06, + "loss": 0.1802, + "step": 2631 + }, + { + "epoch": 0.24250241857465335, + "grad_norm": 0.9625764621719629, + "learning_rate": 4.5103845237962405e-06, + "loss": 0.1824, + "step": 2632 + }, + { + "epoch": 0.24259455475192335, + "grad_norm": 0.9045197623683646, + "learning_rate": 4.509931610513081e-06, + "loss": 0.1535, + "step": 2633 + }, + { + "epoch": 0.24268669092919334, + "grad_norm": 0.8883996221321223, + "learning_rate": 4.509478510607553e-06, + "loss": 0.162, + "step": 2634 + }, + { + "epoch": 0.24277882710646334, + "grad_norm": 0.8733565291886327, + "learning_rate": 4.509025224121732e-06, + "loss": 0.1488, + "step": 2635 + }, + { + "epoch": 0.24287096328373337, + "grad_norm": 0.9323593147387864, + "learning_rate": 4.5085717510977e-06, + "loss": 0.1832, + "step": 2636 + }, + { + "epoch": 0.24296309946100336, + "grad_norm": 0.8330195920047772, + "learning_rate": 4.508118091577566e-06, + "loss": 0.1578, + "step": 2637 + }, + { + "epoch": 0.24305523563827336, + "grad_norm": 1.0178111918491635, + "learning_rate": 4.507664245603451e-06, + "loss": 0.1661, + "step": 2638 + }, + { + "epoch": 0.24314737181554338, + "grad_norm": 0.9034888786182202, + "learning_rate": 4.507210213217495e-06, + "loss": 0.1695, + "step": 2639 + }, + { + "epoch": 0.24323950799281338, + "grad_norm": 0.9159922991117291, + "learning_rate": 4.506755994461853e-06, + "loss": 0.1803, + "step": 2640 + }, + { + "epoch": 0.24333164417008338, + "grad_norm": 0.9496166120744165, + "learning_rate": 4.506301589378703e-06, + "loss": 0.1754, + "step": 2641 + }, + { + "epoch": 0.24342378034735337, + "grad_norm": 0.927288722206701, + "learning_rate": 4.5058469980102336e-06, + "loss": 0.1608, + "step": 2642 + }, + { + "epoch": 0.2435159165246234, + "grad_norm": 0.9357101673472532, + "learning_rate": 4.505392220398655e-06, + "loss": 0.1743, + "step": 2643 + }, + { + "epoch": 0.2436080527018934, + "grad_norm": 0.8967894186666624, + "learning_rate": 4.504937256586192e-06, + "loss": 0.1614, + "step": 2644 + }, + { + "epoch": 0.2437001888791634, + "grad_norm": 1.008390819829681, + "learning_rate": 4.50448210661509e-06, + "loss": 0.18, + "step": 2645 + }, + { + "epoch": 0.24379232505643342, + "grad_norm": 0.8959777511685124, + "learning_rate": 4.504026770527607e-06, + "loss": 0.1625, + "step": 2646 + }, + { + "epoch": 0.24388446123370341, + "grad_norm": 0.883975734164676, + "learning_rate": 4.503571248366024e-06, + "loss": 0.1644, + "step": 2647 + }, + { + "epoch": 0.2439765974109734, + "grad_norm": 0.9447762260150692, + "learning_rate": 4.503115540172636e-06, + "loss": 0.1766, + "step": 2648 + }, + { + "epoch": 0.24406873358824344, + "grad_norm": 0.9023969503174291, + "learning_rate": 4.502659645989753e-06, + "loss": 0.1517, + "step": 2649 + }, + { + "epoch": 0.24416086976551343, + "grad_norm": 0.8786423359540836, + "learning_rate": 4.502203565859706e-06, + "loss": 0.1548, + "step": 2650 + }, + { + "epoch": 0.24425300594278343, + "grad_norm": 0.9129953675126846, + "learning_rate": 4.501747299824843e-06, + "loss": 0.1769, + "step": 2651 + }, + { + "epoch": 0.24434514212005343, + "grad_norm": 0.8685998545587346, + "learning_rate": 4.501290847927529e-06, + "loss": 0.158, + "step": 2652 + }, + { + "epoch": 0.24443727829732345, + "grad_norm": 0.9169127256515894, + "learning_rate": 4.500834210210143e-06, + "loss": 0.1536, + "step": 2653 + }, + { + "epoch": 0.24452941447459345, + "grad_norm": 0.9004372536575974, + "learning_rate": 4.500377386715086e-06, + "loss": 0.1561, + "step": 2654 + }, + { + "epoch": 0.24462155065186345, + "grad_norm": 0.9171923195813083, + "learning_rate": 4.499920377484772e-06, + "loss": 0.1561, + "step": 2655 + }, + { + "epoch": 0.24471368682913347, + "grad_norm": 0.9970316417826179, + "learning_rate": 4.499463182561637e-06, + "loss": 0.1781, + "step": 2656 + }, + { + "epoch": 0.24480582300640347, + "grad_norm": 0.8986690107368408, + "learning_rate": 4.49900580198813e-06, + "loss": 0.163, + "step": 2657 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 0.8609935269764124, + "learning_rate": 4.498548235806719e-06, + "loss": 0.1552, + "step": 2658 + }, + { + "epoch": 0.24499009536094346, + "grad_norm": 0.8736993468630774, + "learning_rate": 4.4980904840598894e-06, + "loss": 0.1552, + "step": 2659 + }, + { + "epoch": 0.24508223153821349, + "grad_norm": 0.9081273596019058, + "learning_rate": 4.497632546790143e-06, + "loss": 0.1512, + "step": 2660 + }, + { + "epoch": 0.24517436771548348, + "grad_norm": 0.9102031151269082, + "learning_rate": 4.49717442404e-06, + "loss": 0.1595, + "step": 2661 + }, + { + "epoch": 0.24526650389275348, + "grad_norm": 0.9021316326942003, + "learning_rate": 4.496716115851996e-06, + "loss": 0.1717, + "step": 2662 + }, + { + "epoch": 0.2453586400700235, + "grad_norm": 0.9719049140168042, + "learning_rate": 4.496257622268687e-06, + "loss": 0.1615, + "step": 2663 + }, + { + "epoch": 0.2454507762472935, + "grad_norm": 0.9727092079229075, + "learning_rate": 4.495798943332642e-06, + "loss": 0.1579, + "step": 2664 + }, + { + "epoch": 0.2455429124245635, + "grad_norm": 0.8986708563300652, + "learning_rate": 4.495340079086451e-06, + "loss": 0.169, + "step": 2665 + }, + { + "epoch": 0.24563504860183352, + "grad_norm": 0.949471301620958, + "learning_rate": 4.494881029572718e-06, + "loss": 0.1704, + "step": 2666 + }, + { + "epoch": 0.24572718477910352, + "grad_norm": 0.9152647484984884, + "learning_rate": 4.494421794834068e-06, + "loss": 0.1605, + "step": 2667 + }, + { + "epoch": 0.24581932095637352, + "grad_norm": 1.0020699225934224, + "learning_rate": 4.4939623749131385e-06, + "loss": 0.1787, + "step": 2668 + }, + { + "epoch": 0.2459114571336435, + "grad_norm": 0.9225724208528915, + "learning_rate": 4.493502769852589e-06, + "loss": 0.1645, + "step": 2669 + }, + { + "epoch": 0.24600359331091354, + "grad_norm": 0.9282588859001433, + "learning_rate": 4.493042979695092e-06, + "loss": 0.1706, + "step": 2670 + }, + { + "epoch": 0.24609572948818353, + "grad_norm": 0.950681578689861, + "learning_rate": 4.4925830044833405e-06, + "loss": 0.1632, + "step": 2671 + }, + { + "epoch": 0.24618786566545353, + "grad_norm": 0.9578740886589785, + "learning_rate": 4.492122844260042e-06, + "loss": 0.1645, + "step": 2672 + }, + { + "epoch": 0.24628000184272356, + "grad_norm": 0.9183491188538061, + "learning_rate": 4.491662499067923e-06, + "loss": 0.1684, + "step": 2673 + }, + { + "epoch": 0.24637213801999355, + "grad_norm": 0.886920316373167, + "learning_rate": 4.491201968949726e-06, + "loss": 0.1425, + "step": 2674 + }, + { + "epoch": 0.24646427419726355, + "grad_norm": 0.9515433294511995, + "learning_rate": 4.490741253948213e-06, + "loss": 0.1617, + "step": 2675 + }, + { + "epoch": 0.24655641037453357, + "grad_norm": 0.9513213792540711, + "learning_rate": 4.49028035410616e-06, + "loss": 0.161, + "step": 2676 + }, + { + "epoch": 0.24664854655180357, + "grad_norm": 0.9553852848159456, + "learning_rate": 4.489819269466362e-06, + "loss": 0.1646, + "step": 2677 + }, + { + "epoch": 0.24674068272907357, + "grad_norm": 1.0489521515871678, + "learning_rate": 4.489358000071631e-06, + "loss": 0.1591, + "step": 2678 + }, + { + "epoch": 0.24683281890634357, + "grad_norm": 0.9143172794273362, + "learning_rate": 4.488896545964795e-06, + "loss": 0.1642, + "step": 2679 + }, + { + "epoch": 0.2469249550836136, + "grad_norm": 0.8760022568204034, + "learning_rate": 4.4884349071887e-06, + "loss": 0.1506, + "step": 2680 + }, + { + "epoch": 0.2470170912608836, + "grad_norm": 0.9208733869067626, + "learning_rate": 4.487973083786211e-06, + "loss": 0.1668, + "step": 2681 + }, + { + "epoch": 0.24710922743815358, + "grad_norm": 0.940764657196804, + "learning_rate": 4.4875110758002076e-06, + "loss": 0.1731, + "step": 2682 + }, + { + "epoch": 0.2472013636154236, + "grad_norm": 0.975836350447163, + "learning_rate": 4.487048883273586e-06, + "loss": 0.162, + "step": 2683 + }, + { + "epoch": 0.2472934997926936, + "grad_norm": 0.9104989847581315, + "learning_rate": 4.486586506249262e-06, + "loss": 0.1656, + "step": 2684 + }, + { + "epoch": 0.2473856359699636, + "grad_norm": 0.8994565355652719, + "learning_rate": 4.486123944770166e-06, + "loss": 0.1649, + "step": 2685 + }, + { + "epoch": 0.2474777721472336, + "grad_norm": 0.9958969115514426, + "learning_rate": 4.48566119887925e-06, + "loss": 0.1773, + "step": 2686 + }, + { + "epoch": 0.24756990832450362, + "grad_norm": 0.8853726143372311, + "learning_rate": 4.4851982686194775e-06, + "loss": 0.1567, + "step": 2687 + }, + { + "epoch": 0.24766204450177362, + "grad_norm": 0.9261982366343027, + "learning_rate": 4.484735154033831e-06, + "loss": 0.1593, + "step": 2688 + }, + { + "epoch": 0.24775418067904362, + "grad_norm": 0.9154568975953823, + "learning_rate": 4.484271855165312e-06, + "loss": 0.1694, + "step": 2689 + }, + { + "epoch": 0.24784631685631364, + "grad_norm": 0.9484967646834224, + "learning_rate": 4.483808372056939e-06, + "loss": 0.1714, + "step": 2690 + }, + { + "epoch": 0.24793845303358364, + "grad_norm": 0.9634197695354942, + "learning_rate": 4.483344704751745e-06, + "loss": 0.1791, + "step": 2691 + }, + { + "epoch": 0.24803058921085364, + "grad_norm": 0.9700386725862131, + "learning_rate": 4.48288085329278e-06, + "loss": 0.1642, + "step": 2692 + }, + { + "epoch": 0.24812272538812366, + "grad_norm": 0.8622018190901424, + "learning_rate": 4.482416817723115e-06, + "loss": 0.1619, + "step": 2693 + }, + { + "epoch": 0.24821486156539366, + "grad_norm": 0.9328281803476557, + "learning_rate": 4.481952598085836e-06, + "loss": 0.1816, + "step": 2694 + }, + { + "epoch": 0.24830699774266365, + "grad_norm": 0.8818479936219914, + "learning_rate": 4.481488194424044e-06, + "loss": 0.1502, + "step": 2695 + }, + { + "epoch": 0.24839913391993365, + "grad_norm": 0.9726880839105281, + "learning_rate": 4.481023606780861e-06, + "loss": 0.1681, + "step": 2696 + }, + { + "epoch": 0.24849127009720368, + "grad_norm": 0.9194992416431299, + "learning_rate": 4.480558835199422e-06, + "loss": 0.1611, + "step": 2697 + }, + { + "epoch": 0.24858340627447367, + "grad_norm": 0.8483395943971769, + "learning_rate": 4.4800938797228825e-06, + "loss": 0.1508, + "step": 2698 + }, + { + "epoch": 0.24867554245174367, + "grad_norm": 0.863379180784911, + "learning_rate": 4.479628740394412e-06, + "loss": 0.1424, + "step": 2699 + }, + { + "epoch": 0.2487676786290137, + "grad_norm": 0.918067359903417, + "learning_rate": 4.4791634172572015e-06, + "loss": 0.1557, + "step": 2700 + }, + { + "epoch": 0.2488598148062837, + "grad_norm": 0.9248335136551753, + "learning_rate": 4.478697910354455e-06, + "loss": 0.1647, + "step": 2701 + }, + { + "epoch": 0.2489519509835537, + "grad_norm": 0.9227062178559122, + "learning_rate": 4.4782322197293935e-06, + "loss": 0.1508, + "step": 2702 + }, + { + "epoch": 0.24904408716082369, + "grad_norm": 0.9355481138879411, + "learning_rate": 4.477766345425257e-06, + "loss": 0.1669, + "step": 2703 + }, + { + "epoch": 0.2491362233380937, + "grad_norm": 0.9262215971301258, + "learning_rate": 4.4773002874853035e-06, + "loss": 0.166, + "step": 2704 + }, + { + "epoch": 0.2492283595153637, + "grad_norm": 0.939773739473119, + "learning_rate": 4.476834045952805e-06, + "loss": 0.1692, + "step": 2705 + }, + { + "epoch": 0.2493204956926337, + "grad_norm": 0.9233835051003554, + "learning_rate": 4.476367620871053e-06, + "loss": 0.1678, + "step": 2706 + }, + { + "epoch": 0.24941263186990373, + "grad_norm": 0.9959179911656888, + "learning_rate": 4.475901012283354e-06, + "loss": 0.1719, + "step": 2707 + }, + { + "epoch": 0.24950476804717373, + "grad_norm": 0.9906403922433501, + "learning_rate": 4.475434220233034e-06, + "loss": 0.171, + "step": 2708 + }, + { + "epoch": 0.24959690422444372, + "grad_norm": 0.9654043551251009, + "learning_rate": 4.474967244763434e-06, + "loss": 0.1524, + "step": 2709 + }, + { + "epoch": 0.24968904040171375, + "grad_norm": 0.8859687185368176, + "learning_rate": 4.474500085917912e-06, + "loss": 0.1626, + "step": 2710 + }, + { + "epoch": 0.24978117657898374, + "grad_norm": 0.9742371995625955, + "learning_rate": 4.474032743739846e-06, + "loss": 0.1681, + "step": 2711 + }, + { + "epoch": 0.24987331275625374, + "grad_norm": 0.8800994362044134, + "learning_rate": 4.4735652182726265e-06, + "loss": 0.1534, + "step": 2712 + }, + { + "epoch": 0.24996544893352374, + "grad_norm": 0.9751280188862264, + "learning_rate": 4.473097509559664e-06, + "loss": 0.1629, + "step": 2713 + }, + { + "epoch": 0.25005758511079373, + "grad_norm": 0.9597390984328364, + "learning_rate": 4.472629617644385e-06, + "loss": 0.1634, + "step": 2714 + }, + { + "epoch": 0.25014972128806373, + "grad_norm": 0.8782469623492604, + "learning_rate": 4.472161542570234e-06, + "loss": 0.1465, + "step": 2715 + }, + { + "epoch": 0.2502418574653338, + "grad_norm": 1.0126355329165513, + "learning_rate": 4.4716932843806715e-06, + "loss": 0.1885, + "step": 2716 + }, + { + "epoch": 0.2503339936426038, + "grad_norm": 0.9689860924375248, + "learning_rate": 4.471224843119176e-06, + "loss": 0.1622, + "step": 2717 + }, + { + "epoch": 0.2504261298198738, + "grad_norm": 1.0502120884934425, + "learning_rate": 4.470756218829241e-06, + "loss": 0.1806, + "step": 2718 + }, + { + "epoch": 0.2505182659971438, + "grad_norm": 0.8987777249287304, + "learning_rate": 4.470287411554379e-06, + "loss": 0.1517, + "step": 2719 + }, + { + "epoch": 0.25061040217441377, + "grad_norm": 1.0238603313699401, + "learning_rate": 4.469818421338119e-06, + "loss": 0.1562, + "step": 2720 + }, + { + "epoch": 0.25070253835168377, + "grad_norm": 1.0334880871837124, + "learning_rate": 4.469349248224007e-06, + "loss": 0.1943, + "step": 2721 + }, + { + "epoch": 0.2507946745289538, + "grad_norm": 0.9433556577855376, + "learning_rate": 4.468879892255604e-06, + "loss": 0.155, + "step": 2722 + }, + { + "epoch": 0.2508868107062238, + "grad_norm": 0.9291327476122262, + "learning_rate": 4.4684103534764925e-06, + "loss": 0.1684, + "step": 2723 + }, + { + "epoch": 0.2509789468834938, + "grad_norm": 0.9294984576948387, + "learning_rate": 4.467940631930267e-06, + "loss": 0.1643, + "step": 2724 + }, + { + "epoch": 0.2510710830607638, + "grad_norm": 0.8568728555781507, + "learning_rate": 4.467470727660543e-06, + "loss": 0.161, + "step": 2725 + }, + { + "epoch": 0.2511632192380338, + "grad_norm": 0.9282762970010767, + "learning_rate": 4.467000640710949e-06, + "loss": 0.1657, + "step": 2726 + }, + { + "epoch": 0.2512553554153038, + "grad_norm": 0.9137764648161802, + "learning_rate": 4.466530371125135e-06, + "loss": 0.1627, + "step": 2727 + }, + { + "epoch": 0.2513474915925738, + "grad_norm": 0.9720124135320236, + "learning_rate": 4.4660599189467634e-06, + "loss": 0.1748, + "step": 2728 + }, + { + "epoch": 0.25143962776984385, + "grad_norm": 1.0387321685221882, + "learning_rate": 4.465589284219517e-06, + "loss": 0.1823, + "step": 2729 + }, + { + "epoch": 0.25153176394711385, + "grad_norm": 0.953514571101409, + "learning_rate": 4.465118466987094e-06, + "loss": 0.1749, + "step": 2730 + }, + { + "epoch": 0.25162390012438385, + "grad_norm": 0.9653536317377608, + "learning_rate": 4.4646474672932105e-06, + "loss": 0.1688, + "step": 2731 + }, + { + "epoch": 0.25171603630165384, + "grad_norm": 0.9214114453191518, + "learning_rate": 4.464176285181597e-06, + "loss": 0.1707, + "step": 2732 + }, + { + "epoch": 0.25180817247892384, + "grad_norm": 0.9310991093879136, + "learning_rate": 4.4637049206960055e-06, + "loss": 0.1664, + "step": 2733 + }, + { + "epoch": 0.25190030865619384, + "grad_norm": 0.8764637830155427, + "learning_rate": 4.4632333738802e-06, + "loss": 0.1636, + "step": 2734 + }, + { + "epoch": 0.25199244483346384, + "grad_norm": 0.8625927177967262, + "learning_rate": 4.462761644777964e-06, + "loss": 0.1619, + "step": 2735 + }, + { + "epoch": 0.2520845810107339, + "grad_norm": 0.90143610722121, + "learning_rate": 4.4622897334330985e-06, + "loss": 0.1643, + "step": 2736 + }, + { + "epoch": 0.2521767171880039, + "grad_norm": 0.9695051236534774, + "learning_rate": 4.4618176398894205e-06, + "loss": 0.1621, + "step": 2737 + }, + { + "epoch": 0.2522688533652739, + "grad_norm": 0.9623621029230007, + "learning_rate": 4.4613453641907634e-06, + "loss": 0.169, + "step": 2738 + }, + { + "epoch": 0.2523609895425439, + "grad_norm": 0.9804724158084682, + "learning_rate": 4.460872906380977e-06, + "loss": 0.1667, + "step": 2739 + }, + { + "epoch": 0.2524531257198139, + "grad_norm": 0.9440309923131297, + "learning_rate": 4.460400266503932e-06, + "loss": 0.1788, + "step": 2740 + }, + { + "epoch": 0.2525452618970839, + "grad_norm": 0.8943689360394186, + "learning_rate": 4.4599274446035104e-06, + "loss": 0.1634, + "step": 2741 + }, + { + "epoch": 0.25263739807435387, + "grad_norm": 0.9078445150328539, + "learning_rate": 4.459454440723614e-06, + "loss": 0.1674, + "step": 2742 + }, + { + "epoch": 0.2527295342516239, + "grad_norm": 1.0362519468921287, + "learning_rate": 4.4589812549081624e-06, + "loss": 0.1754, + "step": 2743 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 0.9085550271912034, + "learning_rate": 4.458507887201091e-06, + "loss": 0.1622, + "step": 2744 + }, + { + "epoch": 0.2529138066061639, + "grad_norm": 0.9723393303756631, + "learning_rate": 4.458034337646351e-06, + "loss": 0.1717, + "step": 2745 + }, + { + "epoch": 0.2530059427834339, + "grad_norm": 0.8992954032616152, + "learning_rate": 4.4575606062879115e-06, + "loss": 0.1591, + "step": 2746 + }, + { + "epoch": 0.2530980789607039, + "grad_norm": 0.94354265969286, + "learning_rate": 4.45708669316976e-06, + "loss": 0.167, + "step": 2747 + }, + { + "epoch": 0.2531902151379739, + "grad_norm": 0.9230334147360164, + "learning_rate": 4.456612598335898e-06, + "loss": 0.1684, + "step": 2748 + }, + { + "epoch": 0.2532823513152439, + "grad_norm": 0.8605421993979968, + "learning_rate": 4.4561383218303455e-06, + "loss": 0.1448, + "step": 2749 + }, + { + "epoch": 0.25337448749251396, + "grad_norm": 0.9734131686816277, + "learning_rate": 4.45566386369714e-06, + "loss": 0.1796, + "step": 2750 + }, + { + "epoch": 0.25346662366978395, + "grad_norm": 0.9752093359656303, + "learning_rate": 4.455189223980333e-06, + "loss": 0.1666, + "step": 2751 + }, + { + "epoch": 0.25355875984705395, + "grad_norm": 0.9671629645895793, + "learning_rate": 4.454714402723997e-06, + "loss": 0.1672, + "step": 2752 + }, + { + "epoch": 0.25365089602432395, + "grad_norm": 0.9912354137997301, + "learning_rate": 4.4542393999722184e-06, + "loss": 0.1749, + "step": 2753 + }, + { + "epoch": 0.25374303220159394, + "grad_norm": 0.9629544614386797, + "learning_rate": 4.453764215769101e-06, + "loss": 0.1617, + "step": 2754 + }, + { + "epoch": 0.25383516837886394, + "grad_norm": 0.903837560697107, + "learning_rate": 4.4532888501587655e-06, + "loss": 0.1691, + "step": 2755 + }, + { + "epoch": 0.253927304556134, + "grad_norm": 0.9369456487961056, + "learning_rate": 4.452813303185351e-06, + "loss": 0.1628, + "step": 2756 + }, + { + "epoch": 0.254019440733404, + "grad_norm": 0.9460665893734913, + "learning_rate": 4.452337574893011e-06, + "loss": 0.1602, + "step": 2757 + }, + { + "epoch": 0.254111576910674, + "grad_norm": 0.8699779142398438, + "learning_rate": 4.451861665325916e-06, + "loss": 0.1473, + "step": 2758 + }, + { + "epoch": 0.254203713087944, + "grad_norm": 0.9002005591816477, + "learning_rate": 4.451385574528256e-06, + "loss": 0.1511, + "step": 2759 + }, + { + "epoch": 0.254295849265214, + "grad_norm": 0.9945308777773249, + "learning_rate": 4.450909302544235e-06, + "loss": 0.1645, + "step": 2760 + }, + { + "epoch": 0.254387985442484, + "grad_norm": 0.9477308554562681, + "learning_rate": 4.450432849418076e-06, + "loss": 0.1698, + "step": 2761 + }, + { + "epoch": 0.254480121619754, + "grad_norm": 0.975609887655622, + "learning_rate": 4.449956215194017e-06, + "loss": 0.1733, + "step": 2762 + }, + { + "epoch": 0.254572257797024, + "grad_norm": 1.045564596726802, + "learning_rate": 4.4494793999163125e-06, + "loss": 0.1485, + "step": 2763 + }, + { + "epoch": 0.254664393974294, + "grad_norm": 0.9691806406972214, + "learning_rate": 4.449002403629237e-06, + "loss": 0.1715, + "step": 2764 + }, + { + "epoch": 0.254756530151564, + "grad_norm": 0.8882406299274819, + "learning_rate": 4.448525226377078e-06, + "loss": 0.1599, + "step": 2765 + }, + { + "epoch": 0.254848666328834, + "grad_norm": 1.0071041342757574, + "learning_rate": 4.448047868204143e-06, + "loss": 0.1663, + "step": 2766 + }, + { + "epoch": 0.254940802506104, + "grad_norm": 0.9867383224082461, + "learning_rate": 4.447570329154752e-06, + "loss": 0.159, + "step": 2767 + }, + { + "epoch": 0.255032938683374, + "grad_norm": 0.9594025105717243, + "learning_rate": 4.447092609273248e-06, + "loss": 0.1672, + "step": 2768 + }, + { + "epoch": 0.255125074860644, + "grad_norm": 0.9711693026778082, + "learning_rate": 4.446614708603985e-06, + "loss": 0.175, + "step": 2769 + }, + { + "epoch": 0.25521721103791406, + "grad_norm": 0.9495706212576188, + "learning_rate": 4.446136627191337e-06, + "loss": 0.1688, + "step": 2770 + }, + { + "epoch": 0.25530934721518406, + "grad_norm": 0.8719052281328445, + "learning_rate": 4.445658365079693e-06, + "loss": 0.1548, + "step": 2771 + }, + { + "epoch": 0.25540148339245405, + "grad_norm": 0.9026874716661581, + "learning_rate": 4.4451799223134615e-06, + "loss": 0.1588, + "step": 2772 + }, + { + "epoch": 0.25549361956972405, + "grad_norm": 0.9205413159506892, + "learning_rate": 4.444701298937064e-06, + "loss": 0.1647, + "step": 2773 + }, + { + "epoch": 0.25558575574699405, + "grad_norm": 0.8853201950547462, + "learning_rate": 4.444222494994942e-06, + "loss": 0.1607, + "step": 2774 + }, + { + "epoch": 0.25567789192426404, + "grad_norm": 0.9112603810645689, + "learning_rate": 4.443743510531552e-06, + "loss": 0.1581, + "step": 2775 + }, + { + "epoch": 0.25577002810153404, + "grad_norm": 0.9430372314730556, + "learning_rate": 4.443264345591368e-06, + "loss": 0.1602, + "step": 2776 + }, + { + "epoch": 0.2558621642788041, + "grad_norm": 0.9940607095386159, + "learning_rate": 4.442785000218881e-06, + "loss": 0.1669, + "step": 2777 + }, + { + "epoch": 0.2559543004560741, + "grad_norm": 0.874978128077645, + "learning_rate": 4.442305474458596e-06, + "loss": 0.1598, + "step": 2778 + }, + { + "epoch": 0.2560464366333441, + "grad_norm": 0.870158754636262, + "learning_rate": 4.4418257683550405e-06, + "loss": 0.1537, + "step": 2779 + }, + { + "epoch": 0.2561385728106141, + "grad_norm": 0.9790466686822947, + "learning_rate": 4.441345881952752e-06, + "loss": 0.169, + "step": 2780 + }, + { + "epoch": 0.2562307089878841, + "grad_norm": 1.0139628897985031, + "learning_rate": 4.44086581529629e-06, + "loss": 0.1769, + "step": 2781 + }, + { + "epoch": 0.2563228451651541, + "grad_norm": 0.9463222270409243, + "learning_rate": 4.440385568430228e-06, + "loss": 0.1544, + "step": 2782 + }, + { + "epoch": 0.2564149813424241, + "grad_norm": 0.9747904707531557, + "learning_rate": 4.439905141399157e-06, + "loss": 0.1797, + "step": 2783 + }, + { + "epoch": 0.25650711751969413, + "grad_norm": 0.9977583516525407, + "learning_rate": 4.439424534247686e-06, + "loss": 0.162, + "step": 2784 + }, + { + "epoch": 0.2565992536969641, + "grad_norm": 1.009039762592302, + "learning_rate": 4.438943747020437e-06, + "loss": 0.1606, + "step": 2785 + }, + { + "epoch": 0.2566913898742341, + "grad_norm": 0.917807551416993, + "learning_rate": 4.438462779762052e-06, + "loss": 0.1589, + "step": 2786 + }, + { + "epoch": 0.2567835260515041, + "grad_norm": 1.0105522924525019, + "learning_rate": 4.437981632517191e-06, + "loss": 0.1691, + "step": 2787 + }, + { + "epoch": 0.2568756622287741, + "grad_norm": 0.8741885257091299, + "learning_rate": 4.437500305330526e-06, + "loss": 0.1627, + "step": 2788 + }, + { + "epoch": 0.2569677984060441, + "grad_norm": 0.9656983545728868, + "learning_rate": 4.437018798246749e-06, + "loss": 0.1533, + "step": 2789 + }, + { + "epoch": 0.25705993458331416, + "grad_norm": 0.9585997069738847, + "learning_rate": 4.436537111310568e-06, + "loss": 0.1806, + "step": 2790 + }, + { + "epoch": 0.25715207076058416, + "grad_norm": 0.891640608831986, + "learning_rate": 4.436055244566708e-06, + "loss": 0.1585, + "step": 2791 + }, + { + "epoch": 0.25724420693785416, + "grad_norm": 0.9842157286945457, + "learning_rate": 4.4355731980599105e-06, + "loss": 0.1579, + "step": 2792 + }, + { + "epoch": 0.25733634311512416, + "grad_norm": 1.0030576180364315, + "learning_rate": 4.435090971834933e-06, + "loss": 0.1709, + "step": 2793 + }, + { + "epoch": 0.25742847929239415, + "grad_norm": 0.9517782001478897, + "learning_rate": 4.43460856593655e-06, + "loss": 0.1718, + "step": 2794 + }, + { + "epoch": 0.25752061546966415, + "grad_norm": 0.8794981685031267, + "learning_rate": 4.434125980409553e-06, + "loss": 0.1562, + "step": 2795 + }, + { + "epoch": 0.25761275164693415, + "grad_norm": 0.8812764377019595, + "learning_rate": 4.433643215298753e-06, + "loss": 0.1577, + "step": 2796 + }, + { + "epoch": 0.2577048878242042, + "grad_norm": 0.8945685024582577, + "learning_rate": 4.433160270648971e-06, + "loss": 0.1613, + "step": 2797 + }, + { + "epoch": 0.2577970240014742, + "grad_norm": 0.8853629308772607, + "learning_rate": 4.432677146505049e-06, + "loss": 0.1643, + "step": 2798 + }, + { + "epoch": 0.2578891601787442, + "grad_norm": 0.9325081834935269, + "learning_rate": 4.432193842911846e-06, + "loss": 0.1807, + "step": 2799 + }, + { + "epoch": 0.2579812963560142, + "grad_norm": 0.9434955095871754, + "learning_rate": 4.431710359914238e-06, + "loss": 0.1653, + "step": 2800 + }, + { + "epoch": 0.2580734325332842, + "grad_norm": 0.9005501179330212, + "learning_rate": 4.4312266975571145e-06, + "loss": 0.1608, + "step": 2801 + }, + { + "epoch": 0.2581655687105542, + "grad_norm": 0.9051611437276726, + "learning_rate": 4.430742855885384e-06, + "loss": 0.1595, + "step": 2802 + }, + { + "epoch": 0.2582577048878242, + "grad_norm": 0.8541816832770728, + "learning_rate": 4.430258834943973e-06, + "loss": 0.1435, + "step": 2803 + }, + { + "epoch": 0.25834984106509423, + "grad_norm": 0.8819731313821468, + "learning_rate": 4.429774634777819e-06, + "loss": 0.1708, + "step": 2804 + }, + { + "epoch": 0.25844197724236423, + "grad_norm": 0.8660324638072526, + "learning_rate": 4.429290255431884e-06, + "loss": 0.1548, + "step": 2805 + }, + { + "epoch": 0.2585341134196342, + "grad_norm": 0.9404639070221896, + "learning_rate": 4.428805696951141e-06, + "loss": 0.1691, + "step": 2806 + }, + { + "epoch": 0.2586262495969042, + "grad_norm": 0.8740371398676113, + "learning_rate": 4.428320959380581e-06, + "loss": 0.1684, + "step": 2807 + }, + { + "epoch": 0.2587183857741742, + "grad_norm": 0.8545366276085776, + "learning_rate": 4.427836042765213e-06, + "loss": 0.1448, + "step": 2808 + }, + { + "epoch": 0.2588105219514442, + "grad_norm": 0.8662838141494884, + "learning_rate": 4.4273509471500606e-06, + "loss": 0.148, + "step": 2809 + }, + { + "epoch": 0.2589026581287142, + "grad_norm": 0.9313287920052562, + "learning_rate": 4.426865672580166e-06, + "loss": 0.1506, + "step": 2810 + }, + { + "epoch": 0.25899479430598427, + "grad_norm": 0.8542806501069565, + "learning_rate": 4.426380219100585e-06, + "loss": 0.1531, + "step": 2811 + }, + { + "epoch": 0.25908693048325426, + "grad_norm": 1.0367999062687907, + "learning_rate": 4.425894586756394e-06, + "loss": 0.196, + "step": 2812 + }, + { + "epoch": 0.25917906666052426, + "grad_norm": 0.9720196319247114, + "learning_rate": 4.425408775592684e-06, + "loss": 0.1574, + "step": 2813 + }, + { + "epoch": 0.25927120283779426, + "grad_norm": 0.8971796959776257, + "learning_rate": 4.424922785654561e-06, + "loss": 0.1557, + "step": 2814 + }, + { + "epoch": 0.25936333901506425, + "grad_norm": 0.8716438967042835, + "learning_rate": 4.424436616987151e-06, + "loss": 0.1534, + "step": 2815 + }, + { + "epoch": 0.25945547519233425, + "grad_norm": 0.9366795506332969, + "learning_rate": 4.423950269635594e-06, + "loss": 0.1722, + "step": 2816 + }, + { + "epoch": 0.25954761136960425, + "grad_norm": 0.8750053562323363, + "learning_rate": 4.4234637436450465e-06, + "loss": 0.1667, + "step": 2817 + }, + { + "epoch": 0.2596397475468743, + "grad_norm": 0.8688375268172122, + "learning_rate": 4.422977039060684e-06, + "loss": 0.1551, + "step": 2818 + }, + { + "epoch": 0.2597318837241443, + "grad_norm": 0.9164926259231396, + "learning_rate": 4.422490155927696e-06, + "loss": 0.1547, + "step": 2819 + }, + { + "epoch": 0.2598240199014143, + "grad_norm": 0.8784819223059159, + "learning_rate": 4.422003094291291e-06, + "loss": 0.1553, + "step": 2820 + }, + { + "epoch": 0.2599161560786843, + "grad_norm": 0.8480220147126285, + "learning_rate": 4.421515854196692e-06, + "loss": 0.1421, + "step": 2821 + }, + { + "epoch": 0.2600082922559543, + "grad_norm": 0.9812963150064519, + "learning_rate": 4.421028435689138e-06, + "loss": 0.1739, + "step": 2822 + }, + { + "epoch": 0.2601004284332243, + "grad_norm": 0.966210027789589, + "learning_rate": 4.420540838813887e-06, + "loss": 0.1726, + "step": 2823 + }, + { + "epoch": 0.26019256461049434, + "grad_norm": 0.8640742533975352, + "learning_rate": 4.420053063616214e-06, + "loss": 0.1597, + "step": 2824 + }, + { + "epoch": 0.26028470078776433, + "grad_norm": 0.9021044826230841, + "learning_rate": 4.419565110141406e-06, + "loss": 0.1669, + "step": 2825 + }, + { + "epoch": 0.26037683696503433, + "grad_norm": 0.8744446068310874, + "learning_rate": 4.419076978434771e-06, + "loss": 0.1587, + "step": 2826 + }, + { + "epoch": 0.2604689731423043, + "grad_norm": 0.9002597276951575, + "learning_rate": 4.418588668541632e-06, + "loss": 0.1496, + "step": 2827 + }, + { + "epoch": 0.2605611093195743, + "grad_norm": 0.8705763804623955, + "learning_rate": 4.41810018050733e-06, + "loss": 0.1612, + "step": 2828 + }, + { + "epoch": 0.2606532454968443, + "grad_norm": 0.8590574476522304, + "learning_rate": 4.417611514377218e-06, + "loss": 0.1541, + "step": 2829 + }, + { + "epoch": 0.2607453816741143, + "grad_norm": 0.9420489301767392, + "learning_rate": 4.417122670196672e-06, + "loss": 0.1662, + "step": 2830 + }, + { + "epoch": 0.26083751785138437, + "grad_norm": 0.9855215157889966, + "learning_rate": 4.416633648011079e-06, + "loss": 0.1719, + "step": 2831 + }, + { + "epoch": 0.26092965402865437, + "grad_norm": 0.9015482825979495, + "learning_rate": 4.416144447865845e-06, + "loss": 0.1594, + "step": 2832 + }, + { + "epoch": 0.26102179020592436, + "grad_norm": 0.9186528341173594, + "learning_rate": 4.4156550698063935e-06, + "loss": 0.1503, + "step": 2833 + }, + { + "epoch": 0.26111392638319436, + "grad_norm": 0.888306584206573, + "learning_rate": 4.4151655138781625e-06, + "loss": 0.1477, + "step": 2834 + }, + { + "epoch": 0.26120606256046436, + "grad_norm": 1.0019447633808933, + "learning_rate": 4.414675780126607e-06, + "loss": 0.1655, + "step": 2835 + }, + { + "epoch": 0.26129819873773436, + "grad_norm": 0.9867728137209706, + "learning_rate": 4.4141858685972e-06, + "loss": 0.1709, + "step": 2836 + }, + { + "epoch": 0.26139033491500435, + "grad_norm": 0.9522319296236781, + "learning_rate": 4.413695779335428e-06, + "loss": 0.1604, + "step": 2837 + }, + { + "epoch": 0.2614824710922744, + "grad_norm": 0.9150332712571578, + "learning_rate": 4.413205512386798e-06, + "loss": 0.1683, + "step": 2838 + }, + { + "epoch": 0.2615746072695444, + "grad_norm": 0.9783738508165877, + "learning_rate": 4.41271506779683e-06, + "loss": 0.1781, + "step": 2839 + }, + { + "epoch": 0.2616667434468144, + "grad_norm": 0.8955606749537997, + "learning_rate": 4.412224445611062e-06, + "loss": 0.1636, + "step": 2840 + }, + { + "epoch": 0.2617588796240844, + "grad_norm": 0.8984379268924941, + "learning_rate": 4.411733645875048e-06, + "loss": 0.1586, + "step": 2841 + }, + { + "epoch": 0.2618510158013544, + "grad_norm": 0.9029965233138934, + "learning_rate": 4.41124266863436e-06, + "loss": 0.1666, + "step": 2842 + }, + { + "epoch": 0.2619431519786244, + "grad_norm": 0.9174022488679581, + "learning_rate": 4.410751513934585e-06, + "loss": 0.1725, + "step": 2843 + }, + { + "epoch": 0.2620352881558944, + "grad_norm": 0.9756876452431863, + "learning_rate": 4.410260181821325e-06, + "loss": 0.1763, + "step": 2844 + }, + { + "epoch": 0.26212742433316444, + "grad_norm": 0.9244170802721333, + "learning_rate": 4.409768672340202e-06, + "loss": 0.1463, + "step": 2845 + }, + { + "epoch": 0.26221956051043444, + "grad_norm": 0.9041971028491017, + "learning_rate": 4.409276985536852e-06, + "loss": 0.173, + "step": 2846 + }, + { + "epoch": 0.26231169668770443, + "grad_norm": 0.9359224044219522, + "learning_rate": 4.408785121456929e-06, + "loss": 0.1613, + "step": 2847 + }, + { + "epoch": 0.26240383286497443, + "grad_norm": 0.8730622181165766, + "learning_rate": 4.408293080146101e-06, + "loss": 0.1474, + "step": 2848 + }, + { + "epoch": 0.2624959690422444, + "grad_norm": 0.9145537165320866, + "learning_rate": 4.407800861650056e-06, + "loss": 0.1586, + "step": 2849 + }, + { + "epoch": 0.2625881052195144, + "grad_norm": 0.8977352205218089, + "learning_rate": 4.407308466014496e-06, + "loss": 0.1486, + "step": 2850 + }, + { + "epoch": 0.2626802413967845, + "grad_norm": 0.9106247178852294, + "learning_rate": 4.406815893285139e-06, + "loss": 0.1653, + "step": 2851 + }, + { + "epoch": 0.26277237757405447, + "grad_norm": 1.0115187545849416, + "learning_rate": 4.406323143507721e-06, + "loss": 0.1612, + "step": 2852 + }, + { + "epoch": 0.26286451375132447, + "grad_norm": 0.9432111102878478, + "learning_rate": 4.405830216727995e-06, + "loss": 0.1501, + "step": 2853 + }, + { + "epoch": 0.26295664992859447, + "grad_norm": 0.8641598652236985, + "learning_rate": 4.405337112991728e-06, + "loss": 0.1483, + "step": 2854 + }, + { + "epoch": 0.26304878610586446, + "grad_norm": 0.8641392406260407, + "learning_rate": 4.404843832344704e-06, + "loss": 0.1653, + "step": 2855 + }, + { + "epoch": 0.26314092228313446, + "grad_norm": 0.9564993972527854, + "learning_rate": 4.404350374832725e-06, + "loss": 0.1611, + "step": 2856 + }, + { + "epoch": 0.26323305846040446, + "grad_norm": 0.8752998536245241, + "learning_rate": 4.40385674050161e-06, + "loss": 0.1537, + "step": 2857 + }, + { + "epoch": 0.2633251946376745, + "grad_norm": 0.8546718345682361, + "learning_rate": 4.403362929397191e-06, + "loss": 0.1559, + "step": 2858 + }, + { + "epoch": 0.2634173308149445, + "grad_norm": 0.8856269547457143, + "learning_rate": 4.40286894156532e-06, + "loss": 0.1531, + "step": 2859 + }, + { + "epoch": 0.2635094669922145, + "grad_norm": 0.8966875055653404, + "learning_rate": 4.402374777051862e-06, + "loss": 0.1676, + "step": 2860 + }, + { + "epoch": 0.2636016031694845, + "grad_norm": 0.9282672278947397, + "learning_rate": 4.401880435902701e-06, + "loss": 0.1676, + "step": 2861 + }, + { + "epoch": 0.2636937393467545, + "grad_norm": 1.0165990167745647, + "learning_rate": 4.401385918163737e-06, + "loss": 0.1859, + "step": 2862 + }, + { + "epoch": 0.2637858755240245, + "grad_norm": 0.8764188958015945, + "learning_rate": 4.400891223880888e-06, + "loss": 0.1488, + "step": 2863 + }, + { + "epoch": 0.2638780117012945, + "grad_norm": 0.8798735349276867, + "learning_rate": 4.400396353100081e-06, + "loss": 0.1599, + "step": 2864 + }, + { + "epoch": 0.26397014787856454, + "grad_norm": 0.9295437632531321, + "learning_rate": 4.39990130586727e-06, + "loss": 0.1597, + "step": 2865 + }, + { + "epoch": 0.26406228405583454, + "grad_norm": 0.8596446043652288, + "learning_rate": 4.399406082228418e-06, + "loss": 0.1533, + "step": 2866 + }, + { + "epoch": 0.26415442023310454, + "grad_norm": 0.9534196281125018, + "learning_rate": 4.398910682229507e-06, + "loss": 0.1718, + "step": 2867 + }, + { + "epoch": 0.26424655641037453, + "grad_norm": 0.9279560321709486, + "learning_rate": 4.398415105916535e-06, + "loss": 0.1646, + "step": 2868 + }, + { + "epoch": 0.26433869258764453, + "grad_norm": 0.8749195173976764, + "learning_rate": 4.397919353335516e-06, + "loss": 0.1513, + "step": 2869 + }, + { + "epoch": 0.2644308287649145, + "grad_norm": 0.9097910090711311, + "learning_rate": 4.3974234245324795e-06, + "loss": 0.1705, + "step": 2870 + }, + { + "epoch": 0.2645229649421845, + "grad_norm": 0.9171732739684699, + "learning_rate": 4.396927319553476e-06, + "loss": 0.1718, + "step": 2871 + }, + { + "epoch": 0.2646151011194546, + "grad_norm": 0.8748961263126276, + "learning_rate": 4.396431038444565e-06, + "loss": 0.1576, + "step": 2872 + }, + { + "epoch": 0.2647072372967246, + "grad_norm": 0.9224132153233574, + "learning_rate": 4.3959345812518285e-06, + "loss": 0.1666, + "step": 2873 + }, + { + "epoch": 0.26479937347399457, + "grad_norm": 0.943886286209004, + "learning_rate": 4.395437948021362e-06, + "loss": 0.1623, + "step": 2874 + }, + { + "epoch": 0.26489150965126457, + "grad_norm": 0.9233832263520497, + "learning_rate": 4.394941138799278e-06, + "loss": 0.1654, + "step": 2875 + }, + { + "epoch": 0.26498364582853456, + "grad_norm": 0.8726674273082234, + "learning_rate": 4.3944441536317055e-06, + "loss": 0.1425, + "step": 2876 + }, + { + "epoch": 0.26507578200580456, + "grad_norm": 1.01831125875492, + "learning_rate": 4.3939469925647895e-06, + "loss": 0.1705, + "step": 2877 + }, + { + "epoch": 0.26516791818307456, + "grad_norm": 0.8873960103462912, + "learning_rate": 4.3934496556446916e-06, + "loss": 0.1475, + "step": 2878 + }, + { + "epoch": 0.2652600543603446, + "grad_norm": 0.890705716689305, + "learning_rate": 4.3929521429175895e-06, + "loss": 0.1523, + "step": 2879 + }, + { + "epoch": 0.2653521905376146, + "grad_norm": 0.8998449017070324, + "learning_rate": 4.392454454429676e-06, + "loss": 0.1703, + "step": 2880 + }, + { + "epoch": 0.2654443267148846, + "grad_norm": 0.8815761058067845, + "learning_rate": 4.391956590227164e-06, + "loss": 0.1603, + "step": 2881 + }, + { + "epoch": 0.2655364628921546, + "grad_norm": 0.9434933743462255, + "learning_rate": 4.391458550356278e-06, + "loss": 0.1619, + "step": 2882 + }, + { + "epoch": 0.2656285990694246, + "grad_norm": 0.9342716887434068, + "learning_rate": 4.390960334863263e-06, + "loss": 0.1533, + "step": 2883 + }, + { + "epoch": 0.2657207352466946, + "grad_norm": 0.9839182352177902, + "learning_rate": 4.390461943794377e-06, + "loss": 0.1775, + "step": 2884 + }, + { + "epoch": 0.26581287142396465, + "grad_norm": 0.8677512839404379, + "learning_rate": 4.389963377195896e-06, + "loss": 0.162, + "step": 2885 + }, + { + "epoch": 0.26590500760123464, + "grad_norm": 0.886479980018512, + "learning_rate": 4.389464635114112e-06, + "loss": 0.1563, + "step": 2886 + }, + { + "epoch": 0.26599714377850464, + "grad_norm": 0.9815790472166931, + "learning_rate": 4.388965717595334e-06, + "loss": 0.1867, + "step": 2887 + }, + { + "epoch": 0.26608927995577464, + "grad_norm": 0.8801062088344992, + "learning_rate": 4.3884666246858846e-06, + "loss": 0.1565, + "step": 2888 + }, + { + "epoch": 0.26618141613304463, + "grad_norm": 0.9784498336027216, + "learning_rate": 4.387967356432107e-06, + "loss": 0.17, + "step": 2889 + }, + { + "epoch": 0.26627355231031463, + "grad_norm": 0.9091918449537576, + "learning_rate": 4.3874679128803565e-06, + "loss": 0.1535, + "step": 2890 + }, + { + "epoch": 0.26636568848758463, + "grad_norm": 0.9058091255127974, + "learning_rate": 4.386968294077007e-06, + "loss": 0.1599, + "step": 2891 + }, + { + "epoch": 0.2664578246648547, + "grad_norm": 0.9314719924346809, + "learning_rate": 4.386468500068449e-06, + "loss": 0.1663, + "step": 2892 + }, + { + "epoch": 0.2665499608421247, + "grad_norm": 0.9439398416546473, + "learning_rate": 4.385968530901087e-06, + "loss": 0.1737, + "step": 2893 + }, + { + "epoch": 0.2666420970193947, + "grad_norm": 0.9627391950050143, + "learning_rate": 4.3854683866213445e-06, + "loss": 0.1613, + "step": 2894 + }, + { + "epoch": 0.26673423319666467, + "grad_norm": 0.9709489598335683, + "learning_rate": 4.384968067275659e-06, + "loss": 0.1749, + "step": 2895 + }, + { + "epoch": 0.26682636937393467, + "grad_norm": 0.9876304597584598, + "learning_rate": 4.384467572910486e-06, + "loss": 0.1744, + "step": 2896 + }, + { + "epoch": 0.26691850555120467, + "grad_norm": 0.9806142595268034, + "learning_rate": 4.383966903572295e-06, + "loss": 0.1669, + "step": 2897 + }, + { + "epoch": 0.26701064172847466, + "grad_norm": 0.8390793688951929, + "learning_rate": 4.383466059307576e-06, + "loss": 0.1529, + "step": 2898 + }, + { + "epoch": 0.2671027779057447, + "grad_norm": 0.9378584287300362, + "learning_rate": 4.382965040162829e-06, + "loss": 0.1678, + "step": 2899 + }, + { + "epoch": 0.2671949140830147, + "grad_norm": 0.9185936272777794, + "learning_rate": 4.3824638461845764e-06, + "loss": 0.1624, + "step": 2900 + }, + { + "epoch": 0.2672870502602847, + "grad_norm": 0.8849129207015696, + "learning_rate": 4.381962477419352e-06, + "loss": 0.1621, + "step": 2901 + }, + { + "epoch": 0.2673791864375547, + "grad_norm": 1.020300634639128, + "learning_rate": 4.3814609339137105e-06, + "loss": 0.1697, + "step": 2902 + }, + { + "epoch": 0.2674713226148247, + "grad_norm": 0.9478899087466803, + "learning_rate": 4.380959215714218e-06, + "loss": 0.1625, + "step": 2903 + }, + { + "epoch": 0.2675634587920947, + "grad_norm": 1.0037603195239564, + "learning_rate": 4.380457322867461e-06, + "loss": 0.1615, + "step": 2904 + }, + { + "epoch": 0.2676555949693647, + "grad_norm": 0.9466541604314458, + "learning_rate": 4.379955255420037e-06, + "loss": 0.1636, + "step": 2905 + }, + { + "epoch": 0.26774773114663475, + "grad_norm": 0.8951466466126106, + "learning_rate": 4.379453013418567e-06, + "loss": 0.1722, + "step": 2906 + }, + { + "epoch": 0.26783986732390475, + "grad_norm": 0.9318669615219027, + "learning_rate": 4.378950596909683e-06, + "loss": 0.1661, + "step": 2907 + }, + { + "epoch": 0.26793200350117474, + "grad_norm": 1.0399320939535317, + "learning_rate": 4.378448005940031e-06, + "loss": 0.1661, + "step": 2908 + }, + { + "epoch": 0.26802413967844474, + "grad_norm": 1.0157712583133751, + "learning_rate": 4.377945240556282e-06, + "loss": 0.1779, + "step": 2909 + }, + { + "epoch": 0.26811627585571474, + "grad_norm": 0.9301603867530266, + "learning_rate": 4.3774423008051145e-06, + "loss": 0.1634, + "step": 2910 + }, + { + "epoch": 0.26820841203298473, + "grad_norm": 0.9296110650376944, + "learning_rate": 4.376939186733227e-06, + "loss": 0.1516, + "step": 2911 + }, + { + "epoch": 0.26830054821025473, + "grad_norm": 1.0043841351727716, + "learning_rate": 4.376435898387334e-06, + "loss": 0.1632, + "step": 2912 + }, + { + "epoch": 0.2683926843875248, + "grad_norm": 0.9963071461738237, + "learning_rate": 4.375932435814167e-06, + "loss": 0.1846, + "step": 2913 + }, + { + "epoch": 0.2684848205647948, + "grad_norm": 0.9703723999360877, + "learning_rate": 4.37542879906047e-06, + "loss": 0.1675, + "step": 2914 + }, + { + "epoch": 0.2685769567420648, + "grad_norm": 0.8830939679657934, + "learning_rate": 4.374924988173008e-06, + "loss": 0.1608, + "step": 2915 + }, + { + "epoch": 0.2686690929193348, + "grad_norm": 0.9540176107967079, + "learning_rate": 4.374421003198559e-06, + "loss": 0.1575, + "step": 2916 + }, + { + "epoch": 0.26876122909660477, + "grad_norm": 0.9928441377656393, + "learning_rate": 4.373916844183918e-06, + "loss": 0.1587, + "step": 2917 + }, + { + "epoch": 0.26885336527387477, + "grad_norm": 0.9455759756236314, + "learning_rate": 4.373412511175897e-06, + "loss": 0.1575, + "step": 2918 + }, + { + "epoch": 0.2689455014511448, + "grad_norm": 0.9637716298737328, + "learning_rate": 4.372908004221322e-06, + "loss": 0.1687, + "step": 2919 + }, + { + "epoch": 0.2690376376284148, + "grad_norm": 0.9425788092717383, + "learning_rate": 4.372403323367037e-06, + "loss": 0.1538, + "step": 2920 + }, + { + "epoch": 0.2691297738056848, + "grad_norm": 0.935187533150534, + "learning_rate": 4.371898468659903e-06, + "loss": 0.1701, + "step": 2921 + }, + { + "epoch": 0.2692219099829548, + "grad_norm": 0.9553637112591826, + "learning_rate": 4.371393440146794e-06, + "loss": 0.1654, + "step": 2922 + }, + { + "epoch": 0.2693140461602248, + "grad_norm": 0.9022261913400997, + "learning_rate": 4.370888237874602e-06, + "loss": 0.1494, + "step": 2923 + }, + { + "epoch": 0.2694061823374948, + "grad_norm": 0.8851623605601451, + "learning_rate": 4.370382861890237e-06, + "loss": 0.1459, + "step": 2924 + }, + { + "epoch": 0.2694983185147648, + "grad_norm": 0.9899899922542382, + "learning_rate": 4.369877312240621e-06, + "loss": 0.17, + "step": 2925 + }, + { + "epoch": 0.26959045469203485, + "grad_norm": 0.8982399301119234, + "learning_rate": 4.369371588972696e-06, + "loss": 0.1581, + "step": 2926 + }, + { + "epoch": 0.26968259086930485, + "grad_norm": 0.9199727306968912, + "learning_rate": 4.368865692133417e-06, + "loss": 0.1628, + "step": 2927 + }, + { + "epoch": 0.26977472704657485, + "grad_norm": 0.916780158787217, + "learning_rate": 4.3683596217697585e-06, + "loss": 0.1578, + "step": 2928 + }, + { + "epoch": 0.26986686322384484, + "grad_norm": 1.0499551006536163, + "learning_rate": 4.367853377928707e-06, + "loss": 0.1777, + "step": 2929 + }, + { + "epoch": 0.26995899940111484, + "grad_norm": 0.9431552116102009, + "learning_rate": 4.367346960657269e-06, + "loss": 0.1523, + "step": 2930 + }, + { + "epoch": 0.27005113557838484, + "grad_norm": 0.9014934644247001, + "learning_rate": 4.366840370002465e-06, + "loss": 0.1602, + "step": 2931 + }, + { + "epoch": 0.27014327175565483, + "grad_norm": 1.0011043047400634, + "learning_rate": 4.366333606011331e-06, + "loss": 0.1674, + "step": 2932 + }, + { + "epoch": 0.2702354079329249, + "grad_norm": 1.0340166384513643, + "learning_rate": 4.365826668730921e-06, + "loss": 0.1662, + "step": 2933 + }, + { + "epoch": 0.2703275441101949, + "grad_norm": 0.9098261155448165, + "learning_rate": 4.365319558208304e-06, + "loss": 0.1656, + "step": 2934 + }, + { + "epoch": 0.2704196802874649, + "grad_norm": 1.0370324645257154, + "learning_rate": 4.3648122744905654e-06, + "loss": 0.1918, + "step": 2935 + }, + { + "epoch": 0.2705118164647349, + "grad_norm": 0.9814751176480726, + "learning_rate": 4.364304817624806e-06, + "loss": 0.1629, + "step": 2936 + }, + { + "epoch": 0.2706039526420049, + "grad_norm": 0.8429499591444676, + "learning_rate": 4.363797187658144e-06, + "loss": 0.1524, + "step": 2937 + }, + { + "epoch": 0.27069608881927487, + "grad_norm": 1.0208847925208175, + "learning_rate": 4.363289384637713e-06, + "loss": 0.1677, + "step": 2938 + }, + { + "epoch": 0.27078822499654487, + "grad_norm": 0.9106234440617202, + "learning_rate": 4.362781408610662e-06, + "loss": 0.1596, + "step": 2939 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 0.8962143929787811, + "learning_rate": 4.362273259624156e-06, + "loss": 0.1587, + "step": 2940 + }, + { + "epoch": 0.2709724973510849, + "grad_norm": 0.9695209952030241, + "learning_rate": 4.3617649377253775e-06, + "loss": 0.1713, + "step": 2941 + }, + { + "epoch": 0.2710646335283549, + "grad_norm": 0.9189259739380802, + "learning_rate": 4.361256442961524e-06, + "loss": 0.1703, + "step": 2942 + }, + { + "epoch": 0.2711567697056249, + "grad_norm": 1.001396554375545, + "learning_rate": 4.360747775379811e-06, + "loss": 0.1704, + "step": 2943 + }, + { + "epoch": 0.2712489058828949, + "grad_norm": 0.9123763202978219, + "learning_rate": 4.3602389350274656e-06, + "loss": 0.1591, + "step": 2944 + }, + { + "epoch": 0.2713410420601649, + "grad_norm": 0.9308820506862105, + "learning_rate": 4.359729921951735e-06, + "loss": 0.1668, + "step": 2945 + }, + { + "epoch": 0.2714331782374349, + "grad_norm": 0.9350597956598543, + "learning_rate": 4.3592207361998815e-06, + "loss": 0.1692, + "step": 2946 + }, + { + "epoch": 0.27152531441470495, + "grad_norm": 0.9473956643289153, + "learning_rate": 4.358711377819181e-06, + "loss": 0.1562, + "step": 2947 + }, + { + "epoch": 0.27161745059197495, + "grad_norm": 0.9222103923820115, + "learning_rate": 4.358201846856931e-06, + "loss": 0.1578, + "step": 2948 + }, + { + "epoch": 0.27170958676924495, + "grad_norm": 0.8718693024219112, + "learning_rate": 4.357692143360438e-06, + "loss": 0.1602, + "step": 2949 + }, + { + "epoch": 0.27180172294651495, + "grad_norm": 0.8969897252603595, + "learning_rate": 4.35718226737703e-06, + "loss": 0.1607, + "step": 2950 + }, + { + "epoch": 0.27189385912378494, + "grad_norm": 0.9386779288297119, + "learning_rate": 4.35667221895405e-06, + "loss": 0.1724, + "step": 2951 + }, + { + "epoch": 0.27198599530105494, + "grad_norm": 0.9094775211356984, + "learning_rate": 4.356161998138853e-06, + "loss": 0.1724, + "step": 2952 + }, + { + "epoch": 0.272078131478325, + "grad_norm": 0.8950162107788308, + "learning_rate": 4.355651604978815e-06, + "loss": 0.1597, + "step": 2953 + }, + { + "epoch": 0.272170267655595, + "grad_norm": 0.8761295316756618, + "learning_rate": 4.355141039521325e-06, + "loss": 0.16, + "step": 2954 + }, + { + "epoch": 0.272262403832865, + "grad_norm": 0.862109488387714, + "learning_rate": 4.3546303018137915e-06, + "loss": 0.1512, + "step": 2955 + }, + { + "epoch": 0.272354540010135, + "grad_norm": 0.9616828736913641, + "learning_rate": 4.354119391903634e-06, + "loss": 0.1654, + "step": 2956 + }, + { + "epoch": 0.272446676187405, + "grad_norm": 0.8860353296993886, + "learning_rate": 4.353608309838292e-06, + "loss": 0.1403, + "step": 2957 + }, + { + "epoch": 0.272538812364675, + "grad_norm": 0.9628950022603051, + "learning_rate": 4.353097055665219e-06, + "loss": 0.1566, + "step": 2958 + }, + { + "epoch": 0.272630948541945, + "grad_norm": 0.9596948029087774, + "learning_rate": 4.352585629431883e-06, + "loss": 0.165, + "step": 2959 + }, + { + "epoch": 0.272723084719215, + "grad_norm": 0.8557258049846965, + "learning_rate": 4.352074031185774e-06, + "loss": 0.1573, + "step": 2960 + }, + { + "epoch": 0.272815220896485, + "grad_norm": 0.9269594433765076, + "learning_rate": 4.351562260974391e-06, + "loss": 0.1749, + "step": 2961 + }, + { + "epoch": 0.272907357073755, + "grad_norm": 0.9173990269761251, + "learning_rate": 4.3510503188452535e-06, + "loss": 0.1693, + "step": 2962 + }, + { + "epoch": 0.272999493251025, + "grad_norm": 0.9427467660265008, + "learning_rate": 4.350538204845895e-06, + "loss": 0.1641, + "step": 2963 + }, + { + "epoch": 0.273091629428295, + "grad_norm": 0.9381162953466216, + "learning_rate": 4.350025919023864e-06, + "loss": 0.1714, + "step": 2964 + }, + { + "epoch": 0.273183765605565, + "grad_norm": 0.8947538202851707, + "learning_rate": 4.349513461426728e-06, + "loss": 0.1777, + "step": 2965 + }, + { + "epoch": 0.273275901782835, + "grad_norm": 0.9430666551395096, + "learning_rate": 4.349000832102067e-06, + "loss": 0.1606, + "step": 2966 + }, + { + "epoch": 0.27336803796010506, + "grad_norm": 0.9077354832576249, + "learning_rate": 4.348488031097481e-06, + "loss": 0.1603, + "step": 2967 + }, + { + "epoch": 0.27346017413737506, + "grad_norm": 0.9182053591196929, + "learning_rate": 4.3479750584605814e-06, + "loss": 0.16, + "step": 2968 + }, + { + "epoch": 0.27355231031464505, + "grad_norm": 0.9057263697989837, + "learning_rate": 4.347461914238999e-06, + "loss": 0.1662, + "step": 2969 + }, + { + "epoch": 0.27364444649191505, + "grad_norm": 0.9164232624325025, + "learning_rate": 4.34694859848038e-06, + "loss": 0.162, + "step": 2970 + }, + { + "epoch": 0.27373658266918505, + "grad_norm": 0.9369492838326873, + "learning_rate": 4.346435111232383e-06, + "loss": 0.1673, + "step": 2971 + }, + { + "epoch": 0.27382871884645504, + "grad_norm": 0.9215110313361893, + "learning_rate": 4.345921452542689e-06, + "loss": 0.1578, + "step": 2972 + }, + { + "epoch": 0.27392085502372504, + "grad_norm": 0.9152849266863357, + "learning_rate": 4.345407622458988e-06, + "loss": 0.1673, + "step": 2973 + }, + { + "epoch": 0.2740129912009951, + "grad_norm": 0.9041598610869491, + "learning_rate": 4.3448936210289916e-06, + "loss": 0.1457, + "step": 2974 + }, + { + "epoch": 0.2741051273782651, + "grad_norm": 0.9426336536924195, + "learning_rate": 4.344379448300423e-06, + "loss": 0.1704, + "step": 2975 + }, + { + "epoch": 0.2741972635555351, + "grad_norm": 0.8841261534075182, + "learning_rate": 4.343865104321026e-06, + "loss": 0.1546, + "step": 2976 + }, + { + "epoch": 0.2742893997328051, + "grad_norm": 0.8945634429252811, + "learning_rate": 4.3433505891385534e-06, + "loss": 0.1668, + "step": 2977 + }, + { + "epoch": 0.2743815359100751, + "grad_norm": 0.8960031810119188, + "learning_rate": 4.342835902800782e-06, + "loss": 0.1692, + "step": 2978 + }, + { + "epoch": 0.2744736720873451, + "grad_norm": 0.9376536707630264, + "learning_rate": 4.342321045355498e-06, + "loss": 0.1679, + "step": 2979 + }, + { + "epoch": 0.2745658082646151, + "grad_norm": 0.9040938912188998, + "learning_rate": 4.341806016850506e-06, + "loss": 0.1564, + "step": 2980 + }, + { + "epoch": 0.2746579444418851, + "grad_norm": 0.9501565919833976, + "learning_rate": 4.341290817333628e-06, + "loss": 0.173, + "step": 2981 + }, + { + "epoch": 0.2747500806191551, + "grad_norm": 0.8828775263021356, + "learning_rate": 4.340775446852699e-06, + "loss": 0.1559, + "step": 2982 + }, + { + "epoch": 0.2748422167964251, + "grad_norm": 0.850067548106956, + "learning_rate": 4.340259905455572e-06, + "loss": 0.1516, + "step": 2983 + }, + { + "epoch": 0.2749343529736951, + "grad_norm": 0.9603013308750544, + "learning_rate": 4.339744193190114e-06, + "loss": 0.1713, + "step": 2984 + }, + { + "epoch": 0.2750264891509651, + "grad_norm": 1.0113076687699456, + "learning_rate": 4.339228310104211e-06, + "loss": 0.1827, + "step": 2985 + }, + { + "epoch": 0.2751186253282351, + "grad_norm": 0.9572529546575894, + "learning_rate": 4.338712256245761e-06, + "loss": 0.1586, + "step": 2986 + }, + { + "epoch": 0.27521076150550516, + "grad_norm": 0.9569799508875555, + "learning_rate": 4.3381960316626795e-06, + "loss": 0.1591, + "step": 2987 + }, + { + "epoch": 0.27530289768277516, + "grad_norm": 0.8482125697273889, + "learning_rate": 4.337679636402898e-06, + "loss": 0.1595, + "step": 2988 + }, + { + "epoch": 0.27539503386004516, + "grad_norm": 0.8838971352649014, + "learning_rate": 4.3371630705143665e-06, + "loss": 0.155, + "step": 2989 + }, + { + "epoch": 0.27548717003731515, + "grad_norm": 0.9940918802102938, + "learning_rate": 4.336646334045045e-06, + "loss": 0.1565, + "step": 2990 + }, + { + "epoch": 0.27557930621458515, + "grad_norm": 0.8826585591020982, + "learning_rate": 4.336129427042913e-06, + "loss": 0.1486, + "step": 2991 + }, + { + "epoch": 0.27567144239185515, + "grad_norm": 0.9645792000767957, + "learning_rate": 4.335612349555967e-06, + "loss": 0.1706, + "step": 2992 + }, + { + "epoch": 0.27576357856912515, + "grad_norm": 0.9157431148804065, + "learning_rate": 4.335095101632217e-06, + "loss": 0.1703, + "step": 2993 + }, + { + "epoch": 0.2758557147463952, + "grad_norm": 0.9536633356430607, + "learning_rate": 4.334577683319689e-06, + "loss": 0.1515, + "step": 2994 + }, + { + "epoch": 0.2759478509236652, + "grad_norm": 0.9246001577573807, + "learning_rate": 4.334060094666426e-06, + "loss": 0.1623, + "step": 2995 + }, + { + "epoch": 0.2760399871009352, + "grad_norm": 0.9440481150724693, + "learning_rate": 4.333542335720485e-06, + "loss": 0.1736, + "step": 2996 + }, + { + "epoch": 0.2761321232782052, + "grad_norm": 1.1704161241743252, + "learning_rate": 4.3330244065299424e-06, + "loss": 0.1663, + "step": 2997 + }, + { + "epoch": 0.2762242594554752, + "grad_norm": 0.9360542946940662, + "learning_rate": 4.332506307142885e-06, + "loss": 0.1557, + "step": 2998 + }, + { + "epoch": 0.2763163956327452, + "grad_norm": 0.8877005689033058, + "learning_rate": 4.33198803760742e-06, + "loss": 0.1623, + "step": 2999 + }, + { + "epoch": 0.2764085318100152, + "grad_norm": 0.8917013234852214, + "learning_rate": 4.3314695979716684e-06, + "loss": 0.1507, + "step": 3000 + }, + { + "epoch": 0.2764085318100152, + "eval_loss": 0.1629796177148819, + "eval_runtime": 299.43, + "eval_samples_per_second": 23.435, + "eval_steps_per_second": 2.932, + "step": 3000 + }, + { + "epoch": 0.27650066798728523, + "grad_norm": 0.9388370081009483, + "learning_rate": 4.330950988283767e-06, + "loss": 0.1647, + "step": 3001 + }, + { + "epoch": 0.27659280416455523, + "grad_norm": 0.8715944561126756, + "learning_rate": 4.330432208591871e-06, + "loss": 0.1517, + "step": 3002 + }, + { + "epoch": 0.2766849403418252, + "grad_norm": 0.9239641362222594, + "learning_rate": 4.329913258944146e-06, + "loss": 0.1629, + "step": 3003 + }, + { + "epoch": 0.2767770765190952, + "grad_norm": 0.9607187053118423, + "learning_rate": 4.329394139388779e-06, + "loss": 0.1547, + "step": 3004 + }, + { + "epoch": 0.2768692126963652, + "grad_norm": 0.9877687003566187, + "learning_rate": 4.328874849973968e-06, + "loss": 0.166, + "step": 3005 + }, + { + "epoch": 0.2769613488736352, + "grad_norm": 0.9366916037002174, + "learning_rate": 4.328355390747931e-06, + "loss": 0.1685, + "step": 3006 + }, + { + "epoch": 0.2770534850509052, + "grad_norm": 0.9333814868699946, + "learning_rate": 4.3278357617589e-06, + "loss": 0.1578, + "step": 3007 + }, + { + "epoch": 0.27714562122817527, + "grad_norm": 0.967607732134371, + "learning_rate": 4.327315963055121e-06, + "loss": 0.1584, + "step": 3008 + }, + { + "epoch": 0.27723775740544526, + "grad_norm": 0.9058729702202627, + "learning_rate": 4.326795994684858e-06, + "loss": 0.1629, + "step": 3009 + }, + { + "epoch": 0.27732989358271526, + "grad_norm": 0.8914686442256584, + "learning_rate": 4.326275856696391e-06, + "loss": 0.1462, + "step": 3010 + }, + { + "epoch": 0.27742202975998526, + "grad_norm": 0.9350169815870337, + "learning_rate": 4.325755549138014e-06, + "loss": 0.1626, + "step": 3011 + }, + { + "epoch": 0.27751416593725525, + "grad_norm": 0.986767040362785, + "learning_rate": 4.325235072058037e-06, + "loss": 0.1682, + "step": 3012 + }, + { + "epoch": 0.27760630211452525, + "grad_norm": 0.9304736612317988, + "learning_rate": 4.324714425504788e-06, + "loss": 0.1681, + "step": 3013 + }, + { + "epoch": 0.27769843829179525, + "grad_norm": 1.0173816582289463, + "learning_rate": 4.324193609526607e-06, + "loss": 0.1493, + "step": 3014 + }, + { + "epoch": 0.2777905744690653, + "grad_norm": 0.9479003398421398, + "learning_rate": 4.323672624171854e-06, + "loss": 0.1716, + "step": 3015 + }, + { + "epoch": 0.2778827106463353, + "grad_norm": 0.9340243186194851, + "learning_rate": 4.323151469488902e-06, + "loss": 0.1596, + "step": 3016 + }, + { + "epoch": 0.2779748468236053, + "grad_norm": 0.8575012092454355, + "learning_rate": 4.322630145526139e-06, + "loss": 0.1603, + "step": 3017 + }, + { + "epoch": 0.2780669830008753, + "grad_norm": 0.8844930689599675, + "learning_rate": 4.322108652331971e-06, + "loss": 0.147, + "step": 3018 + }, + { + "epoch": 0.2781591191781453, + "grad_norm": 0.9506160437467802, + "learning_rate": 4.321586989954819e-06, + "loss": 0.152, + "step": 3019 + }, + { + "epoch": 0.2782512553554153, + "grad_norm": 0.9913035500170787, + "learning_rate": 4.3210651584431186e-06, + "loss": 0.1708, + "step": 3020 + }, + { + "epoch": 0.27834339153268534, + "grad_norm": 0.9403771168318222, + "learning_rate": 4.320543157845321e-06, + "loss": 0.1664, + "step": 3021 + }, + { + "epoch": 0.27843552770995533, + "grad_norm": 0.9888529408708953, + "learning_rate": 4.320020988209898e-06, + "loss": 0.1524, + "step": 3022 + }, + { + "epoch": 0.27852766388722533, + "grad_norm": 1.0197212346707907, + "learning_rate": 4.319498649585329e-06, + "loss": 0.1837, + "step": 3023 + }, + { + "epoch": 0.2786198000644953, + "grad_norm": 1.041097294154436, + "learning_rate": 4.318976142020113e-06, + "loss": 0.1815, + "step": 3024 + }, + { + "epoch": 0.2787119362417653, + "grad_norm": 0.9489619575370245, + "learning_rate": 4.318453465562768e-06, + "loss": 0.1648, + "step": 3025 + }, + { + "epoch": 0.2788040724190353, + "grad_norm": 0.8724471409134508, + "learning_rate": 4.317930620261823e-06, + "loss": 0.1541, + "step": 3026 + }, + { + "epoch": 0.2788962085963053, + "grad_norm": 0.9104960457396443, + "learning_rate": 4.317407606165825e-06, + "loss": 0.1693, + "step": 3027 + }, + { + "epoch": 0.27898834477357537, + "grad_norm": 0.9707721567063771, + "learning_rate": 4.3168844233233345e-06, + "loss": 0.1672, + "step": 3028 + }, + { + "epoch": 0.27908048095084537, + "grad_norm": 0.9680840756993969, + "learning_rate": 4.316361071782929e-06, + "loss": 0.1674, + "step": 3029 + }, + { + "epoch": 0.27917261712811536, + "grad_norm": 0.9071358782871782, + "learning_rate": 4.315837551593203e-06, + "loss": 0.1566, + "step": 3030 + }, + { + "epoch": 0.27926475330538536, + "grad_norm": 0.8903474335652056, + "learning_rate": 4.315313862802766e-06, + "loss": 0.156, + "step": 3031 + }, + { + "epoch": 0.27935688948265536, + "grad_norm": 0.9724942706639386, + "learning_rate": 4.31479000546024e-06, + "loss": 0.1767, + "step": 3032 + }, + { + "epoch": 0.27944902565992535, + "grad_norm": 0.9528363083561447, + "learning_rate": 4.314265979614267e-06, + "loss": 0.1708, + "step": 3033 + }, + { + "epoch": 0.27954116183719535, + "grad_norm": 0.8441521393036625, + "learning_rate": 4.313741785313503e-06, + "loss": 0.1554, + "step": 3034 + }, + { + "epoch": 0.2796332980144654, + "grad_norm": 0.8782369763374204, + "learning_rate": 4.313217422606618e-06, + "loss": 0.1569, + "step": 3035 + }, + { + "epoch": 0.2797254341917354, + "grad_norm": 0.9718559574608084, + "learning_rate": 4.312692891542302e-06, + "loss": 0.1661, + "step": 3036 + }, + { + "epoch": 0.2798175703690054, + "grad_norm": 0.9446994124364056, + "learning_rate": 4.312168192169254e-06, + "loss": 0.1603, + "step": 3037 + }, + { + "epoch": 0.2799097065462754, + "grad_norm": 0.8951970904849386, + "learning_rate": 4.311643324536195e-06, + "loss": 0.1624, + "step": 3038 + }, + { + "epoch": 0.2800018427235454, + "grad_norm": 0.9198959941328185, + "learning_rate": 4.311118288691859e-06, + "loss": 0.1684, + "step": 3039 + }, + { + "epoch": 0.2800939789008154, + "grad_norm": 0.9587061003460235, + "learning_rate": 4.3105930846849945e-06, + "loss": 0.1714, + "step": 3040 + }, + { + "epoch": 0.2801861150780854, + "grad_norm": 0.8302605952313523, + "learning_rate": 4.310067712564367e-06, + "loss": 0.1448, + "step": 3041 + }, + { + "epoch": 0.28027825125535544, + "grad_norm": 0.9283440649700089, + "learning_rate": 4.3095421723787585e-06, + "loss": 0.1672, + "step": 3042 + }, + { + "epoch": 0.28037038743262543, + "grad_norm": 0.9372916319568307, + "learning_rate": 4.309016464176964e-06, + "loss": 0.1653, + "step": 3043 + }, + { + "epoch": 0.28046252360989543, + "grad_norm": 0.9278315387270265, + "learning_rate": 4.308490588007796e-06, + "loss": 0.1634, + "step": 3044 + }, + { + "epoch": 0.28055465978716543, + "grad_norm": 0.9396311360757985, + "learning_rate": 4.307964543920083e-06, + "loss": 0.1662, + "step": 3045 + }, + { + "epoch": 0.2806467959644354, + "grad_norm": 0.9113997709540158, + "learning_rate": 4.3074383319626655e-06, + "loss": 0.1594, + "step": 3046 + }, + { + "epoch": 0.2807389321417054, + "grad_norm": 0.8784488154165262, + "learning_rate": 4.306911952184406e-06, + "loss": 0.1487, + "step": 3047 + }, + { + "epoch": 0.2808310683189754, + "grad_norm": 0.9661023236004905, + "learning_rate": 4.306385404634177e-06, + "loss": 0.1786, + "step": 3048 + }, + { + "epoch": 0.28092320449624547, + "grad_norm": 0.9097267950273097, + "learning_rate": 4.305858689360869e-06, + "loss": 0.1492, + "step": 3049 + }, + { + "epoch": 0.28101534067351547, + "grad_norm": 0.9366899670263596, + "learning_rate": 4.3053318064133864e-06, + "loss": 0.1637, + "step": 3050 + }, + { + "epoch": 0.28110747685078546, + "grad_norm": 0.9021081218041649, + "learning_rate": 4.3048047558406525e-06, + "loss": 0.159, + "step": 3051 + }, + { + "epoch": 0.28119961302805546, + "grad_norm": 0.9412954829333593, + "learning_rate": 4.304277537691602e-06, + "loss": 0.1589, + "step": 3052 + }, + { + "epoch": 0.28129174920532546, + "grad_norm": 0.9059656008131649, + "learning_rate": 4.303750152015188e-06, + "loss": 0.1675, + "step": 3053 + }, + { + "epoch": 0.28138388538259546, + "grad_norm": 0.8794175338311734, + "learning_rate": 4.3032225988603786e-06, + "loss": 0.1537, + "step": 3054 + }, + { + "epoch": 0.2814760215598655, + "grad_norm": 0.913108805336839, + "learning_rate": 4.302694878276157e-06, + "loss": 0.1679, + "step": 3055 + }, + { + "epoch": 0.2815681577371355, + "grad_norm": 0.9989015886002992, + "learning_rate": 4.302166990311522e-06, + "loss": 0.1825, + "step": 3056 + }, + { + "epoch": 0.2816602939144055, + "grad_norm": 0.9152409790134869, + "learning_rate": 4.301638935015487e-06, + "loss": 0.1456, + "step": 3057 + }, + { + "epoch": 0.2817524300916755, + "grad_norm": 0.8561207996336198, + "learning_rate": 4.3011107124370835e-06, + "loss": 0.1498, + "step": 3058 + }, + { + "epoch": 0.2818445662689455, + "grad_norm": 0.9062836968107347, + "learning_rate": 4.300582322625356e-06, + "loss": 0.1524, + "step": 3059 + }, + { + "epoch": 0.2819367024462155, + "grad_norm": 1.0121945627292734, + "learning_rate": 4.300053765629367e-06, + "loss": 0.1708, + "step": 3060 + }, + { + "epoch": 0.2820288386234855, + "grad_norm": 0.8853915734328843, + "learning_rate": 4.299525041498192e-06, + "loss": 0.149, + "step": 3061 + }, + { + "epoch": 0.28212097480075554, + "grad_norm": 0.8736829341581167, + "learning_rate": 4.298996150280923e-06, + "loss": 0.1581, + "step": 3062 + }, + { + "epoch": 0.28221311097802554, + "grad_norm": 1.031079870711207, + "learning_rate": 4.298467092026668e-06, + "loss": 0.1677, + "step": 3063 + }, + { + "epoch": 0.28230524715529554, + "grad_norm": 0.9303524529887373, + "learning_rate": 4.29793786678455e-06, + "loss": 0.1517, + "step": 3064 + }, + { + "epoch": 0.28239738333256553, + "grad_norm": 1.0035979953985423, + "learning_rate": 4.297408474603707e-06, + "loss": 0.1738, + "step": 3065 + }, + { + "epoch": 0.28248951950983553, + "grad_norm": 0.9516958507458324, + "learning_rate": 4.296878915533294e-06, + "loss": 0.1703, + "step": 3066 + }, + { + "epoch": 0.2825816556871055, + "grad_norm": 0.8632532319262257, + "learning_rate": 4.2963491896224806e-06, + "loss": 0.1541, + "step": 3067 + }, + { + "epoch": 0.2826737918643755, + "grad_norm": 0.9264065893787056, + "learning_rate": 4.295819296920451e-06, + "loss": 0.1602, + "step": 3068 + }, + { + "epoch": 0.2827659280416456, + "grad_norm": 0.9289191168966096, + "learning_rate": 4.295289237476407e-06, + "loss": 0.1623, + "step": 3069 + }, + { + "epoch": 0.2828580642189156, + "grad_norm": 0.8903370320980405, + "learning_rate": 4.294759011339564e-06, + "loss": 0.1574, + "step": 3070 + }, + { + "epoch": 0.28295020039618557, + "grad_norm": 0.9863779141764555, + "learning_rate": 4.294228618559153e-06, + "loss": 0.178, + "step": 3071 + }, + { + "epoch": 0.28304233657345557, + "grad_norm": 0.8660438063688297, + "learning_rate": 4.293698059184423e-06, + "loss": 0.1549, + "step": 3072 + }, + { + "epoch": 0.28313447275072556, + "grad_norm": 0.8464393507295984, + "learning_rate": 4.293167333264634e-06, + "loss": 0.1516, + "step": 3073 + }, + { + "epoch": 0.28322660892799556, + "grad_norm": 0.911983641200305, + "learning_rate": 4.292636440849065e-06, + "loss": 0.1559, + "step": 3074 + }, + { + "epoch": 0.28331874510526556, + "grad_norm": 0.8761141663534089, + "learning_rate": 4.292105381987011e-06, + "loss": 0.1531, + "step": 3075 + }, + { + "epoch": 0.2834108812825356, + "grad_norm": 0.9819629945606868, + "learning_rate": 4.291574156727778e-06, + "loss": 0.1823, + "step": 3076 + }, + { + "epoch": 0.2835030174598056, + "grad_norm": 0.92229535525268, + "learning_rate": 4.291042765120693e-06, + "loss": 0.1551, + "step": 3077 + }, + { + "epoch": 0.2835951536370756, + "grad_norm": 0.8734506454229878, + "learning_rate": 4.290511207215093e-06, + "loss": 0.1562, + "step": 3078 + }, + { + "epoch": 0.2836872898143456, + "grad_norm": 0.9258218141455302, + "learning_rate": 4.289979483060336e-06, + "loss": 0.1643, + "step": 3079 + }, + { + "epoch": 0.2837794259916156, + "grad_norm": 0.8863624060949952, + "learning_rate": 4.289447592705791e-06, + "loss": 0.151, + "step": 3080 + }, + { + "epoch": 0.2838715621688856, + "grad_norm": 0.9360893187784172, + "learning_rate": 4.2889155362008435e-06, + "loss": 0.1724, + "step": 3081 + }, + { + "epoch": 0.2839636983461556, + "grad_norm": 0.9114271743045955, + "learning_rate": 4.288383313594897e-06, + "loss": 0.1571, + "step": 3082 + }, + { + "epoch": 0.28405583452342564, + "grad_norm": 0.9140713488133063, + "learning_rate": 4.287850924937367e-06, + "loss": 0.1625, + "step": 3083 + }, + { + "epoch": 0.28414797070069564, + "grad_norm": 0.8729642931002742, + "learning_rate": 4.287318370277686e-06, + "loss": 0.1598, + "step": 3084 + }, + { + "epoch": 0.28424010687796564, + "grad_norm": 0.8690224870912999, + "learning_rate": 4.286785649665302e-06, + "loss": 0.1428, + "step": 3085 + }, + { + "epoch": 0.28433224305523563, + "grad_norm": 0.942851685881234, + "learning_rate": 4.286252763149679e-06, + "loss": 0.1726, + "step": 3086 + }, + { + "epoch": 0.28442437923250563, + "grad_norm": 1.0015144979953934, + "learning_rate": 4.2857197107802936e-06, + "loss": 0.1628, + "step": 3087 + }, + { + "epoch": 0.2845165154097756, + "grad_norm": 0.8880299301894348, + "learning_rate": 4.285186492606641e-06, + "loss": 0.1553, + "step": 3088 + }, + { + "epoch": 0.2846086515870457, + "grad_norm": 0.8486996924762646, + "learning_rate": 4.2846531086782315e-06, + "loss": 0.1475, + "step": 3089 + }, + { + "epoch": 0.2847007877643157, + "grad_norm": 0.9489188020398197, + "learning_rate": 4.2841195590445875e-06, + "loss": 0.1622, + "step": 3090 + }, + { + "epoch": 0.2847929239415857, + "grad_norm": 1.0150637526631434, + "learning_rate": 4.283585843755251e-06, + "loss": 0.1593, + "step": 3091 + }, + { + "epoch": 0.28488506011885567, + "grad_norm": 0.9502841585717497, + "learning_rate": 4.283051962859776e-06, + "loss": 0.1587, + "step": 3092 + }, + { + "epoch": 0.28497719629612567, + "grad_norm": 0.9644289682928954, + "learning_rate": 4.2825179164077365e-06, + "loss": 0.1807, + "step": 3093 + }, + { + "epoch": 0.28506933247339566, + "grad_norm": 0.893244872345484, + "learning_rate": 4.281983704448715e-06, + "loss": 0.1543, + "step": 3094 + }, + { + "epoch": 0.28516146865066566, + "grad_norm": 0.8971291446766364, + "learning_rate": 4.281449327032315e-06, + "loss": 0.1634, + "step": 3095 + }, + { + "epoch": 0.2852536048279357, + "grad_norm": 0.8839743026411959, + "learning_rate": 4.2809147842081535e-06, + "loss": 0.1497, + "step": 3096 + }, + { + "epoch": 0.2853457410052057, + "grad_norm": 0.8814626684254067, + "learning_rate": 4.280380076025863e-06, + "loss": 0.1553, + "step": 3097 + }, + { + "epoch": 0.2854378771824757, + "grad_norm": 0.9336844802468615, + "learning_rate": 4.27984520253509e-06, + "loss": 0.1746, + "step": 3098 + }, + { + "epoch": 0.2855300133597457, + "grad_norm": 0.8813308242067851, + "learning_rate": 4.279310163785499e-06, + "loss": 0.1672, + "step": 3099 + }, + { + "epoch": 0.2856221495370157, + "grad_norm": 0.92626645246821, + "learning_rate": 4.278774959826768e-06, + "loss": 0.154, + "step": 3100 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.0123665912926623, + "learning_rate": 4.2782395907085894e-06, + "loss": 0.1978, + "step": 3101 + }, + { + "epoch": 0.2858064218915557, + "grad_norm": 0.9242315379627385, + "learning_rate": 4.277704056480674e-06, + "loss": 0.1711, + "step": 3102 + }, + { + "epoch": 0.28589855806882575, + "grad_norm": 0.8996561918227259, + "learning_rate": 4.2771683571927455e-06, + "loss": 0.1785, + "step": 3103 + }, + { + "epoch": 0.28599069424609574, + "grad_norm": 0.9052116172117433, + "learning_rate": 4.276632492894544e-06, + "loss": 0.1484, + "step": 3104 + }, + { + "epoch": 0.28608283042336574, + "grad_norm": 0.9657865678405, + "learning_rate": 4.276096463635825e-06, + "loss": 0.1653, + "step": 3105 + }, + { + "epoch": 0.28617496660063574, + "grad_norm": 0.93706574283875, + "learning_rate": 4.275560269466358e-06, + "loss": 0.1673, + "step": 3106 + }, + { + "epoch": 0.28626710277790574, + "grad_norm": 0.9397267517974345, + "learning_rate": 4.275023910435928e-06, + "loss": 0.1537, + "step": 3107 + }, + { + "epoch": 0.28635923895517573, + "grad_norm": 0.9004009633111321, + "learning_rate": 4.274487386594338e-06, + "loss": 0.1538, + "step": 3108 + }, + { + "epoch": 0.28645137513244573, + "grad_norm": 0.9613421096528934, + "learning_rate": 4.273950697991402e-06, + "loss": 0.162, + "step": 3109 + }, + { + "epoch": 0.2865435113097158, + "grad_norm": 0.8638174797582878, + "learning_rate": 4.273413844676953e-06, + "loss": 0.1519, + "step": 3110 + }, + { + "epoch": 0.2866356474869858, + "grad_norm": 0.9219609665483445, + "learning_rate": 4.272876826700838e-06, + "loss": 0.1581, + "step": 3111 + }, + { + "epoch": 0.2867277836642558, + "grad_norm": 0.9123932775435808, + "learning_rate": 4.27233964411292e-06, + "loss": 0.1677, + "step": 3112 + }, + { + "epoch": 0.28681991984152577, + "grad_norm": 0.8596332032462187, + "learning_rate": 4.271802296963073e-06, + "loss": 0.1476, + "step": 3113 + }, + { + "epoch": 0.28691205601879577, + "grad_norm": 0.9236043959851093, + "learning_rate": 4.271264785301194e-06, + "loss": 0.1564, + "step": 3114 + }, + { + "epoch": 0.28700419219606577, + "grad_norm": 0.9562016916199808, + "learning_rate": 4.270727109177188e-06, + "loss": 0.17, + "step": 3115 + }, + { + "epoch": 0.28709632837333576, + "grad_norm": 0.9001979247167449, + "learning_rate": 4.270189268640979e-06, + "loss": 0.1626, + "step": 3116 + }, + { + "epoch": 0.2871884645506058, + "grad_norm": 0.8700019211927976, + "learning_rate": 4.269651263742507e-06, + "loss": 0.1634, + "step": 3117 + }, + { + "epoch": 0.2872806007278758, + "grad_norm": 0.9996977490972816, + "learning_rate": 4.269113094531724e-06, + "loss": 0.1679, + "step": 3118 + }, + { + "epoch": 0.2873727369051458, + "grad_norm": 0.9392105076354225, + "learning_rate": 4.268574761058601e-06, + "loss": 0.1763, + "step": 3119 + }, + { + "epoch": 0.2874648730824158, + "grad_norm": 0.9496915781676928, + "learning_rate": 4.26803626337312e-06, + "loss": 0.1546, + "step": 3120 + }, + { + "epoch": 0.2875570092596858, + "grad_norm": 0.9427266315727539, + "learning_rate": 4.267497601525281e-06, + "loss": 0.1687, + "step": 3121 + }, + { + "epoch": 0.2876491454369558, + "grad_norm": 0.8473345537965931, + "learning_rate": 4.266958775565101e-06, + "loss": 0.1401, + "step": 3122 + }, + { + "epoch": 0.28774128161422585, + "grad_norm": 0.9460675493514902, + "learning_rate": 4.266419785542607e-06, + "loss": 0.1626, + "step": 3123 + }, + { + "epoch": 0.28783341779149585, + "grad_norm": 0.986061711219792, + "learning_rate": 4.265880631507847e-06, + "loss": 0.162, + "step": 3124 + }, + { + "epoch": 0.28792555396876585, + "grad_norm": 0.9533196068906457, + "learning_rate": 4.265341313510879e-06, + "loss": 0.163, + "step": 3125 + }, + { + "epoch": 0.28801769014603584, + "grad_norm": 0.9354306059737347, + "learning_rate": 4.264801831601781e-06, + "loss": 0.17, + "step": 3126 + }, + { + "epoch": 0.28810982632330584, + "grad_norm": 0.8667926960766736, + "learning_rate": 4.264262185830643e-06, + "loss": 0.1498, + "step": 3127 + }, + { + "epoch": 0.28820196250057584, + "grad_norm": 1.0641447059766413, + "learning_rate": 4.263722376247571e-06, + "loss": 0.1683, + "step": 3128 + }, + { + "epoch": 0.28829409867784583, + "grad_norm": 0.9032416434200741, + "learning_rate": 4.263182402902687e-06, + "loss": 0.154, + "step": 3129 + }, + { + "epoch": 0.2883862348551159, + "grad_norm": 0.8697261860817109, + "learning_rate": 4.262642265846127e-06, + "loss": 0.1479, + "step": 3130 + }, + { + "epoch": 0.2884783710323859, + "grad_norm": 0.9237533351121093, + "learning_rate": 4.262101965128042e-06, + "loss": 0.159, + "step": 3131 + }, + { + "epoch": 0.2885705072096559, + "grad_norm": 0.9107264611849066, + "learning_rate": 4.261561500798601e-06, + "loss": 0.1612, + "step": 3132 + }, + { + "epoch": 0.2886626433869259, + "grad_norm": 0.8700605038024919, + "learning_rate": 4.261020872907985e-06, + "loss": 0.1582, + "step": 3133 + }, + { + "epoch": 0.2887547795641959, + "grad_norm": 0.9432745501419812, + "learning_rate": 4.26048008150639e-06, + "loss": 0.164, + "step": 3134 + }, + { + "epoch": 0.28884691574146587, + "grad_norm": 0.9282434523299004, + "learning_rate": 4.259939126644032e-06, + "loss": 0.1679, + "step": 3135 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 0.9423611364408566, + "learning_rate": 4.259398008371136e-06, + "loss": 0.1676, + "step": 3136 + }, + { + "epoch": 0.2890311880960059, + "grad_norm": 0.8685127726333524, + "learning_rate": 4.258856726737945e-06, + "loss": 0.1492, + "step": 3137 + }, + { + "epoch": 0.2891233242732759, + "grad_norm": 0.9771353056369224, + "learning_rate": 4.258315281794718e-06, + "loss": 0.1631, + "step": 3138 + }, + { + "epoch": 0.2892154604505459, + "grad_norm": 0.9624194484481019, + "learning_rate": 4.257773673591728e-06, + "loss": 0.1498, + "step": 3139 + }, + { + "epoch": 0.2893075966278159, + "grad_norm": 0.9092065792466625, + "learning_rate": 4.257231902179263e-06, + "loss": 0.1504, + "step": 3140 + }, + { + "epoch": 0.2893997328050859, + "grad_norm": 0.956521824952388, + "learning_rate": 4.256689967607627e-06, + "loss": 0.1725, + "step": 3141 + }, + { + "epoch": 0.2894918689823559, + "grad_norm": 0.9338296990138528, + "learning_rate": 4.256147869927137e-06, + "loss": 0.1581, + "step": 3142 + }, + { + "epoch": 0.2895840051596259, + "grad_norm": 0.888802037271971, + "learning_rate": 4.25560560918813e-06, + "loss": 0.1602, + "step": 3143 + }, + { + "epoch": 0.28967614133689595, + "grad_norm": 0.9319211913936077, + "learning_rate": 4.255063185440953e-06, + "loss": 0.1654, + "step": 3144 + }, + { + "epoch": 0.28976827751416595, + "grad_norm": 0.9786469213747607, + "learning_rate": 4.254520598735971e-06, + "loss": 0.1824, + "step": 3145 + }, + { + "epoch": 0.28986041369143595, + "grad_norm": 0.9157519191307902, + "learning_rate": 4.253977849123561e-06, + "loss": 0.1612, + "step": 3146 + }, + { + "epoch": 0.28995254986870594, + "grad_norm": 0.9298000384411869, + "learning_rate": 4.25343493665412e-06, + "loss": 0.15, + "step": 3147 + }, + { + "epoch": 0.29004468604597594, + "grad_norm": 0.9180849563596113, + "learning_rate": 4.252891861378056e-06, + "loss": 0.1682, + "step": 3148 + }, + { + "epoch": 0.29013682222324594, + "grad_norm": 0.970214702091066, + "learning_rate": 4.252348623345794e-06, + "loss": 0.1724, + "step": 3149 + }, + { + "epoch": 0.29022895840051594, + "grad_norm": 0.8712777245705906, + "learning_rate": 4.2518052226077734e-06, + "loss": 0.1471, + "step": 3150 + }, + { + "epoch": 0.290321094577786, + "grad_norm": 0.8772532104660963, + "learning_rate": 4.25126165921445e-06, + "loss": 0.1586, + "step": 3151 + }, + { + "epoch": 0.290413230755056, + "grad_norm": 0.8603518107957989, + "learning_rate": 4.250717933216293e-06, + "loss": 0.1485, + "step": 3152 + }, + { + "epoch": 0.290505366932326, + "grad_norm": 0.9476717820121503, + "learning_rate": 4.250174044663787e-06, + "loss": 0.1641, + "step": 3153 + }, + { + "epoch": 0.290597503109596, + "grad_norm": 0.9518921904154757, + "learning_rate": 4.249629993607433e-06, + "loss": 0.1602, + "step": 3154 + }, + { + "epoch": 0.290689639286866, + "grad_norm": 0.8289642643179239, + "learning_rate": 4.249085780097746e-06, + "loss": 0.1506, + "step": 3155 + }, + { + "epoch": 0.29078177546413597, + "grad_norm": 0.9148601434226283, + "learning_rate": 4.248541404185255e-06, + "loss": 0.1575, + "step": 3156 + }, + { + "epoch": 0.290873911641406, + "grad_norm": 0.944322099578078, + "learning_rate": 4.247996865920509e-06, + "loss": 0.1676, + "step": 3157 + }, + { + "epoch": 0.290966047818676, + "grad_norm": 1.0149429041739264, + "learning_rate": 4.247452165354064e-06, + "loss": 0.1757, + "step": 3158 + }, + { + "epoch": 0.291058183995946, + "grad_norm": 0.8897165675585696, + "learning_rate": 4.246907302536497e-06, + "loss": 0.1503, + "step": 3159 + }, + { + "epoch": 0.291150320173216, + "grad_norm": 0.9069588057960449, + "learning_rate": 4.246362277518399e-06, + "loss": 0.1633, + "step": 3160 + }, + { + "epoch": 0.291242456350486, + "grad_norm": 0.8724677368674314, + "learning_rate": 4.245817090350377e-06, + "loss": 0.1507, + "step": 3161 + }, + { + "epoch": 0.291334592527756, + "grad_norm": 0.8859929836695598, + "learning_rate": 4.245271741083049e-06, + "loss": 0.1669, + "step": 3162 + }, + { + "epoch": 0.291426728705026, + "grad_norm": 0.990522122817011, + "learning_rate": 4.244726229767052e-06, + "loss": 0.1826, + "step": 3163 + }, + { + "epoch": 0.29151886488229606, + "grad_norm": 0.8546328526513989, + "learning_rate": 4.2441805564530366e-06, + "loss": 0.1501, + "step": 3164 + }, + { + "epoch": 0.29161100105956606, + "grad_norm": 0.9211953614605264, + "learning_rate": 4.2436347211916695e-06, + "loss": 0.1639, + "step": 3165 + }, + { + "epoch": 0.29170313723683605, + "grad_norm": 1.1441021991526923, + "learning_rate": 4.243088724033632e-06, + "loss": 0.1586, + "step": 3166 + }, + { + "epoch": 0.29179527341410605, + "grad_norm": 0.9121448781527541, + "learning_rate": 4.242542565029617e-06, + "loss": 0.1676, + "step": 3167 + }, + { + "epoch": 0.29188740959137605, + "grad_norm": 0.8570822660987969, + "learning_rate": 4.241996244230338e-06, + "loss": 0.1572, + "step": 3168 + }, + { + "epoch": 0.29197954576864604, + "grad_norm": 0.8798918108299641, + "learning_rate": 4.24144976168652e-06, + "loss": 0.1552, + "step": 3169 + }, + { + "epoch": 0.29207168194591604, + "grad_norm": 0.9012805667976503, + "learning_rate": 4.240903117448904e-06, + "loss": 0.1608, + "step": 3170 + }, + { + "epoch": 0.2921638181231861, + "grad_norm": 0.8268739629189876, + "learning_rate": 4.240356311568247e-06, + "loss": 0.1454, + "step": 3171 + }, + { + "epoch": 0.2922559543004561, + "grad_norm": 0.9519215570676918, + "learning_rate": 4.239809344095319e-06, + "loss": 0.1645, + "step": 3172 + }, + { + "epoch": 0.2923480904777261, + "grad_norm": 0.9360762926839049, + "learning_rate": 4.239262215080906e-06, + "loss": 0.1584, + "step": 3173 + }, + { + "epoch": 0.2924402266549961, + "grad_norm": 0.9743162418622031, + "learning_rate": 4.238714924575809e-06, + "loss": 0.185, + "step": 3174 + }, + { + "epoch": 0.2925323628322661, + "grad_norm": 0.8853535313691572, + "learning_rate": 4.238167472630844e-06, + "loss": 0.1475, + "step": 3175 + }, + { + "epoch": 0.2926244990095361, + "grad_norm": 0.9429373674217792, + "learning_rate": 4.237619859296842e-06, + "loss": 0.1615, + "step": 3176 + }, + { + "epoch": 0.2927166351868061, + "grad_norm": 0.8754838330157808, + "learning_rate": 4.237072084624649e-06, + "loss": 0.141, + "step": 3177 + }, + { + "epoch": 0.2928087713640761, + "grad_norm": 0.9265393674777754, + "learning_rate": 4.2365241486651275e-06, + "loss": 0.1543, + "step": 3178 + }, + { + "epoch": 0.2929009075413461, + "grad_norm": 0.892789764988484, + "learning_rate": 4.235976051469151e-06, + "loss": 0.1626, + "step": 3179 + }, + { + "epoch": 0.2929930437186161, + "grad_norm": 0.8348631663386511, + "learning_rate": 4.23542779308761e-06, + "loss": 0.1429, + "step": 3180 + }, + { + "epoch": 0.2930851798958861, + "grad_norm": 0.8821394531522185, + "learning_rate": 4.234879373571413e-06, + "loss": 0.162, + "step": 3181 + }, + { + "epoch": 0.2931773160731561, + "grad_norm": 0.8806914298120246, + "learning_rate": 4.234330792971479e-06, + "loss": 0.1632, + "step": 3182 + }, + { + "epoch": 0.2932694522504261, + "grad_norm": 0.8918627316336875, + "learning_rate": 4.233782051338745e-06, + "loss": 0.1617, + "step": 3183 + }, + { + "epoch": 0.2933615884276961, + "grad_norm": 0.8678930306994892, + "learning_rate": 4.23323314872416e-06, + "loss": 0.1533, + "step": 3184 + }, + { + "epoch": 0.29345372460496616, + "grad_norm": 0.9178630543984329, + "learning_rate": 4.232684085178691e-06, + "loss": 0.1649, + "step": 3185 + }, + { + "epoch": 0.29354586078223616, + "grad_norm": 0.8753634315475634, + "learning_rate": 4.232134860753318e-06, + "loss": 0.1673, + "step": 3186 + }, + { + "epoch": 0.29363799695950615, + "grad_norm": 0.8870097305860352, + "learning_rate": 4.231585475499037e-06, + "loss": 0.1448, + "step": 3187 + }, + { + "epoch": 0.29373013313677615, + "grad_norm": 0.9544469402594513, + "learning_rate": 4.231035929466858e-06, + "loss": 0.1595, + "step": 3188 + }, + { + "epoch": 0.29382226931404615, + "grad_norm": 0.9241039149441995, + "learning_rate": 4.230486222707807e-06, + "loss": 0.1527, + "step": 3189 + }, + { + "epoch": 0.29391440549131614, + "grad_norm": 0.8843626519655292, + "learning_rate": 4.229936355272924e-06, + "loss": 0.1617, + "step": 3190 + }, + { + "epoch": 0.2940065416685862, + "grad_norm": 0.9271307349122019, + "learning_rate": 4.229386327213264e-06, + "loss": 0.1611, + "step": 3191 + }, + { + "epoch": 0.2940986778458562, + "grad_norm": 0.9492531025198703, + "learning_rate": 4.228836138579897e-06, + "loss": 0.1729, + "step": 3192 + }, + { + "epoch": 0.2941908140231262, + "grad_norm": 0.9295851214577007, + "learning_rate": 4.2282857894239085e-06, + "loss": 0.1687, + "step": 3193 + }, + { + "epoch": 0.2942829502003962, + "grad_norm": 0.977256783995922, + "learning_rate": 4.227735279796399e-06, + "loss": 0.1628, + "step": 3194 + }, + { + "epoch": 0.2943750863776662, + "grad_norm": 0.8930681892980293, + "learning_rate": 4.227184609748483e-06, + "loss": 0.1693, + "step": 3195 + }, + { + "epoch": 0.2944672225549362, + "grad_norm": 0.8933894768496947, + "learning_rate": 4.226633779331289e-06, + "loss": 0.1508, + "step": 3196 + }, + { + "epoch": 0.2945593587322062, + "grad_norm": 0.8927683830142663, + "learning_rate": 4.226082788595965e-06, + "loss": 0.1453, + "step": 3197 + }, + { + "epoch": 0.29465149490947623, + "grad_norm": 0.9104305630689763, + "learning_rate": 4.225531637593666e-06, + "loss": 0.1563, + "step": 3198 + }, + { + "epoch": 0.2947436310867462, + "grad_norm": 0.9241605921153313, + "learning_rate": 4.2249803263755695e-06, + "loss": 0.1743, + "step": 3199 + }, + { + "epoch": 0.2948357672640162, + "grad_norm": 0.9296113117121186, + "learning_rate": 4.2244288549928645e-06, + "loss": 0.1516, + "step": 3200 + }, + { + "epoch": 0.2949279034412862, + "grad_norm": 0.9199239821431868, + "learning_rate": 4.223877223496754e-06, + "loss": 0.1663, + "step": 3201 + }, + { + "epoch": 0.2950200396185562, + "grad_norm": 0.9000094471075423, + "learning_rate": 4.223325431938459e-06, + "loss": 0.1564, + "step": 3202 + }, + { + "epoch": 0.2951121757958262, + "grad_norm": 0.9194190986912426, + "learning_rate": 4.2227734803692115e-06, + "loss": 0.1555, + "step": 3203 + }, + { + "epoch": 0.2952043119730962, + "grad_norm": 0.8767195760502394, + "learning_rate": 4.2222213688402605e-06, + "loss": 0.1386, + "step": 3204 + }, + { + "epoch": 0.29529644815036626, + "grad_norm": 0.8871918600848787, + "learning_rate": 4.22166909740287e-06, + "loss": 0.144, + "step": 3205 + }, + { + "epoch": 0.29538858432763626, + "grad_norm": 1.0100555949711532, + "learning_rate": 4.221116666108319e-06, + "loss": 0.171, + "step": 3206 + }, + { + "epoch": 0.29548072050490626, + "grad_norm": 0.907837346659232, + "learning_rate": 4.2205640750079e-06, + "loss": 0.1585, + "step": 3207 + }, + { + "epoch": 0.29557285668217625, + "grad_norm": 0.9685782224087519, + "learning_rate": 4.220011324152922e-06, + "loss": 0.1694, + "step": 3208 + }, + { + "epoch": 0.29566499285944625, + "grad_norm": 0.9318424056577996, + "learning_rate": 4.219458413594707e-06, + "loss": 0.1661, + "step": 3209 + }, + { + "epoch": 0.29575712903671625, + "grad_norm": 0.975106382055604, + "learning_rate": 4.218905343384593e-06, + "loss": 0.1648, + "step": 3210 + }, + { + "epoch": 0.29584926521398625, + "grad_norm": 0.8829421015276901, + "learning_rate": 4.218352113573933e-06, + "loss": 0.161, + "step": 3211 + }, + { + "epoch": 0.2959414013912563, + "grad_norm": 0.9457196742302185, + "learning_rate": 4.217798724214094e-06, + "loss": 0.176, + "step": 3212 + }, + { + "epoch": 0.2960335375685263, + "grad_norm": 0.9877734213208268, + "learning_rate": 4.21724517535646e-06, + "loss": 0.161, + "step": 3213 + }, + { + "epoch": 0.2961256737457963, + "grad_norm": 0.8864723853789074, + "learning_rate": 4.216691467052426e-06, + "loss": 0.1501, + "step": 3214 + }, + { + "epoch": 0.2962178099230663, + "grad_norm": 0.8725650110211445, + "learning_rate": 4.216137599353404e-06, + "loss": 0.149, + "step": 3215 + }, + { + "epoch": 0.2963099461003363, + "grad_norm": 0.9069598790303386, + "learning_rate": 4.215583572310821e-06, + "loss": 0.1522, + "step": 3216 + }, + { + "epoch": 0.2964020822776063, + "grad_norm": 0.9359483520411078, + "learning_rate": 4.2150293859761196e-06, + "loss": 0.1575, + "step": 3217 + }, + { + "epoch": 0.29649421845487633, + "grad_norm": 0.9376534373883524, + "learning_rate": 4.214475040400755e-06, + "loss": 0.1693, + "step": 3218 + }, + { + "epoch": 0.29658635463214633, + "grad_norm": 0.9605366150327874, + "learning_rate": 4.213920535636198e-06, + "loss": 0.1555, + "step": 3219 + }, + { + "epoch": 0.29667849080941633, + "grad_norm": 0.9239353702038833, + "learning_rate": 4.213365871733934e-06, + "loss": 0.1589, + "step": 3220 + }, + { + "epoch": 0.2967706269866863, + "grad_norm": 0.9265223393518568, + "learning_rate": 4.212811048745467e-06, + "loss": 0.1625, + "step": 3221 + }, + { + "epoch": 0.2968627631639563, + "grad_norm": 0.9788953150847244, + "learning_rate": 4.212256066722307e-06, + "loss": 0.1648, + "step": 3222 + }, + { + "epoch": 0.2969548993412263, + "grad_norm": 0.8587737017236943, + "learning_rate": 4.211700925715988e-06, + "loss": 0.1434, + "step": 3223 + }, + { + "epoch": 0.2970470355184963, + "grad_norm": 0.9571360236107946, + "learning_rate": 4.211145625778054e-06, + "loss": 0.1718, + "step": 3224 + }, + { + "epoch": 0.29713917169576637, + "grad_norm": 0.8948875668309092, + "learning_rate": 4.2105901669600645e-06, + "loss": 0.1493, + "step": 3225 + }, + { + "epoch": 0.29723130787303637, + "grad_norm": 0.8698739984411084, + "learning_rate": 4.210034549313594e-06, + "loss": 0.1537, + "step": 3226 + }, + { + "epoch": 0.29732344405030636, + "grad_norm": 0.9127957190567569, + "learning_rate": 4.2094787728902305e-06, + "loss": 0.154, + "step": 3227 + }, + { + "epoch": 0.29741558022757636, + "grad_norm": 0.9743876075133446, + "learning_rate": 4.20892283774158e-06, + "loss": 0.1653, + "step": 3228 + }, + { + "epoch": 0.29750771640484636, + "grad_norm": 0.9697707594140141, + "learning_rate": 4.20836674391926e-06, + "loss": 0.1628, + "step": 3229 + }, + { + "epoch": 0.29759985258211635, + "grad_norm": 0.9921447822593994, + "learning_rate": 4.207810491474904e-06, + "loss": 0.1741, + "step": 3230 + }, + { + "epoch": 0.29769198875938635, + "grad_norm": 1.0175502207785552, + "learning_rate": 4.207254080460161e-06, + "loss": 0.1759, + "step": 3231 + }, + { + "epoch": 0.2977841249366564, + "grad_norm": 0.8958978146833146, + "learning_rate": 4.206697510926691e-06, + "loss": 0.1538, + "step": 3232 + }, + { + "epoch": 0.2978762611139264, + "grad_norm": 0.950880039603536, + "learning_rate": 4.206140782926174e-06, + "loss": 0.1721, + "step": 3233 + }, + { + "epoch": 0.2979683972911964, + "grad_norm": 0.925327176872321, + "learning_rate": 4.205583896510303e-06, + "loss": 0.1595, + "step": 3234 + }, + { + "epoch": 0.2980605334684664, + "grad_norm": 0.9532665759717548, + "learning_rate": 4.2050268517307816e-06, + "loss": 0.1639, + "step": 3235 + }, + { + "epoch": 0.2981526696457364, + "grad_norm": 0.9291134573772569, + "learning_rate": 4.204469648639335e-06, + "loss": 0.1715, + "step": 3236 + }, + { + "epoch": 0.2982448058230064, + "grad_norm": 0.8797274289945238, + "learning_rate": 4.203912287287697e-06, + "loss": 0.1604, + "step": 3237 + }, + { + "epoch": 0.2983369420002764, + "grad_norm": 0.9040168151293431, + "learning_rate": 4.203354767727621e-06, + "loss": 0.1658, + "step": 3238 + }, + { + "epoch": 0.29842907817754644, + "grad_norm": 0.985469121655086, + "learning_rate": 4.202797090010871e-06, + "loss": 0.1692, + "step": 3239 + }, + { + "epoch": 0.29852121435481643, + "grad_norm": 0.8991733097245737, + "learning_rate": 4.202239254189228e-06, + "loss": 0.1527, + "step": 3240 + }, + { + "epoch": 0.29861335053208643, + "grad_norm": 0.9968726368094237, + "learning_rate": 4.2016812603144865e-06, + "loss": 0.1768, + "step": 3241 + }, + { + "epoch": 0.2987054867093564, + "grad_norm": 0.9261811611053946, + "learning_rate": 4.201123108438457e-06, + "loss": 0.1609, + "step": 3242 + }, + { + "epoch": 0.2987976228866264, + "grad_norm": 0.9805890401028196, + "learning_rate": 4.2005647986129635e-06, + "loss": 0.163, + "step": 3243 + }, + { + "epoch": 0.2988897590638964, + "grad_norm": 0.9154281842963401, + "learning_rate": 4.2000063308898466e-06, + "loss": 0.154, + "step": 3244 + }, + { + "epoch": 0.2989818952411664, + "grad_norm": 0.9263190062990657, + "learning_rate": 4.199447705320958e-06, + "loss": 0.159, + "step": 3245 + }, + { + "epoch": 0.29907403141843647, + "grad_norm": 0.9684841984778471, + "learning_rate": 4.1988889219581676e-06, + "loss": 0.1809, + "step": 3246 + }, + { + "epoch": 0.29916616759570647, + "grad_norm": 0.8439439005356527, + "learning_rate": 4.198329980853357e-06, + "loss": 0.1386, + "step": 3247 + }, + { + "epoch": 0.29925830377297646, + "grad_norm": 0.8805133731749528, + "learning_rate": 4.1977708820584265e-06, + "loss": 0.1609, + "step": 3248 + }, + { + "epoch": 0.29935043995024646, + "grad_norm": 0.9299348501988777, + "learning_rate": 4.197211625625285e-06, + "loss": 0.1614, + "step": 3249 + }, + { + "epoch": 0.29944257612751646, + "grad_norm": 0.953423157440807, + "learning_rate": 4.196652211605863e-06, + "loss": 0.1753, + "step": 3250 + }, + { + "epoch": 0.29953471230478645, + "grad_norm": 0.9650855610206487, + "learning_rate": 4.196092640052099e-06, + "loss": 0.1743, + "step": 3251 + }, + { + "epoch": 0.2996268484820565, + "grad_norm": 0.8158109293633011, + "learning_rate": 4.195532911015952e-06, + "loss": 0.149, + "step": 3252 + }, + { + "epoch": 0.2997189846593265, + "grad_norm": 1.0066905218223139, + "learning_rate": 4.1949730245493915e-06, + "loss": 0.166, + "step": 3253 + }, + { + "epoch": 0.2998111208365965, + "grad_norm": 0.981891494249524, + "learning_rate": 4.194412980704403e-06, + "loss": 0.1683, + "step": 3254 + }, + { + "epoch": 0.2999032570138665, + "grad_norm": 0.9388757636396333, + "learning_rate": 4.1938527795329875e-06, + "loss": 0.1695, + "step": 3255 + }, + { + "epoch": 0.2999953931911365, + "grad_norm": 0.861978941124501, + "learning_rate": 4.1932924210871585e-06, + "loss": 0.1584, + "step": 3256 + }, + { + "epoch": 0.3000875293684065, + "grad_norm": 0.9093301324713368, + "learning_rate": 4.192731905418947e-06, + "loss": 0.1612, + "step": 3257 + }, + { + "epoch": 0.3001796655456765, + "grad_norm": 0.9074202593094116, + "learning_rate": 4.192171232580395e-06, + "loss": 0.1613, + "step": 3258 + }, + { + "epoch": 0.30027180172294654, + "grad_norm": 0.8515153187239818, + "learning_rate": 4.191610402623561e-06, + "loss": 0.1501, + "step": 3259 + }, + { + "epoch": 0.30036393790021654, + "grad_norm": 0.9327983127436237, + "learning_rate": 4.191049415600521e-06, + "loss": 0.1732, + "step": 3260 + }, + { + "epoch": 0.30045607407748653, + "grad_norm": 0.96624826925591, + "learning_rate": 4.19048827156336e-06, + "loss": 0.1695, + "step": 3261 + }, + { + "epoch": 0.30054821025475653, + "grad_norm": 0.9123411497937238, + "learning_rate": 4.189926970564181e-06, + "loss": 0.1404, + "step": 3262 + }, + { + "epoch": 0.30064034643202653, + "grad_norm": 0.9116714759394909, + "learning_rate": 4.189365512655101e-06, + "loss": 0.1605, + "step": 3263 + }, + { + "epoch": 0.3007324826092965, + "grad_norm": 0.9543230905818623, + "learning_rate": 4.188803897888251e-06, + "loss": 0.1489, + "step": 3264 + }, + { + "epoch": 0.3008246187865665, + "grad_norm": 0.946421886980175, + "learning_rate": 4.188242126315778e-06, + "loss": 0.1636, + "step": 3265 + }, + { + "epoch": 0.3009167549638366, + "grad_norm": 0.9416980687774688, + "learning_rate": 4.187680197989841e-06, + "loss": 0.1693, + "step": 3266 + }, + { + "epoch": 0.30100889114110657, + "grad_norm": 0.8628506075375213, + "learning_rate": 4.187118112962616e-06, + "loss": 0.1471, + "step": 3267 + }, + { + "epoch": 0.30110102731837657, + "grad_norm": 0.9110568486745181, + "learning_rate": 4.186555871286293e-06, + "loss": 0.1405, + "step": 3268 + }, + { + "epoch": 0.30119316349564657, + "grad_norm": 0.8937401299625113, + "learning_rate": 4.185993473013076e-06, + "loss": 0.1494, + "step": 3269 + }, + { + "epoch": 0.30128529967291656, + "grad_norm": 0.9063871070755407, + "learning_rate": 4.185430918195184e-06, + "loss": 0.1538, + "step": 3270 + }, + { + "epoch": 0.30137743585018656, + "grad_norm": 0.912457399295779, + "learning_rate": 4.184868206884849e-06, + "loss": 0.1485, + "step": 3271 + }, + { + "epoch": 0.30146957202745656, + "grad_norm": 0.9949777663011297, + "learning_rate": 4.18430533913432e-06, + "loss": 0.1771, + "step": 3272 + }, + { + "epoch": 0.3015617082047266, + "grad_norm": 0.9784479373923843, + "learning_rate": 4.183742314995859e-06, + "loss": 0.1618, + "step": 3273 + }, + { + "epoch": 0.3016538443819966, + "grad_norm": 0.9112650889954245, + "learning_rate": 4.183179134521743e-06, + "loss": 0.1513, + "step": 3274 + }, + { + "epoch": 0.3017459805592666, + "grad_norm": 0.9355584324086121, + "learning_rate": 4.1826157977642634e-06, + "loss": 0.1484, + "step": 3275 + }, + { + "epoch": 0.3018381167365366, + "grad_norm": 0.8931369194300064, + "learning_rate": 4.1820523047757246e-06, + "loss": 0.1656, + "step": 3276 + }, + { + "epoch": 0.3019302529138066, + "grad_norm": 0.9956521555709148, + "learning_rate": 4.18148865560845e-06, + "loss": 0.1626, + "step": 3277 + }, + { + "epoch": 0.3020223890910766, + "grad_norm": 0.8874050966915844, + "learning_rate": 4.180924850314771e-06, + "loss": 0.1575, + "step": 3278 + }, + { + "epoch": 0.3021145252683466, + "grad_norm": 0.9523402862573914, + "learning_rate": 4.180360888947041e-06, + "loss": 0.1703, + "step": 3279 + }, + { + "epoch": 0.30220666144561664, + "grad_norm": 0.9287874514545038, + "learning_rate": 4.179796771557619e-06, + "loss": 0.1669, + "step": 3280 + }, + { + "epoch": 0.30229879762288664, + "grad_norm": 0.8673579238655271, + "learning_rate": 4.179232498198888e-06, + "loss": 0.1503, + "step": 3281 + }, + { + "epoch": 0.30239093380015664, + "grad_norm": 0.8794126748973937, + "learning_rate": 4.178668068923238e-06, + "loss": 0.1578, + "step": 3282 + }, + { + "epoch": 0.30248306997742663, + "grad_norm": 0.9275772205249678, + "learning_rate": 4.178103483783077e-06, + "loss": 0.1525, + "step": 3283 + }, + { + "epoch": 0.30257520615469663, + "grad_norm": 0.8968088357348379, + "learning_rate": 4.177538742830828e-06, + "loss": 0.1547, + "step": 3284 + }, + { + "epoch": 0.3026673423319666, + "grad_norm": 0.8928338403213077, + "learning_rate": 4.1769738461189245e-06, + "loss": 0.1653, + "step": 3285 + }, + { + "epoch": 0.3027594785092367, + "grad_norm": 0.9542122073006188, + "learning_rate": 4.176408793699821e-06, + "loss": 0.1528, + "step": 3286 + }, + { + "epoch": 0.3028516146865067, + "grad_norm": 0.9825041852507498, + "learning_rate": 4.1758435856259784e-06, + "loss": 0.1642, + "step": 3287 + }, + { + "epoch": 0.3029437508637767, + "grad_norm": 0.9378090275370146, + "learning_rate": 4.17527822194988e-06, + "loss": 0.1532, + "step": 3288 + }, + { + "epoch": 0.30303588704104667, + "grad_norm": 0.925622043346032, + "learning_rate": 4.174712702724017e-06, + "loss": 0.1638, + "step": 3289 + }, + { + "epoch": 0.30312802321831667, + "grad_norm": 0.8605309241655659, + "learning_rate": 4.174147028000901e-06, + "loss": 0.1538, + "step": 3290 + }, + { + "epoch": 0.30322015939558666, + "grad_norm": 0.9165943989447812, + "learning_rate": 4.173581197833052e-06, + "loss": 0.1482, + "step": 3291 + }, + { + "epoch": 0.30331229557285666, + "grad_norm": 0.8668274344509658, + "learning_rate": 4.173015212273009e-06, + "loss": 0.1509, + "step": 3292 + }, + { + "epoch": 0.3034044317501267, + "grad_norm": 0.9491082207852997, + "learning_rate": 4.1724490713733246e-06, + "loss": 0.1514, + "step": 3293 + }, + { + "epoch": 0.3034965679273967, + "grad_norm": 0.9456871597563634, + "learning_rate": 4.171882775186563e-06, + "loss": 0.1619, + "step": 3294 + }, + { + "epoch": 0.3035887041046667, + "grad_norm": 0.9315199674334298, + "learning_rate": 4.1713163237653055e-06, + "loss": 0.1582, + "step": 3295 + }, + { + "epoch": 0.3036808402819367, + "grad_norm": 0.9989797250234418, + "learning_rate": 4.170749717162148e-06, + "loss": 0.1632, + "step": 3296 + }, + { + "epoch": 0.3037729764592067, + "grad_norm": 0.875493273649281, + "learning_rate": 4.170182955429699e-06, + "loss": 0.1616, + "step": 3297 + }, + { + "epoch": 0.3038651126364767, + "grad_norm": 0.9281100714565997, + "learning_rate": 4.169616038620583e-06, + "loss": 0.1456, + "step": 3298 + }, + { + "epoch": 0.3039572488137467, + "grad_norm": 0.8493906414552408, + "learning_rate": 4.169048966787438e-06, + "loss": 0.1478, + "step": 3299 + }, + { + "epoch": 0.30404938499101675, + "grad_norm": 0.8952927753283536, + "learning_rate": 4.168481739982917e-06, + "loss": 0.1636, + "step": 3300 + }, + { + "epoch": 0.30414152116828674, + "grad_norm": 0.8655868115020314, + "learning_rate": 4.167914358259687e-06, + "loss": 0.1548, + "step": 3301 + }, + { + "epoch": 0.30423365734555674, + "grad_norm": 0.9919065938123784, + "learning_rate": 4.167346821670429e-06, + "loss": 0.1751, + "step": 3302 + }, + { + "epoch": 0.30432579352282674, + "grad_norm": 0.9493609688016562, + "learning_rate": 4.166779130267839e-06, + "loss": 0.1665, + "step": 3303 + }, + { + "epoch": 0.30441792970009673, + "grad_norm": 0.8983868029734609, + "learning_rate": 4.166211284104629e-06, + "loss": 0.1572, + "step": 3304 + }, + { + "epoch": 0.30451006587736673, + "grad_norm": 0.9007939752199932, + "learning_rate": 4.16564328323352e-06, + "loss": 0.1698, + "step": 3305 + }, + { + "epoch": 0.30460220205463673, + "grad_norm": 0.8913045218728333, + "learning_rate": 4.165075127707254e-06, + "loss": 0.1603, + "step": 3306 + }, + { + "epoch": 0.3046943382319068, + "grad_norm": 0.8821994473863105, + "learning_rate": 4.164506817578582e-06, + "loss": 0.1536, + "step": 3307 + }, + { + "epoch": 0.3047864744091768, + "grad_norm": 0.8662864009408308, + "learning_rate": 4.163938352900274e-06, + "loss": 0.1631, + "step": 3308 + }, + { + "epoch": 0.3048786105864468, + "grad_norm": 0.9322076782697044, + "learning_rate": 4.16336973372511e-06, + "loss": 0.1707, + "step": 3309 + }, + { + "epoch": 0.30497074676371677, + "grad_norm": 0.8791872015663708, + "learning_rate": 4.162800960105889e-06, + "loss": 0.1453, + "step": 3310 + }, + { + "epoch": 0.30506288294098677, + "grad_norm": 0.9335680774636453, + "learning_rate": 4.162232032095418e-06, + "loss": 0.1639, + "step": 3311 + }, + { + "epoch": 0.30515501911825677, + "grad_norm": 0.938972469326503, + "learning_rate": 4.1616629497465245e-06, + "loss": 0.1542, + "step": 3312 + }, + { + "epoch": 0.30524715529552676, + "grad_norm": 0.9437698259785928, + "learning_rate": 4.1610937131120474e-06, + "loss": 0.1788, + "step": 3313 + }, + { + "epoch": 0.3053392914727968, + "grad_norm": 0.8838873717685708, + "learning_rate": 4.16052432224484e-06, + "loss": 0.1577, + "step": 3314 + }, + { + "epoch": 0.3054314276500668, + "grad_norm": 0.9391124996158003, + "learning_rate": 4.159954777197771e-06, + "loss": 0.1574, + "step": 3315 + }, + { + "epoch": 0.3055235638273368, + "grad_norm": 0.9675529515690404, + "learning_rate": 4.159385078023722e-06, + "loss": 0.1664, + "step": 3316 + }, + { + "epoch": 0.3056157000046068, + "grad_norm": 0.9691361853056687, + "learning_rate": 4.15881522477559e-06, + "loss": 0.1581, + "step": 3317 + }, + { + "epoch": 0.3057078361818768, + "grad_norm": 0.9497540299432615, + "learning_rate": 4.1582452175062854e-06, + "loss": 0.1766, + "step": 3318 + }, + { + "epoch": 0.3057999723591468, + "grad_norm": 0.9331780873537096, + "learning_rate": 4.157675056268735e-06, + "loss": 0.1581, + "step": 3319 + }, + { + "epoch": 0.30589210853641685, + "grad_norm": 0.9282528349029902, + "learning_rate": 4.157104741115876e-06, + "loss": 0.1542, + "step": 3320 + }, + { + "epoch": 0.30598424471368685, + "grad_norm": 0.936802223907317, + "learning_rate": 4.156534272100664e-06, + "loss": 0.1827, + "step": 3321 + }, + { + "epoch": 0.30607638089095685, + "grad_norm": 0.9002015504623913, + "learning_rate": 4.155963649276066e-06, + "loss": 0.1593, + "step": 3322 + }, + { + "epoch": 0.30616851706822684, + "grad_norm": 0.8840297918677589, + "learning_rate": 4.155392872695066e-06, + "loss": 0.1578, + "step": 3323 + }, + { + "epoch": 0.30626065324549684, + "grad_norm": 0.8926966395195797, + "learning_rate": 4.154821942410659e-06, + "loss": 0.1528, + "step": 3324 + }, + { + "epoch": 0.30635278942276684, + "grad_norm": 0.8927711731854681, + "learning_rate": 4.154250858475857e-06, + "loss": 0.1653, + "step": 3325 + }, + { + "epoch": 0.30644492560003683, + "grad_norm": 0.9700319830098186, + "learning_rate": 4.1536796209436835e-06, + "loss": 0.1659, + "step": 3326 + }, + { + "epoch": 0.3065370617773069, + "grad_norm": 0.9483174494393913, + "learning_rate": 4.153108229867181e-06, + "loss": 0.1665, + "step": 3327 + }, + { + "epoch": 0.3066291979545769, + "grad_norm": 0.8748630059230683, + "learning_rate": 4.1525366852994e-06, + "loss": 0.1554, + "step": 3328 + }, + { + "epoch": 0.3067213341318469, + "grad_norm": 0.8582455329358799, + "learning_rate": 4.151964987293411e-06, + "loss": 0.143, + "step": 3329 + }, + { + "epoch": 0.3068134703091169, + "grad_norm": 0.8715858478170836, + "learning_rate": 4.151393135902294e-06, + "loss": 0.1583, + "step": 3330 + }, + { + "epoch": 0.3069056064863869, + "grad_norm": 0.9516812753474873, + "learning_rate": 4.150821131179148e-06, + "loss": 0.1645, + "step": 3331 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 0.9084710244319857, + "learning_rate": 4.150248973177081e-06, + "loss": 0.1507, + "step": 3332 + }, + { + "epoch": 0.30708987884092687, + "grad_norm": 0.9165140829229909, + "learning_rate": 4.14967666194922e-06, + "loss": 0.1578, + "step": 3333 + }, + { + "epoch": 0.3071820150181969, + "grad_norm": 0.8704866370447997, + "learning_rate": 4.149104197548703e-06, + "loss": 0.1517, + "step": 3334 + }, + { + "epoch": 0.3072741511954669, + "grad_norm": 0.8936771673676237, + "learning_rate": 4.148531580028685e-06, + "loss": 0.1527, + "step": 3335 + }, + { + "epoch": 0.3073662873727369, + "grad_norm": 0.9689189902531499, + "learning_rate": 4.147958809442331e-06, + "loss": 0.1379, + "step": 3336 + }, + { + "epoch": 0.3074584235500069, + "grad_norm": 0.9222655925574594, + "learning_rate": 4.147385885842824e-06, + "loss": 0.1536, + "step": 3337 + }, + { + "epoch": 0.3075505597272769, + "grad_norm": 0.9481217418968965, + "learning_rate": 4.146812809283361e-06, + "loss": 0.1663, + "step": 3338 + }, + { + "epoch": 0.3076426959045469, + "grad_norm": 0.9071299429191669, + "learning_rate": 4.14623957981715e-06, + "loss": 0.1547, + "step": 3339 + }, + { + "epoch": 0.3077348320818169, + "grad_norm": 0.9272233789397594, + "learning_rate": 4.1456661974974185e-06, + "loss": 0.1385, + "step": 3340 + }, + { + "epoch": 0.30782696825908695, + "grad_norm": 0.9741525591963978, + "learning_rate": 4.145092662377403e-06, + "loss": 0.1641, + "step": 3341 + }, + { + "epoch": 0.30791910443635695, + "grad_norm": 0.9253540203356433, + "learning_rate": 4.144518974510358e-06, + "loss": 0.17, + "step": 3342 + }, + { + "epoch": 0.30801124061362695, + "grad_norm": 0.8975724542139679, + "learning_rate": 4.143945133949547e-06, + "loss": 0.1479, + "step": 3343 + }, + { + "epoch": 0.30810337679089694, + "grad_norm": 0.9632358843689789, + "learning_rate": 4.1433711407482544e-06, + "loss": 0.1731, + "step": 3344 + }, + { + "epoch": 0.30819551296816694, + "grad_norm": 1.0076660275039135, + "learning_rate": 4.142796994959775e-06, + "loss": 0.1857, + "step": 3345 + }, + { + "epoch": 0.30828764914543694, + "grad_norm": 0.9007248222676459, + "learning_rate": 4.142222696637417e-06, + "loss": 0.1653, + "step": 3346 + }, + { + "epoch": 0.30837978532270693, + "grad_norm": 0.8293657995382075, + "learning_rate": 4.141648245834505e-06, + "loss": 0.1557, + "step": 3347 + }, + { + "epoch": 0.308471921499977, + "grad_norm": 0.9018556933312604, + "learning_rate": 4.141073642604377e-06, + "loss": 0.1507, + "step": 3348 + }, + { + "epoch": 0.308564057677247, + "grad_norm": 1.0206437869118763, + "learning_rate": 4.140498887000385e-06, + "loss": 0.1612, + "step": 3349 + }, + { + "epoch": 0.308656193854517, + "grad_norm": 0.8628938425077515, + "learning_rate": 4.139923979075894e-06, + "loss": 0.1537, + "step": 3350 + }, + { + "epoch": 0.308748330031787, + "grad_norm": 0.9050313968713789, + "learning_rate": 4.139348918884285e-06, + "loss": 0.1655, + "step": 3351 + }, + { + "epoch": 0.308840466209057, + "grad_norm": 0.9103712066570067, + "learning_rate": 4.138773706478953e-06, + "loss": 0.151, + "step": 3352 + }, + { + "epoch": 0.30893260238632697, + "grad_norm": 0.92340284255135, + "learning_rate": 4.138198341913305e-06, + "loss": 0.1493, + "step": 3353 + }, + { + "epoch": 0.309024738563597, + "grad_norm": 0.8688924319396153, + "learning_rate": 4.137622825240767e-06, + "loss": 0.1574, + "step": 3354 + }, + { + "epoch": 0.309116874740867, + "grad_norm": 0.8993317455096554, + "learning_rate": 4.1370471565147715e-06, + "loss": 0.1575, + "step": 3355 + }, + { + "epoch": 0.309209010918137, + "grad_norm": 0.928602372486308, + "learning_rate": 4.1364713357887715e-06, + "loss": 0.1543, + "step": 3356 + }, + { + "epoch": 0.309301147095407, + "grad_norm": 1.061155027790231, + "learning_rate": 4.1358953631162314e-06, + "loss": 0.1598, + "step": 3357 + }, + { + "epoch": 0.309393283272677, + "grad_norm": 0.9084105597716158, + "learning_rate": 4.135319238550632e-06, + "loss": 0.1559, + "step": 3358 + }, + { + "epoch": 0.309485419449947, + "grad_norm": 0.9097245918542545, + "learning_rate": 4.1347429621454645e-06, + "loss": 0.1491, + "step": 3359 + }, + { + "epoch": 0.309577555627217, + "grad_norm": 1.0176830209153604, + "learning_rate": 4.134166533954238e-06, + "loss": 0.1677, + "step": 3360 + }, + { + "epoch": 0.30966969180448706, + "grad_norm": 0.9599751677181546, + "learning_rate": 4.1335899540304715e-06, + "loss": 0.1648, + "step": 3361 + }, + { + "epoch": 0.30976182798175705, + "grad_norm": 0.9575989184802846, + "learning_rate": 4.133013222427703e-06, + "loss": 0.1663, + "step": 3362 + }, + { + "epoch": 0.30985396415902705, + "grad_norm": 0.9533677266914239, + "learning_rate": 4.132436339199481e-06, + "loss": 0.1527, + "step": 3363 + }, + { + "epoch": 0.30994610033629705, + "grad_norm": 0.943371411328731, + "learning_rate": 4.131859304399368e-06, + "loss": 0.1645, + "step": 3364 + }, + { + "epoch": 0.31003823651356704, + "grad_norm": 0.9108660115026544, + "learning_rate": 4.1312821180809445e-06, + "loss": 0.1672, + "step": 3365 + }, + { + "epoch": 0.31013037269083704, + "grad_norm": 0.930016137545348, + "learning_rate": 4.130704780297801e-06, + "loss": 0.1498, + "step": 3366 + }, + { + "epoch": 0.31022250886810704, + "grad_norm": 0.9522554481405724, + "learning_rate": 4.130127291103542e-06, + "loss": 0.1644, + "step": 3367 + }, + { + "epoch": 0.3103146450453771, + "grad_norm": 0.8842450520060663, + "learning_rate": 4.129549650551788e-06, + "loss": 0.1453, + "step": 3368 + }, + { + "epoch": 0.3104067812226471, + "grad_norm": 0.9676290587085921, + "learning_rate": 4.1289718586961755e-06, + "loss": 0.1627, + "step": 3369 + }, + { + "epoch": 0.3104989173999171, + "grad_norm": 0.8760194541205237, + "learning_rate": 4.12839391559035e-06, + "loss": 0.1573, + "step": 3370 + }, + { + "epoch": 0.3105910535771871, + "grad_norm": 1.0120308223548502, + "learning_rate": 4.127815821287973e-06, + "loss": 0.1691, + "step": 3371 + }, + { + "epoch": 0.3106831897544571, + "grad_norm": 1.0230226420993533, + "learning_rate": 4.127237575842723e-06, + "loss": 0.1727, + "step": 3372 + }, + { + "epoch": 0.3107753259317271, + "grad_norm": 0.9527995171561136, + "learning_rate": 4.126659179308289e-06, + "loss": 0.167, + "step": 3373 + }, + { + "epoch": 0.3108674621089971, + "grad_norm": 0.8802401223513894, + "learning_rate": 4.126080631738374e-06, + "loss": 0.1577, + "step": 3374 + }, + { + "epoch": 0.3109595982862671, + "grad_norm": 1.0054768017771973, + "learning_rate": 4.125501933186699e-06, + "loss": 0.152, + "step": 3375 + }, + { + "epoch": 0.3110517344635371, + "grad_norm": 0.9339483620066793, + "learning_rate": 4.124923083706993e-06, + "loss": 0.169, + "step": 3376 + }, + { + "epoch": 0.3111438706408071, + "grad_norm": 0.891809746192897, + "learning_rate": 4.124344083353005e-06, + "loss": 0.1604, + "step": 3377 + }, + { + "epoch": 0.3112360068180771, + "grad_norm": 0.9405125779039759, + "learning_rate": 4.123764932178492e-06, + "loss": 0.1537, + "step": 3378 + }, + { + "epoch": 0.3113281429953471, + "grad_norm": 0.9114646595770788, + "learning_rate": 4.123185630237233e-06, + "loss": 0.1631, + "step": 3379 + }, + { + "epoch": 0.3114202791726171, + "grad_norm": 0.9156889815669623, + "learning_rate": 4.122606177583012e-06, + "loss": 0.1571, + "step": 3380 + }, + { + "epoch": 0.3115124153498871, + "grad_norm": 0.9232508366180471, + "learning_rate": 4.122026574269633e-06, + "loss": 0.1623, + "step": 3381 + }, + { + "epoch": 0.31160455152715716, + "grad_norm": 0.9173313455540943, + "learning_rate": 4.121446820350911e-06, + "loss": 0.153, + "step": 3382 + }, + { + "epoch": 0.31169668770442716, + "grad_norm": 0.9332944353788419, + "learning_rate": 4.12086691588068e-06, + "loss": 0.1565, + "step": 3383 + }, + { + "epoch": 0.31178882388169715, + "grad_norm": 0.867962487245611, + "learning_rate": 4.120286860912779e-06, + "loss": 0.1477, + "step": 3384 + }, + { + "epoch": 0.31188096005896715, + "grad_norm": 0.9925637374051337, + "learning_rate": 4.11970665550107e-06, + "loss": 0.1599, + "step": 3385 + }, + { + "epoch": 0.31197309623623715, + "grad_norm": 0.9567564373610871, + "learning_rate": 4.119126299699422e-06, + "loss": 0.1695, + "step": 3386 + }, + { + "epoch": 0.31206523241350714, + "grad_norm": 0.8915249939767821, + "learning_rate": 4.118545793561724e-06, + "loss": 0.1473, + "step": 3387 + }, + { + "epoch": 0.3121573685907772, + "grad_norm": 0.9671789476742289, + "learning_rate": 4.117965137141875e-06, + "loss": 0.1586, + "step": 3388 + }, + { + "epoch": 0.3122495047680472, + "grad_norm": 0.9078171347172409, + "learning_rate": 4.117384330493789e-06, + "loss": 0.143, + "step": 3389 + }, + { + "epoch": 0.3123416409453172, + "grad_norm": 0.9564013420176967, + "learning_rate": 4.1168033736713934e-06, + "loss": 0.1657, + "step": 3390 + }, + { + "epoch": 0.3124337771225872, + "grad_norm": 0.9466706325040476, + "learning_rate": 4.116222266728631e-06, + "loss": 0.1646, + "step": 3391 + }, + { + "epoch": 0.3125259132998572, + "grad_norm": 0.9132831841251625, + "learning_rate": 4.115641009719456e-06, + "loss": 0.1468, + "step": 3392 + }, + { + "epoch": 0.3126180494771272, + "grad_norm": 0.8936981888983082, + "learning_rate": 4.11505960269784e-06, + "loss": 0.1555, + "step": 3393 + }, + { + "epoch": 0.3127101856543972, + "grad_norm": 0.9496755272541821, + "learning_rate": 4.114478045717767e-06, + "loss": 0.1644, + "step": 3394 + }, + { + "epoch": 0.31280232183166723, + "grad_norm": 0.9503775314773306, + "learning_rate": 4.113896338833233e-06, + "loss": 0.1553, + "step": 3395 + }, + { + "epoch": 0.3128944580089372, + "grad_norm": 1.0006972046347853, + "learning_rate": 4.11331448209825e-06, + "loss": 0.1795, + "step": 3396 + }, + { + "epoch": 0.3129865941862072, + "grad_norm": 0.8495640118383059, + "learning_rate": 4.112732475566844e-06, + "loss": 0.1525, + "step": 3397 + }, + { + "epoch": 0.3130787303634772, + "grad_norm": 0.9164288932535483, + "learning_rate": 4.112150319293055e-06, + "loss": 0.1616, + "step": 3398 + }, + { + "epoch": 0.3131708665407472, + "grad_norm": 0.9069820705502305, + "learning_rate": 4.111568013330933e-06, + "loss": 0.1549, + "step": 3399 + }, + { + "epoch": 0.3132630027180172, + "grad_norm": 0.8560780037635382, + "learning_rate": 4.110985557734549e-06, + "loss": 0.1411, + "step": 3400 + }, + { + "epoch": 0.3133551388952872, + "grad_norm": 0.9304730799382744, + "learning_rate": 4.110402952557982e-06, + "loss": 0.1589, + "step": 3401 + }, + { + "epoch": 0.31344727507255726, + "grad_norm": 0.9225084800079449, + "learning_rate": 4.109820197855329e-06, + "loss": 0.1565, + "step": 3402 + }, + { + "epoch": 0.31353941124982726, + "grad_norm": 0.9449480858494125, + "learning_rate": 4.109237293680697e-06, + "loss": 0.1578, + "step": 3403 + }, + { + "epoch": 0.31363154742709726, + "grad_norm": 0.7922549635079016, + "learning_rate": 4.108654240088208e-06, + "loss": 0.1331, + "step": 3404 + }, + { + "epoch": 0.31372368360436725, + "grad_norm": 0.9651881798215124, + "learning_rate": 4.1080710371319995e-06, + "loss": 0.1516, + "step": 3405 + }, + { + "epoch": 0.31381581978163725, + "grad_norm": 0.9823047227787114, + "learning_rate": 4.107487684866224e-06, + "loss": 0.1681, + "step": 3406 + }, + { + "epoch": 0.31390795595890725, + "grad_norm": 0.8984657664496137, + "learning_rate": 4.106904183345042e-06, + "loss": 0.1454, + "step": 3407 + }, + { + "epoch": 0.31400009213617724, + "grad_norm": 1.0948214742805147, + "learning_rate": 4.106320532622635e-06, + "loss": 0.1789, + "step": 3408 + }, + { + "epoch": 0.3140922283134473, + "grad_norm": 1.0380192093032585, + "learning_rate": 4.105736732753193e-06, + "loss": 0.176, + "step": 3409 + }, + { + "epoch": 0.3141843644907173, + "grad_norm": 0.8916083716066878, + "learning_rate": 4.1051527837909225e-06, + "loss": 0.1372, + "step": 3410 + }, + { + "epoch": 0.3142765006679873, + "grad_norm": 0.9012931626518442, + "learning_rate": 4.104568685790043e-06, + "loss": 0.1487, + "step": 3411 + }, + { + "epoch": 0.3143686368452573, + "grad_norm": 0.9137654798095866, + "learning_rate": 4.103984438804789e-06, + "loss": 0.1538, + "step": 3412 + }, + { + "epoch": 0.3144607730225273, + "grad_norm": 0.9481939846524343, + "learning_rate": 4.103400042889407e-06, + "loss": 0.1637, + "step": 3413 + }, + { + "epoch": 0.3145529091997973, + "grad_norm": 0.9071124252098365, + "learning_rate": 4.102815498098159e-06, + "loss": 0.1578, + "step": 3414 + }, + { + "epoch": 0.3146450453770673, + "grad_norm": 0.8270852439467992, + "learning_rate": 4.102230804485318e-06, + "loss": 0.1495, + "step": 3415 + }, + { + "epoch": 0.31473718155433733, + "grad_norm": 0.9502681634882719, + "learning_rate": 4.101645962105176e-06, + "loss": 0.163, + "step": 3416 + }, + { + "epoch": 0.3148293177316073, + "grad_norm": 0.951552464274314, + "learning_rate": 4.101060971012033e-06, + "loss": 0.1591, + "step": 3417 + }, + { + "epoch": 0.3149214539088773, + "grad_norm": 0.8949602924889337, + "learning_rate": 4.100475831260208e-06, + "loss": 0.1444, + "step": 3418 + }, + { + "epoch": 0.3150135900861473, + "grad_norm": 0.9491012105424428, + "learning_rate": 4.099890542904028e-06, + "loss": 0.1576, + "step": 3419 + }, + { + "epoch": 0.3151057262634173, + "grad_norm": 0.9076824336858172, + "learning_rate": 4.0993051059978405e-06, + "loss": 0.1549, + "step": 3420 + }, + { + "epoch": 0.3151978624406873, + "grad_norm": 0.9139002533602671, + "learning_rate": 4.098719520596e-06, + "loss": 0.1417, + "step": 3421 + }, + { + "epoch": 0.31528999861795737, + "grad_norm": 0.9239054987071181, + "learning_rate": 4.098133786752881e-06, + "loss": 0.1456, + "step": 3422 + }, + { + "epoch": 0.31538213479522736, + "grad_norm": 0.9467114150066259, + "learning_rate": 4.097547904522869e-06, + "loss": 0.1505, + "step": 3423 + }, + { + "epoch": 0.31547427097249736, + "grad_norm": 0.9140901271401887, + "learning_rate": 4.09696187396036e-06, + "loss": 0.1493, + "step": 3424 + }, + { + "epoch": 0.31556640714976736, + "grad_norm": 0.9424231683243252, + "learning_rate": 4.0963756951197695e-06, + "loss": 0.1606, + "step": 3425 + }, + { + "epoch": 0.31565854332703736, + "grad_norm": 0.9403766923503943, + "learning_rate": 4.095789368055525e-06, + "loss": 0.1664, + "step": 3426 + }, + { + "epoch": 0.31575067950430735, + "grad_norm": 0.9218638685999803, + "learning_rate": 4.095202892822066e-06, + "loss": 0.1499, + "step": 3427 + }, + { + "epoch": 0.31584281568157735, + "grad_norm": 0.9485215914433966, + "learning_rate": 4.094616269473846e-06, + "loss": 0.1698, + "step": 3428 + }, + { + "epoch": 0.3159349518588474, + "grad_norm": 0.9196371332910769, + "learning_rate": 4.0940294980653335e-06, + "loss": 0.1596, + "step": 3429 + }, + { + "epoch": 0.3160270880361174, + "grad_norm": 0.886507100543908, + "learning_rate": 4.093442578651011e-06, + "loss": 0.151, + "step": 3430 + }, + { + "epoch": 0.3161192242133874, + "grad_norm": 0.8484028923939285, + "learning_rate": 4.092855511285373e-06, + "loss": 0.1416, + "step": 3431 + }, + { + "epoch": 0.3162113603906574, + "grad_norm": 0.9073737296535767, + "learning_rate": 4.09226829602293e-06, + "loss": 0.1493, + "step": 3432 + }, + { + "epoch": 0.3163034965679274, + "grad_norm": 0.967529401555553, + "learning_rate": 4.091680932918205e-06, + "loss": 0.1633, + "step": 3433 + }, + { + "epoch": 0.3163956327451974, + "grad_norm": 0.9725392063430839, + "learning_rate": 4.091093422025733e-06, + "loss": 0.1687, + "step": 3434 + }, + { + "epoch": 0.3164877689224674, + "grad_norm": 0.9128546072500832, + "learning_rate": 4.090505763400065e-06, + "loss": 0.1624, + "step": 3435 + }, + { + "epoch": 0.31657990509973744, + "grad_norm": 0.8497060725564581, + "learning_rate": 4.089917957095767e-06, + "loss": 0.1473, + "step": 3436 + }, + { + "epoch": 0.31667204127700743, + "grad_norm": 0.936515788653949, + "learning_rate": 4.089330003167416e-06, + "loss": 0.1532, + "step": 3437 + }, + { + "epoch": 0.31676417745427743, + "grad_norm": 0.9506803605861734, + "learning_rate": 4.088741901669601e-06, + "loss": 0.1418, + "step": 3438 + }, + { + "epoch": 0.3168563136315474, + "grad_norm": 0.9006173689048483, + "learning_rate": 4.088153652656932e-06, + "loss": 0.1606, + "step": 3439 + }, + { + "epoch": 0.3169484498088174, + "grad_norm": 0.9289187640854032, + "learning_rate": 4.087565256184024e-06, + "loss": 0.1566, + "step": 3440 + }, + { + "epoch": 0.3170405859860874, + "grad_norm": 1.0591258145699214, + "learning_rate": 4.086976712305511e-06, + "loss": 0.1799, + "step": 3441 + }, + { + "epoch": 0.3171327221633574, + "grad_norm": 0.9400847693025363, + "learning_rate": 4.08638802107604e-06, + "loss": 0.1616, + "step": 3442 + }, + { + "epoch": 0.31722485834062747, + "grad_norm": 0.9298557372660742, + "learning_rate": 4.0857991825502696e-06, + "loss": 0.1676, + "step": 3443 + }, + { + "epoch": 0.31731699451789747, + "grad_norm": 0.9800661183069009, + "learning_rate": 4.085210196782875e-06, + "loss": 0.1604, + "step": 3444 + }, + { + "epoch": 0.31740913069516746, + "grad_norm": 1.0754863588778008, + "learning_rate": 4.084621063828544e-06, + "loss": 0.1738, + "step": 3445 + }, + { + "epoch": 0.31750126687243746, + "grad_norm": 0.9468907496454307, + "learning_rate": 4.0840317837419754e-06, + "loss": 0.1716, + "step": 3446 + }, + { + "epoch": 0.31759340304970746, + "grad_norm": 0.9500032382526354, + "learning_rate": 4.083442356577886e-06, + "loss": 0.1568, + "step": 3447 + }, + { + "epoch": 0.31768553922697745, + "grad_norm": 1.03361821973418, + "learning_rate": 4.082852782391003e-06, + "loss": 0.1646, + "step": 3448 + }, + { + "epoch": 0.31777767540424745, + "grad_norm": 0.9627491594944207, + "learning_rate": 4.0822630612360685e-06, + "loss": 0.159, + "step": 3449 + }, + { + "epoch": 0.3178698115815175, + "grad_norm": 0.8662665250817492, + "learning_rate": 4.081673193167839e-06, + "loss": 0.1522, + "step": 3450 + }, + { + "epoch": 0.3179619477587875, + "grad_norm": 0.8937920972998911, + "learning_rate": 4.081083178241083e-06, + "loss": 0.1493, + "step": 3451 + }, + { + "epoch": 0.3180540839360575, + "grad_norm": 0.9479929284339497, + "learning_rate": 4.080493016510583e-06, + "loss": 0.1639, + "step": 3452 + }, + { + "epoch": 0.3181462201133275, + "grad_norm": 0.9187532135497326, + "learning_rate": 4.079902708031137e-06, + "loss": 0.1601, + "step": 3453 + }, + { + "epoch": 0.3182383562905975, + "grad_norm": 0.9144021033769467, + "learning_rate": 4.079312252857556e-06, + "loss": 0.1541, + "step": 3454 + }, + { + "epoch": 0.3183304924678675, + "grad_norm": 0.9191213571527707, + "learning_rate": 4.07872165104466e-06, + "loss": 0.1623, + "step": 3455 + }, + { + "epoch": 0.31842262864513754, + "grad_norm": 0.9511674669274025, + "learning_rate": 4.07813090264729e-06, + "loss": 0.1597, + "step": 3456 + }, + { + "epoch": 0.31851476482240754, + "grad_norm": 0.9798337339733897, + "learning_rate": 4.077540007720295e-06, + "loss": 0.1722, + "step": 3457 + }, + { + "epoch": 0.31860690099967753, + "grad_norm": 0.8879213314696275, + "learning_rate": 4.076948966318542e-06, + "loss": 0.1491, + "step": 3458 + }, + { + "epoch": 0.31869903717694753, + "grad_norm": 0.8721348850025189, + "learning_rate": 4.076357778496906e-06, + "loss": 0.1519, + "step": 3459 + }, + { + "epoch": 0.3187911733542175, + "grad_norm": 0.9450063837612857, + "learning_rate": 4.075766444310282e-06, + "loss": 0.1484, + "step": 3460 + }, + { + "epoch": 0.3188833095314875, + "grad_norm": 0.9015677020824875, + "learning_rate": 4.075174963813574e-06, + "loss": 0.1551, + "step": 3461 + }, + { + "epoch": 0.3189754457087575, + "grad_norm": 0.9398397674223026, + "learning_rate": 4.0745833370617e-06, + "loss": 0.1598, + "step": 3462 + }, + { + "epoch": 0.3190675818860276, + "grad_norm": 0.9545054887697545, + "learning_rate": 4.073991564109595e-06, + "loss": 0.1535, + "step": 3463 + }, + { + "epoch": 0.31915971806329757, + "grad_norm": 0.935516966266871, + "learning_rate": 4.073399645012203e-06, + "loss": 0.1534, + "step": 3464 + }, + { + "epoch": 0.31925185424056757, + "grad_norm": 0.9004407095982404, + "learning_rate": 4.072807579824485e-06, + "loss": 0.1729, + "step": 3465 + }, + { + "epoch": 0.31934399041783756, + "grad_norm": 0.9059623238664691, + "learning_rate": 4.072215368601414e-06, + "loss": 0.1619, + "step": 3466 + }, + { + "epoch": 0.31943612659510756, + "grad_norm": 0.8742229100315807, + "learning_rate": 4.0716230113979766e-06, + "loss": 0.1502, + "step": 3467 + }, + { + "epoch": 0.31952826277237756, + "grad_norm": 1.0188211123923578, + "learning_rate": 4.071030508269173e-06, + "loss": 0.1699, + "step": 3468 + }, + { + "epoch": 0.31962039894964756, + "grad_norm": 0.8903732181525056, + "learning_rate": 4.070437859270019e-06, + "loss": 0.1637, + "step": 3469 + }, + { + "epoch": 0.3197125351269176, + "grad_norm": 0.9856901177717559, + "learning_rate": 4.06984506445554e-06, + "loss": 0.1581, + "step": 3470 + }, + { + "epoch": 0.3198046713041876, + "grad_norm": 0.9279233569958244, + "learning_rate": 4.069252123880777e-06, + "loss": 0.1619, + "step": 3471 + }, + { + "epoch": 0.3198968074814576, + "grad_norm": 0.9172143149312972, + "learning_rate": 4.068659037600786e-06, + "loss": 0.1542, + "step": 3472 + }, + { + "epoch": 0.3199889436587276, + "grad_norm": 0.8131584393868659, + "learning_rate": 4.068065805670635e-06, + "loss": 0.1417, + "step": 3473 + }, + { + "epoch": 0.3200810798359976, + "grad_norm": 0.9034854285537024, + "learning_rate": 4.067472428145405e-06, + "loss": 0.1494, + "step": 3474 + }, + { + "epoch": 0.3201732160132676, + "grad_norm": 0.8541692198778182, + "learning_rate": 4.066878905080191e-06, + "loss": 0.1396, + "step": 3475 + }, + { + "epoch": 0.3202653521905376, + "grad_norm": 0.9365299991439768, + "learning_rate": 4.066285236530103e-06, + "loss": 0.1597, + "step": 3476 + }, + { + "epoch": 0.32035748836780764, + "grad_norm": 0.9592597644111814, + "learning_rate": 4.065691422550261e-06, + "loss": 0.1512, + "step": 3477 + }, + { + "epoch": 0.32044962454507764, + "grad_norm": 0.8863394888331791, + "learning_rate": 4.065097463195803e-06, + "loss": 0.1592, + "step": 3478 + }, + { + "epoch": 0.32054176072234764, + "grad_norm": 0.8543865524815278, + "learning_rate": 4.064503358521876e-06, + "loss": 0.1459, + "step": 3479 + }, + { + "epoch": 0.32063389689961763, + "grad_norm": 0.8962676770429008, + "learning_rate": 4.063909108583644e-06, + "loss": 0.1553, + "step": 3480 + }, + { + "epoch": 0.32072603307688763, + "grad_norm": 0.871977731951131, + "learning_rate": 4.063314713436283e-06, + "loss": 0.1656, + "step": 3481 + }, + { + "epoch": 0.3208181692541576, + "grad_norm": 1.001308123062294, + "learning_rate": 4.062720173134983e-06, + "loss": 0.1709, + "step": 3482 + }, + { + "epoch": 0.3209103054314276, + "grad_norm": 0.8730340921167583, + "learning_rate": 4.062125487734947e-06, + "loss": 0.151, + "step": 3483 + }, + { + "epoch": 0.3210024416086977, + "grad_norm": 0.9399620155764555, + "learning_rate": 4.06153065729139e-06, + "loss": 0.1477, + "step": 3484 + }, + { + "epoch": 0.32109457778596767, + "grad_norm": 0.9154550750631788, + "learning_rate": 4.060935681859545e-06, + "loss": 0.1509, + "step": 3485 + }, + { + "epoch": 0.32118671396323767, + "grad_norm": 0.9360146752265455, + "learning_rate": 4.060340561494654e-06, + "loss": 0.1606, + "step": 3486 + }, + { + "epoch": 0.32127885014050767, + "grad_norm": 0.9870281583727557, + "learning_rate": 4.059745296251972e-06, + "loss": 0.1612, + "step": 3487 + }, + { + "epoch": 0.32137098631777766, + "grad_norm": 0.9446020683476672, + "learning_rate": 4.059149886186773e-06, + "loss": 0.1586, + "step": 3488 + }, + { + "epoch": 0.32146312249504766, + "grad_norm": 0.9370826914836032, + "learning_rate": 4.058554331354339e-06, + "loss": 0.1608, + "step": 3489 + }, + { + "epoch": 0.3215552586723177, + "grad_norm": 0.9222591973137391, + "learning_rate": 4.057958631809967e-06, + "loss": 0.1549, + "step": 3490 + }, + { + "epoch": 0.3216473948495877, + "grad_norm": 0.915092079185541, + "learning_rate": 4.057362787608969e-06, + "loss": 0.1546, + "step": 3491 + }, + { + "epoch": 0.3217395310268577, + "grad_norm": 0.9413157010417782, + "learning_rate": 4.056766798806668e-06, + "loss": 0.1484, + "step": 3492 + }, + { + "epoch": 0.3218316672041277, + "grad_norm": 0.9019348756364096, + "learning_rate": 4.056170665458403e-06, + "loss": 0.1511, + "step": 3493 + }, + { + "epoch": 0.3219238033813977, + "grad_norm": 0.9600914185028901, + "learning_rate": 4.055574387619524e-06, + "loss": 0.1733, + "step": 3494 + }, + { + "epoch": 0.3220159395586677, + "grad_norm": 0.911767587313382, + "learning_rate": 4.054977965345396e-06, + "loss": 0.1658, + "step": 3495 + }, + { + "epoch": 0.3221080757359377, + "grad_norm": 0.9238148568525669, + "learning_rate": 4.054381398691396e-06, + "loss": 0.1651, + "step": 3496 + }, + { + "epoch": 0.32220021191320775, + "grad_norm": 0.8606339124055112, + "learning_rate": 4.053784687712916e-06, + "loss": 0.1521, + "step": 3497 + }, + { + "epoch": 0.32229234809047774, + "grad_norm": 0.8899508622289252, + "learning_rate": 4.05318783246536e-06, + "loss": 0.1452, + "step": 3498 + }, + { + "epoch": 0.32238448426774774, + "grad_norm": 0.8955622026127069, + "learning_rate": 4.052590833004147e-06, + "loss": 0.1554, + "step": 3499 + }, + { + "epoch": 0.32247662044501774, + "grad_norm": 0.824248867010417, + "learning_rate": 4.051993689384709e-06, + "loss": 0.1421, + "step": 3500 + }, + { + "epoch": 0.32247662044501774, + "eval_loss": 0.15785863995552063, + "eval_runtime": 300.4481, + "eval_samples_per_second": 23.355, + "eval_steps_per_second": 2.922, + "step": 3500 + }, + { + "epoch": 0.32256875662228773, + "grad_norm": 0.8684284205940074, + "learning_rate": 4.051396401662489e-06, + "loss": 0.1431, + "step": 3501 + }, + { + "epoch": 0.32266089279955773, + "grad_norm": 1.0345536582306027, + "learning_rate": 4.050798969892946e-06, + "loss": 0.1672, + "step": 3502 + }, + { + "epoch": 0.3227530289768277, + "grad_norm": 0.8593899874719152, + "learning_rate": 4.050201394131551e-06, + "loss": 0.1409, + "step": 3503 + }, + { + "epoch": 0.3228451651540978, + "grad_norm": 0.9223597678224997, + "learning_rate": 4.049603674433791e-06, + "loss": 0.168, + "step": 3504 + }, + { + "epoch": 0.3229373013313678, + "grad_norm": 0.888252713703813, + "learning_rate": 4.049005810855163e-06, + "loss": 0.1679, + "step": 3505 + }, + { + "epoch": 0.3230294375086378, + "grad_norm": 0.9718784879317928, + "learning_rate": 4.048407803451178e-06, + "loss": 0.1591, + "step": 3506 + }, + { + "epoch": 0.32312157368590777, + "grad_norm": 0.8838193993880871, + "learning_rate": 4.047809652277362e-06, + "loss": 0.1505, + "step": 3507 + }, + { + "epoch": 0.32321370986317777, + "grad_norm": 0.8915524847095565, + "learning_rate": 4.047211357389254e-06, + "loss": 0.1685, + "step": 3508 + }, + { + "epoch": 0.32330584604044776, + "grad_norm": 0.873930483845192, + "learning_rate": 4.046612918842405e-06, + "loss": 0.1543, + "step": 3509 + }, + { + "epoch": 0.32339798221771776, + "grad_norm": 0.8735696475644171, + "learning_rate": 4.0460143366923785e-06, + "loss": 0.1578, + "step": 3510 + }, + { + "epoch": 0.3234901183949878, + "grad_norm": 0.8588041851689829, + "learning_rate": 4.045415610994755e-06, + "loss": 0.157, + "step": 3511 + }, + { + "epoch": 0.3235822545722578, + "grad_norm": 0.8657789207118362, + "learning_rate": 4.044816741805127e-06, + "loss": 0.1471, + "step": 3512 + }, + { + "epoch": 0.3236743907495278, + "grad_norm": 0.859079362850816, + "learning_rate": 4.044217729179097e-06, + "loss": 0.1583, + "step": 3513 + }, + { + "epoch": 0.3237665269267978, + "grad_norm": 0.9071782136735049, + "learning_rate": 4.043618573172286e-06, + "loss": 0.1645, + "step": 3514 + }, + { + "epoch": 0.3238586631040678, + "grad_norm": 0.9391842378302793, + "learning_rate": 4.043019273840323e-06, + "loss": 0.1599, + "step": 3515 + }, + { + "epoch": 0.3239507992813378, + "grad_norm": 0.9179852158772025, + "learning_rate": 4.042419831238855e-06, + "loss": 0.1671, + "step": 3516 + }, + { + "epoch": 0.3240429354586078, + "grad_norm": 0.936687179289209, + "learning_rate": 4.041820245423539e-06, + "loss": 0.1489, + "step": 3517 + }, + { + "epoch": 0.32413507163587785, + "grad_norm": 0.8751747372246792, + "learning_rate": 4.041220516450048e-06, + "loss": 0.1429, + "step": 3518 + }, + { + "epoch": 0.32422720781314784, + "grad_norm": 0.9622948464619779, + "learning_rate": 4.040620644374066e-06, + "loss": 0.1714, + "step": 3519 + }, + { + "epoch": 0.32431934399041784, + "grad_norm": 0.8881873414992707, + "learning_rate": 4.0400206292512914e-06, + "loss": 0.1358, + "step": 3520 + }, + { + "epoch": 0.32441148016768784, + "grad_norm": 0.9365575801168411, + "learning_rate": 4.039420471137435e-06, + "loss": 0.1687, + "step": 3521 + }, + { + "epoch": 0.32450361634495783, + "grad_norm": 0.9525223912847359, + "learning_rate": 4.038820170088223e-06, + "loss": 0.162, + "step": 3522 + }, + { + "epoch": 0.32459575252222783, + "grad_norm": 0.8896341990118014, + "learning_rate": 4.0382197261593925e-06, + "loss": 0.1672, + "step": 3523 + }, + { + "epoch": 0.3246878886994979, + "grad_norm": 1.0189292647998855, + "learning_rate": 4.037619139406695e-06, + "loss": 0.1676, + "step": 3524 + }, + { + "epoch": 0.3247800248767679, + "grad_norm": 0.902643694703534, + "learning_rate": 4.037018409885894e-06, + "loss": 0.1641, + "step": 3525 + }, + { + "epoch": 0.3248721610540379, + "grad_norm": 0.867488131643773, + "learning_rate": 4.036417537652769e-06, + "loss": 0.1431, + "step": 3526 + }, + { + "epoch": 0.3249642972313079, + "grad_norm": 0.9114760774953333, + "learning_rate": 4.03581652276311e-06, + "loss": 0.1563, + "step": 3527 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 0.847393565850545, + "learning_rate": 4.035215365272722e-06, + "loss": 0.1403, + "step": 3528 + }, + { + "epoch": 0.32514856958584787, + "grad_norm": 0.9601802498635071, + "learning_rate": 4.034614065237421e-06, + "loss": 0.1717, + "step": 3529 + }, + { + "epoch": 0.32524070576311787, + "grad_norm": 0.8451113466394516, + "learning_rate": 4.034012622713041e-06, + "loss": 0.1323, + "step": 3530 + }, + { + "epoch": 0.3253328419403879, + "grad_norm": 0.9540096973545658, + "learning_rate": 4.033411037755422e-06, + "loss": 0.163, + "step": 3531 + }, + { + "epoch": 0.3254249781176579, + "grad_norm": 0.9029760011942667, + "learning_rate": 4.032809310420424e-06, + "loss": 0.1638, + "step": 3532 + }, + { + "epoch": 0.3255171142949279, + "grad_norm": 0.8653493696897545, + "learning_rate": 4.032207440763915e-06, + "loss": 0.1482, + "step": 3533 + }, + { + "epoch": 0.3256092504721979, + "grad_norm": 0.9002695038758083, + "learning_rate": 4.0316054288417825e-06, + "loss": 0.1626, + "step": 3534 + }, + { + "epoch": 0.3257013866494679, + "grad_norm": 0.9292035928119098, + "learning_rate": 4.031003274709919e-06, + "loss": 0.1565, + "step": 3535 + }, + { + "epoch": 0.3257935228267379, + "grad_norm": 0.9373344296369921, + "learning_rate": 4.0304009784242385e-06, + "loss": 0.1767, + "step": 3536 + }, + { + "epoch": 0.3258856590040079, + "grad_norm": 0.8566406083660746, + "learning_rate": 4.029798540040661e-06, + "loss": 0.1393, + "step": 3537 + }, + { + "epoch": 0.32597779518127795, + "grad_norm": 0.9507352638450343, + "learning_rate": 4.029195959615125e-06, + "loss": 0.1465, + "step": 3538 + }, + { + "epoch": 0.32606993135854795, + "grad_norm": 0.907988452302607, + "learning_rate": 4.02859323720358e-06, + "loss": 0.1587, + "step": 3539 + }, + { + "epoch": 0.32616206753581795, + "grad_norm": 0.9271425625270764, + "learning_rate": 4.027990372861989e-06, + "loss": 0.1569, + "step": 3540 + }, + { + "epoch": 0.32625420371308794, + "grad_norm": 0.9450417681972197, + "learning_rate": 4.027387366646326e-06, + "loss": 0.1594, + "step": 3541 + }, + { + "epoch": 0.32634633989035794, + "grad_norm": 0.9513022280524657, + "learning_rate": 4.026784218612581e-06, + "loss": 0.1675, + "step": 3542 + }, + { + "epoch": 0.32643847606762794, + "grad_norm": 0.9200859147183892, + "learning_rate": 4.026180928816759e-06, + "loss": 0.1609, + "step": 3543 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 1.0053375898691645, + "learning_rate": 4.0255774973148735e-06, + "loss": 0.1638, + "step": 3544 + }, + { + "epoch": 0.326622748422168, + "grad_norm": 0.9025225354859596, + "learning_rate": 4.024973924162952e-06, + "loss": 0.1462, + "step": 3545 + }, + { + "epoch": 0.326714884599438, + "grad_norm": 0.9576990817667488, + "learning_rate": 4.024370209417037e-06, + "loss": 0.1593, + "step": 3546 + }, + { + "epoch": 0.326807020776708, + "grad_norm": 0.9525658664676164, + "learning_rate": 4.0237663531331855e-06, + "loss": 0.1598, + "step": 3547 + }, + { + "epoch": 0.326899156953978, + "grad_norm": 0.8995010015032531, + "learning_rate": 4.023162355367464e-06, + "loss": 0.1483, + "step": 3548 + }, + { + "epoch": 0.326991293131248, + "grad_norm": 0.9037322000936574, + "learning_rate": 4.022558216175953e-06, + "loss": 0.1507, + "step": 3549 + }, + { + "epoch": 0.32708342930851797, + "grad_norm": 0.8937200904467155, + "learning_rate": 4.021953935614748e-06, + "loss": 0.1442, + "step": 3550 + }, + { + "epoch": 0.32717556548578797, + "grad_norm": 0.9984501569219848, + "learning_rate": 4.021349513739956e-06, + "loss": 0.1755, + "step": 3551 + }, + { + "epoch": 0.327267701663058, + "grad_norm": 0.8965440594336742, + "learning_rate": 4.020744950607699e-06, + "loss": 0.1573, + "step": 3552 + }, + { + "epoch": 0.327359837840328, + "grad_norm": 0.9156102418861106, + "learning_rate": 4.020140246274109e-06, + "loss": 0.1599, + "step": 3553 + }, + { + "epoch": 0.327451974017598, + "grad_norm": 0.9645827093836922, + "learning_rate": 4.019535400795333e-06, + "loss": 0.1746, + "step": 3554 + }, + { + "epoch": 0.327544110194868, + "grad_norm": 0.8814855920188206, + "learning_rate": 4.018930414227533e-06, + "loss": 0.1558, + "step": 3555 + }, + { + "epoch": 0.327636246372138, + "grad_norm": 0.983414207184116, + "learning_rate": 4.018325286626879e-06, + "loss": 0.1592, + "step": 3556 + }, + { + "epoch": 0.327728382549408, + "grad_norm": 0.919976809129956, + "learning_rate": 4.017720018049559e-06, + "loss": 0.1532, + "step": 3557 + }, + { + "epoch": 0.32782051872667806, + "grad_norm": 0.9291452608705151, + "learning_rate": 4.017114608551772e-06, + "loss": 0.1601, + "step": 3558 + }, + { + "epoch": 0.32791265490394805, + "grad_norm": 0.8852683619080132, + "learning_rate": 4.016509058189731e-06, + "loss": 0.1505, + "step": 3559 + }, + { + "epoch": 0.32800479108121805, + "grad_norm": 0.9186159403047259, + "learning_rate": 4.0159033670196605e-06, + "loss": 0.1629, + "step": 3560 + }, + { + "epoch": 0.32809692725848805, + "grad_norm": 0.9844743262655814, + "learning_rate": 4.0152975350978e-06, + "loss": 0.1588, + "step": 3561 + }, + { + "epoch": 0.32818906343575804, + "grad_norm": 0.8595561775845129, + "learning_rate": 4.0146915624803985e-06, + "loss": 0.1556, + "step": 3562 + }, + { + "epoch": 0.32828119961302804, + "grad_norm": 0.9446841396028918, + "learning_rate": 4.014085449223724e-06, + "loss": 0.1581, + "step": 3563 + }, + { + "epoch": 0.32837333579029804, + "grad_norm": 1.0023786637300742, + "learning_rate": 4.013479195384051e-06, + "loss": 0.1686, + "step": 3564 + }, + { + "epoch": 0.3284654719675681, + "grad_norm": 0.9068328433386368, + "learning_rate": 4.012872801017673e-06, + "loss": 0.156, + "step": 3565 + }, + { + "epoch": 0.3285576081448381, + "grad_norm": 0.9627651441762631, + "learning_rate": 4.012266266180892e-06, + "loss": 0.1477, + "step": 3566 + }, + { + "epoch": 0.3286497443221081, + "grad_norm": 0.8659019527581846, + "learning_rate": 4.011659590930026e-06, + "loss": 0.1515, + "step": 3567 + }, + { + "epoch": 0.3287418804993781, + "grad_norm": 0.8270948116690996, + "learning_rate": 4.011052775321405e-06, + "loss": 0.1422, + "step": 3568 + }, + { + "epoch": 0.3288340166766481, + "grad_norm": 0.9782404984989339, + "learning_rate": 4.010445819411369e-06, + "loss": 0.1684, + "step": 3569 + }, + { + "epoch": 0.3289261528539181, + "grad_norm": 0.8829362934235169, + "learning_rate": 4.009838723256278e-06, + "loss": 0.1522, + "step": 3570 + }, + { + "epoch": 0.32901828903118807, + "grad_norm": 0.9407625480923458, + "learning_rate": 4.009231486912498e-06, + "loss": 0.1656, + "step": 3571 + }, + { + "epoch": 0.3291104252084581, + "grad_norm": 0.8848548864415823, + "learning_rate": 4.008624110436413e-06, + "loss": 0.1527, + "step": 3572 + }, + { + "epoch": 0.3292025613857281, + "grad_norm": 0.9301909339470641, + "learning_rate": 4.008016593884416e-06, + "loss": 0.1546, + "step": 3573 + }, + { + "epoch": 0.3292946975629981, + "grad_norm": 0.9320406389327086, + "learning_rate": 4.0074089373129165e-06, + "loss": 0.1522, + "step": 3574 + }, + { + "epoch": 0.3293868337402681, + "grad_norm": 0.8886539369086722, + "learning_rate": 4.006801140778335e-06, + "loss": 0.1472, + "step": 3575 + }, + { + "epoch": 0.3294789699175381, + "grad_norm": 0.9537007448167414, + "learning_rate": 4.006193204337106e-06, + "loss": 0.1604, + "step": 3576 + }, + { + "epoch": 0.3295711060948081, + "grad_norm": 0.8619622908907076, + "learning_rate": 4.005585128045675e-06, + "loss": 0.1431, + "step": 3577 + }, + { + "epoch": 0.3296632422720781, + "grad_norm": 0.9035882099898597, + "learning_rate": 4.004976911960503e-06, + "loss": 0.1629, + "step": 3578 + }, + { + "epoch": 0.32975537844934816, + "grad_norm": 0.9564187518876925, + "learning_rate": 4.004368556138062e-06, + "loss": 0.1551, + "step": 3579 + }, + { + "epoch": 0.32984751462661815, + "grad_norm": 0.8805677607577047, + "learning_rate": 4.003760060634839e-06, + "loss": 0.1552, + "step": 3580 + }, + { + "epoch": 0.32993965080388815, + "grad_norm": 0.8140917034555333, + "learning_rate": 4.003151425507333e-06, + "loss": 0.1515, + "step": 3581 + }, + { + "epoch": 0.33003178698115815, + "grad_norm": 0.9419332145704289, + "learning_rate": 4.002542650812056e-06, + "loss": 0.1705, + "step": 3582 + }, + { + "epoch": 0.33012392315842815, + "grad_norm": 0.8431186709355069, + "learning_rate": 4.001933736605531e-06, + "loss": 0.1577, + "step": 3583 + }, + { + "epoch": 0.33021605933569814, + "grad_norm": 0.9288861117347793, + "learning_rate": 4.001324682944297e-06, + "loss": 0.1623, + "step": 3584 + }, + { + "epoch": 0.3303081955129682, + "grad_norm": 0.9538522526825776, + "learning_rate": 4.000715489884906e-06, + "loss": 0.1561, + "step": 3585 + }, + { + "epoch": 0.3304003316902382, + "grad_norm": 0.8871395638894523, + "learning_rate": 4.000106157483919e-06, + "loss": 0.1475, + "step": 3586 + }, + { + "epoch": 0.3304924678675082, + "grad_norm": 0.9324362611728906, + "learning_rate": 3.999496685797914e-06, + "loss": 0.1643, + "step": 3587 + }, + { + "epoch": 0.3305846040447782, + "grad_norm": 0.9712778225713625, + "learning_rate": 3.998887074883481e-06, + "loss": 0.16, + "step": 3588 + }, + { + "epoch": 0.3306767402220482, + "grad_norm": 0.9202172732232993, + "learning_rate": 3.9982773247972204e-06, + "loss": 0.1718, + "step": 3589 + }, + { + "epoch": 0.3307688763993182, + "grad_norm": 0.9304776376651178, + "learning_rate": 3.99766743559575e-06, + "loss": 0.1617, + "step": 3590 + }, + { + "epoch": 0.3308610125765882, + "grad_norm": 0.8970303287240191, + "learning_rate": 3.997057407335697e-06, + "loss": 0.1537, + "step": 3591 + }, + { + "epoch": 0.33095314875385823, + "grad_norm": 0.9275835584300335, + "learning_rate": 3.996447240073702e-06, + "loss": 0.1617, + "step": 3592 + }, + { + "epoch": 0.3310452849311282, + "grad_norm": 0.917024390620923, + "learning_rate": 3.995836933866421e-06, + "loss": 0.1493, + "step": 3593 + }, + { + "epoch": 0.3311374211083982, + "grad_norm": 0.8871456134644057, + "learning_rate": 3.995226488770519e-06, + "loss": 0.1603, + "step": 3594 + }, + { + "epoch": 0.3312295572856682, + "grad_norm": 0.9237380887364842, + "learning_rate": 3.994615904842676e-06, + "loss": 0.1612, + "step": 3595 + }, + { + "epoch": 0.3313216934629382, + "grad_norm": 0.990513202871709, + "learning_rate": 3.994005182139586e-06, + "loss": 0.1813, + "step": 3596 + }, + { + "epoch": 0.3314138296402082, + "grad_norm": 0.8947980698760637, + "learning_rate": 3.993394320717952e-06, + "loss": 0.1597, + "step": 3597 + }, + { + "epoch": 0.3315059658174782, + "grad_norm": 0.8966130531365468, + "learning_rate": 3.992783320634498e-06, + "loss": 0.1464, + "step": 3598 + }, + { + "epoch": 0.33159810199474826, + "grad_norm": 0.9026903822003577, + "learning_rate": 3.992172181945951e-06, + "loss": 0.1658, + "step": 3599 + }, + { + "epoch": 0.33169023817201826, + "grad_norm": 0.9037038636134237, + "learning_rate": 3.991560904709055e-06, + "loss": 0.166, + "step": 3600 + }, + { + "epoch": 0.33178237434928826, + "grad_norm": 0.9647233382895593, + "learning_rate": 3.990949488980569e-06, + "loss": 0.161, + "step": 3601 + }, + { + "epoch": 0.33187451052655825, + "grad_norm": 0.9559015439299133, + "learning_rate": 3.990337934817263e-06, + "loss": 0.1685, + "step": 3602 + }, + { + "epoch": 0.33196664670382825, + "grad_norm": 0.8896214610619697, + "learning_rate": 3.989726242275919e-06, + "loss": 0.1489, + "step": 3603 + }, + { + "epoch": 0.33205878288109825, + "grad_norm": 0.9415012706281953, + "learning_rate": 3.989114411413333e-06, + "loss": 0.1636, + "step": 3604 + }, + { + "epoch": 0.33215091905836824, + "grad_norm": 0.9134402537640711, + "learning_rate": 3.988502442286314e-06, + "loss": 0.1496, + "step": 3605 + }, + { + "epoch": 0.3322430552356383, + "grad_norm": 0.8943155472774135, + "learning_rate": 3.987890334951683e-06, + "loss": 0.1521, + "step": 3606 + }, + { + "epoch": 0.3323351914129083, + "grad_norm": 0.9865044679709729, + "learning_rate": 3.987278089466274e-06, + "loss": 0.1754, + "step": 3607 + }, + { + "epoch": 0.3324273275901783, + "grad_norm": 0.9439032580162637, + "learning_rate": 3.986665705886934e-06, + "loss": 0.1557, + "step": 3608 + }, + { + "epoch": 0.3325194637674483, + "grad_norm": 0.8531927782870514, + "learning_rate": 3.986053184270524e-06, + "loss": 0.1541, + "step": 3609 + }, + { + "epoch": 0.3326115999447183, + "grad_norm": 0.9242320549076447, + "learning_rate": 3.9854405246739155e-06, + "loss": 0.1632, + "step": 3610 + }, + { + "epoch": 0.3327037361219883, + "grad_norm": 0.9688725514867598, + "learning_rate": 3.984827727153995e-06, + "loss": 0.1606, + "step": 3611 + }, + { + "epoch": 0.3327958722992583, + "grad_norm": 0.9171240245963733, + "learning_rate": 3.984214791767659e-06, + "loss": 0.1548, + "step": 3612 + }, + { + "epoch": 0.33288800847652833, + "grad_norm": 0.9001926653232694, + "learning_rate": 3.983601718571821e-06, + "loss": 0.1577, + "step": 3613 + }, + { + "epoch": 0.3329801446537983, + "grad_norm": 0.8394494549911041, + "learning_rate": 3.982988507623403e-06, + "loss": 0.1532, + "step": 3614 + }, + { + "epoch": 0.3330722808310683, + "grad_norm": 0.8980142878215236, + "learning_rate": 3.982375158979344e-06, + "loss": 0.1573, + "step": 3615 + }, + { + "epoch": 0.3331644170083383, + "grad_norm": 0.883954929348229, + "learning_rate": 3.98176167269659e-06, + "loss": 0.1479, + "step": 3616 + }, + { + "epoch": 0.3332565531856083, + "grad_norm": 0.9567135747589431, + "learning_rate": 3.981148048832106e-06, + "loss": 0.1608, + "step": 3617 + }, + { + "epoch": 0.3333486893628783, + "grad_norm": 0.8848169559034494, + "learning_rate": 3.980534287442866e-06, + "loss": 0.1521, + "step": 3618 + }, + { + "epoch": 0.33344082554014837, + "grad_norm": 0.9362944149112743, + "learning_rate": 3.9799203885858584e-06, + "loss": 0.1519, + "step": 3619 + }, + { + "epoch": 0.33353296171741836, + "grad_norm": 0.8514694979616128, + "learning_rate": 3.979306352318083e-06, + "loss": 0.1425, + "step": 3620 + }, + { + "epoch": 0.33362509789468836, + "grad_norm": 0.8970641312249314, + "learning_rate": 3.978692178696555e-06, + "loss": 0.1494, + "step": 3621 + }, + { + "epoch": 0.33371723407195836, + "grad_norm": 0.9766601140568355, + "learning_rate": 3.9780778677782974e-06, + "loss": 0.1709, + "step": 3622 + }, + { + "epoch": 0.33380937024922835, + "grad_norm": 0.9966149520552534, + "learning_rate": 3.977463419620352e-06, + "loss": 0.1779, + "step": 3623 + }, + { + "epoch": 0.33390150642649835, + "grad_norm": 0.9568555752316749, + "learning_rate": 3.976848834279767e-06, + "loss": 0.1615, + "step": 3624 + }, + { + "epoch": 0.33399364260376835, + "grad_norm": 0.919492432719039, + "learning_rate": 3.976234111813611e-06, + "loss": 0.1568, + "step": 3625 + }, + { + "epoch": 0.3340857787810384, + "grad_norm": 0.9378481548085038, + "learning_rate": 3.975619252278958e-06, + "loss": 0.1648, + "step": 3626 + }, + { + "epoch": 0.3341779149583084, + "grad_norm": 0.9358170612115819, + "learning_rate": 3.9750042557328986e-06, + "loss": 0.1494, + "step": 3627 + }, + { + "epoch": 0.3342700511355784, + "grad_norm": 0.892577374929836, + "learning_rate": 3.974389122232536e-06, + "loss": 0.1505, + "step": 3628 + }, + { + "epoch": 0.3343621873128484, + "grad_norm": 0.8907012935380049, + "learning_rate": 3.973773851834983e-06, + "loss": 0.1441, + "step": 3629 + }, + { + "epoch": 0.3344543234901184, + "grad_norm": 0.9104396913688365, + "learning_rate": 3.973158444597371e-06, + "loss": 0.1566, + "step": 3630 + }, + { + "epoch": 0.3345464596673884, + "grad_norm": 1.0018321492907019, + "learning_rate": 3.972542900576838e-06, + "loss": 0.1642, + "step": 3631 + }, + { + "epoch": 0.3346385958446584, + "grad_norm": 1.0043463523252036, + "learning_rate": 3.9719272198305385e-06, + "loss": 0.1546, + "step": 3632 + }, + { + "epoch": 0.33473073202192843, + "grad_norm": 0.9422176961049771, + "learning_rate": 3.971311402415638e-06, + "loss": 0.154, + "step": 3633 + }, + { + "epoch": 0.33482286819919843, + "grad_norm": 0.9556246249227124, + "learning_rate": 3.970695448389315e-06, + "loss": 0.1566, + "step": 3634 + }, + { + "epoch": 0.33491500437646843, + "grad_norm": 0.9922729925416031, + "learning_rate": 3.970079357808763e-06, + "loss": 0.1522, + "step": 3635 + }, + { + "epoch": 0.3350071405537384, + "grad_norm": 0.9364461746995512, + "learning_rate": 3.969463130731183e-06, + "loss": 0.1658, + "step": 3636 + }, + { + "epoch": 0.3350992767310084, + "grad_norm": 0.9077754495843189, + "learning_rate": 3.968846767213794e-06, + "loss": 0.1638, + "step": 3637 + }, + { + "epoch": 0.3351914129082784, + "grad_norm": 0.9642592983646728, + "learning_rate": 3.968230267313824e-06, + "loss": 0.1607, + "step": 3638 + }, + { + "epoch": 0.3352835490855484, + "grad_norm": 0.9523862140574059, + "learning_rate": 3.967613631088516e-06, + "loss": 0.1602, + "step": 3639 + }, + { + "epoch": 0.33537568526281847, + "grad_norm": 0.853738339970035, + "learning_rate": 3.966996858595123e-06, + "loss": 0.1621, + "step": 3640 + }, + { + "epoch": 0.33546782144008847, + "grad_norm": 0.9435129135901393, + "learning_rate": 3.966379949890916e-06, + "loss": 0.1479, + "step": 3641 + }, + { + "epoch": 0.33555995761735846, + "grad_norm": 1.0237409152349508, + "learning_rate": 3.965762905033171e-06, + "loss": 0.1442, + "step": 3642 + }, + { + "epoch": 0.33565209379462846, + "grad_norm": 1.004827274729739, + "learning_rate": 3.965145724079184e-06, + "loss": 0.1727, + "step": 3643 + }, + { + "epoch": 0.33574422997189846, + "grad_norm": 0.9012970937284178, + "learning_rate": 3.964528407086259e-06, + "loss": 0.1582, + "step": 3644 + }, + { + "epoch": 0.33583636614916845, + "grad_norm": 0.9084801893441767, + "learning_rate": 3.963910954111712e-06, + "loss": 0.1427, + "step": 3645 + }, + { + "epoch": 0.33592850232643845, + "grad_norm": 1.0458487380125516, + "learning_rate": 3.9632933652128765e-06, + "loss": 0.1634, + "step": 3646 + }, + { + "epoch": 0.3360206385037085, + "grad_norm": 0.9627427187542494, + "learning_rate": 3.962675640447094e-06, + "loss": 0.1768, + "step": 3647 + }, + { + "epoch": 0.3361127746809785, + "grad_norm": 0.942436602646475, + "learning_rate": 3.962057779871722e-06, + "loss": 0.169, + "step": 3648 + }, + { + "epoch": 0.3362049108582485, + "grad_norm": 1.0233983022133986, + "learning_rate": 3.961439783544126e-06, + "loss": 0.1548, + "step": 3649 + }, + { + "epoch": 0.3362970470355185, + "grad_norm": 0.8673192470735941, + "learning_rate": 3.960821651521691e-06, + "loss": 0.1531, + "step": 3650 + }, + { + "epoch": 0.3363891832127885, + "grad_norm": 0.9154833282149483, + "learning_rate": 3.960203383861807e-06, + "loss": 0.1662, + "step": 3651 + }, + { + "epoch": 0.3364813193900585, + "grad_norm": 0.9145509274938834, + "learning_rate": 3.959584980621883e-06, + "loss": 0.1519, + "step": 3652 + }, + { + "epoch": 0.33657345556732854, + "grad_norm": 0.9274327627736129, + "learning_rate": 3.958966441859335e-06, + "loss": 0.161, + "step": 3653 + }, + { + "epoch": 0.33666559174459854, + "grad_norm": 0.93938164864867, + "learning_rate": 3.958347767631595e-06, + "loss": 0.1478, + "step": 3654 + }, + { + "epoch": 0.33675772792186853, + "grad_norm": 1.0533654788708426, + "learning_rate": 3.95772895799611e-06, + "loss": 0.1577, + "step": 3655 + }, + { + "epoch": 0.33684986409913853, + "grad_norm": 0.8832765620148048, + "learning_rate": 3.957110013010333e-06, + "loss": 0.1574, + "step": 3656 + }, + { + "epoch": 0.3369420002764085, + "grad_norm": 0.9261190045148706, + "learning_rate": 3.9564909327317355e-06, + "loss": 0.1531, + "step": 3657 + }, + { + "epoch": 0.3370341364536785, + "grad_norm": 0.9237710676896893, + "learning_rate": 3.955871717217797e-06, + "loss": 0.1433, + "step": 3658 + }, + { + "epoch": 0.3371262726309485, + "grad_norm": 0.9487715568740481, + "learning_rate": 3.955252366526014e-06, + "loss": 0.1626, + "step": 3659 + }, + { + "epoch": 0.3372184088082186, + "grad_norm": 1.0072562563839196, + "learning_rate": 3.954632880713891e-06, + "loss": 0.1569, + "step": 3660 + }, + { + "epoch": 0.33731054498548857, + "grad_norm": 0.9446053903813388, + "learning_rate": 3.954013259838949e-06, + "loss": 0.1612, + "step": 3661 + }, + { + "epoch": 0.33740268116275857, + "grad_norm": 0.8729387006885364, + "learning_rate": 3.95339350395872e-06, + "loss": 0.1407, + "step": 3662 + }, + { + "epoch": 0.33749481734002856, + "grad_norm": 0.9221098538932074, + "learning_rate": 3.952773613130747e-06, + "loss": 0.177, + "step": 3663 + }, + { + "epoch": 0.33758695351729856, + "grad_norm": 0.86712001263126, + "learning_rate": 3.9521535874125875e-06, + "loss": 0.1529, + "step": 3664 + }, + { + "epoch": 0.33767908969456856, + "grad_norm": 0.8922847012391321, + "learning_rate": 3.951533426861812e-06, + "loss": 0.1519, + "step": 3665 + }, + { + "epoch": 0.33777122587183855, + "grad_norm": 0.940770601309715, + "learning_rate": 3.950913131536001e-06, + "loss": 0.1431, + "step": 3666 + }, + { + "epoch": 0.3378633620491086, + "grad_norm": 0.9514263978222713, + "learning_rate": 3.950292701492749e-06, + "loss": 0.1693, + "step": 3667 + }, + { + "epoch": 0.3379554982263786, + "grad_norm": 0.8291431585281795, + "learning_rate": 3.949672136789665e-06, + "loss": 0.138, + "step": 3668 + }, + { + "epoch": 0.3380476344036486, + "grad_norm": 0.9627031886750439, + "learning_rate": 3.949051437484367e-06, + "loss": 0.1486, + "step": 3669 + }, + { + "epoch": 0.3381397705809186, + "grad_norm": 0.9203209058165884, + "learning_rate": 3.948430603634486e-06, + "loss": 0.1483, + "step": 3670 + }, + { + "epoch": 0.3382319067581886, + "grad_norm": 0.8850216723638966, + "learning_rate": 3.947809635297668e-06, + "loss": 0.1517, + "step": 3671 + }, + { + "epoch": 0.3383240429354586, + "grad_norm": 0.9675759754223425, + "learning_rate": 3.9471885325315695e-06, + "loss": 0.1727, + "step": 3672 + }, + { + "epoch": 0.3384161791127286, + "grad_norm": 0.9573188172538333, + "learning_rate": 3.94656729539386e-06, + "loss": 0.1555, + "step": 3673 + }, + { + "epoch": 0.33850831528999864, + "grad_norm": 0.9600208385443318, + "learning_rate": 3.945945923942221e-06, + "loss": 0.1635, + "step": 3674 + }, + { + "epoch": 0.33860045146726864, + "grad_norm": 0.9380222087962892, + "learning_rate": 3.945324418234349e-06, + "loss": 0.1637, + "step": 3675 + }, + { + "epoch": 0.33869258764453863, + "grad_norm": 0.9400532586305578, + "learning_rate": 3.944702778327948e-06, + "loss": 0.1479, + "step": 3676 + }, + { + "epoch": 0.33878472382180863, + "grad_norm": 0.8849537883410138, + "learning_rate": 3.944081004280738e-06, + "loss": 0.1577, + "step": 3677 + }, + { + "epoch": 0.33887685999907863, + "grad_norm": 0.893123818162334, + "learning_rate": 3.943459096150452e-06, + "loss": 0.1505, + "step": 3678 + }, + { + "epoch": 0.3389689961763486, + "grad_norm": 0.902068247100491, + "learning_rate": 3.942837053994834e-06, + "loss": 0.1561, + "step": 3679 + }, + { + "epoch": 0.3390611323536186, + "grad_norm": 0.9182592324604583, + "learning_rate": 3.942214877871639e-06, + "loss": 0.1631, + "step": 3680 + }, + { + "epoch": 0.3391532685308887, + "grad_norm": 0.9270539702602028, + "learning_rate": 3.941592567838638e-06, + "loss": 0.1599, + "step": 3681 + }, + { + "epoch": 0.33924540470815867, + "grad_norm": 0.869777337642264, + "learning_rate": 3.940970123953613e-06, + "loss": 0.1481, + "step": 3682 + }, + { + "epoch": 0.33933754088542867, + "grad_norm": 0.86276183059958, + "learning_rate": 3.940347546274355e-06, + "loss": 0.156, + "step": 3683 + }, + { + "epoch": 0.33942967706269866, + "grad_norm": 0.9137393832409828, + "learning_rate": 3.9397248348586735e-06, + "loss": 0.1548, + "step": 3684 + }, + { + "epoch": 0.33952181323996866, + "grad_norm": 0.8612576797475836, + "learning_rate": 3.939101989764386e-06, + "loss": 0.1437, + "step": 3685 + }, + { + "epoch": 0.33961394941723866, + "grad_norm": 0.8495987603787247, + "learning_rate": 3.938479011049324e-06, + "loss": 0.1368, + "step": 3686 + }, + { + "epoch": 0.3397060855945087, + "grad_norm": 0.9270752934332105, + "learning_rate": 3.937855898771331e-06, + "loss": 0.1525, + "step": 3687 + }, + { + "epoch": 0.3397982217717787, + "grad_norm": 0.9485150781196504, + "learning_rate": 3.9372326529882635e-06, + "loss": 0.172, + "step": 3688 + }, + { + "epoch": 0.3398903579490487, + "grad_norm": 0.9116673543666898, + "learning_rate": 3.936609273757988e-06, + "loss": 0.1527, + "step": 3689 + }, + { + "epoch": 0.3399824941263187, + "grad_norm": 0.9229394432273127, + "learning_rate": 3.935985761138388e-06, + "loss": 0.1564, + "step": 3690 + }, + { + "epoch": 0.3400746303035887, + "grad_norm": 0.8605203140126699, + "learning_rate": 3.935362115187356e-06, + "loss": 0.1446, + "step": 3691 + }, + { + "epoch": 0.3401667664808587, + "grad_norm": 0.9058126161505262, + "learning_rate": 3.934738335962796e-06, + "loss": 0.1689, + "step": 3692 + }, + { + "epoch": 0.3402589026581287, + "grad_norm": 0.8737393124424059, + "learning_rate": 3.934114423522627e-06, + "loss": 0.1464, + "step": 3693 + }, + { + "epoch": 0.34035103883539874, + "grad_norm": 0.9203947578164606, + "learning_rate": 3.93349037792478e-06, + "loss": 0.1603, + "step": 3694 + }, + { + "epoch": 0.34044317501266874, + "grad_norm": 0.871739564957307, + "learning_rate": 3.932866199227196e-06, + "loss": 0.1527, + "step": 3695 + }, + { + "epoch": 0.34053531118993874, + "grad_norm": 0.8450165346429883, + "learning_rate": 3.932241887487834e-06, + "loss": 0.1608, + "step": 3696 + }, + { + "epoch": 0.34062744736720874, + "grad_norm": 0.9199114792500082, + "learning_rate": 3.931617442764656e-06, + "loss": 0.1627, + "step": 3697 + }, + { + "epoch": 0.34071958354447873, + "grad_norm": 0.9050247845325442, + "learning_rate": 3.930992865115645e-06, + "loss": 0.1604, + "step": 3698 + }, + { + "epoch": 0.34081171972174873, + "grad_norm": 0.9139956710219851, + "learning_rate": 3.930368154598793e-06, + "loss": 0.1572, + "step": 3699 + }, + { + "epoch": 0.3409038558990187, + "grad_norm": 0.8631013927561422, + "learning_rate": 3.929743311272104e-06, + "loss": 0.1598, + "step": 3700 + }, + { + "epoch": 0.3409959920762888, + "grad_norm": 0.8943273656977337, + "learning_rate": 3.929118335193594e-06, + "loss": 0.1538, + "step": 3701 + }, + { + "epoch": 0.3410881282535588, + "grad_norm": 0.8850875675008057, + "learning_rate": 3.9284932264212925e-06, + "loss": 0.1595, + "step": 3702 + }, + { + "epoch": 0.3411802644308288, + "grad_norm": 0.9126308299525047, + "learning_rate": 3.927867985013242e-06, + "loss": 0.1596, + "step": 3703 + }, + { + "epoch": 0.34127240060809877, + "grad_norm": 0.9408328545751462, + "learning_rate": 3.9272426110274955e-06, + "loss": 0.1655, + "step": 3704 + }, + { + "epoch": 0.34136453678536877, + "grad_norm": 0.9617011692840643, + "learning_rate": 3.926617104522118e-06, + "loss": 0.1766, + "step": 3705 + }, + { + "epoch": 0.34145667296263876, + "grad_norm": 0.9237258704007577, + "learning_rate": 3.92599146555519e-06, + "loss": 0.1627, + "step": 3706 + }, + { + "epoch": 0.34154880913990876, + "grad_norm": 0.9690704139681786, + "learning_rate": 3.9253656941848e-06, + "loss": 0.1639, + "step": 3707 + }, + { + "epoch": 0.3416409453171788, + "grad_norm": 0.9982076700789548, + "learning_rate": 3.9247397904690526e-06, + "loss": 0.1669, + "step": 3708 + }, + { + "epoch": 0.3417330814944488, + "grad_norm": 0.9471486226369275, + "learning_rate": 3.924113754466062e-06, + "loss": 0.1528, + "step": 3709 + }, + { + "epoch": 0.3418252176717188, + "grad_norm": 0.9104829825095512, + "learning_rate": 3.923487586233956e-06, + "loss": 0.153, + "step": 3710 + }, + { + "epoch": 0.3419173538489888, + "grad_norm": 0.956036911754247, + "learning_rate": 3.922861285830874e-06, + "loss": 0.1599, + "step": 3711 + }, + { + "epoch": 0.3420094900262588, + "grad_norm": 0.8919804779273773, + "learning_rate": 3.922234853314969e-06, + "loss": 0.1563, + "step": 3712 + }, + { + "epoch": 0.3421016262035288, + "grad_norm": 0.8444794902726142, + "learning_rate": 3.921608288744405e-06, + "loss": 0.1571, + "step": 3713 + }, + { + "epoch": 0.3421937623807988, + "grad_norm": 0.8359544466312437, + "learning_rate": 3.920981592177358e-06, + "loss": 0.1327, + "step": 3714 + }, + { + "epoch": 0.34228589855806885, + "grad_norm": 0.9597616019869197, + "learning_rate": 3.920354763672017e-06, + "loss": 0.1589, + "step": 3715 + }, + { + "epoch": 0.34237803473533884, + "grad_norm": 0.838642030459525, + "learning_rate": 3.9197278032865835e-06, + "loss": 0.146, + "step": 3716 + }, + { + "epoch": 0.34247017091260884, + "grad_norm": 0.9531398921954629, + "learning_rate": 3.919100711079271e-06, + "loss": 0.1544, + "step": 3717 + }, + { + "epoch": 0.34256230708987884, + "grad_norm": 0.9489530931832708, + "learning_rate": 3.918473487108305e-06, + "loss": 0.1521, + "step": 3718 + }, + { + "epoch": 0.34265444326714883, + "grad_norm": 0.9305730738295115, + "learning_rate": 3.917846131431923e-06, + "loss": 0.1532, + "step": 3719 + }, + { + "epoch": 0.34274657944441883, + "grad_norm": 0.9741820408835091, + "learning_rate": 3.917218644108375e-06, + "loss": 0.1548, + "step": 3720 + }, + { + "epoch": 0.3428387156216889, + "grad_norm": 1.0039496017353446, + "learning_rate": 3.916591025195923e-06, + "loss": 0.1618, + "step": 3721 + }, + { + "epoch": 0.3429308517989589, + "grad_norm": 0.975277791866181, + "learning_rate": 3.915963274752842e-06, + "loss": 0.1598, + "step": 3722 + }, + { + "epoch": 0.3430229879762289, + "grad_norm": 0.9510130783589238, + "learning_rate": 3.915335392837418e-06, + "loss": 0.1608, + "step": 3723 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 0.9307546556982514, + "learning_rate": 3.914707379507952e-06, + "loss": 0.1429, + "step": 3724 + }, + { + "epoch": 0.34320726033076887, + "grad_norm": 0.9390570411218914, + "learning_rate": 3.914079234822752e-06, + "loss": 0.1562, + "step": 3725 + }, + { + "epoch": 0.34329939650803887, + "grad_norm": 0.8986706044646279, + "learning_rate": 3.913450958840144e-06, + "loss": 0.1487, + "step": 3726 + }, + { + "epoch": 0.34339153268530886, + "grad_norm": 0.9437748108420285, + "learning_rate": 3.912822551618461e-06, + "loss": 0.1618, + "step": 3727 + }, + { + "epoch": 0.3434836688625789, + "grad_norm": 0.8769176358148374, + "learning_rate": 3.912194013216053e-06, + "loss": 0.1582, + "step": 3728 + }, + { + "epoch": 0.3435758050398489, + "grad_norm": 0.8809803011658778, + "learning_rate": 3.911565343691279e-06, + "loss": 0.1441, + "step": 3729 + }, + { + "epoch": 0.3436679412171189, + "grad_norm": 1.0717667235658699, + "learning_rate": 3.910936543102511e-06, + "loss": 0.1624, + "step": 3730 + }, + { + "epoch": 0.3437600773943889, + "grad_norm": 0.8995225605847589, + "learning_rate": 3.910307611508133e-06, + "loss": 0.1491, + "step": 3731 + }, + { + "epoch": 0.3438522135716589, + "grad_norm": 0.9128137084403737, + "learning_rate": 3.9096785489665405e-06, + "loss": 0.146, + "step": 3732 + }, + { + "epoch": 0.3439443497489289, + "grad_norm": 0.9476732140767965, + "learning_rate": 3.9090493555361445e-06, + "loss": 0.1623, + "step": 3733 + }, + { + "epoch": 0.3440364859261989, + "grad_norm": 0.8873546249523853, + "learning_rate": 3.908420031275363e-06, + "loss": 0.1448, + "step": 3734 + }, + { + "epoch": 0.34412862210346895, + "grad_norm": 0.9694870317753858, + "learning_rate": 3.907790576242631e-06, + "loss": 0.159, + "step": 3735 + }, + { + "epoch": 0.34422075828073895, + "grad_norm": 0.8987721407984873, + "learning_rate": 3.907160990496392e-06, + "loss": 0.1518, + "step": 3736 + }, + { + "epoch": 0.34431289445800894, + "grad_norm": 0.8835955419714732, + "learning_rate": 3.9065312740951035e-06, + "loss": 0.1551, + "step": 3737 + }, + { + "epoch": 0.34440503063527894, + "grad_norm": 0.9036042800205629, + "learning_rate": 3.905901427097235e-06, + "loss": 0.1389, + "step": 3738 + }, + { + "epoch": 0.34449716681254894, + "grad_norm": 0.9288485278514401, + "learning_rate": 3.9052714495612675e-06, + "loss": 0.1479, + "step": 3739 + }, + { + "epoch": 0.34458930298981894, + "grad_norm": 0.832981619207698, + "learning_rate": 3.904641341545694e-06, + "loss": 0.143, + "step": 3740 + }, + { + "epoch": 0.34468143916708893, + "grad_norm": 0.8967549193356065, + "learning_rate": 3.904011103109022e-06, + "loss": 0.1571, + "step": 3741 + }, + { + "epoch": 0.344773575344359, + "grad_norm": 1.0107971066325179, + "learning_rate": 3.903380734309767e-06, + "loss": 0.1667, + "step": 3742 + }, + { + "epoch": 0.344865711521629, + "grad_norm": 0.9379380617255512, + "learning_rate": 3.90275023520646e-06, + "loss": 0.1539, + "step": 3743 + }, + { + "epoch": 0.344957847698899, + "grad_norm": 0.8811406590696291, + "learning_rate": 3.902119605857644e-06, + "loss": 0.1546, + "step": 3744 + }, + { + "epoch": 0.345049983876169, + "grad_norm": 0.8930279758698314, + "learning_rate": 3.90148884632187e-06, + "loss": 0.135, + "step": 3745 + }, + { + "epoch": 0.34514212005343897, + "grad_norm": 0.9641434925117012, + "learning_rate": 3.900857956657707e-06, + "loss": 0.1506, + "step": 3746 + }, + { + "epoch": 0.34523425623070897, + "grad_norm": 0.9518783618290042, + "learning_rate": 3.900226936923731e-06, + "loss": 0.1406, + "step": 3747 + }, + { + "epoch": 0.34532639240797897, + "grad_norm": 0.9648681621137352, + "learning_rate": 3.899595787178534e-06, + "loss": 0.1551, + "step": 3748 + }, + { + "epoch": 0.345418528585249, + "grad_norm": 0.92389069120726, + "learning_rate": 3.898964507480717e-06, + "loss": 0.147, + "step": 3749 + }, + { + "epoch": 0.345510664762519, + "grad_norm": 0.9264829787786033, + "learning_rate": 3.8983330978888955e-06, + "loss": 0.1523, + "step": 3750 + }, + { + "epoch": 0.345602800939789, + "grad_norm": 0.9856484497779876, + "learning_rate": 3.897701558461695e-06, + "loss": 0.1709, + "step": 3751 + }, + { + "epoch": 0.345694937117059, + "grad_norm": 0.9104132120981927, + "learning_rate": 3.897069889257754e-06, + "loss": 0.1615, + "step": 3752 + }, + { + "epoch": 0.345787073294329, + "grad_norm": 0.8928954674177243, + "learning_rate": 3.8964380903357244e-06, + "loss": 0.1491, + "step": 3753 + }, + { + "epoch": 0.345879209471599, + "grad_norm": 0.9773042752590833, + "learning_rate": 3.895806161754267e-06, + "loss": 0.1555, + "step": 3754 + }, + { + "epoch": 0.34597134564886906, + "grad_norm": 0.9934249525249464, + "learning_rate": 3.895174103572057e-06, + "loss": 0.1739, + "step": 3755 + }, + { + "epoch": 0.34606348182613905, + "grad_norm": 0.9725688234311249, + "learning_rate": 3.894541915847783e-06, + "loss": 0.1661, + "step": 3756 + }, + { + "epoch": 0.34615561800340905, + "grad_norm": 0.9447443585635159, + "learning_rate": 3.89390959864014e-06, + "loss": 0.159, + "step": 3757 + }, + { + "epoch": 0.34624775418067905, + "grad_norm": 0.8845525119997683, + "learning_rate": 3.893277152007842e-06, + "loss": 0.1549, + "step": 3758 + }, + { + "epoch": 0.34633989035794904, + "grad_norm": 0.8523390340352258, + "learning_rate": 3.89264457600961e-06, + "loss": 0.1441, + "step": 3759 + }, + { + "epoch": 0.34643202653521904, + "grad_norm": 0.9244438703386956, + "learning_rate": 3.892011870704179e-06, + "loss": 0.1581, + "step": 3760 + }, + { + "epoch": 0.34652416271248904, + "grad_norm": 0.9038773135362822, + "learning_rate": 3.891379036150297e-06, + "loss": 0.1408, + "step": 3761 + }, + { + "epoch": 0.3466162988897591, + "grad_norm": 0.8333273764873563, + "learning_rate": 3.89074607240672e-06, + "loss": 0.1503, + "step": 3762 + }, + { + "epoch": 0.3467084350670291, + "grad_norm": 0.9480337430008207, + "learning_rate": 3.890112979532222e-06, + "loss": 0.1541, + "step": 3763 + }, + { + "epoch": 0.3468005712442991, + "grad_norm": 0.9599410814646354, + "learning_rate": 3.889479757585584e-06, + "loss": 0.1665, + "step": 3764 + }, + { + "epoch": 0.3468927074215691, + "grad_norm": 0.9433260473515176, + "learning_rate": 3.888846406625601e-06, + "loss": 0.1454, + "step": 3765 + }, + { + "epoch": 0.3469848435988391, + "grad_norm": 0.9380561623033904, + "learning_rate": 3.888212926711079e-06, + "loss": 0.1573, + "step": 3766 + }, + { + "epoch": 0.3470769797761091, + "grad_norm": 1.0103890768413186, + "learning_rate": 3.887579317900838e-06, + "loss": 0.1512, + "step": 3767 + }, + { + "epoch": 0.34716911595337907, + "grad_norm": 0.9545274750885263, + "learning_rate": 3.886945580253708e-06, + "loss": 0.1507, + "step": 3768 + }, + { + "epoch": 0.3472612521306491, + "grad_norm": 0.8545779025060927, + "learning_rate": 3.886311713828531e-06, + "loss": 0.134, + "step": 3769 + }, + { + "epoch": 0.3473533883079191, + "grad_norm": 0.9256233241238948, + "learning_rate": 3.885677718684163e-06, + "loss": 0.1652, + "step": 3770 + }, + { + "epoch": 0.3474455244851891, + "grad_norm": 0.8881952279984489, + "learning_rate": 3.885043594879469e-06, + "loss": 0.1515, + "step": 3771 + }, + { + "epoch": 0.3475376606624591, + "grad_norm": 0.9239248201411169, + "learning_rate": 3.884409342473329e-06, + "loss": 0.1722, + "step": 3772 + }, + { + "epoch": 0.3476297968397291, + "grad_norm": 0.9022748249991301, + "learning_rate": 3.883774961524632e-06, + "loss": 0.1591, + "step": 3773 + }, + { + "epoch": 0.3477219330169991, + "grad_norm": 0.9727583904335542, + "learning_rate": 3.88314045209228e-06, + "loss": 0.1586, + "step": 3774 + }, + { + "epoch": 0.3478140691942691, + "grad_norm": 0.8946050022186862, + "learning_rate": 3.8825058142351895e-06, + "loss": 0.1474, + "step": 3775 + }, + { + "epoch": 0.34790620537153916, + "grad_norm": 0.9516275063497269, + "learning_rate": 3.881871048012285e-06, + "loss": 0.1534, + "step": 3776 + }, + { + "epoch": 0.34799834154880915, + "grad_norm": 0.9403615984819307, + "learning_rate": 3.881236153482505e-06, + "loss": 0.1684, + "step": 3777 + }, + { + "epoch": 0.34809047772607915, + "grad_norm": 0.9788364338685362, + "learning_rate": 3.880601130704799e-06, + "loss": 0.1594, + "step": 3778 + }, + { + "epoch": 0.34818261390334915, + "grad_norm": 0.8769684675894291, + "learning_rate": 3.87996597973813e-06, + "loss": 0.1494, + "step": 3779 + }, + { + "epoch": 0.34827475008061914, + "grad_norm": 0.9241894076679832, + "learning_rate": 3.879330700641471e-06, + "loss": 0.1516, + "step": 3780 + }, + { + "epoch": 0.34836688625788914, + "grad_norm": 1.010356419388737, + "learning_rate": 3.878695293473809e-06, + "loss": 0.1786, + "step": 3781 + }, + { + "epoch": 0.34845902243515914, + "grad_norm": 0.8481571310124365, + "learning_rate": 3.878059758294139e-06, + "loss": 0.1566, + "step": 3782 + }, + { + "epoch": 0.3485511586124292, + "grad_norm": 0.9237936486021854, + "learning_rate": 3.877424095161473e-06, + "loss": 0.1643, + "step": 3783 + }, + { + "epoch": 0.3486432947896992, + "grad_norm": 0.9049124285799792, + "learning_rate": 3.8767883041348305e-06, + "loss": 0.1524, + "step": 3784 + }, + { + "epoch": 0.3487354309669692, + "grad_norm": 0.8886044487962574, + "learning_rate": 3.8761523852732475e-06, + "loss": 0.1486, + "step": 3785 + }, + { + "epoch": 0.3488275671442392, + "grad_norm": 0.8609799128855903, + "learning_rate": 3.875516338635766e-06, + "loss": 0.1456, + "step": 3786 + }, + { + "epoch": 0.3489197033215092, + "grad_norm": 0.8265506676375848, + "learning_rate": 3.874880164281446e-06, + "loss": 0.1403, + "step": 3787 + }, + { + "epoch": 0.3490118394987792, + "grad_norm": 0.8677006052855488, + "learning_rate": 3.874243862269353e-06, + "loss": 0.1505, + "step": 3788 + }, + { + "epoch": 0.3491039756760492, + "grad_norm": 0.8970711140727137, + "learning_rate": 3.87360743265857e-06, + "loss": 0.1628, + "step": 3789 + }, + { + "epoch": 0.3491961118533192, + "grad_norm": 1.0145950719528463, + "learning_rate": 3.87297087550819e-06, + "loss": 0.1624, + "step": 3790 + }, + { + "epoch": 0.3492882480305892, + "grad_norm": 0.888008623820799, + "learning_rate": 3.872334190877316e-06, + "loss": 0.1459, + "step": 3791 + }, + { + "epoch": 0.3493803842078592, + "grad_norm": 0.8672174025069913, + "learning_rate": 3.8716973788250645e-06, + "loss": 0.1444, + "step": 3792 + }, + { + "epoch": 0.3494725203851292, + "grad_norm": 0.9186929663280052, + "learning_rate": 3.871060439410563e-06, + "loss": 0.1463, + "step": 3793 + }, + { + "epoch": 0.3495646565623992, + "grad_norm": 0.8865568286318863, + "learning_rate": 3.870423372692953e-06, + "loss": 0.147, + "step": 3794 + }, + { + "epoch": 0.3496567927396692, + "grad_norm": 0.8920318935417838, + "learning_rate": 3.869786178731386e-06, + "loss": 0.1471, + "step": 3795 + }, + { + "epoch": 0.34974892891693926, + "grad_norm": 1.0001696409280874, + "learning_rate": 3.869148857585024e-06, + "loss": 0.1567, + "step": 3796 + }, + { + "epoch": 0.34984106509420926, + "grad_norm": 0.9044038444981554, + "learning_rate": 3.8685114093130436e-06, + "loss": 0.1605, + "step": 3797 + }, + { + "epoch": 0.34993320127147926, + "grad_norm": 0.9419109233515364, + "learning_rate": 3.867873833974631e-06, + "loss": 0.1628, + "step": 3798 + }, + { + "epoch": 0.35002533744874925, + "grad_norm": 0.9098145882514128, + "learning_rate": 3.867236131628985e-06, + "loss": 0.147, + "step": 3799 + }, + { + "epoch": 0.35011747362601925, + "grad_norm": 0.9675498143428517, + "learning_rate": 3.8665983023353195e-06, + "loss": 0.1613, + "step": 3800 + }, + { + "epoch": 0.35020960980328925, + "grad_norm": 0.9152259025758308, + "learning_rate": 3.865960346152853e-06, + "loss": 0.1575, + "step": 3801 + }, + { + "epoch": 0.35030174598055924, + "grad_norm": 0.9096318752456122, + "learning_rate": 3.865322263140821e-06, + "loss": 0.1603, + "step": 3802 + }, + { + "epoch": 0.3503938821578293, + "grad_norm": 0.9042773380336094, + "learning_rate": 3.86468405335847e-06, + "loss": 0.1593, + "step": 3803 + }, + { + "epoch": 0.3504860183350993, + "grad_norm": 0.8810883239456601, + "learning_rate": 3.864045716865059e-06, + "loss": 0.1564, + "step": 3804 + }, + { + "epoch": 0.3505781545123693, + "grad_norm": 0.8969771666725758, + "learning_rate": 3.863407253719855e-06, + "loss": 0.1658, + "step": 3805 + }, + { + "epoch": 0.3506702906896393, + "grad_norm": 0.8859714505584699, + "learning_rate": 3.8627686639821415e-06, + "loss": 0.1524, + "step": 3806 + }, + { + "epoch": 0.3507624268669093, + "grad_norm": 0.9396107844353678, + "learning_rate": 3.8621299477112105e-06, + "loss": 0.1594, + "step": 3807 + }, + { + "epoch": 0.3508545630441793, + "grad_norm": 0.9568604332583132, + "learning_rate": 3.861491104966368e-06, + "loss": 0.143, + "step": 3808 + }, + { + "epoch": 0.3509466992214493, + "grad_norm": 0.9548184489453653, + "learning_rate": 3.860852135806929e-06, + "loss": 0.1612, + "step": 3809 + }, + { + "epoch": 0.35103883539871933, + "grad_norm": 0.9162268999810167, + "learning_rate": 3.860213040292224e-06, + "loss": 0.1516, + "step": 3810 + }, + { + "epoch": 0.3511309715759893, + "grad_norm": 0.965092172228159, + "learning_rate": 3.85957381848159e-06, + "loss": 0.1462, + "step": 3811 + }, + { + "epoch": 0.3512231077532593, + "grad_norm": 0.9578818766366897, + "learning_rate": 3.858934470434381e-06, + "loss": 0.1489, + "step": 3812 + }, + { + "epoch": 0.3513152439305293, + "grad_norm": 0.9123089429516482, + "learning_rate": 3.858294996209961e-06, + "loss": 0.1563, + "step": 3813 + }, + { + "epoch": 0.3514073801077993, + "grad_norm": 0.9948072100858789, + "learning_rate": 3.857655395867704e-06, + "loss": 0.1571, + "step": 3814 + }, + { + "epoch": 0.3514995162850693, + "grad_norm": 0.9216205578818107, + "learning_rate": 3.857015669466998e-06, + "loss": 0.157, + "step": 3815 + }, + { + "epoch": 0.3515916524623393, + "grad_norm": 0.9864385412189361, + "learning_rate": 3.856375817067241e-06, + "loss": 0.1627, + "step": 3816 + }, + { + "epoch": 0.35168378863960936, + "grad_norm": 0.8407221015580862, + "learning_rate": 3.855735838727842e-06, + "loss": 0.1417, + "step": 3817 + }, + { + "epoch": 0.35177592481687936, + "grad_norm": 0.8902927685530272, + "learning_rate": 3.855095734508225e-06, + "loss": 0.1379, + "step": 3818 + }, + { + "epoch": 0.35186806099414936, + "grad_norm": 0.9232807543583181, + "learning_rate": 3.854455504467824e-06, + "loss": 0.1494, + "step": 3819 + }, + { + "epoch": 0.35196019717141935, + "grad_norm": 0.8747114191669552, + "learning_rate": 3.853815148666084e-06, + "loss": 0.1392, + "step": 3820 + }, + { + "epoch": 0.35205233334868935, + "grad_norm": 0.9007112346102228, + "learning_rate": 3.85317466716246e-06, + "loss": 0.1459, + "step": 3821 + }, + { + "epoch": 0.35214446952595935, + "grad_norm": 0.9703594906332523, + "learning_rate": 3.852534060016424e-06, + "loss": 0.1514, + "step": 3822 + }, + { + "epoch": 0.3522366057032294, + "grad_norm": 0.9876709700858648, + "learning_rate": 3.8518933272874546e-06, + "loss": 0.1556, + "step": 3823 + }, + { + "epoch": 0.3523287418804994, + "grad_norm": 0.9085359041715336, + "learning_rate": 3.851252469035044e-06, + "loss": 0.1647, + "step": 3824 + }, + { + "epoch": 0.3524208780577694, + "grad_norm": 0.8843618900771493, + "learning_rate": 3.850611485318696e-06, + "loss": 0.1526, + "step": 3825 + }, + { + "epoch": 0.3525130142350394, + "grad_norm": 0.9384375911161207, + "learning_rate": 3.8499703761979276e-06, + "loss": 0.1504, + "step": 3826 + }, + { + "epoch": 0.3526051504123094, + "grad_norm": 0.8748805623679292, + "learning_rate": 3.849329141732263e-06, + "loss": 0.1539, + "step": 3827 + }, + { + "epoch": 0.3526972865895794, + "grad_norm": 0.8627817633209965, + "learning_rate": 3.848687781981243e-06, + "loss": 0.1552, + "step": 3828 + }, + { + "epoch": 0.3527894227668494, + "grad_norm": 0.9059527689927666, + "learning_rate": 3.848046297004417e-06, + "loss": 0.1447, + "step": 3829 + }, + { + "epoch": 0.35288155894411943, + "grad_norm": 0.8805324891957399, + "learning_rate": 3.847404686861348e-06, + "loss": 0.1439, + "step": 3830 + }, + { + "epoch": 0.35297369512138943, + "grad_norm": 0.9344084024801386, + "learning_rate": 3.846762951611608e-06, + "loss": 0.1552, + "step": 3831 + }, + { + "epoch": 0.3530658312986594, + "grad_norm": 0.9363214449601064, + "learning_rate": 3.846121091314783e-06, + "loss": 0.1545, + "step": 3832 + }, + { + "epoch": 0.3531579674759294, + "grad_norm": 0.8539977014512892, + "learning_rate": 3.84547910603047e-06, + "loss": 0.1405, + "step": 3833 + }, + { + "epoch": 0.3532501036531994, + "grad_norm": 0.9995946565004439, + "learning_rate": 3.8448369958182775e-06, + "loss": 0.1757, + "step": 3834 + }, + { + "epoch": 0.3533422398304694, + "grad_norm": 0.9308525526025416, + "learning_rate": 3.844194760737825e-06, + "loss": 0.163, + "step": 3835 + }, + { + "epoch": 0.3534343760077394, + "grad_norm": 0.8951960585707758, + "learning_rate": 3.843552400848744e-06, + "loss": 0.1492, + "step": 3836 + }, + { + "epoch": 0.35352651218500947, + "grad_norm": 0.8647013473773321, + "learning_rate": 3.842909916210678e-06, + "loss": 0.143, + "step": 3837 + }, + { + "epoch": 0.35361864836227946, + "grad_norm": 0.9713686917245324, + "learning_rate": 3.842267306883283e-06, + "loss": 0.1589, + "step": 3838 + }, + { + "epoch": 0.35371078453954946, + "grad_norm": 0.8925126993308895, + "learning_rate": 3.8416245729262225e-06, + "loss": 0.1415, + "step": 3839 + }, + { + "epoch": 0.35380292071681946, + "grad_norm": 1.0428020798287154, + "learning_rate": 3.840981714399177e-06, + "loss": 0.1517, + "step": 3840 + }, + { + "epoch": 0.35389505689408945, + "grad_norm": 0.9157111788231413, + "learning_rate": 3.840338731361834e-06, + "loss": 0.1494, + "step": 3841 + }, + { + "epoch": 0.35398719307135945, + "grad_norm": 0.9316243217330057, + "learning_rate": 3.839695623873896e-06, + "loss": 0.1558, + "step": 3842 + }, + { + "epoch": 0.35407932924862945, + "grad_norm": 0.9232650712470144, + "learning_rate": 3.839052391995076e-06, + "loss": 0.1542, + "step": 3843 + }, + { + "epoch": 0.3541714654258995, + "grad_norm": 0.8754883534686446, + "learning_rate": 3.8384090357850964e-06, + "loss": 0.1544, + "step": 3844 + }, + { + "epoch": 0.3542636016031695, + "grad_norm": 0.9115494168855587, + "learning_rate": 3.837765555303694e-06, + "loss": 0.1538, + "step": 3845 + }, + { + "epoch": 0.3543557377804395, + "grad_norm": 0.8999621586897121, + "learning_rate": 3.837121950610616e-06, + "loss": 0.1479, + "step": 3846 + }, + { + "epoch": 0.3544478739577095, + "grad_norm": 0.8276305518580209, + "learning_rate": 3.8364782217656205e-06, + "loss": 0.1324, + "step": 3847 + }, + { + "epoch": 0.3545400101349795, + "grad_norm": 0.9024826007607681, + "learning_rate": 3.835834368828479e-06, + "loss": 0.1557, + "step": 3848 + }, + { + "epoch": 0.3546321463122495, + "grad_norm": 0.9065730873524084, + "learning_rate": 3.835190391858972e-06, + "loss": 0.1607, + "step": 3849 + }, + { + "epoch": 0.3547242824895195, + "grad_norm": 0.8661100603350929, + "learning_rate": 3.834546290916893e-06, + "loss": 0.1565, + "step": 3850 + }, + { + "epoch": 0.35481641866678953, + "grad_norm": 0.8664911463390288, + "learning_rate": 3.833902066062049e-06, + "loss": 0.14, + "step": 3851 + }, + { + "epoch": 0.35490855484405953, + "grad_norm": 0.9297572513132248, + "learning_rate": 3.833257717354253e-06, + "loss": 0.162, + "step": 3852 + }, + { + "epoch": 0.35500069102132953, + "grad_norm": 0.9334632150840317, + "learning_rate": 3.832613244853335e-06, + "loss": 0.1549, + "step": 3853 + }, + { + "epoch": 0.3550928271985995, + "grad_norm": 0.9117192881469401, + "learning_rate": 3.831968648619133e-06, + "loss": 0.1643, + "step": 3854 + }, + { + "epoch": 0.3551849633758695, + "grad_norm": 0.8679690659313138, + "learning_rate": 3.8313239287115e-06, + "loss": 0.148, + "step": 3855 + }, + { + "epoch": 0.3552770995531395, + "grad_norm": 0.9554686130719982, + "learning_rate": 3.830679085190296e-06, + "loss": 0.1585, + "step": 3856 + }, + { + "epoch": 0.35536923573040957, + "grad_norm": 0.9674925680074199, + "learning_rate": 3.830034118115396e-06, + "loss": 0.1582, + "step": 3857 + }, + { + "epoch": 0.35546137190767957, + "grad_norm": 0.8961305393810277, + "learning_rate": 3.829389027546685e-06, + "loss": 0.1437, + "step": 3858 + }, + { + "epoch": 0.35555350808494957, + "grad_norm": 0.8747439392004103, + "learning_rate": 3.828743813544059e-06, + "loss": 0.1432, + "step": 3859 + }, + { + "epoch": 0.35564564426221956, + "grad_norm": 0.9511837265747041, + "learning_rate": 3.8280984761674286e-06, + "loss": 0.1554, + "step": 3860 + }, + { + "epoch": 0.35573778043948956, + "grad_norm": 0.9271027409618041, + "learning_rate": 3.82745301547671e-06, + "loss": 0.1604, + "step": 3861 + }, + { + "epoch": 0.35582991661675956, + "grad_norm": 0.8952219446892987, + "learning_rate": 3.8268074315318375e-06, + "loss": 0.1585, + "step": 3862 + }, + { + "epoch": 0.35592205279402955, + "grad_norm": 0.8927757851183054, + "learning_rate": 3.826161724392751e-06, + "loss": 0.142, + "step": 3863 + }, + { + "epoch": 0.3560141889712996, + "grad_norm": 0.8424179319437527, + "learning_rate": 3.8255158941194066e-06, + "loss": 0.1413, + "step": 3864 + }, + { + "epoch": 0.3561063251485696, + "grad_norm": 0.8740217468806033, + "learning_rate": 3.824869940771768e-06, + "loss": 0.1448, + "step": 3865 + }, + { + "epoch": 0.3561984613258396, + "grad_norm": 0.8540961364919858, + "learning_rate": 3.824223864409813e-06, + "loss": 0.1508, + "step": 3866 + }, + { + "epoch": 0.3562905975031096, + "grad_norm": 0.883177257480543, + "learning_rate": 3.823577665093529e-06, + "loss": 0.1583, + "step": 3867 + }, + { + "epoch": 0.3563827336803796, + "grad_norm": 0.8970386523940295, + "learning_rate": 3.822931342882918e-06, + "loss": 0.154, + "step": 3868 + }, + { + "epoch": 0.3564748698576496, + "grad_norm": 0.9148582364371604, + "learning_rate": 3.822284897837989e-06, + "loss": 0.1574, + "step": 3869 + }, + { + "epoch": 0.3565670060349196, + "grad_norm": 0.9835888081454008, + "learning_rate": 3.821638330018764e-06, + "loss": 0.1613, + "step": 3870 + }, + { + "epoch": 0.35665914221218964, + "grad_norm": 0.9487618778033893, + "learning_rate": 3.820991639485279e-06, + "loss": 0.15, + "step": 3871 + }, + { + "epoch": 0.35675127838945964, + "grad_norm": 0.9581745160471137, + "learning_rate": 3.820344826297577e-06, + "loss": 0.1482, + "step": 3872 + }, + { + "epoch": 0.35684341456672963, + "grad_norm": 0.9939597183184489, + "learning_rate": 3.819697890515717e-06, + "loss": 0.1741, + "step": 3873 + }, + { + "epoch": 0.35693555074399963, + "grad_norm": 0.9341386337680994, + "learning_rate": 3.819050832199766e-06, + "loss": 0.1534, + "step": 3874 + }, + { + "epoch": 0.3570276869212696, + "grad_norm": 0.9627433592601693, + "learning_rate": 3.818403651409801e-06, + "loss": 0.166, + "step": 3875 + }, + { + "epoch": 0.3571198230985396, + "grad_norm": 0.8966016704437211, + "learning_rate": 3.817756348205917e-06, + "loss": 0.1534, + "step": 3876 + }, + { + "epoch": 0.3572119592758096, + "grad_norm": 0.831950783650535, + "learning_rate": 3.817108922648214e-06, + "loss": 0.1479, + "step": 3877 + }, + { + "epoch": 0.3573040954530797, + "grad_norm": 0.9242438801830429, + "learning_rate": 3.816461374796805e-06, + "loss": 0.1581, + "step": 3878 + }, + { + "epoch": 0.35739623163034967, + "grad_norm": 0.9027636442769159, + "learning_rate": 3.815813704711816e-06, + "loss": 0.1535, + "step": 3879 + }, + { + "epoch": 0.35748836780761967, + "grad_norm": 0.796082355005034, + "learning_rate": 3.815165912453383e-06, + "loss": 0.1377, + "step": 3880 + }, + { + "epoch": 0.35758050398488966, + "grad_norm": 0.9768643954402325, + "learning_rate": 3.814517998081654e-06, + "loss": 0.1672, + "step": 3881 + }, + { + "epoch": 0.35767264016215966, + "grad_norm": 0.924729896863386, + "learning_rate": 3.8138699616567875e-06, + "loss": 0.1619, + "step": 3882 + }, + { + "epoch": 0.35776477633942966, + "grad_norm": 0.8615406672854165, + "learning_rate": 3.8132218032389524e-06, + "loss": 0.1438, + "step": 3883 + }, + { + "epoch": 0.35785691251669965, + "grad_norm": 0.8715405718747457, + "learning_rate": 3.812573522888332e-06, + "loss": 0.1516, + "step": 3884 + }, + { + "epoch": 0.3579490486939697, + "grad_norm": 0.9561787493207595, + "learning_rate": 3.81192512066512e-06, + "loss": 0.1721, + "step": 3885 + }, + { + "epoch": 0.3580411848712397, + "grad_norm": 0.8752491679156089, + "learning_rate": 3.811276596629518e-06, + "loss": 0.1502, + "step": 3886 + }, + { + "epoch": 0.3581333210485097, + "grad_norm": 0.9660899512247874, + "learning_rate": 3.810627950841743e-06, + "loss": 0.1586, + "step": 3887 + }, + { + "epoch": 0.3582254572257797, + "grad_norm": 0.9448658827634945, + "learning_rate": 3.8099791833620214e-06, + "loss": 0.1533, + "step": 3888 + }, + { + "epoch": 0.3583175934030497, + "grad_norm": 0.9004121487474659, + "learning_rate": 3.8093302942505935e-06, + "loss": 0.1585, + "step": 3889 + }, + { + "epoch": 0.3584097295803197, + "grad_norm": 0.906632158147621, + "learning_rate": 3.8086812835677044e-06, + "loss": 0.1624, + "step": 3890 + }, + { + "epoch": 0.35850186575758974, + "grad_norm": 0.9183195616043317, + "learning_rate": 3.808032151373619e-06, + "loss": 0.1618, + "step": 3891 + }, + { + "epoch": 0.35859400193485974, + "grad_norm": 0.8553552558472457, + "learning_rate": 3.807382897728607e-06, + "loss": 0.1444, + "step": 3892 + }, + { + "epoch": 0.35868613811212974, + "grad_norm": 0.902571763244677, + "learning_rate": 3.8067335226929523e-06, + "loss": 0.1554, + "step": 3893 + }, + { + "epoch": 0.35877827428939973, + "grad_norm": 0.8831422504068266, + "learning_rate": 3.8060840263269494e-06, + "loss": 0.1553, + "step": 3894 + }, + { + "epoch": 0.35887041046666973, + "grad_norm": 0.8695419644636682, + "learning_rate": 3.8054344086909043e-06, + "loss": 0.1539, + "step": 3895 + }, + { + "epoch": 0.35896254664393973, + "grad_norm": 0.8634127120614579, + "learning_rate": 3.804784669845133e-06, + "loss": 0.1438, + "step": 3896 + }, + { + "epoch": 0.3590546828212097, + "grad_norm": 0.8608209569312256, + "learning_rate": 3.8041348098499655e-06, + "loss": 0.1397, + "step": 3897 + }, + { + "epoch": 0.3591468189984798, + "grad_norm": 0.9012937821304821, + "learning_rate": 3.8034848287657403e-06, + "loss": 0.153, + "step": 3898 + }, + { + "epoch": 0.3592389551757498, + "grad_norm": 0.913635814588998, + "learning_rate": 3.802834726652809e-06, + "loss": 0.1583, + "step": 3899 + }, + { + "epoch": 0.35933109135301977, + "grad_norm": 0.9246153282899159, + "learning_rate": 3.802184503571532e-06, + "loss": 0.1508, + "step": 3900 + }, + { + "epoch": 0.35942322753028977, + "grad_norm": 0.9563799822615789, + "learning_rate": 3.801534159582285e-06, + "loss": 0.1728, + "step": 3901 + }, + { + "epoch": 0.35951536370755977, + "grad_norm": 0.8893729611997498, + "learning_rate": 3.80088369474545e-06, + "loss": 0.1571, + "step": 3902 + }, + { + "epoch": 0.35960749988482976, + "grad_norm": 0.8977935109197466, + "learning_rate": 3.800233109121425e-06, + "loss": 0.1395, + "step": 3903 + }, + { + "epoch": 0.35969963606209976, + "grad_norm": 0.9174627801219775, + "learning_rate": 3.7995824027706152e-06, + "loss": 0.1475, + "step": 3904 + }, + { + "epoch": 0.3597917722393698, + "grad_norm": 0.9695866259137063, + "learning_rate": 3.7989315757534397e-06, + "loss": 0.1549, + "step": 3905 + }, + { + "epoch": 0.3598839084166398, + "grad_norm": 0.9840473979899322, + "learning_rate": 3.7982806281303276e-06, + "loss": 0.1672, + "step": 3906 + }, + { + "epoch": 0.3599760445939098, + "grad_norm": 0.8725606551036027, + "learning_rate": 3.797629559961719e-06, + "loss": 0.1436, + "step": 3907 + }, + { + "epoch": 0.3600681807711798, + "grad_norm": 0.8950596885234003, + "learning_rate": 3.7969783713080665e-06, + "loss": 0.1595, + "step": 3908 + }, + { + "epoch": 0.3601603169484498, + "grad_norm": 0.8905525328504984, + "learning_rate": 3.796327062229833e-06, + "loss": 0.1572, + "step": 3909 + }, + { + "epoch": 0.3602524531257198, + "grad_norm": 0.8923575257402443, + "learning_rate": 3.7956756327874912e-06, + "loss": 0.1623, + "step": 3910 + }, + { + "epoch": 0.3603445893029898, + "grad_norm": 0.9554499080768074, + "learning_rate": 3.7950240830415286e-06, + "loss": 0.1594, + "step": 3911 + }, + { + "epoch": 0.36043672548025985, + "grad_norm": 0.9192802900436988, + "learning_rate": 3.79437241305244e-06, + "loss": 0.157, + "step": 3912 + }, + { + "epoch": 0.36052886165752984, + "grad_norm": 0.8913340818064148, + "learning_rate": 3.7937206228807333e-06, + "loss": 0.1491, + "step": 3913 + }, + { + "epoch": 0.36062099783479984, + "grad_norm": 0.8862949193565367, + "learning_rate": 3.793068712586928e-06, + "loss": 0.1456, + "step": 3914 + }, + { + "epoch": 0.36071313401206984, + "grad_norm": 0.9419844735143882, + "learning_rate": 3.7924166822315535e-06, + "loss": 0.1605, + "step": 3915 + }, + { + "epoch": 0.36080527018933983, + "grad_norm": 0.9360333851589002, + "learning_rate": 3.791764531875151e-06, + "loss": 0.1638, + "step": 3916 + }, + { + "epoch": 0.36089740636660983, + "grad_norm": 0.8885011416795211, + "learning_rate": 3.7911122615782727e-06, + "loss": 0.1557, + "step": 3917 + }, + { + "epoch": 0.3609895425438799, + "grad_norm": 0.890939735479181, + "learning_rate": 3.790459871401482e-06, + "loss": 0.1624, + "step": 3918 + }, + { + "epoch": 0.3610816787211499, + "grad_norm": 0.8438507583767467, + "learning_rate": 3.7898073614053527e-06, + "loss": 0.1455, + "step": 3919 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 0.9064039983475835, + "learning_rate": 3.7891547316504716e-06, + "loss": 0.146, + "step": 3920 + }, + { + "epoch": 0.3612659510756899, + "grad_norm": 0.8475645530494015, + "learning_rate": 3.788501982197435e-06, + "loss": 0.1392, + "step": 3921 + }, + { + "epoch": 0.36135808725295987, + "grad_norm": 0.9057176590949076, + "learning_rate": 3.787849113106851e-06, + "loss": 0.159, + "step": 3922 + }, + { + "epoch": 0.36145022343022987, + "grad_norm": 0.9600989104067679, + "learning_rate": 3.787196124439337e-06, + "loss": 0.1694, + "step": 3923 + }, + { + "epoch": 0.36154235960749986, + "grad_norm": 0.9044900537237868, + "learning_rate": 3.7865430162555255e-06, + "loss": 0.1639, + "step": 3924 + }, + { + "epoch": 0.3616344957847699, + "grad_norm": 0.8508738273375758, + "learning_rate": 3.7858897886160562e-06, + "loss": 0.1453, + "step": 3925 + }, + { + "epoch": 0.3617266319620399, + "grad_norm": 0.9620587536792758, + "learning_rate": 3.785236441581581e-06, + "loss": 0.1674, + "step": 3926 + }, + { + "epoch": 0.3618187681393099, + "grad_norm": 0.8942864413844239, + "learning_rate": 3.784582975212765e-06, + "loss": 0.1456, + "step": 3927 + }, + { + "epoch": 0.3619109043165799, + "grad_norm": 0.840135666464938, + "learning_rate": 3.783929389570281e-06, + "loss": 0.1455, + "step": 3928 + }, + { + "epoch": 0.3620030404938499, + "grad_norm": 0.9114590346869866, + "learning_rate": 3.7832756847148146e-06, + "loss": 0.1616, + "step": 3929 + }, + { + "epoch": 0.3620951766711199, + "grad_norm": 0.983234958010798, + "learning_rate": 3.782621860707063e-06, + "loss": 0.1725, + "step": 3930 + }, + { + "epoch": 0.3621873128483899, + "grad_norm": 0.8225486926457173, + "learning_rate": 3.781967917607734e-06, + "loss": 0.1351, + "step": 3931 + }, + { + "epoch": 0.36227944902565995, + "grad_norm": 0.8813300984579409, + "learning_rate": 3.7813138554775454e-06, + "loss": 0.1477, + "step": 3932 + }, + { + "epoch": 0.36237158520292995, + "grad_norm": 0.8821804198238675, + "learning_rate": 3.780659674377227e-06, + "loss": 0.1596, + "step": 3933 + }, + { + "epoch": 0.36246372138019994, + "grad_norm": 0.8722528057807815, + "learning_rate": 3.7800053743675213e-06, + "loss": 0.1419, + "step": 3934 + }, + { + "epoch": 0.36255585755746994, + "grad_norm": 0.9010612330891158, + "learning_rate": 3.779350955509178e-06, + "loss": 0.1565, + "step": 3935 + }, + { + "epoch": 0.36264799373473994, + "grad_norm": 0.8503750455245771, + "learning_rate": 3.7786964178629613e-06, + "loss": 0.1398, + "step": 3936 + }, + { + "epoch": 0.36274012991200993, + "grad_norm": 0.8822053261660605, + "learning_rate": 3.7780417614896438e-06, + "loss": 0.1573, + "step": 3937 + }, + { + "epoch": 0.36283226608927993, + "grad_norm": 0.895341082099772, + "learning_rate": 3.777386986450012e-06, + "loss": 0.1549, + "step": 3938 + }, + { + "epoch": 0.36292440226655, + "grad_norm": 0.8502920032613639, + "learning_rate": 3.77673209280486e-06, + "loss": 0.1537, + "step": 3939 + }, + { + "epoch": 0.36301653844382, + "grad_norm": 0.904481467548156, + "learning_rate": 3.776077080614997e-06, + "loss": 0.14, + "step": 3940 + }, + { + "epoch": 0.36310867462109, + "grad_norm": 0.850345613001194, + "learning_rate": 3.7754219499412393e-06, + "loss": 0.1416, + "step": 3941 + }, + { + "epoch": 0.36320081079836, + "grad_norm": 0.896357927903997, + "learning_rate": 3.7747667008444154e-06, + "loss": 0.1442, + "step": 3942 + }, + { + "epoch": 0.36329294697562997, + "grad_norm": 0.9476496642343656, + "learning_rate": 3.7741113333853673e-06, + "loss": 0.1625, + "step": 3943 + }, + { + "epoch": 0.36338508315289997, + "grad_norm": 0.9442071990268597, + "learning_rate": 3.773455847624944e-06, + "loss": 0.1602, + "step": 3944 + }, + { + "epoch": 0.36347721933016996, + "grad_norm": 0.8869444492762905, + "learning_rate": 3.7728002436240086e-06, + "loss": 0.1449, + "step": 3945 + }, + { + "epoch": 0.36356935550744, + "grad_norm": 0.8691908645382854, + "learning_rate": 3.772144521443434e-06, + "loss": 0.1466, + "step": 3946 + }, + { + "epoch": 0.36366149168471, + "grad_norm": 0.8722709796332478, + "learning_rate": 3.7714886811441033e-06, + "loss": 0.1544, + "step": 3947 + }, + { + "epoch": 0.36375362786198, + "grad_norm": 0.9781950813418121, + "learning_rate": 3.7708327227869113e-06, + "loss": 0.1619, + "step": 3948 + }, + { + "epoch": 0.36384576403925, + "grad_norm": 0.9741482528206916, + "learning_rate": 3.770176646432765e-06, + "loss": 0.1611, + "step": 3949 + }, + { + "epoch": 0.36393790021652, + "grad_norm": 0.9059945224642333, + "learning_rate": 3.76952045214258e-06, + "loss": 0.1474, + "step": 3950 + }, + { + "epoch": 0.36403003639379, + "grad_norm": 0.8904357001863795, + "learning_rate": 3.7688641399772842e-06, + "loss": 0.1556, + "step": 3951 + }, + { + "epoch": 0.36412217257106005, + "grad_norm": 0.9321752585797721, + "learning_rate": 3.7682077099978163e-06, + "loss": 0.1646, + "step": 3952 + }, + { + "epoch": 0.36421430874833005, + "grad_norm": 0.9378488103126844, + "learning_rate": 3.767551162265126e-06, + "loss": 0.1535, + "step": 3953 + }, + { + "epoch": 0.36430644492560005, + "grad_norm": 0.9011582010590111, + "learning_rate": 3.7668944968401743e-06, + "loss": 0.1544, + "step": 3954 + }, + { + "epoch": 0.36439858110287005, + "grad_norm": 0.8790970485023327, + "learning_rate": 3.7662377137839323e-06, + "loss": 0.1442, + "step": 3955 + }, + { + "epoch": 0.36449071728014004, + "grad_norm": 0.908621835636859, + "learning_rate": 3.7655808131573823e-06, + "loss": 0.1511, + "step": 3956 + }, + { + "epoch": 0.36458285345741004, + "grad_norm": 0.8903756339669455, + "learning_rate": 3.7649237950215178e-06, + "loss": 0.1387, + "step": 3957 + }, + { + "epoch": 0.36467498963468004, + "grad_norm": 1.0374094432276606, + "learning_rate": 3.764266659437342e-06, + "loss": 0.1752, + "step": 3958 + }, + { + "epoch": 0.3647671258119501, + "grad_norm": 0.8597914758339518, + "learning_rate": 3.763609406465872e-06, + "loss": 0.1442, + "step": 3959 + }, + { + "epoch": 0.3648592619892201, + "grad_norm": 0.9149822828048013, + "learning_rate": 3.7629520361681317e-06, + "loss": 0.1613, + "step": 3960 + }, + { + "epoch": 0.3649513981664901, + "grad_norm": 0.9028396433291438, + "learning_rate": 3.7622945486051585e-06, + "loss": 0.1412, + "step": 3961 + }, + { + "epoch": 0.3650435343437601, + "grad_norm": 0.9537831433092164, + "learning_rate": 3.7616369438380014e-06, + "loss": 0.1592, + "step": 3962 + }, + { + "epoch": 0.3651356705210301, + "grad_norm": 0.9595063973112449, + "learning_rate": 3.760979221927718e-06, + "loss": 0.1501, + "step": 3963 + }, + { + "epoch": 0.3652278066983001, + "grad_norm": 0.9440368618491344, + "learning_rate": 3.760321382935378e-06, + "loss": 0.1633, + "step": 3964 + }, + { + "epoch": 0.36531994287557007, + "grad_norm": 0.8716304492059451, + "learning_rate": 3.759663426922062e-06, + "loss": 0.1406, + "step": 3965 + }, + { + "epoch": 0.3654120790528401, + "grad_norm": 0.9317503191937407, + "learning_rate": 3.7590053539488613e-06, + "loss": 0.1573, + "step": 3966 + }, + { + "epoch": 0.3655042152301101, + "grad_norm": 0.8833226489364178, + "learning_rate": 3.758347164076879e-06, + "loss": 0.1561, + "step": 3967 + }, + { + "epoch": 0.3655963514073801, + "grad_norm": 0.9745417396103699, + "learning_rate": 3.7576888573672254e-06, + "loss": 0.1592, + "step": 3968 + }, + { + "epoch": 0.3656884875846501, + "grad_norm": 0.9530059653507331, + "learning_rate": 3.757030433881027e-06, + "loss": 0.1468, + "step": 3969 + }, + { + "epoch": 0.3657806237619201, + "grad_norm": 0.9366752553028688, + "learning_rate": 3.7563718936794176e-06, + "loss": 0.1567, + "step": 3970 + }, + { + "epoch": 0.3658727599391901, + "grad_norm": 0.9622765132112847, + "learning_rate": 3.755713236823542e-06, + "loss": 0.1589, + "step": 3971 + }, + { + "epoch": 0.3659648961164601, + "grad_norm": 0.8902165328288483, + "learning_rate": 3.755054463374558e-06, + "loss": 0.1525, + "step": 3972 + }, + { + "epoch": 0.36605703229373016, + "grad_norm": 0.9688432772053249, + "learning_rate": 3.754395573393631e-06, + "loss": 0.1728, + "step": 3973 + }, + { + "epoch": 0.36614916847100015, + "grad_norm": 0.9026315201949281, + "learning_rate": 3.7537365669419413e-06, + "loss": 0.1518, + "step": 3974 + }, + { + "epoch": 0.36624130464827015, + "grad_norm": 0.8789867884729393, + "learning_rate": 3.7530774440806757e-06, + "loss": 0.1473, + "step": 3975 + }, + { + "epoch": 0.36633344082554015, + "grad_norm": 0.8515286072012973, + "learning_rate": 3.7524182048710343e-06, + "loss": 0.1439, + "step": 3976 + }, + { + "epoch": 0.36642557700281014, + "grad_norm": 0.8828903306684278, + "learning_rate": 3.751758849374228e-06, + "loss": 0.1413, + "step": 3977 + }, + { + "epoch": 0.36651771318008014, + "grad_norm": 0.884530916429444, + "learning_rate": 3.7510993776514786e-06, + "loss": 0.1513, + "step": 3978 + }, + { + "epoch": 0.36660984935735014, + "grad_norm": 0.8848921511295754, + "learning_rate": 3.7504397897640165e-06, + "loss": 0.1499, + "step": 3979 + }, + { + "epoch": 0.3667019855346202, + "grad_norm": 0.9206901760475932, + "learning_rate": 3.7497800857730854e-06, + "loss": 0.1526, + "step": 3980 + }, + { + "epoch": 0.3667941217118902, + "grad_norm": 0.8989347990602119, + "learning_rate": 3.749120265739939e-06, + "loss": 0.1416, + "step": 3981 + }, + { + "epoch": 0.3668862578891602, + "grad_norm": 0.9478265392367967, + "learning_rate": 3.7484603297258413e-06, + "loss": 0.1526, + "step": 3982 + }, + { + "epoch": 0.3669783940664302, + "grad_norm": 0.9804580570371507, + "learning_rate": 3.747800277792068e-06, + "loss": 0.1547, + "step": 3983 + }, + { + "epoch": 0.3670705302437002, + "grad_norm": 0.9347412913063416, + "learning_rate": 3.7471401099999044e-06, + "loss": 0.152, + "step": 3984 + }, + { + "epoch": 0.3671626664209702, + "grad_norm": 0.9714621448950228, + "learning_rate": 3.7464798264106474e-06, + "loss": 0.1546, + "step": 3985 + }, + { + "epoch": 0.3672548025982402, + "grad_norm": 0.9361573736104657, + "learning_rate": 3.7458194270856046e-06, + "loss": 0.1496, + "step": 3986 + }, + { + "epoch": 0.3673469387755102, + "grad_norm": 0.9435489839258917, + "learning_rate": 3.745158912086093e-06, + "loss": 0.166, + "step": 3987 + }, + { + "epoch": 0.3674390749527802, + "grad_norm": 0.9511965958836167, + "learning_rate": 3.744498281473443e-06, + "loss": 0.1558, + "step": 3988 + }, + { + "epoch": 0.3675312111300502, + "grad_norm": 0.8979980364324908, + "learning_rate": 3.743837535308994e-06, + "loss": 0.1401, + "step": 3989 + }, + { + "epoch": 0.3676233473073202, + "grad_norm": 0.9236641015351937, + "learning_rate": 3.7431766736540958e-06, + "loss": 0.1482, + "step": 3990 + }, + { + "epoch": 0.3677154834845902, + "grad_norm": 0.9291040410569116, + "learning_rate": 3.74251569657011e-06, + "loss": 0.1511, + "step": 3991 + }, + { + "epoch": 0.3678076196618602, + "grad_norm": 1.014189124142894, + "learning_rate": 3.7418546041184074e-06, + "loss": 0.1467, + "step": 3992 + }, + { + "epoch": 0.36789975583913026, + "grad_norm": 0.9383118379868695, + "learning_rate": 3.7411933963603706e-06, + "loss": 0.1655, + "step": 3993 + }, + { + "epoch": 0.36799189201640026, + "grad_norm": 0.8876646385129865, + "learning_rate": 3.7405320733573948e-06, + "loss": 0.1433, + "step": 3994 + }, + { + "epoch": 0.36808402819367025, + "grad_norm": 0.9951541805502094, + "learning_rate": 3.739870635170881e-06, + "loss": 0.1674, + "step": 3995 + }, + { + "epoch": 0.36817616437094025, + "grad_norm": 0.9346916827824021, + "learning_rate": 3.739209081862247e-06, + "loss": 0.1408, + "step": 3996 + }, + { + "epoch": 0.36826830054821025, + "grad_norm": 0.9003306272523681, + "learning_rate": 3.738547413492916e-06, + "loss": 0.1445, + "step": 3997 + }, + { + "epoch": 0.36836043672548024, + "grad_norm": 0.9747753339711037, + "learning_rate": 3.7378856301243233e-06, + "loss": 0.1558, + "step": 3998 + }, + { + "epoch": 0.36845257290275024, + "grad_norm": 0.9296953912478665, + "learning_rate": 3.7372237318179172e-06, + "loss": 0.1533, + "step": 3999 + }, + { + "epoch": 0.3685447090800203, + "grad_norm": 0.886098729035054, + "learning_rate": 3.7365617186351538e-06, + "loss": 0.1403, + "step": 4000 + }, + { + "epoch": 0.3685447090800203, + "eval_loss": 0.15275675058364868, + "eval_runtime": 299.8799, + "eval_samples_per_second": 23.399, + "eval_steps_per_second": 2.928, + "step": 4000 + }, + { + "epoch": 0.3686368452572903, + "grad_norm": 0.9026680591810774, + "learning_rate": 3.735899590637503e-06, + "loss": 0.1374, + "step": 4001 + }, + { + "epoch": 0.3687289814345603, + "grad_norm": 0.9006265105747577, + "learning_rate": 3.735237347886441e-06, + "loss": 0.141, + "step": 4002 + }, + { + "epoch": 0.3688211176118303, + "grad_norm": 0.9632222935393651, + "learning_rate": 3.7345749904434593e-06, + "loss": 0.1463, + "step": 4003 + }, + { + "epoch": 0.3689132537891003, + "grad_norm": 0.8573673467809124, + "learning_rate": 3.733912518370056e-06, + "loss": 0.1416, + "step": 4004 + }, + { + "epoch": 0.3690053899663703, + "grad_norm": 0.9167873217872394, + "learning_rate": 3.7332499317277432e-06, + "loss": 0.1514, + "step": 4005 + }, + { + "epoch": 0.3690975261436403, + "grad_norm": 0.9053492707952349, + "learning_rate": 3.732587230578041e-06, + "loss": 0.1498, + "step": 4006 + }, + { + "epoch": 0.36918966232091033, + "grad_norm": 0.9514143436948048, + "learning_rate": 3.7319244149824825e-06, + "loss": 0.1502, + "step": 4007 + }, + { + "epoch": 0.3692817984981803, + "grad_norm": 0.9243371986561314, + "learning_rate": 3.7312614850026086e-06, + "loss": 0.1647, + "step": 4008 + }, + { + "epoch": 0.3693739346754503, + "grad_norm": 0.9274365702949716, + "learning_rate": 3.730598440699974e-06, + "loss": 0.1508, + "step": 4009 + }, + { + "epoch": 0.3694660708527203, + "grad_norm": 0.8933690616443185, + "learning_rate": 3.729935282136142e-06, + "loss": 0.1527, + "step": 4010 + }, + { + "epoch": 0.3695582070299903, + "grad_norm": 0.9779518506144268, + "learning_rate": 3.729272009372686e-06, + "loss": 0.1528, + "step": 4011 + }, + { + "epoch": 0.3696503432072603, + "grad_norm": 0.9370281031601337, + "learning_rate": 3.7286086224711916e-06, + "loss": 0.1455, + "step": 4012 + }, + { + "epoch": 0.3697424793845303, + "grad_norm": 0.8777378271793664, + "learning_rate": 3.727945121493255e-06, + "loss": 0.1496, + "step": 4013 + }, + { + "epoch": 0.36983461556180036, + "grad_norm": 0.9393712387806363, + "learning_rate": 3.7272815065004808e-06, + "loss": 0.1641, + "step": 4014 + }, + { + "epoch": 0.36992675173907036, + "grad_norm": 1.0039688517917074, + "learning_rate": 3.7266177775544877e-06, + "loss": 0.1511, + "step": 4015 + }, + { + "epoch": 0.37001888791634036, + "grad_norm": 0.8739356202824567, + "learning_rate": 3.7259539347169015e-06, + "loss": 0.1371, + "step": 4016 + }, + { + "epoch": 0.37011102409361035, + "grad_norm": 0.888356334870733, + "learning_rate": 3.72528997804936e-06, + "loss": 0.1519, + "step": 4017 + }, + { + "epoch": 0.37020316027088035, + "grad_norm": 0.993257143098972, + "learning_rate": 3.724625907613513e-06, + "loss": 0.1565, + "step": 4018 + }, + { + "epoch": 0.37029529644815035, + "grad_norm": 0.9754754956762344, + "learning_rate": 3.7239617234710185e-06, + "loss": 0.1413, + "step": 4019 + }, + { + "epoch": 0.3703874326254204, + "grad_norm": 0.843187808737617, + "learning_rate": 3.7232974256835457e-06, + "loss": 0.1433, + "step": 4020 + }, + { + "epoch": 0.3704795688026904, + "grad_norm": 0.8997900277623461, + "learning_rate": 3.7226330143127765e-06, + "loss": 0.1468, + "step": 4021 + }, + { + "epoch": 0.3705717049799604, + "grad_norm": 0.9570942103840279, + "learning_rate": 3.721968489420399e-06, + "loss": 0.1358, + "step": 4022 + }, + { + "epoch": 0.3706638411572304, + "grad_norm": 0.9574979970822548, + "learning_rate": 3.721303851068116e-06, + "loss": 0.1602, + "step": 4023 + }, + { + "epoch": 0.3707559773345004, + "grad_norm": 0.8884651421400183, + "learning_rate": 3.7206390993176395e-06, + "loss": 0.1414, + "step": 4024 + }, + { + "epoch": 0.3708481135117704, + "grad_norm": 0.9861288907351328, + "learning_rate": 3.719974234230691e-06, + "loss": 0.1542, + "step": 4025 + }, + { + "epoch": 0.3709402496890404, + "grad_norm": 1.0008506973241864, + "learning_rate": 3.7193092558690036e-06, + "loss": 0.1479, + "step": 4026 + }, + { + "epoch": 0.37103238586631043, + "grad_norm": 0.9941006684988769, + "learning_rate": 3.7186441642943206e-06, + "loss": 0.1483, + "step": 4027 + }, + { + "epoch": 0.37112452204358043, + "grad_norm": 0.9438400436931483, + "learning_rate": 3.7179789595683954e-06, + "loss": 0.1474, + "step": 4028 + }, + { + "epoch": 0.3712166582208504, + "grad_norm": 0.9935959286704832, + "learning_rate": 3.717313641752993e-06, + "loss": 0.147, + "step": 4029 + }, + { + "epoch": 0.3713087943981204, + "grad_norm": 1.011789007873907, + "learning_rate": 3.7166482109098878e-06, + "loss": 0.1558, + "step": 4030 + }, + { + "epoch": 0.3714009305753904, + "grad_norm": 0.9405959335230354, + "learning_rate": 3.715982667100866e-06, + "loss": 0.1637, + "step": 4031 + }, + { + "epoch": 0.3714930667526604, + "grad_norm": 0.909937263307173, + "learning_rate": 3.7153170103877216e-06, + "loss": 0.1431, + "step": 4032 + }, + { + "epoch": 0.3715852029299304, + "grad_norm": 0.9554612213688228, + "learning_rate": 3.7146512408322623e-06, + "loss": 0.1652, + "step": 4033 + }, + { + "epoch": 0.37167733910720047, + "grad_norm": 1.0065983816320472, + "learning_rate": 3.7139853584963054e-06, + "loss": 0.1552, + "step": 4034 + }, + { + "epoch": 0.37176947528447046, + "grad_norm": 0.9020436912692089, + "learning_rate": 3.7133193634416766e-06, + "loss": 0.148, + "step": 4035 + }, + { + "epoch": 0.37186161146174046, + "grad_norm": 0.9803335179141888, + "learning_rate": 3.7126532557302144e-06, + "loss": 0.1667, + "step": 4036 + }, + { + "epoch": 0.37195374763901046, + "grad_norm": 0.9175684244894553, + "learning_rate": 3.711987035423767e-06, + "loss": 0.1566, + "step": 4037 + }, + { + "epoch": 0.37204588381628045, + "grad_norm": 0.9262653613986682, + "learning_rate": 3.711320702584193e-06, + "loss": 0.154, + "step": 4038 + }, + { + "epoch": 0.37213801999355045, + "grad_norm": 0.9086129584913225, + "learning_rate": 3.710654257273361e-06, + "loss": 0.1507, + "step": 4039 + }, + { + "epoch": 0.37223015617082045, + "grad_norm": 1.0044929480816431, + "learning_rate": 3.7099876995531515e-06, + "loss": 0.1568, + "step": 4040 + }, + { + "epoch": 0.3723222923480905, + "grad_norm": 0.9511895454298042, + "learning_rate": 3.709321029485453e-06, + "loss": 0.1596, + "step": 4041 + }, + { + "epoch": 0.3724144285253605, + "grad_norm": 0.9457430568964129, + "learning_rate": 3.708654247132168e-06, + "loss": 0.1473, + "step": 4042 + }, + { + "epoch": 0.3725065647026305, + "grad_norm": 0.9216559389726534, + "learning_rate": 3.7079873525552053e-06, + "loss": 0.1471, + "step": 4043 + }, + { + "epoch": 0.3725987008799005, + "grad_norm": 0.9511420633348692, + "learning_rate": 3.707320345816487e-06, + "loss": 0.151, + "step": 4044 + }, + { + "epoch": 0.3726908370571705, + "grad_norm": 0.9025153243738641, + "learning_rate": 3.7066532269779444e-06, + "loss": 0.142, + "step": 4045 + }, + { + "epoch": 0.3727829732344405, + "grad_norm": 0.9011326279630237, + "learning_rate": 3.7059859961015205e-06, + "loss": 0.1437, + "step": 4046 + }, + { + "epoch": 0.3728751094117105, + "grad_norm": 0.964381694542699, + "learning_rate": 3.705318653249166e-06, + "loss": 0.1624, + "step": 4047 + }, + { + "epoch": 0.37296724558898053, + "grad_norm": 0.9491833954146554, + "learning_rate": 3.704651198482846e-06, + "loss": 0.151, + "step": 4048 + }, + { + "epoch": 0.37305938176625053, + "grad_norm": 0.9783666245098148, + "learning_rate": 3.703983631864532e-06, + "loss": 0.1651, + "step": 4049 + }, + { + "epoch": 0.3731515179435205, + "grad_norm": 0.9566763043838267, + "learning_rate": 3.703315953456208e-06, + "loss": 0.1457, + "step": 4050 + }, + { + "epoch": 0.3732436541207905, + "grad_norm": 0.97352017386297, + "learning_rate": 3.7026481633198687e-06, + "loss": 0.1631, + "step": 4051 + }, + { + "epoch": 0.3733357902980605, + "grad_norm": 0.9425864394639262, + "learning_rate": 3.701980261517518e-06, + "loss": 0.1661, + "step": 4052 + }, + { + "epoch": 0.3734279264753305, + "grad_norm": 0.9059232717573347, + "learning_rate": 3.70131224811117e-06, + "loss": 0.1579, + "step": 4053 + }, + { + "epoch": 0.37352006265260057, + "grad_norm": 0.9281068034699287, + "learning_rate": 3.7006441231628517e-06, + "loss": 0.1474, + "step": 4054 + }, + { + "epoch": 0.37361219882987057, + "grad_norm": 0.9670493816154251, + "learning_rate": 3.699975886734596e-06, + "loss": 0.1638, + "step": 4055 + }, + { + "epoch": 0.37370433500714056, + "grad_norm": 0.8970057217264027, + "learning_rate": 3.6993075388884507e-06, + "loss": 0.1477, + "step": 4056 + }, + { + "epoch": 0.37379647118441056, + "grad_norm": 0.9142284746518392, + "learning_rate": 3.698639079686471e-06, + "loss": 0.1577, + "step": 4057 + }, + { + "epoch": 0.37388860736168056, + "grad_norm": 0.9045178818037406, + "learning_rate": 3.6979705091907244e-06, + "loss": 0.158, + "step": 4058 + }, + { + "epoch": 0.37398074353895056, + "grad_norm": 0.8711812616573087, + "learning_rate": 3.6973018274632865e-06, + "loss": 0.1529, + "step": 4059 + }, + { + "epoch": 0.37407287971622055, + "grad_norm": 0.8729978347669862, + "learning_rate": 3.696633034566245e-06, + "loss": 0.157, + "step": 4060 + }, + { + "epoch": 0.3741650158934906, + "grad_norm": 0.8990542116703788, + "learning_rate": 3.6959641305616984e-06, + "loss": 0.1503, + "step": 4061 + }, + { + "epoch": 0.3742571520707606, + "grad_norm": 0.8904444740765883, + "learning_rate": 3.695295115511752e-06, + "loss": 0.1516, + "step": 4062 + }, + { + "epoch": 0.3743492882480306, + "grad_norm": 0.9753397041967755, + "learning_rate": 3.694625989478527e-06, + "loss": 0.1631, + "step": 4063 + }, + { + "epoch": 0.3744414244253006, + "grad_norm": 0.9049757714025393, + "learning_rate": 3.69395675252415e-06, + "loss": 0.1423, + "step": 4064 + }, + { + "epoch": 0.3745335606025706, + "grad_norm": 0.8526472535969689, + "learning_rate": 3.6932874047107597e-06, + "loss": 0.1411, + "step": 4065 + }, + { + "epoch": 0.3746256967798406, + "grad_norm": 0.9655334358530389, + "learning_rate": 3.6926179461005056e-06, + "loss": 0.1569, + "step": 4066 + }, + { + "epoch": 0.3747178329571106, + "grad_norm": 0.9250687407747031, + "learning_rate": 3.691948376755547e-06, + "loss": 0.1572, + "step": 4067 + }, + { + "epoch": 0.37480996913438064, + "grad_norm": 0.9047372516849579, + "learning_rate": 3.6912786967380528e-06, + "loss": 0.1562, + "step": 4068 + }, + { + "epoch": 0.37490210531165064, + "grad_norm": 0.859181189210115, + "learning_rate": 3.6906089061102043e-06, + "loss": 0.1413, + "step": 4069 + }, + { + "epoch": 0.37499424148892063, + "grad_norm": 0.9373737963751739, + "learning_rate": 3.6899390049341893e-06, + "loss": 0.1587, + "step": 4070 + }, + { + "epoch": 0.37508637766619063, + "grad_norm": 0.9447926915063538, + "learning_rate": 3.68926899327221e-06, + "loss": 0.1514, + "step": 4071 + }, + { + "epoch": 0.3751785138434606, + "grad_norm": 0.9193281338428294, + "learning_rate": 3.6885988711864777e-06, + "loss": 0.1583, + "step": 4072 + }, + { + "epoch": 0.3752706500207306, + "grad_norm": 0.9827375591842401, + "learning_rate": 3.6879286387392122e-06, + "loss": 0.1512, + "step": 4073 + }, + { + "epoch": 0.3753627861980006, + "grad_norm": 0.8550091267938758, + "learning_rate": 3.687258295992644e-06, + "loss": 0.1334, + "step": 4074 + }, + { + "epoch": 0.37545492237527067, + "grad_norm": 0.9004154950704388, + "learning_rate": 3.686587843009016e-06, + "loss": 0.1479, + "step": 4075 + }, + { + "epoch": 0.37554705855254067, + "grad_norm": 0.9743962905621874, + "learning_rate": 3.685917279850578e-06, + "loss": 0.1606, + "step": 4076 + }, + { + "epoch": 0.37563919472981067, + "grad_norm": 0.9150974523990754, + "learning_rate": 3.685246606579594e-06, + "loss": 0.1479, + "step": 4077 + }, + { + "epoch": 0.37573133090708066, + "grad_norm": 0.8927057804303226, + "learning_rate": 3.684575823258334e-06, + "loss": 0.1553, + "step": 4078 + }, + { + "epoch": 0.37582346708435066, + "grad_norm": 0.8912269318519523, + "learning_rate": 3.683904929949082e-06, + "loss": 0.1606, + "step": 4079 + }, + { + "epoch": 0.37591560326162066, + "grad_norm": 0.8714988941859493, + "learning_rate": 3.68323392671413e-06, + "loss": 0.1512, + "step": 4080 + }, + { + "epoch": 0.37600773943889065, + "grad_norm": 0.9066092812200909, + "learning_rate": 3.6825628136157805e-06, + "loss": 0.1645, + "step": 4081 + }, + { + "epoch": 0.3760998756161607, + "grad_norm": 0.9353430946436311, + "learning_rate": 3.6818915907163456e-06, + "loss": 0.1546, + "step": 4082 + }, + { + "epoch": 0.3761920117934307, + "grad_norm": 0.824207106952847, + "learning_rate": 3.6812202580781507e-06, + "loss": 0.1357, + "step": 4083 + }, + { + "epoch": 0.3762841479707007, + "grad_norm": 0.9127969613382955, + "learning_rate": 3.680548815763527e-06, + "loss": 0.1486, + "step": 4084 + }, + { + "epoch": 0.3763762841479707, + "grad_norm": 0.9223650712476852, + "learning_rate": 3.6798772638348186e-06, + "loss": 0.1452, + "step": 4085 + }, + { + "epoch": 0.3764684203252407, + "grad_norm": 0.9504996208651127, + "learning_rate": 3.679205602354379e-06, + "loss": 0.1593, + "step": 4086 + }, + { + "epoch": 0.3765605565025107, + "grad_norm": 0.9697927009717019, + "learning_rate": 3.6785338313845725e-06, + "loss": 0.17, + "step": 4087 + }, + { + "epoch": 0.37665269267978074, + "grad_norm": 0.9399208083149008, + "learning_rate": 3.677861950987773e-06, + "loss": 0.1476, + "step": 4088 + }, + { + "epoch": 0.37674482885705074, + "grad_norm": 0.8869065172245069, + "learning_rate": 3.677189961226365e-06, + "loss": 0.1477, + "step": 4089 + }, + { + "epoch": 0.37683696503432074, + "grad_norm": 0.9534644978626714, + "learning_rate": 3.6765178621627418e-06, + "loss": 0.1616, + "step": 4090 + }, + { + "epoch": 0.37692910121159073, + "grad_norm": 0.85557009388475, + "learning_rate": 3.675845653859309e-06, + "loss": 0.1403, + "step": 4091 + }, + { + "epoch": 0.37702123738886073, + "grad_norm": 0.9242645580752875, + "learning_rate": 3.6751733363784804e-06, + "loss": 0.1523, + "step": 4092 + }, + { + "epoch": 0.3771133735661307, + "grad_norm": 0.8425672883949519, + "learning_rate": 3.6745009097826813e-06, + "loss": 0.1412, + "step": 4093 + }, + { + "epoch": 0.3772055097434007, + "grad_norm": 0.8877254768400884, + "learning_rate": 3.6738283741343463e-06, + "loss": 0.1518, + "step": 4094 + }, + { + "epoch": 0.3772976459206708, + "grad_norm": 0.9544665857522291, + "learning_rate": 3.6731557294959196e-06, + "loss": 0.156, + "step": 4095 + }, + { + "epoch": 0.3773897820979408, + "grad_norm": 0.8985137299961812, + "learning_rate": 3.6724829759298585e-06, + "loss": 0.1567, + "step": 4096 + }, + { + "epoch": 0.37748191827521077, + "grad_norm": 0.9207976569709534, + "learning_rate": 3.671810113498626e-06, + "loss": 0.1514, + "step": 4097 + }, + { + "epoch": 0.37757405445248077, + "grad_norm": 0.9250476073445184, + "learning_rate": 3.6711371422646984e-06, + "loss": 0.1529, + "step": 4098 + }, + { + "epoch": 0.37766619062975076, + "grad_norm": 0.9270432357066251, + "learning_rate": 3.6704640622905617e-06, + "loss": 0.1632, + "step": 4099 + }, + { + "epoch": 0.37775832680702076, + "grad_norm": 0.9510100858087815, + "learning_rate": 3.6697908736387105e-06, + "loss": 0.1664, + "step": 4100 + }, + { + "epoch": 0.37785046298429076, + "grad_norm": 0.8715401290229666, + "learning_rate": 3.669117576371651e-06, + "loss": 0.1355, + "step": 4101 + }, + { + "epoch": 0.3779425991615608, + "grad_norm": 0.9679288812456934, + "learning_rate": 3.668444170551898e-06, + "loss": 0.1607, + "step": 4102 + }, + { + "epoch": 0.3780347353388308, + "grad_norm": 0.9808453390014398, + "learning_rate": 3.6677706562419784e-06, + "loss": 0.1641, + "step": 4103 + }, + { + "epoch": 0.3781268715161008, + "grad_norm": 0.9312962652606193, + "learning_rate": 3.667097033504428e-06, + "loss": 0.1692, + "step": 4104 + }, + { + "epoch": 0.3782190076933708, + "grad_norm": 0.9276051374101398, + "learning_rate": 3.666423302401792e-06, + "loss": 0.1667, + "step": 4105 + }, + { + "epoch": 0.3783111438706408, + "grad_norm": 0.8760055981989053, + "learning_rate": 3.6657494629966274e-06, + "loss": 0.144, + "step": 4106 + }, + { + "epoch": 0.3784032800479108, + "grad_norm": 0.8706088602855692, + "learning_rate": 3.6650755153514993e-06, + "loss": 0.1451, + "step": 4107 + }, + { + "epoch": 0.3784954162251808, + "grad_norm": 0.9489960109369865, + "learning_rate": 3.664401459528984e-06, + "loss": 0.1522, + "step": 4108 + }, + { + "epoch": 0.37858755240245084, + "grad_norm": 0.9511363844631635, + "learning_rate": 3.663727295591668e-06, + "loss": 0.1603, + "step": 4109 + }, + { + "epoch": 0.37867968857972084, + "grad_norm": 0.984138360362065, + "learning_rate": 3.6630530236021478e-06, + "loss": 0.1527, + "step": 4110 + }, + { + "epoch": 0.37877182475699084, + "grad_norm": 0.9177359028717674, + "learning_rate": 3.6623786436230287e-06, + "loss": 0.1487, + "step": 4111 + }, + { + "epoch": 0.37886396093426083, + "grad_norm": 0.9387063111522116, + "learning_rate": 3.6617041557169282e-06, + "loss": 0.1429, + "step": 4112 + }, + { + "epoch": 0.37895609711153083, + "grad_norm": 0.9434138295220491, + "learning_rate": 3.6610295599464707e-06, + "loss": 0.1474, + "step": 4113 + }, + { + "epoch": 0.37904823328880083, + "grad_norm": 1.0168889669868741, + "learning_rate": 3.660354856374294e-06, + "loss": 0.1462, + "step": 4114 + }, + { + "epoch": 0.3791403694660708, + "grad_norm": 0.9435621753939302, + "learning_rate": 3.6596800450630445e-06, + "loss": 0.1488, + "step": 4115 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 1.0039934347610908, + "learning_rate": 3.659005126075377e-06, + "loss": 0.1606, + "step": 4116 + }, + { + "epoch": 0.3793246418206109, + "grad_norm": 0.9742949418469122, + "learning_rate": 3.65833009947396e-06, + "loss": 0.1618, + "step": 4117 + }, + { + "epoch": 0.37941677799788087, + "grad_norm": 0.97062878459915, + "learning_rate": 3.657654965321468e-06, + "loss": 0.151, + "step": 4118 + }, + { + "epoch": 0.37950891417515087, + "grad_norm": 0.9727659249762055, + "learning_rate": 3.6569797236805877e-06, + "loss": 0.1531, + "step": 4119 + }, + { + "epoch": 0.37960105035242087, + "grad_norm": 0.9530211060763032, + "learning_rate": 3.656304374614016e-06, + "loss": 0.1608, + "step": 4120 + }, + { + "epoch": 0.37969318652969086, + "grad_norm": 0.9892002262495758, + "learning_rate": 3.6556289181844582e-06, + "loss": 0.1614, + "step": 4121 + }, + { + "epoch": 0.3797853227069609, + "grad_norm": 0.9853737375495374, + "learning_rate": 3.654953354454631e-06, + "loss": 0.1658, + "step": 4122 + }, + { + "epoch": 0.3798774588842309, + "grad_norm": 0.8190640589995611, + "learning_rate": 3.654277683487261e-06, + "loss": 0.1318, + "step": 4123 + }, + { + "epoch": 0.3799695950615009, + "grad_norm": 0.9400423177734195, + "learning_rate": 3.6536019053450834e-06, + "loss": 0.161, + "step": 4124 + }, + { + "epoch": 0.3800617312387709, + "grad_norm": 0.9517573566439758, + "learning_rate": 3.652926020090845e-06, + "loss": 0.1518, + "step": 4125 + }, + { + "epoch": 0.3801538674160409, + "grad_norm": 0.926377462587662, + "learning_rate": 3.6522500277873017e-06, + "loss": 0.1413, + "step": 4126 + }, + { + "epoch": 0.3802460035933109, + "grad_norm": 0.8943962669159516, + "learning_rate": 3.651573928497219e-06, + "loss": 0.1545, + "step": 4127 + }, + { + "epoch": 0.3803381397705809, + "grad_norm": 0.9300678578368041, + "learning_rate": 3.6508977222833737e-06, + "loss": 0.1546, + "step": 4128 + }, + { + "epoch": 0.38043027594785095, + "grad_norm": 0.9643638642908029, + "learning_rate": 3.6502214092085504e-06, + "loss": 0.1529, + "step": 4129 + }, + { + "epoch": 0.38052241212512095, + "grad_norm": 0.9714822619040409, + "learning_rate": 3.649544989335545e-06, + "loss": 0.1542, + "step": 4130 + }, + { + "epoch": 0.38061454830239094, + "grad_norm": 0.9773020057604235, + "learning_rate": 3.648868462727165e-06, + "loss": 0.1575, + "step": 4131 + }, + { + "epoch": 0.38070668447966094, + "grad_norm": 0.9255103386023259, + "learning_rate": 3.6481918294462237e-06, + "loss": 0.1533, + "step": 4132 + }, + { + "epoch": 0.38079882065693094, + "grad_norm": 0.9723738172144644, + "learning_rate": 3.647515089555548e-06, + "loss": 0.1587, + "step": 4133 + }, + { + "epoch": 0.38089095683420093, + "grad_norm": 0.9349434163742851, + "learning_rate": 3.6468382431179717e-06, + "loss": 0.1435, + "step": 4134 + }, + { + "epoch": 0.38098309301147093, + "grad_norm": 0.9636701484737411, + "learning_rate": 3.646161290196342e-06, + "loss": 0.1529, + "step": 4135 + }, + { + "epoch": 0.381075229188741, + "grad_norm": 0.9754765196777119, + "learning_rate": 3.645484230853513e-06, + "loss": 0.1707, + "step": 4136 + }, + { + "epoch": 0.381167365366011, + "grad_norm": 0.9895673826813174, + "learning_rate": 3.64480706515235e-06, + "loss": 0.1673, + "step": 4137 + }, + { + "epoch": 0.381259501543281, + "grad_norm": 0.9674314673920184, + "learning_rate": 3.6441297931557274e-06, + "loss": 0.1552, + "step": 4138 + }, + { + "epoch": 0.381351637720551, + "grad_norm": 0.9225542238948127, + "learning_rate": 3.643452414926531e-06, + "loss": 0.1594, + "step": 4139 + }, + { + "epoch": 0.38144377389782097, + "grad_norm": 0.8649506707868612, + "learning_rate": 3.6427749305276537e-06, + "loss": 0.1415, + "step": 4140 + }, + { + "epoch": 0.38153591007509097, + "grad_norm": 0.8950199000553921, + "learning_rate": 3.6420973400220016e-06, + "loss": 0.16, + "step": 4141 + }, + { + "epoch": 0.38162804625236096, + "grad_norm": 0.9225797065582534, + "learning_rate": 3.641419643472489e-06, + "loss": 0.1576, + "step": 4142 + }, + { + "epoch": 0.381720182429631, + "grad_norm": 0.9392880937976605, + "learning_rate": 3.640741840942039e-06, + "loss": 0.1504, + "step": 4143 + }, + { + "epoch": 0.381812318606901, + "grad_norm": 0.8928198640604731, + "learning_rate": 3.640063932493588e-06, + "loss": 0.1525, + "step": 4144 + }, + { + "epoch": 0.381904454784171, + "grad_norm": 0.899053773871494, + "learning_rate": 3.639385918190076e-06, + "loss": 0.1527, + "step": 4145 + }, + { + "epoch": 0.381996590961441, + "grad_norm": 0.9202410829435287, + "learning_rate": 3.6387077980944595e-06, + "loss": 0.1621, + "step": 4146 + }, + { + "epoch": 0.382088727138711, + "grad_norm": 1.0028767320926162, + "learning_rate": 3.6380295722697023e-06, + "loss": 0.1505, + "step": 4147 + }, + { + "epoch": 0.382180863315981, + "grad_norm": 0.8458601915921796, + "learning_rate": 3.637351240778776e-06, + "loss": 0.1354, + "step": 4148 + }, + { + "epoch": 0.382272999493251, + "grad_norm": 0.8827937855379273, + "learning_rate": 3.6366728036846647e-06, + "loss": 0.1546, + "step": 4149 + }, + { + "epoch": 0.38236513567052105, + "grad_norm": 0.8949424057254047, + "learning_rate": 3.635994261050362e-06, + "loss": 0.1544, + "step": 4150 + }, + { + "epoch": 0.38245727184779105, + "grad_norm": 0.9282648917790803, + "learning_rate": 3.6353156129388683e-06, + "loss": 0.1522, + "step": 4151 + }, + { + "epoch": 0.38254940802506104, + "grad_norm": 0.8870218985004921, + "learning_rate": 3.634636859413199e-06, + "loss": 0.1393, + "step": 4152 + }, + { + "epoch": 0.38264154420233104, + "grad_norm": 0.9425406478284127, + "learning_rate": 3.633958000536375e-06, + "loss": 0.1567, + "step": 4153 + }, + { + "epoch": 0.38273368037960104, + "grad_norm": 0.9264991414341093, + "learning_rate": 3.633279036371429e-06, + "loss": 0.1542, + "step": 4154 + }, + { + "epoch": 0.38282581655687103, + "grad_norm": 0.9064377119313413, + "learning_rate": 3.6325999669814014e-06, + "loss": 0.1477, + "step": 4155 + }, + { + "epoch": 0.3829179527341411, + "grad_norm": 0.9228052083944193, + "learning_rate": 3.631920792429346e-06, + "loss": 0.1534, + "step": 4156 + }, + { + "epoch": 0.3830100889114111, + "grad_norm": 0.8711501223021223, + "learning_rate": 3.6312415127783228e-06, + "loss": 0.1436, + "step": 4157 + }, + { + "epoch": 0.3831022250886811, + "grad_norm": 0.9402770645056095, + "learning_rate": 3.630562128091403e-06, + "loss": 0.1546, + "step": 4158 + }, + { + "epoch": 0.3831943612659511, + "grad_norm": 0.9619983038446867, + "learning_rate": 3.6298826384316684e-06, + "loss": 0.1514, + "step": 4159 + }, + { + "epoch": 0.3832864974432211, + "grad_norm": 0.8653963516789092, + "learning_rate": 3.6292030438622093e-06, + "loss": 0.1424, + "step": 4160 + }, + { + "epoch": 0.38337863362049107, + "grad_norm": 0.938124421714208, + "learning_rate": 3.6285233444461255e-06, + "loss": 0.1644, + "step": 4161 + }, + { + "epoch": 0.38347076979776107, + "grad_norm": 0.8478622536698076, + "learning_rate": 3.6278435402465283e-06, + "loss": 0.1357, + "step": 4162 + }, + { + "epoch": 0.3835629059750311, + "grad_norm": 0.9507324930194543, + "learning_rate": 3.6271636313265368e-06, + "loss": 0.1697, + "step": 4163 + }, + { + "epoch": 0.3836550421523011, + "grad_norm": 0.9054939912598036, + "learning_rate": 3.6264836177492812e-06, + "loss": 0.1469, + "step": 4164 + }, + { + "epoch": 0.3837471783295711, + "grad_norm": 0.8788456687040019, + "learning_rate": 3.6258034995778994e-06, + "loss": 0.1342, + "step": 4165 + }, + { + "epoch": 0.3838393145068411, + "grad_norm": 0.9015870187596965, + "learning_rate": 3.6251232768755428e-06, + "loss": 0.154, + "step": 4166 + }, + { + "epoch": 0.3839314506841111, + "grad_norm": 0.8909983172995234, + "learning_rate": 3.6244429497053678e-06, + "loss": 0.1507, + "step": 4167 + }, + { + "epoch": 0.3840235868613811, + "grad_norm": 0.8913316867597605, + "learning_rate": 3.623762518130545e-06, + "loss": 0.1474, + "step": 4168 + }, + { + "epoch": 0.3841157230386511, + "grad_norm": 0.9361153707666292, + "learning_rate": 3.6230819822142504e-06, + "loss": 0.1416, + "step": 4169 + }, + { + "epoch": 0.38420785921592115, + "grad_norm": 0.9455562022520799, + "learning_rate": 3.6224013420196734e-06, + "loss": 0.1586, + "step": 4170 + }, + { + "epoch": 0.38429999539319115, + "grad_norm": 0.8593562197121624, + "learning_rate": 3.621720597610011e-06, + "loss": 0.1431, + "step": 4171 + }, + { + "epoch": 0.38439213157046115, + "grad_norm": 0.8832913395812277, + "learning_rate": 3.62103974904847e-06, + "loss": 0.1383, + "step": 4172 + }, + { + "epoch": 0.38448426774773115, + "grad_norm": 0.8483230097860568, + "learning_rate": 3.620358796398268e-06, + "loss": 0.1438, + "step": 4173 + }, + { + "epoch": 0.38457640392500114, + "grad_norm": 0.9683562509779495, + "learning_rate": 3.6196777397226314e-06, + "loss": 0.1439, + "step": 4174 + }, + { + "epoch": 0.38466854010227114, + "grad_norm": 0.9822738998325394, + "learning_rate": 3.618996579084796e-06, + "loss": 0.1706, + "step": 4175 + }, + { + "epoch": 0.38476067627954114, + "grad_norm": 0.9318590340850513, + "learning_rate": 3.6183153145480075e-06, + "loss": 0.1502, + "step": 4176 + }, + { + "epoch": 0.3848528124568112, + "grad_norm": 0.91227748896253, + "learning_rate": 3.6176339461755217e-06, + "loss": 0.1544, + "step": 4177 + }, + { + "epoch": 0.3849449486340812, + "grad_norm": 0.849625877446819, + "learning_rate": 3.6169524740306038e-06, + "loss": 0.1269, + "step": 4178 + }, + { + "epoch": 0.3850370848113512, + "grad_norm": 0.8533351529502364, + "learning_rate": 3.6162708981765294e-06, + "loss": 0.1392, + "step": 4179 + }, + { + "epoch": 0.3851292209886212, + "grad_norm": 0.9224771053382539, + "learning_rate": 3.6155892186765805e-06, + "loss": 0.1477, + "step": 4180 + }, + { + "epoch": 0.3852213571658912, + "grad_norm": 0.8713099758237273, + "learning_rate": 3.6149074355940533e-06, + "loss": 0.1398, + "step": 4181 + }, + { + "epoch": 0.3853134933431612, + "grad_norm": 0.9272503453478166, + "learning_rate": 3.614225548992251e-06, + "loss": 0.1531, + "step": 4182 + }, + { + "epoch": 0.38540562952043117, + "grad_norm": 0.9314307699112695, + "learning_rate": 3.6135435589344857e-06, + "loss": 0.1451, + "step": 4183 + }, + { + "epoch": 0.3854977656977012, + "grad_norm": 0.9222458730386164, + "learning_rate": 3.612861465484082e-06, + "loss": 0.1572, + "step": 4184 + }, + { + "epoch": 0.3855899018749712, + "grad_norm": 0.9132259187920421, + "learning_rate": 3.612179268704371e-06, + "loss": 0.1673, + "step": 4185 + }, + { + "epoch": 0.3856820380522412, + "grad_norm": 0.8399624159184574, + "learning_rate": 3.611496968658695e-06, + "loss": 0.1397, + "step": 4186 + }, + { + "epoch": 0.3857741742295112, + "grad_norm": 0.8945593972634557, + "learning_rate": 3.6108145654104065e-06, + "loss": 0.1539, + "step": 4187 + }, + { + "epoch": 0.3858663104067812, + "grad_norm": 0.8969467662175643, + "learning_rate": 3.610132059022865e-06, + "loss": 0.151, + "step": 4188 + }, + { + "epoch": 0.3859584465840512, + "grad_norm": 0.8663822959838602, + "learning_rate": 3.6094494495594435e-06, + "loss": 0.1457, + "step": 4189 + }, + { + "epoch": 0.38605058276132126, + "grad_norm": 0.9056864522351286, + "learning_rate": 3.6087667370835213e-06, + "loss": 0.1533, + "step": 4190 + }, + { + "epoch": 0.38614271893859126, + "grad_norm": 0.9201003973845128, + "learning_rate": 3.6080839216584875e-06, + "loss": 0.1463, + "step": 4191 + }, + { + "epoch": 0.38623485511586125, + "grad_norm": 0.9124350794797036, + "learning_rate": 3.6074010033477425e-06, + "loss": 0.1473, + "step": 4192 + }, + { + "epoch": 0.38632699129313125, + "grad_norm": 0.9708223579259208, + "learning_rate": 3.606717982214695e-06, + "loss": 0.1629, + "step": 4193 + }, + { + "epoch": 0.38641912747040125, + "grad_norm": 0.869245957927959, + "learning_rate": 3.6060348583227635e-06, + "loss": 0.1282, + "step": 4194 + }, + { + "epoch": 0.38651126364767124, + "grad_norm": 0.9028555853054538, + "learning_rate": 3.6053516317353777e-06, + "loss": 0.1499, + "step": 4195 + }, + { + "epoch": 0.38660339982494124, + "grad_norm": 0.8716710099401325, + "learning_rate": 3.6046683025159722e-06, + "loss": 0.1315, + "step": 4196 + }, + { + "epoch": 0.3866955360022113, + "grad_norm": 0.995012950602247, + "learning_rate": 3.6039848707279965e-06, + "loss": 0.1675, + "step": 4197 + }, + { + "epoch": 0.3867876721794813, + "grad_norm": 0.8635770358105515, + "learning_rate": 3.6033013364349074e-06, + "loss": 0.1433, + "step": 4198 + }, + { + "epoch": 0.3868798083567513, + "grad_norm": 0.9387478709393704, + "learning_rate": 3.60261769970017e-06, + "loss": 0.1602, + "step": 4199 + }, + { + "epoch": 0.3869719445340213, + "grad_norm": 0.9106186310143041, + "learning_rate": 3.6019339605872604e-06, + "loss": 0.1423, + "step": 4200 + }, + { + "epoch": 0.3870640807112913, + "grad_norm": 0.829738795239621, + "learning_rate": 3.6012501191596637e-06, + "loss": 0.129, + "step": 4201 + }, + { + "epoch": 0.3871562168885613, + "grad_norm": 0.8274379621149167, + "learning_rate": 3.6005661754808755e-06, + "loss": 0.1306, + "step": 4202 + }, + { + "epoch": 0.3872483530658313, + "grad_norm": 1.0033242130316309, + "learning_rate": 3.5998821296143995e-06, + "loss": 0.1647, + "step": 4203 + }, + { + "epoch": 0.3873404892431013, + "grad_norm": 0.8796576048924489, + "learning_rate": 3.5991979816237495e-06, + "loss": 0.1398, + "step": 4204 + }, + { + "epoch": 0.3874326254203713, + "grad_norm": 0.9147076443755038, + "learning_rate": 3.5985137315724476e-06, + "loss": 0.1467, + "step": 4205 + }, + { + "epoch": 0.3875247615976413, + "grad_norm": 0.9113754827450924, + "learning_rate": 3.597829379524029e-06, + "loss": 0.1527, + "step": 4206 + }, + { + "epoch": 0.3876168977749113, + "grad_norm": 0.8575385170167867, + "learning_rate": 3.5971449255420334e-06, + "loss": 0.1429, + "step": 4207 + }, + { + "epoch": 0.3877090339521813, + "grad_norm": 0.971025350124052, + "learning_rate": 3.5964603696900137e-06, + "loss": 0.1767, + "step": 4208 + }, + { + "epoch": 0.3878011701294513, + "grad_norm": 0.8814985588682502, + "learning_rate": 3.59577571203153e-06, + "loss": 0.1477, + "step": 4209 + }, + { + "epoch": 0.3878933063067213, + "grad_norm": 0.885856410302397, + "learning_rate": 3.5950909526301543e-06, + "loss": 0.1435, + "step": 4210 + }, + { + "epoch": 0.38798544248399136, + "grad_norm": 0.8978747474508465, + "learning_rate": 3.5944060915494656e-06, + "loss": 0.1496, + "step": 4211 + }, + { + "epoch": 0.38807757866126136, + "grad_norm": 0.8340715669280159, + "learning_rate": 3.5937211288530536e-06, + "loss": 0.1414, + "step": 4212 + }, + { + "epoch": 0.38816971483853135, + "grad_norm": 0.9322207725295413, + "learning_rate": 3.5930360646045165e-06, + "loss": 0.144, + "step": 4213 + }, + { + "epoch": 0.38826185101580135, + "grad_norm": 0.9764968494745287, + "learning_rate": 3.5923508988674643e-06, + "loss": 0.1531, + "step": 4214 + }, + { + "epoch": 0.38835398719307135, + "grad_norm": 0.929979392627363, + "learning_rate": 3.591665631705512e-06, + "loss": 0.1583, + "step": 4215 + }, + { + "epoch": 0.38844612337034135, + "grad_norm": 0.8855391143671449, + "learning_rate": 3.59098026318229e-06, + "loss": 0.1392, + "step": 4216 + }, + { + "epoch": 0.38853825954761134, + "grad_norm": 0.8787069684232223, + "learning_rate": 3.5902947933614317e-06, + "loss": 0.1394, + "step": 4217 + }, + { + "epoch": 0.3886303957248814, + "grad_norm": 0.9634827487742722, + "learning_rate": 3.5896092223065854e-06, + "loss": 0.1541, + "step": 4218 + }, + { + "epoch": 0.3887225319021514, + "grad_norm": 0.9325862623957322, + "learning_rate": 3.5889235500814055e-06, + "loss": 0.1542, + "step": 4219 + }, + { + "epoch": 0.3888146680794214, + "grad_norm": 0.9023366000754964, + "learning_rate": 3.588237776749557e-06, + "loss": 0.1501, + "step": 4220 + }, + { + "epoch": 0.3889068042566914, + "grad_norm": 0.8986206366257142, + "learning_rate": 3.5875519023747125e-06, + "loss": 0.1453, + "step": 4221 + }, + { + "epoch": 0.3889989404339614, + "grad_norm": 0.917397921943575, + "learning_rate": 3.5868659270205584e-06, + "loss": 0.1493, + "step": 4222 + }, + { + "epoch": 0.3890910766112314, + "grad_norm": 0.9143535754483454, + "learning_rate": 3.586179850750785e-06, + "loss": 0.1544, + "step": 4223 + }, + { + "epoch": 0.38918321278850143, + "grad_norm": 0.9044475607995964, + "learning_rate": 3.5854936736290956e-06, + "loss": 0.1495, + "step": 4224 + }, + { + "epoch": 0.38927534896577143, + "grad_norm": 0.8377084041288603, + "learning_rate": 3.584807395719202e-06, + "loss": 0.1316, + "step": 4225 + }, + { + "epoch": 0.3893674851430414, + "grad_norm": 0.8703025793104591, + "learning_rate": 3.584121017084825e-06, + "loss": 0.162, + "step": 4226 + }, + { + "epoch": 0.3894596213203114, + "grad_norm": 0.8891808936863865, + "learning_rate": 3.5834345377896953e-06, + "loss": 0.1534, + "step": 4227 + }, + { + "epoch": 0.3895517574975814, + "grad_norm": 0.8490427643718715, + "learning_rate": 3.5827479578975523e-06, + "loss": 0.1485, + "step": 4228 + }, + { + "epoch": 0.3896438936748514, + "grad_norm": 0.9253620813691368, + "learning_rate": 3.582061277472144e-06, + "loss": 0.142, + "step": 4229 + }, + { + "epoch": 0.3897360298521214, + "grad_norm": 0.9100820857060963, + "learning_rate": 3.5813744965772296e-06, + "loss": 0.1496, + "step": 4230 + }, + { + "epoch": 0.38982816602939147, + "grad_norm": 0.8307256013094806, + "learning_rate": 3.580687615276577e-06, + "loss": 0.1349, + "step": 4231 + }, + { + "epoch": 0.38992030220666146, + "grad_norm": 0.9088847447095758, + "learning_rate": 3.580000633633963e-06, + "loss": 0.1495, + "step": 4232 + }, + { + "epoch": 0.39001243838393146, + "grad_norm": 1.1234634530976173, + "learning_rate": 3.579313551713175e-06, + "loss": 0.1616, + "step": 4233 + }, + { + "epoch": 0.39010457456120146, + "grad_norm": 0.9343623593094748, + "learning_rate": 3.578626369578006e-06, + "loss": 0.145, + "step": 4234 + }, + { + "epoch": 0.39019671073847145, + "grad_norm": 0.8849146327309522, + "learning_rate": 3.5779390872922637e-06, + "loss": 0.1542, + "step": 4235 + }, + { + "epoch": 0.39028884691574145, + "grad_norm": 0.9700332080648529, + "learning_rate": 3.5772517049197602e-06, + "loss": 0.1668, + "step": 4236 + }, + { + "epoch": 0.39038098309301145, + "grad_norm": 0.9073077760008486, + "learning_rate": 3.5765642225243204e-06, + "loss": 0.1501, + "step": 4237 + }, + { + "epoch": 0.3904731192702815, + "grad_norm": 0.8560186434769101, + "learning_rate": 3.575876640169777e-06, + "loss": 0.1526, + "step": 4238 + }, + { + "epoch": 0.3905652554475515, + "grad_norm": 0.9149195393802323, + "learning_rate": 3.5751889579199715e-06, + "loss": 0.1519, + "step": 4239 + }, + { + "epoch": 0.3906573916248215, + "grad_norm": 0.9313907437239937, + "learning_rate": 3.574501175838755e-06, + "loss": 0.147, + "step": 4240 + }, + { + "epoch": 0.3907495278020915, + "grad_norm": 0.9774674402082485, + "learning_rate": 3.5738132939899895e-06, + "loss": 0.1593, + "step": 4241 + }, + { + "epoch": 0.3908416639793615, + "grad_norm": 0.9249259871090791, + "learning_rate": 3.573125312437544e-06, + "loss": 0.1571, + "step": 4242 + }, + { + "epoch": 0.3909338001566315, + "grad_norm": 0.8517631758062792, + "learning_rate": 3.572437231245297e-06, + "loss": 0.1469, + "step": 4243 + }, + { + "epoch": 0.3910259363339015, + "grad_norm": 0.9075554502422322, + "learning_rate": 3.5717490504771386e-06, + "loss": 0.1549, + "step": 4244 + }, + { + "epoch": 0.39111807251117153, + "grad_norm": 0.8860237957842017, + "learning_rate": 3.571060770196965e-06, + "loss": 0.1426, + "step": 4245 + }, + { + "epoch": 0.39121020868844153, + "grad_norm": 0.8585811165821868, + "learning_rate": 3.570372390468684e-06, + "loss": 0.1373, + "step": 4246 + }, + { + "epoch": 0.3913023448657115, + "grad_norm": 0.9061281656675514, + "learning_rate": 3.569683911356211e-06, + "loss": 0.147, + "step": 4247 + }, + { + "epoch": 0.3913944810429815, + "grad_norm": 0.9101609658494008, + "learning_rate": 3.568995332923472e-06, + "loss": 0.1557, + "step": 4248 + }, + { + "epoch": 0.3914866172202515, + "grad_norm": 1.0288124302622148, + "learning_rate": 3.568306655234401e-06, + "loss": 0.1653, + "step": 4249 + }, + { + "epoch": 0.3915787533975215, + "grad_norm": 0.9003424685543512, + "learning_rate": 3.567617878352942e-06, + "loss": 0.1434, + "step": 4250 + }, + { + "epoch": 0.3916708895747915, + "grad_norm": 0.8803026788568897, + "learning_rate": 3.566929002343048e-06, + "loss": 0.145, + "step": 4251 + }, + { + "epoch": 0.39176302575206157, + "grad_norm": 0.9300702343272207, + "learning_rate": 3.5662400272686813e-06, + "loss": 0.1505, + "step": 4252 + }, + { + "epoch": 0.39185516192933156, + "grad_norm": 0.9166481799372976, + "learning_rate": 3.5655509531938143e-06, + "loss": 0.1491, + "step": 4253 + }, + { + "epoch": 0.39194729810660156, + "grad_norm": 0.8898318583085889, + "learning_rate": 3.5648617801824257e-06, + "loss": 0.1474, + "step": 4254 + }, + { + "epoch": 0.39203943428387156, + "grad_norm": 0.8754326393842209, + "learning_rate": 3.5641725082985066e-06, + "loss": 0.1385, + "step": 4255 + }, + { + "epoch": 0.39213157046114155, + "grad_norm": 0.8837064377392709, + "learning_rate": 3.5634831376060554e-06, + "loss": 0.1483, + "step": 4256 + }, + { + "epoch": 0.39222370663841155, + "grad_norm": 0.82973390186192, + "learning_rate": 3.5627936681690804e-06, + "loss": 0.1385, + "step": 4257 + }, + { + "epoch": 0.3923158428156816, + "grad_norm": 0.8980341352139759, + "learning_rate": 3.562104100051599e-06, + "loss": 0.1502, + "step": 4258 + }, + { + "epoch": 0.3924079789929516, + "grad_norm": 0.8826296993941916, + "learning_rate": 3.561414433317637e-06, + "loss": 0.144, + "step": 4259 + }, + { + "epoch": 0.3925001151702216, + "grad_norm": 0.8825113800199493, + "learning_rate": 3.560724668031231e-06, + "loss": 0.1535, + "step": 4260 + }, + { + "epoch": 0.3925922513474916, + "grad_norm": 0.9439974689177584, + "learning_rate": 3.560034804256426e-06, + "loss": 0.1575, + "step": 4261 + }, + { + "epoch": 0.3926843875247616, + "grad_norm": 0.8261955307735191, + "learning_rate": 3.5593448420572753e-06, + "loss": 0.1352, + "step": 4262 + }, + { + "epoch": 0.3927765237020316, + "grad_norm": 0.9212672496243489, + "learning_rate": 3.558654781497841e-06, + "loss": 0.1351, + "step": 4263 + }, + { + "epoch": 0.3928686598793016, + "grad_norm": 0.9109157673678041, + "learning_rate": 3.557964622642197e-06, + "loss": 0.1488, + "step": 4264 + }, + { + "epoch": 0.39296079605657164, + "grad_norm": 0.9448156747304166, + "learning_rate": 3.557274365554424e-06, + "loss": 0.1651, + "step": 4265 + }, + { + "epoch": 0.39305293223384163, + "grad_norm": 0.8808161080550717, + "learning_rate": 3.5565840102986128e-06, + "loss": 0.1377, + "step": 4266 + }, + { + "epoch": 0.39314506841111163, + "grad_norm": 0.8536571695891909, + "learning_rate": 3.555893556938862e-06, + "loss": 0.1401, + "step": 4267 + }, + { + "epoch": 0.39323720458838163, + "grad_norm": 0.8420334900764369, + "learning_rate": 3.5552030055392805e-06, + "loss": 0.1333, + "step": 4268 + }, + { + "epoch": 0.3933293407656516, + "grad_norm": 0.9069556452168558, + "learning_rate": 3.554512356163986e-06, + "loss": 0.152, + "step": 4269 + }, + { + "epoch": 0.3934214769429216, + "grad_norm": 0.8525624922991532, + "learning_rate": 3.553821608877107e-06, + "loss": 0.1355, + "step": 4270 + }, + { + "epoch": 0.3935136131201916, + "grad_norm": 0.8460506248401927, + "learning_rate": 3.5531307637427774e-06, + "loss": 0.1461, + "step": 4271 + }, + { + "epoch": 0.39360574929746167, + "grad_norm": 0.866204338149706, + "learning_rate": 3.552439820825143e-06, + "loss": 0.1475, + "step": 4272 + }, + { + "epoch": 0.39369788547473167, + "grad_norm": 0.9409368306978433, + "learning_rate": 3.5517487801883587e-06, + "loss": 0.1617, + "step": 4273 + }, + { + "epoch": 0.39379002165200166, + "grad_norm": 0.8316147873982872, + "learning_rate": 3.5510576418965862e-06, + "loss": 0.1383, + "step": 4274 + }, + { + "epoch": 0.39388215782927166, + "grad_norm": 0.8920309395123601, + "learning_rate": 3.5503664060139987e-06, + "loss": 0.1514, + "step": 4275 + }, + { + "epoch": 0.39397429400654166, + "grad_norm": 0.8663059385692813, + "learning_rate": 3.549675072604778e-06, + "loss": 0.1327, + "step": 4276 + }, + { + "epoch": 0.39406643018381166, + "grad_norm": 0.908796183232886, + "learning_rate": 3.548983641733113e-06, + "loss": 0.1525, + "step": 4277 + }, + { + "epoch": 0.39415856636108165, + "grad_norm": 0.9156866300447943, + "learning_rate": 3.5482921134632043e-06, + "loss": 0.1509, + "step": 4278 + }, + { + "epoch": 0.3942507025383517, + "grad_norm": 0.885423965354652, + "learning_rate": 3.54760048785926e-06, + "loss": 0.1454, + "step": 4279 + }, + { + "epoch": 0.3943428387156217, + "grad_norm": 0.9269220799512766, + "learning_rate": 3.546908764985498e-06, + "loss": 0.1604, + "step": 4280 + }, + { + "epoch": 0.3944349748928917, + "grad_norm": 0.968797661863933, + "learning_rate": 3.5462169449061445e-06, + "loss": 0.1433, + "step": 4281 + }, + { + "epoch": 0.3945271110701617, + "grad_norm": 0.9613953673270564, + "learning_rate": 3.5455250276854348e-06, + "loss": 0.1445, + "step": 4282 + }, + { + "epoch": 0.3946192472474317, + "grad_norm": 0.8605829725128497, + "learning_rate": 3.544833013387613e-06, + "loss": 0.1269, + "step": 4283 + }, + { + "epoch": 0.3947113834247017, + "grad_norm": 0.9087732510897352, + "learning_rate": 3.5441409020769347e-06, + "loss": 0.1496, + "step": 4284 + }, + { + "epoch": 0.39480351960197174, + "grad_norm": 0.9422059956909788, + "learning_rate": 3.5434486938176606e-06, + "loss": 0.1407, + "step": 4285 + }, + { + "epoch": 0.39489565577924174, + "grad_norm": 0.9356627540900108, + "learning_rate": 3.5427563886740633e-06, + "loss": 0.1519, + "step": 4286 + }, + { + "epoch": 0.39498779195651174, + "grad_norm": 0.9822939505174967, + "learning_rate": 3.542063986710423e-06, + "loss": 0.1582, + "step": 4287 + }, + { + "epoch": 0.39507992813378173, + "grad_norm": 0.9727146799740334, + "learning_rate": 3.5413714879910287e-06, + "loss": 0.1435, + "step": 4288 + }, + { + "epoch": 0.39517206431105173, + "grad_norm": 0.9789445530371959, + "learning_rate": 3.540678892580181e-06, + "loss": 0.146, + "step": 4289 + }, + { + "epoch": 0.3952642004883217, + "grad_norm": 0.9347182617511673, + "learning_rate": 3.539986200542185e-06, + "loss": 0.148, + "step": 4290 + }, + { + "epoch": 0.3953563366655917, + "grad_norm": 0.8953433756495065, + "learning_rate": 3.539293411941359e-06, + "loss": 0.1486, + "step": 4291 + }, + { + "epoch": 0.3954484728428618, + "grad_norm": 0.8971317416449771, + "learning_rate": 3.5386005268420277e-06, + "loss": 0.1507, + "step": 4292 + }, + { + "epoch": 0.3955406090201318, + "grad_norm": 0.9427770172715076, + "learning_rate": 3.5379075453085256e-06, + "loss": 0.1572, + "step": 4293 + }, + { + "epoch": 0.39563274519740177, + "grad_norm": 0.8767799263466138, + "learning_rate": 3.5372144674051963e-06, + "loss": 0.1366, + "step": 4294 + }, + { + "epoch": 0.39572488137467177, + "grad_norm": 0.9997094521941999, + "learning_rate": 3.536521293196392e-06, + "loss": 0.1654, + "step": 4295 + }, + { + "epoch": 0.39581701755194176, + "grad_norm": 0.9105426819568583, + "learning_rate": 3.5358280227464735e-06, + "loss": 0.1624, + "step": 4296 + }, + { + "epoch": 0.39590915372921176, + "grad_norm": 0.8461159177852656, + "learning_rate": 3.535134656119813e-06, + "loss": 0.1367, + "step": 4297 + }, + { + "epoch": 0.39600128990648176, + "grad_norm": 0.8914245299115292, + "learning_rate": 3.534441193380787e-06, + "loss": 0.142, + "step": 4298 + }, + { + "epoch": 0.3960934260837518, + "grad_norm": 0.894727223944378, + "learning_rate": 3.5337476345937853e-06, + "loss": 0.1462, + "step": 4299 + }, + { + "epoch": 0.3961855622610218, + "grad_norm": 0.9194710796710017, + "learning_rate": 3.5330539798232044e-06, + "loss": 0.1333, + "step": 4300 + }, + { + "epoch": 0.3962776984382918, + "grad_norm": 0.9067339161473813, + "learning_rate": 3.5323602291334508e-06, + "loss": 0.1525, + "step": 4301 + }, + { + "epoch": 0.3963698346155618, + "grad_norm": 0.9207199566595172, + "learning_rate": 3.5316663825889384e-06, + "loss": 0.1462, + "step": 4302 + }, + { + "epoch": 0.3964619707928318, + "grad_norm": 0.893790793559201, + "learning_rate": 3.530972440254092e-06, + "loss": 0.1465, + "step": 4303 + }, + { + "epoch": 0.3965541069701018, + "grad_norm": 0.9323229957009004, + "learning_rate": 3.530278402193342e-06, + "loss": 0.1501, + "step": 4304 + }, + { + "epoch": 0.3966462431473718, + "grad_norm": 0.93288274935138, + "learning_rate": 3.5295842684711334e-06, + "loss": 0.1487, + "step": 4305 + }, + { + "epoch": 0.39673837932464184, + "grad_norm": 0.8904180096398162, + "learning_rate": 3.528890039151913e-06, + "loss": 0.1374, + "step": 4306 + }, + { + "epoch": 0.39683051550191184, + "grad_norm": 0.9609819910977235, + "learning_rate": 3.5281957143001426e-06, + "loss": 0.165, + "step": 4307 + }, + { + "epoch": 0.39692265167918184, + "grad_norm": 0.9827893129703009, + "learning_rate": 3.5275012939802895e-06, + "loss": 0.1546, + "step": 4308 + }, + { + "epoch": 0.39701478785645183, + "grad_norm": 0.9057704871681684, + "learning_rate": 3.5268067782568306e-06, + "loss": 0.1433, + "step": 4309 + }, + { + "epoch": 0.39710692403372183, + "grad_norm": 0.9032061609195176, + "learning_rate": 3.5261121671942515e-06, + "loss": 0.1475, + "step": 4310 + }, + { + "epoch": 0.39719906021099183, + "grad_norm": 0.9355446728513368, + "learning_rate": 3.525417460857048e-06, + "loss": 0.1468, + "step": 4311 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 0.9825199046051186, + "learning_rate": 3.524722659309722e-06, + "loss": 0.1642, + "step": 4312 + }, + { + "epoch": 0.3973833325655319, + "grad_norm": 0.915612075494406, + "learning_rate": 3.5240277626167875e-06, + "loss": 0.1434, + "step": 4313 + }, + { + "epoch": 0.3974754687428019, + "grad_norm": 0.8754977403678704, + "learning_rate": 3.5233327708427638e-06, + "loss": 0.1382, + "step": 4314 + }, + { + "epoch": 0.39756760492007187, + "grad_norm": 0.8479881511273041, + "learning_rate": 3.522637684052184e-06, + "loss": 0.1358, + "step": 4315 + }, + { + "epoch": 0.39765974109734187, + "grad_norm": 0.9382658935005335, + "learning_rate": 3.5219425023095837e-06, + "loss": 0.1547, + "step": 4316 + }, + { + "epoch": 0.39775187727461186, + "grad_norm": 0.8891053914875621, + "learning_rate": 3.5212472256795122e-06, + "loss": 0.151, + "step": 4317 + }, + { + "epoch": 0.39784401345188186, + "grad_norm": 0.8883971554082362, + "learning_rate": 3.5205518542265265e-06, + "loss": 0.1536, + "step": 4318 + }, + { + "epoch": 0.3979361496291519, + "grad_norm": 1.0232040373202294, + "learning_rate": 3.5198563880151913e-06, + "loss": 0.1609, + "step": 4319 + }, + { + "epoch": 0.3980282858064219, + "grad_norm": 0.9193288491755802, + "learning_rate": 3.519160827110081e-06, + "loss": 0.1391, + "step": 4320 + }, + { + "epoch": 0.3981204219836919, + "grad_norm": 0.9096900817977849, + "learning_rate": 3.5184651715757772e-06, + "loss": 0.148, + "step": 4321 + }, + { + "epoch": 0.3982125581609619, + "grad_norm": 0.8684172311676668, + "learning_rate": 3.517769421476873e-06, + "loss": 0.1324, + "step": 4322 + }, + { + "epoch": 0.3983046943382319, + "grad_norm": 0.924708159192272, + "learning_rate": 3.5170735768779683e-06, + "loss": 0.1576, + "step": 4323 + }, + { + "epoch": 0.3983968305155019, + "grad_norm": 0.8836737240145939, + "learning_rate": 3.5163776378436736e-06, + "loss": 0.1509, + "step": 4324 + }, + { + "epoch": 0.3984889666927719, + "grad_norm": 0.92612250459839, + "learning_rate": 3.515681604438605e-06, + "loss": 0.1584, + "step": 4325 + }, + { + "epoch": 0.39858110287004195, + "grad_norm": 0.9261909259373187, + "learning_rate": 3.5149854767273904e-06, + "loss": 0.163, + "step": 4326 + }, + { + "epoch": 0.39867323904731194, + "grad_norm": 0.8606719880644568, + "learning_rate": 3.5142892547746647e-06, + "loss": 0.1507, + "step": 4327 + }, + { + "epoch": 0.39876537522458194, + "grad_norm": 0.9505649437739544, + "learning_rate": 3.513592938645073e-06, + "loss": 0.1598, + "step": 4328 + }, + { + "epoch": 0.39885751140185194, + "grad_norm": 0.8572409396930928, + "learning_rate": 3.5128965284032677e-06, + "loss": 0.1378, + "step": 4329 + }, + { + "epoch": 0.39894964757912194, + "grad_norm": 0.9370424084273961, + "learning_rate": 3.512200024113911e-06, + "loss": 0.1586, + "step": 4330 + }, + { + "epoch": 0.39904178375639193, + "grad_norm": 0.9345899795920014, + "learning_rate": 3.511503425841672e-06, + "loss": 0.158, + "step": 4331 + }, + { + "epoch": 0.39913391993366193, + "grad_norm": 0.8384552894192423, + "learning_rate": 3.5108067336512325e-06, + "loss": 0.1327, + "step": 4332 + }, + { + "epoch": 0.399226056110932, + "grad_norm": 0.9193822845561663, + "learning_rate": 3.5101099476072776e-06, + "loss": 0.1561, + "step": 4333 + }, + { + "epoch": 0.399318192288202, + "grad_norm": 0.9288838613780442, + "learning_rate": 3.5094130677745065e-06, + "loss": 0.1564, + "step": 4334 + }, + { + "epoch": 0.399410328465472, + "grad_norm": 0.9365564041095639, + "learning_rate": 3.5087160942176228e-06, + "loss": 0.1551, + "step": 4335 + }, + { + "epoch": 0.399502464642742, + "grad_norm": 0.9228486860598516, + "learning_rate": 3.5080190270013415e-06, + "loss": 0.149, + "step": 4336 + }, + { + "epoch": 0.39959460082001197, + "grad_norm": 0.9585047222693666, + "learning_rate": 3.5073218661903852e-06, + "loss": 0.1491, + "step": 4337 + }, + { + "epoch": 0.39968673699728197, + "grad_norm": 0.9608521329567735, + "learning_rate": 3.5066246118494847e-06, + "loss": 0.1651, + "step": 4338 + }, + { + "epoch": 0.39977887317455196, + "grad_norm": 0.9241350643720966, + "learning_rate": 3.5059272640433808e-06, + "loss": 0.1455, + "step": 4339 + }, + { + "epoch": 0.399871009351822, + "grad_norm": 0.9336561932328415, + "learning_rate": 3.5052298228368227e-06, + "loss": 0.1585, + "step": 4340 + }, + { + "epoch": 0.399963145529092, + "grad_norm": 0.9287712735257532, + "learning_rate": 3.5045322882945666e-06, + "loss": 0.1511, + "step": 4341 + }, + { + "epoch": 0.400055281706362, + "grad_norm": 0.8132984018015555, + "learning_rate": 3.5038346604813796e-06, + "loss": 0.1208, + "step": 4342 + }, + { + "epoch": 0.400147417883632, + "grad_norm": 0.9102128798105348, + "learning_rate": 3.5031369394620364e-06, + "loss": 0.1461, + "step": 4343 + }, + { + "epoch": 0.400239554060902, + "grad_norm": 0.8849265666537798, + "learning_rate": 3.5024391253013206e-06, + "loss": 0.1492, + "step": 4344 + }, + { + "epoch": 0.400331690238172, + "grad_norm": 0.9810479880919115, + "learning_rate": 3.5017412180640243e-06, + "loss": 0.1521, + "step": 4345 + }, + { + "epoch": 0.400423826415442, + "grad_norm": 0.956181986827905, + "learning_rate": 3.5010432178149473e-06, + "loss": 0.1609, + "step": 4346 + }, + { + "epoch": 0.40051596259271205, + "grad_norm": 0.9029488074220976, + "learning_rate": 3.5003451246189003e-06, + "loss": 0.1482, + "step": 4347 + }, + { + "epoch": 0.40060809876998205, + "grad_norm": 0.8824563585798078, + "learning_rate": 3.499646938540701e-06, + "loss": 0.146, + "step": 4348 + }, + { + "epoch": 0.40070023494725204, + "grad_norm": 0.9269831174400178, + "learning_rate": 3.498948659645176e-06, + "loss": 0.1484, + "step": 4349 + }, + { + "epoch": 0.40079237112452204, + "grad_norm": 0.9216669167926768, + "learning_rate": 3.4982502879971596e-06, + "loss": 0.1429, + "step": 4350 + }, + { + "epoch": 0.40088450730179204, + "grad_norm": 0.9119424481041382, + "learning_rate": 3.497551823661498e-06, + "loss": 0.1438, + "step": 4351 + }, + { + "epoch": 0.40097664347906203, + "grad_norm": 0.9521206551490597, + "learning_rate": 3.4968532667030408e-06, + "loss": 0.1684, + "step": 4352 + }, + { + "epoch": 0.4010687796563321, + "grad_norm": 0.9169723849076772, + "learning_rate": 3.496154617186651e-06, + "loss": 0.1452, + "step": 4353 + }, + { + "epoch": 0.4011609158336021, + "grad_norm": 0.9816498659771387, + "learning_rate": 3.4954558751771976e-06, + "loss": 0.1611, + "step": 4354 + }, + { + "epoch": 0.4012530520108721, + "grad_norm": 1.0011798661122229, + "learning_rate": 3.4947570407395593e-06, + "loss": 0.1568, + "step": 4355 + }, + { + "epoch": 0.4013451881881421, + "grad_norm": 0.8784800818378974, + "learning_rate": 3.494058113938623e-06, + "loss": 0.1454, + "step": 4356 + }, + { + "epoch": 0.4014373243654121, + "grad_norm": 0.9091950278863203, + "learning_rate": 3.493359094839284e-06, + "loss": 0.1454, + "step": 4357 + }, + { + "epoch": 0.40152946054268207, + "grad_norm": 1.003839121809296, + "learning_rate": 3.4926599835064446e-06, + "loss": 0.1527, + "step": 4358 + }, + { + "epoch": 0.40162159671995207, + "grad_norm": 0.8735840233392277, + "learning_rate": 3.491960780005021e-06, + "loss": 0.1488, + "step": 4359 + }, + { + "epoch": 0.4017137328972221, + "grad_norm": 0.9279416109721896, + "learning_rate": 3.4912614843999304e-06, + "loss": 0.1519, + "step": 4360 + }, + { + "epoch": 0.4018058690744921, + "grad_norm": 0.9292349005371296, + "learning_rate": 3.490562096756105e-06, + "loss": 0.145, + "step": 4361 + }, + { + "epoch": 0.4018980052517621, + "grad_norm": 0.8176160899793758, + "learning_rate": 3.4898626171384823e-06, + "loss": 0.1319, + "step": 4362 + }, + { + "epoch": 0.4019901414290321, + "grad_norm": 0.8663661790382975, + "learning_rate": 3.4891630456120098e-06, + "loss": 0.1478, + "step": 4363 + }, + { + "epoch": 0.4020822776063021, + "grad_norm": 0.8951011796955761, + "learning_rate": 3.4884633822416412e-06, + "loss": 0.1411, + "step": 4364 + }, + { + "epoch": 0.4021744137835721, + "grad_norm": 0.9513840457401349, + "learning_rate": 3.4877636270923416e-06, + "loss": 0.1557, + "step": 4365 + }, + { + "epoch": 0.4022665499608421, + "grad_norm": 0.9631571432658622, + "learning_rate": 3.4870637802290817e-06, + "loss": 0.1528, + "step": 4366 + }, + { + "epoch": 0.40235868613811215, + "grad_norm": 0.9009246331376555, + "learning_rate": 3.4863638417168455e-06, + "loss": 0.1406, + "step": 4367 + }, + { + "epoch": 0.40245082231538215, + "grad_norm": 0.9418153307876809, + "learning_rate": 3.4856638116206194e-06, + "loss": 0.1522, + "step": 4368 + }, + { + "epoch": 0.40254295849265215, + "grad_norm": 0.938253466177827, + "learning_rate": 3.4849636900054023e-06, + "loss": 0.1414, + "step": 4369 + }, + { + "epoch": 0.40263509466992214, + "grad_norm": 0.949524744273461, + "learning_rate": 3.484263476936201e-06, + "loss": 0.1527, + "step": 4370 + }, + { + "epoch": 0.40272723084719214, + "grad_norm": 0.9037729435498303, + "learning_rate": 3.4835631724780296e-06, + "loss": 0.1445, + "step": 4371 + }, + { + "epoch": 0.40281936702446214, + "grad_norm": 0.8961162386122397, + "learning_rate": 3.4828627766959123e-06, + "loss": 0.1491, + "step": 4372 + }, + { + "epoch": 0.40291150320173214, + "grad_norm": 0.875416342667484, + "learning_rate": 3.4821622896548795e-06, + "loss": 0.1483, + "step": 4373 + }, + { + "epoch": 0.4030036393790022, + "grad_norm": 0.8912047991694245, + "learning_rate": 3.4814617114199722e-06, + "loss": 0.1514, + "step": 4374 + }, + { + "epoch": 0.4030957755562722, + "grad_norm": 0.9112833813015218, + "learning_rate": 3.4807610420562406e-06, + "loss": 0.1599, + "step": 4375 + }, + { + "epoch": 0.4031879117335422, + "grad_norm": 0.8839605396294251, + "learning_rate": 3.48006028162874e-06, + "loss": 0.1532, + "step": 4376 + }, + { + "epoch": 0.4032800479108122, + "grad_norm": 0.8948899894612451, + "learning_rate": 3.4793594302025367e-06, + "loss": 0.1465, + "step": 4377 + }, + { + "epoch": 0.4033721840880822, + "grad_norm": 0.851447023415438, + "learning_rate": 3.4786584878427056e-06, + "loss": 0.134, + "step": 4378 + }, + { + "epoch": 0.40346432026535217, + "grad_norm": 0.9114250139558066, + "learning_rate": 3.4779574546143276e-06, + "loss": 0.1539, + "step": 4379 + }, + { + "epoch": 0.40355645644262217, + "grad_norm": 0.9361638989038511, + "learning_rate": 3.4772563305824956e-06, + "loss": 0.1612, + "step": 4380 + }, + { + "epoch": 0.4036485926198922, + "grad_norm": 0.8195140088476734, + "learning_rate": 3.4765551158123074e-06, + "loss": 0.1333, + "step": 4381 + }, + { + "epoch": 0.4037407287971622, + "grad_norm": 0.9008523426752787, + "learning_rate": 3.4758538103688723e-06, + "loss": 0.1525, + "step": 4382 + }, + { + "epoch": 0.4038328649744322, + "grad_norm": 0.957477825346113, + "learning_rate": 3.4751524143173055e-06, + "loss": 0.1651, + "step": 4383 + }, + { + "epoch": 0.4039250011517022, + "grad_norm": 0.9227344073131626, + "learning_rate": 3.4744509277227316e-06, + "loss": 0.1493, + "step": 4384 + }, + { + "epoch": 0.4040171373289722, + "grad_norm": 0.9151830123149463, + "learning_rate": 3.473749350650285e-06, + "loss": 0.1554, + "step": 4385 + }, + { + "epoch": 0.4041092735062422, + "grad_norm": 0.8878099053567197, + "learning_rate": 3.473047683165106e-06, + "loss": 0.1552, + "step": 4386 + }, + { + "epoch": 0.40420140968351226, + "grad_norm": 0.9120949620840199, + "learning_rate": 3.472345925332344e-06, + "loss": 0.156, + "step": 4387 + }, + { + "epoch": 0.40429354586078226, + "grad_norm": 0.9151073635872946, + "learning_rate": 3.47164407721716e-06, + "loss": 0.1601, + "step": 4388 + }, + { + "epoch": 0.40438568203805225, + "grad_norm": 0.9144746721555486, + "learning_rate": 3.4709421388847177e-06, + "loss": 0.1442, + "step": 4389 + }, + { + "epoch": 0.40447781821532225, + "grad_norm": 0.9910249337465405, + "learning_rate": 3.4702401104001937e-06, + "loss": 0.161, + "step": 4390 + }, + { + "epoch": 0.40456995439259225, + "grad_norm": 0.905580616645535, + "learning_rate": 3.4695379918287708e-06, + "loss": 0.147, + "step": 4391 + }, + { + "epoch": 0.40466209056986224, + "grad_norm": 0.9682851249530802, + "learning_rate": 3.468835783235641e-06, + "loss": 0.1664, + "step": 4392 + }, + { + "epoch": 0.40475422674713224, + "grad_norm": 1.019950194661881, + "learning_rate": 3.468133484686005e-06, + "loss": 0.1581, + "step": 4393 + }, + { + "epoch": 0.4048463629244023, + "grad_norm": 0.9054834581346294, + "learning_rate": 3.467431096245071e-06, + "loss": 0.1458, + "step": 4394 + }, + { + "epoch": 0.4049384991016723, + "grad_norm": 0.9394518511371741, + "learning_rate": 3.466728617978054e-06, + "loss": 0.1465, + "step": 4395 + }, + { + "epoch": 0.4050306352789423, + "grad_norm": 0.9276116238870109, + "learning_rate": 3.466026049950182e-06, + "loss": 0.1533, + "step": 4396 + }, + { + "epoch": 0.4051227714562123, + "grad_norm": 0.8814615109668135, + "learning_rate": 3.465323392226687e-06, + "loss": 0.1474, + "step": 4397 + }, + { + "epoch": 0.4052149076334823, + "grad_norm": 0.9276110676799734, + "learning_rate": 3.4646206448728113e-06, + "loss": 0.1471, + "step": 4398 + }, + { + "epoch": 0.4053070438107523, + "grad_norm": 0.9740379558096653, + "learning_rate": 3.463917807953805e-06, + "loss": 0.1514, + "step": 4399 + }, + { + "epoch": 0.4053991799880223, + "grad_norm": 0.8572241682747315, + "learning_rate": 3.4632148815349265e-06, + "loss": 0.1411, + "step": 4400 + }, + { + "epoch": 0.4054913161652923, + "grad_norm": 0.8856261129316616, + "learning_rate": 3.4625118656814414e-06, + "loss": 0.1531, + "step": 4401 + }, + { + "epoch": 0.4055834523425623, + "grad_norm": 0.8319574623205851, + "learning_rate": 3.4618087604586277e-06, + "loss": 0.1476, + "step": 4402 + }, + { + "epoch": 0.4056755885198323, + "grad_norm": 0.851179009626978, + "learning_rate": 3.4611055659317663e-06, + "loss": 0.1403, + "step": 4403 + }, + { + "epoch": 0.4057677246971023, + "grad_norm": 0.7983809793208019, + "learning_rate": 3.4604022821661493e-06, + "loss": 0.1217, + "step": 4404 + }, + { + "epoch": 0.4058598608743723, + "grad_norm": 0.8594086174519592, + "learning_rate": 3.459698909227078e-06, + "loss": 0.15, + "step": 4405 + }, + { + "epoch": 0.4059519970516423, + "grad_norm": 0.8403974045516667, + "learning_rate": 3.458995447179858e-06, + "loss": 0.1442, + "step": 4406 + }, + { + "epoch": 0.4060441332289123, + "grad_norm": 0.9268270027862243, + "learning_rate": 3.4582918960898094e-06, + "loss": 0.1556, + "step": 4407 + }, + { + "epoch": 0.40613626940618236, + "grad_norm": 0.8732579314464122, + "learning_rate": 3.457588256022254e-06, + "loss": 0.1425, + "step": 4408 + }, + { + "epoch": 0.40622840558345236, + "grad_norm": 0.9393071006353455, + "learning_rate": 3.4568845270425268e-06, + "loss": 0.1402, + "step": 4409 + }, + { + "epoch": 0.40632054176072235, + "grad_norm": 0.8936419289776788, + "learning_rate": 3.456180709215968e-06, + "loss": 0.1391, + "step": 4410 + }, + { + "epoch": 0.40641267793799235, + "grad_norm": 0.9028553321289632, + "learning_rate": 3.455476802607927e-06, + "loss": 0.1455, + "step": 4411 + }, + { + "epoch": 0.40650481411526235, + "grad_norm": 0.9629848751950774, + "learning_rate": 3.454772807283763e-06, + "loss": 0.1595, + "step": 4412 + }, + { + "epoch": 0.40659695029253234, + "grad_norm": 0.8973627311061653, + "learning_rate": 3.45406872330884e-06, + "loss": 0.1451, + "step": 4413 + }, + { + "epoch": 0.40668908646980234, + "grad_norm": 0.8543835552665785, + "learning_rate": 3.453364550748533e-06, + "loss": 0.1496, + "step": 4414 + }, + { + "epoch": 0.4067812226470724, + "grad_norm": 0.8201019616645914, + "learning_rate": 3.4526602896682267e-06, + "loss": 0.1435, + "step": 4415 + }, + { + "epoch": 0.4068733588243424, + "grad_norm": 0.8978747617563513, + "learning_rate": 3.451955940133308e-06, + "loss": 0.1436, + "step": 4416 + }, + { + "epoch": 0.4069654950016124, + "grad_norm": 0.9475916174468934, + "learning_rate": 3.451251502209179e-06, + "loss": 0.1572, + "step": 4417 + }, + { + "epoch": 0.4070576311788824, + "grad_norm": 0.9162427080905498, + "learning_rate": 3.4505469759612453e-06, + "loss": 0.1454, + "step": 4418 + }, + { + "epoch": 0.4071497673561524, + "grad_norm": 0.8802358936759178, + "learning_rate": 3.4498423614549226e-06, + "loss": 0.1437, + "step": 4419 + }, + { + "epoch": 0.4072419035334224, + "grad_norm": 0.9874591988178819, + "learning_rate": 3.449137658755635e-06, + "loss": 0.1558, + "step": 4420 + }, + { + "epoch": 0.40733403971069243, + "grad_norm": 0.9103269014973717, + "learning_rate": 3.4484328679288133e-06, + "loss": 0.1512, + "step": 4421 + }, + { + "epoch": 0.4074261758879624, + "grad_norm": 0.8962116901628482, + "learning_rate": 3.4477279890398968e-06, + "loss": 0.1472, + "step": 4422 + }, + { + "epoch": 0.4075183120652324, + "grad_norm": 0.9979364839432481, + "learning_rate": 3.4470230221543362e-06, + "loss": 0.1431, + "step": 4423 + }, + { + "epoch": 0.4076104482425024, + "grad_norm": 0.9124653449987579, + "learning_rate": 3.4463179673375846e-06, + "loss": 0.1426, + "step": 4424 + }, + { + "epoch": 0.4077025844197724, + "grad_norm": 0.9408756063038355, + "learning_rate": 3.445612824655108e-06, + "loss": 0.1414, + "step": 4425 + }, + { + "epoch": 0.4077947205970424, + "grad_norm": 0.9395683151940247, + "learning_rate": 3.4449075941723797e-06, + "loss": 0.1427, + "step": 4426 + }, + { + "epoch": 0.4078868567743124, + "grad_norm": 0.9611998547934455, + "learning_rate": 3.444202275954879e-06, + "loss": 0.1412, + "step": 4427 + }, + { + "epoch": 0.40797899295158246, + "grad_norm": 0.93129468781063, + "learning_rate": 3.443496870068096e-06, + "loss": 0.1483, + "step": 4428 + }, + { + "epoch": 0.40807112912885246, + "grad_norm": 0.9047030358220689, + "learning_rate": 3.442791376577527e-06, + "loss": 0.1372, + "step": 4429 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 0.9545577553610516, + "learning_rate": 3.4420857955486756e-06, + "loss": 0.1442, + "step": 4430 + }, + { + "epoch": 0.40825540148339245, + "grad_norm": 0.8843025095837417, + "learning_rate": 3.441380127047058e-06, + "loss": 0.1345, + "step": 4431 + }, + { + "epoch": 0.40834753766066245, + "grad_norm": 0.8466673145673613, + "learning_rate": 3.4406743711381945e-06, + "loss": 0.1435, + "step": 4432 + }, + { + "epoch": 0.40843967383793245, + "grad_norm": 0.8921580068825665, + "learning_rate": 3.439968527887614e-06, + "loss": 0.1462, + "step": 4433 + }, + { + "epoch": 0.40853181001520245, + "grad_norm": 0.8755904986298998, + "learning_rate": 3.439262597360855e-06, + "loss": 0.152, + "step": 4434 + }, + { + "epoch": 0.4086239461924725, + "grad_norm": 0.9056162255854322, + "learning_rate": 3.438556579623462e-06, + "loss": 0.1412, + "step": 4435 + }, + { + "epoch": 0.4087160823697425, + "grad_norm": 0.9685429925466471, + "learning_rate": 3.43785047474099e-06, + "loss": 0.1574, + "step": 4436 + }, + { + "epoch": 0.4088082185470125, + "grad_norm": 0.850888377979816, + "learning_rate": 3.437144282779e-06, + "loss": 0.1449, + "step": 4437 + }, + { + "epoch": 0.4089003547242825, + "grad_norm": 0.9337752457104304, + "learning_rate": 3.4364380038030636e-06, + "loss": 0.1647, + "step": 4438 + }, + { + "epoch": 0.4089924909015525, + "grad_norm": 0.8772877594659604, + "learning_rate": 3.435731637878757e-06, + "loss": 0.146, + "step": 4439 + }, + { + "epoch": 0.4090846270788225, + "grad_norm": 0.9383038547513723, + "learning_rate": 3.435025185071668e-06, + "loss": 0.1494, + "step": 4440 + }, + { + "epoch": 0.4091767632560925, + "grad_norm": 0.8908104687401064, + "learning_rate": 3.434318645447388e-06, + "loss": 0.1344, + "step": 4441 + }, + { + "epoch": 0.40926889943336253, + "grad_norm": 0.9462178584246064, + "learning_rate": 3.433612019071523e-06, + "loss": 0.1481, + "step": 4442 + }, + { + "epoch": 0.40936103561063253, + "grad_norm": 1.0324931287459957, + "learning_rate": 3.4329053060096805e-06, + "loss": 0.1564, + "step": 4443 + }, + { + "epoch": 0.4094531717879025, + "grad_norm": 0.9704275908210812, + "learning_rate": 3.4321985063274805e-06, + "loss": 0.1566, + "step": 4444 + }, + { + "epoch": 0.4095453079651725, + "grad_norm": 0.9285469120115527, + "learning_rate": 3.431491620090549e-06, + "loss": 0.1529, + "step": 4445 + }, + { + "epoch": 0.4096374441424425, + "grad_norm": 0.8357706208744242, + "learning_rate": 3.43078464736452e-06, + "loss": 0.1382, + "step": 4446 + }, + { + "epoch": 0.4097295803197125, + "grad_norm": 0.979590666946963, + "learning_rate": 3.4300775882150367e-06, + "loss": 0.1642, + "step": 4447 + }, + { + "epoch": 0.4098217164969825, + "grad_norm": 0.9152729280726183, + "learning_rate": 3.429370442707749e-06, + "loss": 0.1474, + "step": 4448 + }, + { + "epoch": 0.40991385267425257, + "grad_norm": 0.9441617628306267, + "learning_rate": 3.428663210908315e-06, + "loss": 0.1565, + "step": 4449 + }, + { + "epoch": 0.41000598885152256, + "grad_norm": 0.9176715113996435, + "learning_rate": 3.427955892882403e-06, + "loss": 0.1561, + "step": 4450 + }, + { + "epoch": 0.41009812502879256, + "grad_norm": 0.9473640183674535, + "learning_rate": 3.4272484886956856e-06, + "loss": 0.1532, + "step": 4451 + }, + { + "epoch": 0.41019026120606256, + "grad_norm": 0.9388563681988958, + "learning_rate": 3.4265409984138463e-06, + "loss": 0.1552, + "step": 4452 + }, + { + "epoch": 0.41028239738333255, + "grad_norm": 0.8830180332465869, + "learning_rate": 3.4258334221025763e-06, + "loss": 0.1392, + "step": 4453 + }, + { + "epoch": 0.41037453356060255, + "grad_norm": 0.8404506052238401, + "learning_rate": 3.425125759827573e-06, + "loss": 0.1408, + "step": 4454 + }, + { + "epoch": 0.4104666697378726, + "grad_norm": 0.8937046949022973, + "learning_rate": 3.4244180116545434e-06, + "loss": 0.1493, + "step": 4455 + }, + { + "epoch": 0.4105588059151426, + "grad_norm": 0.9179506656858187, + "learning_rate": 3.423710177649202e-06, + "loss": 0.1469, + "step": 4456 + }, + { + "epoch": 0.4106509420924126, + "grad_norm": 0.9052018029466072, + "learning_rate": 3.423002257877271e-06, + "loss": 0.1524, + "step": 4457 + }, + { + "epoch": 0.4107430782696826, + "grad_norm": 0.9531593496243919, + "learning_rate": 3.4222942524044817e-06, + "loss": 0.1635, + "step": 4458 + }, + { + "epoch": 0.4108352144469526, + "grad_norm": 0.8613976632136475, + "learning_rate": 3.4215861612965705e-06, + "loss": 0.1364, + "step": 4459 + }, + { + "epoch": 0.4109273506242226, + "grad_norm": 0.9203983911114743, + "learning_rate": 3.4208779846192856e-06, + "loss": 0.1471, + "step": 4460 + }, + { + "epoch": 0.4110194868014926, + "grad_norm": 0.8893575267649316, + "learning_rate": 3.420169722438381e-06, + "loss": 0.1481, + "step": 4461 + }, + { + "epoch": 0.41111162297876264, + "grad_norm": 0.9481408074859154, + "learning_rate": 3.419461374819618e-06, + "loss": 0.153, + "step": 4462 + }, + { + "epoch": 0.41120375915603263, + "grad_norm": 0.9102576968282057, + "learning_rate": 3.418752941828769e-06, + "loss": 0.1479, + "step": 4463 + }, + { + "epoch": 0.41129589533330263, + "grad_norm": 0.9170697810535563, + "learning_rate": 3.418044423531609e-06, + "loss": 0.1493, + "step": 4464 + }, + { + "epoch": 0.4113880315105726, + "grad_norm": 0.9107203490066708, + "learning_rate": 3.4173358199939253e-06, + "loss": 0.1544, + "step": 4465 + }, + { + "epoch": 0.4114801676878426, + "grad_norm": 0.8864919007877409, + "learning_rate": 3.416627131281513e-06, + "loss": 0.1501, + "step": 4466 + }, + { + "epoch": 0.4115723038651126, + "grad_norm": 0.8923614673943928, + "learning_rate": 3.415918357460173e-06, + "loss": 0.1529, + "step": 4467 + }, + { + "epoch": 0.4116644400423826, + "grad_norm": 0.8625947768008402, + "learning_rate": 3.4152094985957135e-06, + "loss": 0.1537, + "step": 4468 + }, + { + "epoch": 0.41175657621965267, + "grad_norm": 0.8763271138448714, + "learning_rate": 3.4145005547539552e-06, + "loss": 0.1422, + "step": 4469 + }, + { + "epoch": 0.41184871239692267, + "grad_norm": 0.845461883705554, + "learning_rate": 3.413791526000721e-06, + "loss": 0.137, + "step": 4470 + }, + { + "epoch": 0.41194084857419266, + "grad_norm": 0.8670354139637978, + "learning_rate": 3.4130824124018453e-06, + "loss": 0.1487, + "step": 4471 + }, + { + "epoch": 0.41203298475146266, + "grad_norm": 0.9156206700607552, + "learning_rate": 3.4123732140231695e-06, + "loss": 0.1505, + "step": 4472 + }, + { + "epoch": 0.41212512092873266, + "grad_norm": 0.9246364937441267, + "learning_rate": 3.411663930930543e-06, + "loss": 0.1559, + "step": 4473 + }, + { + "epoch": 0.41221725710600265, + "grad_norm": 0.9060481038174606, + "learning_rate": 3.4109545631898223e-06, + "loss": 0.1559, + "step": 4474 + }, + { + "epoch": 0.41230939328327265, + "grad_norm": 0.9347223663635646, + "learning_rate": 3.410245110866872e-06, + "loss": 0.1598, + "step": 4475 + }, + { + "epoch": 0.4124015294605427, + "grad_norm": 0.9055809419343559, + "learning_rate": 3.409535574027565e-06, + "loss": 0.1373, + "step": 4476 + }, + { + "epoch": 0.4124936656378127, + "grad_norm": 0.8832161886506429, + "learning_rate": 3.4088259527377826e-06, + "loss": 0.1453, + "step": 4477 + }, + { + "epoch": 0.4125858018150827, + "grad_norm": 0.9766041924808401, + "learning_rate": 3.408116247063412e-06, + "loss": 0.1624, + "step": 4478 + }, + { + "epoch": 0.4126779379923527, + "grad_norm": 0.9355812719201331, + "learning_rate": 3.407406457070351e-06, + "loss": 0.1439, + "step": 4479 + }, + { + "epoch": 0.4127700741696227, + "grad_norm": 0.9492137205341494, + "learning_rate": 3.4066965828245023e-06, + "loss": 0.1495, + "step": 4480 + }, + { + "epoch": 0.4128622103468927, + "grad_norm": 0.9537352797092123, + "learning_rate": 3.4059866243917784e-06, + "loss": 0.1587, + "step": 4481 + }, + { + "epoch": 0.4129543465241627, + "grad_norm": 0.9084473898134254, + "learning_rate": 3.4052765818380988e-06, + "loss": 0.1528, + "step": 4482 + }, + { + "epoch": 0.41304648270143274, + "grad_norm": 0.8738894354219552, + "learning_rate": 3.4045664552293913e-06, + "loss": 0.1439, + "step": 4483 + }, + { + "epoch": 0.41313861887870273, + "grad_norm": 0.955205886413484, + "learning_rate": 3.4038562446315908e-06, + "loss": 0.1621, + "step": 4484 + }, + { + "epoch": 0.41323075505597273, + "grad_norm": 0.9384210771953125, + "learning_rate": 3.4031459501106412e-06, + "loss": 0.1402, + "step": 4485 + }, + { + "epoch": 0.41332289123324273, + "grad_norm": 0.9346661196851742, + "learning_rate": 3.4024355717324927e-06, + "loss": 0.1599, + "step": 4486 + }, + { + "epoch": 0.4134150274105127, + "grad_norm": 0.8793320839571395, + "learning_rate": 3.4017251095631044e-06, + "loss": 0.15, + "step": 4487 + }, + { + "epoch": 0.4135071635877827, + "grad_norm": 0.976370275498756, + "learning_rate": 3.401014563668442e-06, + "loss": 0.1457, + "step": 4488 + }, + { + "epoch": 0.4135992997650528, + "grad_norm": 0.8577892362812106, + "learning_rate": 3.4003039341144807e-06, + "loss": 0.1381, + "step": 4489 + }, + { + "epoch": 0.41369143594232277, + "grad_norm": 0.8677820161364236, + "learning_rate": 3.3995932209672028e-06, + "loss": 0.1457, + "step": 4490 + }, + { + "epoch": 0.41378357211959277, + "grad_norm": 0.9488229913826759, + "learning_rate": 3.3988824242925965e-06, + "loss": 0.1516, + "step": 4491 + }, + { + "epoch": 0.41387570829686277, + "grad_norm": 0.8646831069607954, + "learning_rate": 3.398171544156661e-06, + "loss": 0.1294, + "step": 4492 + }, + { + "epoch": 0.41396784447413276, + "grad_norm": 1.0109574950040325, + "learning_rate": 3.3974605806254015e-06, + "loss": 0.1455, + "step": 4493 + }, + { + "epoch": 0.41405998065140276, + "grad_norm": 0.862225186168334, + "learning_rate": 3.3967495337648297e-06, + "loss": 0.1471, + "step": 4494 + }, + { + "epoch": 0.41415211682867276, + "grad_norm": 0.9281967815655426, + "learning_rate": 3.396038403640968e-06, + "loss": 0.1588, + "step": 4495 + }, + { + "epoch": 0.4142442530059428, + "grad_norm": 0.9404582510399649, + "learning_rate": 3.395327190319843e-06, + "loss": 0.1556, + "step": 4496 + }, + { + "epoch": 0.4143363891832128, + "grad_norm": 0.871442207254464, + "learning_rate": 3.394615893867492e-06, + "loss": 0.1425, + "step": 4497 + }, + { + "epoch": 0.4144285253604828, + "grad_norm": 0.9006939834301403, + "learning_rate": 3.3939045143499604e-06, + "loss": 0.1392, + "step": 4498 + }, + { + "epoch": 0.4145206615377528, + "grad_norm": 0.8677132626890294, + "learning_rate": 3.393193051833297e-06, + "loss": 0.146, + "step": 4499 + }, + { + "epoch": 0.4146127977150228, + "grad_norm": 0.8750036614339674, + "learning_rate": 3.392481506383563e-06, + "loss": 0.1557, + "step": 4500 + }, + { + "epoch": 0.4146127977150228, + "eval_loss": 0.14849522709846497, + "eval_runtime": 299.3682, + "eval_samples_per_second": 23.439, + "eval_steps_per_second": 2.933, + "step": 4500 + }, + { + "epoch": 0.4147049338922928, + "grad_norm": 0.9083364571977722, + "learning_rate": 3.391769878066825e-06, + "loss": 0.1554, + "step": 4501 + }, + { + "epoch": 0.4147970700695628, + "grad_norm": 0.9037418164648761, + "learning_rate": 3.391058166949159e-06, + "loss": 0.1483, + "step": 4502 + }, + { + "epoch": 0.41488920624683284, + "grad_norm": 0.9011614056787858, + "learning_rate": 3.390346373096645e-06, + "loss": 0.1622, + "step": 4503 + }, + { + "epoch": 0.41498134242410284, + "grad_norm": 0.935184827582457, + "learning_rate": 3.3896344965753746e-06, + "loss": 0.1542, + "step": 4504 + }, + { + "epoch": 0.41507347860137284, + "grad_norm": 0.9172362705852074, + "learning_rate": 3.3889225374514455e-06, + "loss": 0.1522, + "step": 4505 + }, + { + "epoch": 0.41516561477864283, + "grad_norm": 0.8595824614921564, + "learning_rate": 3.388210495790964e-06, + "loss": 0.1447, + "step": 4506 + }, + { + "epoch": 0.41525775095591283, + "grad_norm": 0.8785044781998784, + "learning_rate": 3.3874983716600414e-06, + "loss": 0.1459, + "step": 4507 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 0.9537125508997492, + "learning_rate": 3.3867861651247997e-06, + "loss": 0.1605, + "step": 4508 + }, + { + "epoch": 0.4154420233104528, + "grad_norm": 0.89864935320374, + "learning_rate": 3.3860738762513674e-06, + "loss": 0.1539, + "step": 4509 + }, + { + "epoch": 0.4155341594877229, + "grad_norm": 0.9531899881558427, + "learning_rate": 3.3853615051058798e-06, + "loss": 0.1476, + "step": 4510 + }, + { + "epoch": 0.4156262956649929, + "grad_norm": 0.8988810611214069, + "learning_rate": 3.384649051754481e-06, + "loss": 0.1453, + "step": 4511 + }, + { + "epoch": 0.41571843184226287, + "grad_norm": 0.9566810993912842, + "learning_rate": 3.3839365162633237e-06, + "loss": 0.1554, + "step": 4512 + }, + { + "epoch": 0.41581056801953287, + "grad_norm": 0.9267067824355266, + "learning_rate": 3.3832238986985643e-06, + "loss": 0.1581, + "step": 4513 + }, + { + "epoch": 0.41590270419680286, + "grad_norm": 0.9276426644752526, + "learning_rate": 3.382511199126372e-06, + "loss": 0.1353, + "step": 4514 + }, + { + "epoch": 0.41599484037407286, + "grad_norm": 0.9891793471815831, + "learning_rate": 3.3817984176129194e-06, + "loss": 0.1456, + "step": 4515 + }, + { + "epoch": 0.41608697655134286, + "grad_norm": 0.8778307632415944, + "learning_rate": 3.3810855542243892e-06, + "loss": 0.1417, + "step": 4516 + }, + { + "epoch": 0.4161791127286129, + "grad_norm": 0.8922929124760213, + "learning_rate": 3.38037260902697e-06, + "loss": 0.146, + "step": 4517 + }, + { + "epoch": 0.4162712489058829, + "grad_norm": 0.8832533852460722, + "learning_rate": 3.3796595820868596e-06, + "loss": 0.1403, + "step": 4518 + }, + { + "epoch": 0.4163633850831529, + "grad_norm": 0.9314412639880251, + "learning_rate": 3.378946473470262e-06, + "loss": 0.1535, + "step": 4519 + }, + { + "epoch": 0.4164555212604229, + "grad_norm": 0.8383119728511532, + "learning_rate": 3.37823328324339e-06, + "loss": 0.1357, + "step": 4520 + }, + { + "epoch": 0.4165476574376929, + "grad_norm": 0.9162340271668836, + "learning_rate": 3.3775200114724632e-06, + "loss": 0.1479, + "step": 4521 + }, + { + "epoch": 0.4166397936149629, + "grad_norm": 0.9381510485047666, + "learning_rate": 3.3768066582237084e-06, + "loss": 0.162, + "step": 4522 + }, + { + "epoch": 0.41673192979223295, + "grad_norm": 0.903114085112653, + "learning_rate": 3.3760932235633614e-06, + "loss": 0.1555, + "step": 4523 + }, + { + "epoch": 0.41682406596950294, + "grad_norm": 0.9275753356424354, + "learning_rate": 3.3753797075576646e-06, + "loss": 0.1515, + "step": 4524 + }, + { + "epoch": 0.41691620214677294, + "grad_norm": 0.896898839228486, + "learning_rate": 3.374666110272868e-06, + "loss": 0.137, + "step": 4525 + }, + { + "epoch": 0.41700833832404294, + "grad_norm": 0.8418394217228213, + "learning_rate": 3.3739524317752276e-06, + "loss": 0.1297, + "step": 4526 + }, + { + "epoch": 0.41710047450131293, + "grad_norm": 0.8178077340704089, + "learning_rate": 3.373238672131011e-06, + "loss": 0.1383, + "step": 4527 + }, + { + "epoch": 0.41719261067858293, + "grad_norm": 0.9015807010863438, + "learning_rate": 3.372524831406489e-06, + "loss": 0.1405, + "step": 4528 + }, + { + "epoch": 0.41728474685585293, + "grad_norm": 0.9091807204242067, + "learning_rate": 3.371810909667943e-06, + "loss": 0.1425, + "step": 4529 + }, + { + "epoch": 0.417376883033123, + "grad_norm": 0.8591056164974364, + "learning_rate": 3.37109690698166e-06, + "loss": 0.1298, + "step": 4530 + }, + { + "epoch": 0.417469019210393, + "grad_norm": 0.8970907735401513, + "learning_rate": 3.3703828234139357e-06, + "loss": 0.1335, + "step": 4531 + }, + { + "epoch": 0.417561155387663, + "grad_norm": 0.9376572782577746, + "learning_rate": 3.369668659031072e-06, + "loss": 0.1542, + "step": 4532 + }, + { + "epoch": 0.41765329156493297, + "grad_norm": 0.9497750415555174, + "learning_rate": 3.368954413899381e-06, + "loss": 0.1557, + "step": 4533 + }, + { + "epoch": 0.41774542774220297, + "grad_norm": 0.7958487365461181, + "learning_rate": 3.368240088085177e-06, + "loss": 0.1361, + "step": 4534 + }, + { + "epoch": 0.41783756391947297, + "grad_norm": 0.9245976345181074, + "learning_rate": 3.367525681654789e-06, + "loss": 0.1636, + "step": 4535 + }, + { + "epoch": 0.41792970009674296, + "grad_norm": 0.8430081265658356, + "learning_rate": 3.366811194674548e-06, + "loss": 0.1275, + "step": 4536 + }, + { + "epoch": 0.418021836274013, + "grad_norm": 0.9432573855683106, + "learning_rate": 3.3660966272107943e-06, + "loss": 0.1401, + "step": 4537 + }, + { + "epoch": 0.418113972451283, + "grad_norm": 0.878180759208677, + "learning_rate": 3.365381979329875e-06, + "loss": 0.1425, + "step": 4538 + }, + { + "epoch": 0.418206108628553, + "grad_norm": 0.8842514952556141, + "learning_rate": 3.3646672510981458e-06, + "loss": 0.1518, + "step": 4539 + }, + { + "epoch": 0.418298244805823, + "grad_norm": 0.890925588999716, + "learning_rate": 3.363952442581969e-06, + "loss": 0.1453, + "step": 4540 + }, + { + "epoch": 0.418390380983093, + "grad_norm": 0.8994367794628658, + "learning_rate": 3.3632375538477165e-06, + "loss": 0.1379, + "step": 4541 + }, + { + "epoch": 0.418482517160363, + "grad_norm": 1.0133084304207123, + "learning_rate": 3.3625225849617625e-06, + "loss": 0.1578, + "step": 4542 + }, + { + "epoch": 0.418574653337633, + "grad_norm": 0.9206787724545995, + "learning_rate": 3.3618075359904946e-06, + "loss": 0.1376, + "step": 4543 + }, + { + "epoch": 0.41866678951490305, + "grad_norm": 0.9561681753325468, + "learning_rate": 3.361092407000304e-06, + "loss": 0.1585, + "step": 4544 + }, + { + "epoch": 0.41875892569217305, + "grad_norm": 0.8866968591572271, + "learning_rate": 3.3603771980575907e-06, + "loss": 0.1444, + "step": 4545 + }, + { + "epoch": 0.41885106186944304, + "grad_norm": 0.9398479480714228, + "learning_rate": 3.359661909228762e-06, + "loss": 0.159, + "step": 4546 + }, + { + "epoch": 0.41894319804671304, + "grad_norm": 0.9582293954402347, + "learning_rate": 3.3589465405802324e-06, + "loss": 0.1441, + "step": 4547 + }, + { + "epoch": 0.41903533422398304, + "grad_norm": 0.9766029174637952, + "learning_rate": 3.358231092178424e-06, + "loss": 0.1474, + "step": 4548 + }, + { + "epoch": 0.41912747040125303, + "grad_norm": 0.8689780855332151, + "learning_rate": 3.3575155640897666e-06, + "loss": 0.1414, + "step": 4549 + }, + { + "epoch": 0.41921960657852303, + "grad_norm": 0.897722415255773, + "learning_rate": 3.356799956380697e-06, + "loss": 0.1402, + "step": 4550 + }, + { + "epoch": 0.4193117427557931, + "grad_norm": 0.8923317845839843, + "learning_rate": 3.3560842691176583e-06, + "loss": 0.1449, + "step": 4551 + }, + { + "epoch": 0.4194038789330631, + "grad_norm": 0.9314612085458115, + "learning_rate": 3.355368502367104e-06, + "loss": 0.1467, + "step": 4552 + }, + { + "epoch": 0.4194960151103331, + "grad_norm": 0.8821129071299318, + "learning_rate": 3.354652656195492e-06, + "loss": 0.1522, + "step": 4553 + }, + { + "epoch": 0.4195881512876031, + "grad_norm": 0.9219280388280229, + "learning_rate": 3.3539367306692884e-06, + "loss": 0.1479, + "step": 4554 + }, + { + "epoch": 0.41968028746487307, + "grad_norm": 0.894711504158373, + "learning_rate": 3.3532207258549676e-06, + "loss": 0.1422, + "step": 4555 + }, + { + "epoch": 0.41977242364214307, + "grad_norm": 0.8375395309893726, + "learning_rate": 3.352504641819011e-06, + "loss": 0.1351, + "step": 4556 + }, + { + "epoch": 0.4198645598194131, + "grad_norm": 0.9230302453034883, + "learning_rate": 3.3517884786279065e-06, + "loss": 0.1455, + "step": 4557 + }, + { + "epoch": 0.4199566959966831, + "grad_norm": 0.8746154702964645, + "learning_rate": 3.3510722363481505e-06, + "loss": 0.1318, + "step": 4558 + }, + { + "epoch": 0.4200488321739531, + "grad_norm": 0.992793698546288, + "learning_rate": 3.350355915046245e-06, + "loss": 0.1579, + "step": 4559 + }, + { + "epoch": 0.4201409683512231, + "grad_norm": 0.9214331623204591, + "learning_rate": 3.3496395147887017e-06, + "loss": 0.1434, + "step": 4560 + }, + { + "epoch": 0.4202331045284931, + "grad_norm": 0.9457222587582186, + "learning_rate": 3.348923035642038e-06, + "loss": 0.1534, + "step": 4561 + }, + { + "epoch": 0.4203252407057631, + "grad_norm": 0.956493474906948, + "learning_rate": 3.3482064776727784e-06, + "loss": 0.1621, + "step": 4562 + }, + { + "epoch": 0.4204173768830331, + "grad_norm": 0.8780357825071199, + "learning_rate": 3.3474898409474573e-06, + "loss": 0.1429, + "step": 4563 + }, + { + "epoch": 0.42050951306030315, + "grad_norm": 1.0473299857234213, + "learning_rate": 3.3467731255326123e-06, + "loss": 0.1698, + "step": 4564 + }, + { + "epoch": 0.42060164923757315, + "grad_norm": 0.9885364033480389, + "learning_rate": 3.346056331494792e-06, + "loss": 0.1444, + "step": 4565 + }, + { + "epoch": 0.42069378541484315, + "grad_norm": 0.8701414593375438, + "learning_rate": 3.34533945890055e-06, + "loss": 0.1327, + "step": 4566 + }, + { + "epoch": 0.42078592159211314, + "grad_norm": 0.7992360530702873, + "learning_rate": 3.344622507816448e-06, + "loss": 0.1254, + "step": 4567 + }, + { + "epoch": 0.42087805776938314, + "grad_norm": 0.8571525200902486, + "learning_rate": 3.343905478309056e-06, + "loss": 0.1392, + "step": 4568 + }, + { + "epoch": 0.42097019394665314, + "grad_norm": 1.1164661745110709, + "learning_rate": 3.3431883704449485e-06, + "loss": 0.165, + "step": 4569 + }, + { + "epoch": 0.42106233012392313, + "grad_norm": 1.0685661318077153, + "learning_rate": 3.342471184290711e-06, + "loss": 0.1468, + "step": 4570 + }, + { + "epoch": 0.4211544663011932, + "grad_norm": 1.0295348039132994, + "learning_rate": 3.3417539199129327e-06, + "loss": 0.1755, + "step": 4571 + }, + { + "epoch": 0.4212466024784632, + "grad_norm": 0.8852635534731436, + "learning_rate": 3.341036577378213e-06, + "loss": 0.141, + "step": 4572 + }, + { + "epoch": 0.4213387386557332, + "grad_norm": 1.0333297539743211, + "learning_rate": 3.3403191567531563e-06, + "loss": 0.1612, + "step": 4573 + }, + { + "epoch": 0.4214308748330032, + "grad_norm": 0.9589308822078715, + "learning_rate": 3.3396016581043757e-06, + "loss": 0.1416, + "step": 4574 + }, + { + "epoch": 0.4215230110102732, + "grad_norm": 1.0447741475735821, + "learning_rate": 3.3388840814984896e-06, + "loss": 0.1674, + "step": 4575 + }, + { + "epoch": 0.42161514718754317, + "grad_norm": 0.9598745446739234, + "learning_rate": 3.3381664270021273e-06, + "loss": 0.1526, + "step": 4576 + }, + { + "epoch": 0.42170728336481317, + "grad_norm": 0.8744813212867445, + "learning_rate": 3.337448694681922e-06, + "loss": 0.1466, + "step": 4577 + }, + { + "epoch": 0.4217994195420832, + "grad_norm": 0.8700235852673424, + "learning_rate": 3.3367308846045155e-06, + "loss": 0.1504, + "step": 4578 + }, + { + "epoch": 0.4218915557193532, + "grad_norm": 0.9462197033773357, + "learning_rate": 3.3360129968365556e-06, + "loss": 0.1571, + "step": 4579 + }, + { + "epoch": 0.4219836918966232, + "grad_norm": 1.016930154691944, + "learning_rate": 3.335295031444699e-06, + "loss": 0.1627, + "step": 4580 + }, + { + "epoch": 0.4220758280738932, + "grad_norm": 1.0256579540504431, + "learning_rate": 3.3345769884956097e-06, + "loss": 0.1592, + "step": 4581 + }, + { + "epoch": 0.4221679642511632, + "grad_norm": 0.8357570594016366, + "learning_rate": 3.3338588680559565e-06, + "loss": 0.1362, + "step": 4582 + }, + { + "epoch": 0.4222601004284332, + "grad_norm": 0.8566130957892835, + "learning_rate": 3.3331406701924173e-06, + "loss": 0.1412, + "step": 4583 + }, + { + "epoch": 0.4223522366057032, + "grad_norm": 1.0122900400853339, + "learning_rate": 3.3324223949716783e-06, + "loss": 0.1658, + "step": 4584 + }, + { + "epoch": 0.42244437278297325, + "grad_norm": 1.026159322299466, + "learning_rate": 3.3317040424604296e-06, + "loss": 0.1389, + "step": 4585 + }, + { + "epoch": 0.42253650896024325, + "grad_norm": 0.9442753705533472, + "learning_rate": 3.330985612725371e-06, + "loss": 0.1301, + "step": 4586 + }, + { + "epoch": 0.42262864513751325, + "grad_norm": 0.9828708879208898, + "learning_rate": 3.330267105833209e-06, + "loss": 0.156, + "step": 4587 + }, + { + "epoch": 0.42272078131478324, + "grad_norm": 0.9244268518849134, + "learning_rate": 3.3295485218506568e-06, + "loss": 0.1427, + "step": 4588 + }, + { + "epoch": 0.42281291749205324, + "grad_norm": 0.9784915828014263, + "learning_rate": 3.328829860844435e-06, + "loss": 0.1543, + "step": 4589 + }, + { + "epoch": 0.42290505366932324, + "grad_norm": 0.9435802145767377, + "learning_rate": 3.328111122881272e-06, + "loss": 0.1499, + "step": 4590 + }, + { + "epoch": 0.4229971898465933, + "grad_norm": 0.9158661147876648, + "learning_rate": 3.327392308027902e-06, + "loss": 0.1486, + "step": 4591 + }, + { + "epoch": 0.4230893260238633, + "grad_norm": 0.9137787014812229, + "learning_rate": 3.3266734163510668e-06, + "loss": 0.1423, + "step": 4592 + }, + { + "epoch": 0.4231814622011333, + "grad_norm": 0.9229747858346737, + "learning_rate": 3.325954447917516e-06, + "loss": 0.145, + "step": 4593 + }, + { + "epoch": 0.4232735983784033, + "grad_norm": 0.8730894128266261, + "learning_rate": 3.3252354027940055e-06, + "loss": 0.149, + "step": 4594 + }, + { + "epoch": 0.4233657345556733, + "grad_norm": 0.9057946535086321, + "learning_rate": 3.3245162810472998e-06, + "loss": 0.155, + "step": 4595 + }, + { + "epoch": 0.4234578707329433, + "grad_norm": 0.8590449088625499, + "learning_rate": 3.323797082744168e-06, + "loss": 0.1412, + "step": 4596 + }, + { + "epoch": 0.4235500069102133, + "grad_norm": 0.8387324971373283, + "learning_rate": 3.3230778079513883e-06, + "loss": 0.1345, + "step": 4597 + }, + { + "epoch": 0.4236421430874833, + "grad_norm": 0.7968624023810995, + "learning_rate": 3.3223584567357458e-06, + "loss": 0.1398, + "step": 4598 + }, + { + "epoch": 0.4237342792647533, + "grad_norm": 0.9163589605700169, + "learning_rate": 3.3216390291640327e-06, + "loss": 0.1479, + "step": 4599 + }, + { + "epoch": 0.4238264154420233, + "grad_norm": 0.9194471612561067, + "learning_rate": 3.320919525303047e-06, + "loss": 0.1504, + "step": 4600 + }, + { + "epoch": 0.4239185516192933, + "grad_norm": 0.8551752072019035, + "learning_rate": 3.3201999452195942e-06, + "loss": 0.1497, + "step": 4601 + }, + { + "epoch": 0.4240106877965633, + "grad_norm": 0.8814315296349494, + "learning_rate": 3.3194802889804887e-06, + "loss": 0.1511, + "step": 4602 + }, + { + "epoch": 0.4241028239738333, + "grad_norm": 0.8960260768130908, + "learning_rate": 3.318760556652551e-06, + "loss": 0.1413, + "step": 4603 + }, + { + "epoch": 0.4241949601511033, + "grad_norm": 0.9430328687544396, + "learning_rate": 3.318040748302606e-06, + "loss": 0.1566, + "step": 4604 + }, + { + "epoch": 0.42428709632837336, + "grad_norm": 0.8505985044861099, + "learning_rate": 3.317320863997491e-06, + "loss": 0.1383, + "step": 4605 + }, + { + "epoch": 0.42437923250564336, + "grad_norm": 0.9368280598249041, + "learning_rate": 3.316600903804045e-06, + "loss": 0.1509, + "step": 4606 + }, + { + "epoch": 0.42447136868291335, + "grad_norm": 0.9237871325274064, + "learning_rate": 3.3158808677891167e-06, + "loss": 0.1571, + "step": 4607 + }, + { + "epoch": 0.42456350486018335, + "grad_norm": 0.9214272756644586, + "learning_rate": 3.315160756019563e-06, + "loss": 0.1448, + "step": 4608 + }, + { + "epoch": 0.42465564103745335, + "grad_norm": 0.9531573937406549, + "learning_rate": 3.314440568562245e-06, + "loss": 0.149, + "step": 4609 + }, + { + "epoch": 0.42474777721472334, + "grad_norm": 0.8698615665529474, + "learning_rate": 3.3137203054840323e-06, + "loss": 0.1444, + "step": 4610 + }, + { + "epoch": 0.42483991339199334, + "grad_norm": 0.8876415171072912, + "learning_rate": 3.312999966851802e-06, + "loss": 0.1431, + "step": 4611 + }, + { + "epoch": 0.4249320495692634, + "grad_norm": 0.9031438952079939, + "learning_rate": 3.3122795527324374e-06, + "loss": 0.1436, + "step": 4612 + }, + { + "epoch": 0.4250241857465334, + "grad_norm": 0.9283680531824011, + "learning_rate": 3.3115590631928284e-06, + "loss": 0.1478, + "step": 4613 + }, + { + "epoch": 0.4251163219238034, + "grad_norm": 0.949901482292331, + "learning_rate": 3.3108384982998736e-06, + "loss": 0.1534, + "step": 4614 + }, + { + "epoch": 0.4252084581010734, + "grad_norm": 0.8647134104622481, + "learning_rate": 3.310117858120476e-06, + "loss": 0.144, + "step": 4615 + }, + { + "epoch": 0.4253005942783434, + "grad_norm": 0.9460010594500424, + "learning_rate": 3.3093971427215497e-06, + "loss": 0.1514, + "step": 4616 + }, + { + "epoch": 0.4253927304556134, + "grad_norm": 0.8916111121637049, + "learning_rate": 3.3086763521700105e-06, + "loss": 0.1418, + "step": 4617 + }, + { + "epoch": 0.4254848666328834, + "grad_norm": 0.9277173612977886, + "learning_rate": 3.307955486532785e-06, + "loss": 0.1511, + "step": 4618 + }, + { + "epoch": 0.4255770028101534, + "grad_norm": 0.9048532366246484, + "learning_rate": 3.3072345458768063e-06, + "loss": 0.1489, + "step": 4619 + }, + { + "epoch": 0.4256691389874234, + "grad_norm": 0.9137996003679597, + "learning_rate": 3.306513530269012e-06, + "loss": 0.1414, + "step": 4620 + }, + { + "epoch": 0.4257612751646934, + "grad_norm": 0.890587578081525, + "learning_rate": 3.30579243977635e-06, + "loss": 0.147, + "step": 4621 + }, + { + "epoch": 0.4258534113419634, + "grad_norm": 0.8890025500786466, + "learning_rate": 3.305071274465774e-06, + "loss": 0.1409, + "step": 4622 + }, + { + "epoch": 0.4259455475192334, + "grad_norm": 0.9500786995375841, + "learning_rate": 3.304350034404243e-06, + "loss": 0.146, + "step": 4623 + }, + { + "epoch": 0.4260376836965034, + "grad_norm": 0.8989819546643587, + "learning_rate": 3.3036287196587245e-06, + "loss": 0.1493, + "step": 4624 + }, + { + "epoch": 0.42612981987377346, + "grad_norm": 0.8978782164819534, + "learning_rate": 3.3029073302961933e-06, + "loss": 0.1507, + "step": 4625 + }, + { + "epoch": 0.42622195605104346, + "grad_norm": 0.8881037216614471, + "learning_rate": 3.3021858663836302e-06, + "loss": 0.1488, + "step": 4626 + }, + { + "epoch": 0.42631409222831346, + "grad_norm": 0.9523384549504138, + "learning_rate": 3.301464327988023e-06, + "loss": 0.1527, + "step": 4627 + }, + { + "epoch": 0.42640622840558345, + "grad_norm": 0.9354390642807312, + "learning_rate": 3.300742715176366e-06, + "loss": 0.1607, + "step": 4628 + }, + { + "epoch": 0.42649836458285345, + "grad_norm": 0.9184053064450814, + "learning_rate": 3.300021028015662e-06, + "loss": 0.163, + "step": 4629 + }, + { + "epoch": 0.42659050076012345, + "grad_norm": 1.0349141173183476, + "learning_rate": 3.29929926657292e-06, + "loss": 0.1473, + "step": 4630 + }, + { + "epoch": 0.42668263693739344, + "grad_norm": 0.9429341556872202, + "learning_rate": 3.298577430915155e-06, + "loss": 0.1457, + "step": 4631 + }, + { + "epoch": 0.4267747731146635, + "grad_norm": 0.9133407174487953, + "learning_rate": 3.297855521109389e-06, + "loss": 0.1428, + "step": 4632 + }, + { + "epoch": 0.4268669092919335, + "grad_norm": 0.9300103407525209, + "learning_rate": 3.297133537222652e-06, + "loss": 0.1494, + "step": 4633 + }, + { + "epoch": 0.4269590454692035, + "grad_norm": 1.0055639970528747, + "learning_rate": 3.2964114793219802e-06, + "loss": 0.1581, + "step": 4634 + }, + { + "epoch": 0.4270511816464735, + "grad_norm": 0.923588698817885, + "learning_rate": 3.2956893474744177e-06, + "loss": 0.1634, + "step": 4635 + }, + { + "epoch": 0.4271433178237435, + "grad_norm": 0.965170088385289, + "learning_rate": 3.294967141747013e-06, + "loss": 0.157, + "step": 4636 + }, + { + "epoch": 0.4272354540010135, + "grad_norm": 0.9313997319447985, + "learning_rate": 3.294244862206824e-06, + "loss": 0.147, + "step": 4637 + }, + { + "epoch": 0.4273275901782835, + "grad_norm": 0.9533289369081304, + "learning_rate": 3.293522508920914e-06, + "loss": 0.15, + "step": 4638 + }, + { + "epoch": 0.42741972635555353, + "grad_norm": 0.9066220272117559, + "learning_rate": 3.292800081956354e-06, + "loss": 0.1534, + "step": 4639 + }, + { + "epoch": 0.42751186253282353, + "grad_norm": 0.9284221041524333, + "learning_rate": 3.29207758138022e-06, + "loss": 0.1683, + "step": 4640 + }, + { + "epoch": 0.4276039987100935, + "grad_norm": 0.9581504482495421, + "learning_rate": 3.2913550072595986e-06, + "loss": 0.152, + "step": 4641 + }, + { + "epoch": 0.4276961348873635, + "grad_norm": 0.9272473768699457, + "learning_rate": 3.290632359661578e-06, + "loss": 0.1514, + "step": 4642 + }, + { + "epoch": 0.4277882710646335, + "grad_norm": 0.91905411911038, + "learning_rate": 3.289909638653259e-06, + "loss": 0.155, + "step": 4643 + }, + { + "epoch": 0.4278804072419035, + "grad_norm": 0.8779240052127671, + "learning_rate": 3.289186844301745e-06, + "loss": 0.1467, + "step": 4644 + }, + { + "epoch": 0.4279725434191735, + "grad_norm": 0.92752988914161, + "learning_rate": 3.2884639766741473e-06, + "loss": 0.1503, + "step": 4645 + }, + { + "epoch": 0.42806467959644356, + "grad_norm": 0.9044292622735791, + "learning_rate": 3.2877410358375845e-06, + "loss": 0.1484, + "step": 4646 + }, + { + "epoch": 0.42815681577371356, + "grad_norm": 0.98783505304058, + "learning_rate": 3.287018021859182e-06, + "loss": 0.1379, + "step": 4647 + }, + { + "epoch": 0.42824895195098356, + "grad_norm": 0.9153357355574386, + "learning_rate": 3.2862949348060707e-06, + "loss": 0.1485, + "step": 4648 + }, + { + "epoch": 0.42834108812825356, + "grad_norm": 0.8891298445580663, + "learning_rate": 3.285571774745391e-06, + "loss": 0.145, + "step": 4649 + }, + { + "epoch": 0.42843322430552355, + "grad_norm": 0.9539432888562989, + "learning_rate": 3.2848485417442867e-06, + "loss": 0.143, + "step": 4650 + }, + { + "epoch": 0.42852536048279355, + "grad_norm": 0.9597047185512093, + "learning_rate": 3.2841252358699115e-06, + "loss": 0.1428, + "step": 4651 + }, + { + "epoch": 0.4286174966600636, + "grad_norm": 0.9896270645503378, + "learning_rate": 3.2834018571894233e-06, + "loss": 0.1447, + "step": 4652 + }, + { + "epoch": 0.4287096328373336, + "grad_norm": 1.0332260681499266, + "learning_rate": 3.282678405769988e-06, + "loss": 0.1542, + "step": 4653 + }, + { + "epoch": 0.4288017690146036, + "grad_norm": 0.9539150898547636, + "learning_rate": 3.2819548816787794e-06, + "loss": 0.1526, + "step": 4654 + }, + { + "epoch": 0.4288939051918736, + "grad_norm": 0.8810375273548426, + "learning_rate": 3.2812312849829754e-06, + "loss": 0.1399, + "step": 4655 + }, + { + "epoch": 0.4289860413691436, + "grad_norm": 0.8775173004031491, + "learning_rate": 3.280507615749763e-06, + "loss": 0.1401, + "step": 4656 + }, + { + "epoch": 0.4290781775464136, + "grad_norm": 0.9351979870510768, + "learning_rate": 3.279783874046334e-06, + "loss": 0.1516, + "step": 4657 + }, + { + "epoch": 0.4291703137236836, + "grad_norm": 0.9308439149797066, + "learning_rate": 3.2790600599398882e-06, + "loss": 0.1416, + "step": 4658 + }, + { + "epoch": 0.42926244990095364, + "grad_norm": 0.9622148973196127, + "learning_rate": 3.2783361734976325e-06, + "loss": 0.1585, + "step": 4659 + }, + { + "epoch": 0.42935458607822363, + "grad_norm": 0.8675676981158611, + "learning_rate": 3.2776122147867782e-06, + "loss": 0.1388, + "step": 4660 + }, + { + "epoch": 0.42944672225549363, + "grad_norm": 0.8296889413789598, + "learning_rate": 3.276888183874547e-06, + "loss": 0.1348, + "step": 4661 + }, + { + "epoch": 0.4295388584327636, + "grad_norm": 0.8858689329829845, + "learning_rate": 3.2761640808281647e-06, + "loss": 0.1405, + "step": 4662 + }, + { + "epoch": 0.4296309946100336, + "grad_norm": 0.9024337133238689, + "learning_rate": 3.275439905714863e-06, + "loss": 0.1478, + "step": 4663 + }, + { + "epoch": 0.4297231307873036, + "grad_norm": 0.9342100871010554, + "learning_rate": 3.274715658601883e-06, + "loss": 0.1545, + "step": 4664 + }, + { + "epoch": 0.4298152669645736, + "grad_norm": 0.906760939816061, + "learning_rate": 3.273991339556471e-06, + "loss": 0.144, + "step": 4665 + }, + { + "epoch": 0.42990740314184367, + "grad_norm": 0.9287086900859022, + "learning_rate": 3.2732669486458796e-06, + "loss": 0.1617, + "step": 4666 + }, + { + "epoch": 0.42999953931911367, + "grad_norm": 0.8811760608349966, + "learning_rate": 3.272542485937369e-06, + "loss": 0.1321, + "step": 4667 + }, + { + "epoch": 0.43009167549638366, + "grad_norm": 0.904797164755038, + "learning_rate": 3.271817951498205e-06, + "loss": 0.1404, + "step": 4668 + }, + { + "epoch": 0.43018381167365366, + "grad_norm": 0.8849305530273994, + "learning_rate": 3.271093345395661e-06, + "loss": 0.146, + "step": 4669 + }, + { + "epoch": 0.43027594785092366, + "grad_norm": 0.8885015854103672, + "learning_rate": 3.270368667697018e-06, + "loss": 0.145, + "step": 4670 + }, + { + "epoch": 0.43036808402819365, + "grad_norm": 0.8259412462687333, + "learning_rate": 3.2696439184695606e-06, + "loss": 0.124, + "step": 4671 + }, + { + "epoch": 0.43046022020546365, + "grad_norm": 0.8918841353411899, + "learning_rate": 3.2689190977805822e-06, + "loss": 0.1293, + "step": 4672 + }, + { + "epoch": 0.4305523563827337, + "grad_norm": 0.9323176495034798, + "learning_rate": 3.2681942056973838e-06, + "loss": 0.141, + "step": 4673 + }, + { + "epoch": 0.4306444925600037, + "grad_norm": 0.9326715643680737, + "learning_rate": 3.26746924228727e-06, + "loss": 0.1574, + "step": 4674 + }, + { + "epoch": 0.4307366287372737, + "grad_norm": 0.8857621090497365, + "learning_rate": 3.2667442076175543e-06, + "loss": 0.1416, + "step": 4675 + }, + { + "epoch": 0.4308287649145437, + "grad_norm": 0.9038051971376051, + "learning_rate": 3.2660191017555567e-06, + "loss": 0.1414, + "step": 4676 + }, + { + "epoch": 0.4309209010918137, + "grad_norm": 0.9216378597983858, + "learning_rate": 3.2652939247686027e-06, + "loss": 0.1366, + "step": 4677 + }, + { + "epoch": 0.4310130372690837, + "grad_norm": 0.9440044237173141, + "learning_rate": 3.2645686767240263e-06, + "loss": 0.1478, + "step": 4678 + }, + { + "epoch": 0.4311051734463537, + "grad_norm": 0.9933274011071618, + "learning_rate": 3.2638433576891647e-06, + "loss": 0.1602, + "step": 4679 + }, + { + "epoch": 0.43119730962362374, + "grad_norm": 0.9203155714472351, + "learning_rate": 3.263117967731366e-06, + "loss": 0.1558, + "step": 4680 + }, + { + "epoch": 0.43128944580089373, + "grad_norm": 0.8740835069784506, + "learning_rate": 3.2623925069179817e-06, + "loss": 0.1335, + "step": 4681 + }, + { + "epoch": 0.43138158197816373, + "grad_norm": 1.0122592203806835, + "learning_rate": 3.2616669753163717e-06, + "loss": 0.1554, + "step": 4682 + }, + { + "epoch": 0.4314737181554337, + "grad_norm": 0.9154881263246533, + "learning_rate": 3.2609413729939005e-06, + "loss": 0.1444, + "step": 4683 + }, + { + "epoch": 0.4315658543327037, + "grad_norm": 0.892742980949353, + "learning_rate": 3.260215700017941e-06, + "loss": 0.1378, + "step": 4684 + }, + { + "epoch": 0.4316579905099737, + "grad_norm": 0.9415049276203106, + "learning_rate": 3.2594899564558713e-06, + "loss": 0.1585, + "step": 4685 + }, + { + "epoch": 0.4317501266872438, + "grad_norm": 0.8706508361164794, + "learning_rate": 3.2587641423750782e-06, + "loss": 0.1431, + "step": 4686 + }, + { + "epoch": 0.43184226286451377, + "grad_norm": 0.8445511287505905, + "learning_rate": 3.2580382578429525e-06, + "loss": 0.1384, + "step": 4687 + }, + { + "epoch": 0.43193439904178377, + "grad_norm": 0.9137170420709978, + "learning_rate": 3.2573123029268926e-06, + "loss": 0.1468, + "step": 4688 + }, + { + "epoch": 0.43202653521905376, + "grad_norm": 0.9192552269512315, + "learning_rate": 3.256586277694305e-06, + "loss": 0.1407, + "step": 4689 + }, + { + "epoch": 0.43211867139632376, + "grad_norm": 0.9422981334954762, + "learning_rate": 3.255860182212599e-06, + "loss": 0.159, + "step": 4690 + }, + { + "epoch": 0.43221080757359376, + "grad_norm": 0.888256356837777, + "learning_rate": 3.2551340165491947e-06, + "loss": 0.148, + "step": 4691 + }, + { + "epoch": 0.43230294375086376, + "grad_norm": 0.9066577043703513, + "learning_rate": 3.254407780771515e-06, + "loss": 0.1446, + "step": 4692 + }, + { + "epoch": 0.4323950799281338, + "grad_norm": 0.8204021095990877, + "learning_rate": 3.2536814749469915e-06, + "loss": 0.1287, + "step": 4693 + }, + { + "epoch": 0.4324872161054038, + "grad_norm": 0.8675428336875458, + "learning_rate": 3.252955099143062e-06, + "loss": 0.1402, + "step": 4694 + }, + { + "epoch": 0.4325793522826738, + "grad_norm": 0.8491928019041444, + "learning_rate": 3.2522286534271706e-06, + "loss": 0.1403, + "step": 4695 + }, + { + "epoch": 0.4326714884599438, + "grad_norm": 0.8968385179030194, + "learning_rate": 3.2515021378667677e-06, + "loss": 0.1456, + "step": 4696 + }, + { + "epoch": 0.4327636246372138, + "grad_norm": 0.8221696054310007, + "learning_rate": 3.250775552529312e-06, + "loss": 0.1304, + "step": 4697 + }, + { + "epoch": 0.4328557608144838, + "grad_norm": 0.924831124311071, + "learning_rate": 3.250048897482263e-06, + "loss": 0.1527, + "step": 4698 + }, + { + "epoch": 0.4329478969917538, + "grad_norm": 0.9196754936163174, + "learning_rate": 3.2493221727930947e-06, + "loss": 0.1568, + "step": 4699 + }, + { + "epoch": 0.43304003316902384, + "grad_norm": 0.9493069882745526, + "learning_rate": 3.2485953785292813e-06, + "loss": 0.1467, + "step": 4700 + }, + { + "epoch": 0.43313216934629384, + "grad_norm": 0.8814778987108999, + "learning_rate": 3.247868514758307e-06, + "loss": 0.1507, + "step": 4701 + }, + { + "epoch": 0.43322430552356384, + "grad_norm": 0.9240643672635029, + "learning_rate": 3.2471415815476603e-06, + "loss": 0.1526, + "step": 4702 + }, + { + "epoch": 0.43331644170083383, + "grad_norm": 0.8567708852826385, + "learning_rate": 3.246414578964837e-06, + "loss": 0.1372, + "step": 4703 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 0.8995443727689885, + "learning_rate": 3.24568750707734e-06, + "loss": 0.1518, + "step": 4704 + }, + { + "epoch": 0.4335007140553738, + "grad_norm": 0.8848310081209736, + "learning_rate": 3.2449603659526787e-06, + "loss": 0.1414, + "step": 4705 + }, + { + "epoch": 0.4335928502326438, + "grad_norm": 0.9253510253751343, + "learning_rate": 3.244233155658365e-06, + "loss": 0.1561, + "step": 4706 + }, + { + "epoch": 0.4336849864099139, + "grad_norm": 0.8987080572532566, + "learning_rate": 3.2435058762619243e-06, + "loss": 0.1459, + "step": 4707 + }, + { + "epoch": 0.43377712258718387, + "grad_norm": 0.8418375834879664, + "learning_rate": 3.2427785278308832e-06, + "loss": 0.1374, + "step": 4708 + }, + { + "epoch": 0.43386925876445387, + "grad_norm": 0.8605526697968859, + "learning_rate": 3.242051110432775e-06, + "loss": 0.1493, + "step": 4709 + }, + { + "epoch": 0.43396139494172387, + "grad_norm": 0.8897085366064001, + "learning_rate": 3.241323624135142e-06, + "loss": 0.1448, + "step": 4710 + }, + { + "epoch": 0.43405353111899386, + "grad_norm": 0.8611729087577313, + "learning_rate": 3.2405960690055307e-06, + "loss": 0.137, + "step": 4711 + }, + { + "epoch": 0.43414566729626386, + "grad_norm": 0.8688996045321737, + "learning_rate": 3.2398684451114936e-06, + "loss": 0.1367, + "step": 4712 + }, + { + "epoch": 0.43423780347353386, + "grad_norm": 0.9014621771431871, + "learning_rate": 3.2391407525205933e-06, + "loss": 0.1479, + "step": 4713 + }, + { + "epoch": 0.4343299396508039, + "grad_norm": 0.9021014682601557, + "learning_rate": 3.2384129913003935e-06, + "loss": 0.1499, + "step": 4714 + }, + { + "epoch": 0.4344220758280739, + "grad_norm": 0.9191301374460689, + "learning_rate": 3.237685161518468e-06, + "loss": 0.1513, + "step": 4715 + }, + { + "epoch": 0.4345142120053439, + "grad_norm": 0.9119658593871829, + "learning_rate": 3.236957263242396e-06, + "loss": 0.1439, + "step": 4716 + }, + { + "epoch": 0.4346063481826139, + "grad_norm": 0.9280932838157593, + "learning_rate": 3.2362292965397633e-06, + "loss": 0.1528, + "step": 4717 + }, + { + "epoch": 0.4346984843598839, + "grad_norm": 0.9312157213794583, + "learning_rate": 3.235501261478161e-06, + "loss": 0.146, + "step": 4718 + }, + { + "epoch": 0.4347906205371539, + "grad_norm": 0.9302842434648845, + "learning_rate": 3.2347731581251866e-06, + "loss": 0.1393, + "step": 4719 + }, + { + "epoch": 0.43488275671442395, + "grad_norm": 0.8610238509695106, + "learning_rate": 3.2340449865484464e-06, + "loss": 0.1412, + "step": 4720 + }, + { + "epoch": 0.43497489289169394, + "grad_norm": 0.8708422464114209, + "learning_rate": 3.23331674681555e-06, + "loss": 0.127, + "step": 4721 + }, + { + "epoch": 0.43506702906896394, + "grad_norm": 0.9170158188594641, + "learning_rate": 3.2325884389941147e-06, + "loss": 0.1388, + "step": 4722 + }, + { + "epoch": 0.43515916524623394, + "grad_norm": 0.9055547940614805, + "learning_rate": 3.2318600631517637e-06, + "loss": 0.1438, + "step": 4723 + }, + { + "epoch": 0.43525130142350393, + "grad_norm": 0.937925363757348, + "learning_rate": 3.2311316193561277e-06, + "loss": 0.155, + "step": 4724 + }, + { + "epoch": 0.43534343760077393, + "grad_norm": 0.8980494734303124, + "learning_rate": 3.230403107674841e-06, + "loss": 0.1511, + "step": 4725 + }, + { + "epoch": 0.4354355737780439, + "grad_norm": 0.9131532114439331, + "learning_rate": 3.2296745281755485e-06, + "loss": 0.146, + "step": 4726 + }, + { + "epoch": 0.435527709955314, + "grad_norm": 0.9058295332114711, + "learning_rate": 3.2289458809258965e-06, + "loss": 0.1552, + "step": 4727 + }, + { + "epoch": 0.435619846132584, + "grad_norm": 0.8947703410791472, + "learning_rate": 3.2282171659935415e-06, + "loss": 0.1388, + "step": 4728 + }, + { + "epoch": 0.435711982309854, + "grad_norm": 0.8926286281907707, + "learning_rate": 3.2274883834461444e-06, + "loss": 0.1344, + "step": 4729 + }, + { + "epoch": 0.43580411848712397, + "grad_norm": 0.9531048239256399, + "learning_rate": 3.2267595333513724e-06, + "loss": 0.1592, + "step": 4730 + }, + { + "epoch": 0.43589625466439397, + "grad_norm": 0.9201501804582292, + "learning_rate": 3.2260306157768994e-06, + "loss": 0.148, + "step": 4731 + }, + { + "epoch": 0.43598839084166396, + "grad_norm": 0.9174290109231636, + "learning_rate": 3.2253016307904063e-06, + "loss": 0.1385, + "step": 4732 + }, + { + "epoch": 0.43608052701893396, + "grad_norm": 0.8375669138427223, + "learning_rate": 3.224572578459577e-06, + "loss": 0.1278, + "step": 4733 + }, + { + "epoch": 0.436172663196204, + "grad_norm": 0.963997166950795, + "learning_rate": 3.2238434588521078e-06, + "loss": 0.1501, + "step": 4734 + }, + { + "epoch": 0.436264799373474, + "grad_norm": 0.9336340407484712, + "learning_rate": 3.2231142720356946e-06, + "loss": 0.1528, + "step": 4735 + }, + { + "epoch": 0.436356935550744, + "grad_norm": 0.9042393512372882, + "learning_rate": 3.222385018078043e-06, + "loss": 0.1368, + "step": 4736 + }, + { + "epoch": 0.436449071728014, + "grad_norm": 0.9438201754765096, + "learning_rate": 3.2216556970468656e-06, + "loss": 0.1562, + "step": 4737 + }, + { + "epoch": 0.436541207905284, + "grad_norm": 0.938414838398955, + "learning_rate": 3.2209263090098785e-06, + "loss": 0.1526, + "step": 4738 + }, + { + "epoch": 0.436633344082554, + "grad_norm": 0.9018511969382467, + "learning_rate": 3.220196854034806e-06, + "loss": 0.138, + "step": 4739 + }, + { + "epoch": 0.436725480259824, + "grad_norm": 0.8535484944038092, + "learning_rate": 3.2194673321893787e-06, + "loss": 0.1288, + "step": 4740 + }, + { + "epoch": 0.43681761643709405, + "grad_norm": 0.8802381462413789, + "learning_rate": 3.2187377435413316e-06, + "loss": 0.1487, + "step": 4741 + }, + { + "epoch": 0.43690975261436404, + "grad_norm": 0.9534514302434024, + "learning_rate": 3.2180080881584075e-06, + "loss": 0.1572, + "step": 4742 + }, + { + "epoch": 0.43700188879163404, + "grad_norm": 0.8807997475524247, + "learning_rate": 3.2172783661083556e-06, + "loss": 0.1514, + "step": 4743 + }, + { + "epoch": 0.43709402496890404, + "grad_norm": 0.9205359815568849, + "learning_rate": 3.21654857745893e-06, + "loss": 0.1507, + "step": 4744 + }, + { + "epoch": 0.43718616114617403, + "grad_norm": 0.8949200471013621, + "learning_rate": 3.2158187222778926e-06, + "loss": 0.1548, + "step": 4745 + }, + { + "epoch": 0.43727829732344403, + "grad_norm": 0.893054846565139, + "learning_rate": 3.215088800633009e-06, + "loss": 0.1368, + "step": 4746 + }, + { + "epoch": 0.43737043350071403, + "grad_norm": 0.8943073971027454, + "learning_rate": 3.214358812592053e-06, + "loss": 0.135, + "step": 4747 + }, + { + "epoch": 0.4374625696779841, + "grad_norm": 0.9181687265142016, + "learning_rate": 3.2136287582228048e-06, + "loss": 0.1328, + "step": 4748 + }, + { + "epoch": 0.4375547058552541, + "grad_norm": 0.9251073922168981, + "learning_rate": 3.2128986375930495e-06, + "loss": 0.1437, + "step": 4749 + }, + { + "epoch": 0.4376468420325241, + "grad_norm": 0.8926775990316347, + "learning_rate": 3.212168450770579e-06, + "loss": 0.1378, + "step": 4750 + }, + { + "epoch": 0.43773897820979407, + "grad_norm": 0.853248400512252, + "learning_rate": 3.2114381978231918e-06, + "loss": 0.1443, + "step": 4751 + }, + { + "epoch": 0.43783111438706407, + "grad_norm": 0.9583420162746287, + "learning_rate": 3.21070787881869e-06, + "loss": 0.1488, + "step": 4752 + }, + { + "epoch": 0.43792325056433407, + "grad_norm": 0.8855991731771236, + "learning_rate": 3.2099774938248866e-06, + "loss": 0.1349, + "step": 4753 + }, + { + "epoch": 0.4380153867416041, + "grad_norm": 0.8946899043733684, + "learning_rate": 3.2092470429095955e-06, + "loss": 0.1519, + "step": 4754 + }, + { + "epoch": 0.4381075229188741, + "grad_norm": 0.8759680421106206, + "learning_rate": 3.208516526140641e-06, + "loss": 0.14, + "step": 4755 + }, + { + "epoch": 0.4381996590961441, + "grad_norm": 0.9408448869394528, + "learning_rate": 3.2077859435858503e-06, + "loss": 0.1601, + "step": 4756 + }, + { + "epoch": 0.4382917952734141, + "grad_norm": 0.9347952147345089, + "learning_rate": 3.2070552953130586e-06, + "loss": 0.1589, + "step": 4757 + }, + { + "epoch": 0.4383839314506841, + "grad_norm": 0.9081980861793578, + "learning_rate": 3.2063245813901068e-06, + "loss": 0.1582, + "step": 4758 + }, + { + "epoch": 0.4384760676279541, + "grad_norm": 0.8939823014855057, + "learning_rate": 3.2055938018848417e-06, + "loss": 0.154, + "step": 4759 + }, + { + "epoch": 0.4385682038052241, + "grad_norm": 0.9056882583574686, + "learning_rate": 3.2048629568651153e-06, + "loss": 0.1539, + "step": 4760 + }, + { + "epoch": 0.43866033998249415, + "grad_norm": 0.8749131675102494, + "learning_rate": 3.2041320463987886e-06, + "loss": 0.1444, + "step": 4761 + }, + { + "epoch": 0.43875247615976415, + "grad_norm": 0.8808614298731526, + "learning_rate": 3.2034010705537245e-06, + "loss": 0.1415, + "step": 4762 + }, + { + "epoch": 0.43884461233703415, + "grad_norm": 0.9542017201745561, + "learning_rate": 3.202670029397796e-06, + "loss": 0.1473, + "step": 4763 + }, + { + "epoch": 0.43893674851430414, + "grad_norm": 0.9145181955112874, + "learning_rate": 3.2019389229988794e-06, + "loss": 0.1527, + "step": 4764 + }, + { + "epoch": 0.43902888469157414, + "grad_norm": 0.9118897020011315, + "learning_rate": 3.2012077514248592e-06, + "loss": 0.1365, + "step": 4765 + }, + { + "epoch": 0.43912102086884414, + "grad_norm": 0.8890964393279647, + "learning_rate": 3.2004765147436228e-06, + "loss": 0.1555, + "step": 4766 + }, + { + "epoch": 0.43921315704611413, + "grad_norm": 0.9537052872925784, + "learning_rate": 3.1997452130230664e-06, + "loss": 0.1395, + "step": 4767 + }, + { + "epoch": 0.4393052932233842, + "grad_norm": 0.924180348352055, + "learning_rate": 3.1990138463310923e-06, + "loss": 0.1405, + "step": 4768 + }, + { + "epoch": 0.4393974294006542, + "grad_norm": 0.8637655086910015, + "learning_rate": 3.1982824147356078e-06, + "loss": 0.1447, + "step": 4769 + }, + { + "epoch": 0.4394895655779242, + "grad_norm": 0.912924057763308, + "learning_rate": 3.197550918304525e-06, + "loss": 0.141, + "step": 4770 + }, + { + "epoch": 0.4395817017551942, + "grad_norm": 0.9110661693707881, + "learning_rate": 3.196819357105764e-06, + "loss": 0.1416, + "step": 4771 + }, + { + "epoch": 0.4396738379324642, + "grad_norm": 0.9416488247361345, + "learning_rate": 3.196087731207252e-06, + "loss": 0.1539, + "step": 4772 + }, + { + "epoch": 0.43976597410973417, + "grad_norm": 0.9280205052202724, + "learning_rate": 3.1953560406769184e-06, + "loss": 0.15, + "step": 4773 + }, + { + "epoch": 0.43985811028700417, + "grad_norm": 0.9515080388192296, + "learning_rate": 3.194624285582702e-06, + "loss": 0.1563, + "step": 4774 + }, + { + "epoch": 0.4399502464642742, + "grad_norm": 0.8522352854395673, + "learning_rate": 3.1938924659925457e-06, + "loss": 0.1416, + "step": 4775 + }, + { + "epoch": 0.4400423826415442, + "grad_norm": 0.8685099112900193, + "learning_rate": 3.193160581974399e-06, + "loss": 0.1275, + "step": 4776 + }, + { + "epoch": 0.4401345188188142, + "grad_norm": 0.8909613526044037, + "learning_rate": 3.1924286335962177e-06, + "loss": 0.1475, + "step": 4777 + }, + { + "epoch": 0.4402266549960842, + "grad_norm": 0.9496888743162736, + "learning_rate": 3.1916966209259636e-06, + "loss": 0.1506, + "step": 4778 + }, + { + "epoch": 0.4403187911733542, + "grad_norm": 0.8581608533036216, + "learning_rate": 3.1909645440316034e-06, + "loss": 0.1322, + "step": 4779 + }, + { + "epoch": 0.4404109273506242, + "grad_norm": 0.9290846371768269, + "learning_rate": 3.1902324029811115e-06, + "loss": 0.1433, + "step": 4780 + }, + { + "epoch": 0.4405030635278942, + "grad_norm": 0.8837081497834979, + "learning_rate": 3.1895001978424665e-06, + "loss": 0.148, + "step": 4781 + }, + { + "epoch": 0.44059519970516425, + "grad_norm": 0.8818053795949022, + "learning_rate": 3.188767928683654e-06, + "loss": 0.1572, + "step": 4782 + }, + { + "epoch": 0.44068733588243425, + "grad_norm": 0.9566123724363703, + "learning_rate": 3.188035595572665e-06, + "loss": 0.1468, + "step": 4783 + }, + { + "epoch": 0.44077947205970425, + "grad_norm": 0.9126945997475431, + "learning_rate": 3.1873031985774972e-06, + "loss": 0.1417, + "step": 4784 + }, + { + "epoch": 0.44087160823697424, + "grad_norm": 0.9319946243034919, + "learning_rate": 3.186570737766153e-06, + "loss": 0.1524, + "step": 4785 + }, + { + "epoch": 0.44096374441424424, + "grad_norm": 0.8594125708493436, + "learning_rate": 3.1858382132066422e-06, + "loss": 0.1428, + "step": 4786 + }, + { + "epoch": 0.44105588059151424, + "grad_norm": 0.8475132117275923, + "learning_rate": 3.1851056249669786e-06, + "loss": 0.1396, + "step": 4787 + }, + { + "epoch": 0.4411480167687843, + "grad_norm": 0.9114714437921265, + "learning_rate": 3.1843729731151855e-06, + "loss": 0.1506, + "step": 4788 + }, + { + "epoch": 0.4412401529460543, + "grad_norm": 0.8478111369854033, + "learning_rate": 3.183640257719287e-06, + "loss": 0.1399, + "step": 4789 + }, + { + "epoch": 0.4413322891233243, + "grad_norm": 0.9881708242449926, + "learning_rate": 3.182907478847318e-06, + "loss": 0.1628, + "step": 4790 + }, + { + "epoch": 0.4414244253005943, + "grad_norm": 0.8973574240687511, + "learning_rate": 3.1821746365673157e-06, + "loss": 0.1507, + "step": 4791 + }, + { + "epoch": 0.4415165614778643, + "grad_norm": 0.9318494415821815, + "learning_rate": 3.1814417309473243e-06, + "loss": 0.1401, + "step": 4792 + }, + { + "epoch": 0.4416086976551343, + "grad_norm": 0.9456870027866897, + "learning_rate": 3.1807087620553957e-06, + "loss": 0.1535, + "step": 4793 + }, + { + "epoch": 0.44170083383240427, + "grad_norm": 0.8681572085432423, + "learning_rate": 3.179975729959585e-06, + "loss": 0.1532, + "step": 4794 + }, + { + "epoch": 0.4417929700096743, + "grad_norm": 0.931784470899894, + "learning_rate": 3.1792426347279544e-06, + "loss": 0.1557, + "step": 4795 + }, + { + "epoch": 0.4418851061869443, + "grad_norm": 0.8708926614080035, + "learning_rate": 3.178509476428573e-06, + "loss": 0.1389, + "step": 4796 + }, + { + "epoch": 0.4419772423642143, + "grad_norm": 0.9255219971385017, + "learning_rate": 3.177776255129512e-06, + "loss": 0.1564, + "step": 4797 + }, + { + "epoch": 0.4420693785414843, + "grad_norm": 0.9397331600366731, + "learning_rate": 3.1770429708988536e-06, + "loss": 0.1665, + "step": 4798 + }, + { + "epoch": 0.4421615147187543, + "grad_norm": 0.8823264473223045, + "learning_rate": 3.1763096238046833e-06, + "loss": 0.1448, + "step": 4799 + }, + { + "epoch": 0.4422536508960243, + "grad_norm": 0.9225948201087502, + "learning_rate": 3.1755762139150905e-06, + "loss": 0.1496, + "step": 4800 + }, + { + "epoch": 0.4423457870732943, + "grad_norm": 0.8726837869356154, + "learning_rate": 3.1748427412981742e-06, + "loss": 0.1418, + "step": 4801 + }, + { + "epoch": 0.44243792325056436, + "grad_norm": 0.8557987601165721, + "learning_rate": 3.1741092060220364e-06, + "loss": 0.1345, + "step": 4802 + }, + { + "epoch": 0.44253005942783435, + "grad_norm": 0.8580919943355668, + "learning_rate": 3.1733756081547864e-06, + "loss": 0.1394, + "step": 4803 + }, + { + "epoch": 0.44262219560510435, + "grad_norm": 0.9176851413304004, + "learning_rate": 3.172641947764539e-06, + "loss": 0.1449, + "step": 4804 + }, + { + "epoch": 0.44271433178237435, + "grad_norm": 0.9095773510548247, + "learning_rate": 3.1719082249194134e-06, + "loss": 0.1456, + "step": 4805 + }, + { + "epoch": 0.44280646795964435, + "grad_norm": 0.8799083657238806, + "learning_rate": 3.171174439687538e-06, + "loss": 0.1377, + "step": 4806 + }, + { + "epoch": 0.44289860413691434, + "grad_norm": 0.8851747834590045, + "learning_rate": 3.1704405921370428e-06, + "loss": 0.1313, + "step": 4807 + }, + { + "epoch": 0.44299074031418434, + "grad_norm": 0.878962379862866, + "learning_rate": 3.169706682336066e-06, + "loss": 0.1395, + "step": 4808 + }, + { + "epoch": 0.4430828764914544, + "grad_norm": 0.8508108772869166, + "learning_rate": 3.1689727103527536e-06, + "loss": 0.1408, + "step": 4809 + }, + { + "epoch": 0.4431750126687244, + "grad_norm": 0.8878317199533106, + "learning_rate": 3.168238676255251e-06, + "loss": 0.1433, + "step": 4810 + }, + { + "epoch": 0.4432671488459944, + "grad_norm": 0.9226903423145814, + "learning_rate": 3.1675045801117167e-06, + "loss": 0.1492, + "step": 4811 + }, + { + "epoch": 0.4433592850232644, + "grad_norm": 0.9388424210883725, + "learning_rate": 3.1667704219903095e-06, + "loss": 0.1575, + "step": 4812 + }, + { + "epoch": 0.4434514212005344, + "grad_norm": 0.8534203981677879, + "learning_rate": 3.1660362019591972e-06, + "loss": 0.1377, + "step": 4813 + }, + { + "epoch": 0.4435435573778044, + "grad_norm": 0.9133209220134044, + "learning_rate": 3.1653019200865513e-06, + "loss": 0.1437, + "step": 4814 + }, + { + "epoch": 0.4436356935550744, + "grad_norm": 0.9172019407225023, + "learning_rate": 3.164567576440552e-06, + "loss": 0.1507, + "step": 4815 + }, + { + "epoch": 0.4437278297323444, + "grad_norm": 0.9953185567517315, + "learning_rate": 3.1638331710893804e-06, + "loss": 0.1562, + "step": 4816 + }, + { + "epoch": 0.4438199659096144, + "grad_norm": 0.9378368946596357, + "learning_rate": 3.163098704101228e-06, + "loss": 0.1554, + "step": 4817 + }, + { + "epoch": 0.4439121020868844, + "grad_norm": 0.9135001066867825, + "learning_rate": 3.162364175544289e-06, + "loss": 0.1472, + "step": 4818 + }, + { + "epoch": 0.4440042382641544, + "grad_norm": 0.9971829365126571, + "learning_rate": 3.161629585486766e-06, + "loss": 0.1409, + "step": 4819 + }, + { + "epoch": 0.4440963744414244, + "grad_norm": 0.9040841107610292, + "learning_rate": 3.160894933996864e-06, + "loss": 0.149, + "step": 4820 + }, + { + "epoch": 0.4441885106186944, + "grad_norm": 0.9206094051297338, + "learning_rate": 3.160160221142797e-06, + "loss": 0.1364, + "step": 4821 + }, + { + "epoch": 0.44428064679596446, + "grad_norm": 0.9379688154656526, + "learning_rate": 3.159425446992781e-06, + "loss": 0.143, + "step": 4822 + }, + { + "epoch": 0.44437278297323446, + "grad_norm": 0.9567257382459619, + "learning_rate": 3.1586906116150428e-06, + "loss": 0.1482, + "step": 4823 + }, + { + "epoch": 0.44446491915050446, + "grad_norm": 1.005451648614117, + "learning_rate": 3.1579557150778094e-06, + "loss": 0.1601, + "step": 4824 + }, + { + "epoch": 0.44455705532777445, + "grad_norm": 0.8817243195267939, + "learning_rate": 3.1572207574493174e-06, + "loss": 0.1428, + "step": 4825 + }, + { + "epoch": 0.44464919150504445, + "grad_norm": 0.849646786373053, + "learning_rate": 3.1564857387978075e-06, + "loss": 0.1456, + "step": 4826 + }, + { + "epoch": 0.44474132768231445, + "grad_norm": 0.9519644437243897, + "learning_rate": 3.155750659191526e-06, + "loss": 0.1594, + "step": 4827 + }, + { + "epoch": 0.44483346385958444, + "grad_norm": 0.8774384982653047, + "learning_rate": 3.155015518698725e-06, + "loss": 0.1443, + "step": 4828 + }, + { + "epoch": 0.4449256000368545, + "grad_norm": 0.9294480749474386, + "learning_rate": 3.154280317387663e-06, + "loss": 0.152, + "step": 4829 + }, + { + "epoch": 0.4450177362141245, + "grad_norm": 0.8812827080037829, + "learning_rate": 3.1535450553266024e-06, + "loss": 0.1467, + "step": 4830 + }, + { + "epoch": 0.4451098723913945, + "grad_norm": 0.9149468689160323, + "learning_rate": 3.1528097325838143e-06, + "loss": 0.1496, + "step": 4831 + }, + { + "epoch": 0.4452020085686645, + "grad_norm": 0.886095744307431, + "learning_rate": 3.1520743492275714e-06, + "loss": 0.1504, + "step": 4832 + }, + { + "epoch": 0.4452941447459345, + "grad_norm": 0.9507047582370619, + "learning_rate": 3.151338905326155e-06, + "loss": 0.1536, + "step": 4833 + }, + { + "epoch": 0.4453862809232045, + "grad_norm": 0.9475520516133387, + "learning_rate": 3.1506034009478515e-06, + "loss": 0.1513, + "step": 4834 + }, + { + "epoch": 0.4454784171004745, + "grad_norm": 0.8963368746247009, + "learning_rate": 3.1498678361609514e-06, + "loss": 0.1379, + "step": 4835 + }, + { + "epoch": 0.44557055327774453, + "grad_norm": 0.9466097663899271, + "learning_rate": 3.149132211033754e-06, + "loss": 0.1352, + "step": 4836 + }, + { + "epoch": 0.4456626894550145, + "grad_norm": 0.9237287141376792, + "learning_rate": 3.1483965256345596e-06, + "loss": 0.1409, + "step": 4837 + }, + { + "epoch": 0.4457548256322845, + "grad_norm": 1.0177207633781526, + "learning_rate": 3.147660780031679e-06, + "loss": 0.1584, + "step": 4838 + }, + { + "epoch": 0.4458469618095545, + "grad_norm": 0.9370371601534141, + "learning_rate": 3.146924974293425e-06, + "loss": 0.1532, + "step": 4839 + }, + { + "epoch": 0.4459390979868245, + "grad_norm": 0.9331012417513412, + "learning_rate": 3.1461891084881175e-06, + "loss": 0.1604, + "step": 4840 + }, + { + "epoch": 0.4460312341640945, + "grad_norm": 0.9580514103078052, + "learning_rate": 3.1454531826840816e-06, + "loss": 0.1515, + "step": 4841 + }, + { + "epoch": 0.4461233703413645, + "grad_norm": 0.931795200004456, + "learning_rate": 3.1447171969496487e-06, + "loss": 0.1515, + "step": 4842 + }, + { + "epoch": 0.44621550651863456, + "grad_norm": 0.826891990289607, + "learning_rate": 3.1439811513531537e-06, + "loss": 0.1302, + "step": 4843 + }, + { + "epoch": 0.44630764269590456, + "grad_norm": 0.8025586612837762, + "learning_rate": 3.143245045962941e-06, + "loss": 0.134, + "step": 4844 + }, + { + "epoch": 0.44639977887317456, + "grad_norm": 0.9430651189859448, + "learning_rate": 3.142508880847355e-06, + "loss": 0.1572, + "step": 4845 + }, + { + "epoch": 0.44649191505044455, + "grad_norm": 0.8750199948395706, + "learning_rate": 3.1417726560747507e-06, + "loss": 0.1414, + "step": 4846 + }, + { + "epoch": 0.44658405122771455, + "grad_norm": 0.8970428891745201, + "learning_rate": 3.1410363717134868e-06, + "loss": 0.1362, + "step": 4847 + }, + { + "epoch": 0.44667618740498455, + "grad_norm": 0.8970399368797006, + "learning_rate": 3.140300027831927e-06, + "loss": 0.156, + "step": 4848 + }, + { + "epoch": 0.44676832358225455, + "grad_norm": 0.900467919696107, + "learning_rate": 3.1395636244984397e-06, + "loss": 0.1441, + "step": 4849 + }, + { + "epoch": 0.4468604597595246, + "grad_norm": 0.8944755285685609, + "learning_rate": 3.1388271617814015e-06, + "loss": 0.1348, + "step": 4850 + }, + { + "epoch": 0.4469525959367946, + "grad_norm": 0.83068628823159, + "learning_rate": 3.1380906397491923e-06, + "loss": 0.1314, + "step": 4851 + }, + { + "epoch": 0.4470447321140646, + "grad_norm": 0.9164112054172598, + "learning_rate": 3.1373540584701997e-06, + "loss": 0.1537, + "step": 4852 + }, + { + "epoch": 0.4471368682913346, + "grad_norm": 0.8992847662913391, + "learning_rate": 3.1366174180128127e-06, + "loss": 0.1364, + "step": 4853 + }, + { + "epoch": 0.4472290044686046, + "grad_norm": 0.9212563327688689, + "learning_rate": 3.1358807184454305e-06, + "loss": 0.1493, + "step": 4854 + }, + { + "epoch": 0.4473211406458746, + "grad_norm": 0.924510733072517, + "learning_rate": 3.1351439598364554e-06, + "loss": 0.1481, + "step": 4855 + }, + { + "epoch": 0.44741327682314463, + "grad_norm": 0.8533748975648047, + "learning_rate": 3.134407142254295e-06, + "loss": 0.1257, + "step": 4856 + }, + { + "epoch": 0.44750541300041463, + "grad_norm": 0.9120449629078842, + "learning_rate": 3.1336702657673625e-06, + "loss": 0.1446, + "step": 4857 + }, + { + "epoch": 0.44759754917768463, + "grad_norm": 0.8955446860187469, + "learning_rate": 3.132933330444079e-06, + "loss": 0.1375, + "step": 4858 + }, + { + "epoch": 0.4476896853549546, + "grad_norm": 0.9088059249145077, + "learning_rate": 3.132196336352867e-06, + "loss": 0.1317, + "step": 4859 + }, + { + "epoch": 0.4477818215322246, + "grad_norm": 0.9870459351359203, + "learning_rate": 3.131459283562157e-06, + "loss": 0.1494, + "step": 4860 + }, + { + "epoch": 0.4478739577094946, + "grad_norm": 0.8765641433359809, + "learning_rate": 3.1307221721403846e-06, + "loss": 0.1465, + "step": 4861 + }, + { + "epoch": 0.4479660938867646, + "grad_norm": 0.8894333905339528, + "learning_rate": 3.129985002155991e-06, + "loss": 0.1375, + "step": 4862 + }, + { + "epoch": 0.44805823006403467, + "grad_norm": 0.9046224882892272, + "learning_rate": 3.129247773677422e-06, + "loss": 0.1459, + "step": 4863 + }, + { + "epoch": 0.44815036624130467, + "grad_norm": 0.8885653224314798, + "learning_rate": 3.128510486773129e-06, + "loss": 0.1435, + "step": 4864 + }, + { + "epoch": 0.44824250241857466, + "grad_norm": 0.9047398877430426, + "learning_rate": 3.1277731415115696e-06, + "loss": 0.1521, + "step": 4865 + }, + { + "epoch": 0.44833463859584466, + "grad_norm": 0.9101582197651139, + "learning_rate": 3.127035737961207e-06, + "loss": 0.1475, + "step": 4866 + }, + { + "epoch": 0.44842677477311466, + "grad_norm": 0.9211512953806295, + "learning_rate": 3.1262982761905084e-06, + "loss": 0.1483, + "step": 4867 + }, + { + "epoch": 0.44851891095038465, + "grad_norm": 0.9168309363122416, + "learning_rate": 3.125560756267948e-06, + "loss": 0.1431, + "step": 4868 + }, + { + "epoch": 0.44861104712765465, + "grad_norm": 0.882017959352168, + "learning_rate": 3.1248231782620035e-06, + "loss": 0.1291, + "step": 4869 + }, + { + "epoch": 0.4487031833049247, + "grad_norm": 0.9737260719423534, + "learning_rate": 3.1240855422411593e-06, + "loss": 0.1535, + "step": 4870 + }, + { + "epoch": 0.4487953194821947, + "grad_norm": 0.8840045486699224, + "learning_rate": 3.1233478482739065e-06, + "loss": 0.1509, + "step": 4871 + }, + { + "epoch": 0.4488874556594647, + "grad_norm": 0.9149749621641127, + "learning_rate": 3.1226100964287378e-06, + "loss": 0.1387, + "step": 4872 + }, + { + "epoch": 0.4489795918367347, + "grad_norm": 0.9192452144221734, + "learning_rate": 3.1218722867741553e-06, + "loss": 0.1547, + "step": 4873 + }, + { + "epoch": 0.4490717280140047, + "grad_norm": 0.8908377156409427, + "learning_rate": 3.1211344193786636e-06, + "loss": 0.1432, + "step": 4874 + }, + { + "epoch": 0.4491638641912747, + "grad_norm": 0.8925399884941798, + "learning_rate": 3.1203964943107747e-06, + "loss": 0.1533, + "step": 4875 + }, + { + "epoch": 0.4492560003685447, + "grad_norm": 0.861973775471057, + "learning_rate": 3.1196585116390045e-06, + "loss": 0.1282, + "step": 4876 + }, + { + "epoch": 0.44934813654581474, + "grad_norm": 0.8880068164435665, + "learning_rate": 3.1189204714318743e-06, + "loss": 0.1355, + "step": 4877 + }, + { + "epoch": 0.44944027272308473, + "grad_norm": 1.0015594851191234, + "learning_rate": 3.1181823737579115e-06, + "loss": 0.1649, + "step": 4878 + }, + { + "epoch": 0.44953240890035473, + "grad_norm": 0.9213880892562584, + "learning_rate": 3.11744421868565e-06, + "loss": 0.1458, + "step": 4879 + }, + { + "epoch": 0.4496245450776247, + "grad_norm": 0.865422722909716, + "learning_rate": 3.1167060062836253e-06, + "loss": 0.148, + "step": 4880 + }, + { + "epoch": 0.4497166812548947, + "grad_norm": 0.877456464715817, + "learning_rate": 3.1159677366203815e-06, + "loss": 0.1443, + "step": 4881 + }, + { + "epoch": 0.4498088174321647, + "grad_norm": 0.9847277495406819, + "learning_rate": 3.1152294097644677e-06, + "loss": 0.1631, + "step": 4882 + }, + { + "epoch": 0.4499009536094347, + "grad_norm": 0.987382375783057, + "learning_rate": 3.1144910257844367e-06, + "loss": 0.1562, + "step": 4883 + }, + { + "epoch": 0.44999308978670477, + "grad_norm": 0.8920639965187964, + "learning_rate": 3.113752584748848e-06, + "loss": 0.1413, + "step": 4884 + }, + { + "epoch": 0.45008522596397477, + "grad_norm": 0.8848619030803502, + "learning_rate": 3.1130140867262653e-06, + "loss": 0.1435, + "step": 4885 + }, + { + "epoch": 0.45017736214124476, + "grad_norm": 0.9631374264594577, + "learning_rate": 3.112275531785259e-06, + "loss": 0.1502, + "step": 4886 + }, + { + "epoch": 0.45026949831851476, + "grad_norm": 0.8783560040106241, + "learning_rate": 3.111536919994404e-06, + "loss": 0.1384, + "step": 4887 + }, + { + "epoch": 0.45036163449578476, + "grad_norm": 0.8626103552628747, + "learning_rate": 3.110798251422279e-06, + "loss": 0.1376, + "step": 4888 + }, + { + "epoch": 0.45045377067305475, + "grad_norm": 0.9437049556206974, + "learning_rate": 3.1100595261374718e-06, + "loss": 0.1393, + "step": 4889 + }, + { + "epoch": 0.4505459068503248, + "grad_norm": 0.9041755349011702, + "learning_rate": 3.1093207442085716e-06, + "loss": 0.1404, + "step": 4890 + }, + { + "epoch": 0.4506380430275948, + "grad_norm": 0.8609050559264265, + "learning_rate": 3.108581905704175e-06, + "loss": 0.1401, + "step": 4891 + }, + { + "epoch": 0.4507301792048648, + "grad_norm": 0.931689125980329, + "learning_rate": 3.107843010692882e-06, + "loss": 0.1522, + "step": 4892 + }, + { + "epoch": 0.4508223153821348, + "grad_norm": 0.9401339014710594, + "learning_rate": 3.1071040592433003e-06, + "loss": 0.1625, + "step": 4893 + }, + { + "epoch": 0.4509144515594048, + "grad_norm": 0.9329946448016365, + "learning_rate": 3.1063650514240425e-06, + "loss": 0.141, + "step": 4894 + }, + { + "epoch": 0.4510065877366748, + "grad_norm": 0.8452855653517855, + "learning_rate": 3.105625987303723e-06, + "loss": 0.1403, + "step": 4895 + }, + { + "epoch": 0.4510987239139448, + "grad_norm": 0.8710890044558494, + "learning_rate": 3.104886866950966e-06, + "loss": 0.1351, + "step": 4896 + }, + { + "epoch": 0.45119086009121484, + "grad_norm": 0.9314641306578018, + "learning_rate": 3.104147690434398e-06, + "loss": 0.148, + "step": 4897 + }, + { + "epoch": 0.45128299626848484, + "grad_norm": 0.972321592820565, + "learning_rate": 3.103408457822653e-06, + "loss": 0.1458, + "step": 4898 + }, + { + "epoch": 0.45137513244575483, + "grad_norm": 0.9440018331535416, + "learning_rate": 3.1026691691843667e-06, + "loss": 0.1448, + "step": 4899 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 0.8171835747175893, + "learning_rate": 3.1019298245881836e-06, + "loss": 0.1261, + "step": 4900 + }, + { + "epoch": 0.45155940480029483, + "grad_norm": 0.9060060164646293, + "learning_rate": 3.101190424102752e-06, + "loss": 0.1444, + "step": 4901 + }, + { + "epoch": 0.4516515409775648, + "grad_norm": 0.9156139073129704, + "learning_rate": 3.100450967796724e-06, + "loss": 0.1435, + "step": 4902 + }, + { + "epoch": 0.4517436771548348, + "grad_norm": 0.9480421762707635, + "learning_rate": 3.099711455738759e-06, + "loss": 0.1498, + "step": 4903 + }, + { + "epoch": 0.4518358133321049, + "grad_norm": 0.8572388070239273, + "learning_rate": 3.0989718879975216e-06, + "loss": 0.1261, + "step": 4904 + }, + { + "epoch": 0.45192794950937487, + "grad_norm": 0.9440417055921769, + "learning_rate": 3.098232264641679e-06, + "loss": 0.1539, + "step": 4905 + }, + { + "epoch": 0.45202008568664487, + "grad_norm": 0.9367708656728884, + "learning_rate": 3.0974925857399067e-06, + "loss": 0.1469, + "step": 4906 + }, + { + "epoch": 0.45211222186391486, + "grad_norm": 0.9228480233084598, + "learning_rate": 3.0967528513608834e-06, + "loss": 0.1505, + "step": 4907 + }, + { + "epoch": 0.45220435804118486, + "grad_norm": 0.9142943272446579, + "learning_rate": 3.0960130615732934e-06, + "loss": 0.1359, + "step": 4908 + }, + { + "epoch": 0.45229649421845486, + "grad_norm": 0.9470937423096474, + "learning_rate": 3.095273216445827e-06, + "loss": 0.1545, + "step": 4909 + }, + { + "epoch": 0.45238863039572486, + "grad_norm": 0.8542008317546738, + "learning_rate": 3.0945333160471784e-06, + "loss": 0.1419, + "step": 4910 + }, + { + "epoch": 0.4524807665729949, + "grad_norm": 0.913915263073126, + "learning_rate": 3.0937933604460475e-06, + "loss": 0.1411, + "step": 4911 + }, + { + "epoch": 0.4525729027502649, + "grad_norm": 0.8900641602854975, + "learning_rate": 3.0930533497111385e-06, + "loss": 0.1452, + "step": 4912 + }, + { + "epoch": 0.4526650389275349, + "grad_norm": 0.8682569105387904, + "learning_rate": 3.0923132839111623e-06, + "loss": 0.1308, + "step": 4913 + }, + { + "epoch": 0.4527571751048049, + "grad_norm": 0.9630318001076058, + "learning_rate": 3.0915731631148347e-06, + "loss": 0.1394, + "step": 4914 + }, + { + "epoch": 0.4528493112820749, + "grad_norm": 0.9096845912213878, + "learning_rate": 3.0908329873908744e-06, + "loss": 0.1442, + "step": 4915 + }, + { + "epoch": 0.4529414474593449, + "grad_norm": 0.898608735328125, + "learning_rate": 3.0900927568080074e-06, + "loss": 0.1393, + "step": 4916 + }, + { + "epoch": 0.4530335836366149, + "grad_norm": 0.917045289108983, + "learning_rate": 3.0893524714349655e-06, + "loss": 0.1449, + "step": 4917 + }, + { + "epoch": 0.45312571981388494, + "grad_norm": 0.9167944285700524, + "learning_rate": 3.0886121313404827e-06, + "loss": 0.1475, + "step": 4918 + }, + { + "epoch": 0.45321785599115494, + "grad_norm": 0.9361875808177101, + "learning_rate": 3.0878717365933005e-06, + "loss": 0.1521, + "step": 4919 + }, + { + "epoch": 0.45330999216842494, + "grad_norm": 0.872165934115353, + "learning_rate": 3.087131287262163e-06, + "loss": 0.1435, + "step": 4920 + }, + { + "epoch": 0.45340212834569493, + "grad_norm": 0.9511119704159243, + "learning_rate": 3.0863907834158236e-06, + "loss": 0.1489, + "step": 4921 + }, + { + "epoch": 0.45349426452296493, + "grad_norm": 0.878621950916722, + "learning_rate": 3.0856502251230363e-06, + "loss": 0.1406, + "step": 4922 + }, + { + "epoch": 0.4535864007002349, + "grad_norm": 0.8888612621302028, + "learning_rate": 3.084909612452563e-06, + "loss": 0.14, + "step": 4923 + }, + { + "epoch": 0.453678536877505, + "grad_norm": 0.9063848035951376, + "learning_rate": 3.0841689454731686e-06, + "loss": 0.1543, + "step": 4924 + }, + { + "epoch": 0.453770673054775, + "grad_norm": 0.9055802766662286, + "learning_rate": 3.0834282242536253e-06, + "loss": 0.1433, + "step": 4925 + }, + { + "epoch": 0.453862809232045, + "grad_norm": 0.8804890898499717, + "learning_rate": 3.082687448862708e-06, + "loss": 0.1415, + "step": 4926 + }, + { + "epoch": 0.45395494540931497, + "grad_norm": 0.8951461422226128, + "learning_rate": 3.0819466193691995e-06, + "loss": 0.1366, + "step": 4927 + }, + { + "epoch": 0.45404708158658497, + "grad_norm": 0.9641931901566596, + "learning_rate": 3.0812057358418834e-06, + "loss": 0.1516, + "step": 4928 + }, + { + "epoch": 0.45413921776385496, + "grad_norm": 0.8913322240282533, + "learning_rate": 3.0804647983495527e-06, + "loss": 0.144, + "step": 4929 + }, + { + "epoch": 0.45423135394112496, + "grad_norm": 0.8853943694518898, + "learning_rate": 3.079723806961003e-06, + "loss": 0.1415, + "step": 4930 + }, + { + "epoch": 0.454323490118395, + "grad_norm": 0.892217812255629, + "learning_rate": 3.078982761745036e-06, + "loss": 0.1356, + "step": 4931 + }, + { + "epoch": 0.454415626295665, + "grad_norm": 0.9055251034056351, + "learning_rate": 3.078241662770456e-06, + "loss": 0.1459, + "step": 4932 + }, + { + "epoch": 0.454507762472935, + "grad_norm": 0.87958798551727, + "learning_rate": 3.0775005101060766e-06, + "loss": 0.1334, + "step": 4933 + }, + { + "epoch": 0.454599898650205, + "grad_norm": 0.9294900159804047, + "learning_rate": 3.076759303820712e-06, + "loss": 0.1483, + "step": 4934 + }, + { + "epoch": 0.454692034827475, + "grad_norm": 0.9137965516586513, + "learning_rate": 3.0760180439831844e-06, + "loss": 0.1461, + "step": 4935 + }, + { + "epoch": 0.454784171004745, + "grad_norm": 0.9170055144005241, + "learning_rate": 3.0752767306623193e-06, + "loss": 0.1549, + "step": 4936 + }, + { + "epoch": 0.454876307182015, + "grad_norm": 0.9215013771632808, + "learning_rate": 3.0745353639269476e-06, + "loss": 0.1468, + "step": 4937 + }, + { + "epoch": 0.45496844335928505, + "grad_norm": 0.87370944363821, + "learning_rate": 3.073793943845906e-06, + "loss": 0.1312, + "step": 4938 + }, + { + "epoch": 0.45506057953655504, + "grad_norm": 0.8529167887808932, + "learning_rate": 3.073052470488035e-06, + "loss": 0.1368, + "step": 4939 + }, + { + "epoch": 0.45515271571382504, + "grad_norm": 0.8213407441730175, + "learning_rate": 3.0723109439221794e-06, + "loss": 0.1219, + "step": 4940 + }, + { + "epoch": 0.45524485189109504, + "grad_norm": 0.9499098561290193, + "learning_rate": 3.071569364217192e-06, + "loss": 0.1488, + "step": 4941 + }, + { + "epoch": 0.45533698806836503, + "grad_norm": 0.9412672079828817, + "learning_rate": 3.070827731441927e-06, + "loss": 0.145, + "step": 4942 + }, + { + "epoch": 0.45542912424563503, + "grad_norm": 0.9449970015438249, + "learning_rate": 3.0700860456652467e-06, + "loss": 0.1527, + "step": 4943 + }, + { + "epoch": 0.455521260422905, + "grad_norm": 1.023026610833486, + "learning_rate": 3.0693443069560147e-06, + "loss": 0.1645, + "step": 4944 + }, + { + "epoch": 0.4556133966001751, + "grad_norm": 0.9322705358338801, + "learning_rate": 3.0686025153831033e-06, + "loss": 0.14, + "step": 4945 + }, + { + "epoch": 0.4557055327774451, + "grad_norm": 0.9495432808167561, + "learning_rate": 3.067860671015387e-06, + "loss": 0.1393, + "step": 4946 + }, + { + "epoch": 0.4557976689547151, + "grad_norm": 0.940822123502306, + "learning_rate": 3.0671187739217455e-06, + "loss": 0.1453, + "step": 4947 + }, + { + "epoch": 0.45588980513198507, + "grad_norm": 0.9230469467919934, + "learning_rate": 3.0663768241710653e-06, + "loss": 0.1404, + "step": 4948 + }, + { + "epoch": 0.45598194130925507, + "grad_norm": 0.8879484675632415, + "learning_rate": 3.065634821832237e-06, + "loss": 0.1453, + "step": 4949 + }, + { + "epoch": 0.45607407748652506, + "grad_norm": 0.8834701175995728, + "learning_rate": 3.064892766974153e-06, + "loss": 0.1337, + "step": 4950 + }, + { + "epoch": 0.45616621366379506, + "grad_norm": 0.9122572961159483, + "learning_rate": 3.0641506596657155e-06, + "loss": 0.1525, + "step": 4951 + }, + { + "epoch": 0.4562583498410651, + "grad_norm": 0.9907773312470673, + "learning_rate": 3.0634084999758283e-06, + "loss": 0.1481, + "step": 4952 + }, + { + "epoch": 0.4563504860183351, + "grad_norm": 1.0032234693838713, + "learning_rate": 3.0626662879734015e-06, + "loss": 0.1542, + "step": 4953 + }, + { + "epoch": 0.4564426221956051, + "grad_norm": 0.9644592767333385, + "learning_rate": 3.0619240237273496e-06, + "loss": 0.1467, + "step": 4954 + }, + { + "epoch": 0.4565347583728751, + "grad_norm": 0.9045737461881386, + "learning_rate": 3.0611817073065906e-06, + "loss": 0.1411, + "step": 4955 + }, + { + "epoch": 0.4566268945501451, + "grad_norm": 0.8749374497112531, + "learning_rate": 3.0604393387800506e-06, + "loss": 0.133, + "step": 4956 + }, + { + "epoch": 0.4567190307274151, + "grad_norm": 0.9062323098590219, + "learning_rate": 3.059696918216658e-06, + "loss": 0.1517, + "step": 4957 + }, + { + "epoch": 0.45681116690468515, + "grad_norm": 0.8681354325932334, + "learning_rate": 3.058954445685346e-06, + "loss": 0.139, + "step": 4958 + }, + { + "epoch": 0.45690330308195515, + "grad_norm": 0.9817471157907012, + "learning_rate": 3.058211921255053e-06, + "loss": 0.1503, + "step": 4959 + }, + { + "epoch": 0.45699543925922514, + "grad_norm": 0.8802616285300711, + "learning_rate": 3.0574693449947234e-06, + "loss": 0.1369, + "step": 4960 + }, + { + "epoch": 0.45708757543649514, + "grad_norm": 0.910891593640118, + "learning_rate": 3.056726716973305e-06, + "loss": 0.1402, + "step": 4961 + }, + { + "epoch": 0.45717971161376514, + "grad_norm": 1.030761021092152, + "learning_rate": 3.0559840372597516e-06, + "loss": 0.158, + "step": 4962 + }, + { + "epoch": 0.45727184779103514, + "grad_norm": 0.8832204519175471, + "learning_rate": 3.0552413059230196e-06, + "loss": 0.1497, + "step": 4963 + }, + { + "epoch": 0.45736398396830513, + "grad_norm": 0.932715520563677, + "learning_rate": 3.054498523032073e-06, + "loss": 0.1606, + "step": 4964 + }, + { + "epoch": 0.4574561201455752, + "grad_norm": 0.9295316822032308, + "learning_rate": 3.053755688655879e-06, + "loss": 0.1419, + "step": 4965 + }, + { + "epoch": 0.4575482563228452, + "grad_norm": 0.9730620058415319, + "learning_rate": 3.05301280286341e-06, + "loss": 0.1379, + "step": 4966 + }, + { + "epoch": 0.4576403925001152, + "grad_norm": 0.8698683882590288, + "learning_rate": 3.0522698657236417e-06, + "loss": 0.1364, + "step": 4967 + }, + { + "epoch": 0.4577325286773852, + "grad_norm": 0.9725906398499707, + "learning_rate": 3.0515268773055577e-06, + "loss": 0.1574, + "step": 4968 + }, + { + "epoch": 0.4578246648546552, + "grad_norm": 0.973396363810178, + "learning_rate": 3.0507838376781433e-06, + "loss": 0.1599, + "step": 4969 + }, + { + "epoch": 0.45791680103192517, + "grad_norm": 1.007119621555577, + "learning_rate": 3.050040746910391e-06, + "loss": 0.1615, + "step": 4970 + }, + { + "epoch": 0.45800893720919517, + "grad_norm": 0.9464800328407788, + "learning_rate": 3.049297605071296e-06, + "loss": 0.1472, + "step": 4971 + }, + { + "epoch": 0.4581010733864652, + "grad_norm": 0.970285334538535, + "learning_rate": 3.0485544122298586e-06, + "loss": 0.1539, + "step": 4972 + }, + { + "epoch": 0.4581932095637352, + "grad_norm": 0.8753266986448403, + "learning_rate": 3.0478111684550855e-06, + "loss": 0.1349, + "step": 4973 + }, + { + "epoch": 0.4582853457410052, + "grad_norm": 0.8632450928210212, + "learning_rate": 3.0470678738159865e-06, + "loss": 0.1393, + "step": 4974 + }, + { + "epoch": 0.4583774819182752, + "grad_norm": 0.9481887029950181, + "learning_rate": 3.046324528381576e-06, + "loss": 0.1544, + "step": 4975 + }, + { + "epoch": 0.4584696180955452, + "grad_norm": 0.9528130537970406, + "learning_rate": 3.045581132220875e-06, + "loss": 0.144, + "step": 4976 + }, + { + "epoch": 0.4585617542728152, + "grad_norm": 0.958356098741581, + "learning_rate": 3.0448376854029067e-06, + "loss": 0.158, + "step": 4977 + }, + { + "epoch": 0.4586538904500852, + "grad_norm": 0.886430427120259, + "learning_rate": 3.0440941879967007e-06, + "loss": 0.1349, + "step": 4978 + }, + { + "epoch": 0.45874602662735525, + "grad_norm": 0.9492295330617945, + "learning_rate": 3.043350640071291e-06, + "loss": 0.1498, + "step": 4979 + }, + { + "epoch": 0.45883816280462525, + "grad_norm": 0.8711956527213836, + "learning_rate": 3.0426070416957155e-06, + "loss": 0.1367, + "step": 4980 + }, + { + "epoch": 0.45893029898189525, + "grad_norm": 0.8805743996456511, + "learning_rate": 3.0418633929390184e-06, + "loss": 0.133, + "step": 4981 + }, + { + "epoch": 0.45902243515916524, + "grad_norm": 0.939114838941981, + "learning_rate": 3.0411196938702465e-06, + "loss": 0.1491, + "step": 4982 + }, + { + "epoch": 0.45911457133643524, + "grad_norm": 0.8942524010998183, + "learning_rate": 3.040375944558453e-06, + "loss": 0.1487, + "step": 4983 + }, + { + "epoch": 0.45920670751370524, + "grad_norm": 0.9381437330304268, + "learning_rate": 3.0396321450726946e-06, + "loss": 0.1454, + "step": 4984 + }, + { + "epoch": 0.4592988436909753, + "grad_norm": 0.8871246016080946, + "learning_rate": 3.0388882954820336e-06, + "loss": 0.1364, + "step": 4985 + }, + { + "epoch": 0.4593909798682453, + "grad_norm": 0.8884750944032963, + "learning_rate": 3.0381443958555367e-06, + "loss": 0.139, + "step": 4986 + }, + { + "epoch": 0.4594831160455153, + "grad_norm": 0.9264575790757549, + "learning_rate": 3.037400446262274e-06, + "loss": 0.1471, + "step": 4987 + }, + { + "epoch": 0.4595752522227853, + "grad_norm": 0.9376702144868666, + "learning_rate": 3.036656446771322e-06, + "loss": 0.1486, + "step": 4988 + }, + { + "epoch": 0.4596673884000553, + "grad_norm": 0.9416874908664606, + "learning_rate": 3.035912397451763e-06, + "loss": 0.1412, + "step": 4989 + }, + { + "epoch": 0.4597595245773253, + "grad_norm": 0.918365816769307, + "learning_rate": 3.035168298372678e-06, + "loss": 0.1395, + "step": 4990 + }, + { + "epoch": 0.45985166075459527, + "grad_norm": 0.9667651339344426, + "learning_rate": 3.0344241496031602e-06, + "loss": 0.1467, + "step": 4991 + }, + { + "epoch": 0.4599437969318653, + "grad_norm": 0.9702613519979137, + "learning_rate": 3.0336799512123017e-06, + "loss": 0.15, + "step": 4992 + }, + { + "epoch": 0.4600359331091353, + "grad_norm": 0.9617046493464246, + "learning_rate": 3.032935703269203e-06, + "loss": 0.145, + "step": 4993 + }, + { + "epoch": 0.4601280692864053, + "grad_norm": 1.021218623960953, + "learning_rate": 3.0321914058429668e-06, + "loss": 0.1517, + "step": 4994 + }, + { + "epoch": 0.4602202054636753, + "grad_norm": 0.9376645972483496, + "learning_rate": 3.0314470590027012e-06, + "loss": 0.1586, + "step": 4995 + }, + { + "epoch": 0.4603123416409453, + "grad_norm": 0.9052458305949739, + "learning_rate": 3.0307026628175183e-06, + "loss": 0.1454, + "step": 4996 + }, + { + "epoch": 0.4604044778182153, + "grad_norm": 0.8892358271557455, + "learning_rate": 3.029958217356537e-06, + "loss": 0.1423, + "step": 4997 + }, + { + "epoch": 0.4604966139954853, + "grad_norm": 0.8721683270176317, + "learning_rate": 3.029213722688878e-06, + "loss": 0.1422, + "step": 4998 + }, + { + "epoch": 0.46058875017275536, + "grad_norm": 0.9118092519515993, + "learning_rate": 3.0284691788836672e-06, + "loss": 0.1535, + "step": 4999 + }, + { + "epoch": 0.46068088635002535, + "grad_norm": 0.9566666504443163, + "learning_rate": 3.027724586010037e-06, + "loss": 0.1536, + "step": 5000 + }, + { + "epoch": 0.46068088635002535, + "eval_loss": 0.14408743381500244, + "eval_runtime": 299.7274, + "eval_samples_per_second": 23.411, + "eval_steps_per_second": 2.929, + "step": 5000 + }, + { + "epoch": 0.46077302252729535, + "grad_norm": 0.944085745803305, + "learning_rate": 3.0269799441371224e-06, + "loss": 0.1453, + "step": 5001 + }, + { + "epoch": 0.46086515870456535, + "grad_norm": 0.8901400825031632, + "learning_rate": 3.026235253334063e-06, + "loss": 0.1445, + "step": 5002 + }, + { + "epoch": 0.46095729488183534, + "grad_norm": 0.9088254286241351, + "learning_rate": 3.0254905136700038e-06, + "loss": 0.1415, + "step": 5003 + }, + { + "epoch": 0.46104943105910534, + "grad_norm": 0.9018385475977528, + "learning_rate": 3.024745725214093e-06, + "loss": 0.1404, + "step": 5004 + }, + { + "epoch": 0.46114156723637534, + "grad_norm": 0.8610078807583995, + "learning_rate": 3.024000888035486e-06, + "loss": 0.1327, + "step": 5005 + }, + { + "epoch": 0.4612337034136454, + "grad_norm": 0.9052272195057318, + "learning_rate": 3.0232560022033398e-06, + "loss": 0.1389, + "step": 5006 + }, + { + "epoch": 0.4613258395909154, + "grad_norm": 0.9070172834425693, + "learning_rate": 3.022511067786817e-06, + "loss": 0.1444, + "step": 5007 + }, + { + "epoch": 0.4614179757681854, + "grad_norm": 1.0011454075311015, + "learning_rate": 3.0217660848550863e-06, + "loss": 0.1616, + "step": 5008 + }, + { + "epoch": 0.4615101119454554, + "grad_norm": 0.8636917231521298, + "learning_rate": 3.0210210534773175e-06, + "loss": 0.1461, + "step": 5009 + }, + { + "epoch": 0.4616022481227254, + "grad_norm": 0.887277083871904, + "learning_rate": 3.020275973722688e-06, + "loss": 0.1413, + "step": 5010 + }, + { + "epoch": 0.4616943842999954, + "grad_norm": 0.904649471086909, + "learning_rate": 3.0195308456603795e-06, + "loss": 0.1444, + "step": 5011 + }, + { + "epoch": 0.46178652047726537, + "grad_norm": 0.922525202971339, + "learning_rate": 3.018785669359575e-06, + "loss": 0.1349, + "step": 5012 + }, + { + "epoch": 0.4618786566545354, + "grad_norm": 0.8871262781575864, + "learning_rate": 3.018040444889466e-06, + "loss": 0.1357, + "step": 5013 + }, + { + "epoch": 0.4619707928318054, + "grad_norm": 0.8900463787551096, + "learning_rate": 3.0172951723192456e-06, + "loss": 0.1374, + "step": 5014 + }, + { + "epoch": 0.4620629290090754, + "grad_norm": 0.9042316250161178, + "learning_rate": 3.016549851718112e-06, + "loss": 0.1456, + "step": 5015 + }, + { + "epoch": 0.4621550651863454, + "grad_norm": 0.9786460909009905, + "learning_rate": 3.0158044831552703e-06, + "loss": 0.1508, + "step": 5016 + }, + { + "epoch": 0.4622472013636154, + "grad_norm": 0.8686153084102173, + "learning_rate": 3.015059066699926e-06, + "loss": 0.1343, + "step": 5017 + }, + { + "epoch": 0.4623393375408854, + "grad_norm": 0.8651676610958235, + "learning_rate": 3.0143136024212923e-06, + "loss": 0.1362, + "step": 5018 + }, + { + "epoch": 0.46243147371815546, + "grad_norm": 0.9019007066819323, + "learning_rate": 3.013568090388585e-06, + "loss": 0.145, + "step": 5019 + }, + { + "epoch": 0.46252360989542546, + "grad_norm": 0.9500449930037843, + "learning_rate": 3.012822530671026e-06, + "loss": 0.1437, + "step": 5020 + }, + { + "epoch": 0.46261574607269546, + "grad_norm": 0.9361307249362852, + "learning_rate": 3.012076923337839e-06, + "loss": 0.1569, + "step": 5021 + }, + { + "epoch": 0.46270788224996545, + "grad_norm": 0.8733335938562423, + "learning_rate": 3.011331268458255e-06, + "loss": 0.1424, + "step": 5022 + }, + { + "epoch": 0.46280001842723545, + "grad_norm": 0.8812589568625355, + "learning_rate": 3.010585566101507e-06, + "loss": 0.1437, + "step": 5023 + }, + { + "epoch": 0.46289215460450545, + "grad_norm": 0.8940904754428848, + "learning_rate": 3.0098398163368353e-06, + "loss": 0.1385, + "step": 5024 + }, + { + "epoch": 0.46298429078177544, + "grad_norm": 0.8489133395575272, + "learning_rate": 3.0090940192334805e-06, + "loss": 0.1351, + "step": 5025 + }, + { + "epoch": 0.4630764269590455, + "grad_norm": 0.905730134591134, + "learning_rate": 3.0083481748606923e-06, + "loss": 0.1499, + "step": 5026 + }, + { + "epoch": 0.4631685631363155, + "grad_norm": 0.8927244938308796, + "learning_rate": 3.007602283287721e-06, + "loss": 0.1435, + "step": 5027 + }, + { + "epoch": 0.4632606993135855, + "grad_norm": 0.8957615740961147, + "learning_rate": 3.0068563445838234e-06, + "loss": 0.1496, + "step": 5028 + }, + { + "epoch": 0.4633528354908555, + "grad_norm": 0.9292440088049803, + "learning_rate": 3.0061103588182592e-06, + "loss": 0.1532, + "step": 5029 + }, + { + "epoch": 0.4634449716681255, + "grad_norm": 0.8525916459383819, + "learning_rate": 3.005364326060294e-06, + "loss": 0.1267, + "step": 5030 + }, + { + "epoch": 0.4635371078453955, + "grad_norm": 0.9596307408249468, + "learning_rate": 3.0046182463791962e-06, + "loss": 0.1395, + "step": 5031 + }, + { + "epoch": 0.4636292440226655, + "grad_norm": 0.9635650608907829, + "learning_rate": 3.0038721198442406e-06, + "loss": 0.1495, + "step": 5032 + }, + { + "epoch": 0.46372138019993553, + "grad_norm": 0.8783556628180413, + "learning_rate": 3.003125946524704e-06, + "loss": 0.1374, + "step": 5033 + }, + { + "epoch": 0.4638135163772055, + "grad_norm": 0.942047255532235, + "learning_rate": 3.002379726489869e-06, + "loss": 0.1443, + "step": 5034 + }, + { + "epoch": 0.4639056525544755, + "grad_norm": 1.029948109716625, + "learning_rate": 3.001633459809023e-06, + "loss": 0.1544, + "step": 5035 + }, + { + "epoch": 0.4639977887317455, + "grad_norm": 0.9450058200355388, + "learning_rate": 3.000887146551455e-06, + "loss": 0.1447, + "step": 5036 + }, + { + "epoch": 0.4640899249090155, + "grad_norm": 0.8932809463577716, + "learning_rate": 3.000140786786463e-06, + "loss": 0.1351, + "step": 5037 + }, + { + "epoch": 0.4641820610862855, + "grad_norm": 0.9064007681388029, + "learning_rate": 2.9993943805833444e-06, + "loss": 0.1449, + "step": 5038 + }, + { + "epoch": 0.4642741972635555, + "grad_norm": 0.8988360819697974, + "learning_rate": 2.998647928011404e-06, + "loss": 0.1423, + "step": 5039 + }, + { + "epoch": 0.46436633344082556, + "grad_norm": 0.8884133863776579, + "learning_rate": 2.9979014291399495e-06, + "loss": 0.1381, + "step": 5040 + }, + { + "epoch": 0.46445846961809556, + "grad_norm": 0.9578344478960423, + "learning_rate": 2.997154884038294e-06, + "loss": 0.1457, + "step": 5041 + }, + { + "epoch": 0.46455060579536556, + "grad_norm": 0.8889333116943653, + "learning_rate": 2.9964082927757537e-06, + "loss": 0.1434, + "step": 5042 + }, + { + "epoch": 0.46464274197263555, + "grad_norm": 0.8880790358302674, + "learning_rate": 2.995661655421651e-06, + "loss": 0.1422, + "step": 5043 + }, + { + "epoch": 0.46473487814990555, + "grad_norm": 0.9310009728143278, + "learning_rate": 2.994914972045309e-06, + "loss": 0.1519, + "step": 5044 + }, + { + "epoch": 0.46482701432717555, + "grad_norm": 0.9135511073129388, + "learning_rate": 2.994168242716059e-06, + "loss": 0.1439, + "step": 5045 + }, + { + "epoch": 0.46491915050444554, + "grad_norm": 0.850859033376075, + "learning_rate": 2.9934214675032346e-06, + "loss": 0.1444, + "step": 5046 + }, + { + "epoch": 0.4650112866817156, + "grad_norm": 0.9225464686632786, + "learning_rate": 2.9926746464761743e-06, + "loss": 0.1519, + "step": 5047 + }, + { + "epoch": 0.4651034228589856, + "grad_norm": 0.9176808967329263, + "learning_rate": 2.9919277797042196e-06, + "loss": 0.1358, + "step": 5048 + }, + { + "epoch": 0.4651955590362556, + "grad_norm": 0.8571010707868579, + "learning_rate": 2.991180867256718e-06, + "loss": 0.1421, + "step": 5049 + }, + { + "epoch": 0.4652876952135256, + "grad_norm": 0.8862854880044667, + "learning_rate": 2.990433909203019e-06, + "loss": 0.1419, + "step": 5050 + }, + { + "epoch": 0.4653798313907956, + "grad_norm": 0.8790782944491246, + "learning_rate": 2.9896869056124795e-06, + "loss": 0.1408, + "step": 5051 + }, + { + "epoch": 0.4654719675680656, + "grad_norm": 0.9012749764460423, + "learning_rate": 2.9889398565544576e-06, + "loss": 0.1476, + "step": 5052 + }, + { + "epoch": 0.46556410374533563, + "grad_norm": 0.8814257961671169, + "learning_rate": 2.9881927620983175e-06, + "loss": 0.1455, + "step": 5053 + }, + { + "epoch": 0.46565623992260563, + "grad_norm": 0.8888919896330232, + "learning_rate": 2.9874456223134273e-06, + "loss": 0.1395, + "step": 5054 + }, + { + "epoch": 0.4657483760998756, + "grad_norm": 0.8966843988002527, + "learning_rate": 2.9866984372691586e-06, + "loss": 0.1461, + "step": 5055 + }, + { + "epoch": 0.4658405122771456, + "grad_norm": 0.9811961006056726, + "learning_rate": 2.985951207034888e-06, + "loss": 0.1566, + "step": 5056 + }, + { + "epoch": 0.4659326484544156, + "grad_norm": 0.9175985358748722, + "learning_rate": 2.985203931679995e-06, + "loss": 0.1524, + "step": 5057 + }, + { + "epoch": 0.4660247846316856, + "grad_norm": 0.9465765486071972, + "learning_rate": 2.984456611273864e-06, + "loss": 0.1522, + "step": 5058 + }, + { + "epoch": 0.4661169208089556, + "grad_norm": 0.9159762833014369, + "learning_rate": 2.9837092458858862e-06, + "loss": 0.1425, + "step": 5059 + }, + { + "epoch": 0.46620905698622567, + "grad_norm": 0.9303292659055917, + "learning_rate": 2.982961835585451e-06, + "loss": 0.1508, + "step": 5060 + }, + { + "epoch": 0.46630119316349566, + "grad_norm": 0.9815182981934475, + "learning_rate": 2.9822143804419586e-06, + "loss": 0.1443, + "step": 5061 + }, + { + "epoch": 0.46639332934076566, + "grad_norm": 0.9247239725037197, + "learning_rate": 2.981466880524809e-06, + "loss": 0.1619, + "step": 5062 + }, + { + "epoch": 0.46648546551803566, + "grad_norm": 0.9203976339424482, + "learning_rate": 2.9807193359034077e-06, + "loss": 0.1437, + "step": 5063 + }, + { + "epoch": 0.46657760169530565, + "grad_norm": 0.9511622917745709, + "learning_rate": 2.979971746647164e-06, + "loss": 0.1503, + "step": 5064 + }, + { + "epoch": 0.46666973787257565, + "grad_norm": 0.9639638100811875, + "learning_rate": 2.9792241128254916e-06, + "loss": 0.1526, + "step": 5065 + }, + { + "epoch": 0.46676187404984565, + "grad_norm": 0.8636214111011123, + "learning_rate": 2.978476434507809e-06, + "loss": 0.1406, + "step": 5066 + }, + { + "epoch": 0.4668540102271157, + "grad_norm": 0.9786702258760724, + "learning_rate": 2.9777287117635387e-06, + "loss": 0.1501, + "step": 5067 + }, + { + "epoch": 0.4669461464043857, + "grad_norm": 0.8728745748666031, + "learning_rate": 2.9769809446621057e-06, + "loss": 0.1398, + "step": 5068 + }, + { + "epoch": 0.4670382825816557, + "grad_norm": 0.9069597536608185, + "learning_rate": 2.9762331332729405e-06, + "loss": 0.1374, + "step": 5069 + }, + { + "epoch": 0.4671304187589257, + "grad_norm": 0.9322944650625257, + "learning_rate": 2.975485277665478e-06, + "loss": 0.1469, + "step": 5070 + }, + { + "epoch": 0.4672225549361957, + "grad_norm": 0.9255137327705216, + "learning_rate": 2.9747373779091552e-06, + "loss": 0.1442, + "step": 5071 + }, + { + "epoch": 0.4673146911134657, + "grad_norm": 0.9013228649521703, + "learning_rate": 2.9739894340734177e-06, + "loss": 0.1485, + "step": 5072 + }, + { + "epoch": 0.4674068272907357, + "grad_norm": 0.9255808656859331, + "learning_rate": 2.9732414462277083e-06, + "loss": 0.1361, + "step": 5073 + }, + { + "epoch": 0.46749896346800573, + "grad_norm": 0.8959121976406828, + "learning_rate": 2.9724934144414807e-06, + "loss": 0.1392, + "step": 5074 + }, + { + "epoch": 0.46759109964527573, + "grad_norm": 0.9969189706936202, + "learning_rate": 2.9717453387841884e-06, + "loss": 0.1477, + "step": 5075 + }, + { + "epoch": 0.46768323582254573, + "grad_norm": 0.8902239293901296, + "learning_rate": 2.9709972193252905e-06, + "loss": 0.1448, + "step": 5076 + }, + { + "epoch": 0.4677753719998157, + "grad_norm": 0.8644170281111183, + "learning_rate": 2.9702490561342505e-06, + "loss": 0.1384, + "step": 5077 + }, + { + "epoch": 0.4678675081770857, + "grad_norm": 0.8383712450753181, + "learning_rate": 2.969500849280535e-06, + "loss": 0.1306, + "step": 5078 + }, + { + "epoch": 0.4679596443543557, + "grad_norm": 0.8679667354263388, + "learning_rate": 2.9687525988336147e-06, + "loss": 0.144, + "step": 5079 + }, + { + "epoch": 0.4680517805316257, + "grad_norm": 0.9255121092467719, + "learning_rate": 2.968004304862966e-06, + "loss": 0.1363, + "step": 5080 + }, + { + "epoch": 0.46814391670889577, + "grad_norm": 0.9135538111005072, + "learning_rate": 2.9672559674380664e-06, + "loss": 0.149, + "step": 5081 + }, + { + "epoch": 0.46823605288616577, + "grad_norm": 0.8891994213746041, + "learning_rate": 2.9665075866284e-06, + "loss": 0.1455, + "step": 5082 + }, + { + "epoch": 0.46832818906343576, + "grad_norm": 0.906236563353806, + "learning_rate": 2.9657591625034543e-06, + "loss": 0.1342, + "step": 5083 + }, + { + "epoch": 0.46842032524070576, + "grad_norm": 0.8876771489888291, + "learning_rate": 2.9650106951327202e-06, + "loss": 0.1346, + "step": 5084 + }, + { + "epoch": 0.46851246141797576, + "grad_norm": 0.9419083874001088, + "learning_rate": 2.964262184585692e-06, + "loss": 0.1434, + "step": 5085 + }, + { + "epoch": 0.46860459759524575, + "grad_norm": 0.9289938357240787, + "learning_rate": 2.963513630931872e-06, + "loss": 0.1576, + "step": 5086 + }, + { + "epoch": 0.4686967337725158, + "grad_norm": 0.9715748809553193, + "learning_rate": 2.96276503424076e-06, + "loss": 0.1542, + "step": 5087 + }, + { + "epoch": 0.4687888699497858, + "grad_norm": 0.8970855643969083, + "learning_rate": 2.9620163945818648e-06, + "loss": 0.1441, + "step": 5088 + }, + { + "epoch": 0.4688810061270558, + "grad_norm": 0.999806972740666, + "learning_rate": 2.961267712024698e-06, + "loss": 0.1608, + "step": 5089 + }, + { + "epoch": 0.4689731423043258, + "grad_norm": 0.9615698363397197, + "learning_rate": 2.9605189866387746e-06, + "loss": 0.15, + "step": 5090 + }, + { + "epoch": 0.4690652784815958, + "grad_norm": 0.9082739679420999, + "learning_rate": 2.9597702184936137e-06, + "loss": 0.1494, + "step": 5091 + }, + { + "epoch": 0.4691574146588658, + "grad_norm": 0.8927931890323686, + "learning_rate": 2.9590214076587386e-06, + "loss": 0.1425, + "step": 5092 + }, + { + "epoch": 0.4692495508361358, + "grad_norm": 0.8850697632312967, + "learning_rate": 2.958272554203676e-06, + "loss": 0.1422, + "step": 5093 + }, + { + "epoch": 0.46934168701340584, + "grad_norm": 0.9147171490433019, + "learning_rate": 2.9575236581979576e-06, + "loss": 0.1467, + "step": 5094 + }, + { + "epoch": 0.46943382319067584, + "grad_norm": 0.864702255400151, + "learning_rate": 2.9567747197111186e-06, + "loss": 0.1387, + "step": 5095 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 0.8767863278849377, + "learning_rate": 2.9560257388126973e-06, + "loss": 0.1473, + "step": 5096 + }, + { + "epoch": 0.46961809554521583, + "grad_norm": 0.9674640608974738, + "learning_rate": 2.9552767155722375e-06, + "loss": 0.132, + "step": 5097 + }, + { + "epoch": 0.4697102317224858, + "grad_norm": 0.9065230519661844, + "learning_rate": 2.954527650059285e-06, + "loss": 0.1394, + "step": 5098 + }, + { + "epoch": 0.4698023678997558, + "grad_norm": 0.8556130271531583, + "learning_rate": 2.9537785423433925e-06, + "loss": 0.1301, + "step": 5099 + }, + { + "epoch": 0.4698945040770258, + "grad_norm": 0.9112839599118244, + "learning_rate": 2.9530293924941123e-06, + "loss": 0.1542, + "step": 5100 + }, + { + "epoch": 0.4699866402542959, + "grad_norm": 0.9598181354372883, + "learning_rate": 2.9522802005810043e-06, + "loss": 0.1526, + "step": 5101 + }, + { + "epoch": 0.47007877643156587, + "grad_norm": 0.8624365229094141, + "learning_rate": 2.9515309666736312e-06, + "loss": 0.1359, + "step": 5102 + }, + { + "epoch": 0.47017091260883587, + "grad_norm": 0.8809972735025539, + "learning_rate": 2.95078169084156e-06, + "loss": 0.1363, + "step": 5103 + }, + { + "epoch": 0.47026304878610586, + "grad_norm": 0.8561841482860453, + "learning_rate": 2.9500323731543596e-06, + "loss": 0.1252, + "step": 5104 + }, + { + "epoch": 0.47035518496337586, + "grad_norm": 0.8809602128506461, + "learning_rate": 2.9492830136816053e-06, + "loss": 0.1407, + "step": 5105 + }, + { + "epoch": 0.47044732114064586, + "grad_norm": 0.9372906064000932, + "learning_rate": 2.948533612492874e-06, + "loss": 0.1543, + "step": 5106 + }, + { + "epoch": 0.47053945731791585, + "grad_norm": 0.8940500798202332, + "learning_rate": 2.947784169657749e-06, + "loss": 0.1342, + "step": 5107 + }, + { + "epoch": 0.4706315934951859, + "grad_norm": 0.9499099523288487, + "learning_rate": 2.947034685245816e-06, + "loss": 0.1528, + "step": 5108 + }, + { + "epoch": 0.4707237296724559, + "grad_norm": 0.9332993914331615, + "learning_rate": 2.946285159326664e-06, + "loss": 0.146, + "step": 5109 + }, + { + "epoch": 0.4708158658497259, + "grad_norm": 0.9145503369594333, + "learning_rate": 2.945535591969887e-06, + "loss": 0.1366, + "step": 5110 + }, + { + "epoch": 0.4709080020269959, + "grad_norm": 0.87163080765042, + "learning_rate": 2.944785983245082e-06, + "loss": 0.1327, + "step": 5111 + }, + { + "epoch": 0.4710001382042659, + "grad_norm": 0.9165609316092446, + "learning_rate": 2.944036333221851e-06, + "loss": 0.1538, + "step": 5112 + }, + { + "epoch": 0.4710922743815359, + "grad_norm": 0.8710335528196103, + "learning_rate": 2.9432866419697993e-06, + "loss": 0.1289, + "step": 5113 + }, + { + "epoch": 0.4711844105588059, + "grad_norm": 0.8827968344332082, + "learning_rate": 2.9425369095585337e-06, + "loss": 0.133, + "step": 5114 + }, + { + "epoch": 0.47127654673607594, + "grad_norm": 0.8897129402540811, + "learning_rate": 2.94178713605767e-06, + "loss": 0.1358, + "step": 5115 + }, + { + "epoch": 0.47136868291334594, + "grad_norm": 0.9143135967958729, + "learning_rate": 2.9410373215368216e-06, + "loss": 0.1402, + "step": 5116 + }, + { + "epoch": 0.47146081909061593, + "grad_norm": 0.8708685814829983, + "learning_rate": 2.9402874660656113e-06, + "loss": 0.141, + "step": 5117 + }, + { + "epoch": 0.47155295526788593, + "grad_norm": 0.9041670120185827, + "learning_rate": 2.9395375697136623e-06, + "loss": 0.1388, + "step": 5118 + }, + { + "epoch": 0.47164509144515593, + "grad_norm": 0.9052941120311353, + "learning_rate": 2.9387876325506025e-06, + "loss": 0.14, + "step": 5119 + }, + { + "epoch": 0.4717372276224259, + "grad_norm": 0.8817598067857801, + "learning_rate": 2.9380376546460643e-06, + "loss": 0.1344, + "step": 5120 + }, + { + "epoch": 0.471829363799696, + "grad_norm": 0.9163393295135425, + "learning_rate": 2.9372876360696823e-06, + "loss": 0.1398, + "step": 5121 + }, + { + "epoch": 0.471921499976966, + "grad_norm": 0.8989709981272607, + "learning_rate": 2.9365375768910957e-06, + "loss": 0.1398, + "step": 5122 + }, + { + "epoch": 0.47201363615423597, + "grad_norm": 0.8474850729012813, + "learning_rate": 2.935787477179949e-06, + "loss": 0.128, + "step": 5123 + }, + { + "epoch": 0.47210577233150597, + "grad_norm": 0.9426786416617355, + "learning_rate": 2.9350373370058882e-06, + "loss": 0.1482, + "step": 5124 + }, + { + "epoch": 0.47219790850877597, + "grad_norm": 0.9157568621182558, + "learning_rate": 2.9342871564385627e-06, + "loss": 0.1445, + "step": 5125 + }, + { + "epoch": 0.47229004468604596, + "grad_norm": 0.9060909697019053, + "learning_rate": 2.9335369355476295e-06, + "loss": 0.14, + "step": 5126 + }, + { + "epoch": 0.47238218086331596, + "grad_norm": 0.9206297306151924, + "learning_rate": 2.932786674402744e-06, + "loss": 0.1467, + "step": 5127 + }, + { + "epoch": 0.472474317040586, + "grad_norm": 0.9762708271384426, + "learning_rate": 2.9320363730735696e-06, + "loss": 0.1528, + "step": 5128 + }, + { + "epoch": 0.472566453217856, + "grad_norm": 0.9265016675178621, + "learning_rate": 2.9312860316297716e-06, + "loss": 0.1349, + "step": 5129 + }, + { + "epoch": 0.472658589395126, + "grad_norm": 0.9251132679088837, + "learning_rate": 2.930535650141019e-06, + "loss": 0.147, + "step": 5130 + }, + { + "epoch": 0.472750725572396, + "grad_norm": 0.9300808982858864, + "learning_rate": 2.9297852286769852e-06, + "loss": 0.1452, + "step": 5131 + }, + { + "epoch": 0.472842861749666, + "grad_norm": 0.8853462094377297, + "learning_rate": 2.9290347673073466e-06, + "loss": 0.1353, + "step": 5132 + }, + { + "epoch": 0.472934997926936, + "grad_norm": 0.8979270084102228, + "learning_rate": 2.928284266101783e-06, + "loss": 0.1379, + "step": 5133 + }, + { + "epoch": 0.473027134104206, + "grad_norm": 0.858106147363866, + "learning_rate": 2.9275337251299808e-06, + "loss": 0.1373, + "step": 5134 + }, + { + "epoch": 0.47311927028147605, + "grad_norm": 0.9349995659851857, + "learning_rate": 2.9267831444616244e-06, + "loss": 0.1358, + "step": 5135 + }, + { + "epoch": 0.47321140645874604, + "grad_norm": 0.9034520136048325, + "learning_rate": 2.926032524166408e-06, + "loss": 0.1382, + "step": 5136 + }, + { + "epoch": 0.47330354263601604, + "grad_norm": 0.9136257429700642, + "learning_rate": 2.9252818643140256e-06, + "loss": 0.143, + "step": 5137 + }, + { + "epoch": 0.47339567881328604, + "grad_norm": 0.9468303486280631, + "learning_rate": 2.9245311649741765e-06, + "loss": 0.1354, + "step": 5138 + }, + { + "epoch": 0.47348781499055603, + "grad_norm": 0.9333786953701937, + "learning_rate": 2.9237804262165632e-06, + "loss": 0.1494, + "step": 5139 + }, + { + "epoch": 0.47357995116782603, + "grad_norm": 0.8913158148329773, + "learning_rate": 2.9230296481108916e-06, + "loss": 0.1491, + "step": 5140 + }, + { + "epoch": 0.473672087345096, + "grad_norm": 0.9347408552372286, + "learning_rate": 2.922278830726871e-06, + "loss": 0.1469, + "step": 5141 + }, + { + "epoch": 0.4737642235223661, + "grad_norm": 0.9046310230891329, + "learning_rate": 2.9215279741342165e-06, + "loss": 0.1248, + "step": 5142 + }, + { + "epoch": 0.4738563596996361, + "grad_norm": 0.8910835401059762, + "learning_rate": 2.9207770784026436e-06, + "loss": 0.1375, + "step": 5143 + }, + { + "epoch": 0.4739484958769061, + "grad_norm": 0.8892182784728998, + "learning_rate": 2.920026143601874e-06, + "loss": 0.1277, + "step": 5144 + }, + { + "epoch": 0.47404063205417607, + "grad_norm": 0.8774140482608112, + "learning_rate": 2.9192751698016317e-06, + "loss": 0.1368, + "step": 5145 + }, + { + "epoch": 0.47413276823144607, + "grad_norm": 0.9889873174330852, + "learning_rate": 2.918524157071645e-06, + "loss": 0.1509, + "step": 5146 + }, + { + "epoch": 0.47422490440871606, + "grad_norm": 0.9436506649824882, + "learning_rate": 2.917773105481645e-06, + "loss": 0.1535, + "step": 5147 + }, + { + "epoch": 0.47431704058598606, + "grad_norm": 0.8873798622275758, + "learning_rate": 2.917022015101367e-06, + "loss": 0.1312, + "step": 5148 + }, + { + "epoch": 0.4744091767632561, + "grad_norm": 0.9843878112858804, + "learning_rate": 2.91627088600055e-06, + "loss": 0.1525, + "step": 5149 + }, + { + "epoch": 0.4745013129405261, + "grad_norm": 0.9278713582496736, + "learning_rate": 2.9155197182489375e-06, + "loss": 0.133, + "step": 5150 + }, + { + "epoch": 0.4745934491177961, + "grad_norm": 0.8901035694884752, + "learning_rate": 2.9147685119162735e-06, + "loss": 0.1194, + "step": 5151 + }, + { + "epoch": 0.4746855852950661, + "grad_norm": 0.8786311039803401, + "learning_rate": 2.9140172670723083e-06, + "loss": 0.1364, + "step": 5152 + }, + { + "epoch": 0.4747777214723361, + "grad_norm": 0.9353843520819218, + "learning_rate": 2.913265983786796e-06, + "loss": 0.1367, + "step": 5153 + }, + { + "epoch": 0.4748698576496061, + "grad_norm": 0.9136948540094968, + "learning_rate": 2.9125146621294915e-06, + "loss": 0.1585, + "step": 5154 + }, + { + "epoch": 0.47496199382687615, + "grad_norm": 0.9192694976349985, + "learning_rate": 2.9117633021701574e-06, + "loss": 0.1482, + "step": 5155 + }, + { + "epoch": 0.47505413000414615, + "grad_norm": 0.9747913478550096, + "learning_rate": 2.9110119039785554e-06, + "loss": 0.1425, + "step": 5156 + }, + { + "epoch": 0.47514626618141614, + "grad_norm": 0.9813043098360802, + "learning_rate": 2.910260467624455e-06, + "loss": 0.153, + "step": 5157 + }, + { + "epoch": 0.47523840235868614, + "grad_norm": 0.8829325066846414, + "learning_rate": 2.9095089931776255e-06, + "loss": 0.1476, + "step": 5158 + }, + { + "epoch": 0.47533053853595614, + "grad_norm": 0.9194266463326177, + "learning_rate": 2.908757480707842e-06, + "loss": 0.1553, + "step": 5159 + }, + { + "epoch": 0.47542267471322613, + "grad_norm": 0.9373661605075402, + "learning_rate": 2.908005930284882e-06, + "loss": 0.1523, + "step": 5160 + }, + { + "epoch": 0.47551481089049613, + "grad_norm": 0.9409623196439526, + "learning_rate": 2.907254341978528e-06, + "loss": 0.1479, + "step": 5161 + }, + { + "epoch": 0.4756069470677662, + "grad_norm": 0.9489593698972468, + "learning_rate": 2.906502715858564e-06, + "loss": 0.1532, + "step": 5162 + }, + { + "epoch": 0.4756990832450362, + "grad_norm": 0.8697876898890173, + "learning_rate": 2.9057510519947794e-06, + "loss": 0.1359, + "step": 5163 + }, + { + "epoch": 0.4757912194223062, + "grad_norm": 0.9932677774158268, + "learning_rate": 2.9049993504569663e-06, + "loss": 0.1465, + "step": 5164 + }, + { + "epoch": 0.4758833555995762, + "grad_norm": 0.8608562365126148, + "learning_rate": 2.9042476113149193e-06, + "loss": 0.136, + "step": 5165 + }, + { + "epoch": 0.47597549177684617, + "grad_norm": 0.9028113269728798, + "learning_rate": 2.9034958346384385e-06, + "loss": 0.1507, + "step": 5166 + }, + { + "epoch": 0.47606762795411617, + "grad_norm": 0.9736011569329539, + "learning_rate": 2.9027440204973263e-06, + "loss": 0.1503, + "step": 5167 + }, + { + "epoch": 0.47615976413138617, + "grad_norm": 0.8603637223144222, + "learning_rate": 2.9019921689613874e-06, + "loss": 0.1339, + "step": 5168 + }, + { + "epoch": 0.4762519003086562, + "grad_norm": 0.8692468134048886, + "learning_rate": 2.9012402801004334e-06, + "loss": 0.1389, + "step": 5169 + }, + { + "epoch": 0.4763440364859262, + "grad_norm": 0.9180819372587773, + "learning_rate": 2.9004883539842756e-06, + "loss": 0.1456, + "step": 5170 + }, + { + "epoch": 0.4764361726631962, + "grad_norm": 0.9080609642454912, + "learning_rate": 2.8997363906827315e-06, + "loss": 0.1365, + "step": 5171 + }, + { + "epoch": 0.4765283088404662, + "grad_norm": 0.8854688032810417, + "learning_rate": 2.8989843902656202e-06, + "loss": 0.1291, + "step": 5172 + }, + { + "epoch": 0.4766204450177362, + "grad_norm": 0.9528179450062821, + "learning_rate": 2.898232352802765e-06, + "loss": 0.1453, + "step": 5173 + }, + { + "epoch": 0.4767125811950062, + "grad_norm": 0.8690123948765117, + "learning_rate": 2.8974802783639934e-06, + "loss": 0.133, + "step": 5174 + }, + { + "epoch": 0.4768047173722762, + "grad_norm": 0.943895660424614, + "learning_rate": 2.8967281670191357e-06, + "loss": 0.146, + "step": 5175 + }, + { + "epoch": 0.47689685354954625, + "grad_norm": 0.922222468563535, + "learning_rate": 2.895976018838024e-06, + "loss": 0.1397, + "step": 5176 + }, + { + "epoch": 0.47698898972681625, + "grad_norm": 0.8783345231451642, + "learning_rate": 2.895223833890497e-06, + "loss": 0.1368, + "step": 5177 + }, + { + "epoch": 0.47708112590408625, + "grad_norm": 0.8963037452817142, + "learning_rate": 2.8944716122463933e-06, + "loss": 0.127, + "step": 5178 + }, + { + "epoch": 0.47717326208135624, + "grad_norm": 0.9540068837330717, + "learning_rate": 2.8937193539755593e-06, + "loss": 0.133, + "step": 5179 + }, + { + "epoch": 0.47726539825862624, + "grad_norm": 0.926155655361845, + "learning_rate": 2.8929670591478404e-06, + "loss": 0.1597, + "step": 5180 + }, + { + "epoch": 0.47735753443589624, + "grad_norm": 0.9621119246946411, + "learning_rate": 2.8922147278330876e-06, + "loss": 0.1544, + "step": 5181 + }, + { + "epoch": 0.47744967061316623, + "grad_norm": 0.957134161209612, + "learning_rate": 2.891462360101156e-06, + "loss": 0.1517, + "step": 5182 + }, + { + "epoch": 0.4775418067904363, + "grad_norm": 0.9236587620190673, + "learning_rate": 2.890709956021901e-06, + "loss": 0.14, + "step": 5183 + }, + { + "epoch": 0.4776339429677063, + "grad_norm": 0.9000620955817382, + "learning_rate": 2.8899575156651847e-06, + "loss": 0.1374, + "step": 5184 + }, + { + "epoch": 0.4777260791449763, + "grad_norm": 0.919864564287617, + "learning_rate": 2.889205039100872e-06, + "loss": 0.1415, + "step": 5185 + }, + { + "epoch": 0.4778182153222463, + "grad_norm": 0.8402420054493916, + "learning_rate": 2.8884525263988288e-06, + "loss": 0.1265, + "step": 5186 + }, + { + "epoch": 0.4779103514995163, + "grad_norm": 0.8852975503830686, + "learning_rate": 2.887699977628927e-06, + "loss": 0.1262, + "step": 5187 + }, + { + "epoch": 0.47800248767678627, + "grad_norm": 0.8630089162205361, + "learning_rate": 2.886947392861041e-06, + "loss": 0.1349, + "step": 5188 + }, + { + "epoch": 0.4780946238540563, + "grad_norm": 0.8956680318013008, + "learning_rate": 2.886194772165046e-06, + "loss": 0.1479, + "step": 5189 + }, + { + "epoch": 0.4781867600313263, + "grad_norm": 0.9614757825575214, + "learning_rate": 2.8854421156108276e-06, + "loss": 0.15, + "step": 5190 + }, + { + "epoch": 0.4782788962085963, + "grad_norm": 0.9044999706904249, + "learning_rate": 2.8846894232682654e-06, + "loss": 0.1476, + "step": 5191 + }, + { + "epoch": 0.4783710323858663, + "grad_norm": 0.8821383917105962, + "learning_rate": 2.883936695207249e-06, + "loss": 0.147, + "step": 5192 + }, + { + "epoch": 0.4784631685631363, + "grad_norm": 0.9243868354852021, + "learning_rate": 2.8831839314976696e-06, + "loss": 0.1506, + "step": 5193 + }, + { + "epoch": 0.4785553047404063, + "grad_norm": 0.8698264824416307, + "learning_rate": 2.8824311322094213e-06, + "loss": 0.137, + "step": 5194 + }, + { + "epoch": 0.4786474409176763, + "grad_norm": 0.9675043619811561, + "learning_rate": 2.8816782974124007e-06, + "loss": 0.1586, + "step": 5195 + }, + { + "epoch": 0.47873957709494636, + "grad_norm": 0.8938915507442141, + "learning_rate": 2.880925427176509e-06, + "loss": 0.1361, + "step": 5196 + }, + { + "epoch": 0.47883171327221635, + "grad_norm": 0.9334479510460674, + "learning_rate": 2.8801725215716504e-06, + "loss": 0.1474, + "step": 5197 + }, + { + "epoch": 0.47892384944948635, + "grad_norm": 0.8647314780366868, + "learning_rate": 2.879419580667733e-06, + "loss": 0.1216, + "step": 5198 + }, + { + "epoch": 0.47901598562675635, + "grad_norm": 0.8678424881431106, + "learning_rate": 2.878666604534665e-06, + "loss": 0.1295, + "step": 5199 + }, + { + "epoch": 0.47910812180402634, + "grad_norm": 0.8752317443418055, + "learning_rate": 2.8779135932423633e-06, + "loss": 0.1377, + "step": 5200 + }, + { + "epoch": 0.47920025798129634, + "grad_norm": 0.9457831979152571, + "learning_rate": 2.877160546860744e-06, + "loss": 0.1505, + "step": 5201 + }, + { + "epoch": 0.47929239415856634, + "grad_norm": 0.870921425375068, + "learning_rate": 2.8764074654597267e-06, + "loss": 0.1312, + "step": 5202 + }, + { + "epoch": 0.4793845303358364, + "grad_norm": 0.9157472068782744, + "learning_rate": 2.8756543491092352e-06, + "loss": 0.1472, + "step": 5203 + }, + { + "epoch": 0.4794766665131064, + "grad_norm": 0.9300844720183196, + "learning_rate": 2.8749011978791984e-06, + "loss": 0.1528, + "step": 5204 + }, + { + "epoch": 0.4795688026903764, + "grad_norm": 0.8245854441755501, + "learning_rate": 2.8741480118395443e-06, + "loss": 0.1167, + "step": 5205 + }, + { + "epoch": 0.4796609388676464, + "grad_norm": 0.9984410331300712, + "learning_rate": 2.873394791060207e-06, + "loss": 0.1477, + "step": 5206 + }, + { + "epoch": 0.4797530750449164, + "grad_norm": 0.9181133933506418, + "learning_rate": 2.872641535611123e-06, + "loss": 0.1526, + "step": 5207 + }, + { + "epoch": 0.4798452112221864, + "grad_norm": 0.9508564827737102, + "learning_rate": 2.8718882455622334e-06, + "loss": 0.1596, + "step": 5208 + }, + { + "epoch": 0.47993734739945637, + "grad_norm": 0.8843054175098252, + "learning_rate": 2.871134920983479e-06, + "loss": 0.1333, + "step": 5209 + }, + { + "epoch": 0.4800294835767264, + "grad_norm": 0.9349077875143057, + "learning_rate": 2.8703815619448072e-06, + "loss": 0.1379, + "step": 5210 + }, + { + "epoch": 0.4801216197539964, + "grad_norm": 0.8935820005729005, + "learning_rate": 2.8696281685161676e-06, + "loss": 0.1366, + "step": 5211 + }, + { + "epoch": 0.4802137559312664, + "grad_norm": 0.889136554484447, + "learning_rate": 2.868874740767513e-06, + "loss": 0.1359, + "step": 5212 + }, + { + "epoch": 0.4803058921085364, + "grad_norm": 0.9768791414763527, + "learning_rate": 2.8681212787687997e-06, + "loss": 0.1468, + "step": 5213 + }, + { + "epoch": 0.4803980282858064, + "grad_norm": 0.9558517666869947, + "learning_rate": 2.8673677825899852e-06, + "loss": 0.156, + "step": 5214 + }, + { + "epoch": 0.4804901644630764, + "grad_norm": 0.9018021401267595, + "learning_rate": 2.866614252301033e-06, + "loss": 0.1298, + "step": 5215 + }, + { + "epoch": 0.4805823006403464, + "grad_norm": 0.9408168488926255, + "learning_rate": 2.865860687971907e-06, + "loss": 0.1504, + "step": 5216 + }, + { + "epoch": 0.48067443681761646, + "grad_norm": 0.9242266840610662, + "learning_rate": 2.8651070896725786e-06, + "loss": 0.1438, + "step": 5217 + }, + { + "epoch": 0.48076657299488645, + "grad_norm": 0.8848173378094824, + "learning_rate": 2.864353457473016e-06, + "loss": 0.1456, + "step": 5218 + }, + { + "epoch": 0.48085870917215645, + "grad_norm": 0.9226387337834759, + "learning_rate": 2.863599791443196e-06, + "loss": 0.1452, + "step": 5219 + }, + { + "epoch": 0.48095084534942645, + "grad_norm": 0.9215882804526133, + "learning_rate": 2.8628460916530967e-06, + "loss": 0.1484, + "step": 5220 + }, + { + "epoch": 0.48104298152669644, + "grad_norm": 0.9025065525594986, + "learning_rate": 2.8620923581726983e-06, + "loss": 0.1481, + "step": 5221 + }, + { + "epoch": 0.48113511770396644, + "grad_norm": 0.9097430267440212, + "learning_rate": 2.861338591071986e-06, + "loss": 0.1434, + "step": 5222 + }, + { + "epoch": 0.4812272538812365, + "grad_norm": 0.8802740153350054, + "learning_rate": 2.860584790420946e-06, + "loss": 0.1362, + "step": 5223 + }, + { + "epoch": 0.4813193900585065, + "grad_norm": 0.9819178496384163, + "learning_rate": 2.85983095628957e-06, + "loss": 0.1554, + "step": 5224 + }, + { + "epoch": 0.4814115262357765, + "grad_norm": 0.8843967606324523, + "learning_rate": 2.8590770887478507e-06, + "loss": 0.1425, + "step": 5225 + }, + { + "epoch": 0.4815036624130465, + "grad_norm": 0.8725658702095903, + "learning_rate": 2.8583231878657847e-06, + "loss": 0.1329, + "step": 5226 + }, + { + "epoch": 0.4815957985903165, + "grad_norm": 0.8909883448928446, + "learning_rate": 2.8575692537133726e-06, + "loss": 0.1499, + "step": 5227 + }, + { + "epoch": 0.4816879347675865, + "grad_norm": 0.9430742708330203, + "learning_rate": 2.8568152863606167e-06, + "loss": 0.1433, + "step": 5228 + }, + { + "epoch": 0.4817800709448565, + "grad_norm": 0.8869863335741072, + "learning_rate": 2.8560612858775233e-06, + "loss": 0.1303, + "step": 5229 + }, + { + "epoch": 0.48187220712212653, + "grad_norm": 0.9377415356064038, + "learning_rate": 2.8553072523341008e-06, + "loss": 0.1498, + "step": 5230 + }, + { + "epoch": 0.4819643432993965, + "grad_norm": 0.8758553521749834, + "learning_rate": 2.8545531858003623e-06, + "loss": 0.1238, + "step": 5231 + }, + { + "epoch": 0.4820564794766665, + "grad_norm": 0.8988921073626308, + "learning_rate": 2.8537990863463212e-06, + "loss": 0.1298, + "step": 5232 + }, + { + "epoch": 0.4821486156539365, + "grad_norm": 0.8798732879738537, + "learning_rate": 2.853044954041998e-06, + "loss": 0.1454, + "step": 5233 + }, + { + "epoch": 0.4822407518312065, + "grad_norm": 0.9246980033341549, + "learning_rate": 2.8522907889574117e-06, + "loss": 0.1524, + "step": 5234 + }, + { + "epoch": 0.4823328880084765, + "grad_norm": 0.9174157745994747, + "learning_rate": 2.851536591162589e-06, + "loss": 0.1529, + "step": 5235 + }, + { + "epoch": 0.4824250241857465, + "grad_norm": 0.9424499320791001, + "learning_rate": 2.8507823607275554e-06, + "loss": 0.1484, + "step": 5236 + }, + { + "epoch": 0.48251716036301656, + "grad_norm": 0.8256974300101093, + "learning_rate": 2.8500280977223416e-06, + "loss": 0.1247, + "step": 5237 + }, + { + "epoch": 0.48260929654028656, + "grad_norm": 0.8940530112241213, + "learning_rate": 2.8492738022169815e-06, + "loss": 0.1393, + "step": 5238 + }, + { + "epoch": 0.48270143271755656, + "grad_norm": 0.8472711799346075, + "learning_rate": 2.848519474281511e-06, + "loss": 0.1298, + "step": 5239 + }, + { + "epoch": 0.48279356889482655, + "grad_norm": 0.9181352849217045, + "learning_rate": 2.84776511398597e-06, + "loss": 0.1498, + "step": 5240 + }, + { + "epoch": 0.48288570507209655, + "grad_norm": 0.9012991958361424, + "learning_rate": 2.847010721400401e-06, + "loss": 0.1477, + "step": 5241 + }, + { + "epoch": 0.48297784124936655, + "grad_norm": 0.814508085647066, + "learning_rate": 2.8462562965948493e-06, + "loss": 0.1316, + "step": 5242 + }, + { + "epoch": 0.48306997742663654, + "grad_norm": 0.8971639247190899, + "learning_rate": 2.8455018396393618e-06, + "loss": 0.1386, + "step": 5243 + }, + { + "epoch": 0.4831621136039066, + "grad_norm": 0.8670252001320244, + "learning_rate": 2.8447473506039934e-06, + "loss": 0.1274, + "step": 5244 + }, + { + "epoch": 0.4832542497811766, + "grad_norm": 0.9182328079782047, + "learning_rate": 2.8439928295587948e-06, + "loss": 0.1402, + "step": 5245 + }, + { + "epoch": 0.4833463859584466, + "grad_norm": 0.8738254881249767, + "learning_rate": 2.843238276573826e-06, + "loss": 0.1395, + "step": 5246 + }, + { + "epoch": 0.4834385221357166, + "grad_norm": 0.8751207383004573, + "learning_rate": 2.8424836917191455e-06, + "loss": 0.1397, + "step": 5247 + }, + { + "epoch": 0.4835306583129866, + "grad_norm": 0.8559763699901859, + "learning_rate": 2.841729075064818e-06, + "loss": 0.1326, + "step": 5248 + }, + { + "epoch": 0.4836227944902566, + "grad_norm": 0.9330748310081932, + "learning_rate": 2.840974426680909e-06, + "loss": 0.1308, + "step": 5249 + }, + { + "epoch": 0.4837149306675266, + "grad_norm": 0.9467378848060349, + "learning_rate": 2.840219746637487e-06, + "loss": 0.1499, + "step": 5250 + }, + { + "epoch": 0.48380706684479663, + "grad_norm": 0.9756656663187364, + "learning_rate": 2.8394650350046256e-06, + "loss": 0.1326, + "step": 5251 + }, + { + "epoch": 0.4838992030220666, + "grad_norm": 0.9544148043520337, + "learning_rate": 2.8387102918523995e-06, + "loss": 0.1516, + "step": 5252 + }, + { + "epoch": 0.4839913391993366, + "grad_norm": 0.8967080887994231, + "learning_rate": 2.8379555172508853e-06, + "loss": 0.125, + "step": 5253 + }, + { + "epoch": 0.4840834753766066, + "grad_norm": 0.9096511642692993, + "learning_rate": 2.8372007112701657e-06, + "loss": 0.1415, + "step": 5254 + }, + { + "epoch": 0.4841756115538766, + "grad_norm": 0.9212905745476281, + "learning_rate": 2.8364458739803237e-06, + "loss": 0.1394, + "step": 5255 + }, + { + "epoch": 0.4842677477311466, + "grad_norm": 0.9188703117763569, + "learning_rate": 2.835691005451446e-06, + "loss": 0.1497, + "step": 5256 + }, + { + "epoch": 0.48435988390841667, + "grad_norm": 0.9223024413019026, + "learning_rate": 2.8349361057536223e-06, + "loss": 0.1438, + "step": 5257 + }, + { + "epoch": 0.48445202008568666, + "grad_norm": 0.9040847454967237, + "learning_rate": 2.8341811749569452e-06, + "loss": 0.1391, + "step": 5258 + }, + { + "epoch": 0.48454415626295666, + "grad_norm": 0.9561157127435418, + "learning_rate": 2.8334262131315094e-06, + "loss": 0.1476, + "step": 5259 + }, + { + "epoch": 0.48463629244022666, + "grad_norm": 0.8481512102632566, + "learning_rate": 2.832671220347415e-06, + "loss": 0.1265, + "step": 5260 + }, + { + "epoch": 0.48472842861749665, + "grad_norm": 0.9162494132871907, + "learning_rate": 2.831916196674761e-06, + "loss": 0.1506, + "step": 5261 + }, + { + "epoch": 0.48482056479476665, + "grad_norm": 0.9285013656763865, + "learning_rate": 2.831161142183653e-06, + "loss": 0.1442, + "step": 5262 + }, + { + "epoch": 0.48491270097203665, + "grad_norm": 0.9376551897657546, + "learning_rate": 2.830406056944197e-06, + "loss": 0.1395, + "step": 5263 + }, + { + "epoch": 0.4850048371493067, + "grad_norm": 0.9226971589582109, + "learning_rate": 2.8296509410265032e-06, + "loss": 0.1478, + "step": 5264 + }, + { + "epoch": 0.4850969733265767, + "grad_norm": 0.9000083531197767, + "learning_rate": 2.8288957945006845e-06, + "loss": 0.1462, + "step": 5265 + }, + { + "epoch": 0.4851891095038467, + "grad_norm": 0.89690674045041, + "learning_rate": 2.8281406174368555e-06, + "loss": 0.1419, + "step": 5266 + }, + { + "epoch": 0.4852812456811167, + "grad_norm": 0.9345061065274617, + "learning_rate": 2.827385409905134e-06, + "loss": 0.1488, + "step": 5267 + }, + { + "epoch": 0.4853733818583867, + "grad_norm": 0.9064937256577421, + "learning_rate": 2.8266301719756427e-06, + "loss": 0.1428, + "step": 5268 + }, + { + "epoch": 0.4854655180356567, + "grad_norm": 0.9088525413468661, + "learning_rate": 2.825874903718505e-06, + "loss": 0.1478, + "step": 5269 + }, + { + "epoch": 0.4855576542129267, + "grad_norm": 0.9271486145199646, + "learning_rate": 2.8251196052038475e-06, + "loss": 0.1551, + "step": 5270 + }, + { + "epoch": 0.48564979039019673, + "grad_norm": 0.9522761407530571, + "learning_rate": 2.8243642765017993e-06, + "loss": 0.1507, + "step": 5271 + }, + { + "epoch": 0.48574192656746673, + "grad_norm": 0.8485048602690964, + "learning_rate": 2.8236089176824926e-06, + "loss": 0.123, + "step": 5272 + }, + { + "epoch": 0.4858340627447367, + "grad_norm": 0.8780735623699752, + "learning_rate": 2.8228535288160647e-06, + "loss": 0.1285, + "step": 5273 + }, + { + "epoch": 0.4859261989220067, + "grad_norm": 0.9869769002349372, + "learning_rate": 2.8220981099726503e-06, + "loss": 0.1501, + "step": 5274 + }, + { + "epoch": 0.4860183350992767, + "grad_norm": 0.9459927723090031, + "learning_rate": 2.821342661222392e-06, + "loss": 0.1459, + "step": 5275 + }, + { + "epoch": 0.4861104712765467, + "grad_norm": 0.943149589932418, + "learning_rate": 2.8205871826354336e-06, + "loss": 0.1447, + "step": 5276 + }, + { + "epoch": 0.4862026074538167, + "grad_norm": 0.8907007660895891, + "learning_rate": 2.819831674281921e-06, + "loss": 0.1372, + "step": 5277 + }, + { + "epoch": 0.48629474363108677, + "grad_norm": 0.9140557800767286, + "learning_rate": 2.819076136232002e-06, + "loss": 0.1374, + "step": 5278 + }, + { + "epoch": 0.48638687980835676, + "grad_norm": 0.963337859306478, + "learning_rate": 2.81832056855583e-06, + "loss": 0.1486, + "step": 5279 + }, + { + "epoch": 0.48647901598562676, + "grad_norm": 0.9274489957087415, + "learning_rate": 2.8175649713235586e-06, + "loss": 0.1447, + "step": 5280 + }, + { + "epoch": 0.48657115216289676, + "grad_norm": 0.8655898357592946, + "learning_rate": 2.8168093446053455e-06, + "loss": 0.1321, + "step": 5281 + }, + { + "epoch": 0.48666328834016676, + "grad_norm": 0.9215423663303324, + "learning_rate": 2.816053688471351e-06, + "loss": 0.1351, + "step": 5282 + }, + { + "epoch": 0.48675542451743675, + "grad_norm": 0.9621694893535245, + "learning_rate": 2.815298002991738e-06, + "loss": 0.1382, + "step": 5283 + }, + { + "epoch": 0.48684756069470675, + "grad_norm": 0.8585957646539193, + "learning_rate": 2.8145422882366707e-06, + "loss": 0.1251, + "step": 5284 + }, + { + "epoch": 0.4869396968719768, + "grad_norm": 0.9610427782236858, + "learning_rate": 2.8137865442763186e-06, + "loss": 0.155, + "step": 5285 + }, + { + "epoch": 0.4870318330492468, + "grad_norm": 0.8803502360568106, + "learning_rate": 2.813030771180851e-06, + "loss": 0.1389, + "step": 5286 + }, + { + "epoch": 0.4871239692265168, + "grad_norm": 0.8809740678094535, + "learning_rate": 2.8122749690204443e-06, + "loss": 0.1375, + "step": 5287 + }, + { + "epoch": 0.4872161054037868, + "grad_norm": 0.8903435006680406, + "learning_rate": 2.8115191378652716e-06, + "loss": 0.1402, + "step": 5288 + }, + { + "epoch": 0.4873082415810568, + "grad_norm": 0.8398996275510962, + "learning_rate": 2.810763277785514e-06, + "loss": 0.1345, + "step": 5289 + }, + { + "epoch": 0.4874003777583268, + "grad_norm": 0.9572207221710832, + "learning_rate": 2.810007388851353e-06, + "loss": 0.1616, + "step": 5290 + }, + { + "epoch": 0.48749251393559684, + "grad_norm": 0.913103201905886, + "learning_rate": 2.809251471132972e-06, + "loss": 0.1513, + "step": 5291 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 0.8682324842644803, + "learning_rate": 2.808495524700559e-06, + "loss": 0.1291, + "step": 5292 + }, + { + "epoch": 0.48767678629013683, + "grad_norm": 0.8997230016251552, + "learning_rate": 2.807739549624303e-06, + "loss": 0.1389, + "step": 5293 + }, + { + "epoch": 0.48776892246740683, + "grad_norm": 0.8509844715151446, + "learning_rate": 2.8069835459743965e-06, + "loss": 0.1393, + "step": 5294 + }, + { + "epoch": 0.4878610586446768, + "grad_norm": 0.865759399554568, + "learning_rate": 2.8062275138210355e-06, + "loss": 0.1351, + "step": 5295 + }, + { + "epoch": 0.4879531948219468, + "grad_norm": 0.8590645407142179, + "learning_rate": 2.805471453234416e-06, + "loss": 0.1308, + "step": 5296 + }, + { + "epoch": 0.4880453309992168, + "grad_norm": 0.9560127081452957, + "learning_rate": 2.80471536428474e-06, + "loss": 0.1431, + "step": 5297 + }, + { + "epoch": 0.48813746717648687, + "grad_norm": 0.9540596366204239, + "learning_rate": 2.8039592470422096e-06, + "loss": 0.147, + "step": 5298 + }, + { + "epoch": 0.48822960335375687, + "grad_norm": 0.9183606271614978, + "learning_rate": 2.8032031015770296e-06, + "loss": 0.1493, + "step": 5299 + }, + { + "epoch": 0.48832173953102687, + "grad_norm": 0.9200811671231739, + "learning_rate": 2.8024469279594102e-06, + "loss": 0.1383, + "step": 5300 + }, + { + "epoch": 0.48841387570829686, + "grad_norm": 0.8694060487879737, + "learning_rate": 2.80169072625956e-06, + "loss": 0.1333, + "step": 5301 + }, + { + "epoch": 0.48850601188556686, + "grad_norm": 0.9202762624654115, + "learning_rate": 2.8009344965476935e-06, + "loss": 0.134, + "step": 5302 + }, + { + "epoch": 0.48859814806283686, + "grad_norm": 0.9367392274537912, + "learning_rate": 2.8001782388940267e-06, + "loss": 0.1547, + "step": 5303 + }, + { + "epoch": 0.48869028424010685, + "grad_norm": 0.9219003180099276, + "learning_rate": 2.7994219533687784e-06, + "loss": 0.1417, + "step": 5304 + }, + { + "epoch": 0.4887824204173769, + "grad_norm": 0.8394096894153247, + "learning_rate": 2.79866564004217e-06, + "loss": 0.1372, + "step": 5305 + }, + { + "epoch": 0.4888745565946469, + "grad_norm": 0.9632709174947548, + "learning_rate": 2.797909298984424e-06, + "loss": 0.1501, + "step": 5306 + }, + { + "epoch": 0.4889666927719169, + "grad_norm": 0.8824075293556416, + "learning_rate": 2.797152930265767e-06, + "loss": 0.143, + "step": 5307 + }, + { + "epoch": 0.4890588289491869, + "grad_norm": 0.8817934891698063, + "learning_rate": 2.796396533956429e-06, + "loss": 0.1418, + "step": 5308 + }, + { + "epoch": 0.4891509651264569, + "grad_norm": 0.9544326229762716, + "learning_rate": 2.7956401101266407e-06, + "loss": 0.1504, + "step": 5309 + }, + { + "epoch": 0.4892431013037269, + "grad_norm": 0.9322273063288705, + "learning_rate": 2.7948836588466373e-06, + "loss": 0.1456, + "step": 5310 + }, + { + "epoch": 0.4893352374809969, + "grad_norm": 0.909857805157454, + "learning_rate": 2.794127180186653e-06, + "loss": 0.1401, + "step": 5311 + }, + { + "epoch": 0.48942737365826694, + "grad_norm": 0.939370763815054, + "learning_rate": 2.7933706742169297e-06, + "loss": 0.1482, + "step": 5312 + }, + { + "epoch": 0.48951950983553694, + "grad_norm": 0.8992145813911244, + "learning_rate": 2.792614141007707e-06, + "loss": 0.1437, + "step": 5313 + }, + { + "epoch": 0.48961164601280693, + "grad_norm": 0.9533663660430065, + "learning_rate": 2.7918575806292305e-06, + "loss": 0.1432, + "step": 5314 + }, + { + "epoch": 0.48970378219007693, + "grad_norm": 0.9175012649462685, + "learning_rate": 2.791100993151745e-06, + "loss": 0.1449, + "step": 5315 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.9633249103576214, + "learning_rate": 2.790344378645502e-06, + "loss": 0.1503, + "step": 5316 + }, + { + "epoch": 0.4898880545446169, + "grad_norm": 0.9468216343031558, + "learning_rate": 2.7895877371807516e-06, + "loss": 0.1546, + "step": 5317 + }, + { + "epoch": 0.4899801907218869, + "grad_norm": 0.90993861662428, + "learning_rate": 2.7888310688277493e-06, + "loss": 0.1412, + "step": 5318 + }, + { + "epoch": 0.490072326899157, + "grad_norm": 0.9040365876464947, + "learning_rate": 2.7880743736567505e-06, + "loss": 0.141, + "step": 5319 + }, + { + "epoch": 0.49016446307642697, + "grad_norm": 0.9620046505695853, + "learning_rate": 2.7873176517380157e-06, + "loss": 0.1545, + "step": 5320 + }, + { + "epoch": 0.49025659925369697, + "grad_norm": 0.9103509663251281, + "learning_rate": 2.786560903141805e-06, + "loss": 0.1348, + "step": 5321 + }, + { + "epoch": 0.49034873543096696, + "grad_norm": 0.8917965010613744, + "learning_rate": 2.7858041279383854e-06, + "loss": 0.1457, + "step": 5322 + }, + { + "epoch": 0.49044087160823696, + "grad_norm": 0.8894071380853192, + "learning_rate": 2.7850473261980197e-06, + "loss": 0.1396, + "step": 5323 + }, + { + "epoch": 0.49053300778550696, + "grad_norm": 0.9274368507998124, + "learning_rate": 2.78429049799098e-06, + "loss": 0.1548, + "step": 5324 + }, + { + "epoch": 0.490625143962777, + "grad_norm": 0.9052156307937325, + "learning_rate": 2.783533643387537e-06, + "loss": 0.1438, + "step": 5325 + }, + { + "epoch": 0.490717280140047, + "grad_norm": 0.8956506738203168, + "learning_rate": 2.7827767624579645e-06, + "loss": 0.1307, + "step": 5326 + }, + { + "epoch": 0.490809416317317, + "grad_norm": 0.9182979070301321, + "learning_rate": 2.7820198552725404e-06, + "loss": 0.1478, + "step": 5327 + }, + { + "epoch": 0.490901552494587, + "grad_norm": 0.9067870769765347, + "learning_rate": 2.781262921901541e-06, + "loss": 0.1323, + "step": 5328 + }, + { + "epoch": 0.490993688671857, + "grad_norm": 0.9321816561616926, + "learning_rate": 2.780505962415249e-06, + "loss": 0.1403, + "step": 5329 + }, + { + "epoch": 0.491085824849127, + "grad_norm": 0.9245267858958562, + "learning_rate": 2.779748976883949e-06, + "loss": 0.1419, + "step": 5330 + }, + { + "epoch": 0.491177961026397, + "grad_norm": 0.9301677422865269, + "learning_rate": 2.7789919653779257e-06, + "loss": 0.1374, + "step": 5331 + }, + { + "epoch": 0.49127009720366704, + "grad_norm": 0.9786411388579338, + "learning_rate": 2.7782349279674684e-06, + "loss": 0.1587, + "step": 5332 + }, + { + "epoch": 0.49136223338093704, + "grad_norm": 0.912963264042218, + "learning_rate": 2.7774778647228688e-06, + "loss": 0.1406, + "step": 5333 + }, + { + "epoch": 0.49145436955820704, + "grad_norm": 0.9085498490897894, + "learning_rate": 2.7767207757144186e-06, + "loss": 0.1417, + "step": 5334 + }, + { + "epoch": 0.49154650573547704, + "grad_norm": 0.9226144813877271, + "learning_rate": 2.7759636610124158e-06, + "loss": 0.142, + "step": 5335 + }, + { + "epoch": 0.49163864191274703, + "grad_norm": 0.9713959796758398, + "learning_rate": 2.7752065206871564e-06, + "loss": 0.1375, + "step": 5336 + }, + { + "epoch": 0.49173077809001703, + "grad_norm": 0.8991677667705583, + "learning_rate": 2.7744493548089425e-06, + "loss": 0.1304, + "step": 5337 + }, + { + "epoch": 0.491822914267287, + "grad_norm": 0.8722949087858429, + "learning_rate": 2.773692163448076e-06, + "loss": 0.1346, + "step": 5338 + }, + { + "epoch": 0.4919150504445571, + "grad_norm": 0.9623034124111783, + "learning_rate": 2.7729349466748634e-06, + "loss": 0.1347, + "step": 5339 + }, + { + "epoch": 0.4920071866218271, + "grad_norm": 1.0356559269275247, + "learning_rate": 2.772177704559611e-06, + "loss": 0.1553, + "step": 5340 + }, + { + "epoch": 0.49209932279909707, + "grad_norm": 0.9648096203571851, + "learning_rate": 2.7714204371726293e-06, + "loss": 0.148, + "step": 5341 + }, + { + "epoch": 0.49219145897636707, + "grad_norm": 0.8771413421110204, + "learning_rate": 2.770663144584231e-06, + "loss": 0.1315, + "step": 5342 + }, + { + "epoch": 0.49228359515363707, + "grad_norm": 0.9609389946251562, + "learning_rate": 2.769905826864731e-06, + "loss": 0.1541, + "step": 5343 + }, + { + "epoch": 0.49237573133090706, + "grad_norm": 0.9202480450196757, + "learning_rate": 2.769148484084445e-06, + "loss": 0.1323, + "step": 5344 + }, + { + "epoch": 0.49246786750817706, + "grad_norm": 0.9168884464799304, + "learning_rate": 2.7683911163136944e-06, + "loss": 0.1356, + "step": 5345 + }, + { + "epoch": 0.4925600036854471, + "grad_norm": 0.8905102892909667, + "learning_rate": 2.767633723622799e-06, + "loss": 0.1327, + "step": 5346 + }, + { + "epoch": 0.4926521398627171, + "grad_norm": 0.9283258835996472, + "learning_rate": 2.7668763060820842e-06, + "loss": 0.1397, + "step": 5347 + }, + { + "epoch": 0.4927442760399871, + "grad_norm": 0.9031577778408963, + "learning_rate": 2.7661188637618752e-06, + "loss": 0.1432, + "step": 5348 + }, + { + "epoch": 0.4928364122172571, + "grad_norm": 0.9704106362718894, + "learning_rate": 2.7653613967325018e-06, + "loss": 0.1452, + "step": 5349 + }, + { + "epoch": 0.4929285483945271, + "grad_norm": 0.9047274748741216, + "learning_rate": 2.7646039050642926e-06, + "loss": 0.141, + "step": 5350 + }, + { + "epoch": 0.4930206845717971, + "grad_norm": 0.9080009969739212, + "learning_rate": 2.763846388827584e-06, + "loss": 0.1323, + "step": 5351 + }, + { + "epoch": 0.49311282074906715, + "grad_norm": 0.9190765010543278, + "learning_rate": 2.7630888480927082e-06, + "loss": 0.143, + "step": 5352 + }, + { + "epoch": 0.49320495692633715, + "grad_norm": 0.836165641654129, + "learning_rate": 2.7623312829300053e-06, + "loss": 0.1317, + "step": 5353 + }, + { + "epoch": 0.49329709310360714, + "grad_norm": 1.0026628826121744, + "learning_rate": 2.7615736934098146e-06, + "loss": 0.1598, + "step": 5354 + }, + { + "epoch": 0.49338922928087714, + "grad_norm": 0.9572287593009317, + "learning_rate": 2.760816079602478e-06, + "loss": 0.146, + "step": 5355 + }, + { + "epoch": 0.49348136545814714, + "grad_norm": 0.8912204631678248, + "learning_rate": 2.760058441578341e-06, + "loss": 0.1339, + "step": 5356 + }, + { + "epoch": 0.49357350163541713, + "grad_norm": 0.914931483053279, + "learning_rate": 2.7593007794077493e-06, + "loss": 0.1279, + "step": 5357 + }, + { + "epoch": 0.49366563781268713, + "grad_norm": 0.9112520500430483, + "learning_rate": 2.7585430931610526e-06, + "loss": 0.1336, + "step": 5358 + }, + { + "epoch": 0.4937577739899572, + "grad_norm": 0.8758937123292749, + "learning_rate": 2.7577853829086014e-06, + "loss": 0.133, + "step": 5359 + }, + { + "epoch": 0.4938499101672272, + "grad_norm": 0.9703690055488118, + "learning_rate": 2.7570276487207504e-06, + "loss": 0.1546, + "step": 5360 + }, + { + "epoch": 0.4939420463444972, + "grad_norm": 0.9070344715122353, + "learning_rate": 2.7562698906678537e-06, + "loss": 0.1281, + "step": 5361 + }, + { + "epoch": 0.4940341825217672, + "grad_norm": 1.0743999343067259, + "learning_rate": 2.755512108820271e-06, + "loss": 0.1499, + "step": 5362 + }, + { + "epoch": 0.49412631869903717, + "grad_norm": 0.8768062588403824, + "learning_rate": 2.7547543032483604e-06, + "loss": 0.1268, + "step": 5363 + }, + { + "epoch": 0.49421845487630717, + "grad_norm": 0.9074114494072715, + "learning_rate": 2.753996474022486e-06, + "loss": 0.1355, + "step": 5364 + }, + { + "epoch": 0.49431059105357716, + "grad_norm": 0.9424744962467466, + "learning_rate": 2.753238621213012e-06, + "loss": 0.1412, + "step": 5365 + }, + { + "epoch": 0.4944027272308472, + "grad_norm": 0.9539914855888854, + "learning_rate": 2.752480744890304e-06, + "loss": 0.1555, + "step": 5366 + }, + { + "epoch": 0.4944948634081172, + "grad_norm": 0.8823231270895644, + "learning_rate": 2.751722845124732e-06, + "loss": 0.1392, + "step": 5367 + }, + { + "epoch": 0.4945869995853872, + "grad_norm": 0.9403023365444029, + "learning_rate": 2.750964921986667e-06, + "loss": 0.1488, + "step": 5368 + }, + { + "epoch": 0.4946791357626572, + "grad_norm": 0.8975289021275779, + "learning_rate": 2.750206975546481e-06, + "loss": 0.1361, + "step": 5369 + }, + { + "epoch": 0.4947712719399272, + "grad_norm": 0.9253417573353605, + "learning_rate": 2.7494490058745514e-06, + "loss": 0.1425, + "step": 5370 + }, + { + "epoch": 0.4948634081171972, + "grad_norm": 0.9328707319829574, + "learning_rate": 2.7486910130412543e-06, + "loss": 0.1505, + "step": 5371 + }, + { + "epoch": 0.4949555442944672, + "grad_norm": 0.8941550491967376, + "learning_rate": 2.74793299711697e-06, + "loss": 0.1312, + "step": 5372 + }, + { + "epoch": 0.49504768047173725, + "grad_norm": 0.9099114610645433, + "learning_rate": 2.747174958172081e-06, + "loss": 0.1349, + "step": 5373 + }, + { + "epoch": 0.49513981664900725, + "grad_norm": 0.8779917259058921, + "learning_rate": 2.7464168962769696e-06, + "loss": 0.1314, + "step": 5374 + }, + { + "epoch": 0.49523195282627724, + "grad_norm": 0.9125090915870446, + "learning_rate": 2.745658811502023e-06, + "loss": 0.1363, + "step": 5375 + }, + { + "epoch": 0.49532408900354724, + "grad_norm": 0.9775013337024041, + "learning_rate": 2.7449007039176296e-06, + "loss": 0.1473, + "step": 5376 + }, + { + "epoch": 0.49541622518081724, + "grad_norm": 0.8985672527780988, + "learning_rate": 2.7441425735941787e-06, + "loss": 0.1443, + "step": 5377 + }, + { + "epoch": 0.49550836135808723, + "grad_norm": 0.9152198455379926, + "learning_rate": 2.7433844206020643e-06, + "loss": 0.1395, + "step": 5378 + }, + { + "epoch": 0.49560049753535723, + "grad_norm": 0.8692587899638442, + "learning_rate": 2.7426262450116798e-06, + "loss": 0.1301, + "step": 5379 + }, + { + "epoch": 0.4956926337126273, + "grad_norm": 0.968301271071014, + "learning_rate": 2.7418680468934227e-06, + "loss": 0.141, + "step": 5380 + }, + { + "epoch": 0.4957847698898973, + "grad_norm": 0.9688367958386958, + "learning_rate": 2.7411098263176917e-06, + "loss": 0.1474, + "step": 5381 + }, + { + "epoch": 0.4958769060671673, + "grad_norm": 0.9636259047294249, + "learning_rate": 2.740351583354886e-06, + "loss": 0.1466, + "step": 5382 + }, + { + "epoch": 0.4959690422444373, + "grad_norm": 0.9710179385144946, + "learning_rate": 2.739593318075412e-06, + "loss": 0.1511, + "step": 5383 + }, + { + "epoch": 0.49606117842170727, + "grad_norm": 0.9860533062959879, + "learning_rate": 2.7388350305496708e-06, + "loss": 0.1531, + "step": 5384 + }, + { + "epoch": 0.49615331459897727, + "grad_norm": 0.8845136650001137, + "learning_rate": 2.7380767208480726e-06, + "loss": 0.1335, + "step": 5385 + }, + { + "epoch": 0.4962454507762473, + "grad_norm": 0.857431418417288, + "learning_rate": 2.7373183890410245e-06, + "loss": 0.1266, + "step": 5386 + }, + { + "epoch": 0.4963375869535173, + "grad_norm": 0.8725792617659032, + "learning_rate": 2.7365600351989386e-06, + "loss": 0.139, + "step": 5387 + }, + { + "epoch": 0.4964297231307873, + "grad_norm": 0.9327533333797368, + "learning_rate": 2.7358016593922283e-06, + "loss": 0.147, + "step": 5388 + }, + { + "epoch": 0.4965218593080573, + "grad_norm": 0.8981246518761035, + "learning_rate": 2.7350432616913083e-06, + "loss": 0.1345, + "step": 5389 + }, + { + "epoch": 0.4966139954853273, + "grad_norm": 0.9240269830117442, + "learning_rate": 2.734284842166596e-06, + "loss": 0.1558, + "step": 5390 + }, + { + "epoch": 0.4967061316625973, + "grad_norm": 0.9839833653231123, + "learning_rate": 2.733526400888511e-06, + "loss": 0.1562, + "step": 5391 + }, + { + "epoch": 0.4967982678398673, + "grad_norm": 0.9071812673870605, + "learning_rate": 2.732767937927474e-06, + "loss": 0.1365, + "step": 5392 + }, + { + "epoch": 0.49689040401713735, + "grad_norm": 0.869434467607522, + "learning_rate": 2.73200945335391e-06, + "loss": 0.1372, + "step": 5393 + }, + { + "epoch": 0.49698254019440735, + "grad_norm": 0.9389875824273619, + "learning_rate": 2.7312509472382425e-06, + "loss": 0.151, + "step": 5394 + }, + { + "epoch": 0.49707467637167735, + "grad_norm": 0.8639226483522556, + "learning_rate": 2.7304924196509004e-06, + "loss": 0.125, + "step": 5395 + }, + { + "epoch": 0.49716681254894735, + "grad_norm": 0.9154900592463313, + "learning_rate": 2.7297338706623113e-06, + "loss": 0.1392, + "step": 5396 + }, + { + "epoch": 0.49725894872621734, + "grad_norm": 0.9468057879828546, + "learning_rate": 2.728975300342909e-06, + "loss": 0.1349, + "step": 5397 + }, + { + "epoch": 0.49735108490348734, + "grad_norm": 0.942572127556101, + "learning_rate": 2.7282167087631234e-06, + "loss": 0.1487, + "step": 5398 + }, + { + "epoch": 0.49744322108075734, + "grad_norm": 0.9139656094622787, + "learning_rate": 2.7274580959933933e-06, + "loss": 0.1452, + "step": 5399 + }, + { + "epoch": 0.4975353572580274, + "grad_norm": 0.8875985109012006, + "learning_rate": 2.726699462104154e-06, + "loss": 0.1327, + "step": 5400 + }, + { + "epoch": 0.4976274934352974, + "grad_norm": 0.9556085304450759, + "learning_rate": 2.7259408071658456e-06, + "loss": 0.1439, + "step": 5401 + }, + { + "epoch": 0.4977196296125674, + "grad_norm": 1.0220365803003981, + "learning_rate": 2.725182131248909e-06, + "loss": 0.1409, + "step": 5402 + }, + { + "epoch": 0.4978117657898374, + "grad_norm": 0.9580616109925933, + "learning_rate": 2.724423434423787e-06, + "loss": 0.1477, + "step": 5403 + }, + { + "epoch": 0.4979039019671074, + "grad_norm": 0.9150371096328926, + "learning_rate": 2.7236647167609246e-06, + "loss": 0.1345, + "step": 5404 + }, + { + "epoch": 0.4979960381443774, + "grad_norm": 0.9365443118478644, + "learning_rate": 2.7229059783307703e-06, + "loss": 0.1501, + "step": 5405 + }, + { + "epoch": 0.49808817432164737, + "grad_norm": 0.9815020710994572, + "learning_rate": 2.7221472192037707e-06, + "loss": 0.1408, + "step": 5406 + }, + { + "epoch": 0.4981803104989174, + "grad_norm": 0.980382203078079, + "learning_rate": 2.721388439450379e-06, + "loss": 0.1447, + "step": 5407 + }, + { + "epoch": 0.4982724466761874, + "grad_norm": 0.9089049802957376, + "learning_rate": 2.7206296391410457e-06, + "loss": 0.1448, + "step": 5408 + }, + { + "epoch": 0.4983645828534574, + "grad_norm": 0.946696566658947, + "learning_rate": 2.7198708183462275e-06, + "loss": 0.1367, + "step": 5409 + }, + { + "epoch": 0.4984567190307274, + "grad_norm": 0.9457376021726533, + "learning_rate": 2.71911197713638e-06, + "loss": 0.1422, + "step": 5410 + }, + { + "epoch": 0.4985488552079974, + "grad_norm": 0.8824387290721871, + "learning_rate": 2.7183531155819607e-06, + "loss": 0.1421, + "step": 5411 + }, + { + "epoch": 0.4986409913852674, + "grad_norm": 0.8761674752566213, + "learning_rate": 2.7175942337534326e-06, + "loss": 0.1242, + "step": 5412 + }, + { + "epoch": 0.4987331275625374, + "grad_norm": 0.9330486674418532, + "learning_rate": 2.7168353317212565e-06, + "loss": 0.1435, + "step": 5413 + }, + { + "epoch": 0.49882526373980746, + "grad_norm": 0.9235257314844412, + "learning_rate": 2.7160764095558954e-06, + "loss": 0.1385, + "step": 5414 + }, + { + "epoch": 0.49891739991707745, + "grad_norm": 0.9117658297669242, + "learning_rate": 2.7153174673278174e-06, + "loss": 0.139, + "step": 5415 + }, + { + "epoch": 0.49900953609434745, + "grad_norm": 0.898645968059442, + "learning_rate": 2.7145585051074893e-06, + "loss": 0.1388, + "step": 5416 + }, + { + "epoch": 0.49910167227161745, + "grad_norm": 0.961300262470349, + "learning_rate": 2.7137995229653803e-06, + "loss": 0.1457, + "step": 5417 + }, + { + "epoch": 0.49919380844888744, + "grad_norm": 0.9290136686056124, + "learning_rate": 2.7130405209719637e-06, + "loss": 0.1307, + "step": 5418 + }, + { + "epoch": 0.49928594462615744, + "grad_norm": 0.9459474842848344, + "learning_rate": 2.7122814991977104e-06, + "loss": 0.1405, + "step": 5419 + }, + { + "epoch": 0.4993780808034275, + "grad_norm": 0.9241431470133946, + "learning_rate": 2.711522457713098e-06, + "loss": 0.1334, + "step": 5420 + }, + { + "epoch": 0.4994702169806975, + "grad_norm": 0.9390340034093618, + "learning_rate": 2.710763396588602e-06, + "loss": 0.1307, + "step": 5421 + }, + { + "epoch": 0.4995623531579675, + "grad_norm": 0.9981828709590618, + "learning_rate": 2.7100043158947027e-06, + "loss": 0.1405, + "step": 5422 + }, + { + "epoch": 0.4996544893352375, + "grad_norm": 1.0367977847526075, + "learning_rate": 2.7092452157018795e-06, + "loss": 0.1536, + "step": 5423 + }, + { + "epoch": 0.4997466255125075, + "grad_norm": 0.9040763895456477, + "learning_rate": 2.708486096080616e-06, + "loss": 0.1362, + "step": 5424 + }, + { + "epoch": 0.4998387616897775, + "grad_norm": 0.9262236757461823, + "learning_rate": 2.7077269571013947e-06, + "loss": 0.1311, + "step": 5425 + }, + { + "epoch": 0.4999308978670475, + "grad_norm": 0.9351274103164497, + "learning_rate": 2.7069677988347048e-06, + "loss": 0.1466, + "step": 5426 + }, + { + "epoch": 0.5000230340443175, + "grad_norm": 1.010791283106135, + "learning_rate": 2.7062086213510315e-06, + "loss": 0.1392, + "step": 5427 + }, + { + "epoch": 0.5001151702215875, + "grad_norm": 0.9800680940213662, + "learning_rate": 2.705449424720866e-06, + "loss": 0.1393, + "step": 5428 + }, + { + "epoch": 0.5002073063988575, + "grad_norm": 0.9090025369824177, + "learning_rate": 2.7046902090146986e-06, + "loss": 0.1332, + "step": 5429 + }, + { + "epoch": 0.5002994425761275, + "grad_norm": 0.8947906760810447, + "learning_rate": 2.703930974303024e-06, + "loss": 0.1257, + "step": 5430 + }, + { + "epoch": 0.5003915787533976, + "grad_norm": 0.918143860684268, + "learning_rate": 2.703171720656336e-06, + "loss": 0.1377, + "step": 5431 + }, + { + "epoch": 0.5004837149306676, + "grad_norm": 0.8565108609127393, + "learning_rate": 2.7024124481451323e-06, + "loss": 0.1342, + "step": 5432 + }, + { + "epoch": 0.5005758511079376, + "grad_norm": 0.9133626021685404, + "learning_rate": 2.701653156839911e-06, + "loss": 0.1407, + "step": 5433 + }, + { + "epoch": 0.5006679872852076, + "grad_norm": 0.909655782017589, + "learning_rate": 2.700893846811172e-06, + "loss": 0.1474, + "step": 5434 + }, + { + "epoch": 0.5007601234624776, + "grad_norm": 0.9613093139686264, + "learning_rate": 2.700134518129418e-06, + "loss": 0.1483, + "step": 5435 + }, + { + "epoch": 0.5008522596397476, + "grad_norm": 0.922464421682086, + "learning_rate": 2.699375170865152e-06, + "loss": 0.1331, + "step": 5436 + }, + { + "epoch": 0.5009443958170176, + "grad_norm": 0.8953135114057215, + "learning_rate": 2.6986158050888804e-06, + "loss": 0.1182, + "step": 5437 + }, + { + "epoch": 0.5010365319942875, + "grad_norm": 0.9488693208071233, + "learning_rate": 2.6978564208711098e-06, + "loss": 0.138, + "step": 5438 + }, + { + "epoch": 0.5011286681715575, + "grad_norm": 0.9225635631092897, + "learning_rate": 2.697097018282349e-06, + "loss": 0.1225, + "step": 5439 + }, + { + "epoch": 0.5012208043488275, + "grad_norm": 0.9820006390170077, + "learning_rate": 2.6963375973931095e-06, + "loss": 0.1456, + "step": 5440 + }, + { + "epoch": 0.5013129405260975, + "grad_norm": 0.9329638194227488, + "learning_rate": 2.6955781582739028e-06, + "loss": 0.1421, + "step": 5441 + }, + { + "epoch": 0.5014050767033675, + "grad_norm": 0.9709775019424565, + "learning_rate": 2.6948187009952426e-06, + "loss": 0.1469, + "step": 5442 + }, + { + "epoch": 0.5014972128806375, + "grad_norm": 0.8940728365805736, + "learning_rate": 2.6940592256276455e-06, + "loss": 0.1338, + "step": 5443 + }, + { + "epoch": 0.5015893490579076, + "grad_norm": 0.9582589895606664, + "learning_rate": 2.6932997322416276e-06, + "loss": 0.1418, + "step": 5444 + }, + { + "epoch": 0.5016814852351776, + "grad_norm": 0.9555495425316477, + "learning_rate": 2.6925402209077096e-06, + "loss": 0.1493, + "step": 5445 + }, + { + "epoch": 0.5017736214124476, + "grad_norm": 0.8761008247398887, + "learning_rate": 2.6917806916964107e-06, + "loss": 0.133, + "step": 5446 + }, + { + "epoch": 0.5018657575897176, + "grad_norm": 0.8917513046762765, + "learning_rate": 2.691021144678254e-06, + "loss": 0.139, + "step": 5447 + }, + { + "epoch": 0.5019578937669876, + "grad_norm": 0.9523258198573107, + "learning_rate": 2.690261579923764e-06, + "loss": 0.1487, + "step": 5448 + }, + { + "epoch": 0.5020500299442576, + "grad_norm": 0.8774773911026075, + "learning_rate": 2.689501997503466e-06, + "loss": 0.138, + "step": 5449 + }, + { + "epoch": 0.5021421661215276, + "grad_norm": 0.9107629385972631, + "learning_rate": 2.688742397487887e-06, + "loss": 0.1475, + "step": 5450 + }, + { + "epoch": 0.5022343022987976, + "grad_norm": 0.8954104101874873, + "learning_rate": 2.6879827799475557e-06, + "loss": 0.1432, + "step": 5451 + }, + { + "epoch": 0.5023264384760676, + "grad_norm": 0.951129195820262, + "learning_rate": 2.6872231449530027e-06, + "loss": 0.1456, + "step": 5452 + }, + { + "epoch": 0.5024185746533376, + "grad_norm": 0.9334140124428768, + "learning_rate": 2.686463492574761e-06, + "loss": 0.1537, + "step": 5453 + }, + { + "epoch": 0.5025107108306076, + "grad_norm": 0.9532927507320598, + "learning_rate": 2.6857038228833644e-06, + "loss": 0.1485, + "step": 5454 + }, + { + "epoch": 0.5026028470078776, + "grad_norm": 0.8330878352738564, + "learning_rate": 2.6849441359493474e-06, + "loss": 0.1158, + "step": 5455 + }, + { + "epoch": 0.5026949831851476, + "grad_norm": 0.9220034819099294, + "learning_rate": 2.6841844318432476e-06, + "loss": 0.1396, + "step": 5456 + }, + { + "epoch": 0.5027871193624176, + "grad_norm": 0.9118685665157755, + "learning_rate": 2.683424710635603e-06, + "loss": 0.1284, + "step": 5457 + }, + { + "epoch": 0.5028792555396877, + "grad_norm": 0.8654389397910528, + "learning_rate": 2.682664972396955e-06, + "loss": 0.1283, + "step": 5458 + }, + { + "epoch": 0.5029713917169577, + "grad_norm": 0.884857390795016, + "learning_rate": 2.6819052171978443e-06, + "loss": 0.1394, + "step": 5459 + }, + { + "epoch": 0.5030635278942277, + "grad_norm": 0.894611570661682, + "learning_rate": 2.681145445108814e-06, + "loss": 0.1299, + "step": 5460 + }, + { + "epoch": 0.5031556640714977, + "grad_norm": 0.9548521712233243, + "learning_rate": 2.6803856562004112e-06, + "loss": 0.1446, + "step": 5461 + }, + { + "epoch": 0.5032478002487677, + "grad_norm": 0.9100534643805418, + "learning_rate": 2.6796258505431786e-06, + "loss": 0.1287, + "step": 5462 + }, + { + "epoch": 0.5033399364260377, + "grad_norm": 0.9261093041169923, + "learning_rate": 2.6788660282076682e-06, + "loss": 0.1397, + "step": 5463 + }, + { + "epoch": 0.5034320726033077, + "grad_norm": 0.9255465069936254, + "learning_rate": 2.6781061892644273e-06, + "loss": 0.1333, + "step": 5464 + }, + { + "epoch": 0.5035242087805777, + "grad_norm": 0.8713776146436004, + "learning_rate": 2.6773463337840078e-06, + "loss": 0.1392, + "step": 5465 + }, + { + "epoch": 0.5036163449578477, + "grad_norm": 0.9341165883169124, + "learning_rate": 2.676586461836962e-06, + "loss": 0.1554, + "step": 5466 + }, + { + "epoch": 0.5037084811351177, + "grad_norm": 0.8604949495271219, + "learning_rate": 2.6758265734938444e-06, + "loss": 0.1443, + "step": 5467 + }, + { + "epoch": 0.5038006173123877, + "grad_norm": 0.8940670948046049, + "learning_rate": 2.67506666882521e-06, + "loss": 0.1383, + "step": 5468 + }, + { + "epoch": 0.5038927534896577, + "grad_norm": 0.9117729021006208, + "learning_rate": 2.6743067479016166e-06, + "loss": 0.147, + "step": 5469 + }, + { + "epoch": 0.5039848896669277, + "grad_norm": 0.909075371011717, + "learning_rate": 2.673546810793623e-06, + "loss": 0.1298, + "step": 5470 + }, + { + "epoch": 0.5040770258441978, + "grad_norm": 0.9179808077227184, + "learning_rate": 2.6727868575717893e-06, + "loss": 0.1472, + "step": 5471 + }, + { + "epoch": 0.5041691620214678, + "grad_norm": 0.9233613284382322, + "learning_rate": 2.6720268883066773e-06, + "loss": 0.1291, + "step": 5472 + }, + { + "epoch": 0.5042612981987378, + "grad_norm": 0.9615650138923706, + "learning_rate": 2.6712669030688503e-06, + "loss": 0.1502, + "step": 5473 + }, + { + "epoch": 0.5043534343760078, + "grad_norm": 0.9397692153613146, + "learning_rate": 2.6705069019288733e-06, + "loss": 0.1463, + "step": 5474 + }, + { + "epoch": 0.5044455705532778, + "grad_norm": 1.0194186132210752, + "learning_rate": 2.6697468849573114e-06, + "loss": 0.1598, + "step": 5475 + }, + { + "epoch": 0.5045377067305478, + "grad_norm": 0.9924246077269206, + "learning_rate": 2.6689868522247334e-06, + "loss": 0.1521, + "step": 5476 + }, + { + "epoch": 0.5046298429078178, + "grad_norm": 0.9255855941944566, + "learning_rate": 2.668226803801708e-06, + "loss": 0.1385, + "step": 5477 + }, + { + "epoch": 0.5047219790850878, + "grad_norm": 0.8860193614954378, + "learning_rate": 2.6674667397588056e-06, + "loss": 0.1335, + "step": 5478 + }, + { + "epoch": 0.5048141152623578, + "grad_norm": 0.954126434296226, + "learning_rate": 2.666706660166598e-06, + "loss": 0.1539, + "step": 5479 + }, + { + "epoch": 0.5049062514396278, + "grad_norm": 0.9327823851336733, + "learning_rate": 2.66594656509566e-06, + "loss": 0.1436, + "step": 5480 + }, + { + "epoch": 0.5049983876168977, + "grad_norm": 0.9487423920612162, + "learning_rate": 2.665186454616565e-06, + "loss": 0.131, + "step": 5481 + }, + { + "epoch": 0.5050905237941677, + "grad_norm": 0.9131599113448318, + "learning_rate": 2.66442632879989e-06, + "loss": 0.1465, + "step": 5482 + }, + { + "epoch": 0.5051826599714377, + "grad_norm": 0.8724986437513693, + "learning_rate": 2.663666187716213e-06, + "loss": 0.1257, + "step": 5483 + }, + { + "epoch": 0.5052747961487077, + "grad_norm": 0.910959624134114, + "learning_rate": 2.662906031436112e-06, + "loss": 0.1379, + "step": 5484 + }, + { + "epoch": 0.5053669323259778, + "grad_norm": 0.9769423030222774, + "learning_rate": 2.662145860030169e-06, + "loss": 0.1457, + "step": 5485 + }, + { + "epoch": 0.5054590685032478, + "grad_norm": 0.9195321853537671, + "learning_rate": 2.6613856735689656e-06, + "loss": 0.1315, + "step": 5486 + }, + { + "epoch": 0.5055512046805178, + "grad_norm": 0.8813823710242042, + "learning_rate": 2.6606254721230845e-06, + "loss": 0.1325, + "step": 5487 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 0.9379783302984157, + "learning_rate": 2.6598652557631123e-06, + "loss": 0.1525, + "step": 5488 + }, + { + "epoch": 0.5057354770350578, + "grad_norm": 0.9048060270273836, + "learning_rate": 2.6591050245596322e-06, + "loss": 0.1298, + "step": 5489 + }, + { + "epoch": 0.5058276132123278, + "grad_norm": 0.9650276864334607, + "learning_rate": 2.6583447785832343e-06, + "loss": 0.1389, + "step": 5490 + }, + { + "epoch": 0.5059197493895978, + "grad_norm": 0.8938180533740254, + "learning_rate": 2.657584517904507e-06, + "loss": 0.13, + "step": 5491 + }, + { + "epoch": 0.5060118855668678, + "grad_norm": 0.9550953969484592, + "learning_rate": 2.65682424259404e-06, + "loss": 0.1443, + "step": 5492 + }, + { + "epoch": 0.5061040217441378, + "grad_norm": 0.9048846266087659, + "learning_rate": 2.6560639527224246e-06, + "loss": 0.1368, + "step": 5493 + }, + { + "epoch": 0.5061961579214078, + "grad_norm": 0.9388284521232664, + "learning_rate": 2.6553036483602553e-06, + "loss": 0.1528, + "step": 5494 + }, + { + "epoch": 0.5062882940986778, + "grad_norm": 0.9094867358427684, + "learning_rate": 2.6545433295781242e-06, + "loss": 0.1417, + "step": 5495 + }, + { + "epoch": 0.5063804302759478, + "grad_norm": 0.917149402691518, + "learning_rate": 2.65378299644663e-06, + "loss": 0.1406, + "step": 5496 + }, + { + "epoch": 0.5064725664532178, + "grad_norm": 0.8829207817590017, + "learning_rate": 2.653022649036367e-06, + "loss": 0.1236, + "step": 5497 + }, + { + "epoch": 0.5065647026304878, + "grad_norm": 0.8893997459270351, + "learning_rate": 2.652262287417935e-06, + "loss": 0.1372, + "step": 5498 + }, + { + "epoch": 0.5066568388077579, + "grad_norm": 0.9035837066275488, + "learning_rate": 2.6515019116619327e-06, + "loss": 0.1374, + "step": 5499 + }, + { + "epoch": 0.5067489749850279, + "grad_norm": 0.8749623890705591, + "learning_rate": 2.6507415218389616e-06, + "loss": 0.1344, + "step": 5500 + }, + { + "epoch": 0.5067489749850279, + "eval_loss": 0.13985012471675873, + "eval_runtime": 299.8724, + "eval_samples_per_second": 23.4, + "eval_steps_per_second": 2.928, + "step": 5500 + }, + { + "epoch": 0.5068411111622979, + "grad_norm": 0.8972053443275787, + "learning_rate": 2.649981118019625e-06, + "loss": 0.1325, + "step": 5501 + }, + { + "epoch": 0.5069332473395679, + "grad_norm": 0.9610764579191868, + "learning_rate": 2.649220700274524e-06, + "loss": 0.1421, + "step": 5502 + }, + { + "epoch": 0.5070253835168379, + "grad_norm": 0.9421384788983665, + "learning_rate": 2.648460268674266e-06, + "loss": 0.1502, + "step": 5503 + }, + { + "epoch": 0.5071175196941079, + "grad_norm": 0.8800435296498718, + "learning_rate": 2.6476998232894557e-06, + "loss": 0.1352, + "step": 5504 + }, + { + "epoch": 0.5072096558713779, + "grad_norm": 0.917670590492245, + "learning_rate": 2.646939364190701e-06, + "loss": 0.1425, + "step": 5505 + }, + { + "epoch": 0.5073017920486479, + "grad_norm": 0.9647345651378058, + "learning_rate": 2.6461788914486098e-06, + "loss": 0.1458, + "step": 5506 + }, + { + "epoch": 0.5073939282259179, + "grad_norm": 0.9404490001911155, + "learning_rate": 2.6454184051337946e-06, + "loss": 0.1581, + "step": 5507 + }, + { + "epoch": 0.5074860644031879, + "grad_norm": 0.8676278066012653, + "learning_rate": 2.644657905316863e-06, + "loss": 0.1316, + "step": 5508 + }, + { + "epoch": 0.5075782005804579, + "grad_norm": 0.9111223393555133, + "learning_rate": 2.64389739206843e-06, + "loss": 0.1482, + "step": 5509 + }, + { + "epoch": 0.5076703367577279, + "grad_norm": 0.9475807706333695, + "learning_rate": 2.6431368654591087e-06, + "loss": 0.1519, + "step": 5510 + }, + { + "epoch": 0.5077624729349979, + "grad_norm": 0.9054563411368267, + "learning_rate": 2.6423763255595143e-06, + "loss": 0.146, + "step": 5511 + }, + { + "epoch": 0.507854609112268, + "grad_norm": 0.9368488337401683, + "learning_rate": 2.6416157724402626e-06, + "loss": 0.1381, + "step": 5512 + }, + { + "epoch": 0.507946745289538, + "grad_norm": 0.9040531100606465, + "learning_rate": 2.640855206171971e-06, + "loss": 0.1371, + "step": 5513 + }, + { + "epoch": 0.508038881466808, + "grad_norm": 0.9249124192962886, + "learning_rate": 2.640094626825258e-06, + "loss": 0.1355, + "step": 5514 + }, + { + "epoch": 0.508131017644078, + "grad_norm": 0.9403553647015074, + "learning_rate": 2.6393340344707448e-06, + "loss": 0.1459, + "step": 5515 + }, + { + "epoch": 0.508223153821348, + "grad_norm": 0.9221283008538266, + "learning_rate": 2.6385734291790506e-06, + "loss": 0.1307, + "step": 5516 + }, + { + "epoch": 0.508315289998618, + "grad_norm": 0.9181336545301192, + "learning_rate": 2.637812811020799e-06, + "loss": 0.1428, + "step": 5517 + }, + { + "epoch": 0.508407426175888, + "grad_norm": 0.9060067197014477, + "learning_rate": 2.6370521800666127e-06, + "loss": 0.1323, + "step": 5518 + }, + { + "epoch": 0.508499562353158, + "grad_norm": 0.8997448706666332, + "learning_rate": 2.636291536387117e-06, + "loss": 0.136, + "step": 5519 + }, + { + "epoch": 0.508591698530428, + "grad_norm": 0.905826181470098, + "learning_rate": 2.6355308800529377e-06, + "loss": 0.1356, + "step": 5520 + }, + { + "epoch": 0.508683834707698, + "grad_norm": 0.9173879086665507, + "learning_rate": 2.6347702111347017e-06, + "loss": 0.144, + "step": 5521 + }, + { + "epoch": 0.508775970884968, + "grad_norm": 0.9378280218523944, + "learning_rate": 2.634009529703036e-06, + "loss": 0.1344, + "step": 5522 + }, + { + "epoch": 0.508868107062238, + "grad_norm": 0.8970030378074605, + "learning_rate": 2.633248835828573e-06, + "loss": 0.1177, + "step": 5523 + }, + { + "epoch": 0.508960243239508, + "grad_norm": 0.9873589104671638, + "learning_rate": 2.6324881295819394e-06, + "loss": 0.1525, + "step": 5524 + }, + { + "epoch": 0.509052379416778, + "grad_norm": 0.8669878479759109, + "learning_rate": 2.6317274110337692e-06, + "loss": 0.1277, + "step": 5525 + }, + { + "epoch": 0.509144515594048, + "grad_norm": 0.9081281065004965, + "learning_rate": 2.6309666802546953e-06, + "loss": 0.1294, + "step": 5526 + }, + { + "epoch": 0.509236651771318, + "grad_norm": 0.9014846717124748, + "learning_rate": 2.63020593731535e-06, + "loss": 0.1291, + "step": 5527 + }, + { + "epoch": 0.509328787948588, + "grad_norm": 0.9802281228815567, + "learning_rate": 2.629445182286371e-06, + "loss": 0.145, + "step": 5528 + }, + { + "epoch": 0.509420924125858, + "grad_norm": 0.9134326612787356, + "learning_rate": 2.6286844152383913e-06, + "loss": 0.1384, + "step": 5529 + }, + { + "epoch": 0.509513060303128, + "grad_norm": 0.9077152752905923, + "learning_rate": 2.6279236362420503e-06, + "loss": 0.1403, + "step": 5530 + }, + { + "epoch": 0.509605196480398, + "grad_norm": 0.8849225997974401, + "learning_rate": 2.6271628453679865e-06, + "loss": 0.1391, + "step": 5531 + }, + { + "epoch": 0.509697332657668, + "grad_norm": 0.9200720102028345, + "learning_rate": 2.6264020426868382e-06, + "loss": 0.1477, + "step": 5532 + }, + { + "epoch": 0.509789468834938, + "grad_norm": 0.8476479866854897, + "learning_rate": 2.6256412282692467e-06, + "loss": 0.1334, + "step": 5533 + }, + { + "epoch": 0.509881605012208, + "grad_norm": 0.8497457959986047, + "learning_rate": 2.6248804021858538e-06, + "loss": 0.1312, + "step": 5534 + }, + { + "epoch": 0.509973741189478, + "grad_norm": 0.9003692531477517, + "learning_rate": 2.624119564507302e-06, + "loss": 0.139, + "step": 5535 + }, + { + "epoch": 0.510065877366748, + "grad_norm": 0.8630292481575238, + "learning_rate": 2.6233587153042356e-06, + "loss": 0.1279, + "step": 5536 + }, + { + "epoch": 0.510158013544018, + "grad_norm": 0.9052917352915406, + "learning_rate": 2.6225978546472985e-06, + "loss": 0.1267, + "step": 5537 + }, + { + "epoch": 0.510250149721288, + "grad_norm": 0.8627747624985834, + "learning_rate": 2.621836982607138e-06, + "loss": 0.133, + "step": 5538 + }, + { + "epoch": 0.5103422858985581, + "grad_norm": 0.9549906006503575, + "learning_rate": 2.6210760992544004e-06, + "loss": 0.1357, + "step": 5539 + }, + { + "epoch": 0.5104344220758281, + "grad_norm": 0.9526451014635038, + "learning_rate": 2.6203152046597343e-06, + "loss": 0.1423, + "step": 5540 + }, + { + "epoch": 0.5105265582530981, + "grad_norm": 0.861004252901416, + "learning_rate": 2.6195542988937882e-06, + "loss": 0.1395, + "step": 5541 + }, + { + "epoch": 0.5106186944303681, + "grad_norm": 0.9027357450630178, + "learning_rate": 2.6187933820272128e-06, + "loss": 0.142, + "step": 5542 + }, + { + "epoch": 0.5107108306076381, + "grad_norm": 0.891785231864971, + "learning_rate": 2.6180324541306577e-06, + "loss": 0.1385, + "step": 5543 + }, + { + "epoch": 0.5108029667849081, + "grad_norm": 0.8647280432780181, + "learning_rate": 2.6172715152747784e-06, + "loss": 0.129, + "step": 5544 + }, + { + "epoch": 0.5108951029621781, + "grad_norm": 0.8875513460819612, + "learning_rate": 2.6165105655302252e-06, + "loss": 0.1359, + "step": 5545 + }, + { + "epoch": 0.5109872391394481, + "grad_norm": 0.9032613573115922, + "learning_rate": 2.615749604967654e-06, + "loss": 0.1406, + "step": 5546 + }, + { + "epoch": 0.5110793753167181, + "grad_norm": 0.8882411960902156, + "learning_rate": 2.614988633657719e-06, + "loss": 0.1361, + "step": 5547 + }, + { + "epoch": 0.5111715114939881, + "grad_norm": 0.867215069051396, + "learning_rate": 2.614227651671078e-06, + "loss": 0.1335, + "step": 5548 + }, + { + "epoch": 0.5112636476712581, + "grad_norm": 0.8963000008178789, + "learning_rate": 2.6134666590783856e-06, + "loss": 0.1453, + "step": 5549 + }, + { + "epoch": 0.5113557838485281, + "grad_norm": 0.9627517202967072, + "learning_rate": 2.6127056559503035e-06, + "loss": 0.1392, + "step": 5550 + }, + { + "epoch": 0.5114479200257981, + "grad_norm": 0.898582511766749, + "learning_rate": 2.611944642357488e-06, + "loss": 0.1307, + "step": 5551 + }, + { + "epoch": 0.5115400562030681, + "grad_norm": 0.8994720606560878, + "learning_rate": 2.611183618370601e-06, + "loss": 0.1424, + "step": 5552 + }, + { + "epoch": 0.5116321923803382, + "grad_norm": 0.9189882349593465, + "learning_rate": 2.6104225840603026e-06, + "loss": 0.1447, + "step": 5553 + }, + { + "epoch": 0.5117243285576082, + "grad_norm": 0.8508772095700861, + "learning_rate": 2.609661539497255e-06, + "loss": 0.1294, + "step": 5554 + }, + { + "epoch": 0.5118164647348782, + "grad_norm": 0.9372166101365486, + "learning_rate": 2.6089004847521227e-06, + "loss": 0.1384, + "step": 5555 + }, + { + "epoch": 0.5119086009121482, + "grad_norm": 0.9239599944745616, + "learning_rate": 2.608139419895568e-06, + "loss": 0.1349, + "step": 5556 + }, + { + "epoch": 0.5120007370894182, + "grad_norm": 0.9349790401903082, + "learning_rate": 2.6073783449982563e-06, + "loss": 0.138, + "step": 5557 + }, + { + "epoch": 0.5120928732666882, + "grad_norm": 0.8800466157716698, + "learning_rate": 2.6066172601308544e-06, + "loss": 0.1302, + "step": 5558 + }, + { + "epoch": 0.5121850094439582, + "grad_norm": 0.876448123482639, + "learning_rate": 2.605856165364028e-06, + "loss": 0.1362, + "step": 5559 + }, + { + "epoch": 0.5122771456212282, + "grad_norm": 0.8427611964166464, + "learning_rate": 2.6050950607684454e-06, + "loss": 0.1289, + "step": 5560 + }, + { + "epoch": 0.5123692817984982, + "grad_norm": 0.9047573143154314, + "learning_rate": 2.6043339464147754e-06, + "loss": 0.1369, + "step": 5561 + }, + { + "epoch": 0.5124614179757682, + "grad_norm": 0.9495280124543655, + "learning_rate": 2.603572822373686e-06, + "loss": 0.1531, + "step": 5562 + }, + { + "epoch": 0.5125535541530382, + "grad_norm": 0.9881092237552044, + "learning_rate": 2.6028116887158503e-06, + "loss": 0.1504, + "step": 5563 + }, + { + "epoch": 0.5126456903303082, + "grad_norm": 0.8214010445612134, + "learning_rate": 2.6020505455119375e-06, + "loss": 0.1276, + "step": 5564 + }, + { + "epoch": 0.5127378265075782, + "grad_norm": 0.8555404360624906, + "learning_rate": 2.601289392832621e-06, + "loss": 0.1335, + "step": 5565 + }, + { + "epoch": 0.5128299626848482, + "grad_norm": 0.8853452441226398, + "learning_rate": 2.6005282307485735e-06, + "loss": 0.1249, + "step": 5566 + }, + { + "epoch": 0.5129220988621183, + "grad_norm": 0.8881684098601418, + "learning_rate": 2.5997670593304688e-06, + "loss": 0.1383, + "step": 5567 + }, + { + "epoch": 0.5130142350393883, + "grad_norm": 0.9486217169656922, + "learning_rate": 2.5990058786489818e-06, + "loss": 0.1339, + "step": 5568 + }, + { + "epoch": 0.5131063712166583, + "grad_norm": 0.8598186957614298, + "learning_rate": 2.5982446887747885e-06, + "loss": 0.125, + "step": 5569 + }, + { + "epoch": 0.5131985073939282, + "grad_norm": 0.9382037989178497, + "learning_rate": 2.5974834897785646e-06, + "loss": 0.141, + "step": 5570 + }, + { + "epoch": 0.5132906435711982, + "grad_norm": 0.8555476200376073, + "learning_rate": 2.5967222817309893e-06, + "loss": 0.1276, + "step": 5571 + }, + { + "epoch": 0.5133827797484682, + "grad_norm": 0.928756828249199, + "learning_rate": 2.5959610647027388e-06, + "loss": 0.1456, + "step": 5572 + }, + { + "epoch": 0.5134749159257382, + "grad_norm": 0.9146043332944963, + "learning_rate": 2.595199838764493e-06, + "loss": 0.1424, + "step": 5573 + }, + { + "epoch": 0.5135670521030082, + "grad_norm": 0.8719255281170171, + "learning_rate": 2.5944386039869328e-06, + "loss": 0.127, + "step": 5574 + }, + { + "epoch": 0.5136591882802782, + "grad_norm": 0.9891327414194487, + "learning_rate": 2.593677360440738e-06, + "loss": 0.1535, + "step": 5575 + }, + { + "epoch": 0.5137513244575482, + "grad_norm": 0.8941008360456992, + "learning_rate": 2.5929161081965898e-06, + "loss": 0.1252, + "step": 5576 + }, + { + "epoch": 0.5138434606348182, + "grad_norm": 0.9390926757674907, + "learning_rate": 2.592154847325171e-06, + "loss": 0.1411, + "step": 5577 + }, + { + "epoch": 0.5139355968120882, + "grad_norm": 0.8699348061966624, + "learning_rate": 2.5913935778971644e-06, + "loss": 0.1363, + "step": 5578 + }, + { + "epoch": 0.5140277329893582, + "grad_norm": 0.9583723014793004, + "learning_rate": 2.590632299983255e-06, + "loss": 0.1401, + "step": 5579 + }, + { + "epoch": 0.5141198691666283, + "grad_norm": 0.9182617054191736, + "learning_rate": 2.589871013654126e-06, + "loss": 0.134, + "step": 5580 + }, + { + "epoch": 0.5142120053438983, + "grad_norm": 0.9611058899311491, + "learning_rate": 2.589109718980464e-06, + "loss": 0.1469, + "step": 5581 + }, + { + "epoch": 0.5143041415211683, + "grad_norm": 0.8705141047969384, + "learning_rate": 2.5883484160329552e-06, + "loss": 0.1255, + "step": 5582 + }, + { + "epoch": 0.5143962776984383, + "grad_norm": 0.8586967447912096, + "learning_rate": 2.587587104882286e-06, + "loss": 0.14, + "step": 5583 + }, + { + "epoch": 0.5144884138757083, + "grad_norm": 0.9372494805713448, + "learning_rate": 2.586825785599145e-06, + "loss": 0.1277, + "step": 5584 + }, + { + "epoch": 0.5145805500529783, + "grad_norm": 0.9599321610073678, + "learning_rate": 2.5860644582542206e-06, + "loss": 0.1567, + "step": 5585 + }, + { + "epoch": 0.5146726862302483, + "grad_norm": 0.9015636760266527, + "learning_rate": 2.5853031229182017e-06, + "loss": 0.1218, + "step": 5586 + }, + { + "epoch": 0.5147648224075183, + "grad_norm": 0.8916433289597714, + "learning_rate": 2.584541779661779e-06, + "loss": 0.1325, + "step": 5587 + }, + { + "epoch": 0.5148569585847883, + "grad_norm": 0.8773087815609789, + "learning_rate": 2.583780428555643e-06, + "loss": 0.1309, + "step": 5588 + }, + { + "epoch": 0.5149490947620583, + "grad_norm": 0.8559951677078586, + "learning_rate": 2.5830190696704843e-06, + "loss": 0.1328, + "step": 5589 + }, + { + "epoch": 0.5150412309393283, + "grad_norm": 0.919260554053468, + "learning_rate": 2.5822577030769972e-06, + "loss": 0.1353, + "step": 5590 + }, + { + "epoch": 0.5151333671165983, + "grad_norm": 0.9129424240959548, + "learning_rate": 2.581496328845873e-06, + "loss": 0.1382, + "step": 5591 + }, + { + "epoch": 0.5152255032938683, + "grad_norm": 0.9485496218586936, + "learning_rate": 2.580734947047806e-06, + "loss": 0.1294, + "step": 5592 + }, + { + "epoch": 0.5153176394711383, + "grad_norm": 1.0463560758101833, + "learning_rate": 2.57997355775349e-06, + "loss": 0.1582, + "step": 5593 + }, + { + "epoch": 0.5154097756484084, + "grad_norm": 0.9828298858039358, + "learning_rate": 2.5792121610336215e-06, + "loss": 0.138, + "step": 5594 + }, + { + "epoch": 0.5155019118256784, + "grad_norm": 0.8865690987054157, + "learning_rate": 2.5784507569588947e-06, + "loss": 0.133, + "step": 5595 + }, + { + "epoch": 0.5155940480029484, + "grad_norm": 0.948753054611248, + "learning_rate": 2.577689345600007e-06, + "loss": 0.1398, + "step": 5596 + }, + { + "epoch": 0.5156861841802184, + "grad_norm": 0.8963248656506673, + "learning_rate": 2.5769279270276544e-06, + "loss": 0.1359, + "step": 5597 + }, + { + "epoch": 0.5157783203574884, + "grad_norm": 0.8633984849791445, + "learning_rate": 2.5761665013125364e-06, + "loss": 0.1266, + "step": 5598 + }, + { + "epoch": 0.5158704565347584, + "grad_norm": 0.9111635283757924, + "learning_rate": 2.5754050685253503e-06, + "loss": 0.1495, + "step": 5599 + }, + { + "epoch": 0.5159625927120284, + "grad_norm": 0.8888987005242416, + "learning_rate": 2.5746436287367956e-06, + "loss": 0.1411, + "step": 5600 + }, + { + "epoch": 0.5160547288892984, + "grad_norm": 0.9027874762523549, + "learning_rate": 2.5738821820175713e-06, + "loss": 0.1462, + "step": 5601 + }, + { + "epoch": 0.5161468650665684, + "grad_norm": 0.8936351680734343, + "learning_rate": 2.573120728438379e-06, + "loss": 0.1567, + "step": 5602 + }, + { + "epoch": 0.5162390012438384, + "grad_norm": 0.823163896670656, + "learning_rate": 2.5723592680699194e-06, + "loss": 0.1141, + "step": 5603 + }, + { + "epoch": 0.5163311374211084, + "grad_norm": 0.8578357763989864, + "learning_rate": 2.5715978009828934e-06, + "loss": 0.1252, + "step": 5604 + }, + { + "epoch": 0.5164232735983784, + "grad_norm": 0.9347199730746312, + "learning_rate": 2.5708363272480034e-06, + "loss": 0.1471, + "step": 5605 + }, + { + "epoch": 0.5165154097756484, + "grad_norm": 0.9230560472785323, + "learning_rate": 2.5700748469359542e-06, + "loss": 0.1462, + "step": 5606 + }, + { + "epoch": 0.5166075459529185, + "grad_norm": 0.9146428934847072, + "learning_rate": 2.569313360117447e-06, + "loss": 0.1435, + "step": 5607 + }, + { + "epoch": 0.5166996821301885, + "grad_norm": 0.8883033669285219, + "learning_rate": 2.568551866863187e-06, + "loss": 0.1342, + "step": 5608 + }, + { + "epoch": 0.5167918183074585, + "grad_norm": 0.8902566289498707, + "learning_rate": 2.567790367243879e-06, + "loss": 0.1358, + "step": 5609 + }, + { + "epoch": 0.5168839544847285, + "grad_norm": 0.884923794423157, + "learning_rate": 2.5670288613302278e-06, + "loss": 0.1363, + "step": 5610 + }, + { + "epoch": 0.5169760906619985, + "grad_norm": 0.8595484693799914, + "learning_rate": 2.56626734919294e-06, + "loss": 0.131, + "step": 5611 + }, + { + "epoch": 0.5170682268392685, + "grad_norm": 0.8881898408502089, + "learning_rate": 2.5655058309027216e-06, + "loss": 0.133, + "step": 5612 + }, + { + "epoch": 0.5171603630165384, + "grad_norm": 0.9055171174486542, + "learning_rate": 2.5647443065302797e-06, + "loss": 0.1337, + "step": 5613 + }, + { + "epoch": 0.5172524991938084, + "grad_norm": 0.8982698558938885, + "learning_rate": 2.5639827761463217e-06, + "loss": 0.1385, + "step": 5614 + }, + { + "epoch": 0.5173446353710784, + "grad_norm": 0.8439364034167672, + "learning_rate": 2.5632212398215563e-06, + "loss": 0.1277, + "step": 5615 + }, + { + "epoch": 0.5174367715483484, + "grad_norm": 0.9053369863573947, + "learning_rate": 2.562459697626692e-06, + "loss": 0.1322, + "step": 5616 + }, + { + "epoch": 0.5175289077256184, + "grad_norm": 0.9414286983524403, + "learning_rate": 2.561698149632438e-06, + "loss": 0.1425, + "step": 5617 + }, + { + "epoch": 0.5176210439028884, + "grad_norm": 0.8600323617972152, + "learning_rate": 2.560936595909504e-06, + "loss": 0.1314, + "step": 5618 + }, + { + "epoch": 0.5177131800801584, + "grad_norm": 0.9022890230620475, + "learning_rate": 2.560175036528601e-06, + "loss": 0.13, + "step": 5619 + }, + { + "epoch": 0.5178053162574284, + "grad_norm": 0.8822823466002423, + "learning_rate": 2.5594134715604384e-06, + "loss": 0.136, + "step": 5620 + }, + { + "epoch": 0.5178974524346985, + "grad_norm": 0.8770079594534396, + "learning_rate": 2.5586519010757295e-06, + "loss": 0.1416, + "step": 5621 + }, + { + "epoch": 0.5179895886119685, + "grad_norm": 0.9227396154729606, + "learning_rate": 2.557890325145185e-06, + "loss": 0.1449, + "step": 5622 + }, + { + "epoch": 0.5180817247892385, + "grad_norm": 0.8965913173018901, + "learning_rate": 2.5571287438395175e-06, + "loss": 0.1402, + "step": 5623 + }, + { + "epoch": 0.5181738609665085, + "grad_norm": 0.8834294157700694, + "learning_rate": 2.5563671572294396e-06, + "loss": 0.1331, + "step": 5624 + }, + { + "epoch": 0.5182659971437785, + "grad_norm": 0.898106091348643, + "learning_rate": 2.5556055653856667e-06, + "loss": 0.1401, + "step": 5625 + }, + { + "epoch": 0.5183581333210485, + "grad_norm": 0.8886607871965505, + "learning_rate": 2.554843968378909e-06, + "loss": 0.1356, + "step": 5626 + }, + { + "epoch": 0.5184502694983185, + "grad_norm": 0.87036466401323, + "learning_rate": 2.5540823662798843e-06, + "loss": 0.1291, + "step": 5627 + }, + { + "epoch": 0.5185424056755885, + "grad_norm": 0.996280569292839, + "learning_rate": 2.553320759159305e-06, + "loss": 0.1457, + "step": 5628 + }, + { + "epoch": 0.5186345418528585, + "grad_norm": 0.8837610420412856, + "learning_rate": 2.5525591470878886e-06, + "loss": 0.1322, + "step": 5629 + }, + { + "epoch": 0.5187266780301285, + "grad_norm": 0.8816532288664364, + "learning_rate": 2.551797530136349e-06, + "loss": 0.1291, + "step": 5630 + }, + { + "epoch": 0.5188188142073985, + "grad_norm": 1.01580389465096, + "learning_rate": 2.5510359083754038e-06, + "loss": 0.138, + "step": 5631 + }, + { + "epoch": 0.5189109503846685, + "grad_norm": 0.9453562490091288, + "learning_rate": 2.5502742818757683e-06, + "loss": 0.1348, + "step": 5632 + }, + { + "epoch": 0.5190030865619385, + "grad_norm": 0.9264318284476458, + "learning_rate": 2.549512650708161e-06, + "loss": 0.1373, + "step": 5633 + }, + { + "epoch": 0.5190952227392085, + "grad_norm": 0.9397210232263981, + "learning_rate": 2.5487510149432974e-06, + "loss": 0.1372, + "step": 5634 + }, + { + "epoch": 0.5191873589164786, + "grad_norm": 0.917269244252231, + "learning_rate": 2.547989374651898e-06, + "loss": 0.1362, + "step": 5635 + }, + { + "epoch": 0.5192794950937486, + "grad_norm": 0.9624687169062707, + "learning_rate": 2.54722772990468e-06, + "loss": 0.1312, + "step": 5636 + }, + { + "epoch": 0.5193716312710186, + "grad_norm": 0.8989884615305049, + "learning_rate": 2.546466080772362e-06, + "loss": 0.1393, + "step": 5637 + }, + { + "epoch": 0.5194637674482886, + "grad_norm": 0.8504854986989404, + "learning_rate": 2.5457044273256635e-06, + "loss": 0.1419, + "step": 5638 + }, + { + "epoch": 0.5195559036255586, + "grad_norm": 0.9181894072822718, + "learning_rate": 2.544942769635304e-06, + "loss": 0.1371, + "step": 5639 + }, + { + "epoch": 0.5196480398028286, + "grad_norm": 0.9676061229686893, + "learning_rate": 2.544181107772003e-06, + "loss": 0.1416, + "step": 5640 + }, + { + "epoch": 0.5197401759800986, + "grad_norm": 0.8791670761476063, + "learning_rate": 2.543419441806482e-06, + "loss": 0.1418, + "step": 5641 + }, + { + "epoch": 0.5198323121573686, + "grad_norm": 0.8491594494899254, + "learning_rate": 2.5426577718094607e-06, + "loss": 0.1266, + "step": 5642 + }, + { + "epoch": 0.5199244483346386, + "grad_norm": 1.0354163929555338, + "learning_rate": 2.541896097851661e-06, + "loss": 0.1409, + "step": 5643 + }, + { + "epoch": 0.5200165845119086, + "grad_norm": 0.9635321671316454, + "learning_rate": 2.541134420003804e-06, + "loss": 0.1365, + "step": 5644 + }, + { + "epoch": 0.5201087206891786, + "grad_norm": 1.031766808189202, + "learning_rate": 2.5403727383366116e-06, + "loss": 0.1383, + "step": 5645 + }, + { + "epoch": 0.5202008568664486, + "grad_norm": 0.8389665943046382, + "learning_rate": 2.5396110529208066e-06, + "loss": 0.1141, + "step": 5646 + }, + { + "epoch": 0.5202929930437186, + "grad_norm": 0.9395740207321802, + "learning_rate": 2.53884936382711e-06, + "loss": 0.1369, + "step": 5647 + }, + { + "epoch": 0.5203851292209887, + "grad_norm": 0.9084013073081592, + "learning_rate": 2.538087671126247e-06, + "loss": 0.1406, + "step": 5648 + }, + { + "epoch": 0.5204772653982587, + "grad_norm": 0.9266640118622809, + "learning_rate": 2.537325974888939e-06, + "loss": 0.1386, + "step": 5649 + }, + { + "epoch": 0.5205694015755287, + "grad_norm": 1.0315770750585445, + "learning_rate": 2.5365642751859103e-06, + "loss": 0.1348, + "step": 5650 + }, + { + "epoch": 0.5206615377527987, + "grad_norm": 0.9054680007028475, + "learning_rate": 2.5358025720878847e-06, + "loss": 0.135, + "step": 5651 + }, + { + "epoch": 0.5207536739300687, + "grad_norm": 0.9740550288932809, + "learning_rate": 2.535040865665587e-06, + "loss": 0.1449, + "step": 5652 + }, + { + "epoch": 0.5208458101073387, + "grad_norm": 0.8951997050564433, + "learning_rate": 2.53427915598974e-06, + "loss": 0.1393, + "step": 5653 + }, + { + "epoch": 0.5209379462846087, + "grad_norm": 0.8907261864130697, + "learning_rate": 2.5335174431310705e-06, + "loss": 0.1392, + "step": 5654 + }, + { + "epoch": 0.5210300824618787, + "grad_norm": 0.9414523754070911, + "learning_rate": 2.532755727160302e-06, + "loss": 0.1455, + "step": 5655 + }, + { + "epoch": 0.5211222186391486, + "grad_norm": 0.9540517476228871, + "learning_rate": 2.5319940081481612e-06, + "loss": 0.1235, + "step": 5656 + }, + { + "epoch": 0.5212143548164186, + "grad_norm": 0.9954834340666702, + "learning_rate": 2.531232286165374e-06, + "loss": 0.1438, + "step": 5657 + }, + { + "epoch": 0.5213064909936886, + "grad_norm": 0.8970096073321792, + "learning_rate": 2.530470561282665e-06, + "loss": 0.1308, + "step": 5658 + }, + { + "epoch": 0.5213986271709586, + "grad_norm": 0.9058477790180682, + "learning_rate": 2.5297088335707607e-06, + "loss": 0.1461, + "step": 5659 + }, + { + "epoch": 0.5214907633482286, + "grad_norm": 0.9711778108276775, + "learning_rate": 2.5289471031003894e-06, + "loss": 0.1396, + "step": 5660 + }, + { + "epoch": 0.5215828995254986, + "grad_norm": 0.8543618205002567, + "learning_rate": 2.528185369942275e-06, + "loss": 0.1323, + "step": 5661 + }, + { + "epoch": 0.5216750357027687, + "grad_norm": 0.9505400443417231, + "learning_rate": 2.5274236341671464e-06, + "loss": 0.1437, + "step": 5662 + }, + { + "epoch": 0.5217671718800387, + "grad_norm": 0.9705759551536418, + "learning_rate": 2.5266618958457305e-06, + "loss": 0.1501, + "step": 5663 + }, + { + "epoch": 0.5218593080573087, + "grad_norm": 0.9286699162670864, + "learning_rate": 2.525900155048755e-06, + "loss": 0.1351, + "step": 5664 + }, + { + "epoch": 0.5219514442345787, + "grad_norm": 0.9460951212975731, + "learning_rate": 2.525138411846947e-06, + "loss": 0.155, + "step": 5665 + }, + { + "epoch": 0.5220435804118487, + "grad_norm": 0.9521125076330914, + "learning_rate": 2.524376666311035e-06, + "loss": 0.1552, + "step": 5666 + }, + { + "epoch": 0.5221357165891187, + "grad_norm": 0.9052726980091685, + "learning_rate": 2.523614918511746e-06, + "loss": 0.1253, + "step": 5667 + }, + { + "epoch": 0.5222278527663887, + "grad_norm": 0.9223530508604543, + "learning_rate": 2.5228531685198105e-06, + "loss": 0.1265, + "step": 5668 + }, + { + "epoch": 0.5223199889436587, + "grad_norm": 0.9534027425880114, + "learning_rate": 2.522091416405955e-06, + "loss": 0.1339, + "step": 5669 + }, + { + "epoch": 0.5224121251209287, + "grad_norm": 0.9757261885463333, + "learning_rate": 2.5213296622409094e-06, + "loss": 0.1418, + "step": 5670 + }, + { + "epoch": 0.5225042612981987, + "grad_norm": 0.948753953135261, + "learning_rate": 2.5205679060954025e-06, + "loss": 0.1439, + "step": 5671 + }, + { + "epoch": 0.5225963974754687, + "grad_norm": 0.9326818741941446, + "learning_rate": 2.5198061480401623e-06, + "loss": 0.1449, + "step": 5672 + }, + { + "epoch": 0.5226885336527387, + "grad_norm": 0.9363994286992702, + "learning_rate": 2.519044388145921e-06, + "loss": 0.1326, + "step": 5673 + }, + { + "epoch": 0.5227806698300087, + "grad_norm": 0.9856542056580295, + "learning_rate": 2.5182826264834046e-06, + "loss": 0.1359, + "step": 5674 + }, + { + "epoch": 0.5228728060072788, + "grad_norm": 1.034901023201578, + "learning_rate": 2.5175208631233454e-06, + "loss": 0.145, + "step": 5675 + }, + { + "epoch": 0.5229649421845488, + "grad_norm": 0.9040002234718671, + "learning_rate": 2.516759098136472e-06, + "loss": 0.1362, + "step": 5676 + }, + { + "epoch": 0.5230570783618188, + "grad_norm": 0.9295191125870893, + "learning_rate": 2.515997331593514e-06, + "loss": 0.1458, + "step": 5677 + }, + { + "epoch": 0.5231492145390888, + "grad_norm": 0.9049169701368106, + "learning_rate": 2.5152355635652027e-06, + "loss": 0.137, + "step": 5678 + }, + { + "epoch": 0.5232413507163588, + "grad_norm": 0.9557392345266481, + "learning_rate": 2.5144737941222673e-06, + "loss": 0.1471, + "step": 5679 + }, + { + "epoch": 0.5233334868936288, + "grad_norm": 0.929502828589841, + "learning_rate": 2.513712023335438e-06, + "loss": 0.1413, + "step": 5680 + }, + { + "epoch": 0.5234256230708988, + "grad_norm": 0.9097049347259758, + "learning_rate": 2.512950251275447e-06, + "loss": 0.1347, + "step": 5681 + }, + { + "epoch": 0.5235177592481688, + "grad_norm": 0.9202841127962849, + "learning_rate": 2.512188478013023e-06, + "loss": 0.1366, + "step": 5682 + }, + { + "epoch": 0.5236098954254388, + "grad_norm": 0.8929983406529515, + "learning_rate": 2.5114267036188975e-06, + "loss": 0.13, + "step": 5683 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 0.9686069240868677, + "learning_rate": 2.510664928163802e-06, + "loss": 0.1517, + "step": 5684 + }, + { + "epoch": 0.5237941677799788, + "grad_norm": 0.8428080692138286, + "learning_rate": 2.5099031517184665e-06, + "loss": 0.1235, + "step": 5685 + }, + { + "epoch": 0.5238863039572488, + "grad_norm": 0.8988271287258455, + "learning_rate": 2.509141374353622e-06, + "loss": 0.1339, + "step": 5686 + }, + { + "epoch": 0.5239784401345188, + "grad_norm": 0.9611827066606157, + "learning_rate": 2.50837959614e-06, + "loss": 0.1489, + "step": 5687 + }, + { + "epoch": 0.5240705763117888, + "grad_norm": 0.9468591677252746, + "learning_rate": 2.5076178171483312e-06, + "loss": 0.1415, + "step": 5688 + }, + { + "epoch": 0.5241627124890589, + "grad_norm": 0.9603892814379588, + "learning_rate": 2.506856037449348e-06, + "loss": 0.14, + "step": 5689 + }, + { + "epoch": 0.5242548486663289, + "grad_norm": 0.9746388238990209, + "learning_rate": 2.50609425711378e-06, + "loss": 0.1549, + "step": 5690 + }, + { + "epoch": 0.5243469848435989, + "grad_norm": 0.9676253072065032, + "learning_rate": 2.505332476212361e-06, + "loss": 0.1547, + "step": 5691 + }, + { + "epoch": 0.5244391210208689, + "grad_norm": 0.9193704129647952, + "learning_rate": 2.50457069481582e-06, + "loss": 0.1412, + "step": 5692 + }, + { + "epoch": 0.5245312571981389, + "grad_norm": 0.8835165046226858, + "learning_rate": 2.50380891299489e-06, + "loss": 0.14, + "step": 5693 + }, + { + "epoch": 0.5246233933754089, + "grad_norm": 0.8897949239988316, + "learning_rate": 2.503047130820302e-06, + "loss": 0.1305, + "step": 5694 + }, + { + "epoch": 0.5247155295526789, + "grad_norm": 0.8582482232047172, + "learning_rate": 2.5022853483627876e-06, + "loss": 0.1291, + "step": 5695 + }, + { + "epoch": 0.5248076657299489, + "grad_norm": 0.9377699199181725, + "learning_rate": 2.5015235656930774e-06, + "loss": 0.1468, + "step": 5696 + }, + { + "epoch": 0.5248998019072189, + "grad_norm": 0.9298891282148648, + "learning_rate": 2.500761782881905e-06, + "loss": 0.1402, + "step": 5697 + }, + { + "epoch": 0.5249919380844889, + "grad_norm": 0.8586736938601531, + "learning_rate": 2.5e-06, + "loss": 0.1319, + "step": 5698 + }, + { + "epoch": 0.5250840742617588, + "grad_norm": 0.856179840256379, + "learning_rate": 2.499238217118095e-06, + "loss": 0.1341, + "step": 5699 + }, + { + "epoch": 0.5251762104390288, + "grad_norm": 0.918400983382606, + "learning_rate": 2.498476434306923e-06, + "loss": 0.1414, + "step": 5700 + }, + { + "epoch": 0.5252683466162988, + "grad_norm": 0.8617014174835537, + "learning_rate": 2.4977146516372137e-06, + "loss": 0.135, + "step": 5701 + }, + { + "epoch": 0.525360482793569, + "grad_norm": 0.9095148426558132, + "learning_rate": 2.496952869179699e-06, + "loss": 0.1414, + "step": 5702 + }, + { + "epoch": 0.525452618970839, + "grad_norm": 0.9003434006030184, + "learning_rate": 2.4961910870051105e-06, + "loss": 0.1304, + "step": 5703 + }, + { + "epoch": 0.5255447551481089, + "grad_norm": 0.9417961533337451, + "learning_rate": 2.49542930518418e-06, + "loss": 0.1421, + "step": 5704 + }, + { + "epoch": 0.5256368913253789, + "grad_norm": 0.9417914777947627, + "learning_rate": 2.49466752378764e-06, + "loss": 0.1425, + "step": 5705 + }, + { + "epoch": 0.5257290275026489, + "grad_norm": 0.8897487906201332, + "learning_rate": 2.4939057428862203e-06, + "loss": 0.1284, + "step": 5706 + }, + { + "epoch": 0.5258211636799189, + "grad_norm": 0.8457401884584623, + "learning_rate": 2.4931439625506522e-06, + "loss": 0.1276, + "step": 5707 + }, + { + "epoch": 0.5259132998571889, + "grad_norm": 0.9013953147336925, + "learning_rate": 2.4923821828516688e-06, + "loss": 0.1426, + "step": 5708 + }, + { + "epoch": 0.5260054360344589, + "grad_norm": 0.8852735984465513, + "learning_rate": 2.491620403860001e-06, + "loss": 0.1377, + "step": 5709 + }, + { + "epoch": 0.5260975722117289, + "grad_norm": 0.9107949064957632, + "learning_rate": 2.4908586256463788e-06, + "loss": 0.1482, + "step": 5710 + }, + { + "epoch": 0.5261897083889989, + "grad_norm": 0.8745259986185256, + "learning_rate": 2.4900968482815344e-06, + "loss": 0.1391, + "step": 5711 + }, + { + "epoch": 0.5262818445662689, + "grad_norm": 0.8978816508895493, + "learning_rate": 2.4893350718361984e-06, + "loss": 0.1323, + "step": 5712 + }, + { + "epoch": 0.5263739807435389, + "grad_norm": 0.9371818854858208, + "learning_rate": 2.488573296381103e-06, + "loss": 0.1392, + "step": 5713 + }, + { + "epoch": 0.5264661169208089, + "grad_norm": 0.8768828375830369, + "learning_rate": 2.487811521986978e-06, + "loss": 0.1366, + "step": 5714 + }, + { + "epoch": 0.5265582530980789, + "grad_norm": 0.9975475708282623, + "learning_rate": 2.4870497487245534e-06, + "loss": 0.1469, + "step": 5715 + }, + { + "epoch": 0.526650389275349, + "grad_norm": 0.9416496571228281, + "learning_rate": 2.486287976664562e-06, + "loss": 0.1516, + "step": 5716 + }, + { + "epoch": 0.526742525452619, + "grad_norm": 0.8976130137898136, + "learning_rate": 2.485526205877734e-06, + "loss": 0.1329, + "step": 5717 + }, + { + "epoch": 0.526834661629889, + "grad_norm": 0.9229030102050523, + "learning_rate": 2.484764436434798e-06, + "loss": 0.1435, + "step": 5718 + }, + { + "epoch": 0.526926797807159, + "grad_norm": 0.848994540276747, + "learning_rate": 2.4840026684064867e-06, + "loss": 0.1249, + "step": 5719 + }, + { + "epoch": 0.527018933984429, + "grad_norm": 0.9202350631334453, + "learning_rate": 2.4832409018635283e-06, + "loss": 0.1428, + "step": 5720 + }, + { + "epoch": 0.527111070161699, + "grad_norm": 0.938200837146183, + "learning_rate": 2.4824791368766555e-06, + "loss": 0.1412, + "step": 5721 + }, + { + "epoch": 0.527203206338969, + "grad_norm": 0.9183623201693331, + "learning_rate": 2.4817173735165958e-06, + "loss": 0.1406, + "step": 5722 + }, + { + "epoch": 0.527295342516239, + "grad_norm": 0.9091326879662819, + "learning_rate": 2.4809556118540795e-06, + "loss": 0.1333, + "step": 5723 + }, + { + "epoch": 0.527387478693509, + "grad_norm": 0.8675851743483034, + "learning_rate": 2.4801938519598372e-06, + "loss": 0.1201, + "step": 5724 + }, + { + "epoch": 0.527479614870779, + "grad_norm": 0.9000877408453869, + "learning_rate": 2.4794320939045988e-06, + "loss": 0.1418, + "step": 5725 + }, + { + "epoch": 0.527571751048049, + "grad_norm": 0.8474851554049467, + "learning_rate": 2.4786703377590914e-06, + "loss": 0.1314, + "step": 5726 + }, + { + "epoch": 0.527663887225319, + "grad_norm": 0.9530774213615841, + "learning_rate": 2.4779085835940457e-06, + "loss": 0.1501, + "step": 5727 + }, + { + "epoch": 0.527756023402589, + "grad_norm": 0.8946042244381394, + "learning_rate": 2.47714683148019e-06, + "loss": 0.1368, + "step": 5728 + }, + { + "epoch": 0.527848159579859, + "grad_norm": 0.9488184152086155, + "learning_rate": 2.476385081488254e-06, + "loss": 0.1416, + "step": 5729 + }, + { + "epoch": 0.5279402957571291, + "grad_norm": 0.8769562906327312, + "learning_rate": 2.4756233336889663e-06, + "loss": 0.1331, + "step": 5730 + }, + { + "epoch": 0.5280324319343991, + "grad_norm": 0.960488041104247, + "learning_rate": 2.474861588153054e-06, + "loss": 0.1434, + "step": 5731 + }, + { + "epoch": 0.5281245681116691, + "grad_norm": 0.9442539249540435, + "learning_rate": 2.4740998449512456e-06, + "loss": 0.1405, + "step": 5732 + }, + { + "epoch": 0.5282167042889391, + "grad_norm": 1.017322220614333, + "learning_rate": 2.4733381041542695e-06, + "loss": 0.1442, + "step": 5733 + }, + { + "epoch": 0.5283088404662091, + "grad_norm": 0.9558560440845575, + "learning_rate": 2.4725763658328544e-06, + "loss": 0.128, + "step": 5734 + }, + { + "epoch": 0.5284009766434791, + "grad_norm": 0.9552439366518181, + "learning_rate": 2.471814630057726e-06, + "loss": 0.1368, + "step": 5735 + }, + { + "epoch": 0.5284931128207491, + "grad_norm": 0.9475504293116053, + "learning_rate": 2.4710528968996114e-06, + "loss": 0.149, + "step": 5736 + }, + { + "epoch": 0.5285852489980191, + "grad_norm": 0.9358738909776354, + "learning_rate": 2.4702911664292397e-06, + "loss": 0.1365, + "step": 5737 + }, + { + "epoch": 0.5286773851752891, + "grad_norm": 0.9334724145108257, + "learning_rate": 2.469529438717336e-06, + "loss": 0.1373, + "step": 5738 + }, + { + "epoch": 0.5287695213525591, + "grad_norm": 0.8697872053181739, + "learning_rate": 2.4687677138346265e-06, + "loss": 0.137, + "step": 5739 + }, + { + "epoch": 0.528861657529829, + "grad_norm": 1.0230475394744452, + "learning_rate": 2.468005991851839e-06, + "loss": 0.1547, + "step": 5740 + }, + { + "epoch": 0.528953793707099, + "grad_norm": 0.9182793634858509, + "learning_rate": 2.467244272839698e-06, + "loss": 0.1402, + "step": 5741 + }, + { + "epoch": 0.529045929884369, + "grad_norm": 0.8635610908929247, + "learning_rate": 2.4664825568689303e-06, + "loss": 0.1316, + "step": 5742 + }, + { + "epoch": 0.5291380660616392, + "grad_norm": 0.9238816088041331, + "learning_rate": 2.4657208440102607e-06, + "loss": 0.1346, + "step": 5743 + }, + { + "epoch": 0.5292302022389092, + "grad_norm": 0.9603841203290647, + "learning_rate": 2.464959134334414e-06, + "loss": 0.1398, + "step": 5744 + }, + { + "epoch": 0.5293223384161792, + "grad_norm": 0.9691785358952262, + "learning_rate": 2.4641974279121157e-06, + "loss": 0.1338, + "step": 5745 + }, + { + "epoch": 0.5294144745934491, + "grad_norm": 0.9811825017453772, + "learning_rate": 2.463435724814091e-06, + "loss": 0.1363, + "step": 5746 + }, + { + "epoch": 0.5295066107707191, + "grad_norm": 0.9089320567911033, + "learning_rate": 2.4626740251110615e-06, + "loss": 0.1451, + "step": 5747 + }, + { + "epoch": 0.5295987469479891, + "grad_norm": 0.9108691062317888, + "learning_rate": 2.461912328873754e-06, + "loss": 0.1357, + "step": 5748 + }, + { + "epoch": 0.5296908831252591, + "grad_norm": 0.9259190835186841, + "learning_rate": 2.46115063617289e-06, + "loss": 0.1492, + "step": 5749 + }, + { + "epoch": 0.5297830193025291, + "grad_norm": 0.9615169324748283, + "learning_rate": 2.4603889470791946e-06, + "loss": 0.128, + "step": 5750 + }, + { + "epoch": 0.5298751554797991, + "grad_norm": 0.9546828640622951, + "learning_rate": 2.4596272616633892e-06, + "loss": 0.1339, + "step": 5751 + }, + { + "epoch": 0.5299672916570691, + "grad_norm": 0.9448443537367963, + "learning_rate": 2.4588655799961968e-06, + "loss": 0.1365, + "step": 5752 + }, + { + "epoch": 0.5300594278343391, + "grad_norm": 0.9490782504336478, + "learning_rate": 2.45810390214834e-06, + "loss": 0.1388, + "step": 5753 + }, + { + "epoch": 0.5301515640116091, + "grad_norm": 0.9427353748495464, + "learning_rate": 2.4573422281905405e-06, + "loss": 0.1451, + "step": 5754 + }, + { + "epoch": 0.5302437001888791, + "grad_norm": 0.9010459704299345, + "learning_rate": 2.456580558193519e-06, + "loss": 0.1378, + "step": 5755 + }, + { + "epoch": 0.5303358363661491, + "grad_norm": 0.9586692997882068, + "learning_rate": 2.4558188922279977e-06, + "loss": 0.147, + "step": 5756 + }, + { + "epoch": 0.5304279725434192, + "grad_norm": 1.0038167732871943, + "learning_rate": 2.4550572303646965e-06, + "loss": 0.155, + "step": 5757 + }, + { + "epoch": 0.5305201087206892, + "grad_norm": 0.9211631756539981, + "learning_rate": 2.454295572674337e-06, + "loss": 0.14, + "step": 5758 + }, + { + "epoch": 0.5306122448979592, + "grad_norm": 1.0000059466636613, + "learning_rate": 2.453533919227639e-06, + "loss": 0.1378, + "step": 5759 + }, + { + "epoch": 0.5307043810752292, + "grad_norm": 0.8958358231178939, + "learning_rate": 2.4527722700953205e-06, + "loss": 0.1348, + "step": 5760 + }, + { + "epoch": 0.5307965172524992, + "grad_norm": 0.9202617314939519, + "learning_rate": 2.4520106253481025e-06, + "loss": 0.1408, + "step": 5761 + }, + { + "epoch": 0.5308886534297692, + "grad_norm": 0.9861823309901614, + "learning_rate": 2.451248985056702e-06, + "loss": 0.1464, + "step": 5762 + }, + { + "epoch": 0.5309807896070392, + "grad_norm": 0.9958770704868112, + "learning_rate": 2.4504873492918404e-06, + "loss": 0.1252, + "step": 5763 + }, + { + "epoch": 0.5310729257843092, + "grad_norm": 0.9492656613133602, + "learning_rate": 2.449725718124233e-06, + "loss": 0.1401, + "step": 5764 + }, + { + "epoch": 0.5311650619615792, + "grad_norm": 0.9305337760930398, + "learning_rate": 2.448964091624597e-06, + "loss": 0.1383, + "step": 5765 + }, + { + "epoch": 0.5312571981388492, + "grad_norm": 0.9418538276706124, + "learning_rate": 2.4482024698636514e-06, + "loss": 0.1334, + "step": 5766 + }, + { + "epoch": 0.5313493343161192, + "grad_norm": 0.9056941479460245, + "learning_rate": 2.4474408529121126e-06, + "loss": 0.1379, + "step": 5767 + }, + { + "epoch": 0.5314414704933892, + "grad_norm": 0.823476911768923, + "learning_rate": 2.4466792408406953e-06, + "loss": 0.1158, + "step": 5768 + }, + { + "epoch": 0.5315336066706592, + "grad_norm": 0.8974601975603193, + "learning_rate": 2.445917633720117e-06, + "loss": 0.1366, + "step": 5769 + }, + { + "epoch": 0.5316257428479293, + "grad_norm": 0.8852553284312088, + "learning_rate": 2.4451560316210913e-06, + "loss": 0.1298, + "step": 5770 + }, + { + "epoch": 0.5317178790251993, + "grad_norm": 0.9577532072576866, + "learning_rate": 2.444394434614335e-06, + "loss": 0.1448, + "step": 5771 + }, + { + "epoch": 0.5318100152024693, + "grad_norm": 0.9122078585444239, + "learning_rate": 2.4436328427705612e-06, + "loss": 0.1484, + "step": 5772 + }, + { + "epoch": 0.5319021513797393, + "grad_norm": 0.9341072336908377, + "learning_rate": 2.442871256160483e-06, + "loss": 0.1463, + "step": 5773 + }, + { + "epoch": 0.5319942875570093, + "grad_norm": 0.8852758499988086, + "learning_rate": 2.442109674854815e-06, + "loss": 0.1435, + "step": 5774 + }, + { + "epoch": 0.5320864237342793, + "grad_norm": 0.8580463168445764, + "learning_rate": 2.4413480989242718e-06, + "loss": 0.1247, + "step": 5775 + }, + { + "epoch": 0.5321785599115493, + "grad_norm": 0.9335327520464461, + "learning_rate": 2.440586528439562e-06, + "loss": 0.1404, + "step": 5776 + }, + { + "epoch": 0.5322706960888193, + "grad_norm": 0.8745349837111808, + "learning_rate": 2.4398249634713996e-06, + "loss": 0.1332, + "step": 5777 + }, + { + "epoch": 0.5323628322660893, + "grad_norm": 0.8252889666092439, + "learning_rate": 2.4390634040904965e-06, + "loss": 0.1182, + "step": 5778 + }, + { + "epoch": 0.5324549684433593, + "grad_norm": 0.9157329631624775, + "learning_rate": 2.4383018503675633e-06, + "loss": 0.1345, + "step": 5779 + }, + { + "epoch": 0.5325471046206293, + "grad_norm": 0.8603655816644842, + "learning_rate": 2.437540302373309e-06, + "loss": 0.125, + "step": 5780 + }, + { + "epoch": 0.5326392407978993, + "grad_norm": 0.9038892694652924, + "learning_rate": 2.4367787601784446e-06, + "loss": 0.1371, + "step": 5781 + }, + { + "epoch": 0.5327313769751693, + "grad_norm": 0.92831194618531, + "learning_rate": 2.4360172238536787e-06, + "loss": 0.1364, + "step": 5782 + }, + { + "epoch": 0.5328235131524393, + "grad_norm": 0.9846067383086416, + "learning_rate": 2.435255693469721e-06, + "loss": 0.1423, + "step": 5783 + }, + { + "epoch": 0.5329156493297094, + "grad_norm": 0.990724526005928, + "learning_rate": 2.4344941690972797e-06, + "loss": 0.1498, + "step": 5784 + }, + { + "epoch": 0.5330077855069794, + "grad_norm": 0.9493770573022826, + "learning_rate": 2.4337326508070604e-06, + "loss": 0.139, + "step": 5785 + }, + { + "epoch": 0.5330999216842494, + "grad_norm": 0.9201553217830986, + "learning_rate": 2.4329711386697726e-06, + "loss": 0.1298, + "step": 5786 + }, + { + "epoch": 0.5331920578615194, + "grad_norm": 0.8880673033056966, + "learning_rate": 2.432209632756121e-06, + "loss": 0.1489, + "step": 5787 + }, + { + "epoch": 0.5332841940387893, + "grad_norm": 0.9510070687868549, + "learning_rate": 2.4314481331368133e-06, + "loss": 0.135, + "step": 5788 + }, + { + "epoch": 0.5333763302160593, + "grad_norm": 0.8694595494903921, + "learning_rate": 2.430686639882554e-06, + "loss": 0.132, + "step": 5789 + }, + { + "epoch": 0.5334684663933293, + "grad_norm": 0.881862571605303, + "learning_rate": 2.429925153064046e-06, + "loss": 0.1237, + "step": 5790 + }, + { + "epoch": 0.5335606025705993, + "grad_norm": 0.9202894290372889, + "learning_rate": 2.4291636727519966e-06, + "loss": 0.1287, + "step": 5791 + }, + { + "epoch": 0.5336527387478693, + "grad_norm": 0.8795110496314976, + "learning_rate": 2.428402199017108e-06, + "loss": 0.1265, + "step": 5792 + }, + { + "epoch": 0.5337448749251393, + "grad_norm": 0.973557854750666, + "learning_rate": 2.4276407319300815e-06, + "loss": 0.1515, + "step": 5793 + }, + { + "epoch": 0.5338370111024093, + "grad_norm": 0.9241824971544353, + "learning_rate": 2.4268792715616217e-06, + "loss": 0.1386, + "step": 5794 + }, + { + "epoch": 0.5339291472796793, + "grad_norm": 0.9019945526996254, + "learning_rate": 2.4261178179824287e-06, + "loss": 0.1312, + "step": 5795 + }, + { + "epoch": 0.5340212834569493, + "grad_norm": 0.8982120470146486, + "learning_rate": 2.4253563712632057e-06, + "loss": 0.1329, + "step": 5796 + }, + { + "epoch": 0.5341134196342193, + "grad_norm": 0.8751533441546493, + "learning_rate": 2.4245949314746506e-06, + "loss": 0.1367, + "step": 5797 + }, + { + "epoch": 0.5342055558114894, + "grad_norm": 0.9530832980103735, + "learning_rate": 2.423833498687464e-06, + "loss": 0.1374, + "step": 5798 + }, + { + "epoch": 0.5342976919887594, + "grad_norm": 0.9553902470626439, + "learning_rate": 2.423072072972346e-06, + "loss": 0.1346, + "step": 5799 + }, + { + "epoch": 0.5343898281660294, + "grad_norm": 0.9146688205607726, + "learning_rate": 2.4223106543999943e-06, + "loss": 0.1378, + "step": 5800 + }, + { + "epoch": 0.5344819643432994, + "grad_norm": 0.8880858853753428, + "learning_rate": 2.4215492430411057e-06, + "loss": 0.1275, + "step": 5801 + }, + { + "epoch": 0.5345741005205694, + "grad_norm": 0.9299530782474574, + "learning_rate": 2.4207878389663794e-06, + "loss": 0.1372, + "step": 5802 + }, + { + "epoch": 0.5346662366978394, + "grad_norm": 1.0145172996220377, + "learning_rate": 2.4200264422465096e-06, + "loss": 0.1415, + "step": 5803 + }, + { + "epoch": 0.5347583728751094, + "grad_norm": 0.9577851127728939, + "learning_rate": 2.4192650529521948e-06, + "loss": 0.131, + "step": 5804 + }, + { + "epoch": 0.5348505090523794, + "grad_norm": 0.8902962613744783, + "learning_rate": 2.418503671154128e-06, + "loss": 0.1416, + "step": 5805 + }, + { + "epoch": 0.5349426452296494, + "grad_norm": 0.8831433355746883, + "learning_rate": 2.417742296923003e-06, + "loss": 0.1298, + "step": 5806 + }, + { + "epoch": 0.5350347814069194, + "grad_norm": 0.9091113886860835, + "learning_rate": 2.4169809303295157e-06, + "loss": 0.1372, + "step": 5807 + }, + { + "epoch": 0.5351269175841894, + "grad_norm": 0.920230718279897, + "learning_rate": 2.4162195714443584e-06, + "loss": 0.1336, + "step": 5808 + }, + { + "epoch": 0.5352190537614594, + "grad_norm": 0.9225664409925763, + "learning_rate": 2.4154582203382216e-06, + "loss": 0.1377, + "step": 5809 + }, + { + "epoch": 0.5353111899387294, + "grad_norm": 0.8214010998632829, + "learning_rate": 2.4146968770817988e-06, + "loss": 0.1252, + "step": 5810 + }, + { + "epoch": 0.5354033261159995, + "grad_norm": 0.9037345408786586, + "learning_rate": 2.41393554174578e-06, + "loss": 0.1499, + "step": 5811 + }, + { + "epoch": 0.5354954622932695, + "grad_norm": 0.8879279953804324, + "learning_rate": 2.4131742144008557e-06, + "loss": 0.1366, + "step": 5812 + }, + { + "epoch": 0.5355875984705395, + "grad_norm": 0.8643469307323431, + "learning_rate": 2.4124128951177146e-06, + "loss": 0.119, + "step": 5813 + }, + { + "epoch": 0.5356797346478095, + "grad_norm": 0.9000012611078515, + "learning_rate": 2.4116515839670456e-06, + "loss": 0.1358, + "step": 5814 + }, + { + "epoch": 0.5357718708250795, + "grad_norm": 0.9397044907955544, + "learning_rate": 2.4108902810195367e-06, + "loss": 0.1468, + "step": 5815 + }, + { + "epoch": 0.5358640070023495, + "grad_norm": 0.915825164357773, + "learning_rate": 2.4101289863458744e-06, + "loss": 0.1255, + "step": 5816 + }, + { + "epoch": 0.5359561431796195, + "grad_norm": 0.8662962491574562, + "learning_rate": 2.409367700016746e-06, + "loss": 0.1344, + "step": 5817 + }, + { + "epoch": 0.5360482793568895, + "grad_norm": 0.9247751195485957, + "learning_rate": 2.4086064221028365e-06, + "loss": 0.1302, + "step": 5818 + }, + { + "epoch": 0.5361404155341595, + "grad_norm": 0.9472673515439451, + "learning_rate": 2.40784515267483e-06, + "loss": 0.1407, + "step": 5819 + }, + { + "epoch": 0.5362325517114295, + "grad_norm": 0.9411408748162816, + "learning_rate": 2.407083891803411e-06, + "loss": 0.1394, + "step": 5820 + }, + { + "epoch": 0.5363246878886995, + "grad_norm": 0.9255154027937263, + "learning_rate": 2.4063226395592635e-06, + "loss": 0.1358, + "step": 5821 + }, + { + "epoch": 0.5364168240659695, + "grad_norm": 0.9180421838847347, + "learning_rate": 2.4055613960130676e-06, + "loss": 0.1376, + "step": 5822 + }, + { + "epoch": 0.5365089602432395, + "grad_norm": 0.8784816828161529, + "learning_rate": 2.4048001612355072e-06, + "loss": 0.1375, + "step": 5823 + }, + { + "epoch": 0.5366010964205095, + "grad_norm": 0.854986124528438, + "learning_rate": 2.4040389352972616e-06, + "loss": 0.1345, + "step": 5824 + }, + { + "epoch": 0.5366932325977796, + "grad_norm": 0.8877816767972053, + "learning_rate": 2.403277718269012e-06, + "loss": 0.1287, + "step": 5825 + }, + { + "epoch": 0.5367853687750496, + "grad_norm": 0.8756257399903816, + "learning_rate": 2.4025165102214363e-06, + "loss": 0.1274, + "step": 5826 + }, + { + "epoch": 0.5368775049523196, + "grad_norm": 0.9513688328099061, + "learning_rate": 2.4017553112252123e-06, + "loss": 0.1427, + "step": 5827 + }, + { + "epoch": 0.5369696411295896, + "grad_norm": 0.9443193310474314, + "learning_rate": 2.400994121351019e-06, + "loss": 0.1422, + "step": 5828 + }, + { + "epoch": 0.5370617773068596, + "grad_norm": 0.8714929349419318, + "learning_rate": 2.4002329406695325e-06, + "loss": 0.135, + "step": 5829 + }, + { + "epoch": 0.5371539134841296, + "grad_norm": 0.9829188571248488, + "learning_rate": 2.3994717692514274e-06, + "loss": 0.1479, + "step": 5830 + }, + { + "epoch": 0.5372460496613995, + "grad_norm": 0.8778193951704442, + "learning_rate": 2.3987106071673797e-06, + "loss": 0.1339, + "step": 5831 + }, + { + "epoch": 0.5373381858386695, + "grad_norm": 0.8295131686996187, + "learning_rate": 2.3979494544880625e-06, + "loss": 0.117, + "step": 5832 + }, + { + "epoch": 0.5374303220159395, + "grad_norm": 0.9603954748107639, + "learning_rate": 2.3971883112841505e-06, + "loss": 0.1481, + "step": 5833 + }, + { + "epoch": 0.5375224581932095, + "grad_norm": 0.9293362890021563, + "learning_rate": 2.3964271776263146e-06, + "loss": 0.1448, + "step": 5834 + }, + { + "epoch": 0.5376145943704795, + "grad_norm": 0.9414302795244884, + "learning_rate": 2.3956660535852254e-06, + "loss": 0.1403, + "step": 5835 + }, + { + "epoch": 0.5377067305477495, + "grad_norm": 0.9002162943398508, + "learning_rate": 2.3949049392315555e-06, + "loss": 0.1313, + "step": 5836 + }, + { + "epoch": 0.5377988667250195, + "grad_norm": 0.8896376778803587, + "learning_rate": 2.394143834635973e-06, + "loss": 0.132, + "step": 5837 + }, + { + "epoch": 0.5378910029022896, + "grad_norm": 0.9191022468212096, + "learning_rate": 2.3933827398691464e-06, + "loss": 0.1291, + "step": 5838 + }, + { + "epoch": 0.5379831390795596, + "grad_norm": 0.9652932862586016, + "learning_rate": 2.3926216550017445e-06, + "loss": 0.1402, + "step": 5839 + }, + { + "epoch": 0.5380752752568296, + "grad_norm": 0.8928366891316858, + "learning_rate": 2.3918605801044325e-06, + "loss": 0.1403, + "step": 5840 + }, + { + "epoch": 0.5381674114340996, + "grad_norm": 0.9072163582373372, + "learning_rate": 2.3910995152478786e-06, + "loss": 0.1399, + "step": 5841 + }, + { + "epoch": 0.5382595476113696, + "grad_norm": 1.5408775770789918, + "learning_rate": 2.3903384605027462e-06, + "loss": 0.1339, + "step": 5842 + }, + { + "epoch": 0.5383516837886396, + "grad_norm": 0.9626327273310918, + "learning_rate": 2.3895774159396982e-06, + "loss": 0.1425, + "step": 5843 + }, + { + "epoch": 0.5384438199659096, + "grad_norm": 0.8853401330370692, + "learning_rate": 2.3888163816294e-06, + "loss": 0.1345, + "step": 5844 + }, + { + "epoch": 0.5385359561431796, + "grad_norm": 0.8572199289738917, + "learning_rate": 2.3880553576425124e-06, + "loss": 0.1313, + "step": 5845 + }, + { + "epoch": 0.5386280923204496, + "grad_norm": 0.9431335769657753, + "learning_rate": 2.3872943440496978e-06, + "loss": 0.1472, + "step": 5846 + }, + { + "epoch": 0.5387202284977196, + "grad_norm": 0.8765177735234764, + "learning_rate": 2.386533340921615e-06, + "loss": 0.1413, + "step": 5847 + }, + { + "epoch": 0.5388123646749896, + "grad_norm": 1.0028906351410283, + "learning_rate": 2.385772348328923e-06, + "loss": 0.1584, + "step": 5848 + }, + { + "epoch": 0.5389045008522596, + "grad_norm": 0.9002792229676244, + "learning_rate": 2.385011366342281e-06, + "loss": 0.1407, + "step": 5849 + }, + { + "epoch": 0.5389966370295296, + "grad_norm": 0.8770503850906126, + "learning_rate": 2.3842503950323473e-06, + "loss": 0.136, + "step": 5850 + }, + { + "epoch": 0.5390887732067996, + "grad_norm": 0.9336646528001852, + "learning_rate": 2.383489434469775e-06, + "loss": 0.1426, + "step": 5851 + }, + { + "epoch": 0.5391809093840697, + "grad_norm": 0.9010134456089555, + "learning_rate": 2.382728484725222e-06, + "loss": 0.1341, + "step": 5852 + }, + { + "epoch": 0.5392730455613397, + "grad_norm": 0.8942998501108559, + "learning_rate": 2.3819675458693422e-06, + "loss": 0.1409, + "step": 5853 + }, + { + "epoch": 0.5393651817386097, + "grad_norm": 0.9340316091553998, + "learning_rate": 2.381206617972789e-06, + "loss": 0.1444, + "step": 5854 + }, + { + "epoch": 0.5394573179158797, + "grad_norm": 0.8876144334710793, + "learning_rate": 2.3804457011062126e-06, + "loss": 0.1398, + "step": 5855 + }, + { + "epoch": 0.5395494540931497, + "grad_norm": 0.9016840240138915, + "learning_rate": 2.3796847953402665e-06, + "loss": 0.1225, + "step": 5856 + }, + { + "epoch": 0.5396415902704197, + "grad_norm": 0.9181794340809388, + "learning_rate": 2.3789239007455996e-06, + "loss": 0.1405, + "step": 5857 + }, + { + "epoch": 0.5397337264476897, + "grad_norm": 0.8881973608279808, + "learning_rate": 2.3781630173928627e-06, + "loss": 0.1347, + "step": 5858 + }, + { + "epoch": 0.5398258626249597, + "grad_norm": 0.856762225089732, + "learning_rate": 2.3774021453527023e-06, + "loss": 0.1193, + "step": 5859 + }, + { + "epoch": 0.5399179988022297, + "grad_norm": 0.9519632501690944, + "learning_rate": 2.3766412846957652e-06, + "loss": 0.1485, + "step": 5860 + }, + { + "epoch": 0.5400101349794997, + "grad_norm": 0.9612225735542497, + "learning_rate": 2.3758804354926986e-06, + "loss": 0.1391, + "step": 5861 + }, + { + "epoch": 0.5401022711567697, + "grad_norm": 0.934863489073019, + "learning_rate": 2.375119597814147e-06, + "loss": 0.1455, + "step": 5862 + }, + { + "epoch": 0.5401944073340397, + "grad_norm": 0.8937423146747298, + "learning_rate": 2.374358771730754e-06, + "loss": 0.1285, + "step": 5863 + }, + { + "epoch": 0.5402865435113097, + "grad_norm": 0.9940701675808745, + "learning_rate": 2.3735979573131626e-06, + "loss": 0.147, + "step": 5864 + }, + { + "epoch": 0.5403786796885797, + "grad_norm": 1.0060354786289987, + "learning_rate": 2.372837154632014e-06, + "loss": 0.1289, + "step": 5865 + }, + { + "epoch": 0.5404708158658498, + "grad_norm": 0.9563270691350768, + "learning_rate": 2.37207636375795e-06, + "loss": 0.132, + "step": 5866 + }, + { + "epoch": 0.5405629520431198, + "grad_norm": 0.9225113435435278, + "learning_rate": 2.3713155847616095e-06, + "loss": 0.1444, + "step": 5867 + }, + { + "epoch": 0.5406550882203898, + "grad_norm": 0.9051134560491042, + "learning_rate": 2.37055481771363e-06, + "loss": 0.1257, + "step": 5868 + }, + { + "epoch": 0.5407472243976598, + "grad_norm": 0.9125435043547829, + "learning_rate": 2.3697940626846504e-06, + "loss": 0.1366, + "step": 5869 + }, + { + "epoch": 0.5408393605749298, + "grad_norm": 0.8827037590314208, + "learning_rate": 2.369033319745306e-06, + "loss": 0.1278, + "step": 5870 + }, + { + "epoch": 0.5409314967521998, + "grad_norm": 0.9542353104247997, + "learning_rate": 2.3682725889662316e-06, + "loss": 0.1316, + "step": 5871 + }, + { + "epoch": 0.5410236329294698, + "grad_norm": 0.9436897926471467, + "learning_rate": 2.3675118704180614e-06, + "loss": 0.1375, + "step": 5872 + }, + { + "epoch": 0.5411157691067398, + "grad_norm": 0.9053198809341689, + "learning_rate": 2.366751164171428e-06, + "loss": 0.1433, + "step": 5873 + }, + { + "epoch": 0.5412079052840097, + "grad_norm": 0.8602873340218734, + "learning_rate": 2.3659904702969636e-06, + "loss": 0.1339, + "step": 5874 + }, + { + "epoch": 0.5413000414612797, + "grad_norm": 0.8998889070188997, + "learning_rate": 2.3652297888653e-06, + "loss": 0.1409, + "step": 5875 + }, + { + "epoch": 0.5413921776385497, + "grad_norm": 0.9105525894251323, + "learning_rate": 2.3644691199470628e-06, + "loss": 0.1369, + "step": 5876 + }, + { + "epoch": 0.5414843138158197, + "grad_norm": 0.9362291238439949, + "learning_rate": 2.3637084636128836e-06, + "loss": 0.1356, + "step": 5877 + }, + { + "epoch": 0.5415764499930897, + "grad_norm": 0.8885859902523415, + "learning_rate": 2.3629478199333873e-06, + "loss": 0.1328, + "step": 5878 + }, + { + "epoch": 0.5416685861703598, + "grad_norm": 0.9125123891199016, + "learning_rate": 2.362187188979202e-06, + "loss": 0.1368, + "step": 5879 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 0.8937998412152254, + "learning_rate": 2.3614265708209503e-06, + "loss": 0.1324, + "step": 5880 + }, + { + "epoch": 0.5418528585248998, + "grad_norm": 0.8888623370293548, + "learning_rate": 2.360665965529256e-06, + "loss": 0.1336, + "step": 5881 + }, + { + "epoch": 0.5419449947021698, + "grad_norm": 0.9001269534353986, + "learning_rate": 2.3599053731747424e-06, + "loss": 0.1396, + "step": 5882 + }, + { + "epoch": 0.5420371308794398, + "grad_norm": 0.9073225423258994, + "learning_rate": 2.3591447938280304e-06, + "loss": 0.1349, + "step": 5883 + }, + { + "epoch": 0.5421292670567098, + "grad_norm": 0.9664393080376217, + "learning_rate": 2.3583842275597382e-06, + "loss": 0.1362, + "step": 5884 + }, + { + "epoch": 0.5422214032339798, + "grad_norm": 0.9234357650375972, + "learning_rate": 2.3576236744404866e-06, + "loss": 0.1383, + "step": 5885 + }, + { + "epoch": 0.5423135394112498, + "grad_norm": 0.8729256712212028, + "learning_rate": 2.3568631345408912e-06, + "loss": 0.1326, + "step": 5886 + }, + { + "epoch": 0.5424056755885198, + "grad_norm": 0.9081475409204117, + "learning_rate": 2.3561026079315707e-06, + "loss": 0.1271, + "step": 5887 + }, + { + "epoch": 0.5424978117657898, + "grad_norm": 0.9919399955510334, + "learning_rate": 2.3553420946831377e-06, + "loss": 0.159, + "step": 5888 + }, + { + "epoch": 0.5425899479430598, + "grad_norm": 0.8465349151729697, + "learning_rate": 2.3545815948662066e-06, + "loss": 0.1246, + "step": 5889 + }, + { + "epoch": 0.5426820841203298, + "grad_norm": 0.8814514667720759, + "learning_rate": 2.3538211085513902e-06, + "loss": 0.1258, + "step": 5890 + }, + { + "epoch": 0.5427742202975998, + "grad_norm": 0.9320930510441189, + "learning_rate": 2.3530606358093e-06, + "loss": 0.1336, + "step": 5891 + }, + { + "epoch": 0.5428663564748698, + "grad_norm": 0.8891625231779033, + "learning_rate": 2.352300176710545e-06, + "loss": 0.1367, + "step": 5892 + }, + { + "epoch": 0.5429584926521399, + "grad_norm": 0.9387961274368829, + "learning_rate": 2.351539731325735e-06, + "loss": 0.1397, + "step": 5893 + }, + { + "epoch": 0.5430506288294099, + "grad_norm": 0.8696939578101172, + "learning_rate": 2.350779299725476e-06, + "loss": 0.1364, + "step": 5894 + }, + { + "epoch": 0.5431427650066799, + "grad_norm": 0.9477061229980223, + "learning_rate": 2.3500188819803764e-06, + "loss": 0.1488, + "step": 5895 + }, + { + "epoch": 0.5432349011839499, + "grad_norm": 0.8300016678907538, + "learning_rate": 2.3492584781610392e-06, + "loss": 0.1199, + "step": 5896 + }, + { + "epoch": 0.5433270373612199, + "grad_norm": 0.9676273385906484, + "learning_rate": 2.3484980883380677e-06, + "loss": 0.1389, + "step": 5897 + }, + { + "epoch": 0.5434191735384899, + "grad_norm": 0.9146011666455297, + "learning_rate": 2.347737712582066e-06, + "loss": 0.1288, + "step": 5898 + }, + { + "epoch": 0.5435113097157599, + "grad_norm": 0.8746026606193679, + "learning_rate": 2.3469773509636346e-06, + "loss": 0.1253, + "step": 5899 + }, + { + "epoch": 0.5436034458930299, + "grad_norm": 0.9163120972409077, + "learning_rate": 2.3462170035533713e-06, + "loss": 0.1383, + "step": 5900 + }, + { + "epoch": 0.5436955820702999, + "grad_norm": 0.9377392397159249, + "learning_rate": 2.345456670421876e-06, + "loss": 0.1444, + "step": 5901 + }, + { + "epoch": 0.5437877182475699, + "grad_norm": 0.9564357005506579, + "learning_rate": 2.3446963516397455e-06, + "loss": 0.1432, + "step": 5902 + }, + { + "epoch": 0.5438798544248399, + "grad_norm": 0.9273350082364941, + "learning_rate": 2.3439360472775758e-06, + "loss": 0.1398, + "step": 5903 + }, + { + "epoch": 0.5439719906021099, + "grad_norm": 0.8985150989161702, + "learning_rate": 2.3431757574059616e-06, + "loss": 0.1324, + "step": 5904 + }, + { + "epoch": 0.5440641267793799, + "grad_norm": 0.9057262190585165, + "learning_rate": 2.342415482095494e-06, + "loss": 0.1322, + "step": 5905 + }, + { + "epoch": 0.54415626295665, + "grad_norm": 0.9487898862954096, + "learning_rate": 2.341655221416766e-06, + "loss": 0.1367, + "step": 5906 + }, + { + "epoch": 0.54424839913392, + "grad_norm": 0.8907811006669535, + "learning_rate": 2.3408949754403678e-06, + "loss": 0.1257, + "step": 5907 + }, + { + "epoch": 0.54434053531119, + "grad_norm": 0.8690356403332792, + "learning_rate": 2.340134744236889e-06, + "loss": 0.1194, + "step": 5908 + }, + { + "epoch": 0.54443267148846, + "grad_norm": 0.9402466917106319, + "learning_rate": 2.3393745278769163e-06, + "loss": 0.1393, + "step": 5909 + }, + { + "epoch": 0.54452480766573, + "grad_norm": 0.9079624440722254, + "learning_rate": 2.3386143264310348e-06, + "loss": 0.1338, + "step": 5910 + }, + { + "epoch": 0.544616943843, + "grad_norm": 0.9249329243264749, + "learning_rate": 2.3378541399698314e-06, + "loss": 0.1367, + "step": 5911 + }, + { + "epoch": 0.54470908002027, + "grad_norm": 0.9118627413475253, + "learning_rate": 2.337093968563889e-06, + "loss": 0.1267, + "step": 5912 + }, + { + "epoch": 0.54480121619754, + "grad_norm": 0.8645824091337139, + "learning_rate": 2.336333812283788e-06, + "loss": 0.1214, + "step": 5913 + }, + { + "epoch": 0.54489335237481, + "grad_norm": 0.8851023831702883, + "learning_rate": 2.3355736712001107e-06, + "loss": 0.1425, + "step": 5914 + }, + { + "epoch": 0.54498548855208, + "grad_norm": 0.9789065324109237, + "learning_rate": 2.3348135453834353e-06, + "loss": 0.1477, + "step": 5915 + }, + { + "epoch": 0.54507762472935, + "grad_norm": 0.8929699059776705, + "learning_rate": 2.3340534349043407e-06, + "loss": 0.1324, + "step": 5916 + }, + { + "epoch": 0.54516976090662, + "grad_norm": 0.8525535061935612, + "learning_rate": 2.3332933398334028e-06, + "loss": 0.1348, + "step": 5917 + }, + { + "epoch": 0.54526189708389, + "grad_norm": 0.9265971942479982, + "learning_rate": 2.332533260241195e-06, + "loss": 0.146, + "step": 5918 + }, + { + "epoch": 0.5453540332611599, + "grad_norm": 0.8893191415358593, + "learning_rate": 2.3317731961982926e-06, + "loss": 0.1422, + "step": 5919 + }, + { + "epoch": 0.54544616943843, + "grad_norm": 0.8715215907778809, + "learning_rate": 2.331013147775268e-06, + "loss": 0.1322, + "step": 5920 + }, + { + "epoch": 0.5455383056157, + "grad_norm": 0.9232093592061419, + "learning_rate": 2.3302531150426894e-06, + "loss": 0.1514, + "step": 5921 + }, + { + "epoch": 0.54563044179297, + "grad_norm": 0.9221326425086337, + "learning_rate": 2.329493098071128e-06, + "loss": 0.1377, + "step": 5922 + }, + { + "epoch": 0.54572257797024, + "grad_norm": 0.8629100101924353, + "learning_rate": 2.32873309693115e-06, + "loss": 0.1292, + "step": 5923 + }, + { + "epoch": 0.54581471414751, + "grad_norm": 0.8764601317643181, + "learning_rate": 2.3279731116933235e-06, + "loss": 0.1382, + "step": 5924 + }, + { + "epoch": 0.54590685032478, + "grad_norm": 0.9527189795795429, + "learning_rate": 2.327213142428212e-06, + "loss": 0.1412, + "step": 5925 + }, + { + "epoch": 0.54599898650205, + "grad_norm": 0.8514752428752124, + "learning_rate": 2.326453189206378e-06, + "loss": 0.137, + "step": 5926 + }, + { + "epoch": 0.54609112267932, + "grad_norm": 0.8631535978765967, + "learning_rate": 2.325693252098384e-06, + "loss": 0.1302, + "step": 5927 + }, + { + "epoch": 0.54618325885659, + "grad_norm": 0.9256537186435602, + "learning_rate": 2.324933331174792e-06, + "loss": 0.1399, + "step": 5928 + }, + { + "epoch": 0.54627539503386, + "grad_norm": 0.9165258915845617, + "learning_rate": 2.3241734265061573e-06, + "loss": 0.134, + "step": 5929 + }, + { + "epoch": 0.54636753121113, + "grad_norm": 0.9783289918459979, + "learning_rate": 2.323413538163039e-06, + "loss": 0.137, + "step": 5930 + }, + { + "epoch": 0.5464596673884, + "grad_norm": 0.952969188227047, + "learning_rate": 2.322653666215993e-06, + "loss": 0.1499, + "step": 5931 + }, + { + "epoch": 0.54655180356567, + "grad_norm": 0.8964130572884526, + "learning_rate": 2.3218938107355727e-06, + "loss": 0.1367, + "step": 5932 + }, + { + "epoch": 0.54664393974294, + "grad_norm": 0.8954517651328167, + "learning_rate": 2.3211339717923326e-06, + "loss": 0.1226, + "step": 5933 + }, + { + "epoch": 0.5467360759202101, + "grad_norm": 0.9005748837624358, + "learning_rate": 2.320374149456822e-06, + "loss": 0.1346, + "step": 5934 + }, + { + "epoch": 0.5468282120974801, + "grad_norm": 0.9435439221997367, + "learning_rate": 2.31961434379959e-06, + "loss": 0.1442, + "step": 5935 + }, + { + "epoch": 0.5469203482747501, + "grad_norm": 0.8486317533267757, + "learning_rate": 2.3188545548911863e-06, + "loss": 0.1176, + "step": 5936 + }, + { + "epoch": 0.5470124844520201, + "grad_norm": 0.8973418783092466, + "learning_rate": 2.3180947828021574e-06, + "loss": 0.1255, + "step": 5937 + }, + { + "epoch": 0.5471046206292901, + "grad_norm": 0.975856203440726, + "learning_rate": 2.317335027603046e-06, + "loss": 0.1382, + "step": 5938 + }, + { + "epoch": 0.5471967568065601, + "grad_norm": 0.8448417831997888, + "learning_rate": 2.3165752893643974e-06, + "loss": 0.1218, + "step": 5939 + }, + { + "epoch": 0.5472888929838301, + "grad_norm": 0.8631379001410238, + "learning_rate": 2.315815568156753e-06, + "loss": 0.1274, + "step": 5940 + }, + { + "epoch": 0.5473810291611001, + "grad_norm": 0.8739460925822028, + "learning_rate": 2.315055864050654e-06, + "loss": 0.1343, + "step": 5941 + }, + { + "epoch": 0.5474731653383701, + "grad_norm": 0.9056750460446128, + "learning_rate": 2.314296177116637e-06, + "loss": 0.1456, + "step": 5942 + }, + { + "epoch": 0.5475653015156401, + "grad_norm": 0.8597999521946855, + "learning_rate": 2.3135365074252393e-06, + "loss": 0.1229, + "step": 5943 + }, + { + "epoch": 0.5476574376929101, + "grad_norm": 0.9429380689039017, + "learning_rate": 2.3127768550469977e-06, + "loss": 0.1517, + "step": 5944 + }, + { + "epoch": 0.5477495738701801, + "grad_norm": 0.9326983964366256, + "learning_rate": 2.3120172200524456e-06, + "loss": 0.1409, + "step": 5945 + }, + { + "epoch": 0.5478417100474501, + "grad_norm": 0.9029183579785335, + "learning_rate": 2.311257602512114e-06, + "loss": 0.1375, + "step": 5946 + }, + { + "epoch": 0.5479338462247202, + "grad_norm": 0.8405914319952673, + "learning_rate": 2.310498002496535e-06, + "loss": 0.125, + "step": 5947 + }, + { + "epoch": 0.5480259824019902, + "grad_norm": 0.8963143530428416, + "learning_rate": 2.309738420076236e-06, + "loss": 0.1362, + "step": 5948 + }, + { + "epoch": 0.5481181185792602, + "grad_norm": 0.9373725224516829, + "learning_rate": 2.308978855321746e-06, + "loss": 0.1381, + "step": 5949 + }, + { + "epoch": 0.5482102547565302, + "grad_norm": 0.904479495310587, + "learning_rate": 2.30821930830359e-06, + "loss": 0.1344, + "step": 5950 + }, + { + "epoch": 0.5483023909338002, + "grad_norm": 0.947211874283797, + "learning_rate": 2.307459779092291e-06, + "loss": 0.1391, + "step": 5951 + }, + { + "epoch": 0.5483945271110702, + "grad_norm": 0.9137280527447117, + "learning_rate": 2.306700267758373e-06, + "loss": 0.1244, + "step": 5952 + }, + { + "epoch": 0.5484866632883402, + "grad_norm": 0.887794582900808, + "learning_rate": 2.3059407743723562e-06, + "loss": 0.1236, + "step": 5953 + }, + { + "epoch": 0.5485787994656102, + "grad_norm": 0.8957336178387945, + "learning_rate": 2.305181299004758e-06, + "loss": 0.1319, + "step": 5954 + }, + { + "epoch": 0.5486709356428802, + "grad_norm": 0.9285491033008205, + "learning_rate": 2.304421841726098e-06, + "loss": 0.1375, + "step": 5955 + }, + { + "epoch": 0.5487630718201502, + "grad_norm": 0.9000401889585357, + "learning_rate": 2.303662402606891e-06, + "loss": 0.1394, + "step": 5956 + }, + { + "epoch": 0.5488552079974202, + "grad_norm": 0.8887392546394776, + "learning_rate": 2.3029029817176513e-06, + "loss": 0.1241, + "step": 5957 + }, + { + "epoch": 0.5489473441746902, + "grad_norm": 0.9063498591456991, + "learning_rate": 2.302143579128891e-06, + "loss": 0.1452, + "step": 5958 + }, + { + "epoch": 0.5490394803519602, + "grad_norm": 0.9179438604712449, + "learning_rate": 2.30138419491112e-06, + "loss": 0.1393, + "step": 5959 + }, + { + "epoch": 0.5491316165292301, + "grad_norm": 0.9440096748055181, + "learning_rate": 2.3006248291348483e-06, + "loss": 0.1458, + "step": 5960 + }, + { + "epoch": 0.5492237527065003, + "grad_norm": 0.8571329043484666, + "learning_rate": 2.2998654818705824e-06, + "loss": 0.1258, + "step": 5961 + }, + { + "epoch": 0.5493158888837703, + "grad_norm": 0.974799513318454, + "learning_rate": 2.2991061531888285e-06, + "loss": 0.146, + "step": 5962 + }, + { + "epoch": 0.5494080250610403, + "grad_norm": 0.9535479404809656, + "learning_rate": 2.29834684316009e-06, + "loss": 0.1387, + "step": 5963 + }, + { + "epoch": 0.5495001612383102, + "grad_norm": 0.9528230339650521, + "learning_rate": 2.297587551854868e-06, + "loss": 0.1313, + "step": 5964 + }, + { + "epoch": 0.5495922974155802, + "grad_norm": 0.9235402560277223, + "learning_rate": 2.296828279343664e-06, + "loss": 0.1291, + "step": 5965 + }, + { + "epoch": 0.5496844335928502, + "grad_norm": 0.858766032670027, + "learning_rate": 2.2960690256969774e-06, + "loss": 0.1261, + "step": 5966 + }, + { + "epoch": 0.5497765697701202, + "grad_norm": 1.0036102049390694, + "learning_rate": 2.2953097909853018e-06, + "loss": 0.15, + "step": 5967 + }, + { + "epoch": 0.5498687059473902, + "grad_norm": 0.9197290368123798, + "learning_rate": 2.294550575279135e-06, + "loss": 0.1263, + "step": 5968 + }, + { + "epoch": 0.5499608421246602, + "grad_norm": 0.9410944923888054, + "learning_rate": 2.293791378648969e-06, + "loss": 0.1267, + "step": 5969 + }, + { + "epoch": 0.5500529783019302, + "grad_norm": 0.9187152074045628, + "learning_rate": 2.2930322011652965e-06, + "loss": 0.1332, + "step": 5970 + }, + { + "epoch": 0.5501451144792002, + "grad_norm": 0.877662807921679, + "learning_rate": 2.2922730428986057e-06, + "loss": 0.1323, + "step": 5971 + }, + { + "epoch": 0.5502372506564702, + "grad_norm": 0.9595621542293774, + "learning_rate": 2.291513903919385e-06, + "loss": 0.1587, + "step": 5972 + }, + { + "epoch": 0.5503293868337402, + "grad_norm": 0.9391446872559864, + "learning_rate": 2.2907547842981213e-06, + "loss": 0.1322, + "step": 5973 + }, + { + "epoch": 0.5504215230110103, + "grad_norm": 0.9539499349532112, + "learning_rate": 2.289995684105299e-06, + "loss": 0.1416, + "step": 5974 + }, + { + "epoch": 0.5505136591882803, + "grad_norm": 0.9231926460585064, + "learning_rate": 2.2892366034113988e-06, + "loss": 0.1305, + "step": 5975 + }, + { + "epoch": 0.5506057953655503, + "grad_norm": 0.9016548074632982, + "learning_rate": 2.288477542286903e-06, + "loss": 0.1392, + "step": 5976 + }, + { + "epoch": 0.5506979315428203, + "grad_norm": 0.942515622309317, + "learning_rate": 2.2877185008022896e-06, + "loss": 0.143, + "step": 5977 + }, + { + "epoch": 0.5507900677200903, + "grad_norm": 0.9349815447766366, + "learning_rate": 2.2869594790280376e-06, + "loss": 0.1456, + "step": 5978 + }, + { + "epoch": 0.5508822038973603, + "grad_norm": 0.9109243804576193, + "learning_rate": 2.2862004770346205e-06, + "loss": 0.1407, + "step": 5979 + }, + { + "epoch": 0.5509743400746303, + "grad_norm": 0.8404738991971081, + "learning_rate": 2.285441494892511e-06, + "loss": 0.1245, + "step": 5980 + }, + { + "epoch": 0.5510664762519003, + "grad_norm": 0.9338626947780788, + "learning_rate": 2.284682532672183e-06, + "loss": 0.1445, + "step": 5981 + }, + { + "epoch": 0.5511586124291703, + "grad_norm": 0.9020789497422819, + "learning_rate": 2.2839235904441054e-06, + "loss": 0.1344, + "step": 5982 + }, + { + "epoch": 0.5512507486064403, + "grad_norm": 0.8908428591725923, + "learning_rate": 2.2831646682787443e-06, + "loss": 0.1343, + "step": 5983 + }, + { + "epoch": 0.5513428847837103, + "grad_norm": 0.9079304182626624, + "learning_rate": 2.282405766246568e-06, + "loss": 0.1287, + "step": 5984 + }, + { + "epoch": 0.5514350209609803, + "grad_norm": 0.8329410231373364, + "learning_rate": 2.281646884418039e-06, + "loss": 0.1135, + "step": 5985 + }, + { + "epoch": 0.5515271571382503, + "grad_norm": 0.9381099953259999, + "learning_rate": 2.280888022863621e-06, + "loss": 0.1405, + "step": 5986 + }, + { + "epoch": 0.5516192933155203, + "grad_norm": 0.8513722262685598, + "learning_rate": 2.2801291816537738e-06, + "loss": 0.1265, + "step": 5987 + }, + { + "epoch": 0.5517114294927904, + "grad_norm": 0.8820089761459324, + "learning_rate": 2.2793703608589547e-06, + "loss": 0.1432, + "step": 5988 + }, + { + "epoch": 0.5518035656700604, + "grad_norm": 0.9175146678877896, + "learning_rate": 2.2786115605496224e-06, + "loss": 0.1372, + "step": 5989 + }, + { + "epoch": 0.5518957018473304, + "grad_norm": 0.8941107121696663, + "learning_rate": 2.2778527807962297e-06, + "loss": 0.1289, + "step": 5990 + }, + { + "epoch": 0.5519878380246004, + "grad_norm": 0.9554274388492524, + "learning_rate": 2.277094021669231e-06, + "loss": 0.1451, + "step": 5991 + }, + { + "epoch": 0.5520799742018704, + "grad_norm": 0.8202855336317009, + "learning_rate": 2.2763352832390762e-06, + "loss": 0.1209, + "step": 5992 + }, + { + "epoch": 0.5521721103791404, + "grad_norm": 0.8890984039392089, + "learning_rate": 2.2755765655762135e-06, + "loss": 0.1273, + "step": 5993 + }, + { + "epoch": 0.5522642465564104, + "grad_norm": 0.9178667132404417, + "learning_rate": 2.2748178687510915e-06, + "loss": 0.1426, + "step": 5994 + }, + { + "epoch": 0.5523563827336804, + "grad_norm": 0.8546557060383881, + "learning_rate": 2.2740591928341552e-06, + "loss": 0.1335, + "step": 5995 + }, + { + "epoch": 0.5524485189109504, + "grad_norm": 0.9204965701548999, + "learning_rate": 2.2733005378958462e-06, + "loss": 0.1343, + "step": 5996 + }, + { + "epoch": 0.5525406550882204, + "grad_norm": 0.9234067877974934, + "learning_rate": 2.2725419040066075e-06, + "loss": 0.1346, + "step": 5997 + }, + { + "epoch": 0.5526327912654904, + "grad_norm": 0.9161899474482855, + "learning_rate": 2.2717832912368766e-06, + "loss": 0.133, + "step": 5998 + }, + { + "epoch": 0.5527249274427604, + "grad_norm": 0.8984198063815197, + "learning_rate": 2.271024699657093e-06, + "loss": 0.1397, + "step": 5999 + }, + { + "epoch": 0.5528170636200304, + "grad_norm": 0.8464187626712723, + "learning_rate": 2.2702661293376895e-06, + "loss": 0.1195, + "step": 6000 + }, + { + "epoch": 0.5528170636200304, + "eval_loss": 0.13545145094394684, + "eval_runtime": 299.9711, + "eval_samples_per_second": 23.392, + "eval_steps_per_second": 2.927, + "step": 6000 + }, + { + "epoch": 0.5529091997973004, + "grad_norm": 0.8673690915958489, + "learning_rate": 2.269507580349101e-06, + "loss": 0.1426, + "step": 6001 + }, + { + "epoch": 0.5530013359745705, + "grad_norm": 0.8595959230817573, + "learning_rate": 2.2687490527617575e-06, + "loss": 0.1308, + "step": 6002 + }, + { + "epoch": 0.5530934721518405, + "grad_norm": 0.8980906183910887, + "learning_rate": 2.2679905466460917e-06, + "loss": 0.1319, + "step": 6003 + }, + { + "epoch": 0.5531856083291105, + "grad_norm": 0.9618260702341198, + "learning_rate": 2.2672320620725265e-06, + "loss": 0.143, + "step": 6004 + }, + { + "epoch": 0.5532777445063805, + "grad_norm": 0.8827191753043445, + "learning_rate": 2.2664735991114893e-06, + "loss": 0.1325, + "step": 6005 + }, + { + "epoch": 0.5533698806836505, + "grad_norm": 0.9068695564847284, + "learning_rate": 2.2657151578334046e-06, + "loss": 0.131, + "step": 6006 + }, + { + "epoch": 0.5534620168609204, + "grad_norm": 0.895570377791969, + "learning_rate": 2.264956738308693e-06, + "loss": 0.1286, + "step": 6007 + }, + { + "epoch": 0.5535541530381904, + "grad_norm": 0.8658652374176646, + "learning_rate": 2.2641983406077726e-06, + "loss": 0.1287, + "step": 6008 + }, + { + "epoch": 0.5536462892154604, + "grad_norm": 0.9105693225657342, + "learning_rate": 2.2634399648010623e-06, + "loss": 0.1358, + "step": 6009 + }, + { + "epoch": 0.5537384253927304, + "grad_norm": 0.9705522735493408, + "learning_rate": 2.262681610958976e-06, + "loss": 0.1461, + "step": 6010 + }, + { + "epoch": 0.5538305615700004, + "grad_norm": 0.9386721602193085, + "learning_rate": 2.2619232791519287e-06, + "loss": 0.1358, + "step": 6011 + }, + { + "epoch": 0.5539226977472704, + "grad_norm": 0.8948899091915336, + "learning_rate": 2.26116496945033e-06, + "loss": 0.1298, + "step": 6012 + }, + { + "epoch": 0.5540148339245404, + "grad_norm": 0.9849000608057639, + "learning_rate": 2.260406681924589e-06, + "loss": 0.1398, + "step": 6013 + }, + { + "epoch": 0.5541069701018104, + "grad_norm": 0.9402527148448475, + "learning_rate": 2.2596484166451136e-06, + "loss": 0.1393, + "step": 6014 + }, + { + "epoch": 0.5541991062790805, + "grad_norm": 0.8512564356361474, + "learning_rate": 2.2588901736823087e-06, + "loss": 0.1195, + "step": 6015 + }, + { + "epoch": 0.5542912424563505, + "grad_norm": 0.9600645116664145, + "learning_rate": 2.2581319531065777e-06, + "loss": 0.1411, + "step": 6016 + }, + { + "epoch": 0.5543833786336205, + "grad_norm": 0.9064131823119506, + "learning_rate": 2.257373754988321e-06, + "loss": 0.1265, + "step": 6017 + }, + { + "epoch": 0.5544755148108905, + "grad_norm": 0.9335295798432109, + "learning_rate": 2.256615579397936e-06, + "loss": 0.1339, + "step": 6018 + }, + { + "epoch": 0.5545676509881605, + "grad_norm": 0.9618056064055769, + "learning_rate": 2.2558574264058218e-06, + "loss": 0.145, + "step": 6019 + }, + { + "epoch": 0.5546597871654305, + "grad_norm": 0.8809660515422949, + "learning_rate": 2.255099296082372e-06, + "loss": 0.1253, + "step": 6020 + }, + { + "epoch": 0.5547519233427005, + "grad_norm": 0.8893338573120851, + "learning_rate": 2.2543411884979775e-06, + "loss": 0.131, + "step": 6021 + }, + { + "epoch": 0.5548440595199705, + "grad_norm": 0.8911158292533096, + "learning_rate": 2.2535831037230313e-06, + "loss": 0.129, + "step": 6022 + }, + { + "epoch": 0.5549361956972405, + "grad_norm": 0.931882441099175, + "learning_rate": 2.2528250418279196e-06, + "loss": 0.1421, + "step": 6023 + }, + { + "epoch": 0.5550283318745105, + "grad_norm": 0.9395929443411103, + "learning_rate": 2.2520670028830305e-06, + "loss": 0.1337, + "step": 6024 + }, + { + "epoch": 0.5551204680517805, + "grad_norm": 0.9529079984593413, + "learning_rate": 2.251308986958746e-06, + "loss": 0.1342, + "step": 6025 + }, + { + "epoch": 0.5552126042290505, + "grad_norm": 0.9295137492491802, + "learning_rate": 2.250550994125449e-06, + "loss": 0.1377, + "step": 6026 + }, + { + "epoch": 0.5553047404063205, + "grad_norm": 0.9283395008536116, + "learning_rate": 2.249793024453519e-06, + "loss": 0.1243, + "step": 6027 + }, + { + "epoch": 0.5553968765835905, + "grad_norm": 0.9643312312736612, + "learning_rate": 2.2490350780133344e-06, + "loss": 0.1447, + "step": 6028 + }, + { + "epoch": 0.5554890127608606, + "grad_norm": 0.9532369619786419, + "learning_rate": 2.2482771548752684e-06, + "loss": 0.137, + "step": 6029 + }, + { + "epoch": 0.5555811489381306, + "grad_norm": 0.9746110125718069, + "learning_rate": 2.247519255109697e-06, + "loss": 0.1369, + "step": 6030 + }, + { + "epoch": 0.5556732851154006, + "grad_norm": 0.9429182322625936, + "learning_rate": 2.2467613787869886e-06, + "loss": 0.1387, + "step": 6031 + }, + { + "epoch": 0.5557654212926706, + "grad_norm": 0.8952924660189465, + "learning_rate": 2.2460035259775147e-06, + "loss": 0.1373, + "step": 6032 + }, + { + "epoch": 0.5558575574699406, + "grad_norm": 0.8812945184571047, + "learning_rate": 2.2452456967516404e-06, + "loss": 0.1349, + "step": 6033 + }, + { + "epoch": 0.5559496936472106, + "grad_norm": 0.9285111333188203, + "learning_rate": 2.2444878911797295e-06, + "loss": 0.1378, + "step": 6034 + }, + { + "epoch": 0.5560418298244806, + "grad_norm": 0.9269768243193539, + "learning_rate": 2.2437301093321467e-06, + "loss": 0.149, + "step": 6035 + }, + { + "epoch": 0.5561339660017506, + "grad_norm": 0.9081816960352027, + "learning_rate": 2.242972351279251e-06, + "loss": 0.133, + "step": 6036 + }, + { + "epoch": 0.5562261021790206, + "grad_norm": 0.9203965936383334, + "learning_rate": 2.242214617091399e-06, + "loss": 0.1269, + "step": 6037 + }, + { + "epoch": 0.5563182383562906, + "grad_norm": 0.8587985077915162, + "learning_rate": 2.241456906838948e-06, + "loss": 0.1248, + "step": 6038 + }, + { + "epoch": 0.5564103745335606, + "grad_norm": 0.9433740268698593, + "learning_rate": 2.2406992205922506e-06, + "loss": 0.1332, + "step": 6039 + }, + { + "epoch": 0.5565025107108306, + "grad_norm": 0.8494531923033956, + "learning_rate": 2.2399415584216595e-06, + "loss": 0.1199, + "step": 6040 + }, + { + "epoch": 0.5565946468881006, + "grad_norm": 0.8689474784825143, + "learning_rate": 2.2391839203975225e-06, + "loss": 0.1291, + "step": 6041 + }, + { + "epoch": 0.5566867830653707, + "grad_norm": 0.9208796138126546, + "learning_rate": 2.238426306590186e-06, + "loss": 0.1504, + "step": 6042 + }, + { + "epoch": 0.5567789192426407, + "grad_norm": 0.9175005106338083, + "learning_rate": 2.237668717069995e-06, + "loss": 0.1338, + "step": 6043 + }, + { + "epoch": 0.5568710554199107, + "grad_norm": 0.9590343105192461, + "learning_rate": 2.2369111519072917e-06, + "loss": 0.1402, + "step": 6044 + }, + { + "epoch": 0.5569631915971807, + "grad_norm": 0.9352154416418151, + "learning_rate": 2.2361536111724176e-06, + "loss": 0.1285, + "step": 6045 + }, + { + "epoch": 0.5570553277744507, + "grad_norm": 0.9108642890336781, + "learning_rate": 2.2353960949357082e-06, + "loss": 0.1327, + "step": 6046 + }, + { + "epoch": 0.5571474639517207, + "grad_norm": 0.9125190233288996, + "learning_rate": 2.2346386032674995e-06, + "loss": 0.137, + "step": 6047 + }, + { + "epoch": 0.5572396001289907, + "grad_norm": 0.9423488359590013, + "learning_rate": 2.2338811362381256e-06, + "loss": 0.1324, + "step": 6048 + }, + { + "epoch": 0.5573317363062606, + "grad_norm": 0.9392634101242259, + "learning_rate": 2.233123693917917e-06, + "loss": 0.1322, + "step": 6049 + }, + { + "epoch": 0.5574238724835306, + "grad_norm": 0.908005657369545, + "learning_rate": 2.232366276377201e-06, + "loss": 0.1258, + "step": 6050 + }, + { + "epoch": 0.5575160086608006, + "grad_norm": 0.8968615049336631, + "learning_rate": 2.2316088836863064e-06, + "loss": 0.1243, + "step": 6051 + }, + { + "epoch": 0.5576081448380706, + "grad_norm": 0.9926119163177404, + "learning_rate": 2.2308515159155546e-06, + "loss": 0.1417, + "step": 6052 + }, + { + "epoch": 0.5577002810153406, + "grad_norm": 0.9075124724568028, + "learning_rate": 2.23009417313527e-06, + "loss": 0.1259, + "step": 6053 + }, + { + "epoch": 0.5577924171926106, + "grad_norm": 0.9618556843783079, + "learning_rate": 2.2293368554157695e-06, + "loss": 0.1311, + "step": 6054 + }, + { + "epoch": 0.5578845533698806, + "grad_norm": 1.0245024369165074, + "learning_rate": 2.228579562827371e-06, + "loss": 0.1452, + "step": 6055 + }, + { + "epoch": 0.5579766895471507, + "grad_norm": 0.9542187116021007, + "learning_rate": 2.2278222954403895e-06, + "loss": 0.1345, + "step": 6056 + }, + { + "epoch": 0.5580688257244207, + "grad_norm": 1.026635884315615, + "learning_rate": 2.2270650533251383e-06, + "loss": 0.1531, + "step": 6057 + }, + { + "epoch": 0.5581609619016907, + "grad_norm": 0.9103746701579992, + "learning_rate": 2.2263078365519244e-06, + "loss": 0.1315, + "step": 6058 + }, + { + "epoch": 0.5582530980789607, + "grad_norm": 0.8750502768735856, + "learning_rate": 2.2255506451910584e-06, + "loss": 0.1389, + "step": 6059 + }, + { + "epoch": 0.5583452342562307, + "grad_norm": 1.0348797840452786, + "learning_rate": 2.2247934793128436e-06, + "loss": 0.1507, + "step": 6060 + }, + { + "epoch": 0.5584373704335007, + "grad_norm": 0.9387319109433172, + "learning_rate": 2.224036338987585e-06, + "loss": 0.1416, + "step": 6061 + }, + { + "epoch": 0.5585295066107707, + "grad_norm": 0.9648857566576942, + "learning_rate": 2.223279224285582e-06, + "loss": 0.1465, + "step": 6062 + }, + { + "epoch": 0.5586216427880407, + "grad_norm": 0.9512060091262989, + "learning_rate": 2.2225221352771316e-06, + "loss": 0.1333, + "step": 6063 + }, + { + "epoch": 0.5587137789653107, + "grad_norm": 0.8818452555644942, + "learning_rate": 2.221765072032532e-06, + "loss": 0.1354, + "step": 6064 + }, + { + "epoch": 0.5588059151425807, + "grad_norm": 0.9595230313609375, + "learning_rate": 2.2210080346220755e-06, + "loss": 0.1495, + "step": 6065 + }, + { + "epoch": 0.5588980513198507, + "grad_norm": 0.9406934954772517, + "learning_rate": 2.220251023116052e-06, + "loss": 0.1395, + "step": 6066 + }, + { + "epoch": 0.5589901874971207, + "grad_norm": 1.0184402402535877, + "learning_rate": 2.2194940375847517e-06, + "loss": 0.1491, + "step": 6067 + }, + { + "epoch": 0.5590823236743907, + "grad_norm": 0.9912445760663547, + "learning_rate": 2.2187370780984596e-06, + "loss": 0.1489, + "step": 6068 + }, + { + "epoch": 0.5591744598516608, + "grad_norm": 0.94668807549237, + "learning_rate": 2.2179801447274613e-06, + "loss": 0.1385, + "step": 6069 + }, + { + "epoch": 0.5592665960289308, + "grad_norm": 0.9082934245577071, + "learning_rate": 2.217223237542036e-06, + "loss": 0.1235, + "step": 6070 + }, + { + "epoch": 0.5593587322062008, + "grad_norm": 0.9420787600329253, + "learning_rate": 2.2164663566124635e-06, + "loss": 0.1376, + "step": 6071 + }, + { + "epoch": 0.5594508683834708, + "grad_norm": 0.8741240273373798, + "learning_rate": 2.2157095020090207e-06, + "loss": 0.1316, + "step": 6072 + }, + { + "epoch": 0.5595430045607408, + "grad_norm": 0.8657043268109199, + "learning_rate": 2.2149526738019802e-06, + "loss": 0.1233, + "step": 6073 + }, + { + "epoch": 0.5596351407380108, + "grad_norm": 0.8732124884259539, + "learning_rate": 2.2141958720616163e-06, + "loss": 0.1279, + "step": 6074 + }, + { + "epoch": 0.5597272769152808, + "grad_norm": 0.8525074108717983, + "learning_rate": 2.2134390968581958e-06, + "loss": 0.1243, + "step": 6075 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 0.8487356309861768, + "learning_rate": 2.212682348261985e-06, + "loss": 0.1242, + "step": 6076 + }, + { + "epoch": 0.5599115492698208, + "grad_norm": 0.9289207090322201, + "learning_rate": 2.21192562634325e-06, + "loss": 0.1336, + "step": 6077 + }, + { + "epoch": 0.5600036854470908, + "grad_norm": 0.8841161908632112, + "learning_rate": 2.2111689311722524e-06, + "loss": 0.1327, + "step": 6078 + }, + { + "epoch": 0.5600958216243608, + "grad_norm": 0.9007258520902498, + "learning_rate": 2.210412262819249e-06, + "loss": 0.1314, + "step": 6079 + }, + { + "epoch": 0.5601879578016308, + "grad_norm": 0.9034049217075238, + "learning_rate": 2.209655621354499e-06, + "loss": 0.1288, + "step": 6080 + }, + { + "epoch": 0.5602800939789008, + "grad_norm": 1.0561927888352458, + "learning_rate": 2.2088990068482554e-06, + "loss": 0.1638, + "step": 6081 + }, + { + "epoch": 0.5603722301561708, + "grad_norm": 0.9263479169980393, + "learning_rate": 2.208142419370771e-06, + "loss": 0.1286, + "step": 6082 + }, + { + "epoch": 0.5604643663334409, + "grad_norm": 0.876129774276463, + "learning_rate": 2.207385858992294e-06, + "loss": 0.1303, + "step": 6083 + }, + { + "epoch": 0.5605565025107109, + "grad_norm": 0.8577719677238493, + "learning_rate": 2.206629325783071e-06, + "loss": 0.1278, + "step": 6084 + }, + { + "epoch": 0.5606486386879809, + "grad_norm": 0.9145085290177857, + "learning_rate": 2.2058728198133466e-06, + "loss": 0.1449, + "step": 6085 + }, + { + "epoch": 0.5607407748652509, + "grad_norm": 0.9125478368838313, + "learning_rate": 2.2051163411533644e-06, + "loss": 0.1308, + "step": 6086 + }, + { + "epoch": 0.5608329110425209, + "grad_norm": 0.8282140216339626, + "learning_rate": 2.2043598898733597e-06, + "loss": 0.1243, + "step": 6087 + }, + { + "epoch": 0.5609250472197909, + "grad_norm": 0.8843568534788926, + "learning_rate": 2.2036034660435714e-06, + "loss": 0.1404, + "step": 6088 + }, + { + "epoch": 0.5610171833970609, + "grad_norm": 0.9683614071935065, + "learning_rate": 2.2028470697342334e-06, + "loss": 0.1459, + "step": 6089 + }, + { + "epoch": 0.5611093195743309, + "grad_norm": 0.9027082869152945, + "learning_rate": 2.2020907010155775e-06, + "loss": 0.136, + "step": 6090 + }, + { + "epoch": 0.5612014557516009, + "grad_norm": 0.9025715692301286, + "learning_rate": 2.2013343599578314e-06, + "loss": 0.1334, + "step": 6091 + }, + { + "epoch": 0.5612935919288708, + "grad_norm": 1.0019084022146867, + "learning_rate": 2.2005780466312224e-06, + "loss": 0.1453, + "step": 6092 + }, + { + "epoch": 0.5613857281061408, + "grad_norm": 0.8716930577679224, + "learning_rate": 2.1998217611059733e-06, + "loss": 0.118, + "step": 6093 + }, + { + "epoch": 0.5614778642834108, + "grad_norm": 1.0009481851360602, + "learning_rate": 2.1990655034523073e-06, + "loss": 0.1483, + "step": 6094 + }, + { + "epoch": 0.5615700004606808, + "grad_norm": 0.900153344517744, + "learning_rate": 2.198309273740441e-06, + "loss": 0.1184, + "step": 6095 + }, + { + "epoch": 0.5616621366379508, + "grad_norm": 0.9337923240361191, + "learning_rate": 2.1975530720405906e-06, + "loss": 0.1316, + "step": 6096 + }, + { + "epoch": 0.561754272815221, + "grad_norm": 0.8799886004445121, + "learning_rate": 2.1967968984229704e-06, + "loss": 0.131, + "step": 6097 + }, + { + "epoch": 0.5618464089924909, + "grad_norm": 0.9528519591932912, + "learning_rate": 2.1960407529577917e-06, + "loss": 0.1323, + "step": 6098 + }, + { + "epoch": 0.5619385451697609, + "grad_norm": 0.9216207218465383, + "learning_rate": 2.1952846357152603e-06, + "loss": 0.1332, + "step": 6099 + }, + { + "epoch": 0.5620306813470309, + "grad_norm": 0.902665622003663, + "learning_rate": 2.1945285467655843e-06, + "loss": 0.1312, + "step": 6100 + }, + { + "epoch": 0.5621228175243009, + "grad_norm": 0.9401051874207889, + "learning_rate": 2.1937724861789645e-06, + "loss": 0.1443, + "step": 6101 + }, + { + "epoch": 0.5622149537015709, + "grad_norm": 0.9345134562655586, + "learning_rate": 2.1930164540256035e-06, + "loss": 0.1343, + "step": 6102 + }, + { + "epoch": 0.5623070898788409, + "grad_norm": 0.8892763251463518, + "learning_rate": 2.1922604503756977e-06, + "loss": 0.1312, + "step": 6103 + }, + { + "epoch": 0.5623992260561109, + "grad_norm": 0.8633253656360871, + "learning_rate": 2.1915044752994417e-06, + "loss": 0.1254, + "step": 6104 + }, + { + "epoch": 0.5624913622333809, + "grad_norm": 0.9320587345393514, + "learning_rate": 2.1907485288670288e-06, + "loss": 0.1367, + "step": 6105 + }, + { + "epoch": 0.5625834984106509, + "grad_norm": 0.889817249819364, + "learning_rate": 2.1899926111486473e-06, + "loss": 0.1374, + "step": 6106 + }, + { + "epoch": 0.5626756345879209, + "grad_norm": 0.9262828568720447, + "learning_rate": 2.1892367222144863e-06, + "loss": 0.1449, + "step": 6107 + }, + { + "epoch": 0.5627677707651909, + "grad_norm": 0.8659566791382182, + "learning_rate": 2.1884808621347288e-06, + "loss": 0.1249, + "step": 6108 + }, + { + "epoch": 0.5628599069424609, + "grad_norm": 0.9488609398698363, + "learning_rate": 2.1877250309795565e-06, + "loss": 0.1458, + "step": 6109 + }, + { + "epoch": 0.562952043119731, + "grad_norm": 0.9129507239437383, + "learning_rate": 2.186969228819149e-06, + "loss": 0.1356, + "step": 6110 + }, + { + "epoch": 0.563044179297001, + "grad_norm": 0.9234955497308438, + "learning_rate": 2.1862134557236826e-06, + "loss": 0.1358, + "step": 6111 + }, + { + "epoch": 0.563136315474271, + "grad_norm": 0.9315935822477422, + "learning_rate": 2.1854577117633297e-06, + "loss": 0.1459, + "step": 6112 + }, + { + "epoch": 0.563228451651541, + "grad_norm": 0.961404001994605, + "learning_rate": 2.1847019970082628e-06, + "loss": 0.1366, + "step": 6113 + }, + { + "epoch": 0.563320587828811, + "grad_norm": 0.8541689771391625, + "learning_rate": 2.1839463115286484e-06, + "loss": 0.1188, + "step": 6114 + }, + { + "epoch": 0.563412724006081, + "grad_norm": 0.9470634734872366, + "learning_rate": 2.183190655394655e-06, + "loss": 0.1404, + "step": 6115 + }, + { + "epoch": 0.563504860183351, + "grad_norm": 0.9911048928912876, + "learning_rate": 2.182435028676442e-06, + "loss": 0.1413, + "step": 6116 + }, + { + "epoch": 0.563596996360621, + "grad_norm": 0.9182342845541454, + "learning_rate": 2.1816794314441704e-06, + "loss": 0.1234, + "step": 6117 + }, + { + "epoch": 0.563689132537891, + "grad_norm": 0.9403718290802439, + "learning_rate": 2.1809238637679984e-06, + "loss": 0.1295, + "step": 6118 + }, + { + "epoch": 0.563781268715161, + "grad_norm": 0.9045271472960527, + "learning_rate": 2.1801683257180807e-06, + "loss": 0.1291, + "step": 6119 + }, + { + "epoch": 0.563873404892431, + "grad_norm": 0.9327840459682971, + "learning_rate": 2.179412817364567e-06, + "loss": 0.1305, + "step": 6120 + }, + { + "epoch": 0.563965541069701, + "grad_norm": 0.9333718027134292, + "learning_rate": 2.1786573387776085e-06, + "loss": 0.1385, + "step": 6121 + }, + { + "epoch": 0.564057677246971, + "grad_norm": 0.9100700904648433, + "learning_rate": 2.17790189002735e-06, + "loss": 0.1284, + "step": 6122 + }, + { + "epoch": 0.564149813424241, + "grad_norm": 0.9593641910292168, + "learning_rate": 2.177146471183937e-06, + "loss": 0.1342, + "step": 6123 + }, + { + "epoch": 0.5642419496015111, + "grad_norm": 0.968082559377105, + "learning_rate": 2.176391082317508e-06, + "loss": 0.141, + "step": 6124 + }, + { + "epoch": 0.5643340857787811, + "grad_norm": 0.9149483271324426, + "learning_rate": 2.175635723498201e-06, + "loss": 0.1378, + "step": 6125 + }, + { + "epoch": 0.5644262219560511, + "grad_norm": 0.8897063217512015, + "learning_rate": 2.1748803947961533e-06, + "loss": 0.1264, + "step": 6126 + }, + { + "epoch": 0.5645183581333211, + "grad_norm": 0.8974960547750435, + "learning_rate": 2.174125096281496e-06, + "loss": 0.1409, + "step": 6127 + }, + { + "epoch": 0.5646104943105911, + "grad_norm": 0.9182853643197407, + "learning_rate": 2.1733698280243578e-06, + "loss": 0.1307, + "step": 6128 + }, + { + "epoch": 0.5647026304878611, + "grad_norm": 0.8913059045600129, + "learning_rate": 2.1726145900948664e-06, + "loss": 0.1326, + "step": 6129 + }, + { + "epoch": 0.5647947666651311, + "grad_norm": 0.9202971875767432, + "learning_rate": 2.1718593825631454e-06, + "loss": 0.1407, + "step": 6130 + }, + { + "epoch": 0.5648869028424011, + "grad_norm": 0.9154564708205672, + "learning_rate": 2.1711042054993164e-06, + "loss": 0.144, + "step": 6131 + }, + { + "epoch": 0.5649790390196711, + "grad_norm": 0.9188105344844977, + "learning_rate": 2.1703490589734976e-06, + "loss": 0.1406, + "step": 6132 + }, + { + "epoch": 0.5650711751969411, + "grad_norm": 0.9213698353945419, + "learning_rate": 2.1695939430558035e-06, + "loss": 0.1337, + "step": 6133 + }, + { + "epoch": 0.565163311374211, + "grad_norm": 0.9162759583166316, + "learning_rate": 2.1688388578163476e-06, + "loss": 0.1334, + "step": 6134 + }, + { + "epoch": 0.565255447551481, + "grad_norm": 0.941951590936026, + "learning_rate": 2.168083803325239e-06, + "loss": 0.1333, + "step": 6135 + }, + { + "epoch": 0.565347583728751, + "grad_norm": 0.9398691081517228, + "learning_rate": 2.167328779652586e-06, + "loss": 0.134, + "step": 6136 + }, + { + "epoch": 0.5654397199060212, + "grad_norm": 0.9567818118407774, + "learning_rate": 2.166573786868491e-06, + "loss": 0.1385, + "step": 6137 + }, + { + "epoch": 0.5655318560832912, + "grad_norm": 0.9417036150379874, + "learning_rate": 2.1658188250430556e-06, + "loss": 0.1428, + "step": 6138 + }, + { + "epoch": 0.5656239922605611, + "grad_norm": 0.8769247131311982, + "learning_rate": 2.1650638942463785e-06, + "loss": 0.131, + "step": 6139 + }, + { + "epoch": 0.5657161284378311, + "grad_norm": 0.9078529735616421, + "learning_rate": 2.1643089945485555e-06, + "loss": 0.1335, + "step": 6140 + }, + { + "epoch": 0.5658082646151011, + "grad_norm": 0.9324124879098438, + "learning_rate": 2.163554126019677e-06, + "loss": 0.143, + "step": 6141 + }, + { + "epoch": 0.5659004007923711, + "grad_norm": 0.9048706928934882, + "learning_rate": 2.162799288729835e-06, + "loss": 0.1359, + "step": 6142 + }, + { + "epoch": 0.5659925369696411, + "grad_norm": 0.9352246348847684, + "learning_rate": 2.162044482749115e-06, + "loss": 0.142, + "step": 6143 + }, + { + "epoch": 0.5660846731469111, + "grad_norm": 0.911385629437897, + "learning_rate": 2.161289708147602e-06, + "loss": 0.1347, + "step": 6144 + }, + { + "epoch": 0.5661768093241811, + "grad_norm": 1.011126666761643, + "learning_rate": 2.1605349649953756e-06, + "loss": 0.1568, + "step": 6145 + }, + { + "epoch": 0.5662689455014511, + "grad_norm": 0.8963212952587003, + "learning_rate": 2.1597802533625135e-06, + "loss": 0.126, + "step": 6146 + }, + { + "epoch": 0.5663610816787211, + "grad_norm": 0.8806494895413713, + "learning_rate": 2.159025573319092e-06, + "loss": 0.129, + "step": 6147 + }, + { + "epoch": 0.5664532178559911, + "grad_norm": 0.9027048432779949, + "learning_rate": 2.1582709249351834e-06, + "loss": 0.1256, + "step": 6148 + }, + { + "epoch": 0.5665453540332611, + "grad_norm": 0.926098276509541, + "learning_rate": 2.157516308280855e-06, + "loss": 0.1377, + "step": 6149 + }, + { + "epoch": 0.5666374902105311, + "grad_norm": 0.8869498269399431, + "learning_rate": 2.156761723426175e-06, + "loss": 0.1191, + "step": 6150 + }, + { + "epoch": 0.5667296263878012, + "grad_norm": 0.9599949611293014, + "learning_rate": 2.1560071704412052e-06, + "loss": 0.1386, + "step": 6151 + }, + { + "epoch": 0.5668217625650712, + "grad_norm": 0.9464471559188455, + "learning_rate": 2.155252649396008e-06, + "loss": 0.124, + "step": 6152 + }, + { + "epoch": 0.5669138987423412, + "grad_norm": 0.9102320016506573, + "learning_rate": 2.1544981603606386e-06, + "loss": 0.131, + "step": 6153 + }, + { + "epoch": 0.5670060349196112, + "grad_norm": 0.90157281915478, + "learning_rate": 2.1537437034051516e-06, + "loss": 0.1221, + "step": 6154 + }, + { + "epoch": 0.5670981710968812, + "grad_norm": 0.9285510101392009, + "learning_rate": 2.1529892785995996e-06, + "loss": 0.1279, + "step": 6155 + }, + { + "epoch": 0.5671903072741512, + "grad_norm": 0.9514959766388984, + "learning_rate": 2.152234886014031e-06, + "loss": 0.1314, + "step": 6156 + }, + { + "epoch": 0.5672824434514212, + "grad_norm": 0.9429262054723736, + "learning_rate": 2.1514805257184894e-06, + "loss": 0.1375, + "step": 6157 + }, + { + "epoch": 0.5673745796286912, + "grad_norm": 0.9573607033483201, + "learning_rate": 2.1507261977830198e-06, + "loss": 0.1345, + "step": 6158 + }, + { + "epoch": 0.5674667158059612, + "grad_norm": 0.9221981689933934, + "learning_rate": 2.1499719022776588e-06, + "loss": 0.1385, + "step": 6159 + }, + { + "epoch": 0.5675588519832312, + "grad_norm": 0.9011655547573784, + "learning_rate": 2.149217639272445e-06, + "loss": 0.1306, + "step": 6160 + }, + { + "epoch": 0.5676509881605012, + "grad_norm": 0.9171495889253741, + "learning_rate": 2.1484634088374124e-06, + "loss": 0.1327, + "step": 6161 + }, + { + "epoch": 0.5677431243377712, + "grad_norm": 1.0023115963468265, + "learning_rate": 2.1477092110425887e-06, + "loss": 0.1435, + "step": 6162 + }, + { + "epoch": 0.5678352605150412, + "grad_norm": 0.9694155127507335, + "learning_rate": 2.1469550459580025e-06, + "loss": 0.1493, + "step": 6163 + }, + { + "epoch": 0.5679273966923112, + "grad_norm": 0.8953821703774014, + "learning_rate": 2.1462009136536787e-06, + "loss": 0.1302, + "step": 6164 + }, + { + "epoch": 0.5680195328695813, + "grad_norm": 0.9109471052698274, + "learning_rate": 2.145446814199639e-06, + "loss": 0.1317, + "step": 6165 + }, + { + "epoch": 0.5681116690468513, + "grad_norm": 0.9592739524875322, + "learning_rate": 2.1446927476658996e-06, + "loss": 0.1398, + "step": 6166 + }, + { + "epoch": 0.5682038052241213, + "grad_norm": 0.8768594453644337, + "learning_rate": 2.1439387141224775e-06, + "loss": 0.1232, + "step": 6167 + }, + { + "epoch": 0.5682959414013913, + "grad_norm": 0.894972250282082, + "learning_rate": 2.1431847136393832e-06, + "loss": 0.1311, + "step": 6168 + }, + { + "epoch": 0.5683880775786613, + "grad_norm": 0.8918939489151908, + "learning_rate": 2.1424307462866283e-06, + "loss": 0.1292, + "step": 6169 + }, + { + "epoch": 0.5684802137559313, + "grad_norm": 0.902243814029342, + "learning_rate": 2.141676812134216e-06, + "loss": 0.1217, + "step": 6170 + }, + { + "epoch": 0.5685723499332013, + "grad_norm": 0.971252940336675, + "learning_rate": 2.1409229112521498e-06, + "loss": 0.1394, + "step": 6171 + }, + { + "epoch": 0.5686644861104713, + "grad_norm": 0.9021673999338276, + "learning_rate": 2.1401690437104306e-06, + "loss": 0.1227, + "step": 6172 + }, + { + "epoch": 0.5687566222877413, + "grad_norm": 0.9182448929575698, + "learning_rate": 2.139415209579055e-06, + "loss": 0.1361, + "step": 6173 + }, + { + "epoch": 0.5688487584650113, + "grad_norm": 0.9163926727615889, + "learning_rate": 2.1386614089280145e-06, + "loss": 0.1446, + "step": 6174 + }, + { + "epoch": 0.5689408946422813, + "grad_norm": 0.9611220324654529, + "learning_rate": 2.137907641827302e-06, + "loss": 0.1437, + "step": 6175 + }, + { + "epoch": 0.5690330308195513, + "grad_norm": 0.918237967741333, + "learning_rate": 2.1371539083469033e-06, + "loss": 0.1397, + "step": 6176 + }, + { + "epoch": 0.5691251669968213, + "grad_norm": 0.9040728280725138, + "learning_rate": 2.1364002085568046e-06, + "loss": 0.1355, + "step": 6177 + }, + { + "epoch": 0.5692173031740914, + "grad_norm": 0.9140522866500976, + "learning_rate": 2.135646542526985e-06, + "loss": 0.1288, + "step": 6178 + }, + { + "epoch": 0.5693094393513614, + "grad_norm": 0.9138310043305649, + "learning_rate": 2.1348929103274223e-06, + "loss": 0.1296, + "step": 6179 + }, + { + "epoch": 0.5694015755286314, + "grad_norm": 0.8987271086082552, + "learning_rate": 2.134139312028093e-06, + "loss": 0.1366, + "step": 6180 + }, + { + "epoch": 0.5694937117059014, + "grad_norm": 0.8915096965436777, + "learning_rate": 2.1333857476989685e-06, + "loss": 0.1348, + "step": 6181 + }, + { + "epoch": 0.5695858478831713, + "grad_norm": 0.9142618530436438, + "learning_rate": 2.1326322174100156e-06, + "loss": 0.1265, + "step": 6182 + }, + { + "epoch": 0.5696779840604413, + "grad_norm": 1.0020211952626925, + "learning_rate": 2.1318787212312015e-06, + "loss": 0.14, + "step": 6183 + }, + { + "epoch": 0.5697701202377113, + "grad_norm": 0.9206368798131661, + "learning_rate": 2.131125259232487e-06, + "loss": 0.1319, + "step": 6184 + }, + { + "epoch": 0.5698622564149813, + "grad_norm": 0.867245016991194, + "learning_rate": 2.130371831483833e-06, + "loss": 0.1226, + "step": 6185 + }, + { + "epoch": 0.5699543925922513, + "grad_norm": 0.8823565460940279, + "learning_rate": 2.1296184380551936e-06, + "loss": 0.128, + "step": 6186 + }, + { + "epoch": 0.5700465287695213, + "grad_norm": 0.9646875119052738, + "learning_rate": 2.128865079016522e-06, + "loss": 0.1402, + "step": 6187 + }, + { + "epoch": 0.5701386649467913, + "grad_norm": 0.9309522537125285, + "learning_rate": 2.128111754437768e-06, + "loss": 0.1391, + "step": 6188 + }, + { + "epoch": 0.5702308011240613, + "grad_norm": 0.895035105648232, + "learning_rate": 2.127358464388877e-06, + "loss": 0.1366, + "step": 6189 + }, + { + "epoch": 0.5703229373013313, + "grad_norm": 0.9393248599902015, + "learning_rate": 2.1266052089397936e-06, + "loss": 0.1467, + "step": 6190 + }, + { + "epoch": 0.5704150734786013, + "grad_norm": 0.9666709102936543, + "learning_rate": 2.1258519881604566e-06, + "loss": 0.1287, + "step": 6191 + }, + { + "epoch": 0.5705072096558714, + "grad_norm": 0.8937479680491346, + "learning_rate": 2.125098802120802e-06, + "loss": 0.1304, + "step": 6192 + }, + { + "epoch": 0.5705993458331414, + "grad_norm": 0.9700220829306405, + "learning_rate": 2.1243456508907643e-06, + "loss": 0.1394, + "step": 6193 + }, + { + "epoch": 0.5706914820104114, + "grad_norm": 0.9037958372928121, + "learning_rate": 2.1235925345402746e-06, + "loss": 0.1232, + "step": 6194 + }, + { + "epoch": 0.5707836181876814, + "grad_norm": 0.8889183496728102, + "learning_rate": 2.122839453139257e-06, + "loss": 0.1274, + "step": 6195 + }, + { + "epoch": 0.5708757543649514, + "grad_norm": 0.8926523541631516, + "learning_rate": 2.122086406757637e-06, + "loss": 0.1307, + "step": 6196 + }, + { + "epoch": 0.5709678905422214, + "grad_norm": 0.9152895066179517, + "learning_rate": 2.121333395465335e-06, + "loss": 0.1343, + "step": 6197 + }, + { + "epoch": 0.5710600267194914, + "grad_norm": 0.9557095594049324, + "learning_rate": 2.1205804193322685e-06, + "loss": 0.138, + "step": 6198 + }, + { + "epoch": 0.5711521628967614, + "grad_norm": 0.8443583063364482, + "learning_rate": 2.119827478428351e-06, + "loss": 0.1314, + "step": 6199 + }, + { + "epoch": 0.5712442990740314, + "grad_norm": 0.9528291650337537, + "learning_rate": 2.1190745728234916e-06, + "loss": 0.1361, + "step": 6200 + }, + { + "epoch": 0.5713364352513014, + "grad_norm": 0.895194785522947, + "learning_rate": 2.1183217025876e-06, + "loss": 0.1381, + "step": 6201 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.9073430870330993, + "learning_rate": 2.1175688677905804e-06, + "loss": 0.1211, + "step": 6202 + }, + { + "epoch": 0.5715207076058414, + "grad_norm": 0.9412625238043502, + "learning_rate": 2.116816068502331e-06, + "loss": 0.1392, + "step": 6203 + }, + { + "epoch": 0.5716128437831114, + "grad_norm": 0.8922180197402308, + "learning_rate": 2.1160633047927515e-06, + "loss": 0.1346, + "step": 6204 + }, + { + "epoch": 0.5717049799603815, + "grad_norm": 0.9184043552563133, + "learning_rate": 2.115310576731735e-06, + "loss": 0.132, + "step": 6205 + }, + { + "epoch": 0.5717971161376515, + "grad_norm": 0.8905117266588061, + "learning_rate": 2.114557884389174e-06, + "loss": 0.1338, + "step": 6206 + }, + { + "epoch": 0.5718892523149215, + "grad_norm": 1.0157498803550593, + "learning_rate": 2.1138052278349543e-06, + "loss": 0.1359, + "step": 6207 + }, + { + "epoch": 0.5719813884921915, + "grad_norm": 0.91678227280953, + "learning_rate": 2.1130526071389603e-06, + "loss": 0.1155, + "step": 6208 + }, + { + "epoch": 0.5720735246694615, + "grad_norm": 0.9086101316942187, + "learning_rate": 2.1123000223710737e-06, + "loss": 0.1303, + "step": 6209 + }, + { + "epoch": 0.5721656608467315, + "grad_norm": 0.9277009997023078, + "learning_rate": 2.1115474736011725e-06, + "loss": 0.1321, + "step": 6210 + }, + { + "epoch": 0.5722577970240015, + "grad_norm": 0.9127183367904651, + "learning_rate": 2.110794960899129e-06, + "loss": 0.1383, + "step": 6211 + }, + { + "epoch": 0.5723499332012715, + "grad_norm": 0.9983831214605733, + "learning_rate": 2.1100424843348157e-06, + "loss": 0.1328, + "step": 6212 + }, + { + "epoch": 0.5724420693785415, + "grad_norm": 0.9561191380206018, + "learning_rate": 2.1092900439780993e-06, + "loss": 0.1393, + "step": 6213 + }, + { + "epoch": 0.5725342055558115, + "grad_norm": 0.9076953689928847, + "learning_rate": 2.108537639898845e-06, + "loss": 0.1343, + "step": 6214 + }, + { + "epoch": 0.5726263417330815, + "grad_norm": 0.9230865489132, + "learning_rate": 2.1077852721669132e-06, + "loss": 0.125, + "step": 6215 + }, + { + "epoch": 0.5727184779103515, + "grad_norm": 0.8781994139944165, + "learning_rate": 2.10703294085216e-06, + "loss": 0.1222, + "step": 6216 + }, + { + "epoch": 0.5728106140876215, + "grad_norm": 0.9418352709870952, + "learning_rate": 2.1062806460244415e-06, + "loss": 0.1351, + "step": 6217 + }, + { + "epoch": 0.5729027502648915, + "grad_norm": 0.9333115918970857, + "learning_rate": 2.1055283877536066e-06, + "loss": 0.1376, + "step": 6218 + }, + { + "epoch": 0.5729948864421616, + "grad_norm": 0.8909079069488656, + "learning_rate": 2.1047761661095043e-06, + "loss": 0.134, + "step": 6219 + }, + { + "epoch": 0.5730870226194316, + "grad_norm": 0.923389929285855, + "learning_rate": 2.1040239811619774e-06, + "loss": 0.1382, + "step": 6220 + }, + { + "epoch": 0.5731791587967016, + "grad_norm": 0.900590742643775, + "learning_rate": 2.1032718329808656e-06, + "loss": 0.135, + "step": 6221 + }, + { + "epoch": 0.5732712949739716, + "grad_norm": 0.8999258835799233, + "learning_rate": 2.102519721636007e-06, + "loss": 0.1269, + "step": 6222 + }, + { + "epoch": 0.5733634311512416, + "grad_norm": 0.9115861566104987, + "learning_rate": 2.1017676471972363e-06, + "loss": 0.1194, + "step": 6223 + }, + { + "epoch": 0.5734555673285116, + "grad_norm": 0.9547979195731437, + "learning_rate": 2.101015609734381e-06, + "loss": 0.1358, + "step": 6224 + }, + { + "epoch": 0.5735477035057815, + "grad_norm": 0.8987983423045988, + "learning_rate": 2.1002636093172694e-06, + "loss": 0.1404, + "step": 6225 + }, + { + "epoch": 0.5736398396830515, + "grad_norm": 0.9396026936959258, + "learning_rate": 2.099511646015725e-06, + "loss": 0.1358, + "step": 6226 + }, + { + "epoch": 0.5737319758603215, + "grad_norm": 0.9380954721223328, + "learning_rate": 2.098759719899568e-06, + "loss": 0.1365, + "step": 6227 + }, + { + "epoch": 0.5738241120375915, + "grad_norm": 0.9456480775172186, + "learning_rate": 2.0980078310386135e-06, + "loss": 0.1368, + "step": 6228 + }, + { + "epoch": 0.5739162482148615, + "grad_norm": 0.9404251140274369, + "learning_rate": 2.097255979502675e-06, + "loss": 0.1374, + "step": 6229 + }, + { + "epoch": 0.5740083843921315, + "grad_norm": 0.9329114486300538, + "learning_rate": 2.096504165361562e-06, + "loss": 0.1503, + "step": 6230 + }, + { + "epoch": 0.5741005205694015, + "grad_norm": 0.9589004596645393, + "learning_rate": 2.0957523886850815e-06, + "loss": 0.142, + "step": 6231 + }, + { + "epoch": 0.5741926567466715, + "grad_norm": 0.9469984435269726, + "learning_rate": 2.095000649543035e-06, + "loss": 0.1287, + "step": 6232 + }, + { + "epoch": 0.5742847929239416, + "grad_norm": 0.8447650874717392, + "learning_rate": 2.0942489480052214e-06, + "loss": 0.1242, + "step": 6233 + }, + { + "epoch": 0.5743769291012116, + "grad_norm": 0.8887316814688837, + "learning_rate": 2.093497284141436e-06, + "loss": 0.1393, + "step": 6234 + }, + { + "epoch": 0.5744690652784816, + "grad_norm": 0.9034629564353456, + "learning_rate": 2.0927456580214733e-06, + "loss": 0.1421, + "step": 6235 + }, + { + "epoch": 0.5745612014557516, + "grad_norm": 0.8562329900422688, + "learning_rate": 2.091994069715119e-06, + "loss": 0.1242, + "step": 6236 + }, + { + "epoch": 0.5746533376330216, + "grad_norm": 0.9299994710734435, + "learning_rate": 2.0912425192921588e-06, + "loss": 0.1312, + "step": 6237 + }, + { + "epoch": 0.5747454738102916, + "grad_norm": 0.9446203251305053, + "learning_rate": 2.0904910068223745e-06, + "loss": 0.147, + "step": 6238 + }, + { + "epoch": 0.5748376099875616, + "grad_norm": 0.8745483561720717, + "learning_rate": 2.0897395323755464e-06, + "loss": 0.115, + "step": 6239 + }, + { + "epoch": 0.5749297461648316, + "grad_norm": 0.863217780958388, + "learning_rate": 2.088988096021445e-06, + "loss": 0.1237, + "step": 6240 + }, + { + "epoch": 0.5750218823421016, + "grad_norm": 0.9183400050395388, + "learning_rate": 2.088236697829843e-06, + "loss": 0.1366, + "step": 6241 + }, + { + "epoch": 0.5751140185193716, + "grad_norm": 0.8436149595259076, + "learning_rate": 2.0874853378705085e-06, + "loss": 0.1229, + "step": 6242 + }, + { + "epoch": 0.5752061546966416, + "grad_norm": 0.9275652931055757, + "learning_rate": 2.0867340162132054e-06, + "loss": 0.1303, + "step": 6243 + }, + { + "epoch": 0.5752982908739116, + "grad_norm": 0.9302318599693131, + "learning_rate": 2.0859827329276926e-06, + "loss": 0.1418, + "step": 6244 + }, + { + "epoch": 0.5753904270511816, + "grad_norm": 0.95815229518161, + "learning_rate": 2.0852314880837278e-06, + "loss": 0.1397, + "step": 6245 + }, + { + "epoch": 0.5754825632284517, + "grad_norm": 0.8904343622778238, + "learning_rate": 2.0844802817510633e-06, + "loss": 0.1341, + "step": 6246 + }, + { + "epoch": 0.5755746994057217, + "grad_norm": 0.862398919434429, + "learning_rate": 2.08372911399945e-06, + "loss": 0.1185, + "step": 6247 + }, + { + "epoch": 0.5756668355829917, + "grad_norm": 0.9395104140482388, + "learning_rate": 2.0829779848986337e-06, + "loss": 0.1445, + "step": 6248 + }, + { + "epoch": 0.5757589717602617, + "grad_norm": 0.8877225576451748, + "learning_rate": 2.0822268945183555e-06, + "loss": 0.1301, + "step": 6249 + }, + { + "epoch": 0.5758511079375317, + "grad_norm": 0.8670131634340984, + "learning_rate": 2.081475842928356e-06, + "loss": 0.1242, + "step": 6250 + }, + { + "epoch": 0.5759432441148017, + "grad_norm": 0.965400954698164, + "learning_rate": 2.0807248301983682e-06, + "loss": 0.1479, + "step": 6251 + }, + { + "epoch": 0.5760353802920717, + "grad_norm": 0.8849794715504932, + "learning_rate": 2.0799738563981263e-06, + "loss": 0.1287, + "step": 6252 + }, + { + "epoch": 0.5761275164693417, + "grad_norm": 0.905858445926939, + "learning_rate": 2.079222921597357e-06, + "loss": 0.1386, + "step": 6253 + }, + { + "epoch": 0.5762196526466117, + "grad_norm": 0.9180158841915589, + "learning_rate": 2.078472025865784e-06, + "loss": 0.1375, + "step": 6254 + }, + { + "epoch": 0.5763117888238817, + "grad_norm": 0.8919740094962243, + "learning_rate": 2.077721169273129e-06, + "loss": 0.1344, + "step": 6255 + }, + { + "epoch": 0.5764039250011517, + "grad_norm": 0.9292425036753654, + "learning_rate": 2.0769703518891096e-06, + "loss": 0.1339, + "step": 6256 + }, + { + "epoch": 0.5764960611784217, + "grad_norm": 0.9122724926415321, + "learning_rate": 2.076219573783437e-06, + "loss": 0.1295, + "step": 6257 + }, + { + "epoch": 0.5765881973556917, + "grad_norm": 0.9458670285472852, + "learning_rate": 2.075468835025824e-06, + "loss": 0.1332, + "step": 6258 + }, + { + "epoch": 0.5766803335329617, + "grad_norm": 1.0076418370512317, + "learning_rate": 2.0747181356859743e-06, + "loss": 0.1383, + "step": 6259 + }, + { + "epoch": 0.5767724697102318, + "grad_norm": 0.8925007436761428, + "learning_rate": 2.073967475833593e-06, + "loss": 0.1301, + "step": 6260 + }, + { + "epoch": 0.5768646058875018, + "grad_norm": 0.9879169780568002, + "learning_rate": 2.0732168555383764e-06, + "loss": 0.1358, + "step": 6261 + }, + { + "epoch": 0.5769567420647718, + "grad_norm": 0.942015182599193, + "learning_rate": 2.0724662748700205e-06, + "loss": 0.1328, + "step": 6262 + }, + { + "epoch": 0.5770488782420418, + "grad_norm": 0.910961493062362, + "learning_rate": 2.0717157338982172e-06, + "loss": 0.1328, + "step": 6263 + }, + { + "epoch": 0.5771410144193118, + "grad_norm": 0.9031369142798781, + "learning_rate": 2.0709652326926547e-06, + "loss": 0.1309, + "step": 6264 + }, + { + "epoch": 0.5772331505965818, + "grad_norm": 1.0202082284494964, + "learning_rate": 2.070214771323015e-06, + "loss": 0.1547, + "step": 6265 + }, + { + "epoch": 0.5773252867738518, + "grad_norm": 0.8730129929135148, + "learning_rate": 2.0694643498589816e-06, + "loss": 0.1291, + "step": 6266 + }, + { + "epoch": 0.5774174229511218, + "grad_norm": 0.8652049603519832, + "learning_rate": 2.0687139683702284e-06, + "loss": 0.1235, + "step": 6267 + }, + { + "epoch": 0.5775095591283917, + "grad_norm": 0.9304371949873086, + "learning_rate": 2.067963626926431e-06, + "loss": 0.1375, + "step": 6268 + }, + { + "epoch": 0.5776016953056617, + "grad_norm": 0.932762906253494, + "learning_rate": 2.0672133255972567e-06, + "loss": 0.1354, + "step": 6269 + }, + { + "epoch": 0.5776938314829317, + "grad_norm": 0.9551092367759105, + "learning_rate": 2.066463064452371e-06, + "loss": 0.1378, + "step": 6270 + }, + { + "epoch": 0.5777859676602017, + "grad_norm": 0.9374294366319679, + "learning_rate": 2.0657128435614372e-06, + "loss": 0.1426, + "step": 6271 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 0.9424685529587546, + "learning_rate": 2.0649626629941134e-06, + "loss": 0.1383, + "step": 6272 + }, + { + "epoch": 0.5779702400147418, + "grad_norm": 0.8798221517310623, + "learning_rate": 2.0642125228200515e-06, + "loss": 0.1231, + "step": 6273 + }, + { + "epoch": 0.5780623761920118, + "grad_norm": 0.8718920010933575, + "learning_rate": 2.0634624231089047e-06, + "loss": 0.1317, + "step": 6274 + }, + { + "epoch": 0.5781545123692818, + "grad_norm": 0.8900509963420861, + "learning_rate": 2.062712363930318e-06, + "loss": 0.1306, + "step": 6275 + }, + { + "epoch": 0.5782466485465518, + "grad_norm": 0.9602083244915917, + "learning_rate": 2.0619623453539365e-06, + "loss": 0.1311, + "step": 6276 + }, + { + "epoch": 0.5783387847238218, + "grad_norm": 0.9409875054246267, + "learning_rate": 2.0612123674493983e-06, + "loss": 0.1349, + "step": 6277 + }, + { + "epoch": 0.5784309209010918, + "grad_norm": 0.954883571528099, + "learning_rate": 2.060462430286338e-06, + "loss": 0.1488, + "step": 6278 + }, + { + "epoch": 0.5785230570783618, + "grad_norm": 0.924233430850071, + "learning_rate": 2.059712533934389e-06, + "loss": 0.1353, + "step": 6279 + }, + { + "epoch": 0.5786151932556318, + "grad_norm": 0.9041734680736925, + "learning_rate": 2.0589626784631784e-06, + "loss": 0.1323, + "step": 6280 + }, + { + "epoch": 0.5787073294329018, + "grad_norm": 0.9837493412140981, + "learning_rate": 2.0582128639423316e-06, + "loss": 0.1536, + "step": 6281 + }, + { + "epoch": 0.5787994656101718, + "grad_norm": 0.974634864030492, + "learning_rate": 2.057463090441467e-06, + "loss": 0.1436, + "step": 6282 + }, + { + "epoch": 0.5788916017874418, + "grad_norm": 0.9432227471990523, + "learning_rate": 2.056713358030202e-06, + "loss": 0.1441, + "step": 6283 + }, + { + "epoch": 0.5789837379647118, + "grad_norm": 0.8964335561210266, + "learning_rate": 2.0559636667781493e-06, + "loss": 0.1318, + "step": 6284 + }, + { + "epoch": 0.5790758741419818, + "grad_norm": 0.8913206385209099, + "learning_rate": 2.055214016754919e-06, + "loss": 0.1303, + "step": 6285 + }, + { + "epoch": 0.5791680103192518, + "grad_norm": 0.9267354488882299, + "learning_rate": 2.0544644080301138e-06, + "loss": 0.1401, + "step": 6286 + }, + { + "epoch": 0.5792601464965219, + "grad_norm": 0.9305789259939854, + "learning_rate": 2.053714840673337e-06, + "loss": 0.1358, + "step": 6287 + }, + { + "epoch": 0.5793522826737919, + "grad_norm": 0.9084611989875776, + "learning_rate": 2.0529653147541844e-06, + "loss": 0.1357, + "step": 6288 + }, + { + "epoch": 0.5794444188510619, + "grad_norm": 0.9851294864819644, + "learning_rate": 2.0522158303422518e-06, + "loss": 0.1441, + "step": 6289 + }, + { + "epoch": 0.5795365550283319, + "grad_norm": 0.9148519072416933, + "learning_rate": 2.051466387507127e-06, + "loss": 0.1353, + "step": 6290 + }, + { + "epoch": 0.5796286912056019, + "grad_norm": 0.8849124093932106, + "learning_rate": 2.0507169863183956e-06, + "loss": 0.1207, + "step": 6291 + }, + { + "epoch": 0.5797208273828719, + "grad_norm": 0.9565706832013744, + "learning_rate": 2.0499676268456412e-06, + "loss": 0.1329, + "step": 6292 + }, + { + "epoch": 0.5798129635601419, + "grad_norm": 0.9551972700761554, + "learning_rate": 2.0492183091584414e-06, + "loss": 0.1381, + "step": 6293 + }, + { + "epoch": 0.5799050997374119, + "grad_norm": 0.8916816492853256, + "learning_rate": 2.048469033326369e-06, + "loss": 0.139, + "step": 6294 + }, + { + "epoch": 0.5799972359146819, + "grad_norm": 0.9837000019154031, + "learning_rate": 2.047719799418996e-06, + "loss": 0.1408, + "step": 6295 + }, + { + "epoch": 0.5800893720919519, + "grad_norm": 0.9296646554316655, + "learning_rate": 2.046970607505888e-06, + "loss": 0.1238, + "step": 6296 + }, + { + "epoch": 0.5801815082692219, + "grad_norm": 0.9916491510040498, + "learning_rate": 2.046221457656609e-06, + "loss": 0.1467, + "step": 6297 + }, + { + "epoch": 0.5802736444464919, + "grad_norm": 1.0222186289839617, + "learning_rate": 2.0454723499407158e-06, + "loss": 0.142, + "step": 6298 + }, + { + "epoch": 0.5803657806237619, + "grad_norm": 0.9199317685695529, + "learning_rate": 2.044723284427763e-06, + "loss": 0.1337, + "step": 6299 + }, + { + "epoch": 0.5804579168010319, + "grad_norm": 0.9431538247478978, + "learning_rate": 2.043974261187303e-06, + "loss": 0.1311, + "step": 6300 + }, + { + "epoch": 0.580550052978302, + "grad_norm": 0.935052064731194, + "learning_rate": 2.0432252802888827e-06, + "loss": 0.1319, + "step": 6301 + }, + { + "epoch": 0.580642189155572, + "grad_norm": 0.9305806123779443, + "learning_rate": 2.042476341802043e-06, + "loss": 0.1321, + "step": 6302 + }, + { + "epoch": 0.580734325332842, + "grad_norm": 0.9340840745588856, + "learning_rate": 2.0417274457963247e-06, + "loss": 0.1351, + "step": 6303 + }, + { + "epoch": 0.580826461510112, + "grad_norm": 0.8279176393267009, + "learning_rate": 2.040978592341262e-06, + "loss": 0.1174, + "step": 6304 + }, + { + "epoch": 0.580918597687382, + "grad_norm": 0.8983819696246862, + "learning_rate": 2.0402297815063867e-06, + "loss": 0.1301, + "step": 6305 + }, + { + "epoch": 0.581010733864652, + "grad_norm": 0.9312038343939995, + "learning_rate": 2.0394810133612263e-06, + "loss": 0.1365, + "step": 6306 + }, + { + "epoch": 0.581102870041922, + "grad_norm": 0.8673030702969664, + "learning_rate": 2.0387322879753025e-06, + "loss": 0.1295, + "step": 6307 + }, + { + "epoch": 0.581195006219192, + "grad_norm": 0.9257544184093895, + "learning_rate": 2.0379836054181356e-06, + "loss": 0.1348, + "step": 6308 + }, + { + "epoch": 0.581287142396462, + "grad_norm": 0.9720242295003335, + "learning_rate": 2.0372349657592404e-06, + "loss": 0.1422, + "step": 6309 + }, + { + "epoch": 0.581379278573732, + "grad_norm": 0.9263012800097226, + "learning_rate": 2.0364863690681293e-06, + "loss": 0.1291, + "step": 6310 + }, + { + "epoch": 0.581471414751002, + "grad_norm": 0.8925339009739691, + "learning_rate": 2.0357378154143083e-06, + "loss": 0.1305, + "step": 6311 + }, + { + "epoch": 0.5815635509282719, + "grad_norm": 0.8770195000598113, + "learning_rate": 2.0349893048672806e-06, + "loss": 0.1249, + "step": 6312 + }, + { + "epoch": 0.5816556871055419, + "grad_norm": 0.9175165472111774, + "learning_rate": 2.0342408374965457e-06, + "loss": 0.136, + "step": 6313 + }, + { + "epoch": 0.581747823282812, + "grad_norm": 0.9407699994116291, + "learning_rate": 2.033492413371601e-06, + "loss": 0.137, + "step": 6314 + }, + { + "epoch": 0.581839959460082, + "grad_norm": 0.9116296301855412, + "learning_rate": 2.0327440325619345e-06, + "loss": 0.1299, + "step": 6315 + }, + { + "epoch": 0.581932095637352, + "grad_norm": 0.9369925685666635, + "learning_rate": 2.0319956951370346e-06, + "loss": 0.1427, + "step": 6316 + }, + { + "epoch": 0.582024231814622, + "grad_norm": 0.9385348869940181, + "learning_rate": 2.0312474011663857e-06, + "loss": 0.1479, + "step": 6317 + }, + { + "epoch": 0.582116367991892, + "grad_norm": 0.9268995831678168, + "learning_rate": 2.030499150719466e-06, + "loss": 0.1337, + "step": 6318 + }, + { + "epoch": 0.582208504169162, + "grad_norm": 0.9028226590767937, + "learning_rate": 2.02975094386575e-06, + "loss": 0.1309, + "step": 6319 + }, + { + "epoch": 0.582300640346432, + "grad_norm": 0.9370832163666957, + "learning_rate": 2.02900278067471e-06, + "loss": 0.1258, + "step": 6320 + }, + { + "epoch": 0.582392776523702, + "grad_norm": 0.9655358616877272, + "learning_rate": 2.0282546612158116e-06, + "loss": 0.1372, + "step": 6321 + }, + { + "epoch": 0.582484912700972, + "grad_norm": 0.9148607737633211, + "learning_rate": 2.02750658555852e-06, + "loss": 0.1211, + "step": 6322 + }, + { + "epoch": 0.582577048878242, + "grad_norm": 0.8864078738002699, + "learning_rate": 2.026758553772292e-06, + "loss": 0.1305, + "step": 6323 + }, + { + "epoch": 0.582669185055512, + "grad_norm": 1.002989551156702, + "learning_rate": 2.026010565926583e-06, + "loss": 0.152, + "step": 6324 + }, + { + "epoch": 0.582761321232782, + "grad_norm": 0.9183628328083221, + "learning_rate": 2.0252626220908448e-06, + "loss": 0.1376, + "step": 6325 + }, + { + "epoch": 0.582853457410052, + "grad_norm": 0.8937942177656565, + "learning_rate": 2.0245147223345235e-06, + "loss": 0.1316, + "step": 6326 + }, + { + "epoch": 0.582945593587322, + "grad_norm": 0.8875351589120594, + "learning_rate": 2.0237668667270603e-06, + "loss": 0.1303, + "step": 6327 + }, + { + "epoch": 0.5830377297645921, + "grad_norm": 0.9263707122808156, + "learning_rate": 2.023019055337895e-06, + "loss": 0.1328, + "step": 6328 + }, + { + "epoch": 0.5831298659418621, + "grad_norm": 0.915973919313679, + "learning_rate": 2.0222712882364617e-06, + "loss": 0.1386, + "step": 6329 + }, + { + "epoch": 0.5832220021191321, + "grad_norm": 0.9156431422133637, + "learning_rate": 2.0215235654921912e-06, + "loss": 0.1362, + "step": 6330 + }, + { + "epoch": 0.5833141382964021, + "grad_norm": 0.8978505787324376, + "learning_rate": 2.0207758871745088e-06, + "loss": 0.1229, + "step": 6331 + }, + { + "epoch": 0.5834062744736721, + "grad_norm": 0.8691338078911657, + "learning_rate": 2.0200282533528367e-06, + "loss": 0.1373, + "step": 6332 + }, + { + "epoch": 0.5834984106509421, + "grad_norm": 0.938257513392887, + "learning_rate": 2.019280664096593e-06, + "loss": 0.1378, + "step": 6333 + }, + { + "epoch": 0.5835905468282121, + "grad_norm": 0.8559661977591175, + "learning_rate": 2.018533119475191e-06, + "loss": 0.1215, + "step": 6334 + }, + { + "epoch": 0.5836826830054821, + "grad_norm": 0.9573360476871194, + "learning_rate": 2.017785619558042e-06, + "loss": 0.1355, + "step": 6335 + }, + { + "epoch": 0.5837748191827521, + "grad_norm": 0.8571583060040944, + "learning_rate": 2.0170381644145492e-06, + "loss": 0.1224, + "step": 6336 + }, + { + "epoch": 0.5838669553600221, + "grad_norm": 0.9268463083225386, + "learning_rate": 2.0162907541141146e-06, + "loss": 0.1348, + "step": 6337 + }, + { + "epoch": 0.5839590915372921, + "grad_norm": 0.8793654369992728, + "learning_rate": 2.0155433887261362e-06, + "loss": 0.1289, + "step": 6338 + }, + { + "epoch": 0.5840512277145621, + "grad_norm": 0.9334765040012325, + "learning_rate": 2.0147960683200064e-06, + "loss": 0.1351, + "step": 6339 + }, + { + "epoch": 0.5841433638918321, + "grad_norm": 0.913189689611009, + "learning_rate": 2.014048792965113e-06, + "loss": 0.1306, + "step": 6340 + }, + { + "epoch": 0.5842355000691022, + "grad_norm": 0.9441334337706997, + "learning_rate": 2.013301562730842e-06, + "loss": 0.1384, + "step": 6341 + }, + { + "epoch": 0.5843276362463722, + "grad_norm": 0.8629368354740582, + "learning_rate": 2.0125543776865723e-06, + "loss": 0.1213, + "step": 6342 + }, + { + "epoch": 0.5844197724236422, + "grad_norm": 0.8799947858308416, + "learning_rate": 2.011807237901683e-06, + "loss": 0.1168, + "step": 6343 + }, + { + "epoch": 0.5845119086009122, + "grad_norm": 0.9652486866537766, + "learning_rate": 2.011060143445543e-06, + "loss": 0.141, + "step": 6344 + }, + { + "epoch": 0.5846040447781822, + "grad_norm": 0.8773569096523852, + "learning_rate": 2.010313094387521e-06, + "loss": 0.1324, + "step": 6345 + }, + { + "epoch": 0.5846961809554522, + "grad_norm": 0.9465727325543921, + "learning_rate": 2.0095660907969816e-06, + "loss": 0.1331, + "step": 6346 + }, + { + "epoch": 0.5847883171327222, + "grad_norm": 0.8830422552735072, + "learning_rate": 2.0088191327432838e-06, + "loss": 0.1205, + "step": 6347 + }, + { + "epoch": 0.5848804533099922, + "grad_norm": 0.8911291650874078, + "learning_rate": 2.0080722202957813e-06, + "loss": 0.1269, + "step": 6348 + }, + { + "epoch": 0.5849725894872622, + "grad_norm": 0.919726354522966, + "learning_rate": 2.0073253535238266e-06, + "loss": 0.1348, + "step": 6349 + }, + { + "epoch": 0.5850647256645322, + "grad_norm": 0.9153772381047109, + "learning_rate": 2.0065785324967654e-06, + "loss": 0.1231, + "step": 6350 + }, + { + "epoch": 0.5851568618418022, + "grad_norm": 0.9585102905267247, + "learning_rate": 2.0058317572839418e-06, + "loss": 0.1361, + "step": 6351 + }, + { + "epoch": 0.5852489980190722, + "grad_norm": 0.944155411460431, + "learning_rate": 2.0050850279546918e-06, + "loss": 0.1421, + "step": 6352 + }, + { + "epoch": 0.5853411341963421, + "grad_norm": 0.9449499207179856, + "learning_rate": 2.00433834457835e-06, + "loss": 0.1311, + "step": 6353 + }, + { + "epoch": 0.5854332703736121, + "grad_norm": 0.8963039607572094, + "learning_rate": 2.0035917072242463e-06, + "loss": 0.1365, + "step": 6354 + }, + { + "epoch": 0.5855254065508823, + "grad_norm": 0.9372716167180863, + "learning_rate": 2.002845115961707e-06, + "loss": 0.1304, + "step": 6355 + }, + { + "epoch": 0.5856175427281523, + "grad_norm": 0.9593342332592468, + "learning_rate": 2.002098570860051e-06, + "loss": 0.1341, + "step": 6356 + }, + { + "epoch": 0.5857096789054222, + "grad_norm": 0.9355281745635248, + "learning_rate": 2.001352071988597e-06, + "loss": 0.1395, + "step": 6357 + }, + { + "epoch": 0.5858018150826922, + "grad_norm": 0.9088372778534395, + "learning_rate": 2.000605619416656e-06, + "loss": 0.133, + "step": 6358 + }, + { + "epoch": 0.5858939512599622, + "grad_norm": 0.9209424144820311, + "learning_rate": 1.999859213213538e-06, + "loss": 0.1437, + "step": 6359 + }, + { + "epoch": 0.5859860874372322, + "grad_norm": 0.9475953953719402, + "learning_rate": 1.9991128534485454e-06, + "loss": 0.1315, + "step": 6360 + }, + { + "epoch": 0.5860782236145022, + "grad_norm": 0.9238882849548906, + "learning_rate": 1.998366540190978e-06, + "loss": 0.1228, + "step": 6361 + }, + { + "epoch": 0.5861703597917722, + "grad_norm": 0.937878020643912, + "learning_rate": 1.9976202735101314e-06, + "loss": 0.1385, + "step": 6362 + }, + { + "epoch": 0.5862624959690422, + "grad_norm": 0.8840553821623729, + "learning_rate": 1.9968740534752965e-06, + "loss": 0.1201, + "step": 6363 + }, + { + "epoch": 0.5863546321463122, + "grad_norm": 0.9642337662716879, + "learning_rate": 1.9961278801557606e-06, + "loss": 0.1402, + "step": 6364 + }, + { + "epoch": 0.5864467683235822, + "grad_norm": 0.9253050908043012, + "learning_rate": 1.9953817536208046e-06, + "loss": 0.1289, + "step": 6365 + }, + { + "epoch": 0.5865389045008522, + "grad_norm": 0.9555889041598452, + "learning_rate": 1.994635673939707e-06, + "loss": 0.1401, + "step": 6366 + }, + { + "epoch": 0.5866310406781222, + "grad_norm": 0.9409992712837372, + "learning_rate": 1.9938896411817416e-06, + "loss": 0.1389, + "step": 6367 + }, + { + "epoch": 0.5867231768553922, + "grad_norm": 0.8298665944219692, + "learning_rate": 1.9931436554161783e-06, + "loss": 0.1194, + "step": 6368 + }, + { + "epoch": 0.5868153130326623, + "grad_norm": 0.9843872433472307, + "learning_rate": 1.9923977167122797e-06, + "loss": 0.1489, + "step": 6369 + }, + { + "epoch": 0.5869074492099323, + "grad_norm": 0.9426206584555298, + "learning_rate": 1.9916518251393085e-06, + "loss": 0.1354, + "step": 6370 + }, + { + "epoch": 0.5869995853872023, + "grad_norm": 0.9291998492432275, + "learning_rate": 1.9909059807665195e-06, + "loss": 0.1257, + "step": 6371 + }, + { + "epoch": 0.5870917215644723, + "grad_norm": 0.9164376882762449, + "learning_rate": 1.990160183663166e-06, + "loss": 0.1348, + "step": 6372 + }, + { + "epoch": 0.5871838577417423, + "grad_norm": 0.916904820338903, + "learning_rate": 1.9894144338984937e-06, + "loss": 0.132, + "step": 6373 + }, + { + "epoch": 0.5872759939190123, + "grad_norm": 0.9084949228829207, + "learning_rate": 1.9886687315417456e-06, + "loss": 0.1344, + "step": 6374 + }, + { + "epoch": 0.5873681300962823, + "grad_norm": 0.9304152484544277, + "learning_rate": 1.9879230766621616e-06, + "loss": 0.1354, + "step": 6375 + }, + { + "epoch": 0.5874602662735523, + "grad_norm": 0.9262616287041675, + "learning_rate": 1.9871774693289754e-06, + "loss": 0.1279, + "step": 6376 + }, + { + "epoch": 0.5875524024508223, + "grad_norm": 0.8867273085909185, + "learning_rate": 1.9864319096114152e-06, + "loss": 0.1301, + "step": 6377 + }, + { + "epoch": 0.5876445386280923, + "grad_norm": 1.0611872055913358, + "learning_rate": 1.985686397578708e-06, + "loss": 0.1477, + "step": 6378 + }, + { + "epoch": 0.5877366748053623, + "grad_norm": 0.8468401803460077, + "learning_rate": 1.984940933300074e-06, + "loss": 0.1174, + "step": 6379 + }, + { + "epoch": 0.5878288109826323, + "grad_norm": 0.8830701936617826, + "learning_rate": 1.984195516844731e-06, + "loss": 0.1329, + "step": 6380 + }, + { + "epoch": 0.5879209471599023, + "grad_norm": 0.9153760956552899, + "learning_rate": 1.9834501482818885e-06, + "loss": 0.1394, + "step": 6381 + }, + { + "epoch": 0.5880130833371724, + "grad_norm": 0.884290099600892, + "learning_rate": 1.9827048276807552e-06, + "loss": 0.1361, + "step": 6382 + }, + { + "epoch": 0.5881052195144424, + "grad_norm": 0.9630089565926064, + "learning_rate": 1.9819595551105346e-06, + "loss": 0.142, + "step": 6383 + }, + { + "epoch": 0.5881973556917124, + "grad_norm": 0.9348061290004007, + "learning_rate": 1.9812143306404262e-06, + "loss": 0.1325, + "step": 6384 + }, + { + "epoch": 0.5882894918689824, + "grad_norm": 0.8751611132400253, + "learning_rate": 1.9804691543396213e-06, + "loss": 0.1255, + "step": 6385 + }, + { + "epoch": 0.5883816280462524, + "grad_norm": 0.9355019028595838, + "learning_rate": 1.9797240262773122e-06, + "loss": 0.1325, + "step": 6386 + }, + { + "epoch": 0.5884737642235224, + "grad_norm": 0.8332639547487759, + "learning_rate": 1.9789789465226825e-06, + "loss": 0.1253, + "step": 6387 + }, + { + "epoch": 0.5885659004007924, + "grad_norm": 0.9403565805795623, + "learning_rate": 1.978233915144915e-06, + "loss": 0.1401, + "step": 6388 + }, + { + "epoch": 0.5886580365780624, + "grad_norm": 0.8968199420083651, + "learning_rate": 1.977488932213184e-06, + "loss": 0.1397, + "step": 6389 + }, + { + "epoch": 0.5887501727553324, + "grad_norm": 0.9429483455249357, + "learning_rate": 1.976743997796661e-06, + "loss": 0.1431, + "step": 6390 + }, + { + "epoch": 0.5888423089326024, + "grad_norm": 0.886059394188964, + "learning_rate": 1.975999111964515e-06, + "loss": 0.1311, + "step": 6391 + }, + { + "epoch": 0.5889344451098724, + "grad_norm": 0.9131107533925749, + "learning_rate": 1.9752542747859076e-06, + "loss": 0.1332, + "step": 6392 + }, + { + "epoch": 0.5890265812871424, + "grad_norm": 0.8724833844713735, + "learning_rate": 1.974509486329998e-06, + "loss": 0.1185, + "step": 6393 + }, + { + "epoch": 0.5891187174644124, + "grad_norm": 0.8517216431460966, + "learning_rate": 1.973764746665938e-06, + "loss": 0.1244, + "step": 6394 + }, + { + "epoch": 0.5892108536416824, + "grad_norm": 0.8853967206799523, + "learning_rate": 1.9730200558628784e-06, + "loss": 0.1361, + "step": 6395 + }, + { + "epoch": 0.5893029898189525, + "grad_norm": 0.933921719916813, + "learning_rate": 1.972275413989963e-06, + "loss": 0.1276, + "step": 6396 + }, + { + "epoch": 0.5893951259962225, + "grad_norm": 0.9412040556603793, + "learning_rate": 1.971530821116333e-06, + "loss": 0.1351, + "step": 6397 + }, + { + "epoch": 0.5894872621734925, + "grad_norm": 0.9419111745680039, + "learning_rate": 1.970786277311123e-06, + "loss": 0.135, + "step": 6398 + }, + { + "epoch": 0.5895793983507625, + "grad_norm": 0.9215559284783924, + "learning_rate": 1.9700417826434633e-06, + "loss": 0.1351, + "step": 6399 + }, + { + "epoch": 0.5896715345280324, + "grad_norm": 0.996475566243809, + "learning_rate": 1.969297337182482e-06, + "loss": 0.1521, + "step": 6400 + }, + { + "epoch": 0.5897636707053024, + "grad_norm": 0.9291448076226406, + "learning_rate": 1.9685529409973e-06, + "loss": 0.1278, + "step": 6401 + }, + { + "epoch": 0.5898558068825724, + "grad_norm": 0.8924108059641798, + "learning_rate": 1.967808594157034e-06, + "loss": 0.1267, + "step": 6402 + }, + { + "epoch": 0.5899479430598424, + "grad_norm": 0.9645722547839062, + "learning_rate": 1.9670642967307974e-06, + "loss": 0.1399, + "step": 6403 + }, + { + "epoch": 0.5900400792371124, + "grad_norm": 0.8925650886648214, + "learning_rate": 1.9663200487876983e-06, + "loss": 0.1282, + "step": 6404 + }, + { + "epoch": 0.5901322154143824, + "grad_norm": 0.8834690207967658, + "learning_rate": 1.965575850396841e-06, + "loss": 0.1313, + "step": 6405 + }, + { + "epoch": 0.5902243515916524, + "grad_norm": 0.882154265370182, + "learning_rate": 1.9648317016273227e-06, + "loss": 0.1169, + "step": 6406 + }, + { + "epoch": 0.5903164877689224, + "grad_norm": 0.9405001358124874, + "learning_rate": 1.964087602548238e-06, + "loss": 0.134, + "step": 6407 + }, + { + "epoch": 0.5904086239461924, + "grad_norm": 0.9444373778196875, + "learning_rate": 1.9633435532286775e-06, + "loss": 0.1397, + "step": 6408 + }, + { + "epoch": 0.5905007601234625, + "grad_norm": 0.8989319983039749, + "learning_rate": 1.9625995537377268e-06, + "loss": 0.1155, + "step": 6409 + }, + { + "epoch": 0.5905928963007325, + "grad_norm": 0.9366810824860379, + "learning_rate": 1.961855604144464e-06, + "loss": 0.1409, + "step": 6410 + }, + { + "epoch": 0.5906850324780025, + "grad_norm": 0.9492399799170057, + "learning_rate": 1.961111704517967e-06, + "loss": 0.1441, + "step": 6411 + }, + { + "epoch": 0.5907771686552725, + "grad_norm": 0.9306708910687921, + "learning_rate": 1.9603678549273054e-06, + "loss": 0.1392, + "step": 6412 + }, + { + "epoch": 0.5908693048325425, + "grad_norm": 0.9310169038231709, + "learning_rate": 1.959624055441548e-06, + "loss": 0.1309, + "step": 6413 + }, + { + "epoch": 0.5909614410098125, + "grad_norm": 0.9509199776543236, + "learning_rate": 1.9588803061297544e-06, + "loss": 0.1469, + "step": 6414 + }, + { + "epoch": 0.5910535771870825, + "grad_norm": 0.8560938591568295, + "learning_rate": 1.9581366070609824e-06, + "loss": 0.1137, + "step": 6415 + }, + { + "epoch": 0.5911457133643525, + "grad_norm": 0.935421361088011, + "learning_rate": 1.957392958304285e-06, + "loss": 0.1302, + "step": 6416 + }, + { + "epoch": 0.5912378495416225, + "grad_norm": 0.9578568430913035, + "learning_rate": 1.9566493599287103e-06, + "loss": 0.1378, + "step": 6417 + }, + { + "epoch": 0.5913299857188925, + "grad_norm": 0.8990248820513408, + "learning_rate": 1.9559058120032997e-06, + "loss": 0.1172, + "step": 6418 + }, + { + "epoch": 0.5914221218961625, + "grad_norm": 0.9364115670604503, + "learning_rate": 1.955162314597094e-06, + "loss": 0.1382, + "step": 6419 + }, + { + "epoch": 0.5915142580734325, + "grad_norm": 0.9513877830712206, + "learning_rate": 1.9544188677791253e-06, + "loss": 0.1395, + "step": 6420 + }, + { + "epoch": 0.5916063942507025, + "grad_norm": 0.9556704700985831, + "learning_rate": 1.9536754716184244e-06, + "loss": 0.1324, + "step": 6421 + }, + { + "epoch": 0.5916985304279725, + "grad_norm": 0.9350415543063032, + "learning_rate": 1.9529321261840148e-06, + "loss": 0.1286, + "step": 6422 + }, + { + "epoch": 0.5917906666052426, + "grad_norm": 0.9438832953394088, + "learning_rate": 1.952188831544915e-06, + "loss": 0.1425, + "step": 6423 + }, + { + "epoch": 0.5918828027825126, + "grad_norm": 0.9351384678963927, + "learning_rate": 1.951445587770142e-06, + "loss": 0.1236, + "step": 6424 + }, + { + "epoch": 0.5919749389597826, + "grad_norm": 0.9389889457570394, + "learning_rate": 1.9507023949287045e-06, + "loss": 0.135, + "step": 6425 + }, + { + "epoch": 0.5920670751370526, + "grad_norm": 0.8928792043876652, + "learning_rate": 1.94995925308961e-06, + "loss": 0.1191, + "step": 6426 + }, + { + "epoch": 0.5921592113143226, + "grad_norm": 0.907511539500844, + "learning_rate": 1.9492161623218576e-06, + "loss": 0.1347, + "step": 6427 + }, + { + "epoch": 0.5922513474915926, + "grad_norm": 0.9123849674907245, + "learning_rate": 1.9484731226944427e-06, + "loss": 0.1301, + "step": 6428 + }, + { + "epoch": 0.5923434836688626, + "grad_norm": 0.8943152512986418, + "learning_rate": 1.9477301342763587e-06, + "loss": 0.1356, + "step": 6429 + }, + { + "epoch": 0.5924356198461326, + "grad_norm": 0.968239050353456, + "learning_rate": 1.946987197136592e-06, + "loss": 0.1356, + "step": 6430 + }, + { + "epoch": 0.5925277560234026, + "grad_norm": 0.9555806245021822, + "learning_rate": 1.946244311344122e-06, + "loss": 0.1442, + "step": 6431 + }, + { + "epoch": 0.5926198922006726, + "grad_norm": 0.9023064613208227, + "learning_rate": 1.945501476967928e-06, + "loss": 0.1334, + "step": 6432 + }, + { + "epoch": 0.5927120283779426, + "grad_norm": 0.8772285953239736, + "learning_rate": 1.9447586940769808e-06, + "loss": 0.1251, + "step": 6433 + }, + { + "epoch": 0.5928041645552126, + "grad_norm": 0.8673699715919908, + "learning_rate": 1.9440159627402497e-06, + "loss": 0.1224, + "step": 6434 + }, + { + "epoch": 0.5928963007324826, + "grad_norm": 0.8963161155771975, + "learning_rate": 1.9432732830266958e-06, + "loss": 0.1314, + "step": 6435 + }, + { + "epoch": 0.5929884369097527, + "grad_norm": 0.9320802532189435, + "learning_rate": 1.9425306550052774e-06, + "loss": 0.1438, + "step": 6436 + }, + { + "epoch": 0.5930805730870227, + "grad_norm": 0.9223967293003285, + "learning_rate": 1.9417880787449476e-06, + "loss": 0.1359, + "step": 6437 + }, + { + "epoch": 0.5931727092642927, + "grad_norm": 0.8711183973077686, + "learning_rate": 1.9410455543146554e-06, + "loss": 0.1305, + "step": 6438 + }, + { + "epoch": 0.5932648454415627, + "grad_norm": 0.9242491659975414, + "learning_rate": 1.9403030817833428e-06, + "loss": 0.134, + "step": 6439 + }, + { + "epoch": 0.5933569816188327, + "grad_norm": 0.91941387536559, + "learning_rate": 1.93956066121995e-06, + "loss": 0.1367, + "step": 6440 + }, + { + "epoch": 0.5934491177961027, + "grad_norm": 0.948051439287453, + "learning_rate": 1.938818292693409e-06, + "loss": 0.1379, + "step": 6441 + }, + { + "epoch": 0.5935412539733727, + "grad_norm": 0.8998731076445318, + "learning_rate": 1.9380759762726512e-06, + "loss": 0.1439, + "step": 6442 + }, + { + "epoch": 0.5936333901506426, + "grad_norm": 0.8641438471342882, + "learning_rate": 1.9373337120265993e-06, + "loss": 0.1262, + "step": 6443 + }, + { + "epoch": 0.5937255263279126, + "grad_norm": 0.9469896305094141, + "learning_rate": 1.936591500024172e-06, + "loss": 0.1448, + "step": 6444 + }, + { + "epoch": 0.5938176625051826, + "grad_norm": 0.9327091962545253, + "learning_rate": 1.935849340334285e-06, + "loss": 0.1316, + "step": 6445 + }, + { + "epoch": 0.5939097986824526, + "grad_norm": 0.9333644067409109, + "learning_rate": 1.9351072330258483e-06, + "loss": 0.1345, + "step": 6446 + }, + { + "epoch": 0.5940019348597226, + "grad_norm": 0.9562785312219442, + "learning_rate": 1.9343651781677648e-06, + "loss": 0.142, + "step": 6447 + }, + { + "epoch": 0.5940940710369926, + "grad_norm": 0.9451689710704071, + "learning_rate": 1.933623175828935e-06, + "loss": 0.1385, + "step": 6448 + }, + { + "epoch": 0.5941862072142626, + "grad_norm": 0.9044973124651259, + "learning_rate": 1.932881226078255e-06, + "loss": 0.1287, + "step": 6449 + }, + { + "epoch": 0.5942783433915327, + "grad_norm": 0.9583553094337743, + "learning_rate": 1.932139328984614e-06, + "loss": 0.1265, + "step": 6450 + }, + { + "epoch": 0.5943704795688027, + "grad_norm": 0.9073283509605476, + "learning_rate": 1.931397484616898e-06, + "loss": 0.1322, + "step": 6451 + }, + { + "epoch": 0.5944626157460727, + "grad_norm": 0.918641355087958, + "learning_rate": 1.9306556930439857e-06, + "loss": 0.1389, + "step": 6452 + }, + { + "epoch": 0.5945547519233427, + "grad_norm": 0.9440290388840534, + "learning_rate": 1.929913954334754e-06, + "loss": 0.1404, + "step": 6453 + }, + { + "epoch": 0.5946468881006127, + "grad_norm": 0.9119563904232301, + "learning_rate": 1.929172268558073e-06, + "loss": 0.1386, + "step": 6454 + }, + { + "epoch": 0.5947390242778827, + "grad_norm": 0.8866487591480595, + "learning_rate": 1.928430635782809e-06, + "loss": 0.1194, + "step": 6455 + }, + { + "epoch": 0.5948311604551527, + "grad_norm": 0.8549168890829434, + "learning_rate": 1.9276890560778215e-06, + "loss": 0.1148, + "step": 6456 + }, + { + "epoch": 0.5949232966324227, + "grad_norm": 0.9034668553279024, + "learning_rate": 1.9269475295119663e-06, + "loss": 0.1354, + "step": 6457 + }, + { + "epoch": 0.5950154328096927, + "grad_norm": 0.8658038474446961, + "learning_rate": 1.9262060561540946e-06, + "loss": 0.1336, + "step": 6458 + }, + { + "epoch": 0.5951075689869627, + "grad_norm": 0.8666140071243187, + "learning_rate": 1.9254646360730533e-06, + "loss": 0.1215, + "step": 6459 + }, + { + "epoch": 0.5951997051642327, + "grad_norm": 0.8869437410154752, + "learning_rate": 1.9247232693376815e-06, + "loss": 0.1241, + "step": 6460 + }, + { + "epoch": 0.5952918413415027, + "grad_norm": 0.9344054957254118, + "learning_rate": 1.9239819560168165e-06, + "loss": 0.1386, + "step": 6461 + }, + { + "epoch": 0.5953839775187727, + "grad_norm": 0.9095798537916769, + "learning_rate": 1.9232406961792884e-06, + "loss": 0.1361, + "step": 6462 + }, + { + "epoch": 0.5954761136960427, + "grad_norm": 0.8624089349049318, + "learning_rate": 1.9224994898939247e-06, + "loss": 0.1223, + "step": 6463 + }, + { + "epoch": 0.5955682498733128, + "grad_norm": 0.861278948017207, + "learning_rate": 1.9217583372295446e-06, + "loss": 0.1257, + "step": 6464 + }, + { + "epoch": 0.5956603860505828, + "grad_norm": 0.9454217068596863, + "learning_rate": 1.921017238254965e-06, + "loss": 0.145, + "step": 6465 + }, + { + "epoch": 0.5957525222278528, + "grad_norm": 0.9079234970512026, + "learning_rate": 1.920276193038997e-06, + "loss": 0.1299, + "step": 6466 + }, + { + "epoch": 0.5958446584051228, + "grad_norm": 0.9043646425298468, + "learning_rate": 1.9195352016504486e-06, + "loss": 0.1288, + "step": 6467 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 0.9208332337385433, + "learning_rate": 1.9187942641581174e-06, + "loss": 0.1367, + "step": 6468 + }, + { + "epoch": 0.5960289307596628, + "grad_norm": 0.8611028175581442, + "learning_rate": 1.9180533806308017e-06, + "loss": 0.1256, + "step": 6469 + }, + { + "epoch": 0.5961210669369328, + "grad_norm": 0.9491166063148969, + "learning_rate": 1.9173125511372923e-06, + "loss": 0.1349, + "step": 6470 + }, + { + "epoch": 0.5962132031142028, + "grad_norm": 0.9318622192071003, + "learning_rate": 1.916571775746376e-06, + "loss": 0.1375, + "step": 6471 + }, + { + "epoch": 0.5963053392914728, + "grad_norm": 0.9168563890482114, + "learning_rate": 1.915831054526832e-06, + "loss": 0.1412, + "step": 6472 + }, + { + "epoch": 0.5963974754687428, + "grad_norm": 0.883224161497809, + "learning_rate": 1.915090387547438e-06, + "loss": 0.1223, + "step": 6473 + }, + { + "epoch": 0.5964896116460128, + "grad_norm": 0.8794085943148341, + "learning_rate": 1.914349774876964e-06, + "loss": 0.1307, + "step": 6474 + }, + { + "epoch": 0.5965817478232828, + "grad_norm": 0.9169286039253907, + "learning_rate": 1.9136092165841776e-06, + "loss": 0.1385, + "step": 6475 + }, + { + "epoch": 0.5966738840005528, + "grad_norm": 0.8934819174982507, + "learning_rate": 1.9128687127378376e-06, + "loss": 0.1324, + "step": 6476 + }, + { + "epoch": 0.5967660201778229, + "grad_norm": 0.9454788969073651, + "learning_rate": 1.9121282634067008e-06, + "loss": 0.1299, + "step": 6477 + }, + { + "epoch": 0.5968581563550929, + "grad_norm": 0.9455512022162309, + "learning_rate": 1.911387868659518e-06, + "loss": 0.1314, + "step": 6478 + }, + { + "epoch": 0.5969502925323629, + "grad_norm": 0.8730096720038547, + "learning_rate": 1.9106475285650345e-06, + "loss": 0.1144, + "step": 6479 + }, + { + "epoch": 0.5970424287096329, + "grad_norm": 0.9349386390282505, + "learning_rate": 1.909907243191993e-06, + "loss": 0.1352, + "step": 6480 + }, + { + "epoch": 0.5971345648869029, + "grad_norm": 0.9006142107363241, + "learning_rate": 1.9091670126091264e-06, + "loss": 0.1368, + "step": 6481 + }, + { + "epoch": 0.5972267010641729, + "grad_norm": 0.9440688968299398, + "learning_rate": 1.908426836885166e-06, + "loss": 0.1277, + "step": 6482 + }, + { + "epoch": 0.5973188372414429, + "grad_norm": 0.9115356328181138, + "learning_rate": 1.907686716088838e-06, + "loss": 0.13, + "step": 6483 + }, + { + "epoch": 0.5974109734187129, + "grad_norm": 0.9107915908099802, + "learning_rate": 1.9069466502888625e-06, + "loss": 0.123, + "step": 6484 + }, + { + "epoch": 0.5975031095959829, + "grad_norm": 0.9531637489514408, + "learning_rate": 1.9062066395539535e-06, + "loss": 0.1436, + "step": 6485 + }, + { + "epoch": 0.5975952457732528, + "grad_norm": 0.9428193773075232, + "learning_rate": 1.9054666839528225e-06, + "loss": 0.1232, + "step": 6486 + }, + { + "epoch": 0.5976873819505228, + "grad_norm": 0.9627495716809685, + "learning_rate": 1.904726783554173e-06, + "loss": 0.1331, + "step": 6487 + }, + { + "epoch": 0.5977795181277928, + "grad_norm": 0.9517407487393909, + "learning_rate": 1.903986938426707e-06, + "loss": 0.1305, + "step": 6488 + }, + { + "epoch": 0.5978716543050628, + "grad_norm": 0.9264042232053695, + "learning_rate": 1.9032471486391175e-06, + "loss": 0.1359, + "step": 6489 + }, + { + "epoch": 0.5979637904823328, + "grad_norm": 0.904126855844614, + "learning_rate": 1.9025074142600935e-06, + "loss": 0.1244, + "step": 6490 + }, + { + "epoch": 0.5980559266596029, + "grad_norm": 0.9136459862220759, + "learning_rate": 1.9017677353583213e-06, + "loss": 0.1312, + "step": 6491 + }, + { + "epoch": 0.5981480628368729, + "grad_norm": 0.9007666304246773, + "learning_rate": 1.90102811200248e-06, + "loss": 0.1354, + "step": 6492 + }, + { + "epoch": 0.5982401990141429, + "grad_norm": 0.9122491086990855, + "learning_rate": 1.9002885442612413e-06, + "loss": 0.1296, + "step": 6493 + }, + { + "epoch": 0.5983323351914129, + "grad_norm": 0.8637309458864265, + "learning_rate": 1.8995490322032767e-06, + "loss": 0.1177, + "step": 6494 + }, + { + "epoch": 0.5984244713686829, + "grad_norm": 0.9140446335579833, + "learning_rate": 1.8988095758972485e-06, + "loss": 0.126, + "step": 6495 + }, + { + "epoch": 0.5985166075459529, + "grad_norm": 0.891049772866993, + "learning_rate": 1.8980701754118168e-06, + "loss": 0.1407, + "step": 6496 + }, + { + "epoch": 0.5986087437232229, + "grad_norm": 0.9308146796900931, + "learning_rate": 1.8973308308156337e-06, + "loss": 0.146, + "step": 6497 + }, + { + "epoch": 0.5987008799004929, + "grad_norm": 0.9217024801030724, + "learning_rate": 1.8965915421773473e-06, + "loss": 0.1385, + "step": 6498 + }, + { + "epoch": 0.5987930160777629, + "grad_norm": 0.9446524923822498, + "learning_rate": 1.8958523095656016e-06, + "loss": 0.1264, + "step": 6499 + }, + { + "epoch": 0.5988851522550329, + "grad_norm": 0.87348994025732, + "learning_rate": 1.8951131330490347e-06, + "loss": 0.1209, + "step": 6500 + }, + { + "epoch": 0.5988851522550329, + "eval_loss": 0.1315893828868866, + "eval_runtime": 299.1326, + "eval_samples_per_second": 23.458, + "eval_steps_per_second": 2.935, + "step": 6500 + }, + { + "epoch": 0.5989772884323029, + "grad_norm": 0.890060441599928, + "learning_rate": 1.8943740126962774e-06, + "loss": 0.1333, + "step": 6501 + }, + { + "epoch": 0.5990694246095729, + "grad_norm": 0.8535593592292823, + "learning_rate": 1.8936349485759586e-06, + "loss": 0.1216, + "step": 6502 + }, + { + "epoch": 0.5991615607868429, + "grad_norm": 0.8947532542781077, + "learning_rate": 1.8928959407566994e-06, + "loss": 0.1378, + "step": 6503 + }, + { + "epoch": 0.599253696964113, + "grad_norm": 0.8657797376728128, + "learning_rate": 1.8921569893071187e-06, + "loss": 0.1238, + "step": 6504 + }, + { + "epoch": 0.599345833141383, + "grad_norm": 0.8597590797050176, + "learning_rate": 1.8914180942958265e-06, + "loss": 0.1328, + "step": 6505 + }, + { + "epoch": 0.599437969318653, + "grad_norm": 0.9403836820908409, + "learning_rate": 1.890679255791429e-06, + "loss": 0.1387, + "step": 6506 + }, + { + "epoch": 0.599530105495923, + "grad_norm": 0.92284449719563, + "learning_rate": 1.8899404738625288e-06, + "loss": 0.1357, + "step": 6507 + }, + { + "epoch": 0.599622241673193, + "grad_norm": 0.8763065217252709, + "learning_rate": 1.8892017485777208e-06, + "loss": 0.1314, + "step": 6508 + }, + { + "epoch": 0.599714377850463, + "grad_norm": 0.9386674388659384, + "learning_rate": 1.8884630800055973e-06, + "loss": 0.1223, + "step": 6509 + }, + { + "epoch": 0.599806514027733, + "grad_norm": 0.9215754757081813, + "learning_rate": 1.8877244682147419e-06, + "loss": 0.1371, + "step": 6510 + }, + { + "epoch": 0.599898650205003, + "grad_norm": 0.8830249289667409, + "learning_rate": 1.886985913273735e-06, + "loss": 0.1346, + "step": 6511 + }, + { + "epoch": 0.599990786382273, + "grad_norm": 1.0090984121003168, + "learning_rate": 1.8862474152511529e-06, + "loss": 0.1506, + "step": 6512 + }, + { + "epoch": 0.600082922559543, + "grad_norm": 0.8890967804641455, + "learning_rate": 1.8855089742155647e-06, + "loss": 0.1314, + "step": 6513 + }, + { + "epoch": 0.600175058736813, + "grad_norm": 0.9269506146509996, + "learning_rate": 1.8847705902355332e-06, + "loss": 0.1349, + "step": 6514 + }, + { + "epoch": 0.600267194914083, + "grad_norm": 0.889886502404544, + "learning_rate": 1.8840322633796191e-06, + "loss": 0.126, + "step": 6515 + }, + { + "epoch": 0.600359331091353, + "grad_norm": 0.9002338360481987, + "learning_rate": 1.8832939937163753e-06, + "loss": 0.1241, + "step": 6516 + }, + { + "epoch": 0.600451467268623, + "grad_norm": 0.9666422702216696, + "learning_rate": 1.8825557813143513e-06, + "loss": 0.1313, + "step": 6517 + }, + { + "epoch": 0.6005436034458931, + "grad_norm": 0.8794516400849002, + "learning_rate": 1.8818176262420893e-06, + "loss": 0.1217, + "step": 6518 + }, + { + "epoch": 0.6006357396231631, + "grad_norm": 0.8712755680659606, + "learning_rate": 1.8810795285681263e-06, + "loss": 0.1133, + "step": 6519 + }, + { + "epoch": 0.6007278758004331, + "grad_norm": 0.9428832972839667, + "learning_rate": 1.8803414883609967e-06, + "loss": 0.1384, + "step": 6520 + }, + { + "epoch": 0.6008200119777031, + "grad_norm": 0.8541956118384897, + "learning_rate": 1.8796035056892268e-06, + "loss": 0.1218, + "step": 6521 + }, + { + "epoch": 0.6009121481549731, + "grad_norm": 0.9310338659977105, + "learning_rate": 1.8788655806213372e-06, + "loss": 0.1251, + "step": 6522 + }, + { + "epoch": 0.6010042843322431, + "grad_norm": 0.9325252258734855, + "learning_rate": 1.8781277132258458e-06, + "loss": 0.1349, + "step": 6523 + }, + { + "epoch": 0.6010964205095131, + "grad_norm": 0.95634726705644, + "learning_rate": 1.8773899035712622e-06, + "loss": 0.1349, + "step": 6524 + }, + { + "epoch": 0.6011885566867831, + "grad_norm": 0.904487386830463, + "learning_rate": 1.8766521517260946e-06, + "loss": 0.1216, + "step": 6525 + }, + { + "epoch": 0.6012806928640531, + "grad_norm": 0.9604402323364885, + "learning_rate": 1.875914457758841e-06, + "loss": 0.1414, + "step": 6526 + }, + { + "epoch": 0.601372829041323, + "grad_norm": 0.8887907178428888, + "learning_rate": 1.8751768217379973e-06, + "loss": 0.1283, + "step": 6527 + }, + { + "epoch": 0.601464965218593, + "grad_norm": 0.9066603462086329, + "learning_rate": 1.874439243732053e-06, + "loss": 0.1317, + "step": 6528 + }, + { + "epoch": 0.601557101395863, + "grad_norm": 0.9088621832752597, + "learning_rate": 1.8737017238094926e-06, + "loss": 0.1323, + "step": 6529 + }, + { + "epoch": 0.601649237573133, + "grad_norm": 0.8955444159164336, + "learning_rate": 1.8729642620387935e-06, + "loss": 0.1264, + "step": 6530 + }, + { + "epoch": 0.601741373750403, + "grad_norm": 0.9723213160380102, + "learning_rate": 1.8722268584884312e-06, + "loss": 0.1385, + "step": 6531 + }, + { + "epoch": 0.6018335099276731, + "grad_norm": 0.9220388614910245, + "learning_rate": 1.8714895132268718e-06, + "loss": 0.1387, + "step": 6532 + }, + { + "epoch": 0.6019256461049431, + "grad_norm": 0.9805865380748094, + "learning_rate": 1.8707522263225797e-06, + "loss": 0.1388, + "step": 6533 + }, + { + "epoch": 0.6020177822822131, + "grad_norm": 0.9279629753238866, + "learning_rate": 1.8700149978440105e-06, + "loss": 0.1287, + "step": 6534 + }, + { + "epoch": 0.6021099184594831, + "grad_norm": 0.9464551237736833, + "learning_rate": 1.8692778278596162e-06, + "loss": 0.1321, + "step": 6535 + }, + { + "epoch": 0.6022020546367531, + "grad_norm": 0.9800861251562089, + "learning_rate": 1.868540716437844e-06, + "loss": 0.1338, + "step": 6536 + }, + { + "epoch": 0.6022941908140231, + "grad_norm": 0.9347287257761518, + "learning_rate": 1.8678036636471336e-06, + "loss": 0.1269, + "step": 6537 + }, + { + "epoch": 0.6023863269912931, + "grad_norm": 0.9972581832144547, + "learning_rate": 1.867066669555922e-06, + "loss": 0.1431, + "step": 6538 + }, + { + "epoch": 0.6024784631685631, + "grad_norm": 0.9908423627612108, + "learning_rate": 1.8663297342326381e-06, + "loss": 0.1363, + "step": 6539 + }, + { + "epoch": 0.6025705993458331, + "grad_norm": 0.897036984294902, + "learning_rate": 1.8655928577457058e-06, + "loss": 0.1231, + "step": 6540 + }, + { + "epoch": 0.6026627355231031, + "grad_norm": 0.9047862414880307, + "learning_rate": 1.8648560401635448e-06, + "loss": 0.134, + "step": 6541 + }, + { + "epoch": 0.6027548717003731, + "grad_norm": 0.9144810403633027, + "learning_rate": 1.8641192815545705e-06, + "loss": 0.1301, + "step": 6542 + }, + { + "epoch": 0.6028470078776431, + "grad_norm": 0.9365388153587314, + "learning_rate": 1.8633825819871881e-06, + "loss": 0.1347, + "step": 6543 + }, + { + "epoch": 0.6029391440549131, + "grad_norm": 0.8949914004485408, + "learning_rate": 1.8626459415298012e-06, + "loss": 0.122, + "step": 6544 + }, + { + "epoch": 0.6030312802321832, + "grad_norm": 0.8895585861402004, + "learning_rate": 1.8619093602508075e-06, + "loss": 0.1173, + "step": 6545 + }, + { + "epoch": 0.6031234164094532, + "grad_norm": 0.9070371139758455, + "learning_rate": 1.8611728382185995e-06, + "loss": 0.1338, + "step": 6546 + }, + { + "epoch": 0.6032155525867232, + "grad_norm": 0.9223856940001541, + "learning_rate": 1.860436375501561e-06, + "loss": 0.1228, + "step": 6547 + }, + { + "epoch": 0.6033076887639932, + "grad_norm": 0.9695663122827732, + "learning_rate": 1.8596999721680743e-06, + "loss": 0.138, + "step": 6548 + }, + { + "epoch": 0.6033998249412632, + "grad_norm": 0.9665305664799301, + "learning_rate": 1.858963628286513e-06, + "loss": 0.136, + "step": 6549 + }, + { + "epoch": 0.6034919611185332, + "grad_norm": 0.8799821475459378, + "learning_rate": 1.8582273439252497e-06, + "loss": 0.1295, + "step": 6550 + }, + { + "epoch": 0.6035840972958032, + "grad_norm": 0.9107048517759243, + "learning_rate": 1.8574911191526456e-06, + "loss": 0.1199, + "step": 6551 + }, + { + "epoch": 0.6036762334730732, + "grad_norm": 0.9273771857397388, + "learning_rate": 1.85675495403706e-06, + "loss": 0.1419, + "step": 6552 + }, + { + "epoch": 0.6037683696503432, + "grad_norm": 0.879572291727904, + "learning_rate": 1.8560188486468463e-06, + "loss": 0.1257, + "step": 6553 + }, + { + "epoch": 0.6038605058276132, + "grad_norm": 0.917819635353749, + "learning_rate": 1.8552828030503528e-06, + "loss": 0.1362, + "step": 6554 + }, + { + "epoch": 0.6039526420048832, + "grad_norm": 0.854519148192027, + "learning_rate": 1.854546817315919e-06, + "loss": 0.1247, + "step": 6555 + }, + { + "epoch": 0.6040447781821532, + "grad_norm": 0.9358624951200376, + "learning_rate": 1.8538108915118833e-06, + "loss": 0.1385, + "step": 6556 + }, + { + "epoch": 0.6041369143594232, + "grad_norm": 0.8987979977343791, + "learning_rate": 1.8530750257065752e-06, + "loss": 0.1299, + "step": 6557 + }, + { + "epoch": 0.6042290505366932, + "grad_norm": 0.8655912576757261, + "learning_rate": 1.8523392199683218e-06, + "loss": 0.1232, + "step": 6558 + }, + { + "epoch": 0.6043211867139633, + "grad_norm": 0.8844496081591054, + "learning_rate": 1.851603474365441e-06, + "loss": 0.1381, + "step": 6559 + }, + { + "epoch": 0.6044133228912333, + "grad_norm": 0.9974044514544727, + "learning_rate": 1.8508677889662469e-06, + "loss": 0.1425, + "step": 6560 + }, + { + "epoch": 0.6045054590685033, + "grad_norm": 0.8670146001504918, + "learning_rate": 1.850132163839049e-06, + "loss": 0.1305, + "step": 6561 + }, + { + "epoch": 0.6045975952457733, + "grad_norm": 0.8994301785848731, + "learning_rate": 1.849396599052149e-06, + "loss": 0.1161, + "step": 6562 + }, + { + "epoch": 0.6046897314230433, + "grad_norm": 0.9049796994871221, + "learning_rate": 1.848661094673846e-06, + "loss": 0.126, + "step": 6563 + }, + { + "epoch": 0.6047818676003133, + "grad_norm": 0.9022404281815567, + "learning_rate": 1.8479256507724297e-06, + "loss": 0.1187, + "step": 6564 + }, + { + "epoch": 0.6048740037775833, + "grad_norm": 0.9916709809090737, + "learning_rate": 1.8471902674161863e-06, + "loss": 0.1475, + "step": 6565 + }, + { + "epoch": 0.6049661399548533, + "grad_norm": 0.9205985919823508, + "learning_rate": 1.8464549446733976e-06, + "loss": 0.1355, + "step": 6566 + }, + { + "epoch": 0.6050582761321233, + "grad_norm": 0.8901635257949637, + "learning_rate": 1.8457196826123381e-06, + "loss": 0.1254, + "step": 6567 + }, + { + "epoch": 0.6051504123093933, + "grad_norm": 0.8943163998141479, + "learning_rate": 1.8449844813012755e-06, + "loss": 0.1238, + "step": 6568 + }, + { + "epoch": 0.6052425484866633, + "grad_norm": 0.9384468453458241, + "learning_rate": 1.8442493408084746e-06, + "loss": 0.135, + "step": 6569 + }, + { + "epoch": 0.6053346846639333, + "grad_norm": 0.8784833757143954, + "learning_rate": 1.8435142612021929e-06, + "loss": 0.1356, + "step": 6570 + }, + { + "epoch": 0.6054268208412033, + "grad_norm": 0.9014973212837387, + "learning_rate": 1.8427792425506833e-06, + "loss": 0.1255, + "step": 6571 + }, + { + "epoch": 0.6055189570184734, + "grad_norm": 0.9243350184650585, + "learning_rate": 1.8420442849221915e-06, + "loss": 0.136, + "step": 6572 + }, + { + "epoch": 0.6056110931957434, + "grad_norm": 0.8652026511605855, + "learning_rate": 1.8413093883849579e-06, + "loss": 0.1181, + "step": 6573 + }, + { + "epoch": 0.6057032293730134, + "grad_norm": 0.9563469319601445, + "learning_rate": 1.840574553007219e-06, + "loss": 0.1281, + "step": 6574 + }, + { + "epoch": 0.6057953655502833, + "grad_norm": 0.8706380604782731, + "learning_rate": 1.8398397788572046e-06, + "loss": 0.1345, + "step": 6575 + }, + { + "epoch": 0.6058875017275533, + "grad_norm": 0.9457792347195052, + "learning_rate": 1.8391050660031364e-06, + "loss": 0.142, + "step": 6576 + }, + { + "epoch": 0.6059796379048233, + "grad_norm": 0.9546506661285495, + "learning_rate": 1.8383704145132347e-06, + "loss": 0.1328, + "step": 6577 + }, + { + "epoch": 0.6060717740820933, + "grad_norm": 0.9360510999701172, + "learning_rate": 1.8376358244557108e-06, + "loss": 0.1405, + "step": 6578 + }, + { + "epoch": 0.6061639102593633, + "grad_norm": 0.953871600525164, + "learning_rate": 1.8369012958987728e-06, + "loss": 0.1294, + "step": 6579 + }, + { + "epoch": 0.6062560464366333, + "grad_norm": 0.9827266384723413, + "learning_rate": 1.8361668289106204e-06, + "loss": 0.1404, + "step": 6580 + }, + { + "epoch": 0.6063481826139033, + "grad_norm": 0.9683587334877363, + "learning_rate": 1.8354324235594488e-06, + "loss": 0.1409, + "step": 6581 + }, + { + "epoch": 0.6064403187911733, + "grad_norm": 0.8877532566082904, + "learning_rate": 1.8346980799134485e-06, + "loss": 0.1232, + "step": 6582 + }, + { + "epoch": 0.6065324549684433, + "grad_norm": 0.9025550006506345, + "learning_rate": 1.8339637980408038e-06, + "loss": 0.1153, + "step": 6583 + }, + { + "epoch": 0.6066245911457133, + "grad_norm": 0.9428207407116446, + "learning_rate": 1.833229578009691e-06, + "loss": 0.1303, + "step": 6584 + }, + { + "epoch": 0.6067167273229833, + "grad_norm": 0.9556142056967836, + "learning_rate": 1.8324954198882843e-06, + "loss": 0.1438, + "step": 6585 + }, + { + "epoch": 0.6068088635002534, + "grad_norm": 0.8780366492422997, + "learning_rate": 1.831761323744749e-06, + "loss": 0.1214, + "step": 6586 + }, + { + "epoch": 0.6069009996775234, + "grad_norm": 0.9234513915076087, + "learning_rate": 1.831027289647248e-06, + "loss": 0.1257, + "step": 6587 + }, + { + "epoch": 0.6069931358547934, + "grad_norm": 0.9163158474665248, + "learning_rate": 1.8302933176639346e-06, + "loss": 0.1326, + "step": 6588 + }, + { + "epoch": 0.6070852720320634, + "grad_norm": 0.9169583283659786, + "learning_rate": 1.829559407862958e-06, + "loss": 0.1313, + "step": 6589 + }, + { + "epoch": 0.6071774082093334, + "grad_norm": 0.9362982236980304, + "learning_rate": 1.8288255603124632e-06, + "loss": 0.1338, + "step": 6590 + }, + { + "epoch": 0.6072695443866034, + "grad_norm": 0.9062618567697522, + "learning_rate": 1.8280917750805865e-06, + "loss": 0.1259, + "step": 6591 + }, + { + "epoch": 0.6073616805638734, + "grad_norm": 0.8924801799490278, + "learning_rate": 1.8273580522354622e-06, + "loss": 0.1189, + "step": 6592 + }, + { + "epoch": 0.6074538167411434, + "grad_norm": 0.925574310988602, + "learning_rate": 1.8266243918452149e-06, + "loss": 0.128, + "step": 6593 + }, + { + "epoch": 0.6075459529184134, + "grad_norm": 0.9140715166358384, + "learning_rate": 1.825890793977964e-06, + "loss": 0.128, + "step": 6594 + }, + { + "epoch": 0.6076380890956834, + "grad_norm": 0.9680327082562757, + "learning_rate": 1.8251572587018268e-06, + "loss": 0.1515, + "step": 6595 + }, + { + "epoch": 0.6077302252729534, + "grad_norm": 0.9601890925112531, + "learning_rate": 1.8244237860849108e-06, + "loss": 0.1322, + "step": 6596 + }, + { + "epoch": 0.6078223614502234, + "grad_norm": 0.8684119145542256, + "learning_rate": 1.8236903761953178e-06, + "loss": 0.1176, + "step": 6597 + }, + { + "epoch": 0.6079144976274934, + "grad_norm": 0.8648843664557521, + "learning_rate": 1.8229570291011468e-06, + "loss": 0.1168, + "step": 6598 + }, + { + "epoch": 0.6080066338047634, + "grad_norm": 0.9193053247872316, + "learning_rate": 1.8222237448704882e-06, + "loss": 0.1301, + "step": 6599 + }, + { + "epoch": 0.6080987699820335, + "grad_norm": 0.92509832524853, + "learning_rate": 1.8214905235714286e-06, + "loss": 0.128, + "step": 6600 + }, + { + "epoch": 0.6081909061593035, + "grad_norm": 0.8970623170235373, + "learning_rate": 1.8207573652720467e-06, + "loss": 0.1161, + "step": 6601 + }, + { + "epoch": 0.6082830423365735, + "grad_norm": 0.8532064017979026, + "learning_rate": 1.8200242700404159e-06, + "loss": 0.1201, + "step": 6602 + }, + { + "epoch": 0.6083751785138435, + "grad_norm": 0.9100390031276187, + "learning_rate": 1.8192912379446048e-06, + "loss": 0.1226, + "step": 6603 + }, + { + "epoch": 0.6084673146911135, + "grad_norm": 0.9301923891417945, + "learning_rate": 1.8185582690526765e-06, + "loss": 0.1332, + "step": 6604 + }, + { + "epoch": 0.6085594508683835, + "grad_norm": 0.9588130238961836, + "learning_rate": 1.8178253634326854e-06, + "loss": 0.1323, + "step": 6605 + }, + { + "epoch": 0.6086515870456535, + "grad_norm": 0.9464922450102702, + "learning_rate": 1.817092521152683e-06, + "loss": 0.1243, + "step": 6606 + }, + { + "epoch": 0.6087437232229235, + "grad_norm": 0.9661556183879925, + "learning_rate": 1.816359742280713e-06, + "loss": 0.1309, + "step": 6607 + }, + { + "epoch": 0.6088358594001935, + "grad_norm": 0.8915377822004282, + "learning_rate": 1.8156270268848155e-06, + "loss": 0.1183, + "step": 6608 + }, + { + "epoch": 0.6089279955774635, + "grad_norm": 0.9370935343576614, + "learning_rate": 1.8148943750330216e-06, + "loss": 0.1296, + "step": 6609 + }, + { + "epoch": 0.6090201317547335, + "grad_norm": 0.9175329703834323, + "learning_rate": 1.8141617867933586e-06, + "loss": 0.1215, + "step": 6610 + }, + { + "epoch": 0.6091122679320035, + "grad_norm": 0.939823774501705, + "learning_rate": 1.813429262233848e-06, + "loss": 0.1316, + "step": 6611 + }, + { + "epoch": 0.6092044041092735, + "grad_norm": 0.9438374697151484, + "learning_rate": 1.8126968014225044e-06, + "loss": 0.1381, + "step": 6612 + }, + { + "epoch": 0.6092965402865436, + "grad_norm": 0.9026520719916683, + "learning_rate": 1.811964404427336e-06, + "loss": 0.1357, + "step": 6613 + }, + { + "epoch": 0.6093886764638136, + "grad_norm": 0.8733218006894725, + "learning_rate": 1.811232071316347e-06, + "loss": 0.1118, + "step": 6614 + }, + { + "epoch": 0.6094808126410836, + "grad_norm": 0.8841490921267199, + "learning_rate": 1.8104998021575337e-06, + "loss": 0.1295, + "step": 6615 + }, + { + "epoch": 0.6095729488183536, + "grad_norm": 0.9359678194340786, + "learning_rate": 1.8097675970188894e-06, + "loss": 0.1327, + "step": 6616 + }, + { + "epoch": 0.6096650849956236, + "grad_norm": 0.9605463356795557, + "learning_rate": 1.8090354559683972e-06, + "loss": 0.1378, + "step": 6617 + }, + { + "epoch": 0.6097572211728935, + "grad_norm": 0.9054243649757274, + "learning_rate": 1.8083033790740368e-06, + "loss": 0.1289, + "step": 6618 + }, + { + "epoch": 0.6098493573501635, + "grad_norm": 0.8842290378463926, + "learning_rate": 1.8075713664037823e-06, + "loss": 0.1289, + "step": 6619 + }, + { + "epoch": 0.6099414935274335, + "grad_norm": 0.8947640414559052, + "learning_rate": 1.806839418025601e-06, + "loss": 0.1275, + "step": 6620 + }, + { + "epoch": 0.6100336297047035, + "grad_norm": 0.921951402811935, + "learning_rate": 1.8061075340074556e-06, + "loss": 0.1297, + "step": 6621 + }, + { + "epoch": 0.6101257658819735, + "grad_norm": 0.9294620374457182, + "learning_rate": 1.8053757144172987e-06, + "loss": 0.1237, + "step": 6622 + }, + { + "epoch": 0.6102179020592435, + "grad_norm": 0.8834759916422078, + "learning_rate": 1.8046439593230822e-06, + "loss": 0.1147, + "step": 6623 + }, + { + "epoch": 0.6103100382365135, + "grad_norm": 0.9385060723312659, + "learning_rate": 1.8039122687927485e-06, + "loss": 0.1386, + "step": 6624 + }, + { + "epoch": 0.6104021744137835, + "grad_norm": 0.9310030772186891, + "learning_rate": 1.803180642894236e-06, + "loss": 0.1389, + "step": 6625 + }, + { + "epoch": 0.6104943105910535, + "grad_norm": 0.8966613217875679, + "learning_rate": 1.802449081695476e-06, + "loss": 0.1269, + "step": 6626 + }, + { + "epoch": 0.6105864467683236, + "grad_norm": 0.8572229985519388, + "learning_rate": 1.801717585264393e-06, + "loss": 0.1282, + "step": 6627 + }, + { + "epoch": 0.6106785829455936, + "grad_norm": 0.930161237084897, + "learning_rate": 1.800986153668908e-06, + "loss": 0.125, + "step": 6628 + }, + { + "epoch": 0.6107707191228636, + "grad_norm": 0.9925047377703466, + "learning_rate": 1.8002547869769344e-06, + "loss": 0.1297, + "step": 6629 + }, + { + "epoch": 0.6108628553001336, + "grad_norm": 0.8971787402961252, + "learning_rate": 1.7995234852563779e-06, + "loss": 0.1228, + "step": 6630 + }, + { + "epoch": 0.6109549914774036, + "grad_norm": 0.9592466914464529, + "learning_rate": 1.7987922485751418e-06, + "loss": 0.1383, + "step": 6631 + }, + { + "epoch": 0.6110471276546736, + "grad_norm": 0.9101582801987562, + "learning_rate": 1.7980610770011203e-06, + "loss": 0.1217, + "step": 6632 + }, + { + "epoch": 0.6111392638319436, + "grad_norm": 0.9388719801247791, + "learning_rate": 1.7973299706022046e-06, + "loss": 0.1289, + "step": 6633 + }, + { + "epoch": 0.6112314000092136, + "grad_norm": 0.8983598124593635, + "learning_rate": 1.796598929446276e-06, + "loss": 0.1196, + "step": 6634 + }, + { + "epoch": 0.6113235361864836, + "grad_norm": 0.9182147997516233, + "learning_rate": 1.7958679536012118e-06, + "loss": 0.1207, + "step": 6635 + }, + { + "epoch": 0.6114156723637536, + "grad_norm": 0.8619585830126839, + "learning_rate": 1.795137043134885e-06, + "loss": 0.1232, + "step": 6636 + }, + { + "epoch": 0.6115078085410236, + "grad_norm": 1.0263119069785722, + "learning_rate": 1.7944061981151598e-06, + "loss": 0.1374, + "step": 6637 + }, + { + "epoch": 0.6115999447182936, + "grad_norm": 0.9085278130780867, + "learning_rate": 1.793675418609894e-06, + "loss": 0.124, + "step": 6638 + }, + { + "epoch": 0.6116920808955636, + "grad_norm": 0.890303263816946, + "learning_rate": 1.792944704686942e-06, + "loss": 0.1253, + "step": 6639 + }, + { + "epoch": 0.6117842170728337, + "grad_norm": 0.8922482946265187, + "learning_rate": 1.79221405641415e-06, + "loss": 0.1204, + "step": 6640 + }, + { + "epoch": 0.6118763532501037, + "grad_norm": 0.9219285468355103, + "learning_rate": 1.79148347385936e-06, + "loss": 0.1339, + "step": 6641 + }, + { + "epoch": 0.6119684894273737, + "grad_norm": 0.904392406416535, + "learning_rate": 1.790752957090405e-06, + "loss": 0.1285, + "step": 6642 + }, + { + "epoch": 0.6120606256046437, + "grad_norm": 0.9040831409584187, + "learning_rate": 1.7900225061751136e-06, + "loss": 0.1305, + "step": 6643 + }, + { + "epoch": 0.6121527617819137, + "grad_norm": 0.8813255919074837, + "learning_rate": 1.78929212118131e-06, + "loss": 0.1205, + "step": 6644 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 0.8850407998747193, + "learning_rate": 1.7885618021768097e-06, + "loss": 0.1253, + "step": 6645 + }, + { + "epoch": 0.6123370341364537, + "grad_norm": 0.9142336900388941, + "learning_rate": 1.7878315492294213e-06, + "loss": 0.1248, + "step": 6646 + }, + { + "epoch": 0.6124291703137237, + "grad_norm": 0.9290446184856952, + "learning_rate": 1.7871013624069511e-06, + "loss": 0.1391, + "step": 6647 + }, + { + "epoch": 0.6125213064909937, + "grad_norm": 0.9968257121332454, + "learning_rate": 1.7863712417771956e-06, + "loss": 0.1348, + "step": 6648 + }, + { + "epoch": 0.6126134426682637, + "grad_norm": 0.9863639721347642, + "learning_rate": 1.7856411874079475e-06, + "loss": 0.1402, + "step": 6649 + }, + { + "epoch": 0.6127055788455337, + "grad_norm": 0.9698492821648672, + "learning_rate": 1.7849111993669924e-06, + "loss": 0.1345, + "step": 6650 + }, + { + "epoch": 0.6127977150228037, + "grad_norm": 0.9558952945055265, + "learning_rate": 1.7841812777221085e-06, + "loss": 0.1246, + "step": 6651 + }, + { + "epoch": 0.6128898512000737, + "grad_norm": 0.9316749933029452, + "learning_rate": 1.7834514225410704e-06, + "loss": 0.1283, + "step": 6652 + }, + { + "epoch": 0.6129819873773437, + "grad_norm": 0.9578132894362446, + "learning_rate": 1.7827216338916444e-06, + "loss": 0.1362, + "step": 6653 + }, + { + "epoch": 0.6130741235546138, + "grad_norm": 0.9942396128253892, + "learning_rate": 1.781991911841593e-06, + "loss": 0.1384, + "step": 6654 + }, + { + "epoch": 0.6131662597318838, + "grad_norm": 0.8876883802815898, + "learning_rate": 1.7812622564586695e-06, + "loss": 0.1197, + "step": 6655 + }, + { + "epoch": 0.6132583959091538, + "grad_norm": 0.9207165138441854, + "learning_rate": 1.7805326678106221e-06, + "loss": 0.1306, + "step": 6656 + }, + { + "epoch": 0.6133505320864238, + "grad_norm": 0.8709068748396972, + "learning_rate": 1.7798031459651942e-06, + "loss": 0.1205, + "step": 6657 + }, + { + "epoch": 0.6134426682636938, + "grad_norm": 0.8609800545742414, + "learning_rate": 1.7790736909901228e-06, + "loss": 0.1251, + "step": 6658 + }, + { + "epoch": 0.6135348044409638, + "grad_norm": 0.8481561432664178, + "learning_rate": 1.7783443029531352e-06, + "loss": 0.1252, + "step": 6659 + }, + { + "epoch": 0.6136269406182338, + "grad_norm": 0.9299442690236165, + "learning_rate": 1.7776149819219574e-06, + "loss": 0.1355, + "step": 6660 + }, + { + "epoch": 0.6137190767955037, + "grad_norm": 0.9289067001977687, + "learning_rate": 1.776885727964306e-06, + "loss": 0.1263, + "step": 6661 + }, + { + "epoch": 0.6138112129727737, + "grad_norm": 0.8770126865790537, + "learning_rate": 1.7761565411478935e-06, + "loss": 0.1341, + "step": 6662 + }, + { + "epoch": 0.6139033491500437, + "grad_norm": 0.8953296834576114, + "learning_rate": 1.7754274215404234e-06, + "loss": 0.139, + "step": 6663 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 0.9011900760471201, + "learning_rate": 1.7746983692095947e-06, + "loss": 0.1273, + "step": 6664 + }, + { + "epoch": 0.6140876215045837, + "grad_norm": 0.9071509181765444, + "learning_rate": 1.7739693842231008e-06, + "loss": 0.1192, + "step": 6665 + }, + { + "epoch": 0.6141797576818537, + "grad_norm": 0.953660969909198, + "learning_rate": 1.7732404666486289e-06, + "loss": 0.1402, + "step": 6666 + }, + { + "epoch": 0.6142718938591237, + "grad_norm": 0.9647631217108921, + "learning_rate": 1.7725116165538564e-06, + "loss": 0.1391, + "step": 6667 + }, + { + "epoch": 0.6143640300363938, + "grad_norm": 0.8881822165996579, + "learning_rate": 1.7717828340064592e-06, + "loss": 0.1262, + "step": 6668 + }, + { + "epoch": 0.6144561662136638, + "grad_norm": 0.9178819920338384, + "learning_rate": 1.7710541190741037e-06, + "loss": 0.1242, + "step": 6669 + }, + { + "epoch": 0.6145483023909338, + "grad_norm": 0.9312720344544685, + "learning_rate": 1.7703254718244525e-06, + "loss": 0.1339, + "step": 6670 + }, + { + "epoch": 0.6146404385682038, + "grad_norm": 0.9442221013026411, + "learning_rate": 1.7695968923251593e-06, + "loss": 0.1309, + "step": 6671 + }, + { + "epoch": 0.6147325747454738, + "grad_norm": 0.9321429789620098, + "learning_rate": 1.7688683806438731e-06, + "loss": 0.1339, + "step": 6672 + }, + { + "epoch": 0.6148247109227438, + "grad_norm": 0.9000161388757578, + "learning_rate": 1.7681399368482367e-06, + "loss": 0.1306, + "step": 6673 + }, + { + "epoch": 0.6149168471000138, + "grad_norm": 0.9606303166573105, + "learning_rate": 1.7674115610058864e-06, + "loss": 0.133, + "step": 6674 + }, + { + "epoch": 0.6150089832772838, + "grad_norm": 0.9437709741521769, + "learning_rate": 1.7666832531844508e-06, + "loss": 0.1258, + "step": 6675 + }, + { + "epoch": 0.6151011194545538, + "grad_norm": 0.9434879871809514, + "learning_rate": 1.765955013451554e-06, + "loss": 0.1313, + "step": 6676 + }, + { + "epoch": 0.6151932556318238, + "grad_norm": 0.9881526594306695, + "learning_rate": 1.765226841874813e-06, + "loss": 0.1447, + "step": 6677 + }, + { + "epoch": 0.6152853918090938, + "grad_norm": 0.9328846527215926, + "learning_rate": 1.7644987385218395e-06, + "loss": 0.1345, + "step": 6678 + }, + { + "epoch": 0.6153775279863638, + "grad_norm": 0.9727818346921414, + "learning_rate": 1.7637707034602378e-06, + "loss": 0.1464, + "step": 6679 + }, + { + "epoch": 0.6154696641636338, + "grad_norm": 0.9213397997962777, + "learning_rate": 1.7630427367576042e-06, + "loss": 0.131, + "step": 6680 + }, + { + "epoch": 0.6155618003409039, + "grad_norm": 0.9437472003416133, + "learning_rate": 1.7623148384815326e-06, + "loss": 0.1251, + "step": 6681 + }, + { + "epoch": 0.6156539365181739, + "grad_norm": 0.9166929533684961, + "learning_rate": 1.7615870086996067e-06, + "loss": 0.1384, + "step": 6682 + }, + { + "epoch": 0.6157460726954439, + "grad_norm": 0.9249383528462048, + "learning_rate": 1.7608592474794078e-06, + "loss": 0.1295, + "step": 6683 + }, + { + "epoch": 0.6158382088727139, + "grad_norm": 0.9679979891312327, + "learning_rate": 1.760131554888507e-06, + "loss": 0.1295, + "step": 6684 + }, + { + "epoch": 0.6159303450499839, + "grad_norm": 0.9300305618407569, + "learning_rate": 1.7594039309944702e-06, + "loss": 0.1392, + "step": 6685 + }, + { + "epoch": 0.6160224812272539, + "grad_norm": 0.9864960844070007, + "learning_rate": 1.7586763758648587e-06, + "loss": 0.1485, + "step": 6686 + }, + { + "epoch": 0.6161146174045239, + "grad_norm": 0.91785390219516, + "learning_rate": 1.757948889567226e-06, + "loss": 0.1287, + "step": 6687 + }, + { + "epoch": 0.6162067535817939, + "grad_norm": 0.8841484437270067, + "learning_rate": 1.7572214721691178e-06, + "loss": 0.1213, + "step": 6688 + }, + { + "epoch": 0.6162988897590639, + "grad_norm": 0.9395321082526199, + "learning_rate": 1.7564941237380761e-06, + "loss": 0.1293, + "step": 6689 + }, + { + "epoch": 0.6163910259363339, + "grad_norm": 0.8857358471622105, + "learning_rate": 1.7557668443416348e-06, + "loss": 0.124, + "step": 6690 + }, + { + "epoch": 0.6164831621136039, + "grad_norm": 0.903927286093644, + "learning_rate": 1.755039634047323e-06, + "loss": 0.1228, + "step": 6691 + }, + { + "epoch": 0.6165752982908739, + "grad_norm": 0.9233988812330909, + "learning_rate": 1.7543124929226608e-06, + "loss": 0.1355, + "step": 6692 + }, + { + "epoch": 0.6166674344681439, + "grad_norm": 0.9142573934713351, + "learning_rate": 1.7535854210351635e-06, + "loss": 0.1368, + "step": 6693 + }, + { + "epoch": 0.6167595706454139, + "grad_norm": 0.8897398795942182, + "learning_rate": 1.7528584184523407e-06, + "loss": 0.1232, + "step": 6694 + }, + { + "epoch": 0.616851706822684, + "grad_norm": 0.9684599266466104, + "learning_rate": 1.7521314852416946e-06, + "loss": 0.1412, + "step": 6695 + }, + { + "epoch": 0.616943842999954, + "grad_norm": 0.8633355016137957, + "learning_rate": 1.7514046214707195e-06, + "loss": 0.1163, + "step": 6696 + }, + { + "epoch": 0.617035979177224, + "grad_norm": 0.9556729220047715, + "learning_rate": 1.7506778272069064e-06, + "loss": 0.1381, + "step": 6697 + }, + { + "epoch": 0.617128115354494, + "grad_norm": 0.8849642829869895, + "learning_rate": 1.7499511025177373e-06, + "loss": 0.1264, + "step": 6698 + }, + { + "epoch": 0.617220251531764, + "grad_norm": 0.9186363005241283, + "learning_rate": 1.7492244474706898e-06, + "loss": 0.1423, + "step": 6699 + }, + { + "epoch": 0.617312387709034, + "grad_norm": 0.8928578767583446, + "learning_rate": 1.748497862133233e-06, + "loss": 0.1265, + "step": 6700 + }, + { + "epoch": 0.617404523886304, + "grad_norm": 0.8535895801436509, + "learning_rate": 1.7477713465728296e-06, + "loss": 0.1246, + "step": 6701 + }, + { + "epoch": 0.617496660063574, + "grad_norm": 0.9062694992292313, + "learning_rate": 1.747044900856938e-06, + "loss": 0.1244, + "step": 6702 + }, + { + "epoch": 0.617588796240844, + "grad_norm": 1.0026481306418769, + "learning_rate": 1.7463185250530102e-06, + "loss": 0.1309, + "step": 6703 + }, + { + "epoch": 0.617680932418114, + "grad_norm": 0.9401301173996884, + "learning_rate": 1.7455922192284864e-06, + "loss": 0.1313, + "step": 6704 + }, + { + "epoch": 0.617773068595384, + "grad_norm": 0.8910759203702324, + "learning_rate": 1.7448659834508064e-06, + "loss": 0.1298, + "step": 6705 + }, + { + "epoch": 0.6178652047726539, + "grad_norm": 0.9252349150341446, + "learning_rate": 1.7441398177874015e-06, + "loss": 0.1308, + "step": 6706 + }, + { + "epoch": 0.6179573409499239, + "grad_norm": 0.8962704413424406, + "learning_rate": 1.7434137223056954e-06, + "loss": 0.1253, + "step": 6707 + }, + { + "epoch": 0.618049477127194, + "grad_norm": 0.9382417961474702, + "learning_rate": 1.7426876970731076e-06, + "loss": 0.1267, + "step": 6708 + }, + { + "epoch": 0.618141613304464, + "grad_norm": 0.8782716906829233, + "learning_rate": 1.7419617421570483e-06, + "loss": 0.1298, + "step": 6709 + }, + { + "epoch": 0.618233749481734, + "grad_norm": 0.8877730559324124, + "learning_rate": 1.7412358576249222e-06, + "loss": 0.1242, + "step": 6710 + }, + { + "epoch": 0.618325885659004, + "grad_norm": 1.031141967978923, + "learning_rate": 1.740510043544129e-06, + "loss": 0.1433, + "step": 6711 + }, + { + "epoch": 0.618418021836274, + "grad_norm": 0.8924542958205756, + "learning_rate": 1.7397842999820605e-06, + "loss": 0.1228, + "step": 6712 + }, + { + "epoch": 0.618510158013544, + "grad_norm": 0.951354922791097, + "learning_rate": 1.7390586270061005e-06, + "loss": 0.1304, + "step": 6713 + }, + { + "epoch": 0.618602294190814, + "grad_norm": 0.963162432927016, + "learning_rate": 1.7383330246836294e-06, + "loss": 0.1266, + "step": 6714 + }, + { + "epoch": 0.618694430368084, + "grad_norm": 0.8984043647800838, + "learning_rate": 1.737607493082018e-06, + "loss": 0.1252, + "step": 6715 + }, + { + "epoch": 0.618786566545354, + "grad_norm": 0.9279719290802259, + "learning_rate": 1.7368820322686345e-06, + "loss": 0.1294, + "step": 6716 + }, + { + "epoch": 0.618878702722624, + "grad_norm": 0.952211601716075, + "learning_rate": 1.7361566423108355e-06, + "loss": 0.1312, + "step": 6717 + }, + { + "epoch": 0.618970838899894, + "grad_norm": 0.8749680465755658, + "learning_rate": 1.7354313232759745e-06, + "loss": 0.1179, + "step": 6718 + }, + { + "epoch": 0.619062975077164, + "grad_norm": 0.8577709338059307, + "learning_rate": 1.7347060752313978e-06, + "loss": 0.1235, + "step": 6719 + }, + { + "epoch": 0.619155111254434, + "grad_norm": 0.955540440918245, + "learning_rate": 1.7339808982444444e-06, + "loss": 0.1394, + "step": 6720 + }, + { + "epoch": 0.619247247431704, + "grad_norm": 0.9616325266472617, + "learning_rate": 1.7332557923824463e-06, + "loss": 0.1355, + "step": 6721 + }, + { + "epoch": 0.6193393836089741, + "grad_norm": 0.9302492116295052, + "learning_rate": 1.732530757712731e-06, + "loss": 0.121, + "step": 6722 + }, + { + "epoch": 0.6194315197862441, + "grad_norm": 0.8914909942723336, + "learning_rate": 1.7318057943026169e-06, + "loss": 0.1194, + "step": 6723 + }, + { + "epoch": 0.6195236559635141, + "grad_norm": 0.8862393627090467, + "learning_rate": 1.7310809022194184e-06, + "loss": 0.1279, + "step": 6724 + }, + { + "epoch": 0.6196157921407841, + "grad_norm": 0.8809958029695578, + "learning_rate": 1.7303560815304404e-06, + "loss": 0.1274, + "step": 6725 + }, + { + "epoch": 0.6197079283180541, + "grad_norm": 0.9235688604500591, + "learning_rate": 1.7296313323029825e-06, + "loss": 0.129, + "step": 6726 + }, + { + "epoch": 0.6198000644953241, + "grad_norm": 0.9150737133599645, + "learning_rate": 1.7289066546043386e-06, + "loss": 0.1189, + "step": 6727 + }, + { + "epoch": 0.6198922006725941, + "grad_norm": 0.9212605410966181, + "learning_rate": 1.7281820485017958e-06, + "loss": 0.1359, + "step": 6728 + }, + { + "epoch": 0.6199843368498641, + "grad_norm": 0.8655563428055735, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.1287, + "step": 6729 + }, + { + "epoch": 0.6200764730271341, + "grad_norm": 0.9011894322767224, + "learning_rate": 1.726733051354121e-06, + "loss": 0.1325, + "step": 6730 + }, + { + "epoch": 0.6201686092044041, + "grad_norm": 0.8678920213692736, + "learning_rate": 1.7260086604435295e-06, + "loss": 0.1222, + "step": 6731 + }, + { + "epoch": 0.6202607453816741, + "grad_norm": 0.9225506544414821, + "learning_rate": 1.7252843413981176e-06, + "loss": 0.1291, + "step": 6732 + }, + { + "epoch": 0.6203528815589441, + "grad_norm": 0.8741405868411134, + "learning_rate": 1.7245600942851378e-06, + "loss": 0.1168, + "step": 6733 + }, + { + "epoch": 0.6204450177362141, + "grad_norm": 0.88964218818895, + "learning_rate": 1.7238359191718362e-06, + "loss": 0.1158, + "step": 6734 + }, + { + "epoch": 0.6205371539134841, + "grad_norm": 0.8860723797025962, + "learning_rate": 1.7231118161254534e-06, + "loss": 0.1201, + "step": 6735 + }, + { + "epoch": 0.6206292900907542, + "grad_norm": 0.9866557709375795, + "learning_rate": 1.7223877852132218e-06, + "loss": 0.1349, + "step": 6736 + }, + { + "epoch": 0.6207214262680242, + "grad_norm": 0.8892445642733556, + "learning_rate": 1.721663826502369e-06, + "loss": 0.1218, + "step": 6737 + }, + { + "epoch": 0.6208135624452942, + "grad_norm": 0.9688094575884019, + "learning_rate": 1.7209399400601128e-06, + "loss": 0.1346, + "step": 6738 + }, + { + "epoch": 0.6209056986225642, + "grad_norm": 0.926838705061562, + "learning_rate": 1.720216125953667e-06, + "loss": 0.1334, + "step": 6739 + }, + { + "epoch": 0.6209978347998342, + "grad_norm": 0.9396059515548089, + "learning_rate": 1.7194923842502382e-06, + "loss": 0.1324, + "step": 6740 + }, + { + "epoch": 0.6210899709771042, + "grad_norm": 0.9830328224533756, + "learning_rate": 1.7187687150170257e-06, + "loss": 0.1345, + "step": 6741 + }, + { + "epoch": 0.6211821071543742, + "grad_norm": 0.9430353714861338, + "learning_rate": 1.7180451183212217e-06, + "loss": 0.1211, + "step": 6742 + }, + { + "epoch": 0.6212742433316442, + "grad_norm": 0.9121922967727294, + "learning_rate": 1.7173215942300125e-06, + "loss": 0.1428, + "step": 6743 + }, + { + "epoch": 0.6213663795089142, + "grad_norm": 0.8985273162486016, + "learning_rate": 1.7165981428105771e-06, + "loss": 0.14, + "step": 6744 + }, + { + "epoch": 0.6214585156861842, + "grad_norm": 0.9074990241015248, + "learning_rate": 1.71587476413009e-06, + "loss": 0.1354, + "step": 6745 + }, + { + "epoch": 0.6215506518634542, + "grad_norm": 0.8835713991487871, + "learning_rate": 1.7151514582557144e-06, + "loss": 0.1295, + "step": 6746 + }, + { + "epoch": 0.6216427880407241, + "grad_norm": 0.9563138522145045, + "learning_rate": 1.71442822525461e-06, + "loss": 0.1472, + "step": 6747 + }, + { + "epoch": 0.6217349242179941, + "grad_norm": 0.9397063902023289, + "learning_rate": 1.71370506519393e-06, + "loss": 0.1251, + "step": 6748 + }, + { + "epoch": 0.6218270603952643, + "grad_norm": 0.8707621777233324, + "learning_rate": 1.7129819781408197e-06, + "loss": 0.1278, + "step": 6749 + }, + { + "epoch": 0.6219191965725342, + "grad_norm": 0.8388729362903343, + "learning_rate": 1.7122589641624166e-06, + "loss": 0.1181, + "step": 6750 + }, + { + "epoch": 0.6220113327498042, + "grad_norm": 0.9139806446176466, + "learning_rate": 1.7115360233258537e-06, + "loss": 0.1202, + "step": 6751 + }, + { + "epoch": 0.6221034689270742, + "grad_norm": 0.9311401798162033, + "learning_rate": 1.7108131556982554e-06, + "loss": 0.1372, + "step": 6752 + }, + { + "epoch": 0.6221956051043442, + "grad_norm": 0.9511568955126489, + "learning_rate": 1.7100903613467419e-06, + "loss": 0.1214, + "step": 6753 + }, + { + "epoch": 0.6222877412816142, + "grad_norm": 0.9239186752009969, + "learning_rate": 1.7093676403384223e-06, + "loss": 0.1301, + "step": 6754 + }, + { + "epoch": 0.6223798774588842, + "grad_norm": 0.913588956394364, + "learning_rate": 1.7086449927404025e-06, + "loss": 0.1243, + "step": 6755 + }, + { + "epoch": 0.6224720136361542, + "grad_norm": 0.905711339806985, + "learning_rate": 1.7079224186197804e-06, + "loss": 0.1294, + "step": 6756 + }, + { + "epoch": 0.6225641498134242, + "grad_norm": 0.9681244319244915, + "learning_rate": 1.7071999180436477e-06, + "loss": 0.133, + "step": 6757 + }, + { + "epoch": 0.6226562859906942, + "grad_norm": 0.9155233722559435, + "learning_rate": 1.7064774910790865e-06, + "loss": 0.1234, + "step": 6758 + }, + { + "epoch": 0.6227484221679642, + "grad_norm": 0.9435640978336151, + "learning_rate": 1.7057551377931767e-06, + "loss": 0.1307, + "step": 6759 + }, + { + "epoch": 0.6228405583452342, + "grad_norm": 0.9545102508423661, + "learning_rate": 1.705032858252987e-06, + "loss": 0.1422, + "step": 6760 + }, + { + "epoch": 0.6229326945225042, + "grad_norm": 0.9415499869138108, + "learning_rate": 1.7043106525255831e-06, + "loss": 0.1395, + "step": 6761 + }, + { + "epoch": 0.6230248306997742, + "grad_norm": 0.905530428956616, + "learning_rate": 1.70358852067802e-06, + "loss": 0.1226, + "step": 6762 + }, + { + "epoch": 0.6231169668770443, + "grad_norm": 0.9482156251625352, + "learning_rate": 1.7028664627773483e-06, + "loss": 0.138, + "step": 6763 + }, + { + "epoch": 0.6232091030543143, + "grad_norm": 0.8947426819048082, + "learning_rate": 1.7021444788906117e-06, + "loss": 0.1271, + "step": 6764 + }, + { + "epoch": 0.6233012392315843, + "grad_norm": 0.9270692925867124, + "learning_rate": 1.7014225690848458e-06, + "loss": 0.1285, + "step": 6765 + }, + { + "epoch": 0.6233933754088543, + "grad_norm": 0.8786188858935942, + "learning_rate": 1.7007007334270809e-06, + "loss": 0.1205, + "step": 6766 + }, + { + "epoch": 0.6234855115861243, + "grad_norm": 0.9163708159636691, + "learning_rate": 1.6999789719843388e-06, + "loss": 0.1272, + "step": 6767 + }, + { + "epoch": 0.6235776477633943, + "grad_norm": 0.8979496261233568, + "learning_rate": 1.6992572848236343e-06, + "loss": 0.1181, + "step": 6768 + }, + { + "epoch": 0.6236697839406643, + "grad_norm": 0.9156614779990734, + "learning_rate": 1.698535672011978e-06, + "loss": 0.1341, + "step": 6769 + }, + { + "epoch": 0.6237619201179343, + "grad_norm": 0.9391056956844751, + "learning_rate": 1.6978141336163713e-06, + "loss": 0.1263, + "step": 6770 + }, + { + "epoch": 0.6238540562952043, + "grad_norm": 0.8858195058935551, + "learning_rate": 1.6970926697038073e-06, + "loss": 0.1247, + "step": 6771 + }, + { + "epoch": 0.6239461924724743, + "grad_norm": 0.8735010962850701, + "learning_rate": 1.6963712803412761e-06, + "loss": 0.126, + "step": 6772 + }, + { + "epoch": 0.6240383286497443, + "grad_norm": 0.9453628701261408, + "learning_rate": 1.6956499655957577e-06, + "loss": 0.1434, + "step": 6773 + }, + { + "epoch": 0.6241304648270143, + "grad_norm": 0.9657158049888763, + "learning_rate": 1.694928725534227e-06, + "loss": 0.1342, + "step": 6774 + }, + { + "epoch": 0.6242226010042843, + "grad_norm": 0.9759493120509639, + "learning_rate": 1.6942075602236507e-06, + "loss": 0.125, + "step": 6775 + }, + { + "epoch": 0.6243147371815544, + "grad_norm": 0.9688092020766974, + "learning_rate": 1.6934864697309883e-06, + "loss": 0.1469, + "step": 6776 + }, + { + "epoch": 0.6244068733588244, + "grad_norm": 0.9382127371598054, + "learning_rate": 1.6927654541231941e-06, + "loss": 0.1304, + "step": 6777 + }, + { + "epoch": 0.6244990095360944, + "grad_norm": 0.9522838200564424, + "learning_rate": 1.6920445134672162e-06, + "loss": 0.126, + "step": 6778 + }, + { + "epoch": 0.6245911457133644, + "grad_norm": 0.94904956281575, + "learning_rate": 1.6913236478299906e-06, + "loss": 0.1314, + "step": 6779 + }, + { + "epoch": 0.6246832818906344, + "grad_norm": 0.9386781279946481, + "learning_rate": 1.6906028572784511e-06, + "loss": 0.1298, + "step": 6780 + }, + { + "epoch": 0.6247754180679044, + "grad_norm": 0.9081504529487514, + "learning_rate": 1.6898821418795237e-06, + "loss": 0.1349, + "step": 6781 + }, + { + "epoch": 0.6248675542451744, + "grad_norm": 0.8914673156367624, + "learning_rate": 1.6891615017001272e-06, + "loss": 0.1137, + "step": 6782 + }, + { + "epoch": 0.6249596904224444, + "grad_norm": 0.891940546485863, + "learning_rate": 1.6884409368071718e-06, + "loss": 0.1231, + "step": 6783 + }, + { + "epoch": 0.6250518265997144, + "grad_norm": 0.914512509293828, + "learning_rate": 1.6877204472675634e-06, + "loss": 0.1291, + "step": 6784 + }, + { + "epoch": 0.6251439627769844, + "grad_norm": 0.8841333746458065, + "learning_rate": 1.687000033148198e-06, + "loss": 0.1289, + "step": 6785 + }, + { + "epoch": 0.6252360989542544, + "grad_norm": 0.9418903021503365, + "learning_rate": 1.686279694515968e-06, + "loss": 0.1313, + "step": 6786 + }, + { + "epoch": 0.6253282351315244, + "grad_norm": 0.8870373175179938, + "learning_rate": 1.685559431437756e-06, + "loss": 0.1176, + "step": 6787 + }, + { + "epoch": 0.6254203713087944, + "grad_norm": 0.8973213859270821, + "learning_rate": 1.6848392439804374e-06, + "loss": 0.1247, + "step": 6788 + }, + { + "epoch": 0.6255125074860644, + "grad_norm": 0.9056535481430396, + "learning_rate": 1.6841191322108835e-06, + "loss": 0.1316, + "step": 6789 + }, + { + "epoch": 0.6256046436633345, + "grad_norm": 0.8611185014978975, + "learning_rate": 1.6833990961959562e-06, + "loss": 0.1105, + "step": 6790 + }, + { + "epoch": 0.6256967798406045, + "grad_norm": 0.8714595196924181, + "learning_rate": 1.6826791360025103e-06, + "loss": 0.118, + "step": 6791 + }, + { + "epoch": 0.6257889160178745, + "grad_norm": 0.939216708887262, + "learning_rate": 1.6819592516973942e-06, + "loss": 0.1294, + "step": 6792 + }, + { + "epoch": 0.6258810521951444, + "grad_norm": 0.8805849507628835, + "learning_rate": 1.6812394433474497e-06, + "loss": 0.1303, + "step": 6793 + }, + { + "epoch": 0.6259731883724144, + "grad_norm": 0.9904622658783916, + "learning_rate": 1.6805197110195115e-06, + "loss": 0.1366, + "step": 6794 + }, + { + "epoch": 0.6260653245496844, + "grad_norm": 0.8452414433909419, + "learning_rate": 1.6798000547804066e-06, + "loss": 0.1138, + "step": 6795 + }, + { + "epoch": 0.6261574607269544, + "grad_norm": 0.9647793184636386, + "learning_rate": 1.6790804746969542e-06, + "loss": 0.1318, + "step": 6796 + }, + { + "epoch": 0.6262495969042244, + "grad_norm": 0.9190751074302138, + "learning_rate": 1.6783609708359683e-06, + "loss": 0.1272, + "step": 6797 + }, + { + "epoch": 0.6263417330814944, + "grad_norm": 1.0135874834100627, + "learning_rate": 1.677641543264254e-06, + "loss": 0.1356, + "step": 6798 + }, + { + "epoch": 0.6264338692587644, + "grad_norm": 0.9578041665275557, + "learning_rate": 1.6769221920486123e-06, + "loss": 0.1333, + "step": 6799 + }, + { + "epoch": 0.6265260054360344, + "grad_norm": 0.9120272599831393, + "learning_rate": 1.676202917255833e-06, + "loss": 0.1236, + "step": 6800 + }, + { + "epoch": 0.6266181416133044, + "grad_norm": 0.9545336241142, + "learning_rate": 1.675483718952701e-06, + "loss": 0.1505, + "step": 6801 + }, + { + "epoch": 0.6267102777905744, + "grad_norm": 0.8996118102214581, + "learning_rate": 1.6747645972059949e-06, + "loss": 0.1156, + "step": 6802 + }, + { + "epoch": 0.6268024139678445, + "grad_norm": 0.9780721556358306, + "learning_rate": 1.6740455520824852e-06, + "loss": 0.1311, + "step": 6803 + }, + { + "epoch": 0.6268945501451145, + "grad_norm": 0.8756953274991542, + "learning_rate": 1.673326583648934e-06, + "loss": 0.1223, + "step": 6804 + }, + { + "epoch": 0.6269866863223845, + "grad_norm": 0.9590846378967668, + "learning_rate": 1.672607691972099e-06, + "loss": 0.1327, + "step": 6805 + }, + { + "epoch": 0.6270788224996545, + "grad_norm": 0.9725555167066449, + "learning_rate": 1.671888877118728e-06, + "loss": 0.1394, + "step": 6806 + }, + { + "epoch": 0.6271709586769245, + "grad_norm": 0.9680149550710828, + "learning_rate": 1.6711701391555654e-06, + "loss": 0.1458, + "step": 6807 + }, + { + "epoch": 0.6272630948541945, + "grad_norm": 0.9266144277449202, + "learning_rate": 1.6704514781493439e-06, + "loss": 0.1318, + "step": 6808 + }, + { + "epoch": 0.6273552310314645, + "grad_norm": 0.9044064187078539, + "learning_rate": 1.6697328941667911e-06, + "loss": 0.1309, + "step": 6809 + }, + { + "epoch": 0.6274473672087345, + "grad_norm": 0.9161445150421038, + "learning_rate": 1.6690143872746295e-06, + "loss": 0.1359, + "step": 6810 + }, + { + "epoch": 0.6275395033860045, + "grad_norm": 0.9039846306537077, + "learning_rate": 1.6682959575395717e-06, + "loss": 0.13, + "step": 6811 + }, + { + "epoch": 0.6276316395632745, + "grad_norm": 0.8368274819032405, + "learning_rate": 1.6675776050283228e-06, + "loss": 0.1117, + "step": 6812 + }, + { + "epoch": 0.6277237757405445, + "grad_norm": 0.9653527218946011, + "learning_rate": 1.666859329807583e-06, + "loss": 0.1362, + "step": 6813 + }, + { + "epoch": 0.6278159119178145, + "grad_norm": 0.9458104946736843, + "learning_rate": 1.666141131944044e-06, + "loss": 0.1323, + "step": 6814 + }, + { + "epoch": 0.6279080480950845, + "grad_norm": 0.8872800410857472, + "learning_rate": 1.6654230115043915e-06, + "loss": 0.1275, + "step": 6815 + }, + { + "epoch": 0.6280001842723545, + "grad_norm": 0.915733456696928, + "learning_rate": 1.6647049685553018e-06, + "loss": 0.1295, + "step": 6816 + }, + { + "epoch": 0.6280923204496246, + "grad_norm": 0.9299225804478112, + "learning_rate": 1.663987003163445e-06, + "loss": 0.1289, + "step": 6817 + }, + { + "epoch": 0.6281844566268946, + "grad_norm": 0.893925699523802, + "learning_rate": 1.6632691153954855e-06, + "loss": 0.1281, + "step": 6818 + }, + { + "epoch": 0.6282765928041646, + "grad_norm": 0.9610798463300602, + "learning_rate": 1.6625513053180791e-06, + "loss": 0.1255, + "step": 6819 + }, + { + "epoch": 0.6283687289814346, + "grad_norm": 0.9394658550305951, + "learning_rate": 1.6618335729978736e-06, + "loss": 0.1263, + "step": 6820 + }, + { + "epoch": 0.6284608651587046, + "grad_norm": 0.941465892891746, + "learning_rate": 1.661115918501511e-06, + "loss": 0.1254, + "step": 6821 + }, + { + "epoch": 0.6285530013359746, + "grad_norm": 0.9393058701052077, + "learning_rate": 1.6603983418956254e-06, + "loss": 0.1328, + "step": 6822 + }, + { + "epoch": 0.6286451375132446, + "grad_norm": 0.9625490206776947, + "learning_rate": 1.6596808432468445e-06, + "loss": 0.1394, + "step": 6823 + }, + { + "epoch": 0.6287372736905146, + "grad_norm": 0.934085002587979, + "learning_rate": 1.6589634226217883e-06, + "loss": 0.1328, + "step": 6824 + }, + { + "epoch": 0.6288294098677846, + "grad_norm": 0.9148072326121705, + "learning_rate": 1.6582460800870675e-06, + "loss": 0.1284, + "step": 6825 + }, + { + "epoch": 0.6289215460450546, + "grad_norm": 0.9431341023115458, + "learning_rate": 1.6575288157092898e-06, + "loss": 0.1236, + "step": 6826 + }, + { + "epoch": 0.6290136822223246, + "grad_norm": 1.0046117659117793, + "learning_rate": 1.6568116295550515e-06, + "loss": 0.1313, + "step": 6827 + }, + { + "epoch": 0.6291058183995946, + "grad_norm": 0.8757042152992928, + "learning_rate": 1.6560945216909451e-06, + "loss": 0.1144, + "step": 6828 + }, + { + "epoch": 0.6291979545768646, + "grad_norm": 0.8499443288587102, + "learning_rate": 1.6553774921835528e-06, + "loss": 0.1107, + "step": 6829 + }, + { + "epoch": 0.6292900907541346, + "grad_norm": 0.8959817137425333, + "learning_rate": 1.6546605410994507e-06, + "loss": 0.1255, + "step": 6830 + }, + { + "epoch": 0.6293822269314047, + "grad_norm": 0.8993665748741935, + "learning_rate": 1.6539436685052087e-06, + "loss": 0.1263, + "step": 6831 + }, + { + "epoch": 0.6294743631086747, + "grad_norm": 0.9006040587776355, + "learning_rate": 1.6532268744673887e-06, + "loss": 0.1154, + "step": 6832 + }, + { + "epoch": 0.6295664992859447, + "grad_norm": 0.8915285615887515, + "learning_rate": 1.6525101590525435e-06, + "loss": 0.1241, + "step": 6833 + }, + { + "epoch": 0.6296586354632147, + "grad_norm": 0.8626544961619003, + "learning_rate": 1.651793522327222e-06, + "loss": 0.1275, + "step": 6834 + }, + { + "epoch": 0.6297507716404847, + "grad_norm": 0.9064252376215327, + "learning_rate": 1.6510769643579625e-06, + "loss": 0.133, + "step": 6835 + }, + { + "epoch": 0.6298429078177546, + "grad_norm": 0.893570508838155, + "learning_rate": 1.6503604852112992e-06, + "loss": 0.1174, + "step": 6836 + }, + { + "epoch": 0.6299350439950246, + "grad_norm": 0.9209261279569575, + "learning_rate": 1.649644084953756e-06, + "loss": 0.1335, + "step": 6837 + }, + { + "epoch": 0.6300271801722946, + "grad_norm": 0.9612542183985056, + "learning_rate": 1.6489277636518503e-06, + "loss": 0.1364, + "step": 6838 + }, + { + "epoch": 0.6301193163495646, + "grad_norm": 0.9030891613008248, + "learning_rate": 1.6482115213720939e-06, + "loss": 0.128, + "step": 6839 + }, + { + "epoch": 0.6302114525268346, + "grad_norm": 0.892199661038204, + "learning_rate": 1.64749535818099e-06, + "loss": 0.1234, + "step": 6840 + }, + { + "epoch": 0.6303035887041046, + "grad_norm": 0.892341611186558, + "learning_rate": 1.6467792741450328e-06, + "loss": 0.1262, + "step": 6841 + }, + { + "epoch": 0.6303957248813746, + "grad_norm": 0.9303003796224806, + "learning_rate": 1.6460632693307122e-06, + "loss": 0.1332, + "step": 6842 + }, + { + "epoch": 0.6304878610586446, + "grad_norm": 0.971311474946169, + "learning_rate": 1.6453473438045088e-06, + "loss": 0.1282, + "step": 6843 + }, + { + "epoch": 0.6305799972359147, + "grad_norm": 0.9128566574816395, + "learning_rate": 1.644631497632897e-06, + "loss": 0.1268, + "step": 6844 + }, + { + "epoch": 0.6306721334131847, + "grad_norm": 0.9329959880146634, + "learning_rate": 1.6439157308823425e-06, + "loss": 0.1281, + "step": 6845 + }, + { + "epoch": 0.6307642695904547, + "grad_norm": 0.9643591086782031, + "learning_rate": 1.6432000436193042e-06, + "loss": 0.14, + "step": 6846 + }, + { + "epoch": 0.6308564057677247, + "grad_norm": 0.8949048912604123, + "learning_rate": 1.642484435910234e-06, + "loss": 0.1275, + "step": 6847 + }, + { + "epoch": 0.6309485419449947, + "grad_norm": 0.9294314454625718, + "learning_rate": 1.6417689078215771e-06, + "loss": 0.1317, + "step": 6848 + }, + { + "epoch": 0.6310406781222647, + "grad_norm": 0.9092264943448212, + "learning_rate": 1.6410534594197687e-06, + "loss": 0.1243, + "step": 6849 + }, + { + "epoch": 0.6311328142995347, + "grad_norm": 0.8915342261422562, + "learning_rate": 1.640338090771239e-06, + "loss": 0.126, + "step": 6850 + }, + { + "epoch": 0.6312249504768047, + "grad_norm": 0.9631374154156046, + "learning_rate": 1.6396228019424099e-06, + "loss": 0.1193, + "step": 6851 + }, + { + "epoch": 0.6313170866540747, + "grad_norm": 1.028510708487749, + "learning_rate": 1.6389075929996961e-06, + "loss": 0.1425, + "step": 6852 + }, + { + "epoch": 0.6314092228313447, + "grad_norm": 0.9457079527488705, + "learning_rate": 1.6381924640095065e-06, + "loss": 0.1232, + "step": 6853 + }, + { + "epoch": 0.6315013590086147, + "grad_norm": 0.8672064169139518, + "learning_rate": 1.6374774150382377e-06, + "loss": 0.1191, + "step": 6854 + }, + { + "epoch": 0.6315934951858847, + "grad_norm": 0.9792257903977444, + "learning_rate": 1.6367624461522841e-06, + "loss": 0.1303, + "step": 6855 + }, + { + "epoch": 0.6316856313631547, + "grad_norm": 0.9661371788971164, + "learning_rate": 1.6360475574180306e-06, + "loss": 0.136, + "step": 6856 + }, + { + "epoch": 0.6317777675404247, + "grad_norm": 0.9543594834857416, + "learning_rate": 1.635332748901855e-06, + "loss": 0.1294, + "step": 6857 + }, + { + "epoch": 0.6318699037176948, + "grad_norm": 0.8929324029831897, + "learning_rate": 1.6346180206701256e-06, + "loss": 0.1237, + "step": 6858 + }, + { + "epoch": 0.6319620398949648, + "grad_norm": 0.9010697000995774, + "learning_rate": 1.6339033727892067e-06, + "loss": 0.1271, + "step": 6859 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 0.8981085366716448, + "learning_rate": 1.6331888053254521e-06, + "loss": 0.1287, + "step": 6860 + }, + { + "epoch": 0.6321463122495048, + "grad_norm": 0.9238622764409322, + "learning_rate": 1.6324743183452113e-06, + "loss": 0.1316, + "step": 6861 + }, + { + "epoch": 0.6322384484267748, + "grad_norm": 0.9109979518487612, + "learning_rate": 1.631759911914823e-06, + "loss": 0.1232, + "step": 6862 + }, + { + "epoch": 0.6323305846040448, + "grad_norm": 0.9189914997104379, + "learning_rate": 1.63104558610062e-06, + "loss": 0.1279, + "step": 6863 + }, + { + "epoch": 0.6324227207813148, + "grad_norm": 0.9492831449731975, + "learning_rate": 1.630331340968928e-06, + "loss": 0.139, + "step": 6864 + }, + { + "epoch": 0.6325148569585848, + "grad_norm": 0.9602943007686003, + "learning_rate": 1.6296171765860651e-06, + "loss": 0.1392, + "step": 6865 + }, + { + "epoch": 0.6326069931358548, + "grad_norm": 0.96860825635109, + "learning_rate": 1.6289030930183403e-06, + "loss": 0.1261, + "step": 6866 + }, + { + "epoch": 0.6326991293131248, + "grad_norm": 0.9355225957084158, + "learning_rate": 1.6281890903320574e-06, + "loss": 0.1349, + "step": 6867 + }, + { + "epoch": 0.6327912654903948, + "grad_norm": 0.8994626792406648, + "learning_rate": 1.627475168593511e-06, + "loss": 0.1232, + "step": 6868 + }, + { + "epoch": 0.6328834016676648, + "grad_norm": 0.8585321066061194, + "learning_rate": 1.6267613278689898e-06, + "loss": 0.1172, + "step": 6869 + }, + { + "epoch": 0.6329755378449348, + "grad_norm": 1.0178195384314648, + "learning_rate": 1.626047568224773e-06, + "loss": 0.1346, + "step": 6870 + }, + { + "epoch": 0.6330676740222049, + "grad_norm": 0.9180606266375119, + "learning_rate": 1.625333889727133e-06, + "loss": 0.1305, + "step": 6871 + }, + { + "epoch": 0.6331598101994749, + "grad_norm": 0.86008781184332, + "learning_rate": 1.624620292442336e-06, + "loss": 0.1128, + "step": 6872 + }, + { + "epoch": 0.6332519463767449, + "grad_norm": 0.8953508403267575, + "learning_rate": 1.6239067764366396e-06, + "loss": 0.1295, + "step": 6873 + }, + { + "epoch": 0.6333440825540149, + "grad_norm": 0.9701241179780662, + "learning_rate": 1.6231933417762918e-06, + "loss": 0.132, + "step": 6874 + }, + { + "epoch": 0.6334362187312849, + "grad_norm": 0.8964266930393143, + "learning_rate": 1.6224799885275378e-06, + "loss": 0.131, + "step": 6875 + }, + { + "epoch": 0.6335283549085549, + "grad_norm": 0.8920990399552023, + "learning_rate": 1.6217667167566103e-06, + "loss": 0.1214, + "step": 6876 + }, + { + "epoch": 0.6336204910858249, + "grad_norm": 0.9034818516436899, + "learning_rate": 1.6210535265297389e-06, + "loss": 0.1195, + "step": 6877 + }, + { + "epoch": 0.6337126272630949, + "grad_norm": 0.9099483703963683, + "learning_rate": 1.6203404179131415e-06, + "loss": 0.1269, + "step": 6878 + }, + { + "epoch": 0.6338047634403648, + "grad_norm": 0.919892064573786, + "learning_rate": 1.6196273909730303e-06, + "loss": 0.1291, + "step": 6879 + }, + { + "epoch": 0.6338968996176348, + "grad_norm": 0.9388586022519625, + "learning_rate": 1.6189144457756118e-06, + "loss": 0.1282, + "step": 6880 + }, + { + "epoch": 0.6339890357949048, + "grad_norm": 0.9493861638877241, + "learning_rate": 1.6182015823870805e-06, + "loss": 0.129, + "step": 6881 + }, + { + "epoch": 0.6340811719721748, + "grad_norm": 0.9149259117449804, + "learning_rate": 1.617488800873629e-06, + "loss": 0.1364, + "step": 6882 + }, + { + "epoch": 0.6341733081494448, + "grad_norm": 0.9344518751317833, + "learning_rate": 1.616776101301436e-06, + "loss": 0.1343, + "step": 6883 + }, + { + "epoch": 0.6342654443267148, + "grad_norm": 0.8832340238190585, + "learning_rate": 1.6160634837366771e-06, + "loss": 0.1162, + "step": 6884 + }, + { + "epoch": 0.6343575805039849, + "grad_norm": 0.8869408664575485, + "learning_rate": 1.615350948245519e-06, + "loss": 0.1304, + "step": 6885 + }, + { + "epoch": 0.6344497166812549, + "grad_norm": 0.9268014816586394, + "learning_rate": 1.6146384948941213e-06, + "loss": 0.1257, + "step": 6886 + }, + { + "epoch": 0.6345418528585249, + "grad_norm": 0.9930422635091828, + "learning_rate": 1.6139261237486337e-06, + "loss": 0.1316, + "step": 6887 + }, + { + "epoch": 0.6346339890357949, + "grad_norm": 0.9828861548408764, + "learning_rate": 1.6132138348752013e-06, + "loss": 0.1396, + "step": 6888 + }, + { + "epoch": 0.6347261252130649, + "grad_norm": 0.9392508922508789, + "learning_rate": 1.6125016283399592e-06, + "loss": 0.1238, + "step": 6889 + }, + { + "epoch": 0.6348182613903349, + "grad_norm": 0.9367414950444616, + "learning_rate": 1.6117895042090374e-06, + "loss": 0.1369, + "step": 6890 + }, + { + "epoch": 0.6349103975676049, + "grad_norm": 0.8913932424175949, + "learning_rate": 1.6110774625485554e-06, + "loss": 0.1277, + "step": 6891 + }, + { + "epoch": 0.6350025337448749, + "grad_norm": 0.9487507579836935, + "learning_rate": 1.6103655034246256e-06, + "loss": 0.1293, + "step": 6892 + }, + { + "epoch": 0.6350946699221449, + "grad_norm": 0.9078867583893813, + "learning_rate": 1.6096536269033557e-06, + "loss": 0.1253, + "step": 6893 + }, + { + "epoch": 0.6351868060994149, + "grad_norm": 0.9657934294617364, + "learning_rate": 1.6089418330508427e-06, + "loss": 0.1303, + "step": 6894 + }, + { + "epoch": 0.6352789422766849, + "grad_norm": 0.9369656368147398, + "learning_rate": 1.6082301219331754e-06, + "loss": 0.1361, + "step": 6895 + }, + { + "epoch": 0.6353710784539549, + "grad_norm": 0.9221568322665612, + "learning_rate": 1.6075184936164377e-06, + "loss": 0.1186, + "step": 6896 + }, + { + "epoch": 0.6354632146312249, + "grad_norm": 0.9646737106447447, + "learning_rate": 1.606806948166703e-06, + "loss": 0.1283, + "step": 6897 + }, + { + "epoch": 0.6355553508084949, + "grad_norm": 0.985452183450984, + "learning_rate": 1.606095485650041e-06, + "loss": 0.1385, + "step": 6898 + }, + { + "epoch": 0.635647486985765, + "grad_norm": 0.9455020939527603, + "learning_rate": 1.6053841061325086e-06, + "loss": 0.1298, + "step": 6899 + }, + { + "epoch": 0.635739623163035, + "grad_norm": 0.9890402174988893, + "learning_rate": 1.6046728096801575e-06, + "loss": 0.1357, + "step": 6900 + }, + { + "epoch": 0.635831759340305, + "grad_norm": 0.9609982352039083, + "learning_rate": 1.6039615963590332e-06, + "loss": 0.126, + "step": 6901 + }, + { + "epoch": 0.635923895517575, + "grad_norm": 0.956556821723389, + "learning_rate": 1.6032504662351713e-06, + "loss": 0.1325, + "step": 6902 + }, + { + "epoch": 0.636016031694845, + "grad_norm": 0.9432174119765748, + "learning_rate": 1.6025394193745993e-06, + "loss": 0.1276, + "step": 6903 + }, + { + "epoch": 0.636108167872115, + "grad_norm": 0.9059817128700657, + "learning_rate": 1.6018284558433395e-06, + "loss": 0.1302, + "step": 6904 + }, + { + "epoch": 0.636200304049385, + "grad_norm": 0.9010824136241979, + "learning_rate": 1.6011175757074035e-06, + "loss": 0.1273, + "step": 6905 + }, + { + "epoch": 0.636292440226655, + "grad_norm": 0.9038667529472869, + "learning_rate": 1.6004067790327983e-06, + "loss": 0.1332, + "step": 6906 + }, + { + "epoch": 0.636384576403925, + "grad_norm": 0.9335549013514323, + "learning_rate": 1.5996960658855201e-06, + "loss": 0.1453, + "step": 6907 + }, + { + "epoch": 0.636476712581195, + "grad_norm": 0.9466004752685642, + "learning_rate": 1.5989854363315585e-06, + "loss": 0.136, + "step": 6908 + }, + { + "epoch": 0.636568848758465, + "grad_norm": 0.9078843220602426, + "learning_rate": 1.5982748904368966e-06, + "loss": 0.132, + "step": 6909 + }, + { + "epoch": 0.636660984935735, + "grad_norm": 0.9085035676803845, + "learning_rate": 1.5975644282675077e-06, + "loss": 0.1229, + "step": 6910 + }, + { + "epoch": 0.636753121113005, + "grad_norm": 0.9707588244468391, + "learning_rate": 1.5968540498893598e-06, + "loss": 0.1399, + "step": 6911 + }, + { + "epoch": 0.6368452572902751, + "grad_norm": 0.903772261250293, + "learning_rate": 1.59614375536841e-06, + "loss": 0.1325, + "step": 6912 + }, + { + "epoch": 0.6369373934675451, + "grad_norm": 0.877488888929862, + "learning_rate": 1.5954335447706093e-06, + "loss": 0.1202, + "step": 6913 + }, + { + "epoch": 0.6370295296448151, + "grad_norm": 0.9048261590261063, + "learning_rate": 1.5947234181619017e-06, + "loss": 0.126, + "step": 6914 + }, + { + "epoch": 0.6371216658220851, + "grad_norm": 0.8992318434307321, + "learning_rate": 1.5940133756082226e-06, + "loss": 0.1235, + "step": 6915 + }, + { + "epoch": 0.6372138019993551, + "grad_norm": 0.8816222891695241, + "learning_rate": 1.5933034171754985e-06, + "loss": 0.1233, + "step": 6916 + }, + { + "epoch": 0.6373059381766251, + "grad_norm": 0.9163390848974939, + "learning_rate": 1.5925935429296499e-06, + "loss": 0.1227, + "step": 6917 + }, + { + "epoch": 0.6373980743538951, + "grad_norm": 0.9121612444035625, + "learning_rate": 1.5918837529365884e-06, + "loss": 0.1197, + "step": 6918 + }, + { + "epoch": 0.6374902105311651, + "grad_norm": 0.9739517316471556, + "learning_rate": 1.5911740472622184e-06, + "loss": 0.1325, + "step": 6919 + }, + { + "epoch": 0.637582346708435, + "grad_norm": 0.9782508014301649, + "learning_rate": 1.590464425972436e-06, + "loss": 0.1369, + "step": 6920 + }, + { + "epoch": 0.637674482885705, + "grad_norm": 0.9666734281729935, + "learning_rate": 1.5897548891331288e-06, + "loss": 0.1424, + "step": 6921 + }, + { + "epoch": 0.637766619062975, + "grad_norm": 0.928424842338119, + "learning_rate": 1.5890454368101788e-06, + "loss": 0.1319, + "step": 6922 + }, + { + "epoch": 0.637858755240245, + "grad_norm": 0.9400505264698045, + "learning_rate": 1.5883360690694582e-06, + "loss": 0.1248, + "step": 6923 + }, + { + "epoch": 0.637950891417515, + "grad_norm": 0.8670288793128172, + "learning_rate": 1.587626785976831e-06, + "loss": 0.1156, + "step": 6924 + }, + { + "epoch": 0.638043027594785, + "grad_norm": 0.8702635503997724, + "learning_rate": 1.5869175875981551e-06, + "loss": 0.1228, + "step": 6925 + }, + { + "epoch": 0.6381351637720551, + "grad_norm": 0.9992073872034697, + "learning_rate": 1.5862084739992794e-06, + "loss": 0.1425, + "step": 6926 + }, + { + "epoch": 0.6382272999493251, + "grad_norm": 0.9676724696571829, + "learning_rate": 1.585499445246046e-06, + "loss": 0.1381, + "step": 6927 + }, + { + "epoch": 0.6383194361265951, + "grad_norm": 0.9479667525779824, + "learning_rate": 1.584790501404287e-06, + "loss": 0.1414, + "step": 6928 + }, + { + "epoch": 0.6384115723038651, + "grad_norm": 0.9202645800521777, + "learning_rate": 1.5840816425398282e-06, + "loss": 0.1329, + "step": 6929 + }, + { + "epoch": 0.6385037084811351, + "grad_norm": 0.9281144590684807, + "learning_rate": 1.5833728687184868e-06, + "loss": 0.1288, + "step": 6930 + }, + { + "epoch": 0.6385958446584051, + "grad_norm": 0.9212398432492384, + "learning_rate": 1.5826641800060755e-06, + "loss": 0.1235, + "step": 6931 + }, + { + "epoch": 0.6386879808356751, + "grad_norm": 0.907492438307411, + "learning_rate": 1.581955576468392e-06, + "loss": 0.1297, + "step": 6932 + }, + { + "epoch": 0.6387801170129451, + "grad_norm": 0.9343330686647542, + "learning_rate": 1.581247058171232e-06, + "loss": 0.1308, + "step": 6933 + }, + { + "epoch": 0.6388722531902151, + "grad_norm": 0.8988949949443719, + "learning_rate": 1.5805386251803818e-06, + "loss": 0.1183, + "step": 6934 + }, + { + "epoch": 0.6389643893674851, + "grad_norm": 0.9210865319052437, + "learning_rate": 1.5798302775616198e-06, + "loss": 0.1257, + "step": 6935 + }, + { + "epoch": 0.6390565255447551, + "grad_norm": 0.8669787078566003, + "learning_rate": 1.5791220153807146e-06, + "loss": 0.1099, + "step": 6936 + }, + { + "epoch": 0.6391486617220251, + "grad_norm": 0.9472952498117674, + "learning_rate": 1.5784138387034302e-06, + "loss": 0.1283, + "step": 6937 + }, + { + "epoch": 0.6392407978992951, + "grad_norm": 0.9301216379397955, + "learning_rate": 1.5777057475955194e-06, + "loss": 0.1332, + "step": 6938 + }, + { + "epoch": 0.6393329340765652, + "grad_norm": 0.9069566491323627, + "learning_rate": 1.5769977421227295e-06, + "loss": 0.1201, + "step": 6939 + }, + { + "epoch": 0.6394250702538352, + "grad_norm": 0.9762702867137394, + "learning_rate": 1.5762898223507989e-06, + "loss": 0.1368, + "step": 6940 + }, + { + "epoch": 0.6395172064311052, + "grad_norm": 0.9753252251298636, + "learning_rate": 1.575581988345457e-06, + "loss": 0.1377, + "step": 6941 + }, + { + "epoch": 0.6396093426083752, + "grad_norm": 0.9245488069439479, + "learning_rate": 1.5748742401724276e-06, + "loss": 0.1277, + "step": 6942 + }, + { + "epoch": 0.6397014787856452, + "grad_norm": 0.9222913829706925, + "learning_rate": 1.5741665778974239e-06, + "loss": 0.1288, + "step": 6943 + }, + { + "epoch": 0.6397936149629152, + "grad_norm": 0.9101223878977888, + "learning_rate": 1.5734590015861539e-06, + "loss": 0.1309, + "step": 6944 + }, + { + "epoch": 0.6398857511401852, + "grad_norm": 0.960200928451764, + "learning_rate": 1.5727515113043152e-06, + "loss": 0.1398, + "step": 6945 + }, + { + "epoch": 0.6399778873174552, + "grad_norm": 0.8788028856828145, + "learning_rate": 1.5720441071175976e-06, + "loss": 0.1145, + "step": 6946 + }, + { + "epoch": 0.6400700234947252, + "grad_norm": 0.8708461112250071, + "learning_rate": 1.5713367890916852e-06, + "loss": 0.1225, + "step": 6947 + }, + { + "epoch": 0.6401621596719952, + "grad_norm": 0.8772041016771617, + "learning_rate": 1.5706295572922524e-06, + "loss": 0.1173, + "step": 6948 + }, + { + "epoch": 0.6402542958492652, + "grad_norm": 0.897848427339447, + "learning_rate": 1.5699224117849644e-06, + "loss": 0.1382, + "step": 6949 + }, + { + "epoch": 0.6403464320265352, + "grad_norm": 0.8624040978043598, + "learning_rate": 1.569215352635481e-06, + "loss": 0.1172, + "step": 6950 + }, + { + "epoch": 0.6404385682038052, + "grad_norm": 0.9391168244715019, + "learning_rate": 1.5685083799094513e-06, + "loss": 0.1246, + "step": 6951 + }, + { + "epoch": 0.6405307043810752, + "grad_norm": 0.9483088170825726, + "learning_rate": 1.56780149367252e-06, + "loss": 0.1306, + "step": 6952 + }, + { + "epoch": 0.6406228405583453, + "grad_norm": 0.9378810721645574, + "learning_rate": 1.5670946939903201e-06, + "loss": 0.1317, + "step": 6953 + }, + { + "epoch": 0.6407149767356153, + "grad_norm": 0.8918097080165212, + "learning_rate": 1.5663879809284777e-06, + "loss": 0.1223, + "step": 6954 + }, + { + "epoch": 0.6408071129128853, + "grad_norm": 0.9183976546797075, + "learning_rate": 1.565681354552612e-06, + "loss": 0.1322, + "step": 6955 + }, + { + "epoch": 0.6408992490901553, + "grad_norm": 0.8927835519177011, + "learning_rate": 1.5649748149283339e-06, + "loss": 0.1156, + "step": 6956 + }, + { + "epoch": 0.6409913852674253, + "grad_norm": 0.9527061176749169, + "learning_rate": 1.5642683621212435e-06, + "loss": 0.1264, + "step": 6957 + }, + { + "epoch": 0.6410835214446953, + "grad_norm": 0.9276007981920011, + "learning_rate": 1.5635619961969372e-06, + "loss": 0.1224, + "step": 6958 + }, + { + "epoch": 0.6411756576219653, + "grad_norm": 0.9341219872288229, + "learning_rate": 1.5628557172209997e-06, + "loss": 0.1242, + "step": 6959 + }, + { + "epoch": 0.6412677937992353, + "grad_norm": 0.9087165767479295, + "learning_rate": 1.5621495252590108e-06, + "loss": 0.1232, + "step": 6960 + }, + { + "epoch": 0.6413599299765053, + "grad_norm": 0.9845108733921047, + "learning_rate": 1.561443420376539e-06, + "loss": 0.1318, + "step": 6961 + }, + { + "epoch": 0.6414520661537753, + "grad_norm": 0.8579120613839665, + "learning_rate": 1.560737402639146e-06, + "loss": 0.1142, + "step": 6962 + }, + { + "epoch": 0.6415442023310453, + "grad_norm": 0.9145911360522224, + "learning_rate": 1.5600314721123866e-06, + "loss": 0.1305, + "step": 6963 + }, + { + "epoch": 0.6416363385083153, + "grad_norm": 1.0023486798357177, + "learning_rate": 1.5593256288618067e-06, + "loss": 0.1388, + "step": 6964 + }, + { + "epoch": 0.6417284746855852, + "grad_norm": 0.897665746625067, + "learning_rate": 1.5586198729529422e-06, + "loss": 0.1224, + "step": 6965 + }, + { + "epoch": 0.6418206108628552, + "grad_norm": 0.9742657750149276, + "learning_rate": 1.5579142044513248e-06, + "loss": 0.1296, + "step": 6966 + }, + { + "epoch": 0.6419127470401254, + "grad_norm": 0.9864819104891229, + "learning_rate": 1.5572086234224743e-06, + "loss": 0.127, + "step": 6967 + }, + { + "epoch": 0.6420048832173954, + "grad_norm": 0.9440948450795339, + "learning_rate": 1.556503129931905e-06, + "loss": 0.1329, + "step": 6968 + }, + { + "epoch": 0.6420970193946653, + "grad_norm": 0.8435423134543755, + "learning_rate": 1.5557977240451223e-06, + "loss": 0.1153, + "step": 6969 + }, + { + "epoch": 0.6421891555719353, + "grad_norm": 0.9570481012038368, + "learning_rate": 1.5550924058276213e-06, + "loss": 0.127, + "step": 6970 + }, + { + "epoch": 0.6422812917492053, + "grad_norm": 0.9603904086633094, + "learning_rate": 1.5543871753448924e-06, + "loss": 0.1355, + "step": 6971 + }, + { + "epoch": 0.6423734279264753, + "grad_norm": 0.8370803839751185, + "learning_rate": 1.5536820326624159e-06, + "loss": 0.1134, + "step": 6972 + }, + { + "epoch": 0.6424655641037453, + "grad_norm": 0.878586123110724, + "learning_rate": 1.5529769778456654e-06, + "loss": 0.1255, + "step": 6973 + }, + { + "epoch": 0.6425577002810153, + "grad_norm": 0.934161796637777, + "learning_rate": 1.5522720109601039e-06, + "loss": 0.1338, + "step": 6974 + }, + { + "epoch": 0.6426498364582853, + "grad_norm": 0.9010882629753968, + "learning_rate": 1.5515671320711877e-06, + "loss": 0.1239, + "step": 6975 + }, + { + "epoch": 0.6427419726355553, + "grad_norm": 0.8943824566421207, + "learning_rate": 1.5508623412443657e-06, + "loss": 0.1302, + "step": 6976 + }, + { + "epoch": 0.6428341088128253, + "grad_norm": 0.9430422757119458, + "learning_rate": 1.5501576385450785e-06, + "loss": 0.138, + "step": 6977 + }, + { + "epoch": 0.6429262449900953, + "grad_norm": 0.8608802032031488, + "learning_rate": 1.5494530240387552e-06, + "loss": 0.1157, + "step": 6978 + }, + { + "epoch": 0.6430183811673653, + "grad_norm": 0.8931155875083723, + "learning_rate": 1.5487484977908219e-06, + "loss": 0.1209, + "step": 6979 + }, + { + "epoch": 0.6431105173446354, + "grad_norm": 0.934058212786175, + "learning_rate": 1.5480440598666918e-06, + "loss": 0.1286, + "step": 6980 + }, + { + "epoch": 0.6432026535219054, + "grad_norm": 0.9499195487628329, + "learning_rate": 1.5473397103317748e-06, + "loss": 0.1289, + "step": 6981 + }, + { + "epoch": 0.6432947896991754, + "grad_norm": 0.9172407650858186, + "learning_rate": 1.5466354492514675e-06, + "loss": 0.1234, + "step": 6982 + }, + { + "epoch": 0.6433869258764454, + "grad_norm": 0.9040807218912611, + "learning_rate": 1.5459312766911607e-06, + "loss": 0.1147, + "step": 6983 + }, + { + "epoch": 0.6434790620537154, + "grad_norm": 0.9821552436596821, + "learning_rate": 1.5452271927162381e-06, + "loss": 0.1332, + "step": 6984 + }, + { + "epoch": 0.6435711982309854, + "grad_norm": 0.9677205513939832, + "learning_rate": 1.5445231973920744e-06, + "loss": 0.1383, + "step": 6985 + }, + { + "epoch": 0.6436633344082554, + "grad_norm": 0.9278700960504868, + "learning_rate": 1.543819290784033e-06, + "loss": 0.1203, + "step": 6986 + }, + { + "epoch": 0.6437554705855254, + "grad_norm": 1.0164771249246947, + "learning_rate": 1.5431154729574743e-06, + "loss": 0.1382, + "step": 6987 + }, + { + "epoch": 0.6438476067627954, + "grad_norm": 0.9576436472546979, + "learning_rate": 1.5424117439777458e-06, + "loss": 0.1241, + "step": 6988 + }, + { + "epoch": 0.6439397429400654, + "grad_norm": 0.8990740819754796, + "learning_rate": 1.5417081039101916e-06, + "loss": 0.1356, + "step": 6989 + }, + { + "epoch": 0.6440318791173354, + "grad_norm": 0.9008021527591362, + "learning_rate": 1.5410045528201423e-06, + "loss": 0.1152, + "step": 6990 + }, + { + "epoch": 0.6441240152946054, + "grad_norm": 0.8895201725561562, + "learning_rate": 1.5403010907729233e-06, + "loss": 0.1225, + "step": 6991 + }, + { + "epoch": 0.6442161514718754, + "grad_norm": 0.9060414956049287, + "learning_rate": 1.5395977178338511e-06, + "loss": 0.1231, + "step": 6992 + }, + { + "epoch": 0.6443082876491454, + "grad_norm": 0.9858058843844217, + "learning_rate": 1.5388944340682352e-06, + "loss": 0.1293, + "step": 6993 + }, + { + "epoch": 0.6444004238264155, + "grad_norm": 0.9830503873904406, + "learning_rate": 1.5381912395413733e-06, + "loss": 0.1328, + "step": 6994 + }, + { + "epoch": 0.6444925600036855, + "grad_norm": 0.8953062308564167, + "learning_rate": 1.5374881343185592e-06, + "loss": 0.1175, + "step": 6995 + }, + { + "epoch": 0.6445846961809555, + "grad_norm": 0.932027313269444, + "learning_rate": 1.5367851184650745e-06, + "loss": 0.1366, + "step": 6996 + }, + { + "epoch": 0.6446768323582255, + "grad_norm": 0.9017210709791099, + "learning_rate": 1.536082192046196e-06, + "loss": 0.1241, + "step": 6997 + }, + { + "epoch": 0.6447689685354955, + "grad_norm": 0.8736522849637017, + "learning_rate": 1.53537935512719e-06, + "loss": 0.124, + "step": 6998 + }, + { + "epoch": 0.6448611047127655, + "grad_norm": 0.9243176302835587, + "learning_rate": 1.5346766077733138e-06, + "loss": 0.1266, + "step": 6999 + }, + { + "epoch": 0.6449532408900355, + "grad_norm": 0.9500637805596197, + "learning_rate": 1.5339739500498189e-06, + "loss": 0.137, + "step": 7000 + }, + { + "epoch": 0.6449532408900355, + "eval_loss": 0.1284143477678299, + "eval_runtime": 299.5444, + "eval_samples_per_second": 23.426, + "eval_steps_per_second": 2.931, + "step": 7000 + }, + { + "epoch": 0.6450453770673055, + "grad_norm": 0.9176312502247079, + "learning_rate": 1.5332713820219461e-06, + "loss": 0.124, + "step": 7001 + }, + { + "epoch": 0.6451375132445755, + "grad_norm": 0.9517224833402965, + "learning_rate": 1.5325689037549307e-06, + "loss": 0.1278, + "step": 7002 + }, + { + "epoch": 0.6452296494218455, + "grad_norm": 0.9012201702409783, + "learning_rate": 1.531866515313996e-06, + "loss": 0.1294, + "step": 7003 + }, + { + "epoch": 0.6453217855991155, + "grad_norm": 0.870380747830478, + "learning_rate": 1.5311642167643592e-06, + "loss": 0.1127, + "step": 7004 + }, + { + "epoch": 0.6454139217763855, + "grad_norm": 0.9019711406867783, + "learning_rate": 1.530462008171229e-06, + "loss": 0.1284, + "step": 7005 + }, + { + "epoch": 0.6455060579536555, + "grad_norm": 0.9153502803076784, + "learning_rate": 1.5297598895998076e-06, + "loss": 0.128, + "step": 7006 + }, + { + "epoch": 0.6455981941309256, + "grad_norm": 0.9397863736754628, + "learning_rate": 1.529057861115283e-06, + "loss": 0.1225, + "step": 7007 + }, + { + "epoch": 0.6456903303081956, + "grad_norm": 0.9821388898224823, + "learning_rate": 1.5283559227828404e-06, + "loss": 0.1292, + "step": 7008 + }, + { + "epoch": 0.6457824664854656, + "grad_norm": 0.9461970827854304, + "learning_rate": 1.5276540746676558e-06, + "loss": 0.1184, + "step": 7009 + }, + { + "epoch": 0.6458746026627356, + "grad_norm": 0.9689676881269678, + "learning_rate": 1.5269523168348954e-06, + "loss": 0.1319, + "step": 7010 + }, + { + "epoch": 0.6459667388400055, + "grad_norm": 0.9410910844995799, + "learning_rate": 1.5262506493497159e-06, + "loss": 0.1259, + "step": 7011 + }, + { + "epoch": 0.6460588750172755, + "grad_norm": 0.983799739077306, + "learning_rate": 1.525549072277269e-06, + "loss": 0.1288, + "step": 7012 + }, + { + "epoch": 0.6461510111945455, + "grad_norm": 0.8946262414629372, + "learning_rate": 1.524847585682695e-06, + "loss": 0.1265, + "step": 7013 + }, + { + "epoch": 0.6462431473718155, + "grad_norm": 0.925928189408186, + "learning_rate": 1.5241461896311288e-06, + "loss": 0.1333, + "step": 7014 + }, + { + "epoch": 0.6463352835490855, + "grad_norm": 0.9306429155305737, + "learning_rate": 1.5234448841876935e-06, + "loss": 0.1275, + "step": 7015 + }, + { + "epoch": 0.6464274197263555, + "grad_norm": 0.9580787768584496, + "learning_rate": 1.5227436694175052e-06, + "loss": 0.1344, + "step": 7016 + }, + { + "epoch": 0.6465195559036255, + "grad_norm": 0.917554371682307, + "learning_rate": 1.5220425453856728e-06, + "loss": 0.1173, + "step": 7017 + }, + { + "epoch": 0.6466116920808955, + "grad_norm": 0.9025472883792388, + "learning_rate": 1.5213415121572959e-06, + "loss": 0.1194, + "step": 7018 + }, + { + "epoch": 0.6467038282581655, + "grad_norm": 0.9103463103642951, + "learning_rate": 1.5206405697974635e-06, + "loss": 0.1353, + "step": 7019 + }, + { + "epoch": 0.6467959644354355, + "grad_norm": 0.9179849344012468, + "learning_rate": 1.5199397183712606e-06, + "loss": 0.1301, + "step": 7020 + }, + { + "epoch": 0.6468881006127056, + "grad_norm": 0.9046015817619245, + "learning_rate": 1.5192389579437596e-06, + "loss": 0.1208, + "step": 7021 + }, + { + "epoch": 0.6469802367899756, + "grad_norm": 0.9338287299267279, + "learning_rate": 1.5185382885800282e-06, + "loss": 0.13, + "step": 7022 + }, + { + "epoch": 0.6470723729672456, + "grad_norm": 0.8913972315365095, + "learning_rate": 1.5178377103451213e-06, + "loss": 0.1284, + "step": 7023 + }, + { + "epoch": 0.6471645091445156, + "grad_norm": 0.9767424424928457, + "learning_rate": 1.5171372233040887e-06, + "loss": 0.1427, + "step": 7024 + }, + { + "epoch": 0.6472566453217856, + "grad_norm": 0.9782173682999218, + "learning_rate": 1.516436827521971e-06, + "loss": 0.1308, + "step": 7025 + }, + { + "epoch": 0.6473487814990556, + "grad_norm": 0.8844971219928716, + "learning_rate": 1.5157365230637993e-06, + "loss": 0.1233, + "step": 7026 + }, + { + "epoch": 0.6474409176763256, + "grad_norm": 0.9548655466581969, + "learning_rate": 1.5150363099945984e-06, + "loss": 0.133, + "step": 7027 + }, + { + "epoch": 0.6475330538535956, + "grad_norm": 0.9232492722707007, + "learning_rate": 1.5143361883793814e-06, + "loss": 0.1379, + "step": 7028 + }, + { + "epoch": 0.6476251900308656, + "grad_norm": 0.9127807484244733, + "learning_rate": 1.513636158283155e-06, + "loss": 0.1252, + "step": 7029 + }, + { + "epoch": 0.6477173262081356, + "grad_norm": 0.9288643059106027, + "learning_rate": 1.512936219770918e-06, + "loss": 0.1258, + "step": 7030 + }, + { + "epoch": 0.6478094623854056, + "grad_norm": 0.9159865631279025, + "learning_rate": 1.5122363729076595e-06, + "loss": 0.1152, + "step": 7031 + }, + { + "epoch": 0.6479015985626756, + "grad_norm": 0.9582038384688031, + "learning_rate": 1.5115366177583596e-06, + "loss": 0.1245, + "step": 7032 + }, + { + "epoch": 0.6479937347399456, + "grad_norm": 0.9135516336834137, + "learning_rate": 1.510836954387991e-06, + "loss": 0.1258, + "step": 7033 + }, + { + "epoch": 0.6480858709172156, + "grad_norm": 0.9392462355403146, + "learning_rate": 1.5101373828615172e-06, + "loss": 0.1286, + "step": 7034 + }, + { + "epoch": 0.6481780070944857, + "grad_norm": 0.8760360479867124, + "learning_rate": 1.5094379032438956e-06, + "loss": 0.1309, + "step": 7035 + }, + { + "epoch": 0.6482701432717557, + "grad_norm": 0.8521056439065474, + "learning_rate": 1.50873851560007e-06, + "loss": 0.1213, + "step": 7036 + }, + { + "epoch": 0.6483622794490257, + "grad_norm": 0.9119974004512107, + "learning_rate": 1.50803921999498e-06, + "loss": 0.1277, + "step": 7037 + }, + { + "epoch": 0.6484544156262957, + "grad_norm": 0.864820230481895, + "learning_rate": 1.5073400164935554e-06, + "loss": 0.1098, + "step": 7038 + }, + { + "epoch": 0.6485465518035657, + "grad_norm": 0.9371074072350675, + "learning_rate": 1.5066409051607175e-06, + "loss": 0.1283, + "step": 7039 + }, + { + "epoch": 0.6486386879808357, + "grad_norm": 0.9440464798003158, + "learning_rate": 1.5059418860613779e-06, + "loss": 0.1239, + "step": 7040 + }, + { + "epoch": 0.6487308241581057, + "grad_norm": 1.0084876710261992, + "learning_rate": 1.5052429592604411e-06, + "loss": 0.1346, + "step": 7041 + }, + { + "epoch": 0.6488229603353757, + "grad_norm": 0.9344953956197914, + "learning_rate": 1.5045441248228024e-06, + "loss": 0.1346, + "step": 7042 + }, + { + "epoch": 0.6489150965126457, + "grad_norm": 0.9987049159397682, + "learning_rate": 1.5038453828133498e-06, + "loss": 0.1408, + "step": 7043 + }, + { + "epoch": 0.6490072326899157, + "grad_norm": 0.9838768617974539, + "learning_rate": 1.50314673329696e-06, + "loss": 0.1397, + "step": 7044 + }, + { + "epoch": 0.6490993688671857, + "grad_norm": 0.8951590164756219, + "learning_rate": 1.502448176338503e-06, + "loss": 0.1207, + "step": 7045 + }, + { + "epoch": 0.6491915050444557, + "grad_norm": 0.8974579872474433, + "learning_rate": 1.5017497120028404e-06, + "loss": 0.1296, + "step": 7046 + }, + { + "epoch": 0.6492836412217257, + "grad_norm": 0.8768387129540236, + "learning_rate": 1.5010513403548253e-06, + "loss": 0.1255, + "step": 7047 + }, + { + "epoch": 0.6493757773989958, + "grad_norm": 0.8970459648430776, + "learning_rate": 1.5003530614592995e-06, + "loss": 0.1323, + "step": 7048 + }, + { + "epoch": 0.6494679135762658, + "grad_norm": 0.9665064017272036, + "learning_rate": 1.4996548753811001e-06, + "loss": 0.1355, + "step": 7049 + }, + { + "epoch": 0.6495600497535358, + "grad_norm": 0.8812970511013521, + "learning_rate": 1.4989567821850527e-06, + "loss": 0.1211, + "step": 7050 + }, + { + "epoch": 0.6496521859308058, + "grad_norm": 0.9746647652708732, + "learning_rate": 1.4982587819359767e-06, + "loss": 0.1253, + "step": 7051 + }, + { + "epoch": 0.6497443221080758, + "grad_norm": 0.932466168486249, + "learning_rate": 1.4975608746986802e-06, + "loss": 0.1296, + "step": 7052 + }, + { + "epoch": 0.6498364582853458, + "grad_norm": 0.9063329649266557, + "learning_rate": 1.4968630605379642e-06, + "loss": 0.1188, + "step": 7053 + }, + { + "epoch": 0.6499285944626157, + "grad_norm": 0.9583750002128922, + "learning_rate": 1.496165339518621e-06, + "loss": 0.1356, + "step": 7054 + }, + { + "epoch": 0.6500207306398857, + "grad_norm": 0.9622381338587453, + "learning_rate": 1.495467711705434e-06, + "loss": 0.1427, + "step": 7055 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 0.8843393452517173, + "learning_rate": 1.4947701771631788e-06, + "loss": 0.1214, + "step": 7056 + }, + { + "epoch": 0.6502050029944257, + "grad_norm": 0.9571823770880513, + "learning_rate": 1.4940727359566205e-06, + "loss": 0.14, + "step": 7057 + }, + { + "epoch": 0.6502971391716957, + "grad_norm": 0.8934023203022077, + "learning_rate": 1.493375388150516e-06, + "loss": 0.1211, + "step": 7058 + }, + { + "epoch": 0.6503892753489657, + "grad_norm": 0.9452011328243307, + "learning_rate": 1.4926781338096158e-06, + "loss": 0.135, + "step": 7059 + }, + { + "epoch": 0.6504814115262357, + "grad_norm": 0.9565891549170992, + "learning_rate": 1.4919809729986598e-06, + "loss": 0.1308, + "step": 7060 + }, + { + "epoch": 0.6505735477035057, + "grad_norm": 0.927163730231958, + "learning_rate": 1.491283905782378e-06, + "loss": 0.116, + "step": 7061 + }, + { + "epoch": 0.6506656838807758, + "grad_norm": 0.9764884266369946, + "learning_rate": 1.4905869322254946e-06, + "loss": 0.1253, + "step": 7062 + }, + { + "epoch": 0.6507578200580458, + "grad_norm": 0.9093361220955308, + "learning_rate": 1.4898900523927224e-06, + "loss": 0.1217, + "step": 7063 + }, + { + "epoch": 0.6508499562353158, + "grad_norm": 1.00659867942203, + "learning_rate": 1.489193266348769e-06, + "loss": 0.1323, + "step": 7064 + }, + { + "epoch": 0.6509420924125858, + "grad_norm": 0.9328286355155382, + "learning_rate": 1.4884965741583288e-06, + "loss": 0.1242, + "step": 7065 + }, + { + "epoch": 0.6510342285898558, + "grad_norm": 0.9492231265782749, + "learning_rate": 1.48779997588609e-06, + "loss": 0.1352, + "step": 7066 + }, + { + "epoch": 0.6511263647671258, + "grad_norm": 0.9336815769927269, + "learning_rate": 1.4871034715967331e-06, + "loss": 0.1321, + "step": 7067 + }, + { + "epoch": 0.6512185009443958, + "grad_norm": 0.8837768145065813, + "learning_rate": 1.4864070613549284e-06, + "loss": 0.1234, + "step": 7068 + }, + { + "epoch": 0.6513106371216658, + "grad_norm": 0.8757121977193029, + "learning_rate": 1.485710745225336e-06, + "loss": 0.1177, + "step": 7069 + }, + { + "epoch": 0.6514027732989358, + "grad_norm": 0.925605697107438, + "learning_rate": 1.4850145232726104e-06, + "loss": 0.1207, + "step": 7070 + }, + { + "epoch": 0.6514949094762058, + "grad_norm": 0.9039591896508201, + "learning_rate": 1.4843183955613955e-06, + "loss": 0.1262, + "step": 7071 + }, + { + "epoch": 0.6515870456534758, + "grad_norm": 0.9261306989585284, + "learning_rate": 1.4836223621563272e-06, + "loss": 0.1188, + "step": 7072 + }, + { + "epoch": 0.6516791818307458, + "grad_norm": 0.9079997328951442, + "learning_rate": 1.4829264231220319e-06, + "loss": 0.1319, + "step": 7073 + }, + { + "epoch": 0.6517713180080158, + "grad_norm": 0.9149117830703877, + "learning_rate": 1.4822305785231273e-06, + "loss": 0.1248, + "step": 7074 + }, + { + "epoch": 0.6518634541852859, + "grad_norm": 0.9574311441669855, + "learning_rate": 1.4815348284242234e-06, + "loss": 0.1353, + "step": 7075 + }, + { + "epoch": 0.6519555903625559, + "grad_norm": 0.9621595470781569, + "learning_rate": 1.4808391728899206e-06, + "loss": 0.1353, + "step": 7076 + }, + { + "epoch": 0.6520477265398259, + "grad_norm": 0.912082741700463, + "learning_rate": 1.4801436119848096e-06, + "loss": 0.1279, + "step": 7077 + }, + { + "epoch": 0.6521398627170959, + "grad_norm": 0.8963015605594415, + "learning_rate": 1.4794481457734743e-06, + "loss": 0.1232, + "step": 7078 + }, + { + "epoch": 0.6522319988943659, + "grad_norm": 0.9239801719949823, + "learning_rate": 1.478752774320488e-06, + "loss": 0.1279, + "step": 7079 + }, + { + "epoch": 0.6523241350716359, + "grad_norm": 0.8899030279037768, + "learning_rate": 1.4780574976904174e-06, + "loss": 0.1159, + "step": 7080 + }, + { + "epoch": 0.6524162712489059, + "grad_norm": 0.8542351029355911, + "learning_rate": 1.4773623159478178e-06, + "loss": 0.1102, + "step": 7081 + }, + { + "epoch": 0.6525084074261759, + "grad_norm": 0.844299544578612, + "learning_rate": 1.4766672291572364e-06, + "loss": 0.114, + "step": 7082 + }, + { + "epoch": 0.6526005436034459, + "grad_norm": 0.9211202958944511, + "learning_rate": 1.4759722373832135e-06, + "loss": 0.1159, + "step": 7083 + }, + { + "epoch": 0.6526926797807159, + "grad_norm": 0.9334540781839048, + "learning_rate": 1.4752773406902788e-06, + "loss": 0.1275, + "step": 7084 + }, + { + "epoch": 0.6527848159579859, + "grad_norm": 0.8839225399468466, + "learning_rate": 1.4745825391429537e-06, + "loss": 0.1225, + "step": 7085 + }, + { + "epoch": 0.6528769521352559, + "grad_norm": 0.9135405613448878, + "learning_rate": 1.4738878328057493e-06, + "loss": 0.1272, + "step": 7086 + }, + { + "epoch": 0.6529690883125259, + "grad_norm": 0.9291388808423455, + "learning_rate": 1.4731932217431704e-06, + "loss": 0.1304, + "step": 7087 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.9333671336031034, + "learning_rate": 1.472498706019711e-06, + "loss": 0.1282, + "step": 7088 + }, + { + "epoch": 0.653153360667066, + "grad_norm": 0.9549475483412628, + "learning_rate": 1.4718042856998582e-06, + "loss": 0.1323, + "step": 7089 + }, + { + "epoch": 0.653245496844336, + "grad_norm": 0.9327030362108195, + "learning_rate": 1.4711099608480878e-06, + "loss": 0.126, + "step": 7090 + }, + { + "epoch": 0.653337633021606, + "grad_norm": 0.90652969378768, + "learning_rate": 1.4704157315288676e-06, + "loss": 0.1287, + "step": 7091 + }, + { + "epoch": 0.653429769198876, + "grad_norm": 0.9976244196720754, + "learning_rate": 1.469721597806658e-06, + "loss": 0.1358, + "step": 7092 + }, + { + "epoch": 0.653521905376146, + "grad_norm": 0.9056095269673432, + "learning_rate": 1.4690275597459097e-06, + "loss": 0.126, + "step": 7093 + }, + { + "epoch": 0.653614041553416, + "grad_norm": 0.9103089182932546, + "learning_rate": 1.4683336174110622e-06, + "loss": 0.1302, + "step": 7094 + }, + { + "epoch": 0.653706177730686, + "grad_norm": 0.8777755066961466, + "learning_rate": 1.4676397708665496e-06, + "loss": 0.1223, + "step": 7095 + }, + { + "epoch": 0.653798313907956, + "grad_norm": 0.9541944228782124, + "learning_rate": 1.4669460201767954e-06, + "loss": 0.1439, + "step": 7096 + }, + { + "epoch": 0.653890450085226, + "grad_norm": 0.8908592177465279, + "learning_rate": 1.4662523654062153e-06, + "loss": 0.1268, + "step": 7097 + }, + { + "epoch": 0.653982586262496, + "grad_norm": 0.934638689368783, + "learning_rate": 1.4655588066192135e-06, + "loss": 0.132, + "step": 7098 + }, + { + "epoch": 0.6540747224397659, + "grad_norm": 0.9183179883275167, + "learning_rate": 1.4648653438801876e-06, + "loss": 0.1213, + "step": 7099 + }, + { + "epoch": 0.6541668586170359, + "grad_norm": 0.899781104969002, + "learning_rate": 1.4641719772535265e-06, + "loss": 0.1157, + "step": 7100 + }, + { + "epoch": 0.6542589947943059, + "grad_norm": 0.9516515431961823, + "learning_rate": 1.463478706803609e-06, + "loss": 0.1344, + "step": 7101 + }, + { + "epoch": 0.6543511309715759, + "grad_norm": 0.8622076564585328, + "learning_rate": 1.4627855325948044e-06, + "loss": 0.1218, + "step": 7102 + }, + { + "epoch": 0.654443267148846, + "grad_norm": 0.9622965357298842, + "learning_rate": 1.462092454691475e-06, + "loss": 0.1363, + "step": 7103 + }, + { + "epoch": 0.654535403326116, + "grad_norm": 0.9411786160752286, + "learning_rate": 1.461399473157973e-06, + "loss": 0.1293, + "step": 7104 + }, + { + "epoch": 0.654627539503386, + "grad_norm": 0.9203989734120116, + "learning_rate": 1.4607065880586418e-06, + "loss": 0.1234, + "step": 7105 + }, + { + "epoch": 0.654719675680656, + "grad_norm": 0.9211111609012533, + "learning_rate": 1.4600137994578156e-06, + "loss": 0.1248, + "step": 7106 + }, + { + "epoch": 0.654811811857926, + "grad_norm": 0.9447252888204418, + "learning_rate": 1.4593211074198202e-06, + "loss": 0.1305, + "step": 7107 + }, + { + "epoch": 0.654903948035196, + "grad_norm": 0.9322211454899155, + "learning_rate": 1.4586285120089713e-06, + "loss": 0.1321, + "step": 7108 + }, + { + "epoch": 0.654996084212466, + "grad_norm": 0.870988539601446, + "learning_rate": 1.457936013289578e-06, + "loss": 0.1242, + "step": 7109 + }, + { + "epoch": 0.655088220389736, + "grad_norm": 0.9532198461498228, + "learning_rate": 1.4572436113259376e-06, + "loss": 0.138, + "step": 7110 + }, + { + "epoch": 0.655180356567006, + "grad_norm": 0.9982937398063886, + "learning_rate": 1.4565513061823394e-06, + "loss": 0.1395, + "step": 7111 + }, + { + "epoch": 0.655272492744276, + "grad_norm": 0.9169145577965592, + "learning_rate": 1.4558590979230663e-06, + "loss": 0.1288, + "step": 7112 + }, + { + "epoch": 0.655364628921546, + "grad_norm": 0.9951663238796098, + "learning_rate": 1.4551669866123868e-06, + "loss": 0.1485, + "step": 7113 + }, + { + "epoch": 0.655456765098816, + "grad_norm": 0.9797069270513918, + "learning_rate": 1.4544749723145665e-06, + "loss": 0.1385, + "step": 7114 + }, + { + "epoch": 0.655548901276086, + "grad_norm": 0.9252067413656129, + "learning_rate": 1.4537830550938563e-06, + "loss": 0.1301, + "step": 7115 + }, + { + "epoch": 0.6556410374533561, + "grad_norm": 0.8954911117919563, + "learning_rate": 1.453091235014502e-06, + "loss": 0.1304, + "step": 7116 + }, + { + "epoch": 0.6557331736306261, + "grad_norm": 0.8981284612320173, + "learning_rate": 1.4523995121407402e-06, + "loss": 0.1213, + "step": 7117 + }, + { + "epoch": 0.6558253098078961, + "grad_norm": 0.9051040761470195, + "learning_rate": 1.4517078865367968e-06, + "loss": 0.1301, + "step": 7118 + }, + { + "epoch": 0.6559174459851661, + "grad_norm": 0.9465935974796041, + "learning_rate": 1.4510163582668876e-06, + "loss": 0.1374, + "step": 7119 + }, + { + "epoch": 0.6560095821624361, + "grad_norm": 0.9314265941965006, + "learning_rate": 1.4503249273952224e-06, + "loss": 0.1384, + "step": 7120 + }, + { + "epoch": 0.6561017183397061, + "grad_norm": 0.9425029363283532, + "learning_rate": 1.449633593986001e-06, + "loss": 0.1316, + "step": 7121 + }, + { + "epoch": 0.6561938545169761, + "grad_norm": 0.90970340809109, + "learning_rate": 1.448942358103414e-06, + "loss": 0.1249, + "step": 7122 + }, + { + "epoch": 0.6562859906942461, + "grad_norm": 0.9009364371908557, + "learning_rate": 1.4482512198116424e-06, + "loss": 0.1281, + "step": 7123 + }, + { + "epoch": 0.6563781268715161, + "grad_norm": 0.9697235851276125, + "learning_rate": 1.4475601791748572e-06, + "loss": 0.1255, + "step": 7124 + }, + { + "epoch": 0.6564702630487861, + "grad_norm": 0.9906107648348492, + "learning_rate": 1.4468692362572228e-06, + "loss": 0.1391, + "step": 7125 + }, + { + "epoch": 0.6565623992260561, + "grad_norm": 0.9367678289236904, + "learning_rate": 1.4461783911228938e-06, + "loss": 0.1134, + "step": 7126 + }, + { + "epoch": 0.6566545354033261, + "grad_norm": 0.960257839404813, + "learning_rate": 1.4454876438360138e-06, + "loss": 0.1315, + "step": 7127 + }, + { + "epoch": 0.6567466715805961, + "grad_norm": 1.0250542021519866, + "learning_rate": 1.4447969944607207e-06, + "loss": 0.143, + "step": 7128 + }, + { + "epoch": 0.6568388077578661, + "grad_norm": 0.9299221218118151, + "learning_rate": 1.444106443061139e-06, + "loss": 0.1223, + "step": 7129 + }, + { + "epoch": 0.6569309439351362, + "grad_norm": 0.9862847658068205, + "learning_rate": 1.443415989701389e-06, + "loss": 0.1296, + "step": 7130 + }, + { + "epoch": 0.6570230801124062, + "grad_norm": 0.9101288069296816, + "learning_rate": 1.4427256344455764e-06, + "loss": 0.1304, + "step": 7131 + }, + { + "epoch": 0.6571152162896762, + "grad_norm": 0.843608595661738, + "learning_rate": 1.442035377357803e-06, + "loss": 0.1175, + "step": 7132 + }, + { + "epoch": 0.6572073524669462, + "grad_norm": 0.9150555649451192, + "learning_rate": 1.4413452185021594e-06, + "loss": 0.1304, + "step": 7133 + }, + { + "epoch": 0.6572994886442162, + "grad_norm": 0.9201461415510122, + "learning_rate": 1.4406551579427264e-06, + "loss": 0.1314, + "step": 7134 + }, + { + "epoch": 0.6573916248214862, + "grad_norm": 0.8949364453594446, + "learning_rate": 1.4399651957435751e-06, + "loss": 0.1136, + "step": 7135 + }, + { + "epoch": 0.6574837609987562, + "grad_norm": 0.8541001653937904, + "learning_rate": 1.439275331968769e-06, + "loss": 0.1188, + "step": 7136 + }, + { + "epoch": 0.6575758971760262, + "grad_norm": 0.951903553111977, + "learning_rate": 1.4385855666823628e-06, + "loss": 0.1298, + "step": 7137 + }, + { + "epoch": 0.6576680333532962, + "grad_norm": 0.9391325848473654, + "learning_rate": 1.4378958999484021e-06, + "loss": 0.125, + "step": 7138 + }, + { + "epoch": 0.6577601695305662, + "grad_norm": 0.9498586655518755, + "learning_rate": 1.4372063318309213e-06, + "loss": 0.1338, + "step": 7139 + }, + { + "epoch": 0.6578523057078361, + "grad_norm": 0.9439890757247625, + "learning_rate": 1.4365168623939458e-06, + "loss": 0.1247, + "step": 7140 + }, + { + "epoch": 0.6579444418851061, + "grad_norm": 0.9715129796688949, + "learning_rate": 1.4358274917014942e-06, + "loss": 0.1319, + "step": 7141 + }, + { + "epoch": 0.6580365780623761, + "grad_norm": 0.952778753231662, + "learning_rate": 1.4351382198175745e-06, + "loss": 0.1251, + "step": 7142 + }, + { + "epoch": 0.6581287142396463, + "grad_norm": 0.8580832806372259, + "learning_rate": 1.4344490468061867e-06, + "loss": 0.115, + "step": 7143 + }, + { + "epoch": 0.6582208504169162, + "grad_norm": 0.9595668294661949, + "learning_rate": 1.4337599727313196e-06, + "loss": 0.1347, + "step": 7144 + }, + { + "epoch": 0.6583129865941862, + "grad_norm": 0.9272250033635803, + "learning_rate": 1.4330709976569526e-06, + "loss": 0.1285, + "step": 7145 + }, + { + "epoch": 0.6584051227714562, + "grad_norm": 0.9198354354531479, + "learning_rate": 1.4323821216470585e-06, + "loss": 0.1313, + "step": 7146 + }, + { + "epoch": 0.6584972589487262, + "grad_norm": 0.940128449050614, + "learning_rate": 1.4316933447656e-06, + "loss": 0.1316, + "step": 7147 + }, + { + "epoch": 0.6585893951259962, + "grad_norm": 0.9651435947834156, + "learning_rate": 1.4310046670765288e-06, + "loss": 0.1331, + "step": 7148 + }, + { + "epoch": 0.6586815313032662, + "grad_norm": 0.926751113334902, + "learning_rate": 1.43031608864379e-06, + "loss": 0.1335, + "step": 7149 + }, + { + "epoch": 0.6587736674805362, + "grad_norm": 0.9034975813817566, + "learning_rate": 1.4296276095313168e-06, + "loss": 0.1274, + "step": 7150 + }, + { + "epoch": 0.6588658036578062, + "grad_norm": 0.9131412602543207, + "learning_rate": 1.4289392298030362e-06, + "loss": 0.119, + "step": 7151 + }, + { + "epoch": 0.6589579398350762, + "grad_norm": 0.9423968798876814, + "learning_rate": 1.4282509495228622e-06, + "loss": 0.1281, + "step": 7152 + }, + { + "epoch": 0.6590500760123462, + "grad_norm": 0.939229125385478, + "learning_rate": 1.4275627687547027e-06, + "loss": 0.122, + "step": 7153 + }, + { + "epoch": 0.6591422121896162, + "grad_norm": 1.0008297804235626, + "learning_rate": 1.4268746875624572e-06, + "loss": 0.1361, + "step": 7154 + }, + { + "epoch": 0.6592343483668862, + "grad_norm": 0.975070832200004, + "learning_rate": 1.426186706010012e-06, + "loss": 0.1279, + "step": 7155 + }, + { + "epoch": 0.6593264845441562, + "grad_norm": 0.952944740044869, + "learning_rate": 1.4254988241612456e-06, + "loss": 0.1216, + "step": 7156 + }, + { + "epoch": 0.6594186207214263, + "grad_norm": 1.0120820938152526, + "learning_rate": 1.4248110420800293e-06, + "loss": 0.1241, + "step": 7157 + }, + { + "epoch": 0.6595107568986963, + "grad_norm": 0.9205810165081968, + "learning_rate": 1.4241233598302233e-06, + "loss": 0.1232, + "step": 7158 + }, + { + "epoch": 0.6596028930759663, + "grad_norm": 1.0440449077545666, + "learning_rate": 1.4234357774756802e-06, + "loss": 0.1349, + "step": 7159 + }, + { + "epoch": 0.6596950292532363, + "grad_norm": 0.9193202768365726, + "learning_rate": 1.422748295080241e-06, + "loss": 0.1246, + "step": 7160 + }, + { + "epoch": 0.6597871654305063, + "grad_norm": 0.8943555491454667, + "learning_rate": 1.4220609127077373e-06, + "loss": 0.1195, + "step": 7161 + }, + { + "epoch": 0.6598793016077763, + "grad_norm": 0.9391692237037247, + "learning_rate": 1.4213736304219945e-06, + "loss": 0.1328, + "step": 7162 + }, + { + "epoch": 0.6599714377850463, + "grad_norm": 0.9963129927890189, + "learning_rate": 1.4206864482868265e-06, + "loss": 0.1263, + "step": 7163 + }, + { + "epoch": 0.6600635739623163, + "grad_norm": 0.933541761483965, + "learning_rate": 1.4199993663660372e-06, + "loss": 0.1315, + "step": 7164 + }, + { + "epoch": 0.6601557101395863, + "grad_norm": 0.9339266259054608, + "learning_rate": 1.419312384723423e-06, + "loss": 0.1255, + "step": 7165 + }, + { + "epoch": 0.6602478463168563, + "grad_norm": 0.9000373393357917, + "learning_rate": 1.4186255034227714e-06, + "loss": 0.1114, + "step": 7166 + }, + { + "epoch": 0.6603399824941263, + "grad_norm": 0.8703795566468919, + "learning_rate": 1.4179387225278568e-06, + "loss": 0.1116, + "step": 7167 + }, + { + "epoch": 0.6604321186713963, + "grad_norm": 0.955579337973247, + "learning_rate": 1.4172520421024493e-06, + "loss": 0.1269, + "step": 7168 + }, + { + "epoch": 0.6605242548486663, + "grad_norm": 0.9639282983172128, + "learning_rate": 1.4165654622103054e-06, + "loss": 0.1206, + "step": 7169 + }, + { + "epoch": 0.6606163910259364, + "grad_norm": 0.8729449392995738, + "learning_rate": 1.4158789829151747e-06, + "loss": 0.1203, + "step": 7170 + }, + { + "epoch": 0.6607085272032064, + "grad_norm": 0.9116857232463509, + "learning_rate": 1.4151926042807985e-06, + "loss": 0.1147, + "step": 7171 + }, + { + "epoch": 0.6608006633804764, + "grad_norm": 0.9490013290219388, + "learning_rate": 1.4145063263709056e-06, + "loss": 0.1421, + "step": 7172 + }, + { + "epoch": 0.6608927995577464, + "grad_norm": 0.9488839253969639, + "learning_rate": 1.413820149249216e-06, + "loss": 0.1311, + "step": 7173 + }, + { + "epoch": 0.6609849357350164, + "grad_norm": 1.0096436437834344, + "learning_rate": 1.4131340729794424e-06, + "loss": 0.1288, + "step": 7174 + }, + { + "epoch": 0.6610770719122864, + "grad_norm": 0.9178413047426945, + "learning_rate": 1.4124480976252872e-06, + "loss": 0.1257, + "step": 7175 + }, + { + "epoch": 0.6611692080895564, + "grad_norm": 0.8598420566610736, + "learning_rate": 1.4117622232504442e-06, + "loss": 0.1186, + "step": 7176 + }, + { + "epoch": 0.6612613442668264, + "grad_norm": 0.9438112564377011, + "learning_rate": 1.4110764499185957e-06, + "loss": 0.1368, + "step": 7177 + }, + { + "epoch": 0.6613534804440964, + "grad_norm": 0.9398113472022501, + "learning_rate": 1.410390777693415e-06, + "loss": 0.1287, + "step": 7178 + }, + { + "epoch": 0.6614456166213664, + "grad_norm": 0.8882446435476032, + "learning_rate": 1.409705206638568e-06, + "loss": 0.1223, + "step": 7179 + }, + { + "epoch": 0.6615377527986364, + "grad_norm": 0.9108996799402447, + "learning_rate": 1.409019736817711e-06, + "loss": 0.1313, + "step": 7180 + }, + { + "epoch": 0.6616298889759064, + "grad_norm": 0.9425541395961481, + "learning_rate": 1.4083343682944878e-06, + "loss": 0.14, + "step": 7181 + }, + { + "epoch": 0.6617220251531764, + "grad_norm": 0.9257644599811672, + "learning_rate": 1.4076491011325372e-06, + "loss": 0.1366, + "step": 7182 + }, + { + "epoch": 0.6618141613304463, + "grad_norm": 0.9321219321891313, + "learning_rate": 1.4069639353954837e-06, + "loss": 0.1335, + "step": 7183 + }, + { + "epoch": 0.6619062975077165, + "grad_norm": 0.9094495757127697, + "learning_rate": 1.4062788711469478e-06, + "loss": 0.1259, + "step": 7184 + }, + { + "epoch": 0.6619984336849865, + "grad_norm": 0.9848326389326956, + "learning_rate": 1.405593908450535e-06, + "loss": 0.1272, + "step": 7185 + }, + { + "epoch": 0.6620905698622565, + "grad_norm": 0.9063900632096158, + "learning_rate": 1.4049090473698457e-06, + "loss": 0.1238, + "step": 7186 + }, + { + "epoch": 0.6621827060395264, + "grad_norm": 0.9357963658738954, + "learning_rate": 1.4042242879684703e-06, + "loss": 0.1282, + "step": 7187 + }, + { + "epoch": 0.6622748422167964, + "grad_norm": 0.8979794771571415, + "learning_rate": 1.403539630309988e-06, + "loss": 0.121, + "step": 7188 + }, + { + "epoch": 0.6623669783940664, + "grad_norm": 0.892611802740368, + "learning_rate": 1.4028550744579677e-06, + "loss": 0.1236, + "step": 7189 + }, + { + "epoch": 0.6624591145713364, + "grad_norm": 0.8850509948496229, + "learning_rate": 1.4021706204759716e-06, + "loss": 0.1263, + "step": 7190 + }, + { + "epoch": 0.6625512507486064, + "grad_norm": 0.9465343552302885, + "learning_rate": 1.4014862684275522e-06, + "loss": 0.1319, + "step": 7191 + }, + { + "epoch": 0.6626433869258764, + "grad_norm": 0.9608608477037041, + "learning_rate": 1.4008020183762513e-06, + "loss": 0.1287, + "step": 7192 + }, + { + "epoch": 0.6627355231031464, + "grad_norm": 0.9568016008936874, + "learning_rate": 1.4001178703856016e-06, + "loss": 0.1249, + "step": 7193 + }, + { + "epoch": 0.6628276592804164, + "grad_norm": 0.9681371499803646, + "learning_rate": 1.3994338245191249e-06, + "loss": 0.1298, + "step": 7194 + }, + { + "epoch": 0.6629197954576864, + "grad_norm": 0.9707806670367282, + "learning_rate": 1.398749880840336e-06, + "loss": 0.1291, + "step": 7195 + }, + { + "epoch": 0.6630119316349564, + "grad_norm": 1.0292775882377738, + "learning_rate": 1.3980660394127394e-06, + "loss": 0.128, + "step": 7196 + }, + { + "epoch": 0.6631040678122264, + "grad_norm": 0.9334590025297743, + "learning_rate": 1.3973823002998305e-06, + "loss": 0.1248, + "step": 7197 + }, + { + "epoch": 0.6631962039894965, + "grad_norm": 0.8671221737926933, + "learning_rate": 1.3966986635650936e-06, + "loss": 0.1019, + "step": 7198 + }, + { + "epoch": 0.6632883401667665, + "grad_norm": 0.906986379921877, + "learning_rate": 1.3960151292720039e-06, + "loss": 0.1225, + "step": 7199 + }, + { + "epoch": 0.6633804763440365, + "grad_norm": 0.9726558373988553, + "learning_rate": 1.395331697484028e-06, + "loss": 0.1252, + "step": 7200 + }, + { + "epoch": 0.6634726125213065, + "grad_norm": 0.8928249593897785, + "learning_rate": 1.394648368264624e-06, + "loss": 0.1179, + "step": 7201 + }, + { + "epoch": 0.6635647486985765, + "grad_norm": 0.9246855574964215, + "learning_rate": 1.3939651416772365e-06, + "loss": 0.1227, + "step": 7202 + }, + { + "epoch": 0.6636568848758465, + "grad_norm": 0.9321724830410227, + "learning_rate": 1.3932820177853062e-06, + "loss": 0.1238, + "step": 7203 + }, + { + "epoch": 0.6637490210531165, + "grad_norm": 0.9507285348694358, + "learning_rate": 1.3925989966522585e-06, + "loss": 0.1268, + "step": 7204 + }, + { + "epoch": 0.6638411572303865, + "grad_norm": 0.9361993103413431, + "learning_rate": 1.391916078341514e-06, + "loss": 0.1215, + "step": 7205 + }, + { + "epoch": 0.6639332934076565, + "grad_norm": 0.8919295341094984, + "learning_rate": 1.3912332629164798e-06, + "loss": 0.1179, + "step": 7206 + }, + { + "epoch": 0.6640254295849265, + "grad_norm": 0.8853783172570733, + "learning_rate": 1.3905505504405567e-06, + "loss": 0.124, + "step": 7207 + }, + { + "epoch": 0.6641175657621965, + "grad_norm": 0.9334584777070276, + "learning_rate": 1.3898679409771355e-06, + "loss": 0.1283, + "step": 7208 + }, + { + "epoch": 0.6642097019394665, + "grad_norm": 0.9539829238214229, + "learning_rate": 1.389185434589595e-06, + "loss": 0.1266, + "step": 7209 + }, + { + "epoch": 0.6643018381167365, + "grad_norm": 0.9241016852887277, + "learning_rate": 1.3885030313413056e-06, + "loss": 0.1196, + "step": 7210 + }, + { + "epoch": 0.6643939742940066, + "grad_norm": 0.9586669345335255, + "learning_rate": 1.3878207312956295e-06, + "loss": 0.1351, + "step": 7211 + }, + { + "epoch": 0.6644861104712766, + "grad_norm": 0.9450592859824014, + "learning_rate": 1.3871385345159183e-06, + "loss": 0.1323, + "step": 7212 + }, + { + "epoch": 0.6645782466485466, + "grad_norm": 0.9612734215640861, + "learning_rate": 1.3864564410655149e-06, + "loss": 0.1225, + "step": 7213 + }, + { + "epoch": 0.6646703828258166, + "grad_norm": 0.9395580497180398, + "learning_rate": 1.3857744510077507e-06, + "loss": 0.125, + "step": 7214 + }, + { + "epoch": 0.6647625190030866, + "grad_norm": 0.9515079076163937, + "learning_rate": 1.3850925644059475e-06, + "loss": 0.1413, + "step": 7215 + }, + { + "epoch": 0.6648546551803566, + "grad_norm": 0.9818483068556666, + "learning_rate": 1.3844107813234197e-06, + "loss": 0.1317, + "step": 7216 + }, + { + "epoch": 0.6649467913576266, + "grad_norm": 0.9217060441337399, + "learning_rate": 1.3837291018234723e-06, + "loss": 0.1327, + "step": 7217 + }, + { + "epoch": 0.6650389275348966, + "grad_norm": 0.9614026258662707, + "learning_rate": 1.3830475259693964e-06, + "loss": 0.1324, + "step": 7218 + }, + { + "epoch": 0.6651310637121666, + "grad_norm": 0.8997301921202329, + "learning_rate": 1.3823660538244793e-06, + "loss": 0.114, + "step": 7219 + }, + { + "epoch": 0.6652231998894366, + "grad_norm": 0.9264679973250439, + "learning_rate": 1.3816846854519934e-06, + "loss": 0.1366, + "step": 7220 + }, + { + "epoch": 0.6653153360667066, + "grad_norm": 0.9397209333426656, + "learning_rate": 1.3810034209152057e-06, + "loss": 0.1224, + "step": 7221 + }, + { + "epoch": 0.6654074722439766, + "grad_norm": 0.9140976327735287, + "learning_rate": 1.3803222602773696e-06, + "loss": 0.1209, + "step": 7222 + }, + { + "epoch": 0.6654996084212466, + "grad_norm": 0.9275928232405652, + "learning_rate": 1.379641203601732e-06, + "loss": 0.1295, + "step": 7223 + }, + { + "epoch": 0.6655917445985166, + "grad_norm": 0.8812472686313237, + "learning_rate": 1.3789602509515306e-06, + "loss": 0.126, + "step": 7224 + }, + { + "epoch": 0.6656838807757867, + "grad_norm": 0.9507714938881616, + "learning_rate": 1.3782794023899899e-06, + "loss": 0.1271, + "step": 7225 + }, + { + "epoch": 0.6657760169530567, + "grad_norm": 0.8546349341742145, + "learning_rate": 1.3775986579803276e-06, + "loss": 0.111, + "step": 7226 + }, + { + "epoch": 0.6658681531303267, + "grad_norm": 0.9735538532092373, + "learning_rate": 1.37691801778575e-06, + "loss": 0.1267, + "step": 7227 + }, + { + "epoch": 0.6659602893075967, + "grad_norm": 0.9649175198066844, + "learning_rate": 1.3762374818694558e-06, + "loss": 0.1273, + "step": 7228 + }, + { + "epoch": 0.6660524254848667, + "grad_norm": 0.9308364929610364, + "learning_rate": 1.3755570502946324e-06, + "loss": 0.135, + "step": 7229 + }, + { + "epoch": 0.6661445616621366, + "grad_norm": 0.9482080604533994, + "learning_rate": 1.3748767231244587e-06, + "loss": 0.1295, + "step": 7230 + }, + { + "epoch": 0.6662366978394066, + "grad_norm": 0.9044964317809467, + "learning_rate": 1.3741965004221012e-06, + "loss": 0.1201, + "step": 7231 + }, + { + "epoch": 0.6663288340166766, + "grad_norm": 0.9573803087194334, + "learning_rate": 1.3735163822507196e-06, + "loss": 0.1425, + "step": 7232 + }, + { + "epoch": 0.6664209701939466, + "grad_norm": 0.879465266637758, + "learning_rate": 1.372836368673463e-06, + "loss": 0.1097, + "step": 7233 + }, + { + "epoch": 0.6665131063712166, + "grad_norm": 1.007476252708849, + "learning_rate": 1.3721564597534723e-06, + "loss": 0.1303, + "step": 7234 + }, + { + "epoch": 0.6666052425484866, + "grad_norm": 0.930209822547841, + "learning_rate": 1.3714766555538755e-06, + "loss": 0.1259, + "step": 7235 + }, + { + "epoch": 0.6666973787257566, + "grad_norm": 0.9989347154200431, + "learning_rate": 1.3707969561377915e-06, + "loss": 0.1465, + "step": 7236 + }, + { + "epoch": 0.6667895149030266, + "grad_norm": 0.854671371178768, + "learning_rate": 1.370117361568332e-06, + "loss": 0.1137, + "step": 7237 + }, + { + "epoch": 0.6668816510802967, + "grad_norm": 0.9819684863020709, + "learning_rate": 1.3694378719085976e-06, + "loss": 0.1388, + "step": 7238 + }, + { + "epoch": 0.6669737872575667, + "grad_norm": 0.9252396488730227, + "learning_rate": 1.368758487221678e-06, + "loss": 0.1285, + "step": 7239 + }, + { + "epoch": 0.6670659234348367, + "grad_norm": 0.9453506499299281, + "learning_rate": 1.3680792075706545e-06, + "loss": 0.1397, + "step": 7240 + }, + { + "epoch": 0.6671580596121067, + "grad_norm": 0.8709913858856868, + "learning_rate": 1.367400033018599e-06, + "loss": 0.1148, + "step": 7241 + }, + { + "epoch": 0.6672501957893767, + "grad_norm": 0.9137545587799056, + "learning_rate": 1.3667209636285727e-06, + "loss": 0.131, + "step": 7242 + }, + { + "epoch": 0.6673423319666467, + "grad_norm": 0.9159588879600266, + "learning_rate": 1.366041999463626e-06, + "loss": 0.13, + "step": 7243 + }, + { + "epoch": 0.6674344681439167, + "grad_norm": 0.9230621320209568, + "learning_rate": 1.3653631405868011e-06, + "loss": 0.1232, + "step": 7244 + }, + { + "epoch": 0.6675266043211867, + "grad_norm": 0.8953404755783205, + "learning_rate": 1.3646843870611313e-06, + "loss": 0.1277, + "step": 7245 + }, + { + "epoch": 0.6676187404984567, + "grad_norm": 0.9414937946712962, + "learning_rate": 1.3640057389496392e-06, + "loss": 0.1319, + "step": 7246 + }, + { + "epoch": 0.6677108766757267, + "grad_norm": 0.9230244067093492, + "learning_rate": 1.3633271963153363e-06, + "loss": 0.1258, + "step": 7247 + }, + { + "epoch": 0.6678030128529967, + "grad_norm": 0.9482742954005706, + "learning_rate": 1.3626487592212245e-06, + "loss": 0.1285, + "step": 7248 + }, + { + "epoch": 0.6678951490302667, + "grad_norm": 0.9053301065986715, + "learning_rate": 1.361970427730298e-06, + "loss": 0.1221, + "step": 7249 + }, + { + "epoch": 0.6679872852075367, + "grad_norm": 0.9269917273591195, + "learning_rate": 1.3612922019055409e-06, + "loss": 0.1278, + "step": 7250 + }, + { + "epoch": 0.6680794213848067, + "grad_norm": 0.9945437288140861, + "learning_rate": 1.3606140818099243e-06, + "loss": 0.1283, + "step": 7251 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 0.8834575369543373, + "learning_rate": 1.3599360675064139e-06, + "loss": 0.1173, + "step": 7252 + }, + { + "epoch": 0.6682636937393468, + "grad_norm": 0.9297478423868443, + "learning_rate": 1.359258159057961e-06, + "loss": 0.1243, + "step": 7253 + }, + { + "epoch": 0.6683558299166168, + "grad_norm": 0.8803633005549261, + "learning_rate": 1.358580356527511e-06, + "loss": 0.1271, + "step": 7254 + }, + { + "epoch": 0.6684479660938868, + "grad_norm": 0.9436775723358185, + "learning_rate": 1.3579026599779988e-06, + "loss": 0.1261, + "step": 7255 + }, + { + "epoch": 0.6685401022711568, + "grad_norm": 0.8952694224685467, + "learning_rate": 1.3572250694723465e-06, + "loss": 0.1245, + "step": 7256 + }, + { + "epoch": 0.6686322384484268, + "grad_norm": 0.8551919104756415, + "learning_rate": 1.3565475850734706e-06, + "loss": 0.1139, + "step": 7257 + }, + { + "epoch": 0.6687243746256968, + "grad_norm": 0.9032991002081001, + "learning_rate": 1.355870206844273e-06, + "loss": 0.111, + "step": 7258 + }, + { + "epoch": 0.6688165108029668, + "grad_norm": 0.9402261271430783, + "learning_rate": 1.3551929348476512e-06, + "loss": 0.125, + "step": 7259 + }, + { + "epoch": 0.6689086469802368, + "grad_norm": 0.8590361852348019, + "learning_rate": 1.3545157691464878e-06, + "loss": 0.1212, + "step": 7260 + }, + { + "epoch": 0.6690007831575068, + "grad_norm": 0.953889062291394, + "learning_rate": 1.353838709803658e-06, + "loss": 0.1246, + "step": 7261 + }, + { + "epoch": 0.6690929193347768, + "grad_norm": 0.9024711962523686, + "learning_rate": 1.3531617568820287e-06, + "loss": 0.1256, + "step": 7262 + }, + { + "epoch": 0.6691850555120468, + "grad_norm": 0.9029203771714646, + "learning_rate": 1.3524849104444537e-06, + "loss": 0.1259, + "step": 7263 + }, + { + "epoch": 0.6692771916893168, + "grad_norm": 0.9757431844463745, + "learning_rate": 1.3518081705537771e-06, + "loss": 0.1329, + "step": 7264 + }, + { + "epoch": 0.6693693278665868, + "grad_norm": 0.9514092658235749, + "learning_rate": 1.3511315372728357e-06, + "loss": 0.1181, + "step": 7265 + }, + { + "epoch": 0.6694614640438569, + "grad_norm": 0.9673560639479292, + "learning_rate": 1.3504550106644542e-06, + "loss": 0.1253, + "step": 7266 + }, + { + "epoch": 0.6695536002211269, + "grad_norm": 0.9354307139797914, + "learning_rate": 1.34977859079145e-06, + "loss": 0.1139, + "step": 7267 + }, + { + "epoch": 0.6696457363983969, + "grad_norm": 0.8914156946294755, + "learning_rate": 1.3491022777166276e-06, + "loss": 0.1156, + "step": 7268 + }, + { + "epoch": 0.6697378725756669, + "grad_norm": 0.9649736445005475, + "learning_rate": 1.3484260715027813e-06, + "loss": 0.1436, + "step": 7269 + }, + { + "epoch": 0.6698300087529369, + "grad_norm": 0.9483659023489992, + "learning_rate": 1.3477499722126985e-06, + "loss": 0.1323, + "step": 7270 + }, + { + "epoch": 0.6699221449302069, + "grad_norm": 0.9200966092532619, + "learning_rate": 1.3470739799091555e-06, + "loss": 0.1322, + "step": 7271 + }, + { + "epoch": 0.6700142811074769, + "grad_norm": 0.9403584085916542, + "learning_rate": 1.3463980946549166e-06, + "loss": 0.1382, + "step": 7272 + }, + { + "epoch": 0.6701064172847468, + "grad_norm": 0.8378406184781029, + "learning_rate": 1.3457223165127397e-06, + "loss": 0.1088, + "step": 7273 + }, + { + "epoch": 0.6701985534620168, + "grad_norm": 0.9551913632098953, + "learning_rate": 1.3450466455453693e-06, + "loss": 0.1342, + "step": 7274 + }, + { + "epoch": 0.6702906896392868, + "grad_norm": 0.8597495274518914, + "learning_rate": 1.3443710818155428e-06, + "loss": 0.1191, + "step": 7275 + }, + { + "epoch": 0.6703828258165568, + "grad_norm": 0.9307987876921514, + "learning_rate": 1.3436956253859851e-06, + "loss": 0.1119, + "step": 7276 + }, + { + "epoch": 0.6704749619938268, + "grad_norm": 0.913451889373323, + "learning_rate": 1.3430202763194125e-06, + "loss": 0.1242, + "step": 7277 + }, + { + "epoch": 0.6705670981710968, + "grad_norm": 0.9174767959610509, + "learning_rate": 1.342345034678533e-06, + "loss": 0.1207, + "step": 7278 + }, + { + "epoch": 0.6706592343483669, + "grad_norm": 0.9625576179556493, + "learning_rate": 1.3416699005260416e-06, + "loss": 0.131, + "step": 7279 + }, + { + "epoch": 0.6707513705256369, + "grad_norm": 0.9322744623558886, + "learning_rate": 1.3409948739246236e-06, + "loss": 0.1193, + "step": 7280 + }, + { + "epoch": 0.6708435067029069, + "grad_norm": 0.9144503231058142, + "learning_rate": 1.3403199549369564e-06, + "loss": 0.1288, + "step": 7281 + }, + { + "epoch": 0.6709356428801769, + "grad_norm": 0.9216913139875762, + "learning_rate": 1.3396451436257062e-06, + "loss": 0.1307, + "step": 7282 + }, + { + "epoch": 0.6710277790574469, + "grad_norm": 0.9487383042584467, + "learning_rate": 1.3389704400535303e-06, + "loss": 0.1294, + "step": 7283 + }, + { + "epoch": 0.6711199152347169, + "grad_norm": 0.8999456824995248, + "learning_rate": 1.3382958442830737e-06, + "loss": 0.1239, + "step": 7284 + }, + { + "epoch": 0.6712120514119869, + "grad_norm": 0.9535840771969801, + "learning_rate": 1.337621356376972e-06, + "loss": 0.1264, + "step": 7285 + }, + { + "epoch": 0.6713041875892569, + "grad_norm": 0.9497062608622805, + "learning_rate": 1.3369469763978527e-06, + "loss": 0.1316, + "step": 7286 + }, + { + "epoch": 0.6713963237665269, + "grad_norm": 0.9452724159743188, + "learning_rate": 1.3362727044083318e-06, + "loss": 0.1418, + "step": 7287 + }, + { + "epoch": 0.6714884599437969, + "grad_norm": 0.91445117456331, + "learning_rate": 1.3355985404710164e-06, + "loss": 0.1174, + "step": 7288 + }, + { + "epoch": 0.6715805961210669, + "grad_norm": 0.9449257270380875, + "learning_rate": 1.3349244846485022e-06, + "loss": 0.1301, + "step": 7289 + }, + { + "epoch": 0.6716727322983369, + "grad_norm": 0.964496022253499, + "learning_rate": 1.3342505370033736e-06, + "loss": 0.1125, + "step": 7290 + }, + { + "epoch": 0.6717648684756069, + "grad_norm": 0.936912152887504, + "learning_rate": 1.3335766975982082e-06, + "loss": 0.1243, + "step": 7291 + }, + { + "epoch": 0.6718570046528769, + "grad_norm": 0.8577965724245058, + "learning_rate": 1.3329029664955729e-06, + "loss": 0.1002, + "step": 7292 + }, + { + "epoch": 0.671949140830147, + "grad_norm": 0.990204302102415, + "learning_rate": 1.332229343758022e-06, + "loss": 0.1265, + "step": 7293 + }, + { + "epoch": 0.672041277007417, + "grad_norm": 1.0281368081432407, + "learning_rate": 1.331555829448103e-06, + "loss": 0.1257, + "step": 7294 + }, + { + "epoch": 0.672133413184687, + "grad_norm": 0.9288074008539412, + "learning_rate": 1.33088242362835e-06, + "loss": 0.1209, + "step": 7295 + }, + { + "epoch": 0.672225549361957, + "grad_norm": 0.9119209022595895, + "learning_rate": 1.3302091263612907e-06, + "loss": 0.1174, + "step": 7296 + }, + { + "epoch": 0.672317685539227, + "grad_norm": 0.9143317228925907, + "learning_rate": 1.3295359377094392e-06, + "loss": 0.1295, + "step": 7297 + }, + { + "epoch": 0.672409821716497, + "grad_norm": 0.8944269171000773, + "learning_rate": 1.3288628577353014e-06, + "loss": 0.1251, + "step": 7298 + }, + { + "epoch": 0.672501957893767, + "grad_norm": 0.952822769694342, + "learning_rate": 1.3281898865013749e-06, + "loss": 0.1422, + "step": 7299 + }, + { + "epoch": 0.672594094071037, + "grad_norm": 0.9731154772916867, + "learning_rate": 1.327517024070143e-06, + "loss": 0.129, + "step": 7300 + }, + { + "epoch": 0.672686230248307, + "grad_norm": 0.9820492852941306, + "learning_rate": 1.3268442705040808e-06, + "loss": 0.1319, + "step": 7301 + }, + { + "epoch": 0.672778366425577, + "grad_norm": 0.8859435352688277, + "learning_rate": 1.3261716258656543e-06, + "loss": 0.127, + "step": 7302 + }, + { + "epoch": 0.672870502602847, + "grad_norm": 0.9183943013213689, + "learning_rate": 1.3254990902173187e-06, + "loss": 0.1272, + "step": 7303 + }, + { + "epoch": 0.672962638780117, + "grad_norm": 0.9189656727211152, + "learning_rate": 1.3248266636215202e-06, + "loss": 0.125, + "step": 7304 + }, + { + "epoch": 0.673054774957387, + "grad_norm": 0.867060049690445, + "learning_rate": 1.324154346140692e-06, + "loss": 0.1094, + "step": 7305 + }, + { + "epoch": 0.6731469111346571, + "grad_norm": 1.0020192646596813, + "learning_rate": 1.3234821378372586e-06, + "loss": 0.1299, + "step": 7306 + }, + { + "epoch": 0.6732390473119271, + "grad_norm": 0.9670118862538364, + "learning_rate": 1.3228100387736353e-06, + "loss": 0.1377, + "step": 7307 + }, + { + "epoch": 0.6733311834891971, + "grad_norm": 0.9237445429388595, + "learning_rate": 1.3221380490122276e-06, + "loss": 0.1286, + "step": 7308 + }, + { + "epoch": 0.6734233196664671, + "grad_norm": 0.9893606523013816, + "learning_rate": 1.321466168615428e-06, + "loss": 0.1281, + "step": 7309 + }, + { + "epoch": 0.6735154558437371, + "grad_norm": 0.9510171646860528, + "learning_rate": 1.3207943976456223e-06, + "loss": 0.1281, + "step": 7310 + }, + { + "epoch": 0.6736075920210071, + "grad_norm": 0.9048525408219783, + "learning_rate": 1.3201227361651824e-06, + "loss": 0.1258, + "step": 7311 + }, + { + "epoch": 0.6736997281982771, + "grad_norm": 0.9484261385740433, + "learning_rate": 1.3194511842364738e-06, + "loss": 0.1188, + "step": 7312 + }, + { + "epoch": 0.6737918643755471, + "grad_norm": 0.9452879348278467, + "learning_rate": 1.3187797419218506e-06, + "loss": 0.1274, + "step": 7313 + }, + { + "epoch": 0.673884000552817, + "grad_norm": 1.006014751364486, + "learning_rate": 1.3181084092836544e-06, + "loss": 0.1361, + "step": 7314 + }, + { + "epoch": 0.673976136730087, + "grad_norm": 0.9742076329629352, + "learning_rate": 1.31743718638422e-06, + "loss": 0.133, + "step": 7315 + }, + { + "epoch": 0.674068272907357, + "grad_norm": 0.99308011921525, + "learning_rate": 1.3167660732858705e-06, + "loss": 0.145, + "step": 7316 + }, + { + "epoch": 0.674160409084627, + "grad_norm": 0.9007162632362339, + "learning_rate": 1.316095070050919e-06, + "loss": 0.1169, + "step": 7317 + }, + { + "epoch": 0.674252545261897, + "grad_norm": 0.9403181481368799, + "learning_rate": 1.3154241767416665e-06, + "loss": 0.1266, + "step": 7318 + }, + { + "epoch": 0.674344681439167, + "grad_norm": 0.8985720780848173, + "learning_rate": 1.3147533934204065e-06, + "loss": 0.1148, + "step": 7319 + }, + { + "epoch": 0.6744368176164371, + "grad_norm": 0.8679435821138797, + "learning_rate": 1.3140827201494215e-06, + "loss": 0.1242, + "step": 7320 + }, + { + "epoch": 0.6745289537937071, + "grad_norm": 0.9096030324469376, + "learning_rate": 1.313412156990985e-06, + "loss": 0.1295, + "step": 7321 + }, + { + "epoch": 0.6746210899709771, + "grad_norm": 0.9524739379153031, + "learning_rate": 1.312741704007357e-06, + "loss": 0.136, + "step": 7322 + }, + { + "epoch": 0.6747132261482471, + "grad_norm": 0.9364638633954382, + "learning_rate": 1.3120713612607888e-06, + "loss": 0.1259, + "step": 7323 + }, + { + "epoch": 0.6748053623255171, + "grad_norm": 0.9004953190539895, + "learning_rate": 1.3114011288135225e-06, + "loss": 0.1264, + "step": 7324 + }, + { + "epoch": 0.6748974985027871, + "grad_norm": 0.9161284739862543, + "learning_rate": 1.31073100672779e-06, + "loss": 0.1181, + "step": 7325 + }, + { + "epoch": 0.6749896346800571, + "grad_norm": 0.9414480947386061, + "learning_rate": 1.3100609950658109e-06, + "loss": 0.1223, + "step": 7326 + }, + { + "epoch": 0.6750817708573271, + "grad_norm": 0.904014912340006, + "learning_rate": 1.3093910938897972e-06, + "loss": 0.1266, + "step": 7327 + }, + { + "epoch": 0.6751739070345971, + "grad_norm": 0.8955658720631855, + "learning_rate": 1.3087213032619478e-06, + "loss": 0.117, + "step": 7328 + }, + { + "epoch": 0.6752660432118671, + "grad_norm": 0.9207589123517635, + "learning_rate": 1.3080516232444545e-06, + "loss": 0.1212, + "step": 7329 + }, + { + "epoch": 0.6753581793891371, + "grad_norm": 0.9392613606296041, + "learning_rate": 1.3073820538994952e-06, + "loss": 0.1307, + "step": 7330 + }, + { + "epoch": 0.6754503155664071, + "grad_norm": 0.8560776006565872, + "learning_rate": 1.3067125952892408e-06, + "loss": 0.1074, + "step": 7331 + }, + { + "epoch": 0.6755424517436771, + "grad_norm": 0.8545937841158776, + "learning_rate": 1.3060432474758508e-06, + "loss": 0.1121, + "step": 7332 + }, + { + "epoch": 0.6756345879209471, + "grad_norm": 0.9466866706239028, + "learning_rate": 1.3053740105214741e-06, + "loss": 0.1374, + "step": 7333 + }, + { + "epoch": 0.6757267240982172, + "grad_norm": 0.8949652527185565, + "learning_rate": 1.3047048844882481e-06, + "loss": 0.1158, + "step": 7334 + }, + { + "epoch": 0.6758188602754872, + "grad_norm": 0.8689164231602294, + "learning_rate": 1.304035869438302e-06, + "loss": 0.111, + "step": 7335 + }, + { + "epoch": 0.6759109964527572, + "grad_norm": 0.9635373718004209, + "learning_rate": 1.3033669654337544e-06, + "loss": 0.1406, + "step": 7336 + }, + { + "epoch": 0.6760031326300272, + "grad_norm": 0.9337974075065598, + "learning_rate": 1.302698172536714e-06, + "loss": 0.1214, + "step": 7337 + }, + { + "epoch": 0.6760952688072972, + "grad_norm": 0.8946904614413712, + "learning_rate": 1.3020294908092767e-06, + "loss": 0.1255, + "step": 7338 + }, + { + "epoch": 0.6761874049845672, + "grad_norm": 0.9691555259612942, + "learning_rate": 1.3013609203135297e-06, + "loss": 0.1268, + "step": 7339 + }, + { + "epoch": 0.6762795411618372, + "grad_norm": 0.9120333872404491, + "learning_rate": 1.3006924611115495e-06, + "loss": 0.1238, + "step": 7340 + }, + { + "epoch": 0.6763716773391072, + "grad_norm": 0.9405321337635649, + "learning_rate": 1.300024113265404e-06, + "loss": 0.1232, + "step": 7341 + }, + { + "epoch": 0.6764638135163772, + "grad_norm": 0.9354071231407695, + "learning_rate": 1.2993558768371494e-06, + "loss": 0.1262, + "step": 7342 + }, + { + "epoch": 0.6765559496936472, + "grad_norm": 0.8924751034495181, + "learning_rate": 1.2986877518888307e-06, + "loss": 0.1272, + "step": 7343 + }, + { + "epoch": 0.6766480858709172, + "grad_norm": 0.9034305897586791, + "learning_rate": 1.2980197384824828e-06, + "loss": 0.1189, + "step": 7344 + }, + { + "epoch": 0.6767402220481872, + "grad_norm": 0.8854680747678314, + "learning_rate": 1.2973518366801315e-06, + "loss": 0.1124, + "step": 7345 + }, + { + "epoch": 0.6768323582254572, + "grad_norm": 0.9314473818384239, + "learning_rate": 1.2966840465437923e-06, + "loss": 0.1333, + "step": 7346 + }, + { + "epoch": 0.6769244944027273, + "grad_norm": 0.9275619556132121, + "learning_rate": 1.2960163681354683e-06, + "loss": 0.1216, + "step": 7347 + }, + { + "epoch": 0.6770166305799973, + "grad_norm": 0.8869121761739572, + "learning_rate": 1.2953488015171551e-06, + "loss": 0.1186, + "step": 7348 + }, + { + "epoch": 0.6771087667572673, + "grad_norm": 0.818807364000548, + "learning_rate": 1.294681346750834e-06, + "loss": 0.1117, + "step": 7349 + }, + { + "epoch": 0.6772009029345373, + "grad_norm": 0.9352558114757723, + "learning_rate": 1.294014003898481e-06, + "loss": 0.1212, + "step": 7350 + }, + { + "epoch": 0.6772930391118073, + "grad_norm": 1.041324186976536, + "learning_rate": 1.2933467730220562e-06, + "loss": 0.1429, + "step": 7351 + }, + { + "epoch": 0.6773851752890773, + "grad_norm": 0.9600592474772713, + "learning_rate": 1.2926796541835135e-06, + "loss": 0.1338, + "step": 7352 + }, + { + "epoch": 0.6774773114663473, + "grad_norm": 0.9034424041374622, + "learning_rate": 1.2920126474447957e-06, + "loss": 0.1153, + "step": 7353 + }, + { + "epoch": 0.6775694476436173, + "grad_norm": 1.0130396180529198, + "learning_rate": 1.2913457528678335e-06, + "loss": 0.1418, + "step": 7354 + }, + { + "epoch": 0.6776615838208873, + "grad_norm": 0.9177987594541854, + "learning_rate": 1.2906789705145475e-06, + "loss": 0.1257, + "step": 7355 + }, + { + "epoch": 0.6777537199981573, + "grad_norm": 0.943278223201491, + "learning_rate": 1.2900123004468493e-06, + "loss": 0.1156, + "step": 7356 + }, + { + "epoch": 0.6778458561754273, + "grad_norm": 0.9010057141578023, + "learning_rate": 1.289345742726639e-06, + "loss": 0.1326, + "step": 7357 + }, + { + "epoch": 0.6779379923526972, + "grad_norm": 0.9178511003467293, + "learning_rate": 1.288679297415808e-06, + "loss": 0.1248, + "step": 7358 + }, + { + "epoch": 0.6780301285299672, + "grad_norm": 0.9031359655111605, + "learning_rate": 1.2880129645762344e-06, + "loss": 0.1176, + "step": 7359 + }, + { + "epoch": 0.6781222647072372, + "grad_norm": 0.9436874791851032, + "learning_rate": 1.2873467442697862e-06, + "loss": 0.1354, + "step": 7360 + }, + { + "epoch": 0.6782144008845074, + "grad_norm": 0.9241098072391299, + "learning_rate": 1.286680636558324e-06, + "loss": 0.1179, + "step": 7361 + }, + { + "epoch": 0.6783065370617773, + "grad_norm": 0.9397598292849974, + "learning_rate": 1.2860146415036957e-06, + "loss": 0.1177, + "step": 7362 + }, + { + "epoch": 0.6783986732390473, + "grad_norm": 0.9488656040076715, + "learning_rate": 1.2853487591677377e-06, + "loss": 0.1226, + "step": 7363 + }, + { + "epoch": 0.6784908094163173, + "grad_norm": 0.8690902550103791, + "learning_rate": 1.2846829896122792e-06, + "loss": 0.1116, + "step": 7364 + }, + { + "epoch": 0.6785829455935873, + "grad_norm": 0.9312076330382117, + "learning_rate": 1.284017332899135e-06, + "loss": 0.1259, + "step": 7365 + }, + { + "epoch": 0.6786750817708573, + "grad_norm": 0.923409407118127, + "learning_rate": 1.283351789090113e-06, + "loss": 0.1182, + "step": 7366 + }, + { + "epoch": 0.6787672179481273, + "grad_norm": 0.9441374226173443, + "learning_rate": 1.2826863582470078e-06, + "loss": 0.1254, + "step": 7367 + }, + { + "epoch": 0.6788593541253973, + "grad_norm": 0.9020458710503054, + "learning_rate": 1.2820210404316053e-06, + "loss": 0.1124, + "step": 7368 + }, + { + "epoch": 0.6789514903026673, + "grad_norm": 0.9364867537207302, + "learning_rate": 1.2813558357056806e-06, + "loss": 0.1288, + "step": 7369 + }, + { + "epoch": 0.6790436264799373, + "grad_norm": 1.0236374395047134, + "learning_rate": 1.2806907441309974e-06, + "loss": 0.1373, + "step": 7370 + }, + { + "epoch": 0.6791357626572073, + "grad_norm": 0.8698015409414299, + "learning_rate": 1.2800257657693105e-06, + "loss": 0.1099, + "step": 7371 + }, + { + "epoch": 0.6792278988344773, + "grad_norm": 0.9205891854687096, + "learning_rate": 1.2793609006823615e-06, + "loss": 0.1208, + "step": 7372 + }, + { + "epoch": 0.6793200350117473, + "grad_norm": 1.0175982193294322, + "learning_rate": 1.2786961489318842e-06, + "loss": 0.1436, + "step": 7373 + }, + { + "epoch": 0.6794121711890174, + "grad_norm": 0.9473057310918341, + "learning_rate": 1.278031510579602e-06, + "loss": 0.1197, + "step": 7374 + }, + { + "epoch": 0.6795043073662874, + "grad_norm": 0.9997006968274136, + "learning_rate": 1.2773669856872256e-06, + "loss": 0.1398, + "step": 7375 + }, + { + "epoch": 0.6795964435435574, + "grad_norm": 0.879881394116734, + "learning_rate": 1.2767025743164551e-06, + "loss": 0.1183, + "step": 7376 + }, + { + "epoch": 0.6796885797208274, + "grad_norm": 0.8972780003606752, + "learning_rate": 1.2760382765289821e-06, + "loss": 0.1192, + "step": 7377 + }, + { + "epoch": 0.6797807158980974, + "grad_norm": 0.9385686665101676, + "learning_rate": 1.275374092386487e-06, + "loss": 0.1281, + "step": 7378 + }, + { + "epoch": 0.6798728520753674, + "grad_norm": 0.8774908119041022, + "learning_rate": 1.2747100219506404e-06, + "loss": 0.1199, + "step": 7379 + }, + { + "epoch": 0.6799649882526374, + "grad_norm": 0.9192489819603069, + "learning_rate": 1.2740460652831e-06, + "loss": 0.1258, + "step": 7380 + }, + { + "epoch": 0.6800571244299074, + "grad_norm": 0.9323643542426164, + "learning_rate": 1.2733822224455133e-06, + "loss": 0.1309, + "step": 7381 + }, + { + "epoch": 0.6801492606071774, + "grad_norm": 1.0334964401625448, + "learning_rate": 1.272718493499519e-06, + "loss": 0.1378, + "step": 7382 + }, + { + "epoch": 0.6802413967844474, + "grad_norm": 0.9062491939851149, + "learning_rate": 1.272054878506746e-06, + "loss": 0.126, + "step": 7383 + }, + { + "epoch": 0.6803335329617174, + "grad_norm": 0.8902882054217731, + "learning_rate": 1.2713913775288086e-06, + "loss": 0.1158, + "step": 7384 + }, + { + "epoch": 0.6804256691389874, + "grad_norm": 0.8903105818667769, + "learning_rate": 1.2707279906273152e-06, + "loss": 0.1278, + "step": 7385 + }, + { + "epoch": 0.6805178053162574, + "grad_norm": 0.9324559481614326, + "learning_rate": 1.270064717863859e-06, + "loss": 0.1305, + "step": 7386 + }, + { + "epoch": 0.6806099414935274, + "grad_norm": 0.8959648512122598, + "learning_rate": 1.269401559300027e-06, + "loss": 0.1273, + "step": 7387 + }, + { + "epoch": 0.6807020776707975, + "grad_norm": 0.9152200447009375, + "learning_rate": 1.2687385149973919e-06, + "loss": 0.1353, + "step": 7388 + }, + { + "epoch": 0.6807942138480675, + "grad_norm": 0.8724993170234326, + "learning_rate": 1.268075585017518e-06, + "loss": 0.1169, + "step": 7389 + }, + { + "epoch": 0.6808863500253375, + "grad_norm": 0.9065934723664036, + "learning_rate": 1.2674127694219588e-06, + "loss": 0.1258, + "step": 7390 + }, + { + "epoch": 0.6809784862026075, + "grad_norm": 1.0023250137596291, + "learning_rate": 1.2667500682722584e-06, + "loss": 0.1295, + "step": 7391 + }, + { + "epoch": 0.6810706223798775, + "grad_norm": 0.9139762424853833, + "learning_rate": 1.266087481629945e-06, + "loss": 0.1222, + "step": 7392 + }, + { + "epoch": 0.6811627585571475, + "grad_norm": 0.986025500116549, + "learning_rate": 1.2654250095565417e-06, + "loss": 0.1304, + "step": 7393 + }, + { + "epoch": 0.6812548947344175, + "grad_norm": 0.9687913893761001, + "learning_rate": 1.2647626521135592e-06, + "loss": 0.1219, + "step": 7394 + }, + { + "epoch": 0.6813470309116875, + "grad_norm": 1.0229554853329992, + "learning_rate": 1.2641004093624981e-06, + "loss": 0.131, + "step": 7395 + }, + { + "epoch": 0.6814391670889575, + "grad_norm": 0.9457520902813298, + "learning_rate": 1.2634382813648462e-06, + "loss": 0.13, + "step": 7396 + }, + { + "epoch": 0.6815313032662275, + "grad_norm": 0.8981736708544468, + "learning_rate": 1.262776268182084e-06, + "loss": 0.1344, + "step": 7397 + }, + { + "epoch": 0.6816234394434975, + "grad_norm": 0.9310753904940697, + "learning_rate": 1.2621143698756778e-06, + "loss": 0.1256, + "step": 7398 + }, + { + "epoch": 0.6817155756207675, + "grad_norm": 0.9271288321729084, + "learning_rate": 1.2614525865070848e-06, + "loss": 0.1283, + "step": 7399 + }, + { + "epoch": 0.6818077117980375, + "grad_norm": 0.9112315579438799, + "learning_rate": 1.260790918137754e-06, + "loss": 0.1319, + "step": 7400 + }, + { + "epoch": 0.6818998479753074, + "grad_norm": 0.9539092052600017, + "learning_rate": 1.2601293648291184e-06, + "loss": 0.1239, + "step": 7401 + }, + { + "epoch": 0.6819919841525776, + "grad_norm": 0.9703369630185199, + "learning_rate": 1.2594679266426063e-06, + "loss": 0.1301, + "step": 7402 + }, + { + "epoch": 0.6820841203298476, + "grad_norm": 0.9137480063164818, + "learning_rate": 1.2588066036396294e-06, + "loss": 0.1171, + "step": 7403 + }, + { + "epoch": 0.6821762565071176, + "grad_norm": 0.9058815845580901, + "learning_rate": 1.2581453958815937e-06, + "loss": 0.1195, + "step": 7404 + }, + { + "epoch": 0.6822683926843875, + "grad_norm": 0.9045402132109076, + "learning_rate": 1.2574843034298912e-06, + "loss": 0.1265, + "step": 7405 + }, + { + "epoch": 0.6823605288616575, + "grad_norm": 0.9698781357496576, + "learning_rate": 1.2568233263459042e-06, + "loss": 0.1312, + "step": 7406 + }, + { + "epoch": 0.6824526650389275, + "grad_norm": 1.0278967298327988, + "learning_rate": 1.2561624646910064e-06, + "loss": 0.1346, + "step": 7407 + }, + { + "epoch": 0.6825448012161975, + "grad_norm": 0.9620339931384371, + "learning_rate": 1.2555017185265578e-06, + "loss": 0.1376, + "step": 7408 + }, + { + "epoch": 0.6826369373934675, + "grad_norm": 0.8746590584378339, + "learning_rate": 1.2548410879139072e-06, + "loss": 0.105, + "step": 7409 + }, + { + "epoch": 0.6827290735707375, + "grad_norm": 0.9537799769999069, + "learning_rate": 1.254180572914396e-06, + "loss": 0.1309, + "step": 7410 + }, + { + "epoch": 0.6828212097480075, + "grad_norm": 0.8835486213792817, + "learning_rate": 1.2535201735893526e-06, + "loss": 0.1168, + "step": 7411 + }, + { + "epoch": 0.6829133459252775, + "grad_norm": 0.9677867977994593, + "learning_rate": 1.252859890000096e-06, + "loss": 0.1247, + "step": 7412 + }, + { + "epoch": 0.6830054821025475, + "grad_norm": 0.9173729144330235, + "learning_rate": 1.252199722207933e-06, + "loss": 0.1182, + "step": 7413 + }, + { + "epoch": 0.6830976182798175, + "grad_norm": 0.9489197558146778, + "learning_rate": 1.2515396702741593e-06, + "loss": 0.1275, + "step": 7414 + }, + { + "epoch": 0.6831897544570876, + "grad_norm": 1.0231784534441966, + "learning_rate": 1.2508797342600613e-06, + "loss": 0.137, + "step": 7415 + }, + { + "epoch": 0.6832818906343576, + "grad_norm": 0.9137554998444966, + "learning_rate": 1.2502199142269154e-06, + "loss": 0.1262, + "step": 7416 + }, + { + "epoch": 0.6833740268116276, + "grad_norm": 0.959616464056135, + "learning_rate": 1.2495602102359837e-06, + "loss": 0.1159, + "step": 7417 + }, + { + "epoch": 0.6834661629888976, + "grad_norm": 0.9443226724457511, + "learning_rate": 1.2489006223485225e-06, + "loss": 0.1318, + "step": 7418 + }, + { + "epoch": 0.6835582991661676, + "grad_norm": 0.9055951065867156, + "learning_rate": 1.2482411506257722e-06, + "loss": 0.1217, + "step": 7419 + }, + { + "epoch": 0.6836504353434376, + "grad_norm": 0.8964410503789555, + "learning_rate": 1.2475817951289665e-06, + "loss": 0.1219, + "step": 7420 + }, + { + "epoch": 0.6837425715207076, + "grad_norm": 0.9692218760916794, + "learning_rate": 1.2469225559193251e-06, + "loss": 0.1351, + "step": 7421 + }, + { + "epoch": 0.6838347076979776, + "grad_norm": 0.913552989477623, + "learning_rate": 1.2462634330580593e-06, + "loss": 0.1179, + "step": 7422 + }, + { + "epoch": 0.6839268438752476, + "grad_norm": 0.9013278373038858, + "learning_rate": 1.2456044266063694e-06, + "loss": 0.1231, + "step": 7423 + }, + { + "epoch": 0.6840189800525176, + "grad_norm": 0.9833694142508832, + "learning_rate": 1.2449455366254434e-06, + "loss": 0.1342, + "step": 7424 + }, + { + "epoch": 0.6841111162297876, + "grad_norm": 0.8884726425002472, + "learning_rate": 1.2442867631764588e-06, + "loss": 0.1214, + "step": 7425 + }, + { + "epoch": 0.6842032524070576, + "grad_norm": 0.9212373080375857, + "learning_rate": 1.2436281063205833e-06, + "loss": 0.1264, + "step": 7426 + }, + { + "epoch": 0.6842953885843276, + "grad_norm": 0.961439810407205, + "learning_rate": 1.2429695661189731e-06, + "loss": 0.1297, + "step": 7427 + }, + { + "epoch": 0.6843875247615976, + "grad_norm": 0.9165466783704851, + "learning_rate": 1.242311142632775e-06, + "loss": 0.133, + "step": 7428 + }, + { + "epoch": 0.6844796609388677, + "grad_norm": 0.9031234366618096, + "learning_rate": 1.2416528359231228e-06, + "loss": 0.1201, + "step": 7429 + }, + { + "epoch": 0.6845717971161377, + "grad_norm": 0.923906786213281, + "learning_rate": 1.240994646051139e-06, + "loss": 0.1336, + "step": 7430 + }, + { + "epoch": 0.6846639332934077, + "grad_norm": 0.959066739495178, + "learning_rate": 1.2403365730779383e-06, + "loss": 0.1339, + "step": 7431 + }, + { + "epoch": 0.6847560694706777, + "grad_norm": 0.9688337673057995, + "learning_rate": 1.2396786170646218e-06, + "loss": 0.128, + "step": 7432 + }, + { + "epoch": 0.6848482056479477, + "grad_norm": 0.9374828794417941, + "learning_rate": 1.2390207780722827e-06, + "loss": 0.1191, + "step": 7433 + }, + { + "epoch": 0.6849403418252177, + "grad_norm": 0.9645857123533416, + "learning_rate": 1.238363056162e-06, + "loss": 0.1268, + "step": 7434 + }, + { + "epoch": 0.6850324780024877, + "grad_norm": 0.917977034506336, + "learning_rate": 1.2377054513948423e-06, + "loss": 0.1317, + "step": 7435 + }, + { + "epoch": 0.6851246141797577, + "grad_norm": 0.903871262985241, + "learning_rate": 1.2370479638318692e-06, + "loss": 0.1263, + "step": 7436 + }, + { + "epoch": 0.6852167503570277, + "grad_norm": 0.9181670238680173, + "learning_rate": 1.2363905935341295e-06, + "loss": 0.1237, + "step": 7437 + }, + { + "epoch": 0.6853088865342977, + "grad_norm": 0.9205604948512467, + "learning_rate": 1.235733340562658e-06, + "loss": 0.1302, + "step": 7438 + }, + { + "epoch": 0.6854010227115677, + "grad_norm": 0.8849849610348361, + "learning_rate": 1.2350762049784835e-06, + "loss": 0.1192, + "step": 7439 + }, + { + "epoch": 0.6854931588888377, + "grad_norm": 0.9159893977573488, + "learning_rate": 1.2344191868426181e-06, + "loss": 0.1278, + "step": 7440 + }, + { + "epoch": 0.6855852950661077, + "grad_norm": 0.9363760794106957, + "learning_rate": 1.2337622862160687e-06, + "loss": 0.1238, + "step": 7441 + }, + { + "epoch": 0.6856774312433778, + "grad_norm": 0.8938566236978432, + "learning_rate": 1.233105503159826e-06, + "loss": 0.1244, + "step": 7442 + }, + { + "epoch": 0.6857695674206478, + "grad_norm": 0.8527983351894046, + "learning_rate": 1.2324488377348736e-06, + "loss": 0.1087, + "step": 7443 + }, + { + "epoch": 0.6858617035979178, + "grad_norm": 0.907390478133244, + "learning_rate": 1.2317922900021843e-06, + "loss": 0.1223, + "step": 7444 + }, + { + "epoch": 0.6859538397751878, + "grad_norm": 0.9151103834295761, + "learning_rate": 1.2311358600227172e-06, + "loss": 0.118, + "step": 7445 + }, + { + "epoch": 0.6860459759524578, + "grad_norm": 0.9384525478458614, + "learning_rate": 1.2304795478574211e-06, + "loss": 0.1201, + "step": 7446 + }, + { + "epoch": 0.6861381121297278, + "grad_norm": 0.8967253804927608, + "learning_rate": 1.2298233535672357e-06, + "loss": 0.1254, + "step": 7447 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 0.9329650086140051, + "learning_rate": 1.2291672772130885e-06, + "loss": 0.1379, + "step": 7448 + }, + { + "epoch": 0.6863223844842677, + "grad_norm": 0.9485151638142946, + "learning_rate": 1.2285113188558975e-06, + "loss": 0.1324, + "step": 7449 + }, + { + "epoch": 0.6864145206615377, + "grad_norm": 0.9190267891724095, + "learning_rate": 1.2278554785565671e-06, + "loss": 0.1328, + "step": 7450 + }, + { + "epoch": 0.6865066568388077, + "grad_norm": 0.929404692376141, + "learning_rate": 1.2271997563759918e-06, + "loss": 0.1326, + "step": 7451 + }, + { + "epoch": 0.6865987930160777, + "grad_norm": 0.9353304920500429, + "learning_rate": 1.226544152375056e-06, + "loss": 0.1309, + "step": 7452 + }, + { + "epoch": 0.6866909291933477, + "grad_norm": 0.9166655846212474, + "learning_rate": 1.2258886666146335e-06, + "loss": 0.1204, + "step": 7453 + }, + { + "epoch": 0.6867830653706177, + "grad_norm": 0.8790079523079889, + "learning_rate": 1.2252332991555846e-06, + "loss": 0.1168, + "step": 7454 + }, + { + "epoch": 0.6868752015478877, + "grad_norm": 0.9637556673833563, + "learning_rate": 1.224578050058762e-06, + "loss": 0.1382, + "step": 7455 + }, + { + "epoch": 0.6869673377251578, + "grad_norm": 0.8295194190943669, + "learning_rate": 1.2239229193850039e-06, + "loss": 0.1025, + "step": 7456 + }, + { + "epoch": 0.6870594739024278, + "grad_norm": 0.9419495896513839, + "learning_rate": 1.2232679071951398e-06, + "loss": 0.129, + "step": 7457 + }, + { + "epoch": 0.6871516100796978, + "grad_norm": 0.90361666710837, + "learning_rate": 1.2226130135499891e-06, + "loss": 0.1215, + "step": 7458 + }, + { + "epoch": 0.6872437462569678, + "grad_norm": 0.8778510526809243, + "learning_rate": 1.2219582385103564e-06, + "loss": 0.1186, + "step": 7459 + }, + { + "epoch": 0.6873358824342378, + "grad_norm": 0.9003346616886775, + "learning_rate": 1.2213035821370401e-06, + "loss": 0.1193, + "step": 7460 + }, + { + "epoch": 0.6874280186115078, + "grad_norm": 0.9341321475259016, + "learning_rate": 1.2206490444908226e-06, + "loss": 0.1184, + "step": 7461 + }, + { + "epoch": 0.6875201547887778, + "grad_norm": 1.029120204312684, + "learning_rate": 1.21999462563248e-06, + "loss": 0.139, + "step": 7462 + }, + { + "epoch": 0.6876122909660478, + "grad_norm": 0.991534769651168, + "learning_rate": 1.2193403256227731e-06, + "loss": 0.1412, + "step": 7463 + }, + { + "epoch": 0.6877044271433178, + "grad_norm": 0.9770012066753403, + "learning_rate": 1.2186861445224548e-06, + "loss": 0.1279, + "step": 7464 + }, + { + "epoch": 0.6877965633205878, + "grad_norm": 0.9755692964671077, + "learning_rate": 1.2180320823922662e-06, + "loss": 0.1307, + "step": 7465 + }, + { + "epoch": 0.6878886994978578, + "grad_norm": 0.9479539739269719, + "learning_rate": 1.2173781392929383e-06, + "loss": 0.1119, + "step": 7466 + }, + { + "epoch": 0.6879808356751278, + "grad_norm": 0.9075991770684234, + "learning_rate": 1.2167243152851862e-06, + "loss": 0.1236, + "step": 7467 + }, + { + "epoch": 0.6880729718523978, + "grad_norm": 0.9412169301247907, + "learning_rate": 1.21607061042972e-06, + "loss": 0.1231, + "step": 7468 + }, + { + "epoch": 0.6881651080296678, + "grad_norm": 0.9310202024859918, + "learning_rate": 1.2154170247872354e-06, + "loss": 0.1295, + "step": 7469 + }, + { + "epoch": 0.6882572442069379, + "grad_norm": 0.8808125116214799, + "learning_rate": 1.2147635584184194e-06, + "loss": 0.1219, + "step": 7470 + }, + { + "epoch": 0.6883493803842079, + "grad_norm": 0.9574965439429187, + "learning_rate": 1.2141102113839442e-06, + "loss": 0.126, + "step": 7471 + }, + { + "epoch": 0.6884415165614779, + "grad_norm": 0.9585891755121796, + "learning_rate": 1.2134569837444755e-06, + "loss": 0.1192, + "step": 7472 + }, + { + "epoch": 0.6885336527387479, + "grad_norm": 0.9390435430840817, + "learning_rate": 1.2128038755606632e-06, + "loss": 0.1312, + "step": 7473 + }, + { + "epoch": 0.6886257889160179, + "grad_norm": 0.9404765976585037, + "learning_rate": 1.2121508868931507e-06, + "loss": 0.1272, + "step": 7474 + }, + { + "epoch": 0.6887179250932879, + "grad_norm": 0.9462706939442835, + "learning_rate": 1.2114980178025657e-06, + "loss": 0.1238, + "step": 7475 + }, + { + "epoch": 0.6888100612705579, + "grad_norm": 0.9467478685012891, + "learning_rate": 1.2108452683495286e-06, + "loss": 0.1275, + "step": 7476 + }, + { + "epoch": 0.6889021974478279, + "grad_norm": 0.9634104501865915, + "learning_rate": 1.210192638594648e-06, + "loss": 0.135, + "step": 7477 + }, + { + "epoch": 0.6889943336250979, + "grad_norm": 0.9728865732091142, + "learning_rate": 1.2095401285985197e-06, + "loss": 0.1333, + "step": 7478 + }, + { + "epoch": 0.6890864698023679, + "grad_norm": 0.933124824968402, + "learning_rate": 1.2088877384217286e-06, + "loss": 0.1293, + "step": 7479 + }, + { + "epoch": 0.6891786059796379, + "grad_norm": 0.9059485628010226, + "learning_rate": 1.2082354681248495e-06, + "loss": 0.1212, + "step": 7480 + }, + { + "epoch": 0.6892707421569079, + "grad_norm": 0.8854842763292889, + "learning_rate": 1.2075833177684465e-06, + "loss": 0.1144, + "step": 7481 + }, + { + "epoch": 0.6893628783341779, + "grad_norm": 0.922192406975205, + "learning_rate": 1.2069312874130725e-06, + "loss": 0.1121, + "step": 7482 + }, + { + "epoch": 0.689455014511448, + "grad_norm": 0.9320451445217117, + "learning_rate": 1.2062793771192676e-06, + "loss": 0.1246, + "step": 7483 + }, + { + "epoch": 0.689547150688718, + "grad_norm": 0.8946586316246357, + "learning_rate": 1.2056275869475606e-06, + "loss": 0.1246, + "step": 7484 + }, + { + "epoch": 0.689639286865988, + "grad_norm": 0.8953871059867827, + "learning_rate": 1.2049759169584718e-06, + "loss": 0.12, + "step": 7485 + }, + { + "epoch": 0.689731423043258, + "grad_norm": 0.9397138554660357, + "learning_rate": 1.2043243672125083e-06, + "loss": 0.1231, + "step": 7486 + }, + { + "epoch": 0.689823559220528, + "grad_norm": 0.870590564525269, + "learning_rate": 1.2036729377701679e-06, + "loss": 0.1123, + "step": 7487 + }, + { + "epoch": 0.689915695397798, + "grad_norm": 0.9123543737092503, + "learning_rate": 1.2030216286919343e-06, + "loss": 0.1241, + "step": 7488 + }, + { + "epoch": 0.690007831575068, + "grad_norm": 0.9315347547200894, + "learning_rate": 1.2023704400382813e-06, + "loss": 0.1266, + "step": 7489 + }, + { + "epoch": 0.690099967752338, + "grad_norm": 0.9419355207806347, + "learning_rate": 1.201719371869673e-06, + "loss": 0.1259, + "step": 7490 + }, + { + "epoch": 0.690192103929608, + "grad_norm": 1.0714501807054846, + "learning_rate": 1.2010684242465612e-06, + "loss": 0.1364, + "step": 7491 + }, + { + "epoch": 0.6902842401068779, + "grad_norm": 0.9120663194555401, + "learning_rate": 1.2004175972293852e-06, + "loss": 0.1118, + "step": 7492 + }, + { + "epoch": 0.6903763762841479, + "grad_norm": 0.9301664178778738, + "learning_rate": 1.1997668908785761e-06, + "loss": 0.1289, + "step": 7493 + }, + { + "epoch": 0.6904685124614179, + "grad_norm": 0.9135039431916775, + "learning_rate": 1.1991163052545502e-06, + "loss": 0.1254, + "step": 7494 + }, + { + "epoch": 0.6905606486386879, + "grad_norm": 0.9372952523507923, + "learning_rate": 1.1984658404177162e-06, + "loss": 0.1101, + "step": 7495 + }, + { + "epoch": 0.6906527848159579, + "grad_norm": 0.9081435909843156, + "learning_rate": 1.1978154964284683e-06, + "loss": 0.1182, + "step": 7496 + }, + { + "epoch": 0.690744920993228, + "grad_norm": 0.9207021186709088, + "learning_rate": 1.1971652733471915e-06, + "loss": 0.1251, + "step": 7497 + }, + { + "epoch": 0.690837057170498, + "grad_norm": 0.9124595681631252, + "learning_rate": 1.19651517123426e-06, + "loss": 0.126, + "step": 7498 + }, + { + "epoch": 0.690929193347768, + "grad_norm": 0.9578194252866348, + "learning_rate": 1.1958651901500356e-06, + "loss": 0.1233, + "step": 7499 + }, + { + "epoch": 0.691021329525038, + "grad_norm": 0.8642829438776286, + "learning_rate": 1.1952153301548674e-06, + "loss": 0.117, + "step": 7500 + }, + { + "epoch": 0.691021329525038, + "eval_loss": 0.12531189620494843, + "eval_runtime": 299.4056, + "eval_samples_per_second": 23.436, + "eval_steps_per_second": 2.932, + "step": 7500 + }, + { + "epoch": 0.691113465702308, + "grad_norm": 0.9056786919297217, + "learning_rate": 1.1945655913090965e-06, + "loss": 0.1273, + "step": 7501 + }, + { + "epoch": 0.691205601879578, + "grad_norm": 0.9366350244304404, + "learning_rate": 1.1939159736730508e-06, + "loss": 0.1304, + "step": 7502 + }, + { + "epoch": 0.691297738056848, + "grad_norm": 0.9873293906173173, + "learning_rate": 1.1932664773070483e-06, + "loss": 0.1305, + "step": 7503 + }, + { + "epoch": 0.691389874234118, + "grad_norm": 0.879909629424238, + "learning_rate": 1.192617102271394e-06, + "loss": 0.1246, + "step": 7504 + }, + { + "epoch": 0.691482010411388, + "grad_norm": 0.879155311662118, + "learning_rate": 1.1919678486263817e-06, + "loss": 0.1142, + "step": 7505 + }, + { + "epoch": 0.691574146588658, + "grad_norm": 0.9510831799893671, + "learning_rate": 1.1913187164322954e-06, + "loss": 0.1268, + "step": 7506 + }, + { + "epoch": 0.691666282765928, + "grad_norm": 0.8829687120011289, + "learning_rate": 1.190669705749408e-06, + "loss": 0.1266, + "step": 7507 + }, + { + "epoch": 0.691758418943198, + "grad_norm": 0.8564115074734555, + "learning_rate": 1.1900208166379784e-06, + "loss": 0.1147, + "step": 7508 + }, + { + "epoch": 0.691850555120468, + "grad_norm": 0.9176006723966627, + "learning_rate": 1.1893720491582579e-06, + "loss": 0.1236, + "step": 7509 + }, + { + "epoch": 0.6919426912977381, + "grad_norm": 0.9154865319735849, + "learning_rate": 1.1887234033704827e-06, + "loss": 0.1153, + "step": 7510 + }, + { + "epoch": 0.6920348274750081, + "grad_norm": 0.8846755111687101, + "learning_rate": 1.1880748793348818e-06, + "loss": 0.1264, + "step": 7511 + }, + { + "epoch": 0.6921269636522781, + "grad_norm": 0.9063995376950635, + "learning_rate": 1.1874264771116684e-06, + "loss": 0.1233, + "step": 7512 + }, + { + "epoch": 0.6922190998295481, + "grad_norm": 0.9119144408492244, + "learning_rate": 1.1867781967610478e-06, + "loss": 0.1286, + "step": 7513 + }, + { + "epoch": 0.6923112360068181, + "grad_norm": 0.9227075633333177, + "learning_rate": 1.186130038343214e-06, + "loss": 0.1237, + "step": 7514 + }, + { + "epoch": 0.6924033721840881, + "grad_norm": 0.9447847319047454, + "learning_rate": 1.1854820019183467e-06, + "loss": 0.1213, + "step": 7515 + }, + { + "epoch": 0.6924955083613581, + "grad_norm": 0.9557400253206907, + "learning_rate": 1.1848340875466176e-06, + "loss": 0.1241, + "step": 7516 + }, + { + "epoch": 0.6925876445386281, + "grad_norm": 0.9213338639911549, + "learning_rate": 1.1841862952881845e-06, + "loss": 0.1216, + "step": 7517 + }, + { + "epoch": 0.6926797807158981, + "grad_norm": 0.929761471437544, + "learning_rate": 1.183538625203195e-06, + "loss": 0.1161, + "step": 7518 + }, + { + "epoch": 0.6927719168931681, + "grad_norm": 0.956777668991843, + "learning_rate": 1.182891077351787e-06, + "loss": 0.1286, + "step": 7519 + }, + { + "epoch": 0.6928640530704381, + "grad_norm": 0.9351053697619062, + "learning_rate": 1.1822436517940844e-06, + "loss": 0.1376, + "step": 7520 + }, + { + "epoch": 0.6929561892477081, + "grad_norm": 0.9307576923675359, + "learning_rate": 1.1815963485901994e-06, + "loss": 0.1233, + "step": 7521 + }, + { + "epoch": 0.6930483254249781, + "grad_norm": 0.9209797831879457, + "learning_rate": 1.1809491678002356e-06, + "loss": 0.1235, + "step": 7522 + }, + { + "epoch": 0.6931404616022481, + "grad_norm": 0.8927925400652459, + "learning_rate": 1.1803021094842831e-06, + "loss": 0.1257, + "step": 7523 + }, + { + "epoch": 0.6932325977795182, + "grad_norm": 0.8842257147429041, + "learning_rate": 1.179655173702423e-06, + "loss": 0.1131, + "step": 7524 + }, + { + "epoch": 0.6933247339567882, + "grad_norm": 0.960345088928631, + "learning_rate": 1.1790083605147221e-06, + "loss": 0.1219, + "step": 7525 + }, + { + "epoch": 0.6934168701340582, + "grad_norm": 0.9421499013450262, + "learning_rate": 1.1783616699812362e-06, + "loss": 0.1343, + "step": 7526 + }, + { + "epoch": 0.6935090063113282, + "grad_norm": 0.9502278428418999, + "learning_rate": 1.1777151021620113e-06, + "loss": 0.1312, + "step": 7527 + }, + { + "epoch": 0.6936011424885982, + "grad_norm": 1.001310947592207, + "learning_rate": 1.1770686571170824e-06, + "loss": 0.1333, + "step": 7528 + }, + { + "epoch": 0.6936932786658682, + "grad_norm": 0.9471759067515962, + "learning_rate": 1.17642233490647e-06, + "loss": 0.1271, + "step": 7529 + }, + { + "epoch": 0.6937854148431382, + "grad_norm": 0.8974159275769846, + "learning_rate": 1.1757761355901875e-06, + "loss": 0.1104, + "step": 7530 + }, + { + "epoch": 0.6938775510204082, + "grad_norm": 0.9724327109166822, + "learning_rate": 1.1751300592282325e-06, + "loss": 0.1385, + "step": 7531 + }, + { + "epoch": 0.6939696871976782, + "grad_norm": 0.9465696548880492, + "learning_rate": 1.1744841058805947e-06, + "loss": 0.1143, + "step": 7532 + }, + { + "epoch": 0.6940618233749482, + "grad_norm": 0.979571877068151, + "learning_rate": 1.1738382756072495e-06, + "loss": 0.1469, + "step": 7533 + }, + { + "epoch": 0.6941539595522181, + "grad_norm": 0.9206628125733626, + "learning_rate": 1.1731925684681631e-06, + "loss": 0.1203, + "step": 7534 + }, + { + "epoch": 0.6942460957294881, + "grad_norm": 0.9375005379901026, + "learning_rate": 1.1725469845232906e-06, + "loss": 0.1331, + "step": 7535 + }, + { + "epoch": 0.6943382319067581, + "grad_norm": 0.9411215714914509, + "learning_rate": 1.1719015238325731e-06, + "loss": 0.1205, + "step": 7536 + }, + { + "epoch": 0.6944303680840282, + "grad_norm": 0.8819105340485545, + "learning_rate": 1.1712561864559415e-06, + "loss": 0.1173, + "step": 7537 + }, + { + "epoch": 0.6945225042612982, + "grad_norm": 0.9772820854322495, + "learning_rate": 1.1706109724533158e-06, + "loss": 0.1277, + "step": 7538 + }, + { + "epoch": 0.6946146404385682, + "grad_norm": 0.8949011843616254, + "learning_rate": 1.1699658818846044e-06, + "loss": 0.1225, + "step": 7539 + }, + { + "epoch": 0.6947067766158382, + "grad_norm": 0.8911294765958074, + "learning_rate": 1.1693209148097049e-06, + "loss": 0.1134, + "step": 7540 + }, + { + "epoch": 0.6947989127931082, + "grad_norm": 0.9990537451251741, + "learning_rate": 1.1686760712885018e-06, + "loss": 0.1421, + "step": 7541 + }, + { + "epoch": 0.6948910489703782, + "grad_norm": 0.929425147367063, + "learning_rate": 1.1680313513808677e-06, + "loss": 0.1273, + "step": 7542 + }, + { + "epoch": 0.6949831851476482, + "grad_norm": 0.9202734532577793, + "learning_rate": 1.1673867551466658e-06, + "loss": 0.1231, + "step": 7543 + }, + { + "epoch": 0.6950753213249182, + "grad_norm": 0.8998933513621242, + "learning_rate": 1.1667422826457475e-06, + "loss": 0.1212, + "step": 7544 + }, + { + "epoch": 0.6951674575021882, + "grad_norm": 0.9106168799928811, + "learning_rate": 1.1660979339379524e-06, + "loss": 0.13, + "step": 7545 + }, + { + "epoch": 0.6952595936794582, + "grad_norm": 0.9458638903120887, + "learning_rate": 1.1654537090831069e-06, + "loss": 0.1245, + "step": 7546 + }, + { + "epoch": 0.6953517298567282, + "grad_norm": 0.9451228064476396, + "learning_rate": 1.164809608141029e-06, + "loss": 0.1291, + "step": 7547 + }, + { + "epoch": 0.6954438660339982, + "grad_norm": 0.9613647716535906, + "learning_rate": 1.1641656311715218e-06, + "loss": 0.1357, + "step": 7548 + }, + { + "epoch": 0.6955360022112682, + "grad_norm": 0.890685272110586, + "learning_rate": 1.1635217782343801e-06, + "loss": 0.1119, + "step": 7549 + }, + { + "epoch": 0.6956281383885382, + "grad_norm": 0.9401787383359983, + "learning_rate": 1.1628780493893849e-06, + "loss": 0.1146, + "step": 7550 + }, + { + "epoch": 0.6957202745658083, + "grad_norm": 0.9264539393150472, + "learning_rate": 1.162234444696306e-06, + "loss": 0.1108, + "step": 7551 + }, + { + "epoch": 0.6958124107430783, + "grad_norm": 0.966080326479011, + "learning_rate": 1.1615909642149042e-06, + "loss": 0.1218, + "step": 7552 + }, + { + "epoch": 0.6959045469203483, + "grad_norm": 0.9639128496683719, + "learning_rate": 1.1609476080049254e-06, + "loss": 0.1365, + "step": 7553 + }, + { + "epoch": 0.6959966830976183, + "grad_norm": 0.9204230071876955, + "learning_rate": 1.1603043761261043e-06, + "loss": 0.1189, + "step": 7554 + }, + { + "epoch": 0.6960888192748883, + "grad_norm": 0.9068194418459751, + "learning_rate": 1.159661268638166e-06, + "loss": 0.119, + "step": 7555 + }, + { + "epoch": 0.6961809554521583, + "grad_norm": 0.9406348953180761, + "learning_rate": 1.1590182856008233e-06, + "loss": 0.1251, + "step": 7556 + }, + { + "epoch": 0.6962730916294283, + "grad_norm": 1.0019374287174063, + "learning_rate": 1.158375427073778e-06, + "loss": 0.1549, + "step": 7557 + }, + { + "epoch": 0.6963652278066983, + "grad_norm": 0.9614692218032151, + "learning_rate": 1.1577326931167184e-06, + "loss": 0.1194, + "step": 7558 + }, + { + "epoch": 0.6964573639839683, + "grad_norm": 0.9627814151373388, + "learning_rate": 1.1570900837893223e-06, + "loss": 0.1276, + "step": 7559 + }, + { + "epoch": 0.6965495001612383, + "grad_norm": 0.9401742982964019, + "learning_rate": 1.1564475991512562e-06, + "loss": 0.1195, + "step": 7560 + }, + { + "epoch": 0.6966416363385083, + "grad_norm": 0.9606928144175957, + "learning_rate": 1.1558052392621758e-06, + "loss": 0.1386, + "step": 7561 + }, + { + "epoch": 0.6967337725157783, + "grad_norm": 0.8242620094607082, + "learning_rate": 1.155163004181723e-06, + "loss": 0.1104, + "step": 7562 + }, + { + "epoch": 0.6968259086930483, + "grad_norm": 0.9661538812059097, + "learning_rate": 1.1545208939695306e-06, + "loss": 0.1233, + "step": 7563 + }, + { + "epoch": 0.6969180448703183, + "grad_norm": 0.9185722917327637, + "learning_rate": 1.1538789086852173e-06, + "loss": 0.1172, + "step": 7564 + }, + { + "epoch": 0.6970101810475884, + "grad_norm": 0.9559839661690173, + "learning_rate": 1.1532370483883931e-06, + "loss": 0.1308, + "step": 7565 + }, + { + "epoch": 0.6971023172248584, + "grad_norm": 0.9926826971490361, + "learning_rate": 1.152595313138653e-06, + "loss": 0.1378, + "step": 7566 + }, + { + "epoch": 0.6971944534021284, + "grad_norm": 0.8957768567257417, + "learning_rate": 1.151953702995583e-06, + "loss": 0.1221, + "step": 7567 + }, + { + "epoch": 0.6972865895793984, + "grad_norm": 0.919822250860159, + "learning_rate": 1.1513122180187577e-06, + "loss": 0.127, + "step": 7568 + }, + { + "epoch": 0.6973787257566684, + "grad_norm": 0.9537334807702881, + "learning_rate": 1.150670858267738e-06, + "loss": 0.1254, + "step": 7569 + }, + { + "epoch": 0.6974708619339384, + "grad_norm": 0.9311165498976608, + "learning_rate": 1.150029623802074e-06, + "loss": 0.1345, + "step": 7570 + }, + { + "epoch": 0.6975629981112084, + "grad_norm": 0.8904827529578065, + "learning_rate": 1.1493885146813042e-06, + "loss": 0.12, + "step": 7571 + }, + { + "epoch": 0.6976551342884784, + "grad_norm": 0.9123497944185116, + "learning_rate": 1.148747530964956e-06, + "loss": 0.1288, + "step": 7572 + }, + { + "epoch": 0.6977472704657484, + "grad_norm": 0.9531719885240827, + "learning_rate": 1.1481066727125463e-06, + "loss": 0.1291, + "step": 7573 + }, + { + "epoch": 0.6978394066430184, + "grad_norm": 0.9819613269245462, + "learning_rate": 1.1474659399835772e-06, + "loss": 0.1309, + "step": 7574 + }, + { + "epoch": 0.6979315428202884, + "grad_norm": 0.9657481867646279, + "learning_rate": 1.1468253328375404e-06, + "loss": 0.1156, + "step": 7575 + }, + { + "epoch": 0.6980236789975584, + "grad_norm": 0.9068680219647643, + "learning_rate": 1.1461848513339168e-06, + "loss": 0.1199, + "step": 7576 + }, + { + "epoch": 0.6981158151748283, + "grad_norm": 0.9583994048161718, + "learning_rate": 1.145544495532176e-06, + "loss": 0.1276, + "step": 7577 + }, + { + "epoch": 0.6982079513520985, + "grad_norm": 0.9183051486498401, + "learning_rate": 1.144904265491775e-06, + "loss": 0.1166, + "step": 7578 + }, + { + "epoch": 0.6983000875293685, + "grad_norm": 0.9558218358159374, + "learning_rate": 1.1442641612721588e-06, + "loss": 0.1264, + "step": 7579 + }, + { + "epoch": 0.6983922237066384, + "grad_norm": 0.9610830469191616, + "learning_rate": 1.1436241829327605e-06, + "loss": 0.1265, + "step": 7580 + }, + { + "epoch": 0.6984843598839084, + "grad_norm": 0.925704991359198, + "learning_rate": 1.1429843305330027e-06, + "loss": 0.1258, + "step": 7581 + }, + { + "epoch": 0.6985764960611784, + "grad_norm": 0.9619080391790512, + "learning_rate": 1.1423446041322967e-06, + "loss": 0.1166, + "step": 7582 + }, + { + "epoch": 0.6986686322384484, + "grad_norm": 0.9200858544359843, + "learning_rate": 1.1417050037900393e-06, + "loss": 0.1187, + "step": 7583 + }, + { + "epoch": 0.6987607684157184, + "grad_norm": 0.9165477155141011, + "learning_rate": 1.1410655295656196e-06, + "loss": 0.1289, + "step": 7584 + }, + { + "epoch": 0.6988529045929884, + "grad_norm": 0.9558792693314704, + "learning_rate": 1.1404261815184105e-06, + "loss": 0.1297, + "step": 7585 + }, + { + "epoch": 0.6989450407702584, + "grad_norm": 0.9447699533790705, + "learning_rate": 1.1397869597077783e-06, + "loss": 0.1296, + "step": 7586 + }, + { + "epoch": 0.6990371769475284, + "grad_norm": 0.8962723565665709, + "learning_rate": 1.1391478641930716e-06, + "loss": 0.1199, + "step": 7587 + }, + { + "epoch": 0.6991293131247984, + "grad_norm": 0.9362449149914991, + "learning_rate": 1.1385088950336329e-06, + "loss": 0.1236, + "step": 7588 + }, + { + "epoch": 0.6992214493020684, + "grad_norm": 0.9195942463818534, + "learning_rate": 1.1378700522887903e-06, + "loss": 0.1101, + "step": 7589 + }, + { + "epoch": 0.6993135854793384, + "grad_norm": 0.9756437558705977, + "learning_rate": 1.13723133601786e-06, + "loss": 0.1366, + "step": 7590 + }, + { + "epoch": 0.6994057216566084, + "grad_norm": 0.8944513848537959, + "learning_rate": 1.136592746280146e-06, + "loss": 0.1117, + "step": 7591 + }, + { + "epoch": 0.6994978578338785, + "grad_norm": 0.9330843733670557, + "learning_rate": 1.1359542831349422e-06, + "loss": 0.1278, + "step": 7592 + }, + { + "epoch": 0.6995899940111485, + "grad_norm": 0.8788149107087988, + "learning_rate": 1.1353159466415298e-06, + "loss": 0.1139, + "step": 7593 + }, + { + "epoch": 0.6996821301884185, + "grad_norm": 0.9187660811222983, + "learning_rate": 1.1346777368591797e-06, + "loss": 0.1227, + "step": 7594 + }, + { + "epoch": 0.6997742663656885, + "grad_norm": 0.9293501590683788, + "learning_rate": 1.1340396538471488e-06, + "loss": 0.127, + "step": 7595 + }, + { + "epoch": 0.6998664025429585, + "grad_norm": 0.9122089651037086, + "learning_rate": 1.133401697664682e-06, + "loss": 0.1247, + "step": 7596 + }, + { + "epoch": 0.6999585387202285, + "grad_norm": 0.9514888912155925, + "learning_rate": 1.1327638683710146e-06, + "loss": 0.1195, + "step": 7597 + }, + { + "epoch": 0.7000506748974985, + "grad_norm": 0.9093223008533924, + "learning_rate": 1.13212616602537e-06, + "loss": 0.1128, + "step": 7598 + }, + { + "epoch": 0.7001428110747685, + "grad_norm": 0.8769014325228602, + "learning_rate": 1.1314885906869575e-06, + "loss": 0.1092, + "step": 7599 + }, + { + "epoch": 0.7002349472520385, + "grad_norm": 0.8726147241906409, + "learning_rate": 1.1308511424149774e-06, + "loss": 0.1093, + "step": 7600 + }, + { + "epoch": 0.7003270834293085, + "grad_norm": 0.8949518559323022, + "learning_rate": 1.1302138212686152e-06, + "loss": 0.1192, + "step": 7601 + }, + { + "epoch": 0.7004192196065785, + "grad_norm": 0.9241564642082613, + "learning_rate": 1.1295766273070469e-06, + "loss": 0.1218, + "step": 7602 + }, + { + "epoch": 0.7005113557838485, + "grad_norm": 0.9788950224549492, + "learning_rate": 1.1289395605894374e-06, + "loss": 0.1283, + "step": 7603 + }, + { + "epoch": 0.7006034919611185, + "grad_norm": 1.0243458990881766, + "learning_rate": 1.1283026211749362e-06, + "loss": 0.1368, + "step": 7604 + }, + { + "epoch": 0.7006956281383886, + "grad_norm": 0.9374500966330062, + "learning_rate": 1.127665809122685e-06, + "loss": 0.1191, + "step": 7605 + }, + { + "epoch": 0.7007877643156586, + "grad_norm": 0.9205880293432904, + "learning_rate": 1.1270291244918106e-06, + "loss": 0.1316, + "step": 7606 + }, + { + "epoch": 0.7008799004929286, + "grad_norm": 0.9307359895923712, + "learning_rate": 1.1263925673414303e-06, + "loss": 0.1207, + "step": 7607 + }, + { + "epoch": 0.7009720366701986, + "grad_norm": 0.9457635843783687, + "learning_rate": 1.1257561377306471e-06, + "loss": 0.1286, + "step": 7608 + }, + { + "epoch": 0.7010641728474686, + "grad_norm": 0.9370338619779593, + "learning_rate": 1.1251198357185547e-06, + "loss": 0.1295, + "step": 7609 + }, + { + "epoch": 0.7011563090247386, + "grad_norm": 0.9616310297957011, + "learning_rate": 1.1244836613642342e-06, + "loss": 0.1313, + "step": 7610 + }, + { + "epoch": 0.7012484452020086, + "grad_norm": 0.8917892618492652, + "learning_rate": 1.1238476147267537e-06, + "loss": 0.1225, + "step": 7611 + }, + { + "epoch": 0.7013405813792786, + "grad_norm": 0.9314012246164496, + "learning_rate": 1.1232116958651695e-06, + "loss": 0.1284, + "step": 7612 + }, + { + "epoch": 0.7014327175565486, + "grad_norm": 0.9447613390064742, + "learning_rate": 1.1225759048385276e-06, + "loss": 0.124, + "step": 7613 + }, + { + "epoch": 0.7015248537338186, + "grad_norm": 0.9139852735566912, + "learning_rate": 1.1219402417058611e-06, + "loss": 0.1252, + "step": 7614 + }, + { + "epoch": 0.7016169899110886, + "grad_norm": 0.8944899755971756, + "learning_rate": 1.1213047065261922e-06, + "loss": 0.1209, + "step": 7615 + }, + { + "epoch": 0.7017091260883586, + "grad_norm": 0.9037644191735968, + "learning_rate": 1.12066929935853e-06, + "loss": 0.1197, + "step": 7616 + }, + { + "epoch": 0.7018012622656286, + "grad_norm": 0.9280483040196563, + "learning_rate": 1.1200340202618706e-06, + "loss": 0.1194, + "step": 7617 + }, + { + "epoch": 0.7018933984428986, + "grad_norm": 0.9171715200445577, + "learning_rate": 1.1193988692952012e-06, + "loss": 0.1111, + "step": 7618 + }, + { + "epoch": 0.7019855346201687, + "grad_norm": 0.9146803179316838, + "learning_rate": 1.118763846517496e-06, + "loss": 0.1183, + "step": 7619 + }, + { + "epoch": 0.7020776707974387, + "grad_norm": 0.923647975551502, + "learning_rate": 1.1181289519877156e-06, + "loss": 0.1249, + "step": 7620 + }, + { + "epoch": 0.7021698069747087, + "grad_norm": 0.9747801502638038, + "learning_rate": 1.1174941857648105e-06, + "loss": 0.126, + "step": 7621 + }, + { + "epoch": 0.7022619431519787, + "grad_norm": 0.9391905801381869, + "learning_rate": 1.11685954790772e-06, + "loss": 0.1226, + "step": 7622 + }, + { + "epoch": 0.7023540793292486, + "grad_norm": 0.9048641404195729, + "learning_rate": 1.1162250384753697e-06, + "loss": 0.1193, + "step": 7623 + }, + { + "epoch": 0.7024462155065186, + "grad_norm": 0.960076295126353, + "learning_rate": 1.1155906575266722e-06, + "loss": 0.129, + "step": 7624 + }, + { + "epoch": 0.7025383516837886, + "grad_norm": 0.9925166288127805, + "learning_rate": 1.1149564051205314e-06, + "loss": 0.1347, + "step": 7625 + }, + { + "epoch": 0.7026304878610586, + "grad_norm": 0.9426529785588942, + "learning_rate": 1.114322281315837e-06, + "loss": 0.1283, + "step": 7626 + }, + { + "epoch": 0.7027226240383286, + "grad_norm": 0.9194194860925674, + "learning_rate": 1.1136882861714692e-06, + "loss": 0.1162, + "step": 7627 + }, + { + "epoch": 0.7028147602155986, + "grad_norm": 0.937081948310993, + "learning_rate": 1.1130544197462933e-06, + "loss": 0.1226, + "step": 7628 + }, + { + "epoch": 0.7029068963928686, + "grad_norm": 0.9750960465487545, + "learning_rate": 1.1124206820991628e-06, + "loss": 0.1211, + "step": 7629 + }, + { + "epoch": 0.7029990325701386, + "grad_norm": 0.9309418428024832, + "learning_rate": 1.1117870732889214e-06, + "loss": 0.1249, + "step": 7630 + }, + { + "epoch": 0.7030911687474086, + "grad_norm": 0.9511119877832231, + "learning_rate": 1.111153593374399e-06, + "loss": 0.129, + "step": 7631 + }, + { + "epoch": 0.7031833049246786, + "grad_norm": 0.9528627594483212, + "learning_rate": 1.1105202424144165e-06, + "loss": 0.1297, + "step": 7632 + }, + { + "epoch": 0.7032754411019487, + "grad_norm": 0.9814987830163073, + "learning_rate": 1.109887020467779e-06, + "loss": 0.1245, + "step": 7633 + }, + { + "epoch": 0.7033675772792187, + "grad_norm": 0.9242159264865467, + "learning_rate": 1.10925392759328e-06, + "loss": 0.1288, + "step": 7634 + }, + { + "epoch": 0.7034597134564887, + "grad_norm": 0.9120110671753703, + "learning_rate": 1.1086209638497038e-06, + "loss": 0.1262, + "step": 7635 + }, + { + "epoch": 0.7035518496337587, + "grad_norm": 0.9969698964166246, + "learning_rate": 1.1079881292958217e-06, + "loss": 0.1417, + "step": 7636 + }, + { + "epoch": 0.7036439858110287, + "grad_norm": 0.9703225929392982, + "learning_rate": 1.1073554239903905e-06, + "loss": 0.1427, + "step": 7637 + }, + { + "epoch": 0.7037361219882987, + "grad_norm": 0.880006758354232, + "learning_rate": 1.106722847992159e-06, + "loss": 0.1191, + "step": 7638 + }, + { + "epoch": 0.7038282581655687, + "grad_norm": 0.9694607975674124, + "learning_rate": 1.1060904013598604e-06, + "loss": 0.1183, + "step": 7639 + }, + { + "epoch": 0.7039203943428387, + "grad_norm": 0.9711437397169369, + "learning_rate": 1.1054580841522188e-06, + "loss": 0.1263, + "step": 7640 + }, + { + "epoch": 0.7040125305201087, + "grad_norm": 0.8726884936753815, + "learning_rate": 1.1048258964279432e-06, + "loss": 0.1206, + "step": 7641 + }, + { + "epoch": 0.7041046666973787, + "grad_norm": 0.9384163356051172, + "learning_rate": 1.1041938382457332e-06, + "loss": 0.1274, + "step": 7642 + }, + { + "epoch": 0.7041968028746487, + "grad_norm": 0.8797955140060947, + "learning_rate": 1.1035619096642766e-06, + "loss": 0.125, + "step": 7643 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 0.9592333120315895, + "learning_rate": 1.102930110742247e-06, + "loss": 0.1305, + "step": 7644 + }, + { + "epoch": 0.7043810752291887, + "grad_norm": 0.9883696119753728, + "learning_rate": 1.102298441538306e-06, + "loss": 0.1262, + "step": 7645 + }, + { + "epoch": 0.7044732114064588, + "grad_norm": 0.9316484289506399, + "learning_rate": 1.101666902111105e-06, + "loss": 0.1188, + "step": 7646 + }, + { + "epoch": 0.7045653475837288, + "grad_norm": 0.9159885584557372, + "learning_rate": 1.1010354925192826e-06, + "loss": 0.1153, + "step": 7647 + }, + { + "epoch": 0.7046574837609988, + "grad_norm": 0.971625755678242, + "learning_rate": 1.1004042128214664e-06, + "loss": 0.1325, + "step": 7648 + }, + { + "epoch": 0.7047496199382688, + "grad_norm": 0.9208399755488353, + "learning_rate": 1.0997730630762697e-06, + "loss": 0.1183, + "step": 7649 + }, + { + "epoch": 0.7048417561155388, + "grad_norm": 0.9003396206133496, + "learning_rate": 1.0991420433422936e-06, + "loss": 0.1078, + "step": 7650 + }, + { + "epoch": 0.7049338922928088, + "grad_norm": 0.9159612364008332, + "learning_rate": 1.0985111536781298e-06, + "loss": 0.1178, + "step": 7651 + }, + { + "epoch": 0.7050260284700788, + "grad_norm": 0.9660724779459582, + "learning_rate": 1.0978803941423572e-06, + "loss": 0.129, + "step": 7652 + }, + { + "epoch": 0.7051181646473488, + "grad_norm": 0.9196254695327626, + "learning_rate": 1.0972497647935396e-06, + "loss": 0.122, + "step": 7653 + }, + { + "epoch": 0.7052103008246188, + "grad_norm": 0.9473601084641282, + "learning_rate": 1.0966192656902335e-06, + "loss": 0.1081, + "step": 7654 + }, + { + "epoch": 0.7053024370018888, + "grad_norm": 1.0074305480369767, + "learning_rate": 1.0959888968909784e-06, + "loss": 0.1314, + "step": 7655 + }, + { + "epoch": 0.7053945731791588, + "grad_norm": 0.9923751289787686, + "learning_rate": 1.0953586584543066e-06, + "loss": 0.1221, + "step": 7656 + }, + { + "epoch": 0.7054867093564288, + "grad_norm": 0.8843998573581785, + "learning_rate": 1.0947285504387337e-06, + "loss": 0.1217, + "step": 7657 + }, + { + "epoch": 0.7055788455336988, + "grad_norm": 0.8849026382738153, + "learning_rate": 1.094098572902766e-06, + "loss": 0.114, + "step": 7658 + }, + { + "epoch": 0.7056709817109688, + "grad_norm": 0.9500578248983311, + "learning_rate": 1.0934687259048975e-06, + "loss": 0.1178, + "step": 7659 + }, + { + "epoch": 0.7057631178882389, + "grad_norm": 0.9258739494949468, + "learning_rate": 1.092839009503609e-06, + "loss": 0.1229, + "step": 7660 + }, + { + "epoch": 0.7058552540655089, + "grad_norm": 0.9621025980170279, + "learning_rate": 1.0922094237573706e-06, + "loss": 0.1261, + "step": 7661 + }, + { + "epoch": 0.7059473902427789, + "grad_norm": 0.9641510931839957, + "learning_rate": 1.0915799687246376e-06, + "loss": 0.1221, + "step": 7662 + }, + { + "epoch": 0.7060395264200489, + "grad_norm": 0.9571666499830939, + "learning_rate": 1.0909506444638563e-06, + "loss": 0.1247, + "step": 7663 + }, + { + "epoch": 0.7061316625973189, + "grad_norm": 0.9417941506505101, + "learning_rate": 1.09032145103346e-06, + "loss": 0.1145, + "step": 7664 + }, + { + "epoch": 0.7062237987745889, + "grad_norm": 0.8922995138858749, + "learning_rate": 1.0896923884918687e-06, + "loss": 0.1172, + "step": 7665 + }, + { + "epoch": 0.7063159349518588, + "grad_norm": 0.9455943222219495, + "learning_rate": 1.0890634568974901e-06, + "loss": 0.1307, + "step": 7666 + }, + { + "epoch": 0.7064080711291288, + "grad_norm": 0.8963449786751144, + "learning_rate": 1.0884346563087214e-06, + "loss": 0.1148, + "step": 7667 + }, + { + "epoch": 0.7065002073063988, + "grad_norm": 0.9238178351632784, + "learning_rate": 1.0878059867839469e-06, + "loss": 0.1219, + "step": 7668 + }, + { + "epoch": 0.7065923434836688, + "grad_norm": 0.9717273503839575, + "learning_rate": 1.0871774483815393e-06, + "loss": 0.124, + "step": 7669 + }, + { + "epoch": 0.7066844796609388, + "grad_norm": 0.9020615180773476, + "learning_rate": 1.0865490411598576e-06, + "loss": 0.1226, + "step": 7670 + }, + { + "epoch": 0.7067766158382088, + "grad_norm": 0.9052631909123191, + "learning_rate": 1.0859207651772485e-06, + "loss": 0.1221, + "step": 7671 + }, + { + "epoch": 0.7068687520154788, + "grad_norm": 0.9363740585233611, + "learning_rate": 1.0852926204920488e-06, + "loss": 0.1254, + "step": 7672 + }, + { + "epoch": 0.7069608881927489, + "grad_norm": 0.942073473131626, + "learning_rate": 1.084664607162582e-06, + "loss": 0.1197, + "step": 7673 + }, + { + "epoch": 0.7070530243700189, + "grad_norm": 0.9533912073107669, + "learning_rate": 1.0840367252471583e-06, + "loss": 0.123, + "step": 7674 + }, + { + "epoch": 0.7071451605472889, + "grad_norm": 1.0371830317376796, + "learning_rate": 1.083408974804078e-06, + "loss": 0.1408, + "step": 7675 + }, + { + "epoch": 0.7072372967245589, + "grad_norm": 0.9234521408562314, + "learning_rate": 1.082781355891626e-06, + "loss": 0.1181, + "step": 7676 + }, + { + "epoch": 0.7073294329018289, + "grad_norm": 0.904399098255252, + "learning_rate": 1.0821538685680783e-06, + "loss": 0.1183, + "step": 7677 + }, + { + "epoch": 0.7074215690790989, + "grad_norm": 0.9279812511551865, + "learning_rate": 1.0815265128916955e-06, + "loss": 0.127, + "step": 7678 + }, + { + "epoch": 0.7075137052563689, + "grad_norm": 0.8684164999023688, + "learning_rate": 1.0808992889207287e-06, + "loss": 0.1134, + "step": 7679 + }, + { + "epoch": 0.7076058414336389, + "grad_norm": 0.9084082832703506, + "learning_rate": 1.0802721967134167e-06, + "loss": 0.1234, + "step": 7680 + }, + { + "epoch": 0.7076979776109089, + "grad_norm": 0.9542334556040545, + "learning_rate": 1.0796452363279838e-06, + "loss": 0.125, + "step": 7681 + }, + { + "epoch": 0.7077901137881789, + "grad_norm": 0.9552003098411618, + "learning_rate": 1.079018407822643e-06, + "loss": 0.1178, + "step": 7682 + }, + { + "epoch": 0.7078822499654489, + "grad_norm": 0.9786307431398847, + "learning_rate": 1.0783917112555956e-06, + "loss": 0.1213, + "step": 7683 + }, + { + "epoch": 0.7079743861427189, + "grad_norm": 0.9443212053638332, + "learning_rate": 1.0777651466850308e-06, + "loss": 0.123, + "step": 7684 + }, + { + "epoch": 0.7080665223199889, + "grad_norm": 0.906375176394638, + "learning_rate": 1.0771387141691265e-06, + "loss": 0.1239, + "step": 7685 + }, + { + "epoch": 0.7081586584972589, + "grad_norm": 0.9974511076859406, + "learning_rate": 1.0765124137660454e-06, + "loss": 0.1199, + "step": 7686 + }, + { + "epoch": 0.708250794674529, + "grad_norm": 0.9252319719364072, + "learning_rate": 1.075886245533939e-06, + "loss": 0.1322, + "step": 7687 + }, + { + "epoch": 0.708342930851799, + "grad_norm": 0.9953283487894312, + "learning_rate": 1.075260209530948e-06, + "loss": 0.1346, + "step": 7688 + }, + { + "epoch": 0.708435067029069, + "grad_norm": 0.9675526374551723, + "learning_rate": 1.0746343058151998e-06, + "loss": 0.1359, + "step": 7689 + }, + { + "epoch": 0.708527203206339, + "grad_norm": 0.9600303445058818, + "learning_rate": 1.074008534444811e-06, + "loss": 0.1341, + "step": 7690 + }, + { + "epoch": 0.708619339383609, + "grad_norm": 0.8930072632299815, + "learning_rate": 1.0733828954778827e-06, + "loss": 0.1127, + "step": 7691 + }, + { + "epoch": 0.708711475560879, + "grad_norm": 0.9679509417395482, + "learning_rate": 1.0727573889725053e-06, + "loss": 0.125, + "step": 7692 + }, + { + "epoch": 0.708803611738149, + "grad_norm": 0.8541878797863817, + "learning_rate": 1.0721320149867582e-06, + "loss": 0.1176, + "step": 7693 + }, + { + "epoch": 0.708895747915419, + "grad_norm": 0.9332963996090269, + "learning_rate": 1.0715067735787079e-06, + "loss": 0.1304, + "step": 7694 + }, + { + "epoch": 0.708987884092689, + "grad_norm": 0.9552468087190065, + "learning_rate": 1.0708816648064067e-06, + "loss": 0.1362, + "step": 7695 + }, + { + "epoch": 0.709080020269959, + "grad_norm": 0.9699586270892624, + "learning_rate": 1.0702566887278975e-06, + "loss": 0.1363, + "step": 7696 + }, + { + "epoch": 0.709172156447229, + "grad_norm": 0.9025744016802671, + "learning_rate": 1.0696318454012074e-06, + "loss": 0.1194, + "step": 7697 + }, + { + "epoch": 0.709264292624499, + "grad_norm": 0.9032683279099893, + "learning_rate": 1.0690071348843559e-06, + "loss": 0.1263, + "step": 7698 + }, + { + "epoch": 0.709356428801769, + "grad_norm": 0.9876043723097531, + "learning_rate": 1.0683825572353447e-06, + "loss": 0.1268, + "step": 7699 + }, + { + "epoch": 0.709448564979039, + "grad_norm": 0.9741498985940272, + "learning_rate": 1.0677581125121672e-06, + "loss": 0.1292, + "step": 7700 + }, + { + "epoch": 0.7095407011563091, + "grad_norm": 0.9413448494069492, + "learning_rate": 1.067133800772803e-06, + "loss": 0.1328, + "step": 7701 + }, + { + "epoch": 0.7096328373335791, + "grad_norm": 0.9214270401858093, + "learning_rate": 1.0665096220752214e-06, + "loss": 0.1256, + "step": 7702 + }, + { + "epoch": 0.7097249735108491, + "grad_norm": 0.9671108647251513, + "learning_rate": 1.065885576477374e-06, + "loss": 0.125, + "step": 7703 + }, + { + "epoch": 0.7098171096881191, + "grad_norm": 0.8983338985867325, + "learning_rate": 1.0652616640372051e-06, + "loss": 0.1227, + "step": 7704 + }, + { + "epoch": 0.7099092458653891, + "grad_norm": 0.9167380926561612, + "learning_rate": 1.064637884812645e-06, + "loss": 0.1225, + "step": 7705 + }, + { + "epoch": 0.7100013820426591, + "grad_norm": 0.9416855433977324, + "learning_rate": 1.0640142388616128e-06, + "loss": 0.1216, + "step": 7706 + }, + { + "epoch": 0.710093518219929, + "grad_norm": 0.9558540627602284, + "learning_rate": 1.063390726242012e-06, + "loss": 0.1171, + "step": 7707 + }, + { + "epoch": 0.710185654397199, + "grad_norm": 0.9049250133502065, + "learning_rate": 1.062767347011738e-06, + "loss": 0.1179, + "step": 7708 + }, + { + "epoch": 0.710277790574469, + "grad_norm": 0.9117542944386543, + "learning_rate": 1.0621441012286696e-06, + "loss": 0.1249, + "step": 7709 + }, + { + "epoch": 0.710369926751739, + "grad_norm": 0.9522183662298654, + "learning_rate": 1.061520988950677e-06, + "loss": 0.1297, + "step": 7710 + }, + { + "epoch": 0.710462062929009, + "grad_norm": 0.9050768681888326, + "learning_rate": 1.0608980102356146e-06, + "loss": 0.1155, + "step": 7711 + }, + { + "epoch": 0.710554199106279, + "grad_norm": 0.9802646477753757, + "learning_rate": 1.0602751651413264e-06, + "loss": 0.1335, + "step": 7712 + }, + { + "epoch": 0.710646335283549, + "grad_norm": 0.9303028811692908, + "learning_rate": 1.0596524537256453e-06, + "loss": 0.1095, + "step": 7713 + }, + { + "epoch": 0.7107384714608191, + "grad_norm": 0.9190098625800486, + "learning_rate": 1.0590298760463879e-06, + "loss": 0.1199, + "step": 7714 + }, + { + "epoch": 0.7108306076380891, + "grad_norm": 0.94415621243913, + "learning_rate": 1.0584074321613625e-06, + "loss": 0.1242, + "step": 7715 + }, + { + "epoch": 0.7109227438153591, + "grad_norm": 0.9319898998438171, + "learning_rate": 1.0577851221283614e-06, + "loss": 0.1165, + "step": 7716 + }, + { + "epoch": 0.7110148799926291, + "grad_norm": 0.958963020372258, + "learning_rate": 1.0571629460051665e-06, + "loss": 0.1329, + "step": 7717 + }, + { + "epoch": 0.7111070161698991, + "grad_norm": 0.992606267592599, + "learning_rate": 1.0565409038495486e-06, + "loss": 0.1371, + "step": 7718 + }, + { + "epoch": 0.7111991523471691, + "grad_norm": 0.9600979422255448, + "learning_rate": 1.055918995719263e-06, + "loss": 0.1222, + "step": 7719 + }, + { + "epoch": 0.7112912885244391, + "grad_norm": 0.922753779521535, + "learning_rate": 1.0552972216720534e-06, + "loss": 0.121, + "step": 7720 + }, + { + "epoch": 0.7113834247017091, + "grad_norm": 0.9470964491509773, + "learning_rate": 1.054675581765652e-06, + "loss": 0.1318, + "step": 7721 + }, + { + "epoch": 0.7114755608789791, + "grad_norm": 0.9243871817173555, + "learning_rate": 1.0540540760577785e-06, + "loss": 0.1285, + "step": 7722 + }, + { + "epoch": 0.7115676970562491, + "grad_norm": 0.9789267036795136, + "learning_rate": 1.0534327046061404e-06, + "loss": 0.1306, + "step": 7723 + }, + { + "epoch": 0.7116598332335191, + "grad_norm": 0.9684236064786368, + "learning_rate": 1.0528114674684318e-06, + "loss": 0.1198, + "step": 7724 + }, + { + "epoch": 0.7117519694107891, + "grad_norm": 0.9082055129941168, + "learning_rate": 1.0521903647023327e-06, + "loss": 0.126, + "step": 7725 + }, + { + "epoch": 0.7118441055880591, + "grad_norm": 0.8603222142268356, + "learning_rate": 1.0515693963655144e-06, + "loss": 0.1087, + "step": 7726 + }, + { + "epoch": 0.7119362417653291, + "grad_norm": 0.907707522654929, + "learning_rate": 1.0509485625156342e-06, + "loss": 0.1198, + "step": 7727 + }, + { + "epoch": 0.7120283779425992, + "grad_norm": 0.9058331237453139, + "learning_rate": 1.0503278632103353e-06, + "loss": 0.1204, + "step": 7728 + }, + { + "epoch": 0.7121205141198692, + "grad_norm": 0.9647120715897719, + "learning_rate": 1.0497072985072509e-06, + "loss": 0.138, + "step": 7729 + }, + { + "epoch": 0.7122126502971392, + "grad_norm": 0.9429718054470461, + "learning_rate": 1.0490868684639994e-06, + "loss": 0.1377, + "step": 7730 + }, + { + "epoch": 0.7123047864744092, + "grad_norm": 1.019549767972903, + "learning_rate": 1.0484665731381892e-06, + "loss": 0.1275, + "step": 7731 + }, + { + "epoch": 0.7123969226516792, + "grad_norm": 0.8914456137526924, + "learning_rate": 1.0478464125874126e-06, + "loss": 0.1211, + "step": 7732 + }, + { + "epoch": 0.7124890588289492, + "grad_norm": 0.8732468987409082, + "learning_rate": 1.047226386869253e-06, + "loss": 0.1144, + "step": 7733 + }, + { + "epoch": 0.7125811950062192, + "grad_norm": 0.9361439907322878, + "learning_rate": 1.046606496041281e-06, + "loss": 0.1236, + "step": 7734 + }, + { + "epoch": 0.7126733311834892, + "grad_norm": 0.8808363617293428, + "learning_rate": 1.0459867401610519e-06, + "loss": 0.1174, + "step": 7735 + }, + { + "epoch": 0.7127654673607592, + "grad_norm": 0.9359884244207574, + "learning_rate": 1.0453671192861095e-06, + "loss": 0.1251, + "step": 7736 + }, + { + "epoch": 0.7128576035380292, + "grad_norm": 0.9779404856607452, + "learning_rate": 1.0447476334739867e-06, + "loss": 0.1321, + "step": 7737 + }, + { + "epoch": 0.7129497397152992, + "grad_norm": 0.9287380972756585, + "learning_rate": 1.0441282827822027e-06, + "loss": 0.1257, + "step": 7738 + }, + { + "epoch": 0.7130418758925692, + "grad_norm": 0.9621747448313562, + "learning_rate": 1.0435090672682655e-06, + "loss": 0.1311, + "step": 7739 + }, + { + "epoch": 0.7131340120698392, + "grad_norm": 0.9258715491762729, + "learning_rate": 1.042889986989668e-06, + "loss": 0.1309, + "step": 7740 + }, + { + "epoch": 0.7132261482471093, + "grad_norm": 0.9039074847686032, + "learning_rate": 1.0422710420038912e-06, + "loss": 0.1259, + "step": 7741 + }, + { + "epoch": 0.7133182844243793, + "grad_norm": 0.8822465315547272, + "learning_rate": 1.0416522323684048e-06, + "loss": 0.1116, + "step": 7742 + }, + { + "epoch": 0.7134104206016493, + "grad_norm": 0.952450362988351, + "learning_rate": 1.0410335581406657e-06, + "loss": 0.1275, + "step": 7743 + }, + { + "epoch": 0.7135025567789193, + "grad_norm": 0.9446338466948969, + "learning_rate": 1.0404150193781187e-06, + "loss": 0.1285, + "step": 7744 + }, + { + "epoch": 0.7135946929561893, + "grad_norm": 0.886851570227021, + "learning_rate": 1.0397966161381943e-06, + "loss": 0.1163, + "step": 7745 + }, + { + "epoch": 0.7136868291334593, + "grad_norm": 0.9493597158807928, + "learning_rate": 1.03917834847831e-06, + "loss": 0.1261, + "step": 7746 + }, + { + "epoch": 0.7137789653107293, + "grad_norm": 0.872603891886761, + "learning_rate": 1.0385602164558735e-06, + "loss": 0.109, + "step": 7747 + }, + { + "epoch": 0.7138711014879993, + "grad_norm": 0.9153516353588165, + "learning_rate": 1.037942220128279e-06, + "loss": 0.1214, + "step": 7748 + }, + { + "epoch": 0.7139632376652693, + "grad_norm": 0.918045785934361, + "learning_rate": 1.0373243595529058e-06, + "loss": 0.1218, + "step": 7749 + }, + { + "epoch": 0.7140553738425393, + "grad_norm": 0.9248276605451051, + "learning_rate": 1.0367066347871243e-06, + "loss": 0.118, + "step": 7750 + }, + { + "epoch": 0.7141475100198093, + "grad_norm": 0.9388758721031797, + "learning_rate": 1.0360890458882882e-06, + "loss": 0.1277, + "step": 7751 + }, + { + "epoch": 0.7142396461970792, + "grad_norm": 0.8690529484848437, + "learning_rate": 1.0354715929137429e-06, + "loss": 0.1118, + "step": 7752 + }, + { + "epoch": 0.7143317823743492, + "grad_norm": 0.8640578234777775, + "learning_rate": 1.0348542759208166e-06, + "loss": 0.1122, + "step": 7753 + }, + { + "epoch": 0.7144239185516192, + "grad_norm": 0.9843435962019589, + "learning_rate": 1.0342370949668287e-06, + "loss": 0.1282, + "step": 7754 + }, + { + "epoch": 0.7145160547288893, + "grad_norm": 0.8969391758762647, + "learning_rate": 1.0336200501090848e-06, + "loss": 0.118, + "step": 7755 + }, + { + "epoch": 0.7146081909061593, + "grad_norm": 0.9532247647644668, + "learning_rate": 1.0330031414048775e-06, + "loss": 0.1266, + "step": 7756 + }, + { + "epoch": 0.7147003270834293, + "grad_norm": 0.9088908017504251, + "learning_rate": 1.0323863689114851e-06, + "loss": 0.1212, + "step": 7757 + }, + { + "epoch": 0.7147924632606993, + "grad_norm": 0.906425226852019, + "learning_rate": 1.0317697326861766e-06, + "loss": 0.1239, + "step": 7758 + }, + { + "epoch": 0.7148845994379693, + "grad_norm": 0.9227457737134785, + "learning_rate": 1.0311532327862064e-06, + "loss": 0.1219, + "step": 7759 + }, + { + "epoch": 0.7149767356152393, + "grad_norm": 0.9987736939867745, + "learning_rate": 1.0305368692688175e-06, + "loss": 0.1372, + "step": 7760 + }, + { + "epoch": 0.7150688717925093, + "grad_norm": 0.9714466510612396, + "learning_rate": 1.0299206421912382e-06, + "loss": 0.1201, + "step": 7761 + }, + { + "epoch": 0.7151610079697793, + "grad_norm": 0.8732980566662378, + "learning_rate": 1.0293045516106848e-06, + "loss": 0.1085, + "step": 7762 + }, + { + "epoch": 0.7152531441470493, + "grad_norm": 0.9074801485331541, + "learning_rate": 1.0286885975843621e-06, + "loss": 0.1145, + "step": 7763 + }, + { + "epoch": 0.7153452803243193, + "grad_norm": 0.9109621722743166, + "learning_rate": 1.0280727801694624e-06, + "loss": 0.1216, + "step": 7764 + }, + { + "epoch": 0.7154374165015893, + "grad_norm": 0.922807759731232, + "learning_rate": 1.0274570994231622e-06, + "loss": 0.1254, + "step": 7765 + }, + { + "epoch": 0.7155295526788593, + "grad_norm": 0.9349066676402771, + "learning_rate": 1.02684155540263e-06, + "loss": 0.1273, + "step": 7766 + }, + { + "epoch": 0.7156216888561293, + "grad_norm": 0.8751564876537657, + "learning_rate": 1.026226148165017e-06, + "loss": 0.1203, + "step": 7767 + }, + { + "epoch": 0.7157138250333993, + "grad_norm": 0.917719645814074, + "learning_rate": 1.0256108777674656e-06, + "loss": 0.1198, + "step": 7768 + }, + { + "epoch": 0.7158059612106694, + "grad_norm": 0.8858931449744197, + "learning_rate": 1.024995744267102e-06, + "loss": 0.1157, + "step": 7769 + }, + { + "epoch": 0.7158980973879394, + "grad_norm": 0.8912225251790697, + "learning_rate": 1.0243807477210423e-06, + "loss": 0.13, + "step": 7770 + }, + { + "epoch": 0.7159902335652094, + "grad_norm": 0.8764687308266974, + "learning_rate": 1.0237658881863898e-06, + "loss": 0.1114, + "step": 7771 + }, + { + "epoch": 0.7160823697424794, + "grad_norm": 0.9088978751333338, + "learning_rate": 1.0231511657202327e-06, + "loss": 0.1203, + "step": 7772 + }, + { + "epoch": 0.7161745059197494, + "grad_norm": 0.9229428912327712, + "learning_rate": 1.0225365803796498e-06, + "loss": 0.1236, + "step": 7773 + }, + { + "epoch": 0.7162666420970194, + "grad_norm": 0.8660219223718931, + "learning_rate": 1.0219221322217032e-06, + "loss": 0.1101, + "step": 7774 + }, + { + "epoch": 0.7163587782742894, + "grad_norm": 0.9744313899778617, + "learning_rate": 1.0213078213034457e-06, + "loss": 0.1358, + "step": 7775 + }, + { + "epoch": 0.7164509144515594, + "grad_norm": 0.9000387318873999, + "learning_rate": 1.0206936476819165e-06, + "loss": 0.1147, + "step": 7776 + }, + { + "epoch": 0.7165430506288294, + "grad_norm": 0.9151921247508992, + "learning_rate": 1.0200796114141428e-06, + "loss": 0.1173, + "step": 7777 + }, + { + "epoch": 0.7166351868060994, + "grad_norm": 0.9873702603847861, + "learning_rate": 1.0194657125571347e-06, + "loss": 0.1267, + "step": 7778 + }, + { + "epoch": 0.7167273229833694, + "grad_norm": 0.8744889522529582, + "learning_rate": 1.0188519511678946e-06, + "loss": 0.1134, + "step": 7779 + }, + { + "epoch": 0.7168194591606394, + "grad_norm": 0.9000947869027155, + "learning_rate": 1.0182383273034102e-06, + "loss": 0.1138, + "step": 7780 + }, + { + "epoch": 0.7169115953379094, + "grad_norm": 0.9534692999739216, + "learning_rate": 1.0176248410206577e-06, + "loss": 0.131, + "step": 7781 + }, + { + "epoch": 0.7170037315151795, + "grad_norm": 0.9343304549611972, + "learning_rate": 1.017011492376597e-06, + "loss": 0.1226, + "step": 7782 + }, + { + "epoch": 0.7170958676924495, + "grad_norm": 0.9079633458736804, + "learning_rate": 1.0163982814281797e-06, + "loss": 0.1081, + "step": 7783 + }, + { + "epoch": 0.7171880038697195, + "grad_norm": 0.945362100863493, + "learning_rate": 1.0157852082323411e-06, + "loss": 0.117, + "step": 7784 + }, + { + "epoch": 0.7172801400469895, + "grad_norm": 0.9305854058984073, + "learning_rate": 1.0151722728460064e-06, + "loss": 0.1232, + "step": 7785 + }, + { + "epoch": 0.7173722762242595, + "grad_norm": 0.9193933839229599, + "learning_rate": 1.0145594753260849e-06, + "loss": 0.1162, + "step": 7786 + }, + { + "epoch": 0.7174644124015295, + "grad_norm": 0.9311548944071399, + "learning_rate": 1.0139468157294762e-06, + "loss": 0.1218, + "step": 7787 + }, + { + "epoch": 0.7175565485787995, + "grad_norm": 0.9143005687332517, + "learning_rate": 1.0133342941130664e-06, + "loss": 0.1112, + "step": 7788 + }, + { + "epoch": 0.7176486847560695, + "grad_norm": 0.9207452007285798, + "learning_rate": 1.0127219105337274e-06, + "loss": 0.1259, + "step": 7789 + }, + { + "epoch": 0.7177408209333395, + "grad_norm": 0.9215304391889503, + "learning_rate": 1.0121096650483182e-06, + "loss": 0.1169, + "step": 7790 + }, + { + "epoch": 0.7178329571106095, + "grad_norm": 0.8909392126110938, + "learning_rate": 1.0114975577136866e-06, + "loss": 0.1194, + "step": 7791 + }, + { + "epoch": 0.7179250932878795, + "grad_norm": 0.9693442968946673, + "learning_rate": 1.010885588586667e-06, + "loss": 0.1277, + "step": 7792 + }, + { + "epoch": 0.7180172294651495, + "grad_norm": 0.9316443174949289, + "learning_rate": 1.0102737577240818e-06, + "loss": 0.1202, + "step": 7793 + }, + { + "epoch": 0.7181093656424195, + "grad_norm": 0.9187961844110014, + "learning_rate": 1.0096620651827382e-06, + "loss": 0.1214, + "step": 7794 + }, + { + "epoch": 0.7182015018196894, + "grad_norm": 0.8998126418946526, + "learning_rate": 1.0090505110194315e-06, + "loss": 0.1128, + "step": 7795 + }, + { + "epoch": 0.7182936379969596, + "grad_norm": 0.9585336588680275, + "learning_rate": 1.0084390952909456e-06, + "loss": 0.1269, + "step": 7796 + }, + { + "epoch": 0.7183857741742296, + "grad_norm": 0.9211680803712629, + "learning_rate": 1.0078278180540507e-06, + "loss": 0.1154, + "step": 7797 + }, + { + "epoch": 0.7184779103514995, + "grad_norm": 0.8997122910133363, + "learning_rate": 1.0072166793655027e-06, + "loss": 0.1106, + "step": 7798 + }, + { + "epoch": 0.7185700465287695, + "grad_norm": 0.9414360029244899, + "learning_rate": 1.0066056792820478e-06, + "loss": 0.1301, + "step": 7799 + }, + { + "epoch": 0.7186621827060395, + "grad_norm": 0.8439284973205966, + "learning_rate": 1.0059948178604154e-06, + "loss": 0.1053, + "step": 7800 + }, + { + "epoch": 0.7187543188833095, + "grad_norm": 0.8981596749183159, + "learning_rate": 1.0053840951573247e-06, + "loss": 0.1349, + "step": 7801 + }, + { + "epoch": 0.7188464550605795, + "grad_norm": 0.8912489563990994, + "learning_rate": 1.0047735112294827e-06, + "loss": 0.1157, + "step": 7802 + }, + { + "epoch": 0.7189385912378495, + "grad_norm": 0.9025423474052598, + "learning_rate": 1.00416306613358e-06, + "loss": 0.1197, + "step": 7803 + }, + { + "epoch": 0.7190307274151195, + "grad_norm": 0.9318053328867276, + "learning_rate": 1.0035527599262988e-06, + "loss": 0.1341, + "step": 7804 + }, + { + "epoch": 0.7191228635923895, + "grad_norm": 0.9344718743643173, + "learning_rate": 1.0029425926643035e-06, + "loss": 0.1309, + "step": 7805 + }, + { + "epoch": 0.7192149997696595, + "grad_norm": 0.9345501597826967, + "learning_rate": 1.0023325644042508e-06, + "loss": 0.1228, + "step": 7806 + }, + { + "epoch": 0.7193071359469295, + "grad_norm": 0.8698344847523616, + "learning_rate": 1.0017226752027798e-06, + "loss": 0.1173, + "step": 7807 + }, + { + "epoch": 0.7193992721241995, + "grad_norm": 0.8908033217302351, + "learning_rate": 1.0011129251165198e-06, + "loss": 0.1264, + "step": 7808 + }, + { + "epoch": 0.7194914083014696, + "grad_norm": 0.8889868667942962, + "learning_rate": 1.0005033142020868e-06, + "loss": 0.1195, + "step": 7809 + }, + { + "epoch": 0.7195835444787396, + "grad_norm": 0.8608771688254551, + "learning_rate": 9.998938425160822e-07, + "loss": 0.1127, + "step": 7810 + }, + { + "epoch": 0.7196756806560096, + "grad_norm": 0.8824505031638588, + "learning_rate": 9.992845101150949e-07, + "loss": 0.1177, + "step": 7811 + }, + { + "epoch": 0.7197678168332796, + "grad_norm": 0.9015088142832688, + "learning_rate": 9.986753170557026e-07, + "loss": 0.1272, + "step": 7812 + }, + { + "epoch": 0.7198599530105496, + "grad_norm": 0.9486135831290708, + "learning_rate": 9.980662633944687e-07, + "loss": 0.1306, + "step": 7813 + }, + { + "epoch": 0.7199520891878196, + "grad_norm": 0.9331244271896649, + "learning_rate": 9.974573491879447e-07, + "loss": 0.1187, + "step": 7814 + }, + { + "epoch": 0.7200442253650896, + "grad_norm": 1.0721474757596579, + "learning_rate": 9.968485744926673e-07, + "loss": 0.1422, + "step": 7815 + }, + { + "epoch": 0.7201363615423596, + "grad_norm": 0.9312764151210932, + "learning_rate": 9.962399393651608e-07, + "loss": 0.1283, + "step": 7816 + }, + { + "epoch": 0.7202284977196296, + "grad_norm": 0.9758367075704081, + "learning_rate": 9.95631443861938e-07, + "loss": 0.1113, + "step": 7817 + }, + { + "epoch": 0.7203206338968996, + "grad_norm": 0.9924718172406113, + "learning_rate": 9.95023088039498e-07, + "loss": 0.1239, + "step": 7818 + }, + { + "epoch": 0.7204127700741696, + "grad_norm": 0.8883754010136571, + "learning_rate": 9.94414871954326e-07, + "loss": 0.1211, + "step": 7819 + }, + { + "epoch": 0.7205049062514396, + "grad_norm": 0.9535928744680094, + "learning_rate": 9.938067956628955e-07, + "loss": 0.1275, + "step": 7820 + }, + { + "epoch": 0.7205970424287096, + "grad_norm": 0.9243576442784894, + "learning_rate": 9.931988592216654e-07, + "loss": 0.1234, + "step": 7821 + }, + { + "epoch": 0.7206891786059796, + "grad_norm": 0.8921816864655784, + "learning_rate": 9.925910626870841e-07, + "loss": 0.1247, + "step": 7822 + }, + { + "epoch": 0.7207813147832497, + "grad_norm": 1.0087296503855505, + "learning_rate": 9.919834061155841e-07, + "loss": 0.1403, + "step": 7823 + }, + { + "epoch": 0.7208734509605197, + "grad_norm": 0.9161566983649667, + "learning_rate": 9.913758895635872e-07, + "loss": 0.1207, + "step": 7824 + }, + { + "epoch": 0.7209655871377897, + "grad_norm": 0.9466811448920842, + "learning_rate": 9.907685130875022e-07, + "loss": 0.1305, + "step": 7825 + }, + { + "epoch": 0.7210577233150597, + "grad_norm": 0.9427068503028089, + "learning_rate": 9.901612767437233e-07, + "loss": 0.1322, + "step": 7826 + }, + { + "epoch": 0.7211498594923297, + "grad_norm": 0.8997398397326478, + "learning_rate": 9.89554180588631e-07, + "loss": 0.126, + "step": 7827 + }, + { + "epoch": 0.7212419956695997, + "grad_norm": 0.9158172595074155, + "learning_rate": 9.889472246785962e-07, + "loss": 0.1259, + "step": 7828 + }, + { + "epoch": 0.7213341318468697, + "grad_norm": 0.9590521031356755, + "learning_rate": 9.883404090699739e-07, + "loss": 0.123, + "step": 7829 + }, + { + "epoch": 0.7214262680241397, + "grad_norm": 0.9197380638874784, + "learning_rate": 9.877337338191081e-07, + "loss": 0.1255, + "step": 7830 + }, + { + "epoch": 0.7215184042014097, + "grad_norm": 0.9495198865360079, + "learning_rate": 9.871271989823279e-07, + "loss": 0.1257, + "step": 7831 + }, + { + "epoch": 0.7216105403786797, + "grad_norm": 0.9618388100017592, + "learning_rate": 9.865208046159493e-07, + "loss": 0.1222, + "step": 7832 + }, + { + "epoch": 0.7217026765559497, + "grad_norm": 0.9187912885939795, + "learning_rate": 9.85914550776277e-07, + "loss": 0.1315, + "step": 7833 + }, + { + "epoch": 0.7217948127332197, + "grad_norm": 0.9463336421029137, + "learning_rate": 9.853084375196013e-07, + "loss": 0.1376, + "step": 7834 + }, + { + "epoch": 0.7218869489104897, + "grad_norm": 0.8998408177725145, + "learning_rate": 9.847024649022014e-07, + "loss": 0.1134, + "step": 7835 + }, + { + "epoch": 0.7219790850877598, + "grad_norm": 0.9016223772783057, + "learning_rate": 9.840966329803404e-07, + "loss": 0.1229, + "step": 7836 + }, + { + "epoch": 0.7220712212650298, + "grad_norm": 0.9254109877176989, + "learning_rate": 9.834909418102694e-07, + "loss": 0.1241, + "step": 7837 + }, + { + "epoch": 0.7221633574422998, + "grad_norm": 0.9144500299198667, + "learning_rate": 9.828853914482276e-07, + "loss": 0.1148, + "step": 7838 + }, + { + "epoch": 0.7222554936195698, + "grad_norm": 0.8920230252999829, + "learning_rate": 9.822799819504413e-07, + "loss": 0.1161, + "step": 7839 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 0.865650754828013, + "learning_rate": 9.816747133731213e-07, + "loss": 0.1116, + "step": 7840 + }, + { + "epoch": 0.7224397659741097, + "grad_norm": 0.9425761369193745, + "learning_rate": 9.810695857724685e-07, + "loss": 0.1177, + "step": 7841 + }, + { + "epoch": 0.7225319021513797, + "grad_norm": 0.9392678290824935, + "learning_rate": 9.80464599204667e-07, + "loss": 0.1274, + "step": 7842 + }, + { + "epoch": 0.7226240383286497, + "grad_norm": 0.8952324686337915, + "learning_rate": 9.798597537258921e-07, + "loss": 0.118, + "step": 7843 + }, + { + "epoch": 0.7227161745059197, + "grad_norm": 0.9485924421467364, + "learning_rate": 9.79255049392302e-07, + "loss": 0.1253, + "step": 7844 + }, + { + "epoch": 0.7228083106831897, + "grad_norm": 0.9258101069064801, + "learning_rate": 9.78650486260044e-07, + "loss": 0.1185, + "step": 7845 + }, + { + "epoch": 0.7229004468604597, + "grad_norm": 0.9892771454693859, + "learning_rate": 9.78046064385253e-07, + "loss": 0.1191, + "step": 7846 + }, + { + "epoch": 0.7229925830377297, + "grad_norm": 0.9281111956495237, + "learning_rate": 9.774417838240485e-07, + "loss": 0.1203, + "step": 7847 + }, + { + "epoch": 0.7230847192149997, + "grad_norm": 0.9200675350911685, + "learning_rate": 9.768376446325376e-07, + "loss": 0.1163, + "step": 7848 + }, + { + "epoch": 0.7231768553922697, + "grad_norm": 0.8588371158345436, + "learning_rate": 9.762336468668151e-07, + "loss": 0.1151, + "step": 7849 + }, + { + "epoch": 0.7232689915695398, + "grad_norm": 0.848370777709939, + "learning_rate": 9.756297905829627e-07, + "loss": 0.111, + "step": 7850 + }, + { + "epoch": 0.7233611277468098, + "grad_norm": 0.8984725801587228, + "learning_rate": 9.75026075837049e-07, + "loss": 0.1085, + "step": 7851 + }, + { + "epoch": 0.7234532639240798, + "grad_norm": 1.0168488186269962, + "learning_rate": 9.744225026851284e-07, + "loss": 0.125, + "step": 7852 + }, + { + "epoch": 0.7235454001013498, + "grad_norm": 0.8882011781278885, + "learning_rate": 9.738190711832415e-07, + "loss": 0.1143, + "step": 7853 + }, + { + "epoch": 0.7236375362786198, + "grad_norm": 0.9282850267501597, + "learning_rate": 9.732157813874185e-07, + "loss": 0.1284, + "step": 7854 + }, + { + "epoch": 0.7237296724558898, + "grad_norm": 0.9204717778431273, + "learning_rate": 9.72612633353675e-07, + "loss": 0.1117, + "step": 7855 + }, + { + "epoch": 0.7238218086331598, + "grad_norm": 0.8853711615838081, + "learning_rate": 9.720096271380122e-07, + "loss": 0.1122, + "step": 7856 + }, + { + "epoch": 0.7239139448104298, + "grad_norm": 0.9593573703467069, + "learning_rate": 9.714067627964199e-07, + "loss": 0.1265, + "step": 7857 + }, + { + "epoch": 0.7240060809876998, + "grad_norm": 0.8933130363258317, + "learning_rate": 9.708040403848752e-07, + "loss": 0.1265, + "step": 7858 + }, + { + "epoch": 0.7240982171649698, + "grad_norm": 0.9035103404027475, + "learning_rate": 9.70201459959339e-07, + "loss": 0.1134, + "step": 7859 + }, + { + "epoch": 0.7241903533422398, + "grad_norm": 0.9124284467326477, + "learning_rate": 9.695990215757625e-07, + "loss": 0.1183, + "step": 7860 + }, + { + "epoch": 0.7242824895195098, + "grad_norm": 0.9554134809640706, + "learning_rate": 9.689967252900809e-07, + "loss": 0.1205, + "step": 7861 + }, + { + "epoch": 0.7243746256967798, + "grad_norm": 0.947696514674594, + "learning_rate": 9.683945711582181e-07, + "loss": 0.1195, + "step": 7862 + }, + { + "epoch": 0.7244667618740498, + "grad_norm": 0.9489888944270076, + "learning_rate": 9.677925592360851e-07, + "loss": 0.1353, + "step": 7863 + }, + { + "epoch": 0.7245588980513199, + "grad_norm": 0.9408998478748832, + "learning_rate": 9.671906895795779e-07, + "loss": 0.1206, + "step": 7864 + }, + { + "epoch": 0.7246510342285899, + "grad_norm": 0.9700947586152011, + "learning_rate": 9.665889622445792e-07, + "loss": 0.1356, + "step": 7865 + }, + { + "epoch": 0.7247431704058599, + "grad_norm": 0.961635334352596, + "learning_rate": 9.659873772869601e-07, + "loss": 0.121, + "step": 7866 + }, + { + "epoch": 0.7248353065831299, + "grad_norm": 0.9745334075339324, + "learning_rate": 9.653859347625786e-07, + "loss": 0.1267, + "step": 7867 + }, + { + "epoch": 0.7249274427603999, + "grad_norm": 0.8762381292464243, + "learning_rate": 9.647846347272788e-07, + "loss": 0.1128, + "step": 7868 + }, + { + "epoch": 0.7250195789376699, + "grad_norm": 0.9462395663888918, + "learning_rate": 9.64183477236891e-07, + "loss": 0.1276, + "step": 7869 + }, + { + "epoch": 0.7251117151149399, + "grad_norm": 0.9656221813969156, + "learning_rate": 9.635824623472317e-07, + "loss": 0.1354, + "step": 7870 + }, + { + "epoch": 0.7252038512922099, + "grad_norm": 0.9520423537311674, + "learning_rate": 9.629815901141062e-07, + "loss": 0.1242, + "step": 7871 + }, + { + "epoch": 0.7252959874694799, + "grad_norm": 0.9210581281415772, + "learning_rate": 9.623808605933063e-07, + "loss": 0.1182, + "step": 7872 + }, + { + "epoch": 0.7253881236467499, + "grad_norm": 0.8821609441885799, + "learning_rate": 9.617802738406082e-07, + "loss": 0.1125, + "step": 7873 + }, + { + "epoch": 0.7254802598240199, + "grad_norm": 0.8783025714845024, + "learning_rate": 9.611798299117778e-07, + "loss": 0.1193, + "step": 7874 + }, + { + "epoch": 0.7255723960012899, + "grad_norm": 0.9102543279448607, + "learning_rate": 9.605795288625652e-07, + "loss": 0.118, + "step": 7875 + }, + { + "epoch": 0.7256645321785599, + "grad_norm": 0.923045665646691, + "learning_rate": 9.599793707487098e-07, + "loss": 0.1302, + "step": 7876 + }, + { + "epoch": 0.72575666835583, + "grad_norm": 0.9074231261058356, + "learning_rate": 9.593793556259347e-07, + "loss": 0.1216, + "step": 7877 + }, + { + "epoch": 0.7258488045331, + "grad_norm": 0.9150292738479361, + "learning_rate": 9.587794835499523e-07, + "loss": 0.1231, + "step": 7878 + }, + { + "epoch": 0.72594094071037, + "grad_norm": 0.8998385147836466, + "learning_rate": 9.581797545764614e-07, + "loss": 0.1155, + "step": 7879 + }, + { + "epoch": 0.72603307688764, + "grad_norm": 0.9142946009765255, + "learning_rate": 9.575801687611464e-07, + "loss": 0.1185, + "step": 7880 + }, + { + "epoch": 0.72612521306491, + "grad_norm": 1.0152010957770305, + "learning_rate": 9.569807261596779e-07, + "loss": 0.1298, + "step": 7881 + }, + { + "epoch": 0.72621734924218, + "grad_norm": 0.9170396942900356, + "learning_rate": 9.56381426827715e-07, + "loss": 0.1248, + "step": 7882 + }, + { + "epoch": 0.72630948541945, + "grad_norm": 0.9811977448714191, + "learning_rate": 9.557822708209025e-07, + "loss": 0.1295, + "step": 7883 + }, + { + "epoch": 0.72640162159672, + "grad_norm": 0.9161193726605645, + "learning_rate": 9.551832581948733e-07, + "loss": 0.1181, + "step": 7884 + }, + { + "epoch": 0.72649375777399, + "grad_norm": 0.9462600455397618, + "learning_rate": 9.54584389005245e-07, + "loss": 0.1184, + "step": 7885 + }, + { + "epoch": 0.7265858939512599, + "grad_norm": 0.9143213594519252, + "learning_rate": 9.539856633076217e-07, + "loss": 0.1144, + "step": 7886 + }, + { + "epoch": 0.7266780301285299, + "grad_norm": 0.8958649242270569, + "learning_rate": 9.533870811575957e-07, + "loss": 0.1206, + "step": 7887 + }, + { + "epoch": 0.7267701663057999, + "grad_norm": 1.0084483158787285, + "learning_rate": 9.527886426107458e-07, + "loss": 0.132, + "step": 7888 + }, + { + "epoch": 0.7268623024830699, + "grad_norm": 0.8899448954141135, + "learning_rate": 9.52190347722638e-07, + "loss": 0.1217, + "step": 7889 + }, + { + "epoch": 0.7269544386603399, + "grad_norm": 0.9445700892184191, + "learning_rate": 9.515921965488226e-07, + "loss": 0.1214, + "step": 7890 + }, + { + "epoch": 0.72704657483761, + "grad_norm": 0.9425465602318627, + "learning_rate": 9.509941891448376e-07, + "loss": 0.113, + "step": 7891 + }, + { + "epoch": 0.72713871101488, + "grad_norm": 0.9188640731856463, + "learning_rate": 9.503963255662091e-07, + "loss": 0.1235, + "step": 7892 + }, + { + "epoch": 0.72723084719215, + "grad_norm": 0.9183542312222722, + "learning_rate": 9.497986058684491e-07, + "loss": 0.124, + "step": 7893 + }, + { + "epoch": 0.72732298336942, + "grad_norm": 0.9609361929729627, + "learning_rate": 9.492010301070548e-07, + "loss": 0.123, + "step": 7894 + }, + { + "epoch": 0.72741511954669, + "grad_norm": 0.920395848689612, + "learning_rate": 9.486035983375125e-07, + "loss": 0.1203, + "step": 7895 + }, + { + "epoch": 0.72750725572396, + "grad_norm": 0.8742457567724701, + "learning_rate": 9.48006310615292e-07, + "loss": 0.1061, + "step": 7896 + }, + { + "epoch": 0.72759939190123, + "grad_norm": 0.9205268230772412, + "learning_rate": 9.474091669958538e-07, + "loss": 0.1166, + "step": 7897 + }, + { + "epoch": 0.7276915280785, + "grad_norm": 0.9528723866899338, + "learning_rate": 9.468121675346406e-07, + "loss": 0.1117, + "step": 7898 + }, + { + "epoch": 0.72778366425577, + "grad_norm": 0.8977194231147801, + "learning_rate": 9.462153122870846e-07, + "loss": 0.1227, + "step": 7899 + }, + { + "epoch": 0.72787580043304, + "grad_norm": 0.9545650384877833, + "learning_rate": 9.456186013086049e-07, + "loss": 0.1275, + "step": 7900 + }, + { + "epoch": 0.72796793661031, + "grad_norm": 0.9970229892888035, + "learning_rate": 9.450220346546057e-07, + "loss": 0.1252, + "step": 7901 + }, + { + "epoch": 0.72806007278758, + "grad_norm": 0.9069698030197497, + "learning_rate": 9.444256123804768e-07, + "loss": 0.1342, + "step": 7902 + }, + { + "epoch": 0.72815220896485, + "grad_norm": 0.917366478706477, + "learning_rate": 9.438293345415972e-07, + "loss": 0.1233, + "step": 7903 + }, + { + "epoch": 0.7282443451421201, + "grad_norm": 0.9816361473567158, + "learning_rate": 9.432332011933315e-07, + "loss": 0.119, + "step": 7904 + }, + { + "epoch": 0.7283364813193901, + "grad_norm": 0.912167553796657, + "learning_rate": 9.426372123910313e-07, + "loss": 0.1209, + "step": 7905 + }, + { + "epoch": 0.7284286174966601, + "grad_norm": 0.9049561632030253, + "learning_rate": 9.420413681900337e-07, + "loss": 0.1126, + "step": 7906 + }, + { + "epoch": 0.7285207536739301, + "grad_norm": 0.9459876967547667, + "learning_rate": 9.414456686456619e-07, + "loss": 0.1184, + "step": 7907 + }, + { + "epoch": 0.7286128898512001, + "grad_norm": 0.9084561231139721, + "learning_rate": 9.408501138132273e-07, + "loss": 0.1154, + "step": 7908 + }, + { + "epoch": 0.7287050260284701, + "grad_norm": 0.9798235306309518, + "learning_rate": 9.402547037480284e-07, + "loss": 0.1241, + "step": 7909 + }, + { + "epoch": 0.7287971622057401, + "grad_norm": 0.9169403190088212, + "learning_rate": 9.396594385053473e-07, + "loss": 0.1188, + "step": 7910 + }, + { + "epoch": 0.7288892983830101, + "grad_norm": 0.8842824376641447, + "learning_rate": 9.39064318140456e-07, + "loss": 0.1157, + "step": 7911 + }, + { + "epoch": 0.7289814345602801, + "grad_norm": 0.8815960009445793, + "learning_rate": 9.3846934270861e-07, + "loss": 0.1129, + "step": 7912 + }, + { + "epoch": 0.7290735707375501, + "grad_norm": 0.8735058660585312, + "learning_rate": 9.378745122650545e-07, + "loss": 0.0989, + "step": 7913 + }, + { + "epoch": 0.7291657069148201, + "grad_norm": 0.9964407796113233, + "learning_rate": 9.372798268650177e-07, + "loss": 0.128, + "step": 7914 + }, + { + "epoch": 0.7292578430920901, + "grad_norm": 0.9077958451134759, + "learning_rate": 9.366852865637171e-07, + "loss": 0.1119, + "step": 7915 + }, + { + "epoch": 0.7293499792693601, + "grad_norm": 0.9191567318461464, + "learning_rate": 9.360908914163569e-07, + "loss": 0.1165, + "step": 7916 + }, + { + "epoch": 0.7294421154466301, + "grad_norm": 0.9137556085419295, + "learning_rate": 9.354966414781247e-07, + "loss": 0.1151, + "step": 7917 + }, + { + "epoch": 0.7295342516239002, + "grad_norm": 0.9377389661455451, + "learning_rate": 9.349025368041989e-07, + "loss": 0.1299, + "step": 7918 + }, + { + "epoch": 0.7296263878011702, + "grad_norm": 0.9697573167162219, + "learning_rate": 9.343085774497399e-07, + "loss": 0.1303, + "step": 7919 + }, + { + "epoch": 0.7297185239784402, + "grad_norm": 0.9276544404006211, + "learning_rate": 9.337147634698979e-07, + "loss": 0.1114, + "step": 7920 + }, + { + "epoch": 0.7298106601557102, + "grad_norm": 0.9316178448679642, + "learning_rate": 9.331210949198097e-07, + "loss": 0.1186, + "step": 7921 + }, + { + "epoch": 0.7299027963329802, + "grad_norm": 0.9297104126646819, + "learning_rate": 9.325275718545962e-07, + "loss": 0.1175, + "step": 7922 + }, + { + "epoch": 0.7299949325102502, + "grad_norm": 0.9459703058813975, + "learning_rate": 9.319341943293659e-07, + "loss": 0.1143, + "step": 7923 + }, + { + "epoch": 0.7300870686875202, + "grad_norm": 0.903636093967031, + "learning_rate": 9.31340962399214e-07, + "loss": 0.1131, + "step": 7924 + }, + { + "epoch": 0.7301792048647902, + "grad_norm": 0.9130854765962986, + "learning_rate": 9.307478761192229e-07, + "loss": 0.1215, + "step": 7925 + }, + { + "epoch": 0.7302713410420602, + "grad_norm": 0.9250366455134187, + "learning_rate": 9.301549355444611e-07, + "loss": 0.1148, + "step": 7926 + }, + { + "epoch": 0.7303634772193301, + "grad_norm": 0.9753728160621229, + "learning_rate": 9.295621407299824e-07, + "loss": 0.1234, + "step": 7927 + }, + { + "epoch": 0.7304556133966001, + "grad_norm": 0.901388541959924, + "learning_rate": 9.289694917308273e-07, + "loss": 0.1241, + "step": 7928 + }, + { + "epoch": 0.7305477495738701, + "grad_norm": 0.9071666280203143, + "learning_rate": 9.283769886020238e-07, + "loss": 0.1211, + "step": 7929 + }, + { + "epoch": 0.7306398857511401, + "grad_norm": 0.9197162146244954, + "learning_rate": 9.277846313985869e-07, + "loss": 0.1112, + "step": 7930 + }, + { + "epoch": 0.7307320219284101, + "grad_norm": 0.9720399464274628, + "learning_rate": 9.271924201755153e-07, + "loss": 0.1291, + "step": 7931 + }, + { + "epoch": 0.7308241581056802, + "grad_norm": 0.9462594696606331, + "learning_rate": 9.26600354987797e-07, + "loss": 0.1146, + "step": 7932 + }, + { + "epoch": 0.7309162942829502, + "grad_norm": 0.9549579771749659, + "learning_rate": 9.260084358904056e-07, + "loss": 0.1253, + "step": 7933 + }, + { + "epoch": 0.7310084304602202, + "grad_norm": 0.9346643983496812, + "learning_rate": 9.254166629383005e-07, + "loss": 0.118, + "step": 7934 + }, + { + "epoch": 0.7311005666374902, + "grad_norm": 0.9197443173940072, + "learning_rate": 9.24825036186427e-07, + "loss": 0.1155, + "step": 7935 + }, + { + "epoch": 0.7311927028147602, + "grad_norm": 0.9111826786059956, + "learning_rate": 9.242335556897181e-07, + "loss": 0.125, + "step": 7936 + }, + { + "epoch": 0.7312848389920302, + "grad_norm": 0.8963685992216108, + "learning_rate": 9.236422215030932e-07, + "loss": 0.1071, + "step": 7937 + }, + { + "epoch": 0.7313769751693002, + "grad_norm": 0.9996120834457425, + "learning_rate": 9.230510336814586e-07, + "loss": 0.1308, + "step": 7938 + }, + { + "epoch": 0.7314691113465702, + "grad_norm": 0.9100280131314815, + "learning_rate": 9.224599922797053e-07, + "loss": 0.1254, + "step": 7939 + }, + { + "epoch": 0.7315612475238402, + "grad_norm": 0.877434720202932, + "learning_rate": 9.218690973527106e-07, + "loss": 0.1145, + "step": 7940 + }, + { + "epoch": 0.7316533837011102, + "grad_norm": 1.0262118485516254, + "learning_rate": 9.212783489553401e-07, + "loss": 0.1336, + "step": 7941 + }, + { + "epoch": 0.7317455198783802, + "grad_norm": 0.9555725570664751, + "learning_rate": 9.206877471424455e-07, + "loss": 0.1341, + "step": 7942 + }, + { + "epoch": 0.7318376560556502, + "grad_norm": 0.9392503740018264, + "learning_rate": 9.200972919688628e-07, + "loss": 0.1216, + "step": 7943 + }, + { + "epoch": 0.7319297922329202, + "grad_norm": 0.964214487072524, + "learning_rate": 9.195069834894174e-07, + "loss": 0.1232, + "step": 7944 + }, + { + "epoch": 0.7320219284101903, + "grad_norm": 0.9505944973653115, + "learning_rate": 9.189168217589178e-07, + "loss": 0.1208, + "step": 7945 + }, + { + "epoch": 0.7321140645874603, + "grad_norm": 0.9211807278405854, + "learning_rate": 9.183268068321616e-07, + "loss": 0.1156, + "step": 7946 + }, + { + "epoch": 0.7322062007647303, + "grad_norm": 0.8903747447623426, + "learning_rate": 9.177369387639323e-07, + "loss": 0.1082, + "step": 7947 + }, + { + "epoch": 0.7322983369420003, + "grad_norm": 0.8724081988043988, + "learning_rate": 9.171472176089977e-07, + "loss": 0.1116, + "step": 7948 + }, + { + "epoch": 0.7323904731192703, + "grad_norm": 0.9842222839255699, + "learning_rate": 9.165576434221152e-07, + "loss": 0.1304, + "step": 7949 + }, + { + "epoch": 0.7324826092965403, + "grad_norm": 0.8799032026581489, + "learning_rate": 9.15968216258025e-07, + "loss": 0.11, + "step": 7950 + }, + { + "epoch": 0.7325747454738103, + "grad_norm": 0.9429682413677968, + "learning_rate": 9.153789361714573e-07, + "loss": 0.1214, + "step": 7951 + }, + { + "epoch": 0.7326668816510803, + "grad_norm": 0.9521147026715193, + "learning_rate": 9.147898032171251e-07, + "loss": 0.1265, + "step": 7952 + }, + { + "epoch": 0.7327590178283503, + "grad_norm": 0.9118057426636171, + "learning_rate": 9.142008174497302e-07, + "loss": 0.1164, + "step": 7953 + }, + { + "epoch": 0.7328511540056203, + "grad_norm": 0.9177961028689359, + "learning_rate": 9.136119789239612e-07, + "loss": 0.1167, + "step": 7954 + }, + { + "epoch": 0.7329432901828903, + "grad_norm": 0.9368447428001684, + "learning_rate": 9.130232876944903e-07, + "loss": 0.1242, + "step": 7955 + }, + { + "epoch": 0.7330354263601603, + "grad_norm": 0.9623873942205943, + "learning_rate": 9.124347438159772e-07, + "loss": 0.1272, + "step": 7956 + }, + { + "epoch": 0.7331275625374303, + "grad_norm": 0.9679388638840875, + "learning_rate": 9.118463473430689e-07, + "loss": 0.1247, + "step": 7957 + }, + { + "epoch": 0.7332196987147003, + "grad_norm": 0.9003206232231331, + "learning_rate": 9.112580983303984e-07, + "loss": 0.1175, + "step": 7958 + }, + { + "epoch": 0.7333118348919704, + "grad_norm": 0.9656950894298632, + "learning_rate": 9.106699968325849e-07, + "loss": 0.121, + "step": 7959 + }, + { + "epoch": 0.7334039710692404, + "grad_norm": 0.9890202665502145, + "learning_rate": 9.100820429042337e-07, + "loss": 0.1317, + "step": 7960 + }, + { + "epoch": 0.7334961072465104, + "grad_norm": 0.9233138280132024, + "learning_rate": 9.094942365999349e-07, + "loss": 0.1193, + "step": 7961 + }, + { + "epoch": 0.7335882434237804, + "grad_norm": 0.9742207172348182, + "learning_rate": 9.089065779742673e-07, + "loss": 0.1299, + "step": 7962 + }, + { + "epoch": 0.7336803796010504, + "grad_norm": 0.8990113391460285, + "learning_rate": 9.083190670817963e-07, + "loss": 0.1145, + "step": 7963 + }, + { + "epoch": 0.7337725157783204, + "grad_norm": 0.861187500883117, + "learning_rate": 9.0773170397707e-07, + "loss": 0.1143, + "step": 7964 + }, + { + "epoch": 0.7338646519555904, + "grad_norm": 0.9339588576002366, + "learning_rate": 9.071444887146275e-07, + "loss": 0.125, + "step": 7965 + }, + { + "epoch": 0.7339567881328604, + "grad_norm": 0.9228986922230902, + "learning_rate": 9.065574213489897e-07, + "loss": 0.1177, + "step": 7966 + }, + { + "epoch": 0.7340489243101304, + "grad_norm": 0.8987184447458356, + "learning_rate": 9.059705019346676e-07, + "loss": 0.1147, + "step": 7967 + }, + { + "epoch": 0.7341410604874004, + "grad_norm": 0.9718902501296317, + "learning_rate": 9.053837305261551e-07, + "loss": 0.117, + "step": 7968 + }, + { + "epoch": 0.7342331966646704, + "grad_norm": 0.882640269301587, + "learning_rate": 9.047971071779349e-07, + "loss": 0.1014, + "step": 7969 + }, + { + "epoch": 0.7343253328419403, + "grad_norm": 0.9295900436726557, + "learning_rate": 9.042106319444757e-07, + "loss": 0.1282, + "step": 7970 + }, + { + "epoch": 0.7344174690192103, + "grad_norm": 0.9431206523555368, + "learning_rate": 9.036243048802312e-07, + "loss": 0.1172, + "step": 7971 + }, + { + "epoch": 0.7345096051964805, + "grad_norm": 0.9400908273680944, + "learning_rate": 9.030381260396409e-07, + "loss": 0.1143, + "step": 7972 + }, + { + "epoch": 0.7346017413737504, + "grad_norm": 0.9688951073944904, + "learning_rate": 9.024520954771326e-07, + "loss": 0.131, + "step": 7973 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 0.9851741676237068, + "learning_rate": 9.018662132471189e-07, + "loss": 0.1241, + "step": 7974 + }, + { + "epoch": 0.7347860137282904, + "grad_norm": 0.9057714871865711, + "learning_rate": 9.012804794040003e-07, + "loss": 0.1179, + "step": 7975 + }, + { + "epoch": 0.7348781499055604, + "grad_norm": 0.9750342427400404, + "learning_rate": 9.006948940021612e-07, + "loss": 0.121, + "step": 7976 + }, + { + "epoch": 0.7349702860828304, + "grad_norm": 0.9246612470094258, + "learning_rate": 9.001094570959726e-07, + "loss": 0.1145, + "step": 7977 + }, + { + "epoch": 0.7350624222601004, + "grad_norm": 0.9570937459286335, + "learning_rate": 8.995241687397929e-07, + "loss": 0.1219, + "step": 7978 + }, + { + "epoch": 0.7351545584373704, + "grad_norm": 0.9961421625364182, + "learning_rate": 8.989390289879665e-07, + "loss": 0.1294, + "step": 7979 + }, + { + "epoch": 0.7352466946146404, + "grad_norm": 0.9600761953924268, + "learning_rate": 8.983540378948244e-07, + "loss": 0.1242, + "step": 7980 + }, + { + "epoch": 0.7353388307919104, + "grad_norm": 0.9304715372533625, + "learning_rate": 8.977691955146823e-07, + "loss": 0.1137, + "step": 7981 + }, + { + "epoch": 0.7354309669691804, + "grad_norm": 0.9413815198495417, + "learning_rate": 8.971845019018419e-07, + "loss": 0.1279, + "step": 7982 + }, + { + "epoch": 0.7355231031464504, + "grad_norm": 0.9586301278205637, + "learning_rate": 8.965999571105929e-07, + "loss": 0.1302, + "step": 7983 + }, + { + "epoch": 0.7356152393237204, + "grad_norm": 1.0239076800420843, + "learning_rate": 8.960155611952115e-07, + "loss": 0.139, + "step": 7984 + }, + { + "epoch": 0.7357073755009904, + "grad_norm": 0.8874691209556264, + "learning_rate": 8.954313142099568e-07, + "loss": 0.1162, + "step": 7985 + }, + { + "epoch": 0.7357995116782605, + "grad_norm": 0.8906043874184616, + "learning_rate": 8.948472162090782e-07, + "loss": 0.1218, + "step": 7986 + }, + { + "epoch": 0.7358916478555305, + "grad_norm": 0.8774243675542508, + "learning_rate": 8.942632672468077e-07, + "loss": 0.1146, + "step": 7987 + }, + { + "epoch": 0.7359837840328005, + "grad_norm": 0.9447564057822249, + "learning_rate": 8.936794673773661e-07, + "loss": 0.1266, + "step": 7988 + }, + { + "epoch": 0.7360759202100705, + "grad_norm": 0.9399500136330979, + "learning_rate": 8.930958166549583e-07, + "loss": 0.1266, + "step": 7989 + }, + { + "epoch": 0.7361680563873405, + "grad_norm": 0.9102773778780244, + "learning_rate": 8.925123151337767e-07, + "loss": 0.1176, + "step": 7990 + }, + { + "epoch": 0.7362601925646105, + "grad_norm": 0.9128385627600489, + "learning_rate": 8.919289628680005e-07, + "loss": 0.1205, + "step": 7991 + }, + { + "epoch": 0.7363523287418805, + "grad_norm": 0.9342386016200168, + "learning_rate": 8.913457599117933e-07, + "loss": 0.1156, + "step": 7992 + }, + { + "epoch": 0.7364444649191505, + "grad_norm": 0.9199680452398934, + "learning_rate": 8.907627063193045e-07, + "loss": 0.1165, + "step": 7993 + }, + { + "epoch": 0.7365366010964205, + "grad_norm": 0.9437755517472011, + "learning_rate": 8.901798021446714e-07, + "loss": 0.1275, + "step": 7994 + }, + { + "epoch": 0.7366287372736905, + "grad_norm": 0.8953777925202131, + "learning_rate": 8.895970474420171e-07, + "loss": 0.1101, + "step": 7995 + }, + { + "epoch": 0.7367208734509605, + "grad_norm": 0.9170465764857951, + "learning_rate": 8.890144422654512e-07, + "loss": 0.1194, + "step": 7996 + }, + { + "epoch": 0.7368130096282305, + "grad_norm": 0.9822108915381218, + "learning_rate": 8.884319866690674e-07, + "loss": 0.1174, + "step": 7997 + }, + { + "epoch": 0.7369051458055005, + "grad_norm": 0.933804644233933, + "learning_rate": 8.878496807069464e-07, + "loss": 0.1168, + "step": 7998 + }, + { + "epoch": 0.7369972819827705, + "grad_norm": 0.9261693186397323, + "learning_rate": 8.87267524433156e-07, + "loss": 0.1199, + "step": 7999 + }, + { + "epoch": 0.7370894181600406, + "grad_norm": 0.9344138753722313, + "learning_rate": 8.866855179017505e-07, + "loss": 0.116, + "step": 8000 + }, + { + "epoch": 0.7370894181600406, + "eval_loss": 0.12275012582540512, + "eval_runtime": 299.0394, + "eval_samples_per_second": 23.465, + "eval_steps_per_second": 2.936, + "step": 8000 + }, + { + "epoch": 0.7371815543373106, + "grad_norm": 0.9069037559452209, + "learning_rate": 8.861036611667676e-07, + "loss": 0.1074, + "step": 8001 + }, + { + "epoch": 0.7372736905145806, + "grad_norm": 0.916023672944426, + "learning_rate": 8.855219542822341e-07, + "loss": 0.1059, + "step": 8002 + }, + { + "epoch": 0.7373658266918506, + "grad_norm": 0.9141703814175853, + "learning_rate": 8.8494039730216e-07, + "loss": 0.1236, + "step": 8003 + }, + { + "epoch": 0.7374579628691206, + "grad_norm": 0.961606399248731, + "learning_rate": 8.843589902805438e-07, + "loss": 0.1317, + "step": 8004 + }, + { + "epoch": 0.7375500990463906, + "grad_norm": 0.934755066682781, + "learning_rate": 8.837777332713701e-07, + "loss": 0.1203, + "step": 8005 + }, + { + "epoch": 0.7376422352236606, + "grad_norm": 0.9025835586979728, + "learning_rate": 8.831966263286071e-07, + "loss": 0.1187, + "step": 8006 + }, + { + "epoch": 0.7377343714009306, + "grad_norm": 0.9078088364126208, + "learning_rate": 8.826156695062113e-07, + "loss": 0.1241, + "step": 8007 + }, + { + "epoch": 0.7378265075782006, + "grad_norm": 0.9342661255740071, + "learning_rate": 8.820348628581254e-07, + "loss": 0.1228, + "step": 8008 + }, + { + "epoch": 0.7379186437554706, + "grad_norm": 0.9935337510975628, + "learning_rate": 8.814542064382767e-07, + "loss": 0.1244, + "step": 8009 + }, + { + "epoch": 0.7380107799327406, + "grad_norm": 0.9048942427175215, + "learning_rate": 8.808737003005782e-07, + "loss": 0.1214, + "step": 8010 + }, + { + "epoch": 0.7381029161100106, + "grad_norm": 0.9564069497268002, + "learning_rate": 8.802933444989308e-07, + "loss": 0.1257, + "step": 8011 + }, + { + "epoch": 0.7381950522872806, + "grad_norm": 0.8980904699977105, + "learning_rate": 8.797131390872207e-07, + "loss": 0.1104, + "step": 8012 + }, + { + "epoch": 0.7382871884645507, + "grad_norm": 0.8601604398887479, + "learning_rate": 8.79133084119321e-07, + "loss": 0.1216, + "step": 8013 + }, + { + "epoch": 0.7383793246418207, + "grad_norm": 0.9231462426012692, + "learning_rate": 8.78553179649089e-07, + "loss": 0.1287, + "step": 8014 + }, + { + "epoch": 0.7384714608190907, + "grad_norm": 0.9342174944000972, + "learning_rate": 8.779734257303677e-07, + "loss": 0.1248, + "step": 8015 + }, + { + "epoch": 0.7385635969963606, + "grad_norm": 0.9247205293120123, + "learning_rate": 8.773938224169884e-07, + "loss": 0.119, + "step": 8016 + }, + { + "epoch": 0.7386557331736306, + "grad_norm": 0.9345834504752919, + "learning_rate": 8.768143697627681e-07, + "loss": 0.1301, + "step": 8017 + }, + { + "epoch": 0.7387478693509006, + "grad_norm": 0.8466531869104624, + "learning_rate": 8.762350678215076e-07, + "loss": 0.1085, + "step": 8018 + }, + { + "epoch": 0.7388400055281706, + "grad_norm": 0.948753230207356, + "learning_rate": 8.756559166469966e-07, + "loss": 0.1299, + "step": 8019 + }, + { + "epoch": 0.7389321417054406, + "grad_norm": 0.9596076569583665, + "learning_rate": 8.750769162930076e-07, + "loss": 0.1221, + "step": 8020 + }, + { + "epoch": 0.7390242778827106, + "grad_norm": 0.9550525329866566, + "learning_rate": 8.744980668133026e-07, + "loss": 0.1208, + "step": 8021 + }, + { + "epoch": 0.7391164140599806, + "grad_norm": 0.9490302288135481, + "learning_rate": 8.739193682616265e-07, + "loss": 0.1197, + "step": 8022 + }, + { + "epoch": 0.7392085502372506, + "grad_norm": 0.9200381002548552, + "learning_rate": 8.733408206917118e-07, + "loss": 0.1226, + "step": 8023 + }, + { + "epoch": 0.7393006864145206, + "grad_norm": 0.9650781421693304, + "learning_rate": 8.727624241572779e-07, + "loss": 0.1292, + "step": 8024 + }, + { + "epoch": 0.7393928225917906, + "grad_norm": 0.904301714517166, + "learning_rate": 8.72184178712028e-07, + "loss": 0.1146, + "step": 8025 + }, + { + "epoch": 0.7394849587690606, + "grad_norm": 0.8895832457122699, + "learning_rate": 8.716060844096514e-07, + "loss": 0.1159, + "step": 8026 + }, + { + "epoch": 0.7395770949463307, + "grad_norm": 0.8901681509477561, + "learning_rate": 8.710281413038252e-07, + "loss": 0.1127, + "step": 8027 + }, + { + "epoch": 0.7396692311236007, + "grad_norm": 0.973436941777358, + "learning_rate": 8.704503494482114e-07, + "loss": 0.1323, + "step": 8028 + }, + { + "epoch": 0.7397613673008707, + "grad_norm": 0.8806928065451151, + "learning_rate": 8.698727088964587e-07, + "loss": 0.1144, + "step": 8029 + }, + { + "epoch": 0.7398535034781407, + "grad_norm": 0.9283017152565503, + "learning_rate": 8.692952197022006e-07, + "loss": 0.1126, + "step": 8030 + }, + { + "epoch": 0.7399456396554107, + "grad_norm": 0.9541252081277901, + "learning_rate": 8.687178819190558e-07, + "loss": 0.1252, + "step": 8031 + }, + { + "epoch": 0.7400377758326807, + "grad_norm": 0.9693080055760379, + "learning_rate": 8.681406956006316e-07, + "loss": 0.1327, + "step": 8032 + }, + { + "epoch": 0.7401299120099507, + "grad_norm": 0.9120977367696028, + "learning_rate": 8.675636608005191e-07, + "loss": 0.1218, + "step": 8033 + }, + { + "epoch": 0.7402220481872207, + "grad_norm": 0.9133310755374804, + "learning_rate": 8.669867775722973e-07, + "loss": 0.1308, + "step": 8034 + }, + { + "epoch": 0.7403141843644907, + "grad_norm": 0.8719559549311144, + "learning_rate": 8.66410045969529e-07, + "loss": 0.1094, + "step": 8035 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 0.9203157162386649, + "learning_rate": 8.658334660457629e-07, + "loss": 0.1215, + "step": 8036 + }, + { + "epoch": 0.7404984567190307, + "grad_norm": 0.8475985566773456, + "learning_rate": 8.652570378545355e-07, + "loss": 0.1024, + "step": 8037 + }, + { + "epoch": 0.7405905928963007, + "grad_norm": 0.8714394463711818, + "learning_rate": 8.646807614493685e-07, + "loss": 0.1074, + "step": 8038 + }, + { + "epoch": 0.7406827290735707, + "grad_norm": 0.8951025111622029, + "learning_rate": 8.641046368837682e-07, + "loss": 0.119, + "step": 8039 + }, + { + "epoch": 0.7407748652508408, + "grad_norm": 0.8940983168209601, + "learning_rate": 8.635286642112295e-07, + "loss": 0.1068, + "step": 8040 + }, + { + "epoch": 0.7408670014281108, + "grad_norm": 0.9700185466019189, + "learning_rate": 8.629528434852294e-07, + "loss": 0.1261, + "step": 8041 + }, + { + "epoch": 0.7409591376053808, + "grad_norm": 0.9605648103993476, + "learning_rate": 8.623771747592347e-07, + "loss": 0.1303, + "step": 8042 + }, + { + "epoch": 0.7410512737826508, + "grad_norm": 0.8647742808321739, + "learning_rate": 8.618016580866947e-07, + "loss": 0.113, + "step": 8043 + }, + { + "epoch": 0.7411434099599208, + "grad_norm": 0.874872810061018, + "learning_rate": 8.612262935210472e-07, + "loss": 0.1088, + "step": 8044 + }, + { + "epoch": 0.7412355461371908, + "grad_norm": 0.959941985103914, + "learning_rate": 8.606510811157154e-07, + "loss": 0.1181, + "step": 8045 + }, + { + "epoch": 0.7413276823144608, + "grad_norm": 0.9386067809347824, + "learning_rate": 8.600760209241074e-07, + "loss": 0.1277, + "step": 8046 + }, + { + "epoch": 0.7414198184917308, + "grad_norm": 0.9597688620808285, + "learning_rate": 8.595011129996164e-07, + "loss": 0.1219, + "step": 8047 + }, + { + "epoch": 0.7415119546690008, + "grad_norm": 0.9520032460057815, + "learning_rate": 8.589263573956236e-07, + "loss": 0.1226, + "step": 8048 + }, + { + "epoch": 0.7416040908462708, + "grad_norm": 0.9018956330326757, + "learning_rate": 8.583517541654951e-07, + "loss": 0.1117, + "step": 8049 + }, + { + "epoch": 0.7416962270235408, + "grad_norm": 0.9368276434977748, + "learning_rate": 8.577773033625836e-07, + "loss": 0.1226, + "step": 8050 + }, + { + "epoch": 0.7417883632008108, + "grad_norm": 0.8922899110331376, + "learning_rate": 8.572030050402264e-07, + "loss": 0.1094, + "step": 8051 + }, + { + "epoch": 0.7418804993780808, + "grad_norm": 0.9630269672440016, + "learning_rate": 8.566288592517461e-07, + "loss": 0.1222, + "step": 8052 + }, + { + "epoch": 0.7419726355553508, + "grad_norm": 0.9077601187164204, + "learning_rate": 8.560548660504531e-07, + "loss": 0.1109, + "step": 8053 + }, + { + "epoch": 0.7420647717326209, + "grad_norm": 0.9147053432114655, + "learning_rate": 8.554810254896434e-07, + "loss": 0.1222, + "step": 8054 + }, + { + "epoch": 0.7421569079098909, + "grad_norm": 0.8670005809699466, + "learning_rate": 8.54907337622597e-07, + "loss": 0.1051, + "step": 8055 + }, + { + "epoch": 0.7422490440871609, + "grad_norm": 0.9849815202966365, + "learning_rate": 8.543338025025818e-07, + "loss": 0.1295, + "step": 8056 + }, + { + "epoch": 0.7423411802644309, + "grad_norm": 0.9317265536420508, + "learning_rate": 8.537604201828495e-07, + "loss": 0.1194, + "step": 8057 + }, + { + "epoch": 0.7424333164417009, + "grad_norm": 0.9531648343620684, + "learning_rate": 8.5318719071664e-07, + "loss": 0.1215, + "step": 8058 + }, + { + "epoch": 0.7425254526189708, + "grad_norm": 0.9383372475678683, + "learning_rate": 8.526141141571764e-07, + "loss": 0.1263, + "step": 8059 + }, + { + "epoch": 0.7426175887962408, + "grad_norm": 0.8973356402044012, + "learning_rate": 8.520411905576697e-07, + "loss": 0.1238, + "step": 8060 + }, + { + "epoch": 0.7427097249735108, + "grad_norm": 0.8774857932653227, + "learning_rate": 8.514684199713166e-07, + "loss": 0.1126, + "step": 8061 + }, + { + "epoch": 0.7428018611507808, + "grad_norm": 0.9196376921082318, + "learning_rate": 8.508958024512972e-07, + "loss": 0.1186, + "step": 8062 + }, + { + "epoch": 0.7428939973280508, + "grad_norm": 0.9235244197036236, + "learning_rate": 8.503233380507808e-07, + "loss": 0.1221, + "step": 8063 + }, + { + "epoch": 0.7429861335053208, + "grad_norm": 0.8945240160141257, + "learning_rate": 8.497510268229192e-07, + "loss": 0.125, + "step": 8064 + }, + { + "epoch": 0.7430782696825908, + "grad_norm": 0.9899562846771053, + "learning_rate": 8.491788688208524e-07, + "loss": 0.1272, + "step": 8065 + }, + { + "epoch": 0.7431704058598608, + "grad_norm": 0.9773907890691923, + "learning_rate": 8.486068640977063e-07, + "loss": 0.1312, + "step": 8066 + }, + { + "epoch": 0.7432625420371308, + "grad_norm": 0.9148469368667612, + "learning_rate": 8.480350127065904e-07, + "loss": 0.1144, + "step": 8067 + }, + { + "epoch": 0.7433546782144009, + "grad_norm": 0.8556460111218671, + "learning_rate": 8.474633147006006e-07, + "loss": 0.116, + "step": 8068 + }, + { + "epoch": 0.7434468143916709, + "grad_norm": 0.9538636763158576, + "learning_rate": 8.468917701328197e-07, + "loss": 0.1281, + "step": 8069 + }, + { + "epoch": 0.7435389505689409, + "grad_norm": 0.9288207869960571, + "learning_rate": 8.46320379056316e-07, + "loss": 0.1262, + "step": 8070 + }, + { + "epoch": 0.7436310867462109, + "grad_norm": 0.9360281891768948, + "learning_rate": 8.45749141524144e-07, + "loss": 0.1239, + "step": 8071 + }, + { + "epoch": 0.7437232229234809, + "grad_norm": 0.8474422015239383, + "learning_rate": 8.45178057589342e-07, + "loss": 0.1068, + "step": 8072 + }, + { + "epoch": 0.7438153591007509, + "grad_norm": 0.9805074107349353, + "learning_rate": 8.446071273049347e-07, + "loss": 0.1309, + "step": 8073 + }, + { + "epoch": 0.7439074952780209, + "grad_norm": 0.9428578922557328, + "learning_rate": 8.440363507239338e-07, + "loss": 0.122, + "step": 8074 + }, + { + "epoch": 0.7439996314552909, + "grad_norm": 0.8616051880037617, + "learning_rate": 8.434657278993369e-07, + "loss": 0.1103, + "step": 8075 + }, + { + "epoch": 0.7440917676325609, + "grad_norm": 0.8825856553344199, + "learning_rate": 8.428952588841247e-07, + "loss": 0.1141, + "step": 8076 + }, + { + "epoch": 0.7441839038098309, + "grad_norm": 0.945774709888264, + "learning_rate": 8.423249437312667e-07, + "loss": 0.1267, + "step": 8077 + }, + { + "epoch": 0.7442760399871009, + "grad_norm": 0.9036109907141343, + "learning_rate": 8.41754782493715e-07, + "loss": 0.1132, + "step": 8078 + }, + { + "epoch": 0.7443681761643709, + "grad_norm": 0.9153835514619679, + "learning_rate": 8.411847752244115e-07, + "loss": 0.1165, + "step": 8079 + }, + { + "epoch": 0.7444603123416409, + "grad_norm": 1.0480110548918762, + "learning_rate": 8.406149219762791e-07, + "loss": 0.1479, + "step": 8080 + }, + { + "epoch": 0.744552448518911, + "grad_norm": 0.8801512229098181, + "learning_rate": 8.400452228022296e-07, + "loss": 0.1087, + "step": 8081 + }, + { + "epoch": 0.744644584696181, + "grad_norm": 0.9245546425567027, + "learning_rate": 8.394756777551602e-07, + "loss": 0.125, + "step": 8082 + }, + { + "epoch": 0.744736720873451, + "grad_norm": 0.9070405580998772, + "learning_rate": 8.389062868879541e-07, + "loss": 0.1173, + "step": 8083 + }, + { + "epoch": 0.744828857050721, + "grad_norm": 0.8722507342722666, + "learning_rate": 8.383370502534765e-07, + "loss": 0.1102, + "step": 8084 + }, + { + "epoch": 0.744920993227991, + "grad_norm": 0.9207796493390706, + "learning_rate": 8.377679679045828e-07, + "loss": 0.1245, + "step": 8085 + }, + { + "epoch": 0.745013129405261, + "grad_norm": 0.9025661348734192, + "learning_rate": 8.371990398941121e-07, + "loss": 0.1255, + "step": 8086 + }, + { + "epoch": 0.745105265582531, + "grad_norm": 0.9436641765010778, + "learning_rate": 8.366302662748901e-07, + "loss": 0.1198, + "step": 8087 + }, + { + "epoch": 0.745197401759801, + "grad_norm": 0.9440084983200447, + "learning_rate": 8.360616470997263e-07, + "loss": 0.1132, + "step": 8088 + }, + { + "epoch": 0.745289537937071, + "grad_norm": 0.9353058639345495, + "learning_rate": 8.354931824214185e-07, + "loss": 0.1203, + "step": 8089 + }, + { + "epoch": 0.745381674114341, + "grad_norm": 0.9620361208227816, + "learning_rate": 8.349248722927469e-07, + "loss": 0.1263, + "step": 8090 + }, + { + "epoch": 0.745473810291611, + "grad_norm": 0.9390265581383841, + "learning_rate": 8.343567167664801e-07, + "loss": 0.123, + "step": 8091 + }, + { + "epoch": 0.745565946468881, + "grad_norm": 0.9554238810465895, + "learning_rate": 8.337887158953723e-07, + "loss": 0.1203, + "step": 8092 + }, + { + "epoch": 0.745658082646151, + "grad_norm": 0.9129449457442721, + "learning_rate": 8.332208697321606e-07, + "loss": 0.1192, + "step": 8093 + }, + { + "epoch": 0.745750218823421, + "grad_norm": 0.9137176785711236, + "learning_rate": 8.326531783295716e-07, + "loss": 0.115, + "step": 8094 + }, + { + "epoch": 0.7458423550006911, + "grad_norm": 0.9545661166830167, + "learning_rate": 8.320856417403134e-07, + "loss": 0.1287, + "step": 8095 + }, + { + "epoch": 0.7459344911779611, + "grad_norm": 0.9471937440524988, + "learning_rate": 8.315182600170838e-07, + "loss": 0.1241, + "step": 8096 + }, + { + "epoch": 0.7460266273552311, + "grad_norm": 0.9942778081193968, + "learning_rate": 8.309510332125623e-07, + "loss": 0.134, + "step": 8097 + }, + { + "epoch": 0.7461187635325011, + "grad_norm": 0.8990687326017974, + "learning_rate": 8.30383961379417e-07, + "loss": 0.1176, + "step": 8098 + }, + { + "epoch": 0.7462108997097711, + "grad_norm": 0.9198246571850542, + "learning_rate": 8.298170445703016e-07, + "loss": 0.1205, + "step": 8099 + }, + { + "epoch": 0.746303035887041, + "grad_norm": 0.8982667907638325, + "learning_rate": 8.292502828378534e-07, + "loss": 0.1242, + "step": 8100 + }, + { + "epoch": 0.746395172064311, + "grad_norm": 0.9183599913087285, + "learning_rate": 8.286836762346953e-07, + "loss": 0.1261, + "step": 8101 + }, + { + "epoch": 0.746487308241581, + "grad_norm": 0.9390779289689044, + "learning_rate": 8.281172248134376e-07, + "loss": 0.1253, + "step": 8102 + }, + { + "epoch": 0.746579444418851, + "grad_norm": 0.915484652855498, + "learning_rate": 8.275509286266755e-07, + "loss": 0.1213, + "step": 8103 + }, + { + "epoch": 0.746671580596121, + "grad_norm": 0.9337864533144221, + "learning_rate": 8.26984787726991e-07, + "loss": 0.124, + "step": 8104 + }, + { + "epoch": 0.746763716773391, + "grad_norm": 0.9738610570783551, + "learning_rate": 8.264188021669483e-07, + "loss": 0.1287, + "step": 8105 + }, + { + "epoch": 0.746855852950661, + "grad_norm": 0.8942501970326633, + "learning_rate": 8.258529719990996e-07, + "loss": 0.1213, + "step": 8106 + }, + { + "epoch": 0.746947989127931, + "grad_norm": 1.0538439434415687, + "learning_rate": 8.252872972759826e-07, + "loss": 0.1349, + "step": 8107 + }, + { + "epoch": 0.7470401253052011, + "grad_norm": 0.9313257811750428, + "learning_rate": 8.24721778050121e-07, + "loss": 0.1182, + "step": 8108 + }, + { + "epoch": 0.7471322614824711, + "grad_norm": 0.9095931652756736, + "learning_rate": 8.241564143740216e-07, + "loss": 0.1245, + "step": 8109 + }, + { + "epoch": 0.7472243976597411, + "grad_norm": 0.9127789000547557, + "learning_rate": 8.235912063001805e-07, + "loss": 0.1196, + "step": 8110 + }, + { + "epoch": 0.7473165338370111, + "grad_norm": 0.9008185338290264, + "learning_rate": 8.230261538810755e-07, + "loss": 0.1161, + "step": 8111 + }, + { + "epoch": 0.7474086700142811, + "grad_norm": 0.9802778555969498, + "learning_rate": 8.224612571691734e-07, + "loss": 0.1207, + "step": 8112 + }, + { + "epoch": 0.7475008061915511, + "grad_norm": 0.9268639919324714, + "learning_rate": 8.218965162169232e-07, + "loss": 0.1223, + "step": 8113 + }, + { + "epoch": 0.7475929423688211, + "grad_norm": 0.9181135011446372, + "learning_rate": 8.21331931076762e-07, + "loss": 0.1238, + "step": 8114 + }, + { + "epoch": 0.7476850785460911, + "grad_norm": 0.9651031360768163, + "learning_rate": 8.207675018011127e-07, + "loss": 0.1277, + "step": 8115 + }, + { + "epoch": 0.7477772147233611, + "grad_norm": 0.9007347893819367, + "learning_rate": 8.202032284423817e-07, + "loss": 0.1207, + "step": 8116 + }, + { + "epoch": 0.7478693509006311, + "grad_norm": 0.9716022381465899, + "learning_rate": 8.196391110529606e-07, + "loss": 0.1241, + "step": 8117 + }, + { + "epoch": 0.7479614870779011, + "grad_norm": 0.8951591662409739, + "learning_rate": 8.19075149685229e-07, + "loss": 0.1097, + "step": 8118 + }, + { + "epoch": 0.7480536232551711, + "grad_norm": 0.9058294291306436, + "learning_rate": 8.185113443915504e-07, + "loss": 0.1178, + "step": 8119 + }, + { + "epoch": 0.7481457594324411, + "grad_norm": 0.9752842361448212, + "learning_rate": 8.179476952242757e-07, + "loss": 0.1184, + "step": 8120 + }, + { + "epoch": 0.7482378956097111, + "grad_norm": 0.9086052019177948, + "learning_rate": 8.173842022357381e-07, + "loss": 0.1134, + "step": 8121 + }, + { + "epoch": 0.7483300317869812, + "grad_norm": 0.9355291015960355, + "learning_rate": 8.168208654782578e-07, + "loss": 0.1188, + "step": 8122 + }, + { + "epoch": 0.7484221679642512, + "grad_norm": 0.9463130206534003, + "learning_rate": 8.162576850041415e-07, + "loss": 0.1269, + "step": 8123 + }, + { + "epoch": 0.7485143041415212, + "grad_norm": 0.9097943913503486, + "learning_rate": 8.156946608656799e-07, + "loss": 0.1197, + "step": 8124 + }, + { + "epoch": 0.7486064403187912, + "grad_norm": 0.9261635841305669, + "learning_rate": 8.151317931151514e-07, + "loss": 0.1065, + "step": 8125 + }, + { + "epoch": 0.7486985764960612, + "grad_norm": 0.9384266586122866, + "learning_rate": 8.145690818048171e-07, + "loss": 0.1379, + "step": 8126 + }, + { + "epoch": 0.7487907126733312, + "grad_norm": 0.996044063579827, + "learning_rate": 8.140065269869244e-07, + "loss": 0.1245, + "step": 8127 + }, + { + "epoch": 0.7488828488506012, + "grad_norm": 0.9224711715286494, + "learning_rate": 8.134441287137068e-07, + "loss": 0.1114, + "step": 8128 + }, + { + "epoch": 0.7489749850278712, + "grad_norm": 0.9220396484994093, + "learning_rate": 8.128818870373845e-07, + "loss": 0.1084, + "step": 8129 + }, + { + "epoch": 0.7490671212051412, + "grad_norm": 0.9545910800584627, + "learning_rate": 8.123198020101594e-07, + "loss": 0.1278, + "step": 8130 + }, + { + "epoch": 0.7491592573824112, + "grad_norm": 0.9261706948520899, + "learning_rate": 8.117578736842232e-07, + "loss": 0.1276, + "step": 8131 + }, + { + "epoch": 0.7492513935596812, + "grad_norm": 0.9033268659393892, + "learning_rate": 8.111961021117496e-07, + "loss": 0.12, + "step": 8132 + }, + { + "epoch": 0.7493435297369512, + "grad_norm": 0.9488735125394763, + "learning_rate": 8.106344873449001e-07, + "loss": 0.1216, + "step": 8133 + }, + { + "epoch": 0.7494356659142212, + "grad_norm": 0.9861922120436211, + "learning_rate": 8.100730294358197e-07, + "loss": 0.1268, + "step": 8134 + }, + { + "epoch": 0.7495278020914912, + "grad_norm": 0.9421639498933942, + "learning_rate": 8.095117284366405e-07, + "loss": 0.1239, + "step": 8135 + }, + { + "epoch": 0.7496199382687613, + "grad_norm": 0.9059836146120238, + "learning_rate": 8.089505843994797e-07, + "loss": 0.1165, + "step": 8136 + }, + { + "epoch": 0.7497120744460313, + "grad_norm": 0.8729933498186071, + "learning_rate": 8.083895973764394e-07, + "loss": 0.1101, + "step": 8137 + }, + { + "epoch": 0.7498042106233013, + "grad_norm": 0.9354667026261895, + "learning_rate": 8.078287674196061e-07, + "loss": 0.1259, + "step": 8138 + }, + { + "epoch": 0.7498963468005713, + "grad_norm": 0.9393954330086781, + "learning_rate": 8.072680945810538e-07, + "loss": 0.1264, + "step": 8139 + }, + { + "epoch": 0.7499884829778413, + "grad_norm": 0.982062774545163, + "learning_rate": 8.067075789128412e-07, + "loss": 0.1351, + "step": 8140 + }, + { + "epoch": 0.7500806191551113, + "grad_norm": 0.8870732838171413, + "learning_rate": 8.061472204670129e-07, + "loss": 0.1151, + "step": 8141 + }, + { + "epoch": 0.7501727553323813, + "grad_norm": 0.9705648123376437, + "learning_rate": 8.055870192955975e-07, + "loss": 0.1183, + "step": 8142 + }, + { + "epoch": 0.7502648915096513, + "grad_norm": 0.891933701352055, + "learning_rate": 8.050269754506091e-07, + "loss": 0.1179, + "step": 8143 + }, + { + "epoch": 0.7503570276869213, + "grad_norm": 0.974472806167114, + "learning_rate": 8.044670889840481e-07, + "loss": 0.1281, + "step": 8144 + }, + { + "epoch": 0.7504491638641912, + "grad_norm": 0.9393965328736213, + "learning_rate": 8.039073599479014e-07, + "loss": 0.1243, + "step": 8145 + }, + { + "epoch": 0.7505413000414612, + "grad_norm": 0.9261251014677941, + "learning_rate": 8.033477883941379e-07, + "loss": 0.1229, + "step": 8146 + }, + { + "epoch": 0.7506334362187312, + "grad_norm": 0.9257164254893021, + "learning_rate": 8.027883743747156e-07, + "loss": 0.1275, + "step": 8147 + }, + { + "epoch": 0.7507255723960012, + "grad_norm": 0.8801199783379451, + "learning_rate": 8.022291179415747e-07, + "loss": 0.1158, + "step": 8148 + }, + { + "epoch": 0.7508177085732713, + "grad_norm": 0.9106903812355431, + "learning_rate": 8.016700191466431e-07, + "loss": 0.1265, + "step": 8149 + }, + { + "epoch": 0.7509098447505413, + "grad_norm": 0.9141324302012125, + "learning_rate": 8.011110780418335e-07, + "loss": 0.1172, + "step": 8150 + }, + { + "epoch": 0.7510019809278113, + "grad_norm": 0.9587514921406114, + "learning_rate": 8.005522946790426e-07, + "loss": 0.1294, + "step": 8151 + }, + { + "epoch": 0.7510941171050813, + "grad_norm": 0.9030493525414592, + "learning_rate": 7.999936691101545e-07, + "loss": 0.1256, + "step": 8152 + }, + { + "epoch": 0.7511862532823513, + "grad_norm": 0.9972586634465126, + "learning_rate": 7.994352013870366e-07, + "loss": 0.1276, + "step": 8153 + }, + { + "epoch": 0.7512783894596213, + "grad_norm": 0.8537576474310897, + "learning_rate": 7.988768915615441e-07, + "loss": 0.1093, + "step": 8154 + }, + { + "epoch": 0.7513705256368913, + "grad_norm": 0.9041807136029697, + "learning_rate": 7.983187396855144e-07, + "loss": 0.1131, + "step": 8155 + }, + { + "epoch": 0.7514626618141613, + "grad_norm": 0.8912359097150397, + "learning_rate": 7.977607458107731e-07, + "loss": 0.1218, + "step": 8156 + }, + { + "epoch": 0.7515547979914313, + "grad_norm": 0.921588521542897, + "learning_rate": 7.972029099891293e-07, + "loss": 0.1189, + "step": 8157 + }, + { + "epoch": 0.7516469341687013, + "grad_norm": 0.931955611648895, + "learning_rate": 7.966452322723806e-07, + "loss": 0.1142, + "step": 8158 + }, + { + "epoch": 0.7517390703459713, + "grad_norm": 0.9486463061139534, + "learning_rate": 7.960877127123038e-07, + "loss": 0.1199, + "step": 8159 + }, + { + "epoch": 0.7518312065232413, + "grad_norm": 0.930525832367596, + "learning_rate": 7.955303513606657e-07, + "loss": 0.1117, + "step": 8160 + }, + { + "epoch": 0.7519233427005113, + "grad_norm": 0.9429285315072875, + "learning_rate": 7.949731482692185e-07, + "loss": 0.1276, + "step": 8161 + }, + { + "epoch": 0.7520154788777813, + "grad_norm": 0.9014224398876224, + "learning_rate": 7.944161034896986e-07, + "loss": 0.1145, + "step": 8162 + }, + { + "epoch": 0.7521076150550514, + "grad_norm": 0.9373335528043448, + "learning_rate": 7.93859217073826e-07, + "loss": 0.1261, + "step": 8163 + }, + { + "epoch": 0.7521997512323214, + "grad_norm": 0.9031086836900423, + "learning_rate": 7.933024890733099e-07, + "loss": 0.1186, + "step": 8164 + }, + { + "epoch": 0.7522918874095914, + "grad_norm": 0.9747283781206498, + "learning_rate": 7.927459195398404e-07, + "loss": 0.1238, + "step": 8165 + }, + { + "epoch": 0.7523840235868614, + "grad_norm": 1.0216403307734223, + "learning_rate": 7.921895085250967e-07, + "loss": 0.1281, + "step": 8166 + }, + { + "epoch": 0.7524761597641314, + "grad_norm": 0.9079435690483035, + "learning_rate": 7.916332560807402e-07, + "loss": 0.1115, + "step": 8167 + }, + { + "epoch": 0.7525682959414014, + "grad_norm": 0.9732939639285801, + "learning_rate": 7.910771622584199e-07, + "loss": 0.1172, + "step": 8168 + }, + { + "epoch": 0.7526604321186714, + "grad_norm": 0.9380322737134049, + "learning_rate": 7.905212271097696e-07, + "loss": 0.1144, + "step": 8169 + }, + { + "epoch": 0.7527525682959414, + "grad_norm": 0.902154398830762, + "learning_rate": 7.899654506864074e-07, + "loss": 0.1203, + "step": 8170 + }, + { + "epoch": 0.7528447044732114, + "grad_norm": 0.9161360802401297, + "learning_rate": 7.894098330399363e-07, + "loss": 0.1143, + "step": 8171 + }, + { + "epoch": 0.7529368406504814, + "grad_norm": 0.9832148574859536, + "learning_rate": 7.888543742219462e-07, + "loss": 0.1297, + "step": 8172 + }, + { + "epoch": 0.7530289768277514, + "grad_norm": 0.9578861811755155, + "learning_rate": 7.882990742840119e-07, + "loss": 0.1205, + "step": 8173 + }, + { + "epoch": 0.7531211130050214, + "grad_norm": 0.9560100009959844, + "learning_rate": 7.877439332776934e-07, + "loss": 0.1246, + "step": 8174 + }, + { + "epoch": 0.7532132491822914, + "grad_norm": 0.8931628999965731, + "learning_rate": 7.87188951254535e-07, + "loss": 0.1078, + "step": 8175 + }, + { + "epoch": 0.7533053853595615, + "grad_norm": 0.9894713605728191, + "learning_rate": 7.866341282660661e-07, + "loss": 0.1246, + "step": 8176 + }, + { + "epoch": 0.7533975215368315, + "grad_norm": 0.940458859644715, + "learning_rate": 7.860794643638026e-07, + "loss": 0.1194, + "step": 8177 + }, + { + "epoch": 0.7534896577141015, + "grad_norm": 0.919068557096056, + "learning_rate": 7.855249595992454e-07, + "loss": 0.1274, + "step": 8178 + }, + { + "epoch": 0.7535817938913715, + "grad_norm": 0.8730537807997693, + "learning_rate": 7.849706140238808e-07, + "loss": 0.1141, + "step": 8179 + }, + { + "epoch": 0.7536739300686415, + "grad_norm": 0.8816936498803757, + "learning_rate": 7.844164276891794e-07, + "loss": 0.1171, + "step": 8180 + }, + { + "epoch": 0.7537660662459115, + "grad_norm": 0.9362278283930622, + "learning_rate": 7.838624006465967e-07, + "loss": 0.124, + "step": 8181 + }, + { + "epoch": 0.7538582024231815, + "grad_norm": 0.9231753539436256, + "learning_rate": 7.833085329475748e-07, + "loss": 0.1174, + "step": 8182 + }, + { + "epoch": 0.7539503386004515, + "grad_norm": 0.9753567791774588, + "learning_rate": 7.82754824643541e-07, + "loss": 0.1298, + "step": 8183 + }, + { + "epoch": 0.7540424747777215, + "grad_norm": 0.934180096557911, + "learning_rate": 7.822012757859057e-07, + "loss": 0.1291, + "step": 8184 + }, + { + "epoch": 0.7541346109549915, + "grad_norm": 0.9257184287763613, + "learning_rate": 7.816478864260677e-07, + "loss": 0.1147, + "step": 8185 + }, + { + "epoch": 0.7542267471322615, + "grad_norm": 0.9686431756661043, + "learning_rate": 7.810946566154076e-07, + "loss": 0.1255, + "step": 8186 + }, + { + "epoch": 0.7543188833095315, + "grad_norm": 0.9168752484252765, + "learning_rate": 7.805415864052942e-07, + "loss": 0.1261, + "step": 8187 + }, + { + "epoch": 0.7544110194868014, + "grad_norm": 0.9408848273284718, + "learning_rate": 7.79988675847079e-07, + "loss": 0.123, + "step": 8188 + }, + { + "epoch": 0.7545031556640714, + "grad_norm": 0.9802139431624383, + "learning_rate": 7.794359249921004e-07, + "loss": 0.1225, + "step": 8189 + }, + { + "epoch": 0.7545952918413416, + "grad_norm": 0.888342210491898, + "learning_rate": 7.78883333891682e-07, + "loss": 0.1172, + "step": 8190 + }, + { + "epoch": 0.7546874280186116, + "grad_norm": 0.9059491622012312, + "learning_rate": 7.783309025971314e-07, + "loss": 0.1158, + "step": 8191 + }, + { + "epoch": 0.7547795641958815, + "grad_norm": 0.9274618100083608, + "learning_rate": 7.777786311597408e-07, + "loss": 0.1343, + "step": 8192 + }, + { + "epoch": 0.7548717003731515, + "grad_norm": 0.9096886113886077, + "learning_rate": 7.772265196307896e-07, + "loss": 0.1216, + "step": 8193 + }, + { + "epoch": 0.7549638365504215, + "grad_norm": 0.9364608277777419, + "learning_rate": 7.766745680615417e-07, + "loss": 0.1193, + "step": 8194 + }, + { + "epoch": 0.7550559727276915, + "grad_norm": 0.8856154937055789, + "learning_rate": 7.761227765032464e-07, + "loss": 0.1125, + "step": 8195 + }, + { + "epoch": 0.7551481089049615, + "grad_norm": 0.9611275417568343, + "learning_rate": 7.755711450071365e-07, + "loss": 0.1198, + "step": 8196 + }, + { + "epoch": 0.7552402450822315, + "grad_norm": 0.9812449711132765, + "learning_rate": 7.750196736244309e-07, + "loss": 0.1275, + "step": 8197 + }, + { + "epoch": 0.7553323812595015, + "grad_norm": 0.963089860945346, + "learning_rate": 7.744683624063343e-07, + "loss": 0.1175, + "step": 8198 + }, + { + "epoch": 0.7554245174367715, + "grad_norm": 0.9683624391402031, + "learning_rate": 7.739172114040366e-07, + "loss": 0.1212, + "step": 8199 + }, + { + "epoch": 0.7555166536140415, + "grad_norm": 0.9774515231953995, + "learning_rate": 7.733662206687106e-07, + "loss": 0.1191, + "step": 8200 + }, + { + "epoch": 0.7556087897913115, + "grad_norm": 0.9521027886273846, + "learning_rate": 7.728153902515181e-07, + "loss": 0.13, + "step": 8201 + }, + { + "epoch": 0.7557009259685815, + "grad_norm": 1.0001992095848415, + "learning_rate": 7.722647202036012e-07, + "loss": 0.1409, + "step": 8202 + }, + { + "epoch": 0.7557930621458516, + "grad_norm": 0.9519675971196492, + "learning_rate": 7.717142105760922e-07, + "loss": 0.1164, + "step": 8203 + }, + { + "epoch": 0.7558851983231216, + "grad_norm": 0.9638373797322568, + "learning_rate": 7.711638614201037e-07, + "loss": 0.132, + "step": 8204 + }, + { + "epoch": 0.7559773345003916, + "grad_norm": 0.9227872124153469, + "learning_rate": 7.706136727867366e-07, + "loss": 0.1185, + "step": 8205 + }, + { + "epoch": 0.7560694706776616, + "grad_norm": 0.9195582592993863, + "learning_rate": 7.700636447270773e-07, + "loss": 0.1142, + "step": 8206 + }, + { + "epoch": 0.7561616068549316, + "grad_norm": 0.9565199463656906, + "learning_rate": 7.695137772921938e-07, + "loss": 0.125, + "step": 8207 + }, + { + "epoch": 0.7562537430322016, + "grad_norm": 0.9254905477252218, + "learning_rate": 7.68964070533143e-07, + "loss": 0.1211, + "step": 8208 + }, + { + "epoch": 0.7563458792094716, + "grad_norm": 1.000115280608466, + "learning_rate": 7.684145245009639e-07, + "loss": 0.1332, + "step": 8209 + }, + { + "epoch": 0.7564380153867416, + "grad_norm": 0.9401494439330383, + "learning_rate": 7.678651392466824e-07, + "loss": 0.1177, + "step": 8210 + }, + { + "epoch": 0.7565301515640116, + "grad_norm": 0.9091567374279103, + "learning_rate": 7.6731591482131e-07, + "loss": 0.1197, + "step": 8211 + }, + { + "epoch": 0.7566222877412816, + "grad_norm": 0.9063596577785816, + "learning_rate": 7.667668512758414e-07, + "loss": 0.1129, + "step": 8212 + }, + { + "epoch": 0.7567144239185516, + "grad_norm": 0.9020032849310663, + "learning_rate": 7.662179486612561e-07, + "loss": 0.1158, + "step": 8213 + }, + { + "epoch": 0.7568065600958216, + "grad_norm": 0.9373934933001632, + "learning_rate": 7.656692070285212e-07, + "loss": 0.127, + "step": 8214 + }, + { + "epoch": 0.7568986962730916, + "grad_norm": 0.9687990581181442, + "learning_rate": 7.651206264285871e-07, + "loss": 0.1314, + "step": 8215 + }, + { + "epoch": 0.7569908324503616, + "grad_norm": 0.9003439445100029, + "learning_rate": 7.645722069123904e-07, + "loss": 0.118, + "step": 8216 + }, + { + "epoch": 0.7570829686276317, + "grad_norm": 0.9023461782986097, + "learning_rate": 7.640239485308506e-07, + "loss": 0.1108, + "step": 8217 + }, + { + "epoch": 0.7571751048049017, + "grad_norm": 0.9496536924462029, + "learning_rate": 7.634758513348737e-07, + "loss": 0.1242, + "step": 8218 + }, + { + "epoch": 0.7572672409821717, + "grad_norm": 0.9003447933629831, + "learning_rate": 7.629279153753508e-07, + "loss": 0.1134, + "step": 8219 + }, + { + "epoch": 0.7573593771594417, + "grad_norm": 0.9717119656589931, + "learning_rate": 7.623801407031586e-07, + "loss": 0.1257, + "step": 8220 + }, + { + "epoch": 0.7574515133367117, + "grad_norm": 0.9319051148074415, + "learning_rate": 7.618325273691565e-07, + "loss": 0.1285, + "step": 8221 + }, + { + "epoch": 0.7575436495139817, + "grad_norm": 0.898820556761226, + "learning_rate": 7.612850754241921e-07, + "loss": 0.1191, + "step": 8222 + }, + { + "epoch": 0.7576357856912517, + "grad_norm": 0.9545287362385483, + "learning_rate": 7.607377849190947e-07, + "loss": 0.129, + "step": 8223 + }, + { + "epoch": 0.7577279218685217, + "grad_norm": 0.9509213799133734, + "learning_rate": 7.601906559046824e-07, + "loss": 0.1252, + "step": 8224 + }, + { + "epoch": 0.7578200580457917, + "grad_norm": 0.99532066898558, + "learning_rate": 7.596436884317537e-07, + "loss": 0.1418, + "step": 8225 + }, + { + "epoch": 0.7579121942230617, + "grad_norm": 0.9568231647521277, + "learning_rate": 7.590968825510958e-07, + "loss": 0.1316, + "step": 8226 + }, + { + "epoch": 0.7580043304003317, + "grad_norm": 0.9676381668975514, + "learning_rate": 7.585502383134807e-07, + "loss": 0.1246, + "step": 8227 + }, + { + "epoch": 0.7580964665776017, + "grad_norm": 0.9686951983539297, + "learning_rate": 7.580037557696634e-07, + "loss": 0.1281, + "step": 8228 + }, + { + "epoch": 0.7581886027548717, + "grad_norm": 0.9575475261171695, + "learning_rate": 7.574574349703839e-07, + "loss": 0.1277, + "step": 8229 + }, + { + "epoch": 0.7582807389321417, + "grad_norm": 0.966501487465626, + "learning_rate": 7.569112759663693e-07, + "loss": 0.1169, + "step": 8230 + }, + { + "epoch": 0.7583728751094118, + "grad_norm": 0.9650356114616608, + "learning_rate": 7.5636527880833e-07, + "loss": 0.1259, + "step": 8231 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 0.9896181093613594, + "learning_rate": 7.558194435469634e-07, + "loss": 0.1325, + "step": 8232 + }, + { + "epoch": 0.7585571474639518, + "grad_norm": 0.9215878238578664, + "learning_rate": 7.55273770232949e-07, + "loss": 0.1194, + "step": 8233 + }, + { + "epoch": 0.7586492836412217, + "grad_norm": 0.8898782537530946, + "learning_rate": 7.547282589169519e-07, + "loss": 0.1212, + "step": 8234 + }, + { + "epoch": 0.7587414198184917, + "grad_norm": 0.961252642668289, + "learning_rate": 7.541829096496239e-07, + "loss": 0.1148, + "step": 8235 + }, + { + "epoch": 0.7588335559957617, + "grad_norm": 0.978911061379541, + "learning_rate": 7.536377224816008e-07, + "loss": 0.1273, + "step": 8236 + }, + { + "epoch": 0.7589256921730317, + "grad_norm": 0.9326585979443505, + "learning_rate": 7.530926974635036e-07, + "loss": 0.1308, + "step": 8237 + }, + { + "epoch": 0.7590178283503017, + "grad_norm": 0.999088474363938, + "learning_rate": 7.525478346459369e-07, + "loss": 0.1287, + "step": 8238 + }, + { + "epoch": 0.7591099645275717, + "grad_norm": 0.9697618917331947, + "learning_rate": 7.520031340794926e-07, + "loss": 0.131, + "step": 8239 + }, + { + "epoch": 0.7592021007048417, + "grad_norm": 0.9103230425606077, + "learning_rate": 7.514585958147444e-07, + "loss": 0.1231, + "step": 8240 + }, + { + "epoch": 0.7592942368821117, + "grad_norm": 0.9963197904910523, + "learning_rate": 7.509142199022545e-07, + "loss": 0.1251, + "step": 8241 + }, + { + "epoch": 0.7593863730593817, + "grad_norm": 0.9044203026817673, + "learning_rate": 7.50370006392567e-07, + "loss": 0.1167, + "step": 8242 + }, + { + "epoch": 0.7594785092366517, + "grad_norm": 0.9102232962101179, + "learning_rate": 7.498259553362128e-07, + "loss": 0.1197, + "step": 8243 + }, + { + "epoch": 0.7595706454139218, + "grad_norm": 0.9331151108211787, + "learning_rate": 7.492820667837075e-07, + "loss": 0.1209, + "step": 8244 + }, + { + "epoch": 0.7596627815911918, + "grad_norm": 0.9581217043945808, + "learning_rate": 7.487383407855508e-07, + "loss": 0.1313, + "step": 8245 + }, + { + "epoch": 0.7597549177684618, + "grad_norm": 0.859473471190459, + "learning_rate": 7.481947773922269e-07, + "loss": 0.1056, + "step": 8246 + }, + { + "epoch": 0.7598470539457318, + "grad_norm": 0.906385166241462, + "learning_rate": 7.476513766542065e-07, + "loss": 0.1188, + "step": 8247 + }, + { + "epoch": 0.7599391901230018, + "grad_norm": 0.9363292229597777, + "learning_rate": 7.471081386219442e-07, + "loss": 0.1308, + "step": 8248 + }, + { + "epoch": 0.7600313263002718, + "grad_norm": 0.9867804550389615, + "learning_rate": 7.465650633458807e-07, + "loss": 0.139, + "step": 8249 + }, + { + "epoch": 0.7601234624775418, + "grad_norm": 0.9925921139578151, + "learning_rate": 7.460221508764398e-07, + "loss": 0.124, + "step": 8250 + }, + { + "epoch": 0.7602155986548118, + "grad_norm": 0.9089636015164811, + "learning_rate": 7.454794012640301e-07, + "loss": 0.1086, + "step": 8251 + }, + { + "epoch": 0.7603077348320818, + "grad_norm": 0.8845980297368551, + "learning_rate": 7.449368145590469e-07, + "loss": 0.1212, + "step": 8252 + }, + { + "epoch": 0.7603998710093518, + "grad_norm": 0.9422456456596634, + "learning_rate": 7.443943908118703e-07, + "loss": 0.1256, + "step": 8253 + }, + { + "epoch": 0.7604920071866218, + "grad_norm": 0.9566109791919529, + "learning_rate": 7.438521300728624e-07, + "loss": 0.1354, + "step": 8254 + }, + { + "epoch": 0.7605841433638918, + "grad_norm": 0.9351359065136909, + "learning_rate": 7.433100323923742e-07, + "loss": 0.1264, + "step": 8255 + }, + { + "epoch": 0.7606762795411618, + "grad_norm": 0.9693764918733778, + "learning_rate": 7.427680978207378e-07, + "loss": 0.1195, + "step": 8256 + }, + { + "epoch": 0.7607684157184318, + "grad_norm": 0.903810912945681, + "learning_rate": 7.422263264082732e-07, + "loss": 0.1026, + "step": 8257 + }, + { + "epoch": 0.7608605518957019, + "grad_norm": 0.9003919306304144, + "learning_rate": 7.416847182052825e-07, + "loss": 0.1048, + "step": 8258 + }, + { + "epoch": 0.7609526880729719, + "grad_norm": 0.9647571730317374, + "learning_rate": 7.411432732620552e-07, + "loss": 0.121, + "step": 8259 + }, + { + "epoch": 0.7610448242502419, + "grad_norm": 0.9315295549134404, + "learning_rate": 7.406019916288651e-07, + "loss": 0.1267, + "step": 8260 + }, + { + "epoch": 0.7611369604275119, + "grad_norm": 0.8841359303961115, + "learning_rate": 7.400608733559692e-07, + "loss": 0.1151, + "step": 8261 + }, + { + "epoch": 0.7612290966047819, + "grad_norm": 0.9333795268076007, + "learning_rate": 7.395199184936099e-07, + "loss": 0.1235, + "step": 8262 + }, + { + "epoch": 0.7613212327820519, + "grad_norm": 1.0032582232476528, + "learning_rate": 7.389791270920158e-07, + "loss": 0.1182, + "step": 8263 + }, + { + "epoch": 0.7614133689593219, + "grad_norm": 1.008498522600006, + "learning_rate": 7.38438499201399e-07, + "loss": 0.1416, + "step": 8264 + }, + { + "epoch": 0.7615055051365919, + "grad_norm": 0.9695756514176418, + "learning_rate": 7.378980348719581e-07, + "loss": 0.1188, + "step": 8265 + }, + { + "epoch": 0.7615976413138619, + "grad_norm": 0.9189796715543543, + "learning_rate": 7.373577341538742e-07, + "loss": 0.1153, + "step": 8266 + }, + { + "epoch": 0.7616897774911319, + "grad_norm": 0.9377850605107226, + "learning_rate": 7.368175970973138e-07, + "loss": 0.11, + "step": 8267 + }, + { + "epoch": 0.7617819136684019, + "grad_norm": 0.9201398168024537, + "learning_rate": 7.362776237524291e-07, + "loss": 0.1197, + "step": 8268 + }, + { + "epoch": 0.7618740498456719, + "grad_norm": 1.072128024202621, + "learning_rate": 7.357378141693569e-07, + "loss": 0.1197, + "step": 8269 + }, + { + "epoch": 0.7619661860229419, + "grad_norm": 0.9460766863322817, + "learning_rate": 7.351981683982193e-07, + "loss": 0.1239, + "step": 8270 + }, + { + "epoch": 0.762058322200212, + "grad_norm": 0.9692369089181001, + "learning_rate": 7.346586864891217e-07, + "loss": 0.127, + "step": 8271 + }, + { + "epoch": 0.762150458377482, + "grad_norm": 0.8880843044237137, + "learning_rate": 7.341193684921541e-07, + "loss": 0.1118, + "step": 8272 + }, + { + "epoch": 0.762242594554752, + "grad_norm": 0.926515471093357, + "learning_rate": 7.335802144573933e-07, + "loss": 0.1177, + "step": 8273 + }, + { + "epoch": 0.762334730732022, + "grad_norm": 0.9482733992199582, + "learning_rate": 7.330412244349005e-07, + "loss": 0.1163, + "step": 8274 + }, + { + "epoch": 0.762426866909292, + "grad_norm": 0.9582116484025304, + "learning_rate": 7.325023984747195e-07, + "loss": 0.1334, + "step": 8275 + }, + { + "epoch": 0.762519003086562, + "grad_norm": 0.9336639018459388, + "learning_rate": 7.319637366268817e-07, + "loss": 0.1225, + "step": 8276 + }, + { + "epoch": 0.762611139263832, + "grad_norm": 0.9118007724591288, + "learning_rate": 7.314252389414003e-07, + "loss": 0.1131, + "step": 8277 + }, + { + "epoch": 0.762703275441102, + "grad_norm": 0.9695491078378007, + "learning_rate": 7.308869054682769e-07, + "loss": 0.1247, + "step": 8278 + }, + { + "epoch": 0.7627954116183719, + "grad_norm": 0.9668242101632564, + "learning_rate": 7.303487362574938e-07, + "loss": 0.1261, + "step": 8279 + }, + { + "epoch": 0.7628875477956419, + "grad_norm": 1.0067394909831462, + "learning_rate": 7.298107313590208e-07, + "loss": 0.1337, + "step": 8280 + }, + { + "epoch": 0.7629796839729119, + "grad_norm": 0.9437521576960384, + "learning_rate": 7.292728908228127e-07, + "loss": 0.1247, + "step": 8281 + }, + { + "epoch": 0.7630718201501819, + "grad_norm": 0.945932213840081, + "learning_rate": 7.287352146988075e-07, + "loss": 0.1222, + "step": 8282 + }, + { + "epoch": 0.7631639563274519, + "grad_norm": 0.8918765975327134, + "learning_rate": 7.281977030369275e-07, + "loss": 0.1031, + "step": 8283 + }, + { + "epoch": 0.7632560925047219, + "grad_norm": 0.9158648763429635, + "learning_rate": 7.276603558870812e-07, + "loss": 0.1208, + "step": 8284 + }, + { + "epoch": 0.763348228681992, + "grad_norm": 0.9292601320925606, + "learning_rate": 7.271231732991619e-07, + "loss": 0.1164, + "step": 8285 + }, + { + "epoch": 0.763440364859262, + "grad_norm": 0.8994955400839384, + "learning_rate": 7.265861553230472e-07, + "loss": 0.1222, + "step": 8286 + }, + { + "epoch": 0.763532501036532, + "grad_norm": 0.951619859318042, + "learning_rate": 7.26049302008599e-07, + "loss": 0.1282, + "step": 8287 + }, + { + "epoch": 0.763624637213802, + "grad_norm": 0.988325789476459, + "learning_rate": 7.255126134056631e-07, + "loss": 0.1399, + "step": 8288 + }, + { + "epoch": 0.763716773391072, + "grad_norm": 1.055346800786715, + "learning_rate": 7.249760895640723e-07, + "loss": 0.1243, + "step": 8289 + }, + { + "epoch": 0.763808909568342, + "grad_norm": 0.9044247369547711, + "learning_rate": 7.244397305336423e-07, + "loss": 0.1155, + "step": 8290 + }, + { + "epoch": 0.763901045745612, + "grad_norm": 0.9472862151962139, + "learning_rate": 7.239035363641752e-07, + "loss": 0.1219, + "step": 8291 + }, + { + "epoch": 0.763993181922882, + "grad_norm": 0.9502049407566469, + "learning_rate": 7.233675071054564e-07, + "loss": 0.1287, + "step": 8292 + }, + { + "epoch": 0.764085318100152, + "grad_norm": 0.9285316003020824, + "learning_rate": 7.228316428072546e-07, + "loss": 0.1236, + "step": 8293 + }, + { + "epoch": 0.764177454277422, + "grad_norm": 0.9390689107258569, + "learning_rate": 7.222959435193258e-07, + "loss": 0.1158, + "step": 8294 + }, + { + "epoch": 0.764269590454692, + "grad_norm": 0.95789176362226, + "learning_rate": 7.21760409291411e-07, + "loss": 0.1343, + "step": 8295 + }, + { + "epoch": 0.764361726631962, + "grad_norm": 0.9941924628392527, + "learning_rate": 7.212250401732329e-07, + "loss": 0.127, + "step": 8296 + }, + { + "epoch": 0.764453862809232, + "grad_norm": 0.9252169197213751, + "learning_rate": 7.206898362145021e-07, + "loss": 0.1196, + "step": 8297 + }, + { + "epoch": 0.764545998986502, + "grad_norm": 0.8762698700489192, + "learning_rate": 7.201547974649104e-07, + "loss": 0.1122, + "step": 8298 + }, + { + "epoch": 0.7646381351637721, + "grad_norm": 0.9739201895395896, + "learning_rate": 7.196199239741383e-07, + "loss": 0.1236, + "step": 8299 + }, + { + "epoch": 0.7647302713410421, + "grad_norm": 0.9127234805633828, + "learning_rate": 7.190852157918468e-07, + "loss": 0.1167, + "step": 8300 + }, + { + "epoch": 0.7648224075183121, + "grad_norm": 0.9358931953978479, + "learning_rate": 7.185506729676849e-07, + "loss": 0.1136, + "step": 8301 + }, + { + "epoch": 0.7649145436955821, + "grad_norm": 0.9726783433144469, + "learning_rate": 7.180162955512856e-07, + "loss": 0.1223, + "step": 8302 + }, + { + "epoch": 0.7650066798728521, + "grad_norm": 0.9370684221290889, + "learning_rate": 7.174820835922649e-07, + "loss": 0.1136, + "step": 8303 + }, + { + "epoch": 0.7650988160501221, + "grad_norm": 0.8790475206195711, + "learning_rate": 7.16948037140224e-07, + "loss": 0.1164, + "step": 8304 + }, + { + "epoch": 0.7651909522273921, + "grad_norm": 1.0491204399969105, + "learning_rate": 7.164141562447497e-07, + "loss": 0.1411, + "step": 8305 + }, + { + "epoch": 0.7652830884046621, + "grad_norm": 0.9571333133358144, + "learning_rate": 7.158804409554126e-07, + "loss": 0.1118, + "step": 8306 + }, + { + "epoch": 0.7653752245819321, + "grad_norm": 0.964770395513596, + "learning_rate": 7.153468913217695e-07, + "loss": 0.1277, + "step": 8307 + }, + { + "epoch": 0.7654673607592021, + "grad_norm": 0.9070299902224167, + "learning_rate": 7.148135073933599e-07, + "loss": 0.1164, + "step": 8308 + }, + { + "epoch": 0.7655594969364721, + "grad_norm": 0.941404523811673, + "learning_rate": 7.142802892197071e-07, + "loss": 0.1253, + "step": 8309 + }, + { + "epoch": 0.7656516331137421, + "grad_norm": 0.9066252176601356, + "learning_rate": 7.137472368503217e-07, + "loss": 0.1269, + "step": 8310 + }, + { + "epoch": 0.7657437692910121, + "grad_norm": 0.9096259039649669, + "learning_rate": 7.132143503346986e-07, + "loss": 0.1159, + "step": 8311 + }, + { + "epoch": 0.7658359054682822, + "grad_norm": 0.9385880965115956, + "learning_rate": 7.126816297223147e-07, + "loss": 0.1301, + "step": 8312 + }, + { + "epoch": 0.7659280416455522, + "grad_norm": 0.9567068424142435, + "learning_rate": 7.121490750626342e-07, + "loss": 0.1231, + "step": 8313 + }, + { + "epoch": 0.7660201778228222, + "grad_norm": 0.9173460135425626, + "learning_rate": 7.116166864051038e-07, + "loss": 0.1137, + "step": 8314 + }, + { + "epoch": 0.7661123140000922, + "grad_norm": 0.9294880243189472, + "learning_rate": 7.110844637991574e-07, + "loss": 0.1232, + "step": 8315 + }, + { + "epoch": 0.7662044501773622, + "grad_norm": 0.9152657770262788, + "learning_rate": 7.105524072942105e-07, + "loss": 0.1164, + "step": 8316 + }, + { + "epoch": 0.7662965863546322, + "grad_norm": 0.9396243490752815, + "learning_rate": 7.100205169396649e-07, + "loss": 0.1227, + "step": 8317 + }, + { + "epoch": 0.7663887225319022, + "grad_norm": 0.9206804869509456, + "learning_rate": 7.09488792784907e-07, + "loss": 0.1218, + "step": 8318 + }, + { + "epoch": 0.7664808587091722, + "grad_norm": 0.935610637811038, + "learning_rate": 7.089572348793081e-07, + "loss": 0.1202, + "step": 8319 + }, + { + "epoch": 0.7665729948864421, + "grad_norm": 0.9489679146040931, + "learning_rate": 7.084258432722227e-07, + "loss": 0.1177, + "step": 8320 + }, + { + "epoch": 0.7666651310637121, + "grad_norm": 0.9163551987157174, + "learning_rate": 7.078946180129898e-07, + "loss": 0.1164, + "step": 8321 + }, + { + "epoch": 0.7667572672409821, + "grad_norm": 0.8913692547879215, + "learning_rate": 7.073635591509345e-07, + "loss": 0.1064, + "step": 8322 + }, + { + "epoch": 0.7668494034182521, + "grad_norm": 0.9434948927728509, + "learning_rate": 7.068326667353659e-07, + "loss": 0.1205, + "step": 8323 + }, + { + "epoch": 0.7669415395955221, + "grad_norm": 0.9222390272621265, + "learning_rate": 7.063019408155777e-07, + "loss": 0.1213, + "step": 8324 + }, + { + "epoch": 0.7670336757727921, + "grad_norm": 0.9288456312089602, + "learning_rate": 7.057713814408473e-07, + "loss": 0.1217, + "step": 8325 + }, + { + "epoch": 0.7671258119500622, + "grad_norm": 0.9694355795017486, + "learning_rate": 7.052409886604364e-07, + "loss": 0.1292, + "step": 8326 + }, + { + "epoch": 0.7672179481273322, + "grad_norm": 1.0037949780961066, + "learning_rate": 7.04710762523593e-07, + "loss": 0.1229, + "step": 8327 + }, + { + "epoch": 0.7673100843046022, + "grad_norm": 1.0245295525799747, + "learning_rate": 7.041807030795495e-07, + "loss": 0.1302, + "step": 8328 + }, + { + "epoch": 0.7674022204818722, + "grad_norm": 0.9915072745998507, + "learning_rate": 7.036508103775199e-07, + "loss": 0.1288, + "step": 8329 + }, + { + "epoch": 0.7674943566591422, + "grad_norm": 0.9209649117333428, + "learning_rate": 7.031210844667066e-07, + "loss": 0.1177, + "step": 8330 + }, + { + "epoch": 0.7675864928364122, + "grad_norm": 0.9777283812460724, + "learning_rate": 7.025915253962934e-07, + "loss": 0.1194, + "step": 8331 + }, + { + "epoch": 0.7676786290136822, + "grad_norm": 0.9028823303836367, + "learning_rate": 7.020621332154512e-07, + "loss": 0.123, + "step": 8332 + }, + { + "epoch": 0.7677707651909522, + "grad_norm": 0.8798941197971895, + "learning_rate": 7.015329079733327e-07, + "loss": 0.104, + "step": 8333 + }, + { + "epoch": 0.7678629013682222, + "grad_norm": 0.9322675197448133, + "learning_rate": 7.010038497190774e-07, + "loss": 0.1159, + "step": 8334 + }, + { + "epoch": 0.7679550375454922, + "grad_norm": 1.0339398169724536, + "learning_rate": 7.004749585018089e-07, + "loss": 0.1274, + "step": 8335 + }, + { + "epoch": 0.7680471737227622, + "grad_norm": 0.9907312337014706, + "learning_rate": 6.999462343706339e-07, + "loss": 0.1261, + "step": 8336 + }, + { + "epoch": 0.7681393099000322, + "grad_norm": 0.9265113005572841, + "learning_rate": 6.994176773746445e-07, + "loss": 0.1127, + "step": 8337 + }, + { + "epoch": 0.7682314460773022, + "grad_norm": 0.8763629187072877, + "learning_rate": 6.988892875629172e-07, + "loss": 0.1123, + "step": 8338 + }, + { + "epoch": 0.7683235822545723, + "grad_norm": 0.9473048227249817, + "learning_rate": 6.983610649845136e-07, + "loss": 0.1266, + "step": 8339 + }, + { + "epoch": 0.7684157184318423, + "grad_norm": 0.9376358861988066, + "learning_rate": 6.978330096884794e-07, + "loss": 0.118, + "step": 8340 + }, + { + "epoch": 0.7685078546091123, + "grad_norm": 0.9698111754881962, + "learning_rate": 6.973051217238444e-07, + "loss": 0.1259, + "step": 8341 + }, + { + "epoch": 0.7685999907863823, + "grad_norm": 0.912876262065055, + "learning_rate": 6.967774011396222e-07, + "loss": 0.1077, + "step": 8342 + }, + { + "epoch": 0.7686921269636523, + "grad_norm": 0.9663238447952306, + "learning_rate": 6.962498479848124e-07, + "loss": 0.1203, + "step": 8343 + }, + { + "epoch": 0.7687842631409223, + "grad_norm": 0.8994434460789928, + "learning_rate": 6.957224623083989e-07, + "loss": 0.1223, + "step": 8344 + }, + { + "epoch": 0.7688763993181923, + "grad_norm": 0.8717941859314399, + "learning_rate": 6.951952441593482e-07, + "loss": 0.1125, + "step": 8345 + }, + { + "epoch": 0.7689685354954623, + "grad_norm": 0.8812348398963182, + "learning_rate": 6.946681935866143e-07, + "loss": 0.1082, + "step": 8346 + }, + { + "epoch": 0.7690606716727323, + "grad_norm": 1.0019169701459543, + "learning_rate": 6.941413106391321e-07, + "loss": 0.128, + "step": 8347 + }, + { + "epoch": 0.7691528078500023, + "grad_norm": 0.9169717950582553, + "learning_rate": 6.936145953658233e-07, + "loss": 0.1194, + "step": 8348 + }, + { + "epoch": 0.7692449440272723, + "grad_norm": 0.9331417687311467, + "learning_rate": 6.930880478155946e-07, + "loss": 0.1198, + "step": 8349 + }, + { + "epoch": 0.7693370802045423, + "grad_norm": 0.8913529397204782, + "learning_rate": 6.925616680373346e-07, + "loss": 0.1165, + "step": 8350 + }, + { + "epoch": 0.7694292163818123, + "grad_norm": 0.889454311489956, + "learning_rate": 6.920354560799189e-07, + "loss": 0.1176, + "step": 8351 + }, + { + "epoch": 0.7695213525590823, + "grad_norm": 0.9482997599159537, + "learning_rate": 6.915094119922048e-07, + "loss": 0.1333, + "step": 8352 + }, + { + "epoch": 0.7696134887363524, + "grad_norm": 0.9522924476603231, + "learning_rate": 6.909835358230372e-07, + "loss": 0.1208, + "step": 8353 + }, + { + "epoch": 0.7697056249136224, + "grad_norm": 0.9205945084224799, + "learning_rate": 6.904578276212423e-07, + "loss": 0.1298, + "step": 8354 + }, + { + "epoch": 0.7697977610908924, + "grad_norm": 0.9486688534035198, + "learning_rate": 6.899322874356329e-07, + "loss": 0.1303, + "step": 8355 + }, + { + "epoch": 0.7698898972681624, + "grad_norm": 0.9265740419588309, + "learning_rate": 6.89406915315006e-07, + "loss": 0.1253, + "step": 8356 + }, + { + "epoch": 0.7699820334454324, + "grad_norm": 0.9274708447215646, + "learning_rate": 6.888817113081419e-07, + "loss": 0.1262, + "step": 8357 + }, + { + "epoch": 0.7700741696227024, + "grad_norm": 0.9727274919868594, + "learning_rate": 6.883566754638052e-07, + "loss": 0.1318, + "step": 8358 + }, + { + "epoch": 0.7701663057999724, + "grad_norm": 0.8822267025867091, + "learning_rate": 6.878318078307461e-07, + "loss": 0.1122, + "step": 8359 + }, + { + "epoch": 0.7702584419772424, + "grad_norm": 0.9559628727055969, + "learning_rate": 6.873071084576985e-07, + "loss": 0.1169, + "step": 8360 + }, + { + "epoch": 0.7703505781545124, + "grad_norm": 0.9606378148147342, + "learning_rate": 6.86782577393382e-07, + "loss": 0.1277, + "step": 8361 + }, + { + "epoch": 0.7704427143317824, + "grad_norm": 0.9531820149960578, + "learning_rate": 6.862582146864982e-07, + "loss": 0.1244, + "step": 8362 + }, + { + "epoch": 0.7705348505090523, + "grad_norm": 0.90167924888066, + "learning_rate": 6.857340203857335e-07, + "loss": 0.1117, + "step": 8363 + }, + { + "epoch": 0.7706269866863223, + "grad_norm": 0.947891508516917, + "learning_rate": 6.852099945397603e-07, + "loss": 0.1195, + "step": 8364 + }, + { + "epoch": 0.7707191228635923, + "grad_norm": 0.9378585554874798, + "learning_rate": 6.846861371972355e-07, + "loss": 0.1244, + "step": 8365 + }, + { + "epoch": 0.7708112590408623, + "grad_norm": 0.9705047499227291, + "learning_rate": 6.841624484067971e-07, + "loss": 0.1309, + "step": 8366 + }, + { + "epoch": 0.7709033952181324, + "grad_norm": 0.8928153670107007, + "learning_rate": 6.836389282170716e-07, + "loss": 0.1127, + "step": 8367 + }, + { + "epoch": 0.7709955313954024, + "grad_norm": 0.9662324933740548, + "learning_rate": 6.831155766766665e-07, + "loss": 0.1336, + "step": 8368 + }, + { + "epoch": 0.7710876675726724, + "grad_norm": 0.8815268225850014, + "learning_rate": 6.825923938341761e-07, + "loss": 0.108, + "step": 8369 + }, + { + "epoch": 0.7711798037499424, + "grad_norm": 0.9841686059201685, + "learning_rate": 6.820693797381769e-07, + "loss": 0.1247, + "step": 8370 + }, + { + "epoch": 0.7712719399272124, + "grad_norm": 0.9536490084886785, + "learning_rate": 6.815465344372316e-07, + "loss": 0.1251, + "step": 8371 + }, + { + "epoch": 0.7713640761044824, + "grad_norm": 0.9289453206959042, + "learning_rate": 6.81023857979887e-07, + "loss": 0.1124, + "step": 8372 + }, + { + "epoch": 0.7714562122817524, + "grad_norm": 0.9312998796681076, + "learning_rate": 6.805013504146729e-07, + "loss": 0.1187, + "step": 8373 + }, + { + "epoch": 0.7715483484590224, + "grad_norm": 0.9395698669875502, + "learning_rate": 6.799790117901034e-07, + "loss": 0.1231, + "step": 8374 + }, + { + "epoch": 0.7716404846362924, + "grad_norm": 0.9166696590685488, + "learning_rate": 6.794568421546785e-07, + "loss": 0.1228, + "step": 8375 + }, + { + "epoch": 0.7717326208135624, + "grad_norm": 0.9550481131258569, + "learning_rate": 6.78934841556882e-07, + "loss": 0.1279, + "step": 8376 + }, + { + "epoch": 0.7718247569908324, + "grad_norm": 0.913707817793788, + "learning_rate": 6.784130100451819e-07, + "loss": 0.1173, + "step": 8377 + }, + { + "epoch": 0.7719168931681024, + "grad_norm": 0.9996482206708629, + "learning_rate": 6.778913476680302e-07, + "loss": 0.1227, + "step": 8378 + }, + { + "epoch": 0.7720090293453724, + "grad_norm": 0.9288326450663615, + "learning_rate": 6.773698544738619e-07, + "loss": 0.1274, + "step": 8379 + }, + { + "epoch": 0.7721011655226425, + "grad_norm": 0.9094294673119427, + "learning_rate": 6.768485305110989e-07, + "loss": 0.1158, + "step": 8380 + }, + { + "epoch": 0.7721933016999125, + "grad_norm": 0.9455830954897821, + "learning_rate": 6.763273758281458e-07, + "loss": 0.1176, + "step": 8381 + }, + { + "epoch": 0.7722854378771825, + "grad_norm": 0.9650385136922474, + "learning_rate": 6.758063904733933e-07, + "loss": 0.1245, + "step": 8382 + }, + { + "epoch": 0.7723775740544525, + "grad_norm": 0.9541341797982005, + "learning_rate": 6.752855744952136e-07, + "loss": 0.1195, + "step": 8383 + }, + { + "epoch": 0.7724697102317225, + "grad_norm": 0.9791283036740815, + "learning_rate": 6.747649279419638e-07, + "loss": 0.1189, + "step": 8384 + }, + { + "epoch": 0.7725618464089925, + "grad_norm": 0.864555357722854, + "learning_rate": 6.742444508619869e-07, + "loss": 0.1061, + "step": 8385 + }, + { + "epoch": 0.7726539825862625, + "grad_norm": 0.9755164798582352, + "learning_rate": 6.737241433036101e-07, + "loss": 0.1258, + "step": 8386 + }, + { + "epoch": 0.7727461187635325, + "grad_norm": 0.9382934330514728, + "learning_rate": 6.732040053151423e-07, + "loss": 0.1226, + "step": 8387 + }, + { + "epoch": 0.7728382549408025, + "grad_norm": 0.9150629750841084, + "learning_rate": 6.7268403694488e-07, + "loss": 0.1173, + "step": 8388 + }, + { + "epoch": 0.7729303911180725, + "grad_norm": 0.9115809997858356, + "learning_rate": 6.721642382411006e-07, + "loss": 0.1146, + "step": 8389 + }, + { + "epoch": 0.7730225272953425, + "grad_norm": 0.9670908852073896, + "learning_rate": 6.716446092520696e-07, + "loss": 0.1306, + "step": 8390 + }, + { + "epoch": 0.7731146634726125, + "grad_norm": 0.9394456052321907, + "learning_rate": 6.711251500260322e-07, + "loss": 0.1175, + "step": 8391 + }, + { + "epoch": 0.7732067996498825, + "grad_norm": 1.0224125514598037, + "learning_rate": 6.706058606112217e-07, + "loss": 0.1264, + "step": 8392 + }, + { + "epoch": 0.7732989358271525, + "grad_norm": 0.9136807246539035, + "learning_rate": 6.700867410558537e-07, + "loss": 0.1133, + "step": 8393 + }, + { + "epoch": 0.7733910720044226, + "grad_norm": 0.9579652653107388, + "learning_rate": 6.695677914081303e-07, + "loss": 0.1211, + "step": 8394 + }, + { + "epoch": 0.7734832081816926, + "grad_norm": 0.8953201784587433, + "learning_rate": 6.690490117162333e-07, + "loss": 0.1177, + "step": 8395 + }, + { + "epoch": 0.7735753443589626, + "grad_norm": 0.8947390913684228, + "learning_rate": 6.68530402028332e-07, + "loss": 0.1175, + "step": 8396 + }, + { + "epoch": 0.7736674805362326, + "grad_norm": 0.930339743335267, + "learning_rate": 6.680119623925804e-07, + "loss": 0.1119, + "step": 8397 + }, + { + "epoch": 0.7737596167135026, + "grad_norm": 0.9691742182397383, + "learning_rate": 6.67493692857116e-07, + "loss": 0.1232, + "step": 8398 + }, + { + "epoch": 0.7738517528907726, + "grad_norm": 0.9782640805198125, + "learning_rate": 6.669755934700586e-07, + "loss": 0.1178, + "step": 8399 + }, + { + "epoch": 0.7739438890680426, + "grad_norm": 0.946658092447457, + "learning_rate": 6.664576642795153e-07, + "loss": 0.1211, + "step": 8400 + }, + { + "epoch": 0.7740360252453126, + "grad_norm": 0.9171463357835422, + "learning_rate": 6.659399053335747e-07, + "loss": 0.1158, + "step": 8401 + }, + { + "epoch": 0.7741281614225826, + "grad_norm": 0.9237405035504593, + "learning_rate": 6.654223166803117e-07, + "loss": 0.118, + "step": 8402 + }, + { + "epoch": 0.7742202975998526, + "grad_norm": 0.8654426344561063, + "learning_rate": 6.649048983677834e-07, + "loss": 0.1139, + "step": 8403 + }, + { + "epoch": 0.7743124337771226, + "grad_norm": 0.9287901506367029, + "learning_rate": 6.643876504440327e-07, + "loss": 0.1165, + "step": 8404 + }, + { + "epoch": 0.7744045699543926, + "grad_norm": 0.936571591755619, + "learning_rate": 6.638705729570871e-07, + "loss": 0.1256, + "step": 8405 + }, + { + "epoch": 0.7744967061316625, + "grad_norm": 0.9037966720069851, + "learning_rate": 6.633536659549558e-07, + "loss": 0.1117, + "step": 8406 + }, + { + "epoch": 0.7745888423089327, + "grad_norm": 0.930053506926587, + "learning_rate": 6.628369294856349e-07, + "loss": 0.1152, + "step": 8407 + }, + { + "epoch": 0.7746809784862027, + "grad_norm": 1.0105164231758588, + "learning_rate": 6.623203635971018e-07, + "loss": 0.1324, + "step": 8408 + }, + { + "epoch": 0.7747731146634727, + "grad_norm": 0.9280072407677732, + "learning_rate": 6.618039683373209e-07, + "loss": 0.1121, + "step": 8409 + }, + { + "epoch": 0.7748652508407426, + "grad_norm": 0.931988772652393, + "learning_rate": 6.612877437542403e-07, + "loss": 0.1175, + "step": 8410 + }, + { + "epoch": 0.7749573870180126, + "grad_norm": 0.9055453680389911, + "learning_rate": 6.607716898957903e-07, + "loss": 0.1134, + "step": 8411 + }, + { + "epoch": 0.7750495231952826, + "grad_norm": 0.9570405732563926, + "learning_rate": 6.602558068098864e-07, + "loss": 0.126, + "step": 8412 + }, + { + "epoch": 0.7751416593725526, + "grad_norm": 0.8901939988992377, + "learning_rate": 6.597400945444285e-07, + "loss": 0.1098, + "step": 8413 + }, + { + "epoch": 0.7752337955498226, + "grad_norm": 0.9457230744765637, + "learning_rate": 6.59224553147301e-07, + "loss": 0.1268, + "step": 8414 + }, + { + "epoch": 0.7753259317270926, + "grad_norm": 0.9090096740087552, + "learning_rate": 6.587091826663728e-07, + "loss": 0.1165, + "step": 8415 + }, + { + "epoch": 0.7754180679043626, + "grad_norm": 0.9268957635480991, + "learning_rate": 6.581939831494948e-07, + "loss": 0.1165, + "step": 8416 + }, + { + "epoch": 0.7755102040816326, + "grad_norm": 0.8867965628336227, + "learning_rate": 6.576789546445031e-07, + "loss": 0.1091, + "step": 8417 + }, + { + "epoch": 0.7756023402589026, + "grad_norm": 0.9892905172706016, + "learning_rate": 6.571640971992188e-07, + "loss": 0.1157, + "step": 8418 + }, + { + "epoch": 0.7756944764361726, + "grad_norm": 0.9380259789018441, + "learning_rate": 6.566494108614471e-07, + "loss": 0.1152, + "step": 8419 + }, + { + "epoch": 0.7757866126134426, + "grad_norm": 0.9206879628963276, + "learning_rate": 6.561348956789751e-07, + "loss": 0.1111, + "step": 8420 + }, + { + "epoch": 0.7758787487907127, + "grad_norm": 0.9614768658559342, + "learning_rate": 6.556205516995772e-07, + "loss": 0.1194, + "step": 8421 + }, + { + "epoch": 0.7759708849679827, + "grad_norm": 1.0010135251033254, + "learning_rate": 6.551063789710091e-07, + "loss": 0.1287, + "step": 8422 + }, + { + "epoch": 0.7760630211452527, + "grad_norm": 0.9381981477040692, + "learning_rate": 6.545923775410129e-07, + "loss": 0.1111, + "step": 8423 + }, + { + "epoch": 0.7761551573225227, + "grad_norm": 0.991357248153485, + "learning_rate": 6.540785474573121e-07, + "loss": 0.1266, + "step": 8424 + }, + { + "epoch": 0.7762472934997927, + "grad_norm": 0.9387943094150902, + "learning_rate": 6.535648887676171e-07, + "loss": 0.1191, + "step": 8425 + }, + { + "epoch": 0.7763394296770627, + "grad_norm": 0.9860413451407051, + "learning_rate": 6.530514015196218e-07, + "loss": 0.1304, + "step": 8426 + }, + { + "epoch": 0.7764315658543327, + "grad_norm": 0.985186213663649, + "learning_rate": 6.525380857610022e-07, + "loss": 0.1286, + "step": 8427 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 0.9875065104492615, + "learning_rate": 6.520249415394197e-07, + "loss": 0.1194, + "step": 8428 + }, + { + "epoch": 0.7766158382088727, + "grad_norm": 0.926501626127916, + "learning_rate": 6.515119689025201e-07, + "loss": 0.1156, + "step": 8429 + }, + { + "epoch": 0.7767079743861427, + "grad_norm": 1.0045405628938215, + "learning_rate": 6.509991678979333e-07, + "loss": 0.1312, + "step": 8430 + }, + { + "epoch": 0.7768001105634127, + "grad_norm": 0.9431212698157658, + "learning_rate": 6.504865385732734e-07, + "loss": 0.1197, + "step": 8431 + }, + { + "epoch": 0.7768922467406827, + "grad_norm": 0.9141056582506759, + "learning_rate": 6.499740809761373e-07, + "loss": 0.1198, + "step": 8432 + }, + { + "epoch": 0.7769843829179527, + "grad_norm": 0.9006244298850004, + "learning_rate": 6.494617951541063e-07, + "loss": 0.1112, + "step": 8433 + }, + { + "epoch": 0.7770765190952227, + "grad_norm": 1.0513186465071094, + "learning_rate": 6.489496811547468e-07, + "loss": 0.1287, + "step": 8434 + }, + { + "epoch": 0.7771686552724928, + "grad_norm": 0.9398083516416804, + "learning_rate": 6.484377390256086e-07, + "loss": 0.1223, + "step": 8435 + }, + { + "epoch": 0.7772607914497628, + "grad_norm": 0.8917848864473544, + "learning_rate": 6.479259688142261e-07, + "loss": 0.1034, + "step": 8436 + }, + { + "epoch": 0.7773529276270328, + "grad_norm": 0.9237163314143305, + "learning_rate": 6.474143705681171e-07, + "loss": 0.113, + "step": 8437 + }, + { + "epoch": 0.7774450638043028, + "grad_norm": 0.8793476209899715, + "learning_rate": 6.469029443347821e-07, + "loss": 0.1095, + "step": 8438 + }, + { + "epoch": 0.7775371999815728, + "grad_norm": 0.9637901601407782, + "learning_rate": 6.463916901617084e-07, + "loss": 0.1255, + "step": 8439 + }, + { + "epoch": 0.7776293361588428, + "grad_norm": 0.9257448847903722, + "learning_rate": 6.458806080963664e-07, + "loss": 0.1217, + "step": 8440 + }, + { + "epoch": 0.7777214723361128, + "grad_norm": 0.973025415024205, + "learning_rate": 6.453696981862087e-07, + "loss": 0.119, + "step": 8441 + }, + { + "epoch": 0.7778136085133828, + "grad_norm": 0.9278105107397764, + "learning_rate": 6.448589604786748e-07, + "loss": 0.1176, + "step": 8442 + }, + { + "epoch": 0.7779057446906528, + "grad_norm": 0.9748580678259127, + "learning_rate": 6.443483950211854e-07, + "loss": 0.1338, + "step": 8443 + }, + { + "epoch": 0.7779978808679228, + "grad_norm": 0.9426796776933819, + "learning_rate": 6.438380018611481e-07, + "loss": 0.1161, + "step": 8444 + }, + { + "epoch": 0.7780900170451928, + "grad_norm": 0.9374426050170451, + "learning_rate": 6.433277810459512e-07, + "loss": 0.126, + "step": 8445 + }, + { + "epoch": 0.7781821532224628, + "grad_norm": 0.9156606080642972, + "learning_rate": 6.428177326229698e-07, + "loss": 0.1196, + "step": 8446 + }, + { + "epoch": 0.7782742893997328, + "grad_norm": 0.9785305397305804, + "learning_rate": 6.423078566395624e-07, + "loss": 0.1278, + "step": 8447 + }, + { + "epoch": 0.7783664255770029, + "grad_norm": 0.9472568149749723, + "learning_rate": 6.417981531430705e-07, + "loss": 0.1195, + "step": 8448 + }, + { + "epoch": 0.7784585617542729, + "grad_norm": 0.9267730257758037, + "learning_rate": 6.412886221808193e-07, + "loss": 0.1217, + "step": 8449 + }, + { + "epoch": 0.7785506979315429, + "grad_norm": 0.9245783333379712, + "learning_rate": 6.407792638001195e-07, + "loss": 0.118, + "step": 8450 + }, + { + "epoch": 0.7786428341088129, + "grad_norm": 0.8951265439753134, + "learning_rate": 6.402700780482651e-07, + "loss": 0.1143, + "step": 8451 + }, + { + "epoch": 0.7787349702860829, + "grad_norm": 0.9733184947776784, + "learning_rate": 6.39761064972535e-07, + "loss": 0.1192, + "step": 8452 + }, + { + "epoch": 0.7788271064633528, + "grad_norm": 0.9653018548665765, + "learning_rate": 6.392522246201902e-07, + "loss": 0.1194, + "step": 8453 + }, + { + "epoch": 0.7789192426406228, + "grad_norm": 0.9366809581149949, + "learning_rate": 6.387435570384759e-07, + "loss": 0.1185, + "step": 8454 + }, + { + "epoch": 0.7790113788178928, + "grad_norm": 0.9412472670526044, + "learning_rate": 6.382350622746225e-07, + "loss": 0.1257, + "step": 8455 + }, + { + "epoch": 0.7791035149951628, + "grad_norm": 0.9377271262047409, + "learning_rate": 6.377267403758447e-07, + "loss": 0.1122, + "step": 8456 + }, + { + "epoch": 0.7791956511724328, + "grad_norm": 0.9410613069969594, + "learning_rate": 6.372185913893389e-07, + "loss": 0.1252, + "step": 8457 + }, + { + "epoch": 0.7792877873497028, + "grad_norm": 1.0330182142744149, + "learning_rate": 6.367106153622879e-07, + "loss": 0.1228, + "step": 8458 + }, + { + "epoch": 0.7793799235269728, + "grad_norm": 0.9656816711177176, + "learning_rate": 6.362028123418562e-07, + "loss": 0.1174, + "step": 8459 + }, + { + "epoch": 0.7794720597042428, + "grad_norm": 0.9552540658961965, + "learning_rate": 6.356951823751947e-07, + "loss": 0.1094, + "step": 8460 + }, + { + "epoch": 0.7795641958815128, + "grad_norm": 0.8951826862510767, + "learning_rate": 6.351877255094352e-07, + "loss": 0.1064, + "step": 8461 + }, + { + "epoch": 0.7796563320587829, + "grad_norm": 0.8842323676196631, + "learning_rate": 6.346804417916963e-07, + "loss": 0.1091, + "step": 8462 + }, + { + "epoch": 0.7797484682360529, + "grad_norm": 1.021858302504246, + "learning_rate": 6.341733312690798e-07, + "loss": 0.1224, + "step": 8463 + }, + { + "epoch": 0.7798406044133229, + "grad_norm": 0.9373733957984577, + "learning_rate": 6.336663939886695e-07, + "loss": 0.1152, + "step": 8464 + }, + { + "epoch": 0.7799327405905929, + "grad_norm": 0.9418487894872865, + "learning_rate": 6.331596299975362e-07, + "loss": 0.1287, + "step": 8465 + }, + { + "epoch": 0.7800248767678629, + "grad_norm": 0.9485846360794888, + "learning_rate": 6.326530393427316e-07, + "loss": 0.1197, + "step": 8466 + }, + { + "epoch": 0.7801170129451329, + "grad_norm": 0.9556435873882474, + "learning_rate": 6.321466220712929e-07, + "loss": 0.1156, + "step": 8467 + }, + { + "epoch": 0.7802091491224029, + "grad_norm": 0.8926722100933544, + "learning_rate": 6.316403782302416e-07, + "loss": 0.113, + "step": 8468 + }, + { + "epoch": 0.7803012852996729, + "grad_norm": 0.9739250990079734, + "learning_rate": 6.311343078665835e-07, + "loss": 0.1289, + "step": 8469 + }, + { + "epoch": 0.7803934214769429, + "grad_norm": 0.988398143690067, + "learning_rate": 6.306284110273047e-07, + "loss": 0.1248, + "step": 8470 + }, + { + "epoch": 0.7804855576542129, + "grad_norm": 0.9728431078034094, + "learning_rate": 6.301226877593794e-07, + "loss": 0.1162, + "step": 8471 + }, + { + "epoch": 0.7805776938314829, + "grad_norm": 0.9367164880906897, + "learning_rate": 6.296171381097635e-07, + "loss": 0.1259, + "step": 8472 + }, + { + "epoch": 0.7806698300087529, + "grad_norm": 0.914892332720901, + "learning_rate": 6.291117621253984e-07, + "loss": 0.1188, + "step": 8473 + }, + { + "epoch": 0.7807619661860229, + "grad_norm": 0.9445604136684088, + "learning_rate": 6.286065598532065e-07, + "loss": 0.1215, + "step": 8474 + }, + { + "epoch": 0.780854102363293, + "grad_norm": 0.971002437022487, + "learning_rate": 6.281015313400981e-07, + "loss": 0.127, + "step": 8475 + }, + { + "epoch": 0.780946238540563, + "grad_norm": 0.9458980424013934, + "learning_rate": 6.27596676632963e-07, + "loss": 0.1233, + "step": 8476 + }, + { + "epoch": 0.781038374717833, + "grad_norm": 0.9575720735157595, + "learning_rate": 6.270919957786789e-07, + "loss": 0.1276, + "step": 8477 + }, + { + "epoch": 0.781130510895103, + "grad_norm": 0.9043773140494776, + "learning_rate": 6.265874888241035e-07, + "loss": 0.1148, + "step": 8478 + }, + { + "epoch": 0.781222647072373, + "grad_norm": 0.9233761176859706, + "learning_rate": 6.260831558160818e-07, + "loss": 0.1167, + "step": 8479 + }, + { + "epoch": 0.781314783249643, + "grad_norm": 0.98013498969769, + "learning_rate": 6.255789968014411e-07, + "loss": 0.1319, + "step": 8480 + }, + { + "epoch": 0.781406919426913, + "grad_norm": 0.9404213995261698, + "learning_rate": 6.250750118269927e-07, + "loss": 0.1244, + "step": 8481 + }, + { + "epoch": 0.781499055604183, + "grad_norm": 0.9519949447236915, + "learning_rate": 6.245712009395303e-07, + "loss": 0.1134, + "step": 8482 + }, + { + "epoch": 0.781591191781453, + "grad_norm": 0.9317461820724098, + "learning_rate": 6.240675641858335e-07, + "loss": 0.1192, + "step": 8483 + }, + { + "epoch": 0.781683327958723, + "grad_norm": 0.9129871386527313, + "learning_rate": 6.235641016126653e-07, + "loss": 0.1166, + "step": 8484 + }, + { + "epoch": 0.781775464135993, + "grad_norm": 1.048428062985597, + "learning_rate": 6.230608132667732e-07, + "loss": 0.1334, + "step": 8485 + }, + { + "epoch": 0.781867600313263, + "grad_norm": 0.9451185634087925, + "learning_rate": 6.225576991948865e-07, + "loss": 0.1149, + "step": 8486 + }, + { + "epoch": 0.781959736490533, + "grad_norm": 0.99581861616513, + "learning_rate": 6.220547594437188e-07, + "loss": 0.1187, + "step": 8487 + }, + { + "epoch": 0.782051872667803, + "grad_norm": 1.0278512933151998, + "learning_rate": 6.215519940599687e-07, + "loss": 0.1317, + "step": 8488 + }, + { + "epoch": 0.7821440088450731, + "grad_norm": 0.9460848053133615, + "learning_rate": 6.210494030903188e-07, + "loss": 0.1257, + "step": 8489 + }, + { + "epoch": 0.7822361450223431, + "grad_norm": 0.9789169783690399, + "learning_rate": 6.205469865814334e-07, + "loss": 0.1198, + "step": 8490 + }, + { + "epoch": 0.7823282811996131, + "grad_norm": 0.9355875645829241, + "learning_rate": 6.200447445799631e-07, + "loss": 0.1239, + "step": 8491 + }, + { + "epoch": 0.7824204173768831, + "grad_norm": 0.931025680256216, + "learning_rate": 6.195426771325402e-07, + "loss": 0.1197, + "step": 8492 + }, + { + "epoch": 0.7825125535541531, + "grad_norm": 0.9038272461792805, + "learning_rate": 6.190407842857818e-07, + "loss": 0.109, + "step": 8493 + }, + { + "epoch": 0.782604689731423, + "grad_norm": 0.9291283890197121, + "learning_rate": 6.1853906608629e-07, + "loss": 0.1144, + "step": 8494 + }, + { + "epoch": 0.782696825908693, + "grad_norm": 0.9313673945332007, + "learning_rate": 6.180375225806475e-07, + "loss": 0.1174, + "step": 8495 + }, + { + "epoch": 0.782788962085963, + "grad_norm": 0.8837655167803492, + "learning_rate": 6.175361538154243e-07, + "loss": 0.1078, + "step": 8496 + }, + { + "epoch": 0.782881098263233, + "grad_norm": 0.9261810944905562, + "learning_rate": 6.170349598371711e-07, + "loss": 0.1135, + "step": 8497 + }, + { + "epoch": 0.782973234440503, + "grad_norm": 0.9705748971066371, + "learning_rate": 6.165339406924253e-07, + "loss": 0.1284, + "step": 8498 + }, + { + "epoch": 0.783065370617773, + "grad_norm": 0.931338790555389, + "learning_rate": 6.16033096427705e-07, + "loss": 0.1156, + "step": 8499 + }, + { + "epoch": 0.783157506795043, + "grad_norm": 0.9412160383814583, + "learning_rate": 6.155324270895144e-07, + "loss": 0.1259, + "step": 8500 + }, + { + "epoch": 0.783157506795043, + "eval_loss": 0.12055304646492004, + "eval_runtime": 299.0742, + "eval_samples_per_second": 23.462, + "eval_steps_per_second": 2.936, + "step": 8500 + }, + { + "epoch": 0.783249642972313, + "grad_norm": 0.9786974177274947, + "learning_rate": 6.150319327243417e-07, + "loss": 0.1266, + "step": 8501 + }, + { + "epoch": 0.783341779149583, + "grad_norm": 0.9663666635302712, + "learning_rate": 6.145316133786569e-07, + "loss": 0.1236, + "step": 8502 + }, + { + "epoch": 0.7834339153268531, + "grad_norm": 1.0066102428653365, + "learning_rate": 6.140314690989138e-07, + "loss": 0.1201, + "step": 8503 + }, + { + "epoch": 0.7835260515041231, + "grad_norm": 0.9032566231861803, + "learning_rate": 6.135314999315517e-07, + "loss": 0.1188, + "step": 8504 + }, + { + "epoch": 0.7836181876813931, + "grad_norm": 0.8936724534841473, + "learning_rate": 6.130317059229932e-07, + "loss": 0.1112, + "step": 8505 + }, + { + "epoch": 0.7837103238586631, + "grad_norm": 0.8765532770050644, + "learning_rate": 6.125320871196445e-07, + "loss": 0.1215, + "step": 8506 + }, + { + "epoch": 0.7838024600359331, + "grad_norm": 0.9812082328646534, + "learning_rate": 6.120326435678945e-07, + "loss": 0.1204, + "step": 8507 + }, + { + "epoch": 0.7838945962132031, + "grad_norm": 0.9941206603675594, + "learning_rate": 6.115333753141159e-07, + "loss": 0.1267, + "step": 8508 + }, + { + "epoch": 0.7839867323904731, + "grad_norm": 0.9400949014093748, + "learning_rate": 6.110342824046667e-07, + "loss": 0.1191, + "step": 8509 + }, + { + "epoch": 0.7840788685677431, + "grad_norm": 0.9885242657468389, + "learning_rate": 6.105353648858887e-07, + "loss": 0.134, + "step": 8510 + }, + { + "epoch": 0.7841710047450131, + "grad_norm": 0.9742890666878563, + "learning_rate": 6.100366228041043e-07, + "loss": 0.1279, + "step": 8511 + }, + { + "epoch": 0.7842631409222831, + "grad_norm": 0.9855302153041672, + "learning_rate": 6.095380562056238e-07, + "loss": 0.128, + "step": 8512 + }, + { + "epoch": 0.7843552770995531, + "grad_norm": 0.9553368441789891, + "learning_rate": 6.090396651367375e-07, + "loss": 0.1217, + "step": 8513 + }, + { + "epoch": 0.7844474132768231, + "grad_norm": 0.9679957894036133, + "learning_rate": 6.085414496437226e-07, + "loss": 0.1314, + "step": 8514 + }, + { + "epoch": 0.7845395494540931, + "grad_norm": 0.9478895842952598, + "learning_rate": 6.080434097728368e-07, + "loss": 0.1304, + "step": 8515 + }, + { + "epoch": 0.7846316856313632, + "grad_norm": 0.8701797009684351, + "learning_rate": 6.075455455703242e-07, + "loss": 0.1086, + "step": 8516 + }, + { + "epoch": 0.7847238218086332, + "grad_norm": 0.9906259373704634, + "learning_rate": 6.070478570824118e-07, + "loss": 0.1289, + "step": 8517 + }, + { + "epoch": 0.7848159579859032, + "grad_norm": 0.8989965341827344, + "learning_rate": 6.065503443553097e-07, + "loss": 0.1077, + "step": 8518 + }, + { + "epoch": 0.7849080941631732, + "grad_norm": 0.959007954608302, + "learning_rate": 6.060530074352114e-07, + "loss": 0.1263, + "step": 8519 + }, + { + "epoch": 0.7850002303404432, + "grad_norm": 0.9246171708824706, + "learning_rate": 6.055558463682948e-07, + "loss": 0.1178, + "step": 8520 + }, + { + "epoch": 0.7850923665177132, + "grad_norm": 0.9477951948238026, + "learning_rate": 6.050588612007221e-07, + "loss": 0.1191, + "step": 8521 + }, + { + "epoch": 0.7851845026949832, + "grad_norm": 0.9658569636554907, + "learning_rate": 6.045620519786386e-07, + "loss": 0.1155, + "step": 8522 + }, + { + "epoch": 0.7852766388722532, + "grad_norm": 0.957644890519468, + "learning_rate": 6.040654187481726e-07, + "loss": 0.1172, + "step": 8523 + }, + { + "epoch": 0.7853687750495232, + "grad_norm": 0.9855829358535408, + "learning_rate": 6.035689615554358e-07, + "loss": 0.1212, + "step": 8524 + }, + { + "epoch": 0.7854609112267932, + "grad_norm": 0.8942975631899651, + "learning_rate": 6.030726804465251e-07, + "loss": 0.1116, + "step": 8525 + }, + { + "epoch": 0.7855530474040632, + "grad_norm": 0.9451557449490459, + "learning_rate": 6.0257657546752e-07, + "loss": 0.12, + "step": 8526 + }, + { + "epoch": 0.7856451835813332, + "grad_norm": 0.9301666727799931, + "learning_rate": 6.020806466644849e-07, + "loss": 0.1243, + "step": 8527 + }, + { + "epoch": 0.7857373197586032, + "grad_norm": 0.9620046325714793, + "learning_rate": 6.015848940834662e-07, + "loss": 0.1187, + "step": 8528 + }, + { + "epoch": 0.7858294559358732, + "grad_norm": 0.933481477737301, + "learning_rate": 6.010893177704935e-07, + "loss": 0.1183, + "step": 8529 + }, + { + "epoch": 0.7859215921131433, + "grad_norm": 0.9539924108362, + "learning_rate": 6.00593917771582e-07, + "loss": 0.128, + "step": 8530 + }, + { + "epoch": 0.7860137282904133, + "grad_norm": 0.9199764287416644, + "learning_rate": 6.000986941327303e-07, + "loss": 0.114, + "step": 8531 + }, + { + "epoch": 0.7861058644676833, + "grad_norm": 0.9631603455245744, + "learning_rate": 5.996036468999187e-07, + "loss": 0.1209, + "step": 8532 + }, + { + "epoch": 0.7861980006449533, + "grad_norm": 0.946933611017911, + "learning_rate": 5.991087761191136e-07, + "loss": 0.1266, + "step": 8533 + }, + { + "epoch": 0.7862901368222233, + "grad_norm": 0.9249986114077867, + "learning_rate": 5.986140818362626e-07, + "loss": 0.1131, + "step": 8534 + }, + { + "epoch": 0.7863822729994933, + "grad_norm": 0.9484468777362128, + "learning_rate": 5.981195640972995e-07, + "loss": 0.1282, + "step": 8535 + }, + { + "epoch": 0.7864744091767633, + "grad_norm": 0.9371057825811394, + "learning_rate": 5.976252229481385e-07, + "loss": 0.124, + "step": 8536 + }, + { + "epoch": 0.7865665453540333, + "grad_norm": 0.9188426598072358, + "learning_rate": 5.971310584346807e-07, + "loss": 0.1148, + "step": 8537 + }, + { + "epoch": 0.7866586815313032, + "grad_norm": 0.9073770852124188, + "learning_rate": 5.966370706028094e-07, + "loss": 0.1145, + "step": 8538 + }, + { + "epoch": 0.7867508177085732, + "grad_norm": 0.9547670801763313, + "learning_rate": 5.96143259498391e-07, + "loss": 0.1266, + "step": 8539 + }, + { + "epoch": 0.7868429538858432, + "grad_norm": 0.9827821775876324, + "learning_rate": 5.956496251672752e-07, + "loss": 0.128, + "step": 8540 + }, + { + "epoch": 0.7869350900631132, + "grad_norm": 0.9456692696948865, + "learning_rate": 5.951561676552966e-07, + "loss": 0.1215, + "step": 8541 + }, + { + "epoch": 0.7870272262403832, + "grad_norm": 0.9267593713920448, + "learning_rate": 5.946628870082729e-07, + "loss": 0.123, + "step": 8542 + }, + { + "epoch": 0.7871193624176533, + "grad_norm": 0.9393627351534874, + "learning_rate": 5.941697832720058e-07, + "loss": 0.1325, + "step": 8543 + }, + { + "epoch": 0.7872114985949233, + "grad_norm": 0.9550911434469457, + "learning_rate": 5.936768564922796e-07, + "loss": 0.1203, + "step": 8544 + }, + { + "epoch": 0.7873036347721933, + "grad_norm": 0.9004499570914317, + "learning_rate": 5.931841067148616e-07, + "loss": 0.1176, + "step": 8545 + }, + { + "epoch": 0.7873957709494633, + "grad_norm": 0.8932941203298196, + "learning_rate": 5.926915339855044e-07, + "loss": 0.1213, + "step": 8546 + }, + { + "epoch": 0.7874879071267333, + "grad_norm": 0.8922074120043654, + "learning_rate": 5.921991383499445e-07, + "loss": 0.1156, + "step": 8547 + }, + { + "epoch": 0.7875800433040033, + "grad_norm": 0.9050717612224399, + "learning_rate": 5.917069198538991e-07, + "loss": 0.1103, + "step": 8548 + }, + { + "epoch": 0.7876721794812733, + "grad_norm": 0.9122458952803807, + "learning_rate": 5.912148785430713e-07, + "loss": 0.1023, + "step": 8549 + }, + { + "epoch": 0.7877643156585433, + "grad_norm": 0.9437602990583515, + "learning_rate": 5.907230144631485e-07, + "loss": 0.1245, + "step": 8550 + }, + { + "epoch": 0.7878564518358133, + "grad_norm": 0.8797876753243765, + "learning_rate": 5.902313276597984e-07, + "loss": 0.1112, + "step": 8551 + }, + { + "epoch": 0.7879485880130833, + "grad_norm": 0.9023470141440695, + "learning_rate": 5.89739818178676e-07, + "loss": 0.1145, + "step": 8552 + }, + { + "epoch": 0.7880407241903533, + "grad_norm": 1.0168757196023077, + "learning_rate": 5.892484860654163e-07, + "loss": 0.1374, + "step": 8553 + }, + { + "epoch": 0.7881328603676233, + "grad_norm": 0.9206918918167758, + "learning_rate": 5.8875733136564e-07, + "loss": 0.1309, + "step": 8554 + }, + { + "epoch": 0.7882249965448933, + "grad_norm": 0.9902963814438155, + "learning_rate": 5.882663541249523e-07, + "loss": 0.128, + "step": 8555 + }, + { + "epoch": 0.7883171327221633, + "grad_norm": 0.9183091186455521, + "learning_rate": 5.877755543889391e-07, + "loss": 0.1202, + "step": 8556 + }, + { + "epoch": 0.7884092688994334, + "grad_norm": 0.9533822683824176, + "learning_rate": 5.872849322031706e-07, + "loss": 0.1213, + "step": 8557 + }, + { + "epoch": 0.7885014050767034, + "grad_norm": 0.9373591429059158, + "learning_rate": 5.867944876132022e-07, + "loss": 0.1227, + "step": 8558 + }, + { + "epoch": 0.7885935412539734, + "grad_norm": 0.9103034475496797, + "learning_rate": 5.863042206645716e-07, + "loss": 0.1034, + "step": 8559 + }, + { + "epoch": 0.7886856774312434, + "grad_norm": 0.985633430923528, + "learning_rate": 5.858141314028007e-07, + "loss": 0.1258, + "step": 8560 + }, + { + "epoch": 0.7887778136085134, + "grad_norm": 0.8959172713941098, + "learning_rate": 5.853242198733938e-07, + "loss": 0.1123, + "step": 8561 + }, + { + "epoch": 0.7888699497857834, + "grad_norm": 0.9396584454256341, + "learning_rate": 5.848344861218383e-07, + "loss": 0.1252, + "step": 8562 + }, + { + "epoch": 0.7889620859630534, + "grad_norm": 0.9604002869692225, + "learning_rate": 5.843449301936068e-07, + "loss": 0.1263, + "step": 8563 + }, + { + "epoch": 0.7890542221403234, + "grad_norm": 0.9298803788187124, + "learning_rate": 5.838555521341558e-07, + "loss": 0.1278, + "step": 8564 + }, + { + "epoch": 0.7891463583175934, + "grad_norm": 0.9559648352234692, + "learning_rate": 5.833663519889218e-07, + "loss": 0.1244, + "step": 8565 + }, + { + "epoch": 0.7892384944948634, + "grad_norm": 0.8911343469758822, + "learning_rate": 5.828773298033294e-07, + "loss": 0.1109, + "step": 8566 + }, + { + "epoch": 0.7893306306721334, + "grad_norm": 0.9454641222726273, + "learning_rate": 5.823884856227824e-07, + "loss": 0.1232, + "step": 8567 + }, + { + "epoch": 0.7894227668494034, + "grad_norm": 0.9859866096920007, + "learning_rate": 5.818998194926714e-07, + "loss": 0.13, + "step": 8568 + }, + { + "epoch": 0.7895149030266734, + "grad_norm": 0.947179149432619, + "learning_rate": 5.81411331458368e-07, + "loss": 0.1206, + "step": 8569 + }, + { + "epoch": 0.7896070392039435, + "grad_norm": 0.9052920558861215, + "learning_rate": 5.809230215652292e-07, + "loss": 0.1077, + "step": 8570 + }, + { + "epoch": 0.7896991753812135, + "grad_norm": 0.9195765167110864, + "learning_rate": 5.804348898585949e-07, + "loss": 0.1171, + "step": 8571 + }, + { + "epoch": 0.7897913115584835, + "grad_norm": 0.9482984019981444, + "learning_rate": 5.799469363837876e-07, + "loss": 0.1248, + "step": 8572 + }, + { + "epoch": 0.7898834477357535, + "grad_norm": 0.9463055742926426, + "learning_rate": 5.794591611861134e-07, + "loss": 0.1208, + "step": 8573 + }, + { + "epoch": 0.7899755839130235, + "grad_norm": 0.9494961706911567, + "learning_rate": 5.789715643108623e-07, + "loss": 0.1139, + "step": 8574 + }, + { + "epoch": 0.7900677200902935, + "grad_norm": 0.919851610503083, + "learning_rate": 5.784841458033086e-07, + "loss": 0.1215, + "step": 8575 + }, + { + "epoch": 0.7901598562675635, + "grad_norm": 0.9245264796890834, + "learning_rate": 5.779969057087095e-07, + "loss": 0.1176, + "step": 8576 + }, + { + "epoch": 0.7902519924448335, + "grad_norm": 0.9287192440639785, + "learning_rate": 5.775098440723042e-07, + "loss": 0.1222, + "step": 8577 + }, + { + "epoch": 0.7903441286221035, + "grad_norm": 0.9450777250855027, + "learning_rate": 5.770229609393166e-07, + "loss": 0.1129, + "step": 8578 + }, + { + "epoch": 0.7904362647993735, + "grad_norm": 0.9492026706943572, + "learning_rate": 5.765362563549537e-07, + "loss": 0.1278, + "step": 8579 + }, + { + "epoch": 0.7905284009766435, + "grad_norm": 0.9202309222182878, + "learning_rate": 5.760497303644063e-07, + "loss": 0.1103, + "step": 8580 + }, + { + "epoch": 0.7906205371539134, + "grad_norm": 0.9245925883002144, + "learning_rate": 5.755633830128493e-07, + "loss": 0.1257, + "step": 8581 + }, + { + "epoch": 0.7907126733311834, + "grad_norm": 0.9517509945193647, + "learning_rate": 5.750772143454395e-07, + "loss": 0.1261, + "step": 8582 + }, + { + "epoch": 0.7908048095084534, + "grad_norm": 0.9080767566545325, + "learning_rate": 5.745912244073166e-07, + "loss": 0.1067, + "step": 8583 + }, + { + "epoch": 0.7908969456857236, + "grad_norm": 0.9902988389524588, + "learning_rate": 5.741054132436058e-07, + "loss": 0.1311, + "step": 8584 + }, + { + "epoch": 0.7909890818629935, + "grad_norm": 0.9161014687025334, + "learning_rate": 5.736197808994151e-07, + "loss": 0.1168, + "step": 8585 + }, + { + "epoch": 0.7910812180402635, + "grad_norm": 0.9407576129491564, + "learning_rate": 5.731343274198348e-07, + "loss": 0.1181, + "step": 8586 + }, + { + "epoch": 0.7911733542175335, + "grad_norm": 0.9222839395438274, + "learning_rate": 5.726490528499398e-07, + "loss": 0.1241, + "step": 8587 + }, + { + "epoch": 0.7912654903948035, + "grad_norm": 0.989642538165895, + "learning_rate": 5.721639572347873e-07, + "loss": 0.1298, + "step": 8588 + }, + { + "epoch": 0.7913576265720735, + "grad_norm": 0.910192558868216, + "learning_rate": 5.716790406194195e-07, + "loss": 0.1155, + "step": 8589 + }, + { + "epoch": 0.7914497627493435, + "grad_norm": 0.9427897115154177, + "learning_rate": 5.711943030488595e-07, + "loss": 0.1208, + "step": 8590 + }, + { + "epoch": 0.7915418989266135, + "grad_norm": 0.9230754902994186, + "learning_rate": 5.707097445681162e-07, + "loss": 0.1156, + "step": 8591 + }, + { + "epoch": 0.7916340351038835, + "grad_norm": 0.9736347600748048, + "learning_rate": 5.702253652221815e-07, + "loss": 0.1209, + "step": 8592 + }, + { + "epoch": 0.7917261712811535, + "grad_norm": 0.8885640767094255, + "learning_rate": 5.697411650560292e-07, + "loss": 0.1093, + "step": 8593 + }, + { + "epoch": 0.7918183074584235, + "grad_norm": 0.9367737786103164, + "learning_rate": 5.692571441146167e-07, + "loss": 0.1137, + "step": 8594 + }, + { + "epoch": 0.7919104436356935, + "grad_norm": 0.9616450487450995, + "learning_rate": 5.68773302442886e-07, + "loss": 0.1251, + "step": 8595 + }, + { + "epoch": 0.7920025798129635, + "grad_norm": 0.994312643078694, + "learning_rate": 5.682896400857623e-07, + "loss": 0.1328, + "step": 8596 + }, + { + "epoch": 0.7920947159902335, + "grad_norm": 0.9133848572401454, + "learning_rate": 5.678061570881541e-07, + "loss": 0.1175, + "step": 8597 + }, + { + "epoch": 0.7921868521675036, + "grad_norm": 0.9092316720540026, + "learning_rate": 5.673228534949521e-07, + "loss": 0.1126, + "step": 8598 + }, + { + "epoch": 0.7922789883447736, + "grad_norm": 0.9170357165667093, + "learning_rate": 5.668397293510303e-07, + "loss": 0.1095, + "step": 8599 + }, + { + "epoch": 0.7923711245220436, + "grad_norm": 0.9195323389718666, + "learning_rate": 5.66356784701248e-07, + "loss": 0.1139, + "step": 8600 + }, + { + "epoch": 0.7924632606993136, + "grad_norm": 0.9813287621184931, + "learning_rate": 5.658740195904466e-07, + "loss": 0.1224, + "step": 8601 + }, + { + "epoch": 0.7925553968765836, + "grad_norm": 0.9381776235684026, + "learning_rate": 5.653914340634504e-07, + "loss": 0.1183, + "step": 8602 + }, + { + "epoch": 0.7926475330538536, + "grad_norm": 0.9323615832783858, + "learning_rate": 5.649090281650682e-07, + "loss": 0.1234, + "step": 8603 + }, + { + "epoch": 0.7927396692311236, + "grad_norm": 0.9313543955066301, + "learning_rate": 5.644268019400903e-07, + "loss": 0.1118, + "step": 8604 + }, + { + "epoch": 0.7928318054083936, + "grad_norm": 0.9637787040109845, + "learning_rate": 5.639447554332928e-07, + "loss": 0.1296, + "step": 8605 + }, + { + "epoch": 0.7929239415856636, + "grad_norm": 0.9278148056738393, + "learning_rate": 5.634628886894324e-07, + "loss": 0.1084, + "step": 8606 + }, + { + "epoch": 0.7930160777629336, + "grad_norm": 0.9547852357619248, + "learning_rate": 5.629812017532515e-07, + "loss": 0.1208, + "step": 8607 + }, + { + "epoch": 0.7931082139402036, + "grad_norm": 0.9578763992541558, + "learning_rate": 5.62499694669475e-07, + "loss": 0.1269, + "step": 8608 + }, + { + "epoch": 0.7932003501174736, + "grad_norm": 0.954352297806697, + "learning_rate": 5.620183674828094e-07, + "loss": 0.1304, + "step": 8609 + }, + { + "epoch": 0.7932924862947436, + "grad_norm": 0.9215896860684726, + "learning_rate": 5.615372202379482e-07, + "loss": 0.1208, + "step": 8610 + }, + { + "epoch": 0.7933846224720137, + "grad_norm": 0.8858931374801061, + "learning_rate": 5.610562529795635e-07, + "loss": 0.1101, + "step": 8611 + }, + { + "epoch": 0.7934767586492837, + "grad_norm": 0.974136599445881, + "learning_rate": 5.605754657523147e-07, + "loss": 0.1253, + "step": 8612 + }, + { + "epoch": 0.7935688948265537, + "grad_norm": 0.9639846350892, + "learning_rate": 5.600948586008432e-07, + "loss": 0.1208, + "step": 8613 + }, + { + "epoch": 0.7936610310038237, + "grad_norm": 0.9604929500826802, + "learning_rate": 5.59614431569773e-07, + "loss": 0.122, + "step": 8614 + }, + { + "epoch": 0.7937531671810937, + "grad_norm": 0.9931624424857912, + "learning_rate": 5.591341847037107e-07, + "loss": 0.1276, + "step": 8615 + }, + { + "epoch": 0.7938453033583637, + "grad_norm": 0.9549498208056124, + "learning_rate": 5.586541180472485e-07, + "loss": 0.1233, + "step": 8616 + }, + { + "epoch": 0.7939374395356337, + "grad_norm": 0.8682677683889473, + "learning_rate": 5.581742316449601e-07, + "loss": 0.1024, + "step": 8617 + }, + { + "epoch": 0.7940295757129037, + "grad_norm": 0.9493113700230978, + "learning_rate": 5.57694525541404e-07, + "loss": 0.128, + "step": 8618 + }, + { + "epoch": 0.7941217118901737, + "grad_norm": 0.9031374769038998, + "learning_rate": 5.572149997811205e-07, + "loss": 0.1172, + "step": 8619 + }, + { + "epoch": 0.7942138480674437, + "grad_norm": 0.9079305929186919, + "learning_rate": 5.567356544086325e-07, + "loss": 0.1236, + "step": 8620 + }, + { + "epoch": 0.7943059842447137, + "grad_norm": 0.9274268890673875, + "learning_rate": 5.56256489468448e-07, + "loss": 0.1204, + "step": 8621 + }, + { + "epoch": 0.7943981204219837, + "grad_norm": 0.9514779351411853, + "learning_rate": 5.557775050050584e-07, + "loss": 0.1267, + "step": 8622 + }, + { + "epoch": 0.7944902565992537, + "grad_norm": 0.8546480418647092, + "learning_rate": 5.552987010629363e-07, + "loss": 0.1062, + "step": 8623 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 0.8933129494153353, + "learning_rate": 5.54820077686539e-07, + "loss": 0.1191, + "step": 8624 + }, + { + "epoch": 0.7946745289537938, + "grad_norm": 0.8839504359155155, + "learning_rate": 5.543416349203071e-07, + "loss": 0.121, + "step": 8625 + }, + { + "epoch": 0.7947666651310638, + "grad_norm": 0.9199143993897311, + "learning_rate": 5.538633728086643e-07, + "loss": 0.1174, + "step": 8626 + }, + { + "epoch": 0.7948588013083338, + "grad_norm": 0.9234319246914519, + "learning_rate": 5.533852913960158e-07, + "loss": 0.1123, + "step": 8627 + }, + { + "epoch": 0.7949509374856037, + "grad_norm": 0.9564128250888884, + "learning_rate": 5.529073907267526e-07, + "loss": 0.1296, + "step": 8628 + }, + { + "epoch": 0.7950430736628737, + "grad_norm": 0.9131763952358534, + "learning_rate": 5.524296708452476e-07, + "loss": 0.1155, + "step": 8629 + }, + { + "epoch": 0.7951352098401437, + "grad_norm": 0.9744724093653233, + "learning_rate": 5.519521317958581e-07, + "loss": 0.1262, + "step": 8630 + }, + { + "epoch": 0.7952273460174137, + "grad_norm": 0.9906485184260504, + "learning_rate": 5.514747736229225e-07, + "loss": 0.1203, + "step": 8631 + }, + { + "epoch": 0.7953194821946837, + "grad_norm": 0.9125838552784906, + "learning_rate": 5.509975963707636e-07, + "loss": 0.1228, + "step": 8632 + }, + { + "epoch": 0.7954116183719537, + "grad_norm": 0.9283033424610722, + "learning_rate": 5.505206000836874e-07, + "loss": 0.1193, + "step": 8633 + }, + { + "epoch": 0.7955037545492237, + "grad_norm": 0.9203411373141805, + "learning_rate": 5.500437848059842e-07, + "loss": 0.122, + "step": 8634 + }, + { + "epoch": 0.7955958907264937, + "grad_norm": 0.9121874766032623, + "learning_rate": 5.495671505819244e-07, + "loss": 0.1182, + "step": 8635 + }, + { + "epoch": 0.7956880269037637, + "grad_norm": 0.9416908807262255, + "learning_rate": 5.490906974557655e-07, + "loss": 0.1212, + "step": 8636 + }, + { + "epoch": 0.7957801630810337, + "grad_norm": 0.9425940298472445, + "learning_rate": 5.486144254717446e-07, + "loss": 0.1258, + "step": 8637 + }, + { + "epoch": 0.7958722992583038, + "grad_norm": 0.9684145490905561, + "learning_rate": 5.481383346740843e-07, + "loss": 0.1163, + "step": 8638 + }, + { + "epoch": 0.7959644354355738, + "grad_norm": 0.9555213332359374, + "learning_rate": 5.476624251069904e-07, + "loss": 0.1226, + "step": 8639 + }, + { + "epoch": 0.7960565716128438, + "grad_norm": 0.9718769525251739, + "learning_rate": 5.471866968146497e-07, + "loss": 0.1128, + "step": 8640 + }, + { + "epoch": 0.7961487077901138, + "grad_norm": 0.9887299932093059, + "learning_rate": 5.46711149841235e-07, + "loss": 0.1306, + "step": 8641 + }, + { + "epoch": 0.7962408439673838, + "grad_norm": 0.9353887995608104, + "learning_rate": 5.462357842308997e-07, + "loss": 0.1194, + "step": 8642 + }, + { + "epoch": 0.7963329801446538, + "grad_norm": 0.9103463409879694, + "learning_rate": 5.457606000277826e-07, + "loss": 0.1155, + "step": 8643 + }, + { + "epoch": 0.7964251163219238, + "grad_norm": 0.8959125789177207, + "learning_rate": 5.452855972760035e-07, + "loss": 0.1126, + "step": 8644 + }, + { + "epoch": 0.7965172524991938, + "grad_norm": 0.9415821381686513, + "learning_rate": 5.448107760196672e-07, + "loss": 0.1086, + "step": 8645 + }, + { + "epoch": 0.7966093886764638, + "grad_norm": 0.9484822727507379, + "learning_rate": 5.443361363028612e-07, + "loss": 0.1196, + "step": 8646 + }, + { + "epoch": 0.7967015248537338, + "grad_norm": 0.9729626443315508, + "learning_rate": 5.438616781696557e-07, + "loss": 0.1292, + "step": 8647 + }, + { + "epoch": 0.7967936610310038, + "grad_norm": 0.9903115214641103, + "learning_rate": 5.43387401664103e-07, + "loss": 0.1288, + "step": 8648 + }, + { + "epoch": 0.7968857972082738, + "grad_norm": 0.9035118638518556, + "learning_rate": 5.429133068302405e-07, + "loss": 0.1161, + "step": 8649 + }, + { + "epoch": 0.7969779333855438, + "grad_norm": 0.8673452284329635, + "learning_rate": 5.424393937120884e-07, + "loss": 0.0994, + "step": 8650 + }, + { + "epoch": 0.7970700695628138, + "grad_norm": 0.9479150604513178, + "learning_rate": 5.419656623536498e-07, + "loss": 0.124, + "step": 8651 + }, + { + "epoch": 0.7971622057400839, + "grad_norm": 0.8588902296167942, + "learning_rate": 5.414921127989104e-07, + "loss": 0.1042, + "step": 8652 + }, + { + "epoch": 0.7972543419173539, + "grad_norm": 0.9322158612905973, + "learning_rate": 5.410187450918381e-07, + "loss": 0.1294, + "step": 8653 + }, + { + "epoch": 0.7973464780946239, + "grad_norm": 0.9631584611479438, + "learning_rate": 5.405455592763864e-07, + "loss": 0.1296, + "step": 8654 + }, + { + "epoch": 0.7974386142718939, + "grad_norm": 0.919274177152333, + "learning_rate": 5.400725553964908e-07, + "loss": 0.1147, + "step": 8655 + }, + { + "epoch": 0.7975307504491639, + "grad_norm": 0.9032358397233256, + "learning_rate": 5.39599733496069e-07, + "loss": 0.1142, + "step": 8656 + }, + { + "epoch": 0.7976228866264339, + "grad_norm": 0.9549715100342432, + "learning_rate": 5.391270936190232e-07, + "loss": 0.1179, + "step": 8657 + }, + { + "epoch": 0.7977150228037039, + "grad_norm": 0.8774598769374871, + "learning_rate": 5.386546358092376e-07, + "loss": 0.112, + "step": 8658 + }, + { + "epoch": 0.7978071589809739, + "grad_norm": 0.9155784556515261, + "learning_rate": 5.381823601105804e-07, + "loss": 0.1172, + "step": 8659 + }, + { + "epoch": 0.7978992951582439, + "grad_norm": 0.8967035583649721, + "learning_rate": 5.377102665669018e-07, + "loss": 0.1094, + "step": 8660 + }, + { + "epoch": 0.7979914313355139, + "grad_norm": 0.9265403880030733, + "learning_rate": 5.372383552220358e-07, + "loss": 0.1221, + "step": 8661 + }, + { + "epoch": 0.7980835675127839, + "grad_norm": 0.9252246274238533, + "learning_rate": 5.36766626119801e-07, + "loss": 0.1191, + "step": 8662 + }, + { + "epoch": 0.7981757036900539, + "grad_norm": 0.9738521523534015, + "learning_rate": 5.362950793039959e-07, + "loss": 0.1222, + "step": 8663 + }, + { + "epoch": 0.7982678398673239, + "grad_norm": 0.9111419202800259, + "learning_rate": 5.358237148184034e-07, + "loss": 0.1258, + "step": 8664 + }, + { + "epoch": 0.7983599760445939, + "grad_norm": 0.8727192448465615, + "learning_rate": 5.353525327067902e-07, + "loss": 0.103, + "step": 8665 + }, + { + "epoch": 0.798452112221864, + "grad_norm": 0.9316417754395357, + "learning_rate": 5.348815330129059e-07, + "loss": 0.1191, + "step": 8666 + }, + { + "epoch": 0.798544248399134, + "grad_norm": 0.9191480528464787, + "learning_rate": 5.344107157804834e-07, + "loss": 0.1198, + "step": 8667 + }, + { + "epoch": 0.798636384576404, + "grad_norm": 0.9361991190966118, + "learning_rate": 5.339400810532375e-07, + "loss": 0.1292, + "step": 8668 + }, + { + "epoch": 0.798728520753674, + "grad_norm": 0.9428379963087531, + "learning_rate": 5.334696288748661e-07, + "loss": 0.1162, + "step": 8669 + }, + { + "epoch": 0.798820656930944, + "grad_norm": 0.8926886143774425, + "learning_rate": 5.329993592890512e-07, + "loss": 0.1116, + "step": 8670 + }, + { + "epoch": 0.798912793108214, + "grad_norm": 0.9405908396749681, + "learning_rate": 5.325292723394573e-07, + "loss": 0.1155, + "step": 8671 + }, + { + "epoch": 0.799004929285484, + "grad_norm": 0.8988881879842964, + "learning_rate": 5.320593680697331e-07, + "loss": 0.1146, + "step": 8672 + }, + { + "epoch": 0.7990970654627539, + "grad_norm": 0.9517179598930791, + "learning_rate": 5.315896465235084e-07, + "loss": 0.1254, + "step": 8673 + }, + { + "epoch": 0.7991892016400239, + "grad_norm": 0.905955864737142, + "learning_rate": 5.311201077443961e-07, + "loss": 0.1063, + "step": 8674 + }, + { + "epoch": 0.7992813378172939, + "grad_norm": 0.9489280018205543, + "learning_rate": 5.306507517759937e-07, + "loss": 0.1186, + "step": 8675 + }, + { + "epoch": 0.7993734739945639, + "grad_norm": 0.9753417396799335, + "learning_rate": 5.301815786618816e-07, + "loss": 0.1227, + "step": 8676 + }, + { + "epoch": 0.7994656101718339, + "grad_norm": 0.920701395541936, + "learning_rate": 5.297125884456214e-07, + "loss": 0.1161, + "step": 8677 + }, + { + "epoch": 0.7995577463491039, + "grad_norm": 0.9251078334924024, + "learning_rate": 5.292437811707599e-07, + "loss": 0.1262, + "step": 8678 + }, + { + "epoch": 0.799649882526374, + "grad_norm": 0.9327552040475929, + "learning_rate": 5.287751568808247e-07, + "loss": 0.1186, + "step": 8679 + }, + { + "epoch": 0.799742018703644, + "grad_norm": 0.9619542917813455, + "learning_rate": 5.283067156193292e-07, + "loss": 0.1203, + "step": 8680 + }, + { + "epoch": 0.799834154880914, + "grad_norm": 0.9289332567081827, + "learning_rate": 5.278384574297665e-07, + "loss": 0.1234, + "step": 8681 + }, + { + "epoch": 0.799926291058184, + "grad_norm": 0.9064646891261017, + "learning_rate": 5.273703823556153e-07, + "loss": 0.1223, + "step": 8682 + }, + { + "epoch": 0.800018427235454, + "grad_norm": 0.9316922917712442, + "learning_rate": 5.269024904403372e-07, + "loss": 0.1134, + "step": 8683 + }, + { + "epoch": 0.800110563412724, + "grad_norm": 0.9607883705940592, + "learning_rate": 5.264347817273752e-07, + "loss": 0.1259, + "step": 8684 + }, + { + "epoch": 0.800202699589994, + "grad_norm": 0.9179273484417532, + "learning_rate": 5.259672562601553e-07, + "loss": 0.1121, + "step": 8685 + }, + { + "epoch": 0.800294835767264, + "grad_norm": 0.9843152416241994, + "learning_rate": 5.25499914082088e-07, + "loss": 0.1197, + "step": 8686 + }, + { + "epoch": 0.800386971944534, + "grad_norm": 0.9651351060190287, + "learning_rate": 5.250327552365664e-07, + "loss": 0.1215, + "step": 8687 + }, + { + "epoch": 0.800479108121804, + "grad_norm": 0.9119186870911993, + "learning_rate": 5.245657797669665e-07, + "loss": 0.1174, + "step": 8688 + }, + { + "epoch": 0.800571244299074, + "grad_norm": 0.9238367856143697, + "learning_rate": 5.24098987716647e-07, + "loss": 0.1147, + "step": 8689 + }, + { + "epoch": 0.800663380476344, + "grad_norm": 0.9374656049331135, + "learning_rate": 5.236323791289479e-07, + "loss": 0.12, + "step": 8690 + }, + { + "epoch": 0.800755516653614, + "grad_norm": 0.9654942943206993, + "learning_rate": 5.231659540471954e-07, + "loss": 0.1316, + "step": 8691 + }, + { + "epoch": 0.800847652830884, + "grad_norm": 0.9591977272622936, + "learning_rate": 5.226997125146973e-07, + "loss": 0.1253, + "step": 8692 + }, + { + "epoch": 0.8009397890081541, + "grad_norm": 0.9494439671142985, + "learning_rate": 5.222336545747434e-07, + "loss": 0.1198, + "step": 8693 + }, + { + "epoch": 0.8010319251854241, + "grad_norm": 0.9537456425037063, + "learning_rate": 5.217677802706078e-07, + "loss": 0.1193, + "step": 8694 + }, + { + "epoch": 0.8011240613626941, + "grad_norm": 0.9259471935018194, + "learning_rate": 5.213020896455462e-07, + "loss": 0.1141, + "step": 8695 + }, + { + "epoch": 0.8012161975399641, + "grad_norm": 0.9460346609610221, + "learning_rate": 5.208365827427985e-07, + "loss": 0.1207, + "step": 8696 + }, + { + "epoch": 0.8013083337172341, + "grad_norm": 0.9684295469141999, + "learning_rate": 5.203712596055876e-07, + "loss": 0.127, + "step": 8697 + }, + { + "epoch": 0.8014004698945041, + "grad_norm": 0.9765445354106874, + "learning_rate": 5.19906120277118e-07, + "loss": 0.1216, + "step": 8698 + }, + { + "epoch": 0.8014926060717741, + "grad_norm": 0.8910773017327671, + "learning_rate": 5.194411648005778e-07, + "loss": 0.1112, + "step": 8699 + }, + { + "epoch": 0.8015847422490441, + "grad_norm": 1.006495006076603, + "learning_rate": 5.189763932191396e-07, + "loss": 0.1264, + "step": 8700 + }, + { + "epoch": 0.8016768784263141, + "grad_norm": 0.8756991823220688, + "learning_rate": 5.185118055759564e-07, + "loss": 0.116, + "step": 8701 + }, + { + "epoch": 0.8017690146035841, + "grad_norm": 1.0276078013137269, + "learning_rate": 5.180474019141646e-07, + "loss": 0.1299, + "step": 8702 + }, + { + "epoch": 0.8018611507808541, + "grad_norm": 1.008868504080066, + "learning_rate": 5.175831822768848e-07, + "loss": 0.1204, + "step": 8703 + }, + { + "epoch": 0.8019532869581241, + "grad_norm": 0.9189685434248478, + "learning_rate": 5.171191467072201e-07, + "loss": 0.1096, + "step": 8704 + }, + { + "epoch": 0.8020454231353941, + "grad_norm": 0.9502132889598285, + "learning_rate": 5.166552952482565e-07, + "loss": 0.1232, + "step": 8705 + }, + { + "epoch": 0.8021375593126642, + "grad_norm": 0.8942341109417716, + "learning_rate": 5.161916279430623e-07, + "loss": 0.1178, + "step": 8706 + }, + { + "epoch": 0.8022296954899342, + "grad_norm": 1.00440052137423, + "learning_rate": 5.157281448346882e-07, + "loss": 0.1317, + "step": 8707 + }, + { + "epoch": 0.8023218316672042, + "grad_norm": 0.9128632666446306, + "learning_rate": 5.152648459661694e-07, + "loss": 0.1075, + "step": 8708 + }, + { + "epoch": 0.8024139678444742, + "grad_norm": 0.9950914671950657, + "learning_rate": 5.148017313805237e-07, + "loss": 0.1369, + "step": 8709 + }, + { + "epoch": 0.8025061040217442, + "grad_norm": 0.9559709503514494, + "learning_rate": 5.143388011207506e-07, + "loss": 0.1216, + "step": 8710 + }, + { + "epoch": 0.8025982401990142, + "grad_norm": 0.9386882981548607, + "learning_rate": 5.138760552298338e-07, + "loss": 0.1104, + "step": 8711 + }, + { + "epoch": 0.8026903763762842, + "grad_norm": 0.9275125291849861, + "learning_rate": 5.134134937507387e-07, + "loss": 0.1152, + "step": 8712 + }, + { + "epoch": 0.8027825125535542, + "grad_norm": 0.9156843741584696, + "learning_rate": 5.129511167264151e-07, + "loss": 0.1127, + "step": 8713 + }, + { + "epoch": 0.8028746487308241, + "grad_norm": 0.9505867161038387, + "learning_rate": 5.124889241997935e-07, + "loss": 0.1187, + "step": 8714 + }, + { + "epoch": 0.8029667849080941, + "grad_norm": 0.99898568796701, + "learning_rate": 5.120269162137889e-07, + "loss": 0.1231, + "step": 8715 + }, + { + "epoch": 0.8030589210853641, + "grad_norm": 0.9897462954997086, + "learning_rate": 5.115650928113e-07, + "loss": 0.1229, + "step": 8716 + }, + { + "epoch": 0.8031510572626341, + "grad_norm": 0.945085721014716, + "learning_rate": 5.111034540352064e-07, + "loss": 0.1245, + "step": 8717 + }, + { + "epoch": 0.8032431934399041, + "grad_norm": 0.9743912216678187, + "learning_rate": 5.106419999283702e-07, + "loss": 0.1279, + "step": 8718 + }, + { + "epoch": 0.8033353296171741, + "grad_norm": 0.9710579013133132, + "learning_rate": 5.101807305336385e-07, + "loss": 0.1339, + "step": 8719 + }, + { + "epoch": 0.8034274657944442, + "grad_norm": 0.9155853366901943, + "learning_rate": 5.0971964589384e-07, + "loss": 0.1218, + "step": 8720 + }, + { + "epoch": 0.8035196019717142, + "grad_norm": 0.958206277205222, + "learning_rate": 5.092587460517873e-07, + "loss": 0.1239, + "step": 8721 + }, + { + "epoch": 0.8036117381489842, + "grad_norm": 0.959328723295657, + "learning_rate": 5.087980310502743e-07, + "loss": 0.1182, + "step": 8722 + }, + { + "epoch": 0.8037038743262542, + "grad_norm": 0.905234879072177, + "learning_rate": 5.083375009320779e-07, + "loss": 0.1129, + "step": 8723 + }, + { + "epoch": 0.8037960105035242, + "grad_norm": 0.8962931301615531, + "learning_rate": 5.078771557399586e-07, + "loss": 0.1242, + "step": 8724 + }, + { + "epoch": 0.8038881466807942, + "grad_norm": 0.9194498456557597, + "learning_rate": 5.0741699551666e-07, + "loss": 0.1107, + "step": 8725 + }, + { + "epoch": 0.8039802828580642, + "grad_norm": 0.9100045586682919, + "learning_rate": 5.069570203049085e-07, + "loss": 0.1124, + "step": 8726 + }, + { + "epoch": 0.8040724190353342, + "grad_norm": 0.9295523104342945, + "learning_rate": 5.06497230147412e-07, + "loss": 0.1179, + "step": 8727 + }, + { + "epoch": 0.8041645552126042, + "grad_norm": 0.9613944140250847, + "learning_rate": 5.060376250868615e-07, + "loss": 0.1192, + "step": 8728 + }, + { + "epoch": 0.8042566913898742, + "grad_norm": 0.9457683779137299, + "learning_rate": 5.055782051659322e-07, + "loss": 0.1268, + "step": 8729 + }, + { + "epoch": 0.8043488275671442, + "grad_norm": 0.9653237653604977, + "learning_rate": 5.051189704272819e-07, + "loss": 0.1179, + "step": 8730 + }, + { + "epoch": 0.8044409637444142, + "grad_norm": 0.9299770816436291, + "learning_rate": 5.046599209135492e-07, + "loss": 0.1243, + "step": 8731 + }, + { + "epoch": 0.8045330999216842, + "grad_norm": 0.9681021548052552, + "learning_rate": 5.042010566673583e-07, + "loss": 0.1159, + "step": 8732 + }, + { + "epoch": 0.8046252360989542, + "grad_norm": 0.8858392947078415, + "learning_rate": 5.037423777313132e-07, + "loss": 0.1032, + "step": 8733 + }, + { + "epoch": 0.8047173722762243, + "grad_norm": 0.9198330998999283, + "learning_rate": 5.032838841480042e-07, + "loss": 0.105, + "step": 8734 + }, + { + "epoch": 0.8048095084534943, + "grad_norm": 0.948083821447076, + "learning_rate": 5.028255759600004e-07, + "loss": 0.1232, + "step": 8735 + }, + { + "epoch": 0.8049016446307643, + "grad_norm": 0.8914840218195179, + "learning_rate": 5.023674532098571e-07, + "loss": 0.1129, + "step": 8736 + }, + { + "epoch": 0.8049937808080343, + "grad_norm": 0.9627700380196931, + "learning_rate": 5.019095159401113e-07, + "loss": 0.1217, + "step": 8737 + }, + { + "epoch": 0.8050859169853043, + "grad_norm": 0.9673563842251529, + "learning_rate": 5.01451764193282e-07, + "loss": 0.1292, + "step": 8738 + }, + { + "epoch": 0.8051780531625743, + "grad_norm": 0.9294744242883962, + "learning_rate": 5.009941980118707e-07, + "loss": 0.1202, + "step": 8739 + }, + { + "epoch": 0.8052701893398443, + "grad_norm": 0.9123434635710035, + "learning_rate": 5.005368174383634e-07, + "loss": 0.1123, + "step": 8740 + }, + { + "epoch": 0.8053623255171143, + "grad_norm": 0.8951060347141389, + "learning_rate": 5.000796225152277e-07, + "loss": 0.1152, + "step": 8741 + }, + { + "epoch": 0.8054544616943843, + "grad_norm": 0.9299640822902256, + "learning_rate": 4.996226132849149e-07, + "loss": 0.1098, + "step": 8742 + }, + { + "epoch": 0.8055465978716543, + "grad_norm": 0.9759922313677241, + "learning_rate": 4.99165789789858e-07, + "loss": 0.1253, + "step": 8743 + }, + { + "epoch": 0.8056387340489243, + "grad_norm": 0.9719818047810473, + "learning_rate": 4.987091520724721e-07, + "loss": 0.1178, + "step": 8744 + }, + { + "epoch": 0.8057308702261943, + "grad_norm": 0.9429694738572041, + "learning_rate": 4.982527001751567e-07, + "loss": 0.1197, + "step": 8745 + }, + { + "epoch": 0.8058230064034643, + "grad_norm": 0.9728986864512471, + "learning_rate": 4.977964341402941e-07, + "loss": 0.1221, + "step": 8746 + }, + { + "epoch": 0.8059151425807344, + "grad_norm": 0.8994651161363463, + "learning_rate": 4.973403540102476e-07, + "loss": 0.1085, + "step": 8747 + }, + { + "epoch": 0.8060072787580044, + "grad_norm": 0.888762649636477, + "learning_rate": 4.968844598273653e-07, + "loss": 0.1055, + "step": 8748 + }, + { + "epoch": 0.8060994149352744, + "grad_norm": 0.9187766828800995, + "learning_rate": 4.964287516339758e-07, + "loss": 0.111, + "step": 8749 + }, + { + "epoch": 0.8061915511125444, + "grad_norm": 0.9374319253049951, + "learning_rate": 4.959732294723932e-07, + "loss": 0.1192, + "step": 8750 + }, + { + "epoch": 0.8062836872898144, + "grad_norm": 0.9165076691998195, + "learning_rate": 4.955178933849111e-07, + "loss": 0.1178, + "step": 8751 + }, + { + "epoch": 0.8063758234670844, + "grad_norm": 0.917632651699605, + "learning_rate": 4.950627434138083e-07, + "loss": 0.1223, + "step": 8752 + }, + { + "epoch": 0.8064679596443544, + "grad_norm": 0.939684098512974, + "learning_rate": 4.946077796013462e-07, + "loss": 0.1138, + "step": 8753 + }, + { + "epoch": 0.8065600958216244, + "grad_norm": 0.9575212037934554, + "learning_rate": 4.941530019897669e-07, + "loss": 0.1214, + "step": 8754 + }, + { + "epoch": 0.8066522319988944, + "grad_norm": 0.9044347534310851, + "learning_rate": 4.93698410621298e-07, + "loss": 0.1144, + "step": 8755 + }, + { + "epoch": 0.8067443681761644, + "grad_norm": 0.9558305259406638, + "learning_rate": 4.932440055381471e-07, + "loss": 0.1187, + "step": 8756 + }, + { + "epoch": 0.8068365043534343, + "grad_norm": 0.9950767718575722, + "learning_rate": 4.92789786782506e-07, + "loss": 0.1384, + "step": 8757 + }, + { + "epoch": 0.8069286405307043, + "grad_norm": 0.9110712406529344, + "learning_rate": 4.923357543965498e-07, + "loss": 0.1064, + "step": 8758 + }, + { + "epoch": 0.8070207767079743, + "grad_norm": 0.9519585325823425, + "learning_rate": 4.918819084224353e-07, + "loss": 0.109, + "step": 8759 + }, + { + "epoch": 0.8071129128852443, + "grad_norm": 0.947790510305991, + "learning_rate": 4.914282489023006e-07, + "loss": 0.1275, + "step": 8760 + }, + { + "epoch": 0.8072050490625144, + "grad_norm": 1.0226051981887092, + "learning_rate": 4.909747758782693e-07, + "loss": 0.1239, + "step": 8761 + }, + { + "epoch": 0.8072971852397844, + "grad_norm": 0.9217178327591092, + "learning_rate": 4.905214893924462e-07, + "loss": 0.1128, + "step": 8762 + }, + { + "epoch": 0.8073893214170544, + "grad_norm": 0.9737955286681498, + "learning_rate": 4.900683894869198e-07, + "loss": 0.1307, + "step": 8763 + }, + { + "epoch": 0.8074814575943244, + "grad_norm": 0.9299037243211407, + "learning_rate": 4.8961547620376e-07, + "loss": 0.1242, + "step": 8764 + }, + { + "epoch": 0.8075735937715944, + "grad_norm": 0.9475722332886723, + "learning_rate": 4.891627495850188e-07, + "loss": 0.1142, + "step": 8765 + }, + { + "epoch": 0.8076657299488644, + "grad_norm": 0.9662419417795653, + "learning_rate": 4.887102096727326e-07, + "loss": 0.1167, + "step": 8766 + }, + { + "epoch": 0.8077578661261344, + "grad_norm": 0.9339979179707536, + "learning_rate": 4.882578565089205e-07, + "loss": 0.1185, + "step": 8767 + }, + { + "epoch": 0.8078500023034044, + "grad_norm": 0.9596979033805477, + "learning_rate": 4.878056901355823e-07, + "loss": 0.1279, + "step": 8768 + }, + { + "epoch": 0.8079421384806744, + "grad_norm": 0.9218071788920659, + "learning_rate": 4.873537105947029e-07, + "loss": 0.1229, + "step": 8769 + }, + { + "epoch": 0.8080342746579444, + "grad_norm": 0.9480480193282101, + "learning_rate": 4.869019179282478e-07, + "loss": 0.1285, + "step": 8770 + }, + { + "epoch": 0.8081264108352144, + "grad_norm": 0.9074065825918513, + "learning_rate": 4.864503121781666e-07, + "loss": 0.1232, + "step": 8771 + }, + { + "epoch": 0.8082185470124844, + "grad_norm": 0.9385402031446006, + "learning_rate": 4.859988933863898e-07, + "loss": 0.1199, + "step": 8772 + }, + { + "epoch": 0.8083106831897544, + "grad_norm": 0.9707115923915903, + "learning_rate": 4.85547661594833e-07, + "loss": 0.1255, + "step": 8773 + }, + { + "epoch": 0.8084028193670245, + "grad_norm": 0.9897153814911818, + "learning_rate": 4.850966168453922e-07, + "loss": 0.1305, + "step": 8774 + }, + { + "epoch": 0.8084949555442945, + "grad_norm": 0.9438068143981858, + "learning_rate": 4.846457591799489e-07, + "loss": 0.1166, + "step": 8775 + }, + { + "epoch": 0.8085870917215645, + "grad_norm": 0.929569865631667, + "learning_rate": 4.841950886403623e-07, + "loss": 0.1201, + "step": 8776 + }, + { + "epoch": 0.8086792278988345, + "grad_norm": 0.9544358854771392, + "learning_rate": 4.837446052684788e-07, + "loss": 0.1337, + "step": 8777 + }, + { + "epoch": 0.8087713640761045, + "grad_norm": 0.8648015438610358, + "learning_rate": 4.832943091061257e-07, + "loss": 0.1097, + "step": 8778 + }, + { + "epoch": 0.8088635002533745, + "grad_norm": 0.9271638424570154, + "learning_rate": 4.828442001951136e-07, + "loss": 0.112, + "step": 8779 + }, + { + "epoch": 0.8089556364306445, + "grad_norm": 0.8825506069463267, + "learning_rate": 4.82394278577234e-07, + "loss": 0.1071, + "step": 8780 + }, + { + "epoch": 0.8090477726079145, + "grad_norm": 0.982444580480559, + "learning_rate": 4.819445442942633e-07, + "loss": 0.13, + "step": 8781 + }, + { + "epoch": 0.8091399087851845, + "grad_norm": 0.9338257947288606, + "learning_rate": 4.814949973879582e-07, + "loss": 0.1148, + "step": 8782 + }, + { + "epoch": 0.8092320449624545, + "grad_norm": 0.9345277141507186, + "learning_rate": 4.8104563790006e-07, + "loss": 0.1189, + "step": 8783 + }, + { + "epoch": 0.8093241811397245, + "grad_norm": 0.9422878793863659, + "learning_rate": 4.805964658722922e-07, + "loss": 0.1242, + "step": 8784 + }, + { + "epoch": 0.8094163173169945, + "grad_norm": 0.9994874290723805, + "learning_rate": 4.801474813463591e-07, + "loss": 0.1259, + "step": 8785 + }, + { + "epoch": 0.8095084534942645, + "grad_norm": 1.0136117696237055, + "learning_rate": 4.796986843639506e-07, + "loss": 0.1269, + "step": 8786 + }, + { + "epoch": 0.8096005896715345, + "grad_norm": 0.9477824056753072, + "learning_rate": 4.792500749667359e-07, + "loss": 0.1225, + "step": 8787 + }, + { + "epoch": 0.8096927258488046, + "grad_norm": 0.9447988320137191, + "learning_rate": 4.788016531963699e-07, + "loss": 0.1164, + "step": 8788 + }, + { + "epoch": 0.8097848620260746, + "grad_norm": 0.9110046721363823, + "learning_rate": 4.783534190944872e-07, + "loss": 0.1176, + "step": 8789 + }, + { + "epoch": 0.8098769982033446, + "grad_norm": 0.9650411635983291, + "learning_rate": 4.779053727027072e-07, + "loss": 0.118, + "step": 8790 + }, + { + "epoch": 0.8099691343806146, + "grad_norm": 0.9194160816683928, + "learning_rate": 4.774575140626317e-07, + "loss": 0.1167, + "step": 8791 + }, + { + "epoch": 0.8100612705578846, + "grad_norm": 0.9730949942743494, + "learning_rate": 4.770098432158434e-07, + "loss": 0.1186, + "step": 8792 + }, + { + "epoch": 0.8101534067351546, + "grad_norm": 1.0026966470323755, + "learning_rate": 4.765623602039085e-07, + "loss": 0.1261, + "step": 8793 + }, + { + "epoch": 0.8102455429124246, + "grad_norm": 0.8959270267150095, + "learning_rate": 4.76115065068376e-07, + "loss": 0.1143, + "step": 8794 + }, + { + "epoch": 0.8103376790896946, + "grad_norm": 0.9745090604240599, + "learning_rate": 4.756679578507778e-07, + "loss": 0.1198, + "step": 8795 + }, + { + "epoch": 0.8104298152669646, + "grad_norm": 0.9752302606638509, + "learning_rate": 4.7522103859262813e-07, + "loss": 0.1246, + "step": 8796 + }, + { + "epoch": 0.8105219514442346, + "grad_norm": 0.9154425716815816, + "learning_rate": 4.7477430733542273e-07, + "loss": 0.1088, + "step": 8797 + }, + { + "epoch": 0.8106140876215046, + "grad_norm": 0.9209746568913927, + "learning_rate": 4.7432776412064034e-07, + "loss": 0.1164, + "step": 8798 + }, + { + "epoch": 0.8107062237987746, + "grad_norm": 0.8858703515048639, + "learning_rate": 4.738814089897431e-07, + "loss": 0.1118, + "step": 8799 + }, + { + "epoch": 0.8107983599760445, + "grad_norm": 0.9071064876375858, + "learning_rate": 4.734352419841756e-07, + "loss": 0.1128, + "step": 8800 + }, + { + "epoch": 0.8108904961533145, + "grad_norm": 0.9638341312522045, + "learning_rate": 4.7298926314536364e-07, + "loss": 0.1121, + "step": 8801 + }, + { + "epoch": 0.8109826323305847, + "grad_norm": 0.9563234435681683, + "learning_rate": 4.725434725147171e-07, + "loss": 0.1154, + "step": 8802 + }, + { + "epoch": 0.8110747685078546, + "grad_norm": 0.96543556775914, + "learning_rate": 4.720978701336268e-07, + "loss": 0.1266, + "step": 8803 + }, + { + "epoch": 0.8111669046851246, + "grad_norm": 0.9933176223015399, + "learning_rate": 4.716524560434679e-07, + "loss": 0.1339, + "step": 8804 + }, + { + "epoch": 0.8112590408623946, + "grad_norm": 0.9202425773610461, + "learning_rate": 4.7120723028559633e-07, + "loss": 0.1199, + "step": 8805 + }, + { + "epoch": 0.8113511770396646, + "grad_norm": 0.9422875480282432, + "learning_rate": 4.70762192901352e-07, + "loss": 0.1225, + "step": 8806 + }, + { + "epoch": 0.8114433132169346, + "grad_norm": 0.9709769506635573, + "learning_rate": 4.7031734393205683e-07, + "loss": 0.1239, + "step": 8807 + }, + { + "epoch": 0.8115354493942046, + "grad_norm": 0.9206405274403489, + "learning_rate": 4.6987268341901455e-07, + "loss": 0.1179, + "step": 8808 + }, + { + "epoch": 0.8116275855714746, + "grad_norm": 0.9756799458791882, + "learning_rate": 4.6942821140351174e-07, + "loss": 0.1174, + "step": 8809 + }, + { + "epoch": 0.8117197217487446, + "grad_norm": 0.9470559153669584, + "learning_rate": 4.6898392792681796e-07, + "loss": 0.1187, + "step": 8810 + }, + { + "epoch": 0.8118118579260146, + "grad_norm": 0.9201916935032508, + "learning_rate": 4.6853983303018493e-07, + "loss": 0.1206, + "step": 8811 + }, + { + "epoch": 0.8119039941032846, + "grad_norm": 0.9265359599326323, + "learning_rate": 4.680959267548479e-07, + "loss": 0.1181, + "step": 8812 + }, + { + "epoch": 0.8119961302805546, + "grad_norm": 0.9643827677740148, + "learning_rate": 4.676522091420227e-07, + "loss": 0.1295, + "step": 8813 + }, + { + "epoch": 0.8120882664578246, + "grad_norm": 0.9221162699337527, + "learning_rate": 4.672086802329079e-07, + "loss": 0.1106, + "step": 8814 + }, + { + "epoch": 0.8121804026350947, + "grad_norm": 0.9718760958585148, + "learning_rate": 4.667653400686858e-07, + "loss": 0.1226, + "step": 8815 + }, + { + "epoch": 0.8122725388123647, + "grad_norm": 0.8960897169593115, + "learning_rate": 4.6632218869052085e-07, + "loss": 0.1046, + "step": 8816 + }, + { + "epoch": 0.8123646749896347, + "grad_norm": 0.9716052904056378, + "learning_rate": 4.6587922613956005e-07, + "loss": 0.1274, + "step": 8817 + }, + { + "epoch": 0.8124568111669047, + "grad_norm": 0.9329622015203816, + "learning_rate": 4.6543645245693215e-07, + "loss": 0.1234, + "step": 8818 + }, + { + "epoch": 0.8125489473441747, + "grad_norm": 0.9354174342448097, + "learning_rate": 4.649938676837479e-07, + "loss": 0.1283, + "step": 8819 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 0.9504482807872802, + "learning_rate": 4.6455147186110217e-07, + "loss": 0.1253, + "step": 8820 + }, + { + "epoch": 0.8127332196987147, + "grad_norm": 0.8710928394218936, + "learning_rate": 4.6410926503007187e-07, + "loss": 0.1121, + "step": 8821 + }, + { + "epoch": 0.8128253558759847, + "grad_norm": 0.9167945722916967, + "learning_rate": 4.636672472317147e-07, + "loss": 0.1219, + "step": 8822 + }, + { + "epoch": 0.8129174920532547, + "grad_norm": 0.931250359567654, + "learning_rate": 4.6322541850707336e-07, + "loss": 0.1179, + "step": 8823 + }, + { + "epoch": 0.8130096282305247, + "grad_norm": 0.9173628142401588, + "learning_rate": 4.6278377889717064e-07, + "loss": 0.1201, + "step": 8824 + }, + { + "epoch": 0.8131017644077947, + "grad_norm": 0.9259153539552193, + "learning_rate": 4.62342328443014e-07, + "loss": 0.1073, + "step": 8825 + }, + { + "epoch": 0.8131939005850647, + "grad_norm": 0.9620459880501422, + "learning_rate": 4.6190106718559056e-07, + "loss": 0.1228, + "step": 8826 + }, + { + "epoch": 0.8132860367623347, + "grad_norm": 0.8858775284263333, + "learning_rate": 4.614599951658727e-07, + "loss": 0.1136, + "step": 8827 + }, + { + "epoch": 0.8133781729396047, + "grad_norm": 0.8963968749457515, + "learning_rate": 4.6101911242481396e-07, + "loss": 0.1072, + "step": 8828 + }, + { + "epoch": 0.8134703091168748, + "grad_norm": 0.9429238031032099, + "learning_rate": 4.605784190033502e-07, + "loss": 0.1195, + "step": 8829 + }, + { + "epoch": 0.8135624452941448, + "grad_norm": 0.9081363924275805, + "learning_rate": 4.6013791494239927e-07, + "loss": 0.1129, + "step": 8830 + }, + { + "epoch": 0.8136545814714148, + "grad_norm": 0.928463106569425, + "learning_rate": 4.5969760028286236e-07, + "loss": 0.1193, + "step": 8831 + }, + { + "epoch": 0.8137467176486848, + "grad_norm": 0.9505794219950519, + "learning_rate": 4.5925747506562287e-07, + "loss": 0.1192, + "step": 8832 + }, + { + "epoch": 0.8138388538259548, + "grad_norm": 0.9083831854315285, + "learning_rate": 4.5881753933154695e-07, + "loss": 0.1143, + "step": 8833 + }, + { + "epoch": 0.8139309900032248, + "grad_norm": 1.0098806197047803, + "learning_rate": 4.5837779312148225e-07, + "loss": 0.1279, + "step": 8834 + }, + { + "epoch": 0.8140231261804948, + "grad_norm": 0.9610060245617507, + "learning_rate": 4.5793823647625856e-07, + "loss": 0.129, + "step": 8835 + }, + { + "epoch": 0.8141152623577648, + "grad_norm": 0.9499210345688431, + "learning_rate": 4.574988694366894e-07, + "loss": 0.1237, + "step": 8836 + }, + { + "epoch": 0.8142073985350348, + "grad_norm": 0.916244337778968, + "learning_rate": 4.570596920435708e-07, + "loss": 0.1119, + "step": 8837 + }, + { + "epoch": 0.8142995347123048, + "grad_norm": 0.9170844203897126, + "learning_rate": 4.566207043376789e-07, + "loss": 0.1114, + "step": 8838 + }, + { + "epoch": 0.8143916708895748, + "grad_norm": 0.9254240160408157, + "learning_rate": 4.56181906359775e-07, + "loss": 0.123, + "step": 8839 + }, + { + "epoch": 0.8144838070668448, + "grad_norm": 0.9655610903066298, + "learning_rate": 4.557432981506005e-07, + "loss": 0.1217, + "step": 8840 + }, + { + "epoch": 0.8145759432441148, + "grad_norm": 0.9875316818462202, + "learning_rate": 4.5530487975088076e-07, + "loss": 0.1194, + "step": 8841 + }, + { + "epoch": 0.8146680794213849, + "grad_norm": 0.9574896478337922, + "learning_rate": 4.548666512013236e-07, + "loss": 0.1219, + "step": 8842 + }, + { + "epoch": 0.8147602155986549, + "grad_norm": 0.921246124798205, + "learning_rate": 4.5442861254261753e-07, + "loss": 0.1186, + "step": 8843 + }, + { + "epoch": 0.8148523517759249, + "grad_norm": 0.9451372908762516, + "learning_rate": 4.5399076381543536e-07, + "loss": 0.1192, + "step": 8844 + }, + { + "epoch": 0.8149444879531949, + "grad_norm": 0.9134889434581871, + "learning_rate": 4.5355310506043053e-07, + "loss": 0.112, + "step": 8845 + }, + { + "epoch": 0.8150366241304648, + "grad_norm": 0.902262399393891, + "learning_rate": 4.531156363182407e-07, + "loss": 0.1112, + "step": 8846 + }, + { + "epoch": 0.8151287603077348, + "grad_norm": 0.950956875411372, + "learning_rate": 4.526783576294835e-07, + "loss": 0.1274, + "step": 8847 + }, + { + "epoch": 0.8152208964850048, + "grad_norm": 0.8980087170242911, + "learning_rate": 4.5224126903476136e-07, + "loss": 0.1144, + "step": 8848 + }, + { + "epoch": 0.8153130326622748, + "grad_norm": 0.9102210614967016, + "learning_rate": 4.518043705746578e-07, + "loss": 0.121, + "step": 8849 + }, + { + "epoch": 0.8154051688395448, + "grad_norm": 0.9794454379572055, + "learning_rate": 4.5136766228974005e-07, + "loss": 0.1259, + "step": 8850 + }, + { + "epoch": 0.8154973050168148, + "grad_norm": 0.9445608794611002, + "learning_rate": 4.509311442205538e-07, + "loss": 0.1128, + "step": 8851 + }, + { + "epoch": 0.8155894411940848, + "grad_norm": 0.9359598682098146, + "learning_rate": 4.504948164076317e-07, + "loss": 0.1173, + "step": 8852 + }, + { + "epoch": 0.8156815773713548, + "grad_norm": 0.9334601600662357, + "learning_rate": 4.5005867889148626e-07, + "loss": 0.108, + "step": 8853 + }, + { + "epoch": 0.8157737135486248, + "grad_norm": 0.8918500853209136, + "learning_rate": 4.4962273171261393e-07, + "loss": 0.1177, + "step": 8854 + }, + { + "epoch": 0.8158658497258948, + "grad_norm": 0.9167522940965833, + "learning_rate": 4.491869749114908e-07, + "loss": 0.1016, + "step": 8855 + }, + { + "epoch": 0.8159579859031649, + "grad_norm": 0.9233508244470227, + "learning_rate": 4.4875140852857854e-07, + "loss": 0.1213, + "step": 8856 + }, + { + "epoch": 0.8160501220804349, + "grad_norm": 0.9724640957299692, + "learning_rate": 4.4831603260431787e-07, + "loss": 0.1307, + "step": 8857 + }, + { + "epoch": 0.8161422582577049, + "grad_norm": 0.8845130255857112, + "learning_rate": 4.478808471791354e-07, + "loss": 0.1084, + "step": 8858 + }, + { + "epoch": 0.8162343944349749, + "grad_norm": 0.9911551925112649, + "learning_rate": 4.474458522934361e-07, + "loss": 0.1268, + "step": 8859 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.9575733009268137, + "learning_rate": 4.470110479876105e-07, + "loss": 0.1232, + "step": 8860 + }, + { + "epoch": 0.8164186667895149, + "grad_norm": 0.881394940578178, + "learning_rate": 4.4657643430203067e-07, + "loss": 0.1086, + "step": 8861 + }, + { + "epoch": 0.8165108029667849, + "grad_norm": 0.9874625512103724, + "learning_rate": 4.461420112770501e-07, + "loss": 0.1232, + "step": 8862 + }, + { + "epoch": 0.8166029391440549, + "grad_norm": 0.9262867632138934, + "learning_rate": 4.45707778953004e-07, + "loss": 0.1166, + "step": 8863 + }, + { + "epoch": 0.8166950753213249, + "grad_norm": 0.9675648924666996, + "learning_rate": 4.452737373702115e-07, + "loss": 0.1229, + "step": 8864 + }, + { + "epoch": 0.8167872114985949, + "grad_norm": 0.887245591991255, + "learning_rate": 4.44839886568974e-07, + "loss": 0.1105, + "step": 8865 + }, + { + "epoch": 0.8168793476758649, + "grad_norm": 0.9227953394306798, + "learning_rate": 4.444062265895746e-07, + "loss": 0.1207, + "step": 8866 + }, + { + "epoch": 0.8169714838531349, + "grad_norm": 0.928324613846221, + "learning_rate": 4.439727574722783e-07, + "loss": 0.1159, + "step": 8867 + }, + { + "epoch": 0.8170636200304049, + "grad_norm": 0.9133012167675768, + "learning_rate": 4.435394792573322e-07, + "loss": 0.114, + "step": 8868 + }, + { + "epoch": 0.8171557562076749, + "grad_norm": 0.9615531030979356, + "learning_rate": 4.431063919849668e-07, + "loss": 0.1171, + "step": 8869 + }, + { + "epoch": 0.817247892384945, + "grad_norm": 0.9009290620554269, + "learning_rate": 4.4267349569539404e-07, + "loss": 0.114, + "step": 8870 + }, + { + "epoch": 0.817340028562215, + "grad_norm": 0.9012808267067186, + "learning_rate": 4.422407904288095e-07, + "loss": 0.1259, + "step": 8871 + }, + { + "epoch": 0.817432164739485, + "grad_norm": 0.9080862780039042, + "learning_rate": 4.418082762253889e-07, + "loss": 0.1072, + "step": 8872 + }, + { + "epoch": 0.817524300916755, + "grad_norm": 0.9247328780388934, + "learning_rate": 4.4137595312529066e-07, + "loss": 0.1004, + "step": 8873 + }, + { + "epoch": 0.817616437094025, + "grad_norm": 0.9123210256606668, + "learning_rate": 4.4094382116865704e-07, + "loss": 0.1161, + "step": 8874 + }, + { + "epoch": 0.817708573271295, + "grad_norm": 0.9462605452591462, + "learning_rate": 4.4051188039561156e-07, + "loss": 0.1294, + "step": 8875 + }, + { + "epoch": 0.817800709448565, + "grad_norm": 0.9238681176353081, + "learning_rate": 4.400801308462591e-07, + "loss": 0.1223, + "step": 8876 + }, + { + "epoch": 0.817892845625835, + "grad_norm": 0.9192630991789277, + "learning_rate": 4.396485725606886e-07, + "loss": 0.1133, + "step": 8877 + }, + { + "epoch": 0.817984981803105, + "grad_norm": 0.9292618059590252, + "learning_rate": 4.3921720557896953e-07, + "loss": 0.1228, + "step": 8878 + }, + { + "epoch": 0.818077117980375, + "grad_norm": 0.941305059372131, + "learning_rate": 4.387860299411553e-07, + "loss": 0.1203, + "step": 8879 + }, + { + "epoch": 0.818169254157645, + "grad_norm": 0.9116056962903435, + "learning_rate": 4.383550456872793e-07, + "loss": 0.1145, + "step": 8880 + }, + { + "epoch": 0.818261390334915, + "grad_norm": 0.9624185944825424, + "learning_rate": 4.3792425285735935e-07, + "loss": 0.1277, + "step": 8881 + }, + { + "epoch": 0.818353526512185, + "grad_norm": 0.9945802215949294, + "learning_rate": 4.3749365149139493e-07, + "loss": 0.1229, + "step": 8882 + }, + { + "epoch": 0.8184456626894551, + "grad_norm": 0.8888004929847416, + "learning_rate": 4.3706324162936684e-07, + "loss": 0.1073, + "step": 8883 + }, + { + "epoch": 0.8185377988667251, + "grad_norm": 0.9515796576447026, + "learning_rate": 4.3663302331123815e-07, + "loss": 0.119, + "step": 8884 + }, + { + "epoch": 0.8186299350439951, + "grad_norm": 0.9196874221149748, + "learning_rate": 4.362029965769554e-07, + "loss": 0.1149, + "step": 8885 + }, + { + "epoch": 0.8187220712212651, + "grad_norm": 0.9462812699661997, + "learning_rate": 4.3577316146644677e-07, + "loss": 0.1249, + "step": 8886 + }, + { + "epoch": 0.818814207398535, + "grad_norm": 0.9160239522286485, + "learning_rate": 4.353435180196225e-07, + "loss": 0.1158, + "step": 8887 + }, + { + "epoch": 0.818906343575805, + "grad_norm": 0.9343357130198869, + "learning_rate": 4.349140662763751e-07, + "loss": 0.118, + "step": 8888 + }, + { + "epoch": 0.818998479753075, + "grad_norm": 0.9273075309006215, + "learning_rate": 4.3448480627657804e-07, + "loss": 0.1262, + "step": 8889 + }, + { + "epoch": 0.819090615930345, + "grad_norm": 0.9436024913907296, + "learning_rate": 4.3405573806008905e-07, + "loss": 0.1236, + "step": 8890 + }, + { + "epoch": 0.819182752107615, + "grad_norm": 0.9298816928569952, + "learning_rate": 4.336268616667477e-07, + "loss": 0.1198, + "step": 8891 + }, + { + "epoch": 0.819274888284885, + "grad_norm": 0.9208677732375474, + "learning_rate": 4.3319817713637415e-07, + "loss": 0.1159, + "step": 8892 + }, + { + "epoch": 0.819367024462155, + "grad_norm": 0.9685921400717235, + "learning_rate": 4.327696845087728e-07, + "loss": 0.1145, + "step": 8893 + }, + { + "epoch": 0.819459160639425, + "grad_norm": 0.9149193983462024, + "learning_rate": 4.323413838237281e-07, + "loss": 0.1148, + "step": 8894 + }, + { + "epoch": 0.819551296816695, + "grad_norm": 0.8670286833052321, + "learning_rate": 4.319132751210084e-07, + "loss": 0.1044, + "step": 8895 + }, + { + "epoch": 0.819643432993965, + "grad_norm": 0.9696750526422799, + "learning_rate": 4.3148535844036444e-07, + "loss": 0.1277, + "step": 8896 + }, + { + "epoch": 0.8197355691712351, + "grad_norm": 0.905676179841572, + "learning_rate": 4.310576338215269e-07, + "loss": 0.1042, + "step": 8897 + }, + { + "epoch": 0.8198277053485051, + "grad_norm": 0.9891893214431671, + "learning_rate": 4.3063010130421133e-07, + "loss": 0.1137, + "step": 8898 + }, + { + "epoch": 0.8199198415257751, + "grad_norm": 0.9729248302550854, + "learning_rate": 4.302027609281129e-07, + "loss": 0.1196, + "step": 8899 + }, + { + "epoch": 0.8200119777030451, + "grad_norm": 1.0186935346621, + "learning_rate": 4.2977561273291166e-07, + "loss": 0.1255, + "step": 8900 + }, + { + "epoch": 0.8201041138803151, + "grad_norm": 0.9395849723041353, + "learning_rate": 4.2934865675826666e-07, + "loss": 0.1116, + "step": 8901 + }, + { + "epoch": 0.8201962500575851, + "grad_norm": 0.9359575585273199, + "learning_rate": 4.289218930438219e-07, + "loss": 0.1194, + "step": 8902 + }, + { + "epoch": 0.8202883862348551, + "grad_norm": 0.930147304393213, + "learning_rate": 4.284953216292029e-07, + "loss": 0.1276, + "step": 8903 + }, + { + "epoch": 0.8203805224121251, + "grad_norm": 0.9492365348281026, + "learning_rate": 4.280689425540163e-07, + "loss": 0.1178, + "step": 8904 + }, + { + "epoch": 0.8204726585893951, + "grad_norm": 0.8818809813498917, + "learning_rate": 4.2764275585785054e-07, + "loss": 0.1096, + "step": 8905 + }, + { + "epoch": 0.8205647947666651, + "grad_norm": 0.9030730135112734, + "learning_rate": 4.27216761580278e-07, + "loss": 0.1183, + "step": 8906 + }, + { + "epoch": 0.8206569309439351, + "grad_norm": 0.9361132602718762, + "learning_rate": 4.2679095976085217e-07, + "loss": 0.1202, + "step": 8907 + }, + { + "epoch": 0.8207490671212051, + "grad_norm": 0.9450714386955552, + "learning_rate": 4.2636535043910965e-07, + "loss": 0.1181, + "step": 8908 + }, + { + "epoch": 0.8208412032984751, + "grad_norm": 0.9815734649085485, + "learning_rate": 4.2593993365456746e-07, + "loss": 0.1126, + "step": 8909 + }, + { + "epoch": 0.8209333394757452, + "grad_norm": 0.9795019317978423, + "learning_rate": 4.255147094467249e-07, + "loss": 0.1177, + "step": 8910 + }, + { + "epoch": 0.8210254756530152, + "grad_norm": 0.9814500011535171, + "learning_rate": 4.250896778550648e-07, + "loss": 0.1262, + "step": 8911 + }, + { + "epoch": 0.8211176118302852, + "grad_norm": 0.8741343412062761, + "learning_rate": 4.246648389190522e-07, + "loss": 0.1115, + "step": 8912 + }, + { + "epoch": 0.8212097480075552, + "grad_norm": 1.001494769248629, + "learning_rate": 4.24240192678132e-07, + "loss": 0.1262, + "step": 8913 + }, + { + "epoch": 0.8213018841848252, + "grad_norm": 0.9333485530166209, + "learning_rate": 4.23815739171734e-07, + "loss": 0.115, + "step": 8914 + }, + { + "epoch": 0.8213940203620952, + "grad_norm": 0.9608338050250689, + "learning_rate": 4.233914784392673e-07, + "loss": 0.1298, + "step": 8915 + }, + { + "epoch": 0.8214861565393652, + "grad_norm": 0.9396240080659503, + "learning_rate": 4.229674105201259e-07, + "loss": 0.1166, + "step": 8916 + }, + { + "epoch": 0.8215782927166352, + "grad_norm": 0.9271200896495708, + "learning_rate": 4.225435354536833e-07, + "loss": 0.116, + "step": 8917 + }, + { + "epoch": 0.8216704288939052, + "grad_norm": 0.9403145613253563, + "learning_rate": 4.221198532792972e-07, + "loss": 0.1141, + "step": 8918 + }, + { + "epoch": 0.8217625650711752, + "grad_norm": 0.8984489683434724, + "learning_rate": 4.2169636403630697e-07, + "loss": 0.1184, + "step": 8919 + }, + { + "epoch": 0.8218547012484452, + "grad_norm": 0.9492944717039095, + "learning_rate": 4.212730677640328e-07, + "loss": 0.1157, + "step": 8920 + }, + { + "epoch": 0.8219468374257152, + "grad_norm": 0.9388579066857203, + "learning_rate": 4.2084996450177744e-07, + "loss": 0.12, + "step": 8921 + }, + { + "epoch": 0.8220389736029852, + "grad_norm": 0.9925571358138545, + "learning_rate": 4.20427054288827e-07, + "loss": 0.1286, + "step": 8922 + }, + { + "epoch": 0.8221311097802552, + "grad_norm": 0.9465299652537837, + "learning_rate": 4.20004337164448e-07, + "loss": 0.1252, + "step": 8923 + }, + { + "epoch": 0.8222232459575253, + "grad_norm": 0.8823315904062302, + "learning_rate": 4.1958181316789084e-07, + "loss": 0.1116, + "step": 8924 + }, + { + "epoch": 0.8223153821347953, + "grad_norm": 0.9751808798700181, + "learning_rate": 4.1915948233838625e-07, + "loss": 0.1131, + "step": 8925 + }, + { + "epoch": 0.8224075183120653, + "grad_norm": 0.8996660151380795, + "learning_rate": 4.1873734471514685e-07, + "loss": 0.119, + "step": 8926 + }, + { + "epoch": 0.8224996544893353, + "grad_norm": 0.9027774874768812, + "learning_rate": 4.1831540033736935e-07, + "loss": 0.1098, + "step": 8927 + }, + { + "epoch": 0.8225917906666053, + "grad_norm": 0.9123413341572139, + "learning_rate": 4.1789364924423067e-07, + "loss": 0.1201, + "step": 8928 + }, + { + "epoch": 0.8226839268438753, + "grad_norm": 0.9650092844823791, + "learning_rate": 4.174720914748914e-07, + "loss": 0.1212, + "step": 8929 + }, + { + "epoch": 0.8227760630211453, + "grad_norm": 0.9097635241979137, + "learning_rate": 4.1705072706849287e-07, + "loss": 0.1146, + "step": 8930 + }, + { + "epoch": 0.8228681991984153, + "grad_norm": 0.9642879397283781, + "learning_rate": 4.166295560641576e-07, + "loss": 0.1214, + "step": 8931 + }, + { + "epoch": 0.8229603353756852, + "grad_norm": 0.9629087714634937, + "learning_rate": 4.1620857850099227e-07, + "loss": 0.1155, + "step": 8932 + }, + { + "epoch": 0.8230524715529552, + "grad_norm": 0.9983273086133829, + "learning_rate": 4.157877944180852e-07, + "loss": 0.1293, + "step": 8933 + }, + { + "epoch": 0.8231446077302252, + "grad_norm": 0.9935451639675307, + "learning_rate": 4.153672038545054e-07, + "loss": 0.128, + "step": 8934 + }, + { + "epoch": 0.8232367439074952, + "grad_norm": 1.061281927988124, + "learning_rate": 4.1494680684930485e-07, + "loss": 0.1413, + "step": 8935 + }, + { + "epoch": 0.8233288800847652, + "grad_norm": 0.9332443265144761, + "learning_rate": 4.1452660344151826e-07, + "loss": 0.1247, + "step": 8936 + }, + { + "epoch": 0.8234210162620353, + "grad_norm": 0.8874742579698003, + "learning_rate": 4.141065936701613e-07, + "loss": 0.1045, + "step": 8937 + }, + { + "epoch": 0.8235131524393053, + "grad_norm": 0.994749623574586, + "learning_rate": 4.1368677757423064e-07, + "loss": 0.1285, + "step": 8938 + }, + { + "epoch": 0.8236052886165753, + "grad_norm": 0.954874028724842, + "learning_rate": 4.1326715519270725e-07, + "loss": 0.1302, + "step": 8939 + }, + { + "epoch": 0.8236974247938453, + "grad_norm": 0.9643041528254239, + "learning_rate": 4.1284772656455334e-07, + "loss": 0.1276, + "step": 8940 + }, + { + "epoch": 0.8237895609711153, + "grad_norm": 0.9820897900287507, + "learning_rate": 4.12428491728713e-07, + "loss": 0.1341, + "step": 8941 + }, + { + "epoch": 0.8238816971483853, + "grad_norm": 0.9548532478634758, + "learning_rate": 4.1200945072411207e-07, + "loss": 0.121, + "step": 8942 + }, + { + "epoch": 0.8239738333256553, + "grad_norm": 0.9437093357718186, + "learning_rate": 4.1159060358965745e-07, + "loss": 0.1326, + "step": 8943 + }, + { + "epoch": 0.8240659695029253, + "grad_norm": 0.9282509901707379, + "learning_rate": 4.111719503642403e-07, + "loss": 0.1183, + "step": 8944 + }, + { + "epoch": 0.8241581056801953, + "grad_norm": 0.971786827764824, + "learning_rate": 4.1075349108673306e-07, + "loss": 0.1355, + "step": 8945 + }, + { + "epoch": 0.8242502418574653, + "grad_norm": 0.9875630667869919, + "learning_rate": 4.1033522579598804e-07, + "loss": 0.1413, + "step": 8946 + }, + { + "epoch": 0.8243423780347353, + "grad_norm": 0.9528042142165237, + "learning_rate": 4.0991715453084307e-07, + "loss": 0.1146, + "step": 8947 + }, + { + "epoch": 0.8244345142120053, + "grad_norm": 0.9010012156168963, + "learning_rate": 4.0949927733011455e-07, + "loss": 0.1172, + "step": 8948 + }, + { + "epoch": 0.8245266503892753, + "grad_norm": 0.90929624679404, + "learning_rate": 4.0908159423260374e-07, + "loss": 0.1142, + "step": 8949 + }, + { + "epoch": 0.8246187865665453, + "grad_norm": 0.943372601274594, + "learning_rate": 4.086641052770915e-07, + "loss": 0.1162, + "step": 8950 + }, + { + "epoch": 0.8247109227438154, + "grad_norm": 0.9659695477866573, + "learning_rate": 4.082468105023418e-07, + "loss": 0.1209, + "step": 8951 + }, + { + "epoch": 0.8248030589210854, + "grad_norm": 0.8942946250102998, + "learning_rate": 4.078297099471018e-07, + "loss": 0.1143, + "step": 8952 + }, + { + "epoch": 0.8248951950983554, + "grad_norm": 0.9233344249672104, + "learning_rate": 4.0741280365009765e-07, + "loss": 0.1254, + "step": 8953 + }, + { + "epoch": 0.8249873312756254, + "grad_norm": 0.9404317994762078, + "learning_rate": 4.069960916500404e-07, + "loss": 0.118, + "step": 8954 + }, + { + "epoch": 0.8250794674528954, + "grad_norm": 0.9542843518189766, + "learning_rate": 4.065795739856207e-07, + "loss": 0.1236, + "step": 8955 + }, + { + "epoch": 0.8251716036301654, + "grad_norm": 0.8768924661820494, + "learning_rate": 4.0616325069551296e-07, + "loss": 0.1026, + "step": 8956 + }, + { + "epoch": 0.8252637398074354, + "grad_norm": 0.910496711637403, + "learning_rate": 4.057471218183734e-07, + "loss": 0.1185, + "step": 8957 + }, + { + "epoch": 0.8253558759847054, + "grad_norm": 0.9617153732562939, + "learning_rate": 4.0533118739283864e-07, + "loss": 0.1287, + "step": 8958 + }, + { + "epoch": 0.8254480121619754, + "grad_norm": 0.9206590523649169, + "learning_rate": 4.049154474575284e-07, + "loss": 0.1096, + "step": 8959 + }, + { + "epoch": 0.8255401483392454, + "grad_norm": 0.9911337506691006, + "learning_rate": 4.04499902051044e-07, + "loss": 0.1254, + "step": 8960 + }, + { + "epoch": 0.8256322845165154, + "grad_norm": 0.9393601602299094, + "learning_rate": 4.0408455121196957e-07, + "loss": 0.1216, + "step": 8961 + }, + { + "epoch": 0.8257244206937854, + "grad_norm": 0.90987307073812, + "learning_rate": 4.0366939497887033e-07, + "loss": 0.1069, + "step": 8962 + }, + { + "epoch": 0.8258165568710554, + "grad_norm": 0.9690470990393921, + "learning_rate": 4.032544333902935e-07, + "loss": 0.1267, + "step": 8963 + }, + { + "epoch": 0.8259086930483254, + "grad_norm": 0.9604445922242358, + "learning_rate": 4.028396664847678e-07, + "loss": 0.1182, + "step": 8964 + }, + { + "epoch": 0.8260008292255955, + "grad_norm": 0.9183917505789629, + "learning_rate": 4.0242509430080456e-07, + "loss": 0.1126, + "step": 8965 + }, + { + "epoch": 0.8260929654028655, + "grad_norm": 1.0160636850060218, + "learning_rate": 4.0201071687689746e-07, + "loss": 0.1324, + "step": 8966 + }, + { + "epoch": 0.8261851015801355, + "grad_norm": 0.9045916383232453, + "learning_rate": 4.0159653425152074e-07, + "loss": 0.1165, + "step": 8967 + }, + { + "epoch": 0.8262772377574055, + "grad_norm": 0.9083173799399996, + "learning_rate": 4.011825464631322e-07, + "loss": 0.1046, + "step": 8968 + }, + { + "epoch": 0.8263693739346755, + "grad_norm": 0.9147366746741465, + "learning_rate": 4.0076875355016975e-07, + "loss": 0.113, + "step": 8969 + }, + { + "epoch": 0.8264615101119455, + "grad_norm": 0.9910503681642115, + "learning_rate": 4.003551555510549e-07, + "loss": 0.1175, + "step": 8970 + }, + { + "epoch": 0.8265536462892155, + "grad_norm": 1.0082243762021406, + "learning_rate": 3.99941752504189e-07, + "loss": 0.1273, + "step": 8971 + }, + { + "epoch": 0.8266457824664855, + "grad_norm": 0.9544300821805285, + "learning_rate": 3.995285444479574e-07, + "loss": 0.1277, + "step": 8972 + }, + { + "epoch": 0.8267379186437555, + "grad_norm": 0.9873879835090933, + "learning_rate": 3.9911553142072733e-07, + "loss": 0.1156, + "step": 8973 + }, + { + "epoch": 0.8268300548210255, + "grad_norm": 1.0143048636164989, + "learning_rate": 3.987027134608462e-07, + "loss": 0.1268, + "step": 8974 + }, + { + "epoch": 0.8269221909982954, + "grad_norm": 0.9406111544099246, + "learning_rate": 3.9829009060664363e-07, + "loss": 0.1246, + "step": 8975 + }, + { + "epoch": 0.8270143271755654, + "grad_norm": 0.9211034325729525, + "learning_rate": 3.9787766289643233e-07, + "loss": 0.1066, + "step": 8976 + }, + { + "epoch": 0.8271064633528354, + "grad_norm": 0.9369902884160378, + "learning_rate": 3.974654303685063e-07, + "loss": 0.1135, + "step": 8977 + }, + { + "epoch": 0.8271985995301055, + "grad_norm": 0.9431563970089338, + "learning_rate": 3.97053393061142e-07, + "loss": 0.111, + "step": 8978 + }, + { + "epoch": 0.8272907357073755, + "grad_norm": 0.9293997578153168, + "learning_rate": 3.966415510125965e-07, + "loss": 0.1093, + "step": 8979 + }, + { + "epoch": 0.8273828718846455, + "grad_norm": 0.9676726803687037, + "learning_rate": 3.9622990426110867e-07, + "loss": 0.1204, + "step": 8980 + }, + { + "epoch": 0.8274750080619155, + "grad_norm": 0.9486005141441863, + "learning_rate": 3.958184528449005e-07, + "loss": 0.1204, + "step": 8981 + }, + { + "epoch": 0.8275671442391855, + "grad_norm": 0.9918274525746862, + "learning_rate": 3.954071968021755e-07, + "loss": 0.127, + "step": 8982 + }, + { + "epoch": 0.8276592804164555, + "grad_norm": 0.9000784607704694, + "learning_rate": 3.9499613617111965e-07, + "loss": 0.1071, + "step": 8983 + }, + { + "epoch": 0.8277514165937255, + "grad_norm": 0.954022230403387, + "learning_rate": 3.945852709898987e-07, + "loss": 0.1233, + "step": 8984 + }, + { + "epoch": 0.8278435527709955, + "grad_norm": 1.0309117530762104, + "learning_rate": 3.941746012966616e-07, + "loss": 0.1155, + "step": 8985 + }, + { + "epoch": 0.8279356889482655, + "grad_norm": 0.9673615839155028, + "learning_rate": 3.937641271295392e-07, + "loss": 0.1245, + "step": 8986 + }, + { + "epoch": 0.8280278251255355, + "grad_norm": 0.996234111747313, + "learning_rate": 3.93353848526645e-07, + "loss": 0.1329, + "step": 8987 + }, + { + "epoch": 0.8281199613028055, + "grad_norm": 0.9257195359719688, + "learning_rate": 3.9294376552607233e-07, + "loss": 0.1149, + "step": 8988 + }, + { + "epoch": 0.8282120974800755, + "grad_norm": 0.8982916316625367, + "learning_rate": 3.92533878165898e-07, + "loss": 0.1091, + "step": 8989 + }, + { + "epoch": 0.8283042336573455, + "grad_norm": 0.9789169488412091, + "learning_rate": 3.921241864841793e-07, + "loss": 0.1123, + "step": 8990 + }, + { + "epoch": 0.8283963698346155, + "grad_norm": 0.9511169814629293, + "learning_rate": 3.917146905189576e-07, + "loss": 0.1122, + "step": 8991 + }, + { + "epoch": 0.8284885060118856, + "grad_norm": 0.9369304152077549, + "learning_rate": 3.913053903082531e-07, + "loss": 0.1155, + "step": 8992 + }, + { + "epoch": 0.8285806421891556, + "grad_norm": 0.9261575552816484, + "learning_rate": 3.9089628589007e-07, + "loss": 0.1204, + "step": 8993 + }, + { + "epoch": 0.8286727783664256, + "grad_norm": 0.9719530306024485, + "learning_rate": 3.9048737730239427e-07, + "loss": 0.1287, + "step": 8994 + }, + { + "epoch": 0.8287649145436956, + "grad_norm": 0.9022992796256752, + "learning_rate": 3.9007866458319275e-07, + "loss": 0.1032, + "step": 8995 + }, + { + "epoch": 0.8288570507209656, + "grad_norm": 0.9154891131247639, + "learning_rate": 3.8967014777041344e-07, + "loss": 0.1183, + "step": 8996 + }, + { + "epoch": 0.8289491868982356, + "grad_norm": 0.9790283403390899, + "learning_rate": 3.892618269019882e-07, + "loss": 0.12, + "step": 8997 + }, + { + "epoch": 0.8290413230755056, + "grad_norm": 0.8989171769636702, + "learning_rate": 3.888537020158295e-07, + "loss": 0.1162, + "step": 8998 + }, + { + "epoch": 0.8291334592527756, + "grad_norm": 0.9513898800003195, + "learning_rate": 3.8844577314983254e-07, + "loss": 0.1206, + "step": 8999 + }, + { + "epoch": 0.8292255954300456, + "grad_norm": 0.8928187592109958, + "learning_rate": 3.8803804034187235e-07, + "loss": 0.1147, + "step": 9000 + }, + { + "epoch": 0.8292255954300456, + "eval_loss": 0.11866238713264465, + "eval_runtime": 298.9938, + "eval_samples_per_second": 23.469, + "eval_steps_per_second": 2.937, + "step": 9000 + }, + { + "epoch": 0.8293177316073156, + "grad_norm": 0.8963570408818765, + "learning_rate": 3.8763050362980723e-07, + "loss": 0.1019, + "step": 9001 + }, + { + "epoch": 0.8294098677845856, + "grad_norm": 0.9809451351037467, + "learning_rate": 3.8722316305147693e-07, + "loss": 0.1274, + "step": 9002 + }, + { + "epoch": 0.8295020039618556, + "grad_norm": 0.8795144198454591, + "learning_rate": 3.8681601864470396e-07, + "loss": 0.1017, + "step": 9003 + }, + { + "epoch": 0.8295941401391256, + "grad_norm": 0.9827458108554684, + "learning_rate": 3.864090704472906e-07, + "loss": 0.1246, + "step": 9004 + }, + { + "epoch": 0.8296862763163957, + "grad_norm": 0.9325877389996922, + "learning_rate": 3.86002318497023e-07, + "loss": 0.118, + "step": 9005 + }, + { + "epoch": 0.8297784124936657, + "grad_norm": 0.9312298769357221, + "learning_rate": 3.855957628316673e-07, + "loss": 0.1339, + "step": 9006 + }, + { + "epoch": 0.8298705486709357, + "grad_norm": 0.9946128464419297, + "learning_rate": 3.8518940348897277e-07, + "loss": 0.1323, + "step": 9007 + }, + { + "epoch": 0.8299626848482057, + "grad_norm": 0.9313704014210649, + "learning_rate": 3.8478324050666926e-07, + "loss": 0.1181, + "step": 9008 + }, + { + "epoch": 0.8300548210254757, + "grad_norm": 0.9198770011959588, + "learning_rate": 3.8437727392246966e-07, + "loss": 0.1142, + "step": 9009 + }, + { + "epoch": 0.8301469572027457, + "grad_norm": 0.9180412820101661, + "learning_rate": 3.839715037740677e-07, + "loss": 0.1252, + "step": 9010 + }, + { + "epoch": 0.8302390933800157, + "grad_norm": 0.9675721326632333, + "learning_rate": 3.835659300991401e-07, + "loss": 0.1252, + "step": 9011 + }, + { + "epoch": 0.8303312295572857, + "grad_norm": 0.9748217354158463, + "learning_rate": 3.8316055293534353e-07, + "loss": 0.1179, + "step": 9012 + }, + { + "epoch": 0.8304233657345557, + "grad_norm": 0.8849626621010777, + "learning_rate": 3.82755372320317e-07, + "loss": 0.1157, + "step": 9013 + }, + { + "epoch": 0.8305155019118257, + "grad_norm": 0.9417758302521162, + "learning_rate": 3.823503882916818e-07, + "loss": 0.1145, + "step": 9014 + }, + { + "epoch": 0.8306076380890957, + "grad_norm": 0.9352672162324204, + "learning_rate": 3.819456008870412e-07, + "loss": 0.1112, + "step": 9015 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 0.9546735786454992, + "learning_rate": 3.815410101439798e-07, + "loss": 0.124, + "step": 9016 + }, + { + "epoch": 0.8307919104436357, + "grad_norm": 0.8982406751220058, + "learning_rate": 3.8113661610006375e-07, + "loss": 0.108, + "step": 9017 + }, + { + "epoch": 0.8308840466209056, + "grad_norm": 0.936872634455003, + "learning_rate": 3.8073241879284045e-07, + "loss": 0.125, + "step": 9018 + }, + { + "epoch": 0.8309761827981758, + "grad_norm": 0.9029042550032307, + "learning_rate": 3.803284182598399e-07, + "loss": 0.111, + "step": 9019 + }, + { + "epoch": 0.8310683189754458, + "grad_norm": 0.9279239531114798, + "learning_rate": 3.799246145385746e-07, + "loss": 0.1183, + "step": 9020 + }, + { + "epoch": 0.8311604551527157, + "grad_norm": 0.9001601906556236, + "learning_rate": 3.795210076665362e-07, + "loss": 0.1062, + "step": 9021 + }, + { + "epoch": 0.8312525913299857, + "grad_norm": 0.9096130257836612, + "learning_rate": 3.791175976812014e-07, + "loss": 0.1121, + "step": 9022 + }, + { + "epoch": 0.8313447275072557, + "grad_norm": 0.9841772012761711, + "learning_rate": 3.78714384620025e-07, + "loss": 0.1291, + "step": 9023 + }, + { + "epoch": 0.8314368636845257, + "grad_norm": 0.978523579768035, + "learning_rate": 3.7831136852044705e-07, + "loss": 0.1217, + "step": 9024 + }, + { + "epoch": 0.8315289998617957, + "grad_norm": 0.9247283536985508, + "learning_rate": 3.7790854941988615e-07, + "loss": 0.1077, + "step": 9025 + }, + { + "epoch": 0.8316211360390657, + "grad_norm": 0.9377240447676977, + "learning_rate": 3.7750592735574494e-07, + "loss": 0.1153, + "step": 9026 + }, + { + "epoch": 0.8317132722163357, + "grad_norm": 0.9592075723655732, + "learning_rate": 3.7710350236540737e-07, + "loss": 0.1163, + "step": 9027 + }, + { + "epoch": 0.8318054083936057, + "grad_norm": 0.9618403112391543, + "learning_rate": 3.7670127448623804e-07, + "loss": 0.1261, + "step": 9028 + }, + { + "epoch": 0.8318975445708757, + "grad_norm": 0.9207309046891926, + "learning_rate": 3.7629924375558347e-07, + "loss": 0.1142, + "step": 9029 + }, + { + "epoch": 0.8319896807481457, + "grad_norm": 0.9393609681547822, + "learning_rate": 3.7589741021077234e-07, + "loss": 0.1193, + "step": 9030 + }, + { + "epoch": 0.8320818169254157, + "grad_norm": 0.9288187420162773, + "learning_rate": 3.7549577388911546e-07, + "loss": 0.1123, + "step": 9031 + }, + { + "epoch": 0.8321739531026857, + "grad_norm": 0.9124189814585899, + "learning_rate": 3.7509433482790515e-07, + "loss": 0.11, + "step": 9032 + }, + { + "epoch": 0.8322660892799558, + "grad_norm": 0.9163760756223935, + "learning_rate": 3.7469309306441466e-07, + "loss": 0.1089, + "step": 9033 + }, + { + "epoch": 0.8323582254572258, + "grad_norm": 1.020152579275757, + "learning_rate": 3.742920486358986e-07, + "loss": 0.1325, + "step": 9034 + }, + { + "epoch": 0.8324503616344958, + "grad_norm": 0.9058071226707416, + "learning_rate": 3.738912015795945e-07, + "loss": 0.1124, + "step": 9035 + }, + { + "epoch": 0.8325424978117658, + "grad_norm": 0.9111574495433679, + "learning_rate": 3.734905519327217e-07, + "loss": 0.1197, + "step": 9036 + }, + { + "epoch": 0.8326346339890358, + "grad_norm": 0.9817191595487917, + "learning_rate": 3.7309009973247963e-07, + "loss": 0.1235, + "step": 9037 + }, + { + "epoch": 0.8327267701663058, + "grad_norm": 0.8820936392317466, + "learning_rate": 3.72689845016051e-07, + "loss": 0.1057, + "step": 9038 + }, + { + "epoch": 0.8328189063435758, + "grad_norm": 0.9080842498257791, + "learning_rate": 3.722897878205989e-07, + "loss": 0.1238, + "step": 9039 + }, + { + "epoch": 0.8329110425208458, + "grad_norm": 0.9689835563120064, + "learning_rate": 3.718899281832686e-07, + "loss": 0.1158, + "step": 9040 + }, + { + "epoch": 0.8330031786981158, + "grad_norm": 0.9041628190128206, + "learning_rate": 3.7149026614118844e-07, + "loss": 0.1161, + "step": 9041 + }, + { + "epoch": 0.8330953148753858, + "grad_norm": 0.8889635923127017, + "learning_rate": 3.7109080173146543e-07, + "loss": 0.1125, + "step": 9042 + }, + { + "epoch": 0.8331874510526558, + "grad_norm": 0.9497760569923801, + "learning_rate": 3.7069153499119134e-07, + "loss": 0.1229, + "step": 9043 + }, + { + "epoch": 0.8332795872299258, + "grad_norm": 0.9338812497801194, + "learning_rate": 3.7029246595743666e-07, + "loss": 0.1178, + "step": 9044 + }, + { + "epoch": 0.8333717234071958, + "grad_norm": 0.8848813835864621, + "learning_rate": 3.6989359466725603e-07, + "loss": 0.109, + "step": 9045 + }, + { + "epoch": 0.8334638595844659, + "grad_norm": 0.8906596859436642, + "learning_rate": 3.6949492115768425e-07, + "loss": 0.1136, + "step": 9046 + }, + { + "epoch": 0.8335559957617359, + "grad_norm": 0.926402940035174, + "learning_rate": 3.6909644546573806e-07, + "loss": 0.1124, + "step": 9047 + }, + { + "epoch": 0.8336481319390059, + "grad_norm": 0.9310849307554305, + "learning_rate": 3.686981676284171e-07, + "loss": 0.1116, + "step": 9048 + }, + { + "epoch": 0.8337402681162759, + "grad_norm": 0.9872574472717625, + "learning_rate": 3.6830008768270033e-07, + "loss": 0.1274, + "step": 9049 + }, + { + "epoch": 0.8338324042935459, + "grad_norm": 0.9384635786716498, + "learning_rate": 3.679022056655493e-07, + "loss": 0.115, + "step": 9050 + }, + { + "epoch": 0.8339245404708159, + "grad_norm": 0.950804946256567, + "learning_rate": 3.675045216139081e-07, + "loss": 0.1113, + "step": 9051 + }, + { + "epoch": 0.8340166766480859, + "grad_norm": 0.9885673160133664, + "learning_rate": 3.6710703556470136e-07, + "loss": 0.1242, + "step": 9052 + }, + { + "epoch": 0.8341088128253559, + "grad_norm": 0.9689834654878836, + "learning_rate": 3.667097475548367e-07, + "loss": 0.1243, + "step": 9053 + }, + { + "epoch": 0.8342009490026259, + "grad_norm": 0.9672726322538227, + "learning_rate": 3.663126576212014e-07, + "loss": 0.1226, + "step": 9054 + }, + { + "epoch": 0.8342930851798959, + "grad_norm": 0.9226458262636499, + "learning_rate": 3.659157658006651e-07, + "loss": 0.1171, + "step": 9055 + }, + { + "epoch": 0.8343852213571659, + "grad_norm": 0.9283165671065761, + "learning_rate": 3.655190721300794e-07, + "loss": 0.1175, + "step": 9056 + }, + { + "epoch": 0.8344773575344359, + "grad_norm": 0.892800777812627, + "learning_rate": 3.651225766462782e-07, + "loss": 0.1133, + "step": 9057 + }, + { + "epoch": 0.8345694937117059, + "grad_norm": 1.0480032457880637, + "learning_rate": 3.647262793860751e-07, + "loss": 0.1168, + "step": 9058 + }, + { + "epoch": 0.8346616298889759, + "grad_norm": 0.9495060160798997, + "learning_rate": 3.643301803862673e-07, + "loss": 0.1251, + "step": 9059 + }, + { + "epoch": 0.834753766066246, + "grad_norm": 0.9767279275355083, + "learning_rate": 3.639342796836312e-07, + "loss": 0.1249, + "step": 9060 + }, + { + "epoch": 0.834845902243516, + "grad_norm": 0.956717822836148, + "learning_rate": 3.63538577314928e-07, + "loss": 0.1167, + "step": 9061 + }, + { + "epoch": 0.834938038420786, + "grad_norm": 0.8805632595773724, + "learning_rate": 3.6314307331689725e-07, + "loss": 0.1089, + "step": 9062 + }, + { + "epoch": 0.835030174598056, + "grad_norm": 0.9608933523035653, + "learning_rate": 3.6274776772626197e-07, + "loss": 0.1173, + "step": 9063 + }, + { + "epoch": 0.835122310775326, + "grad_norm": 0.9490880216010513, + "learning_rate": 3.6235266057972727e-07, + "loss": 0.1236, + "step": 9064 + }, + { + "epoch": 0.835214446952596, + "grad_norm": 0.9741016505813996, + "learning_rate": 3.6195775191397776e-07, + "loss": 0.1312, + "step": 9065 + }, + { + "epoch": 0.8353065831298659, + "grad_norm": 1.061742094019043, + "learning_rate": 3.615630417656807e-07, + "loss": 0.1333, + "step": 9066 + }, + { + "epoch": 0.8353987193071359, + "grad_norm": 0.9404116017239663, + "learning_rate": 3.611685301714854e-07, + "loss": 0.1159, + "step": 9067 + }, + { + "epoch": 0.8354908554844059, + "grad_norm": 0.948652738305397, + "learning_rate": 3.607742171680223e-07, + "loss": 0.1156, + "step": 9068 + }, + { + "epoch": 0.8355829916616759, + "grad_norm": 1.0125976280025968, + "learning_rate": 3.6038010279190376e-07, + "loss": 0.136, + "step": 9069 + }, + { + "epoch": 0.8356751278389459, + "grad_norm": 0.8919477550099669, + "learning_rate": 3.5998618707972303e-07, + "loss": 0.1076, + "step": 9070 + }, + { + "epoch": 0.8357672640162159, + "grad_norm": 0.9215454245045038, + "learning_rate": 3.5959247006805446e-07, + "loss": 0.1148, + "step": 9071 + }, + { + "epoch": 0.8358594001934859, + "grad_norm": 0.9527340209068846, + "learning_rate": 3.5919895179345546e-07, + "loss": 0.1193, + "step": 9072 + }, + { + "epoch": 0.835951536370756, + "grad_norm": 0.964974428331, + "learning_rate": 3.5880563229246434e-07, + "loss": 0.1248, + "step": 9073 + }, + { + "epoch": 0.836043672548026, + "grad_norm": 0.9511935127041369, + "learning_rate": 3.58412511601601e-07, + "loss": 0.1213, + "step": 9074 + }, + { + "epoch": 0.836135808725296, + "grad_norm": 0.9306185915223708, + "learning_rate": 3.5801958975736653e-07, + "loss": 0.1158, + "step": 9075 + }, + { + "epoch": 0.836227944902566, + "grad_norm": 0.8972295969668357, + "learning_rate": 3.576268667962432e-07, + "loss": 0.1121, + "step": 9076 + }, + { + "epoch": 0.836320081079836, + "grad_norm": 0.9471002321711262, + "learning_rate": 3.5723434275469593e-07, + "loss": 0.1185, + "step": 9077 + }, + { + "epoch": 0.836412217257106, + "grad_norm": 0.9437730017790239, + "learning_rate": 3.5684201766917087e-07, + "loss": 0.1203, + "step": 9078 + }, + { + "epoch": 0.836504353434376, + "grad_norm": 0.9323885982962973, + "learning_rate": 3.564498915760947e-07, + "loss": 0.1246, + "step": 9079 + }, + { + "epoch": 0.836596489611646, + "grad_norm": 0.9202495340206708, + "learning_rate": 3.560579645118775e-07, + "loss": 0.1123, + "step": 9080 + }, + { + "epoch": 0.836688625788916, + "grad_norm": 0.9643478363526602, + "learning_rate": 3.556662365129082e-07, + "loss": 0.1242, + "step": 9081 + }, + { + "epoch": 0.836780761966186, + "grad_norm": 0.9643206560127056, + "learning_rate": 3.5527470761556024e-07, + "loss": 0.1313, + "step": 9082 + }, + { + "epoch": 0.836872898143456, + "grad_norm": 0.9966865526476313, + "learning_rate": 3.5488337785618607e-07, + "loss": 0.1277, + "step": 9083 + }, + { + "epoch": 0.836965034320726, + "grad_norm": 0.9510760700309292, + "learning_rate": 3.5449224727112094e-07, + "loss": 0.1239, + "step": 9084 + }, + { + "epoch": 0.837057170497996, + "grad_norm": 0.937440293224199, + "learning_rate": 3.541013158966816e-07, + "loss": 0.1185, + "step": 9085 + }, + { + "epoch": 0.837149306675266, + "grad_norm": 0.9609619870780663, + "learning_rate": 3.5371058376916733e-07, + "loss": 0.1136, + "step": 9086 + }, + { + "epoch": 0.8372414428525361, + "grad_norm": 0.9648051774960312, + "learning_rate": 3.5332005092485496e-07, + "loss": 0.1196, + "step": 9087 + }, + { + "epoch": 0.8373335790298061, + "grad_norm": 0.9183894976690667, + "learning_rate": 3.529297174000071e-07, + "loss": 0.1114, + "step": 9088 + }, + { + "epoch": 0.8374257152070761, + "grad_norm": 0.9582901426501818, + "learning_rate": 3.525395832308659e-07, + "loss": 0.114, + "step": 9089 + }, + { + "epoch": 0.8375178513843461, + "grad_norm": 0.9000415692013729, + "learning_rate": 3.52149648453656e-07, + "loss": 0.117, + "step": 9090 + }, + { + "epoch": 0.8376099875616161, + "grad_norm": 0.9444437649158218, + "learning_rate": 3.5175991310458207e-07, + "loss": 0.1179, + "step": 9091 + }, + { + "epoch": 0.8377021237388861, + "grad_norm": 0.941489120512911, + "learning_rate": 3.513703772198318e-07, + "loss": 0.1279, + "step": 9092 + }, + { + "epoch": 0.8377942599161561, + "grad_norm": 0.9868011195136057, + "learning_rate": 3.5098104083557264e-07, + "loss": 0.1223, + "step": 9093 + }, + { + "epoch": 0.8378863960934261, + "grad_norm": 0.8705477423458002, + "learning_rate": 3.5059190398795603e-07, + "loss": 0.1046, + "step": 9094 + }, + { + "epoch": 0.8379785322706961, + "grad_norm": 0.9114211371590364, + "learning_rate": 3.5020296671311154e-07, + "loss": 0.1206, + "step": 9095 + }, + { + "epoch": 0.8380706684479661, + "grad_norm": 0.8950724214900556, + "learning_rate": 3.498142290471529e-07, + "loss": 0.113, + "step": 9096 + }, + { + "epoch": 0.8381628046252361, + "grad_norm": 0.9521022208521928, + "learning_rate": 3.4942569102617534e-07, + "loss": 0.1135, + "step": 9097 + }, + { + "epoch": 0.8382549408025061, + "grad_norm": 0.8837117200628214, + "learning_rate": 3.490373526862531e-07, + "loss": 0.1017, + "step": 9098 + }, + { + "epoch": 0.8383470769797761, + "grad_norm": 0.8962134046824323, + "learning_rate": 3.4864921406344483e-07, + "loss": 0.1135, + "step": 9099 + }, + { + "epoch": 0.8384392131570461, + "grad_norm": 0.8887076436299933, + "learning_rate": 3.482612751937878e-07, + "loss": 0.1116, + "step": 9100 + }, + { + "epoch": 0.8385313493343162, + "grad_norm": 0.9453219724054187, + "learning_rate": 3.4787353611330317e-07, + "loss": 0.1188, + "step": 9101 + }, + { + "epoch": 0.8386234855115862, + "grad_norm": 0.9381143409167717, + "learning_rate": 3.4748599685799274e-07, + "loss": 0.1167, + "step": 9102 + }, + { + "epoch": 0.8387156216888562, + "grad_norm": 0.9600988951205893, + "learning_rate": 3.470986574638391e-07, + "loss": 0.1087, + "step": 9103 + }, + { + "epoch": 0.8388077578661262, + "grad_norm": 0.984011588082167, + "learning_rate": 3.4671151796680654e-07, + "loss": 0.1325, + "step": 9104 + }, + { + "epoch": 0.8388998940433962, + "grad_norm": 0.9611615555873271, + "learning_rate": 3.46324578402841e-07, + "loss": 0.1305, + "step": 9105 + }, + { + "epoch": 0.8389920302206662, + "grad_norm": 0.9586580145664007, + "learning_rate": 3.459378388078702e-07, + "loss": 0.1225, + "step": 9106 + }, + { + "epoch": 0.8390841663979361, + "grad_norm": 0.9436411042590699, + "learning_rate": 3.4555129921780337e-07, + "loss": 0.117, + "step": 9107 + }, + { + "epoch": 0.8391763025752061, + "grad_norm": 0.9925703690511307, + "learning_rate": 3.4516495966853054e-07, + "loss": 0.1177, + "step": 9108 + }, + { + "epoch": 0.8392684387524761, + "grad_norm": 0.9548357208284326, + "learning_rate": 3.447788201959223e-07, + "loss": 0.1247, + "step": 9109 + }, + { + "epoch": 0.8393605749297461, + "grad_norm": 0.9972576838402836, + "learning_rate": 3.443928808358327e-07, + "loss": 0.1367, + "step": 9110 + }, + { + "epoch": 0.8394527111070161, + "grad_norm": 0.923211368304357, + "learning_rate": 3.4400714162409644e-07, + "loss": 0.1182, + "step": 9111 + }, + { + "epoch": 0.8395448472842861, + "grad_norm": 0.9053249971071133, + "learning_rate": 3.4362160259652887e-07, + "loss": 0.109, + "step": 9112 + }, + { + "epoch": 0.8396369834615561, + "grad_norm": 1.02878674745858, + "learning_rate": 3.4323626378892775e-07, + "loss": 0.1369, + "step": 9113 + }, + { + "epoch": 0.8397291196388262, + "grad_norm": 0.8885051736781546, + "learning_rate": 3.4285112523707143e-07, + "loss": 0.1153, + "step": 9114 + }, + { + "epoch": 0.8398212558160962, + "grad_norm": 0.9073801662881984, + "learning_rate": 3.424661869767207e-07, + "loss": 0.1089, + "step": 9115 + }, + { + "epoch": 0.8399133919933662, + "grad_norm": 0.928370265115947, + "learning_rate": 3.4208144904361613e-07, + "loss": 0.1121, + "step": 9116 + }, + { + "epoch": 0.8400055281706362, + "grad_norm": 0.9212881770336953, + "learning_rate": 3.4169691147348137e-07, + "loss": 0.1259, + "step": 9117 + }, + { + "epoch": 0.8400976643479062, + "grad_norm": 0.9788684272042728, + "learning_rate": 3.4131257430202124e-07, + "loss": 0.1266, + "step": 9118 + }, + { + "epoch": 0.8401898005251762, + "grad_norm": 0.9755763215915222, + "learning_rate": 3.4092843756492075e-07, + "loss": 0.1256, + "step": 9119 + }, + { + "epoch": 0.8402819367024462, + "grad_norm": 0.9751484992123327, + "learning_rate": 3.405445012978467e-07, + "loss": 0.1343, + "step": 9120 + }, + { + "epoch": 0.8403740728797162, + "grad_norm": 1.013270465324915, + "learning_rate": 3.4016076553644806e-07, + "loss": 0.1311, + "step": 9121 + }, + { + "epoch": 0.8404662090569862, + "grad_norm": 0.9903544054884075, + "learning_rate": 3.397772303163549e-07, + "loss": 0.1291, + "step": 9122 + }, + { + "epoch": 0.8405583452342562, + "grad_norm": 1.0099661007009835, + "learning_rate": 3.3939389567317876e-07, + "loss": 0.1196, + "step": 9123 + }, + { + "epoch": 0.8406504814115262, + "grad_norm": 0.9572207364409202, + "learning_rate": 3.39010761642512e-07, + "loss": 0.1328, + "step": 9124 + }, + { + "epoch": 0.8407426175887962, + "grad_norm": 0.9433063660527777, + "learning_rate": 3.386278282599281e-07, + "loss": 0.1244, + "step": 9125 + }, + { + "epoch": 0.8408347537660662, + "grad_norm": 0.9561219271919276, + "learning_rate": 3.3824509556098314e-07, + "loss": 0.1117, + "step": 9126 + }, + { + "epoch": 0.8409268899433362, + "grad_norm": 0.9526955884568686, + "learning_rate": 3.3786256358121327e-07, + "loss": 0.1171, + "step": 9127 + }, + { + "epoch": 0.8410190261206063, + "grad_norm": 0.9421580524280039, + "learning_rate": 3.37480232356138e-07, + "loss": 0.1185, + "step": 9128 + }, + { + "epoch": 0.8411111622978763, + "grad_norm": 0.9007740123953568, + "learning_rate": 3.3709810192125576e-07, + "loss": 0.1124, + "step": 9129 + }, + { + "epoch": 0.8412032984751463, + "grad_norm": 0.9257483251297324, + "learning_rate": 3.3671617231204683e-07, + "loss": 0.1231, + "step": 9130 + }, + { + "epoch": 0.8412954346524163, + "grad_norm": 0.9196705659888632, + "learning_rate": 3.3633444356397425e-07, + "loss": 0.1145, + "step": 9131 + }, + { + "epoch": 0.8413875708296863, + "grad_norm": 0.9501560375264937, + "learning_rate": 3.3595291571248216e-07, + "loss": 0.1174, + "step": 9132 + }, + { + "epoch": 0.8414797070069563, + "grad_norm": 0.9529617492932547, + "learning_rate": 3.355715887929939e-07, + "loss": 0.1235, + "step": 9133 + }, + { + "epoch": 0.8415718431842263, + "grad_norm": 0.9237901196829995, + "learning_rate": 3.351904628409172e-07, + "loss": 0.1113, + "step": 9134 + }, + { + "epoch": 0.8416639793614963, + "grad_norm": 0.9690381039536827, + "learning_rate": 3.348095378916386e-07, + "loss": 0.1195, + "step": 9135 + }, + { + "epoch": 0.8417561155387663, + "grad_norm": 0.9498145803212865, + "learning_rate": 3.34428813980528e-07, + "loss": 0.1163, + "step": 9136 + }, + { + "epoch": 0.8418482517160363, + "grad_norm": 0.8832640227036879, + "learning_rate": 3.3404829114293437e-07, + "loss": 0.1051, + "step": 9137 + }, + { + "epoch": 0.8419403878933063, + "grad_norm": 0.8932009094651336, + "learning_rate": 3.336679694141898e-07, + "loss": 0.1023, + "step": 9138 + }, + { + "epoch": 0.8420325240705763, + "grad_norm": 0.9251539366264352, + "learning_rate": 3.3328784882960817e-07, + "loss": 0.1073, + "step": 9139 + }, + { + "epoch": 0.8421246602478463, + "grad_norm": 0.9107574392909054, + "learning_rate": 3.3290792942448303e-07, + "loss": 0.1217, + "step": 9140 + }, + { + "epoch": 0.8422167964251164, + "grad_norm": 0.9267056836585295, + "learning_rate": 3.325282112340894e-07, + "loss": 0.1176, + "step": 9141 + }, + { + "epoch": 0.8423089326023864, + "grad_norm": 0.9393315446042251, + "learning_rate": 3.321486942936844e-07, + "loss": 0.123, + "step": 9142 + }, + { + "epoch": 0.8424010687796564, + "grad_norm": 0.9063891028187058, + "learning_rate": 3.317693786385065e-07, + "loss": 0.1099, + "step": 9143 + }, + { + "epoch": 0.8424932049569264, + "grad_norm": 0.9198452193152991, + "learning_rate": 3.3139026430377583e-07, + "loss": 0.1144, + "step": 9144 + }, + { + "epoch": 0.8425853411341964, + "grad_norm": 0.9107477136619364, + "learning_rate": 3.3101135132469237e-07, + "loss": 0.1102, + "step": 9145 + }, + { + "epoch": 0.8426774773114664, + "grad_norm": 0.9865477556790888, + "learning_rate": 3.306326397364379e-07, + "loss": 0.1203, + "step": 9146 + }, + { + "epoch": 0.8427696134887364, + "grad_norm": 0.9823050755373736, + "learning_rate": 3.3025412957417624e-07, + "loss": 0.1287, + "step": 9147 + }, + { + "epoch": 0.8428617496660064, + "grad_norm": 0.9272156435057771, + "learning_rate": 3.298758208730529e-07, + "loss": 0.1219, + "step": 9148 + }, + { + "epoch": 0.8429538858432764, + "grad_norm": 0.9622429969672128, + "learning_rate": 3.294977136681923e-07, + "loss": 0.1287, + "step": 9149 + }, + { + "epoch": 0.8430460220205463, + "grad_norm": 1.012662531193091, + "learning_rate": 3.291198079947033e-07, + "loss": 0.1199, + "step": 9150 + }, + { + "epoch": 0.8431381581978163, + "grad_norm": 0.903122594351644, + "learning_rate": 3.2874210388767313e-07, + "loss": 0.1208, + "step": 9151 + }, + { + "epoch": 0.8432302943750863, + "grad_norm": 0.9666984134750198, + "learning_rate": 3.2836460138217287e-07, + "loss": 0.1169, + "step": 9152 + }, + { + "epoch": 0.8433224305523563, + "grad_norm": 1.0222716317336589, + "learning_rate": 3.279873005132525e-07, + "loss": 0.1298, + "step": 9153 + }, + { + "epoch": 0.8434145667296263, + "grad_norm": 0.9444079754719301, + "learning_rate": 3.276102013159452e-07, + "loss": 0.1129, + "step": 9154 + }, + { + "epoch": 0.8435067029068964, + "grad_norm": 0.9178977470183698, + "learning_rate": 3.272333038252648e-07, + "loss": 0.1152, + "step": 9155 + }, + { + "epoch": 0.8435988390841664, + "grad_norm": 0.990355918764317, + "learning_rate": 3.2685660807620563e-07, + "loss": 0.122, + "step": 9156 + }, + { + "epoch": 0.8436909752614364, + "grad_norm": 0.994814328810813, + "learning_rate": 3.2648011410374463e-07, + "loss": 0.1273, + "step": 9157 + }, + { + "epoch": 0.8437831114387064, + "grad_norm": 0.9642134855127644, + "learning_rate": 3.2610382194283865e-07, + "loss": 0.115, + "step": 9158 + }, + { + "epoch": 0.8438752476159764, + "grad_norm": 0.998202330870969, + "learning_rate": 3.257277316284266e-07, + "loss": 0.1175, + "step": 9159 + }, + { + "epoch": 0.8439673837932464, + "grad_norm": 0.9220879009717579, + "learning_rate": 3.253518431954286e-07, + "loss": 0.1144, + "step": 9160 + }, + { + "epoch": 0.8440595199705164, + "grad_norm": 0.9456319757338142, + "learning_rate": 3.249761566787474e-07, + "loss": 0.1138, + "step": 9161 + }, + { + "epoch": 0.8441516561477864, + "grad_norm": 0.9517620248803692, + "learning_rate": 3.2460067211326274e-07, + "loss": 0.1183, + "step": 9162 + }, + { + "epoch": 0.8442437923250564, + "grad_norm": 0.9205843267337143, + "learning_rate": 3.2422538953383986e-07, + "loss": 0.1125, + "step": 9163 + }, + { + "epoch": 0.8443359285023264, + "grad_norm": 0.9913122073528652, + "learning_rate": 3.2385030897532364e-07, + "loss": 0.1214, + "step": 9164 + }, + { + "epoch": 0.8444280646795964, + "grad_norm": 0.9507436181178945, + "learning_rate": 3.234754304725413e-07, + "loss": 0.1172, + "step": 9165 + }, + { + "epoch": 0.8445202008568664, + "grad_norm": 0.9003711742448463, + "learning_rate": 3.2310075406029875e-07, + "loss": 0.1084, + "step": 9166 + }, + { + "epoch": 0.8446123370341364, + "grad_norm": 1.0251546343565063, + "learning_rate": 3.22726279773386e-07, + "loss": 0.1329, + "step": 9167 + }, + { + "epoch": 0.8447044732114064, + "grad_norm": 0.9484851964975073, + "learning_rate": 3.223520076465719e-07, + "loss": 0.1116, + "step": 9168 + }, + { + "epoch": 0.8447966093886765, + "grad_norm": 0.9361520273962691, + "learning_rate": 3.219779377146087e-07, + "loss": 0.1112, + "step": 9169 + }, + { + "epoch": 0.8448887455659465, + "grad_norm": 0.9910150266298312, + "learning_rate": 3.2160407001222805e-07, + "loss": 0.1319, + "step": 9170 + }, + { + "epoch": 0.8449808817432165, + "grad_norm": 0.9437007956534685, + "learning_rate": 3.2123040457414377e-07, + "loss": 0.1196, + "step": 9171 + }, + { + "epoch": 0.8450730179204865, + "grad_norm": 0.9162880175898103, + "learning_rate": 3.208569414350515e-07, + "loss": 0.1205, + "step": 9172 + }, + { + "epoch": 0.8451651540977565, + "grad_norm": 0.9328157543095837, + "learning_rate": 3.204836806296269e-07, + "loss": 0.1247, + "step": 9173 + }, + { + "epoch": 0.8452572902750265, + "grad_norm": 1.0276232815812092, + "learning_rate": 3.201106221925265e-07, + "loss": 0.1333, + "step": 9174 + }, + { + "epoch": 0.8453494264522965, + "grad_norm": 0.8934105754776828, + "learning_rate": 3.197377661583892e-07, + "loss": 0.1047, + "step": 9175 + }, + { + "epoch": 0.8454415626295665, + "grad_norm": 0.8901250527712635, + "learning_rate": 3.1936511256183524e-07, + "loss": 0.108, + "step": 9176 + }, + { + "epoch": 0.8455336988068365, + "grad_norm": 0.9456476762090105, + "learning_rate": 3.1899266143746556e-07, + "loss": 0.119, + "step": 9177 + }, + { + "epoch": 0.8456258349841065, + "grad_norm": 0.9369759218485262, + "learning_rate": 3.1862041281986224e-07, + "loss": 0.1151, + "step": 9178 + }, + { + "epoch": 0.8457179711613765, + "grad_norm": 0.9202364542455173, + "learning_rate": 3.182483667435876e-07, + "loss": 0.1141, + "step": 9179 + }, + { + "epoch": 0.8458101073386465, + "grad_norm": 0.8767559307939479, + "learning_rate": 3.1787652324318715e-07, + "loss": 0.107, + "step": 9180 + }, + { + "epoch": 0.8459022435159165, + "grad_norm": 0.9779144285133847, + "learning_rate": 3.1750488235318675e-07, + "loss": 0.1229, + "step": 9181 + }, + { + "epoch": 0.8459943796931866, + "grad_norm": 1.0063964020443654, + "learning_rate": 3.171334441080923e-07, + "loss": 0.124, + "step": 9182 + }, + { + "epoch": 0.8460865158704566, + "grad_norm": 0.9256576366855372, + "learning_rate": 3.1676220854239326e-07, + "loss": 0.1179, + "step": 9183 + }, + { + "epoch": 0.8461786520477266, + "grad_norm": 0.9470305673494429, + "learning_rate": 3.1639117569055744e-07, + "loss": 0.1211, + "step": 9184 + }, + { + "epoch": 0.8462707882249966, + "grad_norm": 0.9269075595700427, + "learning_rate": 3.160203455870359e-07, + "loss": 0.115, + "step": 9185 + }, + { + "epoch": 0.8463629244022666, + "grad_norm": 0.9437783625013559, + "learning_rate": 3.156497182662607e-07, + "loss": 0.1096, + "step": 9186 + }, + { + "epoch": 0.8464550605795366, + "grad_norm": 1.0404071839324958, + "learning_rate": 3.1527929376264393e-07, + "loss": 0.1346, + "step": 9187 + }, + { + "epoch": 0.8465471967568066, + "grad_norm": 0.9384504047967224, + "learning_rate": 3.149090721105805e-07, + "loss": 0.1204, + "step": 9188 + }, + { + "epoch": 0.8466393329340766, + "grad_norm": 0.9687143806916653, + "learning_rate": 3.145390533444442e-07, + "loss": 0.1205, + "step": 9189 + }, + { + "epoch": 0.8467314691113466, + "grad_norm": 0.9811809575277504, + "learning_rate": 3.1416923749859244e-07, + "loss": 0.124, + "step": 9190 + }, + { + "epoch": 0.8468236052886166, + "grad_norm": 0.9215113535637731, + "learning_rate": 3.1379962460736183e-07, + "loss": 0.118, + "step": 9191 + }, + { + "epoch": 0.8469157414658866, + "grad_norm": 0.9453843360808005, + "learning_rate": 3.1343021470507134e-07, + "loss": 0.1262, + "step": 9192 + }, + { + "epoch": 0.8470078776431565, + "grad_norm": 0.9679969128743854, + "learning_rate": 3.1306100782602126e-07, + "loss": 0.1217, + "step": 9193 + }, + { + "epoch": 0.8471000138204265, + "grad_norm": 0.943336778635726, + "learning_rate": 3.12692004004492e-07, + "loss": 0.1292, + "step": 9194 + }, + { + "epoch": 0.8471921499976965, + "grad_norm": 0.9273773600918996, + "learning_rate": 3.1232320327474504e-07, + "loss": 0.111, + "step": 9195 + }, + { + "epoch": 0.8472842861749666, + "grad_norm": 0.9294729570727469, + "learning_rate": 3.11954605671024e-07, + "loss": 0.1091, + "step": 9196 + }, + { + "epoch": 0.8473764223522366, + "grad_norm": 1.0014883154780123, + "learning_rate": 3.1158621122755336e-07, + "loss": 0.1209, + "step": 9197 + }, + { + "epoch": 0.8474685585295066, + "grad_norm": 0.9293479352770679, + "learning_rate": 3.112180199785389e-07, + "loss": 0.1186, + "step": 9198 + }, + { + "epoch": 0.8475606947067766, + "grad_norm": 0.925782398043939, + "learning_rate": 3.1085003195816697e-07, + "loss": 0.1168, + "step": 9199 + }, + { + "epoch": 0.8476528308840466, + "grad_norm": 0.9633031131247594, + "learning_rate": 3.1048224720060473e-07, + "loss": 0.1167, + "step": 9200 + }, + { + "epoch": 0.8477449670613166, + "grad_norm": 0.883630976694059, + "learning_rate": 3.1011466574000144e-07, + "loss": 0.1063, + "step": 9201 + }, + { + "epoch": 0.8478371032385866, + "grad_norm": 0.8883849522188346, + "learning_rate": 3.097472876104876e-07, + "loss": 0.1141, + "step": 9202 + }, + { + "epoch": 0.8479292394158566, + "grad_norm": 0.9053676263367607, + "learning_rate": 3.093801128461735e-07, + "loss": 0.1119, + "step": 9203 + }, + { + "epoch": 0.8480213755931266, + "grad_norm": 0.8851968674906724, + "learning_rate": 3.0901314148115203e-07, + "loss": 0.1122, + "step": 9204 + }, + { + "epoch": 0.8481135117703966, + "grad_norm": 0.9304807734264189, + "learning_rate": 3.0864637354949545e-07, + "loss": 0.1162, + "step": 9205 + }, + { + "epoch": 0.8482056479476666, + "grad_norm": 0.956308995162976, + "learning_rate": 3.082798090852596e-07, + "loss": 0.1159, + "step": 9206 + }, + { + "epoch": 0.8482977841249366, + "grad_norm": 0.938749830136273, + "learning_rate": 3.079134481224788e-07, + "loss": 0.1139, + "step": 9207 + }, + { + "epoch": 0.8483899203022066, + "grad_norm": 0.9347902376284178, + "learning_rate": 3.075472906951704e-07, + "loss": 0.1213, + "step": 9208 + }, + { + "epoch": 0.8484820564794767, + "grad_norm": 0.9532417511976031, + "learning_rate": 3.0718133683733247e-07, + "loss": 0.1308, + "step": 9209 + }, + { + "epoch": 0.8485741926567467, + "grad_norm": 0.9022635398220523, + "learning_rate": 3.0681558658294334e-07, + "loss": 0.114, + "step": 9210 + }, + { + "epoch": 0.8486663288340167, + "grad_norm": 0.953635552226034, + "learning_rate": 3.0645003996596254e-07, + "loss": 0.1167, + "step": 9211 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 0.9560485444002419, + "learning_rate": 3.0608469702033185e-07, + "loss": 0.1199, + "step": 9212 + }, + { + "epoch": 0.8488506011885567, + "grad_norm": 0.9130642873786164, + "learning_rate": 3.057195577799729e-07, + "loss": 0.1144, + "step": 9213 + }, + { + "epoch": 0.8489427373658267, + "grad_norm": 0.9626403344881101, + "learning_rate": 3.053546222787895e-07, + "loss": 0.1234, + "step": 9214 + }, + { + "epoch": 0.8490348735430967, + "grad_norm": 0.9327186071825609, + "learning_rate": 3.0498989055066597e-07, + "loss": 0.1122, + "step": 9215 + }, + { + "epoch": 0.8491270097203667, + "grad_norm": 0.9708981271608061, + "learning_rate": 3.046253626294665e-07, + "loss": 0.1278, + "step": 9216 + }, + { + "epoch": 0.8492191458976367, + "grad_norm": 0.8690144829639436, + "learning_rate": 3.0426103854903856e-07, + "loss": 0.1021, + "step": 9217 + }, + { + "epoch": 0.8493112820749067, + "grad_norm": 0.928018364641159, + "learning_rate": 3.0389691834320944e-07, + "loss": 0.1133, + "step": 9218 + }, + { + "epoch": 0.8494034182521767, + "grad_norm": 0.8965255066759291, + "learning_rate": 3.0353300204578854e-07, + "loss": 0.1102, + "step": 9219 + }, + { + "epoch": 0.8494955544294467, + "grad_norm": 0.8911346876173641, + "learning_rate": 3.031692896905644e-07, + "loss": 0.1097, + "step": 9220 + }, + { + "epoch": 0.8495876906067167, + "grad_norm": 0.96694668047903, + "learning_rate": 3.0280578131130805e-07, + "loss": 0.1198, + "step": 9221 + }, + { + "epoch": 0.8496798267839867, + "grad_norm": 0.8842727538731018, + "learning_rate": 3.024424769417711e-07, + "loss": 0.1018, + "step": 9222 + }, + { + "epoch": 0.8497719629612568, + "grad_norm": 0.9233797044528547, + "learning_rate": 3.020793766156871e-07, + "loss": 0.1181, + "step": 9223 + }, + { + "epoch": 0.8498640991385268, + "grad_norm": 0.9490387802732133, + "learning_rate": 3.017164803667691e-07, + "loss": 0.1215, + "step": 9224 + }, + { + "epoch": 0.8499562353157968, + "grad_norm": 1.0032873233600246, + "learning_rate": 3.013537882287132e-07, + "loss": 0.1153, + "step": 9225 + }, + { + "epoch": 0.8500483714930668, + "grad_norm": 0.9287605713152437, + "learning_rate": 3.0099130023519384e-07, + "loss": 0.1153, + "step": 9226 + }, + { + "epoch": 0.8501405076703368, + "grad_norm": 0.9260430559067087, + "learning_rate": 3.0062901641986967e-07, + "loss": 0.1125, + "step": 9227 + }, + { + "epoch": 0.8502326438476068, + "grad_norm": 0.9559548455870349, + "learning_rate": 3.002669368163774e-07, + "loss": 0.1192, + "step": 9228 + }, + { + "epoch": 0.8503247800248768, + "grad_norm": 0.9856506355542799, + "learning_rate": 2.999050614583368e-07, + "loss": 0.1319, + "step": 9229 + }, + { + "epoch": 0.8504169162021468, + "grad_norm": 0.9733649361017951, + "learning_rate": 2.995433903793485e-07, + "loss": 0.1178, + "step": 9230 + }, + { + "epoch": 0.8505090523794168, + "grad_norm": 0.9179807223176983, + "learning_rate": 2.9918192361299335e-07, + "loss": 0.1113, + "step": 9231 + }, + { + "epoch": 0.8506011885566868, + "grad_norm": 0.9337430679762098, + "learning_rate": 2.988206611928329e-07, + "loss": 0.1157, + "step": 9232 + }, + { + "epoch": 0.8506933247339568, + "grad_norm": 0.9471775562002914, + "learning_rate": 2.984596031524109e-07, + "loss": 0.1199, + "step": 9233 + }, + { + "epoch": 0.8507854609112268, + "grad_norm": 0.8764880163175652, + "learning_rate": 2.980987495252516e-07, + "loss": 0.0979, + "step": 9234 + }, + { + "epoch": 0.8508775970884968, + "grad_norm": 0.9735124287920944, + "learning_rate": 2.9773810034486095e-07, + "loss": 0.128, + "step": 9235 + }, + { + "epoch": 0.8509697332657667, + "grad_norm": 0.9763026567613571, + "learning_rate": 2.973776556447247e-07, + "loss": 0.117, + "step": 9236 + }, + { + "epoch": 0.8510618694430369, + "grad_norm": 0.9657511144374241, + "learning_rate": 2.970174154583097e-07, + "loss": 0.1349, + "step": 9237 + }, + { + "epoch": 0.8511540056203069, + "grad_norm": 0.9732217261684756, + "learning_rate": 2.9665737981906475e-07, + "loss": 0.1157, + "step": 9238 + }, + { + "epoch": 0.8512461417975768, + "grad_norm": 0.9557352112393618, + "learning_rate": 2.962975487604197e-07, + "loss": 0.1131, + "step": 9239 + }, + { + "epoch": 0.8513382779748468, + "grad_norm": 0.879001540732117, + "learning_rate": 2.9593792231578407e-07, + "loss": 0.1091, + "step": 9240 + }, + { + "epoch": 0.8514304141521168, + "grad_norm": 0.9464176135109578, + "learning_rate": 2.9557850051854935e-07, + "loss": 0.1133, + "step": 9241 + }, + { + "epoch": 0.8515225503293868, + "grad_norm": 0.9042055512690569, + "learning_rate": 2.9521928340208867e-07, + "loss": 0.1073, + "step": 9242 + }, + { + "epoch": 0.8516146865066568, + "grad_norm": 0.9290529203658289, + "learning_rate": 2.9486027099975416e-07, + "loss": 0.1111, + "step": 9243 + }, + { + "epoch": 0.8517068226839268, + "grad_norm": 0.9999503573416042, + "learning_rate": 2.9450146334488144e-07, + "loss": 0.1238, + "step": 9244 + }, + { + "epoch": 0.8517989588611968, + "grad_norm": 0.9769975220666243, + "learning_rate": 2.9414286047078495e-07, + "loss": 0.1185, + "step": 9245 + }, + { + "epoch": 0.8518910950384668, + "grad_norm": 0.9805675881687931, + "learning_rate": 2.937844624107608e-07, + "loss": 0.1249, + "step": 9246 + }, + { + "epoch": 0.8519832312157368, + "grad_norm": 0.9392593771275237, + "learning_rate": 2.934262691980877e-07, + "loss": 0.1271, + "step": 9247 + }, + { + "epoch": 0.8520753673930068, + "grad_norm": 0.9420394121292485, + "learning_rate": 2.930682808660226e-07, + "loss": 0.1238, + "step": 9248 + }, + { + "epoch": 0.8521675035702768, + "grad_norm": 0.9336936611055597, + "learning_rate": 2.927104974478048e-07, + "loss": 0.112, + "step": 9249 + }, + { + "epoch": 0.8522596397475469, + "grad_norm": 0.9187592636253514, + "learning_rate": 2.9235291897665497e-07, + "loss": 0.1095, + "step": 9250 + }, + { + "epoch": 0.8523517759248169, + "grad_norm": 0.9382325552043895, + "learning_rate": 2.91995545485774e-07, + "loss": 0.123, + "step": 9251 + }, + { + "epoch": 0.8524439121020869, + "grad_norm": 0.9385540114569236, + "learning_rate": 2.9163837700834473e-07, + "loss": 0.1111, + "step": 9252 + }, + { + "epoch": 0.8525360482793569, + "grad_norm": 1.0194309807321986, + "learning_rate": 2.912814135775299e-07, + "loss": 0.1201, + "step": 9253 + }, + { + "epoch": 0.8526281844566269, + "grad_norm": 0.9422330543550835, + "learning_rate": 2.909246552264733e-07, + "loss": 0.1185, + "step": 9254 + }, + { + "epoch": 0.8527203206338969, + "grad_norm": 0.9714237671246142, + "learning_rate": 2.905681019882997e-07, + "loss": 0.1233, + "step": 9255 + }, + { + "epoch": 0.8528124568111669, + "grad_norm": 0.889642435524911, + "learning_rate": 2.902117538961166e-07, + "loss": 0.1089, + "step": 9256 + }, + { + "epoch": 0.8529045929884369, + "grad_norm": 0.9627728725749669, + "learning_rate": 2.898556109830092e-07, + "loss": 0.1232, + "step": 9257 + }, + { + "epoch": 0.8529967291657069, + "grad_norm": 0.924709180557171, + "learning_rate": 2.894996732820468e-07, + "loss": 0.1198, + "step": 9258 + }, + { + "epoch": 0.8530888653429769, + "grad_norm": 0.995663879425923, + "learning_rate": 2.8914394082627694e-07, + "loss": 0.1308, + "step": 9259 + }, + { + "epoch": 0.8531810015202469, + "grad_norm": 0.8870383336303369, + "learning_rate": 2.8878841364873067e-07, + "loss": 0.108, + "step": 9260 + }, + { + "epoch": 0.8532731376975169, + "grad_norm": 0.939204703933531, + "learning_rate": 2.8843309178241766e-07, + "loss": 0.1229, + "step": 9261 + }, + { + "epoch": 0.8533652738747869, + "grad_norm": 0.9353801215931173, + "learning_rate": 2.880779752603302e-07, + "loss": 0.117, + "step": 9262 + }, + { + "epoch": 0.8534574100520569, + "grad_norm": 0.9319288195935074, + "learning_rate": 2.877230641154413e-07, + "loss": 0.1109, + "step": 9263 + }, + { + "epoch": 0.853549546229327, + "grad_norm": 0.9812789745070393, + "learning_rate": 2.873683583807038e-07, + "loss": 0.1217, + "step": 9264 + }, + { + "epoch": 0.853641682406597, + "grad_norm": 0.9535806751362134, + "learning_rate": 2.8701385808905217e-07, + "loss": 0.1217, + "step": 9265 + }, + { + "epoch": 0.853733818583867, + "grad_norm": 1.0046894585013302, + "learning_rate": 2.8665956327340175e-07, + "loss": 0.1176, + "step": 9266 + }, + { + "epoch": 0.853825954761137, + "grad_norm": 0.9303632824861731, + "learning_rate": 2.8630547396664905e-07, + "loss": 0.1179, + "step": 9267 + }, + { + "epoch": 0.853918090938407, + "grad_norm": 0.9387604544667134, + "learning_rate": 2.8595159020167186e-07, + "loss": 0.1261, + "step": 9268 + }, + { + "epoch": 0.854010227115677, + "grad_norm": 0.9702272703985197, + "learning_rate": 2.855979120113278e-07, + "loss": 0.1234, + "step": 9269 + }, + { + "epoch": 0.854102363292947, + "grad_norm": 0.9525420190866508, + "learning_rate": 2.8524443942845567e-07, + "loss": 0.1259, + "step": 9270 + }, + { + "epoch": 0.854194499470217, + "grad_norm": 0.9265347955081413, + "learning_rate": 2.848911724858755e-07, + "loss": 0.1159, + "step": 9271 + }, + { + "epoch": 0.854286635647487, + "grad_norm": 0.8711773749514531, + "learning_rate": 2.8453811121638834e-07, + "loss": 0.1004, + "step": 9272 + }, + { + "epoch": 0.854378771824757, + "grad_norm": 0.9477565096775316, + "learning_rate": 2.841852556527763e-07, + "loss": 0.1227, + "step": 9273 + }, + { + "epoch": 0.854470908002027, + "grad_norm": 0.9126296349395749, + "learning_rate": 2.8383260582780206e-07, + "loss": 0.1102, + "step": 9274 + }, + { + "epoch": 0.854563044179297, + "grad_norm": 0.9525387630469448, + "learning_rate": 2.8348016177420833e-07, + "loss": 0.1181, + "step": 9275 + }, + { + "epoch": 0.854655180356567, + "grad_norm": 0.9081397804878282, + "learning_rate": 2.8312792352472003e-07, + "loss": 0.1146, + "step": 9276 + }, + { + "epoch": 0.8547473165338371, + "grad_norm": 0.9292285804423576, + "learning_rate": 2.8277589111204315e-07, + "loss": 0.1052, + "step": 9277 + }, + { + "epoch": 0.8548394527111071, + "grad_norm": 0.957473562369141, + "learning_rate": 2.824240645688628e-07, + "loss": 0.1172, + "step": 9278 + }, + { + "epoch": 0.8549315888883771, + "grad_norm": 0.9231740975710168, + "learning_rate": 2.8207244392784715e-07, + "loss": 0.1088, + "step": 9279 + }, + { + "epoch": 0.8550237250656471, + "grad_norm": 0.9829296508725345, + "learning_rate": 2.817210292216435e-07, + "loss": 0.1206, + "step": 9280 + }, + { + "epoch": 0.855115861242917, + "grad_norm": 0.9620000672282338, + "learning_rate": 2.813698204828816e-07, + "loss": 0.1142, + "step": 9281 + }, + { + "epoch": 0.855207997420187, + "grad_norm": 0.9896587979379299, + "learning_rate": 2.8101881774416975e-07, + "loss": 0.1293, + "step": 9282 + }, + { + "epoch": 0.855300133597457, + "grad_norm": 0.9736012672651115, + "learning_rate": 2.806680210380999e-07, + "loss": 0.116, + "step": 9283 + }, + { + "epoch": 0.855392269774727, + "grad_norm": 0.946172637330825, + "learning_rate": 2.8031743039724337e-07, + "loss": 0.1172, + "step": 9284 + }, + { + "epoch": 0.855484405951997, + "grad_norm": 0.9513361916546126, + "learning_rate": 2.7996704585415227e-07, + "loss": 0.1127, + "step": 9285 + }, + { + "epoch": 0.855576542129267, + "grad_norm": 0.881332691957772, + "learning_rate": 2.796168674413596e-07, + "loss": 0.1091, + "step": 9286 + }, + { + "epoch": 0.855668678306537, + "grad_norm": 0.9543852740058231, + "learning_rate": 2.7926689519137963e-07, + "loss": 0.1069, + "step": 9287 + }, + { + "epoch": 0.855760814483807, + "grad_norm": 1.0190560153967547, + "learning_rate": 2.7891712913670765e-07, + "loss": 0.127, + "step": 9288 + }, + { + "epoch": 0.855852950661077, + "grad_norm": 0.9276584149377606, + "learning_rate": 2.785675693098194e-07, + "loss": 0.1174, + "step": 9289 + }, + { + "epoch": 0.855945086838347, + "grad_norm": 0.9138106971403834, + "learning_rate": 2.782182157431718e-07, + "loss": 0.1122, + "step": 9290 + }, + { + "epoch": 0.8560372230156171, + "grad_norm": 0.9154240943613224, + "learning_rate": 2.778690684692012e-07, + "loss": 0.1131, + "step": 9291 + }, + { + "epoch": 0.8561293591928871, + "grad_norm": 1.0292627495010502, + "learning_rate": 2.7752012752032683e-07, + "loss": 0.1357, + "step": 9292 + }, + { + "epoch": 0.8562214953701571, + "grad_norm": 0.8692529164590608, + "learning_rate": 2.7717139292894824e-07, + "loss": 0.1097, + "step": 9293 + }, + { + "epoch": 0.8563136315474271, + "grad_norm": 0.9380278752902587, + "learning_rate": 2.768228647274446e-07, + "loss": 0.1174, + "step": 9294 + }, + { + "epoch": 0.8564057677246971, + "grad_norm": 0.9121772464230438, + "learning_rate": 2.7647454294817773e-07, + "loss": 0.1146, + "step": 9295 + }, + { + "epoch": 0.8564979039019671, + "grad_norm": 0.9604678113327662, + "learning_rate": 2.7612642762348844e-07, + "loss": 0.1086, + "step": 9296 + }, + { + "epoch": 0.8565900400792371, + "grad_norm": 0.9726017733465043, + "learning_rate": 2.757785187857001e-07, + "loss": 0.1143, + "step": 9297 + }, + { + "epoch": 0.8566821762565071, + "grad_norm": 0.9420082236664085, + "learning_rate": 2.7543081646711487e-07, + "loss": 0.1122, + "step": 9298 + }, + { + "epoch": 0.8567743124337771, + "grad_norm": 0.9700969708005628, + "learning_rate": 2.7508332070001807e-07, + "loss": 0.1133, + "step": 9299 + }, + { + "epoch": 0.8568664486110471, + "grad_norm": 0.9530651641631397, + "learning_rate": 2.74736031516675e-07, + "loss": 0.1231, + "step": 9300 + }, + { + "epoch": 0.8569585847883171, + "grad_norm": 0.9404951910508061, + "learning_rate": 2.7438894894933013e-07, + "loss": 0.125, + "step": 9301 + }, + { + "epoch": 0.8570507209655871, + "grad_norm": 0.9157359729471527, + "learning_rate": 2.7404207303021153e-07, + "loss": 0.1151, + "step": 9302 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.9539864507426042, + "learning_rate": 2.736954037915254e-07, + "loss": 0.1224, + "step": 9303 + }, + { + "epoch": 0.8572349933201272, + "grad_norm": 1.0345955402987674, + "learning_rate": 2.733489412654608e-07, + "loss": 0.1293, + "step": 9304 + }, + { + "epoch": 0.8573271294973972, + "grad_norm": 0.9601556995270799, + "learning_rate": 2.730026854841869e-07, + "loss": 0.1279, + "step": 9305 + }, + { + "epoch": 0.8574192656746672, + "grad_norm": 0.9853873195926093, + "learning_rate": 2.7265663647985357e-07, + "loss": 0.127, + "step": 9306 + }, + { + "epoch": 0.8575114018519372, + "grad_norm": 0.9441216337976664, + "learning_rate": 2.723107942845907e-07, + "loss": 0.1252, + "step": 9307 + }, + { + "epoch": 0.8576035380292072, + "grad_norm": 0.9547990137449248, + "learning_rate": 2.7196515893051003e-07, + "loss": 0.1186, + "step": 9308 + }, + { + "epoch": 0.8576956742064772, + "grad_norm": 0.9812925813959527, + "learning_rate": 2.7161973044970453e-07, + "loss": 0.1247, + "step": 9309 + }, + { + "epoch": 0.8577878103837472, + "grad_norm": 0.9238639272845356, + "learning_rate": 2.712745088742472e-07, + "loss": 0.1087, + "step": 9310 + }, + { + "epoch": 0.8578799465610172, + "grad_norm": 0.9477555453822856, + "learning_rate": 2.7092949423619145e-07, + "loss": 0.1201, + "step": 9311 + }, + { + "epoch": 0.8579720827382872, + "grad_norm": 0.9510317673971791, + "learning_rate": 2.7058468656757183e-07, + "loss": 0.1166, + "step": 9312 + }, + { + "epoch": 0.8580642189155572, + "grad_norm": 0.8710369385781217, + "learning_rate": 2.702400859004037e-07, + "loss": 0.108, + "step": 9313 + }, + { + "epoch": 0.8581563550928272, + "grad_norm": 0.9460929888604528, + "learning_rate": 2.698956922666843e-07, + "loss": 0.1199, + "step": 9314 + }, + { + "epoch": 0.8582484912700972, + "grad_norm": 0.9849952167511389, + "learning_rate": 2.695515056983894e-07, + "loss": 0.116, + "step": 9315 + }, + { + "epoch": 0.8583406274473672, + "grad_norm": 0.9495136032256005, + "learning_rate": 2.692075262274771e-07, + "loss": 0.1138, + "step": 9316 + }, + { + "epoch": 0.8584327636246372, + "grad_norm": 0.970959517015624, + "learning_rate": 2.6886375388588656e-07, + "loss": 0.125, + "step": 9317 + }, + { + "epoch": 0.8585248998019073, + "grad_norm": 0.9827648328416002, + "learning_rate": 2.6852018870553664e-07, + "loss": 0.1359, + "step": 9318 + }, + { + "epoch": 0.8586170359791773, + "grad_norm": 0.9965665810884955, + "learning_rate": 2.6817683071832687e-07, + "loss": 0.1345, + "step": 9319 + }, + { + "epoch": 0.8587091721564473, + "grad_norm": 0.8961725861415342, + "learning_rate": 2.6783367995613846e-07, + "loss": 0.1088, + "step": 9320 + }, + { + "epoch": 0.8588013083337173, + "grad_norm": 0.9104434301871034, + "learning_rate": 2.6749073645083333e-07, + "loss": 0.1125, + "step": 9321 + }, + { + "epoch": 0.8588934445109873, + "grad_norm": 0.9244596919139304, + "learning_rate": 2.6714800023425385e-07, + "loss": 0.1189, + "step": 9322 + }, + { + "epoch": 0.8589855806882573, + "grad_norm": 0.9368277789935434, + "learning_rate": 2.668054713382229e-07, + "loss": 0.1195, + "step": 9323 + }, + { + "epoch": 0.8590777168655273, + "grad_norm": 0.9729852872130562, + "learning_rate": 2.6646314979454386e-07, + "loss": 0.1161, + "step": 9324 + }, + { + "epoch": 0.8591698530427972, + "grad_norm": 0.947259066999539, + "learning_rate": 2.6612103563500165e-07, + "loss": 0.1269, + "step": 9325 + }, + { + "epoch": 0.8592619892200672, + "grad_norm": 0.9575031330131318, + "learning_rate": 2.657791288913622e-07, + "loss": 0.1229, + "step": 9326 + }, + { + "epoch": 0.8593541253973372, + "grad_norm": 0.8933964099773779, + "learning_rate": 2.6543742959537074e-07, + "loss": 0.1115, + "step": 9327 + }, + { + "epoch": 0.8594462615746072, + "grad_norm": 0.880212005558546, + "learning_rate": 2.650959377787549e-07, + "loss": 0.1084, + "step": 9328 + }, + { + "epoch": 0.8595383977518772, + "grad_norm": 0.9167841432310003, + "learning_rate": 2.647546534732209e-07, + "loss": 0.1102, + "step": 9329 + }, + { + "epoch": 0.8596305339291472, + "grad_norm": 0.9309781613665132, + "learning_rate": 2.6441357671045833e-07, + "loss": 0.1158, + "step": 9330 + }, + { + "epoch": 0.8597226701064172, + "grad_norm": 0.923428605833851, + "learning_rate": 2.640727075221361e-07, + "loss": 0.1164, + "step": 9331 + }, + { + "epoch": 0.8598148062836873, + "grad_norm": 0.9617325314977327, + "learning_rate": 2.637320459399031e-07, + "loss": 0.1201, + "step": 9332 + }, + { + "epoch": 0.8599069424609573, + "grad_norm": 0.8600354204904673, + "learning_rate": 2.6339159199539085e-07, + "loss": 0.1084, + "step": 9333 + }, + { + "epoch": 0.8599990786382273, + "grad_norm": 0.9258377390351011, + "learning_rate": 2.6305134572020943e-07, + "loss": 0.1143, + "step": 9334 + }, + { + "epoch": 0.8600912148154973, + "grad_norm": 0.9465789010041532, + "learning_rate": 2.6271130714595164e-07, + "loss": 0.1144, + "step": 9335 + }, + { + "epoch": 0.8601833509927673, + "grad_norm": 0.8715165159103272, + "learning_rate": 2.623714763041896e-07, + "loss": 0.1077, + "step": 9336 + }, + { + "epoch": 0.8602754871700373, + "grad_norm": 0.9303896406648642, + "learning_rate": 2.620318532264765e-07, + "loss": 0.1173, + "step": 9337 + }, + { + "epoch": 0.8603676233473073, + "grad_norm": 0.9786268133601739, + "learning_rate": 2.6169243794434725e-07, + "loss": 0.1242, + "step": 9338 + }, + { + "epoch": 0.8604597595245773, + "grad_norm": 0.9589918812170614, + "learning_rate": 2.613532304893163e-07, + "loss": 0.1232, + "step": 9339 + }, + { + "epoch": 0.8605518957018473, + "grad_norm": 0.8785947502166375, + "learning_rate": 2.610142308928779e-07, + "loss": 0.1135, + "step": 9340 + }, + { + "epoch": 0.8606440318791173, + "grad_norm": 0.9312433513423943, + "learning_rate": 2.6067543918650935e-07, + "loss": 0.1242, + "step": 9341 + }, + { + "epoch": 0.8607361680563873, + "grad_norm": 0.9325694372347554, + "learning_rate": 2.603368554016672e-07, + "loss": 0.1129, + "step": 9342 + }, + { + "epoch": 0.8608283042336573, + "grad_norm": 0.9469327317443748, + "learning_rate": 2.5999847956978963e-07, + "loss": 0.1158, + "step": 9343 + }, + { + "epoch": 0.8609204404109273, + "grad_norm": 0.90638245153095, + "learning_rate": 2.5966031172229427e-07, + "loss": 0.1109, + "step": 9344 + }, + { + "epoch": 0.8610125765881974, + "grad_norm": 0.9413146612471566, + "learning_rate": 2.593223518905796e-07, + "loss": 0.1157, + "step": 9345 + }, + { + "epoch": 0.8611047127654674, + "grad_norm": 0.8941304858723512, + "learning_rate": 2.589846001060259e-07, + "loss": 0.1084, + "step": 9346 + }, + { + "epoch": 0.8611968489427374, + "grad_norm": 0.9540972957902403, + "learning_rate": 2.586470563999935e-07, + "loss": 0.1254, + "step": 9347 + }, + { + "epoch": 0.8612889851200074, + "grad_norm": 0.9197783560409639, + "learning_rate": 2.5830972080382265e-07, + "loss": 0.1179, + "step": 9348 + }, + { + "epoch": 0.8613811212972774, + "grad_norm": 1.0135846500017713, + "learning_rate": 2.5797259334883613e-07, + "loss": 0.1269, + "step": 9349 + }, + { + "epoch": 0.8614732574745474, + "grad_norm": 0.9956640673249679, + "learning_rate": 2.5763567406633496e-07, + "loss": 0.1274, + "step": 9350 + }, + { + "epoch": 0.8615653936518174, + "grad_norm": 0.9982956385527713, + "learning_rate": 2.5729896298760325e-07, + "loss": 0.1256, + "step": 9351 + }, + { + "epoch": 0.8616575298290874, + "grad_norm": 0.9997035981643813, + "learning_rate": 2.569624601439039e-07, + "loss": 0.1186, + "step": 9352 + }, + { + "epoch": 0.8617496660063574, + "grad_norm": 0.9473032276378903, + "learning_rate": 2.566261655664812e-07, + "loss": 0.113, + "step": 9353 + }, + { + "epoch": 0.8618418021836274, + "grad_norm": 0.8904282708574572, + "learning_rate": 2.562900792865611e-07, + "loss": 0.1014, + "step": 9354 + }, + { + "epoch": 0.8619339383608974, + "grad_norm": 0.9421774910383477, + "learning_rate": 2.5595420133534887e-07, + "loss": 0.1127, + "step": 9355 + }, + { + "epoch": 0.8620260745381674, + "grad_norm": 0.9437685397025191, + "learning_rate": 2.5561853174402964e-07, + "loss": 0.1104, + "step": 9356 + }, + { + "epoch": 0.8621182107154374, + "grad_norm": 0.9365914283115228, + "learning_rate": 2.5528307054377145e-07, + "loss": 0.1077, + "step": 9357 + }, + { + "epoch": 0.8622103468927074, + "grad_norm": 0.9285492507636135, + "learning_rate": 2.549478177657219e-07, + "loss": 0.1051, + "step": 9358 + }, + { + "epoch": 0.8623024830699775, + "grad_norm": 0.9765891819348158, + "learning_rate": 2.546127734410095e-07, + "loss": 0.1252, + "step": 9359 + }, + { + "epoch": 0.8623946192472475, + "grad_norm": 0.9287397592156162, + "learning_rate": 2.542779376007426e-07, + "loss": 0.119, + "step": 9360 + }, + { + "epoch": 0.8624867554245175, + "grad_norm": 0.9004090598160094, + "learning_rate": 2.5394331027601056e-07, + "loss": 0.1024, + "step": 9361 + }, + { + "epoch": 0.8625788916017875, + "grad_norm": 0.9722848299604637, + "learning_rate": 2.5360889149788375e-07, + "loss": 0.1241, + "step": 9362 + }, + { + "epoch": 0.8626710277790575, + "grad_norm": 0.906785297368422, + "learning_rate": 2.532746812974132e-07, + "loss": 0.114, + "step": 9363 + }, + { + "epoch": 0.8627631639563275, + "grad_norm": 0.9589773361342592, + "learning_rate": 2.529406797056305e-07, + "loss": 0.1213, + "step": 9364 + }, + { + "epoch": 0.8628553001335975, + "grad_norm": 0.9966890284084031, + "learning_rate": 2.5260688675354806e-07, + "loss": 0.1229, + "step": 9365 + }, + { + "epoch": 0.8629474363108675, + "grad_norm": 0.9667983213030846, + "learning_rate": 2.5227330247215716e-07, + "loss": 0.1208, + "step": 9366 + }, + { + "epoch": 0.8630395724881375, + "grad_norm": 0.9291822525151221, + "learning_rate": 2.519399268924322e-07, + "loss": 0.1139, + "step": 9367 + }, + { + "epoch": 0.8631317086654074, + "grad_norm": 0.8994909111600728, + "learning_rate": 2.516067600453273e-07, + "loss": 0.1034, + "step": 9368 + }, + { + "epoch": 0.8632238448426774, + "grad_norm": 1.0207921145021992, + "learning_rate": 2.5127380196177634e-07, + "loss": 0.1284, + "step": 9369 + }, + { + "epoch": 0.8633159810199474, + "grad_norm": 0.9177180668568711, + "learning_rate": 2.509410526726952e-07, + "loss": 0.1095, + "step": 9370 + }, + { + "epoch": 0.8634081171972174, + "grad_norm": 0.8712347431050033, + "learning_rate": 2.5060851220897906e-07, + "loss": 0.1001, + "step": 9371 + }, + { + "epoch": 0.8635002533744875, + "grad_norm": 0.9886160970135562, + "learning_rate": 2.5027618060150526e-07, + "loss": 0.1185, + "step": 9372 + }, + { + "epoch": 0.8635923895517575, + "grad_norm": 0.9670625304491519, + "learning_rate": 2.4994405788112933e-07, + "loss": 0.1218, + "step": 9373 + }, + { + "epoch": 0.8636845257290275, + "grad_norm": 0.9168572699846476, + "learning_rate": 2.4961214407869e-07, + "loss": 0.1081, + "step": 9374 + }, + { + "epoch": 0.8637766619062975, + "grad_norm": 0.9599698651657775, + "learning_rate": 2.492804392250059e-07, + "loss": 0.1181, + "step": 9375 + }, + { + "epoch": 0.8638687980835675, + "grad_norm": 0.9619307269182482, + "learning_rate": 2.489489433508752e-07, + "loss": 0.1295, + "step": 9376 + }, + { + "epoch": 0.8639609342608375, + "grad_norm": 0.9576970010297813, + "learning_rate": 2.486176564870768e-07, + "loss": 0.1197, + "step": 9377 + }, + { + "epoch": 0.8640530704381075, + "grad_norm": 1.0203235684877179, + "learning_rate": 2.4828657866437123e-07, + "loss": 0.1371, + "step": 9378 + }, + { + "epoch": 0.8641452066153775, + "grad_norm": 0.9495758122219625, + "learning_rate": 2.479557099134991e-07, + "loss": 0.1164, + "step": 9379 + }, + { + "epoch": 0.8642373427926475, + "grad_norm": 0.9214388616761735, + "learning_rate": 2.4762505026518224e-07, + "loss": 0.1094, + "step": 9380 + }, + { + "epoch": 0.8643294789699175, + "grad_norm": 0.9413072214892965, + "learning_rate": 2.4729459975012194e-07, + "loss": 0.1158, + "step": 9381 + }, + { + "epoch": 0.8644216151471875, + "grad_norm": 0.9354397693146485, + "learning_rate": 2.46964358399e-07, + "loss": 0.1136, + "step": 9382 + }, + { + "epoch": 0.8645137513244575, + "grad_norm": 0.9088302027421223, + "learning_rate": 2.4663432624247975e-07, + "loss": 0.1124, + "step": 9383 + }, + { + "epoch": 0.8646058875017275, + "grad_norm": 0.8911370384300263, + "learning_rate": 2.4630450331120547e-07, + "loss": 0.1108, + "step": 9384 + }, + { + "epoch": 0.8646980236789975, + "grad_norm": 0.9630587035183339, + "learning_rate": 2.4597488963579995e-07, + "loss": 0.1162, + "step": 9385 + }, + { + "epoch": 0.8647901598562676, + "grad_norm": 0.9209292229525733, + "learning_rate": 2.4564548524686925e-07, + "loss": 0.1106, + "step": 9386 + }, + { + "epoch": 0.8648822960335376, + "grad_norm": 0.9432471817490239, + "learning_rate": 2.4531629017499724e-07, + "loss": 0.1097, + "step": 9387 + }, + { + "epoch": 0.8649744322108076, + "grad_norm": 0.9825332338153762, + "learning_rate": 2.449873044507503e-07, + "loss": 0.1241, + "step": 9388 + }, + { + "epoch": 0.8650665683880776, + "grad_norm": 0.9408846109600006, + "learning_rate": 2.446585281046751e-07, + "loss": 0.1127, + "step": 9389 + }, + { + "epoch": 0.8651587045653476, + "grad_norm": 0.9401838784688298, + "learning_rate": 2.443299611672981e-07, + "loss": 0.1269, + "step": 9390 + }, + { + "epoch": 0.8652508407426176, + "grad_norm": 0.9111818343603258, + "learning_rate": 2.44001603669127e-07, + "loss": 0.119, + "step": 9391 + }, + { + "epoch": 0.8653429769198876, + "grad_norm": 0.906464655426303, + "learning_rate": 2.4367345564065003e-07, + "loss": 0.1097, + "step": 9392 + }, + { + "epoch": 0.8654351130971576, + "grad_norm": 0.875938489337497, + "learning_rate": 2.433455171123356e-07, + "loss": 0.1069, + "step": 9393 + }, + { + "epoch": 0.8655272492744276, + "grad_norm": 0.9003402477495523, + "learning_rate": 2.4301778811463255e-07, + "loss": 0.1077, + "step": 9394 + }, + { + "epoch": 0.8656193854516976, + "grad_norm": 0.9109081886705044, + "learning_rate": 2.426902686779706e-07, + "loss": 0.1186, + "step": 9395 + }, + { + "epoch": 0.8657115216289676, + "grad_norm": 0.9609017270753635, + "learning_rate": 2.4236295883276e-07, + "loss": 0.1208, + "step": 9396 + }, + { + "epoch": 0.8658036578062376, + "grad_norm": 0.898910913025197, + "learning_rate": 2.420358586093921e-07, + "loss": 0.1047, + "step": 9397 + }, + { + "epoch": 0.8658957939835076, + "grad_norm": 0.9206843573844598, + "learning_rate": 2.4170896803823785e-07, + "loss": 0.1115, + "step": 9398 + }, + { + "epoch": 0.8659879301607776, + "grad_norm": 0.9272830235116014, + "learning_rate": 2.4138228714964853e-07, + "loss": 0.1091, + "step": 9399 + }, + { + "epoch": 0.8660800663380477, + "grad_norm": 0.9547090967557088, + "learning_rate": 2.4105581597395705e-07, + "loss": 0.1232, + "step": 9400 + }, + { + "epoch": 0.8661722025153177, + "grad_norm": 0.9480953903908746, + "learning_rate": 2.4072955454147643e-07, + "loss": 0.1203, + "step": 9401 + }, + { + "epoch": 0.8662643386925877, + "grad_norm": 0.9798031746500194, + "learning_rate": 2.4040350288249944e-07, + "loss": 0.1242, + "step": 9402 + }, + { + "epoch": 0.8663564748698577, + "grad_norm": 0.9522213276476387, + "learning_rate": 2.400776610273006e-07, + "loss": 0.1155, + "step": 9403 + }, + { + "epoch": 0.8664486110471277, + "grad_norm": 0.9347048484577208, + "learning_rate": 2.397520290061339e-07, + "loss": 0.1121, + "step": 9404 + }, + { + "epoch": 0.8665407472243977, + "grad_norm": 0.9501824992765463, + "learning_rate": 2.394266068492351e-07, + "loss": 0.1171, + "step": 9405 + }, + { + "epoch": 0.8666328834016677, + "grad_norm": 0.8502324502165575, + "learning_rate": 2.391013945868187e-07, + "loss": 0.0941, + "step": 9406 + }, + { + "epoch": 0.8667250195789377, + "grad_norm": 0.9133438904722627, + "learning_rate": 2.38776392249081e-07, + "loss": 0.1126, + "step": 9407 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 0.9547356127125062, + "learning_rate": 2.38451599866199e-07, + "loss": 0.1272, + "step": 9408 + }, + { + "epoch": 0.8669092919334777, + "grad_norm": 1.0557212654833275, + "learning_rate": 2.381270174683295e-07, + "loss": 0.1321, + "step": 9409 + }, + { + "epoch": 0.8670014281107477, + "grad_norm": 0.9335280236927755, + "learning_rate": 2.3780264508560942e-07, + "loss": 0.1165, + "step": 9410 + }, + { + "epoch": 0.8670935642880176, + "grad_norm": 0.9290477286297832, + "learning_rate": 2.3747848274815716e-07, + "loss": 0.1198, + "step": 9411 + }, + { + "epoch": 0.8671857004652876, + "grad_norm": 0.9789407678799273, + "learning_rate": 2.3715453048607118e-07, + "loss": 0.1263, + "step": 9412 + }, + { + "epoch": 0.8672778366425578, + "grad_norm": 1.0063340760093704, + "learning_rate": 2.368307883294313e-07, + "loss": 0.1116, + "step": 9413 + }, + { + "epoch": 0.8673699728198278, + "grad_norm": 0.9504096460020395, + "learning_rate": 2.3650725630829598e-07, + "loss": 0.1236, + "step": 9414 + }, + { + "epoch": 0.8674621089970977, + "grad_norm": 0.9909104584146814, + "learning_rate": 2.3618393445270504e-07, + "loss": 0.1241, + "step": 9415 + }, + { + "epoch": 0.8675542451743677, + "grad_norm": 0.8738058725690909, + "learning_rate": 2.3586082279267952e-07, + "loss": 0.1042, + "step": 9416 + }, + { + "epoch": 0.8676463813516377, + "grad_norm": 0.9207349471071707, + "learning_rate": 2.355379213582204e-07, + "loss": 0.121, + "step": 9417 + }, + { + "epoch": 0.8677385175289077, + "grad_norm": 0.9308193808957346, + "learning_rate": 2.3521523017930954e-07, + "loss": 0.1124, + "step": 9418 + }, + { + "epoch": 0.8678306537061777, + "grad_norm": 0.9365520666702065, + "learning_rate": 2.3489274928590795e-07, + "loss": 0.1225, + "step": 9419 + }, + { + "epoch": 0.8679227898834477, + "grad_norm": 0.9492932580673579, + "learning_rate": 2.3457047870795808e-07, + "loss": 0.1151, + "step": 9420 + }, + { + "epoch": 0.8680149260607177, + "grad_norm": 0.9490480593481834, + "learning_rate": 2.3424841847538292e-07, + "loss": 0.1168, + "step": 9421 + }, + { + "epoch": 0.8681070622379877, + "grad_norm": 0.9449269641389587, + "learning_rate": 2.3392656861808666e-07, + "loss": 0.1193, + "step": 9422 + }, + { + "epoch": 0.8681991984152577, + "grad_norm": 0.9233460585881546, + "learning_rate": 2.3360492916595174e-07, + "loss": 0.1207, + "step": 9423 + }, + { + "epoch": 0.8682913345925277, + "grad_norm": 0.9503848489495152, + "learning_rate": 2.332835001488437e-07, + "loss": 0.1091, + "step": 9424 + }, + { + "epoch": 0.8683834707697977, + "grad_norm": 0.9610048378704811, + "learning_rate": 2.3296228159660594e-07, + "loss": 0.125, + "step": 9425 + }, + { + "epoch": 0.8684756069470677, + "grad_norm": 0.9498156086623913, + "learning_rate": 2.3264127353906485e-07, + "loss": 0.1147, + "step": 9426 + }, + { + "epoch": 0.8685677431243378, + "grad_norm": 0.914848634914354, + "learning_rate": 2.323204760060252e-07, + "loss": 0.1096, + "step": 9427 + }, + { + "epoch": 0.8686598793016078, + "grad_norm": 0.8394385416081372, + "learning_rate": 2.3199988902727317e-07, + "loss": 0.1005, + "step": 9428 + }, + { + "epoch": 0.8687520154788778, + "grad_norm": 0.9696929167719233, + "learning_rate": 2.3167951263257633e-07, + "loss": 0.1304, + "step": 9429 + }, + { + "epoch": 0.8688441516561478, + "grad_norm": 0.9503528872420482, + "learning_rate": 2.313593468516806e-07, + "loss": 0.1127, + "step": 9430 + }, + { + "epoch": 0.8689362878334178, + "grad_norm": 0.9408948277709294, + "learning_rate": 2.3103939171431305e-07, + "loss": 0.1247, + "step": 9431 + }, + { + "epoch": 0.8690284240106878, + "grad_norm": 0.9920825083534364, + "learning_rate": 2.307196472501824e-07, + "loss": 0.1306, + "step": 9432 + }, + { + "epoch": 0.8691205601879578, + "grad_norm": 0.906482830614097, + "learning_rate": 2.3040011348897689e-07, + "loss": 0.1105, + "step": 9433 + }, + { + "epoch": 0.8692126963652278, + "grad_norm": 0.9142460311821695, + "learning_rate": 2.3008079046036525e-07, + "loss": 0.115, + "step": 9434 + }, + { + "epoch": 0.8693048325424978, + "grad_norm": 0.9322906020030624, + "learning_rate": 2.2976167819399652e-07, + "loss": 0.1076, + "step": 9435 + }, + { + "epoch": 0.8693969687197678, + "grad_norm": 0.8997499764924649, + "learning_rate": 2.294427767195001e-07, + "loss": 0.1058, + "step": 9436 + }, + { + "epoch": 0.8694891048970378, + "grad_norm": 0.9470817054742046, + "learning_rate": 2.291240860664859e-07, + "loss": 0.1161, + "step": 9437 + }, + { + "epoch": 0.8695812410743078, + "grad_norm": 0.922169310738529, + "learning_rate": 2.288056062645455e-07, + "loss": 0.1101, + "step": 9438 + }, + { + "epoch": 0.8696733772515778, + "grad_norm": 0.8836399103375951, + "learning_rate": 2.2848733734324835e-07, + "loss": 0.1008, + "step": 9439 + }, + { + "epoch": 0.8697655134288479, + "grad_norm": 0.9790710542189891, + "learning_rate": 2.281692793321469e-07, + "loss": 0.1293, + "step": 9440 + }, + { + "epoch": 0.8698576496061179, + "grad_norm": 0.9605080787153812, + "learning_rate": 2.2785143226077166e-07, + "loss": 0.1233, + "step": 9441 + }, + { + "epoch": 0.8699497857833879, + "grad_norm": 0.9728766436641451, + "learning_rate": 2.2753379615863575e-07, + "loss": 0.1296, + "step": 9442 + }, + { + "epoch": 0.8700419219606579, + "grad_norm": 0.9012599081009991, + "learning_rate": 2.2721637105523193e-07, + "loss": 0.1169, + "step": 9443 + }, + { + "epoch": 0.8701340581379279, + "grad_norm": 0.9449224767636791, + "learning_rate": 2.268991569800319e-07, + "loss": 0.1167, + "step": 9444 + }, + { + "epoch": 0.8702261943151979, + "grad_norm": 0.9397622392423732, + "learning_rate": 2.2658215396249046e-07, + "loss": 0.1224, + "step": 9445 + }, + { + "epoch": 0.8703183304924679, + "grad_norm": 1.0206942614633827, + "learning_rate": 2.2626536203204014e-07, + "loss": 0.1238, + "step": 9446 + }, + { + "epoch": 0.8704104666697379, + "grad_norm": 0.9508061328678594, + "learning_rate": 2.2594878121809633e-07, + "loss": 0.1181, + "step": 9447 + }, + { + "epoch": 0.8705026028470079, + "grad_norm": 0.9776800478848374, + "learning_rate": 2.2563241155005216e-07, + "loss": 0.1227, + "step": 9448 + }, + { + "epoch": 0.8705947390242779, + "grad_norm": 0.9499755485624733, + "learning_rate": 2.2531625305728362e-07, + "loss": 0.1176, + "step": 9449 + }, + { + "epoch": 0.8706868752015479, + "grad_norm": 0.9626981166059405, + "learning_rate": 2.2500030576914606e-07, + "loss": 0.1041, + "step": 9450 + }, + { + "epoch": 0.8707790113788179, + "grad_norm": 0.9322085622455839, + "learning_rate": 2.2468456971497493e-07, + "loss": 0.1215, + "step": 9451 + }, + { + "epoch": 0.8708711475560879, + "grad_norm": 1.0594714922946018, + "learning_rate": 2.2436904492408596e-07, + "loss": 0.1197, + "step": 9452 + }, + { + "epoch": 0.8709632837333579, + "grad_norm": 0.9755136585207467, + "learning_rate": 2.2405373142577597e-07, + "loss": 0.1261, + "step": 9453 + }, + { + "epoch": 0.871055419910628, + "grad_norm": 0.9455054395345747, + "learning_rate": 2.237386292493221e-07, + "loss": 0.127, + "step": 9454 + }, + { + "epoch": 0.871147556087898, + "grad_norm": 0.9658714454786119, + "learning_rate": 2.2342373842398208e-07, + "loss": 0.122, + "step": 9455 + }, + { + "epoch": 0.871239692265168, + "grad_norm": 0.9632137472514127, + "learning_rate": 2.2310905897899275e-07, + "loss": 0.117, + "step": 9456 + }, + { + "epoch": 0.871331828442438, + "grad_norm": 0.9449426092351376, + "learning_rate": 2.227945909435719e-07, + "loss": 0.1128, + "step": 9457 + }, + { + "epoch": 0.871423964619708, + "grad_norm": 0.9567787882080494, + "learning_rate": 2.224803343469184e-07, + "loss": 0.1213, + "step": 9458 + }, + { + "epoch": 0.8715161007969779, + "grad_norm": 0.9823985814029087, + "learning_rate": 2.2216628921821138e-07, + "loss": 0.1114, + "step": 9459 + }, + { + "epoch": 0.8716082369742479, + "grad_norm": 0.9140140102629726, + "learning_rate": 2.2185245558660918e-07, + "loss": 0.1113, + "step": 9460 + }, + { + "epoch": 0.8717003731515179, + "grad_norm": 1.0126259446458135, + "learning_rate": 2.215388334812521e-07, + "loss": 0.1237, + "step": 9461 + }, + { + "epoch": 0.8717925093287879, + "grad_norm": 0.875242477779259, + "learning_rate": 2.2122542293125883e-07, + "loss": 0.1051, + "step": 9462 + }, + { + "epoch": 0.8718846455060579, + "grad_norm": 0.9070067464486096, + "learning_rate": 2.2091222396573104e-07, + "loss": 0.105, + "step": 9463 + }, + { + "epoch": 0.8719767816833279, + "grad_norm": 0.8634324183365186, + "learning_rate": 2.20599236613748e-07, + "loss": 0.1051, + "step": 9464 + }, + { + "epoch": 0.8720689178605979, + "grad_norm": 0.9097237773646777, + "learning_rate": 2.2028646090437117e-07, + "loss": 0.1156, + "step": 9465 + }, + { + "epoch": 0.8721610540378679, + "grad_norm": 0.9388952429509481, + "learning_rate": 2.199738968666418e-07, + "loss": 0.1137, + "step": 9466 + }, + { + "epoch": 0.8722531902151379, + "grad_norm": 0.9697976669781796, + "learning_rate": 2.1966154452958216e-07, + "loss": 0.1247, + "step": 9467 + }, + { + "epoch": 0.872345326392408, + "grad_norm": 0.8989941935550352, + "learning_rate": 2.1934940392219272e-07, + "loss": 0.1066, + "step": 9468 + }, + { + "epoch": 0.872437462569678, + "grad_norm": 0.94116231002316, + "learning_rate": 2.190374750734567e-07, + "loss": 0.1197, + "step": 9469 + }, + { + "epoch": 0.872529598746948, + "grad_norm": 0.9274780247234806, + "learning_rate": 2.187257580123367e-07, + "loss": 0.1091, + "step": 9470 + }, + { + "epoch": 0.872621734924218, + "grad_norm": 0.9605073108855209, + "learning_rate": 2.1841425276777544e-07, + "loss": 0.1204, + "step": 9471 + }, + { + "epoch": 0.872713871101488, + "grad_norm": 0.9908749541818052, + "learning_rate": 2.1810295936869675e-07, + "loss": 0.1285, + "step": 9472 + }, + { + "epoch": 0.872806007278758, + "grad_norm": 0.9926019750513289, + "learning_rate": 2.1779187784400385e-07, + "loss": 0.1288, + "step": 9473 + }, + { + "epoch": 0.872898143456028, + "grad_norm": 0.9483689313827183, + "learning_rate": 2.1748100822258034e-07, + "loss": 0.1141, + "step": 9474 + }, + { + "epoch": 0.872990279633298, + "grad_norm": 0.9137883591856961, + "learning_rate": 2.171703505332909e-07, + "loss": 0.108, + "step": 9475 + }, + { + "epoch": 0.873082415810568, + "grad_norm": 0.9370216914924847, + "learning_rate": 2.1685990480498048e-07, + "loss": 0.126, + "step": 9476 + }, + { + "epoch": 0.873174551987838, + "grad_norm": 0.9391599852602918, + "learning_rate": 2.1654967106647328e-07, + "loss": 0.1134, + "step": 9477 + }, + { + "epoch": 0.873266688165108, + "grad_norm": 0.9333893143260931, + "learning_rate": 2.1623964934657516e-07, + "loss": 0.1207, + "step": 9478 + }, + { + "epoch": 0.873358824342378, + "grad_norm": 0.921403332616667, + "learning_rate": 2.159298396740711e-07, + "loss": 0.1184, + "step": 9479 + }, + { + "epoch": 0.873450960519648, + "grad_norm": 0.989075346325913, + "learning_rate": 2.1562024207772758e-07, + "loss": 0.1221, + "step": 9480 + }, + { + "epoch": 0.8735430966969181, + "grad_norm": 0.8899903434445905, + "learning_rate": 2.1531085658628992e-07, + "loss": 0.1073, + "step": 9481 + }, + { + "epoch": 0.8736352328741881, + "grad_norm": 0.9766428519858893, + "learning_rate": 2.1500168322848515e-07, + "loss": 0.1219, + "step": 9482 + }, + { + "epoch": 0.8737273690514581, + "grad_norm": 0.9650695272721265, + "learning_rate": 2.1469272203302055e-07, + "loss": 0.1207, + "step": 9483 + }, + { + "epoch": 0.8738195052287281, + "grad_norm": 0.9154717048048135, + "learning_rate": 2.143839730285824e-07, + "loss": 0.1136, + "step": 9484 + }, + { + "epoch": 0.8739116414059981, + "grad_norm": 0.9660021884154858, + "learning_rate": 2.1407543624383798e-07, + "loss": 0.1162, + "step": 9485 + }, + { + "epoch": 0.8740037775832681, + "grad_norm": 0.9456680462862941, + "learning_rate": 2.1376711170743553e-07, + "loss": 0.1213, + "step": 9486 + }, + { + "epoch": 0.8740959137605381, + "grad_norm": 0.9118077582257975, + "learning_rate": 2.134589994480027e-07, + "loss": 0.1139, + "step": 9487 + }, + { + "epoch": 0.8741880499378081, + "grad_norm": 0.9085493885735575, + "learning_rate": 2.1315109949414824e-07, + "loss": 0.1191, + "step": 9488 + }, + { + "epoch": 0.8742801861150781, + "grad_norm": 0.9387869250564302, + "learning_rate": 2.1284341187446046e-07, + "loss": 0.1219, + "step": 9489 + }, + { + "epoch": 0.8743723222923481, + "grad_norm": 0.9152316051258936, + "learning_rate": 2.1253593661750727e-07, + "loss": 0.1052, + "step": 9490 + }, + { + "epoch": 0.8744644584696181, + "grad_norm": 0.979761804929908, + "learning_rate": 2.1222867375183893e-07, + "loss": 0.1236, + "step": 9491 + }, + { + "epoch": 0.8745565946468881, + "grad_norm": 0.931801417869082, + "learning_rate": 2.1192162330598453e-07, + "loss": 0.1145, + "step": 9492 + }, + { + "epoch": 0.8746487308241581, + "grad_norm": 0.8688388184702385, + "learning_rate": 2.1161478530845353e-07, + "loss": 0.1095, + "step": 9493 + }, + { + "epoch": 0.8747408670014281, + "grad_norm": 0.9228613560998604, + "learning_rate": 2.1130815978773616e-07, + "loss": 0.1261, + "step": 9494 + }, + { + "epoch": 0.8748330031786982, + "grad_norm": 0.9595529954449742, + "learning_rate": 2.1100174677230217e-07, + "loss": 0.1235, + "step": 9495 + }, + { + "epoch": 0.8749251393559682, + "grad_norm": 0.8906914567887639, + "learning_rate": 2.1069554629060297e-07, + "loss": 0.1102, + "step": 9496 + }, + { + "epoch": 0.8750172755332382, + "grad_norm": 0.9461347234134713, + "learning_rate": 2.10389558371068e-07, + "loss": 0.1151, + "step": 9497 + }, + { + "epoch": 0.8751094117105082, + "grad_norm": 0.9429358001436208, + "learning_rate": 2.1008378304210876e-07, + "loss": 0.1251, + "step": 9498 + }, + { + "epoch": 0.8752015478877782, + "grad_norm": 0.9420068293333895, + "learning_rate": 2.0977822033211748e-07, + "loss": 0.1164, + "step": 9499 + }, + { + "epoch": 0.8752936840650481, + "grad_norm": 1.0217294748310057, + "learning_rate": 2.0947287026946428e-07, + "loss": 0.1175, + "step": 9500 + }, + { + "epoch": 0.8752936840650481, + "eval_loss": 0.1174582913517952, + "eval_runtime": 299.0187, + "eval_samples_per_second": 23.467, + "eval_steps_per_second": 2.936, + "step": 9500 + }, + { + "epoch": 0.8753858202423181, + "grad_norm": 0.8911898224450792, + "learning_rate": 2.091677328825023e-07, + "loss": 0.1057, + "step": 9501 + }, + { + "epoch": 0.8754779564195881, + "grad_norm": 0.9062661157706274, + "learning_rate": 2.0886280819956223e-07, + "loss": 0.1122, + "step": 9502 + }, + { + "epoch": 0.8755700925968581, + "grad_norm": 0.9721174742130575, + "learning_rate": 2.0855809624895694e-07, + "loss": 0.1268, + "step": 9503 + }, + { + "epoch": 0.8756622287741281, + "grad_norm": 0.9541176980173214, + "learning_rate": 2.082535970589794e-07, + "loss": 0.1225, + "step": 9504 + }, + { + "epoch": 0.8757543649513981, + "grad_norm": 0.9238750790195943, + "learning_rate": 2.0794931065790226e-07, + "loss": 0.1134, + "step": 9505 + }, + { + "epoch": 0.8758465011286681, + "grad_norm": 0.9486934776566898, + "learning_rate": 2.076452370739776e-07, + "loss": 0.1185, + "step": 9506 + }, + { + "epoch": 0.8759386373059381, + "grad_norm": 1.0458887278730715, + "learning_rate": 2.0734137633543954e-07, + "loss": 0.1309, + "step": 9507 + }, + { + "epoch": 0.8760307734832082, + "grad_norm": 0.944357664608892, + "learning_rate": 2.0703772847050136e-07, + "loss": 0.1102, + "step": 9508 + }, + { + "epoch": 0.8761229096604782, + "grad_norm": 0.9642165230298722, + "learning_rate": 2.0673429350735742e-07, + "loss": 0.1214, + "step": 9509 + }, + { + "epoch": 0.8762150458377482, + "grad_norm": 0.8546678704665883, + "learning_rate": 2.06431071474181e-07, + "loss": 0.1051, + "step": 9510 + }, + { + "epoch": 0.8763071820150182, + "grad_norm": 0.9556155038340977, + "learning_rate": 2.0612806239912602e-07, + "loss": 0.1133, + "step": 9511 + }, + { + "epoch": 0.8763993181922882, + "grad_norm": 0.940205240213181, + "learning_rate": 2.0582526631032745e-07, + "loss": 0.1122, + "step": 9512 + }, + { + "epoch": 0.8764914543695582, + "grad_norm": 0.9471384282662257, + "learning_rate": 2.0552268323590002e-07, + "loss": 0.1183, + "step": 9513 + }, + { + "epoch": 0.8765835905468282, + "grad_norm": 0.9996225062721664, + "learning_rate": 2.052203132039382e-07, + "loss": 0.1233, + "step": 9514 + }, + { + "epoch": 0.8766757267240982, + "grad_norm": 0.9260611802417887, + "learning_rate": 2.0491815624251733e-07, + "loss": 0.1184, + "step": 9515 + }, + { + "epoch": 0.8767678629013682, + "grad_norm": 0.9071613612784006, + "learning_rate": 2.046162123796927e-07, + "loss": 0.1145, + "step": 9516 + }, + { + "epoch": 0.8768599990786382, + "grad_norm": 0.9863259108007564, + "learning_rate": 2.043144816434997e-07, + "loss": 0.1326, + "step": 9517 + }, + { + "epoch": 0.8769521352559082, + "grad_norm": 0.8971641251120323, + "learning_rate": 2.0401296406195426e-07, + "loss": 0.1136, + "step": 9518 + }, + { + "epoch": 0.8770442714331782, + "grad_norm": 0.897002354512559, + "learning_rate": 2.0371165966305173e-07, + "loss": 0.1127, + "step": 9519 + }, + { + "epoch": 0.8771364076104482, + "grad_norm": 0.9075910207315132, + "learning_rate": 2.0341056847476947e-07, + "loss": 0.1084, + "step": 9520 + }, + { + "epoch": 0.8772285437877182, + "grad_norm": 0.9592976228033577, + "learning_rate": 2.031096905250629e-07, + "loss": 0.1112, + "step": 9521 + }, + { + "epoch": 0.8773206799649883, + "grad_norm": 0.9076761866207034, + "learning_rate": 2.0280902584186828e-07, + "loss": 0.1172, + "step": 9522 + }, + { + "epoch": 0.8774128161422583, + "grad_norm": 1.0029980746167146, + "learning_rate": 2.02508574453103e-07, + "loss": 0.1247, + "step": 9523 + }, + { + "epoch": 0.8775049523195283, + "grad_norm": 0.9757391160570924, + "learning_rate": 2.0220833638666393e-07, + "loss": 0.1203, + "step": 9524 + }, + { + "epoch": 0.8775970884967983, + "grad_norm": 0.943872802307776, + "learning_rate": 2.0190831167042846e-07, + "loss": 0.1152, + "step": 9525 + }, + { + "epoch": 0.8776892246740683, + "grad_norm": 0.9287559296126232, + "learning_rate": 2.016085003322535e-07, + "loss": 0.1214, + "step": 9526 + }, + { + "epoch": 0.8777813608513383, + "grad_norm": 0.9162111580851442, + "learning_rate": 2.013089023999762e-07, + "loss": 0.1146, + "step": 9527 + }, + { + "epoch": 0.8778734970286083, + "grad_norm": 0.9129520056654986, + "learning_rate": 2.010095179014146e-07, + "loss": 0.1116, + "step": 9528 + }, + { + "epoch": 0.8779656332058783, + "grad_norm": 0.9523827132430306, + "learning_rate": 2.00710346864367e-07, + "loss": 0.1279, + "step": 9529 + }, + { + "epoch": 0.8780577693831483, + "grad_norm": 0.9521087669536956, + "learning_rate": 2.0041138931661124e-07, + "loss": 0.1249, + "step": 9530 + }, + { + "epoch": 0.8781499055604183, + "grad_norm": 0.9189194425795829, + "learning_rate": 2.0011264528590562e-07, + "loss": 0.1194, + "step": 9531 + }, + { + "epoch": 0.8782420417376883, + "grad_norm": 0.9487815314113962, + "learning_rate": 1.9981411479998798e-07, + "loss": 0.1085, + "step": 9532 + }, + { + "epoch": 0.8783341779149583, + "grad_norm": 0.963493565405593, + "learning_rate": 1.9951579788657748e-07, + "loss": 0.1102, + "step": 9533 + }, + { + "epoch": 0.8784263140922283, + "grad_norm": 0.9495227621628857, + "learning_rate": 1.9921769457337286e-07, + "loss": 0.1242, + "step": 9534 + }, + { + "epoch": 0.8785184502694983, + "grad_norm": 0.9305475533838578, + "learning_rate": 1.9891980488805278e-07, + "loss": 0.114, + "step": 9535 + }, + { + "epoch": 0.8786105864467684, + "grad_norm": 0.9693885781366122, + "learning_rate": 1.986221288582768e-07, + "loss": 0.1306, + "step": 9536 + }, + { + "epoch": 0.8787027226240384, + "grad_norm": 0.9847652103702126, + "learning_rate": 1.9832466651168337e-07, + "loss": 0.1214, + "step": 9537 + }, + { + "epoch": 0.8787948588013084, + "grad_norm": 0.8562607646907413, + "learning_rate": 1.9802741787589258e-07, + "loss": 0.1056, + "step": 9538 + }, + { + "epoch": 0.8788869949785784, + "grad_norm": 0.9232649256601291, + "learning_rate": 1.977303829785035e-07, + "loss": 0.116, + "step": 9539 + }, + { + "epoch": 0.8789791311558484, + "grad_norm": 0.9829657229119272, + "learning_rate": 1.9743356184709628e-07, + "loss": 0.1243, + "step": 9540 + }, + { + "epoch": 0.8790712673331184, + "grad_norm": 0.9827295767927928, + "learning_rate": 1.9713695450923054e-07, + "loss": 0.129, + "step": 9541 + }, + { + "epoch": 0.8791634035103884, + "grad_norm": 0.94977481989151, + "learning_rate": 1.968405609924473e-07, + "loss": 0.1198, + "step": 9542 + }, + { + "epoch": 0.8792555396876583, + "grad_norm": 0.9891164225609158, + "learning_rate": 1.9654438132426485e-07, + "loss": 0.1231, + "step": 9543 + }, + { + "epoch": 0.8793476758649283, + "grad_norm": 0.9193503040315959, + "learning_rate": 1.9624841553218476e-07, + "loss": 0.1083, + "step": 9544 + }, + { + "epoch": 0.8794398120421983, + "grad_norm": 1.0548903153761093, + "learning_rate": 1.9595266364368705e-07, + "loss": 0.1317, + "step": 9545 + }, + { + "epoch": 0.8795319482194683, + "grad_norm": 0.8884916257850111, + "learning_rate": 1.9565712568623274e-07, + "loss": 0.1113, + "step": 9546 + }, + { + "epoch": 0.8796240843967383, + "grad_norm": 0.9715085221952553, + "learning_rate": 1.9536180168726214e-07, + "loss": 0.1127, + "step": 9547 + }, + { + "epoch": 0.8797162205740083, + "grad_norm": 0.9821115353932246, + "learning_rate": 1.9506669167419667e-07, + "loss": 0.119, + "step": 9548 + }, + { + "epoch": 0.8798083567512784, + "grad_norm": 0.9843575732122866, + "learning_rate": 1.9477179567443632e-07, + "loss": 0.1274, + "step": 9549 + }, + { + "epoch": 0.8799004929285484, + "grad_norm": 0.9642181310696486, + "learning_rate": 1.9447711371536365e-07, + "loss": 0.1202, + "step": 9550 + }, + { + "epoch": 0.8799926291058184, + "grad_norm": 1.0037179892947732, + "learning_rate": 1.9418264582433844e-07, + "loss": 0.1171, + "step": 9551 + }, + { + "epoch": 0.8800847652830884, + "grad_norm": 0.9337763265065536, + "learning_rate": 1.9388839202870268e-07, + "loss": 0.1167, + "step": 9552 + }, + { + "epoch": 0.8801769014603584, + "grad_norm": 0.8987444148210977, + "learning_rate": 1.9359435235577818e-07, + "loss": 0.1042, + "step": 9553 + }, + { + "epoch": 0.8802690376376284, + "grad_norm": 0.9502015836724473, + "learning_rate": 1.9330052683286666e-07, + "loss": 0.1253, + "step": 9554 + }, + { + "epoch": 0.8803611738148984, + "grad_norm": 0.93018808175646, + "learning_rate": 1.930069154872488e-07, + "loss": 0.1134, + "step": 9555 + }, + { + "epoch": 0.8804533099921684, + "grad_norm": 0.9617089178446352, + "learning_rate": 1.92713518346187e-07, + "loss": 0.1213, + "step": 9556 + }, + { + "epoch": 0.8805454461694384, + "grad_norm": 0.9348159689266482, + "learning_rate": 1.9242033543692362e-07, + "loss": 0.1185, + "step": 9557 + }, + { + "epoch": 0.8806375823467084, + "grad_norm": 0.9509850882088483, + "learning_rate": 1.9212736678668075e-07, + "loss": 0.1214, + "step": 9558 + }, + { + "epoch": 0.8807297185239784, + "grad_norm": 0.9288825792030714, + "learning_rate": 1.9183461242266027e-07, + "loss": 0.1142, + "step": 9559 + }, + { + "epoch": 0.8808218547012484, + "grad_norm": 0.9294423662297463, + "learning_rate": 1.9154207237204403e-07, + "loss": 0.1147, + "step": 9560 + }, + { + "epoch": 0.8809139908785184, + "grad_norm": 1.0171452491428885, + "learning_rate": 1.9124974666199476e-07, + "loss": 0.1299, + "step": 9561 + }, + { + "epoch": 0.8810061270557884, + "grad_norm": 0.9263947519637243, + "learning_rate": 1.909576353196549e-07, + "loss": 0.1131, + "step": 9562 + }, + { + "epoch": 0.8810982632330585, + "grad_norm": 0.8906518666115324, + "learning_rate": 1.9066573837214773e-07, + "loss": 0.113, + "step": 9563 + }, + { + "epoch": 0.8811903994103285, + "grad_norm": 0.9373704014564624, + "learning_rate": 1.90374055846575e-07, + "loss": 0.117, + "step": 9564 + }, + { + "epoch": 0.8812825355875985, + "grad_norm": 0.9669182368686189, + "learning_rate": 1.9008258777001963e-07, + "loss": 0.1208, + "step": 9565 + }, + { + "epoch": 0.8813746717648685, + "grad_norm": 0.9085498077985098, + "learning_rate": 1.8979133416954453e-07, + "loss": 0.1116, + "step": 9566 + }, + { + "epoch": 0.8814668079421385, + "grad_norm": 0.9457559872850566, + "learning_rate": 1.8950029507219302e-07, + "loss": 0.1252, + "step": 9567 + }, + { + "epoch": 0.8815589441194085, + "grad_norm": 0.9720738745786429, + "learning_rate": 1.8920947050498711e-07, + "loss": 0.1207, + "step": 9568 + }, + { + "epoch": 0.8816510802966785, + "grad_norm": 0.9886987730813052, + "learning_rate": 1.889188604949313e-07, + "loss": 0.1293, + "step": 9569 + }, + { + "epoch": 0.8817432164739485, + "grad_norm": 0.8695655667801883, + "learning_rate": 1.8862846506900762e-07, + "loss": 0.1005, + "step": 9570 + }, + { + "epoch": 0.8818353526512185, + "grad_norm": 0.9139453399544554, + "learning_rate": 1.8833828425418006e-07, + "loss": 0.1113, + "step": 9571 + }, + { + "epoch": 0.8819274888284885, + "grad_norm": 0.9206137569411968, + "learning_rate": 1.8804831807739094e-07, + "loss": 0.1122, + "step": 9572 + }, + { + "epoch": 0.8820196250057585, + "grad_norm": 0.9661739250181918, + "learning_rate": 1.8775856656556458e-07, + "loss": 0.118, + "step": 9573 + }, + { + "epoch": 0.8821117611830285, + "grad_norm": 0.9716508124012724, + "learning_rate": 1.8746902974560443e-07, + "loss": 0.1242, + "step": 9574 + }, + { + "epoch": 0.8822038973602985, + "grad_norm": 0.9133529948046311, + "learning_rate": 1.8717970764439374e-07, + "loss": 0.123, + "step": 9575 + }, + { + "epoch": 0.8822960335375686, + "grad_norm": 0.9832013176200599, + "learning_rate": 1.8689060028879602e-07, + "loss": 0.1278, + "step": 9576 + }, + { + "epoch": 0.8823881697148386, + "grad_norm": 0.8860667781542512, + "learning_rate": 1.866017077056545e-07, + "loss": 0.1193, + "step": 9577 + }, + { + "epoch": 0.8824803058921086, + "grad_norm": 0.9641731545183843, + "learning_rate": 1.8631302992179383e-07, + "loss": 0.1184, + "step": 9578 + }, + { + "epoch": 0.8825724420693786, + "grad_norm": 0.8761458096650978, + "learning_rate": 1.860245669640176e-07, + "loss": 0.1092, + "step": 9579 + }, + { + "epoch": 0.8826645782466486, + "grad_norm": 0.9386085977402518, + "learning_rate": 1.857363188591091e-07, + "loss": 0.1111, + "step": 9580 + }, + { + "epoch": 0.8827567144239186, + "grad_norm": 0.9293784733624652, + "learning_rate": 1.8544828563383243e-07, + "loss": 0.1079, + "step": 9581 + }, + { + "epoch": 0.8828488506011886, + "grad_norm": 0.9749859909166957, + "learning_rate": 1.8516046731493127e-07, + "loss": 0.1275, + "step": 9582 + }, + { + "epoch": 0.8829409867784586, + "grad_norm": 0.9022892040966464, + "learning_rate": 1.848728639291303e-07, + "loss": 0.1096, + "step": 9583 + }, + { + "epoch": 0.8830331229557286, + "grad_norm": 0.9433013663273301, + "learning_rate": 1.8458547550313287e-07, + "loss": 0.1191, + "step": 9584 + }, + { + "epoch": 0.8831252591329986, + "grad_norm": 0.8964672985576979, + "learning_rate": 1.8429830206362325e-07, + "loss": 0.1137, + "step": 9585 + }, + { + "epoch": 0.8832173953102685, + "grad_norm": 0.9174450175751114, + "learning_rate": 1.8401134363726536e-07, + "loss": 0.1138, + "step": 9586 + }, + { + "epoch": 0.8833095314875385, + "grad_norm": 0.8803209048806083, + "learning_rate": 1.8372460025070343e-07, + "loss": 0.1152, + "step": 9587 + }, + { + "epoch": 0.8834016676648085, + "grad_norm": 0.9100598997602551, + "learning_rate": 1.8343807193056201e-07, + "loss": 0.1113, + "step": 9588 + }, + { + "epoch": 0.8834938038420785, + "grad_norm": 0.8753907155320467, + "learning_rate": 1.8315175870344455e-07, + "loss": 0.1149, + "step": 9589 + }, + { + "epoch": 0.8835859400193486, + "grad_norm": 0.9462524201593473, + "learning_rate": 1.8286566059593615e-07, + "loss": 0.1154, + "step": 9590 + }, + { + "epoch": 0.8836780761966186, + "grad_norm": 0.9212877418915894, + "learning_rate": 1.825797776346e-07, + "loss": 0.1185, + "step": 9591 + }, + { + "epoch": 0.8837702123738886, + "grad_norm": 0.946329462649158, + "learning_rate": 1.8229410984598128e-07, + "loss": 0.1243, + "step": 9592 + }, + { + "epoch": 0.8838623485511586, + "grad_norm": 0.9357474153015102, + "learning_rate": 1.820086572566035e-07, + "loss": 0.1225, + "step": 9593 + }, + { + "epoch": 0.8839544847284286, + "grad_norm": 0.9432417654817284, + "learning_rate": 1.8172341989297154e-07, + "loss": 0.1082, + "step": 9594 + }, + { + "epoch": 0.8840466209056986, + "grad_norm": 0.9728284426637823, + "learning_rate": 1.814383977815698e-07, + "loss": 0.1082, + "step": 9595 + }, + { + "epoch": 0.8841387570829686, + "grad_norm": 0.9521973139587481, + "learning_rate": 1.8115359094886238e-07, + "loss": 0.1237, + "step": 9596 + }, + { + "epoch": 0.8842308932602386, + "grad_norm": 0.9412655233696202, + "learning_rate": 1.808689994212931e-07, + "loss": 0.1181, + "step": 9597 + }, + { + "epoch": 0.8843230294375086, + "grad_norm": 0.9095779965593743, + "learning_rate": 1.8058462322528698e-07, + "loss": 0.1093, + "step": 9598 + }, + { + "epoch": 0.8844151656147786, + "grad_norm": 0.9580227956852666, + "learning_rate": 1.8030046238724814e-07, + "loss": 0.1191, + "step": 9599 + }, + { + "epoch": 0.8845073017920486, + "grad_norm": 0.9635109661247991, + "learning_rate": 1.8001651693356131e-07, + "loss": 0.1228, + "step": 9600 + }, + { + "epoch": 0.8845994379693186, + "grad_norm": 0.8568137536842804, + "learning_rate": 1.797327868905907e-07, + "loss": 0.0991, + "step": 9601 + }, + { + "epoch": 0.8846915741465886, + "grad_norm": 0.9547474068403831, + "learning_rate": 1.7944927228467995e-07, + "loss": 0.1168, + "step": 9602 + }, + { + "epoch": 0.8847837103238586, + "grad_norm": 0.9584155615411788, + "learning_rate": 1.791659731421541e-07, + "loss": 0.1077, + "step": 9603 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 0.928296303372466, + "learning_rate": 1.7888288948931799e-07, + "loss": 0.1165, + "step": 9604 + }, + { + "epoch": 0.8849679826783987, + "grad_norm": 0.9114506107172179, + "learning_rate": 1.786000213524547e-07, + "loss": 0.1154, + "step": 9605 + }, + { + "epoch": 0.8850601188556687, + "grad_norm": 0.9032049431949399, + "learning_rate": 1.783173687578299e-07, + "loss": 0.1077, + "step": 9606 + }, + { + "epoch": 0.8851522550329387, + "grad_norm": 1.031588892107339, + "learning_rate": 1.7803493173168679e-07, + "loss": 0.1249, + "step": 9607 + }, + { + "epoch": 0.8852443912102087, + "grad_norm": 0.9265771709711623, + "learning_rate": 1.777527103002505e-07, + "loss": 0.1197, + "step": 9608 + }, + { + "epoch": 0.8853365273874787, + "grad_norm": 0.9544289104713, + "learning_rate": 1.7747070448972475e-07, + "loss": 0.1178, + "step": 9609 + }, + { + "epoch": 0.8854286635647487, + "grad_norm": 0.9971281602223147, + "learning_rate": 1.7718891432629392e-07, + "loss": 0.1295, + "step": 9610 + }, + { + "epoch": 0.8855207997420187, + "grad_norm": 0.9433321387081206, + "learning_rate": 1.769073398361229e-07, + "loss": 0.1169, + "step": 9611 + }, + { + "epoch": 0.8856129359192887, + "grad_norm": 0.9677629402688539, + "learning_rate": 1.7662598104535522e-07, + "loss": 0.1211, + "step": 9612 + }, + { + "epoch": 0.8857050720965587, + "grad_norm": 0.9972427061874779, + "learning_rate": 1.7634483798011498e-07, + "loss": 0.1217, + "step": 9613 + }, + { + "epoch": 0.8857972082738287, + "grad_norm": 0.8859263153374594, + "learning_rate": 1.760639106665063e-07, + "loss": 0.1138, + "step": 9614 + }, + { + "epoch": 0.8858893444510987, + "grad_norm": 0.9202629453001602, + "learning_rate": 1.7578319913061387e-07, + "loss": 0.1135, + "step": 9615 + }, + { + "epoch": 0.8859814806283687, + "grad_norm": 0.9096230922774695, + "learning_rate": 1.7550270339850212e-07, + "loss": 0.108, + "step": 9616 + }, + { + "epoch": 0.8860736168056388, + "grad_norm": 0.9214369389617435, + "learning_rate": 1.7522242349621438e-07, + "loss": 0.1213, + "step": 9617 + }, + { + "epoch": 0.8861657529829088, + "grad_norm": 0.9458955777475399, + "learning_rate": 1.7494235944977427e-07, + "loss": 0.1252, + "step": 9618 + }, + { + "epoch": 0.8862578891601788, + "grad_norm": 0.9374083870077097, + "learning_rate": 1.7466251128518629e-07, + "loss": 0.1088, + "step": 9619 + }, + { + "epoch": 0.8863500253374488, + "grad_norm": 0.9111588907855831, + "learning_rate": 1.7438287902843465e-07, + "loss": 0.1051, + "step": 9620 + }, + { + "epoch": 0.8864421615147188, + "grad_norm": 0.9315197242699069, + "learning_rate": 1.7410346270548328e-07, + "loss": 0.1175, + "step": 9621 + }, + { + "epoch": 0.8865342976919888, + "grad_norm": 1.0174523449968809, + "learning_rate": 1.7382426234227562e-07, + "loss": 0.1309, + "step": 9622 + }, + { + "epoch": 0.8866264338692588, + "grad_norm": 0.9663714692842666, + "learning_rate": 1.735452779647351e-07, + "loss": 0.1191, + "step": 9623 + }, + { + "epoch": 0.8867185700465288, + "grad_norm": 0.9881143534895991, + "learning_rate": 1.7326650959876595e-07, + "loss": 0.1308, + "step": 9624 + }, + { + "epoch": 0.8868107062237988, + "grad_norm": 0.9652210535066099, + "learning_rate": 1.7298795727025226e-07, + "loss": 0.1118, + "step": 9625 + }, + { + "epoch": 0.8869028424010688, + "grad_norm": 0.9190988726721089, + "learning_rate": 1.7270962100505688e-07, + "loss": 0.1202, + "step": 9626 + }, + { + "epoch": 0.8869949785783388, + "grad_norm": 0.9356727701411983, + "learning_rate": 1.724315008290234e-07, + "loss": 0.1208, + "step": 9627 + }, + { + "epoch": 0.8870871147556088, + "grad_norm": 0.9449533031642502, + "learning_rate": 1.7215359676797604e-07, + "loss": 0.1204, + "step": 9628 + }, + { + "epoch": 0.8871792509328787, + "grad_norm": 0.9224941964563907, + "learning_rate": 1.7187590884771789e-07, + "loss": 0.1137, + "step": 9629 + }, + { + "epoch": 0.8872713871101487, + "grad_norm": 0.9184212287329622, + "learning_rate": 1.7159843709403156e-07, + "loss": 0.1184, + "step": 9630 + }, + { + "epoch": 0.8873635232874189, + "grad_norm": 1.033680961033416, + "learning_rate": 1.7132118153268097e-07, + "loss": 0.1182, + "step": 9631 + }, + { + "epoch": 0.8874556594646889, + "grad_norm": 0.925393325283817, + "learning_rate": 1.7104414218940934e-07, + "loss": 0.1161, + "step": 9632 + }, + { + "epoch": 0.8875477956419588, + "grad_norm": 0.9012734623981898, + "learning_rate": 1.7076731908994032e-07, + "loss": 0.1063, + "step": 9633 + }, + { + "epoch": 0.8876399318192288, + "grad_norm": 0.9821681211632581, + "learning_rate": 1.704907122599761e-07, + "loss": 0.1297, + "step": 9634 + }, + { + "epoch": 0.8877320679964988, + "grad_norm": 0.937963156497377, + "learning_rate": 1.7021432172519974e-07, + "loss": 0.1123, + "step": 9635 + }, + { + "epoch": 0.8878242041737688, + "grad_norm": 0.8937454742138694, + "learning_rate": 1.6993814751127435e-07, + "loss": 0.115, + "step": 9636 + }, + { + "epoch": 0.8879163403510388, + "grad_norm": 0.8840091538217429, + "learning_rate": 1.69662189643843e-07, + "loss": 0.103, + "step": 9637 + }, + { + "epoch": 0.8880084765283088, + "grad_norm": 0.9022781888770508, + "learning_rate": 1.69386448148528e-07, + "loss": 0.1093, + "step": 9638 + }, + { + "epoch": 0.8881006127055788, + "grad_norm": 0.948615735169581, + "learning_rate": 1.691109230509322e-07, + "loss": 0.1155, + "step": 9639 + }, + { + "epoch": 0.8881927488828488, + "grad_norm": 0.9448712983100227, + "learning_rate": 1.6883561437663788e-07, + "loss": 0.1164, + "step": 9640 + }, + { + "epoch": 0.8882848850601188, + "grad_norm": 0.8863119815353013, + "learning_rate": 1.6856052215120794e-07, + "loss": 0.1115, + "step": 9641 + }, + { + "epoch": 0.8883770212373888, + "grad_norm": 0.9719080592411924, + "learning_rate": 1.682856464001839e-07, + "loss": 0.1166, + "step": 9642 + }, + { + "epoch": 0.8884691574146588, + "grad_norm": 0.8809419811084558, + "learning_rate": 1.680109871490887e-07, + "loss": 0.1115, + "step": 9643 + }, + { + "epoch": 0.8885612935919289, + "grad_norm": 0.9303992924384262, + "learning_rate": 1.6773654442342468e-07, + "loss": 0.1105, + "step": 9644 + }, + { + "epoch": 0.8886534297691989, + "grad_norm": 1.0080826201157422, + "learning_rate": 1.6746231824867316e-07, + "loss": 0.1284, + "step": 9645 + }, + { + "epoch": 0.8887455659464689, + "grad_norm": 0.9524532559452373, + "learning_rate": 1.671883086502968e-07, + "loss": 0.1139, + "step": 9646 + }, + { + "epoch": 0.8888377021237389, + "grad_norm": 0.9658574906611174, + "learning_rate": 1.669145156537366e-07, + "loss": 0.1339, + "step": 9647 + }, + { + "epoch": 0.8889298383010089, + "grad_norm": 0.9751522991921692, + "learning_rate": 1.6664093928441456e-07, + "loss": 0.1203, + "step": 9648 + }, + { + "epoch": 0.8890219744782789, + "grad_norm": 0.951320518261064, + "learning_rate": 1.6636757956773302e-07, + "loss": 0.12, + "step": 9649 + }, + { + "epoch": 0.8891141106555489, + "grad_norm": 0.9416676468741589, + "learning_rate": 1.6609443652907287e-07, + "loss": 0.1227, + "step": 9650 + }, + { + "epoch": 0.8892062468328189, + "grad_norm": 1.0030344928799633, + "learning_rate": 1.6582151019379517e-07, + "loss": 0.1299, + "step": 9651 + }, + { + "epoch": 0.8892983830100889, + "grad_norm": 0.9258602981126589, + "learning_rate": 1.655488005872413e-07, + "loss": 0.1126, + "step": 9652 + }, + { + "epoch": 0.8893905191873589, + "grad_norm": 0.9424174799317092, + "learning_rate": 1.6527630773473248e-07, + "loss": 0.1178, + "step": 9653 + }, + { + "epoch": 0.8894826553646289, + "grad_norm": 0.9481218238285717, + "learning_rate": 1.650040316615703e-07, + "loss": 0.1246, + "step": 9654 + }, + { + "epoch": 0.8895747915418989, + "grad_norm": 0.9605790380659346, + "learning_rate": 1.647319723930349e-07, + "loss": 0.124, + "step": 9655 + }, + { + "epoch": 0.8896669277191689, + "grad_norm": 0.9472833340946128, + "learning_rate": 1.6446012995438688e-07, + "loss": 0.1252, + "step": 9656 + }, + { + "epoch": 0.8897590638964389, + "grad_norm": 0.9887156441835274, + "learning_rate": 1.6418850437086715e-07, + "loss": 0.1311, + "step": 9657 + }, + { + "epoch": 0.889851200073709, + "grad_norm": 0.9639073024387345, + "learning_rate": 1.6391709566769664e-07, + "loss": 0.1232, + "step": 9658 + }, + { + "epoch": 0.889943336250979, + "grad_norm": 0.8845699968934604, + "learning_rate": 1.6364590387007468e-07, + "loss": 0.1053, + "step": 9659 + }, + { + "epoch": 0.890035472428249, + "grad_norm": 0.9467171501941805, + "learning_rate": 1.6337492900318246e-07, + "loss": 0.1118, + "step": 9660 + }, + { + "epoch": 0.890127608605519, + "grad_norm": 0.8784549110877983, + "learning_rate": 1.6310417109217906e-07, + "loss": 0.11, + "step": 9661 + }, + { + "epoch": 0.890219744782789, + "grad_norm": 0.9112414536641252, + "learning_rate": 1.6283363016220548e-07, + "loss": 0.1077, + "step": 9662 + }, + { + "epoch": 0.890311880960059, + "grad_norm": 0.9555138237898598, + "learning_rate": 1.6256330623838024e-07, + "loss": 0.126, + "step": 9663 + }, + { + "epoch": 0.890404017137329, + "grad_norm": 0.9146987594311619, + "learning_rate": 1.6229319934580378e-07, + "loss": 0.108, + "step": 9664 + }, + { + "epoch": 0.890496153314599, + "grad_norm": 0.9870921357946368, + "learning_rate": 1.6202330950955552e-07, + "loss": 0.1216, + "step": 9665 + }, + { + "epoch": 0.890588289491869, + "grad_norm": 0.9640885538855887, + "learning_rate": 1.6175363675469485e-07, + "loss": 0.1239, + "step": 9666 + }, + { + "epoch": 0.890680425669139, + "grad_norm": 0.9317904692179457, + "learning_rate": 1.6148418110626008e-07, + "loss": 0.1171, + "step": 9667 + }, + { + "epoch": 0.890772561846409, + "grad_norm": 0.9345846725693181, + "learning_rate": 1.612149425892709e-07, + "loss": 0.1137, + "step": 9668 + }, + { + "epoch": 0.890864698023679, + "grad_norm": 0.9183218187820292, + "learning_rate": 1.6094592122872594e-07, + "loss": 0.1142, + "step": 9669 + }, + { + "epoch": 0.890956834200949, + "grad_norm": 0.9700195116573087, + "learning_rate": 1.6067711704960408e-07, + "loss": 0.1146, + "step": 9670 + }, + { + "epoch": 0.8910489703782191, + "grad_norm": 0.9380884900362914, + "learning_rate": 1.60408530076864e-07, + "loss": 0.1258, + "step": 9671 + }, + { + "epoch": 0.8911411065554891, + "grad_norm": 0.9505680280600464, + "learning_rate": 1.6014016033544329e-07, + "loss": 0.1125, + "step": 9672 + }, + { + "epoch": 0.8912332427327591, + "grad_norm": 0.9153781576364745, + "learning_rate": 1.5987200785026024e-07, + "loss": 0.1134, + "step": 9673 + }, + { + "epoch": 0.891325378910029, + "grad_norm": 0.9722948079778472, + "learning_rate": 1.5960407264621335e-07, + "loss": 0.1234, + "step": 9674 + }, + { + "epoch": 0.891417515087299, + "grad_norm": 0.9108011291797209, + "learning_rate": 1.5933635474818048e-07, + "loss": 0.1207, + "step": 9675 + }, + { + "epoch": 0.891509651264569, + "grad_norm": 0.9460370912653332, + "learning_rate": 1.5906885418101897e-07, + "loss": 0.1169, + "step": 9676 + }, + { + "epoch": 0.891601787441839, + "grad_norm": 0.9385372386749276, + "learning_rate": 1.588015709695659e-07, + "loss": 0.1155, + "step": 9677 + }, + { + "epoch": 0.891693923619109, + "grad_norm": 0.9241971228937185, + "learning_rate": 1.5853450513863887e-07, + "loss": 0.112, + "step": 9678 + }, + { + "epoch": 0.891786059796379, + "grad_norm": 0.8890910453397396, + "learning_rate": 1.582676567130356e-07, + "loss": 0.1149, + "step": 9679 + }, + { + "epoch": 0.891878195973649, + "grad_norm": 0.9725011168763831, + "learning_rate": 1.5800102571753185e-07, + "loss": 0.1197, + "step": 9680 + }, + { + "epoch": 0.891970332150919, + "grad_norm": 0.9501294692501856, + "learning_rate": 1.5773461217688552e-07, + "loss": 0.118, + "step": 9681 + }, + { + "epoch": 0.892062468328189, + "grad_norm": 0.9353526816347267, + "learning_rate": 1.5746841611583185e-07, + "loss": 0.1155, + "step": 9682 + }, + { + "epoch": 0.892154604505459, + "grad_norm": 0.8727846936420787, + "learning_rate": 1.572024375590883e-07, + "loss": 0.1078, + "step": 9683 + }, + { + "epoch": 0.892246740682729, + "grad_norm": 0.9458301911588138, + "learning_rate": 1.5693667653135043e-07, + "loss": 0.1154, + "step": 9684 + }, + { + "epoch": 0.8923388768599991, + "grad_norm": 0.9325030933389573, + "learning_rate": 1.56671133057294e-07, + "loss": 0.1133, + "step": 9685 + }, + { + "epoch": 0.8924310130372691, + "grad_norm": 0.9620657518647695, + "learning_rate": 1.5640580716157566e-07, + "loss": 0.1101, + "step": 9686 + }, + { + "epoch": 0.8925231492145391, + "grad_norm": 0.886768718834908, + "learning_rate": 1.5614069886883021e-07, + "loss": 0.1093, + "step": 9687 + }, + { + "epoch": 0.8926152853918091, + "grad_norm": 1.0028608248597497, + "learning_rate": 1.5587580820367294e-07, + "loss": 0.1294, + "step": 9688 + }, + { + "epoch": 0.8927074215690791, + "grad_norm": 0.9258946821905417, + "learning_rate": 1.5561113519069887e-07, + "loss": 0.1232, + "step": 9689 + }, + { + "epoch": 0.8927995577463491, + "grad_norm": 0.8868177500137526, + "learning_rate": 1.5534667985448336e-07, + "loss": 0.1154, + "step": 9690 + }, + { + "epoch": 0.8928916939236191, + "grad_norm": 0.9208808651503403, + "learning_rate": 1.5508244221958125e-07, + "loss": 0.1114, + "step": 9691 + }, + { + "epoch": 0.8929838301008891, + "grad_norm": 0.9465660393051565, + "learning_rate": 1.5481842231052702e-07, + "loss": 0.1236, + "step": 9692 + }, + { + "epoch": 0.8930759662781591, + "grad_norm": 0.8850488686636715, + "learning_rate": 1.5455462015183388e-07, + "loss": 0.0956, + "step": 9693 + }, + { + "epoch": 0.8931681024554291, + "grad_norm": 0.9198730188731642, + "learning_rate": 1.5429103576799692e-07, + "loss": 0.1096, + "step": 9694 + }, + { + "epoch": 0.8932602386326991, + "grad_norm": 0.9843260253433014, + "learning_rate": 1.540276691834902e-07, + "loss": 0.1274, + "step": 9695 + }, + { + "epoch": 0.8933523748099691, + "grad_norm": 1.0059626536122228, + "learning_rate": 1.537645204227664e-07, + "loss": 0.1311, + "step": 9696 + }, + { + "epoch": 0.8934445109872391, + "grad_norm": 0.980951404503035, + "learning_rate": 1.5350158951025957e-07, + "loss": 0.1183, + "step": 9697 + }, + { + "epoch": 0.8935366471645091, + "grad_norm": 0.9806529946895199, + "learning_rate": 1.5323887647038266e-07, + "loss": 0.1249, + "step": 9698 + }, + { + "epoch": 0.8936287833417792, + "grad_norm": 0.9610970927025647, + "learning_rate": 1.5297638132752867e-07, + "loss": 0.1138, + "step": 9699 + }, + { + "epoch": 0.8937209195190492, + "grad_norm": 0.9565492528451189, + "learning_rate": 1.5271410410607008e-07, + "loss": 0.1103, + "step": 9700 + }, + { + "epoch": 0.8938130556963192, + "grad_norm": 0.9128992345808833, + "learning_rate": 1.5245204483035958e-07, + "loss": 0.1194, + "step": 9701 + }, + { + "epoch": 0.8939051918735892, + "grad_norm": 0.9813926968534401, + "learning_rate": 1.5219020352472914e-07, + "loss": 0.1208, + "step": 9702 + }, + { + "epoch": 0.8939973280508592, + "grad_norm": 0.947509072446224, + "learning_rate": 1.519285802134915e-07, + "loss": 0.113, + "step": 9703 + }, + { + "epoch": 0.8940894642281292, + "grad_norm": 0.9932594914438546, + "learning_rate": 1.5166717492093808e-07, + "loss": 0.1217, + "step": 9704 + }, + { + "epoch": 0.8941816004053992, + "grad_norm": 0.892243585536645, + "learning_rate": 1.5140598767133947e-07, + "loss": 0.1104, + "step": 9705 + }, + { + "epoch": 0.8942737365826692, + "grad_norm": 0.9090227640422955, + "learning_rate": 1.5114501848894792e-07, + "loss": 0.1129, + "step": 9706 + }, + { + "epoch": 0.8943658727599392, + "grad_norm": 0.9667171942105707, + "learning_rate": 1.5088426739799405e-07, + "loss": 0.1195, + "step": 9707 + }, + { + "epoch": 0.8944580089372092, + "grad_norm": 0.9372614582454015, + "learning_rate": 1.5062373442268908e-07, + "loss": 0.1222, + "step": 9708 + }, + { + "epoch": 0.8945501451144792, + "grad_norm": 0.933453218296901, + "learning_rate": 1.5036341958722334e-07, + "loss": 0.1151, + "step": 9709 + }, + { + "epoch": 0.8946422812917492, + "grad_norm": 0.9340782377512039, + "learning_rate": 1.501033229157667e-07, + "loss": 0.1205, + "step": 9710 + }, + { + "epoch": 0.8947344174690192, + "grad_norm": 0.9239264734069119, + "learning_rate": 1.4984344443246924e-07, + "loss": 0.1126, + "step": 9711 + }, + { + "epoch": 0.8948265536462893, + "grad_norm": 0.9349919081011316, + "learning_rate": 1.495837841614614e-07, + "loss": 0.0968, + "step": 9712 + }, + { + "epoch": 0.8949186898235593, + "grad_norm": 0.9218295205011691, + "learning_rate": 1.493243421268517e-07, + "loss": 0.1116, + "step": 9713 + }, + { + "epoch": 0.8950108260008293, + "grad_norm": 0.9371288620444782, + "learning_rate": 1.4906511835273003e-07, + "loss": 0.1048, + "step": 9714 + }, + { + "epoch": 0.8951029621780993, + "grad_norm": 0.9444301518399142, + "learning_rate": 1.4880611286316487e-07, + "loss": 0.122, + "step": 9715 + }, + { + "epoch": 0.8951950983553693, + "grad_norm": 0.9990493560331191, + "learning_rate": 1.4854732568220566e-07, + "loss": 0.1316, + "step": 9716 + }, + { + "epoch": 0.8952872345326393, + "grad_norm": 0.9358320327837656, + "learning_rate": 1.4828875683387977e-07, + "loss": 0.1169, + "step": 9717 + }, + { + "epoch": 0.8953793707099093, + "grad_norm": 0.9190529344035475, + "learning_rate": 1.4803040634219612e-07, + "loss": 0.1094, + "step": 9718 + }, + { + "epoch": 0.8954715068871792, + "grad_norm": 0.9687233614559736, + "learning_rate": 1.4777227423114271e-07, + "loss": 0.12, + "step": 9719 + }, + { + "epoch": 0.8955636430644492, + "grad_norm": 1.0284364178633627, + "learning_rate": 1.4751436052468677e-07, + "loss": 0.1258, + "step": 9720 + }, + { + "epoch": 0.8956557792417192, + "grad_norm": 0.87744147112822, + "learning_rate": 1.4725666524677496e-07, + "loss": 0.1086, + "step": 9721 + }, + { + "epoch": 0.8957479154189892, + "grad_norm": 0.9439290320132014, + "learning_rate": 1.4699918842133536e-07, + "loss": 0.1211, + "step": 9722 + }, + { + "epoch": 0.8958400515962592, + "grad_norm": 0.8633906867334843, + "learning_rate": 1.4674193007227416e-07, + "loss": 0.1035, + "step": 9723 + }, + { + "epoch": 0.8959321877735292, + "grad_norm": 0.9399171676121051, + "learning_rate": 1.464848902234786e-07, + "loss": 0.1188, + "step": 9724 + }, + { + "epoch": 0.8960243239507992, + "grad_norm": 1.200021321856677, + "learning_rate": 1.4622806889881407e-07, + "loss": 0.1209, + "step": 9725 + }, + { + "epoch": 0.8961164601280693, + "grad_norm": 0.9660581201978683, + "learning_rate": 1.4597146612212622e-07, + "loss": 0.1165, + "step": 9726 + }, + { + "epoch": 0.8962085963053393, + "grad_norm": 0.9637032682306422, + "learning_rate": 1.457150819172412e-07, + "loss": 0.123, + "step": 9727 + }, + { + "epoch": 0.8963007324826093, + "grad_norm": 0.9216934942341704, + "learning_rate": 1.454589163079645e-07, + "loss": 0.1088, + "step": 9728 + }, + { + "epoch": 0.8963928686598793, + "grad_norm": 0.8845350904114487, + "learning_rate": 1.4520296931808064e-07, + "loss": 0.112, + "step": 9729 + }, + { + "epoch": 0.8964850048371493, + "grad_norm": 0.9438486506585017, + "learning_rate": 1.449472409713548e-07, + "loss": 0.1236, + "step": 9730 + }, + { + "epoch": 0.8965771410144193, + "grad_norm": 0.957677136788932, + "learning_rate": 1.4469173129153052e-07, + "loss": 0.1181, + "step": 9731 + }, + { + "epoch": 0.8966692771916893, + "grad_norm": 0.9933864337772672, + "learning_rate": 1.4443644030233268e-07, + "loss": 0.1305, + "step": 9732 + }, + { + "epoch": 0.8967614133689593, + "grad_norm": 0.8999166286380204, + "learning_rate": 1.4418136802746507e-07, + "loss": 0.1068, + "step": 9733 + }, + { + "epoch": 0.8968535495462293, + "grad_norm": 0.9631857910045679, + "learning_rate": 1.4392651449061075e-07, + "loss": 0.1221, + "step": 9734 + }, + { + "epoch": 0.8969456857234993, + "grad_norm": 0.8684313853698533, + "learning_rate": 1.4367187971543352e-07, + "loss": 0.1085, + "step": 9735 + }, + { + "epoch": 0.8970378219007693, + "grad_norm": 0.8674388014206332, + "learning_rate": 1.434174637255753e-07, + "loss": 0.0992, + "step": 9736 + }, + { + "epoch": 0.8971299580780393, + "grad_norm": 0.9874178781613823, + "learning_rate": 1.4316326654465972e-07, + "loss": 0.128, + "step": 9737 + }, + { + "epoch": 0.8972220942553093, + "grad_norm": 0.9477060570555671, + "learning_rate": 1.429092881962882e-07, + "loss": 0.1228, + "step": 9738 + }, + { + "epoch": 0.8973142304325794, + "grad_norm": 0.9149830735214507, + "learning_rate": 1.4265552870404265e-07, + "loss": 0.1189, + "step": 9739 + }, + { + "epoch": 0.8974063666098494, + "grad_norm": 0.9544319608703057, + "learning_rate": 1.4240198809148537e-07, + "loss": 0.1261, + "step": 9740 + }, + { + "epoch": 0.8974985027871194, + "grad_norm": 0.9299058821822609, + "learning_rate": 1.421486663821575e-07, + "loss": 0.1085, + "step": 9741 + }, + { + "epoch": 0.8975906389643894, + "grad_norm": 0.9000451973033925, + "learning_rate": 1.4189556359957917e-07, + "loss": 0.1145, + "step": 9742 + }, + { + "epoch": 0.8976827751416594, + "grad_norm": 0.9255410508309111, + "learning_rate": 1.4164267976725154e-07, + "loss": 0.1196, + "step": 9743 + }, + { + "epoch": 0.8977749113189294, + "grad_norm": 0.9849259768868959, + "learning_rate": 1.41390014908655e-07, + "loss": 0.1232, + "step": 9744 + }, + { + "epoch": 0.8978670474961994, + "grad_norm": 0.9322779341663346, + "learning_rate": 1.4113756904724967e-07, + "loss": 0.1113, + "step": 9745 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 0.9525263463078988, + "learning_rate": 1.4088534220647487e-07, + "loss": 0.124, + "step": 9746 + }, + { + "epoch": 0.8980513198507394, + "grad_norm": 0.921740201644219, + "learning_rate": 1.4063333440974963e-07, + "loss": 0.1108, + "step": 9747 + }, + { + "epoch": 0.8981434560280094, + "grad_norm": 0.9209820144117232, + "learning_rate": 1.403815456804733e-07, + "loss": 0.1071, + "step": 9748 + }, + { + "epoch": 0.8982355922052794, + "grad_norm": 0.9262180070109551, + "learning_rate": 1.4012997604202466e-07, + "loss": 0.1143, + "step": 9749 + }, + { + "epoch": 0.8983277283825494, + "grad_norm": 0.8761195305273904, + "learning_rate": 1.3987862551776143e-07, + "loss": 0.1079, + "step": 9750 + }, + { + "epoch": 0.8984198645598194, + "grad_norm": 0.9368357522890151, + "learning_rate": 1.3962749413102216e-07, + "loss": 0.1136, + "step": 9751 + }, + { + "epoch": 0.8985120007370894, + "grad_norm": 0.9377461590430607, + "learning_rate": 1.3937658190512377e-07, + "loss": 0.1209, + "step": 9752 + }, + { + "epoch": 0.8986041369143595, + "grad_norm": 0.8844793469481224, + "learning_rate": 1.3912588886336397e-07, + "loss": 0.114, + "step": 9753 + }, + { + "epoch": 0.8986962730916295, + "grad_norm": 0.9335683035286864, + "learning_rate": 1.388754150290192e-07, + "loss": 0.1234, + "step": 9754 + }, + { + "epoch": 0.8987884092688995, + "grad_norm": 0.9053742346487745, + "learning_rate": 1.3862516042534634e-07, + "loss": 0.1042, + "step": 9755 + }, + { + "epoch": 0.8988805454461695, + "grad_norm": 0.9444620424685304, + "learning_rate": 1.3837512507558188e-07, + "loss": 0.1192, + "step": 9756 + }, + { + "epoch": 0.8989726816234395, + "grad_norm": 0.8816034579977505, + "learning_rate": 1.3812530900294107e-07, + "loss": 0.1131, + "step": 9757 + }, + { + "epoch": 0.8990648178007095, + "grad_norm": 0.8964634166927596, + "learning_rate": 1.37875712230619e-07, + "loss": 0.1141, + "step": 9758 + }, + { + "epoch": 0.8991569539779795, + "grad_norm": 1.020273171079443, + "learning_rate": 1.376263347817916e-07, + "loss": 0.1255, + "step": 9759 + }, + { + "epoch": 0.8992490901552495, + "grad_norm": 0.9267078640525189, + "learning_rate": 1.3737717667961308e-07, + "loss": 0.1206, + "step": 9760 + }, + { + "epoch": 0.8993412263325195, + "grad_norm": 0.930151281773034, + "learning_rate": 1.371282379472183e-07, + "loss": 0.1116, + "step": 9761 + }, + { + "epoch": 0.8994333625097894, + "grad_norm": 0.88549447679142, + "learning_rate": 1.3687951860772098e-07, + "loss": 0.1131, + "step": 9762 + }, + { + "epoch": 0.8995254986870594, + "grad_norm": 0.913221976034471, + "learning_rate": 1.366310186842143e-07, + "loss": 0.1157, + "step": 9763 + }, + { + "epoch": 0.8996176348643294, + "grad_norm": 0.9949130994048048, + "learning_rate": 1.3638273819977205e-07, + "loss": 0.1205, + "step": 9764 + }, + { + "epoch": 0.8997097710415994, + "grad_norm": 0.9465781966051717, + "learning_rate": 1.3613467717744661e-07, + "loss": 0.1234, + "step": 9765 + }, + { + "epoch": 0.8998019072188694, + "grad_norm": 0.9411758903196948, + "learning_rate": 1.358868356402715e-07, + "loss": 0.122, + "step": 9766 + }, + { + "epoch": 0.8998940433961395, + "grad_norm": 1.0203783648280509, + "learning_rate": 1.3563921361125804e-07, + "loss": 0.1273, + "step": 9767 + }, + { + "epoch": 0.8999861795734095, + "grad_norm": 0.9816594213978099, + "learning_rate": 1.3539181111339754e-07, + "loss": 0.1182, + "step": 9768 + }, + { + "epoch": 0.9000783157506795, + "grad_norm": 0.889791816825393, + "learning_rate": 1.3514462816966195e-07, + "loss": 0.106, + "step": 9769 + }, + { + "epoch": 0.9001704519279495, + "grad_norm": 0.9633466334197318, + "learning_rate": 1.3489766480300232e-07, + "loss": 0.1252, + "step": 9770 + }, + { + "epoch": 0.9002625881052195, + "grad_norm": 0.9379316337177459, + "learning_rate": 1.3465092103634892e-07, + "loss": 0.1209, + "step": 9771 + }, + { + "epoch": 0.9003547242824895, + "grad_norm": 0.9954431482111418, + "learning_rate": 1.3440439689261232e-07, + "loss": 0.1249, + "step": 9772 + }, + { + "epoch": 0.9004468604597595, + "grad_norm": 0.9238096644006695, + "learning_rate": 1.3415809239468198e-07, + "loss": 0.1133, + "step": 9773 + }, + { + "epoch": 0.9005389966370295, + "grad_norm": 0.9604790895841134, + "learning_rate": 1.3391200756542738e-07, + "loss": 0.1198, + "step": 9774 + }, + { + "epoch": 0.9006311328142995, + "grad_norm": 0.974038640437757, + "learning_rate": 1.336661424276972e-07, + "loss": 0.1186, + "step": 9775 + }, + { + "epoch": 0.9007232689915695, + "grad_norm": 0.9207157778378379, + "learning_rate": 1.334204970043204e-07, + "loss": 0.112, + "step": 9776 + }, + { + "epoch": 0.9008154051688395, + "grad_norm": 0.9607697109332836, + "learning_rate": 1.331750713181054e-07, + "loss": 0.1186, + "step": 9777 + }, + { + "epoch": 0.9009075413461095, + "grad_norm": 0.9612023119466364, + "learning_rate": 1.3292986539184011e-07, + "loss": 0.1217, + "step": 9778 + }, + { + "epoch": 0.9009996775233795, + "grad_norm": 0.9352391442078847, + "learning_rate": 1.32684879248291e-07, + "loss": 0.1181, + "step": 9779 + }, + { + "epoch": 0.9010918137006496, + "grad_norm": 0.938840287739617, + "learning_rate": 1.324401129102057e-07, + "loss": 0.113, + "step": 9780 + }, + { + "epoch": 0.9011839498779196, + "grad_norm": 0.9599975054539138, + "learning_rate": 1.321955664003105e-07, + "loss": 0.1229, + "step": 9781 + }, + { + "epoch": 0.9012760860551896, + "grad_norm": 0.9002389769621546, + "learning_rate": 1.3195123974131252e-07, + "loss": 0.109, + "step": 9782 + }, + { + "epoch": 0.9013682222324596, + "grad_norm": 0.9532838208954857, + "learning_rate": 1.317071329558961e-07, + "loss": 0.1244, + "step": 9783 + }, + { + "epoch": 0.9014603584097296, + "grad_norm": 0.9008130939211582, + "learning_rate": 1.3146324606672754e-07, + "loss": 0.1171, + "step": 9784 + }, + { + "epoch": 0.9015524945869996, + "grad_norm": 0.9205065612854544, + "learning_rate": 1.3121957909645155e-07, + "loss": 0.1112, + "step": 9785 + }, + { + "epoch": 0.9016446307642696, + "grad_norm": 0.963261859456219, + "learning_rate": 1.309761320676925e-07, + "loss": 0.1296, + "step": 9786 + }, + { + "epoch": 0.9017367669415396, + "grad_norm": 0.9543745475845056, + "learning_rate": 1.3073290500305452e-07, + "loss": 0.1211, + "step": 9787 + }, + { + "epoch": 0.9018289031188096, + "grad_norm": 0.939430723758642, + "learning_rate": 1.3048989792512096e-07, + "loss": 0.1165, + "step": 9788 + }, + { + "epoch": 0.9019210392960796, + "grad_norm": 0.9899501282112528, + "learning_rate": 1.3024711085645597e-07, + "loss": 0.1174, + "step": 9789 + }, + { + "epoch": 0.9020131754733496, + "grad_norm": 0.956721372717787, + "learning_rate": 1.3000454381960127e-07, + "loss": 0.1151, + "step": 9790 + }, + { + "epoch": 0.9021053116506196, + "grad_norm": 0.9518141144978524, + "learning_rate": 1.297621968370802e-07, + "loss": 0.113, + "step": 9791 + }, + { + "epoch": 0.9021974478278896, + "grad_norm": 0.998188656043015, + "learning_rate": 1.2952006993139393e-07, + "loss": 0.13, + "step": 9792 + }, + { + "epoch": 0.9022895840051596, + "grad_norm": 0.9881266943062582, + "learning_rate": 1.2927816312502422e-07, + "loss": 0.1248, + "step": 9793 + }, + { + "epoch": 0.9023817201824297, + "grad_norm": 0.9594850371411043, + "learning_rate": 1.2903647644043254e-07, + "loss": 0.121, + "step": 9794 + }, + { + "epoch": 0.9024738563596997, + "grad_norm": 0.9412583884780079, + "learning_rate": 1.2879500990005926e-07, + "loss": 0.1118, + "step": 9795 + }, + { + "epoch": 0.9025659925369697, + "grad_norm": 0.9863617128648952, + "learning_rate": 1.2855376352632427e-07, + "loss": 0.1199, + "step": 9796 + }, + { + "epoch": 0.9026581287142397, + "grad_norm": 0.9075230786695033, + "learning_rate": 1.2831273734162736e-07, + "loss": 0.1116, + "step": 9797 + }, + { + "epoch": 0.9027502648915097, + "grad_norm": 0.970810405883738, + "learning_rate": 1.280719313683479e-07, + "loss": 0.127, + "step": 9798 + }, + { + "epoch": 0.9028424010687797, + "grad_norm": 0.969479796883492, + "learning_rate": 1.2783134562884547e-07, + "loss": 0.1309, + "step": 9799 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 0.9723007625084326, + "learning_rate": 1.275909801454578e-07, + "loss": 0.1242, + "step": 9800 + }, + { + "epoch": 0.9030266734233197, + "grad_norm": 0.9245601520664418, + "learning_rate": 1.2735083494050255e-07, + "loss": 0.1172, + "step": 9801 + }, + { + "epoch": 0.9031188096005897, + "grad_norm": 0.939522410574133, + "learning_rate": 1.2711091003627773e-07, + "loss": 0.1197, + "step": 9802 + }, + { + "epoch": 0.9032109457778597, + "grad_norm": 0.9359774641408325, + "learning_rate": 1.2687120545506054e-07, + "loss": 0.1187, + "step": 9803 + }, + { + "epoch": 0.9033030819551296, + "grad_norm": 0.9609895198256817, + "learning_rate": 1.2663172121910705e-07, + "loss": 0.1189, + "step": 9804 + }, + { + "epoch": 0.9033952181323996, + "grad_norm": 0.9098035827692788, + "learning_rate": 1.263924573506542e-07, + "loss": 0.1092, + "step": 9805 + }, + { + "epoch": 0.9034873543096696, + "grad_norm": 0.9864415404989244, + "learning_rate": 1.2615341387191644e-07, + "loss": 0.1253, + "step": 9806 + }, + { + "epoch": 0.9035794904869398, + "grad_norm": 0.9141531124611704, + "learning_rate": 1.2591459080509017e-07, + "loss": 0.1108, + "step": 9807 + }, + { + "epoch": 0.9036716266642097, + "grad_norm": 0.938159819537608, + "learning_rate": 1.2567598817234932e-07, + "loss": 0.1146, + "step": 9808 + }, + { + "epoch": 0.9037637628414797, + "grad_norm": 0.9704965870545296, + "learning_rate": 1.2543760599584842e-07, + "loss": 0.1208, + "step": 9809 + }, + { + "epoch": 0.9038558990187497, + "grad_norm": 0.933503554525939, + "learning_rate": 1.2519944429772168e-07, + "loss": 0.1109, + "step": 9810 + }, + { + "epoch": 0.9039480351960197, + "grad_norm": 1.0386870504682086, + "learning_rate": 1.2496150310008226e-07, + "loss": 0.1333, + "step": 9811 + }, + { + "epoch": 0.9040401713732897, + "grad_norm": 0.9608349972329964, + "learning_rate": 1.2472378242502247e-07, + "loss": 0.1166, + "step": 9812 + }, + { + "epoch": 0.9041323075505597, + "grad_norm": 0.9204922383585812, + "learning_rate": 1.2448628229461522e-07, + "loss": 0.1123, + "step": 9813 + }, + { + "epoch": 0.9042244437278297, + "grad_norm": 0.8858240303231056, + "learning_rate": 1.2424900273091206e-07, + "loss": 0.1059, + "step": 9814 + }, + { + "epoch": 0.9043165799050997, + "grad_norm": 0.93623941741783, + "learning_rate": 1.2401194375594532e-07, + "loss": 0.1171, + "step": 9815 + }, + { + "epoch": 0.9044087160823697, + "grad_norm": 0.9047481046424645, + "learning_rate": 1.237751053917252e-07, + "loss": 0.1137, + "step": 9816 + }, + { + "epoch": 0.9045008522596397, + "grad_norm": 0.9396357652997752, + "learning_rate": 1.235384876602419e-07, + "loss": 0.1166, + "step": 9817 + }, + { + "epoch": 0.9045929884369097, + "grad_norm": 0.942299322546276, + "learning_rate": 1.233020905834656e-07, + "loss": 0.1109, + "step": 9818 + }, + { + "epoch": 0.9046851246141797, + "grad_norm": 0.9482094011622528, + "learning_rate": 1.2306591418334624e-07, + "loss": 0.1236, + "step": 9819 + }, + { + "epoch": 0.9047772607914497, + "grad_norm": 0.973923981370888, + "learning_rate": 1.2282995848181267e-07, + "loss": 0.1238, + "step": 9820 + }, + { + "epoch": 0.9048693969687198, + "grad_norm": 0.851301869657395, + "learning_rate": 1.2259422350077348e-07, + "loss": 0.1026, + "step": 9821 + }, + { + "epoch": 0.9049615331459898, + "grad_norm": 0.924457941406162, + "learning_rate": 1.223587092621162e-07, + "loss": 0.1147, + "step": 9822 + }, + { + "epoch": 0.9050536693232598, + "grad_norm": 0.9255580493486694, + "learning_rate": 1.2212341578770854e-07, + "loss": 0.1168, + "step": 9823 + }, + { + "epoch": 0.9051458055005298, + "grad_norm": 0.9023318089918937, + "learning_rate": 1.2188834309939806e-07, + "loss": 0.1158, + "step": 9824 + }, + { + "epoch": 0.9052379416777998, + "grad_norm": 0.8721923584983146, + "learning_rate": 1.2165349121901037e-07, + "loss": 0.1126, + "step": 9825 + }, + { + "epoch": 0.9053300778550698, + "grad_norm": 0.9105543472402409, + "learning_rate": 1.2141886016835246e-07, + "loss": 0.1137, + "step": 9826 + }, + { + "epoch": 0.9054222140323398, + "grad_norm": 0.9505582223687297, + "learning_rate": 1.2118444996920887e-07, + "loss": 0.1172, + "step": 9827 + }, + { + "epoch": 0.9055143502096098, + "grad_norm": 0.9050373360289897, + "learning_rate": 1.2095026064334548e-07, + "loss": 0.1186, + "step": 9828 + }, + { + "epoch": 0.9056064863868798, + "grad_norm": 0.9383008904163106, + "learning_rate": 1.207162922125063e-07, + "loss": 0.1197, + "step": 9829 + }, + { + "epoch": 0.9056986225641498, + "grad_norm": 0.913038696222506, + "learning_rate": 1.2048254469841508e-07, + "loss": 0.1071, + "step": 9830 + }, + { + "epoch": 0.9057907587414198, + "grad_norm": 0.9626035864124302, + "learning_rate": 1.2024901812277639e-07, + "loss": 0.1223, + "step": 9831 + }, + { + "epoch": 0.9058828949186898, + "grad_norm": 0.9518301451963114, + "learning_rate": 1.2001571250727233e-07, + "loss": 0.1132, + "step": 9832 + }, + { + "epoch": 0.9059750310959598, + "grad_norm": 0.8904070068361779, + "learning_rate": 1.1978262787356504e-07, + "loss": 0.1131, + "step": 9833 + }, + { + "epoch": 0.9060671672732298, + "grad_norm": 0.8945005093728289, + "learning_rate": 1.1954976424329716e-07, + "loss": 0.114, + "step": 9834 + }, + { + "epoch": 0.9061593034504999, + "grad_norm": 0.9772335300116917, + "learning_rate": 1.193171216380895e-07, + "loss": 0.1168, + "step": 9835 + }, + { + "epoch": 0.9062514396277699, + "grad_norm": 0.9413307860703609, + "learning_rate": 1.1908470007954392e-07, + "loss": 0.1175, + "step": 9836 + }, + { + "epoch": 0.9063435758050399, + "grad_norm": 0.8985101580638164, + "learning_rate": 1.188524995892401e-07, + "loss": 0.1165, + "step": 9837 + }, + { + "epoch": 0.9064357119823099, + "grad_norm": 0.9558883496465006, + "learning_rate": 1.1862052018873777e-07, + "loss": 0.1181, + "step": 9838 + }, + { + "epoch": 0.9065278481595799, + "grad_norm": 0.9821413035235732, + "learning_rate": 1.1838876189957632e-07, + "loss": 0.1342, + "step": 9839 + }, + { + "epoch": 0.9066199843368499, + "grad_norm": 0.9294065453209202, + "learning_rate": 1.1815722474327495e-07, + "loss": 0.1159, + "step": 9840 + }, + { + "epoch": 0.9067121205141199, + "grad_norm": 0.941478958853304, + "learning_rate": 1.1792590874133119e-07, + "loss": 0.115, + "step": 9841 + }, + { + "epoch": 0.9068042566913899, + "grad_norm": 0.9511182921769091, + "learning_rate": 1.176948139152237e-07, + "loss": 0.1225, + "step": 9842 + }, + { + "epoch": 0.9068963928686599, + "grad_norm": 0.9305028355356414, + "learning_rate": 1.1746394028640862e-07, + "loss": 0.1236, + "step": 9843 + }, + { + "epoch": 0.9069885290459299, + "grad_norm": 0.909927237435103, + "learning_rate": 1.1723328787632354e-07, + "loss": 0.119, + "step": 9844 + }, + { + "epoch": 0.9070806652231999, + "grad_norm": 0.9264740890403086, + "learning_rate": 1.1700285670638356e-07, + "loss": 0.1105, + "step": 9845 + }, + { + "epoch": 0.9071728014004699, + "grad_norm": 0.9332255111550347, + "learning_rate": 1.1677264679798489e-07, + "loss": 0.1135, + "step": 9846 + }, + { + "epoch": 0.9072649375777398, + "grad_norm": 0.8932121363052186, + "learning_rate": 1.1654265817250294e-07, + "loss": 0.0993, + "step": 9847 + }, + { + "epoch": 0.90735707375501, + "grad_norm": 0.984756358362903, + "learning_rate": 1.1631289085129143e-07, + "loss": 0.1209, + "step": 9848 + }, + { + "epoch": 0.90744920993228, + "grad_norm": 0.8797646764277562, + "learning_rate": 1.1608334485568446e-07, + "loss": 0.1099, + "step": 9849 + }, + { + "epoch": 0.90754134610955, + "grad_norm": 0.9266451524625748, + "learning_rate": 1.1585402020699548e-07, + "loss": 0.1128, + "step": 9850 + }, + { + "epoch": 0.90763348228682, + "grad_norm": 0.9936081829017386, + "learning_rate": 1.1562491692651723e-07, + "loss": 0.1223, + "step": 9851 + }, + { + "epoch": 0.90772561846409, + "grad_norm": 1.0064731245012568, + "learning_rate": 1.153960350355221e-07, + "loss": 0.124, + "step": 9852 + }, + { + "epoch": 0.9078177546413599, + "grad_norm": 0.9249707712056661, + "learning_rate": 1.1516737455526228e-07, + "loss": 0.1159, + "step": 9853 + }, + { + "epoch": 0.9079098908186299, + "grad_norm": 0.9030367953806261, + "learning_rate": 1.14938935506968e-07, + "loss": 0.1184, + "step": 9854 + }, + { + "epoch": 0.9080020269958999, + "grad_norm": 1.007252102658938, + "learning_rate": 1.1471071791185007e-07, + "loss": 0.1362, + "step": 9855 + }, + { + "epoch": 0.9080941631731699, + "grad_norm": 0.9551043810463008, + "learning_rate": 1.1448272179109848e-07, + "loss": 0.1233, + "step": 9856 + }, + { + "epoch": 0.9081862993504399, + "grad_norm": 1.0072214892101452, + "learning_rate": 1.1425494716588353e-07, + "loss": 0.1238, + "step": 9857 + }, + { + "epoch": 0.9082784355277099, + "grad_norm": 0.9042976003595324, + "learning_rate": 1.1402739405735303e-07, + "loss": 0.1067, + "step": 9858 + }, + { + "epoch": 0.9083705717049799, + "grad_norm": 0.9074508823982074, + "learning_rate": 1.1380006248663616e-07, + "loss": 0.1212, + "step": 9859 + }, + { + "epoch": 0.9084627078822499, + "grad_norm": 0.9749475166102289, + "learning_rate": 1.1357295247483997e-07, + "loss": 0.1184, + "step": 9860 + }, + { + "epoch": 0.9085548440595199, + "grad_norm": 0.9722337825584535, + "learning_rate": 1.1334606404305226e-07, + "loss": 0.1253, + "step": 9861 + }, + { + "epoch": 0.90864698023679, + "grad_norm": 0.9000281767600011, + "learning_rate": 1.13119397212339e-07, + "loss": 0.1063, + "step": 9862 + }, + { + "epoch": 0.90873911641406, + "grad_norm": 0.8865674739066136, + "learning_rate": 1.1289295200374667e-07, + "loss": 0.1021, + "step": 9863 + }, + { + "epoch": 0.90883125259133, + "grad_norm": 0.9338951859368928, + "learning_rate": 1.1266672843830095e-07, + "loss": 0.1124, + "step": 9864 + }, + { + "epoch": 0.9089233887686, + "grad_norm": 0.9355079432027297, + "learning_rate": 1.1244072653700644e-07, + "loss": 0.1071, + "step": 9865 + }, + { + "epoch": 0.90901552494587, + "grad_norm": 0.9657351672980083, + "learning_rate": 1.122149463208469e-07, + "loss": 0.1143, + "step": 9866 + }, + { + "epoch": 0.90910766112314, + "grad_norm": 0.9550170888128628, + "learning_rate": 1.1198938781078694e-07, + "loss": 0.1183, + "step": 9867 + }, + { + "epoch": 0.90919979730041, + "grad_norm": 0.9679298136534235, + "learning_rate": 1.1176405102776899e-07, + "loss": 0.1078, + "step": 9868 + }, + { + "epoch": 0.90929193347768, + "grad_norm": 0.9730850211805437, + "learning_rate": 1.1153893599271631e-07, + "loss": 0.1149, + "step": 9869 + }, + { + "epoch": 0.90938406965495, + "grad_norm": 0.9591042974754497, + "learning_rate": 1.1131404272653051e-07, + "loss": 0.1122, + "step": 9870 + }, + { + "epoch": 0.90947620583222, + "grad_norm": 0.9421743521813682, + "learning_rate": 1.1108937125009266e-07, + "loss": 0.1212, + "step": 9871 + }, + { + "epoch": 0.90956834200949, + "grad_norm": 0.9474137614317413, + "learning_rate": 1.1086492158426387e-07, + "loss": 0.1276, + "step": 9872 + }, + { + "epoch": 0.90966047818676, + "grad_norm": 0.9496210187132605, + "learning_rate": 1.106406937498844e-07, + "loss": 0.117, + "step": 9873 + }, + { + "epoch": 0.90975261436403, + "grad_norm": 0.9623210428089947, + "learning_rate": 1.1041668776777342e-07, + "loss": 0.1188, + "step": 9874 + }, + { + "epoch": 0.9098447505413001, + "grad_norm": 0.9688398891785434, + "learning_rate": 1.1019290365873042e-07, + "loss": 0.1167, + "step": 9875 + }, + { + "epoch": 0.9099368867185701, + "grad_norm": 0.9438973611520155, + "learning_rate": 1.0996934144353322e-07, + "loss": 0.1238, + "step": 9876 + }, + { + "epoch": 0.9100290228958401, + "grad_norm": 1.0123754346766263, + "learning_rate": 1.0974600114293993e-07, + "loss": 0.131, + "step": 9877 + }, + { + "epoch": 0.9101211590731101, + "grad_norm": 0.9369173621826444, + "learning_rate": 1.0952288277768786e-07, + "loss": 0.1243, + "step": 9878 + }, + { + "epoch": 0.9102132952503801, + "grad_norm": 0.9104725907747405, + "learning_rate": 1.0929998636849321e-07, + "loss": 0.1114, + "step": 9879 + }, + { + "epoch": 0.9103054314276501, + "grad_norm": 0.8911323682845301, + "learning_rate": 1.090773119360522e-07, + "loss": 0.1047, + "step": 9880 + }, + { + "epoch": 0.9103975676049201, + "grad_norm": 0.9328489809284518, + "learning_rate": 1.0885485950103997e-07, + "loss": 0.115, + "step": 9881 + }, + { + "epoch": 0.9104897037821901, + "grad_norm": 0.9185301495705797, + "learning_rate": 1.0863262908411165e-07, + "loss": 0.1102, + "step": 9882 + }, + { + "epoch": 0.9105818399594601, + "grad_norm": 0.9338449898577366, + "learning_rate": 1.0841062070590074e-07, + "loss": 0.1195, + "step": 9883 + }, + { + "epoch": 0.9106739761367301, + "grad_norm": 0.9028633254451033, + "learning_rate": 1.0818883438702105e-07, + "loss": 0.107, + "step": 9884 + }, + { + "epoch": 0.9107661123140001, + "grad_norm": 0.9441449194556039, + "learning_rate": 1.0796727014806607e-07, + "loss": 0.1209, + "step": 9885 + }, + { + "epoch": 0.9108582484912701, + "grad_norm": 0.943313665217802, + "learning_rate": 1.0774592800960715e-07, + "loss": 0.119, + "step": 9886 + }, + { + "epoch": 0.9109503846685401, + "grad_norm": 0.9296020252630918, + "learning_rate": 1.0752480799219616e-07, + "loss": 0.117, + "step": 9887 + }, + { + "epoch": 0.91104252084581, + "grad_norm": 0.9667448070909315, + "learning_rate": 1.073039101163642e-07, + "loss": 0.1242, + "step": 9888 + }, + { + "epoch": 0.9111346570230802, + "grad_norm": 0.909844133598178, + "learning_rate": 1.0708323440262153e-07, + "loss": 0.103, + "step": 9889 + }, + { + "epoch": 0.9112267932003502, + "grad_norm": 0.9273168759001507, + "learning_rate": 1.0686278087145868e-07, + "loss": 0.1183, + "step": 9890 + }, + { + "epoch": 0.9113189293776202, + "grad_norm": 0.9491675032145784, + "learning_rate": 1.0664254954334402e-07, + "loss": 0.112, + "step": 9891 + }, + { + "epoch": 0.9114110655548902, + "grad_norm": 0.9436010409635454, + "learning_rate": 1.064225404387259e-07, + "loss": 0.1163, + "step": 9892 + }, + { + "epoch": 0.9115032017321602, + "grad_norm": 0.9369245507410711, + "learning_rate": 1.0620275357803244e-07, + "loss": 0.1219, + "step": 9893 + }, + { + "epoch": 0.9115953379094301, + "grad_norm": 0.934345303014593, + "learning_rate": 1.059831889816712e-07, + "loss": 0.1149, + "step": 9894 + }, + { + "epoch": 0.9116874740867001, + "grad_norm": 0.9164697181421179, + "learning_rate": 1.0576384667002837e-07, + "loss": 0.1091, + "step": 9895 + }, + { + "epoch": 0.9117796102639701, + "grad_norm": 0.9185437968697807, + "learning_rate": 1.0554472666347043e-07, + "loss": 0.1245, + "step": 9896 + }, + { + "epoch": 0.9118717464412401, + "grad_norm": 0.9367584691113693, + "learning_rate": 1.0532582898234167e-07, + "loss": 0.1224, + "step": 9897 + }, + { + "epoch": 0.9119638826185101, + "grad_norm": 0.9166216071894971, + "learning_rate": 1.0510715364696806e-07, + "loss": 0.1278, + "step": 9898 + }, + { + "epoch": 0.9120560187957801, + "grad_norm": 0.8699664872405362, + "learning_rate": 1.048887006776525e-07, + "loss": 0.1071, + "step": 9899 + }, + { + "epoch": 0.9121481549730501, + "grad_norm": 0.9623063255443589, + "learning_rate": 1.0467047009467878e-07, + "loss": 0.1179, + "step": 9900 + }, + { + "epoch": 0.9122402911503201, + "grad_norm": 0.9751253664006017, + "learning_rate": 1.0445246191831015e-07, + "loss": 0.1222, + "step": 9901 + }, + { + "epoch": 0.9123324273275901, + "grad_norm": 0.9550208126269293, + "learning_rate": 1.0423467616878819e-07, + "loss": 0.1223, + "step": 9902 + }, + { + "epoch": 0.9124245635048602, + "grad_norm": 0.9520464240161787, + "learning_rate": 1.040171128663342e-07, + "loss": 0.1113, + "step": 9903 + }, + { + "epoch": 0.9125166996821302, + "grad_norm": 0.9800160176815804, + "learning_rate": 1.03799772031149e-07, + "loss": 0.1258, + "step": 9904 + }, + { + "epoch": 0.9126088358594002, + "grad_norm": 0.9131937286932651, + "learning_rate": 1.035826536834128e-07, + "loss": 0.1107, + "step": 9905 + }, + { + "epoch": 0.9127009720366702, + "grad_norm": 0.9198729568337226, + "learning_rate": 1.0336575784328534e-07, + "loss": 0.1129, + "step": 9906 + }, + { + "epoch": 0.9127931082139402, + "grad_norm": 0.9354820808259293, + "learning_rate": 1.031490845309055e-07, + "loss": 0.1155, + "step": 9907 + }, + { + "epoch": 0.9128852443912102, + "grad_norm": 0.9341493220138887, + "learning_rate": 1.029326337663905e-07, + "loss": 0.1136, + "step": 9908 + }, + { + "epoch": 0.9129773805684802, + "grad_norm": 0.9341967204483506, + "learning_rate": 1.0271640556983875e-07, + "loss": 0.1117, + "step": 9909 + }, + { + "epoch": 0.9130695167457502, + "grad_norm": 0.9531420223699355, + "learning_rate": 1.0250039996132637e-07, + "loss": 0.1191, + "step": 9910 + }, + { + "epoch": 0.9131616529230202, + "grad_norm": 0.9487379938366196, + "learning_rate": 1.0228461696091041e-07, + "loss": 0.116, + "step": 9911 + }, + { + "epoch": 0.9132537891002902, + "grad_norm": 0.9383824540449928, + "learning_rate": 1.0206905658862592e-07, + "loss": 0.1203, + "step": 9912 + }, + { + "epoch": 0.9133459252775602, + "grad_norm": 0.8714240991607098, + "learning_rate": 1.0185371886448719e-07, + "loss": 0.1039, + "step": 9913 + }, + { + "epoch": 0.9134380614548302, + "grad_norm": 0.9609563547823312, + "learning_rate": 1.016386038084885e-07, + "loss": 0.1293, + "step": 9914 + }, + { + "epoch": 0.9135301976321002, + "grad_norm": 0.9029910797521731, + "learning_rate": 1.0142371144060414e-07, + "loss": 0.1026, + "step": 9915 + }, + { + "epoch": 0.9136223338093703, + "grad_norm": 0.9201532523605698, + "learning_rate": 1.0120904178078594e-07, + "loss": 0.1125, + "step": 9916 + }, + { + "epoch": 0.9137144699866403, + "grad_norm": 0.935929708191619, + "learning_rate": 1.0099459484896684e-07, + "loss": 0.1161, + "step": 9917 + }, + { + "epoch": 0.9138066061639103, + "grad_norm": 0.9579121910083173, + "learning_rate": 1.007803706650573e-07, + "loss": 0.1201, + "step": 9918 + }, + { + "epoch": 0.9138987423411803, + "grad_norm": 0.8990511111015285, + "learning_rate": 1.0056636924894864e-07, + "loss": 0.1082, + "step": 9919 + }, + { + "epoch": 0.9139908785184503, + "grad_norm": 0.9419869327103809, + "learning_rate": 1.0035259062051079e-07, + "loss": 0.111, + "step": 9920 + }, + { + "epoch": 0.9140830146957203, + "grad_norm": 0.9481056579845162, + "learning_rate": 1.0013903479959313e-07, + "loss": 0.1107, + "step": 9921 + }, + { + "epoch": 0.9141751508729903, + "grad_norm": 0.9419350381453994, + "learning_rate": 9.992570180602484e-08, + "loss": 0.1183, + "step": 9922 + }, + { + "epoch": 0.9142672870502603, + "grad_norm": 0.9768882094713995, + "learning_rate": 9.971259165961312e-08, + "loss": 0.1186, + "step": 9923 + }, + { + "epoch": 0.9143594232275303, + "grad_norm": 0.9286412912317882, + "learning_rate": 9.949970438014544e-08, + "loss": 0.1103, + "step": 9924 + }, + { + "epoch": 0.9144515594048003, + "grad_norm": 0.8956628077663775, + "learning_rate": 9.928703998738853e-08, + "loss": 0.1068, + "step": 9925 + }, + { + "epoch": 0.9145436955820703, + "grad_norm": 0.9399039480242753, + "learning_rate": 9.907459850108824e-08, + "loss": 0.1131, + "step": 9926 + }, + { + "epoch": 0.9146358317593403, + "grad_norm": 0.9283382943599822, + "learning_rate": 9.886237994097048e-08, + "loss": 0.1147, + "step": 9927 + }, + { + "epoch": 0.9147279679366103, + "grad_norm": 0.9424001613736077, + "learning_rate": 9.86503843267389e-08, + "loss": 0.115, + "step": 9928 + }, + { + "epoch": 0.9148201041138803, + "grad_norm": 0.9528230598198278, + "learning_rate": 9.843861167807722e-08, + "loss": 0.1271, + "step": 9929 + }, + { + "epoch": 0.9149122402911504, + "grad_norm": 0.8861050600901383, + "learning_rate": 9.822706201464915e-08, + "loss": 0.1077, + "step": 9930 + }, + { + "epoch": 0.9150043764684204, + "grad_norm": 0.9660146570263729, + "learning_rate": 9.801573535609677e-08, + "loss": 0.1187, + "step": 9931 + }, + { + "epoch": 0.9150965126456904, + "grad_norm": 0.9516861737652685, + "learning_rate": 9.780463172204186e-08, + "loss": 0.122, + "step": 9932 + }, + { + "epoch": 0.9151886488229604, + "grad_norm": 0.955635835206404, + "learning_rate": 9.759375113208541e-08, + "loss": 0.1159, + "step": 9933 + }, + { + "epoch": 0.9152807850002304, + "grad_norm": 0.9384288101603031, + "learning_rate": 9.738309360580789e-08, + "loss": 0.1297, + "step": 9934 + }, + { + "epoch": 0.9153729211775004, + "grad_norm": 0.9568435927717273, + "learning_rate": 9.717265916276863e-08, + "loss": 0.1173, + "step": 9935 + }, + { + "epoch": 0.9154650573547704, + "grad_norm": 0.8958470833309566, + "learning_rate": 9.696244782250675e-08, + "loss": 0.1032, + "step": 9936 + }, + { + "epoch": 0.9155571935320403, + "grad_norm": 0.9548627720230944, + "learning_rate": 9.675245960453966e-08, + "loss": 0.1208, + "step": 9937 + }, + { + "epoch": 0.9156493297093103, + "grad_norm": 0.9615401456254469, + "learning_rate": 9.654269452836567e-08, + "loss": 0.1273, + "step": 9938 + }, + { + "epoch": 0.9157414658865803, + "grad_norm": 0.9336984788547501, + "learning_rate": 9.633315261346115e-08, + "loss": 0.1111, + "step": 9939 + }, + { + "epoch": 0.9158336020638503, + "grad_norm": 0.9676566615609804, + "learning_rate": 9.612383387928248e-08, + "loss": 0.1238, + "step": 9940 + }, + { + "epoch": 0.9159257382411203, + "grad_norm": 0.9365548618439336, + "learning_rate": 9.59147383452641e-08, + "loss": 0.1171, + "step": 9941 + }, + { + "epoch": 0.9160178744183903, + "grad_norm": 0.9679108953343, + "learning_rate": 9.570586603082078e-08, + "loss": 0.1187, + "step": 9942 + }, + { + "epoch": 0.9161100105956604, + "grad_norm": 0.9693508193733982, + "learning_rate": 9.549721695534669e-08, + "loss": 0.1153, + "step": 9943 + }, + { + "epoch": 0.9162021467729304, + "grad_norm": 0.8984541268521159, + "learning_rate": 9.528879113821526e-08, + "loss": 0.1151, + "step": 9944 + }, + { + "epoch": 0.9162942829502004, + "grad_norm": 0.9104078134300633, + "learning_rate": 9.508058859877794e-08, + "loss": 0.1149, + "step": 9945 + }, + { + "epoch": 0.9163864191274704, + "grad_norm": 0.9581148806404735, + "learning_rate": 9.487260935636678e-08, + "loss": 0.1153, + "step": 9946 + }, + { + "epoch": 0.9164785553047404, + "grad_norm": 0.8834590853387024, + "learning_rate": 9.466485343029269e-08, + "loss": 0.109, + "step": 9947 + }, + { + "epoch": 0.9165706914820104, + "grad_norm": 0.9112418757641385, + "learning_rate": 9.44573208398461e-08, + "loss": 0.1125, + "step": 9948 + }, + { + "epoch": 0.9166628276592804, + "grad_norm": 0.9450438994219061, + "learning_rate": 9.425001160429603e-08, + "loss": 0.1159, + "step": 9949 + }, + { + "epoch": 0.9167549638365504, + "grad_norm": 0.9472996467579019, + "learning_rate": 9.404292574289126e-08, + "loss": 0.1183, + "step": 9950 + }, + { + "epoch": 0.9168471000138204, + "grad_norm": 0.8966195092536017, + "learning_rate": 9.383606327485973e-08, + "loss": 0.1044, + "step": 9951 + }, + { + "epoch": 0.9169392361910904, + "grad_norm": 0.9532262581930256, + "learning_rate": 9.362942421940885e-08, + "loss": 0.1214, + "step": 9952 + }, + { + "epoch": 0.9170313723683604, + "grad_norm": 0.9579253822700269, + "learning_rate": 9.342300859572467e-08, + "loss": 0.1217, + "step": 9953 + }, + { + "epoch": 0.9171235085456304, + "grad_norm": 0.9434676831721229, + "learning_rate": 9.321681642297298e-08, + "loss": 0.1228, + "step": 9954 + }, + { + "epoch": 0.9172156447229004, + "grad_norm": 0.9044628242856921, + "learning_rate": 9.301084772029928e-08, + "loss": 0.1061, + "step": 9955 + }, + { + "epoch": 0.9173077809001704, + "grad_norm": 0.9238703964017801, + "learning_rate": 9.280510250682745e-08, + "loss": 0.123, + "step": 9956 + }, + { + "epoch": 0.9173999170774405, + "grad_norm": 0.8527902116443628, + "learning_rate": 9.259958080166081e-08, + "loss": 0.1017, + "step": 9957 + }, + { + "epoch": 0.9174920532547105, + "grad_norm": 0.9095947345394509, + "learning_rate": 9.23942826238819e-08, + "loss": 0.1098, + "step": 9958 + }, + { + "epoch": 0.9175841894319805, + "grad_norm": 0.8956255963339151, + "learning_rate": 9.218920799255293e-08, + "loss": 0.1111, + "step": 9959 + }, + { + "epoch": 0.9176763256092505, + "grad_norm": 0.971998459665577, + "learning_rate": 9.198435692671565e-08, + "loss": 0.1157, + "step": 9960 + }, + { + "epoch": 0.9177684617865205, + "grad_norm": 0.9697718892880428, + "learning_rate": 9.177972944538982e-08, + "loss": 0.1131, + "step": 9961 + }, + { + "epoch": 0.9178605979637905, + "grad_norm": 0.9544535119686984, + "learning_rate": 9.157532556757526e-08, + "loss": 0.1171, + "step": 9962 + }, + { + "epoch": 0.9179527341410605, + "grad_norm": 0.9497954111135992, + "learning_rate": 9.137114531225066e-08, + "loss": 0.115, + "step": 9963 + }, + { + "epoch": 0.9180448703183305, + "grad_norm": 0.9652083061124271, + "learning_rate": 9.116718869837449e-08, + "loss": 0.1291, + "step": 9964 + }, + { + "epoch": 0.9181370064956005, + "grad_norm": 0.9278351761508028, + "learning_rate": 9.096345574488435e-08, + "loss": 0.1164, + "step": 9965 + }, + { + "epoch": 0.9182291426728705, + "grad_norm": 0.9466540329969422, + "learning_rate": 9.075994647069653e-08, + "loss": 0.1134, + "step": 9966 + }, + { + "epoch": 0.9183212788501405, + "grad_norm": 0.9546339745574965, + "learning_rate": 9.0556660894707e-08, + "loss": 0.1246, + "step": 9967 + }, + { + "epoch": 0.9184134150274105, + "grad_norm": 0.9071641213415044, + "learning_rate": 9.035359903579039e-08, + "loss": 0.1085, + "step": 9968 + }, + { + "epoch": 0.9185055512046805, + "grad_norm": 0.9984994115231136, + "learning_rate": 9.015076091280189e-08, + "loss": 0.116, + "step": 9969 + }, + { + "epoch": 0.9185976873819506, + "grad_norm": 0.8703859546656373, + "learning_rate": 8.994814654457451e-08, + "loss": 0.1085, + "step": 9970 + }, + { + "epoch": 0.9186898235592206, + "grad_norm": 1.0156798267992415, + "learning_rate": 8.974575594992124e-08, + "loss": 0.1122, + "step": 9971 + }, + { + "epoch": 0.9187819597364906, + "grad_norm": 0.921122595652039, + "learning_rate": 8.954358914763373e-08, + "loss": 0.11, + "step": 9972 + }, + { + "epoch": 0.9188740959137606, + "grad_norm": 0.8869429691351998, + "learning_rate": 8.934164615648333e-08, + "loss": 0.1037, + "step": 9973 + }, + { + "epoch": 0.9189662320910306, + "grad_norm": 0.9087355779950654, + "learning_rate": 8.913992699522062e-08, + "loss": 0.1063, + "step": 9974 + }, + { + "epoch": 0.9190583682683006, + "grad_norm": 0.9171403022519661, + "learning_rate": 8.893843168257504e-08, + "loss": 0.115, + "step": 9975 + }, + { + "epoch": 0.9191505044455706, + "grad_norm": 0.9090886668961073, + "learning_rate": 8.873716023725581e-08, + "loss": 0.105, + "step": 9976 + }, + { + "epoch": 0.9192426406228406, + "grad_norm": 0.9517240376938657, + "learning_rate": 8.853611267795076e-08, + "loss": 0.1249, + "step": 9977 + }, + { + "epoch": 0.9193347768001106, + "grad_norm": 0.8765303454447209, + "learning_rate": 8.833528902332688e-08, + "loss": 0.0996, + "step": 9978 + }, + { + "epoch": 0.9194269129773806, + "grad_norm": 0.9423252168440611, + "learning_rate": 8.813468929203095e-08, + "loss": 0.1141, + "step": 9979 + }, + { + "epoch": 0.9195190491546505, + "grad_norm": 0.9496341928199417, + "learning_rate": 8.793431350268861e-08, + "loss": 0.1184, + "step": 9980 + }, + { + "epoch": 0.9196111853319205, + "grad_norm": 0.9484875310380811, + "learning_rate": 8.773416167390525e-08, + "loss": 0.1151, + "step": 9981 + }, + { + "epoch": 0.9197033215091905, + "grad_norm": 0.9375366241158559, + "learning_rate": 8.753423382426463e-08, + "loss": 0.1048, + "step": 9982 + }, + { + "epoch": 0.9197954576864605, + "grad_norm": 0.9891837534169375, + "learning_rate": 8.733452997232967e-08, + "loss": 0.1165, + "step": 9983 + }, + { + "epoch": 0.9198875938637306, + "grad_norm": 0.9944041684722955, + "learning_rate": 8.713505013664303e-08, + "loss": 0.1319, + "step": 9984 + }, + { + "epoch": 0.9199797300410006, + "grad_norm": 0.9809021101391047, + "learning_rate": 8.693579433572741e-08, + "loss": 0.1245, + "step": 9985 + }, + { + "epoch": 0.9200718662182706, + "grad_norm": 1.0109863882514742, + "learning_rate": 8.673676258808244e-08, + "loss": 0.1272, + "step": 9986 + }, + { + "epoch": 0.9201640023955406, + "grad_norm": 0.9204852372456744, + "learning_rate": 8.653795491218891e-08, + "loss": 0.105, + "step": 9987 + }, + { + "epoch": 0.9202561385728106, + "grad_norm": 0.9258040595618826, + "learning_rate": 8.633937132650593e-08, + "loss": 0.1134, + "step": 9988 + }, + { + "epoch": 0.9203482747500806, + "grad_norm": 0.9293784596137714, + "learning_rate": 8.614101184947238e-08, + "loss": 0.1193, + "step": 9989 + }, + { + "epoch": 0.9204404109273506, + "grad_norm": 0.9030214124625668, + "learning_rate": 8.594287649950544e-08, + "loss": 0.1097, + "step": 9990 + }, + { + "epoch": 0.9205325471046206, + "grad_norm": 0.9118329350693379, + "learning_rate": 8.574496529500209e-08, + "loss": 0.1103, + "step": 9991 + }, + { + "epoch": 0.9206246832818906, + "grad_norm": 0.9446461098329167, + "learning_rate": 8.554727825433872e-08, + "loss": 0.1192, + "step": 9992 + }, + { + "epoch": 0.9207168194591606, + "grad_norm": 0.9733014763065767, + "learning_rate": 8.53498153958704e-08, + "loss": 0.1158, + "step": 9993 + }, + { + "epoch": 0.9208089556364306, + "grad_norm": 0.9264514392864119, + "learning_rate": 8.515257673793159e-08, + "loss": 0.1144, + "step": 9994 + }, + { + "epoch": 0.9209010918137006, + "grad_norm": 0.9796407782816811, + "learning_rate": 8.4955562298836e-08, + "loss": 0.1202, + "step": 9995 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 0.9384931539095959, + "learning_rate": 8.475877209687594e-08, + "loss": 0.1071, + "step": 9996 + }, + { + "epoch": 0.9210853641682406, + "grad_norm": 0.9675554946533331, + "learning_rate": 8.456220615032429e-08, + "loss": 0.1209, + "step": 9997 + }, + { + "epoch": 0.9211775003455107, + "grad_norm": 0.9773182683269641, + "learning_rate": 8.436586447743172e-08, + "loss": 0.1226, + "step": 9998 + }, + { + "epoch": 0.9212696365227807, + "grad_norm": 0.9262159911644006, + "learning_rate": 8.416974709642839e-08, + "loss": 0.1157, + "step": 9999 + }, + { + "epoch": 0.9213617727000507, + "grad_norm": 0.9153048649416058, + "learning_rate": 8.397385402552415e-08, + "loss": 0.1117, + "step": 10000 + }, + { + "epoch": 0.9213617727000507, + "eval_loss": 0.11681114137172699, + "eval_runtime": 300.109, + "eval_samples_per_second": 23.382, + "eval_steps_per_second": 2.926, + "step": 10000 + }, + { + "epoch": 0.9214539088773207, + "grad_norm": 0.8953472769400774, + "learning_rate": 8.377818528290754e-08, + "loss": 0.1116, + "step": 10001 + }, + { + "epoch": 0.9215460450545907, + "grad_norm": 0.9605648976460276, + "learning_rate": 8.358274088674651e-08, + "loss": 0.1208, + "step": 10002 + }, + { + "epoch": 0.9216381812318607, + "grad_norm": 0.9322621523925322, + "learning_rate": 8.338752085518819e-08, + "loss": 0.1114, + "step": 10003 + }, + { + "epoch": 0.9217303174091307, + "grad_norm": 0.9272123065068724, + "learning_rate": 8.31925252063584e-08, + "loss": 0.1073, + "step": 10004 + }, + { + "epoch": 0.9218224535864007, + "grad_norm": 0.9332881339966433, + "learning_rate": 8.299775395836262e-08, + "loss": 0.1133, + "step": 10005 + }, + { + "epoch": 0.9219145897636707, + "grad_norm": 0.9102158918404231, + "learning_rate": 8.280320712928585e-08, + "loss": 0.1163, + "step": 10006 + }, + { + "epoch": 0.9220067259409407, + "grad_norm": 0.9326940952236754, + "learning_rate": 8.260888473719114e-08, + "loss": 0.1158, + "step": 10007 + }, + { + "epoch": 0.9220988621182107, + "grad_norm": 0.9461833110524152, + "learning_rate": 8.241478680012183e-08, + "loss": 0.116, + "step": 10008 + }, + { + "epoch": 0.9221909982954807, + "grad_norm": 0.9551382205517904, + "learning_rate": 8.222091333609989e-08, + "loss": 0.1193, + "step": 10009 + }, + { + "epoch": 0.9222831344727507, + "grad_norm": 0.9370064660179406, + "learning_rate": 8.202726436312619e-08, + "loss": 0.1092, + "step": 10010 + }, + { + "epoch": 0.9223752706500208, + "grad_norm": 0.9791261039611153, + "learning_rate": 8.183383989918109e-08, + "loss": 0.1194, + "step": 10011 + }, + { + "epoch": 0.9224674068272908, + "grad_norm": 0.9504216348772377, + "learning_rate": 8.164063996222438e-08, + "loss": 0.12, + "step": 10012 + }, + { + "epoch": 0.9225595430045608, + "grad_norm": 0.936257019374371, + "learning_rate": 8.14476645701942e-08, + "loss": 0.1137, + "step": 10013 + }, + { + "epoch": 0.9226516791818308, + "grad_norm": 0.9381555121278079, + "learning_rate": 8.125491374100902e-08, + "loss": 0.1155, + "step": 10014 + }, + { + "epoch": 0.9227438153591008, + "grad_norm": 1.0079460709363284, + "learning_rate": 8.106238749256562e-08, + "loss": 0.1185, + "step": 10015 + }, + { + "epoch": 0.9228359515363708, + "grad_norm": 0.958334489638207, + "learning_rate": 8.087008584273942e-08, + "loss": 0.1193, + "step": 10016 + }, + { + "epoch": 0.9229280877136408, + "grad_norm": 0.9068490658601089, + "learning_rate": 8.067800880938615e-08, + "loss": 0.1079, + "step": 10017 + }, + { + "epoch": 0.9230202238909108, + "grad_norm": 0.9435404073158213, + "learning_rate": 8.048615641034013e-08, + "loss": 0.1118, + "step": 10018 + }, + { + "epoch": 0.9231123600681808, + "grad_norm": 0.9200330865058696, + "learning_rate": 8.029452866341492e-08, + "loss": 0.1182, + "step": 10019 + }, + { + "epoch": 0.9232044962454508, + "grad_norm": 0.914248690361516, + "learning_rate": 8.010312558640348e-08, + "loss": 0.1062, + "step": 10020 + }, + { + "epoch": 0.9232966324227208, + "grad_norm": 0.937085460107984, + "learning_rate": 7.991194719707663e-08, + "loss": 0.1207, + "step": 10021 + }, + { + "epoch": 0.9233887685999908, + "grad_norm": 0.9829486329659389, + "learning_rate": 7.972099351318624e-08, + "loss": 0.1263, + "step": 10022 + }, + { + "epoch": 0.9234809047772607, + "grad_norm": 0.9274488053766404, + "learning_rate": 7.953026455246233e-08, + "loss": 0.1171, + "step": 10023 + }, + { + "epoch": 0.9235730409545307, + "grad_norm": 0.9512581459490765, + "learning_rate": 7.933976033261348e-08, + "loss": 0.1118, + "step": 10024 + }, + { + "epoch": 0.9236651771318009, + "grad_norm": 0.9332557412920207, + "learning_rate": 7.914948087132862e-08, + "loss": 0.1221, + "step": 10025 + }, + { + "epoch": 0.9237573133090708, + "grad_norm": 0.9534856497766078, + "learning_rate": 7.895942618627472e-08, + "loss": 0.1164, + "step": 10026 + }, + { + "epoch": 0.9238494494863408, + "grad_norm": 0.9511982683101914, + "learning_rate": 7.876959629509907e-08, + "loss": 0.1168, + "step": 10027 + }, + { + "epoch": 0.9239415856636108, + "grad_norm": 0.9863978122827085, + "learning_rate": 7.85799912154267e-08, + "loss": 0.1153, + "step": 10028 + }, + { + "epoch": 0.9240337218408808, + "grad_norm": 0.856384935668297, + "learning_rate": 7.839061096486273e-08, + "loss": 0.1043, + "step": 10029 + }, + { + "epoch": 0.9241258580181508, + "grad_norm": 0.9659271632688743, + "learning_rate": 7.82014555609914e-08, + "loss": 0.1252, + "step": 10030 + }, + { + "epoch": 0.9242179941954208, + "grad_norm": 0.9124472370190727, + "learning_rate": 7.801252502137535e-08, + "loss": 0.1076, + "step": 10031 + }, + { + "epoch": 0.9243101303726908, + "grad_norm": 0.943476757994991, + "learning_rate": 7.782381936355693e-08, + "loss": 0.1244, + "step": 10032 + }, + { + "epoch": 0.9244022665499608, + "grad_norm": 0.9477335815226061, + "learning_rate": 7.763533860505767e-08, + "loss": 0.1219, + "step": 10033 + }, + { + "epoch": 0.9244944027272308, + "grad_norm": 0.9474630709253589, + "learning_rate": 7.744708276337776e-08, + "loss": 0.1115, + "step": 10034 + }, + { + "epoch": 0.9245865389045008, + "grad_norm": 0.9124267452326419, + "learning_rate": 7.725905185599735e-08, + "loss": 0.111, + "step": 10035 + }, + { + "epoch": 0.9246786750817708, + "grad_norm": 0.9073193546423393, + "learning_rate": 7.707124590037445e-08, + "loss": 0.1079, + "step": 10036 + }, + { + "epoch": 0.9247708112590408, + "grad_norm": 0.8735953297989576, + "learning_rate": 7.688366491394706e-08, + "loss": 0.1057, + "step": 10037 + }, + { + "epoch": 0.9248629474363109, + "grad_norm": 0.8926319327455886, + "learning_rate": 7.669630891413204e-08, + "loss": 0.1076, + "step": 10038 + }, + { + "epoch": 0.9249550836135809, + "grad_norm": 0.9033458944044545, + "learning_rate": 7.650917791832608e-08, + "loss": 0.108, + "step": 10039 + }, + { + "epoch": 0.9250472197908509, + "grad_norm": 0.9280311985098185, + "learning_rate": 7.632227194390301e-08, + "loss": 0.1187, + "step": 10040 + }, + { + "epoch": 0.9251393559681209, + "grad_norm": 0.8983958238137177, + "learning_rate": 7.613559100821843e-08, + "loss": 0.1087, + "step": 10041 + }, + { + "epoch": 0.9252314921453909, + "grad_norm": 0.9239527817591513, + "learning_rate": 7.594913512860485e-08, + "loss": 0.1131, + "step": 10042 + }, + { + "epoch": 0.9253236283226609, + "grad_norm": 0.920847354160995, + "learning_rate": 7.57629043223751e-08, + "loss": 0.1129, + "step": 10043 + }, + { + "epoch": 0.9254157644999309, + "grad_norm": 0.9208440112474245, + "learning_rate": 7.557689860682032e-08, + "loss": 0.1123, + "step": 10044 + }, + { + "epoch": 0.9255079006772009, + "grad_norm": 0.9693290819377761, + "learning_rate": 7.539111799921145e-08, + "loss": 0.1235, + "step": 10045 + }, + { + "epoch": 0.9256000368544709, + "grad_norm": 0.9707370388186811, + "learning_rate": 7.520556251679856e-08, + "loss": 0.1157, + "step": 10046 + }, + { + "epoch": 0.9256921730317409, + "grad_norm": 0.9373089571232343, + "learning_rate": 7.502023217680982e-08, + "loss": 0.114, + "step": 10047 + }, + { + "epoch": 0.9257843092090109, + "grad_norm": 0.9666360388533218, + "learning_rate": 7.483512699645368e-08, + "loss": 0.1203, + "step": 10048 + }, + { + "epoch": 0.9258764453862809, + "grad_norm": 0.9974723494362501, + "learning_rate": 7.465024699291696e-08, + "loss": 0.1253, + "step": 10049 + }, + { + "epoch": 0.9259685815635509, + "grad_norm": 0.9684830257669516, + "learning_rate": 7.446559218336563e-08, + "loss": 0.1269, + "step": 10050 + }, + { + "epoch": 0.9260607177408209, + "grad_norm": 0.941063326323892, + "learning_rate": 7.428116258494545e-08, + "loss": 0.1184, + "step": 10051 + }, + { + "epoch": 0.926152853918091, + "grad_norm": 0.959037420198221, + "learning_rate": 7.409695821478046e-08, + "loss": 0.117, + "step": 10052 + }, + { + "epoch": 0.926244990095361, + "grad_norm": 0.9716484105543598, + "learning_rate": 7.391297908997341e-08, + "loss": 0.117, + "step": 10053 + }, + { + "epoch": 0.926337126272631, + "grad_norm": 0.9243109744426414, + "learning_rate": 7.372922522760755e-08, + "loss": 0.1085, + "step": 10054 + }, + { + "epoch": 0.926429262449901, + "grad_norm": 0.9668232342765302, + "learning_rate": 7.354569664474426e-08, + "loss": 0.1202, + "step": 10055 + }, + { + "epoch": 0.926521398627171, + "grad_norm": 0.9634841438555413, + "learning_rate": 7.33623933584246e-08, + "loss": 0.1233, + "step": 10056 + }, + { + "epoch": 0.926613534804441, + "grad_norm": 0.9764871788307892, + "learning_rate": 7.317931538566747e-08, + "loss": 0.1249, + "step": 10057 + }, + { + "epoch": 0.926705670981711, + "grad_norm": 0.9915884927220054, + "learning_rate": 7.299646274347205e-08, + "loss": 0.1226, + "step": 10058 + }, + { + "epoch": 0.926797807158981, + "grad_norm": 0.9689646354780623, + "learning_rate": 7.281383544881642e-08, + "loss": 0.1245, + "step": 10059 + }, + { + "epoch": 0.926889943336251, + "grad_norm": 1.0001034469069645, + "learning_rate": 7.263143351865759e-08, + "loss": 0.1312, + "step": 10060 + }, + { + "epoch": 0.926982079513521, + "grad_norm": 0.975500103508216, + "learning_rate": 7.244925696993088e-08, + "loss": 0.1208, + "step": 10061 + }, + { + "epoch": 0.927074215690791, + "grad_norm": 0.9932454627539781, + "learning_rate": 7.226730581955249e-08, + "loss": 0.1289, + "step": 10062 + }, + { + "epoch": 0.927166351868061, + "grad_norm": 0.9673568247424288, + "learning_rate": 7.208558008441557e-08, + "loss": 0.1232, + "step": 10063 + }, + { + "epoch": 0.927258488045331, + "grad_norm": 0.8870914398841915, + "learning_rate": 7.190407978139413e-08, + "loss": 0.1098, + "step": 10064 + }, + { + "epoch": 0.927350624222601, + "grad_norm": 0.9218735423503247, + "learning_rate": 7.172280492733996e-08, + "loss": 0.1188, + "step": 10065 + }, + { + "epoch": 0.9274427603998711, + "grad_norm": 0.9383407246695775, + "learning_rate": 7.15417555390846e-08, + "loss": 0.1243, + "step": 10066 + }, + { + "epoch": 0.927534896577141, + "grad_norm": 0.9179094929795911, + "learning_rate": 7.136093163343877e-08, + "loss": 0.1197, + "step": 10067 + }, + { + "epoch": 0.927627032754411, + "grad_norm": 0.9438528092701421, + "learning_rate": 7.118033322719209e-08, + "loss": 0.1086, + "step": 10068 + }, + { + "epoch": 0.927719168931681, + "grad_norm": 0.9768979953603585, + "learning_rate": 7.099996033711254e-08, + "loss": 0.1215, + "step": 10069 + }, + { + "epoch": 0.927811305108951, + "grad_norm": 0.92007035394007, + "learning_rate": 7.081981297994784e-08, + "loss": 0.1098, + "step": 10070 + }, + { + "epoch": 0.927903441286221, + "grad_norm": 0.9547575066326877, + "learning_rate": 7.063989117242514e-08, + "loss": 0.1197, + "step": 10071 + }, + { + "epoch": 0.927995577463491, + "grad_norm": 0.9142725216376145, + "learning_rate": 7.046019493125028e-08, + "loss": 0.1078, + "step": 10072 + }, + { + "epoch": 0.928087713640761, + "grad_norm": 0.9579050206705731, + "learning_rate": 7.028072427310767e-08, + "loss": 0.1216, + "step": 10073 + }, + { + "epoch": 0.928179849818031, + "grad_norm": 0.9297320319381187, + "learning_rate": 7.010147921466121e-08, + "loss": 0.1143, + "step": 10074 + }, + { + "epoch": 0.928271985995301, + "grad_norm": 0.9497750407350231, + "learning_rate": 6.992245977255369e-08, + "loss": 0.1201, + "step": 10075 + }, + { + "epoch": 0.928364122172571, + "grad_norm": 0.9907453496156512, + "learning_rate": 6.974366596340765e-08, + "loss": 0.1263, + "step": 10076 + }, + { + "epoch": 0.928456258349841, + "grad_norm": 0.9239784611446478, + "learning_rate": 6.95650978038237e-08, + "loss": 0.1206, + "step": 10077 + }, + { + "epoch": 0.928548394527111, + "grad_norm": 0.9308862971712853, + "learning_rate": 6.93867553103822e-08, + "loss": 0.1152, + "step": 10078 + }, + { + "epoch": 0.9286405307043811, + "grad_norm": 0.9778397868120814, + "learning_rate": 6.920863849964154e-08, + "loss": 0.1242, + "step": 10079 + }, + { + "epoch": 0.9287326668816511, + "grad_norm": 0.9989156628975393, + "learning_rate": 6.903074738814047e-08, + "loss": 0.1256, + "step": 10080 + }, + { + "epoch": 0.9288248030589211, + "grad_norm": 0.9604361889555622, + "learning_rate": 6.88530819923966e-08, + "loss": 0.1183, + "step": 10081 + }, + { + "epoch": 0.9289169392361911, + "grad_norm": 0.9902749552785988, + "learning_rate": 6.867564232890534e-08, + "loss": 0.1297, + "step": 10082 + }, + { + "epoch": 0.9290090754134611, + "grad_norm": 0.9193712003271258, + "learning_rate": 6.849842841414239e-08, + "loss": 0.1128, + "step": 10083 + }, + { + "epoch": 0.9291012115907311, + "grad_norm": 0.927277125839814, + "learning_rate": 6.832144026456211e-08, + "loss": 0.1116, + "step": 10084 + }, + { + "epoch": 0.9291933477680011, + "grad_norm": 0.9509926978450719, + "learning_rate": 6.8144677896598e-08, + "loss": 0.118, + "step": 10085 + }, + { + "epoch": 0.9292854839452711, + "grad_norm": 0.89136791200629, + "learning_rate": 6.796814132666196e-08, + "loss": 0.1133, + "step": 10086 + }, + { + "epoch": 0.9293776201225411, + "grad_norm": 0.9316915181788125, + "learning_rate": 6.779183057114585e-08, + "loss": 0.1161, + "step": 10087 + }, + { + "epoch": 0.9294697562998111, + "grad_norm": 0.955332942284261, + "learning_rate": 6.761574564641993e-08, + "loss": 0.1207, + "step": 10088 + }, + { + "epoch": 0.9295618924770811, + "grad_norm": 0.9516950712819171, + "learning_rate": 6.743988656883388e-08, + "loss": 0.1209, + "step": 10089 + }, + { + "epoch": 0.9296540286543511, + "grad_norm": 0.955229675549783, + "learning_rate": 6.726425335471632e-08, + "loss": 0.1173, + "step": 10090 + }, + { + "epoch": 0.9297461648316211, + "grad_norm": 0.8921231535918545, + "learning_rate": 6.708884602037446e-08, + "loss": 0.1112, + "step": 10091 + }, + { + "epoch": 0.9298383010088911, + "grad_norm": 0.9489220528435535, + "learning_rate": 6.691366458209503e-08, + "loss": 0.1244, + "step": 10092 + }, + { + "epoch": 0.9299304371861612, + "grad_norm": 0.9299314366674031, + "learning_rate": 6.673870905614387e-08, + "loss": 0.1152, + "step": 10093 + }, + { + "epoch": 0.9300225733634312, + "grad_norm": 0.9705811815401614, + "learning_rate": 6.656397945876525e-08, + "loss": 0.1192, + "step": 10094 + }, + { + "epoch": 0.9301147095407012, + "grad_norm": 0.9309252837569718, + "learning_rate": 6.638947580618338e-08, + "loss": 0.1152, + "step": 10095 + }, + { + "epoch": 0.9302068457179712, + "grad_norm": 0.9649772172337729, + "learning_rate": 6.621519811460003e-08, + "loss": 0.1184, + "step": 10096 + }, + { + "epoch": 0.9302989818952412, + "grad_norm": 0.9312697012951024, + "learning_rate": 6.60411464001981e-08, + "loss": 0.1102, + "step": 10097 + }, + { + "epoch": 0.9303911180725112, + "grad_norm": 0.9764272745006919, + "learning_rate": 6.586732067913715e-08, + "loss": 0.1119, + "step": 10098 + }, + { + "epoch": 0.9304832542497812, + "grad_norm": 0.9436377619779318, + "learning_rate": 6.56937209675576e-08, + "loss": 0.1234, + "step": 10099 + }, + { + "epoch": 0.9305753904270512, + "grad_norm": 0.9793883894185555, + "learning_rate": 6.552034728157824e-08, + "loss": 0.1198, + "step": 10100 + }, + { + "epoch": 0.9306675266043212, + "grad_norm": 0.9409414214358416, + "learning_rate": 6.534719963729646e-08, + "loss": 0.1124, + "step": 10101 + }, + { + "epoch": 0.9307596627815912, + "grad_norm": 0.9221372089696581, + "learning_rate": 6.517427805078913e-08, + "loss": 0.1198, + "step": 10102 + }, + { + "epoch": 0.9308517989588612, + "grad_norm": 0.875186624270954, + "learning_rate": 6.500158253811228e-08, + "loss": 0.0994, + "step": 10103 + }, + { + "epoch": 0.9309439351361312, + "grad_norm": 0.9267212533368236, + "learning_rate": 6.482911311530033e-08, + "loss": 0.1136, + "step": 10104 + }, + { + "epoch": 0.9310360713134012, + "grad_norm": 0.8883942370936049, + "learning_rate": 6.465686979836766e-08, + "loss": 0.1036, + "step": 10105 + }, + { + "epoch": 0.9311282074906713, + "grad_norm": 0.9235460645528614, + "learning_rate": 6.44848526033065e-08, + "loss": 0.1159, + "step": 10106 + }, + { + "epoch": 0.9312203436679413, + "grad_norm": 0.9071083021798902, + "learning_rate": 6.43130615460888e-08, + "loss": 0.1105, + "step": 10107 + }, + { + "epoch": 0.9313124798452113, + "grad_norm": 0.9968005269119061, + "learning_rate": 6.41414966426654e-08, + "loss": 0.1284, + "step": 10108 + }, + { + "epoch": 0.9314046160224813, + "grad_norm": 0.9761930692958624, + "learning_rate": 6.397015790896633e-08, + "loss": 0.1205, + "step": 10109 + }, + { + "epoch": 0.9314967521997513, + "grad_norm": 0.9365364534222561, + "learning_rate": 6.379904536090053e-08, + "loss": 0.1076, + "step": 10110 + }, + { + "epoch": 0.9315888883770213, + "grad_norm": 0.8918112749411182, + "learning_rate": 6.362815901435532e-08, + "loss": 0.1098, + "step": 10111 + }, + { + "epoch": 0.9316810245542912, + "grad_norm": 0.9663526734184886, + "learning_rate": 6.34574988851977e-08, + "loss": 0.1224, + "step": 10112 + }, + { + "epoch": 0.9317731607315612, + "grad_norm": 0.927169379210311, + "learning_rate": 6.328706498927361e-08, + "loss": 0.117, + "step": 10113 + }, + { + "epoch": 0.9318652969088312, + "grad_norm": 0.8784108002332701, + "learning_rate": 6.311685734240791e-08, + "loss": 0.1074, + "step": 10114 + }, + { + "epoch": 0.9319574330861012, + "grad_norm": 0.9172721768927773, + "learning_rate": 6.294687596040406e-08, + "loss": 0.119, + "step": 10115 + }, + { + "epoch": 0.9320495692633712, + "grad_norm": 0.903584060250854, + "learning_rate": 6.277712085904524e-08, + "loss": 0.1014, + "step": 10116 + }, + { + "epoch": 0.9321417054406412, + "grad_norm": 0.9725426775760924, + "learning_rate": 6.260759205409278e-08, + "loss": 0.134, + "step": 10117 + }, + { + "epoch": 0.9322338416179112, + "grad_norm": 0.8691523075785011, + "learning_rate": 6.243828956128794e-08, + "loss": 0.1008, + "step": 10118 + }, + { + "epoch": 0.9323259777951812, + "grad_norm": 0.9136529794135902, + "learning_rate": 6.226921339635012e-08, + "loss": 0.1025, + "step": 10119 + }, + { + "epoch": 0.9324181139724513, + "grad_norm": 1.0116626279614331, + "learning_rate": 6.210036357497811e-08, + "loss": 0.1259, + "step": 10120 + }, + { + "epoch": 0.9325102501497213, + "grad_norm": 0.93766496587313, + "learning_rate": 6.193174011284997e-08, + "loss": 0.1199, + "step": 10121 + }, + { + "epoch": 0.9326023863269913, + "grad_norm": 0.889965497266823, + "learning_rate": 6.176334302562204e-08, + "loss": 0.1029, + "step": 10122 + }, + { + "epoch": 0.9326945225042613, + "grad_norm": 0.9855453499663499, + "learning_rate": 6.159517232893014e-08, + "loss": 0.121, + "step": 10123 + }, + { + "epoch": 0.9327866586815313, + "grad_norm": 0.9743008981087332, + "learning_rate": 6.142722803838874e-08, + "loss": 0.124, + "step": 10124 + }, + { + "epoch": 0.9328787948588013, + "grad_norm": 0.958869092253525, + "learning_rate": 6.125951016959175e-08, + "loss": 0.1247, + "step": 10125 + }, + { + "epoch": 0.9329709310360713, + "grad_norm": 0.9640490274238452, + "learning_rate": 6.109201873811171e-08, + "loss": 0.1254, + "step": 10126 + }, + { + "epoch": 0.9330630672133413, + "grad_norm": 0.8968523241668316, + "learning_rate": 6.092475375950035e-08, + "loss": 0.1058, + "step": 10127 + }, + { + "epoch": 0.9331552033906113, + "grad_norm": 0.8935034810307682, + "learning_rate": 6.075771524928804e-08, + "loss": 0.1118, + "step": 10128 + }, + { + "epoch": 0.9332473395678813, + "grad_norm": 0.9125924372741135, + "learning_rate": 6.0590903222984e-08, + "loss": 0.1099, + "step": 10129 + }, + { + "epoch": 0.9333394757451513, + "grad_norm": 0.8980119979407162, + "learning_rate": 6.042431769607782e-08, + "loss": 0.0987, + "step": 10130 + }, + { + "epoch": 0.9334316119224213, + "grad_norm": 0.9152540334694288, + "learning_rate": 6.025795868403573e-08, + "loss": 0.1221, + "step": 10131 + }, + { + "epoch": 0.9335237480996913, + "grad_norm": 0.9265595110349005, + "learning_rate": 6.009182620230508e-08, + "loss": 0.1153, + "step": 10132 + }, + { + "epoch": 0.9336158842769613, + "grad_norm": 0.9307369152940632, + "learning_rate": 5.992592026631078e-08, + "loss": 0.1064, + "step": 10133 + }, + { + "epoch": 0.9337080204542314, + "grad_norm": 0.9215489600399154, + "learning_rate": 5.976024089145715e-08, + "loss": 0.1137, + "step": 10134 + }, + { + "epoch": 0.9338001566315014, + "grad_norm": 0.9070052737981305, + "learning_rate": 5.95947880931283e-08, + "loss": 0.118, + "step": 10135 + }, + { + "epoch": 0.9338922928087714, + "grad_norm": 0.9934855918706669, + "learning_rate": 5.942956188668553e-08, + "loss": 0.1296, + "step": 10136 + }, + { + "epoch": 0.9339844289860414, + "grad_norm": 0.9732008800545134, + "learning_rate": 5.926456228747102e-08, + "loss": 0.1276, + "step": 10137 + }, + { + "epoch": 0.9340765651633114, + "grad_norm": 0.9041660583571358, + "learning_rate": 5.909978931080418e-08, + "loss": 0.1115, + "step": 10138 + }, + { + "epoch": 0.9341687013405814, + "grad_norm": 0.9207987007708457, + "learning_rate": 5.8935242971984993e-08, + "loss": 0.1173, + "step": 10139 + }, + { + "epoch": 0.9342608375178514, + "grad_norm": 0.9634402298027609, + "learning_rate": 5.877092328629097e-08, + "loss": 0.1201, + "step": 10140 + }, + { + "epoch": 0.9343529736951214, + "grad_norm": 0.9358564013054519, + "learning_rate": 5.8606830268979344e-08, + "loss": 0.1115, + "step": 10141 + }, + { + "epoch": 0.9344451098723914, + "grad_norm": 0.9296780517476542, + "learning_rate": 5.8442963935286535e-08, + "loss": 0.1194, + "step": 10142 + }, + { + "epoch": 0.9345372460496614, + "grad_norm": 0.8830379602856283, + "learning_rate": 5.827932430042732e-08, + "loss": 0.1087, + "step": 10143 + }, + { + "epoch": 0.9346293822269314, + "grad_norm": 0.9437410925435715, + "learning_rate": 5.811591137959538e-08, + "loss": 0.1187, + "step": 10144 + }, + { + "epoch": 0.9347215184042014, + "grad_norm": 0.9544053264277833, + "learning_rate": 5.7952725187963855e-08, + "loss": 0.1108, + "step": 10145 + }, + { + "epoch": 0.9348136545814714, + "grad_norm": 0.9956439833964636, + "learning_rate": 5.778976574068451e-08, + "loss": 0.1212, + "step": 10146 + }, + { + "epoch": 0.9349057907587415, + "grad_norm": 0.8916894305196384, + "learning_rate": 5.762703305288858e-08, + "loss": 0.1154, + "step": 10147 + }, + { + "epoch": 0.9349979269360115, + "grad_norm": 0.9453557024716712, + "learning_rate": 5.746452713968564e-08, + "loss": 0.1232, + "step": 10148 + }, + { + "epoch": 0.9350900631132815, + "grad_norm": 0.9616711906703019, + "learning_rate": 5.730224801616391e-08, + "loss": 0.1175, + "step": 10149 + }, + { + "epoch": 0.9351821992905515, + "grad_norm": 0.9178001385739297, + "learning_rate": 5.714019569739132e-08, + "loss": 0.1115, + "step": 10150 + }, + { + "epoch": 0.9352743354678215, + "grad_norm": 0.9475727631500085, + "learning_rate": 5.697837019841446e-08, + "loss": 0.125, + "step": 10151 + }, + { + "epoch": 0.9353664716450915, + "grad_norm": 0.9648214850088555, + "learning_rate": 5.6816771534258794e-08, + "loss": 0.1196, + "step": 10152 + }, + { + "epoch": 0.9354586078223615, + "grad_norm": 0.9432824464052948, + "learning_rate": 5.6655399719929286e-08, + "loss": 0.1149, + "step": 10153 + }, + { + "epoch": 0.9355507439996315, + "grad_norm": 0.8934593126027819, + "learning_rate": 5.649425477040837e-08, + "loss": 0.1107, + "step": 10154 + }, + { + "epoch": 0.9356428801769014, + "grad_norm": 0.9901492980135598, + "learning_rate": 5.63333367006591e-08, + "loss": 0.1259, + "step": 10155 + }, + { + "epoch": 0.9357350163541714, + "grad_norm": 0.9389511635432886, + "learning_rate": 5.617264552562229e-08, + "loss": 0.1262, + "step": 10156 + }, + { + "epoch": 0.9358271525314414, + "grad_norm": 0.9962631342478383, + "learning_rate": 5.6012181260218514e-08, + "loss": 0.1219, + "step": 10157 + }, + { + "epoch": 0.9359192887087114, + "grad_norm": 0.9589826323723284, + "learning_rate": 5.5851943919346394e-08, + "loss": 0.1155, + "step": 10158 + }, + { + "epoch": 0.9360114248859814, + "grad_norm": 1.0008099775740829, + "learning_rate": 5.569193351788516e-08, + "loss": 0.1258, + "step": 10159 + }, + { + "epoch": 0.9361035610632514, + "grad_norm": 0.9701895891527986, + "learning_rate": 5.5532150070690404e-08, + "loss": 0.1144, + "step": 10160 + }, + { + "epoch": 0.9361956972405215, + "grad_norm": 0.9887847060003988, + "learning_rate": 5.5372593592598333e-08, + "loss": 0.1239, + "step": 10161 + }, + { + "epoch": 0.9362878334177915, + "grad_norm": 0.917151189793383, + "learning_rate": 5.521326409842431e-08, + "loss": 0.1072, + "step": 10162 + }, + { + "epoch": 0.9363799695950615, + "grad_norm": 0.9511802364714468, + "learning_rate": 5.5054161602961786e-08, + "loss": 0.1128, + "step": 10163 + }, + { + "epoch": 0.9364721057723315, + "grad_norm": 0.9196778717957829, + "learning_rate": 5.489528612098366e-08, + "loss": 0.1104, + "step": 10164 + }, + { + "epoch": 0.9365642419496015, + "grad_norm": 0.9109397752453979, + "learning_rate": 5.4736637667241465e-08, + "loss": 0.1188, + "step": 10165 + }, + { + "epoch": 0.9366563781268715, + "grad_norm": 0.9572866214282587, + "learning_rate": 5.457821625646537e-08, + "loss": 0.121, + "step": 10166 + }, + { + "epoch": 0.9367485143041415, + "grad_norm": 0.9675168468217017, + "learning_rate": 5.442002190336498e-08, + "loss": 0.1255, + "step": 10167 + }, + { + "epoch": 0.9368406504814115, + "grad_norm": 0.9939171972835509, + "learning_rate": 5.426205462262884e-08, + "loss": 0.1363, + "step": 10168 + }, + { + "epoch": 0.9369327866586815, + "grad_norm": 1.00824738921418, + "learning_rate": 5.410431442892411e-08, + "loss": 0.1318, + "step": 10169 + }, + { + "epoch": 0.9370249228359515, + "grad_norm": 0.9286004221876722, + "learning_rate": 5.3946801336897395e-08, + "loss": 0.1152, + "step": 10170 + }, + { + "epoch": 0.9371170590132215, + "grad_norm": 0.9394035973214175, + "learning_rate": 5.37895153611731e-08, + "loss": 0.1161, + "step": 10171 + }, + { + "epoch": 0.9372091951904915, + "grad_norm": 0.9356338375134339, + "learning_rate": 5.363245651635568e-08, + "loss": 0.1163, + "step": 10172 + }, + { + "epoch": 0.9373013313677615, + "grad_norm": 0.9705226684470347, + "learning_rate": 5.3475624817027614e-08, + "loss": 0.1221, + "step": 10173 + }, + { + "epoch": 0.9373934675450316, + "grad_norm": 0.9778732464822383, + "learning_rate": 5.331902027775143e-08, + "loss": 0.1226, + "step": 10174 + }, + { + "epoch": 0.9374856037223016, + "grad_norm": 0.9344922249700046, + "learning_rate": 5.316264291306744e-08, + "loss": 0.1171, + "step": 10175 + }, + { + "epoch": 0.9375777398995716, + "grad_norm": 0.930084190711884, + "learning_rate": 5.300649273749542e-08, + "loss": 0.1196, + "step": 10176 + }, + { + "epoch": 0.9376698760768416, + "grad_norm": 0.9179768576305335, + "learning_rate": 5.2850569765533766e-08, + "loss": 0.1175, + "step": 10177 + }, + { + "epoch": 0.9377620122541116, + "grad_norm": 0.92930569119096, + "learning_rate": 5.2694874011660066e-08, + "loss": 0.1195, + "step": 10178 + }, + { + "epoch": 0.9378541484313816, + "grad_norm": 0.9196928713129051, + "learning_rate": 5.253940549033082e-08, + "loss": 0.1064, + "step": 10179 + }, + { + "epoch": 0.9379462846086516, + "grad_norm": 0.9418730370988134, + "learning_rate": 5.238416421598142e-08, + "loss": 0.1188, + "step": 10180 + }, + { + "epoch": 0.9380384207859216, + "grad_norm": 0.9425062455243531, + "learning_rate": 5.2229150203025604e-08, + "loss": 0.117, + "step": 10181 + }, + { + "epoch": 0.9381305569631916, + "grad_norm": 0.9634946865174938, + "learning_rate": 5.2074363465856316e-08, + "loss": 0.1138, + "step": 10182 + }, + { + "epoch": 0.9382226931404616, + "grad_norm": 0.9414468299208367, + "learning_rate": 5.191980401884594e-08, + "loss": 0.1181, + "step": 10183 + }, + { + "epoch": 0.9383148293177316, + "grad_norm": 0.9099367437520887, + "learning_rate": 5.176547187634551e-08, + "loss": 0.1173, + "step": 10184 + }, + { + "epoch": 0.9384069654950016, + "grad_norm": 0.940932781792052, + "learning_rate": 5.161136705268438e-08, + "loss": 0.1197, + "step": 10185 + }, + { + "epoch": 0.9384991016722716, + "grad_norm": 0.9422087246290408, + "learning_rate": 5.145748956217139e-08, + "loss": 0.1186, + "step": 10186 + }, + { + "epoch": 0.9385912378495416, + "grad_norm": 0.9339898499942781, + "learning_rate": 5.130383941909372e-08, + "loss": 0.1077, + "step": 10187 + }, + { + "epoch": 0.9386833740268117, + "grad_norm": 0.9095970890077272, + "learning_rate": 5.1150416637718306e-08, + "loss": 0.1071, + "step": 10188 + }, + { + "epoch": 0.9387755102040817, + "grad_norm": 0.9610721856654976, + "learning_rate": 5.0997221232290115e-08, + "loss": 0.1182, + "step": 10189 + }, + { + "epoch": 0.9388676463813517, + "grad_norm": 0.9442119623237161, + "learning_rate": 5.0844253217033624e-08, + "loss": 0.1253, + "step": 10190 + }, + { + "epoch": 0.9389597825586217, + "grad_norm": 0.9104684661627135, + "learning_rate": 5.06915126061519e-08, + "loss": 0.1107, + "step": 10191 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 0.9213554783526815, + "learning_rate": 5.0538999413826393e-08, + "loss": 0.1178, + "step": 10192 + }, + { + "epoch": 0.9391440549131617, + "grad_norm": 0.9993405206730799, + "learning_rate": 5.0386713654218825e-08, + "loss": 0.1308, + "step": 10193 + }, + { + "epoch": 0.9392361910904317, + "grad_norm": 0.9736502010870575, + "learning_rate": 5.023465534146843e-08, + "loss": 0.1214, + "step": 10194 + }, + { + "epoch": 0.9393283272677017, + "grad_norm": 0.8838783987621547, + "learning_rate": 5.008282448969393e-08, + "loss": 0.1084, + "step": 10195 + }, + { + "epoch": 0.9394204634449717, + "grad_norm": 0.948623222140314, + "learning_rate": 4.9931221112992924e-08, + "loss": 0.1198, + "step": 10196 + }, + { + "epoch": 0.9395125996222417, + "grad_norm": 0.8956293742678936, + "learning_rate": 4.977984522544166e-08, + "loss": 0.1091, + "step": 10197 + }, + { + "epoch": 0.9396047357995116, + "grad_norm": 0.9255954230214183, + "learning_rate": 4.962869684109528e-08, + "loss": 0.1097, + "step": 10198 + }, + { + "epoch": 0.9396968719767816, + "grad_norm": 0.9149389171061979, + "learning_rate": 4.947777597398812e-08, + "loss": 0.1145, + "step": 10199 + }, + { + "epoch": 0.9397890081540516, + "grad_norm": 0.9200927674585153, + "learning_rate": 4.932708263813341e-08, + "loss": 0.1114, + "step": 10200 + }, + { + "epoch": 0.9398811443313216, + "grad_norm": 0.9676136590853243, + "learning_rate": 4.917661684752273e-08, + "loss": 0.1214, + "step": 10201 + }, + { + "epoch": 0.9399732805085917, + "grad_norm": 0.9156287716615673, + "learning_rate": 4.9026378616127133e-08, + "loss": 0.118, + "step": 10202 + }, + { + "epoch": 0.9400654166858617, + "grad_norm": 0.889927739280015, + "learning_rate": 4.8876367957895744e-08, + "loss": 0.1084, + "step": 10203 + }, + { + "epoch": 0.9401575528631317, + "grad_norm": 0.9299760065843369, + "learning_rate": 4.872658488675741e-08, + "loss": 0.1127, + "step": 10204 + }, + { + "epoch": 0.9402496890404017, + "grad_norm": 0.9132737348421209, + "learning_rate": 4.8577029416619625e-08, + "loss": 0.1131, + "step": 10205 + }, + { + "epoch": 0.9403418252176717, + "grad_norm": 0.88489448512981, + "learning_rate": 4.84277015613685e-08, + "loss": 0.1083, + "step": 10206 + }, + { + "epoch": 0.9404339613949417, + "grad_norm": 1.0037419903582985, + "learning_rate": 4.8278601334869056e-08, + "loss": 0.1192, + "step": 10207 + }, + { + "epoch": 0.9405260975722117, + "grad_norm": 0.9268411474644621, + "learning_rate": 4.8129728750965224e-08, + "loss": 0.1128, + "step": 10208 + }, + { + "epoch": 0.9406182337494817, + "grad_norm": 0.9709277587211269, + "learning_rate": 4.79810838234801e-08, + "loss": 0.1303, + "step": 10209 + }, + { + "epoch": 0.9407103699267517, + "grad_norm": 0.8769457665825694, + "learning_rate": 4.7832666566215156e-08, + "loss": 0.0994, + "step": 10210 + }, + { + "epoch": 0.9408025061040217, + "grad_norm": 0.9298202881373417, + "learning_rate": 4.7684476992951033e-08, + "loss": 0.1122, + "step": 10211 + }, + { + "epoch": 0.9408946422812917, + "grad_norm": 0.8818956452577879, + "learning_rate": 4.753651511744728e-08, + "loss": 0.1072, + "step": 10212 + }, + { + "epoch": 0.9409867784585617, + "grad_norm": 0.9343822631531623, + "learning_rate": 4.738878095344207e-08, + "loss": 0.1167, + "step": 10213 + }, + { + "epoch": 0.9410789146358317, + "grad_norm": 0.9605236137441521, + "learning_rate": 4.7241274514652217e-08, + "loss": 0.114, + "step": 10214 + }, + { + "epoch": 0.9411710508131018, + "grad_norm": 0.9555618347520516, + "learning_rate": 4.7093995814773975e-08, + "loss": 0.1234, + "step": 10215 + }, + { + "epoch": 0.9412631869903718, + "grad_norm": 0.9108221732663505, + "learning_rate": 4.694694486748225e-08, + "loss": 0.1163, + "step": 10216 + }, + { + "epoch": 0.9413553231676418, + "grad_norm": 0.9386691959416757, + "learning_rate": 4.680012168643111e-08, + "loss": 0.1156, + "step": 10217 + }, + { + "epoch": 0.9414474593449118, + "grad_norm": 0.9589183202714626, + "learning_rate": 4.6653526285252437e-08, + "loss": 0.1186, + "step": 10218 + }, + { + "epoch": 0.9415395955221818, + "grad_norm": 0.9383960225445924, + "learning_rate": 4.650715867755784e-08, + "loss": 0.1124, + "step": 10219 + }, + { + "epoch": 0.9416317316994518, + "grad_norm": 0.9599936149241113, + "learning_rate": 4.636101887693756e-08, + "loss": 0.1156, + "step": 10220 + }, + { + "epoch": 0.9417238678767218, + "grad_norm": 0.9530960877295532, + "learning_rate": 4.621510689696046e-08, + "loss": 0.1234, + "step": 10221 + }, + { + "epoch": 0.9418160040539918, + "grad_norm": 0.9144532827198258, + "learning_rate": 4.606942275117543e-08, + "loss": 0.1082, + "step": 10222 + }, + { + "epoch": 0.9419081402312618, + "grad_norm": 0.9337094101499579, + "learning_rate": 4.5923966453108315e-08, + "loss": 0.1154, + "step": 10223 + }, + { + "epoch": 0.9420002764085318, + "grad_norm": 0.9107710053439749, + "learning_rate": 4.57787380162647e-08, + "loss": 0.1105, + "step": 10224 + }, + { + "epoch": 0.9420924125858018, + "grad_norm": 0.9887729337567444, + "learning_rate": 4.5633737454129636e-08, + "loss": 0.1222, + "step": 10225 + }, + { + "epoch": 0.9421845487630718, + "grad_norm": 1.0127249337390898, + "learning_rate": 4.548896478016651e-08, + "loss": 0.1213, + "step": 10226 + }, + { + "epoch": 0.9422766849403418, + "grad_norm": 0.8962299567936826, + "learning_rate": 4.5344420007816526e-08, + "loss": 0.1125, + "step": 10227 + }, + { + "epoch": 0.9423688211176118, + "grad_norm": 0.8985914309284595, + "learning_rate": 4.5200103150501996e-08, + "loss": 0.1132, + "step": 10228 + }, + { + "epoch": 0.9424609572948819, + "grad_norm": 0.8947342003365959, + "learning_rate": 4.5056014221621645e-08, + "loss": 0.1106, + "step": 10229 + }, + { + "epoch": 0.9425530934721519, + "grad_norm": 0.9554543314067567, + "learning_rate": 4.4912153234554777e-08, + "loss": 0.1163, + "step": 10230 + }, + { + "epoch": 0.9426452296494219, + "grad_norm": 0.8884133929587698, + "learning_rate": 4.4768520202658484e-08, + "loss": 0.1036, + "step": 10231 + }, + { + "epoch": 0.9427373658266919, + "grad_norm": 0.9632133595908018, + "learning_rate": 4.4625115139269314e-08, + "loss": 0.1237, + "step": 10232 + }, + { + "epoch": 0.9428295020039619, + "grad_norm": 0.9369237909664713, + "learning_rate": 4.448193805770273e-08, + "loss": 0.1118, + "step": 10233 + }, + { + "epoch": 0.9429216381812319, + "grad_norm": 0.9721798178694152, + "learning_rate": 4.4338988971252275e-08, + "loss": 0.1255, + "step": 10234 + }, + { + "epoch": 0.9430137743585019, + "grad_norm": 0.9855443317803402, + "learning_rate": 4.4196267893190926e-08, + "loss": 0.1181, + "step": 10235 + }, + { + "epoch": 0.9431059105357719, + "grad_norm": 0.9723576412165987, + "learning_rate": 4.4053774836770315e-08, + "loss": 0.1235, + "step": 10236 + }, + { + "epoch": 0.9431980467130419, + "grad_norm": 0.9171237878625916, + "learning_rate": 4.3911509815221244e-08, + "loss": 0.1065, + "step": 10237 + }, + { + "epoch": 0.9432901828903119, + "grad_norm": 0.9440385935903521, + "learning_rate": 4.3769472841752866e-08, + "loss": 0.1147, + "step": 10238 + }, + { + "epoch": 0.9433823190675819, + "grad_norm": 0.9451368053622421, + "learning_rate": 4.362766392955325e-08, + "loss": 0.1158, + "step": 10239 + }, + { + "epoch": 0.9434744552448519, + "grad_norm": 0.9943988884305303, + "learning_rate": 4.348608309178909e-08, + "loss": 0.1229, + "step": 10240 + }, + { + "epoch": 0.9435665914221218, + "grad_norm": 0.9477728019676438, + "learning_rate": 4.33447303416068e-08, + "loss": 0.1156, + "step": 10241 + }, + { + "epoch": 0.943658727599392, + "grad_norm": 0.9560883582930023, + "learning_rate": 4.320360569213061e-08, + "loss": 0.1123, + "step": 10242 + }, + { + "epoch": 0.943750863776662, + "grad_norm": 0.9330272969248564, + "learning_rate": 4.3062709156463936e-08, + "loss": 0.1157, + "step": 10243 + }, + { + "epoch": 0.943842999953932, + "grad_norm": 0.9343240680576235, + "learning_rate": 4.292204074768908e-08, + "loss": 0.1169, + "step": 10244 + }, + { + "epoch": 0.943935136131202, + "grad_norm": 0.9823045448492629, + "learning_rate": 4.278160047886753e-08, + "loss": 0.1266, + "step": 10245 + }, + { + "epoch": 0.9440272723084719, + "grad_norm": 0.9301202110619059, + "learning_rate": 4.264138836303861e-08, + "loss": 0.1203, + "step": 10246 + }, + { + "epoch": 0.9441194084857419, + "grad_norm": 0.9450681121818032, + "learning_rate": 4.250140441322131e-08, + "loss": 0.1272, + "step": 10247 + }, + { + "epoch": 0.9442115446630119, + "grad_norm": 0.9737075656993364, + "learning_rate": 4.236164864241277e-08, + "loss": 0.1184, + "step": 10248 + }, + { + "epoch": 0.9443036808402819, + "grad_norm": 0.9491703019406365, + "learning_rate": 4.22221210635898e-08, + "loss": 0.1153, + "step": 10249 + }, + { + "epoch": 0.9443958170175519, + "grad_norm": 0.9452455622285862, + "learning_rate": 4.208282168970762e-08, + "loss": 0.1173, + "step": 10250 + }, + { + "epoch": 0.9444879531948219, + "grad_norm": 1.0134904272797585, + "learning_rate": 4.1943750533700036e-08, + "loss": 0.1264, + "step": 10251 + }, + { + "epoch": 0.9445800893720919, + "grad_norm": 0.9769733494426245, + "learning_rate": 4.1804907608479494e-08, + "loss": 0.1312, + "step": 10252 + }, + { + "epoch": 0.9446722255493619, + "grad_norm": 0.9095232445090923, + "learning_rate": 4.166629292693791e-08, + "loss": 0.1095, + "step": 10253 + }, + { + "epoch": 0.9447643617266319, + "grad_norm": 0.9253806676477634, + "learning_rate": 4.1527906501945547e-08, + "loss": 0.1136, + "step": 10254 + }, + { + "epoch": 0.9448564979039019, + "grad_norm": 0.9756487468351606, + "learning_rate": 4.138974834635157e-08, + "loss": 0.123, + "step": 10255 + }, + { + "epoch": 0.944948634081172, + "grad_norm": 0.9438368581540895, + "learning_rate": 4.1251818472984315e-08, + "loss": 0.1184, + "step": 10256 + }, + { + "epoch": 0.945040770258442, + "grad_norm": 0.9009313023621, + "learning_rate": 4.1114116894650225e-08, + "loss": 0.1161, + "step": 10257 + }, + { + "epoch": 0.945132906435712, + "grad_norm": 0.9339442841007484, + "learning_rate": 4.0976643624134896e-08, + "loss": 0.114, + "step": 10258 + }, + { + "epoch": 0.945225042612982, + "grad_norm": 0.9994792676185544, + "learning_rate": 4.0839398674203114e-08, + "loss": 0.1172, + "step": 10259 + }, + { + "epoch": 0.945317178790252, + "grad_norm": 0.8813178121583533, + "learning_rate": 4.0702382057597465e-08, + "loss": 0.1085, + "step": 10260 + }, + { + "epoch": 0.945409314967522, + "grad_norm": 0.9806576557306624, + "learning_rate": 4.0565593787040555e-08, + "loss": 0.1229, + "step": 10261 + }, + { + "epoch": 0.945501451144792, + "grad_norm": 0.9284548478345062, + "learning_rate": 4.042903387523278e-08, + "loss": 0.1133, + "step": 10262 + }, + { + "epoch": 0.945593587322062, + "grad_norm": 0.961086375262101, + "learning_rate": 4.029270233485427e-08, + "loss": 0.1178, + "step": 10263 + }, + { + "epoch": 0.945685723499332, + "grad_norm": 0.924194713183989, + "learning_rate": 4.0156599178562686e-08, + "loss": 0.1157, + "step": 10264 + }, + { + "epoch": 0.945777859676602, + "grad_norm": 0.9922060533952876, + "learning_rate": 4.002072441899568e-08, + "loss": 0.1299, + "step": 10265 + }, + { + "epoch": 0.945869995853872, + "grad_norm": 0.8994135466256734, + "learning_rate": 3.988507806876929e-08, + "loss": 0.1099, + "step": 10266 + }, + { + "epoch": 0.945962132031142, + "grad_norm": 0.8825297317998159, + "learning_rate": 3.974966014047815e-08, + "loss": 0.1083, + "step": 10267 + }, + { + "epoch": 0.946054268208412, + "grad_norm": 0.8886493644730223, + "learning_rate": 3.961447064669582e-08, + "loss": 0.1077, + "step": 10268 + }, + { + "epoch": 0.946146404385682, + "grad_norm": 0.9517858277726193, + "learning_rate": 3.9479509599974486e-08, + "loss": 0.1205, + "step": 10269 + }, + { + "epoch": 0.9462385405629521, + "grad_norm": 0.9357653566018014, + "learning_rate": 3.9344777012845504e-08, + "loss": 0.1142, + "step": 10270 + }, + { + "epoch": 0.9463306767402221, + "grad_norm": 0.9994080080071049, + "learning_rate": 3.921027289781915e-08, + "loss": 0.1286, + "step": 10271 + }, + { + "epoch": 0.9464228129174921, + "grad_norm": 0.9701132832326953, + "learning_rate": 3.907599726738348e-08, + "loss": 0.1214, + "step": 10272 + }, + { + "epoch": 0.9465149490947621, + "grad_norm": 0.9475842718266658, + "learning_rate": 3.894195013400631e-08, + "loss": 0.1087, + "step": 10273 + }, + { + "epoch": 0.9466070852720321, + "grad_norm": 0.9428359279407029, + "learning_rate": 3.8808131510134074e-08, + "loss": 0.1263, + "step": 10274 + }, + { + "epoch": 0.9466992214493021, + "grad_norm": 0.97188586109351, + "learning_rate": 3.8674541408191824e-08, + "loss": 0.1213, + "step": 10275 + }, + { + "epoch": 0.9467913576265721, + "grad_norm": 0.97344652490752, + "learning_rate": 3.854117984058298e-08, + "loss": 0.1212, + "step": 10276 + }, + { + "epoch": 0.9468834938038421, + "grad_norm": 0.9359938550829311, + "learning_rate": 3.840804681969068e-08, + "loss": 0.1067, + "step": 10277 + }, + { + "epoch": 0.9469756299811121, + "grad_norm": 0.9344899765877749, + "learning_rate": 3.827514235787616e-08, + "loss": 0.1217, + "step": 10278 + }, + { + "epoch": 0.9470677661583821, + "grad_norm": 0.9439926374477899, + "learning_rate": 3.8142466467479265e-08, + "loss": 0.1105, + "step": 10279 + }, + { + "epoch": 0.9471599023356521, + "grad_norm": 0.9322879507849088, + "learning_rate": 3.801001916081987e-08, + "loss": 0.1158, + "step": 10280 + }, + { + "epoch": 0.9472520385129221, + "grad_norm": 0.935923550064033, + "learning_rate": 3.787780045019479e-08, + "loss": 0.119, + "step": 10281 + }, + { + "epoch": 0.947344174690192, + "grad_norm": 0.9640710670440555, + "learning_rate": 3.774581034788116e-08, + "loss": 0.1135, + "step": 10282 + }, + { + "epoch": 0.9474363108674622, + "grad_norm": 0.9311183807439256, + "learning_rate": 3.7614048866133624e-08, + "loss": 0.1173, + "step": 10283 + }, + { + "epoch": 0.9475284470447322, + "grad_norm": 0.9739842153491166, + "learning_rate": 3.748251601718711e-08, + "loss": 0.1223, + "step": 10284 + }, + { + "epoch": 0.9476205832220022, + "grad_norm": 0.9989638639290763, + "learning_rate": 3.7351211813253795e-08, + "loss": 0.1196, + "step": 10285 + }, + { + "epoch": 0.9477127193992722, + "grad_norm": 0.9387016890086035, + "learning_rate": 3.722013626652532e-08, + "loss": 0.1178, + "step": 10286 + }, + { + "epoch": 0.9478048555765421, + "grad_norm": 0.9004645700352083, + "learning_rate": 3.70892893891725e-08, + "loss": 0.1016, + "step": 10287 + }, + { + "epoch": 0.9478969917538121, + "grad_norm": 0.9072326721273851, + "learning_rate": 3.695867119334423e-08, + "loss": 0.1173, + "step": 10288 + }, + { + "epoch": 0.9479891279310821, + "grad_norm": 0.9778845392909062, + "learning_rate": 3.682828169116831e-08, + "loss": 0.1272, + "step": 10289 + }, + { + "epoch": 0.9480812641083521, + "grad_norm": 0.909610617103646, + "learning_rate": 3.669812089475144e-08, + "loss": 0.1161, + "step": 10290 + }, + { + "epoch": 0.9481734002856221, + "grad_norm": 0.9546052020757654, + "learning_rate": 3.656818881617924e-08, + "loss": 0.1204, + "step": 10291 + }, + { + "epoch": 0.9482655364628921, + "grad_norm": 0.9205463321641423, + "learning_rate": 3.6438485467515935e-08, + "loss": 0.1015, + "step": 10292 + }, + { + "epoch": 0.9483576726401621, + "grad_norm": 0.9848823587850997, + "learning_rate": 3.630901086080441e-08, + "loss": 0.1283, + "step": 10293 + }, + { + "epoch": 0.9484498088174321, + "grad_norm": 0.9526839289073572, + "learning_rate": 3.6179765008066134e-08, + "loss": 0.117, + "step": 10294 + }, + { + "epoch": 0.9485419449947021, + "grad_norm": 0.9203969240464469, + "learning_rate": 3.605074792130181e-08, + "loss": 0.1089, + "step": 10295 + }, + { + "epoch": 0.9486340811719721, + "grad_norm": 0.9418423365790961, + "learning_rate": 3.5921959612491006e-08, + "loss": 0.1118, + "step": 10296 + }, + { + "epoch": 0.9487262173492422, + "grad_norm": 0.9103613162527556, + "learning_rate": 3.5793400093591394e-08, + "loss": 0.1092, + "step": 10297 + }, + { + "epoch": 0.9488183535265122, + "grad_norm": 0.9557914846405104, + "learning_rate": 3.5665069376539796e-08, + "loss": 0.1276, + "step": 10298 + }, + { + "epoch": 0.9489104897037822, + "grad_norm": 0.9745637820171693, + "learning_rate": 3.553696747325142e-08, + "loss": 0.1125, + "step": 10299 + }, + { + "epoch": 0.9490026258810522, + "grad_norm": 0.9457597041155954, + "learning_rate": 3.540909439562118e-08, + "loss": 0.1186, + "step": 10300 + }, + { + "epoch": 0.9490947620583222, + "grad_norm": 1.0035411325101824, + "learning_rate": 3.528145015552154e-08, + "loss": 0.1309, + "step": 10301 + }, + { + "epoch": 0.9491868982355922, + "grad_norm": 0.9519212745079059, + "learning_rate": 3.515403476480439e-08, + "loss": 0.1118, + "step": 10302 + }, + { + "epoch": 0.9492790344128622, + "grad_norm": 0.9403154344488309, + "learning_rate": 3.5026848235300834e-08, + "loss": 0.11, + "step": 10303 + }, + { + "epoch": 0.9493711705901322, + "grad_norm": 0.9136726499930174, + "learning_rate": 3.489989057881948e-08, + "loss": 0.1133, + "step": 10304 + }, + { + "epoch": 0.9494633067674022, + "grad_norm": 0.955839488777064, + "learning_rate": 3.47731618071484e-08, + "loss": 0.1243, + "step": 10305 + }, + { + "epoch": 0.9495554429446722, + "grad_norm": 0.964765603667002, + "learning_rate": 3.4646661932054846e-08, + "loss": 0.1175, + "step": 10306 + }, + { + "epoch": 0.9496475791219422, + "grad_norm": 0.9055605967583641, + "learning_rate": 3.452039096528359e-08, + "loss": 0.1169, + "step": 10307 + }, + { + "epoch": 0.9497397152992122, + "grad_norm": 0.9543169846360642, + "learning_rate": 3.439434891855997e-08, + "loss": 0.1117, + "step": 10308 + }, + { + "epoch": 0.9498318514764822, + "grad_norm": 0.90718852701149, + "learning_rate": 3.42685358035863e-08, + "loss": 0.117, + "step": 10309 + }, + { + "epoch": 0.9499239876537523, + "grad_norm": 0.9405814649991735, + "learning_rate": 3.4142951632044065e-08, + "loss": 0.1161, + "step": 10310 + }, + { + "epoch": 0.9500161238310223, + "grad_norm": 1.0220816495719443, + "learning_rate": 3.401759641559449e-08, + "loss": 0.1268, + "step": 10311 + }, + { + "epoch": 0.9501082600082923, + "grad_norm": 0.9680480102811909, + "learning_rate": 3.3892470165876045e-08, + "loss": 0.1143, + "step": 10312 + }, + { + "epoch": 0.9502003961855623, + "grad_norm": 0.9410029223109165, + "learning_rate": 3.376757289450777e-08, + "loss": 0.1156, + "step": 10313 + }, + { + "epoch": 0.9502925323628323, + "grad_norm": 0.8905182542520986, + "learning_rate": 3.3642904613085393e-08, + "loss": 0.1117, + "step": 10314 + }, + { + "epoch": 0.9503846685401023, + "grad_norm": 0.9202657757695479, + "learning_rate": 3.3518465333184925e-08, + "loss": 0.1192, + "step": 10315 + }, + { + "epoch": 0.9504768047173723, + "grad_norm": 0.914152702938313, + "learning_rate": 3.339425506636018e-08, + "loss": 0.1054, + "step": 10316 + }, + { + "epoch": 0.9505689408946423, + "grad_norm": 0.9279714745854521, + "learning_rate": 3.327027382414444e-08, + "loss": 0.1131, + "step": 10317 + }, + { + "epoch": 0.9506610770719123, + "grad_norm": 0.9479869182555316, + "learning_rate": 3.314652161804932e-08, + "loss": 0.1161, + "step": 10318 + }, + { + "epoch": 0.9507532132491823, + "grad_norm": 0.954111959930808, + "learning_rate": 3.30229984595648e-08, + "loss": 0.1201, + "step": 10319 + }, + { + "epoch": 0.9508453494264523, + "grad_norm": 0.9734374242952998, + "learning_rate": 3.289970436016088e-08, + "loss": 0.1221, + "step": 10320 + }, + { + "epoch": 0.9509374856037223, + "grad_norm": 0.9428411208222813, + "learning_rate": 3.2776639331284774e-08, + "loss": 0.1144, + "step": 10321 + }, + { + "epoch": 0.9510296217809923, + "grad_norm": 0.9317668049150903, + "learning_rate": 3.2653803384362914e-08, + "loss": 0.1181, + "step": 10322 + }, + { + "epoch": 0.9511217579582623, + "grad_norm": 0.9298411302304473, + "learning_rate": 3.253119653080117e-08, + "loss": 0.1223, + "step": 10323 + }, + { + "epoch": 0.9512138941355324, + "grad_norm": 0.9151941756140258, + "learning_rate": 3.240881878198349e-08, + "loss": 0.1185, + "step": 10324 + }, + { + "epoch": 0.9513060303128024, + "grad_norm": 0.8593540933721838, + "learning_rate": 3.228667014927245e-08, + "loss": 0.1042, + "step": 10325 + }, + { + "epoch": 0.9513981664900724, + "grad_norm": 0.9377788085836343, + "learning_rate": 3.2164750644009814e-08, + "loss": 0.1217, + "step": 10326 + }, + { + "epoch": 0.9514903026673424, + "grad_norm": 0.9133361500053995, + "learning_rate": 3.204306027751541e-08, + "loss": 0.1116, + "step": 10327 + }, + { + "epoch": 0.9515824388446124, + "grad_norm": 0.9487359846675105, + "learning_rate": 3.1921599061088546e-08, + "loss": 0.1249, + "step": 10328 + }, + { + "epoch": 0.9516745750218824, + "grad_norm": 0.9343318497067161, + "learning_rate": 3.180036700600686e-08, + "loss": 0.1114, + "step": 10329 + }, + { + "epoch": 0.9517667111991523, + "grad_norm": 0.9296746242117467, + "learning_rate": 3.1679364123526625e-08, + "loss": 0.1187, + "step": 10330 + }, + { + "epoch": 0.9518588473764223, + "grad_norm": 0.9647571322981348, + "learning_rate": 3.1558590424883294e-08, + "loss": 0.1276, + "step": 10331 + }, + { + "epoch": 0.9519509835536923, + "grad_norm": 0.9685146771017278, + "learning_rate": 3.1438045921290404e-08, + "loss": 0.124, + "step": 10332 + }, + { + "epoch": 0.9520431197309623, + "grad_norm": 0.8476579863265571, + "learning_rate": 3.1317730623940665e-08, + "loss": 0.1033, + "step": 10333 + }, + { + "epoch": 0.9521352559082323, + "grad_norm": 0.9532082684247339, + "learning_rate": 3.119764454400515e-08, + "loss": 0.1218, + "step": 10334 + }, + { + "epoch": 0.9522273920855023, + "grad_norm": 0.9453757489867697, + "learning_rate": 3.1077787692634085e-08, + "loss": 0.121, + "step": 10335 + }, + { + "epoch": 0.9523195282627723, + "grad_norm": 0.9229009382067201, + "learning_rate": 3.095816008095637e-08, + "loss": 0.1149, + "step": 10336 + }, + { + "epoch": 0.9524116644400424, + "grad_norm": 1.0206879719212576, + "learning_rate": 3.083876172007894e-08, + "loss": 0.122, + "step": 10337 + }, + { + "epoch": 0.9525038006173124, + "grad_norm": 0.8983665500372922, + "learning_rate": 3.071959262108848e-08, + "loss": 0.1063, + "step": 10338 + }, + { + "epoch": 0.9525959367945824, + "grad_norm": 0.9479813529207537, + "learning_rate": 3.0600652795049204e-08, + "loss": 0.112, + "step": 10339 + }, + { + "epoch": 0.9526880729718524, + "grad_norm": 0.9177433261172819, + "learning_rate": 3.048194225300532e-08, + "loss": 0.112, + "step": 10340 + }, + { + "epoch": 0.9527802091491224, + "grad_norm": 0.9370376744228802, + "learning_rate": 3.0363461005978865e-08, + "loss": 0.1103, + "step": 10341 + }, + { + "epoch": 0.9528723453263924, + "grad_norm": 0.9060338446923794, + "learning_rate": 3.024520906497103e-08, + "loss": 0.1088, + "step": 10342 + }, + { + "epoch": 0.9529644815036624, + "grad_norm": 0.9151253682669213, + "learning_rate": 3.012718644096107e-08, + "loss": 0.116, + "step": 10343 + }, + { + "epoch": 0.9530566176809324, + "grad_norm": 0.9662487060386523, + "learning_rate": 3.0009393144907475e-08, + "loss": 0.1025, + "step": 10344 + }, + { + "epoch": 0.9531487538582024, + "grad_norm": 0.9507650484823077, + "learning_rate": 2.989182918774786e-08, + "loss": 0.1208, + "step": 10345 + }, + { + "epoch": 0.9532408900354724, + "grad_norm": 1.045514593246912, + "learning_rate": 2.977449458039766e-08, + "loss": 0.1339, + "step": 10346 + }, + { + "epoch": 0.9533330262127424, + "grad_norm": 0.9427860617410642, + "learning_rate": 2.9657389333751784e-08, + "loss": 0.1154, + "step": 10347 + }, + { + "epoch": 0.9534251623900124, + "grad_norm": 0.9225913930801135, + "learning_rate": 2.954051345868264e-08, + "loss": 0.1031, + "step": 10348 + }, + { + "epoch": 0.9535172985672824, + "grad_norm": 0.9038936333486661, + "learning_rate": 2.9423866966042935e-08, + "loss": 0.1116, + "step": 10349 + }, + { + "epoch": 0.9536094347445524, + "grad_norm": 0.9543339656289974, + "learning_rate": 2.9307449866663174e-08, + "loss": 0.1174, + "step": 10350 + }, + { + "epoch": 0.9537015709218225, + "grad_norm": 0.9056991309017217, + "learning_rate": 2.9191262171352486e-08, + "loss": 0.1111, + "step": 10351 + }, + { + "epoch": 0.9537937070990925, + "grad_norm": 0.9861185880449297, + "learning_rate": 2.9075303890899187e-08, + "loss": 0.1282, + "step": 10352 + }, + { + "epoch": 0.9538858432763625, + "grad_norm": 0.9621592809326136, + "learning_rate": 2.895957503606939e-08, + "loss": 0.1274, + "step": 10353 + }, + { + "epoch": 0.9539779794536325, + "grad_norm": 0.8922039949226299, + "learning_rate": 2.8844075617609492e-08, + "loss": 0.1105, + "step": 10354 + }, + { + "epoch": 0.9540701156309025, + "grad_norm": 0.9228402469930214, + "learning_rate": 2.8728805646242863e-08, + "loss": 0.1128, + "step": 10355 + }, + { + "epoch": 0.9541622518081725, + "grad_norm": 0.9617168790205287, + "learning_rate": 2.8613765132672612e-08, + "loss": 0.1128, + "step": 10356 + }, + { + "epoch": 0.9542543879854425, + "grad_norm": 0.8580565446966045, + "learning_rate": 2.8498954087580187e-08, + "loss": 0.0943, + "step": 10357 + }, + { + "epoch": 0.9543465241627125, + "grad_norm": 0.9520942535427239, + "learning_rate": 2.8384372521626236e-08, + "loss": 0.1172, + "step": 10358 + }, + { + "epoch": 0.9544386603399825, + "grad_norm": 0.9365343890221104, + "learning_rate": 2.827002044544891e-08, + "loss": 0.1148, + "step": 10359 + }, + { + "epoch": 0.9545307965172525, + "grad_norm": 0.933691258317069, + "learning_rate": 2.8155897869666105e-08, + "loss": 0.1206, + "step": 10360 + }, + { + "epoch": 0.9546229326945225, + "grad_norm": 0.9104238953866819, + "learning_rate": 2.8042004804874346e-08, + "loss": 0.1116, + "step": 10361 + }, + { + "epoch": 0.9547150688717925, + "grad_norm": 0.8681396545723196, + "learning_rate": 2.7928341261648507e-08, + "loss": 0.1018, + "step": 10362 + }, + { + "epoch": 0.9548072050490625, + "grad_norm": 0.9477791985994918, + "learning_rate": 2.7814907250542368e-08, + "loss": 0.1133, + "step": 10363 + }, + { + "epoch": 0.9548993412263325, + "grad_norm": 0.9830223390489866, + "learning_rate": 2.770170278208806e-08, + "loss": 0.1211, + "step": 10364 + }, + { + "epoch": 0.9549914774036026, + "grad_norm": 0.9651177678762349, + "learning_rate": 2.7588727866796617e-08, + "loss": 0.1193, + "step": 10365 + }, + { + "epoch": 0.9550836135808726, + "grad_norm": 0.9231632131169901, + "learning_rate": 2.7475982515157986e-08, + "loss": 0.108, + "step": 10366 + }, + { + "epoch": 0.9551757497581426, + "grad_norm": 0.9546876190711204, + "learning_rate": 2.7363466737640453e-08, + "loss": 0.1126, + "step": 10367 + }, + { + "epoch": 0.9552678859354126, + "grad_norm": 0.9264536078414163, + "learning_rate": 2.7251180544691225e-08, + "loss": 0.1166, + "step": 10368 + }, + { + "epoch": 0.9553600221126826, + "grad_norm": 0.948299270414955, + "learning_rate": 2.7139123946735847e-08, + "loss": 0.1229, + "step": 10369 + }, + { + "epoch": 0.9554521582899526, + "grad_norm": 0.9773410646908133, + "learning_rate": 2.7027296954178773e-08, + "loss": 0.1169, + "step": 10370 + }, + { + "epoch": 0.9555442944672226, + "grad_norm": 0.9411232679657067, + "learning_rate": 2.6915699577403644e-08, + "loss": 0.1119, + "step": 10371 + }, + { + "epoch": 0.9556364306444926, + "grad_norm": 0.9173845585128807, + "learning_rate": 2.680433182677189e-08, + "loss": 0.1151, + "step": 10372 + }, + { + "epoch": 0.9557285668217625, + "grad_norm": 0.9916694029075123, + "learning_rate": 2.6693193712624133e-08, + "loss": 0.1183, + "step": 10373 + }, + { + "epoch": 0.9558207029990325, + "grad_norm": 0.8983875748435834, + "learning_rate": 2.6582285245279338e-08, + "loss": 0.1121, + "step": 10374 + }, + { + "epoch": 0.9559128391763025, + "grad_norm": 0.916895182755261, + "learning_rate": 2.6471606435035934e-08, + "loss": 0.1075, + "step": 10375 + }, + { + "epoch": 0.9560049753535725, + "grad_norm": 0.9347044640622088, + "learning_rate": 2.6361157292169593e-08, + "loss": 0.1195, + "step": 10376 + }, + { + "epoch": 0.9560971115308425, + "grad_norm": 0.9614171099198148, + "learning_rate": 2.6250937826936274e-08, + "loss": 0.12, + "step": 10377 + }, + { + "epoch": 0.9561892477081126, + "grad_norm": 0.891382660362729, + "learning_rate": 2.6140948049569737e-08, + "loss": 0.111, + "step": 10378 + }, + { + "epoch": 0.9562813838853826, + "grad_norm": 0.9757278767219288, + "learning_rate": 2.603118797028209e-08, + "loss": 0.1256, + "step": 10379 + }, + { + "epoch": 0.9563735200626526, + "grad_norm": 0.9466608595122338, + "learning_rate": 2.592165759926518e-08, + "loss": 0.1157, + "step": 10380 + }, + { + "epoch": 0.9564656562399226, + "grad_norm": 0.9383107114478125, + "learning_rate": 2.5812356946688376e-08, + "loss": 0.1102, + "step": 10381 + }, + { + "epoch": 0.9565577924171926, + "grad_norm": 0.9982385805871572, + "learning_rate": 2.5703286022700503e-08, + "loss": 0.1209, + "step": 10382 + }, + { + "epoch": 0.9566499285944626, + "grad_norm": 0.9727363491362462, + "learning_rate": 2.559444483742901e-08, + "loss": 0.122, + "step": 10383 + }, + { + "epoch": 0.9567420647717326, + "grad_norm": 0.9155845511468633, + "learning_rate": 2.548583340097971e-08, + "loss": 0.1099, + "step": 10384 + }, + { + "epoch": 0.9568342009490026, + "grad_norm": 0.9366488234735828, + "learning_rate": 2.5377451723436753e-08, + "loss": 0.1122, + "step": 10385 + }, + { + "epoch": 0.9569263371262726, + "grad_norm": 0.9856402469594859, + "learning_rate": 2.5269299814863756e-08, + "loss": 0.125, + "step": 10386 + }, + { + "epoch": 0.9570184733035426, + "grad_norm": 0.9558864294357874, + "learning_rate": 2.5161377685302968e-08, + "loss": 0.1283, + "step": 10387 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 0.9764377825032131, + "learning_rate": 2.505368534477415e-08, + "loss": 0.1199, + "step": 10388 + }, + { + "epoch": 0.9572027456580826, + "grad_norm": 0.8978698039692437, + "learning_rate": 2.4946222803277354e-08, + "loss": 0.0986, + "step": 10389 + }, + { + "epoch": 0.9572948818353526, + "grad_norm": 0.9801110295191998, + "learning_rate": 2.483899007078988e-08, + "loss": 0.1306, + "step": 10390 + }, + { + "epoch": 0.9573870180126226, + "grad_norm": 0.9985048936130674, + "learning_rate": 2.4731987157268768e-08, + "loss": 0.1247, + "step": 10391 + }, + { + "epoch": 0.9574791541898927, + "grad_norm": 0.9170543020639506, + "learning_rate": 2.462521407264912e-08, + "loss": 0.1122, + "step": 10392 + }, + { + "epoch": 0.9575712903671627, + "grad_norm": 0.9342437829032304, + "learning_rate": 2.4518670826844393e-08, + "loss": 0.1129, + "step": 10393 + }, + { + "epoch": 0.9576634265444327, + "grad_norm": 0.9286150111365903, + "learning_rate": 2.4412357429747514e-08, + "loss": 0.1156, + "step": 10394 + }, + { + "epoch": 0.9577555627217027, + "grad_norm": 0.9298276602062225, + "learning_rate": 2.4306273891230025e-08, + "loss": 0.1104, + "step": 10395 + }, + { + "epoch": 0.9578476988989727, + "grad_norm": 1.0298146827458847, + "learning_rate": 2.4200420221141274e-08, + "loss": 0.1333, + "step": 10396 + }, + { + "epoch": 0.9579398350762427, + "grad_norm": 1.0308681272587117, + "learning_rate": 2.4094796429310063e-08, + "loss": 0.1398, + "step": 10397 + }, + { + "epoch": 0.9580319712535127, + "grad_norm": 0.9327133822677386, + "learning_rate": 2.398940252554327e-08, + "loss": 0.1067, + "step": 10398 + }, + { + "epoch": 0.9581241074307827, + "grad_norm": 0.9462850181389516, + "learning_rate": 2.3884238519626957e-08, + "loss": 0.1124, + "step": 10399 + }, + { + "epoch": 0.9582162436080527, + "grad_norm": 0.9239641699815608, + "learning_rate": 2.3779304421325532e-08, + "loss": 0.1116, + "step": 10400 + }, + { + "epoch": 0.9583083797853227, + "grad_norm": 0.9651337343147548, + "learning_rate": 2.3674600240382594e-08, + "loss": 0.1218, + "step": 10401 + }, + { + "epoch": 0.9584005159625927, + "grad_norm": 0.9622370171729949, + "learning_rate": 2.3570125986518977e-08, + "loss": 0.1234, + "step": 10402 + }, + { + "epoch": 0.9584926521398627, + "grad_norm": 0.9408894748477682, + "learning_rate": 2.346588166943581e-08, + "loss": 0.115, + "step": 10403 + }, + { + "epoch": 0.9585847883171327, + "grad_norm": 0.9783764871100286, + "learning_rate": 2.336186729881229e-08, + "loss": 0.1194, + "step": 10404 + }, + { + "epoch": 0.9586769244944028, + "grad_norm": 0.9756132049009393, + "learning_rate": 2.32580828843057e-08, + "loss": 0.1186, + "step": 10405 + }, + { + "epoch": 0.9587690606716728, + "grad_norm": 0.9510515294492892, + "learning_rate": 2.3154528435553046e-08, + "loss": 0.1092, + "step": 10406 + }, + { + "epoch": 0.9588611968489428, + "grad_norm": 0.8845538346516818, + "learning_rate": 2.3051203962168588e-08, + "loss": 0.0984, + "step": 10407 + }, + { + "epoch": 0.9589533330262128, + "grad_norm": 0.9793169380312722, + "learning_rate": 2.2948109473746593e-08, + "loss": 0.1201, + "step": 10408 + }, + { + "epoch": 0.9590454692034828, + "grad_norm": 0.9673626523788653, + "learning_rate": 2.2845244979859127e-08, + "loss": 0.1207, + "step": 10409 + }, + { + "epoch": 0.9591376053807528, + "grad_norm": 0.9894712915426592, + "learning_rate": 2.274261049005716e-08, + "loss": 0.1243, + "step": 10410 + }, + { + "epoch": 0.9592297415580228, + "grad_norm": 0.9445587376170052, + "learning_rate": 2.264020601387057e-08, + "loss": 0.1186, + "step": 10411 + }, + { + "epoch": 0.9593218777352928, + "grad_norm": 0.9923210445189716, + "learning_rate": 2.2538031560807584e-08, + "loss": 0.1362, + "step": 10412 + }, + { + "epoch": 0.9594140139125628, + "grad_norm": 0.9947331667454813, + "learning_rate": 2.243608714035478e-08, + "loss": 0.1276, + "step": 10413 + }, + { + "epoch": 0.9595061500898328, + "grad_norm": 0.9192672464588488, + "learning_rate": 2.2334372761977918e-08, + "loss": 0.1123, + "step": 10414 + }, + { + "epoch": 0.9595982862671028, + "grad_norm": 0.9176602052717612, + "learning_rate": 2.2232888435121115e-08, + "loss": 0.1082, + "step": 10415 + }, + { + "epoch": 0.9596904224443727, + "grad_norm": 0.8490744571811496, + "learning_rate": 2.213163416920766e-08, + "loss": 0.1001, + "step": 10416 + }, + { + "epoch": 0.9597825586216427, + "grad_norm": 1.0246092310280255, + "learning_rate": 2.203060997363837e-08, + "loss": 0.1357, + "step": 10417 + }, + { + "epoch": 0.9598746947989127, + "grad_norm": 0.9870790553638401, + "learning_rate": 2.1929815857793802e-08, + "loss": 0.1232, + "step": 10418 + }, + { + "epoch": 0.9599668309761828, + "grad_norm": 0.8683466030495433, + "learning_rate": 2.1829251831032293e-08, + "loss": 0.1048, + "step": 10419 + }, + { + "epoch": 0.9600589671534528, + "grad_norm": 0.9473957011467433, + "learning_rate": 2.172891790269166e-08, + "loss": 0.1191, + "step": 10420 + }, + { + "epoch": 0.9601511033307228, + "grad_norm": 0.9543960166674035, + "learning_rate": 2.1628814082087503e-08, + "loss": 0.1059, + "step": 10421 + }, + { + "epoch": 0.9602432395079928, + "grad_norm": 0.9508773504440561, + "learning_rate": 2.1528940378514885e-08, + "loss": 0.1162, + "step": 10422 + }, + { + "epoch": 0.9603353756852628, + "grad_norm": 0.9490154248407026, + "learning_rate": 2.142929680124667e-08, + "loss": 0.1193, + "step": 10423 + }, + { + "epoch": 0.9604275118625328, + "grad_norm": 0.9539270143381033, + "learning_rate": 2.1329883359535174e-08, + "loss": 0.1224, + "step": 10424 + }, + { + "epoch": 0.9605196480398028, + "grad_norm": 0.9445515396370072, + "learning_rate": 2.12307000626108e-08, + "loss": 0.1116, + "step": 10425 + }, + { + "epoch": 0.9606117842170728, + "grad_norm": 0.9336698772896715, + "learning_rate": 2.113174691968256e-08, + "loss": 0.1222, + "step": 10426 + }, + { + "epoch": 0.9607039203943428, + "grad_norm": 0.9736147739548708, + "learning_rate": 2.103302393993867e-08, + "loss": 0.1215, + "step": 10427 + }, + { + "epoch": 0.9607960565716128, + "grad_norm": 0.9610154886959913, + "learning_rate": 2.0934531132544845e-08, + "loss": 0.1119, + "step": 10428 + }, + { + "epoch": 0.9608881927488828, + "grad_norm": 0.9233255796310752, + "learning_rate": 2.0836268506647108e-08, + "loss": 0.1159, + "step": 10429 + }, + { + "epoch": 0.9609803289261528, + "grad_norm": 1.0019170763885927, + "learning_rate": 2.0738236071368157e-08, + "loss": 0.1267, + "step": 10430 + }, + { + "epoch": 0.9610724651034228, + "grad_norm": 0.9475199883723734, + "learning_rate": 2.0640433835810992e-08, + "loss": 0.124, + "step": 10431 + }, + { + "epoch": 0.9611646012806928, + "grad_norm": 0.9505982590713935, + "learning_rate": 2.0542861809056403e-08, + "loss": 0.1195, + "step": 10432 + }, + { + "epoch": 0.9612567374579629, + "grad_norm": 0.9229846484513212, + "learning_rate": 2.044552000016409e-08, + "loss": 0.1221, + "step": 10433 + }, + { + "epoch": 0.9613488736352329, + "grad_norm": 1.0038863547016854, + "learning_rate": 2.0348408418172095e-08, + "loss": 0.1264, + "step": 10434 + }, + { + "epoch": 0.9614410098125029, + "grad_norm": 0.9560645257488093, + "learning_rate": 2.025152707209682e-08, + "loss": 0.1254, + "step": 10435 + }, + { + "epoch": 0.9615331459897729, + "grad_norm": 0.9731366034584815, + "learning_rate": 2.0154875970934406e-08, + "loss": 0.1203, + "step": 10436 + }, + { + "epoch": 0.9616252821670429, + "grad_norm": 0.87193348832153, + "learning_rate": 2.0058455123658783e-08, + "loss": 0.1034, + "step": 10437 + }, + { + "epoch": 0.9617174183443129, + "grad_norm": 0.9036349774099822, + "learning_rate": 1.996226453922251e-08, + "loss": 0.1133, + "step": 10438 + }, + { + "epoch": 0.9618095545215829, + "grad_norm": 0.9620815398911813, + "learning_rate": 1.98663042265565e-08, + "loss": 0.122, + "step": 10439 + }, + { + "epoch": 0.9619016906988529, + "grad_norm": 0.9346474041117219, + "learning_rate": 1.97705741945714e-08, + "loss": 0.1164, + "step": 10440 + }, + { + "epoch": 0.9619938268761229, + "grad_norm": 0.9400603989396563, + "learning_rate": 1.9675074452155385e-08, + "loss": 0.122, + "step": 10441 + }, + { + "epoch": 0.9620859630533929, + "grad_norm": 0.9234460209107835, + "learning_rate": 1.9579805008175524e-08, + "loss": 0.1125, + "step": 10442 + }, + { + "epoch": 0.9621780992306629, + "grad_norm": 0.9414505506791186, + "learning_rate": 1.9484765871477795e-08, + "loss": 0.124, + "step": 10443 + }, + { + "epoch": 0.9622702354079329, + "grad_norm": 1.0014286121077527, + "learning_rate": 1.9389957050886255e-08, + "loss": 0.1299, + "step": 10444 + }, + { + "epoch": 0.9623623715852029, + "grad_norm": 0.9142956574674312, + "learning_rate": 1.9295378555204692e-08, + "loss": 0.1108, + "step": 10445 + }, + { + "epoch": 0.962454507762473, + "grad_norm": 0.8926286890511982, + "learning_rate": 1.920103039321386e-08, + "loss": 0.1038, + "step": 10446 + }, + { + "epoch": 0.962546643939743, + "grad_norm": 0.9334442791983129, + "learning_rate": 1.910691257367425e-08, + "loss": 0.1185, + "step": 10447 + }, + { + "epoch": 0.962638780117013, + "grad_norm": 0.9226218209835843, + "learning_rate": 1.9013025105324988e-08, + "loss": 0.1156, + "step": 10448 + }, + { + "epoch": 0.962730916294283, + "grad_norm": 1.0008713216068006, + "learning_rate": 1.8919367996883263e-08, + "loss": 0.1358, + "step": 10449 + }, + { + "epoch": 0.962823052471553, + "grad_norm": 0.9420362549155533, + "learning_rate": 1.8825941257045178e-08, + "loss": 0.1215, + "step": 10450 + }, + { + "epoch": 0.962915188648823, + "grad_norm": 0.9479544981846614, + "learning_rate": 1.8732744894485732e-08, + "loss": 0.1147, + "step": 10451 + }, + { + "epoch": 0.963007324826093, + "grad_norm": 0.911141437496614, + "learning_rate": 1.8639778917857732e-08, + "loss": 0.1154, + "step": 10452 + }, + { + "epoch": 0.963099461003363, + "grad_norm": 0.9618422043315887, + "learning_rate": 1.8547043335793435e-08, + "loss": 0.1207, + "step": 10453 + }, + { + "epoch": 0.963191597180633, + "grad_norm": 0.9380728000761778, + "learning_rate": 1.845453815690318e-08, + "loss": 0.1165, + "step": 10454 + }, + { + "epoch": 0.963283733357903, + "grad_norm": 0.9173483292287571, + "learning_rate": 1.8362263389775926e-08, + "loss": 0.1158, + "step": 10455 + }, + { + "epoch": 0.963375869535173, + "grad_norm": 0.9578019512184618, + "learning_rate": 1.827021904297982e-08, + "loss": 0.1068, + "step": 10456 + }, + { + "epoch": 0.963468005712443, + "grad_norm": 0.9760195511875756, + "learning_rate": 1.8178405125060804e-08, + "loss": 0.1254, + "step": 10457 + }, + { + "epoch": 0.963560141889713, + "grad_norm": 0.9428947080982596, + "learning_rate": 1.8086821644544283e-08, + "loss": 0.1093, + "step": 10458 + }, + { + "epoch": 0.963652278066983, + "grad_norm": 0.952402275184984, + "learning_rate": 1.7995468609933176e-08, + "loss": 0.1197, + "step": 10459 + }, + { + "epoch": 0.9637444142442531, + "grad_norm": 0.9459210733717505, + "learning_rate": 1.790434602971014e-08, + "loss": 0.1174, + "step": 10460 + }, + { + "epoch": 0.963836550421523, + "grad_norm": 0.9453907795866404, + "learning_rate": 1.7813453912335354e-08, + "loss": 0.1207, + "step": 10461 + }, + { + "epoch": 0.963928686598793, + "grad_norm": 0.9770369213173757, + "learning_rate": 1.772279226624901e-08, + "loss": 0.1189, + "step": 10462 + }, + { + "epoch": 0.964020822776063, + "grad_norm": 0.9195107982697946, + "learning_rate": 1.7632361099867988e-08, + "loss": 0.1148, + "step": 10463 + }, + { + "epoch": 0.964112958953333, + "grad_norm": 0.9331915458684126, + "learning_rate": 1.7542160421590017e-08, + "loss": 0.1204, + "step": 10464 + }, + { + "epoch": 0.964205095130603, + "grad_norm": 0.8800809838568525, + "learning_rate": 1.7452190239789225e-08, + "loss": 0.1125, + "step": 10465 + }, + { + "epoch": 0.964297231307873, + "grad_norm": 0.932302653775477, + "learning_rate": 1.7362450562819765e-08, + "loss": 0.1112, + "step": 10466 + }, + { + "epoch": 0.964389367485143, + "grad_norm": 0.9339782759306302, + "learning_rate": 1.7272941399013865e-08, + "loss": 0.1203, + "step": 10467 + }, + { + "epoch": 0.964481503662413, + "grad_norm": 0.9185719997260997, + "learning_rate": 1.718366275668265e-08, + "loss": 0.1135, + "step": 10468 + }, + { + "epoch": 0.964573639839683, + "grad_norm": 0.947190121034125, + "learning_rate": 1.7094614644115605e-08, + "loss": 0.1202, + "step": 10469 + }, + { + "epoch": 0.964665776016953, + "grad_norm": 0.9272255443211805, + "learning_rate": 1.700579706958083e-08, + "loss": 0.1116, + "step": 10470 + }, + { + "epoch": 0.964757912194223, + "grad_norm": 0.9462412750065758, + "learning_rate": 1.6917210041325073e-08, + "loss": 0.1239, + "step": 10471 + }, + { + "epoch": 0.964850048371493, + "grad_norm": 0.8873810871132549, + "learning_rate": 1.6828853567573413e-08, + "loss": 0.1103, + "step": 10472 + }, + { + "epoch": 0.9649421845487631, + "grad_norm": 0.9566149931244136, + "learning_rate": 1.6740727656529844e-08, + "loss": 0.1229, + "step": 10473 + }, + { + "epoch": 0.9650343207260331, + "grad_norm": 0.8996264087378854, + "learning_rate": 1.6652832316377264e-08, + "loss": 0.1136, + "step": 10474 + }, + { + "epoch": 0.9651264569033031, + "grad_norm": 0.9032876556833013, + "learning_rate": 1.6565167555276373e-08, + "loss": 0.108, + "step": 10475 + }, + { + "epoch": 0.9652185930805731, + "grad_norm": 0.9747145064783178, + "learning_rate": 1.6477733381367043e-08, + "loss": 0.1172, + "step": 10476 + }, + { + "epoch": 0.9653107292578431, + "grad_norm": 0.955471389790214, + "learning_rate": 1.639052980276723e-08, + "loss": 0.1171, + "step": 10477 + }, + { + "epoch": 0.9654028654351131, + "grad_norm": 0.9422341858068135, + "learning_rate": 1.6303556827574062e-08, + "loss": 0.1103, + "step": 10478 + }, + { + "epoch": 0.9654950016123831, + "grad_norm": 0.9833514913519336, + "learning_rate": 1.6216814463863028e-08, + "loss": 0.1112, + "step": 10479 + }, + { + "epoch": 0.9655871377896531, + "grad_norm": 0.9459532688328239, + "learning_rate": 1.6130302719687962e-08, + "loss": 0.1148, + "step": 10480 + }, + { + "epoch": 0.9656792739669231, + "grad_norm": 0.9261751377453326, + "learning_rate": 1.6044021603081607e-08, + "loss": 0.1188, + "step": 10481 + }, + { + "epoch": 0.9657714101441931, + "grad_norm": 0.9328018965578682, + "learning_rate": 1.5957971122055327e-08, + "loss": 0.1224, + "step": 10482 + }, + { + "epoch": 0.9658635463214631, + "grad_norm": 0.9127921231957702, + "learning_rate": 1.5872151284598848e-08, + "loss": 0.1106, + "step": 10483 + }, + { + "epoch": 0.9659556824987331, + "grad_norm": 0.9700118414319213, + "learning_rate": 1.5786562098680235e-08, + "loss": 0.1217, + "step": 10484 + }, + { + "epoch": 0.9660478186760031, + "grad_norm": 0.9464480098243716, + "learning_rate": 1.570120357224647e-08, + "loss": 0.1133, + "step": 10485 + }, + { + "epoch": 0.9661399548532731, + "grad_norm": 0.9112094496345272, + "learning_rate": 1.561607571322371e-08, + "loss": 0.1093, + "step": 10486 + }, + { + "epoch": 0.9662320910305432, + "grad_norm": 0.9105593590010037, + "learning_rate": 1.5531178529515635e-08, + "loss": 0.1097, + "step": 10487 + }, + { + "epoch": 0.9663242272078132, + "grad_norm": 0.9055822997245453, + "learning_rate": 1.54465120290051e-08, + "loss": 0.1054, + "step": 10488 + }, + { + "epoch": 0.9664163633850832, + "grad_norm": 0.9365050243180014, + "learning_rate": 1.5362076219553048e-08, + "loss": 0.1167, + "step": 10489 + }, + { + "epoch": 0.9665084995623532, + "grad_norm": 0.9324783994074396, + "learning_rate": 1.5277871108999586e-08, + "loss": 0.1196, + "step": 10490 + }, + { + "epoch": 0.9666006357396232, + "grad_norm": 0.9687391645872897, + "learning_rate": 1.519389670516347e-08, + "loss": 0.1264, + "step": 10491 + }, + { + "epoch": 0.9666927719168932, + "grad_norm": 0.9326297055675994, + "learning_rate": 1.511015301584151e-08, + "loss": 0.1155, + "step": 10492 + }, + { + "epoch": 0.9667849080941632, + "grad_norm": 0.9116280800410203, + "learning_rate": 1.502664004880888e-08, + "loss": 0.1104, + "step": 10493 + }, + { + "epoch": 0.9668770442714332, + "grad_norm": 0.9768260070146615, + "learning_rate": 1.4943357811820492e-08, + "loss": 0.1179, + "step": 10494 + }, + { + "epoch": 0.9669691804487032, + "grad_norm": 1.0156559451484888, + "learning_rate": 1.4860306312608762e-08, + "loss": 0.1246, + "step": 10495 + }, + { + "epoch": 0.9670613166259732, + "grad_norm": 0.9438961994256015, + "learning_rate": 1.4777485558884753e-08, + "loss": 0.1193, + "step": 10496 + }, + { + "epoch": 0.9671534528032432, + "grad_norm": 0.9865514247200415, + "learning_rate": 1.4694895558338972e-08, + "loss": 0.1222, + "step": 10497 + }, + { + "epoch": 0.9672455889805132, + "grad_norm": 0.9609002911659604, + "learning_rate": 1.4612536318639459e-08, + "loss": 0.1188, + "step": 10498 + }, + { + "epoch": 0.9673377251577832, + "grad_norm": 0.9682466775746156, + "learning_rate": 1.4530407847433702e-08, + "loss": 0.1288, + "step": 10499 + }, + { + "epoch": 0.9674298613350532, + "grad_norm": 0.9339595766099066, + "learning_rate": 1.4448510152346717e-08, + "loss": 0.1133, + "step": 10500 + }, + { + "epoch": 0.9674298613350532, + "eval_loss": 0.11658257246017456, + "eval_runtime": 300.5591, + "eval_samples_per_second": 23.346, + "eval_steps_per_second": 2.921, + "step": 10500 + }, + { + "epoch": 0.9675219975123233, + "grad_norm": 0.9331219890938893, + "learning_rate": 1.4366843240982975e-08, + "loss": 0.1149, + "step": 10501 + }, + { + "epoch": 0.9676141336895933, + "grad_norm": 0.9234308420470323, + "learning_rate": 1.4285407120925854e-08, + "loss": 0.1203, + "step": 10502 + }, + { + "epoch": 0.9677062698668633, + "grad_norm": 0.9100047418427226, + "learning_rate": 1.4204201799735973e-08, + "loss": 0.1048, + "step": 10503 + }, + { + "epoch": 0.9677984060441333, + "grad_norm": 0.9445857702495147, + "learning_rate": 1.412322728495341e-08, + "loss": 0.1076, + "step": 10504 + }, + { + "epoch": 0.9678905422214032, + "grad_norm": 0.9000245874971252, + "learning_rate": 1.40424835840966e-08, + "loss": 0.1125, + "step": 10505 + }, + { + "epoch": 0.9679826783986732, + "grad_norm": 0.9193210702366165, + "learning_rate": 1.3961970704662875e-08, + "loss": 0.1061, + "step": 10506 + }, + { + "epoch": 0.9680748145759432, + "grad_norm": 0.876365817619803, + "learning_rate": 1.3881688654127645e-08, + "loss": 0.1051, + "step": 10507 + }, + { + "epoch": 0.9681669507532132, + "grad_norm": 0.9577008365137175, + "learning_rate": 1.3801637439945225e-08, + "loss": 0.1203, + "step": 10508 + }, + { + "epoch": 0.9682590869304832, + "grad_norm": 0.9348907292875536, + "learning_rate": 1.3721817069548282e-08, + "loss": 0.1178, + "step": 10509 + }, + { + "epoch": 0.9683512231077532, + "grad_norm": 0.9456956045310755, + "learning_rate": 1.3642227550348387e-08, + "loss": 0.1199, + "step": 10510 + }, + { + "epoch": 0.9684433592850232, + "grad_norm": 0.9169617118871369, + "learning_rate": 1.3562868889735182e-08, + "loss": 0.104, + "step": 10511 + }, + { + "epoch": 0.9685354954622932, + "grad_norm": 0.9437229497003773, + "learning_rate": 1.348374109507694e-08, + "loss": 0.1223, + "step": 10512 + }, + { + "epoch": 0.9686276316395632, + "grad_norm": 0.980640074035574, + "learning_rate": 1.3404844173721398e-08, + "loss": 0.1183, + "step": 10513 + }, + { + "epoch": 0.9687197678168333, + "grad_norm": 0.9759068217403593, + "learning_rate": 1.332617813299325e-08, + "loss": 0.1234, + "step": 10514 + }, + { + "epoch": 0.9688119039941033, + "grad_norm": 0.9122055207582814, + "learning_rate": 1.324774298019721e-08, + "loss": 0.1115, + "step": 10515 + }, + { + "epoch": 0.9689040401713733, + "grad_norm": 0.9770911885826764, + "learning_rate": 1.316953872261606e-08, + "loss": 0.1205, + "step": 10516 + }, + { + "epoch": 0.9689961763486433, + "grad_norm": 0.9367724440014724, + "learning_rate": 1.3091565367510661e-08, + "loss": 0.1111, + "step": 10517 + }, + { + "epoch": 0.9690883125259133, + "grad_norm": 0.9332000236767728, + "learning_rate": 1.3013822922121332e-08, + "loss": 0.1164, + "step": 10518 + }, + { + "epoch": 0.9691804487031833, + "grad_norm": 0.9715687328064153, + "learning_rate": 1.2936311393665912e-08, + "loss": 0.1154, + "step": 10519 + }, + { + "epoch": 0.9692725848804533, + "grad_norm": 0.9494480811734403, + "learning_rate": 1.2859030789341698e-08, + "loss": 0.1109, + "step": 10520 + }, + { + "epoch": 0.9693647210577233, + "grad_norm": 0.8713494760993586, + "learning_rate": 1.278198111632406e-08, + "loss": 0.1073, + "step": 10521 + }, + { + "epoch": 0.9694568572349933, + "grad_norm": 0.9593584205782064, + "learning_rate": 1.2705162381767277e-08, + "loss": 0.1166, + "step": 10522 + }, + { + "epoch": 0.9695489934122633, + "grad_norm": 0.9071851172527483, + "learning_rate": 1.2628574592803977e-08, + "loss": 0.1131, + "step": 10523 + }, + { + "epoch": 0.9696411295895333, + "grad_norm": 0.9053877155338074, + "learning_rate": 1.2552217756545137e-08, + "loss": 0.1115, + "step": 10524 + }, + { + "epoch": 0.9697332657668033, + "grad_norm": 0.9666711819309619, + "learning_rate": 1.2476091880080366e-08, + "loss": 0.1189, + "step": 10525 + }, + { + "epoch": 0.9698254019440733, + "grad_norm": 0.9263691427304236, + "learning_rate": 1.240019697047845e-08, + "loss": 0.1234, + "step": 10526 + }, + { + "epoch": 0.9699175381213433, + "grad_norm": 0.9379347777164991, + "learning_rate": 1.2324533034785702e-08, + "loss": 0.1086, + "step": 10527 + }, + { + "epoch": 0.9700096742986134, + "grad_norm": 0.9637020462719162, + "learning_rate": 1.2249100080028164e-08, + "loss": 0.1089, + "step": 10528 + }, + { + "epoch": 0.9701018104758834, + "grad_norm": 0.9419892430527868, + "learning_rate": 1.2173898113209126e-08, + "loss": 0.1191, + "step": 10529 + }, + { + "epoch": 0.9701939466531534, + "grad_norm": 0.9343313110426041, + "learning_rate": 1.2098927141311333e-08, + "loss": 0.1188, + "step": 10530 + }, + { + "epoch": 0.9702860828304234, + "grad_norm": 0.9821001712352756, + "learning_rate": 1.2024187171296165e-08, + "loss": 0.1207, + "step": 10531 + }, + { + "epoch": 0.9703782190076934, + "grad_norm": 0.936560662898743, + "learning_rate": 1.1949678210102788e-08, + "loss": 0.1141, + "step": 10532 + }, + { + "epoch": 0.9704703551849634, + "grad_norm": 0.9816223567816494, + "learning_rate": 1.1875400264649562e-08, + "loss": 0.1137, + "step": 10533 + }, + { + "epoch": 0.9705624913622334, + "grad_norm": 0.9258232675199926, + "learning_rate": 1.1801353341833466e-08, + "loss": 0.1106, + "step": 10534 + }, + { + "epoch": 0.9706546275395034, + "grad_norm": 0.9150708660768186, + "learning_rate": 1.1727537448529003e-08, + "loss": 0.1079, + "step": 10535 + }, + { + "epoch": 0.9707467637167734, + "grad_norm": 0.9204217315060662, + "learning_rate": 1.1653952591590967e-08, + "loss": 0.1169, + "step": 10536 + }, + { + "epoch": 0.9708388998940434, + "grad_norm": 0.9549901951555877, + "learning_rate": 1.1580598777850837e-08, + "loss": 0.1115, + "step": 10537 + }, + { + "epoch": 0.9709310360713134, + "grad_norm": 0.9021095506714563, + "learning_rate": 1.1507476014120112e-08, + "loss": 0.1081, + "step": 10538 + }, + { + "epoch": 0.9710231722485834, + "grad_norm": 0.9292179876258926, + "learning_rate": 1.143458430718808e-08, + "loss": 0.12, + "step": 10539 + }, + { + "epoch": 0.9711153084258534, + "grad_norm": 0.9377578388308517, + "learning_rate": 1.136192366382266e-08, + "loss": 0.1229, + "step": 10540 + }, + { + "epoch": 0.9712074446031235, + "grad_norm": 0.9572734492491797, + "learning_rate": 1.128949409077068e-08, + "loss": 0.1144, + "step": 10541 + }, + { + "epoch": 0.9712995807803935, + "grad_norm": 0.9870783763304519, + "learning_rate": 1.121729559475676e-08, + "loss": 0.118, + "step": 10542 + }, + { + "epoch": 0.9713917169576635, + "grad_norm": 0.9116902070429109, + "learning_rate": 1.1145328182484706e-08, + "loss": 0.1097, + "step": 10543 + }, + { + "epoch": 0.9714838531349335, + "grad_norm": 0.9063023701626302, + "learning_rate": 1.1073591860636946e-08, + "loss": 0.1156, + "step": 10544 + }, + { + "epoch": 0.9715759893122035, + "grad_norm": 0.9457383706995881, + "learning_rate": 1.1002086635873987e-08, + "loss": 0.1241, + "step": 10545 + }, + { + "epoch": 0.9716681254894735, + "grad_norm": 0.9838563803147564, + "learning_rate": 1.0930812514835243e-08, + "loss": 0.1213, + "step": 10546 + }, + { + "epoch": 0.9717602616667435, + "grad_norm": 0.8800524682172473, + "learning_rate": 1.0859769504138196e-08, + "loss": 0.1083, + "step": 10547 + }, + { + "epoch": 0.9718523978440134, + "grad_norm": 0.910856796387508, + "learning_rate": 1.0788957610379791e-08, + "loss": 0.1113, + "step": 10548 + }, + { + "epoch": 0.9719445340212834, + "grad_norm": 0.8808796503629651, + "learning_rate": 1.0718376840134214e-08, + "loss": 0.1117, + "step": 10549 + }, + { + "epoch": 0.9720366701985534, + "grad_norm": 0.9406849715983612, + "learning_rate": 1.0648027199955391e-08, + "loss": 0.1214, + "step": 10550 + }, + { + "epoch": 0.9721288063758234, + "grad_norm": 0.9069532479196183, + "learning_rate": 1.0577908696375316e-08, + "loss": 0.1165, + "step": 10551 + }, + { + "epoch": 0.9722209425530934, + "grad_norm": 0.9548171410344726, + "learning_rate": 1.0508021335904061e-08, + "loss": 0.1156, + "step": 10552 + }, + { + "epoch": 0.9723130787303634, + "grad_norm": 0.9573038324311234, + "learning_rate": 1.0438365125031158e-08, + "loss": 0.1192, + "step": 10553 + }, + { + "epoch": 0.9724052149076334, + "grad_norm": 0.9533383138256408, + "learning_rate": 1.0368940070223932e-08, + "loss": 0.1135, + "step": 10554 + }, + { + "epoch": 0.9724973510849035, + "grad_norm": 0.9615848577133012, + "learning_rate": 1.0299746177928338e-08, + "loss": 0.1158, + "step": 10555 + }, + { + "epoch": 0.9725894872621735, + "grad_norm": 0.9373204121607308, + "learning_rate": 1.0230783454569515e-08, + "loss": 0.1143, + "step": 10556 + }, + { + "epoch": 0.9726816234394435, + "grad_norm": 0.9449065421907685, + "learning_rate": 1.0162051906550397e-08, + "loss": 0.1136, + "step": 10557 + }, + { + "epoch": 0.9727737596167135, + "grad_norm": 0.88676617682467, + "learning_rate": 1.0093551540252822e-08, + "loss": 0.1071, + "step": 10558 + }, + { + "epoch": 0.9728658957939835, + "grad_norm": 0.905225458618946, + "learning_rate": 1.0025282362036704e-08, + "loss": 0.1206, + "step": 10559 + }, + { + "epoch": 0.9729580319712535, + "grad_norm": 0.9183727079728304, + "learning_rate": 9.957244378241138e-09, + "loss": 0.1101, + "step": 10560 + }, + { + "epoch": 0.9730501681485235, + "grad_norm": 0.9845449747492825, + "learning_rate": 9.889437595183293e-09, + "loss": 0.1244, + "step": 10561 + }, + { + "epoch": 0.9731423043257935, + "grad_norm": 0.9502006703842801, + "learning_rate": 9.821862019159522e-09, + "loss": 0.1231, + "step": 10562 + }, + { + "epoch": 0.9732344405030635, + "grad_norm": 0.9617262844228264, + "learning_rate": 9.754517656443697e-09, + "loss": 0.1221, + "step": 10563 + }, + { + "epoch": 0.9733265766803335, + "grad_norm": 0.9519604835311853, + "learning_rate": 9.68740451328859e-09, + "loss": 0.1171, + "step": 10564 + }, + { + "epoch": 0.9734187128576035, + "grad_norm": 0.9637928773849125, + "learning_rate": 9.62052259592644e-09, + "loss": 0.1243, + "step": 10565 + }, + { + "epoch": 0.9735108490348735, + "grad_norm": 0.9288945824254763, + "learning_rate": 9.553871910566448e-09, + "loss": 0.1239, + "step": 10566 + }, + { + "epoch": 0.9736029852121435, + "grad_norm": 0.9472650891783666, + "learning_rate": 9.487452463397828e-09, + "loss": 0.1033, + "step": 10567 + }, + { + "epoch": 0.9736951213894135, + "grad_norm": 0.9534769666335019, + "learning_rate": 9.421264260587038e-09, + "loss": 0.1191, + "step": 10568 + }, + { + "epoch": 0.9737872575666836, + "grad_norm": 0.9897087742581929, + "learning_rate": 9.355307308279992e-09, + "loss": 0.1246, + "step": 10569 + }, + { + "epoch": 0.9738793937439536, + "grad_norm": 0.9127514393900824, + "learning_rate": 9.289581612600684e-09, + "loss": 0.1194, + "step": 10570 + }, + { + "epoch": 0.9739715299212236, + "grad_norm": 0.9052157433771952, + "learning_rate": 9.224087179651731e-09, + "loss": 0.1099, + "step": 10571 + }, + { + "epoch": 0.9740636660984936, + "grad_norm": 0.9536514537796765, + "learning_rate": 9.158824015514378e-09, + "loss": 0.1174, + "step": 10572 + }, + { + "epoch": 0.9741558022757636, + "grad_norm": 1.0070035166714224, + "learning_rate": 9.093792126248224e-09, + "loss": 0.1177, + "step": 10573 + }, + { + "epoch": 0.9742479384530336, + "grad_norm": 0.9381398739474147, + "learning_rate": 9.028991517891495e-09, + "loss": 0.119, + "step": 10574 + }, + { + "epoch": 0.9743400746303036, + "grad_norm": 0.9668291092324927, + "learning_rate": 8.964422196461042e-09, + "loss": 0.1296, + "step": 10575 + }, + { + "epoch": 0.9744322108075736, + "grad_norm": 0.8874480746623137, + "learning_rate": 8.900084167952072e-09, + "loss": 0.1066, + "step": 10576 + }, + { + "epoch": 0.9745243469848436, + "grad_norm": 0.93955289545439, + "learning_rate": 8.835977438338417e-09, + "loss": 0.114, + "step": 10577 + }, + { + "epoch": 0.9746164831621136, + "grad_norm": 0.9040397264793893, + "learning_rate": 8.772102013572537e-09, + "loss": 0.1093, + "step": 10578 + }, + { + "epoch": 0.9747086193393836, + "grad_norm": 0.9045961887807713, + "learning_rate": 8.708457899584965e-09, + "loss": 0.1128, + "step": 10579 + }, + { + "epoch": 0.9748007555166536, + "grad_norm": 0.9615250420317627, + "learning_rate": 8.645045102285143e-09, + "loss": 0.1171, + "step": 10580 + }, + { + "epoch": 0.9748928916939236, + "grad_norm": 0.9054598326002232, + "learning_rate": 8.58186362756086e-09, + "loss": 0.1186, + "step": 10581 + }, + { + "epoch": 0.9749850278711937, + "grad_norm": 0.9889995978585244, + "learning_rate": 8.518913481278812e-09, + "loss": 0.1214, + "step": 10582 + }, + { + "epoch": 0.9750771640484637, + "grad_norm": 0.9742523995692353, + "learning_rate": 8.456194669284046e-09, + "loss": 0.1277, + "step": 10583 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 0.9658412586374955, + "learning_rate": 8.393707197399404e-09, + "loss": 0.1194, + "step": 10584 + }, + { + "epoch": 0.9752614364030037, + "grad_norm": 0.8987042660004325, + "learning_rate": 8.331451071427188e-09, + "loss": 0.1154, + "step": 10585 + }, + { + "epoch": 0.9753535725802737, + "grad_norm": 0.9238970510872365, + "learning_rate": 8.269426297148053e-09, + "loss": 0.1093, + "step": 10586 + }, + { + "epoch": 0.9754457087575437, + "grad_norm": 0.9426614072312355, + "learning_rate": 8.207632880320727e-09, + "loss": 0.1089, + "step": 10587 + }, + { + "epoch": 0.9755378449348137, + "grad_norm": 1.0276725978874846, + "learning_rate": 8.146070826683116e-09, + "loss": 0.1195, + "step": 10588 + }, + { + "epoch": 0.9756299811120837, + "grad_norm": 0.9563523741611569, + "learning_rate": 8.084740141950653e-09, + "loss": 0.1236, + "step": 10589 + }, + { + "epoch": 0.9757221172893537, + "grad_norm": 0.9156272141595668, + "learning_rate": 8.023640831818502e-09, + "loss": 0.1095, + "step": 10590 + }, + { + "epoch": 0.9758142534666236, + "grad_norm": 0.9179595642185693, + "learning_rate": 7.962772901959348e-09, + "loss": 0.112, + "step": 10591 + }, + { + "epoch": 0.9759063896438936, + "grad_norm": 0.9278463762785488, + "learning_rate": 7.902136358025058e-09, + "loss": 0.1138, + "step": 10592 + }, + { + "epoch": 0.9759985258211636, + "grad_norm": 0.95021990936773, + "learning_rate": 7.841731205645576e-09, + "loss": 0.124, + "step": 10593 + }, + { + "epoch": 0.9760906619984336, + "grad_norm": 0.8940755815437218, + "learning_rate": 7.781557450429467e-09, + "loss": 0.1009, + "step": 10594 + }, + { + "epoch": 0.9761827981757036, + "grad_norm": 0.9717988157956657, + "learning_rate": 7.72161509796393e-09, + "loss": 0.1119, + "step": 10595 + }, + { + "epoch": 0.9762749343529737, + "grad_norm": 0.9008600970075309, + "learning_rate": 7.661904153814793e-09, + "loss": 0.1133, + "step": 10596 + }, + { + "epoch": 0.9763670705302437, + "grad_norm": 0.9419592224579174, + "learning_rate": 7.60242462352595e-09, + "loss": 0.1228, + "step": 10597 + }, + { + "epoch": 0.9764592067075137, + "grad_norm": 0.9587973988727028, + "learning_rate": 7.543176512620487e-09, + "loss": 0.1168, + "step": 10598 + }, + { + "epoch": 0.9765513428847837, + "grad_norm": 0.9321975028306119, + "learning_rate": 7.484159826599002e-09, + "loss": 0.1176, + "step": 10599 + }, + { + "epoch": 0.9766434790620537, + "grad_norm": 0.922618089187133, + "learning_rate": 7.425374570941557e-09, + "loss": 0.1141, + "step": 10600 + }, + { + "epoch": 0.9767356152393237, + "grad_norm": 0.8982462835378838, + "learning_rate": 7.366820751106562e-09, + "loss": 0.1142, + "step": 10601 + }, + { + "epoch": 0.9768277514165937, + "grad_norm": 0.9843917017683375, + "learning_rate": 7.308498372530226e-09, + "loss": 0.1259, + "step": 10602 + }, + { + "epoch": 0.9769198875938637, + "grad_norm": 0.9473204680331596, + "learning_rate": 7.250407440628493e-09, + "loss": 0.1234, + "step": 10603 + }, + { + "epoch": 0.9770120237711337, + "grad_norm": 1.0040227923056881, + "learning_rate": 7.192547960794549e-09, + "loss": 0.1217, + "step": 10604 + }, + { + "epoch": 0.9771041599484037, + "grad_norm": 0.947542454325738, + "learning_rate": 7.134919938400486e-09, + "loss": 0.1178, + "step": 10605 + }, + { + "epoch": 0.9771962961256737, + "grad_norm": 0.9500522546507386, + "learning_rate": 7.077523378797579e-09, + "loss": 0.1136, + "step": 10606 + }, + { + "epoch": 0.9772884323029437, + "grad_norm": 0.8961019047765427, + "learning_rate": 7.0203582873151764e-09, + "loss": 0.106, + "step": 10607 + }, + { + "epoch": 0.9773805684802137, + "grad_norm": 0.9211980530137002, + "learning_rate": 6.963424669260421e-09, + "loss": 0.1176, + "step": 10608 + }, + { + "epoch": 0.9774727046574838, + "grad_norm": 0.929269443134366, + "learning_rate": 6.906722529920196e-09, + "loss": 0.1233, + "step": 10609 + }, + { + "epoch": 0.9775648408347538, + "grad_norm": 0.9473648799487335, + "learning_rate": 6.850251874559177e-09, + "loss": 0.1128, + "step": 10610 + }, + { + "epoch": 0.9776569770120238, + "grad_norm": 0.9169410453945872, + "learning_rate": 6.7940127084203945e-09, + "loss": 0.1125, + "step": 10611 + }, + { + "epoch": 0.9777491131892938, + "grad_norm": 0.9092218025061077, + "learning_rate": 6.738005036726059e-09, + "loss": 0.1082, + "step": 10612 + }, + { + "epoch": 0.9778412493665638, + "grad_norm": 0.9487463863743852, + "learning_rate": 6.682228864675899e-09, + "loss": 0.117, + "step": 10613 + }, + { + "epoch": 0.9779333855438338, + "grad_norm": 0.9286186855517777, + "learning_rate": 6.626684197449384e-09, + "loss": 0.1073, + "step": 10614 + }, + { + "epoch": 0.9780255217211038, + "grad_norm": 0.9033427687803351, + "learning_rate": 6.5713710402037775e-09, + "loss": 0.111, + "step": 10615 + }, + { + "epoch": 0.9781176578983738, + "grad_norm": 0.9705632853143739, + "learning_rate": 6.516289398074416e-09, + "loss": 0.1272, + "step": 10616 + }, + { + "epoch": 0.9782097940756438, + "grad_norm": 0.9143294975731162, + "learning_rate": 6.461439276176096e-09, + "loss": 0.1156, + "step": 10617 + }, + { + "epoch": 0.9783019302529138, + "grad_norm": 0.9656105337016601, + "learning_rate": 6.406820679601411e-09, + "loss": 0.1257, + "step": 10618 + }, + { + "epoch": 0.9783940664301838, + "grad_norm": 1.006782451352813, + "learning_rate": 6.35243361342186e-09, + "loss": 0.1294, + "step": 10619 + }, + { + "epoch": 0.9784862026074538, + "grad_norm": 0.930881618709819, + "learning_rate": 6.298278082687015e-09, + "loss": 0.1229, + "step": 10620 + }, + { + "epoch": 0.9785783387847238, + "grad_norm": 0.9250924437201751, + "learning_rate": 6.244354092425631e-09, + "loss": 0.1092, + "step": 10621 + }, + { + "epoch": 0.9786704749619938, + "grad_norm": 0.9004668653298935, + "learning_rate": 6.190661647644259e-09, + "loss": 0.0974, + "step": 10622 + }, + { + "epoch": 0.9787626111392639, + "grad_norm": 0.9078763930835535, + "learning_rate": 6.137200753328354e-09, + "loss": 0.1078, + "step": 10623 + }, + { + "epoch": 0.9788547473165339, + "grad_norm": 0.9246549887726953, + "learning_rate": 6.083971414442003e-09, + "loss": 0.1059, + "step": 10624 + }, + { + "epoch": 0.9789468834938039, + "grad_norm": 0.9760946282787811, + "learning_rate": 6.030973635926807e-09, + "loss": 0.1196, + "step": 10625 + }, + { + "epoch": 0.9790390196710739, + "grad_norm": 0.9148879730601133, + "learning_rate": 5.9782074227046625e-09, + "loss": 0.1106, + "step": 10626 + }, + { + "epoch": 0.9791311558483439, + "grad_norm": 0.9830983954452395, + "learning_rate": 5.925672779673875e-09, + "loss": 0.13, + "step": 10627 + }, + { + "epoch": 0.9792232920256139, + "grad_norm": 0.9168069775130212, + "learning_rate": 5.87336971171304e-09, + "loss": 0.1117, + "step": 10628 + }, + { + "epoch": 0.9793154282028839, + "grad_norm": 0.927218481484028, + "learning_rate": 5.821298223678274e-09, + "loss": 0.1152, + "step": 10629 + }, + { + "epoch": 0.9794075643801539, + "grad_norm": 0.9453897188162311, + "learning_rate": 5.76945832040432e-09, + "loss": 0.1227, + "step": 10630 + }, + { + "epoch": 0.9794997005574239, + "grad_norm": 0.9079700350552379, + "learning_rate": 5.717850006704551e-09, + "loss": 0.1071, + "step": 10631 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.9516553063862506, + "learning_rate": 5.666473287370966e-09, + "loss": 0.1245, + "step": 10632 + }, + { + "epoch": 0.9796839729119639, + "grad_norm": 0.9634462965013579, + "learning_rate": 5.615328167173639e-09, + "loss": 0.1178, + "step": 10633 + }, + { + "epoch": 0.9797761090892338, + "grad_norm": 0.9206987163610182, + "learning_rate": 5.564414650861549e-09, + "loss": 0.1154, + "step": 10634 + }, + { + "epoch": 0.9798682452665038, + "grad_norm": 0.9014004204638184, + "learning_rate": 5.513732743162303e-09, + "loss": 0.1063, + "step": 10635 + }, + { + "epoch": 0.9799603814437738, + "grad_norm": 0.8760217490984519, + "learning_rate": 5.463282448781027e-09, + "loss": 0.1048, + "step": 10636 + }, + { + "epoch": 0.980052517621044, + "grad_norm": 0.9331572014494885, + "learning_rate": 5.41306377240286e-09, + "loss": 0.1111, + "step": 10637 + }, + { + "epoch": 0.980144653798314, + "grad_norm": 0.9934794177717047, + "learning_rate": 5.363076718689908e-09, + "loss": 0.1257, + "step": 10638 + }, + { + "epoch": 0.9802367899755839, + "grad_norm": 0.9255709555345145, + "learning_rate": 5.313321292283735e-09, + "loss": 0.1118, + "step": 10639 + }, + { + "epoch": 0.9803289261528539, + "grad_norm": 0.9304134226487328, + "learning_rate": 5.263797497804257e-09, + "loss": 0.1202, + "step": 10640 + }, + { + "epoch": 0.9804210623301239, + "grad_norm": 0.9036732654637581, + "learning_rate": 5.2145053398494626e-09, + "loss": 0.113, + "step": 10641 + }, + { + "epoch": 0.9805131985073939, + "grad_norm": 0.884189794772743, + "learning_rate": 5.165444822996801e-09, + "loss": 0.1096, + "step": 10642 + }, + { + "epoch": 0.9806053346846639, + "grad_norm": 0.9227208369894664, + "learning_rate": 5.116615951800685e-09, + "loss": 0.1138, + "step": 10643 + }, + { + "epoch": 0.9806974708619339, + "grad_norm": 0.9727271841430377, + "learning_rate": 5.068018730795543e-09, + "loss": 0.1278, + "step": 10644 + }, + { + "epoch": 0.9807896070392039, + "grad_norm": 0.8889123529269067, + "learning_rate": 5.019653164493044e-09, + "loss": 0.1126, + "step": 10645 + }, + { + "epoch": 0.9808817432164739, + "grad_norm": 0.9502701859156512, + "learning_rate": 4.971519257384316e-09, + "loss": 0.1164, + "step": 10646 + }, + { + "epoch": 0.9809738793937439, + "grad_norm": 0.9612182962807315, + "learning_rate": 4.9236170139388415e-09, + "loss": 0.1181, + "step": 10647 + }, + { + "epoch": 0.9810660155710139, + "grad_norm": 0.9677109113081508, + "learning_rate": 4.875946438603896e-09, + "loss": 0.1192, + "step": 10648 + }, + { + "epoch": 0.9811581517482839, + "grad_norm": 0.934500406425529, + "learning_rate": 4.828507535805937e-09, + "loss": 0.1155, + "step": 10649 + }, + { + "epoch": 0.981250287925554, + "grad_norm": 0.9816515440764665, + "learning_rate": 4.781300309949221e-09, + "loss": 0.122, + "step": 10650 + }, + { + "epoch": 0.981342424102824, + "grad_norm": 0.8982107358428756, + "learning_rate": 4.734324765417741e-09, + "loss": 0.1081, + "step": 10651 + }, + { + "epoch": 0.981434560280094, + "grad_norm": 0.9325950183542184, + "learning_rate": 4.687580906572453e-09, + "loss": 0.1197, + "step": 10652 + }, + { + "epoch": 0.981526696457364, + "grad_norm": 0.9549246082334452, + "learning_rate": 4.6410687377540505e-09, + "loss": 0.1154, + "step": 10653 + }, + { + "epoch": 0.981618832634634, + "grad_norm": 0.9353084160232171, + "learning_rate": 4.5947882632810244e-09, + "loss": 0.1083, + "step": 10654 + }, + { + "epoch": 0.981710968811904, + "grad_norm": 0.8924218274524065, + "learning_rate": 4.5487394874502155e-09, + "loss": 0.105, + "step": 10655 + }, + { + "epoch": 0.981803104989174, + "grad_norm": 0.9371527891685825, + "learning_rate": 4.502922414537647e-09, + "loss": 0.1162, + "step": 10656 + }, + { + "epoch": 0.981895241166444, + "grad_norm": 0.9043040642121698, + "learning_rate": 4.457337048797139e-09, + "loss": 0.112, + "step": 10657 + }, + { + "epoch": 0.981987377343714, + "grad_norm": 0.9966084418021494, + "learning_rate": 4.411983394461694e-09, + "loss": 0.1213, + "step": 10658 + }, + { + "epoch": 0.982079513520984, + "grad_norm": 0.9518988653583343, + "learning_rate": 4.366861455742111e-09, + "loss": 0.1136, + "step": 10659 + }, + { + "epoch": 0.982171649698254, + "grad_norm": 0.9182319295200211, + "learning_rate": 4.321971236827815e-09, + "loss": 0.1124, + "step": 10660 + }, + { + "epoch": 0.982263785875524, + "grad_norm": 0.9072685324613959, + "learning_rate": 4.277312741887418e-09, + "loss": 0.1162, + "step": 10661 + }, + { + "epoch": 0.982355922052794, + "grad_norm": 0.9127397436133858, + "learning_rate": 4.232885975066769e-09, + "loss": 0.1073, + "step": 10662 + }, + { + "epoch": 0.982448058230064, + "grad_norm": 0.9097974449201501, + "learning_rate": 4.188690940491457e-09, + "loss": 0.1221, + "step": 10663 + }, + { + "epoch": 0.9825401944073341, + "grad_norm": 0.921889778134402, + "learning_rate": 4.144727642264867e-09, + "loss": 0.1033, + "step": 10664 + }, + { + "epoch": 0.9826323305846041, + "grad_norm": 0.9406033364188552, + "learning_rate": 4.100996084468734e-09, + "loss": 0.1169, + "step": 10665 + }, + { + "epoch": 0.9827244667618741, + "grad_norm": 0.948017766212838, + "learning_rate": 4.057496271163974e-09, + "loss": 0.1186, + "step": 10666 + }, + { + "epoch": 0.9828166029391441, + "grad_norm": 0.9274946903488767, + "learning_rate": 4.014228206389026e-09, + "loss": 0.1189, + "step": 10667 + }, + { + "epoch": 0.9829087391164141, + "grad_norm": 0.9316329448709069, + "learning_rate": 3.971191894161785e-09, + "loss": 0.1147, + "step": 10668 + }, + { + "epoch": 0.9830008752936841, + "grad_norm": 0.9329760448425564, + "learning_rate": 3.9283873384779455e-09, + "loss": 0.1111, + "step": 10669 + }, + { + "epoch": 0.9830930114709541, + "grad_norm": 0.9303551616716375, + "learning_rate": 3.8858145433118275e-09, + "loss": 0.1149, + "step": 10670 + }, + { + "epoch": 0.9831851476482241, + "grad_norm": 1.0299478339943349, + "learning_rate": 3.843473512616658e-09, + "loss": 0.1233, + "step": 10671 + }, + { + "epoch": 0.9832772838254941, + "grad_norm": 0.9347356160626646, + "learning_rate": 3.801364250323458e-09, + "loss": 0.1204, + "step": 10672 + }, + { + "epoch": 0.9833694200027641, + "grad_norm": 0.9154680976488679, + "learning_rate": 3.759486760342435e-09, + "loss": 0.1131, + "step": 10673 + }, + { + "epoch": 0.9834615561800341, + "grad_norm": 0.9648845681864151, + "learning_rate": 3.7178410465615876e-09, + "loss": 0.1074, + "step": 10674 + }, + { + "epoch": 0.983553692357304, + "grad_norm": 0.9607039283534857, + "learning_rate": 3.676427112848102e-09, + "loss": 0.108, + "step": 10675 + }, + { + "epoch": 0.983645828534574, + "grad_norm": 0.9682615771187273, + "learning_rate": 3.63524496304668e-09, + "loss": 0.1303, + "step": 10676 + }, + { + "epoch": 0.9837379647118442, + "grad_norm": 1.004190066912663, + "learning_rate": 3.5942946009814848e-09, + "loss": 0.1222, + "step": 10677 + }, + { + "epoch": 0.9838301008891142, + "grad_norm": 0.9169312481844852, + "learning_rate": 3.553576030454753e-09, + "loss": 0.1105, + "step": 10678 + }, + { + "epoch": 0.9839222370663842, + "grad_norm": 0.9427475770972022, + "learning_rate": 3.5130892552473485e-09, + "loss": 0.122, + "step": 10679 + }, + { + "epoch": 0.9840143732436542, + "grad_norm": 0.909718197191416, + "learning_rate": 3.4728342791179313e-09, + "loss": 0.1105, + "step": 10680 + }, + { + "epoch": 0.9841065094209241, + "grad_norm": 0.958982757205956, + "learning_rate": 3.432811105804623e-09, + "loss": 0.118, + "step": 10681 + }, + { + "epoch": 0.9841986455981941, + "grad_norm": 0.9241596455799383, + "learning_rate": 3.3930197390236175e-09, + "loss": 0.114, + "step": 10682 + }, + { + "epoch": 0.9842907817754641, + "grad_norm": 0.9580131120057819, + "learning_rate": 3.353460182469459e-09, + "loss": 0.1243, + "step": 10683 + }, + { + "epoch": 0.9843829179527341, + "grad_norm": 0.9537908329817629, + "learning_rate": 3.3141324398150434e-09, + "loss": 0.1231, + "step": 10684 + }, + { + "epoch": 0.9844750541300041, + "grad_norm": 0.9846295496352669, + "learning_rate": 3.275036514712171e-09, + "loss": 0.1181, + "step": 10685 + }, + { + "epoch": 0.9845671903072741, + "grad_norm": 1.0269472534084152, + "learning_rate": 3.236172410790994e-09, + "loss": 0.1243, + "step": 10686 + }, + { + "epoch": 0.9846593264845441, + "grad_norm": 0.9138841791343236, + "learning_rate": 3.1975401316597376e-09, + "loss": 0.1114, + "step": 10687 + }, + { + "epoch": 0.9847514626618141, + "grad_norm": 0.9102065333454794, + "learning_rate": 3.1591396809055317e-09, + "loss": 0.1126, + "step": 10688 + }, + { + "epoch": 0.9848435988390841, + "grad_norm": 0.9490694422798538, + "learning_rate": 3.120971062094136e-09, + "loss": 0.1243, + "step": 10689 + }, + { + "epoch": 0.9849357350163541, + "grad_norm": 0.9356220763269121, + "learning_rate": 3.0830342787693814e-09, + "loss": 0.116, + "step": 10690 + }, + { + "epoch": 0.9850278711936242, + "grad_norm": 0.9780782171880228, + "learning_rate": 3.0453293344534507e-09, + "loss": 0.1134, + "step": 10691 + }, + { + "epoch": 0.9851200073708942, + "grad_norm": 0.927888813498766, + "learning_rate": 3.007856232647155e-09, + "loss": 0.1128, + "step": 10692 + }, + { + "epoch": 0.9852121435481642, + "grad_norm": 0.987536393409098, + "learning_rate": 2.970614976830488e-09, + "loss": 0.1182, + "step": 10693 + }, + { + "epoch": 0.9853042797254342, + "grad_norm": 0.9263137120241127, + "learning_rate": 2.933605570460962e-09, + "loss": 0.1059, + "step": 10694 + }, + { + "epoch": 0.9853964159027042, + "grad_norm": 0.9625610479510435, + "learning_rate": 2.8968280169747177e-09, + "loss": 0.1197, + "step": 10695 + }, + { + "epoch": 0.9854885520799742, + "grad_norm": 0.9172728982589524, + "learning_rate": 2.8602823197868e-09, + "loss": 0.1093, + "step": 10696 + }, + { + "epoch": 0.9855806882572442, + "grad_norm": 0.9632436012736146, + "learning_rate": 2.823968482290329e-09, + "loss": 0.1253, + "step": 10697 + }, + { + "epoch": 0.9856728244345142, + "grad_norm": 0.9223452021204606, + "learning_rate": 2.787886507857329e-09, + "loss": 0.1129, + "step": 10698 + }, + { + "epoch": 0.9857649606117842, + "grad_norm": 0.8890863207340997, + "learning_rate": 2.7520363998376208e-09, + "loss": 0.1153, + "step": 10699 + }, + { + "epoch": 0.9858570967890542, + "grad_norm": 0.9555634706681613, + "learning_rate": 2.716418161560208e-09, + "loss": 0.1153, + "step": 10700 + }, + { + "epoch": 0.9859492329663242, + "grad_norm": 0.9023523390251761, + "learning_rate": 2.6810317963321674e-09, + "loss": 0.116, + "step": 10701 + }, + { + "epoch": 0.9860413691435942, + "grad_norm": 0.9676451487324907, + "learning_rate": 2.6458773074389266e-09, + "loss": 0.1128, + "step": 10702 + }, + { + "epoch": 0.9861335053208642, + "grad_norm": 0.9623707144940772, + "learning_rate": 2.610954698145096e-09, + "loss": 0.1231, + "step": 10703 + }, + { + "epoch": 0.9862256414981343, + "grad_norm": 0.9138591551329498, + "learning_rate": 2.5762639716925274e-09, + "loss": 0.1164, + "step": 10704 + }, + { + "epoch": 0.9863177776754043, + "grad_norm": 0.9348545180273251, + "learning_rate": 2.5418051313028102e-09, + "loss": 0.116, + "step": 10705 + }, + { + "epoch": 0.9864099138526743, + "grad_norm": 0.9045750930758, + "learning_rate": 2.507578180175052e-09, + "loss": 0.1059, + "step": 10706 + }, + { + "epoch": 0.9865020500299443, + "grad_norm": 0.9558289080953325, + "learning_rate": 2.473583121487544e-09, + "loss": 0.1191, + "step": 10707 + }, + { + "epoch": 0.9865941862072143, + "grad_norm": 0.9052879057162557, + "learning_rate": 2.43981995839665e-09, + "loss": 0.1145, + "step": 10708 + }, + { + "epoch": 0.9866863223844843, + "grad_norm": 0.951275220374051, + "learning_rate": 2.406288694037362e-09, + "loss": 0.1206, + "step": 10709 + }, + { + "epoch": 0.9867784585617543, + "grad_norm": 0.9285907818070983, + "learning_rate": 2.3729893315230234e-09, + "loss": 0.1151, + "step": 10710 + }, + { + "epoch": 0.9868705947390243, + "grad_norm": 0.9395711360817434, + "learning_rate": 2.339921873945328e-09, + "loss": 0.1192, + "step": 10711 + }, + { + "epoch": 0.9869627309162943, + "grad_norm": 0.90872727900148, + "learning_rate": 2.3070863243745967e-09, + "loss": 0.1197, + "step": 10712 + }, + { + "epoch": 0.9870548670935643, + "grad_norm": 0.9485592509220756, + "learning_rate": 2.2744826858597803e-09, + "loss": 0.1159, + "step": 10713 + }, + { + "epoch": 0.9871470032708343, + "grad_norm": 0.9328209116258498, + "learning_rate": 2.2421109614279015e-09, + "loss": 0.1123, + "step": 10714 + }, + { + "epoch": 0.9872391394481043, + "grad_norm": 1.0114257066594805, + "learning_rate": 2.209971154084889e-09, + "loss": 0.129, + "step": 10715 + }, + { + "epoch": 0.9873312756253743, + "grad_norm": 0.9576620679676022, + "learning_rate": 2.1780632668150226e-09, + "loss": 0.1192, + "step": 10716 + }, + { + "epoch": 0.9874234118026443, + "grad_norm": 0.9937479557344089, + "learning_rate": 2.1463873025806547e-09, + "loss": 0.13, + "step": 10717 + }, + { + "epoch": 0.9875155479799144, + "grad_norm": 0.8957813580668273, + "learning_rate": 2.1149432643233213e-09, + "loss": 0.1015, + "step": 10718 + }, + { + "epoch": 0.9876076841571844, + "grad_norm": 0.9371753197462088, + "learning_rate": 2.0837311549620763e-09, + "loss": 0.1105, + "step": 10719 + }, + { + "epoch": 0.9876998203344544, + "grad_norm": 0.9100179659422312, + "learning_rate": 2.052750977395157e-09, + "loss": 0.1109, + "step": 10720 + }, + { + "epoch": 0.9877919565117244, + "grad_norm": 0.9000073802913596, + "learning_rate": 2.0220027344994285e-09, + "loss": 0.1063, + "step": 10721 + }, + { + "epoch": 0.9878840926889944, + "grad_norm": 0.913903695944919, + "learning_rate": 1.9914864291292747e-09, + "loss": 0.1135, + "step": 10722 + }, + { + "epoch": 0.9879762288662643, + "grad_norm": 0.9678585764440448, + "learning_rate": 1.961202064118539e-09, + "loss": 0.1222, + "step": 10723 + }, + { + "epoch": 0.9880683650435343, + "grad_norm": 0.9568563012010259, + "learning_rate": 1.9311496422791398e-09, + "loss": 0.1183, + "step": 10724 + }, + { + "epoch": 0.9881605012208043, + "grad_norm": 0.9891580511080038, + "learning_rate": 1.9013291664013445e-09, + "loss": 0.1247, + "step": 10725 + }, + { + "epoch": 0.9882526373980743, + "grad_norm": 0.8893119723719847, + "learning_rate": 1.8717406392537718e-09, + "loss": 0.1032, + "step": 10726 + }, + { + "epoch": 0.9883447735753443, + "grad_norm": 0.9638566658251766, + "learning_rate": 1.8423840635842237e-09, + "loss": 0.1255, + "step": 10727 + }, + { + "epoch": 0.9884369097526143, + "grad_norm": 0.9313632687076696, + "learning_rate": 1.8132594421180206e-09, + "loss": 0.1166, + "step": 10728 + }, + { + "epoch": 0.9885290459298843, + "grad_norm": 0.9253116813594806, + "learning_rate": 1.7843667775593875e-09, + "loss": 0.111, + "step": 10729 + }, + { + "epoch": 0.9886211821071543, + "grad_norm": 0.8668298633727463, + "learning_rate": 1.7557060725914566e-09, + "loss": 0.0993, + "step": 10730 + }, + { + "epoch": 0.9887133182844243, + "grad_norm": 0.9730611123571105, + "learning_rate": 1.7272773298748769e-09, + "loss": 0.1258, + "step": 10731 + }, + { + "epoch": 0.9888054544616944, + "grad_norm": 0.9001091638779297, + "learning_rate": 1.6990805520494813e-09, + "loss": 0.1135, + "step": 10732 + }, + { + "epoch": 0.9888975906389644, + "grad_norm": 0.9557725581836793, + "learning_rate": 1.6711157417334533e-09, + "loss": 0.1169, + "step": 10733 + }, + { + "epoch": 0.9889897268162344, + "grad_norm": 0.9184313551073652, + "learning_rate": 1.6433829015230497e-09, + "loss": 0.1135, + "step": 10734 + }, + { + "epoch": 0.9890818629935044, + "grad_norm": 0.8740577853942184, + "learning_rate": 1.6158820339937098e-09, + "loss": 0.1076, + "step": 10735 + }, + { + "epoch": 0.9891739991707744, + "grad_norm": 0.9511768299527451, + "learning_rate": 1.5886131416981144e-09, + "loss": 0.1036, + "step": 10736 + }, + { + "epoch": 0.9892661353480444, + "grad_norm": 0.9315575336147978, + "learning_rate": 1.5615762271689593e-09, + "loss": 0.1257, + "step": 10737 + }, + { + "epoch": 0.9893582715253144, + "grad_norm": 0.9735995642076678, + "learning_rate": 1.5347712929164594e-09, + "loss": 0.1243, + "step": 10738 + }, + { + "epoch": 0.9894504077025844, + "grad_norm": 0.9609590717734398, + "learning_rate": 1.508198341429179e-09, + "loss": 0.1245, + "step": 10739 + }, + { + "epoch": 0.9895425438798544, + "grad_norm": 0.9803389685165724, + "learning_rate": 1.481857375174589e-09, + "loss": 0.1245, + "step": 10740 + }, + { + "epoch": 0.9896346800571244, + "grad_norm": 0.9447187065505281, + "learning_rate": 1.4557483965985109e-09, + "loss": 0.1217, + "step": 10741 + }, + { + "epoch": 0.9897268162343944, + "grad_norm": 0.9222073699752521, + "learning_rate": 1.4298714081248389e-09, + "loss": 0.1154, + "step": 10742 + }, + { + "epoch": 0.9898189524116644, + "grad_norm": 0.9511417934769287, + "learning_rate": 1.4042264121566507e-09, + "loss": 0.1159, + "step": 10743 + }, + { + "epoch": 0.9899110885889344, + "grad_norm": 0.9365644687505328, + "learning_rate": 1.3788134110750972e-09, + "loss": 0.1162, + "step": 10744 + }, + { + "epoch": 0.9900032247662045, + "grad_norm": 0.9530663162797376, + "learning_rate": 1.3536324072394026e-09, + "loss": 0.1217, + "step": 10745 + }, + { + "epoch": 0.9900953609434745, + "grad_norm": 0.9721468538247888, + "learning_rate": 1.3286834029879735e-09, + "loss": 0.1193, + "step": 10746 + }, + { + "epoch": 0.9901874971207445, + "grad_norm": 0.917636144996345, + "learning_rate": 1.303966400637291e-09, + "loss": 0.1153, + "step": 10747 + }, + { + "epoch": 0.9902796332980145, + "grad_norm": 0.8968146551784478, + "learning_rate": 1.279481402481908e-09, + "loss": 0.1095, + "step": 10748 + }, + { + "epoch": 0.9903717694752845, + "grad_norm": 0.9204522570969634, + "learning_rate": 1.255228410795839e-09, + "loss": 0.1132, + "step": 10749 + }, + { + "epoch": 0.9904639056525545, + "grad_norm": 0.909280443704693, + "learning_rate": 1.2312074278308939e-09, + "loss": 0.1053, + "step": 10750 + }, + { + "epoch": 0.9905560418298245, + "grad_norm": 0.999015146759792, + "learning_rate": 1.2074184558169554e-09, + "loss": 0.1238, + "step": 10751 + }, + { + "epoch": 0.9906481780070945, + "grad_norm": 1.0173745580641365, + "learning_rate": 1.1838614969633678e-09, + "loss": 0.134, + "step": 10752 + }, + { + "epoch": 0.9907403141843645, + "grad_norm": 0.9423136441090145, + "learning_rate": 1.1605365534569922e-09, + "loss": 0.1165, + "step": 10753 + }, + { + "epoch": 0.9908324503616345, + "grad_norm": 0.9484240848690269, + "learning_rate": 1.1374436274635968e-09, + "loss": 0.1207, + "step": 10754 + }, + { + "epoch": 0.9909245865389045, + "grad_norm": 0.9701778726691909, + "learning_rate": 1.1145827211278548e-09, + "loss": 0.1191, + "step": 10755 + }, + { + "epoch": 0.9910167227161745, + "grad_norm": 0.918169085936637, + "learning_rate": 1.0919538365716797e-09, + "loss": 0.1095, + "step": 10756 + }, + { + "epoch": 0.9911088588934445, + "grad_norm": 0.9436039067062373, + "learning_rate": 1.069556975896724e-09, + "loss": 0.1164, + "step": 10757 + }, + { + "epoch": 0.9912009950707145, + "grad_norm": 0.961966720663876, + "learning_rate": 1.047392141182435e-09, + "loss": 0.1157, + "step": 10758 + }, + { + "epoch": 0.9912931312479846, + "grad_norm": 0.9031848710547364, + "learning_rate": 1.0254593344866115e-09, + "loss": 0.1156, + "step": 10759 + }, + { + "epoch": 0.9913852674252546, + "grad_norm": 0.9289358118171895, + "learning_rate": 1.00375855784568e-09, + "loss": 0.1161, + "step": 10760 + }, + { + "epoch": 0.9914774036025246, + "grad_norm": 0.9280764395282413, + "learning_rate": 9.822898132749726e-10, + "loss": 0.1127, + "step": 10761 + }, + { + "epoch": 0.9915695397797946, + "grad_norm": 0.9245548787551704, + "learning_rate": 9.610531027673398e-10, + "loss": 0.1099, + "step": 10762 + }, + { + "epoch": 0.9916616759570646, + "grad_norm": 0.9526967095042527, + "learning_rate": 9.400484282950928e-10, + "loss": 0.1116, + "step": 10763 + }, + { + "epoch": 0.9917538121343346, + "grad_norm": 0.9584267650838287, + "learning_rate": 9.192757918083383e-10, + "loss": 0.1247, + "step": 10764 + }, + { + "epoch": 0.9918459483116046, + "grad_norm": 0.9599737471955169, + "learning_rate": 8.987351952355338e-10, + "loss": 0.1188, + "step": 10765 + }, + { + "epoch": 0.9919380844888745, + "grad_norm": 0.9471789520600152, + "learning_rate": 8.7842664048432e-10, + "loss": 0.1232, + "step": 10766 + }, + { + "epoch": 0.9920302206661445, + "grad_norm": 0.9560335153255229, + "learning_rate": 8.58350129440133e-10, + "loss": 0.123, + "step": 10767 + }, + { + "epoch": 0.9921223568434145, + "grad_norm": 0.9297203583707537, + "learning_rate": 8.385056639670375e-10, + "loss": 0.1104, + "step": 10768 + }, + { + "epoch": 0.9922144930206845, + "grad_norm": 0.9557436945001951, + "learning_rate": 8.188932459077259e-10, + "loss": 0.1148, + "step": 10769 + }, + { + "epoch": 0.9923066291979545, + "grad_norm": 0.9382086601075487, + "learning_rate": 7.995128770829641e-10, + "loss": 0.1229, + "step": 10770 + }, + { + "epoch": 0.9923987653752245, + "grad_norm": 0.9776951290640383, + "learning_rate": 7.803645592927012e-10, + "loss": 0.1261, + "step": 10771 + }, + { + "epoch": 0.9924909015524946, + "grad_norm": 0.9234222588432033, + "learning_rate": 7.614482943144041e-10, + "loss": 0.1078, + "step": 10772 + }, + { + "epoch": 0.9925830377297646, + "grad_norm": 0.9425926563945666, + "learning_rate": 7.427640839044458e-10, + "loss": 0.1165, + "step": 10773 + }, + { + "epoch": 0.9926751739070346, + "grad_norm": 0.9997978213398736, + "learning_rate": 7.243119297981049e-10, + "loss": 0.1301, + "step": 10774 + }, + { + "epoch": 0.9927673100843046, + "grad_norm": 0.9622010406970339, + "learning_rate": 7.060918337081779e-10, + "loss": 0.1298, + "step": 10775 + }, + { + "epoch": 0.9928594462615746, + "grad_norm": 0.9629025338726946, + "learning_rate": 6.881037973266447e-10, + "loss": 0.1085, + "step": 10776 + }, + { + "epoch": 0.9929515824388446, + "grad_norm": 0.9442954815335793, + "learning_rate": 6.703478223235582e-10, + "loss": 0.1183, + "step": 10777 + }, + { + "epoch": 0.9930437186161146, + "grad_norm": 0.9778711461998094, + "learning_rate": 6.528239103478773e-10, + "loss": 0.1207, + "step": 10778 + }, + { + "epoch": 0.9931358547933846, + "grad_norm": 0.8971965968414244, + "learning_rate": 6.355320630263561e-10, + "loss": 0.108, + "step": 10779 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 0.944982803475713, + "learning_rate": 6.184722819646549e-10, + "loss": 0.1075, + "step": 10780 + }, + { + "epoch": 0.9933201271479246, + "grad_norm": 0.9722778893436133, + "learning_rate": 6.016445687467842e-10, + "loss": 0.1195, + "step": 10781 + }, + { + "epoch": 0.9934122633251946, + "grad_norm": 0.9633531416564072, + "learning_rate": 5.850489249351054e-10, + "loss": 0.1316, + "step": 10782 + }, + { + "epoch": 0.9935043995024646, + "grad_norm": 0.8947891332160514, + "learning_rate": 5.686853520708857e-10, + "loss": 0.1036, + "step": 10783 + }, + { + "epoch": 0.9935965356797346, + "grad_norm": 0.8921400585380197, + "learning_rate": 5.525538516729101e-10, + "loss": 0.1088, + "step": 10784 + }, + { + "epoch": 0.9936886718570046, + "grad_norm": 0.9192509868974265, + "learning_rate": 5.366544252397021e-10, + "loss": 0.1181, + "step": 10785 + }, + { + "epoch": 0.9937808080342747, + "grad_norm": 0.9294284724982631, + "learning_rate": 5.209870742467482e-10, + "loss": 0.1192, + "step": 10786 + }, + { + "epoch": 0.9938729442115447, + "grad_norm": 0.9204015586179748, + "learning_rate": 5.055518001492731e-10, + "loss": 0.1158, + "step": 10787 + }, + { + "epoch": 0.9939650803888147, + "grad_norm": 0.9626572036280917, + "learning_rate": 4.903486043802974e-10, + "loss": 0.1205, + "step": 10788 + }, + { + "epoch": 0.9940572165660847, + "grad_norm": 0.9249171298942248, + "learning_rate": 4.75377488351747e-10, + "loss": 0.1172, + "step": 10789 + }, + { + "epoch": 0.9941493527433547, + "grad_norm": 0.916272268645282, + "learning_rate": 4.6063845345306613e-10, + "loss": 0.1096, + "step": 10790 + }, + { + "epoch": 0.9942414889206247, + "grad_norm": 0.9606174055026327, + "learning_rate": 4.4613150105315974e-10, + "loss": 0.1159, + "step": 10791 + }, + { + "epoch": 0.9943336250978947, + "grad_norm": 0.9366043316921812, + "learning_rate": 4.3185663249900587e-10, + "loss": 0.1151, + "step": 10792 + }, + { + "epoch": 0.9944257612751647, + "grad_norm": 0.9470015355101646, + "learning_rate": 4.1781384911593336e-10, + "loss": 0.1106, + "step": 10793 + }, + { + "epoch": 0.9945178974524347, + "grad_norm": 0.9553448889331428, + "learning_rate": 4.040031522078991e-10, + "loss": 0.1155, + "step": 10794 + }, + { + "epoch": 0.9946100336297047, + "grad_norm": 0.943307344715156, + "learning_rate": 3.904245430569331e-10, + "loss": 0.1182, + "step": 10795 + }, + { + "epoch": 0.9947021698069747, + "grad_norm": 0.941980364543447, + "learning_rate": 3.7707802292424877e-10, + "loss": 0.1068, + "step": 10796 + }, + { + "epoch": 0.9947943059842447, + "grad_norm": 0.9633082238006477, + "learning_rate": 3.639635930491325e-10, + "loss": 0.1169, + "step": 10797 + }, + { + "epoch": 0.9948864421615147, + "grad_norm": 0.9486204676908485, + "learning_rate": 3.5108125464866636e-10, + "loss": 0.1149, + "step": 10798 + }, + { + "epoch": 0.9949785783387847, + "grad_norm": 0.9663686096850286, + "learning_rate": 3.3843100891939316e-10, + "loss": 0.118, + "step": 10799 + }, + { + "epoch": 0.9950707145160548, + "grad_norm": 0.922095330879719, + "learning_rate": 3.260128570359289e-10, + "loss": 0.1164, + "step": 10800 + }, + { + "epoch": 0.9951628506933248, + "grad_norm": 1.000098514142364, + "learning_rate": 3.138268001509626e-10, + "loss": 0.1222, + "step": 10801 + }, + { + "epoch": 0.9952549868705948, + "grad_norm": 0.9719465806233039, + "learning_rate": 3.018728393963666e-10, + "loss": 0.1253, + "step": 10802 + }, + { + "epoch": 0.9953471230478648, + "grad_norm": 0.9543309585482759, + "learning_rate": 2.901509758820864e-10, + "loss": 0.1206, + "step": 10803 + }, + { + "epoch": 0.9954392592251348, + "grad_norm": 0.9013190921660513, + "learning_rate": 2.786612106961406e-10, + "loss": 0.11, + "step": 10804 + }, + { + "epoch": 0.9955313954024048, + "grad_norm": 0.9654267379006883, + "learning_rate": 2.674035449054535e-10, + "loss": 0.114, + "step": 10805 + }, + { + "epoch": 0.9956235315796748, + "grad_norm": 0.995907441347612, + "learning_rate": 2.563779795553001e-10, + "loss": 0.1223, + "step": 10806 + }, + { + "epoch": 0.9957156677569448, + "grad_norm": 0.9689107866786532, + "learning_rate": 2.455845156695835e-10, + "loss": 0.1231, + "step": 10807 + }, + { + "epoch": 0.9958078039342148, + "grad_norm": 0.8690683165999983, + "learning_rate": 2.350231542502801e-10, + "loss": 0.0981, + "step": 10808 + }, + { + "epoch": 0.9958999401114847, + "grad_norm": 0.9517071622004358, + "learning_rate": 2.2469389627827188e-10, + "loss": 0.1242, + "step": 10809 + }, + { + "epoch": 0.9959920762887547, + "grad_norm": 0.9644950560808359, + "learning_rate": 2.1459674271251397e-10, + "loss": 0.1179, + "step": 10810 + }, + { + "epoch": 0.9960842124660247, + "grad_norm": 0.9585515695749373, + "learning_rate": 2.0473169449031217e-10, + "loss": 0.1181, + "step": 10811 + }, + { + "epoch": 0.9961763486432947, + "grad_norm": 0.8761769302597302, + "learning_rate": 1.9509875252787803e-10, + "loss": 0.1071, + "step": 10812 + }, + { + "epoch": 0.9962684848205648, + "grad_norm": 0.9240358594187232, + "learning_rate": 1.856979177194962e-10, + "loss": 0.1127, + "step": 10813 + }, + { + "epoch": 0.9963606209978348, + "grad_norm": 0.90524334167604, + "learning_rate": 1.7652919093807952e-10, + "loss": 0.1084, + "step": 10814 + }, + { + "epoch": 0.9964527571751048, + "grad_norm": 0.9512308901033537, + "learning_rate": 1.675925730348915e-10, + "loss": 0.1181, + "step": 10815 + }, + { + "epoch": 0.9965448933523748, + "grad_norm": 0.8919879468606943, + "learning_rate": 1.5888806484010143e-10, + "loss": 0.1092, + "step": 10816 + }, + { + "epoch": 0.9966370295296448, + "grad_norm": 0.928927024723345, + "learning_rate": 1.5041566716139656e-10, + "loss": 0.1152, + "step": 10817 + }, + { + "epoch": 0.9967291657069148, + "grad_norm": 0.9682198116348629, + "learning_rate": 1.4217538078536985e-10, + "loss": 0.121, + "step": 10818 + }, + { + "epoch": 0.9968213018841848, + "grad_norm": 0.9812596295403685, + "learning_rate": 1.3416720647779768e-10, + "loss": 0.1254, + "step": 10819 + }, + { + "epoch": 0.9969134380614548, + "grad_norm": 0.9476046140101004, + "learning_rate": 1.263911449816968e-10, + "loss": 0.1228, + "step": 10820 + }, + { + "epoch": 0.9970055742387248, + "grad_norm": 0.8940822513705196, + "learning_rate": 1.1884719701926727e-10, + "loss": 0.1175, + "step": 10821 + }, + { + "epoch": 0.9970977104159948, + "grad_norm": 0.9495262015744932, + "learning_rate": 1.1153536329078229e-10, + "loss": 0.1171, + "step": 10822 + }, + { + "epoch": 0.9971898465932648, + "grad_norm": 0.9466157440151927, + "learning_rate": 1.0445564447542078e-10, + "loss": 0.1192, + "step": 10823 + }, + { + "epoch": 0.9972819827705348, + "grad_norm": 0.9413453778321897, + "learning_rate": 9.760804123015721e-11, + "loss": 0.1141, + "step": 10824 + }, + { + "epoch": 0.9973741189478048, + "grad_norm": 0.9226586113738102, + "learning_rate": 9.099255419114938e-11, + "loss": 0.1177, + "step": 10825 + }, + { + "epoch": 0.9974662551250748, + "grad_norm": 0.9383011446764592, + "learning_rate": 8.460918397262818e-11, + "loss": 0.1128, + "step": 10826 + }, + { + "epoch": 0.9975583913023449, + "grad_norm": 0.9252669138882176, + "learning_rate": 7.845793116717515e-11, + "loss": 0.1134, + "step": 10827 + }, + { + "epoch": 0.9976505274796149, + "grad_norm": 0.9388645197912326, + "learning_rate": 7.253879634600003e-11, + "loss": 0.1238, + "step": 10828 + }, + { + "epoch": 0.9977426636568849, + "grad_norm": 0.9377529336829813, + "learning_rate": 6.685178005838567e-11, + "loss": 0.1114, + "step": 10829 + }, + { + "epoch": 0.9978347998341549, + "grad_norm": 1.0046882927242726, + "learning_rate": 6.139688283279821e-11, + "loss": 0.1272, + "step": 10830 + }, + { + "epoch": 0.9979269360114249, + "grad_norm": 0.9156009519149251, + "learning_rate": 5.617410517549937e-11, + "loss": 0.1112, + "step": 10831 + }, + { + "epoch": 0.9980190721886949, + "grad_norm": 0.9902312030633188, + "learning_rate": 5.118344757165661e-11, + "loss": 0.1272, + "step": 10832 + }, + { + "epoch": 0.9981112083659649, + "grad_norm": 0.9937864237747287, + "learning_rate": 4.6424910484232924e-11, + "loss": 0.1351, + "step": 10833 + }, + { + "epoch": 0.9982033445432349, + "grad_norm": 0.9211831585262729, + "learning_rate": 4.189849435565219e-11, + "loss": 0.1033, + "step": 10834 + }, + { + "epoch": 0.9982954807205049, + "grad_norm": 0.9613565174736062, + "learning_rate": 3.7604199605578705e-11, + "loss": 0.1195, + "step": 10835 + }, + { + "epoch": 0.9983876168977749, + "grad_norm": 0.9636026205660925, + "learning_rate": 3.35420266328601e-11, + "loss": 0.1267, + "step": 10836 + }, + { + "epoch": 0.9984797530750449, + "grad_norm": 0.9489533548789341, + "learning_rate": 2.9711975814972205e-11, + "loss": 0.1205, + "step": 10837 + }, + { + "epoch": 0.9985718892523149, + "grad_norm": 0.9312987097224975, + "learning_rate": 2.6114047507463936e-11, + "loss": 0.1177, + "step": 10838 + }, + { + "epoch": 0.9986640254295849, + "grad_norm": 0.9687758766239608, + "learning_rate": 2.2748242044234868e-11, + "loss": 0.1155, + "step": 10839 + }, + { + "epoch": 0.998756161606855, + "grad_norm": 0.9124547506561508, + "learning_rate": 1.9614559738090345e-11, + "loss": 0.1206, + "step": 10840 + }, + { + "epoch": 0.998848297784125, + "grad_norm": 0.9012464083753811, + "learning_rate": 1.671300087935368e-11, + "loss": 0.1137, + "step": 10841 + }, + { + "epoch": 0.998940433961395, + "grad_norm": 0.9556438025043256, + "learning_rate": 1.4043565738364184e-11, + "loss": 0.112, + "step": 10842 + }, + { + "epoch": 0.999032570138665, + "grad_norm": 0.9389183452029211, + "learning_rate": 1.1606254562146479e-11, + "loss": 0.1132, + "step": 10843 + }, + { + "epoch": 0.999124706315935, + "grad_norm": 0.906558779678973, + "learning_rate": 9.401067577463618e-12, + "loss": 0.1083, + "step": 10844 + }, + { + "epoch": 0.999216842493205, + "grad_norm": 0.9626475810080135, + "learning_rate": 7.428004988874194e-12, + "loss": 0.1201, + "step": 10845 + }, + { + "epoch": 0.999308978670475, + "grad_norm": 0.9183400382526548, + "learning_rate": 5.687066979565003e-12, + "loss": 0.1133, + "step": 10846 + }, + { + "epoch": 0.999401114847745, + "grad_norm": 0.9157534567793171, + "learning_rate": 4.178253711351054e-12, + "loss": 0.1123, + "step": 10847 + }, + { + "epoch": 0.999493251025015, + "grad_norm": 0.9607427044331877, + "learning_rate": 2.9015653243980034e-12, + "loss": 0.1194, + "step": 10848 + }, + { + "epoch": 0.999585387202285, + "grad_norm": 0.9133340683724209, + "learning_rate": 1.8570019369446025e-12, + "loss": 0.1209, + "step": 10849 + }, + { + "epoch": 0.999677523379555, + "grad_norm": 0.9797017728895512, + "learning_rate": 1.044563646135366e-12, + "loss": 0.1213, + "step": 10850 + }, + { + "epoch": 0.999769659556825, + "grad_norm": 0.950547168845273, + "learning_rate": 4.642505274654596e-13, + "loss": 0.1109, + "step": 10851 + }, + { + "epoch": 0.999861795734095, + "grad_norm": 0.9546759905442439, + "learning_rate": 1.1606263450314458e-13, + "loss": 0.1163, + "step": 10852 + }, + { + "epoch": 0.999953931911365, + "grad_norm": 0.99401938130207, + "learning_rate": 0.0, + "loss": 0.1287, + "step": 10853 + }, + { + "epoch": 0.999953931911365, + "step": 10853, + "total_flos": 1908258935930880.0, + "train_loss": 0.14796658507510943, + "train_runtime": 123515.0295, + "train_samples_per_second": 5.624, + "train_steps_per_second": 0.088 } ], "logging_steps": 1, - "max_steps": 1270, + "max_steps": 10853, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, @@ -8941,7 +76174,7 @@ "attributes": {} } }, - "total_flos": 223330201436160.0, + "total_flos": 1908258935930880.0, "train_batch_size": 1, "trial_name": null, "trial_params": null