diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14065 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.17538354874909054, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.769177437454527e-05, + "grad_norm": 7.59375, + "learning_rate": 3e-06, + "loss": 4.1911, + "step": 1 + }, + { + "epoch": 0.00017538354874909053, + "grad_norm": 7.6875, + "learning_rate": 6e-06, + "loss": 4.1743, + "step": 2 + }, + { + "epoch": 0.0002630753231236358, + "grad_norm": 8.1875, + "learning_rate": 9e-06, + "loss": 4.1165, + "step": 3 + }, + { + "epoch": 0.00035076709749818106, + "grad_norm": 8.0, + "learning_rate": 1.2e-05, + "loss": 4.2652, + "step": 4 + }, + { + "epoch": 0.00043845887187272633, + "grad_norm": 7.9375, + "learning_rate": 1.5e-05, + "loss": 4.1691, + "step": 5 + }, + { + "epoch": 0.0005261506462472716, + "grad_norm": 8.0625, + "learning_rate": 1.8e-05, + "loss": 4.2089, + "step": 6 + }, + { + "epoch": 0.0006138424206218169, + "grad_norm": 8.25, + "learning_rate": 2.1000000000000002e-05, + "loss": 4.1677, + "step": 7 + }, + { + "epoch": 0.0007015341949963621, + "grad_norm": 7.8125, + "learning_rate": 2.4e-05, + "loss": 4.1564, + "step": 8 + }, + { + "epoch": 0.0007892259693709074, + "grad_norm": 7.75, + "learning_rate": 2.7e-05, + "loss": 4.1722, + "step": 9 + }, + { + "epoch": 0.0008769177437454527, + "grad_norm": 7.3125, + "learning_rate": 3e-05, + "loss": 4.1554, + "step": 10 + }, + { + "epoch": 0.0009646095181199979, + "grad_norm": 7.53125, + "learning_rate": 3.2999999999999996e-05, + "loss": 4.1511, + "step": 11 + }, + { + "epoch": 0.0010523012924945432, + "grad_norm": 7.1875, + "learning_rate": 3.6e-05, + "loss": 4.152, + "step": 12 + }, + { + "epoch": 0.0011399930668690886, + "grad_norm": 6.8125, + "learning_rate": 3.9e-05, + "loss": 4.1581, + "step": 13 + }, + { + "epoch": 0.0012276848412436337, + "grad_norm": 6.625, + "learning_rate": 4.2000000000000004e-05, + "loss": 4.0485, + "step": 14 + }, + { + "epoch": 0.001315376615618179, + "grad_norm": 6.28125, + "learning_rate": 4.4999999999999996e-05, + "loss": 4.083, + "step": 15 + }, + { + "epoch": 0.0014030683899927242, + "grad_norm": 5.625, + "learning_rate": 4.8e-05, + "loss": 4.075, + "step": 16 + }, + { + "epoch": 0.0014907601643672696, + "grad_norm": 5.625, + "learning_rate": 5.1000000000000006e-05, + "loss": 4.0571, + "step": 17 + }, + { + "epoch": 0.0015784519387418148, + "grad_norm": 5.53125, + "learning_rate": 5.4e-05, + "loss": 4.0213, + "step": 18 + }, + { + "epoch": 0.0016661437131163602, + "grad_norm": 4.5625, + "learning_rate": 5.7e-05, + "loss": 4.0436, + "step": 19 + }, + { + "epoch": 0.0017538354874909053, + "grad_norm": 4.6875, + "learning_rate": 6e-05, + "loss": 4.0306, + "step": 20 + }, + { + "epoch": 0.0018415272618654507, + "grad_norm": 4.125, + "learning_rate": 6.3e-05, + "loss": 4.0364, + "step": 21 + }, + { + "epoch": 0.0019292190362399958, + "grad_norm": 3.875, + "learning_rate": 6.599999999999999e-05, + "loss": 3.9774, + "step": 22 + }, + { + "epoch": 0.002016910810614541, + "grad_norm": 3.421875, + "learning_rate": 6.9e-05, + "loss": 3.9513, + "step": 23 + }, + { + "epoch": 0.0021046025849890864, + "grad_norm": 3.21875, + "learning_rate": 7.2e-05, + "loss": 3.9453, + "step": 24 + }, + { + "epoch": 0.0021922943593636317, + "grad_norm": 2.734375, + "learning_rate": 7.500000000000001e-05, + "loss": 3.9338, + "step": 25 + }, + { + "epoch": 0.002279986133738177, + "grad_norm": 2.375, + "learning_rate": 7.8e-05, + "loss": 3.9318, + "step": 26 + }, + { + "epoch": 0.0023676779081127225, + "grad_norm": 1.921875, + "learning_rate": 8.1e-05, + "loss": 3.8758, + "step": 27 + }, + { + "epoch": 0.0024553696824872674, + "grad_norm": 1.828125, + "learning_rate": 8.400000000000001e-05, + "loss": 3.8885, + "step": 28 + }, + { + "epoch": 0.002543061456861813, + "grad_norm": 1.765625, + "learning_rate": 8.7e-05, + "loss": 3.8412, + "step": 29 + }, + { + "epoch": 0.002630753231236358, + "grad_norm": 1.65625, + "learning_rate": 8.999999999999999e-05, + "loss": 3.8114, + "step": 30 + }, + { + "epoch": 0.0027184450056109036, + "grad_norm": 1.5234375, + "learning_rate": 9.3e-05, + "loss": 3.778, + "step": 31 + }, + { + "epoch": 0.0028061367799854485, + "grad_norm": 1.515625, + "learning_rate": 9.6e-05, + "loss": 3.7355, + "step": 32 + }, + { + "epoch": 0.002893828554359994, + "grad_norm": 1.5, + "learning_rate": 9.900000000000001e-05, + "loss": 3.7612, + "step": 33 + }, + { + "epoch": 0.0029815203287345392, + "grad_norm": 1.390625, + "learning_rate": 0.00010200000000000001, + "loss": 3.7869, + "step": 34 + }, + { + "epoch": 0.0030692121031090846, + "grad_norm": 1.328125, + "learning_rate": 0.00010500000000000002, + "loss": 3.6998, + "step": 35 + }, + { + "epoch": 0.0031569038774836296, + "grad_norm": 1.3203125, + "learning_rate": 0.000108, + "loss": 3.645, + "step": 36 + }, + { + "epoch": 0.003244595651858175, + "grad_norm": 1.265625, + "learning_rate": 0.000111, + "loss": 3.6778, + "step": 37 + }, + { + "epoch": 0.0033322874262327203, + "grad_norm": 1.28125, + "learning_rate": 0.000114, + "loss": 3.6011, + "step": 38 + }, + { + "epoch": 0.0034199792006072657, + "grad_norm": 1.28125, + "learning_rate": 0.000117, + "loss": 3.6147, + "step": 39 + }, + { + "epoch": 0.0035076709749818106, + "grad_norm": 1.2265625, + "learning_rate": 0.00012, + "loss": 3.5427, + "step": 40 + }, + { + "epoch": 0.003595362749356356, + "grad_norm": 1.1484375, + "learning_rate": 0.000123, + "loss": 3.5252, + "step": 41 + }, + { + "epoch": 0.0036830545237309014, + "grad_norm": 1.140625, + "learning_rate": 0.000126, + "loss": 3.5946, + "step": 42 + }, + { + "epoch": 0.0037707462981054467, + "grad_norm": 1.1328125, + "learning_rate": 0.000129, + "loss": 3.498, + "step": 43 + }, + { + "epoch": 0.0038584380724799917, + "grad_norm": 1.4140625, + "learning_rate": 0.00013199999999999998, + "loss": 3.4045, + "step": 44 + }, + { + "epoch": 0.0039461298468545375, + "grad_norm": 1.46875, + "learning_rate": 0.000135, + "loss": 3.4418, + "step": 45 + }, + { + "epoch": 0.004033821621229082, + "grad_norm": 1.5390625, + "learning_rate": 0.000138, + "loss": 3.3624, + "step": 46 + }, + { + "epoch": 0.004121513395603627, + "grad_norm": 1.6328125, + "learning_rate": 0.000141, + "loss": 3.3476, + "step": 47 + }, + { + "epoch": 0.004209205169978173, + "grad_norm": 1.5078125, + "learning_rate": 0.000144, + "loss": 3.2416, + "step": 48 + }, + { + "epoch": 0.004296896944352718, + "grad_norm": 1.171875, + "learning_rate": 0.000147, + "loss": 3.263, + "step": 49 + }, + { + "epoch": 0.0043845887187272635, + "grad_norm": 1.1328125, + "learning_rate": 0.00015000000000000001, + "loss": 3.2391, + "step": 50 + }, + { + "epoch": 0.004472280493101809, + "grad_norm": 0.8671875, + "learning_rate": 0.000153, + "loss": 3.1366, + "step": 51 + }, + { + "epoch": 0.004559972267476354, + "grad_norm": 0.75390625, + "learning_rate": 0.000156, + "loss": 3.103, + "step": 52 + }, + { + "epoch": 0.0046476640418509, + "grad_norm": 0.77734375, + "learning_rate": 0.000159, + "loss": 3.1938, + "step": 53 + }, + { + "epoch": 0.004735355816225445, + "grad_norm": 0.69140625, + "learning_rate": 0.000162, + "loss": 3.1503, + "step": 54 + }, + { + "epoch": 0.0048230475905999895, + "grad_norm": 0.70703125, + "learning_rate": 0.000165, + "loss": 3.1695, + "step": 55 + }, + { + "epoch": 0.004910739364974535, + "grad_norm": 0.67578125, + "learning_rate": 0.00016800000000000002, + "loss": 3.0348, + "step": 56 + }, + { + "epoch": 0.00499843113934908, + "grad_norm": 0.68359375, + "learning_rate": 0.000171, + "loss": 3.0016, + "step": 57 + }, + { + "epoch": 0.005086122913723626, + "grad_norm": 0.640625, + "learning_rate": 0.000174, + "loss": 2.9536, + "step": 58 + }, + { + "epoch": 0.005173814688098171, + "grad_norm": 0.59765625, + "learning_rate": 0.000177, + "loss": 2.9378, + "step": 59 + }, + { + "epoch": 0.005261506462472716, + "grad_norm": 0.56640625, + "learning_rate": 0.00017999999999999998, + "loss": 2.9442, + "step": 60 + }, + { + "epoch": 0.005349198236847262, + "grad_norm": 0.55078125, + "learning_rate": 0.000183, + "loss": 2.8868, + "step": 61 + }, + { + "epoch": 0.005436890011221807, + "grad_norm": 0.55078125, + "learning_rate": 0.000186, + "loss": 2.8555, + "step": 62 + }, + { + "epoch": 0.005524581785596352, + "grad_norm": 0.55078125, + "learning_rate": 0.000189, + "loss": 2.8699, + "step": 63 + }, + { + "epoch": 0.005612273559970897, + "grad_norm": 0.59375, + "learning_rate": 0.000192, + "loss": 2.9004, + "step": 64 + }, + { + "epoch": 0.005699965334345442, + "grad_norm": 0.625, + "learning_rate": 0.00019500000000000002, + "loss": 2.8415, + "step": 65 + }, + { + "epoch": 0.005787657108719988, + "grad_norm": 0.5703125, + "learning_rate": 0.00019800000000000002, + "loss": 2.756, + "step": 66 + }, + { + "epoch": 0.005875348883094533, + "grad_norm": 0.54296875, + "learning_rate": 0.000201, + "loss": 2.8199, + "step": 67 + }, + { + "epoch": 0.0059630406574690785, + "grad_norm": 0.52734375, + "learning_rate": 0.00020400000000000003, + "loss": 2.753, + "step": 68 + }, + { + "epoch": 0.006050732431843624, + "grad_norm": 0.46484375, + "learning_rate": 0.00020700000000000002, + "loss": 2.677, + "step": 69 + }, + { + "epoch": 0.006138424206218169, + "grad_norm": 0.5546875, + "learning_rate": 0.00021000000000000004, + "loss": 2.7099, + "step": 70 + }, + { + "epoch": 0.006226115980592715, + "grad_norm": 0.5859375, + "learning_rate": 0.00021299999999999997, + "loss": 2.6848, + "step": 71 + }, + { + "epoch": 0.006313807754967259, + "grad_norm": 0.439453125, + "learning_rate": 0.000216, + "loss": 2.6327, + "step": 72 + }, + { + "epoch": 0.0064014995293418045, + "grad_norm": 0.43359375, + "learning_rate": 0.00021899999999999998, + "loss": 2.6438, + "step": 73 + }, + { + "epoch": 0.00648919130371635, + "grad_norm": 0.44921875, + "learning_rate": 0.000222, + "loss": 2.6331, + "step": 74 + }, + { + "epoch": 0.006576883078090895, + "grad_norm": 0.4296875, + "learning_rate": 0.000225, + "loss": 2.616, + "step": 75 + }, + { + "epoch": 0.006664574852465441, + "grad_norm": 0.3828125, + "learning_rate": 0.000228, + "loss": 2.4532, + "step": 76 + }, + { + "epoch": 0.006752266626839986, + "grad_norm": 0.318359375, + "learning_rate": 0.000231, + "loss": 2.4668, + "step": 77 + }, + { + "epoch": 0.006839958401214531, + "grad_norm": 0.3203125, + "learning_rate": 0.000234, + "loss": 2.4524, + "step": 78 + }, + { + "epoch": 0.006927650175589077, + "grad_norm": 0.353515625, + "learning_rate": 0.00023700000000000001, + "loss": 2.4424, + "step": 79 + }, + { + "epoch": 0.007015341949963621, + "grad_norm": 0.30859375, + "learning_rate": 0.00024, + "loss": 2.403, + "step": 80 + }, + { + "epoch": 0.007103033724338167, + "grad_norm": 0.294921875, + "learning_rate": 0.00024300000000000002, + "loss": 2.4171, + "step": 81 + }, + { + "epoch": 0.007190725498712712, + "grad_norm": 0.298828125, + "learning_rate": 0.000246, + "loss": 2.4317, + "step": 82 + }, + { + "epoch": 0.007278417273087257, + "grad_norm": 0.259765625, + "learning_rate": 0.00024900000000000004, + "loss": 2.3678, + "step": 83 + }, + { + "epoch": 0.007366109047461803, + "grad_norm": 0.341796875, + "learning_rate": 0.000252, + "loss": 2.3322, + "step": 84 + }, + { + "epoch": 0.007453800821836348, + "grad_norm": 0.310546875, + "learning_rate": 0.000255, + "loss": 2.2875, + "step": 85 + }, + { + "epoch": 0.0075414925962108935, + "grad_norm": 0.248046875, + "learning_rate": 0.000258, + "loss": 2.2755, + "step": 86 + }, + { + "epoch": 0.007629184370585439, + "grad_norm": 0.2578125, + "learning_rate": 0.000261, + "loss": 2.245, + "step": 87 + }, + { + "epoch": 0.007716876144959983, + "grad_norm": 0.220703125, + "learning_rate": 0.00026399999999999997, + "loss": 2.1879, + "step": 88 + }, + { + "epoch": 0.007804567919334529, + "grad_norm": 0.279296875, + "learning_rate": 0.000267, + "loss": 2.263, + "step": 89 + }, + { + "epoch": 0.007892259693709075, + "grad_norm": 0.240234375, + "learning_rate": 0.00027, + "loss": 2.2339, + "step": 90 + }, + { + "epoch": 0.00797995146808362, + "grad_norm": 0.23828125, + "learning_rate": 0.000273, + "loss": 2.2315, + "step": 91 + }, + { + "epoch": 0.008067643242458164, + "grad_norm": 0.2119140625, + "learning_rate": 0.000276, + "loss": 2.1785, + "step": 92 + }, + { + "epoch": 0.00815533501683271, + "grad_norm": 0.216796875, + "learning_rate": 0.000279, + "loss": 2.1301, + "step": 93 + }, + { + "epoch": 0.008243026791207255, + "grad_norm": 0.203125, + "learning_rate": 0.000282, + "loss": 2.1555, + "step": 94 + }, + { + "epoch": 0.008330718565581801, + "grad_norm": 0.2734375, + "learning_rate": 0.000285, + "loss": 2.0664, + "step": 95 + }, + { + "epoch": 0.008418410339956345, + "grad_norm": 0.212890625, + "learning_rate": 0.000288, + "loss": 2.0808, + "step": 96 + }, + { + "epoch": 0.008506102114330892, + "grad_norm": 0.2060546875, + "learning_rate": 0.000291, + "loss": 1.9987, + "step": 97 + }, + { + "epoch": 0.008593793888705436, + "grad_norm": 0.2177734375, + "learning_rate": 0.000294, + "loss": 2.0368, + "step": 98 + }, + { + "epoch": 0.008681485663079982, + "grad_norm": 0.1953125, + "learning_rate": 0.000297, + "loss": 2.0081, + "step": 99 + }, + { + "epoch": 0.008769177437454527, + "grad_norm": 0.28515625, + "learning_rate": 0.00030000000000000003, + "loss": 2.0451, + "step": 100 + }, + { + "epoch": 0.008856869211829071, + "grad_norm": 0.1875, + "learning_rate": 0.00030300000000000005, + "loss": 1.9185, + "step": 101 + }, + { + "epoch": 0.008944560986203618, + "grad_norm": 0.18359375, + "learning_rate": 0.000306, + "loss": 1.9602, + "step": 102 + }, + { + "epoch": 0.009032252760578162, + "grad_norm": 0.197265625, + "learning_rate": 0.000309, + "loss": 1.9397, + "step": 103 + }, + { + "epoch": 0.009119944534952708, + "grad_norm": 0.271484375, + "learning_rate": 0.000312, + "loss": 1.9686, + "step": 104 + }, + { + "epoch": 0.009207636309327253, + "grad_norm": 0.2314453125, + "learning_rate": 0.000315, + "loss": 1.903, + "step": 105 + }, + { + "epoch": 0.0092953280837018, + "grad_norm": 0.1923828125, + "learning_rate": 0.000318, + "loss": 1.9059, + "step": 106 + }, + { + "epoch": 0.009383019858076344, + "grad_norm": 0.189453125, + "learning_rate": 0.000321, + "loss": 1.8646, + "step": 107 + }, + { + "epoch": 0.00947071163245089, + "grad_norm": 0.1826171875, + "learning_rate": 0.000324, + "loss": 1.8592, + "step": 108 + }, + { + "epoch": 0.009558403406825434, + "grad_norm": 0.2314453125, + "learning_rate": 0.000327, + "loss": 1.8891, + "step": 109 + }, + { + "epoch": 0.009646095181199979, + "grad_norm": 0.1474609375, + "learning_rate": 0.00033, + "loss": 1.7582, + "step": 110 + }, + { + "epoch": 0.009733786955574525, + "grad_norm": 0.1708984375, + "learning_rate": 0.000333, + "loss": 1.7959, + "step": 111 + }, + { + "epoch": 0.00982147872994907, + "grad_norm": 0.2255859375, + "learning_rate": 0.00033600000000000004, + "loss": 1.8503, + "step": 112 + }, + { + "epoch": 0.009909170504323616, + "grad_norm": 0.1640625, + "learning_rate": 0.000339, + "loss": 1.8528, + "step": 113 + }, + { + "epoch": 0.00999686227869816, + "grad_norm": 0.1474609375, + "learning_rate": 0.000342, + "loss": 1.7961, + "step": 114 + }, + { + "epoch": 0.010084554053072707, + "grad_norm": 0.1337890625, + "learning_rate": 0.00034500000000000004, + "loss": 1.7692, + "step": 115 + }, + { + "epoch": 0.010172245827447251, + "grad_norm": 0.1376953125, + "learning_rate": 0.000348, + "loss": 1.781, + "step": 116 + }, + { + "epoch": 0.010259937601821796, + "grad_norm": 0.1669921875, + "learning_rate": 0.000351, + "loss": 1.7401, + "step": 117 + }, + { + "epoch": 0.010347629376196342, + "grad_norm": 0.1328125, + "learning_rate": 0.000354, + "loss": 1.7695, + "step": 118 + }, + { + "epoch": 0.010435321150570886, + "grad_norm": 0.1337890625, + "learning_rate": 0.000357, + "loss": 1.744, + "step": 119 + }, + { + "epoch": 0.010523012924945433, + "grad_norm": 0.166015625, + "learning_rate": 0.00035999999999999997, + "loss": 1.7182, + "step": 120 + }, + { + "epoch": 0.010610704699319977, + "grad_norm": 0.1474609375, + "learning_rate": 0.000363, + "loss": 1.7426, + "step": 121 + }, + { + "epoch": 0.010698396473694523, + "grad_norm": 0.1416015625, + "learning_rate": 0.000366, + "loss": 1.6804, + "step": 122 + }, + { + "epoch": 0.010786088248069068, + "grad_norm": 0.12353515625, + "learning_rate": 0.000369, + "loss": 1.753, + "step": 123 + }, + { + "epoch": 0.010873780022443614, + "grad_norm": 0.14453125, + "learning_rate": 0.000372, + "loss": 1.7363, + "step": 124 + }, + { + "epoch": 0.010961471796818159, + "grad_norm": 0.138671875, + "learning_rate": 0.000375, + "loss": 1.7182, + "step": 125 + }, + { + "epoch": 0.011049163571192703, + "grad_norm": 0.158203125, + "learning_rate": 0.000378, + "loss": 1.6622, + "step": 126 + }, + { + "epoch": 0.01113685534556725, + "grad_norm": 0.1376953125, + "learning_rate": 0.000381, + "loss": 1.6381, + "step": 127 + }, + { + "epoch": 0.011224547119941794, + "grad_norm": 0.1142578125, + "learning_rate": 0.000384, + "loss": 1.6721, + "step": 128 + }, + { + "epoch": 0.01131223889431634, + "grad_norm": 0.1669921875, + "learning_rate": 0.00038700000000000003, + "loss": 1.6262, + "step": 129 + }, + { + "epoch": 0.011399930668690885, + "grad_norm": 0.1376953125, + "learning_rate": 0.00039000000000000005, + "loss": 1.7461, + "step": 130 + }, + { + "epoch": 0.011487622443065431, + "grad_norm": 0.2353515625, + "learning_rate": 0.000393, + "loss": 1.6759, + "step": 131 + }, + { + "epoch": 0.011575314217439975, + "grad_norm": 0.208984375, + "learning_rate": 0.00039600000000000003, + "loss": 1.6878, + "step": 132 + }, + { + "epoch": 0.011663005991814522, + "grad_norm": 0.29296875, + "learning_rate": 0.00039900000000000005, + "loss": 1.6794, + "step": 133 + }, + { + "epoch": 0.011750697766189066, + "grad_norm": 0.2080078125, + "learning_rate": 0.000402, + "loss": 1.6118, + "step": 134 + }, + { + "epoch": 0.01183838954056361, + "grad_norm": 0.1982421875, + "learning_rate": 0.00040500000000000003, + "loss": 1.6834, + "step": 135 + }, + { + "epoch": 0.011926081314938157, + "grad_norm": 0.1474609375, + "learning_rate": 0.00040800000000000005, + "loss": 1.6754, + "step": 136 + }, + { + "epoch": 0.012013773089312701, + "grad_norm": 0.1572265625, + "learning_rate": 0.000411, + "loss": 1.6149, + "step": 137 + }, + { + "epoch": 0.012101464863687248, + "grad_norm": 0.10546875, + "learning_rate": 0.00041400000000000003, + "loss": 1.7056, + "step": 138 + }, + { + "epoch": 0.012189156638061792, + "grad_norm": 0.1650390625, + "learning_rate": 0.00041700000000000005, + "loss": 1.6293, + "step": 139 + }, + { + "epoch": 0.012276848412436338, + "grad_norm": 0.140625, + "learning_rate": 0.00042000000000000007, + "loss": 1.6483, + "step": 140 + }, + { + "epoch": 0.012364540186810883, + "grad_norm": 0.1181640625, + "learning_rate": 0.000423, + "loss": 1.6171, + "step": 141 + }, + { + "epoch": 0.01245223196118543, + "grad_norm": 0.12255859375, + "learning_rate": 0.00042599999999999995, + "loss": 1.6129, + "step": 142 + }, + { + "epoch": 0.012539923735559974, + "grad_norm": 0.1337890625, + "learning_rate": 0.00042899999999999997, + "loss": 1.6066, + "step": 143 + }, + { + "epoch": 0.012627615509934518, + "grad_norm": 0.154296875, + "learning_rate": 0.000432, + "loss": 1.5817, + "step": 144 + }, + { + "epoch": 0.012715307284309064, + "grad_norm": 0.1162109375, + "learning_rate": 0.000435, + "loss": 1.666, + "step": 145 + }, + { + "epoch": 0.012802999058683609, + "grad_norm": 0.1123046875, + "learning_rate": 0.00043799999999999997, + "loss": 1.6477, + "step": 146 + }, + { + "epoch": 0.012890690833058155, + "grad_norm": 0.1533203125, + "learning_rate": 0.000441, + "loss": 1.5617, + "step": 147 + }, + { + "epoch": 0.0129783826074327, + "grad_norm": 0.154296875, + "learning_rate": 0.000444, + "loss": 1.5307, + "step": 148 + }, + { + "epoch": 0.013066074381807246, + "grad_norm": 0.09716796875, + "learning_rate": 0.00044699999999999997, + "loss": 1.5576, + "step": 149 + }, + { + "epoch": 0.01315376615618179, + "grad_norm": 0.1533203125, + "learning_rate": 0.00045, + "loss": 1.5612, + "step": 150 + }, + { + "epoch": 0.013241457930556335, + "grad_norm": 0.1064453125, + "learning_rate": 0.000453, + "loss": 1.5933, + "step": 151 + }, + { + "epoch": 0.013329149704930881, + "grad_norm": 0.140625, + "learning_rate": 0.000456, + "loss": 1.5505, + "step": 152 + }, + { + "epoch": 0.013416841479305426, + "grad_norm": 0.11376953125, + "learning_rate": 0.000459, + "loss": 1.4969, + "step": 153 + }, + { + "epoch": 0.013504533253679972, + "grad_norm": 0.1318359375, + "learning_rate": 0.000462, + "loss": 1.5668, + "step": 154 + }, + { + "epoch": 0.013592225028054516, + "grad_norm": 0.095703125, + "learning_rate": 0.000465, + "loss": 1.5287, + "step": 155 + }, + { + "epoch": 0.013679916802429063, + "grad_norm": 0.1044921875, + "learning_rate": 0.000468, + "loss": 1.545, + "step": 156 + }, + { + "epoch": 0.013767608576803607, + "grad_norm": 0.1298828125, + "learning_rate": 0.000471, + "loss": 1.615, + "step": 157 + }, + { + "epoch": 0.013855300351178153, + "grad_norm": 0.134765625, + "learning_rate": 0.00047400000000000003, + "loss": 1.5476, + "step": 158 + }, + { + "epoch": 0.013942992125552698, + "grad_norm": 0.10546875, + "learning_rate": 0.000477, + "loss": 1.5439, + "step": 159 + }, + { + "epoch": 0.014030683899927242, + "grad_norm": 0.189453125, + "learning_rate": 0.00048, + "loss": 1.5647, + "step": 160 + }, + { + "epoch": 0.014118375674301789, + "grad_norm": 0.11669921875, + "learning_rate": 0.00048300000000000003, + "loss": 1.6254, + "step": 161 + }, + { + "epoch": 0.014206067448676333, + "grad_norm": 0.10791015625, + "learning_rate": 0.00048600000000000005, + "loss": 1.4983, + "step": 162 + }, + { + "epoch": 0.01429375922305088, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004890000000000001, + "loss": 1.5548, + "step": 163 + }, + { + "epoch": 0.014381450997425424, + "grad_norm": 0.11376953125, + "learning_rate": 0.000492, + "loss": 1.4867, + "step": 164 + }, + { + "epoch": 0.01446914277179997, + "grad_norm": 0.1162109375, + "learning_rate": 0.000495, + "loss": 1.6054, + "step": 165 + }, + { + "epoch": 0.014556834546174515, + "grad_norm": 0.09716796875, + "learning_rate": 0.0004980000000000001, + "loss": 1.553, + "step": 166 + }, + { + "epoch": 0.014644526320549061, + "grad_norm": 0.162109375, + "learning_rate": 0.000501, + "loss": 1.568, + "step": 167 + }, + { + "epoch": 0.014732218094923605, + "grad_norm": 0.11328125, + "learning_rate": 0.000504, + "loss": 1.5792, + "step": 168 + }, + { + "epoch": 0.01481990986929815, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005070000000000001, + "loss": 1.5449, + "step": 169 + }, + { + "epoch": 0.014907601643672696, + "grad_norm": 0.1103515625, + "learning_rate": 0.00051, + "loss": 1.5593, + "step": 170 + }, + { + "epoch": 0.01499529341804724, + "grad_norm": 0.1064453125, + "learning_rate": 0.000513, + "loss": 1.4883, + "step": 171 + }, + { + "epoch": 0.015082985192421787, + "grad_norm": 0.1064453125, + "learning_rate": 0.000516, + "loss": 1.529, + "step": 172 + }, + { + "epoch": 0.015170676966796331, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005189999999999999, + "loss": 1.466, + "step": 173 + }, + { + "epoch": 0.015258368741170878, + "grad_norm": 0.09033203125, + "learning_rate": 0.000522, + "loss": 1.4907, + "step": 174 + }, + { + "epoch": 0.015346060515545422, + "grad_norm": 0.16796875, + "learning_rate": 0.000525, + "loss": 1.5739, + "step": 175 + }, + { + "epoch": 0.015433752289919967, + "grad_norm": 0.115234375, + "learning_rate": 0.0005279999999999999, + "loss": 1.5142, + "step": 176 + }, + { + "epoch": 0.015521444064294513, + "grad_norm": 0.1904296875, + "learning_rate": 0.000531, + "loss": 1.4573, + "step": 177 + }, + { + "epoch": 0.015609135838669057, + "grad_norm": 0.1142578125, + "learning_rate": 0.000534, + "loss": 1.5181, + "step": 178 + }, + { + "epoch": 0.015696827613043602, + "grad_norm": 0.1572265625, + "learning_rate": 0.000537, + "loss": 1.4548, + "step": 179 + }, + { + "epoch": 0.01578451938741815, + "grad_norm": 0.10498046875, + "learning_rate": 0.00054, + "loss": 1.4154, + "step": 180 + }, + { + "epoch": 0.015872211161792694, + "grad_norm": 0.11376953125, + "learning_rate": 0.000543, + "loss": 1.4346, + "step": 181 + }, + { + "epoch": 0.01595990293616724, + "grad_norm": 0.103515625, + "learning_rate": 0.000546, + "loss": 1.6055, + "step": 182 + }, + { + "epoch": 0.016047594710541783, + "grad_norm": 0.11376953125, + "learning_rate": 0.000549, + "loss": 1.4598, + "step": 183 + }, + { + "epoch": 0.016135286484916328, + "grad_norm": 0.11572265625, + "learning_rate": 0.000552, + "loss": 1.4608, + "step": 184 + }, + { + "epoch": 0.016222978259290876, + "grad_norm": 0.099609375, + "learning_rate": 0.000555, + "loss": 1.5092, + "step": 185 + }, + { + "epoch": 0.01631067003366542, + "grad_norm": 0.099609375, + "learning_rate": 0.000558, + "loss": 1.5338, + "step": 186 + }, + { + "epoch": 0.016398361808039965, + "grad_norm": 0.1708984375, + "learning_rate": 0.000561, + "loss": 1.4916, + "step": 187 + }, + { + "epoch": 0.01648605358241451, + "grad_norm": 0.11376953125, + "learning_rate": 0.000564, + "loss": 1.4815, + "step": 188 + }, + { + "epoch": 0.016573745356789057, + "grad_norm": 0.2470703125, + "learning_rate": 0.000567, + "loss": 1.4502, + "step": 189 + }, + { + "epoch": 0.016661437131163602, + "grad_norm": 0.10302734375, + "learning_rate": 0.00057, + "loss": 1.5195, + "step": 190 + }, + { + "epoch": 0.016749128905538146, + "grad_norm": 0.15234375, + "learning_rate": 0.000573, + "loss": 1.545, + "step": 191 + }, + { + "epoch": 0.01683682067991269, + "grad_norm": 0.146484375, + "learning_rate": 0.000576, + "loss": 1.5517, + "step": 192 + }, + { + "epoch": 0.016924512454287235, + "grad_norm": 0.10791015625, + "learning_rate": 0.000579, + "loss": 1.5469, + "step": 193 + }, + { + "epoch": 0.017012204228661783, + "grad_norm": 0.1416015625, + "learning_rate": 0.000582, + "loss": 1.4311, + "step": 194 + }, + { + "epoch": 0.017099896003036328, + "grad_norm": 0.10888671875, + "learning_rate": 0.000585, + "loss": 1.5292, + "step": 195 + }, + { + "epoch": 0.017187587777410872, + "grad_norm": 0.1240234375, + "learning_rate": 0.000588, + "loss": 1.4326, + "step": 196 + }, + { + "epoch": 0.017275279551785417, + "grad_norm": 0.10205078125, + "learning_rate": 0.000591, + "loss": 1.4556, + "step": 197 + }, + { + "epoch": 0.017362971326159965, + "grad_norm": 0.09521484375, + "learning_rate": 0.000594, + "loss": 1.4719, + "step": 198 + }, + { + "epoch": 0.01745066310053451, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005970000000000001, + "loss": 1.5113, + "step": 199 + }, + { + "epoch": 0.017538354874909054, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006000000000000001, + "loss": 1.4525, + "step": 200 + }, + { + "epoch": 0.0176260466492836, + "grad_norm": 0.0888671875, + "learning_rate": 0.000603, + "loss": 1.4616, + "step": 201 + }, + { + "epoch": 0.017713738423658143, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006060000000000001, + "loss": 1.4662, + "step": 202 + }, + { + "epoch": 0.01780143019803269, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006090000000000001, + "loss": 1.4347, + "step": 203 + }, + { + "epoch": 0.017889121972407235, + "grad_norm": 0.09912109375, + "learning_rate": 0.000612, + "loss": 1.5131, + "step": 204 + }, + { + "epoch": 0.01797681374678178, + "grad_norm": 0.1220703125, + "learning_rate": 0.000615, + "loss": 1.4848, + "step": 205 + }, + { + "epoch": 0.018064505521156324, + "grad_norm": 0.10498046875, + "learning_rate": 0.000618, + "loss": 1.4201, + "step": 206 + }, + { + "epoch": 0.018152197295530872, + "grad_norm": 0.1103515625, + "learning_rate": 0.000621, + "loss": 1.4722, + "step": 207 + }, + { + "epoch": 0.018239889069905417, + "grad_norm": 0.0966796875, + "learning_rate": 0.000624, + "loss": 1.345, + "step": 208 + }, + { + "epoch": 0.01832758084427996, + "grad_norm": 0.08740234375, + "learning_rate": 0.000627, + "loss": 1.3573, + "step": 209 + }, + { + "epoch": 0.018415272618654506, + "grad_norm": 0.1494140625, + "learning_rate": 0.00063, + "loss": 1.4277, + "step": 210 + }, + { + "epoch": 0.01850296439302905, + "grad_norm": 0.0849609375, + "learning_rate": 0.000633, + "loss": 1.4629, + "step": 211 + }, + { + "epoch": 0.0185906561674036, + "grad_norm": 0.1142578125, + "learning_rate": 0.000636, + "loss": 1.4655, + "step": 212 + }, + { + "epoch": 0.018678347941778143, + "grad_norm": 0.1328125, + "learning_rate": 0.000639, + "loss": 1.4401, + "step": 213 + }, + { + "epoch": 0.018766039716152687, + "grad_norm": 0.1025390625, + "learning_rate": 0.000642, + "loss": 1.4852, + "step": 214 + }, + { + "epoch": 0.018853731490527232, + "grad_norm": 0.099609375, + "learning_rate": 0.000645, + "loss": 1.3802, + "step": 215 + }, + { + "epoch": 0.01894142326490178, + "grad_norm": 0.111328125, + "learning_rate": 0.000648, + "loss": 1.4288, + "step": 216 + }, + { + "epoch": 0.019029115039276324, + "grad_norm": 0.126953125, + "learning_rate": 0.000651, + "loss": 1.467, + "step": 217 + }, + { + "epoch": 0.01911680681365087, + "grad_norm": 0.11376953125, + "learning_rate": 0.000654, + "loss": 1.4569, + "step": 218 + }, + { + "epoch": 0.019204498588025413, + "grad_norm": 0.1064453125, + "learning_rate": 0.000657, + "loss": 1.4554, + "step": 219 + }, + { + "epoch": 0.019292190362399958, + "grad_norm": 0.154296875, + "learning_rate": 0.00066, + "loss": 1.4055, + "step": 220 + }, + { + "epoch": 0.019379882136774506, + "grad_norm": 0.08837890625, + "learning_rate": 0.0006630000000000001, + "loss": 1.4994, + "step": 221 + }, + { + "epoch": 0.01946757391114905, + "grad_norm": 0.11767578125, + "learning_rate": 0.000666, + "loss": 1.4716, + "step": 222 + }, + { + "epoch": 0.019555265685523595, + "grad_norm": 0.10498046875, + "learning_rate": 0.000669, + "loss": 1.3835, + "step": 223 + }, + { + "epoch": 0.01964295745989814, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006720000000000001, + "loss": 1.3807, + "step": 224 + }, + { + "epoch": 0.019730649234272687, + "grad_norm": 0.1123046875, + "learning_rate": 0.000675, + "loss": 1.4361, + "step": 225 + }, + { + "epoch": 0.019818341008647232, + "grad_norm": 0.109375, + "learning_rate": 0.000678, + "loss": 1.4103, + "step": 226 + }, + { + "epoch": 0.019906032783021776, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006810000000000001, + "loss": 1.4801, + "step": 227 + }, + { + "epoch": 0.01999372455739632, + "grad_norm": 0.10107421875, + "learning_rate": 0.000684, + "loss": 1.4361, + "step": 228 + }, + { + "epoch": 0.020081416331770865, + "grad_norm": 0.11328125, + "learning_rate": 0.000687, + "loss": 1.4731, + "step": 229 + }, + { + "epoch": 0.020169108106145413, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006900000000000001, + "loss": 1.4505, + "step": 230 + }, + { + "epoch": 0.020256799880519958, + "grad_norm": 0.1484375, + "learning_rate": 0.000693, + "loss": 1.3999, + "step": 231 + }, + { + "epoch": 0.020344491654894502, + "grad_norm": 0.0859375, + "learning_rate": 0.000696, + "loss": 1.4207, + "step": 232 + }, + { + "epoch": 0.020432183429269047, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006990000000000001, + "loss": 1.4174, + "step": 233 + }, + { + "epoch": 0.02051987520364359, + "grad_norm": 0.1279296875, + "learning_rate": 0.000702, + "loss": 1.4266, + "step": 234 + }, + { + "epoch": 0.02060756697801814, + "grad_norm": 0.10595703125, + "learning_rate": 0.000705, + "loss": 1.3834, + "step": 235 + }, + { + "epoch": 0.020695258752392684, + "grad_norm": 0.09326171875, + "learning_rate": 0.000708, + "loss": 1.3796, + "step": 236 + }, + { + "epoch": 0.02078295052676723, + "grad_norm": 0.1455078125, + "learning_rate": 0.0007109999999999999, + "loss": 1.4036, + "step": 237 + }, + { + "epoch": 0.020870642301141773, + "grad_norm": 0.1220703125, + "learning_rate": 0.000714, + "loss": 1.3561, + "step": 238 + }, + { + "epoch": 0.02095833407551632, + "grad_norm": 0.0849609375, + "learning_rate": 0.000717, + "loss": 1.4045, + "step": 239 + }, + { + "epoch": 0.021046025849890865, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007199999999999999, + "loss": 1.41, + "step": 240 + }, + { + "epoch": 0.02113371762426541, + "grad_norm": 0.1396484375, + "learning_rate": 0.000723, + "loss": 1.4192, + "step": 241 + }, + { + "epoch": 0.021221409398639954, + "grad_norm": 0.11328125, + "learning_rate": 0.000726, + "loss": 1.3841, + "step": 242 + }, + { + "epoch": 0.0213091011730145, + "grad_norm": 0.1240234375, + "learning_rate": 0.000729, + "loss": 1.3593, + "step": 243 + }, + { + "epoch": 0.021396792947389047, + "grad_norm": 0.1181640625, + "learning_rate": 0.000732, + "loss": 1.3946, + "step": 244 + }, + { + "epoch": 0.02148448472176359, + "grad_norm": 0.10400390625, + "learning_rate": 0.000735, + "loss": 1.4397, + "step": 245 + }, + { + "epoch": 0.021572176496138136, + "grad_norm": 0.10595703125, + "learning_rate": 0.000738, + "loss": 1.3494, + "step": 246 + }, + { + "epoch": 0.02165986827051268, + "grad_norm": 0.1396484375, + "learning_rate": 0.000741, + "loss": 1.4703, + "step": 247 + }, + { + "epoch": 0.02174756004488723, + "grad_norm": 0.11181640625, + "learning_rate": 0.000744, + "loss": 1.3589, + "step": 248 + }, + { + "epoch": 0.021835251819261773, + "grad_norm": 0.1220703125, + "learning_rate": 0.000747, + "loss": 1.3954, + "step": 249 + }, + { + "epoch": 0.021922943593636317, + "grad_norm": 0.1044921875, + "learning_rate": 0.00075, + "loss": 1.3917, + "step": 250 + }, + { + "epoch": 0.022010635368010862, + "grad_norm": 0.11376953125, + "learning_rate": 0.000753, + "loss": 1.3834, + "step": 251 + }, + { + "epoch": 0.022098327142385406, + "grad_norm": 0.0986328125, + "learning_rate": 0.000756, + "loss": 1.4159, + "step": 252 + }, + { + "epoch": 0.022186018916759954, + "grad_norm": 0.103515625, + "learning_rate": 0.000759, + "loss": 1.4357, + "step": 253 + }, + { + "epoch": 0.0222737106911345, + "grad_norm": 0.10888671875, + "learning_rate": 0.000762, + "loss": 1.4194, + "step": 254 + }, + { + "epoch": 0.022361402465509043, + "grad_norm": 0.09375, + "learning_rate": 0.0007650000000000001, + "loss": 1.3669, + "step": 255 + }, + { + "epoch": 0.022449094239883588, + "grad_norm": 0.095703125, + "learning_rate": 0.000768, + "loss": 1.4522, + "step": 256 + }, + { + "epoch": 0.022536786014258136, + "grad_norm": 0.1103515625, + "learning_rate": 0.000771, + "loss": 1.3672, + "step": 257 + }, + { + "epoch": 0.02262447778863268, + "grad_norm": 0.08984375, + "learning_rate": 0.0007740000000000001, + "loss": 1.3274, + "step": 258 + }, + { + "epoch": 0.022712169563007225, + "grad_norm": 0.1298828125, + "learning_rate": 0.000777, + "loss": 1.3584, + "step": 259 + }, + { + "epoch": 0.02279986133738177, + "grad_norm": 0.12158203125, + "learning_rate": 0.0007800000000000001, + "loss": 1.3933, + "step": 260 + }, + { + "epoch": 0.022887553111756314, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007830000000000001, + "loss": 1.4353, + "step": 261 + }, + { + "epoch": 0.022975244886130862, + "grad_norm": 0.10205078125, + "learning_rate": 0.000786, + "loss": 1.3784, + "step": 262 + }, + { + "epoch": 0.023062936660505406, + "grad_norm": 0.1708984375, + "learning_rate": 0.0007890000000000001, + "loss": 1.4389, + "step": 263 + }, + { + "epoch": 0.02315062843487995, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007920000000000001, + "loss": 1.3353, + "step": 264 + }, + { + "epoch": 0.023238320209254495, + "grad_norm": 0.208984375, + "learning_rate": 0.000795, + "loss": 1.375, + "step": 265 + }, + { + "epoch": 0.023326011983629043, + "grad_norm": 0.0849609375, + "learning_rate": 0.0007980000000000001, + "loss": 1.3568, + "step": 266 + }, + { + "epoch": 0.023413703758003588, + "grad_norm": 0.15625, + "learning_rate": 0.0008010000000000001, + "loss": 1.3975, + "step": 267 + }, + { + "epoch": 0.023501395532378132, + "grad_norm": 0.09716796875, + "learning_rate": 0.000804, + "loss": 1.4104, + "step": 268 + }, + { + "epoch": 0.023589087306752677, + "grad_norm": 0.1201171875, + "learning_rate": 0.0008070000000000001, + "loss": 1.4314, + "step": 269 + }, + { + "epoch": 0.02367677908112722, + "grad_norm": 0.08935546875, + "learning_rate": 0.0008100000000000001, + "loss": 1.3977, + "step": 270 + }, + { + "epoch": 0.02376447085550177, + "grad_norm": 0.087890625, + "learning_rate": 0.000813, + "loss": 1.4509, + "step": 271 + }, + { + "epoch": 0.023852162629876314, + "grad_norm": 0.154296875, + "learning_rate": 0.0008160000000000001, + "loss": 1.4231, + "step": 272 + }, + { + "epoch": 0.02393985440425086, + "grad_norm": 0.099609375, + "learning_rate": 0.0008190000000000001, + "loss": 1.381, + "step": 273 + }, + { + "epoch": 0.024027546178625403, + "grad_norm": 0.224609375, + "learning_rate": 0.000822, + "loss": 1.423, + "step": 274 + }, + { + "epoch": 0.02411523795299995, + "grad_norm": 0.16015625, + "learning_rate": 0.0008250000000000001, + "loss": 1.4066, + "step": 275 + }, + { + "epoch": 0.024202929727374495, + "grad_norm": 0.189453125, + "learning_rate": 0.0008280000000000001, + "loss": 1.374, + "step": 276 + }, + { + "epoch": 0.02429062150174904, + "grad_norm": 0.341796875, + "learning_rate": 0.0008310000000000001, + "loss": 1.3806, + "step": 277 + }, + { + "epoch": 0.024378313276123584, + "grad_norm": 0.1298828125, + "learning_rate": 0.0008340000000000001, + "loss": 1.4127, + "step": 278 + }, + { + "epoch": 0.02446600505049813, + "grad_norm": 0.220703125, + "learning_rate": 0.0008370000000000001, + "loss": 1.3966, + "step": 279 + }, + { + "epoch": 0.024553696824872677, + "grad_norm": 0.2138671875, + "learning_rate": 0.0008400000000000001, + "loss": 1.401, + "step": 280 + }, + { + "epoch": 0.02464138859924722, + "grad_norm": 0.1513671875, + "learning_rate": 0.0008430000000000001, + "loss": 1.4582, + "step": 281 + }, + { + "epoch": 0.024729080373621766, + "grad_norm": 0.138671875, + "learning_rate": 0.000846, + "loss": 1.3452, + "step": 282 + }, + { + "epoch": 0.02481677214799631, + "grad_norm": 0.09814453125, + "learning_rate": 0.0008489999999999999, + "loss": 1.3579, + "step": 283 + }, + { + "epoch": 0.02490446392237086, + "grad_norm": 0.162109375, + "learning_rate": 0.0008519999999999999, + "loss": 1.3803, + "step": 284 + }, + { + "epoch": 0.024992155696745403, + "grad_norm": 0.1279296875, + "learning_rate": 0.000855, + "loss": 1.3182, + "step": 285 + }, + { + "epoch": 0.025079847471119947, + "grad_norm": 0.1884765625, + "learning_rate": 0.0008579999999999999, + "loss": 1.4524, + "step": 286 + }, + { + "epoch": 0.025167539245494492, + "grad_norm": 0.126953125, + "learning_rate": 0.000861, + "loss": 1.3664, + "step": 287 + }, + { + "epoch": 0.025255231019869036, + "grad_norm": 0.1640625, + "learning_rate": 0.000864, + "loss": 1.4199, + "step": 288 + }, + { + "epoch": 0.025342922794243584, + "grad_norm": 0.1591796875, + "learning_rate": 0.0008669999999999999, + "loss": 1.3351, + "step": 289 + }, + { + "epoch": 0.02543061456861813, + "grad_norm": 0.111328125, + "learning_rate": 0.00087, + "loss": 1.3552, + "step": 290 + }, + { + "epoch": 0.025518306342992673, + "grad_norm": 0.193359375, + "learning_rate": 0.000873, + "loss": 1.3583, + "step": 291 + }, + { + "epoch": 0.025605998117367218, + "grad_norm": 0.10205078125, + "learning_rate": 0.0008759999999999999, + "loss": 1.3731, + "step": 292 + }, + { + "epoch": 0.025693689891741762, + "grad_norm": 0.181640625, + "learning_rate": 0.000879, + "loss": 1.3517, + "step": 293 + }, + { + "epoch": 0.02578138166611631, + "grad_norm": 0.09521484375, + "learning_rate": 0.000882, + "loss": 1.4209, + "step": 294 + }, + { + "epoch": 0.025869073440490855, + "grad_norm": 0.1357421875, + "learning_rate": 0.0008849999999999999, + "loss": 1.3834, + "step": 295 + }, + { + "epoch": 0.0259567652148654, + "grad_norm": 0.12353515625, + "learning_rate": 0.000888, + "loss": 1.4558, + "step": 296 + }, + { + "epoch": 0.026044456989239944, + "grad_norm": 0.10791015625, + "learning_rate": 0.000891, + "loss": 1.4401, + "step": 297 + }, + { + "epoch": 0.026132148763614492, + "grad_norm": 0.1630859375, + "learning_rate": 0.0008939999999999999, + "loss": 1.3981, + "step": 298 + }, + { + "epoch": 0.026219840537989036, + "grad_norm": 0.08740234375, + "learning_rate": 0.000897, + "loss": 1.3309, + "step": 299 + }, + { + "epoch": 0.02630753231236358, + "grad_norm": 0.240234375, + "learning_rate": 0.0009, + "loss": 1.34, + "step": 300 + }, + { + "epoch": 0.026395224086738125, + "grad_norm": 0.083984375, + "learning_rate": 0.0009029999999999999, + "loss": 1.3813, + "step": 301 + }, + { + "epoch": 0.02648291586111267, + "grad_norm": 0.2041015625, + "learning_rate": 0.000906, + "loss": 1.4285, + "step": 302 + }, + { + "epoch": 0.026570607635487218, + "grad_norm": 0.11572265625, + "learning_rate": 0.000909, + "loss": 1.3756, + "step": 303 + }, + { + "epoch": 0.026658299409861762, + "grad_norm": 0.119140625, + "learning_rate": 0.000912, + "loss": 1.394, + "step": 304 + }, + { + "epoch": 0.026745991184236307, + "grad_norm": 0.1552734375, + "learning_rate": 0.000915, + "loss": 1.4289, + "step": 305 + }, + { + "epoch": 0.02683368295861085, + "grad_norm": 0.09228515625, + "learning_rate": 0.000918, + "loss": 1.3585, + "step": 306 + }, + { + "epoch": 0.0269213747329854, + "grad_norm": 0.193359375, + "learning_rate": 0.000921, + "loss": 1.3685, + "step": 307 + }, + { + "epoch": 0.027009066507359944, + "grad_norm": 0.08984375, + "learning_rate": 0.000924, + "loss": 1.3597, + "step": 308 + }, + { + "epoch": 0.02709675828173449, + "grad_norm": 0.12451171875, + "learning_rate": 0.000927, + "loss": 1.3464, + "step": 309 + }, + { + "epoch": 0.027184450056109033, + "grad_norm": 0.11865234375, + "learning_rate": 0.00093, + "loss": 1.4275, + "step": 310 + }, + { + "epoch": 0.027272141830483577, + "grad_norm": 0.10693359375, + "learning_rate": 0.000933, + "loss": 1.3538, + "step": 311 + }, + { + "epoch": 0.027359833604858125, + "grad_norm": 0.08740234375, + "learning_rate": 0.000936, + "loss": 1.3189, + "step": 312 + }, + { + "epoch": 0.02744752537923267, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009390000000000001, + "loss": 1.337, + "step": 313 + }, + { + "epoch": 0.027535217153607214, + "grad_norm": 0.099609375, + "learning_rate": 0.000942, + "loss": 1.3509, + "step": 314 + }, + { + "epoch": 0.02762290892798176, + "grad_norm": 0.10302734375, + "learning_rate": 0.000945, + "loss": 1.4155, + "step": 315 + }, + { + "epoch": 0.027710600702356307, + "grad_norm": 0.12060546875, + "learning_rate": 0.0009480000000000001, + "loss": 1.3987, + "step": 316 + }, + { + "epoch": 0.02779829247673085, + "grad_norm": 0.09765625, + "learning_rate": 0.000951, + "loss": 1.3961, + "step": 317 + }, + { + "epoch": 0.027885984251105396, + "grad_norm": 0.10888671875, + "learning_rate": 0.000954, + "loss": 1.3766, + "step": 318 + }, + { + "epoch": 0.02797367602547994, + "grad_norm": 0.1171875, + "learning_rate": 0.0009570000000000001, + "loss": 1.3738, + "step": 319 + }, + { + "epoch": 0.028061367799854485, + "grad_norm": 0.18359375, + "learning_rate": 0.00096, + "loss": 1.4111, + "step": 320 + }, + { + "epoch": 0.028149059574229033, + "grad_norm": 0.111328125, + "learning_rate": 0.000963, + "loss": 1.344, + "step": 321 + }, + { + "epoch": 0.028236751348603577, + "grad_norm": 0.2333984375, + "learning_rate": 0.0009660000000000001, + "loss": 1.3448, + "step": 322 + }, + { + "epoch": 0.028324443122978122, + "grad_norm": 0.09033203125, + "learning_rate": 0.000969, + "loss": 1.4107, + "step": 323 + }, + { + "epoch": 0.028412134897352666, + "grad_norm": 0.1708984375, + "learning_rate": 0.0009720000000000001, + "loss": 1.4008, + "step": 324 + }, + { + "epoch": 0.028499826671727214, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009750000000000001, + "loss": 1.3442, + "step": 325 + }, + { + "epoch": 0.02858751844610176, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009780000000000001, + "loss": 1.4229, + "step": 326 + }, + { + "epoch": 0.028675210220476303, + "grad_norm": 0.1123046875, + "learning_rate": 0.000981, + "loss": 1.3201, + "step": 327 + }, + { + "epoch": 0.028762901994850848, + "grad_norm": 0.10595703125, + "learning_rate": 0.000984, + "loss": 1.4149, + "step": 328 + }, + { + "epoch": 0.028850593769225392, + "grad_norm": 0.162109375, + "learning_rate": 0.000987, + "loss": 1.3379, + "step": 329 + }, + { + "epoch": 0.02893828554359994, + "grad_norm": 0.126953125, + "learning_rate": 0.00099, + "loss": 1.3845, + "step": 330 + }, + { + "epoch": 0.029025977317974485, + "grad_norm": 0.126953125, + "learning_rate": 0.0009930000000000002, + "loss": 1.3366, + "step": 331 + }, + { + "epoch": 0.02911366909234903, + "grad_norm": 0.130859375, + "learning_rate": 0.0009960000000000001, + "loss": 1.4185, + "step": 332 + }, + { + "epoch": 0.029201360866723574, + "grad_norm": 0.126953125, + "learning_rate": 0.000999, + "loss": 1.3812, + "step": 333 + }, + { + "epoch": 0.029289052641098122, + "grad_norm": 0.10400390625, + "learning_rate": 0.001002, + "loss": 1.3699, + "step": 334 + }, + { + "epoch": 0.029376744415472666, + "grad_norm": 0.1171875, + "learning_rate": 0.001005, + "loss": 1.3294, + "step": 335 + }, + { + "epoch": 0.02946443618984721, + "grad_norm": 0.1552734375, + "learning_rate": 0.001008, + "loss": 1.3399, + "step": 336 + }, + { + "epoch": 0.029552127964221755, + "grad_norm": 0.09765625, + "learning_rate": 0.0010110000000000002, + "loss": 1.3641, + "step": 337 + }, + { + "epoch": 0.0296398197385963, + "grad_norm": 0.1171875, + "learning_rate": 0.0010140000000000001, + "loss": 1.3034, + "step": 338 + }, + { + "epoch": 0.029727511512970848, + "grad_norm": 0.09375, + "learning_rate": 0.0010170000000000001, + "loss": 1.4665, + "step": 339 + }, + { + "epoch": 0.029815203287345392, + "grad_norm": 0.095703125, + "learning_rate": 0.00102, + "loss": 1.362, + "step": 340 + }, + { + "epoch": 0.029902895061719937, + "grad_norm": 0.1328125, + "learning_rate": 0.001023, + "loss": 1.4033, + "step": 341 + }, + { + "epoch": 0.02999058683609448, + "grad_norm": 0.091796875, + "learning_rate": 0.001026, + "loss": 1.3324, + "step": 342 + }, + { + "epoch": 0.030078278610469026, + "grad_norm": 0.130859375, + "learning_rate": 0.0010290000000000002, + "loss": 1.3564, + "step": 343 + }, + { + "epoch": 0.030165970384843574, + "grad_norm": 0.1044921875, + "learning_rate": 0.001032, + "loss": 1.4015, + "step": 344 + }, + { + "epoch": 0.03025366215921812, + "grad_norm": 0.1044921875, + "learning_rate": 0.001035, + "loss": 1.4183, + "step": 345 + }, + { + "epoch": 0.030341353933592663, + "grad_norm": 0.09716796875, + "learning_rate": 0.0010379999999999999, + "loss": 1.402, + "step": 346 + }, + { + "epoch": 0.030429045707967207, + "grad_norm": 0.1279296875, + "learning_rate": 0.001041, + "loss": 1.4356, + "step": 347 + }, + { + "epoch": 0.030516737482341755, + "grad_norm": 0.10888671875, + "learning_rate": 0.001044, + "loss": 1.3444, + "step": 348 + }, + { + "epoch": 0.0306044292567163, + "grad_norm": 0.1005859375, + "learning_rate": 0.001047, + "loss": 1.2408, + "step": 349 + }, + { + "epoch": 0.030692121031090844, + "grad_norm": 0.1181640625, + "learning_rate": 0.00105, + "loss": 1.3412, + "step": 350 + }, + { + "epoch": 0.03077981280546539, + "grad_norm": 0.107421875, + "learning_rate": 0.001053, + "loss": 1.3695, + "step": 351 + }, + { + "epoch": 0.030867504579839933, + "grad_norm": 0.10498046875, + "learning_rate": 0.0010559999999999999, + "loss": 1.2612, + "step": 352 + }, + { + "epoch": 0.03095519635421448, + "grad_norm": 0.126953125, + "learning_rate": 0.001059, + "loss": 1.3209, + "step": 353 + }, + { + "epoch": 0.031042888128589026, + "grad_norm": 0.11181640625, + "learning_rate": 0.001062, + "loss": 1.3619, + "step": 354 + }, + { + "epoch": 0.03113057990296357, + "grad_norm": 0.11669921875, + "learning_rate": 0.001065, + "loss": 1.3898, + "step": 355 + }, + { + "epoch": 0.031218271677338115, + "grad_norm": 0.1083984375, + "learning_rate": 0.001068, + "loss": 1.3756, + "step": 356 + }, + { + "epoch": 0.03130596345171266, + "grad_norm": 0.09814453125, + "learning_rate": 0.001071, + "loss": 1.3883, + "step": 357 + }, + { + "epoch": 0.031393655226087204, + "grad_norm": 0.1015625, + "learning_rate": 0.001074, + "loss": 1.3457, + "step": 358 + }, + { + "epoch": 0.031481347000461755, + "grad_norm": 0.1318359375, + "learning_rate": 0.001077, + "loss": 1.3499, + "step": 359 + }, + { + "epoch": 0.0315690387748363, + "grad_norm": 0.125, + "learning_rate": 0.00108, + "loss": 1.385, + "step": 360 + }, + { + "epoch": 0.031656730549210844, + "grad_norm": 0.0966796875, + "learning_rate": 0.001083, + "loss": 1.3921, + "step": 361 + }, + { + "epoch": 0.03174442232358539, + "grad_norm": 0.10888671875, + "learning_rate": 0.001086, + "loss": 1.3615, + "step": 362 + }, + { + "epoch": 0.03183211409795993, + "grad_norm": 0.1376953125, + "learning_rate": 0.001089, + "loss": 1.3125, + "step": 363 + }, + { + "epoch": 0.03191980587233448, + "grad_norm": 0.2080078125, + "learning_rate": 0.001092, + "loss": 1.3186, + "step": 364 + }, + { + "epoch": 0.03200749764670902, + "grad_norm": 0.09375, + "learning_rate": 0.001095, + "loss": 1.364, + "step": 365 + }, + { + "epoch": 0.03209518942108357, + "grad_norm": 0.205078125, + "learning_rate": 0.001098, + "loss": 1.4273, + "step": 366 + }, + { + "epoch": 0.03218288119545811, + "grad_norm": 0.09375, + "learning_rate": 0.001101, + "loss": 1.3585, + "step": 367 + }, + { + "epoch": 0.032270572969832656, + "grad_norm": 0.111328125, + "learning_rate": 0.001104, + "loss": 1.3809, + "step": 368 + }, + { + "epoch": 0.03235826474420721, + "grad_norm": 0.11572265625, + "learning_rate": 0.001107, + "loss": 1.4134, + "step": 369 + }, + { + "epoch": 0.03244595651858175, + "grad_norm": 0.154296875, + "learning_rate": 0.00111, + "loss": 1.3046, + "step": 370 + }, + { + "epoch": 0.032533648292956296, + "grad_norm": 0.1455078125, + "learning_rate": 0.001113, + "loss": 1.44, + "step": 371 + }, + { + "epoch": 0.03262134006733084, + "grad_norm": 0.10400390625, + "learning_rate": 0.001116, + "loss": 1.3767, + "step": 372 + }, + { + "epoch": 0.032709031841705385, + "grad_norm": 0.10400390625, + "learning_rate": 0.001119, + "loss": 1.3568, + "step": 373 + }, + { + "epoch": 0.03279672361607993, + "grad_norm": 0.10302734375, + "learning_rate": 0.001122, + "loss": 1.2876, + "step": 374 + }, + { + "epoch": 0.032884415390454474, + "grad_norm": 0.09326171875, + "learning_rate": 0.0011250000000000001, + "loss": 1.3378, + "step": 375 + }, + { + "epoch": 0.03297210716482902, + "grad_norm": 0.12255859375, + "learning_rate": 0.001128, + "loss": 1.2907, + "step": 376 + }, + { + "epoch": 0.033059798939203563, + "grad_norm": 0.091796875, + "learning_rate": 0.001131, + "loss": 1.3901, + "step": 377 + }, + { + "epoch": 0.033147490713578115, + "grad_norm": 0.12158203125, + "learning_rate": 0.001134, + "loss": 1.3821, + "step": 378 + }, + { + "epoch": 0.03323518248795266, + "grad_norm": 0.083984375, + "learning_rate": 0.001137, + "loss": 1.3682, + "step": 379 + }, + { + "epoch": 0.033322874262327204, + "grad_norm": 0.1123046875, + "learning_rate": 0.00114, + "loss": 1.3609, + "step": 380 + }, + { + "epoch": 0.03341056603670175, + "grad_norm": 0.0966796875, + "learning_rate": 0.0011430000000000001, + "loss": 1.3734, + "step": 381 + }, + { + "epoch": 0.03349825781107629, + "grad_norm": 0.1591796875, + "learning_rate": 0.001146, + "loss": 1.426, + "step": 382 + }, + { + "epoch": 0.03358594958545084, + "grad_norm": 0.1376953125, + "learning_rate": 0.001149, + "loss": 1.3942, + "step": 383 + }, + { + "epoch": 0.03367364135982538, + "grad_norm": 0.0908203125, + "learning_rate": 0.001152, + "loss": 1.3952, + "step": 384 + }, + { + "epoch": 0.033761333134199926, + "grad_norm": 0.09033203125, + "learning_rate": 0.001155, + "loss": 1.4302, + "step": 385 + }, + { + "epoch": 0.03384902490857447, + "grad_norm": 0.10009765625, + "learning_rate": 0.001158, + "loss": 1.3961, + "step": 386 + }, + { + "epoch": 0.03393671668294902, + "grad_norm": 0.10888671875, + "learning_rate": 0.0011610000000000001, + "loss": 1.3277, + "step": 387 + }, + { + "epoch": 0.03402440845732357, + "grad_norm": 0.09765625, + "learning_rate": 0.001164, + "loss": 1.2633, + "step": 388 + }, + { + "epoch": 0.03411210023169811, + "grad_norm": 0.1220703125, + "learning_rate": 0.001167, + "loss": 1.3971, + "step": 389 + }, + { + "epoch": 0.034199792006072656, + "grad_norm": 0.09619140625, + "learning_rate": 0.00117, + "loss": 1.3659, + "step": 390 + }, + { + "epoch": 0.0342874837804472, + "grad_norm": 0.1640625, + "learning_rate": 0.001173, + "loss": 1.4147, + "step": 391 + }, + { + "epoch": 0.034375175554821745, + "grad_norm": 0.10595703125, + "learning_rate": 0.001176, + "loss": 1.3504, + "step": 392 + }, + { + "epoch": 0.03446286732919629, + "grad_norm": 0.15625, + "learning_rate": 0.0011790000000000001, + "loss": 1.3842, + "step": 393 + }, + { + "epoch": 0.034550559103570834, + "grad_norm": 0.11279296875, + "learning_rate": 0.001182, + "loss": 1.4277, + "step": 394 + }, + { + "epoch": 0.03463825087794538, + "grad_norm": 0.142578125, + "learning_rate": 0.001185, + "loss": 1.3052, + "step": 395 + }, + { + "epoch": 0.03472594265231993, + "grad_norm": 0.140625, + "learning_rate": 0.001188, + "loss": 1.41, + "step": 396 + }, + { + "epoch": 0.034813634426694474, + "grad_norm": 0.154296875, + "learning_rate": 0.001191, + "loss": 1.3605, + "step": 397 + }, + { + "epoch": 0.03490132620106902, + "grad_norm": 0.1572265625, + "learning_rate": 0.0011940000000000002, + "loss": 1.3762, + "step": 398 + }, + { + "epoch": 0.03498901797544356, + "grad_norm": 0.1845703125, + "learning_rate": 0.0011970000000000001, + "loss": 1.2598, + "step": 399 + }, + { + "epoch": 0.03507670974981811, + "grad_norm": 0.2236328125, + "learning_rate": 0.0012000000000000001, + "loss": 1.352, + "step": 400 + }, + { + "epoch": 0.03516440152419265, + "grad_norm": 0.1259765625, + "learning_rate": 0.001203, + "loss": 1.3672, + "step": 401 + }, + { + "epoch": 0.0352520932985672, + "grad_norm": 0.1767578125, + "learning_rate": 0.001206, + "loss": 1.4493, + "step": 402 + }, + { + "epoch": 0.03533978507294174, + "grad_norm": 0.2236328125, + "learning_rate": 0.001209, + "loss": 1.3753, + "step": 403 + }, + { + "epoch": 0.035427476847316286, + "grad_norm": 0.15234375, + "learning_rate": 0.0012120000000000002, + "loss": 1.3092, + "step": 404 + }, + { + "epoch": 0.03551516862169084, + "grad_norm": 0.12890625, + "learning_rate": 0.0012150000000000002, + "loss": 1.3083, + "step": 405 + }, + { + "epoch": 0.03560286039606538, + "grad_norm": 0.1025390625, + "learning_rate": 0.0012180000000000001, + "loss": 1.2666, + "step": 406 + }, + { + "epoch": 0.035690552170439926, + "grad_norm": 0.18359375, + "learning_rate": 0.0012209999999999999, + "loss": 1.31, + "step": 407 + }, + { + "epoch": 0.03577824394481447, + "grad_norm": 0.130859375, + "learning_rate": 0.001224, + "loss": 1.3723, + "step": 408 + }, + { + "epoch": 0.035865935719189015, + "grad_norm": 0.10400390625, + "learning_rate": 0.001227, + "loss": 1.296, + "step": 409 + }, + { + "epoch": 0.03595362749356356, + "grad_norm": 0.1689453125, + "learning_rate": 0.00123, + "loss": 1.3209, + "step": 410 + }, + { + "epoch": 0.036041319267938104, + "grad_norm": 0.0869140625, + "learning_rate": 0.001233, + "loss": 1.3439, + "step": 411 + }, + { + "epoch": 0.03612901104231265, + "grad_norm": 0.1640625, + "learning_rate": 0.001236, + "loss": 1.361, + "step": 412 + }, + { + "epoch": 0.036216702816687193, + "grad_norm": 0.150390625, + "learning_rate": 0.0012389999999999999, + "loss": 1.3845, + "step": 413 + }, + { + "epoch": 0.036304394591061745, + "grad_norm": 0.12255859375, + "learning_rate": 0.001242, + "loss": 1.3486, + "step": 414 + }, + { + "epoch": 0.03639208636543629, + "grad_norm": 0.173828125, + "learning_rate": 0.001245, + "loss": 1.3611, + "step": 415 + }, + { + "epoch": 0.036479778139810834, + "grad_norm": 0.158203125, + "learning_rate": 0.001248, + "loss": 1.3968, + "step": 416 + }, + { + "epoch": 0.03656746991418538, + "grad_norm": 0.1904296875, + "learning_rate": 0.001251, + "loss": 1.3516, + "step": 417 + }, + { + "epoch": 0.03665516168855992, + "grad_norm": 0.216796875, + "learning_rate": 0.001254, + "loss": 1.3711, + "step": 418 + }, + { + "epoch": 0.03674285346293447, + "grad_norm": 0.1689453125, + "learning_rate": 0.0012569999999999999, + "loss": 1.3854, + "step": 419 + }, + { + "epoch": 0.03683054523730901, + "grad_norm": 0.1650390625, + "learning_rate": 0.00126, + "loss": 1.3558, + "step": 420 + }, + { + "epoch": 0.036918237011683556, + "grad_norm": 0.1728515625, + "learning_rate": 0.001263, + "loss": 1.2976, + "step": 421 + }, + { + "epoch": 0.0370059287860581, + "grad_norm": 0.103515625, + "learning_rate": 0.001266, + "loss": 1.3582, + "step": 422 + }, + { + "epoch": 0.03709362056043265, + "grad_norm": 0.1689453125, + "learning_rate": 0.001269, + "loss": 1.3893, + "step": 423 + }, + { + "epoch": 0.0371813123348072, + "grad_norm": 0.10986328125, + "learning_rate": 0.001272, + "loss": 1.2884, + "step": 424 + }, + { + "epoch": 0.03726900410918174, + "grad_norm": 0.1181640625, + "learning_rate": 0.001275, + "loss": 1.3434, + "step": 425 + }, + { + "epoch": 0.037356695883556286, + "grad_norm": 0.1044921875, + "learning_rate": 0.001278, + "loss": 1.3381, + "step": 426 + }, + { + "epoch": 0.03744438765793083, + "grad_norm": 0.140625, + "learning_rate": 0.001281, + "loss": 1.3903, + "step": 427 + }, + { + "epoch": 0.037532079432305375, + "grad_norm": 0.08935546875, + "learning_rate": 0.001284, + "loss": 1.3506, + "step": 428 + }, + { + "epoch": 0.03761977120667992, + "grad_norm": 0.09814453125, + "learning_rate": 0.001287, + "loss": 1.2621, + "step": 429 + }, + { + "epoch": 0.037707462981054464, + "grad_norm": 0.09814453125, + "learning_rate": 0.00129, + "loss": 1.3424, + "step": 430 + }, + { + "epoch": 0.03779515475542901, + "grad_norm": 0.1162109375, + "learning_rate": 0.001293, + "loss": 1.4801, + "step": 431 + }, + { + "epoch": 0.03788284652980356, + "grad_norm": 0.09814453125, + "learning_rate": 0.001296, + "loss": 1.3269, + "step": 432 + }, + { + "epoch": 0.037970538304178104, + "grad_norm": 0.138671875, + "learning_rate": 0.001299, + "loss": 1.3656, + "step": 433 + }, + { + "epoch": 0.03805823007855265, + "grad_norm": 0.08837890625, + "learning_rate": 0.001302, + "loss": 1.3883, + "step": 434 + }, + { + "epoch": 0.03814592185292719, + "grad_norm": 0.1044921875, + "learning_rate": 0.001305, + "loss": 1.3636, + "step": 435 + }, + { + "epoch": 0.03823361362730174, + "grad_norm": 0.09130859375, + "learning_rate": 0.001308, + "loss": 1.3846, + "step": 436 + }, + { + "epoch": 0.03832130540167628, + "grad_norm": 0.1201171875, + "learning_rate": 0.001311, + "loss": 1.3506, + "step": 437 + }, + { + "epoch": 0.03840899717605083, + "grad_norm": 0.0859375, + "learning_rate": 0.001314, + "loss": 1.3023, + "step": 438 + }, + { + "epoch": 0.03849668895042537, + "grad_norm": 0.109375, + "learning_rate": 0.001317, + "loss": 1.3817, + "step": 439 + }, + { + "epoch": 0.038584380724799916, + "grad_norm": 0.1435546875, + "learning_rate": 0.00132, + "loss": 1.3687, + "step": 440 + }, + { + "epoch": 0.03867207249917447, + "grad_norm": 0.13671875, + "learning_rate": 0.001323, + "loss": 1.3563, + "step": 441 + }, + { + "epoch": 0.03875976427354901, + "grad_norm": 0.08837890625, + "learning_rate": 0.0013260000000000001, + "loss": 1.3338, + "step": 442 + }, + { + "epoch": 0.038847456047923556, + "grad_norm": 0.1103515625, + "learning_rate": 0.001329, + "loss": 1.3922, + "step": 443 + }, + { + "epoch": 0.0389351478222981, + "grad_norm": 0.12158203125, + "learning_rate": 0.001332, + "loss": 1.2568, + "step": 444 + }, + { + "epoch": 0.039022839596672645, + "grad_norm": 0.1337890625, + "learning_rate": 0.001335, + "loss": 1.3628, + "step": 445 + }, + { + "epoch": 0.03911053137104719, + "grad_norm": 0.1064453125, + "learning_rate": 0.001338, + "loss": 1.3751, + "step": 446 + }, + { + "epoch": 0.039198223145421734, + "grad_norm": 0.126953125, + "learning_rate": 0.001341, + "loss": 1.3495, + "step": 447 + }, + { + "epoch": 0.03928591491979628, + "grad_norm": 0.1474609375, + "learning_rate": 0.0013440000000000001, + "loss": 1.317, + "step": 448 + }, + { + "epoch": 0.039373606694170823, + "grad_norm": 0.09765625, + "learning_rate": 0.001347, + "loss": 1.3535, + "step": 449 + }, + { + "epoch": 0.039461298468545375, + "grad_norm": 0.12890625, + "learning_rate": 0.00135, + "loss": 1.2891, + "step": 450 + }, + { + "epoch": 0.03954899024291992, + "grad_norm": 0.1376953125, + "learning_rate": 0.001353, + "loss": 1.3959, + "step": 451 + }, + { + "epoch": 0.039636682017294464, + "grad_norm": 0.154296875, + "learning_rate": 0.001356, + "loss": 1.3192, + "step": 452 + }, + { + "epoch": 0.03972437379166901, + "grad_norm": 0.1572265625, + "learning_rate": 0.001359, + "loss": 1.4324, + "step": 453 + }, + { + "epoch": 0.03981206556604355, + "grad_norm": 0.0986328125, + "learning_rate": 0.0013620000000000001, + "loss": 1.3246, + "step": 454 + }, + { + "epoch": 0.0398997573404181, + "grad_norm": 0.138671875, + "learning_rate": 0.0013650000000000001, + "loss": 1.2848, + "step": 455 + }, + { + "epoch": 0.03998744911479264, + "grad_norm": 0.142578125, + "learning_rate": 0.001368, + "loss": 1.3973, + "step": 456 + }, + { + "epoch": 0.040075140889167186, + "grad_norm": 0.1572265625, + "learning_rate": 0.001371, + "loss": 1.3794, + "step": 457 + }, + { + "epoch": 0.04016283266354173, + "grad_norm": 0.099609375, + "learning_rate": 0.001374, + "loss": 1.2708, + "step": 458 + }, + { + "epoch": 0.04025052443791628, + "grad_norm": 0.171875, + "learning_rate": 0.0013770000000000002, + "loss": 1.3587, + "step": 459 + }, + { + "epoch": 0.04033821621229083, + "grad_norm": 0.1611328125, + "learning_rate": 0.0013800000000000002, + "loss": 1.2727, + "step": 460 + }, + { + "epoch": 0.04042590798666537, + "grad_norm": 0.171875, + "learning_rate": 0.0013830000000000001, + "loss": 1.3551, + "step": 461 + }, + { + "epoch": 0.040513599761039916, + "grad_norm": 0.1328125, + "learning_rate": 0.001386, + "loss": 1.3897, + "step": 462 + }, + { + "epoch": 0.04060129153541446, + "grad_norm": 0.14453125, + "learning_rate": 0.001389, + "loss": 1.3319, + "step": 463 + }, + { + "epoch": 0.040688983309789005, + "grad_norm": 0.16796875, + "learning_rate": 0.001392, + "loss": 1.2967, + "step": 464 + }, + { + "epoch": 0.04077667508416355, + "grad_norm": 0.138671875, + "learning_rate": 0.0013950000000000002, + "loss": 1.3376, + "step": 465 + }, + { + "epoch": 0.040864366858538094, + "grad_norm": 0.146484375, + "learning_rate": 0.0013980000000000002, + "loss": 1.3628, + "step": 466 + }, + { + "epoch": 0.04095205863291264, + "grad_norm": 0.1982421875, + "learning_rate": 0.0014010000000000001, + "loss": 1.2868, + "step": 467 + }, + { + "epoch": 0.04103975040728718, + "grad_norm": 0.0947265625, + "learning_rate": 0.001404, + "loss": 1.3618, + "step": 468 + }, + { + "epoch": 0.041127442181661734, + "grad_norm": 0.2197265625, + "learning_rate": 0.001407, + "loss": 1.3419, + "step": 469 + }, + { + "epoch": 0.04121513395603628, + "grad_norm": 0.09765625, + "learning_rate": 0.00141, + "loss": 1.3019, + "step": 470 + }, + { + "epoch": 0.04130282573041082, + "grad_norm": 0.2470703125, + "learning_rate": 0.001413, + "loss": 1.4217, + "step": 471 + }, + { + "epoch": 0.04139051750478537, + "grad_norm": 0.0986328125, + "learning_rate": 0.001416, + "loss": 1.3689, + "step": 472 + }, + { + "epoch": 0.04147820927915991, + "grad_norm": 0.173828125, + "learning_rate": 0.001419, + "loss": 1.3036, + "step": 473 + }, + { + "epoch": 0.04156590105353446, + "grad_norm": 0.0986328125, + "learning_rate": 0.0014219999999999999, + "loss": 1.3835, + "step": 474 + }, + { + "epoch": 0.041653592827909, + "grad_norm": 0.1611328125, + "learning_rate": 0.001425, + "loss": 1.2816, + "step": 475 + }, + { + "epoch": 0.041741284602283546, + "grad_norm": 0.107421875, + "learning_rate": 0.001428, + "loss": 1.3424, + "step": 476 + }, + { + "epoch": 0.04182897637665809, + "grad_norm": 0.1044921875, + "learning_rate": 0.001431, + "loss": 1.3255, + "step": 477 + }, + { + "epoch": 0.04191666815103264, + "grad_norm": 0.11376953125, + "learning_rate": 0.001434, + "loss": 1.358, + "step": 478 + }, + { + "epoch": 0.042004359925407186, + "grad_norm": 0.103515625, + "learning_rate": 0.001437, + "loss": 1.3379, + "step": 479 + }, + { + "epoch": 0.04209205169978173, + "grad_norm": 0.1162109375, + "learning_rate": 0.0014399999999999999, + "loss": 1.2963, + "step": 480 + }, + { + "epoch": 0.042179743474156275, + "grad_norm": 0.08984375, + "learning_rate": 0.001443, + "loss": 1.2964, + "step": 481 + }, + { + "epoch": 0.04226743524853082, + "grad_norm": 0.08837890625, + "learning_rate": 0.001446, + "loss": 1.2804, + "step": 482 + }, + { + "epoch": 0.042355127022905364, + "grad_norm": 0.1181640625, + "learning_rate": 0.001449, + "loss": 1.353, + "step": 483 + }, + { + "epoch": 0.04244281879727991, + "grad_norm": 0.140625, + "learning_rate": 0.001452, + "loss": 1.2805, + "step": 484 + }, + { + "epoch": 0.042530510571654453, + "grad_norm": 0.130859375, + "learning_rate": 0.001455, + "loss": 1.3353, + "step": 485 + }, + { + "epoch": 0.042618202346029, + "grad_norm": 0.1845703125, + "learning_rate": 0.001458, + "loss": 1.4368, + "step": 486 + }, + { + "epoch": 0.04270589412040355, + "grad_norm": 0.1416015625, + "learning_rate": 0.001461, + "loss": 1.355, + "step": 487 + }, + { + "epoch": 0.042793585894778094, + "grad_norm": 0.1318359375, + "learning_rate": 0.001464, + "loss": 1.3571, + "step": 488 + }, + { + "epoch": 0.04288127766915264, + "grad_norm": 0.12890625, + "learning_rate": 0.001467, + "loss": 1.3144, + "step": 489 + }, + { + "epoch": 0.04296896944352718, + "grad_norm": 0.09716796875, + "learning_rate": 0.00147, + "loss": 1.3431, + "step": 490 + }, + { + "epoch": 0.04305666121790173, + "grad_norm": 0.119140625, + "learning_rate": 0.001473, + "loss": 1.3331, + "step": 491 + }, + { + "epoch": 0.04314435299227627, + "grad_norm": 0.10205078125, + "learning_rate": 0.001476, + "loss": 1.3873, + "step": 492 + }, + { + "epoch": 0.043232044766650816, + "grad_norm": 0.1181640625, + "learning_rate": 0.001479, + "loss": 1.3456, + "step": 493 + }, + { + "epoch": 0.04331973654102536, + "grad_norm": 0.10107421875, + "learning_rate": 0.001482, + "loss": 1.354, + "step": 494 + }, + { + "epoch": 0.043407428315399905, + "grad_norm": 0.1513671875, + "learning_rate": 0.001485, + "loss": 1.3155, + "step": 495 + }, + { + "epoch": 0.04349512008977446, + "grad_norm": 0.1318359375, + "learning_rate": 0.001488, + "loss": 1.3318, + "step": 496 + }, + { + "epoch": 0.043582811864149, + "grad_norm": 0.1318359375, + "learning_rate": 0.001491, + "loss": 1.3467, + "step": 497 + }, + { + "epoch": 0.043670503638523546, + "grad_norm": 0.1005859375, + "learning_rate": 0.001494, + "loss": 1.33, + "step": 498 + }, + { + "epoch": 0.04375819541289809, + "grad_norm": 0.09765625, + "learning_rate": 0.001497, + "loss": 1.3274, + "step": 499 + }, + { + "epoch": 0.043845887187272635, + "grad_norm": 0.138671875, + "learning_rate": 0.0015, + "loss": 1.3365, + "step": 500 + }, + { + "epoch": 0.043845887187272635, + "eval_loss": 1.3519667387008667, + "eval_runtime": 427.9384, + "eval_samples_per_second": 33.76, + "eval_steps_per_second": 8.44, + "step": 500 + }, + { + "epoch": 0.04393357896164718, + "grad_norm": 0.1767578125, + "learning_rate": 0.001503, + "loss": 1.3827, + "step": 501 + }, + { + "epoch": 0.044021270736021724, + "grad_norm": 0.08984375, + "learning_rate": 0.001506, + "loss": 1.3601, + "step": 502 + }, + { + "epoch": 0.04410896251039627, + "grad_norm": 0.1298828125, + "learning_rate": 0.0015090000000000001, + "loss": 1.3613, + "step": 503 + }, + { + "epoch": 0.04419665428477081, + "grad_norm": 0.15234375, + "learning_rate": 0.001512, + "loss": 1.3476, + "step": 504 + }, + { + "epoch": 0.044284346059145364, + "grad_norm": 0.09619140625, + "learning_rate": 0.001515, + "loss": 1.382, + "step": 505 + }, + { + "epoch": 0.04437203783351991, + "grad_norm": 0.134765625, + "learning_rate": 0.001518, + "loss": 1.3764, + "step": 506 + }, + { + "epoch": 0.04445972960789445, + "grad_norm": 0.09130859375, + "learning_rate": 0.001521, + "loss": 1.265, + "step": 507 + }, + { + "epoch": 0.044547421382269, + "grad_norm": 0.0908203125, + "learning_rate": 0.001524, + "loss": 1.3309, + "step": 508 + }, + { + "epoch": 0.04463511315664354, + "grad_norm": 0.095703125, + "learning_rate": 0.0015270000000000001, + "loss": 1.3333, + "step": 509 + }, + { + "epoch": 0.04472280493101809, + "grad_norm": 0.09765625, + "learning_rate": 0.0015300000000000001, + "loss": 1.3741, + "step": 510 + }, + { + "epoch": 0.04481049670539263, + "grad_norm": 0.1279296875, + "learning_rate": 0.001533, + "loss": 1.3363, + "step": 511 + }, + { + "epoch": 0.044898188479767176, + "grad_norm": 0.12890625, + "learning_rate": 0.001536, + "loss": 1.3045, + "step": 512 + }, + { + "epoch": 0.04498588025414172, + "grad_norm": 0.15625, + "learning_rate": 0.001539, + "loss": 1.4171, + "step": 513 + }, + { + "epoch": 0.04507357202851627, + "grad_norm": 0.0849609375, + "learning_rate": 0.001542, + "loss": 1.3343, + "step": 514 + }, + { + "epoch": 0.045161263802890816, + "grad_norm": 0.1640625, + "learning_rate": 0.0015450000000000001, + "loss": 1.3433, + "step": 515 + }, + { + "epoch": 0.04524895557726536, + "grad_norm": 0.09912109375, + "learning_rate": 0.0015480000000000001, + "loss": 1.281, + "step": 516 + }, + { + "epoch": 0.045336647351639905, + "grad_norm": 0.1171875, + "learning_rate": 0.001551, + "loss": 1.3229, + "step": 517 + }, + { + "epoch": 0.04542433912601445, + "grad_norm": 0.0869140625, + "learning_rate": 0.001554, + "loss": 1.3284, + "step": 518 + }, + { + "epoch": 0.045512030900388994, + "grad_norm": 0.11767578125, + "learning_rate": 0.001557, + "loss": 1.3185, + "step": 519 + }, + { + "epoch": 0.04559972267476354, + "grad_norm": 0.12255859375, + "learning_rate": 0.0015600000000000002, + "loss": 1.4157, + "step": 520 + }, + { + "epoch": 0.045687414449138083, + "grad_norm": 0.1171875, + "learning_rate": 0.0015630000000000002, + "loss": 1.3225, + "step": 521 + }, + { + "epoch": 0.04577510622351263, + "grad_norm": 0.1591796875, + "learning_rate": 0.0015660000000000001, + "loss": 1.2768, + "step": 522 + }, + { + "epoch": 0.04586279799788718, + "grad_norm": 0.1083984375, + "learning_rate": 0.001569, + "loss": 1.3378, + "step": 523 + }, + { + "epoch": 0.045950489772261724, + "grad_norm": 0.2158203125, + "learning_rate": 0.001572, + "loss": 1.3312, + "step": 524 + }, + { + "epoch": 0.04603818154663627, + "grad_norm": 0.146484375, + "learning_rate": 0.001575, + "loss": 1.3473, + "step": 525 + }, + { + "epoch": 0.04612587332101081, + "grad_norm": 0.1533203125, + "learning_rate": 0.0015780000000000002, + "loss": 1.3193, + "step": 526 + }, + { + "epoch": 0.04621356509538536, + "grad_norm": 0.166015625, + "learning_rate": 0.0015810000000000002, + "loss": 1.3617, + "step": 527 + }, + { + "epoch": 0.0463012568697599, + "grad_norm": 0.146484375, + "learning_rate": 0.0015840000000000001, + "loss": 1.4205, + "step": 528 + }, + { + "epoch": 0.046388948644134446, + "grad_norm": 0.1943359375, + "learning_rate": 0.001587, + "loss": 1.3412, + "step": 529 + }, + { + "epoch": 0.04647664041850899, + "grad_norm": 0.138671875, + "learning_rate": 0.00159, + "loss": 1.3731, + "step": 530 + }, + { + "epoch": 0.046564332192883535, + "grad_norm": 0.158203125, + "learning_rate": 0.001593, + "loss": 1.3481, + "step": 531 + }, + { + "epoch": 0.04665202396725809, + "grad_norm": 0.11865234375, + "learning_rate": 0.0015960000000000002, + "loss": 1.3438, + "step": 532 + }, + { + "epoch": 0.04673971574163263, + "grad_norm": 0.11376953125, + "learning_rate": 0.0015990000000000002, + "loss": 1.3893, + "step": 533 + }, + { + "epoch": 0.046827407516007176, + "grad_norm": 0.12060546875, + "learning_rate": 0.0016020000000000001, + "loss": 1.3365, + "step": 534 + }, + { + "epoch": 0.04691509929038172, + "grad_norm": 0.177734375, + "learning_rate": 0.001605, + "loss": 1.3429, + "step": 535 + }, + { + "epoch": 0.047002791064756265, + "grad_norm": 0.109375, + "learning_rate": 0.001608, + "loss": 1.3629, + "step": 536 + }, + { + "epoch": 0.04709048283913081, + "grad_norm": 0.177734375, + "learning_rate": 0.0016110000000000002, + "loss": 1.2756, + "step": 537 + }, + { + "epoch": 0.047178174613505354, + "grad_norm": 0.1103515625, + "learning_rate": 0.0016140000000000002, + "loss": 1.3064, + "step": 538 + }, + { + "epoch": 0.0472658663878799, + "grad_norm": 0.1455078125, + "learning_rate": 0.0016170000000000002, + "loss": 1.3871, + "step": 539 + }, + { + "epoch": 0.04735355816225444, + "grad_norm": 0.1220703125, + "learning_rate": 0.0016200000000000001, + "loss": 1.2368, + "step": 540 + }, + { + "epoch": 0.047441249936628994, + "grad_norm": 0.1572265625, + "learning_rate": 0.001623, + "loss": 1.376, + "step": 541 + }, + { + "epoch": 0.04752894171100354, + "grad_norm": 0.16015625, + "learning_rate": 0.001626, + "loss": 1.3061, + "step": 542 + }, + { + "epoch": 0.04761663348537808, + "grad_norm": 0.130859375, + "learning_rate": 0.0016290000000000002, + "loss": 1.2628, + "step": 543 + }, + { + "epoch": 0.04770432525975263, + "grad_norm": 0.1337890625, + "learning_rate": 0.0016320000000000002, + "loss": 1.2792, + "step": 544 + }, + { + "epoch": 0.04779201703412717, + "grad_norm": 0.10107421875, + "learning_rate": 0.0016350000000000002, + "loss": 1.285, + "step": 545 + }, + { + "epoch": 0.04787970880850172, + "grad_norm": 0.193359375, + "learning_rate": 0.0016380000000000001, + "loss": 1.3308, + "step": 546 + }, + { + "epoch": 0.04796740058287626, + "grad_norm": 0.10302734375, + "learning_rate": 0.001641, + "loss": 1.3833, + "step": 547 + }, + { + "epoch": 0.048055092357250806, + "grad_norm": 0.2080078125, + "learning_rate": 0.001644, + "loss": 1.3428, + "step": 548 + }, + { + "epoch": 0.04814278413162535, + "grad_norm": 0.083984375, + "learning_rate": 0.0016470000000000002, + "loss": 1.3376, + "step": 549 + }, + { + "epoch": 0.0482304759059999, + "grad_norm": 0.099609375, + "learning_rate": 0.0016500000000000002, + "loss": 1.4305, + "step": 550 + }, + { + "epoch": 0.048318167680374446, + "grad_norm": 0.140625, + "learning_rate": 0.0016530000000000002, + "loss": 1.3828, + "step": 551 + }, + { + "epoch": 0.04840585945474899, + "grad_norm": 0.09716796875, + "learning_rate": 0.0016560000000000001, + "loss": 1.323, + "step": 552 + }, + { + "epoch": 0.048493551229123535, + "grad_norm": 0.12890625, + "learning_rate": 0.001659, + "loss": 1.3733, + "step": 553 + }, + { + "epoch": 0.04858124300349808, + "grad_norm": 0.08837890625, + "learning_rate": 0.0016620000000000003, + "loss": 1.3101, + "step": 554 + }, + { + "epoch": 0.048668934777872624, + "grad_norm": 0.1064453125, + "learning_rate": 0.0016650000000000002, + "loss": 1.3806, + "step": 555 + }, + { + "epoch": 0.04875662655224717, + "grad_norm": 0.119140625, + "learning_rate": 0.0016680000000000002, + "loss": 1.321, + "step": 556 + }, + { + "epoch": 0.04884431832662171, + "grad_norm": 0.1416015625, + "learning_rate": 0.0016710000000000002, + "loss": 1.3756, + "step": 557 + }, + { + "epoch": 0.04893201010099626, + "grad_norm": 0.1572265625, + "learning_rate": 0.0016740000000000001, + "loss": 1.3487, + "step": 558 + }, + { + "epoch": 0.04901970187537081, + "grad_norm": 0.11669921875, + "learning_rate": 0.001677, + "loss": 1.2843, + "step": 559 + }, + { + "epoch": 0.049107393649745354, + "grad_norm": 0.1220703125, + "learning_rate": 0.0016800000000000003, + "loss": 1.3796, + "step": 560 + }, + { + "epoch": 0.0491950854241199, + "grad_norm": 0.08056640625, + "learning_rate": 0.0016830000000000003, + "loss": 1.3371, + "step": 561 + }, + { + "epoch": 0.04928277719849444, + "grad_norm": 0.10107421875, + "learning_rate": 0.0016860000000000002, + "loss": 1.3571, + "step": 562 + }, + { + "epoch": 0.04937046897286899, + "grad_norm": 0.140625, + "learning_rate": 0.001689, + "loss": 1.3039, + "step": 563 + }, + { + "epoch": 0.04945816074724353, + "grad_norm": 0.0888671875, + "learning_rate": 0.001692, + "loss": 1.3606, + "step": 564 + }, + { + "epoch": 0.049545852521618076, + "grad_norm": 0.1259765625, + "learning_rate": 0.001695, + "loss": 1.348, + "step": 565 + }, + { + "epoch": 0.04963354429599262, + "grad_norm": 0.10400390625, + "learning_rate": 0.0016979999999999999, + "loss": 1.291, + "step": 566 + }, + { + "epoch": 0.049721236070367165, + "grad_norm": 0.142578125, + "learning_rate": 0.0017009999999999998, + "loss": 1.3581, + "step": 567 + }, + { + "epoch": 0.04980892784474172, + "grad_norm": 0.0947265625, + "learning_rate": 0.0017039999999999998, + "loss": 1.311, + "step": 568 + }, + { + "epoch": 0.04989661961911626, + "grad_norm": 0.1416015625, + "learning_rate": 0.001707, + "loss": 1.3596, + "step": 569 + }, + { + "epoch": 0.049984311393490806, + "grad_norm": 0.11376953125, + "learning_rate": 0.00171, + "loss": 1.3427, + "step": 570 + }, + { + "epoch": 0.05007200316786535, + "grad_norm": 0.1396484375, + "learning_rate": 0.001713, + "loss": 1.3852, + "step": 571 + }, + { + "epoch": 0.050159694942239895, + "grad_norm": 0.1767578125, + "learning_rate": 0.0017159999999999999, + "loss": 1.3147, + "step": 572 + }, + { + "epoch": 0.05024738671661444, + "grad_norm": 0.11181640625, + "learning_rate": 0.0017189999999999998, + "loss": 1.3285, + "step": 573 + }, + { + "epoch": 0.050335078490988984, + "grad_norm": 0.1865234375, + "learning_rate": 0.001722, + "loss": 1.3213, + "step": 574 + }, + { + "epoch": 0.05042277026536353, + "grad_norm": 0.09619140625, + "learning_rate": 0.001725, + "loss": 1.2918, + "step": 575 + }, + { + "epoch": 0.05051046203973807, + "grad_norm": 0.09912109375, + "learning_rate": 0.001728, + "loss": 1.3342, + "step": 576 + }, + { + "epoch": 0.05059815381411262, + "grad_norm": 0.11865234375, + "learning_rate": 0.001731, + "loss": 1.3351, + "step": 577 + }, + { + "epoch": 0.05068584558848717, + "grad_norm": 0.140625, + "learning_rate": 0.0017339999999999999, + "loss": 1.4707, + "step": 578 + }, + { + "epoch": 0.05077353736286171, + "grad_norm": 0.125, + "learning_rate": 0.0017369999999999998, + "loss": 1.3766, + "step": 579 + }, + { + "epoch": 0.05086122913723626, + "grad_norm": 0.1630859375, + "learning_rate": 0.00174, + "loss": 1.3022, + "step": 580 + }, + { + "epoch": 0.0509489209116108, + "grad_norm": 0.109375, + "learning_rate": 0.001743, + "loss": 1.3365, + "step": 581 + }, + { + "epoch": 0.05103661268598535, + "grad_norm": 0.1552734375, + "learning_rate": 0.001746, + "loss": 1.2805, + "step": 582 + }, + { + "epoch": 0.05112430446035989, + "grad_norm": 0.1396484375, + "learning_rate": 0.001749, + "loss": 1.3641, + "step": 583 + }, + { + "epoch": 0.051211996234734436, + "grad_norm": 0.130859375, + "learning_rate": 0.0017519999999999999, + "loss": 1.3591, + "step": 584 + }, + { + "epoch": 0.05129968800910898, + "grad_norm": 0.1123046875, + "learning_rate": 0.0017549999999999998, + "loss": 1.2795, + "step": 585 + }, + { + "epoch": 0.051387379783483525, + "grad_norm": 0.09814453125, + "learning_rate": 0.001758, + "loss": 1.4044, + "step": 586 + }, + { + "epoch": 0.051475071557858076, + "grad_norm": 0.142578125, + "learning_rate": 0.001761, + "loss": 1.2848, + "step": 587 + }, + { + "epoch": 0.05156276333223262, + "grad_norm": 0.203125, + "learning_rate": 0.001764, + "loss": 1.3604, + "step": 588 + }, + { + "epoch": 0.051650455106607165, + "grad_norm": 0.1396484375, + "learning_rate": 0.001767, + "loss": 1.265, + "step": 589 + }, + { + "epoch": 0.05173814688098171, + "grad_norm": 0.1708984375, + "learning_rate": 0.0017699999999999999, + "loss": 1.3369, + "step": 590 + }, + { + "epoch": 0.051825838655356254, + "grad_norm": 0.1123046875, + "learning_rate": 0.001773, + "loss": 1.3202, + "step": 591 + }, + { + "epoch": 0.0519135304297308, + "grad_norm": 0.1044921875, + "learning_rate": 0.001776, + "loss": 1.3488, + "step": 592 + }, + { + "epoch": 0.05200122220410534, + "grad_norm": 0.1435546875, + "learning_rate": 0.001779, + "loss": 1.3653, + "step": 593 + }, + { + "epoch": 0.05208891397847989, + "grad_norm": 0.111328125, + "learning_rate": 0.001782, + "loss": 1.3817, + "step": 594 + }, + { + "epoch": 0.05217660575285443, + "grad_norm": 0.1259765625, + "learning_rate": 0.001785, + "loss": 1.3422, + "step": 595 + }, + { + "epoch": 0.052264297527228984, + "grad_norm": 0.10302734375, + "learning_rate": 0.0017879999999999999, + "loss": 1.3877, + "step": 596 + }, + { + "epoch": 0.05235198930160353, + "grad_norm": 0.1318359375, + "learning_rate": 0.001791, + "loss": 1.3137, + "step": 597 + }, + { + "epoch": 0.05243968107597807, + "grad_norm": 0.10986328125, + "learning_rate": 0.001794, + "loss": 1.3798, + "step": 598 + }, + { + "epoch": 0.05252737285035262, + "grad_norm": 0.11474609375, + "learning_rate": 0.001797, + "loss": 1.3497, + "step": 599 + }, + { + "epoch": 0.05261506462472716, + "grad_norm": 0.1142578125, + "learning_rate": 0.0018, + "loss": 1.3417, + "step": 600 + }, + { + "epoch": 0.052702756399101706, + "grad_norm": 0.125, + "learning_rate": 0.001803, + "loss": 1.3298, + "step": 601 + }, + { + "epoch": 0.05279044817347625, + "grad_norm": 0.10595703125, + "learning_rate": 0.0018059999999999999, + "loss": 1.3585, + "step": 602 + }, + { + "epoch": 0.052878139947850795, + "grad_norm": 0.11279296875, + "learning_rate": 0.001809, + "loss": 1.331, + "step": 603 + }, + { + "epoch": 0.05296583172222534, + "grad_norm": 0.11376953125, + "learning_rate": 0.001812, + "loss": 1.2992, + "step": 604 + }, + { + "epoch": 0.05305352349659989, + "grad_norm": 0.09619140625, + "learning_rate": 0.001815, + "loss": 1.2915, + "step": 605 + }, + { + "epoch": 0.053141215270974436, + "grad_norm": 0.09033203125, + "learning_rate": 0.001818, + "loss": 1.32, + "step": 606 + }, + { + "epoch": 0.05322890704534898, + "grad_norm": 0.0771484375, + "learning_rate": 0.001821, + "loss": 1.3601, + "step": 607 + }, + { + "epoch": 0.053316598819723525, + "grad_norm": 0.0927734375, + "learning_rate": 0.001824, + "loss": 1.3336, + "step": 608 + }, + { + "epoch": 0.05340429059409807, + "grad_norm": 0.09765625, + "learning_rate": 0.001827, + "loss": 1.347, + "step": 609 + }, + { + "epoch": 0.053491982368472614, + "grad_norm": 0.1650390625, + "learning_rate": 0.00183, + "loss": 1.3347, + "step": 610 + }, + { + "epoch": 0.05357967414284716, + "grad_norm": 0.10546875, + "learning_rate": 0.001833, + "loss": 1.3217, + "step": 611 + }, + { + "epoch": 0.0536673659172217, + "grad_norm": 0.09716796875, + "learning_rate": 0.001836, + "loss": 1.2653, + "step": 612 + }, + { + "epoch": 0.05375505769159625, + "grad_norm": 0.11083984375, + "learning_rate": 0.001839, + "loss": 1.3131, + "step": 613 + }, + { + "epoch": 0.0538427494659708, + "grad_norm": 0.166015625, + "learning_rate": 0.001842, + "loss": 1.3646, + "step": 614 + }, + { + "epoch": 0.05393044124034534, + "grad_norm": 0.091796875, + "learning_rate": 0.001845, + "loss": 1.3012, + "step": 615 + }, + { + "epoch": 0.05401813301471989, + "grad_norm": 0.11181640625, + "learning_rate": 0.001848, + "loss": 1.2882, + "step": 616 + }, + { + "epoch": 0.05410582478909443, + "grad_norm": 0.0927734375, + "learning_rate": 0.001851, + "loss": 1.2802, + "step": 617 + }, + { + "epoch": 0.05419351656346898, + "grad_norm": 0.107421875, + "learning_rate": 0.001854, + "loss": 1.3446, + "step": 618 + }, + { + "epoch": 0.05428120833784352, + "grad_norm": 0.138671875, + "learning_rate": 0.001857, + "loss": 1.3352, + "step": 619 + }, + { + "epoch": 0.054368900112218066, + "grad_norm": 0.14453125, + "learning_rate": 0.00186, + "loss": 1.3795, + "step": 620 + }, + { + "epoch": 0.05445659188659261, + "grad_norm": 0.142578125, + "learning_rate": 0.001863, + "loss": 1.3184, + "step": 621 + }, + { + "epoch": 0.054544283660967155, + "grad_norm": 0.12890625, + "learning_rate": 0.001866, + "loss": 1.306, + "step": 622 + }, + { + "epoch": 0.054631975435341706, + "grad_norm": 0.1357421875, + "learning_rate": 0.001869, + "loss": 1.3334, + "step": 623 + }, + { + "epoch": 0.05471966720971625, + "grad_norm": 0.09765625, + "learning_rate": 0.001872, + "loss": 1.3268, + "step": 624 + }, + { + "epoch": 0.054807358984090795, + "grad_norm": 0.11181640625, + "learning_rate": 0.001875, + "loss": 1.3805, + "step": 625 + }, + { + "epoch": 0.05489505075846534, + "grad_norm": 0.083984375, + "learning_rate": 0.0018780000000000001, + "loss": 1.4231, + "step": 626 + }, + { + "epoch": 0.054982742532839884, + "grad_norm": 0.1455078125, + "learning_rate": 0.001881, + "loss": 1.3484, + "step": 627 + }, + { + "epoch": 0.05507043430721443, + "grad_norm": 0.1064453125, + "learning_rate": 0.001884, + "loss": 1.3057, + "step": 628 + }, + { + "epoch": 0.05515812608158897, + "grad_norm": 0.1181640625, + "learning_rate": 0.001887, + "loss": 1.3342, + "step": 629 + }, + { + "epoch": 0.05524581785596352, + "grad_norm": 0.1396484375, + "learning_rate": 0.00189, + "loss": 1.3539, + "step": 630 + }, + { + "epoch": 0.05533350963033806, + "grad_norm": 0.1279296875, + "learning_rate": 0.0018930000000000002, + "loss": 1.3593, + "step": 631 + }, + { + "epoch": 0.055421201404712614, + "grad_norm": 0.1279296875, + "learning_rate": 0.0018960000000000001, + "loss": 1.3262, + "step": 632 + }, + { + "epoch": 0.05550889317908716, + "grad_norm": 0.126953125, + "learning_rate": 0.001899, + "loss": 1.3629, + "step": 633 + }, + { + "epoch": 0.0555965849534617, + "grad_norm": 0.107421875, + "learning_rate": 0.001902, + "loss": 1.313, + "step": 634 + }, + { + "epoch": 0.05568427672783625, + "grad_norm": 0.1376953125, + "learning_rate": 0.001905, + "loss": 1.3779, + "step": 635 + }, + { + "epoch": 0.05577196850221079, + "grad_norm": 0.07958984375, + "learning_rate": 0.001908, + "loss": 1.3722, + "step": 636 + }, + { + "epoch": 0.055859660276585336, + "grad_norm": 0.1494140625, + "learning_rate": 0.0019110000000000002, + "loss": 1.2903, + "step": 637 + }, + { + "epoch": 0.05594735205095988, + "grad_norm": 0.09619140625, + "learning_rate": 0.0019140000000000001, + "loss": 1.2875, + "step": 638 + }, + { + "epoch": 0.056035043825334425, + "grad_norm": 0.1337890625, + "learning_rate": 0.001917, + "loss": 1.3276, + "step": 639 + }, + { + "epoch": 0.05612273559970897, + "grad_norm": 0.1142578125, + "learning_rate": 0.00192, + "loss": 1.2509, + "step": 640 + }, + { + "epoch": 0.05621042737408352, + "grad_norm": 0.083984375, + "learning_rate": 0.001923, + "loss": 1.3149, + "step": 641 + }, + { + "epoch": 0.056298119148458066, + "grad_norm": 0.1279296875, + "learning_rate": 0.001926, + "loss": 1.3123, + "step": 642 + }, + { + "epoch": 0.05638581092283261, + "grad_norm": 0.09765625, + "learning_rate": 0.0019290000000000002, + "loss": 1.3152, + "step": 643 + }, + { + "epoch": 0.056473502697207155, + "grad_norm": 0.1669921875, + "learning_rate": 0.0019320000000000001, + "loss": 1.337, + "step": 644 + }, + { + "epoch": 0.0565611944715817, + "grad_norm": 0.11767578125, + "learning_rate": 0.001935, + "loss": 1.3222, + "step": 645 + }, + { + "epoch": 0.056648886245956244, + "grad_norm": 0.1484375, + "learning_rate": 0.001938, + "loss": 1.2985, + "step": 646 + }, + { + "epoch": 0.05673657802033079, + "grad_norm": 0.107421875, + "learning_rate": 0.001941, + "loss": 1.4031, + "step": 647 + }, + { + "epoch": 0.05682426979470533, + "grad_norm": 0.1123046875, + "learning_rate": 0.0019440000000000002, + "loss": 1.2429, + "step": 648 + }, + { + "epoch": 0.05691196156907988, + "grad_norm": 0.08154296875, + "learning_rate": 0.0019470000000000002, + "loss": 1.3375, + "step": 649 + }, + { + "epoch": 0.05699965334345443, + "grad_norm": 0.12451171875, + "learning_rate": 0.0019500000000000001, + "loss": 1.3173, + "step": 650 + }, + { + "epoch": 0.05708734511782897, + "grad_norm": 0.1171875, + "learning_rate": 0.001953, + "loss": 1.3144, + "step": 651 + }, + { + "epoch": 0.05717503689220352, + "grad_norm": 0.12353515625, + "learning_rate": 0.0019560000000000003, + "loss": 1.4079, + "step": 652 + }, + { + "epoch": 0.05726272866657806, + "grad_norm": 0.134765625, + "learning_rate": 0.0019590000000000002, + "loss": 1.4005, + "step": 653 + }, + { + "epoch": 0.05735042044095261, + "grad_norm": 0.142578125, + "learning_rate": 0.001962, + "loss": 1.4134, + "step": 654 + }, + { + "epoch": 0.05743811221532715, + "grad_norm": 0.169921875, + "learning_rate": 0.001965, + "loss": 1.3191, + "step": 655 + }, + { + "epoch": 0.057525803989701696, + "grad_norm": 0.1220703125, + "learning_rate": 0.001968, + "loss": 1.3546, + "step": 656 + }, + { + "epoch": 0.05761349576407624, + "grad_norm": 0.1513671875, + "learning_rate": 0.001971, + "loss": 1.3522, + "step": 657 + }, + { + "epoch": 0.057701187538450785, + "grad_norm": 0.1484375, + "learning_rate": 0.001974, + "loss": 1.318, + "step": 658 + }, + { + "epoch": 0.057788879312825336, + "grad_norm": 0.1103515625, + "learning_rate": 0.001977, + "loss": 1.3126, + "step": 659 + }, + { + "epoch": 0.05787657108719988, + "grad_norm": 0.0947265625, + "learning_rate": 0.00198, + "loss": 1.3112, + "step": 660 + }, + { + "epoch": 0.057964262861574425, + "grad_norm": 0.1044921875, + "learning_rate": 0.001983, + "loss": 1.3347, + "step": 661 + }, + { + "epoch": 0.05805195463594897, + "grad_norm": 0.125, + "learning_rate": 0.0019860000000000004, + "loss": 1.384, + "step": 662 + }, + { + "epoch": 0.058139646410323514, + "grad_norm": 0.11279296875, + "learning_rate": 0.0019890000000000003, + "loss": 1.3456, + "step": 663 + }, + { + "epoch": 0.05822733818469806, + "grad_norm": 0.1220703125, + "learning_rate": 0.0019920000000000003, + "loss": 1.3589, + "step": 664 + }, + { + "epoch": 0.0583150299590726, + "grad_norm": 0.1025390625, + "learning_rate": 0.0019950000000000002, + "loss": 1.3744, + "step": 665 + }, + { + "epoch": 0.05840272173344715, + "grad_norm": 0.1474609375, + "learning_rate": 0.001998, + "loss": 1.3614, + "step": 666 + }, + { + "epoch": 0.05849041350782169, + "grad_norm": 0.10400390625, + "learning_rate": 0.002001, + "loss": 1.3101, + "step": 667 + }, + { + "epoch": 0.058578105282196244, + "grad_norm": 0.11767578125, + "learning_rate": 0.002004, + "loss": 1.3496, + "step": 668 + }, + { + "epoch": 0.05866579705657079, + "grad_norm": 0.09716796875, + "learning_rate": 0.002007, + "loss": 1.3225, + "step": 669 + }, + { + "epoch": 0.05875348883094533, + "grad_norm": 0.1318359375, + "learning_rate": 0.00201, + "loss": 1.3426, + "step": 670 + }, + { + "epoch": 0.05884118060531988, + "grad_norm": 0.09521484375, + "learning_rate": 0.002013, + "loss": 1.2791, + "step": 671 + }, + { + "epoch": 0.05892887237969442, + "grad_norm": 0.11279296875, + "learning_rate": 0.002016, + "loss": 1.2956, + "step": 672 + }, + { + "epoch": 0.059016564154068966, + "grad_norm": 0.10400390625, + "learning_rate": 0.002019, + "loss": 1.2956, + "step": 673 + }, + { + "epoch": 0.05910425592844351, + "grad_norm": 0.09228515625, + "learning_rate": 0.0020220000000000004, + "loss": 1.3212, + "step": 674 + }, + { + "epoch": 0.059191947702818055, + "grad_norm": 0.10498046875, + "learning_rate": 0.0020250000000000003, + "loss": 1.3192, + "step": 675 + }, + { + "epoch": 0.0592796394771926, + "grad_norm": 0.07958984375, + "learning_rate": 0.0020280000000000003, + "loss": 1.373, + "step": 676 + }, + { + "epoch": 0.059367331251567144, + "grad_norm": 0.09423828125, + "learning_rate": 0.0020310000000000003, + "loss": 1.4266, + "step": 677 + }, + { + "epoch": 0.059455023025941696, + "grad_norm": 0.08447265625, + "learning_rate": 0.0020340000000000002, + "loss": 1.3394, + "step": 678 + }, + { + "epoch": 0.05954271480031624, + "grad_norm": 0.10400390625, + "learning_rate": 0.002037, + "loss": 1.3182, + "step": 679 + }, + { + "epoch": 0.059630406574690785, + "grad_norm": 0.11328125, + "learning_rate": 0.00204, + "loss": 1.3112, + "step": 680 + }, + { + "epoch": 0.05971809834906533, + "grad_norm": 0.08740234375, + "learning_rate": 0.002043, + "loss": 1.2881, + "step": 681 + }, + { + "epoch": 0.059805790123439874, + "grad_norm": 0.1279296875, + "learning_rate": 0.002046, + "loss": 1.3234, + "step": 682 + }, + { + "epoch": 0.05989348189781442, + "grad_norm": 0.1220703125, + "learning_rate": 0.002049, + "loss": 1.3557, + "step": 683 + }, + { + "epoch": 0.05998117367218896, + "grad_norm": 0.138671875, + "learning_rate": 0.002052, + "loss": 1.3126, + "step": 684 + }, + { + "epoch": 0.06006886544656351, + "grad_norm": 0.1259765625, + "learning_rate": 0.0020550000000000004, + "loss": 1.3457, + "step": 685 + }, + { + "epoch": 0.06015655722093805, + "grad_norm": 0.0986328125, + "learning_rate": 0.0020580000000000004, + "loss": 1.295, + "step": 686 + }, + { + "epoch": 0.0602442489953126, + "grad_norm": 0.115234375, + "learning_rate": 0.0020610000000000003, + "loss": 1.3832, + "step": 687 + }, + { + "epoch": 0.06033194076968715, + "grad_norm": 0.130859375, + "learning_rate": 0.002064, + "loss": 1.338, + "step": 688 + }, + { + "epoch": 0.06041963254406169, + "grad_norm": 0.08984375, + "learning_rate": 0.002067, + "loss": 1.3339, + "step": 689 + }, + { + "epoch": 0.06050732431843624, + "grad_norm": 0.138671875, + "learning_rate": 0.00207, + "loss": 1.3436, + "step": 690 + }, + { + "epoch": 0.06059501609281078, + "grad_norm": 0.09423828125, + "learning_rate": 0.0020729999999999998, + "loss": 1.3324, + "step": 691 + }, + { + "epoch": 0.060682707867185326, + "grad_norm": 0.1435546875, + "learning_rate": 0.0020759999999999997, + "loss": 1.3587, + "step": 692 + }, + { + "epoch": 0.06077039964155987, + "grad_norm": 0.15234375, + "learning_rate": 0.0020789999999999997, + "loss": 1.3409, + "step": 693 + }, + { + "epoch": 0.060858091415934415, + "grad_norm": 0.09716796875, + "learning_rate": 0.002082, + "loss": 1.358, + "step": 694 + }, + { + "epoch": 0.06094578319030896, + "grad_norm": 0.11767578125, + "learning_rate": 0.002085, + "loss": 1.2721, + "step": 695 + }, + { + "epoch": 0.06103347496468351, + "grad_norm": 0.1533203125, + "learning_rate": 0.002088, + "loss": 1.3178, + "step": 696 + }, + { + "epoch": 0.061121166739058055, + "grad_norm": 0.1494140625, + "learning_rate": 0.002091, + "loss": 1.2921, + "step": 697 + }, + { + "epoch": 0.0612088585134326, + "grad_norm": 0.07958984375, + "learning_rate": 0.002094, + "loss": 1.2661, + "step": 698 + }, + { + "epoch": 0.061296550287807144, + "grad_norm": 0.1259765625, + "learning_rate": 0.002097, + "loss": 1.3789, + "step": 699 + }, + { + "epoch": 0.06138424206218169, + "grad_norm": 0.0927734375, + "learning_rate": 0.0021, + "loss": 1.3957, + "step": 700 + }, + { + "epoch": 0.06147193383655623, + "grad_norm": 0.125, + "learning_rate": 0.002103, + "loss": 1.348, + "step": 701 + }, + { + "epoch": 0.06155962561093078, + "grad_norm": 0.103515625, + "learning_rate": 0.002106, + "loss": 1.3111, + "step": 702 + }, + { + "epoch": 0.06164731738530532, + "grad_norm": 0.08251953125, + "learning_rate": 0.0021089999999999998, + "loss": 1.321, + "step": 703 + }, + { + "epoch": 0.06173500915967987, + "grad_norm": 0.09814453125, + "learning_rate": 0.0021119999999999997, + "loss": 1.4042, + "step": 704 + }, + { + "epoch": 0.06182270093405442, + "grad_norm": 0.12890625, + "learning_rate": 0.002115, + "loss": 1.2891, + "step": 705 + }, + { + "epoch": 0.06191039270842896, + "grad_norm": 0.125, + "learning_rate": 0.002118, + "loss": 1.2994, + "step": 706 + }, + { + "epoch": 0.06199808448280351, + "grad_norm": 0.1640625, + "learning_rate": 0.002121, + "loss": 1.4005, + "step": 707 + }, + { + "epoch": 0.06208577625717805, + "grad_norm": 0.109375, + "learning_rate": 0.002124, + "loss": 1.3539, + "step": 708 + }, + { + "epoch": 0.062173468031552596, + "grad_norm": 0.1513671875, + "learning_rate": 0.002127, + "loss": 1.3263, + "step": 709 + }, + { + "epoch": 0.06226115980592714, + "grad_norm": 0.083984375, + "learning_rate": 0.00213, + "loss": 1.3272, + "step": 710 + }, + { + "epoch": 0.062348851580301685, + "grad_norm": 0.11865234375, + "learning_rate": 0.002133, + "loss": 1.3914, + "step": 711 + }, + { + "epoch": 0.06243654335467623, + "grad_norm": 0.0966796875, + "learning_rate": 0.002136, + "loss": 1.3741, + "step": 712 + }, + { + "epoch": 0.06252423512905078, + "grad_norm": 0.1298828125, + "learning_rate": 0.002139, + "loss": 1.3691, + "step": 713 + }, + { + "epoch": 0.06261192690342532, + "grad_norm": 0.08984375, + "learning_rate": 0.002142, + "loss": 1.3387, + "step": 714 + }, + { + "epoch": 0.06269961867779987, + "grad_norm": 0.095703125, + "learning_rate": 0.0021449999999999998, + "loss": 1.3177, + "step": 715 + }, + { + "epoch": 0.06278731045217441, + "grad_norm": 0.08447265625, + "learning_rate": 0.002148, + "loss": 1.3291, + "step": 716 + }, + { + "epoch": 0.06287500222654896, + "grad_norm": 0.115234375, + "learning_rate": 0.002151, + "loss": 1.3442, + "step": 717 + }, + { + "epoch": 0.06296269400092351, + "grad_norm": 0.08154296875, + "learning_rate": 0.002154, + "loss": 1.352, + "step": 718 + }, + { + "epoch": 0.06305038577529805, + "grad_norm": 0.12255859375, + "learning_rate": 0.002157, + "loss": 1.3209, + "step": 719 + }, + { + "epoch": 0.0631380775496726, + "grad_norm": 0.080078125, + "learning_rate": 0.00216, + "loss": 1.3168, + "step": 720 + }, + { + "epoch": 0.06322576932404714, + "grad_norm": 0.12060546875, + "learning_rate": 0.002163, + "loss": 1.3624, + "step": 721 + }, + { + "epoch": 0.06331346109842169, + "grad_norm": 0.09619140625, + "learning_rate": 0.002166, + "loss": 1.3289, + "step": 722 + }, + { + "epoch": 0.06340115287279623, + "grad_norm": 0.1416015625, + "learning_rate": 0.002169, + "loss": 1.3062, + "step": 723 + }, + { + "epoch": 0.06348884464717078, + "grad_norm": 0.09912109375, + "learning_rate": 0.002172, + "loss": 1.2927, + "step": 724 + }, + { + "epoch": 0.06357653642154532, + "grad_norm": 0.126953125, + "learning_rate": 0.002175, + "loss": 1.3349, + "step": 725 + }, + { + "epoch": 0.06366422819591987, + "grad_norm": 0.162109375, + "learning_rate": 0.002178, + "loss": 1.4218, + "step": 726 + }, + { + "epoch": 0.0637519199702944, + "grad_norm": 0.23828125, + "learning_rate": 0.0021809999999999998, + "loss": 1.4004, + "step": 727 + }, + { + "epoch": 0.06383961174466896, + "grad_norm": 0.1533203125, + "learning_rate": 0.002184, + "loss": 1.3236, + "step": 728 + }, + { + "epoch": 0.06392730351904351, + "grad_norm": 0.1640625, + "learning_rate": 0.002187, + "loss": 1.2873, + "step": 729 + }, + { + "epoch": 0.06401499529341804, + "grad_norm": 0.10400390625, + "learning_rate": 0.00219, + "loss": 1.3388, + "step": 730 + }, + { + "epoch": 0.0641026870677926, + "grad_norm": 0.16796875, + "learning_rate": 0.002193, + "loss": 1.3179, + "step": 731 + }, + { + "epoch": 0.06419037884216713, + "grad_norm": 0.1318359375, + "learning_rate": 0.002196, + "loss": 1.3483, + "step": 732 + }, + { + "epoch": 0.06427807061654169, + "grad_norm": 0.1357421875, + "learning_rate": 0.002199, + "loss": 1.3194, + "step": 733 + }, + { + "epoch": 0.06436576239091622, + "grad_norm": 0.1318359375, + "learning_rate": 0.002202, + "loss": 1.3404, + "step": 734 + }, + { + "epoch": 0.06445345416529077, + "grad_norm": 0.125, + "learning_rate": 0.002205, + "loss": 1.2977, + "step": 735 + }, + { + "epoch": 0.06454114593966531, + "grad_norm": 0.1083984375, + "learning_rate": 0.002208, + "loss": 1.362, + "step": 736 + }, + { + "epoch": 0.06462883771403986, + "grad_norm": 0.09423828125, + "learning_rate": 0.002211, + "loss": 1.3174, + "step": 737 + }, + { + "epoch": 0.06471652948841441, + "grad_norm": 0.1435546875, + "learning_rate": 0.002214, + "loss": 1.3242, + "step": 738 + }, + { + "epoch": 0.06480422126278895, + "grad_norm": 0.10888671875, + "learning_rate": 0.0022170000000000002, + "loss": 1.296, + "step": 739 + }, + { + "epoch": 0.0648919130371635, + "grad_norm": 0.08837890625, + "learning_rate": 0.00222, + "loss": 1.3848, + "step": 740 + }, + { + "epoch": 0.06497960481153804, + "grad_norm": 0.10693359375, + "learning_rate": 0.002223, + "loss": 1.3586, + "step": 741 + }, + { + "epoch": 0.06506729658591259, + "grad_norm": 0.09912109375, + "learning_rate": 0.002226, + "loss": 1.2866, + "step": 742 + }, + { + "epoch": 0.06515498836028713, + "grad_norm": 0.1044921875, + "learning_rate": 0.002229, + "loss": 1.2886, + "step": 743 + }, + { + "epoch": 0.06524268013466168, + "grad_norm": 0.10009765625, + "learning_rate": 0.002232, + "loss": 1.3764, + "step": 744 + }, + { + "epoch": 0.06533037190903622, + "grad_norm": 0.08837890625, + "learning_rate": 0.002235, + "loss": 1.3943, + "step": 745 + }, + { + "epoch": 0.06541806368341077, + "grad_norm": 0.12890625, + "learning_rate": 0.002238, + "loss": 1.3645, + "step": 746 + }, + { + "epoch": 0.06550575545778532, + "grad_norm": 0.0869140625, + "learning_rate": 0.002241, + "loss": 1.2752, + "step": 747 + }, + { + "epoch": 0.06559344723215986, + "grad_norm": 0.11474609375, + "learning_rate": 0.002244, + "loss": 1.2695, + "step": 748 + }, + { + "epoch": 0.06568113900653441, + "grad_norm": 0.1357421875, + "learning_rate": 0.002247, + "loss": 1.3008, + "step": 749 + }, + { + "epoch": 0.06576883078090895, + "grad_norm": 0.126953125, + "learning_rate": 0.0022500000000000003, + "loss": 1.2855, + "step": 750 + }, + { + "epoch": 0.0658565225552835, + "grad_norm": 0.09033203125, + "learning_rate": 0.0022530000000000002, + "loss": 1.3534, + "step": 751 + }, + { + "epoch": 0.06594421432965804, + "grad_norm": 0.08642578125, + "learning_rate": 0.002256, + "loss": 1.3676, + "step": 752 + }, + { + "epoch": 0.06603190610403259, + "grad_norm": 0.1279296875, + "learning_rate": 0.002259, + "loss": 1.2932, + "step": 753 + }, + { + "epoch": 0.06611959787840713, + "grad_norm": 0.1298828125, + "learning_rate": 0.002262, + "loss": 1.3035, + "step": 754 + }, + { + "epoch": 0.06620728965278168, + "grad_norm": 0.103515625, + "learning_rate": 0.002265, + "loss": 1.2848, + "step": 755 + }, + { + "epoch": 0.06629498142715623, + "grad_norm": 0.126953125, + "learning_rate": 0.002268, + "loss": 1.2856, + "step": 756 + }, + { + "epoch": 0.06638267320153077, + "grad_norm": 0.109375, + "learning_rate": 0.002271, + "loss": 1.3616, + "step": 757 + }, + { + "epoch": 0.06647036497590532, + "grad_norm": 0.1259765625, + "learning_rate": 0.002274, + "loss": 1.3047, + "step": 758 + }, + { + "epoch": 0.06655805675027986, + "grad_norm": 0.1689453125, + "learning_rate": 0.002277, + "loss": 1.4104, + "step": 759 + }, + { + "epoch": 0.06664574852465441, + "grad_norm": 0.11865234375, + "learning_rate": 0.00228, + "loss": 1.2926, + "step": 760 + }, + { + "epoch": 0.06673344029902895, + "grad_norm": 0.0986328125, + "learning_rate": 0.002283, + "loss": 1.3173, + "step": 761 + }, + { + "epoch": 0.0668211320734035, + "grad_norm": 0.1796875, + "learning_rate": 0.0022860000000000003, + "loss": 1.3172, + "step": 762 + }, + { + "epoch": 0.06690882384777803, + "grad_norm": 0.1201171875, + "learning_rate": 0.0022890000000000002, + "loss": 1.3202, + "step": 763 + }, + { + "epoch": 0.06699651562215259, + "grad_norm": 0.1435546875, + "learning_rate": 0.002292, + "loss": 1.3341, + "step": 764 + }, + { + "epoch": 0.06708420739652714, + "grad_norm": 0.1767578125, + "learning_rate": 0.002295, + "loss": 1.3316, + "step": 765 + }, + { + "epoch": 0.06717189917090167, + "grad_norm": 0.09716796875, + "learning_rate": 0.002298, + "loss": 1.3074, + "step": 766 + }, + { + "epoch": 0.06725959094527623, + "grad_norm": 0.26171875, + "learning_rate": 0.002301, + "loss": 1.3714, + "step": 767 + }, + { + "epoch": 0.06734728271965076, + "grad_norm": 0.21484375, + "learning_rate": 0.002304, + "loss": 1.3505, + "step": 768 + }, + { + "epoch": 0.06743497449402532, + "grad_norm": 0.091796875, + "learning_rate": 0.002307, + "loss": 1.2795, + "step": 769 + }, + { + "epoch": 0.06752266626839985, + "grad_norm": 0.146484375, + "learning_rate": 0.00231, + "loss": 1.2834, + "step": 770 + }, + { + "epoch": 0.0676103580427744, + "grad_norm": 0.09619140625, + "learning_rate": 0.002313, + "loss": 1.2901, + "step": 771 + }, + { + "epoch": 0.06769804981714894, + "grad_norm": 0.11083984375, + "learning_rate": 0.002316, + "loss": 1.2928, + "step": 772 + }, + { + "epoch": 0.0677857415915235, + "grad_norm": 0.0869140625, + "learning_rate": 0.0023190000000000003, + "loss": 1.3493, + "step": 773 + }, + { + "epoch": 0.06787343336589804, + "grad_norm": 0.1044921875, + "learning_rate": 0.0023220000000000003, + "loss": 1.3889, + "step": 774 + }, + { + "epoch": 0.06796112514027258, + "grad_norm": 0.10009765625, + "learning_rate": 0.0023250000000000002, + "loss": 1.2773, + "step": 775 + }, + { + "epoch": 0.06804881691464713, + "grad_norm": 0.08740234375, + "learning_rate": 0.002328, + "loss": 1.3119, + "step": 776 + }, + { + "epoch": 0.06813650868902167, + "grad_norm": 0.11376953125, + "learning_rate": 0.002331, + "loss": 1.3411, + "step": 777 + }, + { + "epoch": 0.06822420046339622, + "grad_norm": 0.0771484375, + "learning_rate": 0.002334, + "loss": 1.3557, + "step": 778 + }, + { + "epoch": 0.06831189223777076, + "grad_norm": 0.11669921875, + "learning_rate": 0.002337, + "loss": 1.3282, + "step": 779 + }, + { + "epoch": 0.06839958401214531, + "grad_norm": 0.07666015625, + "learning_rate": 0.00234, + "loss": 1.3127, + "step": 780 + }, + { + "epoch": 0.06848727578651985, + "grad_norm": 0.119140625, + "learning_rate": 0.002343, + "loss": 1.3505, + "step": 781 + }, + { + "epoch": 0.0685749675608944, + "grad_norm": 0.08447265625, + "learning_rate": 0.002346, + "loss": 1.368, + "step": 782 + }, + { + "epoch": 0.06866265933526895, + "grad_norm": 0.1572265625, + "learning_rate": 0.002349, + "loss": 1.3141, + "step": 783 + }, + { + "epoch": 0.06875035110964349, + "grad_norm": 0.0830078125, + "learning_rate": 0.002352, + "loss": 1.3129, + "step": 784 + }, + { + "epoch": 0.06883804288401804, + "grad_norm": 0.1240234375, + "learning_rate": 0.0023550000000000003, + "loss": 1.3506, + "step": 785 + }, + { + "epoch": 0.06892573465839258, + "grad_norm": 0.0888671875, + "learning_rate": 0.0023580000000000003, + "loss": 1.3198, + "step": 786 + }, + { + "epoch": 0.06901342643276713, + "grad_norm": 0.1162109375, + "learning_rate": 0.0023610000000000003, + "loss": 1.3936, + "step": 787 + }, + { + "epoch": 0.06910111820714167, + "grad_norm": 0.09326171875, + "learning_rate": 0.002364, + "loss": 1.3044, + "step": 788 + }, + { + "epoch": 0.06918880998151622, + "grad_norm": 0.13671875, + "learning_rate": 0.002367, + "loss": 1.3279, + "step": 789 + }, + { + "epoch": 0.06927650175589076, + "grad_norm": 0.134765625, + "learning_rate": 0.00237, + "loss": 1.3941, + "step": 790 + }, + { + "epoch": 0.06936419353026531, + "grad_norm": 0.08349609375, + "learning_rate": 0.002373, + "loss": 1.3437, + "step": 791 + }, + { + "epoch": 0.06945188530463986, + "grad_norm": 0.0859375, + "learning_rate": 0.002376, + "loss": 1.4024, + "step": 792 + }, + { + "epoch": 0.0695395770790144, + "grad_norm": 0.08642578125, + "learning_rate": 0.002379, + "loss": 1.3566, + "step": 793 + }, + { + "epoch": 0.06962726885338895, + "grad_norm": 0.09814453125, + "learning_rate": 0.002382, + "loss": 1.2291, + "step": 794 + }, + { + "epoch": 0.06971496062776349, + "grad_norm": 0.0849609375, + "learning_rate": 0.002385, + "loss": 1.3785, + "step": 795 + }, + { + "epoch": 0.06980265240213804, + "grad_norm": 0.1103515625, + "learning_rate": 0.0023880000000000004, + "loss": 1.3573, + "step": 796 + }, + { + "epoch": 0.06989034417651258, + "grad_norm": 0.140625, + "learning_rate": 0.0023910000000000003, + "loss": 1.3114, + "step": 797 + }, + { + "epoch": 0.06997803595088713, + "grad_norm": 0.09228515625, + "learning_rate": 0.0023940000000000003, + "loss": 1.3111, + "step": 798 + }, + { + "epoch": 0.07006572772526166, + "grad_norm": 0.1728515625, + "learning_rate": 0.0023970000000000003, + "loss": 1.416, + "step": 799 + }, + { + "epoch": 0.07015341949963622, + "grad_norm": 0.181640625, + "learning_rate": 0.0024000000000000002, + "loss": 1.2953, + "step": 800 + }, + { + "epoch": 0.07024111127401077, + "grad_norm": 0.150390625, + "learning_rate": 0.002403, + "loss": 1.3164, + "step": 801 + }, + { + "epoch": 0.0703288030483853, + "grad_norm": 0.205078125, + "learning_rate": 0.002406, + "loss": 1.44, + "step": 802 + }, + { + "epoch": 0.07041649482275986, + "grad_norm": 0.12158203125, + "learning_rate": 0.002409, + "loss": 1.3285, + "step": 803 + }, + { + "epoch": 0.0705041865971344, + "grad_norm": 0.11962890625, + "learning_rate": 0.002412, + "loss": 1.2978, + "step": 804 + }, + { + "epoch": 0.07059187837150895, + "grad_norm": 0.10791015625, + "learning_rate": 0.002415, + "loss": 1.3385, + "step": 805 + }, + { + "epoch": 0.07067957014588348, + "grad_norm": 0.16796875, + "learning_rate": 0.002418, + "loss": 1.2502, + "step": 806 + }, + { + "epoch": 0.07076726192025803, + "grad_norm": 0.09619140625, + "learning_rate": 0.0024210000000000004, + "loss": 1.2764, + "step": 807 + }, + { + "epoch": 0.07085495369463257, + "grad_norm": 0.1884765625, + "learning_rate": 0.0024240000000000004, + "loss": 1.306, + "step": 808 + }, + { + "epoch": 0.07094264546900712, + "grad_norm": 0.103515625, + "learning_rate": 0.0024270000000000003, + "loss": 1.2523, + "step": 809 + }, + { + "epoch": 0.07103033724338167, + "grad_norm": 0.251953125, + "learning_rate": 0.0024300000000000003, + "loss": 1.3436, + "step": 810 + }, + { + "epoch": 0.07111802901775621, + "grad_norm": 0.076171875, + "learning_rate": 0.0024330000000000003, + "loss": 1.329, + "step": 811 + }, + { + "epoch": 0.07120572079213076, + "grad_norm": 0.2578125, + "learning_rate": 0.0024360000000000002, + "loss": 1.3534, + "step": 812 + }, + { + "epoch": 0.0712934125665053, + "grad_norm": 0.0869140625, + "learning_rate": 0.0024389999999999998, + "loss": 1.2848, + "step": 813 + }, + { + "epoch": 0.07138110434087985, + "grad_norm": 0.1787109375, + "learning_rate": 0.0024419999999999997, + "loss": 1.3431, + "step": 814 + }, + { + "epoch": 0.07146879611525439, + "grad_norm": 0.09716796875, + "learning_rate": 0.0024449999999999997, + "loss": 1.2798, + "step": 815 + }, + { + "epoch": 0.07155648788962894, + "grad_norm": 0.1640625, + "learning_rate": 0.002448, + "loss": 1.3048, + "step": 816 + }, + { + "epoch": 0.07164417966400348, + "grad_norm": 0.11865234375, + "learning_rate": 0.002451, + "loss": 1.2944, + "step": 817 + }, + { + "epoch": 0.07173187143837803, + "grad_norm": 0.296875, + "learning_rate": 0.002454, + "loss": 1.4416, + "step": 818 + }, + { + "epoch": 0.07181956321275258, + "grad_norm": 0.1767578125, + "learning_rate": 0.002457, + "loss": 1.3192, + "step": 819 + }, + { + "epoch": 0.07190725498712712, + "grad_norm": 0.2138671875, + "learning_rate": 0.00246, + "loss": 1.3287, + "step": 820 + }, + { + "epoch": 0.07199494676150167, + "grad_norm": 0.1591796875, + "learning_rate": 0.002463, + "loss": 1.3128, + "step": 821 + }, + { + "epoch": 0.07208263853587621, + "grad_norm": 0.162109375, + "learning_rate": 0.002466, + "loss": 1.3772, + "step": 822 + }, + { + "epoch": 0.07217033031025076, + "grad_norm": 0.140625, + "learning_rate": 0.002469, + "loss": 1.3326, + "step": 823 + }, + { + "epoch": 0.0722580220846253, + "grad_norm": 0.16796875, + "learning_rate": 0.002472, + "loss": 1.3595, + "step": 824 + }, + { + "epoch": 0.07234571385899985, + "grad_norm": 0.12060546875, + "learning_rate": 0.0024749999999999998, + "loss": 1.3271, + "step": 825 + }, + { + "epoch": 0.07243340563337439, + "grad_norm": 0.169921875, + "learning_rate": 0.0024779999999999997, + "loss": 1.3399, + "step": 826 + }, + { + "epoch": 0.07252109740774894, + "grad_norm": 0.142578125, + "learning_rate": 0.002481, + "loss": 1.3322, + "step": 827 + }, + { + "epoch": 0.07260878918212349, + "grad_norm": 0.20703125, + "learning_rate": 0.002484, + "loss": 1.3868, + "step": 828 + }, + { + "epoch": 0.07269648095649803, + "grad_norm": 0.1767578125, + "learning_rate": 0.002487, + "loss": 1.3419, + "step": 829 + }, + { + "epoch": 0.07278417273087258, + "grad_norm": 0.1865234375, + "learning_rate": 0.00249, + "loss": 1.3186, + "step": 830 + }, + { + "epoch": 0.07287186450524712, + "grad_norm": 0.1640625, + "learning_rate": 0.002493, + "loss": 1.2891, + "step": 831 + }, + { + "epoch": 0.07295955627962167, + "grad_norm": 0.154296875, + "learning_rate": 0.002496, + "loss": 1.3262, + "step": 832 + }, + { + "epoch": 0.0730472480539962, + "grad_norm": 0.12451171875, + "learning_rate": 0.002499, + "loss": 1.3283, + "step": 833 + }, + { + "epoch": 0.07313493982837076, + "grad_norm": 0.10009765625, + "learning_rate": 0.002502, + "loss": 1.3316, + "step": 834 + }, + { + "epoch": 0.0732226316027453, + "grad_norm": 0.0888671875, + "learning_rate": 0.002505, + "loss": 1.324, + "step": 835 + }, + { + "epoch": 0.07331032337711985, + "grad_norm": 0.08984375, + "learning_rate": 0.002508, + "loss": 1.3424, + "step": 836 + }, + { + "epoch": 0.0733980151514944, + "grad_norm": 0.09765625, + "learning_rate": 0.0025109999999999998, + "loss": 1.3264, + "step": 837 + }, + { + "epoch": 0.07348570692586893, + "grad_norm": 0.08740234375, + "learning_rate": 0.0025139999999999997, + "loss": 1.3647, + "step": 838 + }, + { + "epoch": 0.07357339870024349, + "grad_norm": 0.10693359375, + "learning_rate": 0.002517, + "loss": 1.3417, + "step": 839 + }, + { + "epoch": 0.07366109047461802, + "grad_norm": 0.10888671875, + "learning_rate": 0.00252, + "loss": 1.2623, + "step": 840 + }, + { + "epoch": 0.07374878224899258, + "grad_norm": 0.142578125, + "learning_rate": 0.002523, + "loss": 1.3707, + "step": 841 + }, + { + "epoch": 0.07383647402336711, + "grad_norm": 0.10400390625, + "learning_rate": 0.002526, + "loss": 1.398, + "step": 842 + }, + { + "epoch": 0.07392416579774166, + "grad_norm": 0.09912109375, + "learning_rate": 0.002529, + "loss": 1.319, + "step": 843 + }, + { + "epoch": 0.0740118575721162, + "grad_norm": 0.11962890625, + "learning_rate": 0.002532, + "loss": 1.324, + "step": 844 + }, + { + "epoch": 0.07409954934649075, + "grad_norm": 0.16015625, + "learning_rate": 0.002535, + "loss": 1.2863, + "step": 845 + }, + { + "epoch": 0.0741872411208653, + "grad_norm": 0.1396484375, + "learning_rate": 0.002538, + "loss": 1.3849, + "step": 846 + }, + { + "epoch": 0.07427493289523984, + "grad_norm": 0.12353515625, + "learning_rate": 0.002541, + "loss": 1.3583, + "step": 847 + }, + { + "epoch": 0.0743626246696144, + "grad_norm": 0.1396484375, + "learning_rate": 0.002544, + "loss": 1.383, + "step": 848 + }, + { + "epoch": 0.07445031644398893, + "grad_norm": 0.1298828125, + "learning_rate": 0.002547, + "loss": 1.2858, + "step": 849 + }, + { + "epoch": 0.07453800821836348, + "grad_norm": 0.1904296875, + "learning_rate": 0.00255, + "loss": 1.3304, + "step": 850 + }, + { + "epoch": 0.07462569999273802, + "grad_norm": 0.1796875, + "learning_rate": 0.002553, + "loss": 1.4514, + "step": 851 + }, + { + "epoch": 0.07471339176711257, + "grad_norm": 0.177734375, + "learning_rate": 0.002556, + "loss": 1.3373, + "step": 852 + }, + { + "epoch": 0.07480108354148711, + "grad_norm": 0.1875, + "learning_rate": 0.002559, + "loss": 1.4162, + "step": 853 + }, + { + "epoch": 0.07488877531586166, + "grad_norm": 0.1484375, + "learning_rate": 0.002562, + "loss": 1.3607, + "step": 854 + }, + { + "epoch": 0.07497646709023621, + "grad_norm": 0.1435546875, + "learning_rate": 0.002565, + "loss": 1.3378, + "step": 855 + }, + { + "epoch": 0.07506415886461075, + "grad_norm": 0.1435546875, + "learning_rate": 0.002568, + "loss": 1.3712, + "step": 856 + }, + { + "epoch": 0.0751518506389853, + "grad_norm": 0.111328125, + "learning_rate": 0.002571, + "loss": 1.3485, + "step": 857 + }, + { + "epoch": 0.07523954241335984, + "grad_norm": 0.0869140625, + "learning_rate": 0.002574, + "loss": 1.3551, + "step": 858 + }, + { + "epoch": 0.07532723418773439, + "grad_norm": 0.072265625, + "learning_rate": 0.002577, + "loss": 1.3312, + "step": 859 + }, + { + "epoch": 0.07541492596210893, + "grad_norm": 0.09912109375, + "learning_rate": 0.00258, + "loss": 1.3454, + "step": 860 + }, + { + "epoch": 0.07550261773648348, + "grad_norm": 0.103515625, + "learning_rate": 0.0025830000000000002, + "loss": 1.3705, + "step": 861 + }, + { + "epoch": 0.07559030951085802, + "grad_norm": 0.11181640625, + "learning_rate": 0.002586, + "loss": 1.2941, + "step": 862 + }, + { + "epoch": 0.07567800128523257, + "grad_norm": 0.0947265625, + "learning_rate": 0.002589, + "loss": 1.3392, + "step": 863 + }, + { + "epoch": 0.07576569305960712, + "grad_norm": 0.1357421875, + "learning_rate": 0.002592, + "loss": 1.3258, + "step": 864 + }, + { + "epoch": 0.07585338483398166, + "grad_norm": 0.07568359375, + "learning_rate": 0.002595, + "loss": 1.288, + "step": 865 + }, + { + "epoch": 0.07594107660835621, + "grad_norm": 0.17578125, + "learning_rate": 0.002598, + "loss": 1.4316, + "step": 866 + }, + { + "epoch": 0.07602876838273075, + "grad_norm": 0.130859375, + "learning_rate": 0.002601, + "loss": 1.3078, + "step": 867 + }, + { + "epoch": 0.0761164601571053, + "grad_norm": 0.1552734375, + "learning_rate": 0.002604, + "loss": 1.2932, + "step": 868 + }, + { + "epoch": 0.07620415193147984, + "grad_norm": 0.1669921875, + "learning_rate": 0.002607, + "loss": 1.3165, + "step": 869 + }, + { + "epoch": 0.07629184370585439, + "grad_norm": 0.123046875, + "learning_rate": 0.00261, + "loss": 1.3193, + "step": 870 + }, + { + "epoch": 0.07637953548022892, + "grad_norm": 0.09765625, + "learning_rate": 0.002613, + "loss": 1.3277, + "step": 871 + }, + { + "epoch": 0.07646722725460348, + "grad_norm": 0.111328125, + "learning_rate": 0.002616, + "loss": 1.2847, + "step": 872 + }, + { + "epoch": 0.07655491902897803, + "grad_norm": 0.12158203125, + "learning_rate": 0.0026190000000000002, + "loss": 1.335, + "step": 873 + }, + { + "epoch": 0.07664261080335256, + "grad_norm": 0.09130859375, + "learning_rate": 0.002622, + "loss": 1.2934, + "step": 874 + }, + { + "epoch": 0.07673030257772712, + "grad_norm": 0.11181640625, + "learning_rate": 0.002625, + "loss": 1.3118, + "step": 875 + }, + { + "epoch": 0.07681799435210165, + "grad_norm": 0.08837890625, + "learning_rate": 0.002628, + "loss": 1.268, + "step": 876 + }, + { + "epoch": 0.0769056861264762, + "grad_norm": 0.1630859375, + "learning_rate": 0.002631, + "loss": 1.3718, + "step": 877 + }, + { + "epoch": 0.07699337790085074, + "grad_norm": 0.087890625, + "learning_rate": 0.002634, + "loss": 1.347, + "step": 878 + }, + { + "epoch": 0.0770810696752253, + "grad_norm": 0.1025390625, + "learning_rate": 0.002637, + "loss": 1.321, + "step": 879 + }, + { + "epoch": 0.07716876144959983, + "grad_norm": 0.10498046875, + "learning_rate": 0.00264, + "loss": 1.3698, + "step": 880 + }, + { + "epoch": 0.07725645322397438, + "grad_norm": 0.1083984375, + "learning_rate": 0.002643, + "loss": 1.3034, + "step": 881 + }, + { + "epoch": 0.07734414499834893, + "grad_norm": 0.08251953125, + "learning_rate": 0.002646, + "loss": 1.3008, + "step": 882 + }, + { + "epoch": 0.07743183677272347, + "grad_norm": 0.107421875, + "learning_rate": 0.002649, + "loss": 1.3692, + "step": 883 + }, + { + "epoch": 0.07751952854709802, + "grad_norm": 0.1044921875, + "learning_rate": 0.0026520000000000003, + "loss": 1.3591, + "step": 884 + }, + { + "epoch": 0.07760722032147256, + "grad_norm": 0.0732421875, + "learning_rate": 0.0026550000000000002, + "loss": 1.2839, + "step": 885 + }, + { + "epoch": 0.07769491209584711, + "grad_norm": 0.10107421875, + "learning_rate": 0.002658, + "loss": 1.3315, + "step": 886 + }, + { + "epoch": 0.07778260387022165, + "grad_norm": 0.10595703125, + "learning_rate": 0.002661, + "loss": 1.306, + "step": 887 + }, + { + "epoch": 0.0778702956445962, + "grad_norm": 0.103515625, + "learning_rate": 0.002664, + "loss": 1.3655, + "step": 888 + }, + { + "epoch": 0.07795798741897074, + "grad_norm": 0.0908203125, + "learning_rate": 0.002667, + "loss": 1.3113, + "step": 889 + }, + { + "epoch": 0.07804567919334529, + "grad_norm": 0.1884765625, + "learning_rate": 0.00267, + "loss": 1.3352, + "step": 890 + }, + { + "epoch": 0.07813337096771984, + "grad_norm": 0.1689453125, + "learning_rate": 0.002673, + "loss": 1.2859, + "step": 891 + }, + { + "epoch": 0.07822106274209438, + "grad_norm": 0.10205078125, + "learning_rate": 0.002676, + "loss": 1.3539, + "step": 892 + }, + { + "epoch": 0.07830875451646893, + "grad_norm": 0.234375, + "learning_rate": 0.002679, + "loss": 1.3543, + "step": 893 + }, + { + "epoch": 0.07839644629084347, + "grad_norm": 0.2080078125, + "learning_rate": 0.002682, + "loss": 1.3276, + "step": 894 + }, + { + "epoch": 0.07848413806521802, + "grad_norm": 0.10302734375, + "learning_rate": 0.0026850000000000003, + "loss": 1.3294, + "step": 895 + }, + { + "epoch": 0.07857182983959256, + "grad_norm": 0.111328125, + "learning_rate": 0.0026880000000000003, + "loss": 1.2468, + "step": 896 + }, + { + "epoch": 0.07865952161396711, + "grad_norm": 0.1845703125, + "learning_rate": 0.0026910000000000002, + "loss": 1.3238, + "step": 897 + }, + { + "epoch": 0.07874721338834165, + "grad_norm": 0.1767578125, + "learning_rate": 0.002694, + "loss": 1.3838, + "step": 898 + }, + { + "epoch": 0.0788349051627162, + "grad_norm": 0.205078125, + "learning_rate": 0.002697, + "loss": 1.3141, + "step": 899 + }, + { + "epoch": 0.07892259693709075, + "grad_norm": 0.1455078125, + "learning_rate": 0.0027, + "loss": 1.2963, + "step": 900 + }, + { + "epoch": 0.07901028871146529, + "grad_norm": 0.15234375, + "learning_rate": 0.002703, + "loss": 1.256, + "step": 901 + }, + { + "epoch": 0.07909798048583984, + "grad_norm": 0.1259765625, + "learning_rate": 0.002706, + "loss": 1.3802, + "step": 902 + }, + { + "epoch": 0.07918567226021438, + "grad_norm": 0.10693359375, + "learning_rate": 0.002709, + "loss": 1.2523, + "step": 903 + }, + { + "epoch": 0.07927336403458893, + "grad_norm": 0.11083984375, + "learning_rate": 0.002712, + "loss": 1.2966, + "step": 904 + }, + { + "epoch": 0.07936105580896347, + "grad_norm": 0.1103515625, + "learning_rate": 0.002715, + "loss": 1.2996, + "step": 905 + }, + { + "epoch": 0.07944874758333802, + "grad_norm": 0.10986328125, + "learning_rate": 0.002718, + "loss": 1.3653, + "step": 906 + }, + { + "epoch": 0.07953643935771255, + "grad_norm": 0.1396484375, + "learning_rate": 0.0027210000000000003, + "loss": 1.3168, + "step": 907 + }, + { + "epoch": 0.0796241311320871, + "grad_norm": 0.095703125, + "learning_rate": 0.0027240000000000003, + "loss": 1.4055, + "step": 908 + }, + { + "epoch": 0.07971182290646166, + "grad_norm": 0.1357421875, + "learning_rate": 0.0027270000000000003, + "loss": 1.3193, + "step": 909 + }, + { + "epoch": 0.0797995146808362, + "grad_norm": 0.09912109375, + "learning_rate": 0.0027300000000000002, + "loss": 1.3323, + "step": 910 + }, + { + "epoch": 0.07988720645521075, + "grad_norm": 0.193359375, + "learning_rate": 0.002733, + "loss": 1.361, + "step": 911 + }, + { + "epoch": 0.07997489822958528, + "grad_norm": 0.255859375, + "learning_rate": 0.002736, + "loss": 1.3238, + "step": 912 + }, + { + "epoch": 0.08006259000395984, + "grad_norm": 0.1005859375, + "learning_rate": 0.002739, + "loss": 1.3205, + "step": 913 + }, + { + "epoch": 0.08015028177833437, + "grad_norm": 0.111328125, + "learning_rate": 0.002742, + "loss": 1.2774, + "step": 914 + }, + { + "epoch": 0.08023797355270892, + "grad_norm": 0.09521484375, + "learning_rate": 0.002745, + "loss": 1.3301, + "step": 915 + }, + { + "epoch": 0.08032566532708346, + "grad_norm": 0.09423828125, + "learning_rate": 0.002748, + "loss": 1.3214, + "step": 916 + }, + { + "epoch": 0.08041335710145801, + "grad_norm": 0.1416015625, + "learning_rate": 0.002751, + "loss": 1.3142, + "step": 917 + }, + { + "epoch": 0.08050104887583256, + "grad_norm": 0.07470703125, + "learning_rate": 0.0027540000000000004, + "loss": 1.3766, + "step": 918 + }, + { + "epoch": 0.0805887406502071, + "grad_norm": 0.11376953125, + "learning_rate": 0.0027570000000000003, + "loss": 1.2802, + "step": 919 + }, + { + "epoch": 0.08067643242458165, + "grad_norm": 0.087890625, + "learning_rate": 0.0027600000000000003, + "loss": 1.3042, + "step": 920 + }, + { + "epoch": 0.08076412419895619, + "grad_norm": 0.10986328125, + "learning_rate": 0.0027630000000000003, + "loss": 1.3302, + "step": 921 + }, + { + "epoch": 0.08085181597333074, + "grad_norm": 0.1044921875, + "learning_rate": 0.0027660000000000002, + "loss": 1.3524, + "step": 922 + }, + { + "epoch": 0.08093950774770528, + "grad_norm": 0.10888671875, + "learning_rate": 0.002769, + "loss": 1.229, + "step": 923 + }, + { + "epoch": 0.08102719952207983, + "grad_norm": 0.10107421875, + "learning_rate": 0.002772, + "loss": 1.334, + "step": 924 + }, + { + "epoch": 0.08111489129645437, + "grad_norm": 0.1318359375, + "learning_rate": 0.002775, + "loss": 1.3263, + "step": 925 + }, + { + "epoch": 0.08120258307082892, + "grad_norm": 0.201171875, + "learning_rate": 0.002778, + "loss": 1.3455, + "step": 926 + }, + { + "epoch": 0.08129027484520347, + "grad_norm": 0.0849609375, + "learning_rate": 0.002781, + "loss": 1.3536, + "step": 927 + }, + { + "epoch": 0.08137796661957801, + "grad_norm": 0.11865234375, + "learning_rate": 0.002784, + "loss": 1.3306, + "step": 928 + }, + { + "epoch": 0.08146565839395256, + "grad_norm": 0.0771484375, + "learning_rate": 0.0027870000000000004, + "loss": 1.3462, + "step": 929 + }, + { + "epoch": 0.0815533501683271, + "grad_norm": 0.12890625, + "learning_rate": 0.0027900000000000004, + "loss": 1.2984, + "step": 930 + }, + { + "epoch": 0.08164104194270165, + "grad_norm": 0.10205078125, + "learning_rate": 0.0027930000000000003, + "loss": 1.3302, + "step": 931 + }, + { + "epoch": 0.08172873371707619, + "grad_norm": 0.1005859375, + "learning_rate": 0.0027960000000000003, + "loss": 1.356, + "step": 932 + }, + { + "epoch": 0.08181642549145074, + "grad_norm": 0.091796875, + "learning_rate": 0.0027990000000000003, + "loss": 1.3102, + "step": 933 + }, + { + "epoch": 0.08190411726582528, + "grad_norm": 0.10205078125, + "learning_rate": 0.0028020000000000002, + "loss": 1.4226, + "step": 934 + }, + { + "epoch": 0.08199180904019983, + "grad_norm": 0.0869140625, + "learning_rate": 0.002805, + "loss": 1.3052, + "step": 935 + }, + { + "epoch": 0.08207950081457437, + "grad_norm": 0.095703125, + "learning_rate": 0.002808, + "loss": 1.3552, + "step": 936 + }, + { + "epoch": 0.08216719258894892, + "grad_norm": 0.07958984375, + "learning_rate": 0.002811, + "loss": 1.3334, + "step": 937 + }, + { + "epoch": 0.08225488436332347, + "grad_norm": 0.15234375, + "learning_rate": 0.002814, + "loss": 1.3238, + "step": 938 + }, + { + "epoch": 0.082342576137698, + "grad_norm": 0.08154296875, + "learning_rate": 0.002817, + "loss": 1.2994, + "step": 939 + }, + { + "epoch": 0.08243026791207256, + "grad_norm": 0.12158203125, + "learning_rate": 0.00282, + "loss": 1.2673, + "step": 940 + }, + { + "epoch": 0.0825179596864471, + "grad_norm": 0.126953125, + "learning_rate": 0.002823, + "loss": 1.2656, + "step": 941 + }, + { + "epoch": 0.08260565146082165, + "grad_norm": 0.10009765625, + "learning_rate": 0.002826, + "loss": 1.3713, + "step": 942 + }, + { + "epoch": 0.08269334323519618, + "grad_norm": 0.27734375, + "learning_rate": 0.002829, + "loss": 1.3689, + "step": 943 + }, + { + "epoch": 0.08278103500957074, + "grad_norm": 0.1533203125, + "learning_rate": 0.002832, + "loss": 1.3078, + "step": 944 + }, + { + "epoch": 0.08286872678394527, + "grad_norm": 0.201171875, + "learning_rate": 0.002835, + "loss": 1.3463, + "step": 945 + }, + { + "epoch": 0.08295641855831982, + "grad_norm": 0.2109375, + "learning_rate": 0.002838, + "loss": 1.2728, + "step": 946 + }, + { + "epoch": 0.08304411033269438, + "grad_norm": 0.10205078125, + "learning_rate": 0.0028409999999999998, + "loss": 1.2866, + "step": 947 + }, + { + "epoch": 0.08313180210706891, + "grad_norm": 0.09375, + "learning_rate": 0.0028439999999999997, + "loss": 1.3144, + "step": 948 + }, + { + "epoch": 0.08321949388144347, + "grad_norm": 0.10986328125, + "learning_rate": 0.002847, + "loss": 1.2829, + "step": 949 + }, + { + "epoch": 0.083307185655818, + "grad_norm": 0.07763671875, + "learning_rate": 0.00285, + "loss": 1.3705, + "step": 950 + }, + { + "epoch": 0.08339487743019255, + "grad_norm": 0.107421875, + "learning_rate": 0.002853, + "loss": 1.3597, + "step": 951 + }, + { + "epoch": 0.08348256920456709, + "grad_norm": 0.10791015625, + "learning_rate": 0.002856, + "loss": 1.3528, + "step": 952 + }, + { + "epoch": 0.08357026097894164, + "grad_norm": 0.126953125, + "learning_rate": 0.002859, + "loss": 1.3411, + "step": 953 + }, + { + "epoch": 0.08365795275331618, + "grad_norm": 0.1376953125, + "learning_rate": 0.002862, + "loss": 1.3656, + "step": 954 + }, + { + "epoch": 0.08374564452769073, + "grad_norm": 0.259765625, + "learning_rate": 0.002865, + "loss": 1.3907, + "step": 955 + }, + { + "epoch": 0.08383333630206528, + "grad_norm": 0.10986328125, + "learning_rate": 0.002868, + "loss": 1.3499, + "step": 956 + }, + { + "epoch": 0.08392102807643982, + "grad_norm": 0.13671875, + "learning_rate": 0.002871, + "loss": 1.326, + "step": 957 + }, + { + "epoch": 0.08400871985081437, + "grad_norm": 0.11083984375, + "learning_rate": 0.002874, + "loss": 1.2958, + "step": 958 + }, + { + "epoch": 0.08409641162518891, + "grad_norm": 0.087890625, + "learning_rate": 0.002877, + "loss": 1.2943, + "step": 959 + }, + { + "epoch": 0.08418410339956346, + "grad_norm": 0.15625, + "learning_rate": 0.0028799999999999997, + "loss": 1.2968, + "step": 960 + }, + { + "epoch": 0.084271795173938, + "grad_norm": 0.119140625, + "learning_rate": 0.002883, + "loss": 1.3424, + "step": 961 + }, + { + "epoch": 0.08435948694831255, + "grad_norm": 0.130859375, + "learning_rate": 0.002886, + "loss": 1.326, + "step": 962 + }, + { + "epoch": 0.08444717872268709, + "grad_norm": 0.1845703125, + "learning_rate": 0.002889, + "loss": 1.3498, + "step": 963 + }, + { + "epoch": 0.08453487049706164, + "grad_norm": 0.1787109375, + "learning_rate": 0.002892, + "loss": 1.2791, + "step": 964 + }, + { + "epoch": 0.08462256227143619, + "grad_norm": 0.0849609375, + "learning_rate": 0.002895, + "loss": 1.3582, + "step": 965 + }, + { + "epoch": 0.08471025404581073, + "grad_norm": 0.1337890625, + "learning_rate": 0.002898, + "loss": 1.3123, + "step": 966 + }, + { + "epoch": 0.08479794582018528, + "grad_norm": 0.1689453125, + "learning_rate": 0.002901, + "loss": 1.3389, + "step": 967 + }, + { + "epoch": 0.08488563759455982, + "grad_norm": 0.111328125, + "learning_rate": 0.002904, + "loss": 1.3073, + "step": 968 + }, + { + "epoch": 0.08497332936893437, + "grad_norm": 0.388671875, + "learning_rate": 0.002907, + "loss": 1.3919, + "step": 969 + }, + { + "epoch": 0.08506102114330891, + "grad_norm": 0.255859375, + "learning_rate": 0.00291, + "loss": 1.3619, + "step": 970 + }, + { + "epoch": 0.08514871291768346, + "grad_norm": 0.1748046875, + "learning_rate": 0.002913, + "loss": 1.2457, + "step": 971 + }, + { + "epoch": 0.085236404692058, + "grad_norm": 0.140625, + "learning_rate": 0.002916, + "loss": 1.3531, + "step": 972 + }, + { + "epoch": 0.08532409646643255, + "grad_norm": 0.19921875, + "learning_rate": 0.002919, + "loss": 1.3493, + "step": 973 + }, + { + "epoch": 0.0854117882408071, + "grad_norm": 0.1865234375, + "learning_rate": 0.002922, + "loss": 1.3564, + "step": 974 + }, + { + "epoch": 0.08549948001518164, + "grad_norm": 0.1572265625, + "learning_rate": 0.002925, + "loss": 1.3556, + "step": 975 + }, + { + "epoch": 0.08558717178955619, + "grad_norm": 0.10791015625, + "learning_rate": 0.002928, + "loss": 1.3064, + "step": 976 + }, + { + "epoch": 0.08567486356393073, + "grad_norm": 0.181640625, + "learning_rate": 0.002931, + "loss": 1.3248, + "step": 977 + }, + { + "epoch": 0.08576255533830528, + "grad_norm": 0.09619140625, + "learning_rate": 0.002934, + "loss": 1.2805, + "step": 978 + }, + { + "epoch": 0.08585024711267981, + "grad_norm": 0.138671875, + "learning_rate": 0.002937, + "loss": 1.3346, + "step": 979 + }, + { + "epoch": 0.08593793888705437, + "grad_norm": 0.08740234375, + "learning_rate": 0.00294, + "loss": 1.2495, + "step": 980 + }, + { + "epoch": 0.0860256306614289, + "grad_norm": 0.10693359375, + "learning_rate": 0.002943, + "loss": 1.3576, + "step": 981 + }, + { + "epoch": 0.08611332243580345, + "grad_norm": 0.0751953125, + "learning_rate": 0.002946, + "loss": 1.2991, + "step": 982 + }, + { + "epoch": 0.086201014210178, + "grad_norm": 0.08935546875, + "learning_rate": 0.0029490000000000002, + "loss": 1.4072, + "step": 983 + }, + { + "epoch": 0.08628870598455254, + "grad_norm": 0.10546875, + "learning_rate": 0.002952, + "loss": 1.3707, + "step": 984 + }, + { + "epoch": 0.0863763977589271, + "grad_norm": 0.095703125, + "learning_rate": 0.002955, + "loss": 1.3455, + "step": 985 + }, + { + "epoch": 0.08646408953330163, + "grad_norm": 0.099609375, + "learning_rate": 0.002958, + "loss": 1.3252, + "step": 986 + }, + { + "epoch": 0.08655178130767618, + "grad_norm": 0.08935546875, + "learning_rate": 0.002961, + "loss": 1.3285, + "step": 987 + }, + { + "epoch": 0.08663947308205072, + "grad_norm": 0.1328125, + "learning_rate": 0.002964, + "loss": 1.4067, + "step": 988 + }, + { + "epoch": 0.08672716485642527, + "grad_norm": 0.10009765625, + "learning_rate": 0.002967, + "loss": 1.321, + "step": 989 + }, + { + "epoch": 0.08681485663079981, + "grad_norm": 0.111328125, + "learning_rate": 0.00297, + "loss": 1.3814, + "step": 990 + }, + { + "epoch": 0.08690254840517436, + "grad_norm": 0.1953125, + "learning_rate": 0.002973, + "loss": 1.3265, + "step": 991 + }, + { + "epoch": 0.08699024017954891, + "grad_norm": 0.1865234375, + "learning_rate": 0.002976, + "loss": 1.4072, + "step": 992 + }, + { + "epoch": 0.08707793195392345, + "grad_norm": 0.10302734375, + "learning_rate": 0.002979, + "loss": 1.3561, + "step": 993 + }, + { + "epoch": 0.087165623728298, + "grad_norm": 0.08349609375, + "learning_rate": 0.002982, + "loss": 1.3047, + "step": 994 + }, + { + "epoch": 0.08725331550267254, + "grad_norm": 0.0888671875, + "learning_rate": 0.0029850000000000002, + "loss": 1.2992, + "step": 995 + }, + { + "epoch": 0.08734100727704709, + "grad_norm": 0.11572265625, + "learning_rate": 0.002988, + "loss": 1.3093, + "step": 996 + }, + { + "epoch": 0.08742869905142163, + "grad_norm": 0.08984375, + "learning_rate": 0.002991, + "loss": 1.3291, + "step": 997 + }, + { + "epoch": 0.08751639082579618, + "grad_norm": 0.1484375, + "learning_rate": 0.002994, + "loss": 1.4177, + "step": 998 + }, + { + "epoch": 0.08760408260017072, + "grad_norm": 0.1513671875, + "learning_rate": 0.002997, + "loss": 1.2768, + "step": 999 + }, + { + "epoch": 0.08769177437454527, + "grad_norm": 0.10205078125, + "learning_rate": 0.003, + "loss": 1.3108, + "step": 1000 + }, + { + "epoch": 0.08769177437454527, + "eval_loss": 1.3424164056777954, + "eval_runtime": 429.1223, + "eval_samples_per_second": 33.666, + "eval_steps_per_second": 8.417, + "step": 1000 + }, + { + "epoch": 0.08777946614891982, + "grad_norm": 0.1044921875, + "learning_rate": 0.0029999999384417424, + "loss": 1.3823, + "step": 1001 + }, + { + "epoch": 0.08786715792329436, + "grad_norm": 0.16015625, + "learning_rate": 0.0029999997537669756, + "loss": 1.4031, + "step": 1002 + }, + { + "epoch": 0.08795484969766891, + "grad_norm": 0.1845703125, + "learning_rate": 0.002999999445975716, + "loss": 1.3088, + "step": 1003 + }, + { + "epoch": 0.08804254147204345, + "grad_norm": 0.10205078125, + "learning_rate": 0.0029999990150679926, + "loss": 1.3848, + "step": 1004 + }, + { + "epoch": 0.088130233246418, + "grad_norm": 0.16015625, + "learning_rate": 0.002999998461043843, + "loss": 1.3547, + "step": 1005 + }, + { + "epoch": 0.08821792502079254, + "grad_norm": 0.08642578125, + "learning_rate": 0.0029999977839033198, + "loss": 1.2681, + "step": 1006 + }, + { + "epoch": 0.08830561679516709, + "grad_norm": 0.150390625, + "learning_rate": 0.0029999969836464833, + "loss": 1.3275, + "step": 1007 + }, + { + "epoch": 0.08839330856954163, + "grad_norm": 0.11083984375, + "learning_rate": 0.002999996060273407, + "loss": 1.3565, + "step": 1008 + }, + { + "epoch": 0.08848100034391618, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029999950137841744, + "loss": 1.2566, + "step": 1009 + }, + { + "epoch": 0.08856869211829073, + "grad_norm": 0.087890625, + "learning_rate": 0.002999993844178882, + "loss": 1.3053, + "step": 1010 + }, + { + "epoch": 0.08865638389266527, + "grad_norm": 0.091796875, + "learning_rate": 0.002999992551457636, + "loss": 1.307, + "step": 1011 + }, + { + "epoch": 0.08874407566703982, + "grad_norm": 0.140625, + "learning_rate": 0.002999991135620554, + "loss": 1.3178, + "step": 1012 + }, + { + "epoch": 0.08883176744141436, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029999895966677658, + "loss": 1.2969, + "step": 1013 + }, + { + "epoch": 0.0889194592157889, + "grad_norm": 0.083984375, + "learning_rate": 0.0029999879345994113, + "loss": 1.378, + "step": 1014 + }, + { + "epoch": 0.08900715099016344, + "grad_norm": 0.083984375, + "learning_rate": 0.002999986149415642, + "loss": 1.3091, + "step": 1015 + }, + { + "epoch": 0.089094842764538, + "grad_norm": 0.130859375, + "learning_rate": 0.002999984241116621, + "loss": 1.3379, + "step": 1016 + }, + { + "epoch": 0.08918253453891253, + "grad_norm": 0.283203125, + "learning_rate": 0.0029999822097025223, + "loss": 1.3691, + "step": 1017 + }, + { + "epoch": 0.08927022631328708, + "grad_norm": 0.166015625, + "learning_rate": 0.0029999800551735304, + "loss": 1.3491, + "step": 1018 + }, + { + "epoch": 0.08935791808766164, + "grad_norm": 0.1025390625, + "learning_rate": 0.002999977777529843, + "loss": 1.3031, + "step": 1019 + }, + { + "epoch": 0.08944560986203617, + "grad_norm": 0.10791015625, + "learning_rate": 0.002999975376771667, + "loss": 1.3721, + "step": 1020 + }, + { + "epoch": 0.08953330163641073, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029999728528992214, + "loss": 1.3064, + "step": 1021 + }, + { + "epoch": 0.08962099341078526, + "grad_norm": 0.111328125, + "learning_rate": 0.002999970205912737, + "loss": 1.3724, + "step": 1022 + }, + { + "epoch": 0.08970868518515981, + "grad_norm": 0.189453125, + "learning_rate": 0.002999967435812455, + "loss": 1.3757, + "step": 1023 + }, + { + "epoch": 0.08979637695953435, + "grad_norm": 0.26953125, + "learning_rate": 0.0029999645425986265, + "loss": 1.3566, + "step": 1024 + }, + { + "epoch": 0.0898840687339089, + "grad_norm": 0.109375, + "learning_rate": 0.0029999615262715183, + "loss": 1.393, + "step": 1025 + }, + { + "epoch": 0.08997176050828344, + "grad_norm": 0.302734375, + "learning_rate": 0.0029999583868314025, + "loss": 1.3272, + "step": 1026 + }, + { + "epoch": 0.09005945228265799, + "grad_norm": 0.224609375, + "learning_rate": 0.0029999551242785674, + "loss": 1.3077, + "step": 1027 + }, + { + "epoch": 0.09014714405703254, + "grad_norm": 0.1357421875, + "learning_rate": 0.0029999517386133097, + "loss": 1.2973, + "step": 1028 + }, + { + "epoch": 0.09023483583140708, + "grad_norm": 0.291015625, + "learning_rate": 0.0029999482298359386, + "loss": 1.3414, + "step": 1029 + }, + { + "epoch": 0.09032252760578163, + "grad_norm": 0.1396484375, + "learning_rate": 0.002999944597946773, + "loss": 1.2904, + "step": 1030 + }, + { + "epoch": 0.09041021938015617, + "grad_norm": 0.2109375, + "learning_rate": 0.0029999408429461456, + "loss": 1.3405, + "step": 1031 + }, + { + "epoch": 0.09049791115453072, + "grad_norm": 0.2158203125, + "learning_rate": 0.0029999369648343976, + "loss": 1.3186, + "step": 1032 + }, + { + "epoch": 0.09058560292890526, + "grad_norm": 0.11474609375, + "learning_rate": 0.0029999329636118837, + "loss": 1.2944, + "step": 1033 + }, + { + "epoch": 0.09067329470327981, + "grad_norm": 0.1630859375, + "learning_rate": 0.0029999288392789686, + "loss": 1.3552, + "step": 1034 + }, + { + "epoch": 0.09076098647765435, + "grad_norm": 0.10791015625, + "learning_rate": 0.002999924591836028, + "loss": 1.3493, + "step": 1035 + }, + { + "epoch": 0.0908486782520289, + "grad_norm": 0.193359375, + "learning_rate": 0.002999920221283449, + "loss": 1.3785, + "step": 1036 + }, + { + "epoch": 0.09093637002640345, + "grad_norm": 0.1484375, + "learning_rate": 0.002999915727621631, + "loss": 1.341, + "step": 1037 + }, + { + "epoch": 0.09102406180077799, + "grad_norm": 0.1474609375, + "learning_rate": 0.0029999111108509834, + "loss": 1.3182, + "step": 1038 + }, + { + "epoch": 0.09111175357515254, + "grad_norm": 0.1572265625, + "learning_rate": 0.0029999063709719278, + "loss": 1.3108, + "step": 1039 + }, + { + "epoch": 0.09119944534952708, + "grad_norm": 0.1376953125, + "learning_rate": 0.002999901507984895, + "loss": 1.3294, + "step": 1040 + }, + { + "epoch": 0.09128713712390163, + "grad_norm": 0.1669921875, + "learning_rate": 0.0029998965218903297, + "loss": 1.2971, + "step": 1041 + }, + { + "epoch": 0.09137482889827617, + "grad_norm": 0.10791015625, + "learning_rate": 0.0029998914126886864, + "loss": 1.3484, + "step": 1042 + }, + { + "epoch": 0.09146252067265072, + "grad_norm": 0.1005859375, + "learning_rate": 0.002999886180380431, + "loss": 1.3318, + "step": 1043 + }, + { + "epoch": 0.09155021244702526, + "grad_norm": 0.1171875, + "learning_rate": 0.0029998808249660407, + "loss": 1.2998, + "step": 1044 + }, + { + "epoch": 0.09163790422139981, + "grad_norm": 0.10546875, + "learning_rate": 0.0029998753464460038, + "loss": 1.3707, + "step": 1045 + }, + { + "epoch": 0.09172559599577436, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029998697448208205, + "loss": 1.348, + "step": 1046 + }, + { + "epoch": 0.0918132877701489, + "grad_norm": 0.171875, + "learning_rate": 0.0029998640200910006, + "loss": 1.3921, + "step": 1047 + }, + { + "epoch": 0.09190097954452345, + "grad_norm": 0.2060546875, + "learning_rate": 0.002999858172257067, + "loss": 1.368, + "step": 1048 + }, + { + "epoch": 0.09198867131889799, + "grad_norm": 0.09814453125, + "learning_rate": 0.0029998522013195525, + "loss": 1.3864, + "step": 1049 + }, + { + "epoch": 0.09207636309327254, + "grad_norm": 0.1025390625, + "learning_rate": 0.0029998461072790017, + "loss": 1.3486, + "step": 1050 + }, + { + "epoch": 0.09216405486764707, + "grad_norm": 0.08203125, + "learning_rate": 0.002999839890135971, + "loss": 1.3157, + "step": 1051 + }, + { + "epoch": 0.09225174664202163, + "grad_norm": 0.072265625, + "learning_rate": 0.0029998335498910263, + "loss": 1.3175, + "step": 1052 + }, + { + "epoch": 0.09233943841639616, + "grad_norm": 0.09375, + "learning_rate": 0.002999827086544747, + "loss": 1.4045, + "step": 1053 + }, + { + "epoch": 0.09242713019077071, + "grad_norm": 0.09130859375, + "learning_rate": 0.0029998205000977213, + "loss": 1.3117, + "step": 1054 + }, + { + "epoch": 0.09251482196514527, + "grad_norm": 0.08056640625, + "learning_rate": 0.0029998137905505513, + "loss": 1.3054, + "step": 1055 + }, + { + "epoch": 0.0926025137395198, + "grad_norm": 0.09228515625, + "learning_rate": 0.0029998069579038476, + "loss": 1.342, + "step": 1056 + }, + { + "epoch": 0.09269020551389436, + "grad_norm": 0.1083984375, + "learning_rate": 0.0029998000021582345, + "loss": 1.2806, + "step": 1057 + }, + { + "epoch": 0.09277789728826889, + "grad_norm": 0.1337890625, + "learning_rate": 0.002999792923314345, + "loss": 1.3156, + "step": 1058 + }, + { + "epoch": 0.09286558906264344, + "grad_norm": 0.0966796875, + "learning_rate": 0.0029997857213728257, + "loss": 1.3024, + "step": 1059 + }, + { + "epoch": 0.09295328083701798, + "grad_norm": 0.103515625, + "learning_rate": 0.0029997783963343332, + "loss": 1.3808, + "step": 1060 + }, + { + "epoch": 0.09304097261139253, + "grad_norm": 0.181640625, + "learning_rate": 0.002999770948199535, + "loss": 1.3611, + "step": 1061 + }, + { + "epoch": 0.09312866438576707, + "grad_norm": 0.11572265625, + "learning_rate": 0.002999763376969111, + "loss": 1.3256, + "step": 1062 + }, + { + "epoch": 0.09321635616014162, + "grad_norm": 0.12353515625, + "learning_rate": 0.002999755682643751, + "loss": 1.3321, + "step": 1063 + }, + { + "epoch": 0.09330404793451617, + "grad_norm": 0.18359375, + "learning_rate": 0.0029997478652241576, + "loss": 1.372, + "step": 1064 + }, + { + "epoch": 0.09339173970889071, + "grad_norm": 0.11865234375, + "learning_rate": 0.002999739924711043, + "loss": 1.3459, + "step": 1065 + }, + { + "epoch": 0.09347943148326526, + "grad_norm": 0.1416015625, + "learning_rate": 0.002999731861105132, + "loss": 1.3769, + "step": 1066 + }, + { + "epoch": 0.0935671232576398, + "grad_norm": 0.259765625, + "learning_rate": 0.002999723674407159, + "loss": 1.3364, + "step": 1067 + }, + { + "epoch": 0.09365481503201435, + "grad_norm": 0.1416015625, + "learning_rate": 0.0029997153646178715, + "loss": 1.3105, + "step": 1068 + }, + { + "epoch": 0.09374250680638889, + "grad_norm": 0.08251953125, + "learning_rate": 0.0029997069317380274, + "loss": 1.3385, + "step": 1069 + }, + { + "epoch": 0.09383019858076344, + "grad_norm": 0.11962890625, + "learning_rate": 0.002999698375768395, + "loss": 1.3305, + "step": 1070 + }, + { + "epoch": 0.09391789035513798, + "grad_norm": 0.08642578125, + "learning_rate": 0.002999689696709755, + "loss": 1.3452, + "step": 1071 + }, + { + "epoch": 0.09400558212951253, + "grad_norm": 0.1640625, + "learning_rate": 0.0029996808945628984, + "loss": 1.3774, + "step": 1072 + }, + { + "epoch": 0.09409327390388708, + "grad_norm": 0.2099609375, + "learning_rate": 0.002999671969328629, + "loss": 1.3165, + "step": 1073 + }, + { + "epoch": 0.09418096567826162, + "grad_norm": 0.0673828125, + "learning_rate": 0.00299966292100776, + "loss": 1.3589, + "step": 1074 + }, + { + "epoch": 0.09426865745263617, + "grad_norm": 0.26171875, + "learning_rate": 0.0029996537496011166, + "loss": 1.3527, + "step": 1075 + }, + { + "epoch": 0.09435634922701071, + "grad_norm": 0.19921875, + "learning_rate": 0.0029996444551095352, + "loss": 1.3123, + "step": 1076 + }, + { + "epoch": 0.09444404100138526, + "grad_norm": 0.1767578125, + "learning_rate": 0.002999635037533864, + "loss": 1.3793, + "step": 1077 + }, + { + "epoch": 0.0945317327757598, + "grad_norm": 0.30078125, + "learning_rate": 0.0029996254968749614, + "loss": 1.3356, + "step": 1078 + }, + { + "epoch": 0.09461942455013435, + "grad_norm": 0.115234375, + "learning_rate": 0.002999615833133697, + "loss": 1.36, + "step": 1079 + }, + { + "epoch": 0.09470711632450889, + "grad_norm": 0.19921875, + "learning_rate": 0.0029996060463109535, + "loss": 1.2863, + "step": 1080 + }, + { + "epoch": 0.09479480809888344, + "grad_norm": 0.1171875, + "learning_rate": 0.002999596136407622, + "loss": 1.3104, + "step": 1081 + }, + { + "epoch": 0.09488249987325799, + "grad_norm": 0.1083984375, + "learning_rate": 0.002999586103424607, + "loss": 1.2665, + "step": 1082 + }, + { + "epoch": 0.09497019164763253, + "grad_norm": 0.1416015625, + "learning_rate": 0.0029995759473628227, + "loss": 1.3407, + "step": 1083 + }, + { + "epoch": 0.09505788342200708, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029995656682231964, + "loss": 1.3338, + "step": 1084 + }, + { + "epoch": 0.09514557519638162, + "grad_norm": 0.1484375, + "learning_rate": 0.0029995552660066656, + "loss": 1.3139, + "step": 1085 + }, + { + "epoch": 0.09523326697075617, + "grad_norm": 0.11328125, + "learning_rate": 0.0029995447407141777, + "loss": 1.3548, + "step": 1086 + }, + { + "epoch": 0.0953209587451307, + "grad_norm": 0.10986328125, + "learning_rate": 0.0029995340923466935, + "loss": 1.268, + "step": 1087 + }, + { + "epoch": 0.09540865051950526, + "grad_norm": 0.08642578125, + "learning_rate": 0.0029995233209051835, + "loss": 1.3708, + "step": 1088 + }, + { + "epoch": 0.0954963422938798, + "grad_norm": 0.158203125, + "learning_rate": 0.002999512426390631, + "loss": 1.2914, + "step": 1089 + }, + { + "epoch": 0.09558403406825434, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029995014088040287, + "loss": 1.3179, + "step": 1090 + }, + { + "epoch": 0.0956717258426289, + "grad_norm": 0.16796875, + "learning_rate": 0.0029994902681463815, + "loss": 1.3853, + "step": 1091 + }, + { + "epoch": 0.09575941761700343, + "grad_norm": 0.111328125, + "learning_rate": 0.0029994790044187056, + "loss": 1.2732, + "step": 1092 + }, + { + "epoch": 0.09584710939137799, + "grad_norm": 0.0751953125, + "learning_rate": 0.002999467617622028, + "loss": 1.2913, + "step": 1093 + }, + { + "epoch": 0.09593480116575252, + "grad_norm": 0.154296875, + "learning_rate": 0.002999456107757388, + "loss": 1.399, + "step": 1094 + }, + { + "epoch": 0.09602249294012707, + "grad_norm": 0.1865234375, + "learning_rate": 0.0029994444748258344, + "loss": 1.364, + "step": 1095 + }, + { + "epoch": 0.09611018471450161, + "grad_norm": 0.08203125, + "learning_rate": 0.0029994327188284276, + "loss": 1.363, + "step": 1096 + }, + { + "epoch": 0.09619787648887616, + "grad_norm": 0.13671875, + "learning_rate": 0.002999420839766241, + "loss": 1.3517, + "step": 1097 + }, + { + "epoch": 0.0962855682632507, + "grad_norm": 0.08203125, + "learning_rate": 0.002999408837640357, + "loss": 1.3214, + "step": 1098 + }, + { + "epoch": 0.09637326003762525, + "grad_norm": 0.087890625, + "learning_rate": 0.0029993967124518706, + "loss": 1.2575, + "step": 1099 + }, + { + "epoch": 0.0964609518119998, + "grad_norm": 0.1357421875, + "learning_rate": 0.0029993844642018873, + "loss": 1.2762, + "step": 1100 + }, + { + "epoch": 0.09654864358637434, + "grad_norm": 0.09033203125, + "learning_rate": 0.0029993720928915245, + "loss": 1.3552, + "step": 1101 + }, + { + "epoch": 0.09663633536074889, + "grad_norm": 0.10546875, + "learning_rate": 0.0029993595985219105, + "loss": 1.3962, + "step": 1102 + }, + { + "epoch": 0.09672402713512343, + "grad_norm": 0.09033203125, + "learning_rate": 0.0029993469810941837, + "loss": 1.3563, + "step": 1103 + }, + { + "epoch": 0.09681171890949798, + "grad_norm": 0.11865234375, + "learning_rate": 0.0029993342406094965, + "loss": 1.2809, + "step": 1104 + }, + { + "epoch": 0.09689941068387252, + "grad_norm": 0.0947265625, + "learning_rate": 0.002999321377069009, + "loss": 1.3422, + "step": 1105 + }, + { + "epoch": 0.09698710245824707, + "grad_norm": 0.173828125, + "learning_rate": 0.0029993083904738954, + "loss": 1.333, + "step": 1106 + }, + { + "epoch": 0.09707479423262161, + "grad_norm": 0.166015625, + "learning_rate": 0.00299929528082534, + "loss": 1.2935, + "step": 1107 + }, + { + "epoch": 0.09716248600699616, + "grad_norm": 0.1328125, + "learning_rate": 0.0029992820481245385, + "loss": 1.3219, + "step": 1108 + }, + { + "epoch": 0.09725017778137071, + "grad_norm": 0.1083984375, + "learning_rate": 0.0029992686923726963, + "loss": 1.3029, + "step": 1109 + }, + { + "epoch": 0.09733786955574525, + "grad_norm": 0.09814453125, + "learning_rate": 0.0029992552135710334, + "loss": 1.265, + "step": 1110 + }, + { + "epoch": 0.0974255613301198, + "grad_norm": 0.12451171875, + "learning_rate": 0.0029992416117207775, + "loss": 1.3016, + "step": 1111 + }, + { + "epoch": 0.09751325310449434, + "grad_norm": 0.08154296875, + "learning_rate": 0.00299922788682317, + "loss": 1.3026, + "step": 1112 + }, + { + "epoch": 0.09760094487886889, + "grad_norm": 0.10888671875, + "learning_rate": 0.0029992140388794626, + "loss": 1.3753, + "step": 1113 + }, + { + "epoch": 0.09768863665324343, + "grad_norm": 0.09375, + "learning_rate": 0.0029992000678909173, + "loss": 1.3631, + "step": 1114 + }, + { + "epoch": 0.09777632842761798, + "grad_norm": 0.08935546875, + "learning_rate": 0.002999185973858809, + "loss": 1.3922, + "step": 1115 + }, + { + "epoch": 0.09786402020199252, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029991717567844226, + "loss": 1.2998, + "step": 1116 + }, + { + "epoch": 0.09795171197636707, + "grad_norm": 0.10888671875, + "learning_rate": 0.002999157416669055, + "loss": 1.2946, + "step": 1117 + }, + { + "epoch": 0.09803940375074162, + "grad_norm": 0.08203125, + "learning_rate": 0.002999142953514014, + "loss": 1.3215, + "step": 1118 + }, + { + "epoch": 0.09812709552511616, + "grad_norm": 0.1064453125, + "learning_rate": 0.002999128367320618, + "loss": 1.407, + "step": 1119 + }, + { + "epoch": 0.09821478729949071, + "grad_norm": 0.1298828125, + "learning_rate": 0.002999113658090198, + "loss": 1.3248, + "step": 1120 + }, + { + "epoch": 0.09830247907386525, + "grad_norm": 0.08740234375, + "learning_rate": 0.002999098825824095, + "loss": 1.3874, + "step": 1121 + }, + { + "epoch": 0.0983901708482398, + "grad_norm": 0.08984375, + "learning_rate": 0.0029990838705236614, + "loss": 1.3076, + "step": 1122 + }, + { + "epoch": 0.09847786262261433, + "grad_norm": 0.08740234375, + "learning_rate": 0.002999068792190262, + "loss": 1.3557, + "step": 1123 + }, + { + "epoch": 0.09856555439698889, + "grad_norm": 0.10986328125, + "learning_rate": 0.0029990535908252713, + "loss": 1.2909, + "step": 1124 + }, + { + "epoch": 0.09865324617136342, + "grad_norm": 0.10546875, + "learning_rate": 0.0029990382664300754, + "loss": 1.2819, + "step": 1125 + }, + { + "epoch": 0.09874093794573797, + "grad_norm": 0.140625, + "learning_rate": 0.0029990228190060722, + "loss": 1.3976, + "step": 1126 + }, + { + "epoch": 0.09882862972011253, + "grad_norm": 0.11669921875, + "learning_rate": 0.0029990072485546705, + "loss": 1.3246, + "step": 1127 + }, + { + "epoch": 0.09891632149448706, + "grad_norm": 0.1376953125, + "learning_rate": 0.00299899155507729, + "loss": 1.4496, + "step": 1128 + }, + { + "epoch": 0.09900401326886162, + "grad_norm": 0.376953125, + "learning_rate": 0.002998975738575362, + "loss": 1.3328, + "step": 1129 + }, + { + "epoch": 0.09909170504323615, + "grad_norm": 0.25390625, + "learning_rate": 0.002998959799050329, + "loss": 1.3249, + "step": 1130 + }, + { + "epoch": 0.0991793968176107, + "grad_norm": 0.12060546875, + "learning_rate": 0.0029989437365036454, + "loss": 1.2959, + "step": 1131 + }, + { + "epoch": 0.09926708859198524, + "grad_norm": 0.13671875, + "learning_rate": 0.0029989275509367747, + "loss": 1.3706, + "step": 1132 + }, + { + "epoch": 0.0993547803663598, + "grad_norm": 0.09423828125, + "learning_rate": 0.0029989112423511933, + "loss": 1.312, + "step": 1133 + }, + { + "epoch": 0.09944247214073433, + "grad_norm": 0.08740234375, + "learning_rate": 0.0029988948107483896, + "loss": 1.3564, + "step": 1134 + }, + { + "epoch": 0.09953016391510888, + "grad_norm": 0.099609375, + "learning_rate": 0.0029988782561298608, + "loss": 1.3321, + "step": 1135 + }, + { + "epoch": 0.09961785568948343, + "grad_norm": 0.1171875, + "learning_rate": 0.002998861578497117, + "loss": 1.3139, + "step": 1136 + }, + { + "epoch": 0.09970554746385797, + "grad_norm": 0.1396484375, + "learning_rate": 0.00299884477785168, + "loss": 1.2867, + "step": 1137 + }, + { + "epoch": 0.09979323923823252, + "grad_norm": 0.0966796875, + "learning_rate": 0.0029988278541950805, + "loss": 1.2831, + "step": 1138 + }, + { + "epoch": 0.09988093101260706, + "grad_norm": 0.08544921875, + "learning_rate": 0.002998810807528863, + "loss": 1.3432, + "step": 1139 + }, + { + "epoch": 0.09996862278698161, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029987936378545813, + "loss": 1.3169, + "step": 1140 + }, + { + "epoch": 0.10005631456135615, + "grad_norm": 0.0849609375, + "learning_rate": 0.0029987763451738026, + "loss": 1.3212, + "step": 1141 + }, + { + "epoch": 0.1001440063357307, + "grad_norm": 0.087890625, + "learning_rate": 0.0029987589294881026, + "loss": 1.382, + "step": 1142 + }, + { + "epoch": 0.10023169811010524, + "grad_norm": 0.111328125, + "learning_rate": 0.0029987413907990703, + "loss": 1.2966, + "step": 1143 + }, + { + "epoch": 0.10031938988447979, + "grad_norm": 0.1455078125, + "learning_rate": 0.0029987237291083046, + "loss": 1.2973, + "step": 1144 + }, + { + "epoch": 0.10040708165885433, + "grad_norm": 0.1044921875, + "learning_rate": 0.002998705944417417, + "loss": 1.3291, + "step": 1145 + }, + { + "epoch": 0.10049477343322888, + "grad_norm": 0.11474609375, + "learning_rate": 0.002998688036728028, + "loss": 1.3089, + "step": 1146 + }, + { + "epoch": 0.10058246520760343, + "grad_norm": 0.076171875, + "learning_rate": 0.0029986700060417723, + "loss": 1.2835, + "step": 1147 + }, + { + "epoch": 0.10067015698197797, + "grad_norm": 0.11474609375, + "learning_rate": 0.0029986518523602936, + "loss": 1.3405, + "step": 1148 + }, + { + "epoch": 0.10075784875635252, + "grad_norm": 0.0966796875, + "learning_rate": 0.0029986335756852474, + "loss": 1.3233, + "step": 1149 + }, + { + "epoch": 0.10084554053072706, + "grad_norm": 0.09912109375, + "learning_rate": 0.002998615176018301, + "loss": 1.282, + "step": 1150 + }, + { + "epoch": 0.10093323230510161, + "grad_norm": 0.08203125, + "learning_rate": 0.0029985966533611313, + "loss": 1.3426, + "step": 1151 + }, + { + "epoch": 0.10102092407947615, + "grad_norm": 0.0859375, + "learning_rate": 0.0029985780077154286, + "loss": 1.2735, + "step": 1152 + }, + { + "epoch": 0.1011086158538507, + "grad_norm": 0.146484375, + "learning_rate": 0.002998559239082893, + "loss": 1.3627, + "step": 1153 + }, + { + "epoch": 0.10119630762822523, + "grad_norm": 0.162109375, + "learning_rate": 0.002998540347465236, + "loss": 1.2934, + "step": 1154 + }, + { + "epoch": 0.10128399940259979, + "grad_norm": 0.12255859375, + "learning_rate": 0.0029985213328641803, + "loss": 1.3252, + "step": 1155 + }, + { + "epoch": 0.10137169117697434, + "grad_norm": 0.1728515625, + "learning_rate": 0.0029985021952814604, + "loss": 1.2824, + "step": 1156 + }, + { + "epoch": 0.10145938295134888, + "grad_norm": 0.10302734375, + "learning_rate": 0.0029984829347188212, + "loss": 1.356, + "step": 1157 + }, + { + "epoch": 0.10154707472572343, + "grad_norm": 0.326171875, + "learning_rate": 0.00299846355117802, + "loss": 1.3524, + "step": 1158 + }, + { + "epoch": 0.10163476650009796, + "grad_norm": 0.19140625, + "learning_rate": 0.0029984440446608235, + "loss": 1.3321, + "step": 1159 + }, + { + "epoch": 0.10172245827447252, + "grad_norm": 0.2216796875, + "learning_rate": 0.0029984244151690116, + "loss": 1.3648, + "step": 1160 + }, + { + "epoch": 0.10181015004884705, + "grad_norm": 0.1787109375, + "learning_rate": 0.0029984046627043737, + "loss": 1.324, + "step": 1161 + }, + { + "epoch": 0.1018978418232216, + "grad_norm": 0.1171875, + "learning_rate": 0.0029983847872687114, + "loss": 1.3589, + "step": 1162 + }, + { + "epoch": 0.10198553359759614, + "grad_norm": 0.09716796875, + "learning_rate": 0.002998364788863837, + "loss": 1.3402, + "step": 1163 + }, + { + "epoch": 0.1020732253719707, + "grad_norm": 0.10205078125, + "learning_rate": 0.002998344667491575, + "loss": 1.3162, + "step": 1164 + }, + { + "epoch": 0.10216091714634525, + "grad_norm": 0.126953125, + "learning_rate": 0.00299832442315376, + "loss": 1.3257, + "step": 1165 + }, + { + "epoch": 0.10224860892071978, + "grad_norm": 0.09521484375, + "learning_rate": 0.0029983040558522384, + "loss": 1.4214, + "step": 1166 + }, + { + "epoch": 0.10233630069509433, + "grad_norm": 0.1357421875, + "learning_rate": 0.0029982835655888674, + "loss": 1.348, + "step": 1167 + }, + { + "epoch": 0.10242399246946887, + "grad_norm": 0.10546875, + "learning_rate": 0.002998262952365516, + "loss": 1.2412, + "step": 1168 + }, + { + "epoch": 0.10251168424384342, + "grad_norm": 0.10888671875, + "learning_rate": 0.0029982422161840636, + "loss": 1.3103, + "step": 1169 + }, + { + "epoch": 0.10259937601821796, + "grad_norm": 0.1103515625, + "learning_rate": 0.0029982213570464017, + "loss": 1.3279, + "step": 1170 + }, + { + "epoch": 0.10268706779259251, + "grad_norm": 0.109375, + "learning_rate": 0.0029982003749544324, + "loss": 1.34, + "step": 1171 + }, + { + "epoch": 0.10277475956696705, + "grad_norm": 0.0966796875, + "learning_rate": 0.0029981792699100692, + "loss": 1.3723, + "step": 1172 + }, + { + "epoch": 0.1028624513413416, + "grad_norm": 0.099609375, + "learning_rate": 0.0029981580419152372, + "loss": 1.2719, + "step": 1173 + }, + { + "epoch": 0.10295014311571615, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029981366909718715, + "loss": 1.2699, + "step": 1174 + }, + { + "epoch": 0.10303783489009069, + "grad_norm": 0.1015625, + "learning_rate": 0.0029981152170819206, + "loss": 1.3496, + "step": 1175 + }, + { + "epoch": 0.10312552666446524, + "grad_norm": 0.1552734375, + "learning_rate": 0.0029980936202473416, + "loss": 1.3897, + "step": 1176 + }, + { + "epoch": 0.10321321843883978, + "grad_norm": 0.2099609375, + "learning_rate": 0.002998071900470104, + "loss": 1.3579, + "step": 1177 + }, + { + "epoch": 0.10330091021321433, + "grad_norm": 0.1103515625, + "learning_rate": 0.00299805005775219, + "loss": 1.2813, + "step": 1178 + }, + { + "epoch": 0.10338860198758887, + "grad_norm": 0.2578125, + "learning_rate": 0.00299802809209559, + "loss": 1.2954, + "step": 1179 + }, + { + "epoch": 0.10347629376196342, + "grad_norm": 0.1044921875, + "learning_rate": 0.002998006003502308, + "loss": 1.4056, + "step": 1180 + }, + { + "epoch": 0.10356398553633796, + "grad_norm": 0.1572265625, + "learning_rate": 0.0029979837919743586, + "loss": 1.3866, + "step": 1181 + }, + { + "epoch": 0.10365167731071251, + "grad_norm": 0.158203125, + "learning_rate": 0.0029979614575137673, + "loss": 1.3485, + "step": 1182 + }, + { + "epoch": 0.10373936908508706, + "grad_norm": 0.373046875, + "learning_rate": 0.00299793900012257, + "loss": 1.3949, + "step": 1183 + }, + { + "epoch": 0.1038270608594616, + "grad_norm": 0.37109375, + "learning_rate": 0.002997916419802817, + "loss": 1.391, + "step": 1184 + }, + { + "epoch": 0.10391475263383615, + "grad_norm": 0.158203125, + "learning_rate": 0.0029978937165565656, + "loss": 1.3314, + "step": 1185 + }, + { + "epoch": 0.10400244440821069, + "grad_norm": 0.10888671875, + "learning_rate": 0.002997870890385886, + "loss": 1.3823, + "step": 1186 + }, + { + "epoch": 0.10409013618258524, + "grad_norm": 0.1181640625, + "learning_rate": 0.002997847941292861, + "loss": 1.391, + "step": 1187 + }, + { + "epoch": 0.10417782795695978, + "grad_norm": 0.12353515625, + "learning_rate": 0.0029978248692795837, + "loss": 1.2936, + "step": 1188 + }, + { + "epoch": 0.10426551973133433, + "grad_norm": 0.12060546875, + "learning_rate": 0.0029978016743481576, + "loss": 1.2837, + "step": 1189 + }, + { + "epoch": 0.10435321150570886, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029977783565006983, + "loss": 1.2655, + "step": 1190 + }, + { + "epoch": 0.10444090328008342, + "grad_norm": 0.111328125, + "learning_rate": 0.0029977549157393316, + "loss": 1.2235, + "step": 1191 + }, + { + "epoch": 0.10452859505445797, + "grad_norm": 0.09521484375, + "learning_rate": 0.0029977313520661956, + "loss": 1.3755, + "step": 1192 + }, + { + "epoch": 0.1046162868288325, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029977076654834406, + "loss": 1.3078, + "step": 1193 + }, + { + "epoch": 0.10470397860320706, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029976838559932248, + "loss": 1.3872, + "step": 1194 + }, + { + "epoch": 0.1047916703775816, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029976599235977206, + "loss": 1.3248, + "step": 1195 + }, + { + "epoch": 0.10487936215195615, + "grad_norm": 0.08251953125, + "learning_rate": 0.0029976358682991104, + "loss": 1.2766, + "step": 1196 + }, + { + "epoch": 0.10496705392633068, + "grad_norm": 0.09130859375, + "learning_rate": 0.002997611690099587, + "loss": 1.3499, + "step": 1197 + }, + { + "epoch": 0.10505474570070523, + "grad_norm": 0.1005859375, + "learning_rate": 0.0029975873890013575, + "loss": 1.3031, + "step": 1198 + }, + { + "epoch": 0.10514243747507977, + "grad_norm": 0.158203125, + "learning_rate": 0.0029975629650066367, + "loss": 1.3624, + "step": 1199 + }, + { + "epoch": 0.10523012924945432, + "grad_norm": 0.14453125, + "learning_rate": 0.0029975384181176513, + "loss": 1.3101, + "step": 1200 + }, + { + "epoch": 0.10531782102382888, + "grad_norm": 0.10888671875, + "learning_rate": 0.0029975137483366416, + "loss": 1.2886, + "step": 1201 + }, + { + "epoch": 0.10540551279820341, + "grad_norm": 0.232421875, + "learning_rate": 0.0029974889556658568, + "loss": 1.2838, + "step": 1202 + }, + { + "epoch": 0.10549320457257796, + "grad_norm": 0.12353515625, + "learning_rate": 0.0029974640401075575, + "loss": 1.3332, + "step": 1203 + }, + { + "epoch": 0.1055808963469525, + "grad_norm": 0.1171875, + "learning_rate": 0.002997439001664016, + "loss": 1.3389, + "step": 1204 + }, + { + "epoch": 0.10566858812132705, + "grad_norm": 0.1728515625, + "learning_rate": 0.002997413840337516, + "loss": 1.309, + "step": 1205 + }, + { + "epoch": 0.10575627989570159, + "grad_norm": 0.12060546875, + "learning_rate": 0.0029973885561303524, + "loss": 1.2586, + "step": 1206 + }, + { + "epoch": 0.10584397167007614, + "grad_norm": 0.10302734375, + "learning_rate": 0.0029973631490448306, + "loss": 1.3604, + "step": 1207 + }, + { + "epoch": 0.10593166344445068, + "grad_norm": 0.1259765625, + "learning_rate": 0.0029973376190832674, + "loss": 1.3604, + "step": 1208 + }, + { + "epoch": 0.10601935521882523, + "grad_norm": 0.08740234375, + "learning_rate": 0.002997311966247992, + "loss": 1.3153, + "step": 1209 + }, + { + "epoch": 0.10610704699319978, + "grad_norm": 0.1279296875, + "learning_rate": 0.0029972861905413436, + "loss": 1.3669, + "step": 1210 + }, + { + "epoch": 0.10619473876757432, + "grad_norm": 0.123046875, + "learning_rate": 0.002997260291965672, + "loss": 1.2343, + "step": 1211 + }, + { + "epoch": 0.10628243054194887, + "grad_norm": 0.11279296875, + "learning_rate": 0.0029972342705233404, + "loss": 1.2207, + "step": 1212 + }, + { + "epoch": 0.10637012231632341, + "grad_norm": 0.11181640625, + "learning_rate": 0.0029972081262167197, + "loss": 1.3529, + "step": 1213 + }, + { + "epoch": 0.10645781409069796, + "grad_norm": 0.08203125, + "learning_rate": 0.0029971818590481974, + "loss": 1.3519, + "step": 1214 + }, + { + "epoch": 0.1065455058650725, + "grad_norm": 0.142578125, + "learning_rate": 0.0029971554690201665, + "loss": 1.305, + "step": 1215 + }, + { + "epoch": 0.10663319763944705, + "grad_norm": 0.146484375, + "learning_rate": 0.0029971289561350344, + "loss": 1.3518, + "step": 1216 + }, + { + "epoch": 0.10672088941382159, + "grad_norm": 0.083984375, + "learning_rate": 0.0029971023203952192, + "loss": 1.3848, + "step": 1217 + }, + { + "epoch": 0.10680858118819614, + "grad_norm": 0.138671875, + "learning_rate": 0.00299707556180315, + "loss": 1.3416, + "step": 1218 + }, + { + "epoch": 0.10689627296257069, + "grad_norm": 0.1162109375, + "learning_rate": 0.0029970486803612673, + "loss": 1.3402, + "step": 1219 + }, + { + "epoch": 0.10698396473694523, + "grad_norm": 0.07568359375, + "learning_rate": 0.002997021676072022, + "loss": 1.3151, + "step": 1220 + }, + { + "epoch": 0.10707165651131978, + "grad_norm": 0.109375, + "learning_rate": 0.002996994548937877, + "loss": 1.3481, + "step": 1221 + }, + { + "epoch": 0.10715934828569432, + "grad_norm": 0.1162109375, + "learning_rate": 0.002996967298961307, + "loss": 1.2862, + "step": 1222 + }, + { + "epoch": 0.10724704006006887, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029969399261447955, + "loss": 1.2572, + "step": 1223 + }, + { + "epoch": 0.1073347318344434, + "grad_norm": 0.1005859375, + "learning_rate": 0.002996912430490841, + "loss": 1.284, + "step": 1224 + }, + { + "epoch": 0.10742242360881796, + "grad_norm": 0.12451171875, + "learning_rate": 0.002996884812001949, + "loss": 1.3279, + "step": 1225 + }, + { + "epoch": 0.1075101153831925, + "grad_norm": 0.203125, + "learning_rate": 0.002996857070680639, + "loss": 1.3086, + "step": 1226 + }, + { + "epoch": 0.10759780715756705, + "grad_norm": 0.10400390625, + "learning_rate": 0.002996829206529442, + "loss": 1.3585, + "step": 1227 + }, + { + "epoch": 0.1076854989319416, + "grad_norm": 0.16015625, + "learning_rate": 0.0029968012195508978, + "loss": 1.3435, + "step": 1228 + }, + { + "epoch": 0.10777319070631614, + "grad_norm": 0.1884765625, + "learning_rate": 0.0029967731097475586, + "loss": 1.3828, + "step": 1229 + }, + { + "epoch": 0.10786088248069069, + "grad_norm": 0.2314453125, + "learning_rate": 0.002996744877121989, + "loss": 1.3482, + "step": 1230 + }, + { + "epoch": 0.10794857425506522, + "grad_norm": 0.11572265625, + "learning_rate": 0.0029967165216767634, + "loss": 1.3508, + "step": 1231 + }, + { + "epoch": 0.10803626602943978, + "grad_norm": 0.2236328125, + "learning_rate": 0.0029966880434144673, + "loss": 1.2748, + "step": 1232 + }, + { + "epoch": 0.10812395780381431, + "grad_norm": 0.158203125, + "learning_rate": 0.0029966594423376973, + "loss": 1.2977, + "step": 1233 + }, + { + "epoch": 0.10821164957818886, + "grad_norm": 0.142578125, + "learning_rate": 0.002996630718449064, + "loss": 1.3324, + "step": 1234 + }, + { + "epoch": 0.1082993413525634, + "grad_norm": 0.1728515625, + "learning_rate": 0.0029966018717511845, + "loss": 1.2831, + "step": 1235 + }, + { + "epoch": 0.10838703312693795, + "grad_norm": 0.06884765625, + "learning_rate": 0.002996572902246691, + "loss": 1.2907, + "step": 1236 + }, + { + "epoch": 0.1084747249013125, + "grad_norm": 0.09814453125, + "learning_rate": 0.0029965438099382245, + "loss": 1.3001, + "step": 1237 + }, + { + "epoch": 0.10856241667568704, + "grad_norm": 0.0673828125, + "learning_rate": 0.0029965145948284387, + "loss": 1.3221, + "step": 1238 + }, + { + "epoch": 0.1086501084500616, + "grad_norm": 0.11083984375, + "learning_rate": 0.0029964852569199984, + "loss": 1.2634, + "step": 1239 + }, + { + "epoch": 0.10873780022443613, + "grad_norm": 0.072265625, + "learning_rate": 0.002996455796215578, + "loss": 1.294, + "step": 1240 + }, + { + "epoch": 0.10882549199881068, + "grad_norm": 0.09765625, + "learning_rate": 0.0029964262127178654, + "loss": 1.3118, + "step": 1241 + }, + { + "epoch": 0.10891318377318522, + "grad_norm": 0.10009765625, + "learning_rate": 0.0029963965064295573, + "loss": 1.27, + "step": 1242 + }, + { + "epoch": 0.10900087554755977, + "grad_norm": 0.0830078125, + "learning_rate": 0.002996366677353364, + "loss": 1.3144, + "step": 1243 + }, + { + "epoch": 0.10908856732193431, + "grad_norm": 0.0849609375, + "learning_rate": 0.0029963367254920055, + "loss": 1.339, + "step": 1244 + }, + { + "epoch": 0.10917625909630886, + "grad_norm": 0.11376953125, + "learning_rate": 0.002996306650848213, + "loss": 1.3353, + "step": 1245 + }, + { + "epoch": 0.10926395087068341, + "grad_norm": 0.1611328125, + "learning_rate": 0.0029962764534247287, + "loss": 1.2875, + "step": 1246 + }, + { + "epoch": 0.10935164264505795, + "grad_norm": 0.076171875, + "learning_rate": 0.0029962461332243085, + "loss": 1.3335, + "step": 1247 + }, + { + "epoch": 0.1094393344194325, + "grad_norm": 0.1513671875, + "learning_rate": 0.0029962156902497154, + "loss": 1.2532, + "step": 1248 + }, + { + "epoch": 0.10952702619380704, + "grad_norm": 0.091796875, + "learning_rate": 0.002996185124503727, + "loss": 1.2625, + "step": 1249 + }, + { + "epoch": 0.10961471796818159, + "grad_norm": 0.232421875, + "learning_rate": 0.0029961544359891302, + "loss": 1.3298, + "step": 1250 + }, + { + "epoch": 0.10970240974255613, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029961236247087234, + "loss": 1.2993, + "step": 1251 + }, + { + "epoch": 0.10979010151693068, + "grad_norm": 0.193359375, + "learning_rate": 0.0029960926906653185, + "loss": 1.3254, + "step": 1252 + }, + { + "epoch": 0.10987779329130522, + "grad_norm": 0.1162109375, + "learning_rate": 0.0029960616338617335, + "loss": 1.3134, + "step": 1253 + }, + { + "epoch": 0.10996548506567977, + "grad_norm": 0.0927734375, + "learning_rate": 0.0029960304543008034, + "loss": 1.2123, + "step": 1254 + }, + { + "epoch": 0.11005317684005432, + "grad_norm": 0.0654296875, + "learning_rate": 0.00299599915198537, + "loss": 1.3634, + "step": 1255 + }, + { + "epoch": 0.11014086861442886, + "grad_norm": 0.09521484375, + "learning_rate": 0.002995967726918289, + "loss": 1.3638, + "step": 1256 + }, + { + "epoch": 0.11022856038880341, + "grad_norm": 0.10595703125, + "learning_rate": 0.002995936179102426, + "loss": 1.2941, + "step": 1257 + }, + { + "epoch": 0.11031625216317795, + "grad_norm": 0.1533203125, + "learning_rate": 0.0029959045085406573, + "loss": 1.3817, + "step": 1258 + }, + { + "epoch": 0.1104039439375525, + "grad_norm": 0.130859375, + "learning_rate": 0.002995872715235873, + "loss": 1.241, + "step": 1259 + }, + { + "epoch": 0.11049163571192704, + "grad_norm": 0.07080078125, + "learning_rate": 0.0029958407991909704, + "loss": 1.3193, + "step": 1260 + }, + { + "epoch": 0.11057932748630159, + "grad_norm": 0.10400390625, + "learning_rate": 0.0029958087604088617, + "loss": 1.3237, + "step": 1261 + }, + { + "epoch": 0.11066701926067612, + "grad_norm": 0.08349609375, + "learning_rate": 0.002995776598892468, + "loss": 1.303, + "step": 1262 + }, + { + "epoch": 0.11075471103505068, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029957443146447224, + "loss": 1.3359, + "step": 1263 + }, + { + "epoch": 0.11084240280942523, + "grad_norm": 0.125, + "learning_rate": 0.0029957119076685695, + "loss": 1.292, + "step": 1264 + }, + { + "epoch": 0.11093009458379977, + "grad_norm": 0.0693359375, + "learning_rate": 0.002995679377966965, + "loss": 1.3467, + "step": 1265 + }, + { + "epoch": 0.11101778635817432, + "grad_norm": 0.1328125, + "learning_rate": 0.0029956467255428743, + "loss": 1.2914, + "step": 1266 + }, + { + "epoch": 0.11110547813254885, + "grad_norm": 0.1416015625, + "learning_rate": 0.0029956139503992765, + "loss": 1.3085, + "step": 1267 + }, + { + "epoch": 0.1111931699069234, + "grad_norm": 0.201171875, + "learning_rate": 0.0029955810525391603, + "loss": 1.2696, + "step": 1268 + }, + { + "epoch": 0.11128086168129794, + "grad_norm": 0.1015625, + "learning_rate": 0.002995548031965526, + "loss": 1.3236, + "step": 1269 + }, + { + "epoch": 0.1113685534556725, + "grad_norm": 0.2138671875, + "learning_rate": 0.0029955148886813836, + "loss": 1.3616, + "step": 1270 + }, + { + "epoch": 0.11145624523004703, + "grad_norm": 0.10791015625, + "learning_rate": 0.0029954816226897573, + "loss": 1.3317, + "step": 1271 + }, + { + "epoch": 0.11154393700442158, + "grad_norm": 0.1162109375, + "learning_rate": 0.0029954482339936803, + "loss": 1.3282, + "step": 1272 + }, + { + "epoch": 0.11163162877879614, + "grad_norm": 0.0859375, + "learning_rate": 0.002995414722596198, + "loss": 1.3022, + "step": 1273 + }, + { + "epoch": 0.11171932055317067, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029953810885003655, + "loss": 1.3222, + "step": 1274 + }, + { + "epoch": 0.11180701232754522, + "grad_norm": 0.07568359375, + "learning_rate": 0.0029953473317092514, + "loss": 1.3184, + "step": 1275 + }, + { + "epoch": 0.11189470410191976, + "grad_norm": 0.10205078125, + "learning_rate": 0.0029953134522259337, + "loss": 1.2974, + "step": 1276 + }, + { + "epoch": 0.11198239587629431, + "grad_norm": 0.1123046875, + "learning_rate": 0.0029952794500535014, + "loss": 1.3494, + "step": 1277 + }, + { + "epoch": 0.11207008765066885, + "grad_norm": 0.107421875, + "learning_rate": 0.0029952453251950563, + "loss": 1.2715, + "step": 1278 + }, + { + "epoch": 0.1121577794250434, + "grad_norm": 0.10302734375, + "learning_rate": 0.0029952110776537105, + "loss": 1.2665, + "step": 1279 + }, + { + "epoch": 0.11224547119941794, + "grad_norm": 0.119140625, + "learning_rate": 0.002995176707432587, + "loss": 1.3398, + "step": 1280 + }, + { + "epoch": 0.11233316297379249, + "grad_norm": 0.1103515625, + "learning_rate": 0.0029951422145348206, + "loss": 1.2775, + "step": 1281 + }, + { + "epoch": 0.11242085474816704, + "grad_norm": 0.059814453125, + "learning_rate": 0.0029951075989635566, + "loss": 1.2788, + "step": 1282 + }, + { + "epoch": 0.11250854652254158, + "grad_norm": 0.0908203125, + "learning_rate": 0.0029950728607219513, + "loss": 1.3765, + "step": 1283 + }, + { + "epoch": 0.11259623829691613, + "grad_norm": 0.11962890625, + "learning_rate": 0.0029950379998131744, + "loss": 1.2966, + "step": 1284 + }, + { + "epoch": 0.11268393007129067, + "grad_norm": 0.12158203125, + "learning_rate": 0.0029950030162404035, + "loss": 1.2517, + "step": 1285 + }, + { + "epoch": 0.11277162184566522, + "grad_norm": 0.1455078125, + "learning_rate": 0.0029949679100068297, + "loss": 1.3251, + "step": 1286 + }, + { + "epoch": 0.11285931362003976, + "grad_norm": 0.11865234375, + "learning_rate": 0.002994932681115655, + "loss": 1.3349, + "step": 1287 + }, + { + "epoch": 0.11294700539441431, + "grad_norm": 0.1669921875, + "learning_rate": 0.0029948973295700907, + "loss": 1.3208, + "step": 1288 + }, + { + "epoch": 0.11303469716878885, + "grad_norm": 0.126953125, + "learning_rate": 0.002994861855373362, + "loss": 1.4167, + "step": 1289 + }, + { + "epoch": 0.1131223889431634, + "grad_norm": 0.09912109375, + "learning_rate": 0.002994826258528705, + "loss": 1.2385, + "step": 1290 + }, + { + "epoch": 0.11321008071753795, + "grad_norm": 0.1103515625, + "learning_rate": 0.0029947905390393637, + "loss": 1.2785, + "step": 1291 + }, + { + "epoch": 0.11329777249191249, + "grad_norm": 0.267578125, + "learning_rate": 0.002994754696908597, + "loss": 1.3461, + "step": 1292 + }, + { + "epoch": 0.11338546426628704, + "grad_norm": 0.24609375, + "learning_rate": 0.0029947187321396735, + "loss": 1.3384, + "step": 1293 + }, + { + "epoch": 0.11347315604066158, + "grad_norm": 0.0732421875, + "learning_rate": 0.002994682644735873, + "loss": 1.3497, + "step": 1294 + }, + { + "epoch": 0.11356084781503613, + "grad_norm": 0.193359375, + "learning_rate": 0.0029946464347004867, + "loss": 1.3835, + "step": 1295 + }, + { + "epoch": 0.11364853958941067, + "grad_norm": 0.09765625, + "learning_rate": 0.002994610102036817, + "loss": 1.3218, + "step": 1296 + }, + { + "epoch": 0.11373623136378522, + "grad_norm": 0.12353515625, + "learning_rate": 0.0029945736467481767, + "loss": 1.2858, + "step": 1297 + }, + { + "epoch": 0.11382392313815975, + "grad_norm": 0.0859375, + "learning_rate": 0.002994537068837891, + "loss": 1.2978, + "step": 1298 + }, + { + "epoch": 0.1139116149125343, + "grad_norm": 0.09814453125, + "learning_rate": 0.002994500368309295, + "loss": 1.3046, + "step": 1299 + }, + { + "epoch": 0.11399930668690886, + "grad_norm": 0.10400390625, + "learning_rate": 0.0029944635451657365, + "loss": 1.2858, + "step": 1300 + }, + { + "epoch": 0.1140869984612834, + "grad_norm": 0.10498046875, + "learning_rate": 0.002994426599410574, + "loss": 1.3892, + "step": 1301 + }, + { + "epoch": 0.11417469023565795, + "grad_norm": 0.1220703125, + "learning_rate": 0.0029943895310471755, + "loss": 1.3058, + "step": 1302 + }, + { + "epoch": 0.11426238201003248, + "grad_norm": 0.115234375, + "learning_rate": 0.0029943523400789225, + "loss": 1.3764, + "step": 1303 + }, + { + "epoch": 0.11435007378440704, + "grad_norm": 0.09326171875, + "learning_rate": 0.0029943150265092067, + "loss": 1.3253, + "step": 1304 + }, + { + "epoch": 0.11443776555878157, + "grad_norm": 0.11181640625, + "learning_rate": 0.0029942775903414307, + "loss": 1.3296, + "step": 1305 + }, + { + "epoch": 0.11452545733315612, + "grad_norm": 0.0888671875, + "learning_rate": 0.0029942400315790085, + "loss": 1.3896, + "step": 1306 + }, + { + "epoch": 0.11461314910753066, + "grad_norm": 0.0654296875, + "learning_rate": 0.0029942023502253657, + "loss": 1.3069, + "step": 1307 + }, + { + "epoch": 0.11470084088190521, + "grad_norm": 0.087890625, + "learning_rate": 0.0029941645462839388, + "loss": 1.2887, + "step": 1308 + }, + { + "epoch": 0.11478853265627977, + "grad_norm": 0.0693359375, + "learning_rate": 0.0029941266197581755, + "loss": 1.3141, + "step": 1309 + }, + { + "epoch": 0.1148762244306543, + "grad_norm": 0.0927734375, + "learning_rate": 0.0029940885706515336, + "loss": 1.3429, + "step": 1310 + }, + { + "epoch": 0.11496391620502885, + "grad_norm": 0.0703125, + "learning_rate": 0.002994050398967484, + "loss": 1.3575, + "step": 1311 + }, + { + "epoch": 0.11505160797940339, + "grad_norm": 0.11669921875, + "learning_rate": 0.002994012104709508, + "loss": 1.2967, + "step": 1312 + }, + { + "epoch": 0.11513929975377794, + "grad_norm": 0.111328125, + "learning_rate": 0.002993973687881097, + "loss": 1.4125, + "step": 1313 + }, + { + "epoch": 0.11522699152815248, + "grad_norm": 0.08056640625, + "learning_rate": 0.0029939351484857555, + "loss": 1.2555, + "step": 1314 + }, + { + "epoch": 0.11531468330252703, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029938964865269977, + "loss": 1.2359, + "step": 1315 + }, + { + "epoch": 0.11540237507690157, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029938577020083503, + "loss": 1.3448, + "step": 1316 + }, + { + "epoch": 0.11549006685127612, + "grad_norm": 0.0703125, + "learning_rate": 0.0029938187949333484, + "loss": 1.2827, + "step": 1317 + }, + { + "epoch": 0.11557775862565067, + "grad_norm": 0.07470703125, + "learning_rate": 0.0029937797653055423, + "loss": 1.3329, + "step": 1318 + }, + { + "epoch": 0.11566545040002521, + "grad_norm": 0.083984375, + "learning_rate": 0.00299374061312849, + "loss": 1.2784, + "step": 1319 + }, + { + "epoch": 0.11575314217439976, + "grad_norm": 0.12255859375, + "learning_rate": 0.002993701338405763, + "loss": 1.3145, + "step": 1320 + }, + { + "epoch": 0.1158408339487743, + "grad_norm": 0.09765625, + "learning_rate": 0.002993661941140943, + "loss": 1.3254, + "step": 1321 + }, + { + "epoch": 0.11592852572314885, + "grad_norm": 0.06298828125, + "learning_rate": 0.002993622421337622, + "loss": 1.3961, + "step": 1322 + }, + { + "epoch": 0.11601621749752339, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029935827789994048, + "loss": 1.3328, + "step": 1323 + }, + { + "epoch": 0.11610390927189794, + "grad_norm": 0.1015625, + "learning_rate": 0.002993543014129907, + "loss": 1.2933, + "step": 1324 + }, + { + "epoch": 0.11619160104627248, + "grad_norm": 0.2392578125, + "learning_rate": 0.0029935031267327543, + "loss": 1.3627, + "step": 1325 + }, + { + "epoch": 0.11627929282064703, + "grad_norm": 0.205078125, + "learning_rate": 0.0029934631168115847, + "loss": 1.272, + "step": 1326 + }, + { + "epoch": 0.11636698459502158, + "grad_norm": 0.09912109375, + "learning_rate": 0.002993422984370047, + "loss": 1.288, + "step": 1327 + }, + { + "epoch": 0.11645467636939612, + "grad_norm": 0.15625, + "learning_rate": 0.002993382729411801, + "loss": 1.3332, + "step": 1328 + }, + { + "epoch": 0.11654236814377067, + "grad_norm": 0.25, + "learning_rate": 0.0029933423519405184, + "loss": 1.3108, + "step": 1329 + }, + { + "epoch": 0.1166300599181452, + "grad_norm": 0.1064453125, + "learning_rate": 0.002993301851959881, + "loss": 1.2691, + "step": 1330 + }, + { + "epoch": 0.11671775169251976, + "grad_norm": 0.158203125, + "learning_rate": 0.002993261229473582, + "loss": 1.3041, + "step": 1331 + }, + { + "epoch": 0.1168054434668943, + "grad_norm": 0.1279296875, + "learning_rate": 0.0029932204844853273, + "loss": 1.3623, + "step": 1332 + }, + { + "epoch": 0.11689313524126885, + "grad_norm": 0.06640625, + "learning_rate": 0.0029931796169988313, + "loss": 1.29, + "step": 1333 + }, + { + "epoch": 0.11698082701564338, + "grad_norm": 0.08837890625, + "learning_rate": 0.002993138627017822, + "loss": 1.3212, + "step": 1334 + }, + { + "epoch": 0.11706851879001794, + "grad_norm": 0.0849609375, + "learning_rate": 0.002993097514546037, + "loss": 1.3031, + "step": 1335 + }, + { + "epoch": 0.11715621056439249, + "grad_norm": 0.07666015625, + "learning_rate": 0.0029930562795872258, + "loss": 1.2398, + "step": 1336 + }, + { + "epoch": 0.11724390233876703, + "grad_norm": 0.0703125, + "learning_rate": 0.002993014922145149, + "loss": 1.3357, + "step": 1337 + }, + { + "epoch": 0.11733159411314158, + "grad_norm": 0.11181640625, + "learning_rate": 0.002992973442223578, + "loss": 1.3013, + "step": 1338 + }, + { + "epoch": 0.11741928588751611, + "grad_norm": 0.095703125, + "learning_rate": 0.0029929318398262965, + "loss": 1.3366, + "step": 1339 + }, + { + "epoch": 0.11750697766189067, + "grad_norm": 0.146484375, + "learning_rate": 0.002992890114957098, + "loss": 1.3721, + "step": 1340 + }, + { + "epoch": 0.1175946694362652, + "grad_norm": 0.18359375, + "learning_rate": 0.0029928482676197872, + "loss": 1.2499, + "step": 1341 + }, + { + "epoch": 0.11768236121063975, + "grad_norm": 0.1611328125, + "learning_rate": 0.002992806297818181, + "loss": 1.3645, + "step": 1342 + }, + { + "epoch": 0.11777005298501429, + "grad_norm": 0.1142578125, + "learning_rate": 0.002992764205556107, + "loss": 1.2865, + "step": 1343 + }, + { + "epoch": 0.11785774475938884, + "grad_norm": 0.1025390625, + "learning_rate": 0.0029927219908374037, + "loss": 1.3024, + "step": 1344 + }, + { + "epoch": 0.1179454365337634, + "grad_norm": 0.1904296875, + "learning_rate": 0.002992679653665921, + "loss": 1.3191, + "step": 1345 + }, + { + "epoch": 0.11803312830813793, + "grad_norm": 0.0693359375, + "learning_rate": 0.0029926371940455204, + "loss": 1.3488, + "step": 1346 + }, + { + "epoch": 0.11812082008251248, + "grad_norm": 0.2890625, + "learning_rate": 0.0029925946119800737, + "loss": 1.3971, + "step": 1347 + }, + { + "epoch": 0.11820851185688702, + "grad_norm": 0.251953125, + "learning_rate": 0.0029925519074734635, + "loss": 1.3229, + "step": 1348 + }, + { + "epoch": 0.11829620363126157, + "grad_norm": 0.10693359375, + "learning_rate": 0.002992509080529586, + "loss": 1.3529, + "step": 1349 + }, + { + "epoch": 0.11838389540563611, + "grad_norm": 0.265625, + "learning_rate": 0.002992466131152345, + "loss": 1.4095, + "step": 1350 + }, + { + "epoch": 0.11847158718001066, + "grad_norm": 0.1904296875, + "learning_rate": 0.0029924230593456596, + "loss": 1.2716, + "step": 1351 + }, + { + "epoch": 0.1185592789543852, + "grad_norm": 0.123046875, + "learning_rate": 0.002992379865113456, + "loss": 1.2908, + "step": 1352 + }, + { + "epoch": 0.11864697072875975, + "grad_norm": 0.208984375, + "learning_rate": 0.0029923365484596743, + "loss": 1.2965, + "step": 1353 + }, + { + "epoch": 0.11873466250313429, + "grad_norm": 0.11572265625, + "learning_rate": 0.0029922931093882645, + "loss": 1.3475, + "step": 1354 + }, + { + "epoch": 0.11882235427750884, + "grad_norm": 0.29296875, + "learning_rate": 0.002992249547903188, + "loss": 1.2868, + "step": 1355 + }, + { + "epoch": 0.11891004605188339, + "grad_norm": 0.1455078125, + "learning_rate": 0.002992205864008418, + "loss": 1.3291, + "step": 1356 + }, + { + "epoch": 0.11899773782625793, + "grad_norm": 0.1552734375, + "learning_rate": 0.002992162057707938, + "loss": 1.3299, + "step": 1357 + }, + { + "epoch": 0.11908542960063248, + "grad_norm": 0.146484375, + "learning_rate": 0.0029921181290057425, + "loss": 1.3613, + "step": 1358 + }, + { + "epoch": 0.11917312137500702, + "grad_norm": 0.0908203125, + "learning_rate": 0.0029920740779058393, + "loss": 1.3172, + "step": 1359 + }, + { + "epoch": 0.11926081314938157, + "grad_norm": 0.1064453125, + "learning_rate": 0.0029920299044122445, + "loss": 1.3173, + "step": 1360 + }, + { + "epoch": 0.11934850492375611, + "grad_norm": 0.09228515625, + "learning_rate": 0.0029919856085289863, + "loss": 1.3222, + "step": 1361 + }, + { + "epoch": 0.11943619669813066, + "grad_norm": 0.1689453125, + "learning_rate": 0.0029919411902601055, + "loss": 1.3778, + "step": 1362 + }, + { + "epoch": 0.1195238884725052, + "grad_norm": 0.12158203125, + "learning_rate": 0.002991896649609652, + "loss": 1.3441, + "step": 1363 + }, + { + "epoch": 0.11961158024687975, + "grad_norm": 0.1279296875, + "learning_rate": 0.0029918519865816885, + "loss": 1.3452, + "step": 1364 + }, + { + "epoch": 0.1196992720212543, + "grad_norm": 0.1787109375, + "learning_rate": 0.0029918072011802872, + "loss": 1.2676, + "step": 1365 + }, + { + "epoch": 0.11978696379562884, + "grad_norm": 0.103515625, + "learning_rate": 0.002991762293409534, + "loss": 1.2948, + "step": 1366 + }, + { + "epoch": 0.11987465557000339, + "grad_norm": 0.150390625, + "learning_rate": 0.002991717263273523, + "loss": 1.2419, + "step": 1367 + }, + { + "epoch": 0.11996234734437793, + "grad_norm": 0.10107421875, + "learning_rate": 0.0029916721107763606, + "loss": 1.2408, + "step": 1368 + }, + { + "epoch": 0.12005003911875248, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029916268359221655, + "loss": 1.3819, + "step": 1369 + }, + { + "epoch": 0.12013773089312701, + "grad_norm": 0.11865234375, + "learning_rate": 0.002991581438715066, + "loss": 1.2392, + "step": 1370 + }, + { + "epoch": 0.12022542266750157, + "grad_norm": 0.0927734375, + "learning_rate": 0.0029915359191592036, + "loss": 1.2534, + "step": 1371 + }, + { + "epoch": 0.1203131144418761, + "grad_norm": 0.087890625, + "learning_rate": 0.002991490277258728, + "loss": 1.2898, + "step": 1372 + }, + { + "epoch": 0.12040080621625066, + "grad_norm": 0.1279296875, + "learning_rate": 0.0029914445130178016, + "loss": 1.2815, + "step": 1373 + }, + { + "epoch": 0.1204884979906252, + "grad_norm": 0.09716796875, + "learning_rate": 0.002991398626440599, + "loss": 1.3045, + "step": 1374 + }, + { + "epoch": 0.12057618976499974, + "grad_norm": 0.1435546875, + "learning_rate": 0.0029913526175313052, + "loss": 1.2907, + "step": 1375 + }, + { + "epoch": 0.1206638815393743, + "grad_norm": 0.09228515625, + "learning_rate": 0.0029913064862941144, + "loss": 1.3564, + "step": 1376 + }, + { + "epoch": 0.12075157331374883, + "grad_norm": 0.076171875, + "learning_rate": 0.002991260232733235, + "loss": 1.2927, + "step": 1377 + }, + { + "epoch": 0.12083926508812338, + "grad_norm": 0.1044921875, + "learning_rate": 0.0029912138568528846, + "loss": 1.3473, + "step": 1378 + }, + { + "epoch": 0.12092695686249792, + "grad_norm": 0.07666015625, + "learning_rate": 0.002991167358657293, + "loss": 1.3737, + "step": 1379 + }, + { + "epoch": 0.12101464863687247, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029911207381507, + "loss": 1.2672, + "step": 1380 + }, + { + "epoch": 0.12110234041124701, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029910739953373584, + "loss": 1.267, + "step": 1381 + }, + { + "epoch": 0.12119003218562156, + "grad_norm": 0.09228515625, + "learning_rate": 0.0029910271302215304, + "loss": 1.2971, + "step": 1382 + }, + { + "epoch": 0.12127772395999611, + "grad_norm": 0.10009765625, + "learning_rate": 0.0029909801428074896, + "loss": 1.3148, + "step": 1383 + }, + { + "epoch": 0.12136541573437065, + "grad_norm": 0.09033203125, + "learning_rate": 0.0029909330330995213, + "loss": 1.3299, + "step": 1384 + }, + { + "epoch": 0.1214531075087452, + "grad_norm": 0.06591796875, + "learning_rate": 0.0029908858011019226, + "loss": 1.2774, + "step": 1385 + }, + { + "epoch": 0.12154079928311974, + "grad_norm": 0.1474609375, + "learning_rate": 0.002990838446819, + "loss": 1.3522, + "step": 1386 + }, + { + "epoch": 0.12162849105749429, + "grad_norm": 0.2333984375, + "learning_rate": 0.002990790970255072, + "loss": 1.2872, + "step": 1387 + }, + { + "epoch": 0.12171618283186883, + "grad_norm": 0.09765625, + "learning_rate": 0.0029907433714144696, + "loss": 1.2608, + "step": 1388 + }, + { + "epoch": 0.12180387460624338, + "grad_norm": 0.2373046875, + "learning_rate": 0.0029906956503015325, + "loss": 1.2954, + "step": 1389 + }, + { + "epoch": 0.12189156638061792, + "grad_norm": 0.1181640625, + "learning_rate": 0.002990647806920613, + "loss": 1.3108, + "step": 1390 + }, + { + "epoch": 0.12197925815499247, + "grad_norm": 0.228515625, + "learning_rate": 0.0029905998412760744, + "loss": 1.3155, + "step": 1391 + }, + { + "epoch": 0.12206694992936702, + "grad_norm": 0.15625, + "learning_rate": 0.002990551753372291, + "loss": 1.274, + "step": 1392 + }, + { + "epoch": 0.12215464170374156, + "grad_norm": 0.1611328125, + "learning_rate": 0.0029905035432136484, + "loss": 1.2705, + "step": 1393 + }, + { + "epoch": 0.12224233347811611, + "grad_norm": 0.1728515625, + "learning_rate": 0.0029904552108045426, + "loss": 1.3025, + "step": 1394 + }, + { + "epoch": 0.12233002525249065, + "grad_norm": 0.1669921875, + "learning_rate": 0.0029904067561493824, + "loss": 1.3554, + "step": 1395 + }, + { + "epoch": 0.1224177170268652, + "grad_norm": 0.1650390625, + "learning_rate": 0.0029903581792525866, + "loss": 1.2629, + "step": 1396 + }, + { + "epoch": 0.12250540880123974, + "grad_norm": 0.12890625, + "learning_rate": 0.0029903094801185847, + "loss": 1.3087, + "step": 1397 + }, + { + "epoch": 0.12259310057561429, + "grad_norm": 0.162109375, + "learning_rate": 0.002990260658751818, + "loss": 1.2803, + "step": 1398 + }, + { + "epoch": 0.12268079234998883, + "grad_norm": 0.1494140625, + "learning_rate": 0.0029902117151567394, + "loss": 1.3107, + "step": 1399 + }, + { + "epoch": 0.12276848412436338, + "grad_norm": 0.162109375, + "learning_rate": 0.002990162649337812, + "loss": 1.3432, + "step": 1400 + }, + { + "epoch": 0.12285617589873793, + "grad_norm": 0.09228515625, + "learning_rate": 0.00299011346129951, + "loss": 1.3475, + "step": 1401 + }, + { + "epoch": 0.12294386767311247, + "grad_norm": 0.10107421875, + "learning_rate": 0.00299006415104632, + "loss": 1.3691, + "step": 1402 + }, + { + "epoch": 0.12303155944748702, + "grad_norm": 0.08203125, + "learning_rate": 0.00299001471858274, + "loss": 1.3435, + "step": 1403 + }, + { + "epoch": 0.12311925122186156, + "grad_norm": 0.126953125, + "learning_rate": 0.002989965163913276, + "loss": 1.2595, + "step": 1404 + }, + { + "epoch": 0.12320694299623611, + "grad_norm": 0.07666015625, + "learning_rate": 0.002989915487042448, + "loss": 1.2682, + "step": 1405 + }, + { + "epoch": 0.12329463477061064, + "grad_norm": 0.1298828125, + "learning_rate": 0.002989865687974787, + "loss": 1.3321, + "step": 1406 + }, + { + "epoch": 0.1233823265449852, + "grad_norm": 0.07861328125, + "learning_rate": 0.0029898157667148334, + "loss": 1.315, + "step": 1407 + }, + { + "epoch": 0.12347001831935973, + "grad_norm": 0.130859375, + "learning_rate": 0.0029897657232671408, + "loss": 1.3025, + "step": 1408 + }, + { + "epoch": 0.12355771009373429, + "grad_norm": 0.12060546875, + "learning_rate": 0.002989715557636273, + "loss": 1.3442, + "step": 1409 + }, + { + "epoch": 0.12364540186810884, + "grad_norm": 0.10498046875, + "learning_rate": 0.0029896652698268057, + "loss": 1.3382, + "step": 1410 + }, + { + "epoch": 0.12373309364248337, + "grad_norm": 0.13671875, + "learning_rate": 0.0029896148598433227, + "loss": 1.2784, + "step": 1411 + }, + { + "epoch": 0.12382078541685793, + "grad_norm": 0.1875, + "learning_rate": 0.0029895643276904235, + "loss": 1.2851, + "step": 1412 + }, + { + "epoch": 0.12390847719123246, + "grad_norm": 0.09814453125, + "learning_rate": 0.0029895136733727157, + "loss": 1.2747, + "step": 1413 + }, + { + "epoch": 0.12399616896560701, + "grad_norm": 0.173828125, + "learning_rate": 0.0029894628968948184, + "loss": 1.3125, + "step": 1414 + }, + { + "epoch": 0.12408386073998155, + "grad_norm": 0.07080078125, + "learning_rate": 0.002989411998261363, + "loss": 1.3735, + "step": 1415 + }, + { + "epoch": 0.1241715525143561, + "grad_norm": 0.095703125, + "learning_rate": 0.0029893609774769908, + "loss": 1.3115, + "step": 1416 + }, + { + "epoch": 0.12425924428873064, + "grad_norm": 0.1025390625, + "learning_rate": 0.0029893098345463547, + "loss": 1.3742, + "step": 1417 + }, + { + "epoch": 0.12434693606310519, + "grad_norm": 0.1142578125, + "learning_rate": 0.0029892585694741196, + "loss": 1.3222, + "step": 1418 + }, + { + "epoch": 0.12443462783747974, + "grad_norm": 0.09619140625, + "learning_rate": 0.00298920718226496, + "loss": 1.3342, + "step": 1419 + }, + { + "epoch": 0.12452231961185428, + "grad_norm": 0.09326171875, + "learning_rate": 0.0029891556729235626, + "loss": 1.3557, + "step": 1420 + }, + { + "epoch": 0.12461001138622883, + "grad_norm": 0.0791015625, + "learning_rate": 0.002989104041454625, + "loss": 1.3099, + "step": 1421 + }, + { + "epoch": 0.12469770316060337, + "grad_norm": 0.15625, + "learning_rate": 0.002989052287862856, + "loss": 1.3465, + "step": 1422 + }, + { + "epoch": 0.12478539493497792, + "grad_norm": 0.1796875, + "learning_rate": 0.002989000412152975, + "loss": 1.3353, + "step": 1423 + }, + { + "epoch": 0.12487308670935246, + "grad_norm": 0.0908203125, + "learning_rate": 0.0029889484143297124, + "loss": 1.318, + "step": 1424 + }, + { + "epoch": 0.12496077848372701, + "grad_norm": 0.169921875, + "learning_rate": 0.0029888962943978113, + "loss": 1.2814, + "step": 1425 + }, + { + "epoch": 0.12504847025810156, + "grad_norm": 0.09130859375, + "learning_rate": 0.002988844052362024, + "loss": 1.3305, + "step": 1426 + }, + { + "epoch": 0.1251361620324761, + "grad_norm": 0.2099609375, + "learning_rate": 0.002988791688227116, + "loss": 1.3228, + "step": 1427 + }, + { + "epoch": 0.12522385380685064, + "grad_norm": 0.130859375, + "learning_rate": 0.002988739201997862, + "loss": 1.3273, + "step": 1428 + }, + { + "epoch": 0.1253115455812252, + "grad_norm": 0.1318359375, + "learning_rate": 0.002988686593679048, + "loss": 1.2191, + "step": 1429 + }, + { + "epoch": 0.12539923735559974, + "grad_norm": 0.0986328125, + "learning_rate": 0.0029886338632754733, + "loss": 1.2992, + "step": 1430 + }, + { + "epoch": 0.12548692912997428, + "grad_norm": 0.08642578125, + "learning_rate": 0.0029885810107919456, + "loss": 1.3225, + "step": 1431 + }, + { + "epoch": 0.12557462090434882, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029885280362332853, + "loss": 1.3324, + "step": 1432 + }, + { + "epoch": 0.12566231267872338, + "grad_norm": 0.09423828125, + "learning_rate": 0.0029884749396043237, + "loss": 1.3346, + "step": 1433 + }, + { + "epoch": 0.12575000445309792, + "grad_norm": 0.080078125, + "learning_rate": 0.0029884217209099023, + "loss": 1.2389, + "step": 1434 + }, + { + "epoch": 0.12583769622747246, + "grad_norm": 0.1279296875, + "learning_rate": 0.0029883683801548754, + "loss": 1.3179, + "step": 1435 + }, + { + "epoch": 0.12592538800184702, + "grad_norm": 0.1640625, + "learning_rate": 0.0029883149173441075, + "loss": 1.2994, + "step": 1436 + }, + { + "epoch": 0.12601307977622156, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029882613324824737, + "loss": 1.2993, + "step": 1437 + }, + { + "epoch": 0.1261007715505961, + "grad_norm": 0.2734375, + "learning_rate": 0.002988207625574861, + "loss": 1.2902, + "step": 1438 + }, + { + "epoch": 0.12618846332497063, + "grad_norm": 0.302734375, + "learning_rate": 0.0029881537966261673, + "loss": 1.295, + "step": 1439 + }, + { + "epoch": 0.1262761550993452, + "grad_norm": 0.0703125, + "learning_rate": 0.0029880998456413023, + "loss": 1.3192, + "step": 1440 + }, + { + "epoch": 0.12636384687371974, + "grad_norm": 0.220703125, + "learning_rate": 0.002988045772625185, + "loss": 1.2559, + "step": 1441 + }, + { + "epoch": 0.12645153864809427, + "grad_norm": 0.09814453125, + "learning_rate": 0.0029879915775827483, + "loss": 1.2742, + "step": 1442 + }, + { + "epoch": 0.1265392304224688, + "grad_norm": 0.1591796875, + "learning_rate": 0.002987937260518933, + "loss": 1.3096, + "step": 1443 + }, + { + "epoch": 0.12662692219684338, + "grad_norm": 0.11181640625, + "learning_rate": 0.0029878828214386934, + "loss": 1.3017, + "step": 1444 + }, + { + "epoch": 0.12671461397121792, + "grad_norm": 0.0908203125, + "learning_rate": 0.0029878282603469945, + "loss": 1.3181, + "step": 1445 + }, + { + "epoch": 0.12680230574559245, + "grad_norm": 0.08251953125, + "learning_rate": 0.0029877735772488117, + "loss": 1.3489, + "step": 1446 + }, + { + "epoch": 0.12688999751996702, + "grad_norm": 0.11865234375, + "learning_rate": 0.0029877187721491323, + "loss": 1.2814, + "step": 1447 + }, + { + "epoch": 0.12697768929434156, + "grad_norm": 0.0859375, + "learning_rate": 0.002987663845052954, + "loss": 1.2852, + "step": 1448 + }, + { + "epoch": 0.1270653810687161, + "grad_norm": 0.14453125, + "learning_rate": 0.002987608795965286, + "loss": 1.3262, + "step": 1449 + }, + { + "epoch": 0.12715307284309063, + "grad_norm": 0.1337890625, + "learning_rate": 0.002987553624891149, + "loss": 1.312, + "step": 1450 + }, + { + "epoch": 0.1272407646174652, + "grad_norm": 0.10302734375, + "learning_rate": 0.0029874983318355745, + "loss": 1.334, + "step": 1451 + }, + { + "epoch": 0.12732845639183973, + "grad_norm": 0.19921875, + "learning_rate": 0.0029874429168036047, + "loss": 1.3343, + "step": 1452 + }, + { + "epoch": 0.12741614816621427, + "grad_norm": 1.8125, + "learning_rate": 0.0029873873798002934, + "loss": 1.3057, + "step": 1453 + }, + { + "epoch": 0.1275038399405888, + "grad_norm": 0.11279296875, + "learning_rate": 0.0029873317208307056, + "loss": 1.35, + "step": 1454 + }, + { + "epoch": 0.12759153171496337, + "grad_norm": 0.2578125, + "learning_rate": 0.002987275939899917, + "loss": 1.3545, + "step": 1455 + }, + { + "epoch": 0.1276792234893379, + "grad_norm": 0.357421875, + "learning_rate": 0.002987220037013015, + "loss": 1.2747, + "step": 1456 + }, + { + "epoch": 0.12776691526371245, + "grad_norm": 0.154296875, + "learning_rate": 0.002987164012175098, + "loss": 1.3802, + "step": 1457 + }, + { + "epoch": 0.12785460703808701, + "grad_norm": 0.12255859375, + "learning_rate": 0.0029871078653912744, + "loss": 1.3492, + "step": 1458 + }, + { + "epoch": 0.12794229881246155, + "grad_norm": 0.1982421875, + "learning_rate": 0.0029870515966666654, + "loss": 1.3027, + "step": 1459 + }, + { + "epoch": 0.1280299905868361, + "grad_norm": 0.220703125, + "learning_rate": 0.002986995206006402, + "loss": 1.3125, + "step": 1460 + }, + { + "epoch": 0.12811768236121063, + "grad_norm": 0.203125, + "learning_rate": 0.0029869386934156276, + "loss": 1.2989, + "step": 1461 + }, + { + "epoch": 0.1282053741355852, + "grad_norm": 0.12353515625, + "learning_rate": 0.0029868820588994957, + "loss": 1.2363, + "step": 1462 + }, + { + "epoch": 0.12829306590995973, + "grad_norm": 0.1201171875, + "learning_rate": 0.0029868253024631715, + "loss": 1.3255, + "step": 1463 + }, + { + "epoch": 0.12838075768433427, + "grad_norm": 0.28125, + "learning_rate": 0.00298676842411183, + "loss": 1.317, + "step": 1464 + }, + { + "epoch": 0.12846844945870883, + "grad_norm": 0.11474609375, + "learning_rate": 0.00298671142385066, + "loss": 1.2981, + "step": 1465 + }, + { + "epoch": 0.12855614123308337, + "grad_norm": 0.267578125, + "learning_rate": 0.0029866543016848577, + "loss": 1.3597, + "step": 1466 + }, + { + "epoch": 0.1286438330074579, + "grad_norm": 0.1376953125, + "learning_rate": 0.002986597057619634, + "loss": 1.2674, + "step": 1467 + }, + { + "epoch": 0.12873152478183245, + "grad_norm": 0.1806640625, + "learning_rate": 0.0029865396916602094, + "loss": 1.3773, + "step": 1468 + }, + { + "epoch": 0.128819216556207, + "grad_norm": 0.201171875, + "learning_rate": 0.002986482203811815, + "loss": 1.331, + "step": 1469 + }, + { + "epoch": 0.12890690833058155, + "grad_norm": 0.1240234375, + "learning_rate": 0.002986424594079694, + "loss": 1.3114, + "step": 1470 + }, + { + "epoch": 0.1289946001049561, + "grad_norm": 0.2060546875, + "learning_rate": 0.0029863668624690995, + "loss": 1.3031, + "step": 1471 + }, + { + "epoch": 0.12908229187933062, + "grad_norm": 0.365234375, + "learning_rate": 0.0029863090089852972, + "loss": 1.2871, + "step": 1472 + }, + { + "epoch": 0.1291699836537052, + "grad_norm": 0.10888671875, + "learning_rate": 0.0029862510336335635, + "loss": 1.3051, + "step": 1473 + }, + { + "epoch": 0.12925767542807973, + "grad_norm": 0.185546875, + "learning_rate": 0.002986192936419184, + "loss": 1.3369, + "step": 1474 + }, + { + "epoch": 0.12934536720245426, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029861347173474584, + "loss": 1.3769, + "step": 1475 + }, + { + "epoch": 0.12943305897682883, + "grad_norm": 0.1572265625, + "learning_rate": 0.0029860763764236963, + "loss": 1.3722, + "step": 1476 + }, + { + "epoch": 0.12952075075120337, + "grad_norm": 0.1611328125, + "learning_rate": 0.0029860179136532174, + "loss": 1.3819, + "step": 1477 + }, + { + "epoch": 0.1296084425255779, + "grad_norm": 0.08154296875, + "learning_rate": 0.0029859593290413535, + "loss": 1.3006, + "step": 1478 + }, + { + "epoch": 0.12969613429995244, + "grad_norm": 0.1298828125, + "learning_rate": 0.002985900622593448, + "loss": 1.2738, + "step": 1479 + }, + { + "epoch": 0.129783826074327, + "grad_norm": 0.07373046875, + "learning_rate": 0.0029858417943148534, + "loss": 1.3179, + "step": 1480 + }, + { + "epoch": 0.12987151784870155, + "grad_norm": 0.072265625, + "learning_rate": 0.0029857828442109366, + "loss": 1.2314, + "step": 1481 + }, + { + "epoch": 0.12995920962307608, + "grad_norm": 0.10791015625, + "learning_rate": 0.0029857237722870724, + "loss": 1.3601, + "step": 1482 + }, + { + "epoch": 0.13004690139745065, + "grad_norm": 0.0654296875, + "learning_rate": 0.002985664578548648, + "loss": 1.2792, + "step": 1483 + }, + { + "epoch": 0.13013459317182519, + "grad_norm": 0.09375, + "learning_rate": 0.0029856052630010625, + "loss": 1.2767, + "step": 1484 + }, + { + "epoch": 0.13022228494619972, + "grad_norm": 0.07861328125, + "learning_rate": 0.002985545825649725, + "loss": 1.336, + "step": 1485 + }, + { + "epoch": 0.13030997672057426, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029854862665000554, + "loss": 1.3138, + "step": 1486 + }, + { + "epoch": 0.13039766849494883, + "grad_norm": 0.12255859375, + "learning_rate": 0.002985426585557486, + "loss": 1.252, + "step": 1487 + }, + { + "epoch": 0.13048536026932336, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029853667828274593, + "loss": 1.2989, + "step": 1488 + }, + { + "epoch": 0.1305730520436979, + "grad_norm": 0.546875, + "learning_rate": 0.0029853068583154292, + "loss": 1.3, + "step": 1489 + }, + { + "epoch": 0.13066074381807244, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029852468120268607, + "loss": 1.3237, + "step": 1490 + }, + { + "epoch": 0.130748435592447, + "grad_norm": 0.1494140625, + "learning_rate": 0.00298518664396723, + "loss": 1.3524, + "step": 1491 + }, + { + "epoch": 0.13083612736682154, + "grad_norm": 0.1220703125, + "learning_rate": 0.0029851263541420246, + "loss": 1.3396, + "step": 1492 + }, + { + "epoch": 0.13092381914119608, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029850659425567413, + "loss": 1.3305, + "step": 1493 + }, + { + "epoch": 0.13101151091557064, + "grad_norm": 0.181640625, + "learning_rate": 0.0029850054092168915, + "loss": 1.3134, + "step": 1494 + }, + { + "epoch": 0.13109920268994518, + "grad_norm": 0.0966796875, + "learning_rate": 0.002984944754127994, + "loss": 1.3054, + "step": 1495 + }, + { + "epoch": 0.13118689446431972, + "grad_norm": 0.1484375, + "learning_rate": 0.0029848839772955815, + "loss": 1.2187, + "step": 1496 + }, + { + "epoch": 0.13127458623869426, + "grad_norm": 0.158203125, + "learning_rate": 0.002984823078725196, + "loss": 1.3308, + "step": 1497 + }, + { + "epoch": 0.13136227801306882, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029847620584223917, + "loss": 1.2468, + "step": 1498 + }, + { + "epoch": 0.13144996978744336, + "grad_norm": 0.09521484375, + "learning_rate": 0.0029847009163927333, + "loss": 1.2942, + "step": 1499 + }, + { + "epoch": 0.1315376615618179, + "grad_norm": 0.083984375, + "learning_rate": 0.0029846396526417964, + "loss": 1.2777, + "step": 1500 + }, + { + "epoch": 0.1315376615618179, + "eval_loss": 1.3182919025421143, + "eval_runtime": 429.272, + "eval_samples_per_second": 33.655, + "eval_steps_per_second": 8.414, + "step": 1500 + }, + { + "epoch": 0.13162535333619246, + "grad_norm": 0.076171875, + "learning_rate": 0.002984578267175169, + "loss": 1.284, + "step": 1501 + }, + { + "epoch": 0.131713045110567, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029845167599984487, + "loss": 1.3167, + "step": 1502 + }, + { + "epoch": 0.13180073688494154, + "grad_norm": 0.0791015625, + "learning_rate": 0.002984455131117245, + "loss": 1.2918, + "step": 1503 + }, + { + "epoch": 0.13188842865931608, + "grad_norm": 0.091796875, + "learning_rate": 0.0029843933805371783, + "loss": 1.2734, + "step": 1504 + }, + { + "epoch": 0.13197612043369064, + "grad_norm": 0.0712890625, + "learning_rate": 0.00298433150826388, + "loss": 1.3092, + "step": 1505 + }, + { + "epoch": 0.13206381220806518, + "grad_norm": 0.1318359375, + "learning_rate": 0.0029842695143029925, + "loss": 1.2861, + "step": 1506 + }, + { + "epoch": 0.13215150398243972, + "grad_norm": 0.2353515625, + "learning_rate": 0.0029842073986601696, + "loss": 1.2986, + "step": 1507 + }, + { + "epoch": 0.13223919575681425, + "grad_norm": 0.1552734375, + "learning_rate": 0.0029841451613410765, + "loss": 1.3754, + "step": 1508 + }, + { + "epoch": 0.13232688753118882, + "grad_norm": 0.11962890625, + "learning_rate": 0.0029840828023513888, + "loss": 1.2943, + "step": 1509 + }, + { + "epoch": 0.13241457930556336, + "grad_norm": 0.1328125, + "learning_rate": 0.0029840203216967933, + "loss": 1.256, + "step": 1510 + }, + { + "epoch": 0.1325022710799379, + "grad_norm": 0.1259765625, + "learning_rate": 0.002983957719382988, + "loss": 1.3142, + "step": 1511 + }, + { + "epoch": 0.13258996285431246, + "grad_norm": 0.10498046875, + "learning_rate": 0.0029838949954156826, + "loss": 1.3331, + "step": 1512 + }, + { + "epoch": 0.132677654628687, + "grad_norm": 0.166015625, + "learning_rate": 0.002983832149800597, + "loss": 1.3206, + "step": 1513 + }, + { + "epoch": 0.13276534640306153, + "grad_norm": 0.1533203125, + "learning_rate": 0.0029837691825434624, + "loss": 1.4007, + "step": 1514 + }, + { + "epoch": 0.13285303817743607, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029837060936500214, + "loss": 1.3109, + "step": 1515 + }, + { + "epoch": 0.13294072995181064, + "grad_norm": 0.1357421875, + "learning_rate": 0.002983642883126028, + "loss": 1.3233, + "step": 1516 + }, + { + "epoch": 0.13302842172618518, + "grad_norm": 0.07373046875, + "learning_rate": 0.002983579550977246, + "loss": 1.3065, + "step": 1517 + }, + { + "epoch": 0.1331161135005597, + "grad_norm": 0.08154296875, + "learning_rate": 0.002983516097209452, + "loss": 1.3287, + "step": 1518 + }, + { + "epoch": 0.13320380527493428, + "grad_norm": 0.076171875, + "learning_rate": 0.002983452521828432, + "loss": 1.3336, + "step": 1519 + }, + { + "epoch": 0.13329149704930882, + "grad_norm": 0.06298828125, + "learning_rate": 0.0029833888248399845, + "loss": 1.3169, + "step": 1520 + }, + { + "epoch": 0.13337918882368335, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029833250062499188, + "loss": 1.3375, + "step": 1521 + }, + { + "epoch": 0.1334668805980579, + "grad_norm": 0.06591796875, + "learning_rate": 0.0029832610660640536, + "loss": 1.3951, + "step": 1522 + }, + { + "epoch": 0.13355457237243246, + "grad_norm": 0.107421875, + "learning_rate": 0.0029831970042882216, + "loss": 1.2699, + "step": 1523 + }, + { + "epoch": 0.133642264146807, + "grad_norm": 0.130859375, + "learning_rate": 0.0029831328209282645, + "loss": 1.2956, + "step": 1524 + }, + { + "epoch": 0.13372995592118153, + "grad_norm": 0.08154296875, + "learning_rate": 0.0029830685159900347, + "loss": 1.3326, + "step": 1525 + }, + { + "epoch": 0.13381764769555607, + "grad_norm": 0.1787109375, + "learning_rate": 0.0029830040894793983, + "loss": 1.3339, + "step": 1526 + }, + { + "epoch": 0.13390533946993063, + "grad_norm": 0.3203125, + "learning_rate": 0.0029829395414022303, + "loss": 1.3356, + "step": 1527 + }, + { + "epoch": 0.13399303124430517, + "grad_norm": 0.1953125, + "learning_rate": 0.0029828748717644167, + "loss": 1.3342, + "step": 1528 + }, + { + "epoch": 0.1340807230186797, + "grad_norm": 0.0869140625, + "learning_rate": 0.0029828100805718554, + "loss": 1.2823, + "step": 1529 + }, + { + "epoch": 0.13416841479305427, + "grad_norm": 0.1572265625, + "learning_rate": 0.0029827451678304555, + "loss": 1.2867, + "step": 1530 + }, + { + "epoch": 0.1342561065674288, + "grad_norm": 0.10302734375, + "learning_rate": 0.002982680133546137, + "loss": 1.3465, + "step": 1531 + }, + { + "epoch": 0.13434379834180335, + "grad_norm": 0.1484375, + "learning_rate": 0.0029826149777248305, + "loss": 1.2621, + "step": 1532 + }, + { + "epoch": 0.1344314901161779, + "grad_norm": 0.0908203125, + "learning_rate": 0.002982549700372478, + "loss": 1.3645, + "step": 1533 + }, + { + "epoch": 0.13451918189055245, + "grad_norm": 0.16015625, + "learning_rate": 0.0029824843014950326, + "loss": 1.2433, + "step": 1534 + }, + { + "epoch": 0.134606873664927, + "grad_norm": 0.07470703125, + "learning_rate": 0.002982418781098459, + "loss": 1.3254, + "step": 1535 + }, + { + "epoch": 0.13469456543930153, + "grad_norm": 0.1513671875, + "learning_rate": 0.0029823531391887322, + "loss": 1.3154, + "step": 1536 + }, + { + "epoch": 0.1347822572136761, + "grad_norm": 0.07666015625, + "learning_rate": 0.0029822873757718387, + "loss": 1.3446, + "step": 1537 + }, + { + "epoch": 0.13486994898805063, + "grad_norm": 0.138671875, + "learning_rate": 0.0029822214908537758, + "loss": 1.2785, + "step": 1538 + }, + { + "epoch": 0.13495764076242517, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029821554844405517, + "loss": 1.3284, + "step": 1539 + }, + { + "epoch": 0.1350453325367997, + "grad_norm": 0.1728515625, + "learning_rate": 0.002982089356538187, + "loss": 1.3399, + "step": 1540 + }, + { + "epoch": 0.13513302431117427, + "grad_norm": 0.11279296875, + "learning_rate": 0.002982023107152711, + "loss": 1.3512, + "step": 1541 + }, + { + "epoch": 0.1352207160855488, + "grad_norm": 0.11767578125, + "learning_rate": 0.0029819567362901664, + "loss": 1.3023, + "step": 1542 + }, + { + "epoch": 0.13530840785992335, + "grad_norm": 0.0888671875, + "learning_rate": 0.002981890243956606, + "loss": 1.3128, + "step": 1543 + }, + { + "epoch": 0.13539609963429788, + "grad_norm": 0.107421875, + "learning_rate": 0.002981823630158094, + "loss": 1.3158, + "step": 1544 + }, + { + "epoch": 0.13548379140867245, + "grad_norm": 0.0966796875, + "learning_rate": 0.0029817568949007047, + "loss": 1.3173, + "step": 1545 + }, + { + "epoch": 0.135571483183047, + "grad_norm": 0.08935546875, + "learning_rate": 0.002981690038190524, + "loss": 1.302, + "step": 1546 + }, + { + "epoch": 0.13565917495742152, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029816230600336504, + "loss": 1.2854, + "step": 1547 + }, + { + "epoch": 0.1357468667317961, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029815559604361905, + "loss": 1.2837, + "step": 1548 + }, + { + "epoch": 0.13583455850617063, + "grad_norm": 0.1435546875, + "learning_rate": 0.002981488739404265, + "loss": 1.2879, + "step": 1549 + }, + { + "epoch": 0.13592225028054516, + "grad_norm": 0.09033203125, + "learning_rate": 0.0029814213969440034, + "loss": 1.3034, + "step": 1550 + }, + { + "epoch": 0.1360099420549197, + "grad_norm": 0.1357421875, + "learning_rate": 0.0029813539330615477, + "loss": 1.3412, + "step": 1551 + }, + { + "epoch": 0.13609763382929427, + "grad_norm": 0.08203125, + "learning_rate": 0.0029812863477630502, + "loss": 1.2951, + "step": 1552 + }, + { + "epoch": 0.1361853256036688, + "grad_norm": 0.126953125, + "learning_rate": 0.002981218641054674, + "loss": 1.2204, + "step": 1553 + }, + { + "epoch": 0.13627301737804334, + "grad_norm": 0.10107421875, + "learning_rate": 0.002981150812942595, + "loss": 1.2347, + "step": 1554 + }, + { + "epoch": 0.1363607091524179, + "grad_norm": 0.08935546875, + "learning_rate": 0.0029810828634329973, + "loss": 1.2983, + "step": 1555 + }, + { + "epoch": 0.13644840092679245, + "grad_norm": 0.06689453125, + "learning_rate": 0.002981014792532079, + "loss": 1.3449, + "step": 1556 + }, + { + "epoch": 0.13653609270116698, + "grad_norm": 0.10302734375, + "learning_rate": 0.0029809466002460477, + "loss": 1.3148, + "step": 1557 + }, + { + "epoch": 0.13662378447554152, + "grad_norm": 0.0654296875, + "learning_rate": 0.0029808782865811223, + "loss": 1.3352, + "step": 1558 + }, + { + "epoch": 0.1367114762499161, + "grad_norm": 0.2041015625, + "learning_rate": 0.002980809851543533, + "loss": 1.4218, + "step": 1559 + }, + { + "epoch": 0.13679916802429062, + "grad_norm": 0.134765625, + "learning_rate": 0.0029807412951395203, + "loss": 1.3049, + "step": 1560 + }, + { + "epoch": 0.13688685979866516, + "grad_norm": 0.11767578125, + "learning_rate": 0.0029806726173753372, + "loss": 1.3568, + "step": 1561 + }, + { + "epoch": 0.1369745515730397, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029806038182572463, + "loss": 1.3048, + "step": 1562 + }, + { + "epoch": 0.13706224334741426, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029805348977915216, + "loss": 1.3356, + "step": 1563 + }, + { + "epoch": 0.1371499351217888, + "grad_norm": 0.06396484375, + "learning_rate": 0.00298046585598445, + "loss": 1.296, + "step": 1564 + }, + { + "epoch": 0.13723762689616334, + "grad_norm": 0.1240234375, + "learning_rate": 0.0029803966928423262, + "loss": 1.2985, + "step": 1565 + }, + { + "epoch": 0.1373253186705379, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029803274083714582, + "loss": 1.2839, + "step": 1566 + }, + { + "epoch": 0.13741301044491244, + "grad_norm": 0.095703125, + "learning_rate": 0.0029802580025781655, + "loss": 1.3112, + "step": 1567 + }, + { + "epoch": 0.13750070221928698, + "grad_norm": 0.07275390625, + "learning_rate": 0.0029801884754687767, + "loss": 1.3728, + "step": 1568 + }, + { + "epoch": 0.13758839399366152, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029801188270496327, + "loss": 1.3091, + "step": 1569 + }, + { + "epoch": 0.13767608576803608, + "grad_norm": 0.07666015625, + "learning_rate": 0.002980049057327086, + "loss": 1.2518, + "step": 1570 + }, + { + "epoch": 0.13776377754241062, + "grad_norm": 0.10888671875, + "learning_rate": 0.002979979166307498, + "loss": 1.3197, + "step": 1571 + }, + { + "epoch": 0.13785146931678516, + "grad_norm": 0.0849609375, + "learning_rate": 0.002979909153997243, + "loss": 1.3024, + "step": 1572 + }, + { + "epoch": 0.13793916109115972, + "grad_norm": 0.0859375, + "learning_rate": 0.002979839020402707, + "loss": 1.2366, + "step": 1573 + }, + { + "epoch": 0.13802685286553426, + "grad_norm": 0.08154296875, + "learning_rate": 0.0029797687655302853, + "loss": 1.2927, + "step": 1574 + }, + { + "epoch": 0.1381145446399088, + "grad_norm": 0.0791015625, + "learning_rate": 0.002979698389386385, + "loss": 1.3856, + "step": 1575 + }, + { + "epoch": 0.13820223641428334, + "grad_norm": 0.08251953125, + "learning_rate": 0.002979627891977424, + "loss": 1.3791, + "step": 1576 + }, + { + "epoch": 0.1382899281886579, + "grad_norm": 0.11474609375, + "learning_rate": 0.0029795572733098317, + "loss": 1.2658, + "step": 1577 + }, + { + "epoch": 0.13837761996303244, + "grad_norm": 0.062255859375, + "learning_rate": 0.002979486533390048, + "loss": 1.3256, + "step": 1578 + }, + { + "epoch": 0.13846531173740698, + "grad_norm": 0.1689453125, + "learning_rate": 0.002979415672224525, + "loss": 1.2792, + "step": 1579 + }, + { + "epoch": 0.13855300351178151, + "grad_norm": 0.0751953125, + "learning_rate": 0.0029793446898197244, + "loss": 1.2801, + "step": 1580 + }, + { + "epoch": 0.13864069528615608, + "grad_norm": 0.1875, + "learning_rate": 0.00297927358618212, + "loss": 1.3045, + "step": 1581 + }, + { + "epoch": 0.13872838706053062, + "grad_norm": 0.1728515625, + "learning_rate": 0.0029792023613181957, + "loss": 1.2642, + "step": 1582 + }, + { + "epoch": 0.13881607883490515, + "grad_norm": 0.10302734375, + "learning_rate": 0.0029791310152344473, + "loss": 1.2261, + "step": 1583 + }, + { + "epoch": 0.13890377060927972, + "grad_norm": 0.142578125, + "learning_rate": 0.0029790595479373813, + "loss": 1.3044, + "step": 1584 + }, + { + "epoch": 0.13899146238365426, + "grad_norm": 0.16015625, + "learning_rate": 0.0029789879594335164, + "loss": 1.3165, + "step": 1585 + }, + { + "epoch": 0.1390791541580288, + "grad_norm": 0.0859375, + "learning_rate": 0.0029789162497293794, + "loss": 1.2949, + "step": 1586 + }, + { + "epoch": 0.13916684593240333, + "grad_norm": 0.10205078125, + "learning_rate": 0.002978844418831511, + "loss": 1.3131, + "step": 1587 + }, + { + "epoch": 0.1392545377067779, + "grad_norm": 0.23046875, + "learning_rate": 0.0029787724667464624, + "loss": 1.3262, + "step": 1588 + }, + { + "epoch": 0.13934222948115244, + "grad_norm": 0.1728515625, + "learning_rate": 0.002978700393480795, + "loss": 1.217, + "step": 1589 + }, + { + "epoch": 0.13942992125552697, + "grad_norm": 0.1904296875, + "learning_rate": 0.0029786281990410823, + "loss": 1.3964, + "step": 1590 + }, + { + "epoch": 0.13951761302990154, + "grad_norm": 0.30078125, + "learning_rate": 0.0029785558834339067, + "loss": 1.3288, + "step": 1591 + }, + { + "epoch": 0.13960530480427608, + "grad_norm": 0.0791015625, + "learning_rate": 0.002978483446665865, + "loss": 1.2713, + "step": 1592 + }, + { + "epoch": 0.1396929965786506, + "grad_norm": 0.296875, + "learning_rate": 0.0029784108887435617, + "loss": 1.3323, + "step": 1593 + }, + { + "epoch": 0.13978068835302515, + "grad_norm": 0.08154296875, + "learning_rate": 0.0029783382096736145, + "loss": 1.3132, + "step": 1594 + }, + { + "epoch": 0.13986838012739972, + "grad_norm": 0.341796875, + "learning_rate": 0.0029782654094626525, + "loss": 1.3063, + "step": 1595 + }, + { + "epoch": 0.13995607190177425, + "grad_norm": 0.10107421875, + "learning_rate": 0.0029781924881173137, + "loss": 1.3055, + "step": 1596 + }, + { + "epoch": 0.1400437636761488, + "grad_norm": 0.29296875, + "learning_rate": 0.0029781194456442485, + "loss": 1.3568, + "step": 1597 + }, + { + "epoch": 0.14013145545052333, + "grad_norm": 0.1416015625, + "learning_rate": 0.0029780462820501188, + "loss": 1.2725, + "step": 1598 + }, + { + "epoch": 0.1402191472248979, + "grad_norm": 0.232421875, + "learning_rate": 0.0029779729973415958, + "loss": 1.2852, + "step": 1599 + }, + { + "epoch": 0.14030683899927243, + "grad_norm": 0.23046875, + "learning_rate": 0.0029778995915253643, + "loss": 1.3203, + "step": 1600 + }, + { + "epoch": 0.14039453077364697, + "grad_norm": 0.11962890625, + "learning_rate": 0.002977826064608118, + "loss": 1.2179, + "step": 1601 + }, + { + "epoch": 0.14048222254802153, + "grad_norm": 0.1240234375, + "learning_rate": 0.002977752416596562, + "loss": 1.238, + "step": 1602 + }, + { + "epoch": 0.14056991432239607, + "grad_norm": 0.1630859375, + "learning_rate": 0.002977678647497413, + "loss": 1.3133, + "step": 1603 + }, + { + "epoch": 0.1406576060967706, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029776047573173993, + "loss": 1.3257, + "step": 1604 + }, + { + "epoch": 0.14074529787114515, + "grad_norm": 0.19140625, + "learning_rate": 0.0029775307460632584, + "loss": 1.289, + "step": 1605 + }, + { + "epoch": 0.1408329896455197, + "grad_norm": 0.09423828125, + "learning_rate": 0.002977456613741741, + "loss": 1.3245, + "step": 1606 + }, + { + "epoch": 0.14092068141989425, + "grad_norm": 0.1630859375, + "learning_rate": 0.002977382360359607, + "loss": 1.3248, + "step": 1607 + }, + { + "epoch": 0.1410083731942688, + "grad_norm": 0.078125, + "learning_rate": 0.002977307985923628, + "loss": 1.3047, + "step": 1608 + }, + { + "epoch": 0.14109606496864335, + "grad_norm": 0.1787109375, + "learning_rate": 0.0029772334904405876, + "loss": 1.2525, + "step": 1609 + }, + { + "epoch": 0.1411837567430179, + "grad_norm": 0.07470703125, + "learning_rate": 0.002977158873917279, + "loss": 1.3219, + "step": 1610 + }, + { + "epoch": 0.14127144851739243, + "grad_norm": 0.1484375, + "learning_rate": 0.0029770841363605067, + "loss": 1.2669, + "step": 1611 + }, + { + "epoch": 0.14135914029176697, + "grad_norm": 0.1064453125, + "learning_rate": 0.0029770092777770874, + "loss": 1.3024, + "step": 1612 + }, + { + "epoch": 0.14144683206614153, + "grad_norm": 0.10791015625, + "learning_rate": 0.002976934298173848, + "loss": 1.3109, + "step": 1613 + }, + { + "epoch": 0.14153452384051607, + "grad_norm": 0.1298828125, + "learning_rate": 0.0029768591975576253, + "loss": 1.2855, + "step": 1614 + }, + { + "epoch": 0.1416222156148906, + "grad_norm": 0.12451171875, + "learning_rate": 0.0029767839759352694, + "loss": 1.2878, + "step": 1615 + }, + { + "epoch": 0.14170990738926514, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029767086333136405, + "loss": 1.3303, + "step": 1616 + }, + { + "epoch": 0.1417975991636397, + "grad_norm": 0.13671875, + "learning_rate": 0.0029766331696996083, + "loss": 1.3565, + "step": 1617 + }, + { + "epoch": 0.14188529093801425, + "grad_norm": 0.11767578125, + "learning_rate": 0.002976557585100056, + "loss": 1.3557, + "step": 1618 + }, + { + "epoch": 0.14197298271238878, + "grad_norm": 0.1015625, + "learning_rate": 0.0029764818795218763, + "loss": 1.3797, + "step": 1619 + }, + { + "epoch": 0.14206067448676335, + "grad_norm": 0.2060546875, + "learning_rate": 0.002976406052971974, + "loss": 1.3364, + "step": 1620 + }, + { + "epoch": 0.1421483662611379, + "grad_norm": 0.08154296875, + "learning_rate": 0.0029763301054572633, + "loss": 1.3215, + "step": 1621 + }, + { + "epoch": 0.14223605803551242, + "grad_norm": 0.154296875, + "learning_rate": 0.0029762540369846708, + "loss": 1.2655, + "step": 1622 + }, + { + "epoch": 0.14232374980988696, + "grad_norm": 0.09228515625, + "learning_rate": 0.002976177847561134, + "loss": 1.2315, + "step": 1623 + }, + { + "epoch": 0.14241144158426153, + "grad_norm": 0.1396484375, + "learning_rate": 0.0029761015371936013, + "loss": 1.315, + "step": 1624 + }, + { + "epoch": 0.14249913335863607, + "grad_norm": 0.09619140625, + "learning_rate": 0.0029760251058890312, + "loss": 1.3761, + "step": 1625 + }, + { + "epoch": 0.1425868251330106, + "grad_norm": 0.171875, + "learning_rate": 0.002975948553654395, + "loss": 1.3818, + "step": 1626 + }, + { + "epoch": 0.14267451690738517, + "grad_norm": 0.09716796875, + "learning_rate": 0.002975871880496673, + "loss": 1.2746, + "step": 1627 + }, + { + "epoch": 0.1427622086817597, + "grad_norm": 0.150390625, + "learning_rate": 0.002975795086422859, + "loss": 1.2724, + "step": 1628 + }, + { + "epoch": 0.14284990045613424, + "grad_norm": 0.0634765625, + "learning_rate": 0.002975718171439955, + "loss": 1.2987, + "step": 1629 + }, + { + "epoch": 0.14293759223050878, + "grad_norm": 0.1376953125, + "learning_rate": 0.002975641135554977, + "loss": 1.303, + "step": 1630 + }, + { + "epoch": 0.14302528400488335, + "grad_norm": 0.08642578125, + "learning_rate": 0.0029755639787749488, + "loss": 1.3062, + "step": 1631 + }, + { + "epoch": 0.14311297577925788, + "grad_norm": 0.09423828125, + "learning_rate": 0.0029754867011069076, + "loss": 1.2757, + "step": 1632 + }, + { + "epoch": 0.14320066755363242, + "grad_norm": 0.07275390625, + "learning_rate": 0.0029754093025579015, + "loss": 1.2948, + "step": 1633 + }, + { + "epoch": 0.14328835932800696, + "grad_norm": 0.138671875, + "learning_rate": 0.002975331783134988, + "loss": 1.2779, + "step": 1634 + }, + { + "epoch": 0.14337605110238152, + "grad_norm": 0.0927734375, + "learning_rate": 0.0029752541428452375, + "loss": 1.254, + "step": 1635 + }, + { + "epoch": 0.14346374287675606, + "grad_norm": 0.09375, + "learning_rate": 0.0029751763816957305, + "loss": 1.2744, + "step": 1636 + }, + { + "epoch": 0.1435514346511306, + "grad_norm": 0.0888671875, + "learning_rate": 0.002975098499693558, + "loss": 1.3138, + "step": 1637 + }, + { + "epoch": 0.14363912642550516, + "grad_norm": 0.0673828125, + "learning_rate": 0.0029750204968458233, + "loss": 1.3148, + "step": 1638 + }, + { + "epoch": 0.1437268181998797, + "grad_norm": 0.09619140625, + "learning_rate": 0.0029749423731596394, + "loss": 1.2637, + "step": 1639 + }, + { + "epoch": 0.14381450997425424, + "grad_norm": 0.0732421875, + "learning_rate": 0.002974864128642132, + "loss": 1.273, + "step": 1640 + }, + { + "epoch": 0.14390220174862878, + "grad_norm": 0.072265625, + "learning_rate": 0.002974785763300436, + "loss": 1.3235, + "step": 1641 + }, + { + "epoch": 0.14398989352300334, + "grad_norm": 0.1044921875, + "learning_rate": 0.002974707277141698, + "loss": 1.2963, + "step": 1642 + }, + { + "epoch": 0.14407758529737788, + "grad_norm": 0.083984375, + "learning_rate": 0.0029746286701730763, + "loss": 1.3017, + "step": 1643 + }, + { + "epoch": 0.14416527707175242, + "grad_norm": 0.0693359375, + "learning_rate": 0.0029745499424017395, + "loss": 1.3372, + "step": 1644 + }, + { + "epoch": 0.14425296884612698, + "grad_norm": 0.142578125, + "learning_rate": 0.002974471093834867, + "loss": 1.3172, + "step": 1645 + }, + { + "epoch": 0.14434066062050152, + "grad_norm": 0.1494140625, + "learning_rate": 0.00297439212447965, + "loss": 1.3033, + "step": 1646 + }, + { + "epoch": 0.14442835239487606, + "grad_norm": 0.09130859375, + "learning_rate": 0.00297431303434329, + "loss": 1.3191, + "step": 1647 + }, + { + "epoch": 0.1445160441692506, + "grad_norm": 0.1689453125, + "learning_rate": 0.002974233823433, + "loss": 1.326, + "step": 1648 + }, + { + "epoch": 0.14460373594362516, + "grad_norm": 0.1513671875, + "learning_rate": 0.002974154491756004, + "loss": 1.2827, + "step": 1649 + }, + { + "epoch": 0.1446914277179997, + "grad_norm": 0.076171875, + "learning_rate": 0.002974075039319536, + "loss": 1.249, + "step": 1650 + }, + { + "epoch": 0.14477911949237424, + "grad_norm": 0.10693359375, + "learning_rate": 0.0029739954661308437, + "loss": 1.2419, + "step": 1651 + }, + { + "epoch": 0.14486681126674877, + "grad_norm": 0.08251953125, + "learning_rate": 0.002973915772197182, + "loss": 1.2723, + "step": 1652 + }, + { + "epoch": 0.14495450304112334, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029738359575258192, + "loss": 1.333, + "step": 1653 + }, + { + "epoch": 0.14504219481549788, + "grad_norm": 0.06689453125, + "learning_rate": 0.002973756022124035, + "loss": 1.308, + "step": 1654 + }, + { + "epoch": 0.14512988658987241, + "grad_norm": 0.062255859375, + "learning_rate": 0.0029736759659991186, + "loss": 1.3056, + "step": 1655 + }, + { + "epoch": 0.14521757836424698, + "grad_norm": 0.078125, + "learning_rate": 0.0029735957891583713, + "loss": 1.2666, + "step": 1656 + }, + { + "epoch": 0.14530527013862152, + "grad_norm": 0.0927734375, + "learning_rate": 0.0029735154916091053, + "loss": 1.3374, + "step": 1657 + }, + { + "epoch": 0.14539296191299605, + "grad_norm": 0.0830078125, + "learning_rate": 0.002973435073358643, + "loss": 1.335, + "step": 1658 + }, + { + "epoch": 0.1454806536873706, + "grad_norm": 0.1328125, + "learning_rate": 0.0029733545344143176, + "loss": 1.3012, + "step": 1659 + }, + { + "epoch": 0.14556834546174516, + "grad_norm": 0.10595703125, + "learning_rate": 0.002973273874783475, + "loss": 1.2851, + "step": 1660 + }, + { + "epoch": 0.1456560372361197, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029731930944734723, + "loss": 1.32, + "step": 1661 + }, + { + "epoch": 0.14574372901049423, + "grad_norm": 0.06787109375, + "learning_rate": 0.0029731121934916736, + "loss": 1.3265, + "step": 1662 + }, + { + "epoch": 0.14583142078486877, + "grad_norm": 0.07373046875, + "learning_rate": 0.0029730311718454594, + "loss": 1.4071, + "step": 1663 + }, + { + "epoch": 0.14591911255924334, + "grad_norm": 0.1181640625, + "learning_rate": 0.002972950029542218, + "loss": 1.3511, + "step": 1664 + }, + { + "epoch": 0.14600680433361787, + "grad_norm": 0.279296875, + "learning_rate": 0.002972868766589348, + "loss": 1.3421, + "step": 1665 + }, + { + "epoch": 0.1460944961079924, + "grad_norm": 0.31640625, + "learning_rate": 0.0029727873829942623, + "loss": 1.343, + "step": 1666 + }, + { + "epoch": 0.14618218788236698, + "grad_norm": 0.07568359375, + "learning_rate": 0.002972705878764382, + "loss": 1.325, + "step": 1667 + }, + { + "epoch": 0.1462698796567415, + "grad_norm": 0.2216796875, + "learning_rate": 0.0029726242539071398, + "loss": 1.2319, + "step": 1668 + }, + { + "epoch": 0.14635757143111605, + "grad_norm": 0.091796875, + "learning_rate": 0.0029725425084299803, + "loss": 1.2526, + "step": 1669 + }, + { + "epoch": 0.1464452632054906, + "grad_norm": 0.25390625, + "learning_rate": 0.0029724606423403577, + "loss": 1.3712, + "step": 1670 + }, + { + "epoch": 0.14653295497986515, + "grad_norm": 0.10107421875, + "learning_rate": 0.0029723786556457386, + "loss": 1.2874, + "step": 1671 + }, + { + "epoch": 0.1466206467542397, + "grad_norm": 0.20703125, + "learning_rate": 0.0029722965483535996, + "loss": 1.2487, + "step": 1672 + }, + { + "epoch": 0.14670833852861423, + "grad_norm": 0.091796875, + "learning_rate": 0.0029722143204714293, + "loss": 1.3243, + "step": 1673 + }, + { + "epoch": 0.1467960303029888, + "grad_norm": 0.1982421875, + "learning_rate": 0.0029721319720067262, + "loss": 1.314, + "step": 1674 + }, + { + "epoch": 0.14688372207736333, + "grad_norm": 0.10888671875, + "learning_rate": 0.002972049502967, + "loss": 1.3093, + "step": 1675 + }, + { + "epoch": 0.14697141385173787, + "grad_norm": 0.1455078125, + "learning_rate": 0.0029719669133597723, + "loss": 1.3873, + "step": 1676 + }, + { + "epoch": 0.1470591056261124, + "grad_norm": 0.13671875, + "learning_rate": 0.002971884203192575, + "loss": 1.3275, + "step": 1677 + }, + { + "epoch": 0.14714679740048697, + "grad_norm": 0.07568359375, + "learning_rate": 0.00297180137247295, + "loss": 1.3174, + "step": 1678 + }, + { + "epoch": 0.1472344891748615, + "grad_norm": 0.1328125, + "learning_rate": 0.002971718421208453, + "loss": 1.295, + "step": 1679 + }, + { + "epoch": 0.14732218094923605, + "grad_norm": 0.150390625, + "learning_rate": 0.002971635349406647, + "loss": 1.2753, + "step": 1680 + }, + { + "epoch": 0.14740987272361059, + "grad_norm": 0.0849609375, + "learning_rate": 0.0029715521570751096, + "loss": 1.3296, + "step": 1681 + }, + { + "epoch": 0.14749756449798515, + "grad_norm": 0.1259765625, + "learning_rate": 0.002971468844221427, + "loss": 1.2714, + "step": 1682 + }, + { + "epoch": 0.1475852562723597, + "grad_norm": 0.09912109375, + "learning_rate": 0.0029713854108531965, + "loss": 1.2962, + "step": 1683 + }, + { + "epoch": 0.14767294804673423, + "grad_norm": 0.08203125, + "learning_rate": 0.0029713018569780284, + "loss": 1.2805, + "step": 1684 + }, + { + "epoch": 0.1477606398211088, + "grad_norm": 0.08642578125, + "learning_rate": 0.0029712181826035415, + "loss": 1.2831, + "step": 1685 + }, + { + "epoch": 0.14784833159548333, + "grad_norm": 0.0751953125, + "learning_rate": 0.0029711343877373672, + "loss": 1.295, + "step": 1686 + }, + { + "epoch": 0.14793602336985787, + "grad_norm": 0.0947265625, + "learning_rate": 0.0029710504723871474, + "loss": 1.2647, + "step": 1687 + }, + { + "epoch": 0.1480237151442324, + "grad_norm": 0.0732421875, + "learning_rate": 0.0029709664365605345, + "loss": 1.3441, + "step": 1688 + }, + { + "epoch": 0.14811140691860697, + "grad_norm": 0.1904296875, + "learning_rate": 0.0029708822802651928, + "loss": 1.3064, + "step": 1689 + }, + { + "epoch": 0.1481990986929815, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029707980035087967, + "loss": 1.2509, + "step": 1690 + }, + { + "epoch": 0.14828679046735604, + "grad_norm": 0.224609375, + "learning_rate": 0.002970713606299033, + "loss": 1.2616, + "step": 1691 + }, + { + "epoch": 0.1483744822417306, + "grad_norm": 0.1474609375, + "learning_rate": 0.002970629088643597, + "loss": 1.3159, + "step": 1692 + }, + { + "epoch": 0.14846217401610515, + "grad_norm": 0.11083984375, + "learning_rate": 0.0029705444505501977, + "loss": 1.3307, + "step": 1693 + }, + { + "epoch": 0.14854986579047968, + "grad_norm": 0.1416015625, + "learning_rate": 0.0029704596920265536, + "loss": 1.2977, + "step": 1694 + }, + { + "epoch": 0.14863755756485422, + "grad_norm": 0.09619140625, + "learning_rate": 0.0029703748130803943, + "loss": 1.2808, + "step": 1695 + }, + { + "epoch": 0.1487252493392288, + "grad_norm": 0.0908203125, + "learning_rate": 0.0029702898137194604, + "loss": 1.2882, + "step": 1696 + }, + { + "epoch": 0.14881294111360333, + "grad_norm": 0.09423828125, + "learning_rate": 0.0029702046939515036, + "loss": 1.3025, + "step": 1697 + }, + { + "epoch": 0.14890063288797786, + "grad_norm": 0.06640625, + "learning_rate": 0.002970119453784287, + "loss": 1.3931, + "step": 1698 + }, + { + "epoch": 0.1489883246623524, + "grad_norm": 0.10986328125, + "learning_rate": 0.0029700340932255842, + "loss": 1.2934, + "step": 1699 + }, + { + "epoch": 0.14907601643672697, + "grad_norm": 0.09130859375, + "learning_rate": 0.0029699486122831795, + "loss": 1.354, + "step": 1700 + }, + { + "epoch": 0.1491637082111015, + "grad_norm": 0.1708984375, + "learning_rate": 0.002969863010964869, + "loss": 1.277, + "step": 1701 + }, + { + "epoch": 0.14925139998547604, + "grad_norm": 0.1318359375, + "learning_rate": 0.002969777289278459, + "loss": 1.2748, + "step": 1702 + }, + { + "epoch": 0.1493390917598506, + "grad_norm": 0.09912109375, + "learning_rate": 0.0029696914472317672, + "loss": 1.332, + "step": 1703 + }, + { + "epoch": 0.14942678353422514, + "grad_norm": 0.1201171875, + "learning_rate": 0.0029696054848326226, + "loss": 1.3173, + "step": 1704 + }, + { + "epoch": 0.14951447530859968, + "grad_norm": 0.1435546875, + "learning_rate": 0.002969519402088864, + "loss": 1.3306, + "step": 1705 + }, + { + "epoch": 0.14960216708297422, + "grad_norm": 0.14453125, + "learning_rate": 0.002969433199008342, + "loss": 1.2826, + "step": 1706 + }, + { + "epoch": 0.14968985885734878, + "grad_norm": 0.0888671875, + "learning_rate": 0.0029693468755989188, + "loss": 1.3347, + "step": 1707 + }, + { + "epoch": 0.14977755063172332, + "grad_norm": 0.10693359375, + "learning_rate": 0.002969260431868466, + "loss": 1.2668, + "step": 1708 + }, + { + "epoch": 0.14986524240609786, + "grad_norm": 0.142578125, + "learning_rate": 0.002969173867824868, + "loss": 1.2978, + "step": 1709 + }, + { + "epoch": 0.14995293418047242, + "grad_norm": 0.12255859375, + "learning_rate": 0.002969087183476018, + "loss": 1.3336, + "step": 1710 + }, + { + "epoch": 0.15004062595484696, + "grad_norm": 0.0732421875, + "learning_rate": 0.0029690003788298224, + "loss": 1.3322, + "step": 1711 + }, + { + "epoch": 0.1501283177292215, + "grad_norm": 0.0625, + "learning_rate": 0.002968913453894197, + "loss": 1.2582, + "step": 1712 + }, + { + "epoch": 0.15021600950359604, + "grad_norm": 0.126953125, + "learning_rate": 0.00296882640867707, + "loss": 1.2509, + "step": 1713 + }, + { + "epoch": 0.1503037012779706, + "grad_norm": 0.0732421875, + "learning_rate": 0.0029687392431863788, + "loss": 1.3008, + "step": 1714 + }, + { + "epoch": 0.15039139305234514, + "grad_norm": 0.1826171875, + "learning_rate": 0.0029686519574300724, + "loss": 1.4032, + "step": 1715 + }, + { + "epoch": 0.15047908482671968, + "grad_norm": 0.1005859375, + "learning_rate": 0.0029685645514161123, + "loss": 1.2226, + "step": 1716 + }, + { + "epoch": 0.15056677660109422, + "grad_norm": 0.130859375, + "learning_rate": 0.0029684770251524684, + "loss": 1.258, + "step": 1717 + }, + { + "epoch": 0.15065446837546878, + "grad_norm": 0.1591796875, + "learning_rate": 0.002968389378647124, + "loss": 1.3325, + "step": 1718 + }, + { + "epoch": 0.15074216014984332, + "grad_norm": 0.083984375, + "learning_rate": 0.0029683016119080715, + "loss": 1.37, + "step": 1719 + }, + { + "epoch": 0.15082985192421786, + "grad_norm": 0.33203125, + "learning_rate": 0.002968213724943315, + "loss": 1.3357, + "step": 1720 + }, + { + "epoch": 0.15091754369859242, + "grad_norm": 0.25, + "learning_rate": 0.0029681257177608697, + "loss": 1.3055, + "step": 1721 + }, + { + "epoch": 0.15100523547296696, + "grad_norm": 0.1962890625, + "learning_rate": 0.0029680375903687614, + "loss": 1.2907, + "step": 1722 + }, + { + "epoch": 0.1510929272473415, + "grad_norm": 0.26953125, + "learning_rate": 0.0029679493427750277, + "loss": 1.3671, + "step": 1723 + }, + { + "epoch": 0.15118061902171603, + "grad_norm": 0.10888671875, + "learning_rate": 0.002967860974987716, + "loss": 1.3317, + "step": 1724 + }, + { + "epoch": 0.1512683107960906, + "grad_norm": 0.2392578125, + "learning_rate": 0.002967772487014886, + "loss": 1.3038, + "step": 1725 + }, + { + "epoch": 0.15135600257046514, + "grad_norm": 0.061279296875, + "learning_rate": 0.0029676838788646066, + "loss": 1.3724, + "step": 1726 + }, + { + "epoch": 0.15144369434483967, + "grad_norm": 0.1279296875, + "learning_rate": 0.0029675951505449593, + "loss": 1.3427, + "step": 1727 + }, + { + "epoch": 0.15153138611921424, + "grad_norm": 0.0791015625, + "learning_rate": 0.002967506302064035, + "loss": 1.3172, + "step": 1728 + }, + { + "epoch": 0.15161907789358878, + "grad_norm": 0.1015625, + "learning_rate": 0.0029674173334299377, + "loss": 1.266, + "step": 1729 + }, + { + "epoch": 0.15170676966796331, + "grad_norm": 0.07177734375, + "learning_rate": 0.00296732824465078, + "loss": 1.3791, + "step": 1730 + }, + { + "epoch": 0.15179446144233785, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029672390357346873, + "loss": 1.2942, + "step": 1731 + }, + { + "epoch": 0.15188215321671242, + "grad_norm": 0.09375, + "learning_rate": 0.002967149706689795, + "loss": 1.3361, + "step": 1732 + }, + { + "epoch": 0.15196984499108696, + "grad_norm": 0.146484375, + "learning_rate": 0.00296706025752425, + "loss": 1.2497, + "step": 1733 + }, + { + "epoch": 0.1520575367654615, + "grad_norm": 0.06689453125, + "learning_rate": 0.002966970688246209, + "loss": 1.2908, + "step": 1734 + }, + { + "epoch": 0.15214522853983603, + "grad_norm": 0.10009765625, + "learning_rate": 0.0029668809988638405, + "loss": 1.3164, + "step": 1735 + }, + { + "epoch": 0.1522329203142106, + "grad_norm": 0.0751953125, + "learning_rate": 0.002966791189385325, + "loss": 1.2638, + "step": 1736 + }, + { + "epoch": 0.15232061208858513, + "grad_norm": 0.11376953125, + "learning_rate": 0.0029667012598188526, + "loss": 1.331, + "step": 1737 + }, + { + "epoch": 0.15240830386295967, + "grad_norm": 0.07373046875, + "learning_rate": 0.0029666112101726237, + "loss": 1.2418, + "step": 1738 + }, + { + "epoch": 0.15249599563733424, + "grad_norm": 0.1279296875, + "learning_rate": 0.002966521040454852, + "loss": 1.31, + "step": 1739 + }, + { + "epoch": 0.15258368741170877, + "grad_norm": 0.16015625, + "learning_rate": 0.0029664307506737596, + "loss": 1.3026, + "step": 1740 + }, + { + "epoch": 0.1526713791860833, + "grad_norm": 0.11181640625, + "learning_rate": 0.0029663403408375813, + "loss": 1.2625, + "step": 1741 + }, + { + "epoch": 0.15275907096045785, + "grad_norm": 0.1318359375, + "learning_rate": 0.002966249810954561, + "loss": 1.2715, + "step": 1742 + }, + { + "epoch": 0.15284676273483241, + "grad_norm": 0.1142578125, + "learning_rate": 0.002966159161032957, + "loss": 1.2995, + "step": 1743 + }, + { + "epoch": 0.15293445450920695, + "grad_norm": 0.109375, + "learning_rate": 0.002966068391081035, + "loss": 1.2396, + "step": 1744 + }, + { + "epoch": 0.1530221462835815, + "grad_norm": 0.119140625, + "learning_rate": 0.002965977501107073, + "loss": 1.2447, + "step": 1745 + }, + { + "epoch": 0.15310983805795605, + "grad_norm": 0.099609375, + "learning_rate": 0.0029658864911193596, + "loss": 1.3183, + "step": 1746 + }, + { + "epoch": 0.1531975298323306, + "grad_norm": 0.06201171875, + "learning_rate": 0.0029657953611261956, + "loss": 1.2936, + "step": 1747 + }, + { + "epoch": 0.15328522160670513, + "grad_norm": 0.1025390625, + "learning_rate": 0.0029657041111358917, + "loss": 1.2693, + "step": 1748 + }, + { + "epoch": 0.15337291338107967, + "grad_norm": 0.0771484375, + "learning_rate": 0.0029656127411567686, + "loss": 1.2768, + "step": 1749 + }, + { + "epoch": 0.15346060515545423, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029655212511971603, + "loss": 1.2926, + "step": 1750 + }, + { + "epoch": 0.15354829692982877, + "grad_norm": 0.1416015625, + "learning_rate": 0.0029654296412654096, + "loss": 1.3025, + "step": 1751 + }, + { + "epoch": 0.1536359887042033, + "grad_norm": 0.1630859375, + "learning_rate": 0.0029653379113698717, + "loss": 1.3208, + "step": 1752 + }, + { + "epoch": 0.15372368047857785, + "grad_norm": 0.10888671875, + "learning_rate": 0.0029652460615189114, + "loss": 1.2135, + "step": 1753 + }, + { + "epoch": 0.1538113722529524, + "grad_norm": 0.142578125, + "learning_rate": 0.002965154091720906, + "loss": 1.3043, + "step": 1754 + }, + { + "epoch": 0.15389906402732695, + "grad_norm": 0.10498046875, + "learning_rate": 0.002965062001984242, + "loss": 1.3056, + "step": 1755 + }, + { + "epoch": 0.15398675580170149, + "grad_norm": 0.29296875, + "learning_rate": 0.0029649697923173185, + "loss": 1.3145, + "step": 1756 + }, + { + "epoch": 0.15407444757607605, + "grad_norm": 0.171875, + "learning_rate": 0.0029648774627285446, + "loss": 1.2702, + "step": 1757 + }, + { + "epoch": 0.1541621393504506, + "grad_norm": 0.302734375, + "learning_rate": 0.0029647850132263403, + "loss": 1.387, + "step": 1758 + }, + { + "epoch": 0.15424983112482513, + "grad_norm": 0.453125, + "learning_rate": 0.002964692443819137, + "loss": 1.3019, + "step": 1759 + }, + { + "epoch": 0.15433752289919966, + "grad_norm": 0.1552734375, + "learning_rate": 0.002964599754515377, + "loss": 1.2936, + "step": 1760 + }, + { + "epoch": 0.15442521467357423, + "grad_norm": 0.478515625, + "learning_rate": 0.002964506945323512, + "loss": 1.3148, + "step": 1761 + }, + { + "epoch": 0.15451290644794877, + "grad_norm": 0.1806640625, + "learning_rate": 0.002964414016252008, + "loss": 1.271, + "step": 1762 + }, + { + "epoch": 0.1546005982223233, + "grad_norm": 0.392578125, + "learning_rate": 0.0029643209673093384, + "loss": 1.3235, + "step": 1763 + }, + { + "epoch": 0.15468828999669787, + "grad_norm": 0.1396484375, + "learning_rate": 0.0029642277985039893, + "loss": 1.3947, + "step": 1764 + }, + { + "epoch": 0.1547759817710724, + "grad_norm": 0.267578125, + "learning_rate": 0.0029641345098444585, + "loss": 1.2814, + "step": 1765 + }, + { + "epoch": 0.15486367354544694, + "grad_norm": 0.12890625, + "learning_rate": 0.002964041101339252, + "loss": 1.3002, + "step": 1766 + }, + { + "epoch": 0.15495136531982148, + "grad_norm": 0.16015625, + "learning_rate": 0.0029639475729968893, + "loss": 1.2534, + "step": 1767 + }, + { + "epoch": 0.15503905709419605, + "grad_norm": 0.1796875, + "learning_rate": 0.0029638539248259, + "loss": 1.3228, + "step": 1768 + }, + { + "epoch": 0.15512674886857059, + "grad_norm": 0.08837890625, + "learning_rate": 0.002963760156834825, + "loss": 1.2834, + "step": 1769 + }, + { + "epoch": 0.15521444064294512, + "grad_norm": 0.2001953125, + "learning_rate": 0.0029636662690322143, + "loss": 1.3193, + "step": 1770 + }, + { + "epoch": 0.15530213241731966, + "grad_norm": 0.072265625, + "learning_rate": 0.0029635722614266315, + "loss": 1.3285, + "step": 1771 + }, + { + "epoch": 0.15538982419169423, + "grad_norm": 0.1123046875, + "learning_rate": 0.002963478134026649, + "loss": 1.3342, + "step": 1772 + }, + { + "epoch": 0.15547751596606876, + "grad_norm": 0.12060546875, + "learning_rate": 0.002963383886840852, + "loss": 1.2917, + "step": 1773 + }, + { + "epoch": 0.1555652077404433, + "grad_norm": 0.1015625, + "learning_rate": 0.002963289519877835, + "loss": 1.2716, + "step": 1774 + }, + { + "epoch": 0.15565289951481787, + "grad_norm": 0.10986328125, + "learning_rate": 0.0029631950331462037, + "loss": 1.3703, + "step": 1775 + }, + { + "epoch": 0.1557405912891924, + "grad_norm": 0.109375, + "learning_rate": 0.0029631004266545756, + "loss": 1.2962, + "step": 1776 + }, + { + "epoch": 0.15582828306356694, + "grad_norm": 0.10986328125, + "learning_rate": 0.002963005700411578, + "loss": 1.2938, + "step": 1777 + }, + { + "epoch": 0.15591597483794148, + "grad_norm": 0.14453125, + "learning_rate": 0.00296291085442585, + "loss": 1.3536, + "step": 1778 + }, + { + "epoch": 0.15600366661231604, + "grad_norm": 0.08984375, + "learning_rate": 0.0029628158887060416, + "loss": 1.2311, + "step": 1779 + }, + { + "epoch": 0.15609135838669058, + "grad_norm": 0.13671875, + "learning_rate": 0.002962720803260813, + "loss": 1.2521, + "step": 1780 + }, + { + "epoch": 0.15617905016106512, + "grad_norm": 0.1181640625, + "learning_rate": 0.0029626255980988365, + "loss": 1.2764, + "step": 1781 + }, + { + "epoch": 0.15626674193543968, + "grad_norm": 0.08740234375, + "learning_rate": 0.0029625302732287934, + "loss": 1.2357, + "step": 1782 + }, + { + "epoch": 0.15635443370981422, + "grad_norm": 0.076171875, + "learning_rate": 0.0029624348286593776, + "loss": 1.3649, + "step": 1783 + }, + { + "epoch": 0.15644212548418876, + "grad_norm": 0.10693359375, + "learning_rate": 0.0029623392643992937, + "loss": 1.3346, + "step": 1784 + }, + { + "epoch": 0.1565298172585633, + "grad_norm": 0.07373046875, + "learning_rate": 0.0029622435804572563, + "loss": 1.3027, + "step": 1785 + }, + { + "epoch": 0.15661750903293786, + "grad_norm": 0.1484375, + "learning_rate": 0.0029621477768419927, + "loss": 1.3065, + "step": 1786 + }, + { + "epoch": 0.1567052008073124, + "grad_norm": 0.1064453125, + "learning_rate": 0.002962051853562238, + "loss": 1.2124, + "step": 1787 + }, + { + "epoch": 0.15679289258168694, + "grad_norm": 0.11962890625, + "learning_rate": 0.0029619558106267424, + "loss": 1.3163, + "step": 1788 + }, + { + "epoch": 0.15688058435606148, + "grad_norm": 0.142578125, + "learning_rate": 0.0029618596480442635, + "loss": 1.266, + "step": 1789 + }, + { + "epoch": 0.15696827613043604, + "grad_norm": 0.1201171875, + "learning_rate": 0.0029617633658235707, + "loss": 1.3783, + "step": 1790 + }, + { + "epoch": 0.15705596790481058, + "grad_norm": 0.2177734375, + "learning_rate": 0.0029616669639734457, + "loss": 1.3268, + "step": 1791 + }, + { + "epoch": 0.15714365967918512, + "grad_norm": 0.083984375, + "learning_rate": 0.002961570442502679, + "loss": 1.2861, + "step": 1792 + }, + { + "epoch": 0.15723135145355968, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029614738014200745, + "loss": 1.3748, + "step": 1793 + }, + { + "epoch": 0.15731904322793422, + "grad_norm": 0.0859375, + "learning_rate": 0.0029613770407344447, + "loss": 1.2581, + "step": 1794 + }, + { + "epoch": 0.15740673500230876, + "grad_norm": 0.07763671875, + "learning_rate": 0.002961280160454614, + "loss": 1.2899, + "step": 1795 + }, + { + "epoch": 0.1574944267766833, + "grad_norm": 0.07373046875, + "learning_rate": 0.0029611831605894172, + "loss": 1.2457, + "step": 1796 + }, + { + "epoch": 0.15758211855105786, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029610860411477015, + "loss": 1.3766, + "step": 1797 + }, + { + "epoch": 0.1576698103254324, + "grad_norm": 0.06494140625, + "learning_rate": 0.0029609888021383235, + "loss": 1.3176, + "step": 1798 + }, + { + "epoch": 0.15775750209980693, + "grad_norm": 0.0869140625, + "learning_rate": 0.002960891443570151, + "loss": 1.2908, + "step": 1799 + }, + { + "epoch": 0.1578451938741815, + "grad_norm": 0.07470703125, + "learning_rate": 0.0029607939654520627, + "loss": 1.305, + "step": 1800 + }, + { + "epoch": 0.15793288564855604, + "grad_norm": 0.1005859375, + "learning_rate": 0.0029606963677929485, + "loss": 1.2922, + "step": 1801 + }, + { + "epoch": 0.15802057742293057, + "grad_norm": 0.11572265625, + "learning_rate": 0.0029605986506017093, + "loss": 1.2961, + "step": 1802 + }, + { + "epoch": 0.1581082691973051, + "grad_norm": 0.11328125, + "learning_rate": 0.0029605008138872562, + "loss": 1.2114, + "step": 1803 + }, + { + "epoch": 0.15819596097167968, + "grad_norm": 0.126953125, + "learning_rate": 0.0029604028576585124, + "loss": 1.2436, + "step": 1804 + }, + { + "epoch": 0.15828365274605422, + "grad_norm": 0.1298828125, + "learning_rate": 0.00296030478192441, + "loss": 1.3183, + "step": 1805 + }, + { + "epoch": 0.15837134452042875, + "grad_norm": 0.171875, + "learning_rate": 0.002960206586693895, + "loss": 1.2487, + "step": 1806 + }, + { + "epoch": 0.1584590362948033, + "grad_norm": 0.0703125, + "learning_rate": 0.002960108271975921, + "loss": 1.3022, + "step": 1807 + }, + { + "epoch": 0.15854672806917786, + "grad_norm": 0.158203125, + "learning_rate": 0.0029600098377794543, + "loss": 1.3295, + "step": 1808 + }, + { + "epoch": 0.1586344198435524, + "grad_norm": 0.1748046875, + "learning_rate": 0.0029599112841134723, + "loss": 1.297, + "step": 1809 + }, + { + "epoch": 0.15872211161792693, + "grad_norm": 0.0966796875, + "learning_rate": 0.0029598126109869633, + "loss": 1.3147, + "step": 1810 + }, + { + "epoch": 0.1588098033923015, + "grad_norm": 0.1435546875, + "learning_rate": 0.002959713818408925, + "loss": 1.2913, + "step": 1811 + }, + { + "epoch": 0.15889749516667603, + "grad_norm": 0.2255859375, + "learning_rate": 0.0029596149063883677, + "loss": 1.233, + "step": 1812 + }, + { + "epoch": 0.15898518694105057, + "grad_norm": 0.0927734375, + "learning_rate": 0.0029595158749343114, + "loss": 1.2982, + "step": 1813 + }, + { + "epoch": 0.1590728787154251, + "grad_norm": 0.2412109375, + "learning_rate": 0.0029594167240557883, + "loss": 1.3088, + "step": 1814 + }, + { + "epoch": 0.15916057048979967, + "grad_norm": 0.11572265625, + "learning_rate": 0.0029593174537618392, + "loss": 1.3154, + "step": 1815 + }, + { + "epoch": 0.1592482622641742, + "grad_norm": 0.169921875, + "learning_rate": 0.0029592180640615195, + "loss": 1.2972, + "step": 1816 + }, + { + "epoch": 0.15933595403854875, + "grad_norm": 0.1650390625, + "learning_rate": 0.0029591185549638914, + "loss": 1.32, + "step": 1817 + }, + { + "epoch": 0.15942364581292331, + "grad_norm": 0.111328125, + "learning_rate": 0.002959018926478031, + "loss": 1.2938, + "step": 1818 + }, + { + "epoch": 0.15951133758729785, + "grad_norm": 0.09375, + "learning_rate": 0.002958919178613023, + "loss": 1.2777, + "step": 1819 + }, + { + "epoch": 0.1595990293616724, + "grad_norm": 0.08251953125, + "learning_rate": 0.002958819311377965, + "loss": 1.3409, + "step": 1820 + }, + { + "epoch": 0.15968672113604693, + "grad_norm": 0.080078125, + "learning_rate": 0.002958719324781965, + "loss": 1.2738, + "step": 1821 + }, + { + "epoch": 0.1597744129104215, + "grad_norm": 0.0869140625, + "learning_rate": 0.002958619218834141, + "loss": 1.2636, + "step": 1822 + }, + { + "epoch": 0.15986210468479603, + "grad_norm": 0.10888671875, + "learning_rate": 0.002958518993543622, + "loss": 1.3311, + "step": 1823 + }, + { + "epoch": 0.15994979645917057, + "grad_norm": 0.06640625, + "learning_rate": 0.002958418648919549, + "loss": 1.3638, + "step": 1824 + }, + { + "epoch": 0.1600374882335451, + "grad_norm": 0.09130859375, + "learning_rate": 0.0029583181849710733, + "loss": 1.3066, + "step": 1825 + }, + { + "epoch": 0.16012518000791967, + "grad_norm": 0.0634765625, + "learning_rate": 0.0029582176017073558, + "loss": 1.3149, + "step": 1826 + }, + { + "epoch": 0.1602128717822942, + "grad_norm": 0.07080078125, + "learning_rate": 0.00295811689913757, + "loss": 1.2287, + "step": 1827 + }, + { + "epoch": 0.16030056355666875, + "grad_norm": 0.0859375, + "learning_rate": 0.002958016077270901, + "loss": 1.2944, + "step": 1828 + }, + { + "epoch": 0.1603882553310433, + "grad_norm": 0.15625, + "learning_rate": 0.0029579151361165414, + "loss": 1.3327, + "step": 1829 + }, + { + "epoch": 0.16047594710541785, + "grad_norm": 0.1083984375, + "learning_rate": 0.0029578140756836985, + "loss": 1.2067, + "step": 1830 + }, + { + "epoch": 0.1605636388797924, + "grad_norm": 0.12060546875, + "learning_rate": 0.0029577128959815875, + "loss": 1.2926, + "step": 1831 + }, + { + "epoch": 0.16065133065416692, + "grad_norm": 0.17578125, + "learning_rate": 0.0029576115970194362, + "loss": 1.3344, + "step": 1832 + }, + { + "epoch": 0.1607390224285415, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029575101788064826, + "loss": 1.2922, + "step": 1833 + }, + { + "epoch": 0.16082671420291603, + "grad_norm": 0.171875, + "learning_rate": 0.0029574086413519766, + "loss": 1.2902, + "step": 1834 + }, + { + "epoch": 0.16091440597729056, + "grad_norm": 0.068359375, + "learning_rate": 0.0029573069846651773, + "loss": 1.3136, + "step": 1835 + }, + { + "epoch": 0.16100209775166513, + "grad_norm": 0.1337890625, + "learning_rate": 0.002957205208755356, + "loss": 1.1953, + "step": 1836 + }, + { + "epoch": 0.16108978952603967, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029571033136317937, + "loss": 1.3136, + "step": 1837 + }, + { + "epoch": 0.1611774813004142, + "grad_norm": 0.2138671875, + "learning_rate": 0.002957001299303784, + "loss": 1.2416, + "step": 1838 + }, + { + "epoch": 0.16126517307478874, + "grad_norm": 0.1201171875, + "learning_rate": 0.0029568991657806295, + "loss": 1.2981, + "step": 1839 + }, + { + "epoch": 0.1613528648491633, + "grad_norm": 0.11669921875, + "learning_rate": 0.002956796913071645, + "loss": 1.273, + "step": 1840 + }, + { + "epoch": 0.16144055662353785, + "grad_norm": 0.06298828125, + "learning_rate": 0.002956694541186155, + "loss": 1.2522, + "step": 1841 + }, + { + "epoch": 0.16152824839791238, + "grad_norm": 0.08984375, + "learning_rate": 0.002956592050133497, + "loss": 1.2785, + "step": 1842 + }, + { + "epoch": 0.16161594017228692, + "grad_norm": 0.0869140625, + "learning_rate": 0.0029564894399230165, + "loss": 1.2929, + "step": 1843 + }, + { + "epoch": 0.16170363194666149, + "grad_norm": 0.1103515625, + "learning_rate": 0.0029563867105640716, + "loss": 1.3277, + "step": 1844 + }, + { + "epoch": 0.16179132372103602, + "grad_norm": 0.08251953125, + "learning_rate": 0.002956283862066031, + "loss": 1.2834, + "step": 1845 + }, + { + "epoch": 0.16187901549541056, + "grad_norm": 0.162109375, + "learning_rate": 0.0029561808944382744, + "loss": 1.24, + "step": 1846 + }, + { + "epoch": 0.16196670726978513, + "grad_norm": 0.083984375, + "learning_rate": 0.0029560778076901926, + "loss": 1.3224, + "step": 1847 + }, + { + "epoch": 0.16205439904415966, + "grad_norm": 0.07421875, + "learning_rate": 0.0029559746018311857, + "loss": 1.296, + "step": 1848 + }, + { + "epoch": 0.1621420908185342, + "grad_norm": 0.08154296875, + "learning_rate": 0.002955871276870667, + "loss": 1.3479, + "step": 1849 + }, + { + "epoch": 0.16222978259290874, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029557678328180586, + "loss": 1.3016, + "step": 1850 + }, + { + "epoch": 0.1623174743672833, + "grad_norm": 0.0849609375, + "learning_rate": 0.002955664269682795, + "loss": 1.3169, + "step": 1851 + }, + { + "epoch": 0.16240516614165784, + "grad_norm": 0.107421875, + "learning_rate": 0.0029555605874743204, + "loss": 1.2505, + "step": 1852 + }, + { + "epoch": 0.16249285791603238, + "grad_norm": 0.09375, + "learning_rate": 0.0029554567862020904, + "loss": 1.3084, + "step": 1853 + }, + { + "epoch": 0.16258054969040694, + "grad_norm": 0.162109375, + "learning_rate": 0.002955352865875572, + "loss": 1.3385, + "step": 1854 + }, + { + "epoch": 0.16266824146478148, + "grad_norm": 0.115234375, + "learning_rate": 0.002955248826504241, + "loss": 1.3753, + "step": 1855 + }, + { + "epoch": 0.16275593323915602, + "grad_norm": 0.11962890625, + "learning_rate": 0.002955144668097588, + "loss": 1.2402, + "step": 1856 + }, + { + "epoch": 0.16284362501353056, + "grad_norm": 0.1552734375, + "learning_rate": 0.0029550403906651087, + "loss": 1.2476, + "step": 1857 + }, + { + "epoch": 0.16293131678790512, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029549359942163157, + "loss": 1.2843, + "step": 1858 + }, + { + "epoch": 0.16301900856227966, + "grad_norm": 0.12890625, + "learning_rate": 0.0029548314787607288, + "loss": 1.2993, + "step": 1859 + }, + { + "epoch": 0.1631067003366542, + "grad_norm": 0.12890625, + "learning_rate": 0.002954726844307879, + "loss": 1.3391, + "step": 1860 + }, + { + "epoch": 0.16319439211102874, + "grad_norm": 0.1259765625, + "learning_rate": 0.0029546220908673094, + "loss": 1.319, + "step": 1861 + }, + { + "epoch": 0.1632820838854033, + "grad_norm": 0.07568359375, + "learning_rate": 0.0029545172184485733, + "loss": 1.2632, + "step": 1862 + }, + { + "epoch": 0.16336977565977784, + "grad_norm": 0.09033203125, + "learning_rate": 0.002954412227061234, + "loss": 1.2358, + "step": 1863 + }, + { + "epoch": 0.16345746743415238, + "grad_norm": 0.0869140625, + "learning_rate": 0.0029543071167148667, + "loss": 1.2597, + "step": 1864 + }, + { + "epoch": 0.16354515920852694, + "grad_norm": 0.0927734375, + "learning_rate": 0.002954201887419058, + "loss": 1.2416, + "step": 1865 + }, + { + "epoch": 0.16363285098290148, + "grad_norm": 0.08544921875, + "learning_rate": 0.002954096539183404, + "loss": 1.2841, + "step": 1866 + }, + { + "epoch": 0.16372054275727602, + "grad_norm": 0.087890625, + "learning_rate": 0.0029539910720175113, + "loss": 1.2665, + "step": 1867 + }, + { + "epoch": 0.16380823453165055, + "grad_norm": 0.1044921875, + "learning_rate": 0.0029538854859309997, + "loss": 1.3402, + "step": 1868 + }, + { + "epoch": 0.16389592630602512, + "grad_norm": 0.0693359375, + "learning_rate": 0.0029537797809334977, + "loss": 1.2414, + "step": 1869 + }, + { + "epoch": 0.16398361808039966, + "grad_norm": 0.07275390625, + "learning_rate": 0.002953673957034645, + "loss": 1.2813, + "step": 1870 + }, + { + "epoch": 0.1640713098547742, + "grad_norm": 0.0908203125, + "learning_rate": 0.0029535680142440935, + "loss": 1.2876, + "step": 1871 + }, + { + "epoch": 0.16415900162914873, + "grad_norm": 0.06298828125, + "learning_rate": 0.0029534619525715036, + "loss": 1.2672, + "step": 1872 + }, + { + "epoch": 0.1642466934035233, + "grad_norm": 0.080078125, + "learning_rate": 0.0029533557720265494, + "loss": 1.3085, + "step": 1873 + }, + { + "epoch": 0.16433438517789783, + "grad_norm": 0.0927734375, + "learning_rate": 0.0029532494726189126, + "loss": 1.2638, + "step": 1874 + }, + { + "epoch": 0.16442207695227237, + "grad_norm": 0.15625, + "learning_rate": 0.002953143054358288, + "loss": 1.2597, + "step": 1875 + }, + { + "epoch": 0.16450976872664694, + "grad_norm": 0.09912109375, + "learning_rate": 0.0029530365172543817, + "loss": 1.2191, + "step": 1876 + }, + { + "epoch": 0.16459746050102148, + "grad_norm": 0.0830078125, + "learning_rate": 0.002952929861316908, + "loss": 1.297, + "step": 1877 + }, + { + "epoch": 0.164685152275396, + "grad_norm": 0.0673828125, + "learning_rate": 0.002952823086555595, + "loss": 1.2995, + "step": 1878 + }, + { + "epoch": 0.16477284404977055, + "grad_norm": 0.0595703125, + "learning_rate": 0.0029527161929801798, + "loss": 1.2785, + "step": 1879 + }, + { + "epoch": 0.16486053582414512, + "grad_norm": 0.0703125, + "learning_rate": 0.00295260918060041, + "loss": 1.2669, + "step": 1880 + }, + { + "epoch": 0.16494822759851965, + "grad_norm": 0.0673828125, + "learning_rate": 0.0029525020494260462, + "loss": 1.2682, + "step": 1881 + }, + { + "epoch": 0.1650359193728942, + "grad_norm": 0.056396484375, + "learning_rate": 0.0029523947994668583, + "loss": 1.3312, + "step": 1882 + }, + { + "epoch": 0.16512361114726876, + "grad_norm": 0.12890625, + "learning_rate": 0.0029522874307326263, + "loss": 1.3099, + "step": 1883 + }, + { + "epoch": 0.1652113029216433, + "grad_norm": 0.16796875, + "learning_rate": 0.002952179943233142, + "loss": 1.2679, + "step": 1884 + }, + { + "epoch": 0.16529899469601783, + "grad_norm": 0.07421875, + "learning_rate": 0.002952072336978209, + "loss": 1.2699, + "step": 1885 + }, + { + "epoch": 0.16538668647039237, + "grad_norm": 0.10205078125, + "learning_rate": 0.00295196461197764, + "loss": 1.2717, + "step": 1886 + }, + { + "epoch": 0.16547437824476693, + "grad_norm": 0.076171875, + "learning_rate": 0.0029518567682412593, + "loss": 1.2344, + "step": 1887 + }, + { + "epoch": 0.16556207001914147, + "grad_norm": 0.1044921875, + "learning_rate": 0.0029517488057789025, + "loss": 1.2496, + "step": 1888 + }, + { + "epoch": 0.165649761793516, + "grad_norm": 0.1748046875, + "learning_rate": 0.002951640724600414, + "loss": 1.32, + "step": 1889 + }, + { + "epoch": 0.16573745356789055, + "grad_norm": 0.158203125, + "learning_rate": 0.0029515325247156526, + "loss": 1.3256, + "step": 1890 + }, + { + "epoch": 0.1658251453422651, + "grad_norm": 0.11328125, + "learning_rate": 0.0029514242061344847, + "loss": 1.3123, + "step": 1891 + }, + { + "epoch": 0.16591283711663965, + "grad_norm": 0.0751953125, + "learning_rate": 0.0029513157688667883, + "loss": 1.317, + "step": 1892 + }, + { + "epoch": 0.1660005288910142, + "grad_norm": 0.12158203125, + "learning_rate": 0.002951207212922454, + "loss": 1.2483, + "step": 1893 + }, + { + "epoch": 0.16608822066538875, + "grad_norm": 0.07373046875, + "learning_rate": 0.00295109853831138, + "loss": 1.3007, + "step": 1894 + }, + { + "epoch": 0.1661759124397633, + "grad_norm": 0.0625, + "learning_rate": 0.0029509897450434785, + "loss": 1.2602, + "step": 1895 + }, + { + "epoch": 0.16626360421413783, + "grad_norm": 0.0751953125, + "learning_rate": 0.002950880833128671, + "loss": 1.3378, + "step": 1896 + }, + { + "epoch": 0.16635129598851237, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029507718025768895, + "loss": 1.3414, + "step": 1897 + }, + { + "epoch": 0.16643898776288693, + "grad_norm": 0.09814453125, + "learning_rate": 0.0029506626533980776, + "loss": 1.268, + "step": 1898 + }, + { + "epoch": 0.16652667953726147, + "grad_norm": 0.08544921875, + "learning_rate": 0.002950553385602189, + "loss": 1.2873, + "step": 1899 + }, + { + "epoch": 0.166614371311636, + "grad_norm": 0.07958984375, + "learning_rate": 0.002950443999199189, + "loss": 1.2687, + "step": 1900 + }, + { + "epoch": 0.16670206308601057, + "grad_norm": 0.06494140625, + "learning_rate": 0.002950334494199054, + "loss": 1.307, + "step": 1901 + }, + { + "epoch": 0.1667897548603851, + "grad_norm": 0.0888671875, + "learning_rate": 0.0029502248706117693, + "loss": 1.3191, + "step": 1902 + }, + { + "epoch": 0.16687744663475965, + "grad_norm": 0.0849609375, + "learning_rate": 0.002950115128447333, + "loss": 1.3759, + "step": 1903 + }, + { + "epoch": 0.16696513840913418, + "grad_norm": 0.0751953125, + "learning_rate": 0.0029500052677157535, + "loss": 1.2491, + "step": 1904 + }, + { + "epoch": 0.16705283018350875, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029498952884270493, + "loss": 1.2773, + "step": 1905 + }, + { + "epoch": 0.1671405219578833, + "grad_norm": 0.1337890625, + "learning_rate": 0.00294978519059125, + "loss": 1.2975, + "step": 1906 + }, + { + "epoch": 0.16722821373225782, + "grad_norm": 0.1923828125, + "learning_rate": 0.0029496749742183976, + "loss": 1.3059, + "step": 1907 + }, + { + "epoch": 0.16731590550663236, + "grad_norm": 0.1884765625, + "learning_rate": 0.002949564639318542, + "loss": 1.3331, + "step": 1908 + }, + { + "epoch": 0.16740359728100693, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029494541859017465, + "loss": 1.2586, + "step": 1909 + }, + { + "epoch": 0.16749128905538146, + "grad_norm": 0.193359375, + "learning_rate": 0.0029493436139780838, + "loss": 1.3039, + "step": 1910 + }, + { + "epoch": 0.167578980829756, + "grad_norm": 0.1044921875, + "learning_rate": 0.0029492329235576375, + "loss": 1.281, + "step": 1911 + }, + { + "epoch": 0.16766667260413057, + "grad_norm": 0.0859375, + "learning_rate": 0.0029491221146505024, + "loss": 1.2381, + "step": 1912 + }, + { + "epoch": 0.1677543643785051, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029490111872667846, + "loss": 1.2411, + "step": 1913 + }, + { + "epoch": 0.16784205615287964, + "grad_norm": 0.1044921875, + "learning_rate": 0.0029489001414165997, + "loss": 1.2881, + "step": 1914 + }, + { + "epoch": 0.16792974792725418, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029487889771100746, + "loss": 1.1976, + "step": 1915 + }, + { + "epoch": 0.16801743970162875, + "grad_norm": 0.1025390625, + "learning_rate": 0.002948677694357348, + "loss": 1.2403, + "step": 1916 + }, + { + "epoch": 0.16810513147600328, + "grad_norm": 0.07861328125, + "learning_rate": 0.002948566293168568, + "loss": 1.2295, + "step": 1917 + }, + { + "epoch": 0.16819282325037782, + "grad_norm": 0.1630859375, + "learning_rate": 0.0029484547735538946, + "loss": 1.3096, + "step": 1918 + }, + { + "epoch": 0.1682805150247524, + "grad_norm": 0.1337890625, + "learning_rate": 0.0029483431355234973, + "loss": 1.2894, + "step": 1919 + }, + { + "epoch": 0.16836820679912692, + "grad_norm": 0.1064453125, + "learning_rate": 0.002948231379087558, + "loss": 1.3074, + "step": 1920 + }, + { + "epoch": 0.16845589857350146, + "grad_norm": 0.130859375, + "learning_rate": 0.0029481195042562686, + "loss": 1.2618, + "step": 1921 + }, + { + "epoch": 0.168543590347876, + "grad_norm": 0.07666015625, + "learning_rate": 0.002948007511039831, + "loss": 1.3031, + "step": 1922 + }, + { + "epoch": 0.16863128212225056, + "grad_norm": 0.1494140625, + "learning_rate": 0.002947895399448459, + "loss": 1.317, + "step": 1923 + }, + { + "epoch": 0.1687189738966251, + "grad_norm": 0.125, + "learning_rate": 0.0029477831694923774, + "loss": 1.2723, + "step": 1924 + }, + { + "epoch": 0.16880666567099964, + "grad_norm": 0.154296875, + "learning_rate": 0.0029476708211818212, + "loss": 1.3451, + "step": 1925 + }, + { + "epoch": 0.16889435744537418, + "grad_norm": 0.146484375, + "learning_rate": 0.002947558354527036, + "loss": 1.2635, + "step": 1926 + }, + { + "epoch": 0.16898204921974874, + "grad_norm": 0.1455078125, + "learning_rate": 0.002947445769538278, + "loss": 1.276, + "step": 1927 + }, + { + "epoch": 0.16906974099412328, + "grad_norm": 0.1396484375, + "learning_rate": 0.0029473330662258155, + "loss": 1.3344, + "step": 1928 + }, + { + "epoch": 0.16915743276849782, + "grad_norm": 0.09130859375, + "learning_rate": 0.002947220244599926, + "loss": 1.2571, + "step": 1929 + }, + { + "epoch": 0.16924512454287238, + "grad_norm": 0.06884765625, + "learning_rate": 0.0029471073046709, + "loss": 1.2501, + "step": 1930 + }, + { + "epoch": 0.16933281631724692, + "grad_norm": 0.076171875, + "learning_rate": 0.0029469942464490353, + "loss": 1.2875, + "step": 1931 + }, + { + "epoch": 0.16942050809162146, + "grad_norm": 0.08837890625, + "learning_rate": 0.002946881069944644, + "loss": 1.2909, + "step": 1932 + }, + { + "epoch": 0.169508199865996, + "grad_norm": 0.072265625, + "learning_rate": 0.002946767775168047, + "loss": 1.3028, + "step": 1933 + }, + { + "epoch": 0.16959589164037056, + "grad_norm": 0.1044921875, + "learning_rate": 0.002946654362129576, + "loss": 1.2602, + "step": 1934 + }, + { + "epoch": 0.1696835834147451, + "grad_norm": 0.1142578125, + "learning_rate": 0.0029465408308395748, + "loss": 1.319, + "step": 1935 + }, + { + "epoch": 0.16977127518911964, + "grad_norm": 0.078125, + "learning_rate": 0.002946427181308397, + "loss": 1.2907, + "step": 1936 + }, + { + "epoch": 0.1698589669634942, + "grad_norm": 0.11474609375, + "learning_rate": 0.0029463134135464066, + "loss": 1.2207, + "step": 1937 + }, + { + "epoch": 0.16994665873786874, + "grad_norm": 0.10546875, + "learning_rate": 0.0029461995275639795, + "loss": 1.2865, + "step": 1938 + }, + { + "epoch": 0.17003435051224328, + "grad_norm": 0.1328125, + "learning_rate": 0.0029460855233715013, + "loss": 1.2687, + "step": 1939 + }, + { + "epoch": 0.17012204228661781, + "grad_norm": 0.0771484375, + "learning_rate": 0.0029459714009793694, + "loss": 1.2635, + "step": 1940 + }, + { + "epoch": 0.17020973406099238, + "grad_norm": 0.1083984375, + "learning_rate": 0.0029458571603979914, + "loss": 1.3113, + "step": 1941 + }, + { + "epoch": 0.17029742583536692, + "grad_norm": 0.09130859375, + "learning_rate": 0.0029457428016377856, + "loss": 1.2634, + "step": 1942 + }, + { + "epoch": 0.17038511760974145, + "grad_norm": 0.07275390625, + "learning_rate": 0.002945628324709181, + "loss": 1.3609, + "step": 1943 + }, + { + "epoch": 0.170472809384116, + "grad_norm": 0.1787109375, + "learning_rate": 0.0029455137296226183, + "loss": 1.2946, + "step": 1944 + }, + { + "epoch": 0.17056050115849056, + "grad_norm": 0.111328125, + "learning_rate": 0.0029453990163885472, + "loss": 1.3569, + "step": 1945 + }, + { + "epoch": 0.1706481929328651, + "grad_norm": 0.1298828125, + "learning_rate": 0.00294528418501743, + "loss": 1.2823, + "step": 1946 + }, + { + "epoch": 0.17073588470723963, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029451692355197396, + "loss": 1.2662, + "step": 1947 + }, + { + "epoch": 0.1708235764816142, + "grad_norm": 0.1025390625, + "learning_rate": 0.0029450541679059573, + "loss": 1.2931, + "step": 1948 + }, + { + "epoch": 0.17091126825598874, + "grad_norm": 0.134765625, + "learning_rate": 0.0029449389821865793, + "loss": 1.1969, + "step": 1949 + }, + { + "epoch": 0.17099896003036327, + "grad_norm": 0.12158203125, + "learning_rate": 0.0029448236783721077, + "loss": 1.2791, + "step": 1950 + }, + { + "epoch": 0.1710866518047378, + "grad_norm": 0.07666015625, + "learning_rate": 0.00294470825647306, + "loss": 1.3965, + "step": 1951 + }, + { + "epoch": 0.17117434357911238, + "grad_norm": 0.072265625, + "learning_rate": 0.0029445927164999616, + "loss": 1.3565, + "step": 1952 + }, + { + "epoch": 0.1712620353534869, + "grad_norm": 0.08154296875, + "learning_rate": 0.0029444770584633488, + "loss": 1.2791, + "step": 1953 + }, + { + "epoch": 0.17134972712786145, + "grad_norm": 0.119140625, + "learning_rate": 0.0029443612823737706, + "loss": 1.2292, + "step": 1954 + }, + { + "epoch": 0.17143741890223602, + "grad_norm": 0.0732421875, + "learning_rate": 0.0029442453882417845, + "loss": 1.2841, + "step": 1955 + }, + { + "epoch": 0.17152511067661055, + "grad_norm": 0.1005859375, + "learning_rate": 0.00294412937607796, + "loss": 1.2832, + "step": 1956 + }, + { + "epoch": 0.1716128024509851, + "grad_norm": 0.0712890625, + "learning_rate": 0.002944013245892877, + "loss": 1.3156, + "step": 1957 + }, + { + "epoch": 0.17170049422535963, + "grad_norm": 0.21484375, + "learning_rate": 0.0029438969976971273, + "loss": 1.291, + "step": 1958 + }, + { + "epoch": 0.1717881859997342, + "grad_norm": 0.11328125, + "learning_rate": 0.002943780631501311, + "loss": 1.2569, + "step": 1959 + }, + { + "epoch": 0.17187587777410873, + "grad_norm": 0.177734375, + "learning_rate": 0.002943664147316041, + "loss": 1.3101, + "step": 1960 + }, + { + "epoch": 0.17196356954848327, + "grad_norm": 0.1630859375, + "learning_rate": 0.0029435475451519404, + "loss": 1.2592, + "step": 1961 + }, + { + "epoch": 0.1720512613228578, + "grad_norm": 0.08642578125, + "learning_rate": 0.002943430825019643, + "loss": 1.3996, + "step": 1962 + }, + { + "epoch": 0.17213895309723237, + "grad_norm": 0.13671875, + "learning_rate": 0.002943313986929793, + "loss": 1.2734, + "step": 1963 + }, + { + "epoch": 0.1722266448716069, + "grad_norm": 0.07861328125, + "learning_rate": 0.0029431970308930466, + "loss": 1.3204, + "step": 1964 + }, + { + "epoch": 0.17231433664598145, + "grad_norm": 0.083984375, + "learning_rate": 0.0029430799569200687, + "loss": 1.27, + "step": 1965 + }, + { + "epoch": 0.172402028420356, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029429627650215374, + "loss": 1.2319, + "step": 1966 + }, + { + "epoch": 0.17248972019473055, + "grad_norm": 0.076171875, + "learning_rate": 0.0029428454552081393, + "loss": 1.3078, + "step": 1967 + }, + { + "epoch": 0.1725774119691051, + "grad_norm": 0.10595703125, + "learning_rate": 0.002942728027490574, + "loss": 1.2591, + "step": 1968 + }, + { + "epoch": 0.17266510374347963, + "grad_norm": 0.08642578125, + "learning_rate": 0.002942610481879549, + "loss": 1.3353, + "step": 1969 + }, + { + "epoch": 0.1727527955178542, + "grad_norm": 0.10302734375, + "learning_rate": 0.0029424928183857846, + "loss": 1.3396, + "step": 1970 + }, + { + "epoch": 0.17284048729222873, + "grad_norm": 0.080078125, + "learning_rate": 0.002942375037020012, + "loss": 1.3243, + "step": 1971 + }, + { + "epoch": 0.17292817906660327, + "grad_norm": 0.06884765625, + "learning_rate": 0.0029422571377929726, + "loss": 1.3106, + "step": 1972 + }, + { + "epoch": 0.17301587084097783, + "grad_norm": 0.083984375, + "learning_rate": 0.0029421391207154184, + "loss": 1.2602, + "step": 1973 + }, + { + "epoch": 0.17310356261535237, + "grad_norm": 0.064453125, + "learning_rate": 0.0029420209857981116, + "loss": 1.2806, + "step": 1974 + }, + { + "epoch": 0.1731912543897269, + "grad_norm": 0.08935546875, + "learning_rate": 0.0029419027330518255, + "loss": 1.2117, + "step": 1975 + }, + { + "epoch": 0.17327894616410144, + "grad_norm": 0.0751953125, + "learning_rate": 0.002941784362487346, + "loss": 1.3052, + "step": 1976 + }, + { + "epoch": 0.173366637938476, + "grad_norm": 0.0771484375, + "learning_rate": 0.0029416658741154675, + "loss": 1.2397, + "step": 1977 + }, + { + "epoch": 0.17345432971285055, + "grad_norm": 0.11083984375, + "learning_rate": 0.0029415472679469954, + "loss": 1.2818, + "step": 1978 + }, + { + "epoch": 0.17354202148722508, + "grad_norm": 0.11083984375, + "learning_rate": 0.0029414285439927465, + "loss": 1.2803, + "step": 1979 + }, + { + "epoch": 0.17362971326159962, + "grad_norm": 0.0654296875, + "learning_rate": 0.0029413097022635486, + "loss": 1.2976, + "step": 1980 + }, + { + "epoch": 0.1737174050359742, + "grad_norm": 0.07080078125, + "learning_rate": 0.002941190742770239, + "loss": 1.3047, + "step": 1981 + }, + { + "epoch": 0.17380509681034872, + "grad_norm": 0.07421875, + "learning_rate": 0.002941071665523667, + "loss": 1.2546, + "step": 1982 + }, + { + "epoch": 0.17389278858472326, + "grad_norm": 0.091796875, + "learning_rate": 0.0029409524705346917, + "loss": 1.2918, + "step": 1983 + }, + { + "epoch": 0.17398048035909783, + "grad_norm": 0.16015625, + "learning_rate": 0.002940833157814184, + "loss": 1.3862, + "step": 1984 + }, + { + "epoch": 0.17406817213347237, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029407137273730244, + "loss": 1.3218, + "step": 1985 + }, + { + "epoch": 0.1741558639078469, + "grad_norm": 0.1884765625, + "learning_rate": 0.002940594179222105, + "loss": 1.3668, + "step": 1986 + }, + { + "epoch": 0.17424355568222144, + "grad_norm": 0.23046875, + "learning_rate": 0.002940474513372328, + "loss": 1.2302, + "step": 1987 + }, + { + "epoch": 0.174331247456596, + "grad_norm": 0.07421875, + "learning_rate": 0.0029403547298346064, + "loss": 1.3277, + "step": 1988 + }, + { + "epoch": 0.17441893923097054, + "grad_norm": 0.2578125, + "learning_rate": 0.0029402348286198653, + "loss": 1.2634, + "step": 1989 + }, + { + "epoch": 0.17450663100534508, + "grad_norm": 0.10498046875, + "learning_rate": 0.002940114809739038, + "loss": 1.2542, + "step": 1990 + }, + { + "epoch": 0.17459432277971965, + "grad_norm": 0.2099609375, + "learning_rate": 0.0029399946732030706, + "loss": 1.3228, + "step": 1991 + }, + { + "epoch": 0.17468201455409418, + "grad_norm": 0.1513671875, + "learning_rate": 0.0029398744190229187, + "loss": 1.2353, + "step": 1992 + }, + { + "epoch": 0.17476970632846872, + "grad_norm": 0.09033203125, + "learning_rate": 0.00293975404720955, + "loss": 1.3104, + "step": 1993 + }, + { + "epoch": 0.17485739810284326, + "grad_norm": 0.07861328125, + "learning_rate": 0.0029396335577739413, + "loss": 1.2444, + "step": 1994 + }, + { + "epoch": 0.17494508987721782, + "grad_norm": 0.095703125, + "learning_rate": 0.0029395129507270817, + "loss": 1.3507, + "step": 1995 + }, + { + "epoch": 0.17503278165159236, + "grad_norm": 0.177734375, + "learning_rate": 0.00293939222607997, + "loss": 1.3499, + "step": 1996 + }, + { + "epoch": 0.1751204734259669, + "grad_norm": 0.140625, + "learning_rate": 0.002939271383843615, + "loss": 1.2426, + "step": 1997 + }, + { + "epoch": 0.17520816520034144, + "grad_norm": 0.130859375, + "learning_rate": 0.002939150424029039, + "loss": 1.2648, + "step": 1998 + }, + { + "epoch": 0.175295856974716, + "grad_norm": 0.20703125, + "learning_rate": 0.002939029346647272, + "loss": 1.3051, + "step": 1999 + }, + { + "epoch": 0.17538354874909054, + "grad_norm": 0.07568359375, + "learning_rate": 0.0029389081517093556, + "loss": 1.3173, + "step": 2000 + }, + { + "epoch": 0.17538354874909054, + "eval_loss": 1.2995423078536987, + "eval_runtime": 429.2876, + "eval_samples_per_second": 33.653, + "eval_steps_per_second": 8.414, + "step": 2000 + } + ], + "logging_steps": 1.0, + "max_steps": 11403, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.897567584256e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}