{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.989025177533893, "eval_steps": 500, "global_step": 1935, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025823111684958036, "grad_norm": 18.261793337291987, "learning_rate": 1.0309278350515465e-07, "loss": 2.5303, "step": 1 }, { "epoch": 0.005164622336991607, "grad_norm": 17.801147060406052, "learning_rate": 2.061855670103093e-07, "loss": 2.5624, "step": 2 }, { "epoch": 0.007746933505487412, "grad_norm": 18.16995107477998, "learning_rate": 3.0927835051546394e-07, "loss": 2.5315, "step": 3 }, { "epoch": 0.010329244673983214, "grad_norm": 18.01568136556937, "learning_rate": 4.123711340206186e-07, "loss": 2.4765, "step": 4 }, { "epoch": 0.012911555842479019, "grad_norm": 17.515246452891258, "learning_rate": 5.154639175257732e-07, "loss": 2.5302, "step": 5 }, { "epoch": 0.015493867010974823, "grad_norm": 17.590449970545567, "learning_rate": 6.185567010309279e-07, "loss": 2.5035, "step": 6 }, { "epoch": 0.018076178179470628, "grad_norm": 17.860290443777686, "learning_rate": 7.216494845360824e-07, "loss": 2.4856, "step": 7 }, { "epoch": 0.02065848934796643, "grad_norm": 16.457931144604892, "learning_rate": 8.247422680412372e-07, "loss": 2.5034, "step": 8 }, { "epoch": 0.023240800516462233, "grad_norm": 15.470780109900105, "learning_rate": 9.278350515463919e-07, "loss": 2.4852, "step": 9 }, { "epoch": 0.025823111684958037, "grad_norm": 15.512665472720686, "learning_rate": 1.0309278350515464e-06, "loss": 2.4699, "step": 10 }, { "epoch": 0.028405422853453842, "grad_norm": 11.58357921033743, "learning_rate": 1.134020618556701e-06, "loss": 2.4699, "step": 11 }, { "epoch": 0.030987734021949646, "grad_norm": 10.860729184020308, "learning_rate": 1.2371134020618557e-06, "loss": 2.4071, "step": 12 }, { "epoch": 0.03357004519044545, "grad_norm": 9.989819033444546, "learning_rate": 1.3402061855670104e-06, "loss": 2.3805, "step": 13 }, { "epoch": 0.036152356358941255, "grad_norm": 4.5174463153202415, "learning_rate": 1.4432989690721649e-06, "loss": 2.2546, "step": 14 }, { "epoch": 0.03873466752743705, "grad_norm": 4.393268427169014, "learning_rate": 1.5463917525773197e-06, "loss": 2.2604, "step": 15 }, { "epoch": 0.04131697869593286, "grad_norm": 4.427841764828794, "learning_rate": 1.6494845360824744e-06, "loss": 2.2923, "step": 16 }, { "epoch": 0.04389928986442866, "grad_norm": 4.116274907311869, "learning_rate": 1.7525773195876288e-06, "loss": 2.2216, "step": 17 }, { "epoch": 0.046481601032924466, "grad_norm": 4.133640482984593, "learning_rate": 1.8556701030927837e-06, "loss": 2.2106, "step": 18 }, { "epoch": 0.04906391220142027, "grad_norm": 4.395953110017593, "learning_rate": 1.9587628865979384e-06, "loss": 2.0858, "step": 19 }, { "epoch": 0.051646223369916075, "grad_norm": 4.474700218464569, "learning_rate": 2.061855670103093e-06, "loss": 2.0397, "step": 20 }, { "epoch": 0.05422853453841188, "grad_norm": 4.4282827611154545, "learning_rate": 2.1649484536082477e-06, "loss": 1.9898, "step": 21 }, { "epoch": 0.056810845706907684, "grad_norm": 4.327539626304312, "learning_rate": 2.268041237113402e-06, "loss": 1.9951, "step": 22 }, { "epoch": 0.05939315687540349, "grad_norm": 4.120482191834098, "learning_rate": 2.3711340206185566e-06, "loss": 1.9155, "step": 23 }, { "epoch": 0.06197546804389929, "grad_norm": 3.8070009573204358, "learning_rate": 2.4742268041237115e-06, "loss": 1.8995, "step": 24 }, { "epoch": 0.0645577792123951, "grad_norm": 4.212221981103606, "learning_rate": 2.577319587628866e-06, "loss": 1.8671, "step": 25 }, { "epoch": 0.0671400903808909, "grad_norm": 4.058515787932264, "learning_rate": 2.680412371134021e-06, "loss": 1.624, "step": 26 }, { "epoch": 0.0697224015493867, "grad_norm": 2.6837920364601398, "learning_rate": 2.7835051546391757e-06, "loss": 1.6006, "step": 27 }, { "epoch": 0.07230471271788251, "grad_norm": 2.3134143191011924, "learning_rate": 2.8865979381443297e-06, "loss": 1.5962, "step": 28 }, { "epoch": 0.07488702388637831, "grad_norm": 2.10252758773887, "learning_rate": 2.9896907216494846e-06, "loss": 1.5671, "step": 29 }, { "epoch": 0.0774693350548741, "grad_norm": 1.5299035293940784, "learning_rate": 3.0927835051546395e-06, "loss": 1.5472, "step": 30 }, { "epoch": 0.08005164622336991, "grad_norm": 1.2378467992780195, "learning_rate": 3.195876288659794e-06, "loss": 1.5826, "step": 31 }, { "epoch": 0.08263395739186571, "grad_norm": 1.1693886031886611, "learning_rate": 3.298969072164949e-06, "loss": 1.4891, "step": 32 }, { "epoch": 0.08521626856036152, "grad_norm": 1.1401616730246562, "learning_rate": 3.4020618556701037e-06, "loss": 1.4982, "step": 33 }, { "epoch": 0.08779857972885732, "grad_norm": 1.0954194107096582, "learning_rate": 3.5051546391752577e-06, "loss": 1.5325, "step": 34 }, { "epoch": 0.09038089089735313, "grad_norm": 1.0579698812847145, "learning_rate": 3.6082474226804126e-06, "loss": 1.4814, "step": 35 }, { "epoch": 0.09296320206584893, "grad_norm": 0.9653972853589035, "learning_rate": 3.7113402061855674e-06, "loss": 1.4494, "step": 36 }, { "epoch": 0.09554551323434474, "grad_norm": 0.9645799739429384, "learning_rate": 3.814432989690722e-06, "loss": 1.5063, "step": 37 }, { "epoch": 0.09812782440284054, "grad_norm": 1.00902463704413, "learning_rate": 3.917525773195877e-06, "loss": 1.4683, "step": 38 }, { "epoch": 0.10071013557133635, "grad_norm": 0.8783590838731776, "learning_rate": 4.020618556701032e-06, "loss": 1.4453, "step": 39 }, { "epoch": 0.10329244673983215, "grad_norm": 0.8107745608541321, "learning_rate": 4.123711340206186e-06, "loss": 1.4753, "step": 40 }, { "epoch": 0.10587475790832795, "grad_norm": 0.789499449431658, "learning_rate": 4.2268041237113405e-06, "loss": 1.4376, "step": 41 }, { "epoch": 0.10845706907682376, "grad_norm": 0.8025862979836251, "learning_rate": 4.329896907216495e-06, "loss": 1.4394, "step": 42 }, { "epoch": 0.11103938024531956, "grad_norm": 0.7735255000850332, "learning_rate": 4.4329896907216494e-06, "loss": 1.4252, "step": 43 }, { "epoch": 0.11362169141381537, "grad_norm": 0.655685351955956, "learning_rate": 4.536082474226804e-06, "loss": 1.4297, "step": 44 }, { "epoch": 0.11620400258231117, "grad_norm": 0.6434305676734384, "learning_rate": 4.639175257731959e-06, "loss": 1.4067, "step": 45 }, { "epoch": 0.11878631375080698, "grad_norm": 0.6261090972732297, "learning_rate": 4.742268041237113e-06, "loss": 1.4101, "step": 46 }, { "epoch": 0.12136862491930278, "grad_norm": 0.6073212568723447, "learning_rate": 4.845360824742268e-06, "loss": 1.4172, "step": 47 }, { "epoch": 0.12395093608779859, "grad_norm": 0.5593992251138936, "learning_rate": 4.948453608247423e-06, "loss": 1.3798, "step": 48 }, { "epoch": 0.1265332472562944, "grad_norm": 0.5922984047206602, "learning_rate": 5.051546391752578e-06, "loss": 1.3858, "step": 49 }, { "epoch": 0.1291155584247902, "grad_norm": 0.6051805656066229, "learning_rate": 5.154639175257732e-06, "loss": 1.3836, "step": 50 }, { "epoch": 0.131697869593286, "grad_norm": 0.5691467307958845, "learning_rate": 5.257731958762888e-06, "loss": 1.3841, "step": 51 }, { "epoch": 0.1342801807617818, "grad_norm": 0.5693752131599139, "learning_rate": 5.360824742268042e-06, "loss": 1.3529, "step": 52 }, { "epoch": 0.1368624919302776, "grad_norm": 0.5695922879694713, "learning_rate": 5.463917525773196e-06, "loss": 1.3599, "step": 53 }, { "epoch": 0.1394448030987734, "grad_norm": 0.5653409792135605, "learning_rate": 5.567010309278351e-06, "loss": 1.359, "step": 54 }, { "epoch": 0.14202711426726922, "grad_norm": 0.5181379607950953, "learning_rate": 5.670103092783505e-06, "loss": 1.358, "step": 55 }, { "epoch": 0.14460942543576502, "grad_norm": 0.5616919575863237, "learning_rate": 5.7731958762886594e-06, "loss": 1.3925, "step": 56 }, { "epoch": 0.14719173660426083, "grad_norm": 0.5785012034706822, "learning_rate": 5.876288659793815e-06, "loss": 1.3902, "step": 57 }, { "epoch": 0.14977404777275663, "grad_norm": 0.5304602501774517, "learning_rate": 5.979381443298969e-06, "loss": 1.3516, "step": 58 }, { "epoch": 0.15235635894125243, "grad_norm": 0.530893892325567, "learning_rate": 6.082474226804124e-06, "loss": 1.3556, "step": 59 }, { "epoch": 0.1549386701097482, "grad_norm": 0.5578192633717619, "learning_rate": 6.185567010309279e-06, "loss": 1.3609, "step": 60 }, { "epoch": 0.15752098127824402, "grad_norm": 0.5513788256965562, "learning_rate": 6.288659793814433e-06, "loss": 1.3494, "step": 61 }, { "epoch": 0.16010329244673982, "grad_norm": 0.5357427336170907, "learning_rate": 6.391752577319588e-06, "loss": 1.3338, "step": 62 }, { "epoch": 0.16268560361523562, "grad_norm": 0.5023717761905121, "learning_rate": 6.494845360824743e-06, "loss": 1.3329, "step": 63 }, { "epoch": 0.16526791478373143, "grad_norm": 0.5667763878793689, "learning_rate": 6.597938144329898e-06, "loss": 1.3091, "step": 64 }, { "epoch": 0.16785022595222723, "grad_norm": 0.5321682781083598, "learning_rate": 6.701030927835052e-06, "loss": 1.4019, "step": 65 }, { "epoch": 0.17043253712072304, "grad_norm": 0.5281078850382545, "learning_rate": 6.804123711340207e-06, "loss": 1.2957, "step": 66 }, { "epoch": 0.17301484828921884, "grad_norm": 0.5517071277006239, "learning_rate": 6.907216494845361e-06, "loss": 1.3387, "step": 67 }, { "epoch": 0.17559715945771465, "grad_norm": 0.49767879157016104, "learning_rate": 7.010309278350515e-06, "loss": 1.3428, "step": 68 }, { "epoch": 0.17817947062621045, "grad_norm": 0.5333338011710018, "learning_rate": 7.113402061855671e-06, "loss": 1.3275, "step": 69 }, { "epoch": 0.18076178179470626, "grad_norm": 0.5436738511723471, "learning_rate": 7.216494845360825e-06, "loss": 1.323, "step": 70 }, { "epoch": 0.18334409296320206, "grad_norm": 0.510166704780262, "learning_rate": 7.319587628865979e-06, "loss": 1.3337, "step": 71 }, { "epoch": 0.18592640413169786, "grad_norm": 0.5367005388657456, "learning_rate": 7.422680412371135e-06, "loss": 1.3157, "step": 72 }, { "epoch": 0.18850871530019367, "grad_norm": 0.5448486484799029, "learning_rate": 7.525773195876289e-06, "loss": 1.3016, "step": 73 }, { "epoch": 0.19109102646868947, "grad_norm": 0.5049919202829503, "learning_rate": 7.628865979381444e-06, "loss": 1.314, "step": 74 }, { "epoch": 0.19367333763718528, "grad_norm": 0.5203343239140137, "learning_rate": 7.731958762886599e-06, "loss": 1.3402, "step": 75 }, { "epoch": 0.19625564880568108, "grad_norm": 0.5295426487296165, "learning_rate": 7.835051546391754e-06, "loss": 1.2937, "step": 76 }, { "epoch": 0.1988379599741769, "grad_norm": 0.516845033941673, "learning_rate": 7.938144329896907e-06, "loss": 1.3334, "step": 77 }, { "epoch": 0.2014202711426727, "grad_norm": 0.5195834990483513, "learning_rate": 8.041237113402063e-06, "loss": 1.3398, "step": 78 }, { "epoch": 0.2040025823111685, "grad_norm": 0.5437220214849503, "learning_rate": 8.144329896907216e-06, "loss": 1.3328, "step": 79 }, { "epoch": 0.2065848934796643, "grad_norm": 0.5316348277354109, "learning_rate": 8.247422680412371e-06, "loss": 1.2668, "step": 80 }, { "epoch": 0.2091672046481601, "grad_norm": 0.5367859922800738, "learning_rate": 8.350515463917526e-06, "loss": 1.3455, "step": 81 }, { "epoch": 0.2117495158166559, "grad_norm": 0.5330729877188181, "learning_rate": 8.453608247422681e-06, "loss": 1.3012, "step": 82 }, { "epoch": 0.2143318269851517, "grad_norm": 0.5289538334232236, "learning_rate": 8.556701030927836e-06, "loss": 1.2987, "step": 83 }, { "epoch": 0.21691413815364752, "grad_norm": 0.5271186274617113, "learning_rate": 8.65979381443299e-06, "loss": 1.3091, "step": 84 }, { "epoch": 0.21949644932214332, "grad_norm": 0.5425463627416961, "learning_rate": 8.762886597938146e-06, "loss": 1.3312, "step": 85 }, { "epoch": 0.22207876049063913, "grad_norm": 0.5002639279766852, "learning_rate": 8.865979381443299e-06, "loss": 1.297, "step": 86 }, { "epoch": 0.22466107165913493, "grad_norm": 0.5217277201869615, "learning_rate": 8.969072164948455e-06, "loss": 1.289, "step": 87 }, { "epoch": 0.22724338282763074, "grad_norm": 0.5187640962084964, "learning_rate": 9.072164948453609e-06, "loss": 1.2987, "step": 88 }, { "epoch": 0.22982569399612654, "grad_norm": 0.5338044768965542, "learning_rate": 9.175257731958764e-06, "loss": 1.3224, "step": 89 }, { "epoch": 0.23240800516462234, "grad_norm": 0.5154318950249379, "learning_rate": 9.278350515463918e-06, "loss": 1.2765, "step": 90 }, { "epoch": 0.23499031633311815, "grad_norm": 0.5528419081350959, "learning_rate": 9.381443298969073e-06, "loss": 1.2546, "step": 91 }, { "epoch": 0.23757262750161395, "grad_norm": 0.5214303815188377, "learning_rate": 9.484536082474226e-06, "loss": 1.3196, "step": 92 }, { "epoch": 0.24015493867010976, "grad_norm": 0.5526561516212597, "learning_rate": 9.587628865979383e-06, "loss": 1.3079, "step": 93 }, { "epoch": 0.24273724983860556, "grad_norm": 0.5222706312908064, "learning_rate": 9.690721649484536e-06, "loss": 1.3066, "step": 94 }, { "epoch": 0.24531956100710137, "grad_norm": 0.5012834034257991, "learning_rate": 9.793814432989691e-06, "loss": 1.2722, "step": 95 }, { "epoch": 0.24790187217559717, "grad_norm": 0.546506717931034, "learning_rate": 9.896907216494846e-06, "loss": 1.2797, "step": 96 }, { "epoch": 0.25048418334409295, "grad_norm": 0.5253068189240363, "learning_rate": 1e-05, "loss": 1.3317, "step": 97 }, { "epoch": 0.2530664945125888, "grad_norm": 0.5611013083746247, "learning_rate": 1.0103092783505156e-05, "loss": 1.3025, "step": 98 }, { "epoch": 0.25564880568108456, "grad_norm": 0.5298450611788376, "learning_rate": 1.0206185567010309e-05, "loss": 1.3036, "step": 99 }, { "epoch": 0.2582311168495804, "grad_norm": 0.5099158069328582, "learning_rate": 1.0309278350515464e-05, "loss": 1.3022, "step": 100 }, { "epoch": 0.26081342801807617, "grad_norm": 0.5293692963261973, "learning_rate": 1.041237113402062e-05, "loss": 1.3146, "step": 101 }, { "epoch": 0.263395739186572, "grad_norm": 0.5244293197527762, "learning_rate": 1.0515463917525775e-05, "loss": 1.2885, "step": 102 }, { "epoch": 0.2659780503550678, "grad_norm": 0.5033038451383925, "learning_rate": 1.0618556701030928e-05, "loss": 1.2578, "step": 103 }, { "epoch": 0.2685603615235636, "grad_norm": 0.5274373232659619, "learning_rate": 1.0721649484536083e-05, "loss": 1.299, "step": 104 }, { "epoch": 0.2711426726920594, "grad_norm": 0.5444726311980428, "learning_rate": 1.0824742268041238e-05, "loss": 1.2929, "step": 105 }, { "epoch": 0.2737249838605552, "grad_norm": 0.5326713139943118, "learning_rate": 1.0927835051546391e-05, "loss": 1.2753, "step": 106 }, { "epoch": 0.276307295029051, "grad_norm": 0.5380363136345329, "learning_rate": 1.1030927835051548e-05, "loss": 1.3309, "step": 107 }, { "epoch": 0.2788896061975468, "grad_norm": 0.5326668305211691, "learning_rate": 1.1134020618556703e-05, "loss": 1.2572, "step": 108 }, { "epoch": 0.2814719173660426, "grad_norm": 0.546468614229743, "learning_rate": 1.1237113402061856e-05, "loss": 1.3002, "step": 109 }, { "epoch": 0.28405422853453843, "grad_norm": 0.5702827216999251, "learning_rate": 1.134020618556701e-05, "loss": 1.2785, "step": 110 }, { "epoch": 0.2866365397030342, "grad_norm": 0.6279431033493682, "learning_rate": 1.1443298969072166e-05, "loss": 1.293, "step": 111 }, { "epoch": 0.28921885087153004, "grad_norm": 0.531097886388855, "learning_rate": 1.1546391752577319e-05, "loss": 1.2583, "step": 112 }, { "epoch": 0.2918011620400258, "grad_norm": 0.5599048942527884, "learning_rate": 1.1649484536082475e-05, "loss": 1.2812, "step": 113 }, { "epoch": 0.29438347320852165, "grad_norm": 0.5156492032600126, "learning_rate": 1.175257731958763e-05, "loss": 1.2593, "step": 114 }, { "epoch": 0.2969657843770174, "grad_norm": 0.5204640872347789, "learning_rate": 1.1855670103092785e-05, "loss": 1.2222, "step": 115 }, { "epoch": 0.29954809554551326, "grad_norm": 0.5242605009448421, "learning_rate": 1.1958762886597938e-05, "loss": 1.2452, "step": 116 }, { "epoch": 0.30213040671400904, "grad_norm": 0.5194114061316515, "learning_rate": 1.2061855670103093e-05, "loss": 1.2844, "step": 117 }, { "epoch": 0.30471271788250487, "grad_norm": 0.517182795982599, "learning_rate": 1.2164948453608248e-05, "loss": 1.2494, "step": 118 }, { "epoch": 0.30729502905100065, "grad_norm": 0.5322718855106631, "learning_rate": 1.2268041237113405e-05, "loss": 1.2853, "step": 119 }, { "epoch": 0.3098773402194964, "grad_norm": 0.5477767784263254, "learning_rate": 1.2371134020618558e-05, "loss": 1.266, "step": 120 }, { "epoch": 0.31245965138799225, "grad_norm": 0.5409220884054373, "learning_rate": 1.2474226804123713e-05, "loss": 1.2862, "step": 121 }, { "epoch": 0.31504196255648803, "grad_norm": 0.5278483491044216, "learning_rate": 1.2577319587628866e-05, "loss": 1.2431, "step": 122 }, { "epoch": 0.31762427372498386, "grad_norm": 0.5366419552263015, "learning_rate": 1.268041237113402e-05, "loss": 1.2665, "step": 123 }, { "epoch": 0.32020658489347964, "grad_norm": 0.5484453793169465, "learning_rate": 1.2783505154639176e-05, "loss": 1.2466, "step": 124 }, { "epoch": 0.32278889606197547, "grad_norm": 0.5707617704740593, "learning_rate": 1.2886597938144332e-05, "loss": 1.2801, "step": 125 }, { "epoch": 0.32537120723047125, "grad_norm": 0.5545965695444451, "learning_rate": 1.2989690721649485e-05, "loss": 1.2627, "step": 126 }, { "epoch": 0.3279535183989671, "grad_norm": 0.545566481033505, "learning_rate": 1.309278350515464e-05, "loss": 1.2662, "step": 127 }, { "epoch": 0.33053582956746286, "grad_norm": 0.614998937339536, "learning_rate": 1.3195876288659795e-05, "loss": 1.278, "step": 128 }, { "epoch": 0.3331181407359587, "grad_norm": 0.8515042792729816, "learning_rate": 1.3298969072164948e-05, "loss": 1.2528, "step": 129 }, { "epoch": 0.33570045190445447, "grad_norm": 0.5478821425580377, "learning_rate": 1.3402061855670103e-05, "loss": 1.2626, "step": 130 }, { "epoch": 0.3382827630729503, "grad_norm": 0.5793920120710241, "learning_rate": 1.350515463917526e-05, "loss": 1.229, "step": 131 }, { "epoch": 0.3408650742414461, "grad_norm": 0.6386990352009216, "learning_rate": 1.3608247422680415e-05, "loss": 1.2371, "step": 132 }, { "epoch": 0.3434473854099419, "grad_norm": 0.5565581664986918, "learning_rate": 1.3711340206185568e-05, "loss": 1.2155, "step": 133 }, { "epoch": 0.3460296965784377, "grad_norm": 0.5946237867227266, "learning_rate": 1.3814432989690723e-05, "loss": 1.2841, "step": 134 }, { "epoch": 0.3486120077469335, "grad_norm": 0.5521016353163, "learning_rate": 1.3917525773195878e-05, "loss": 1.2551, "step": 135 }, { "epoch": 0.3511943189154293, "grad_norm": 0.5385185628561531, "learning_rate": 1.402061855670103e-05, "loss": 1.2419, "step": 136 }, { "epoch": 0.3537766300839251, "grad_norm": 0.5193240629735151, "learning_rate": 1.4123711340206187e-05, "loss": 1.2539, "step": 137 }, { "epoch": 0.3563589412524209, "grad_norm": 0.556121240094166, "learning_rate": 1.4226804123711342e-05, "loss": 1.2303, "step": 138 }, { "epoch": 0.35894125242091673, "grad_norm": 0.5218247569723863, "learning_rate": 1.4329896907216495e-05, "loss": 1.2644, "step": 139 }, { "epoch": 0.3615235635894125, "grad_norm": 0.530437093642517, "learning_rate": 1.443298969072165e-05, "loss": 1.2497, "step": 140 }, { "epoch": 0.36410587475790834, "grad_norm": 0.5484440453142011, "learning_rate": 1.4536082474226805e-05, "loss": 1.2324, "step": 141 }, { "epoch": 0.3666881859264041, "grad_norm": 0.5493692192470679, "learning_rate": 1.4639175257731958e-05, "loss": 1.2717, "step": 142 }, { "epoch": 0.36927049709489995, "grad_norm": 0.5413123502333834, "learning_rate": 1.4742268041237115e-05, "loss": 1.2103, "step": 143 }, { "epoch": 0.37185280826339573, "grad_norm": 0.556751121901872, "learning_rate": 1.484536082474227e-05, "loss": 1.2532, "step": 144 }, { "epoch": 0.37443511943189156, "grad_norm": 0.5408139067043912, "learning_rate": 1.4948453608247425e-05, "loss": 1.2995, "step": 145 }, { "epoch": 0.37701743060038734, "grad_norm": 0.567109158025857, "learning_rate": 1.5051546391752578e-05, "loss": 1.2412, "step": 146 }, { "epoch": 0.37959974176888317, "grad_norm": 0.5489463445020752, "learning_rate": 1.5154639175257733e-05, "loss": 1.2601, "step": 147 }, { "epoch": 0.38218205293737895, "grad_norm": 0.5269782212095577, "learning_rate": 1.5257731958762888e-05, "loss": 1.2105, "step": 148 }, { "epoch": 0.3847643641058748, "grad_norm": 0.5476377594621987, "learning_rate": 1.5360824742268042e-05, "loss": 1.251, "step": 149 }, { "epoch": 0.38734667527437056, "grad_norm": 0.5851100678927004, "learning_rate": 1.5463917525773197e-05, "loss": 1.2722, "step": 150 }, { "epoch": 0.3899289864428664, "grad_norm": 0.5906959266553361, "learning_rate": 1.5567010309278352e-05, "loss": 1.2605, "step": 151 }, { "epoch": 0.39251129761136216, "grad_norm": 0.5812493666723612, "learning_rate": 1.5670103092783507e-05, "loss": 1.278, "step": 152 }, { "epoch": 0.395093608779858, "grad_norm": 0.6197633946415279, "learning_rate": 1.5773195876288662e-05, "loss": 1.2062, "step": 153 }, { "epoch": 0.3976759199483538, "grad_norm": 0.5382604894375081, "learning_rate": 1.5876288659793813e-05, "loss": 1.1994, "step": 154 }, { "epoch": 0.4002582311168496, "grad_norm": 0.5919569403332751, "learning_rate": 1.597938144329897e-05, "loss": 1.2151, "step": 155 }, { "epoch": 0.4028405422853454, "grad_norm": 0.5540942763865888, "learning_rate": 1.6082474226804127e-05, "loss": 1.2245, "step": 156 }, { "epoch": 0.4054228534538412, "grad_norm": 0.6456250154614755, "learning_rate": 1.618556701030928e-05, "loss": 1.2407, "step": 157 }, { "epoch": 0.408005164622337, "grad_norm": 0.5495270617698018, "learning_rate": 1.6288659793814433e-05, "loss": 1.2319, "step": 158 }, { "epoch": 0.41058747579083277, "grad_norm": 0.6456750468117451, "learning_rate": 1.6391752577319588e-05, "loss": 1.2314, "step": 159 }, { "epoch": 0.4131697869593286, "grad_norm": 0.5361216417752698, "learning_rate": 1.6494845360824743e-05, "loss": 1.2252, "step": 160 }, { "epoch": 0.4157520981278244, "grad_norm": 0.5740356212634748, "learning_rate": 1.65979381443299e-05, "loss": 1.2199, "step": 161 }, { "epoch": 0.4183344092963202, "grad_norm": 0.5600044764299427, "learning_rate": 1.6701030927835052e-05, "loss": 1.2386, "step": 162 }, { "epoch": 0.420916720464816, "grad_norm": 0.5752486574243302, "learning_rate": 1.6804123711340207e-05, "loss": 1.2745, "step": 163 }, { "epoch": 0.4234990316333118, "grad_norm": 0.5653976115621461, "learning_rate": 1.6907216494845362e-05, "loss": 1.2294, "step": 164 }, { "epoch": 0.4260813428018076, "grad_norm": 0.6075219487510116, "learning_rate": 1.7010309278350517e-05, "loss": 1.2719, "step": 165 }, { "epoch": 0.4286636539703034, "grad_norm": 0.606371230053945, "learning_rate": 1.7113402061855672e-05, "loss": 1.2112, "step": 166 }, { "epoch": 0.4312459651387992, "grad_norm": 0.6078879765053119, "learning_rate": 1.7216494845360827e-05, "loss": 1.264, "step": 167 }, { "epoch": 0.43382827630729504, "grad_norm": 0.5767804822867032, "learning_rate": 1.731958762886598e-05, "loss": 1.2515, "step": 168 }, { "epoch": 0.4364105874757908, "grad_norm": 0.6130923630273873, "learning_rate": 1.7422680412371137e-05, "loss": 1.2038, "step": 169 }, { "epoch": 0.43899289864428664, "grad_norm": 0.5235102985682106, "learning_rate": 1.752577319587629e-05, "loss": 1.237, "step": 170 }, { "epoch": 0.4415752098127824, "grad_norm": 0.6204241509904728, "learning_rate": 1.7628865979381443e-05, "loss": 1.2653, "step": 171 }, { "epoch": 0.44415752098127825, "grad_norm": 0.5839862114241418, "learning_rate": 1.7731958762886598e-05, "loss": 1.2268, "step": 172 }, { "epoch": 0.44673983214977403, "grad_norm": 0.5379286158874602, "learning_rate": 1.7835051546391756e-05, "loss": 1.2237, "step": 173 }, { "epoch": 0.44932214331826986, "grad_norm": 0.5773716297805083, "learning_rate": 1.793814432989691e-05, "loss": 1.2563, "step": 174 }, { "epoch": 0.45190445448676564, "grad_norm": 0.5310519709213946, "learning_rate": 1.8041237113402062e-05, "loss": 1.1999, "step": 175 }, { "epoch": 0.45448676565526147, "grad_norm": 0.5849880582279767, "learning_rate": 1.8144329896907217e-05, "loss": 1.2286, "step": 176 }, { "epoch": 0.45706907682375725, "grad_norm": 0.5412178590822653, "learning_rate": 1.8247422680412372e-05, "loss": 1.2008, "step": 177 }, { "epoch": 0.4596513879922531, "grad_norm": 0.6133621047960811, "learning_rate": 1.8350515463917527e-05, "loss": 1.2378, "step": 178 }, { "epoch": 0.46223369916074886, "grad_norm": 0.528699955767748, "learning_rate": 1.8453608247422682e-05, "loss": 1.2259, "step": 179 }, { "epoch": 0.4648160103292447, "grad_norm": 0.5279317130021535, "learning_rate": 1.8556701030927837e-05, "loss": 1.2412, "step": 180 }, { "epoch": 0.46739832149774047, "grad_norm": 0.5853559551993786, "learning_rate": 1.865979381443299e-05, "loss": 1.2214, "step": 181 }, { "epoch": 0.4699806326662363, "grad_norm": 0.5424233833428941, "learning_rate": 1.8762886597938147e-05, "loss": 1.2458, "step": 182 }, { "epoch": 0.4725629438347321, "grad_norm": 0.5106671423999047, "learning_rate": 1.88659793814433e-05, "loss": 1.2233, "step": 183 }, { "epoch": 0.4751452550032279, "grad_norm": 0.5324533214858636, "learning_rate": 1.8969072164948453e-05, "loss": 1.2069, "step": 184 }, { "epoch": 0.4777275661717237, "grad_norm": 0.5646853613253074, "learning_rate": 1.907216494845361e-05, "loss": 1.252, "step": 185 }, { "epoch": 0.4803098773402195, "grad_norm": 0.5937004053571758, "learning_rate": 1.9175257731958766e-05, "loss": 1.2147, "step": 186 }, { "epoch": 0.4828921885087153, "grad_norm": 0.5562505209372693, "learning_rate": 1.927835051546392e-05, "loss": 1.2675, "step": 187 }, { "epoch": 0.4854744996772111, "grad_norm": 0.5563014952202203, "learning_rate": 1.9381443298969072e-05, "loss": 1.2189, "step": 188 }, { "epoch": 0.4880568108457069, "grad_norm": 0.5264086037575215, "learning_rate": 1.9484536082474227e-05, "loss": 1.2233, "step": 189 }, { "epoch": 0.49063912201420273, "grad_norm": 0.5151034326104902, "learning_rate": 1.9587628865979382e-05, "loss": 1.1885, "step": 190 }, { "epoch": 0.4932214331826985, "grad_norm": 0.5034288685234395, "learning_rate": 1.969072164948454e-05, "loss": 1.1903, "step": 191 }, { "epoch": 0.49580374435119434, "grad_norm": 0.5307684939002809, "learning_rate": 1.9793814432989692e-05, "loss": 1.1928, "step": 192 }, { "epoch": 0.4983860555196901, "grad_norm": 0.5230091554872849, "learning_rate": 1.9896907216494847e-05, "loss": 1.2519, "step": 193 }, { "epoch": 0.5009683666881859, "grad_norm": 0.5226317801919913, "learning_rate": 2e-05, "loss": 1.2413, "step": 194 }, { "epoch": 0.5035506778566817, "grad_norm": 0.5482333261742753, "learning_rate": 1.9999983719336895e-05, "loss": 1.2165, "step": 195 }, { "epoch": 0.5061329890251776, "grad_norm": 0.5162294083726316, "learning_rate": 1.999993487740058e-05, "loss": 1.241, "step": 196 }, { "epoch": 0.5087153001936734, "grad_norm": 0.5914236651547329, "learning_rate": 1.99998534743501e-05, "loss": 1.2174, "step": 197 }, { "epoch": 0.5112976113621691, "grad_norm": 0.5318708983837044, "learning_rate": 1.9999739510450505e-05, "loss": 1.2061, "step": 198 }, { "epoch": 0.513879922530665, "grad_norm": 0.5320871675914947, "learning_rate": 1.9999592986072886e-05, "loss": 1.2102, "step": 199 }, { "epoch": 0.5164622336991608, "grad_norm": 0.520490178789329, "learning_rate": 1.999941390169434e-05, "loss": 1.2041, "step": 200 }, { "epoch": 0.5190445448676565, "grad_norm": 0.5771804416096503, "learning_rate": 1.9999202257897994e-05, "loss": 1.2208, "step": 201 }, { "epoch": 0.5216268560361523, "grad_norm": 0.5820688393505574, "learning_rate": 1.9998958055372984e-05, "loss": 1.2535, "step": 202 }, { "epoch": 0.5242091672046482, "grad_norm": 0.5725413352628053, "learning_rate": 1.9998681294914463e-05, "loss": 1.2003, "step": 203 }, { "epoch": 0.526791478373144, "grad_norm": 0.5813889118499928, "learning_rate": 1.999837197742361e-05, "loss": 1.1826, "step": 204 }, { "epoch": 0.5293737895416397, "grad_norm": 0.543697742052886, "learning_rate": 1.9998030103907594e-05, "loss": 1.2383, "step": 205 }, { "epoch": 0.5319561007101355, "grad_norm": 0.5737878782200372, "learning_rate": 1.9997655675479604e-05, "loss": 1.2261, "step": 206 }, { "epoch": 0.5345384118786314, "grad_norm": 0.5318306020135024, "learning_rate": 1.999724869335883e-05, "loss": 1.2087, "step": 207 }, { "epoch": 0.5371207230471272, "grad_norm": 0.5577225248661078, "learning_rate": 1.999680915887046e-05, "loss": 1.1793, "step": 208 }, { "epoch": 0.5397030342156229, "grad_norm": 0.5246527308813651, "learning_rate": 1.9996337073445673e-05, "loss": 1.1913, "step": 209 }, { "epoch": 0.5422853453841188, "grad_norm": 0.5214804985936327, "learning_rate": 1.9995832438621646e-05, "loss": 1.2264, "step": 210 }, { "epoch": 0.5448676565526146, "grad_norm": 0.5608653450795346, "learning_rate": 1.9995295256041534e-05, "loss": 1.2269, "step": 211 }, { "epoch": 0.5474499677211104, "grad_norm": 0.516907670213089, "learning_rate": 1.9994725527454476e-05, "loss": 1.1963, "step": 212 }, { "epoch": 0.5500322788896062, "grad_norm": 0.5315857426965214, "learning_rate": 1.999412325471558e-05, "loss": 1.2646, "step": 213 }, { "epoch": 0.552614590058102, "grad_norm": 0.5281811102245024, "learning_rate": 1.999348843978593e-05, "loss": 1.2163, "step": 214 }, { "epoch": 0.5551969012265978, "grad_norm": 0.5579573481195031, "learning_rate": 1.9992821084732572e-05, "loss": 1.2262, "step": 215 }, { "epoch": 0.5577792123950936, "grad_norm": 0.5595800317478022, "learning_rate": 1.9992121191728495e-05, "loss": 1.1872, "step": 216 }, { "epoch": 0.5603615235635894, "grad_norm": 0.5344069260281085, "learning_rate": 1.9991388763052643e-05, "loss": 1.2293, "step": 217 }, { "epoch": 0.5629438347320852, "grad_norm": 0.549912373034949, "learning_rate": 1.9990623801089908e-05, "loss": 1.1958, "step": 218 }, { "epoch": 0.565526145900581, "grad_norm": 0.5482064392691067, "learning_rate": 1.9989826308331103e-05, "loss": 1.249, "step": 219 }, { "epoch": 0.5681084570690769, "grad_norm": 0.6017796916988709, "learning_rate": 1.9988996287372967e-05, "loss": 1.1591, "step": 220 }, { "epoch": 0.5706907682375726, "grad_norm": 0.5616669264753023, "learning_rate": 1.9988133740918167e-05, "loss": 1.2029, "step": 221 }, { "epoch": 0.5732730794060684, "grad_norm": 0.548070565923786, "learning_rate": 1.998723867177526e-05, "loss": 1.2113, "step": 222 }, { "epoch": 0.5758553905745643, "grad_norm": 0.5579809466395939, "learning_rate": 1.998631108285871e-05, "loss": 1.1915, "step": 223 }, { "epoch": 0.5784377017430601, "grad_norm": 0.5145023167575715, "learning_rate": 1.9985350977188877e-05, "loss": 1.2455, "step": 224 }, { "epoch": 0.5810200129115558, "grad_norm": 0.5438281690868874, "learning_rate": 1.998435835789199e-05, "loss": 1.1996, "step": 225 }, { "epoch": 0.5836023240800516, "grad_norm": 0.5062822089728274, "learning_rate": 1.9983333228200145e-05, "loss": 1.2267, "step": 226 }, { "epoch": 0.5861846352485475, "grad_norm": 0.49455916062220207, "learning_rate": 1.9982275591451304e-05, "loss": 1.2234, "step": 227 }, { "epoch": 0.5887669464170433, "grad_norm": 0.5230043234273385, "learning_rate": 1.998118545108927e-05, "loss": 1.2048, "step": 228 }, { "epoch": 0.591349257585539, "grad_norm": 0.492411804242286, "learning_rate": 1.998006281066369e-05, "loss": 1.1567, "step": 229 }, { "epoch": 0.5939315687540349, "grad_norm": 0.47969671152864785, "learning_rate": 1.997890767383002e-05, "loss": 1.1842, "step": 230 }, { "epoch": 0.5965138799225307, "grad_norm": 0.5037463603875141, "learning_rate": 1.9977720044349546e-05, "loss": 1.2071, "step": 231 }, { "epoch": 0.5990961910910265, "grad_norm": 0.5505833494748907, "learning_rate": 1.997649992608935e-05, "loss": 1.2215, "step": 232 }, { "epoch": 0.6016785022595222, "grad_norm": 0.48300379957641953, "learning_rate": 1.9975247323022286e-05, "loss": 1.1522, "step": 233 }, { "epoch": 0.6042608134280181, "grad_norm": 0.535926278201071, "learning_rate": 1.9973962239227012e-05, "loss": 1.187, "step": 234 }, { "epoch": 0.6068431245965139, "grad_norm": 0.5053573787331682, "learning_rate": 1.997264467888792e-05, "loss": 1.2264, "step": 235 }, { "epoch": 0.6094254357650097, "grad_norm": 0.5209920849371423, "learning_rate": 1.9971294646295165e-05, "loss": 1.1841, "step": 236 }, { "epoch": 0.6120077469335055, "grad_norm": 0.540792437365881, "learning_rate": 1.9969912145844633e-05, "loss": 1.2543, "step": 237 }, { "epoch": 0.6145900581020013, "grad_norm": 0.5212177041723032, "learning_rate": 1.9968497182037926e-05, "loss": 1.2561, "step": 238 }, { "epoch": 0.6171723692704971, "grad_norm": 0.5304144527565653, "learning_rate": 1.996704975948236e-05, "loss": 1.1606, "step": 239 }, { "epoch": 0.6197546804389928, "grad_norm": 0.5829183549634611, "learning_rate": 1.9965569882890924e-05, "loss": 1.196, "step": 240 }, { "epoch": 0.6223369916074887, "grad_norm": 0.5345144481383324, "learning_rate": 1.99640575570823e-05, "loss": 1.1859, "step": 241 }, { "epoch": 0.6249193027759845, "grad_norm": 0.5221286963848386, "learning_rate": 1.9962512786980825e-05, "loss": 1.1715, "step": 242 }, { "epoch": 0.6275016139444803, "grad_norm": 0.5058627006232616, "learning_rate": 1.9960935577616466e-05, "loss": 1.1821, "step": 243 }, { "epoch": 0.6300839251129761, "grad_norm": 0.5199433840698656, "learning_rate": 1.9959325934124833e-05, "loss": 1.1953, "step": 244 }, { "epoch": 0.6326662362814719, "grad_norm": 0.5096245762495348, "learning_rate": 1.9957683861747137e-05, "loss": 1.1775, "step": 245 }, { "epoch": 0.6352485474499677, "grad_norm": 0.491816602747724, "learning_rate": 1.995600936583018e-05, "loss": 1.1965, "step": 246 }, { "epoch": 0.6378308586184636, "grad_norm": 0.539351334033109, "learning_rate": 1.9954302451826343e-05, "loss": 1.1902, "step": 247 }, { "epoch": 0.6404131697869593, "grad_norm": 0.5695500043236047, "learning_rate": 1.9952563125293572e-05, "loss": 1.1805, "step": 248 }, { "epoch": 0.6429954809554551, "grad_norm": 0.5279649835109442, "learning_rate": 1.9950791391895335e-05, "loss": 1.1736, "step": 249 }, { "epoch": 0.6455777921239509, "grad_norm": 0.5082701944757544, "learning_rate": 1.9948987257400637e-05, "loss": 1.2334, "step": 250 }, { "epoch": 0.6481601032924468, "grad_norm": 0.5482093109063493, "learning_rate": 1.994715072768398e-05, "loss": 1.1802, "step": 251 }, { "epoch": 0.6507424144609425, "grad_norm": 0.50876456990971, "learning_rate": 1.9945281808725342e-05, "loss": 1.2399, "step": 252 }, { "epoch": 0.6533247256294383, "grad_norm": 0.5541690057371409, "learning_rate": 1.9943380506610177e-05, "loss": 1.1826, "step": 253 }, { "epoch": 0.6559070367979342, "grad_norm": 0.503268686466964, "learning_rate": 1.9941446827529374e-05, "loss": 1.1959, "step": 254 }, { "epoch": 0.65848934796643, "grad_norm": 0.5437511832970794, "learning_rate": 1.993948077777925e-05, "loss": 1.1953, "step": 255 }, { "epoch": 0.6610716591349257, "grad_norm": 0.4781802895283609, "learning_rate": 1.9937482363761522e-05, "loss": 1.1989, "step": 256 }, { "epoch": 0.6636539703034215, "grad_norm": 0.5449342716639853, "learning_rate": 1.9935451591983292e-05, "loss": 1.2134, "step": 257 }, { "epoch": 0.6662362814719174, "grad_norm": 0.5062143579422912, "learning_rate": 1.9933388469057026e-05, "loss": 1.2243, "step": 258 }, { "epoch": 0.6688185926404132, "grad_norm": 0.47503990693298725, "learning_rate": 1.9931293001700518e-05, "loss": 1.1859, "step": 259 }, { "epoch": 0.6714009038089089, "grad_norm": 0.5208679076366871, "learning_rate": 1.9929165196736893e-05, "loss": 1.1658, "step": 260 }, { "epoch": 0.6739832149774048, "grad_norm": 0.5159438722195342, "learning_rate": 1.9927005061094563e-05, "loss": 1.1943, "step": 261 }, { "epoch": 0.6765655261459006, "grad_norm": 0.51564921169161, "learning_rate": 1.992481260180722e-05, "loss": 1.2096, "step": 262 }, { "epoch": 0.6791478373143964, "grad_norm": 0.5272888613016128, "learning_rate": 1.99225878260138e-05, "loss": 1.2247, "step": 263 }, { "epoch": 0.6817301484828922, "grad_norm": 0.4937662552334185, "learning_rate": 1.992033074095847e-05, "loss": 1.1959, "step": 264 }, { "epoch": 0.684312459651388, "grad_norm": 0.5156120724948244, "learning_rate": 1.9918041353990593e-05, "loss": 1.1896, "step": 265 }, { "epoch": 0.6868947708198838, "grad_norm": 0.5029899107771948, "learning_rate": 1.9915719672564724e-05, "loss": 1.2029, "step": 266 }, { "epoch": 0.6894770819883796, "grad_norm": 0.5021013897512888, "learning_rate": 1.9913365704240562e-05, "loss": 1.2001, "step": 267 }, { "epoch": 0.6920593931568754, "grad_norm": 0.4874570592953185, "learning_rate": 1.9910979456682935e-05, "loss": 1.1909, "step": 268 }, { "epoch": 0.6946417043253712, "grad_norm": 0.49498289469426227, "learning_rate": 1.990856093766179e-05, "loss": 1.1823, "step": 269 }, { "epoch": 0.697224015493867, "grad_norm": 0.493423453437657, "learning_rate": 1.9906110155052142e-05, "loss": 1.226, "step": 270 }, { "epoch": 0.6998063266623629, "grad_norm": 0.5122671842264414, "learning_rate": 1.9903627116834064e-05, "loss": 1.1651, "step": 271 }, { "epoch": 0.7023886378308586, "grad_norm": 0.4937702040231816, "learning_rate": 1.990111183109266e-05, "loss": 1.1902, "step": 272 }, { "epoch": 0.7049709489993544, "grad_norm": 0.5136007768359044, "learning_rate": 1.989856430601803e-05, "loss": 1.1999, "step": 273 }, { "epoch": 0.7075532601678503, "grad_norm": 0.49202497646323023, "learning_rate": 1.9895984549905255e-05, "loss": 1.1814, "step": 274 }, { "epoch": 0.7101355713363461, "grad_norm": 0.5009139995620645, "learning_rate": 1.9893372571154362e-05, "loss": 1.19, "step": 275 }, { "epoch": 0.7127178825048418, "grad_norm": 0.5193484363230418, "learning_rate": 1.9890728378270304e-05, "loss": 1.2066, "step": 276 }, { "epoch": 0.7153001936733376, "grad_norm": 0.49927361128296144, "learning_rate": 1.9888051979862922e-05, "loss": 1.2064, "step": 277 }, { "epoch": 0.7178825048418335, "grad_norm": 0.48996929520751215, "learning_rate": 1.988534338464692e-05, "loss": 1.1653, "step": 278 }, { "epoch": 0.7204648160103292, "grad_norm": 0.499684088085507, "learning_rate": 1.988260260144185e-05, "loss": 1.1654, "step": 279 }, { "epoch": 0.723047127178825, "grad_norm": 0.48422890349549536, "learning_rate": 1.987982963917206e-05, "loss": 1.1554, "step": 280 }, { "epoch": 0.7256294383473209, "grad_norm": 0.49199667307495154, "learning_rate": 1.987702450686669e-05, "loss": 1.1908, "step": 281 }, { "epoch": 0.7282117495158167, "grad_norm": 0.4852284035856874, "learning_rate": 1.9874187213659614e-05, "loss": 1.1367, "step": 282 }, { "epoch": 0.7307940606843124, "grad_norm": 0.49745524250401135, "learning_rate": 1.987131776878944e-05, "loss": 1.1801, "step": 283 }, { "epoch": 0.7333763718528082, "grad_norm": 0.49039290208544745, "learning_rate": 1.986841618159946e-05, "loss": 1.1691, "step": 284 }, { "epoch": 0.7359586830213041, "grad_norm": 0.4905834691968128, "learning_rate": 1.986548246153763e-05, "loss": 1.1752, "step": 285 }, { "epoch": 0.7385409941897999, "grad_norm": 0.490506332640748, "learning_rate": 1.9862516618156526e-05, "loss": 1.1883, "step": 286 }, { "epoch": 0.7411233053582956, "grad_norm": 0.5248431755020703, "learning_rate": 1.9859518661113326e-05, "loss": 1.205, "step": 287 }, { "epoch": 0.7437056165267915, "grad_norm": 0.5057560855422101, "learning_rate": 1.9856488600169785e-05, "loss": 1.2279, "step": 288 }, { "epoch": 0.7462879276952873, "grad_norm": 0.5025298659401831, "learning_rate": 1.9853426445192175e-05, "loss": 1.1631, "step": 289 }, { "epoch": 0.7488702388637831, "grad_norm": 0.4851276896544048, "learning_rate": 1.9850332206151285e-05, "loss": 1.1626, "step": 290 }, { "epoch": 0.7514525500322788, "grad_norm": 0.48026264033577865, "learning_rate": 1.984720589312236e-05, "loss": 1.2098, "step": 291 }, { "epoch": 0.7540348612007747, "grad_norm": 0.5405394737905861, "learning_rate": 1.9844047516285098e-05, "loss": 1.2298, "step": 292 }, { "epoch": 0.7566171723692705, "grad_norm": 0.48769934414927935, "learning_rate": 1.9840857085923585e-05, "loss": 1.196, "step": 293 }, { "epoch": 0.7591994835377663, "grad_norm": 0.527078245348908, "learning_rate": 1.9837634612426292e-05, "loss": 1.1832, "step": 294 }, { "epoch": 0.7617817947062621, "grad_norm": 0.47968694451872484, "learning_rate": 1.983438010628602e-05, "loss": 1.176, "step": 295 }, { "epoch": 0.7643641058747579, "grad_norm": 0.5316988367330956, "learning_rate": 1.9831093578099866e-05, "loss": 1.215, "step": 296 }, { "epoch": 0.7669464170432537, "grad_norm": 0.4839737165293382, "learning_rate": 1.9827775038569203e-05, "loss": 1.1483, "step": 297 }, { "epoch": 0.7695287282117496, "grad_norm": 0.47298182478673473, "learning_rate": 1.9824424498499644e-05, "loss": 1.138, "step": 298 }, { "epoch": 0.7721110393802453, "grad_norm": 0.5037524314864462, "learning_rate": 1.9821041968800982e-05, "loss": 1.1906, "step": 299 }, { "epoch": 0.7746933505487411, "grad_norm": 0.4726414066930357, "learning_rate": 1.981762746048719e-05, "loss": 1.1872, "step": 300 }, { "epoch": 0.7772756617172369, "grad_norm": 0.4775555799653867, "learning_rate": 1.9814180984676353e-05, "loss": 1.1741, "step": 301 }, { "epoch": 0.7798579728857328, "grad_norm": 0.48936767771418854, "learning_rate": 1.981070255259066e-05, "loss": 1.1687, "step": 302 }, { "epoch": 0.7824402840542285, "grad_norm": 0.5010983431343674, "learning_rate": 1.9807192175556344e-05, "loss": 1.1563, "step": 303 }, { "epoch": 0.7850225952227243, "grad_norm": 0.49317307520583575, "learning_rate": 1.9803649865003658e-05, "loss": 1.1831, "step": 304 }, { "epoch": 0.7876049063912202, "grad_norm": 0.5072717596433491, "learning_rate": 1.9800075632466832e-05, "loss": 1.1795, "step": 305 }, { "epoch": 0.790187217559716, "grad_norm": 0.5044246633386273, "learning_rate": 1.979646948958405e-05, "loss": 1.1985, "step": 306 }, { "epoch": 0.7927695287282117, "grad_norm": 0.5075154181174161, "learning_rate": 1.979283144809738e-05, "loss": 1.1955, "step": 307 }, { "epoch": 0.7953518398967075, "grad_norm": 0.5199713188120779, "learning_rate": 1.9789161519852777e-05, "loss": 1.2114, "step": 308 }, { "epoch": 0.7979341510652034, "grad_norm": 0.5261480930927327, "learning_rate": 1.9785459716800005e-05, "loss": 1.1582, "step": 309 }, { "epoch": 0.8005164622336992, "grad_norm": 0.5271373406623951, "learning_rate": 1.978172605099264e-05, "loss": 1.1761, "step": 310 }, { "epoch": 0.8030987734021949, "grad_norm": 0.5065930895833843, "learning_rate": 1.9777960534587975e-05, "loss": 1.1915, "step": 311 }, { "epoch": 0.8056810845706908, "grad_norm": 0.5171670807714366, "learning_rate": 1.9774163179847046e-05, "loss": 1.1776, "step": 312 }, { "epoch": 0.8082633957391866, "grad_norm": 0.49429669397671067, "learning_rate": 1.9770333999134538e-05, "loss": 1.2005, "step": 313 }, { "epoch": 0.8108457069076824, "grad_norm": 0.506264732133423, "learning_rate": 1.976647300491877e-05, "loss": 1.1555, "step": 314 }, { "epoch": 0.8134280180761781, "grad_norm": 0.4975188714683081, "learning_rate": 1.9762580209771648e-05, "loss": 1.1761, "step": 315 }, { "epoch": 0.816010329244674, "grad_norm": 0.4917712576784755, "learning_rate": 1.9758655626368635e-05, "loss": 1.1769, "step": 316 }, { "epoch": 0.8185926404131698, "grad_norm": 0.4954187885530158, "learning_rate": 1.975469926748869e-05, "loss": 1.169, "step": 317 }, { "epoch": 0.8211749515816655, "grad_norm": 0.5115626137899685, "learning_rate": 1.9750711146014254e-05, "loss": 1.1737, "step": 318 }, { "epoch": 0.8237572627501614, "grad_norm": 0.4748616428095463, "learning_rate": 1.9746691274931168e-05, "loss": 1.19, "step": 319 }, { "epoch": 0.8263395739186572, "grad_norm": 0.4940492043780541, "learning_rate": 1.9742639667328666e-05, "loss": 1.1761, "step": 320 }, { "epoch": 0.828921885087153, "grad_norm": 0.5422433717627166, "learning_rate": 1.9738556336399322e-05, "loss": 1.1573, "step": 321 }, { "epoch": 0.8315041962556488, "grad_norm": 0.5003074749343184, "learning_rate": 1.9734441295439004e-05, "loss": 1.1777, "step": 322 }, { "epoch": 0.8340865074241446, "grad_norm": 0.4892259502271507, "learning_rate": 1.973029455784683e-05, "loss": 1.1696, "step": 323 }, { "epoch": 0.8366688185926404, "grad_norm": 0.5321024468246395, "learning_rate": 1.9726116137125128e-05, "loss": 1.1436, "step": 324 }, { "epoch": 0.8392511297611362, "grad_norm": 0.5045646201138196, "learning_rate": 1.9721906046879392e-05, "loss": 1.1764, "step": 325 }, { "epoch": 0.841833440929632, "grad_norm": 0.5281769148500022, "learning_rate": 1.971766430081823e-05, "loss": 1.1966, "step": 326 }, { "epoch": 0.8444157520981278, "grad_norm": 0.5202036284793085, "learning_rate": 1.971339091275333e-05, "loss": 1.1929, "step": 327 }, { "epoch": 0.8469980632666236, "grad_norm": 0.48805338005531557, "learning_rate": 1.9709085896599414e-05, "loss": 1.1713, "step": 328 }, { "epoch": 0.8495803744351195, "grad_norm": 0.48834281779827204, "learning_rate": 1.970474926637418e-05, "loss": 1.1766, "step": 329 }, { "epoch": 0.8521626856036152, "grad_norm": 0.47045086079257925, "learning_rate": 1.9700381036198278e-05, "loss": 1.1733, "step": 330 }, { "epoch": 0.854744996772111, "grad_norm": 0.5046592134608324, "learning_rate": 1.9695981220295242e-05, "loss": 1.2065, "step": 331 }, { "epoch": 0.8573273079406069, "grad_norm": 0.4815780307147996, "learning_rate": 1.9691549832991455e-05, "loss": 1.1641, "step": 332 }, { "epoch": 0.8599096191091027, "grad_norm": 0.47306399270721433, "learning_rate": 1.96870868887161e-05, "loss": 1.2106, "step": 333 }, { "epoch": 0.8624919302775984, "grad_norm": 0.48177902227189173, "learning_rate": 1.968259240200112e-05, "loss": 1.1867, "step": 334 }, { "epoch": 0.8650742414460942, "grad_norm": 0.5109266879563596, "learning_rate": 1.967806638748116e-05, "loss": 1.1835, "step": 335 }, { "epoch": 0.8676565526145901, "grad_norm": 0.4886245722535642, "learning_rate": 1.9673508859893515e-05, "loss": 1.1687, "step": 336 }, { "epoch": 0.8702388637830859, "grad_norm": 0.5248768069436597, "learning_rate": 1.966891983407811e-05, "loss": 1.1984, "step": 337 }, { "epoch": 0.8728211749515816, "grad_norm": 0.4943514825391058, "learning_rate": 1.9664299324977412e-05, "loss": 1.1891, "step": 338 }, { "epoch": 0.8754034861200775, "grad_norm": 0.47129413485615096, "learning_rate": 1.9659647347636422e-05, "loss": 1.1586, "step": 339 }, { "epoch": 0.8779857972885733, "grad_norm": 0.49462243772822845, "learning_rate": 1.9654963917202586e-05, "loss": 1.1558, "step": 340 }, { "epoch": 0.8805681084570691, "grad_norm": 0.4861824324614408, "learning_rate": 1.965024904892578e-05, "loss": 1.1683, "step": 341 }, { "epoch": 0.8831504196255648, "grad_norm": 0.4951569265192093, "learning_rate": 1.9645502758158234e-05, "loss": 1.2037, "step": 342 }, { "epoch": 0.8857327307940607, "grad_norm": 0.46700115170322853, "learning_rate": 1.9640725060354508e-05, "loss": 1.1142, "step": 343 }, { "epoch": 0.8883150419625565, "grad_norm": 0.5117387702694532, "learning_rate": 1.963591597107142e-05, "loss": 1.1944, "step": 344 }, { "epoch": 0.8908973531310523, "grad_norm": 0.46783399641185575, "learning_rate": 1.9631075505967993e-05, "loss": 1.1802, "step": 345 }, { "epoch": 0.8934796642995481, "grad_norm": 0.49795421380846105, "learning_rate": 1.9626203680805432e-05, "loss": 1.1814, "step": 346 }, { "epoch": 0.8960619754680439, "grad_norm": 0.4861596444598828, "learning_rate": 1.9621300511447043e-05, "loss": 1.1825, "step": 347 }, { "epoch": 0.8986442866365397, "grad_norm": 0.4953435099828471, "learning_rate": 1.9616366013858195e-05, "loss": 1.161, "step": 348 }, { "epoch": 0.9012265978050356, "grad_norm": 0.5023878227402143, "learning_rate": 1.961140020410627e-05, "loss": 1.1885, "step": 349 }, { "epoch": 0.9038089089735313, "grad_norm": 0.5007001061995038, "learning_rate": 1.9606403098360597e-05, "loss": 1.1989, "step": 350 }, { "epoch": 0.9063912201420271, "grad_norm": 0.4635439197533225, "learning_rate": 1.960137471289242e-05, "loss": 1.1302, "step": 351 }, { "epoch": 0.9089735313105229, "grad_norm": 0.49621446281583526, "learning_rate": 1.9596315064074826e-05, "loss": 1.1991, "step": 352 }, { "epoch": 0.9115558424790188, "grad_norm": 0.48901012014264644, "learning_rate": 1.9591224168382708e-05, "loss": 1.1818, "step": 353 }, { "epoch": 0.9141381536475145, "grad_norm": 0.4823284694023606, "learning_rate": 1.958610204239269e-05, "loss": 1.1464, "step": 354 }, { "epoch": 0.9167204648160103, "grad_norm": 0.47062926472993427, "learning_rate": 1.95809487027831e-05, "loss": 1.1966, "step": 355 }, { "epoch": 0.9193027759845062, "grad_norm": 0.4766355752313664, "learning_rate": 1.9575764166333887e-05, "loss": 1.1741, "step": 356 }, { "epoch": 0.921885087153002, "grad_norm": 0.470588804103096, "learning_rate": 1.95705484499266e-05, "loss": 1.1425, "step": 357 }, { "epoch": 0.9244673983214977, "grad_norm": 0.49029399234060356, "learning_rate": 1.9565301570544297e-05, "loss": 1.19, "step": 358 }, { "epoch": 0.9270497094899935, "grad_norm": 0.4811321660417228, "learning_rate": 1.9560023545271512e-05, "loss": 1.1617, "step": 359 }, { "epoch": 0.9296320206584894, "grad_norm": 0.47791045452781805, "learning_rate": 1.9554714391294198e-05, "loss": 1.1349, "step": 360 }, { "epoch": 0.9322143318269851, "grad_norm": 0.45457900915984395, "learning_rate": 1.9549374125899665e-05, "loss": 1.1697, "step": 361 }, { "epoch": 0.9347966429954809, "grad_norm": 0.4783672298922519, "learning_rate": 1.9544002766476523e-05, "loss": 1.1779, "step": 362 }, { "epoch": 0.9373789541639768, "grad_norm": 0.5043177366580075, "learning_rate": 1.953860033051463e-05, "loss": 1.16, "step": 363 }, { "epoch": 0.9399612653324726, "grad_norm": 0.5253476046397387, "learning_rate": 1.953316683560504e-05, "loss": 1.2074, "step": 364 }, { "epoch": 0.9425435765009683, "grad_norm": 0.4830105807169101, "learning_rate": 1.9527702299439925e-05, "loss": 1.1598, "step": 365 }, { "epoch": 0.9451258876694641, "grad_norm": 0.4603907599438401, "learning_rate": 1.9522206739812546e-05, "loss": 1.1511, "step": 366 }, { "epoch": 0.94770819883796, "grad_norm": 0.45849247679852523, "learning_rate": 1.9516680174617168e-05, "loss": 1.1873, "step": 367 }, { "epoch": 0.9502905100064558, "grad_norm": 0.4826500481926017, "learning_rate": 1.9511122621849025e-05, "loss": 1.187, "step": 368 }, { "epoch": 0.9528728211749515, "grad_norm": 0.4577162756249727, "learning_rate": 1.9505534099604245e-05, "loss": 1.1611, "step": 369 }, { "epoch": 0.9554551323434474, "grad_norm": 0.4674250946156371, "learning_rate": 1.94999146260798e-05, "loss": 1.164, "step": 370 }, { "epoch": 0.9580374435119432, "grad_norm": 0.46337504643140265, "learning_rate": 1.9494264219573433e-05, "loss": 1.1898, "step": 371 }, { "epoch": 0.960619754680439, "grad_norm": 0.48205345839710323, "learning_rate": 1.9488582898483625e-05, "loss": 1.1641, "step": 372 }, { "epoch": 0.9632020658489348, "grad_norm": 0.4927156506373219, "learning_rate": 1.9482870681309502e-05, "loss": 1.1526, "step": 373 }, { "epoch": 0.9657843770174306, "grad_norm": 0.49774685339253233, "learning_rate": 1.9477127586650812e-05, "loss": 1.1513, "step": 374 }, { "epoch": 0.9683666881859264, "grad_norm": 0.5493094035957639, "learning_rate": 1.9471353633207824e-05, "loss": 1.2067, "step": 375 }, { "epoch": 0.9709489993544222, "grad_norm": 0.4695400511675006, "learning_rate": 1.94655488397813e-05, "loss": 1.1575, "step": 376 }, { "epoch": 0.973531310522918, "grad_norm": 0.5159667326407505, "learning_rate": 1.9459713225272422e-05, "loss": 1.1785, "step": 377 }, { "epoch": 0.9761136216914138, "grad_norm": 0.5216341313782504, "learning_rate": 1.9453846808682713e-05, "loss": 1.1446, "step": 378 }, { "epoch": 0.9786959328599096, "grad_norm": 0.5182964074330236, "learning_rate": 1.9447949609114018e-05, "loss": 1.1432, "step": 379 }, { "epoch": 0.9812782440284055, "grad_norm": 0.485083144408532, "learning_rate": 1.9442021645768383e-05, "loss": 1.1275, "step": 380 }, { "epoch": 0.9838605551969012, "grad_norm": 0.4599544335039025, "learning_rate": 1.9436062937948058e-05, "loss": 1.1151, "step": 381 }, { "epoch": 0.986442866365397, "grad_norm": 0.521054277469576, "learning_rate": 1.943007350505538e-05, "loss": 1.2012, "step": 382 }, { "epoch": 0.9890251775338929, "grad_norm": 0.4751977247155287, "learning_rate": 1.942405336659274e-05, "loss": 1.1797, "step": 383 }, { "epoch": 0.9916074887023887, "grad_norm": 0.47658638371046963, "learning_rate": 1.94180025421625e-05, "loss": 1.1392, "step": 384 }, { "epoch": 0.9941897998708844, "grad_norm": 0.4712331278213858, "learning_rate": 1.9411921051466952e-05, "loss": 1.141, "step": 385 }, { "epoch": 0.9967721110393802, "grad_norm": 0.49755103398259565, "learning_rate": 1.9405808914308236e-05, "loss": 1.1328, "step": 386 }, { "epoch": 0.9993544222078761, "grad_norm": 0.4847898684583998, "learning_rate": 1.9399666150588286e-05, "loss": 1.1669, "step": 387 }, { "epoch": 1.0, "grad_norm": 0.4847898684583998, "learning_rate": 1.9393492780308745e-05, "loss": 1.1861, "step": 388 }, { "epoch": 1.0025823111684957, "grad_norm": 1.1014826257650279, "learning_rate": 1.938728882357093e-05, "loss": 1.0917, "step": 389 }, { "epoch": 1.0051646223369917, "grad_norm": 0.5639388688605913, "learning_rate": 1.938105430057575e-05, "loss": 1.0903, "step": 390 }, { "epoch": 1.0077469335054874, "grad_norm": 0.6443566019067333, "learning_rate": 1.9374789231623636e-05, "loss": 1.1009, "step": 391 }, { "epoch": 1.010329244673983, "grad_norm": 0.630435005940545, "learning_rate": 1.9368493637114483e-05, "loss": 1.1003, "step": 392 }, { "epoch": 1.012911555842479, "grad_norm": 0.5108574939050173, "learning_rate": 1.936216753754758e-05, "loss": 1.0711, "step": 393 }, { "epoch": 1.0154938670109748, "grad_norm": 0.608458106332031, "learning_rate": 1.9355810953521556e-05, "loss": 1.0595, "step": 394 }, { "epoch": 1.0180761781794707, "grad_norm": 0.5458550950072169, "learning_rate": 1.934942390573428e-05, "loss": 1.0943, "step": 395 }, { "epoch": 1.0206584893479664, "grad_norm": 0.5725197447406076, "learning_rate": 1.9343006414982827e-05, "loss": 1.0715, "step": 396 }, { "epoch": 1.0232408005164622, "grad_norm": 0.5707836595715814, "learning_rate": 1.9336558502163404e-05, "loss": 1.0845, "step": 397 }, { "epoch": 1.025823111684958, "grad_norm": 0.5436343954371967, "learning_rate": 1.933008018827127e-05, "loss": 1.1252, "step": 398 }, { "epoch": 1.0284054228534538, "grad_norm": 0.5722847310225982, "learning_rate": 1.932357149440067e-05, "loss": 1.0698, "step": 399 }, { "epoch": 1.0309877340219495, "grad_norm": 0.5620996777465161, "learning_rate": 1.9317032441744778e-05, "loss": 1.0999, "step": 400 }, { "epoch": 1.0335700451904455, "grad_norm": 0.6088252142123551, "learning_rate": 1.9310463051595612e-05, "loss": 1.1462, "step": 401 }, { "epoch": 1.0361523563589412, "grad_norm": 0.5076462836433528, "learning_rate": 1.9303863345343985e-05, "loss": 1.0755, "step": 402 }, { "epoch": 1.0387346675274371, "grad_norm": 0.5109910409402665, "learning_rate": 1.929723334447941e-05, "loss": 1.0821, "step": 403 }, { "epoch": 1.0413169786959329, "grad_norm": 0.5636026180024839, "learning_rate": 1.9290573070590053e-05, "loss": 1.079, "step": 404 }, { "epoch": 1.0438992898644286, "grad_norm": 0.5112896124943734, "learning_rate": 1.9283882545362642e-05, "loss": 1.104, "step": 405 }, { "epoch": 1.0464816010329245, "grad_norm": 0.5515513629476969, "learning_rate": 1.9277161790582425e-05, "loss": 1.094, "step": 406 }, { "epoch": 1.0490639122014203, "grad_norm": 0.514886116372971, "learning_rate": 1.9270410828133062e-05, "loss": 1.085, "step": 407 }, { "epoch": 1.051646223369916, "grad_norm": 0.5331151636377134, "learning_rate": 1.9263629679996582e-05, "loss": 1.1028, "step": 408 }, { "epoch": 1.054228534538412, "grad_norm": 0.5165475698281132, "learning_rate": 1.925681836825331e-05, "loss": 1.0848, "step": 409 }, { "epoch": 1.0568108457069076, "grad_norm": 0.5366595294372662, "learning_rate": 1.9249976915081773e-05, "loss": 1.1015, "step": 410 }, { "epoch": 1.0593931568754036, "grad_norm": 0.5193038464689262, "learning_rate": 1.9243105342758657e-05, "loss": 1.0782, "step": 411 }, { "epoch": 1.0619754680438993, "grad_norm": 0.5009182548303415, "learning_rate": 1.923620367365871e-05, "loss": 1.0516, "step": 412 }, { "epoch": 1.064557779212395, "grad_norm": 0.5029748452840229, "learning_rate": 1.922927193025468e-05, "loss": 1.0781, "step": 413 }, { "epoch": 1.067140090380891, "grad_norm": 0.5196114525480685, "learning_rate": 1.922231013511724e-05, "loss": 1.0873, "step": 414 }, { "epoch": 1.0697224015493867, "grad_norm": 0.5294324303655081, "learning_rate": 1.921531831091492e-05, "loss": 1.0955, "step": 415 }, { "epoch": 1.0723047127178824, "grad_norm": 0.5264181862085819, "learning_rate": 1.9208296480414034e-05, "loss": 1.0849, "step": 416 }, { "epoch": 1.0748870238863784, "grad_norm": 0.5072255528456242, "learning_rate": 1.9201244666478586e-05, "loss": 1.0865, "step": 417 }, { "epoch": 1.077469335054874, "grad_norm": 0.5043454850061722, "learning_rate": 1.919416289207022e-05, "loss": 1.1016, "step": 418 }, { "epoch": 1.08005164622337, "grad_norm": 0.521923847407557, "learning_rate": 1.9187051180248134e-05, "loss": 1.1006, "step": 419 }, { "epoch": 1.0826339573918657, "grad_norm": 0.5106595879796177, "learning_rate": 1.9179909554169002e-05, "loss": 1.0947, "step": 420 }, { "epoch": 1.0852162685603615, "grad_norm": 0.5021486916963932, "learning_rate": 1.9172738037086905e-05, "loss": 1.0763, "step": 421 }, { "epoch": 1.0877985797288574, "grad_norm": 0.5348145469187987, "learning_rate": 1.9165536652353256e-05, "loss": 1.1169, "step": 422 }, { "epoch": 1.0903808908973531, "grad_norm": 0.49988703400762524, "learning_rate": 1.915830542341672e-05, "loss": 1.1116, "step": 423 }, { "epoch": 1.0929632020658488, "grad_norm": 0.5234949485004682, "learning_rate": 1.915104437382313e-05, "loss": 1.088, "step": 424 }, { "epoch": 1.0955455132343448, "grad_norm": 0.5079748564445703, "learning_rate": 1.9143753527215437e-05, "loss": 1.0716, "step": 425 }, { "epoch": 1.0981278244028405, "grad_norm": 0.5084741143233746, "learning_rate": 1.91364329073336e-05, "loss": 1.0913, "step": 426 }, { "epoch": 1.1007101355713362, "grad_norm": 0.5110683137296181, "learning_rate": 1.912908253801453e-05, "loss": 1.0376, "step": 427 }, { "epoch": 1.1032924467398322, "grad_norm": 0.5104932914801271, "learning_rate": 1.9121702443191994e-05, "loss": 1.0499, "step": 428 }, { "epoch": 1.105874757908328, "grad_norm": 0.5311882952530959, "learning_rate": 1.9114292646896574e-05, "loss": 1.0875, "step": 429 }, { "epoch": 1.1084570690768238, "grad_norm": 0.4879682050554302, "learning_rate": 1.910685317325554e-05, "loss": 1.0851, "step": 430 }, { "epoch": 1.1110393802453196, "grad_norm": 0.5107417160978968, "learning_rate": 1.9099384046492807e-05, "loss": 1.1234, "step": 431 }, { "epoch": 1.1136216914138153, "grad_norm": 0.5323164257143238, "learning_rate": 1.9091885290928846e-05, "loss": 1.1051, "step": 432 }, { "epoch": 1.1162040025823112, "grad_norm": 0.5062938072356562, "learning_rate": 1.9084356930980593e-05, "loss": 1.1316, "step": 433 }, { "epoch": 1.118786313750807, "grad_norm": 0.5155486221780639, "learning_rate": 1.9076798991161395e-05, "loss": 1.1078, "step": 434 }, { "epoch": 1.121368624919303, "grad_norm": 0.5225888914215889, "learning_rate": 1.90692114960809e-05, "loss": 1.0605, "step": 435 }, { "epoch": 1.1239509360877986, "grad_norm": 0.5218586332404057, "learning_rate": 1.9061594470445e-05, "loss": 1.0544, "step": 436 }, { "epoch": 1.1265332472562943, "grad_norm": 0.5063750610874502, "learning_rate": 1.9053947939055737e-05, "loss": 1.1166, "step": 437 }, { "epoch": 1.1291155584247903, "grad_norm": 0.5243484649411593, "learning_rate": 1.9046271926811238e-05, "loss": 1.0726, "step": 438 }, { "epoch": 1.131697869593286, "grad_norm": 0.5495963106537264, "learning_rate": 1.9038566458705615e-05, "loss": 1.1032, "step": 439 }, { "epoch": 1.1342801807617817, "grad_norm": 0.4964414774064479, "learning_rate": 1.903083155982889e-05, "loss": 1.1079, "step": 440 }, { "epoch": 1.1368624919302777, "grad_norm": 0.5147288498020974, "learning_rate": 1.902306725536692e-05, "loss": 1.1177, "step": 441 }, { "epoch": 1.1394448030987734, "grad_norm": 0.5327839820642644, "learning_rate": 1.9015273570601316e-05, "loss": 1.075, "step": 442 }, { "epoch": 1.142027114267269, "grad_norm": 0.528940491782197, "learning_rate": 1.9007450530909345e-05, "loss": 1.086, "step": 443 }, { "epoch": 1.144609425435765, "grad_norm": 0.5495937382324539, "learning_rate": 1.899959816176386e-05, "loss": 1.0836, "step": 444 }, { "epoch": 1.1471917366042608, "grad_norm": 0.5259398393873413, "learning_rate": 1.899171648873322e-05, "loss": 1.0777, "step": 445 }, { "epoch": 1.1497740477727567, "grad_norm": 0.5301129579516534, "learning_rate": 1.8983805537481196e-05, "loss": 1.072, "step": 446 }, { "epoch": 1.1523563589412524, "grad_norm": 0.5472484501403468, "learning_rate": 1.8975865333766895e-05, "loss": 1.053, "step": 447 }, { "epoch": 1.1549386701097482, "grad_norm": 0.521949307121031, "learning_rate": 1.8967895903444672e-05, "loss": 1.0818, "step": 448 }, { "epoch": 1.157520981278244, "grad_norm": 0.5182353015878332, "learning_rate": 1.895989727246405e-05, "loss": 1.1152, "step": 449 }, { "epoch": 1.1601032924467398, "grad_norm": 0.5214919591138573, "learning_rate": 1.895186946686964e-05, "loss": 1.1376, "step": 450 }, { "epoch": 1.1626856036152355, "grad_norm": 0.5173501815788256, "learning_rate": 1.8943812512801023e-05, "loss": 1.1067, "step": 451 }, { "epoch": 1.1652679147837315, "grad_norm": 0.5017112933282827, "learning_rate": 1.8935726436492724e-05, "loss": 1.0956, "step": 452 }, { "epoch": 1.1678502259522272, "grad_norm": 0.5054557050103587, "learning_rate": 1.8927611264274078e-05, "loss": 1.0743, "step": 453 }, { "epoch": 1.170432537120723, "grad_norm": 0.5038866398853414, "learning_rate": 1.8919467022569163e-05, "loss": 1.0663, "step": 454 }, { "epoch": 1.1730148482892189, "grad_norm": 0.5069505760103037, "learning_rate": 1.8911293737896706e-05, "loss": 1.0918, "step": 455 }, { "epoch": 1.1755971594577146, "grad_norm": 0.49669266787942645, "learning_rate": 1.890309143687001e-05, "loss": 1.0768, "step": 456 }, { "epoch": 1.1781794706262105, "grad_norm": 0.5059342818530757, "learning_rate": 1.8894860146196848e-05, "loss": 1.0606, "step": 457 }, { "epoch": 1.1807617817947063, "grad_norm": 0.49213032174231053, "learning_rate": 1.88865998926794e-05, "loss": 1.0538, "step": 458 }, { "epoch": 1.183344092963202, "grad_norm": 0.497950829610666, "learning_rate": 1.8878310703214148e-05, "loss": 1.0797, "step": 459 }, { "epoch": 1.185926404131698, "grad_norm": 0.5085396242287622, "learning_rate": 1.8869992604791786e-05, "loss": 1.0652, "step": 460 }, { "epoch": 1.1885087153001936, "grad_norm": 0.5050911514318284, "learning_rate": 1.8861645624497154e-05, "loss": 1.0744, "step": 461 }, { "epoch": 1.1910910264686896, "grad_norm": 0.518765301787044, "learning_rate": 1.885326978950912e-05, "loss": 1.1027, "step": 462 }, { "epoch": 1.1936733376371853, "grad_norm": 0.49638458680440195, "learning_rate": 1.8844865127100517e-05, "loss": 1.0804, "step": 463 }, { "epoch": 1.196255648805681, "grad_norm": 0.5183830709936179, "learning_rate": 1.883643166463804e-05, "loss": 1.0966, "step": 464 }, { "epoch": 1.198837959974177, "grad_norm": 0.49817074390497323, "learning_rate": 1.882796942958216e-05, "loss": 1.0786, "step": 465 }, { "epoch": 1.2014202711426727, "grad_norm": 0.501779662690412, "learning_rate": 1.8819478449487034e-05, "loss": 1.0586, "step": 466 }, { "epoch": 1.2040025823111684, "grad_norm": 0.5054852727426871, "learning_rate": 1.8810958752000426e-05, "loss": 1.0553, "step": 467 }, { "epoch": 1.2065848934796644, "grad_norm": 0.5016427339348701, "learning_rate": 1.8802410364863598e-05, "loss": 1.0943, "step": 468 }, { "epoch": 1.20916720464816, "grad_norm": 0.49091502171551293, "learning_rate": 1.879383331591123e-05, "loss": 1.1174, "step": 469 }, { "epoch": 1.2117495158166558, "grad_norm": 0.5378342182934727, "learning_rate": 1.8785227633071332e-05, "loss": 1.0729, "step": 470 }, { "epoch": 1.2143318269851517, "grad_norm": 0.5106316461830734, "learning_rate": 1.877659334436515e-05, "loss": 1.1085, "step": 471 }, { "epoch": 1.2169141381536475, "grad_norm": 0.5098246784805628, "learning_rate": 1.8767930477907074e-05, "loss": 1.0957, "step": 472 }, { "epoch": 1.2194964493221434, "grad_norm": 0.506477376675333, "learning_rate": 1.875923906190454e-05, "loss": 1.0918, "step": 473 }, { "epoch": 1.2220787604906391, "grad_norm": 0.5004090128286083, "learning_rate": 1.875051912465796e-05, "loss": 1.1018, "step": 474 }, { "epoch": 1.2246610716591348, "grad_norm": 0.5117533679075642, "learning_rate": 1.8741770694560598e-05, "loss": 1.0592, "step": 475 }, { "epoch": 1.2272433828276308, "grad_norm": 0.5078739636230806, "learning_rate": 1.873299380009851e-05, "loss": 1.0705, "step": 476 }, { "epoch": 1.2298256939961265, "grad_norm": 0.5143902933122632, "learning_rate": 1.8724188469850423e-05, "loss": 1.0774, "step": 477 }, { "epoch": 1.2324080051646225, "grad_norm": 0.49232344984603593, "learning_rate": 1.871535473248766e-05, "loss": 1.0548, "step": 478 }, { "epoch": 1.2349903163331182, "grad_norm": 0.49486310972586456, "learning_rate": 1.8706492616774043e-05, "loss": 1.084, "step": 479 }, { "epoch": 1.237572627501614, "grad_norm": 0.5178421153087599, "learning_rate": 1.86976021515658e-05, "loss": 1.0907, "step": 480 }, { "epoch": 1.2401549386701098, "grad_norm": 0.524983289209404, "learning_rate": 1.8688683365811456e-05, "loss": 1.0863, "step": 481 }, { "epoch": 1.2427372498386056, "grad_norm": 0.5045481433976463, "learning_rate": 1.867973628855177e-05, "loss": 1.1053, "step": 482 }, { "epoch": 1.2453195610071013, "grad_norm": 0.5091501092813715, "learning_rate": 1.8670760948919608e-05, "loss": 1.1194, "step": 483 }, { "epoch": 1.2479018721755972, "grad_norm": 0.49967798365538585, "learning_rate": 1.8661757376139858e-05, "loss": 1.0845, "step": 484 }, { "epoch": 1.250484183344093, "grad_norm": 0.4980457942019374, "learning_rate": 1.865272559952936e-05, "loss": 1.0601, "step": 485 }, { "epoch": 1.2530664945125887, "grad_norm": 0.4972111559723752, "learning_rate": 1.864366564849677e-05, "loss": 1.0907, "step": 486 }, { "epoch": 1.2556488056810846, "grad_norm": 0.5065337348249978, "learning_rate": 1.8634577552542492e-05, "loss": 1.0795, "step": 487 }, { "epoch": 1.2582311168495803, "grad_norm": 0.513607026130961, "learning_rate": 1.862546134125857e-05, "loss": 1.0436, "step": 488 }, { "epoch": 1.2608134280180763, "grad_norm": 0.5225707833426417, "learning_rate": 1.86163170443286e-05, "loss": 1.0857, "step": 489 }, { "epoch": 1.263395739186572, "grad_norm": 0.5022229029307232, "learning_rate": 1.860714469152762e-05, "loss": 1.0872, "step": 490 }, { "epoch": 1.2659780503550677, "grad_norm": 0.5010606447774606, "learning_rate": 1.859794431272203e-05, "loss": 1.1187, "step": 491 }, { "epoch": 1.2685603615235637, "grad_norm": 0.5014036896762357, "learning_rate": 1.8588715937869487e-05, "loss": 1.0601, "step": 492 }, { "epoch": 1.2711426726920594, "grad_norm": 0.5062378400840666, "learning_rate": 1.8579459597018798e-05, "loss": 1.0733, "step": 493 }, { "epoch": 1.2737249838605553, "grad_norm": 0.49322055025801476, "learning_rate": 1.857017532030984e-05, "loss": 1.081, "step": 494 }, { "epoch": 1.276307295029051, "grad_norm": 0.4985978212218064, "learning_rate": 1.8560863137973447e-05, "loss": 1.0824, "step": 495 }, { "epoch": 1.2788896061975468, "grad_norm": 0.5035778345625428, "learning_rate": 1.8551523080331324e-05, "loss": 1.0875, "step": 496 }, { "epoch": 1.2814719173660425, "grad_norm": 0.5035572503139835, "learning_rate": 1.854215517779593e-05, "loss": 1.085, "step": 497 }, { "epoch": 1.2840542285345384, "grad_norm": 0.5464525852922026, "learning_rate": 1.8532759460870407e-05, "loss": 1.0964, "step": 498 }, { "epoch": 1.2866365397030342, "grad_norm": 0.5069778816485006, "learning_rate": 1.8523335960148446e-05, "loss": 1.1123, "step": 499 }, { "epoch": 1.28921885087153, "grad_norm": 0.5106529164099722, "learning_rate": 1.8513884706314224e-05, "loss": 1.0752, "step": 500 }, { "epoch": 1.2918011620400258, "grad_norm": 0.5009571583608718, "learning_rate": 1.8504405730142267e-05, "loss": 1.0549, "step": 501 }, { "epoch": 1.2943834732085215, "grad_norm": 0.4969158537205058, "learning_rate": 1.849489906249739e-05, "loss": 1.0756, "step": 502 }, { "epoch": 1.2969657843770175, "grad_norm": 0.5336056726258084, "learning_rate": 1.8485364734334555e-05, "loss": 1.0765, "step": 503 }, { "epoch": 1.2995480955455132, "grad_norm": 0.5171474777619819, "learning_rate": 1.84758027766988e-05, "loss": 1.0919, "step": 504 }, { "epoch": 1.3021304067140091, "grad_norm": 0.5120748195398405, "learning_rate": 1.8466213220725133e-05, "loss": 1.092, "step": 505 }, { "epoch": 1.3047127178825049, "grad_norm": 0.5018571042434719, "learning_rate": 1.8456596097638414e-05, "loss": 1.0857, "step": 506 }, { "epoch": 1.3072950290510006, "grad_norm": 0.5230975347071961, "learning_rate": 1.8446951438753272e-05, "loss": 1.0498, "step": 507 }, { "epoch": 1.3098773402194963, "grad_norm": 0.5217912907178898, "learning_rate": 1.8437279275474e-05, "loss": 1.0744, "step": 508 }, { "epoch": 1.3124596513879923, "grad_norm": 0.5317696621552458, "learning_rate": 1.8427579639294436e-05, "loss": 1.0914, "step": 509 }, { "epoch": 1.315041962556488, "grad_norm": 0.49394980658380616, "learning_rate": 1.841785256179789e-05, "loss": 1.1051, "step": 510 }, { "epoch": 1.317624273724984, "grad_norm": 0.5217876781301438, "learning_rate": 1.840809807465701e-05, "loss": 1.1077, "step": 511 }, { "epoch": 1.3202065848934796, "grad_norm": 0.49490066616909967, "learning_rate": 1.839831620963371e-05, "loss": 1.0961, "step": 512 }, { "epoch": 1.3227888960619754, "grad_norm": 0.5083891511636687, "learning_rate": 1.8388506998579025e-05, "loss": 1.0532, "step": 513 }, { "epoch": 1.3253712072304713, "grad_norm": 0.5242811566268283, "learning_rate": 1.837867047343306e-05, "loss": 1.1178, "step": 514 }, { "epoch": 1.327953518398967, "grad_norm": 0.5281173798352883, "learning_rate": 1.8368806666224836e-05, "loss": 1.1301, "step": 515 }, { "epoch": 1.330535829567463, "grad_norm": 0.5022194856315405, "learning_rate": 1.8358915609072223e-05, "loss": 1.0753, "step": 516 }, { "epoch": 1.3331181407359587, "grad_norm": 0.5275458989077836, "learning_rate": 1.8348997334181815e-05, "loss": 1.0857, "step": 517 }, { "epoch": 1.3357004519044544, "grad_norm": 0.5261446317192925, "learning_rate": 1.833905187384883e-05, "loss": 1.0724, "step": 518 }, { "epoch": 1.3382827630729504, "grad_norm": 0.4909880164376135, "learning_rate": 1.8329079260457e-05, "loss": 1.0867, "step": 519 }, { "epoch": 1.340865074241446, "grad_norm": 0.538007883447562, "learning_rate": 1.8319079526478487e-05, "loss": 1.1477, "step": 520 }, { "epoch": 1.343447385409942, "grad_norm": 0.5259603760732874, "learning_rate": 1.830905270447374e-05, "loss": 1.0871, "step": 521 }, { "epoch": 1.3460296965784377, "grad_norm": 0.5210088026374243, "learning_rate": 1.829899882709143e-05, "loss": 1.0548, "step": 522 }, { "epoch": 1.3486120077469335, "grad_norm": 0.5274100435358815, "learning_rate": 1.8288917927068315e-05, "loss": 1.0874, "step": 523 }, { "epoch": 1.3511943189154292, "grad_norm": 0.509881963592761, "learning_rate": 1.8278810037229134e-05, "loss": 1.0692, "step": 524 }, { "epoch": 1.3537766300839251, "grad_norm": 0.5425562492866538, "learning_rate": 1.8268675190486524e-05, "loss": 1.0896, "step": 525 }, { "epoch": 1.3563589412524208, "grad_norm": 0.5172299722903951, "learning_rate": 1.825851341984089e-05, "loss": 1.0332, "step": 526 }, { "epoch": 1.3589412524209168, "grad_norm": 0.5113811516912794, "learning_rate": 1.82483247583803e-05, "loss": 1.1021, "step": 527 }, { "epoch": 1.3615235635894125, "grad_norm": 0.4972938127748451, "learning_rate": 1.8238109239280393e-05, "loss": 1.0932, "step": 528 }, { "epoch": 1.3641058747579082, "grad_norm": 0.516789056255083, "learning_rate": 1.822786689580425e-05, "loss": 1.0544, "step": 529 }, { "epoch": 1.3666881859264042, "grad_norm": 0.55011290621819, "learning_rate": 1.8217597761302298e-05, "loss": 1.1075, "step": 530 }, { "epoch": 1.3692704970949, "grad_norm": 0.5273282578927786, "learning_rate": 1.8207301869212207e-05, "loss": 1.0642, "step": 531 }, { "epoch": 1.3718528082633958, "grad_norm": 0.5212309533973544, "learning_rate": 1.8196979253058765e-05, "loss": 1.1039, "step": 532 }, { "epoch": 1.3744351194318916, "grad_norm": 0.5234164457393133, "learning_rate": 1.8186629946453774e-05, "loss": 1.0697, "step": 533 }, { "epoch": 1.3770174306003873, "grad_norm": 0.5143378010325881, "learning_rate": 1.8176253983095958e-05, "loss": 1.059, "step": 534 }, { "epoch": 1.3795997417688832, "grad_norm": 0.5202992098003197, "learning_rate": 1.816585139677082e-05, "loss": 1.0832, "step": 535 }, { "epoch": 1.382182052937379, "grad_norm": 0.5137267251756589, "learning_rate": 1.8155422221350566e-05, "loss": 1.077, "step": 536 }, { "epoch": 1.384764364105875, "grad_norm": 0.5206482372388472, "learning_rate": 1.8144966490793973e-05, "loss": 1.0808, "step": 537 }, { "epoch": 1.3873466752743706, "grad_norm": 0.4977846119013132, "learning_rate": 1.813448423914629e-05, "loss": 1.0889, "step": 538 }, { "epoch": 1.3899289864428663, "grad_norm": 0.5038953337541946, "learning_rate": 1.8123975500539114e-05, "loss": 1.0517, "step": 539 }, { "epoch": 1.392511297611362, "grad_norm": 0.48358165460230973, "learning_rate": 1.811344030919029e-05, "loss": 1.0637, "step": 540 }, { "epoch": 1.395093608779858, "grad_norm": 0.5010075364080836, "learning_rate": 1.8102878699403804e-05, "loss": 1.0718, "step": 541 }, { "epoch": 1.3976759199483537, "grad_norm": 0.5031248131184225, "learning_rate": 1.8092290705569655e-05, "loss": 1.08, "step": 542 }, { "epoch": 1.4002582311168497, "grad_norm": 0.4906664062118931, "learning_rate": 1.8081676362163757e-05, "loss": 1.0582, "step": 543 }, { "epoch": 1.4028405422853454, "grad_norm": 0.5121501864467626, "learning_rate": 1.8071035703747816e-05, "loss": 1.0751, "step": 544 }, { "epoch": 1.405422853453841, "grad_norm": 0.524910024204937, "learning_rate": 1.806036876496923e-05, "loss": 1.0547, "step": 545 }, { "epoch": 1.408005164622337, "grad_norm": 0.5209029065997814, "learning_rate": 1.8049675580560965e-05, "loss": 1.1205, "step": 546 }, { "epoch": 1.4105874757908328, "grad_norm": 0.48220028684533556, "learning_rate": 1.8038956185341452e-05, "loss": 1.0426, "step": 547 }, { "epoch": 1.4131697869593287, "grad_norm": 0.503658857829128, "learning_rate": 1.8028210614214458e-05, "loss": 1.0772, "step": 548 }, { "epoch": 1.4157520981278244, "grad_norm": 0.5055334860886664, "learning_rate": 1.8017438902168987e-05, "loss": 1.0702, "step": 549 }, { "epoch": 1.4183344092963202, "grad_norm": 0.5171324204492811, "learning_rate": 1.800664108427917e-05, "loss": 1.0609, "step": 550 }, { "epoch": 1.4209167204648159, "grad_norm": 0.5189722895903278, "learning_rate": 1.799581719570412e-05, "loss": 1.0755, "step": 551 }, { "epoch": 1.4234990316333118, "grad_norm": 0.5243160862800866, "learning_rate": 1.798496727168787e-05, "loss": 1.0739, "step": 552 }, { "epoch": 1.4260813428018075, "grad_norm": 0.5317337215399274, "learning_rate": 1.7974091347559197e-05, "loss": 1.0711, "step": 553 }, { "epoch": 1.4286636539703035, "grad_norm": 0.49461538156213314, "learning_rate": 1.796318945873156e-05, "loss": 1.1056, "step": 554 }, { "epoch": 1.4312459651387992, "grad_norm": 0.5212170098454869, "learning_rate": 1.795226164070296e-05, "loss": 1.1166, "step": 555 }, { "epoch": 1.433828276307295, "grad_norm": 0.5384906886276548, "learning_rate": 1.7941307929055813e-05, "loss": 1.0836, "step": 556 }, { "epoch": 1.4364105874757909, "grad_norm": 0.538099607308631, "learning_rate": 1.7930328359456856e-05, "loss": 1.0563, "step": 557 }, { "epoch": 1.4389928986442866, "grad_norm": 0.508125640639657, "learning_rate": 1.791932296765703e-05, "loss": 1.0862, "step": 558 }, { "epoch": 1.4415752098127825, "grad_norm": 0.5258022272020693, "learning_rate": 1.7908291789491348e-05, "loss": 1.0947, "step": 559 }, { "epoch": 1.4441575209812783, "grad_norm": 0.5292716445438939, "learning_rate": 1.7897234860878783e-05, "loss": 1.0953, "step": 560 }, { "epoch": 1.446739832149774, "grad_norm": 0.4919901732224947, "learning_rate": 1.7886152217822173e-05, "loss": 1.0589, "step": 561 }, { "epoch": 1.44932214331827, "grad_norm": 0.540399625807644, "learning_rate": 1.7875043896408065e-05, "loss": 1.0868, "step": 562 }, { "epoch": 1.4519044544867656, "grad_norm": 0.5157274204814702, "learning_rate": 1.7863909932806632e-05, "loss": 1.055, "step": 563 }, { "epoch": 1.4544867656552616, "grad_norm": 0.5157064713943694, "learning_rate": 1.785275036327153e-05, "loss": 1.0743, "step": 564 }, { "epoch": 1.4570690768237573, "grad_norm": 0.5093111094768367, "learning_rate": 1.7841565224139798e-05, "loss": 1.0885, "step": 565 }, { "epoch": 1.459651387992253, "grad_norm": 0.48350148375931845, "learning_rate": 1.783035455183174e-05, "loss": 1.0795, "step": 566 }, { "epoch": 1.4622336991607487, "grad_norm": 0.5104024923814483, "learning_rate": 1.781911838285078e-05, "loss": 1.0691, "step": 567 }, { "epoch": 1.4648160103292447, "grad_norm": 0.5257774613135558, "learning_rate": 1.7807856753783387e-05, "loss": 1.0836, "step": 568 }, { "epoch": 1.4673983214977404, "grad_norm": 0.5133170260226599, "learning_rate": 1.7796569701298906e-05, "loss": 1.111, "step": 569 }, { "epoch": 1.4699806326662364, "grad_norm": 0.4960488771792151, "learning_rate": 1.778525726214949e-05, "loss": 1.0913, "step": 570 }, { "epoch": 1.472562943834732, "grad_norm": 0.4923055735581868, "learning_rate": 1.7773919473169933e-05, "loss": 1.0585, "step": 571 }, { "epoch": 1.4751452550032278, "grad_norm": 0.5127181992354112, "learning_rate": 1.7762556371277578e-05, "loss": 1.0647, "step": 572 }, { "epoch": 1.4777275661717237, "grad_norm": 0.5282994414831952, "learning_rate": 1.7751167993472198e-05, "loss": 1.1137, "step": 573 }, { "epoch": 1.4803098773402195, "grad_norm": 0.5248953393475492, "learning_rate": 1.7739754376835858e-05, "loss": 1.0999, "step": 574 }, { "epoch": 1.4828921885087154, "grad_norm": 0.5281077411991068, "learning_rate": 1.7728315558532806e-05, "loss": 1.0953, "step": 575 }, { "epoch": 1.4854744996772111, "grad_norm": 0.4914770889754758, "learning_rate": 1.7716851575809354e-05, "loss": 1.1072, "step": 576 }, { "epoch": 1.4880568108457068, "grad_norm": 0.5218556469624681, "learning_rate": 1.770536246599375e-05, "loss": 1.0899, "step": 577 }, { "epoch": 1.4906391220142028, "grad_norm": 0.5239346375890538, "learning_rate": 1.769384826649606e-05, "loss": 1.0779, "step": 578 }, { "epoch": 1.4932214331826985, "grad_norm": 0.5538774540635639, "learning_rate": 1.7682309014808043e-05, "loss": 1.0503, "step": 579 }, { "epoch": 1.4958037443511945, "grad_norm": 0.5212453237405811, "learning_rate": 1.7670744748503033e-05, "loss": 1.1206, "step": 580 }, { "epoch": 1.4983860555196902, "grad_norm": 0.5221975743446008, "learning_rate": 1.7659155505235812e-05, "loss": 1.0712, "step": 581 }, { "epoch": 1.500968366688186, "grad_norm": 0.49914883784122016, "learning_rate": 1.76475413227425e-05, "loss": 1.0649, "step": 582 }, { "epoch": 1.5035506778566816, "grad_norm": 0.5000466728400638, "learning_rate": 1.7635902238840408e-05, "loss": 1.0621, "step": 583 }, { "epoch": 1.5061329890251776, "grad_norm": 0.4994098902333712, "learning_rate": 1.762423829142794e-05, "loss": 1.0712, "step": 584 }, { "epoch": 1.5087153001936735, "grad_norm": 0.5342342246357215, "learning_rate": 1.7612549518484458e-05, "loss": 1.1141, "step": 585 }, { "epoch": 1.5112976113621692, "grad_norm": 0.5097744481641414, "learning_rate": 1.7600835958070156e-05, "loss": 1.1007, "step": 586 }, { "epoch": 1.513879922530665, "grad_norm": 0.5079344339293564, "learning_rate": 1.7589097648325936e-05, "loss": 1.0814, "step": 587 }, { "epoch": 1.5164622336991607, "grad_norm": 0.4902197192987524, "learning_rate": 1.7577334627473295e-05, "loss": 1.0589, "step": 588 }, { "epoch": 1.5190445448676564, "grad_norm": 0.4893765216413381, "learning_rate": 1.756554693381419e-05, "loss": 1.0913, "step": 589 }, { "epoch": 1.5216268560361523, "grad_norm": 0.5225764596558536, "learning_rate": 1.755373460573091e-05, "loss": 1.0923, "step": 590 }, { "epoch": 1.5242091672046483, "grad_norm": 0.5152845982203591, "learning_rate": 1.7541897681685967e-05, "loss": 1.0946, "step": 591 }, { "epoch": 1.526791478373144, "grad_norm": 0.5261227535805723, "learning_rate": 1.7530036200221955e-05, "loss": 1.1183, "step": 592 }, { "epoch": 1.5293737895416397, "grad_norm": 0.49461625515890395, "learning_rate": 1.7518150199961427e-05, "loss": 1.0876, "step": 593 }, { "epoch": 1.5319561007101354, "grad_norm": 0.5021228041031806, "learning_rate": 1.7506239719606776e-05, "loss": 1.0916, "step": 594 }, { "epoch": 1.5345384118786314, "grad_norm": 0.503576565099223, "learning_rate": 1.749430479794011e-05, "loss": 1.0943, "step": 595 }, { "epoch": 1.5371207230471273, "grad_norm": 0.5226270592481841, "learning_rate": 1.7482345473823116e-05, "loss": 1.1015, "step": 596 }, { "epoch": 1.539703034215623, "grad_norm": 0.537294451707703, "learning_rate": 1.7470361786196938e-05, "loss": 1.0954, "step": 597 }, { "epoch": 1.5422853453841188, "grad_norm": 0.504661881274588, "learning_rate": 1.7458353774082052e-05, "loss": 1.0821, "step": 598 }, { "epoch": 1.5448676565526145, "grad_norm": 0.49601719518902315, "learning_rate": 1.7446321476578138e-05, "loss": 1.0721, "step": 599 }, { "epoch": 1.5474499677211104, "grad_norm": 0.5187763220017648, "learning_rate": 1.743426493286395e-05, "loss": 1.0507, "step": 600 }, { "epoch": 1.5500322788896062, "grad_norm": 0.4989292387853037, "learning_rate": 1.7422184182197197e-05, "loss": 1.0897, "step": 601 }, { "epoch": 1.552614590058102, "grad_norm": 0.4843467686137247, "learning_rate": 1.7410079263914406e-05, "loss": 1.0631, "step": 602 }, { "epoch": 1.5551969012265978, "grad_norm": 0.5271533786423233, "learning_rate": 1.7397950217430794e-05, "loss": 1.1022, "step": 603 }, { "epoch": 1.5577792123950935, "grad_norm": 0.5088338681798537, "learning_rate": 1.7385797082240147e-05, "loss": 1.0839, "step": 604 }, { "epoch": 1.5603615235635893, "grad_norm": 0.4853868656293585, "learning_rate": 1.737361989791468e-05, "loss": 1.0539, "step": 605 }, { "epoch": 1.5629438347320852, "grad_norm": 0.49682074497284284, "learning_rate": 1.7361418704104925e-05, "loss": 1.089, "step": 606 }, { "epoch": 1.5655261459005811, "grad_norm": 0.49391110999180715, "learning_rate": 1.734919354053959e-05, "loss": 1.0829, "step": 607 }, { "epoch": 1.5681084570690769, "grad_norm": 0.5033926514159104, "learning_rate": 1.733694444702542e-05, "loss": 1.0882, "step": 608 }, { "epoch": 1.5706907682375726, "grad_norm": 0.5120915845296581, "learning_rate": 1.7324671463447092e-05, "loss": 1.1071, "step": 609 }, { "epoch": 1.5732730794060683, "grad_norm": 0.5027154616635228, "learning_rate": 1.731237462976707e-05, "loss": 1.0706, "step": 610 }, { "epoch": 1.5758553905745643, "grad_norm": 0.5017997136995258, "learning_rate": 1.7300053986025476e-05, "loss": 1.0935, "step": 611 }, { "epoch": 1.5784377017430602, "grad_norm": 0.4876968907410891, "learning_rate": 1.7287709572339958e-05, "loss": 1.0414, "step": 612 }, { "epoch": 1.581020012911556, "grad_norm": 0.5124364267745315, "learning_rate": 1.7275341428905564e-05, "loss": 1.0569, "step": 613 }, { "epoch": 1.5836023240800516, "grad_norm": 0.5192211728464061, "learning_rate": 1.7262949595994606e-05, "loss": 1.0761, "step": 614 }, { "epoch": 1.5861846352485474, "grad_norm": 0.49024680943396437, "learning_rate": 1.7250534113956543e-05, "loss": 1.08, "step": 615 }, { "epoch": 1.5887669464170433, "grad_norm": 0.4920460482587524, "learning_rate": 1.7238095023217823e-05, "loss": 1.0739, "step": 616 }, { "epoch": 1.591349257585539, "grad_norm": 0.5124627096290069, "learning_rate": 1.722563236428178e-05, "loss": 1.0507, "step": 617 }, { "epoch": 1.593931568754035, "grad_norm": 0.51198818397323, "learning_rate": 1.721314617772849e-05, "loss": 1.0922, "step": 618 }, { "epoch": 1.5965138799225307, "grad_norm": 0.5062132347505444, "learning_rate": 1.7200636504214618e-05, "loss": 1.0374, "step": 619 }, { "epoch": 1.5990961910910264, "grad_norm": 0.521687159299446, "learning_rate": 1.7188103384473334e-05, "loss": 1.064, "step": 620 }, { "epoch": 1.6016785022595221, "grad_norm": 0.47867011946021426, "learning_rate": 1.7175546859314126e-05, "loss": 1.0988, "step": 621 }, { "epoch": 1.604260813428018, "grad_norm": 0.4894260917344886, "learning_rate": 1.7162966969622713e-05, "loss": 1.0709, "step": 622 }, { "epoch": 1.606843124596514, "grad_norm": 0.5354553757656224, "learning_rate": 1.7150363756360886e-05, "loss": 1.1033, "step": 623 }, { "epoch": 1.6094254357650097, "grad_norm": 0.5039991951669948, "learning_rate": 1.713773726056637e-05, "loss": 1.1001, "step": 624 }, { "epoch": 1.6120077469335055, "grad_norm": 0.4980182271238221, "learning_rate": 1.7125087523352718e-05, "loss": 1.0788, "step": 625 }, { "epoch": 1.6145900581020012, "grad_norm": 0.5322347706228192, "learning_rate": 1.7112414585909146e-05, "loss": 1.0673, "step": 626 }, { "epoch": 1.6171723692704971, "grad_norm": 0.49515788566577773, "learning_rate": 1.7099718489500426e-05, "loss": 1.0818, "step": 627 }, { "epoch": 1.6197546804389928, "grad_norm": 0.49847544874894734, "learning_rate": 1.7086999275466727e-05, "loss": 1.071, "step": 628 }, { "epoch": 1.6223369916074888, "grad_norm": 0.4852829304995305, "learning_rate": 1.7074256985223496e-05, "loss": 1.0631, "step": 629 }, { "epoch": 1.6249193027759845, "grad_norm": 0.49795661914754413, "learning_rate": 1.706149166026132e-05, "loss": 1.0876, "step": 630 }, { "epoch": 1.6275016139444802, "grad_norm": 0.502622123586083, "learning_rate": 1.7048703342145793e-05, "loss": 1.0846, "step": 631 }, { "epoch": 1.630083925112976, "grad_norm": 0.49228802240874425, "learning_rate": 1.7035892072517373e-05, "loss": 1.1087, "step": 632 }, { "epoch": 1.632666236281472, "grad_norm": 0.4971680566338004, "learning_rate": 1.7023057893091254e-05, "loss": 1.0768, "step": 633 }, { "epoch": 1.6352485474499678, "grad_norm": 0.5131651796956291, "learning_rate": 1.7010200845657222e-05, "loss": 1.0899, "step": 634 }, { "epoch": 1.6378308586184636, "grad_norm": 0.5074519176524721, "learning_rate": 1.6997320972079536e-05, "loss": 1.081, "step": 635 }, { "epoch": 1.6404131697869593, "grad_norm": 0.5174545968485477, "learning_rate": 1.6984418314296768e-05, "loss": 1.0472, "step": 636 }, { "epoch": 1.642995480955455, "grad_norm": 0.5077381746768771, "learning_rate": 1.697149291432168e-05, "loss": 1.0926, "step": 637 }, { "epoch": 1.645577792123951, "grad_norm": 0.5354282615337868, "learning_rate": 1.6958544814241094e-05, "loss": 1.0414, "step": 638 }, { "epoch": 1.6481601032924469, "grad_norm": 0.52898951516764, "learning_rate": 1.6945574056215742e-05, "loss": 1.0973, "step": 639 }, { "epoch": 1.6507424144609426, "grad_norm": 0.5009163137242975, "learning_rate": 1.6932580682480124e-05, "loss": 1.0826, "step": 640 }, { "epoch": 1.6533247256294383, "grad_norm": 0.5066860393622376, "learning_rate": 1.6919564735342398e-05, "loss": 1.0836, "step": 641 }, { "epoch": 1.655907036797934, "grad_norm": 0.5418314242744041, "learning_rate": 1.6906526257184206e-05, "loss": 1.1132, "step": 642 }, { "epoch": 1.65848934796643, "grad_norm": 0.4999534074892505, "learning_rate": 1.689346529046057e-05, "loss": 1.0818, "step": 643 }, { "epoch": 1.6610716591349257, "grad_norm": 0.5017097366254959, "learning_rate": 1.6880381877699717e-05, "loss": 1.074, "step": 644 }, { "epoch": 1.6636539703034217, "grad_norm": 0.5233206395612633, "learning_rate": 1.686727606150299e-05, "loss": 1.0628, "step": 645 }, { "epoch": 1.6662362814719174, "grad_norm": 0.512780816400479, "learning_rate": 1.6854147884544655e-05, "loss": 1.0843, "step": 646 }, { "epoch": 1.668818592640413, "grad_norm": 0.49275381075866503, "learning_rate": 1.68409973895718e-05, "loss": 1.0843, "step": 647 }, { "epoch": 1.6714009038089088, "grad_norm": 0.5496131300871087, "learning_rate": 1.682782461940418e-05, "loss": 1.0836, "step": 648 }, { "epoch": 1.6739832149774048, "grad_norm": 0.512860264741888, "learning_rate": 1.6814629616934078e-05, "loss": 1.0743, "step": 649 }, { "epoch": 1.6765655261459007, "grad_norm": 0.4975144838656257, "learning_rate": 1.6801412425126183e-05, "loss": 1.0864, "step": 650 }, { "epoch": 1.6791478373143964, "grad_norm": 0.5014057129631031, "learning_rate": 1.678817308701741e-05, "loss": 1.0427, "step": 651 }, { "epoch": 1.6817301484828922, "grad_norm": 0.5234985943818525, "learning_rate": 1.677491164571681e-05, "loss": 1.1048, "step": 652 }, { "epoch": 1.6843124596513879, "grad_norm": 0.5201679982476692, "learning_rate": 1.6761628144405394e-05, "loss": 1.064, "step": 653 }, { "epoch": 1.6868947708198838, "grad_norm": 0.4846519175712527, "learning_rate": 1.6748322626336e-05, "loss": 1.0539, "step": 654 }, { "epoch": 1.6894770819883798, "grad_norm": 0.5173646723613604, "learning_rate": 1.6734995134833155e-05, "loss": 1.1007, "step": 655 }, { "epoch": 1.6920593931568755, "grad_norm": 0.5113183351556072, "learning_rate": 1.6721645713292953e-05, "loss": 1.0815, "step": 656 }, { "epoch": 1.6946417043253712, "grad_norm": 0.5211205639308888, "learning_rate": 1.670827440518287e-05, "loss": 1.0837, "step": 657 }, { "epoch": 1.697224015493867, "grad_norm": 0.5080702604570161, "learning_rate": 1.6694881254041657e-05, "loss": 1.1173, "step": 658 }, { "epoch": 1.6998063266623629, "grad_norm": 0.4962653526436615, "learning_rate": 1.6681466303479196e-05, "loss": 1.0352, "step": 659 }, { "epoch": 1.7023886378308586, "grad_norm": 0.5142414297852521, "learning_rate": 1.6668029597176344e-05, "loss": 1.0666, "step": 660 }, { "epoch": 1.7049709489993545, "grad_norm": 0.4901838014123924, "learning_rate": 1.66545711788848e-05, "loss": 1.0816, "step": 661 }, { "epoch": 1.7075532601678503, "grad_norm": 0.5141149184635171, "learning_rate": 1.664109109242696e-05, "loss": 1.0771, "step": 662 }, { "epoch": 1.710135571336346, "grad_norm": 0.5172010165390014, "learning_rate": 1.6627589381695763e-05, "loss": 1.0752, "step": 663 }, { "epoch": 1.7127178825048417, "grad_norm": 0.4963415308538906, "learning_rate": 1.661406609065458e-05, "loss": 1.1219, "step": 664 }, { "epoch": 1.7153001936733376, "grad_norm": 0.4877763872520322, "learning_rate": 1.6600521263337043e-05, "loss": 1.058, "step": 665 }, { "epoch": 1.7178825048418336, "grad_norm": 0.4969456781904556, "learning_rate": 1.6586954943846895e-05, "loss": 1.0834, "step": 666 }, { "epoch": 1.7204648160103293, "grad_norm": 0.4815019660085988, "learning_rate": 1.6573367176357876e-05, "loss": 1.0618, "step": 667 }, { "epoch": 1.723047127178825, "grad_norm": 0.4936696257730036, "learning_rate": 1.6559758005113564e-05, "loss": 1.0902, "step": 668 }, { "epoch": 1.7256294383473207, "grad_norm": 0.4850729841607312, "learning_rate": 1.6546127474427217e-05, "loss": 1.0499, "step": 669 }, { "epoch": 1.7282117495158167, "grad_norm": 0.48113300472686776, "learning_rate": 1.653247562868166e-05, "loss": 1.0682, "step": 670 }, { "epoch": 1.7307940606843124, "grad_norm": 0.4814780954159902, "learning_rate": 1.6518802512329105e-05, "loss": 1.083, "step": 671 }, { "epoch": 1.7333763718528084, "grad_norm": 0.5247380192600469, "learning_rate": 1.6505108169891032e-05, "loss": 1.093, "step": 672 }, { "epoch": 1.735958683021304, "grad_norm": 0.4981848964288428, "learning_rate": 1.6491392645958043e-05, "loss": 1.0656, "step": 673 }, { "epoch": 1.7385409941897998, "grad_norm": 0.5007815563313807, "learning_rate": 1.6477655985189703e-05, "loss": 1.0583, "step": 674 }, { "epoch": 1.7411233053582955, "grad_norm": 0.4924655390382668, "learning_rate": 1.6463898232314393e-05, "loss": 1.0881, "step": 675 }, { "epoch": 1.7437056165267915, "grad_norm": 0.4964112767549225, "learning_rate": 1.6450119432129185e-05, "loss": 1.0645, "step": 676 }, { "epoch": 1.7462879276952874, "grad_norm": 0.48606768423741387, "learning_rate": 1.6436319629499683e-05, "loss": 1.0984, "step": 677 }, { "epoch": 1.7488702388637831, "grad_norm": 0.5148244978248903, "learning_rate": 1.642249886935987e-05, "loss": 1.0668, "step": 678 }, { "epoch": 1.7514525500322788, "grad_norm": 0.5005608398817017, "learning_rate": 1.6408657196711977e-05, "loss": 1.0253, "step": 679 }, { "epoch": 1.7540348612007746, "grad_norm": 0.5231072514633008, "learning_rate": 1.6394794656626325e-05, "loss": 1.1069, "step": 680 }, { "epoch": 1.7566171723692705, "grad_norm": 0.5158005487144547, "learning_rate": 1.638091129424118e-05, "loss": 1.1059, "step": 681 }, { "epoch": 1.7591994835377665, "grad_norm": 0.5085221327854161, "learning_rate": 1.6367007154762616e-05, "loss": 1.0628, "step": 682 }, { "epoch": 1.7617817947062622, "grad_norm": 0.49993960938301113, "learning_rate": 1.6353082283464355e-05, "loss": 1.0774, "step": 683 }, { "epoch": 1.764364105874758, "grad_norm": 0.5136885732061924, "learning_rate": 1.633913672568762e-05, "loss": 1.0571, "step": 684 }, { "epoch": 1.7669464170432536, "grad_norm": 0.5064861116191551, "learning_rate": 1.6325170526841e-05, "loss": 1.0927, "step": 685 }, { "epoch": 1.7695287282117496, "grad_norm": 0.5065336598849988, "learning_rate": 1.631118373240029e-05, "loss": 1.0437, "step": 686 }, { "epoch": 1.7721110393802453, "grad_norm": 0.5071797770884724, "learning_rate": 1.629717638790835e-05, "loss": 1.058, "step": 687 }, { "epoch": 1.7746933505487412, "grad_norm": 0.5032716603331865, "learning_rate": 1.6283148538974943e-05, "loss": 1.108, "step": 688 }, { "epoch": 1.777275661717237, "grad_norm": 0.5168971680331349, "learning_rate": 1.6269100231276617e-05, "loss": 1.0967, "step": 689 }, { "epoch": 1.7798579728857327, "grad_norm": 0.5328504423513274, "learning_rate": 1.6255031510556513e-05, "loss": 1.0755, "step": 690 }, { "epoch": 1.7824402840542284, "grad_norm": 0.474134415576521, "learning_rate": 1.6240942422624264e-05, "loss": 1.0433, "step": 691 }, { "epoch": 1.7850225952227243, "grad_norm": 0.5062840676066106, "learning_rate": 1.62268330133558e-05, "loss": 1.0884, "step": 692 }, { "epoch": 1.7876049063912203, "grad_norm": 0.5242892989776939, "learning_rate": 1.6212703328693232e-05, "loss": 1.0813, "step": 693 }, { "epoch": 1.790187217559716, "grad_norm": 0.49294778164946207, "learning_rate": 1.6198553414644687e-05, "loss": 1.0589, "step": 694 }, { "epoch": 1.7927695287282117, "grad_norm": 0.5333831319134179, "learning_rate": 1.6184383317284163e-05, "loss": 1.0803, "step": 695 }, { "epoch": 1.7953518398967074, "grad_norm": 0.4848117805750976, "learning_rate": 1.6170193082751372e-05, "loss": 1.0651, "step": 696 }, { "epoch": 1.7979341510652034, "grad_norm": 0.4912184014826424, "learning_rate": 1.6155982757251605e-05, "loss": 1.0805, "step": 697 }, { "epoch": 1.8005164622336993, "grad_norm": 0.5209563606543747, "learning_rate": 1.614175238705556e-05, "loss": 1.0676, "step": 698 }, { "epoch": 1.803098773402195, "grad_norm": 0.4983116660478031, "learning_rate": 1.6127502018499216e-05, "loss": 1.0523, "step": 699 }, { "epoch": 1.8056810845706908, "grad_norm": 0.487693383153112, "learning_rate": 1.6113231697983658e-05, "loss": 1.0663, "step": 700 }, { "epoch": 1.8082633957391865, "grad_norm": 0.5338041396304789, "learning_rate": 1.6098941471974945e-05, "loss": 1.1128, "step": 701 }, { "epoch": 1.8108457069076824, "grad_norm": 0.5142298199439157, "learning_rate": 1.608463138700395e-05, "loss": 1.0712, "step": 702 }, { "epoch": 1.8134280180761781, "grad_norm": 0.47630790480629104, "learning_rate": 1.6070301489666203e-05, "loss": 1.0988, "step": 703 }, { "epoch": 1.816010329244674, "grad_norm": 0.4901104298299483, "learning_rate": 1.6055951826621753e-05, "loss": 1.0428, "step": 704 }, { "epoch": 1.8185926404131698, "grad_norm": 0.5227604090573119, "learning_rate": 1.6041582444595004e-05, "loss": 1.0698, "step": 705 }, { "epoch": 1.8211749515816655, "grad_norm": 0.5041405266487794, "learning_rate": 1.602719339037457e-05, "loss": 1.0753, "step": 706 }, { "epoch": 1.8237572627501613, "grad_norm": 0.5093841266418548, "learning_rate": 1.6012784710813122e-05, "loss": 1.1189, "step": 707 }, { "epoch": 1.8263395739186572, "grad_norm": 0.5166236437305157, "learning_rate": 1.599835645282723e-05, "loss": 1.07, "step": 708 }, { "epoch": 1.8289218850871531, "grad_norm": 0.5238202604739227, "learning_rate": 1.598390866339721e-05, "loss": 1.0734, "step": 709 }, { "epoch": 1.8315041962556489, "grad_norm": 0.5351507809671923, "learning_rate": 1.5969441389566995e-05, "loss": 1.0722, "step": 710 }, { "epoch": 1.8340865074241446, "grad_norm": 0.48654357580846874, "learning_rate": 1.5954954678443934e-05, "loss": 1.0581, "step": 711 }, { "epoch": 1.8366688185926403, "grad_norm": 0.48828499263183267, "learning_rate": 1.5940448577198685e-05, "loss": 1.0778, "step": 712 }, { "epoch": 1.8392511297611362, "grad_norm": 0.49993618606120815, "learning_rate": 1.5925923133065036e-05, "loss": 1.0744, "step": 713 }, { "epoch": 1.841833440929632, "grad_norm": 0.5060526028050449, "learning_rate": 1.591137839333976e-05, "loss": 1.0869, "step": 714 }, { "epoch": 1.844415752098128, "grad_norm": 0.4854542346534205, "learning_rate": 1.5896814405382455e-05, "loss": 1.0734, "step": 715 }, { "epoch": 1.8469980632666236, "grad_norm": 0.5120958440799618, "learning_rate": 1.5882231216615405e-05, "loss": 1.056, "step": 716 }, { "epoch": 1.8495803744351194, "grad_norm": 0.4857556591454626, "learning_rate": 1.58676288745234e-05, "loss": 1.0502, "step": 717 }, { "epoch": 1.852162685603615, "grad_norm": 0.49183704296233893, "learning_rate": 1.5853007426653607e-05, "loss": 1.116, "step": 718 }, { "epoch": 1.854744996772111, "grad_norm": 0.4971020661357399, "learning_rate": 1.5838366920615395e-05, "loss": 1.0535, "step": 719 }, { "epoch": 1.857327307940607, "grad_norm": 0.485071482170176, "learning_rate": 1.5823707404080196e-05, "loss": 1.0465, "step": 720 }, { "epoch": 1.8599096191091027, "grad_norm": 0.48718149162761787, "learning_rate": 1.5809028924781343e-05, "loss": 1.0787, "step": 721 }, { "epoch": 1.8624919302775984, "grad_norm": 0.4775709718268873, "learning_rate": 1.5794331530513903e-05, "loss": 1.0354, "step": 722 }, { "epoch": 1.8650742414460941, "grad_norm": 0.5008952740743758, "learning_rate": 1.577961526913455e-05, "loss": 1.0602, "step": 723 }, { "epoch": 1.86765655261459, "grad_norm": 0.5064643531485886, "learning_rate": 1.5764880188561376e-05, "loss": 1.1178, "step": 724 }, { "epoch": 1.870238863783086, "grad_norm": 0.49848568260978515, "learning_rate": 1.5750126336773755e-05, "loss": 1.0422, "step": 725 }, { "epoch": 1.8728211749515817, "grad_norm": 0.5220419002346904, "learning_rate": 1.5735353761812197e-05, "loss": 1.057, "step": 726 }, { "epoch": 1.8754034861200775, "grad_norm": 0.4971228637528045, "learning_rate": 1.5720562511778156e-05, "loss": 1.0556, "step": 727 }, { "epoch": 1.8779857972885732, "grad_norm": 0.48640138793502713, "learning_rate": 1.5705752634833908e-05, "loss": 1.0857, "step": 728 }, { "epoch": 1.8805681084570691, "grad_norm": 0.5037467924591017, "learning_rate": 1.5690924179202375e-05, "loss": 1.0581, "step": 729 }, { "epoch": 1.8831504196255648, "grad_norm": 0.5282496443218059, "learning_rate": 1.5676077193166973e-05, "loss": 1.0799, "step": 730 }, { "epoch": 1.8857327307940608, "grad_norm": 0.47021596749068756, "learning_rate": 1.5661211725071457e-05, "loss": 1.0352, "step": 731 }, { "epoch": 1.8883150419625565, "grad_norm": 0.513391496193585, "learning_rate": 1.5646327823319765e-05, "loss": 1.1031, "step": 732 }, { "epoch": 1.8908973531310522, "grad_norm": 0.5186791214030437, "learning_rate": 1.5631425536375858e-05, "loss": 1.0849, "step": 733 }, { "epoch": 1.893479664299548, "grad_norm": 0.4945851612291226, "learning_rate": 1.5616504912763554e-05, "loss": 1.0513, "step": 734 }, { "epoch": 1.896061975468044, "grad_norm": 0.4805032305818217, "learning_rate": 1.5601566001066384e-05, "loss": 1.0388, "step": 735 }, { "epoch": 1.8986442866365398, "grad_norm": 0.49355307137386584, "learning_rate": 1.5586608849927424e-05, "loss": 1.0729, "step": 736 }, { "epoch": 1.9012265978050356, "grad_norm": 0.4868522623792675, "learning_rate": 1.5571633508049148e-05, "loss": 1.0472, "step": 737 }, { "epoch": 1.9038089089735313, "grad_norm": 0.5064645951323721, "learning_rate": 1.5556640024193245e-05, "loss": 1.0592, "step": 738 }, { "epoch": 1.906391220142027, "grad_norm": 0.4935670987750482, "learning_rate": 1.5541628447180494e-05, "loss": 1.0567, "step": 739 }, { "epoch": 1.908973531310523, "grad_norm": 0.5087284962422527, "learning_rate": 1.552659882589058e-05, "loss": 1.0544, "step": 740 }, { "epoch": 1.9115558424790189, "grad_norm": 0.503398066954607, "learning_rate": 1.551155120926194e-05, "loss": 1.0416, "step": 741 }, { "epoch": 1.9141381536475146, "grad_norm": 0.5248315003621526, "learning_rate": 1.5496485646291613e-05, "loss": 1.0821, "step": 742 }, { "epoch": 1.9167204648160103, "grad_norm": 0.509374102002012, "learning_rate": 1.548140218603507e-05, "loss": 1.1231, "step": 743 }, { "epoch": 1.919302775984506, "grad_norm": 0.4763357005641916, "learning_rate": 1.5466300877606054e-05, "loss": 1.0557, "step": 744 }, { "epoch": 1.921885087153002, "grad_norm": 0.5156095352624543, "learning_rate": 1.5451181770176434e-05, "loss": 1.102, "step": 745 }, { "epoch": 1.9244673983214977, "grad_norm": 0.5116973644648233, "learning_rate": 1.543604491297602e-05, "loss": 1.1098, "step": 746 }, { "epoch": 1.9270497094899937, "grad_norm": 0.5093190129624484, "learning_rate": 1.5420890355292435e-05, "loss": 1.0528, "step": 747 }, { "epoch": 1.9296320206584894, "grad_norm": 0.506225349532536, "learning_rate": 1.5405718146470926e-05, "loss": 1.0607, "step": 748 }, { "epoch": 1.932214331826985, "grad_norm": 0.5271647344306659, "learning_rate": 1.5390528335914216e-05, "loss": 1.1065, "step": 749 }, { "epoch": 1.9347966429954808, "grad_norm": 0.5309546928129137, "learning_rate": 1.5375320973082346e-05, "loss": 1.0818, "step": 750 }, { "epoch": 1.9373789541639768, "grad_norm": 0.5187838315898233, "learning_rate": 1.53600961074925e-05, "loss": 1.0614, "step": 751 }, { "epoch": 1.9399612653324727, "grad_norm": 0.4999420933843893, "learning_rate": 1.5344853788718867e-05, "loss": 1.0385, "step": 752 }, { "epoch": 1.9425435765009684, "grad_norm": 0.5449765150372478, "learning_rate": 1.532959406639245e-05, "loss": 1.0324, "step": 753 }, { "epoch": 1.9451258876694641, "grad_norm": 0.48682953533866824, "learning_rate": 1.5314316990200933e-05, "loss": 1.0302, "step": 754 }, { "epoch": 1.9477081988379599, "grad_norm": 0.49226909739556324, "learning_rate": 1.5299022609888507e-05, "loss": 1.1016, "step": 755 }, { "epoch": 1.9502905100064558, "grad_norm": 0.5347878649288165, "learning_rate": 1.5283710975255695e-05, "loss": 1.0843, "step": 756 }, { "epoch": 1.9528728211749515, "grad_norm": 0.5028835492330221, "learning_rate": 1.5268382136159213e-05, "loss": 1.0832, "step": 757 }, { "epoch": 1.9554551323434475, "grad_norm": 0.5020788778708613, "learning_rate": 1.5253036142511794e-05, "loss": 1.0554, "step": 758 }, { "epoch": 1.9580374435119432, "grad_norm": 0.5117826424662124, "learning_rate": 1.5237673044282028e-05, "loss": 1.0407, "step": 759 }, { "epoch": 1.960619754680439, "grad_norm": 0.486751220436105, "learning_rate": 1.5222292891494204e-05, "loss": 1.1028, "step": 760 }, { "epoch": 1.9632020658489346, "grad_norm": 0.5042620665613498, "learning_rate": 1.5206895734228133e-05, "loss": 1.1089, "step": 761 }, { "epoch": 1.9657843770174306, "grad_norm": 0.4985230564456094, "learning_rate": 1.5191481622619006e-05, "loss": 1.0892, "step": 762 }, { "epoch": 1.9683666881859265, "grad_norm": 0.46564788925434075, "learning_rate": 1.5176050606857211e-05, "loss": 1.0687, "step": 763 }, { "epoch": 1.9709489993544222, "grad_norm": 0.4799444405216457, "learning_rate": 1.5160602737188184e-05, "loss": 1.0627, "step": 764 }, { "epoch": 1.973531310522918, "grad_norm": 0.517680238719764, "learning_rate": 1.514513806391224e-05, "loss": 1.1087, "step": 765 }, { "epoch": 1.9761136216914137, "grad_norm": 0.4945356852520951, "learning_rate": 1.5129656637384398e-05, "loss": 1.0333, "step": 766 }, { "epoch": 1.9786959328599096, "grad_norm": 0.47246161399068515, "learning_rate": 1.5114158508014244e-05, "loss": 1.0622, "step": 767 }, { "epoch": 1.9812782440284056, "grad_norm": 0.4792556964609251, "learning_rate": 1.509864372626574e-05, "loss": 1.0807, "step": 768 }, { "epoch": 1.9838605551969013, "grad_norm": 0.4942144710838991, "learning_rate": 1.5083112342657071e-05, "loss": 1.088, "step": 769 }, { "epoch": 1.986442866365397, "grad_norm": 0.5201249256236419, "learning_rate": 1.5067564407760485e-05, "loss": 1.0938, "step": 770 }, { "epoch": 1.9890251775338927, "grad_norm": 0.4829123624901927, "learning_rate": 1.5051999972202118e-05, "loss": 1.0353, "step": 771 }, { "epoch": 1.9916074887023887, "grad_norm": 0.5024498746492575, "learning_rate": 1.5036419086661837e-05, "loss": 1.0802, "step": 772 }, { "epoch": 1.9941897998708844, "grad_norm": 0.4963413917672638, "learning_rate": 1.5020821801873072e-05, "loss": 1.0801, "step": 773 }, { "epoch": 1.9967721110393803, "grad_norm": 0.4872864553127849, "learning_rate": 1.5005208168622649e-05, "loss": 1.0509, "step": 774 }, { "epoch": 1.999354422207876, "grad_norm": 0.47707633602130245, "learning_rate": 1.4989578237750628e-05, "loss": 1.0485, "step": 775 }, { "epoch": 2.0, "grad_norm": 0.47707633602130245, "learning_rate": 1.4973932060150142e-05, "loss": 1.0293, "step": 776 }, { "epoch": 2.0025823111684957, "grad_norm": 1.1941485826343992, "learning_rate": 1.4958269686767214e-05, "loss": 0.9552, "step": 777 }, { "epoch": 2.0051646223369914, "grad_norm": 0.7512477337089716, "learning_rate": 1.4942591168600616e-05, "loss": 0.9653, "step": 778 }, { "epoch": 2.0077469335054876, "grad_norm": 0.6413401233120336, "learning_rate": 1.4926896556701676e-05, "loss": 0.9713, "step": 779 }, { "epoch": 2.0103292446739833, "grad_norm": 0.8265225982878504, "learning_rate": 1.4911185902174134e-05, "loss": 0.9674, "step": 780 }, { "epoch": 2.012911555842479, "grad_norm": 0.7690019968988396, "learning_rate": 1.4895459256173966e-05, "loss": 0.9701, "step": 781 }, { "epoch": 2.0154938670109748, "grad_norm": 0.6695923734742643, "learning_rate": 1.4879716669909215e-05, "loss": 0.9262, "step": 782 }, { "epoch": 2.0180761781794705, "grad_norm": 0.6968627838761038, "learning_rate": 1.4863958194639828e-05, "loss": 0.9738, "step": 783 }, { "epoch": 2.020658489347966, "grad_norm": 0.7154538295447892, "learning_rate": 1.4848183881677497e-05, "loss": 0.9537, "step": 784 }, { "epoch": 2.0232408005164624, "grad_norm": 0.6599822904047927, "learning_rate": 1.4832393782385475e-05, "loss": 0.9428, "step": 785 }, { "epoch": 2.025823111684958, "grad_norm": 0.6785737464207784, "learning_rate": 1.4816587948178411e-05, "loss": 0.9377, "step": 786 }, { "epoch": 2.028405422853454, "grad_norm": 0.6659927550215519, "learning_rate": 1.4800766430522208e-05, "loss": 0.9477, "step": 787 }, { "epoch": 2.0309877340219495, "grad_norm": 0.6453008064623791, "learning_rate": 1.4784929280933819e-05, "loss": 0.9734, "step": 788 }, { "epoch": 2.0335700451904453, "grad_norm": 0.6138501650256379, "learning_rate": 1.4769076550981107e-05, "loss": 0.9485, "step": 789 }, { "epoch": 2.0361523563589414, "grad_norm": 0.628292728528221, "learning_rate": 1.4753208292282666e-05, "loss": 0.9373, "step": 790 }, { "epoch": 2.038734667527437, "grad_norm": 0.6518720740962953, "learning_rate": 1.4737324556507639e-05, "loss": 0.9854, "step": 791 }, { "epoch": 2.041316978695933, "grad_norm": 0.6417366086713475, "learning_rate": 1.472142539537559e-05, "loss": 0.9668, "step": 792 }, { "epoch": 2.0438992898644286, "grad_norm": 0.6525413690852432, "learning_rate": 1.4705510860656289e-05, "loss": 0.9429, "step": 793 }, { "epoch": 2.0464816010329243, "grad_norm": 0.6126084056396112, "learning_rate": 1.4689581004169573e-05, "loss": 0.9828, "step": 794 }, { "epoch": 2.0490639122014205, "grad_norm": 0.6169432282421197, "learning_rate": 1.4673635877785168e-05, "loss": 0.9522, "step": 795 }, { "epoch": 2.051646223369916, "grad_norm": 0.5922213822722046, "learning_rate": 1.4657675533422517e-05, "loss": 0.9478, "step": 796 }, { "epoch": 2.054228534538412, "grad_norm": 0.6211700530426607, "learning_rate": 1.4641700023050625e-05, "loss": 0.9325, "step": 797 }, { "epoch": 2.0568108457069076, "grad_norm": 0.6173048324016761, "learning_rate": 1.4625709398687862e-05, "loss": 0.9477, "step": 798 }, { "epoch": 2.0593931568754034, "grad_norm": 0.5866193216584925, "learning_rate": 1.4609703712401832e-05, "loss": 0.9378, "step": 799 }, { "epoch": 2.061975468043899, "grad_norm": 0.6128450747778748, "learning_rate": 1.4593683016309168e-05, "loss": 0.9785, "step": 800 }, { "epoch": 2.0645577792123952, "grad_norm": 0.582652343191567, "learning_rate": 1.4577647362575378e-05, "loss": 0.9318, "step": 801 }, { "epoch": 2.067140090380891, "grad_norm": 0.5589523161558311, "learning_rate": 1.4561596803414681e-05, "loss": 0.9295, "step": 802 }, { "epoch": 2.0697224015493867, "grad_norm": 0.5777811559605781, "learning_rate": 1.4545531391089826e-05, "loss": 0.9606, "step": 803 }, { "epoch": 2.0723047127178824, "grad_norm": 0.5932336435576502, "learning_rate": 1.4529451177911926e-05, "loss": 0.973, "step": 804 }, { "epoch": 2.074887023886378, "grad_norm": 0.5467403526396694, "learning_rate": 1.4513356216240287e-05, "loss": 0.8862, "step": 805 }, { "epoch": 2.0774693350548743, "grad_norm": 0.5667654235524354, "learning_rate": 1.449724655848224e-05, "loss": 0.9484, "step": 806 }, { "epoch": 2.08005164622337, "grad_norm": 0.557027598681585, "learning_rate": 1.4481122257092966e-05, "loss": 0.9537, "step": 807 }, { "epoch": 2.0826339573918657, "grad_norm": 0.5736006444800077, "learning_rate": 1.4464983364575327e-05, "loss": 0.9644, "step": 808 }, { "epoch": 2.0852162685603615, "grad_norm": 0.5911533935183022, "learning_rate": 1.44488299334797e-05, "loss": 0.9547, "step": 809 }, { "epoch": 2.087798579728857, "grad_norm": 0.5854788104570025, "learning_rate": 1.44326620164038e-05, "loss": 0.9316, "step": 810 }, { "epoch": 2.090380890897353, "grad_norm": 0.5885109768704322, "learning_rate": 1.4416479665992507e-05, "loss": 0.9468, "step": 811 }, { "epoch": 2.092963202065849, "grad_norm": 0.5860083582206628, "learning_rate": 1.4400282934937702e-05, "loss": 0.9597, "step": 812 }, { "epoch": 2.095545513234345, "grad_norm": 0.5612799121488241, "learning_rate": 1.4384071875978085e-05, "loss": 0.9291, "step": 813 }, { "epoch": 2.0981278244028405, "grad_norm": 0.5760413758972827, "learning_rate": 1.4367846541899017e-05, "loss": 0.9434, "step": 814 }, { "epoch": 2.1007101355713362, "grad_norm": 0.5872031423934213, "learning_rate": 1.4351606985532338e-05, "loss": 0.9546, "step": 815 }, { "epoch": 2.103292446739832, "grad_norm": 0.5875266718802965, "learning_rate": 1.4335353259756199e-05, "loss": 0.9739, "step": 816 }, { "epoch": 2.105874757908328, "grad_norm": 0.5834229896061526, "learning_rate": 1.4319085417494885e-05, "loss": 0.936, "step": 817 }, { "epoch": 2.108457069076824, "grad_norm": 0.5740341555688057, "learning_rate": 1.430280351171864e-05, "loss": 0.9295, "step": 818 }, { "epoch": 2.1110393802453196, "grad_norm": 0.6028061663862296, "learning_rate": 1.4286507595443527e-05, "loss": 0.9475, "step": 819 }, { "epoch": 2.1136216914138153, "grad_norm": 0.6066376132775557, "learning_rate": 1.4270197721731192e-05, "loss": 0.9748, "step": 820 }, { "epoch": 2.116204002582311, "grad_norm": 0.6009913417618149, "learning_rate": 1.4253873943688751e-05, "loss": 0.9599, "step": 821 }, { "epoch": 2.118786313750807, "grad_norm": 0.5983886456577467, "learning_rate": 1.4237536314468602e-05, "loss": 0.9594, "step": 822 }, { "epoch": 2.121368624919303, "grad_norm": 0.617011626576933, "learning_rate": 1.4221184887268218e-05, "loss": 0.9498, "step": 823 }, { "epoch": 2.1239509360877986, "grad_norm": 0.6005132003701584, "learning_rate": 1.4204819715330026e-05, "loss": 0.9503, "step": 824 }, { "epoch": 2.1265332472562943, "grad_norm": 0.5741558367115511, "learning_rate": 1.4188440851941185e-05, "loss": 0.9587, "step": 825 }, { "epoch": 2.12911555842479, "grad_norm": 0.6062156734819026, "learning_rate": 1.4172048350433457e-05, "loss": 0.969, "step": 826 }, { "epoch": 2.131697869593286, "grad_norm": 0.6128646943053142, "learning_rate": 1.4155642264182992e-05, "loss": 0.9534, "step": 827 }, { "epoch": 2.134280180761782, "grad_norm": 0.5828534204572827, "learning_rate": 1.4139222646610185e-05, "loss": 0.9388, "step": 828 }, { "epoch": 2.1368624919302777, "grad_norm": 0.6060884386262935, "learning_rate": 1.4122789551179495e-05, "loss": 0.9884, "step": 829 }, { "epoch": 2.1394448030987734, "grad_norm": 0.6061173547442686, "learning_rate": 1.4106343031399252e-05, "loss": 0.924, "step": 830 }, { "epoch": 2.142027114267269, "grad_norm": 0.5851413898430766, "learning_rate": 1.408988314082151e-05, "loss": 0.9455, "step": 831 }, { "epoch": 2.144609425435765, "grad_norm": 0.588388475305726, "learning_rate": 1.4073409933041853e-05, "loss": 0.9337, "step": 832 }, { "epoch": 2.147191736604261, "grad_norm": 0.5838096533852828, "learning_rate": 1.4056923461699232e-05, "loss": 0.9392, "step": 833 }, { "epoch": 2.1497740477727567, "grad_norm": 0.5997141349811622, "learning_rate": 1.4040423780475787e-05, "loss": 0.9593, "step": 834 }, { "epoch": 2.1523563589412524, "grad_norm": 0.6020566174282612, "learning_rate": 1.4023910943096662e-05, "loss": 0.9616, "step": 835 }, { "epoch": 2.154938670109748, "grad_norm": 0.6073751111196977, "learning_rate": 1.4007385003329847e-05, "loss": 0.9804, "step": 836 }, { "epoch": 2.157520981278244, "grad_norm": 0.640691713500995, "learning_rate": 1.3990846014985997e-05, "loss": 0.9525, "step": 837 }, { "epoch": 2.16010329244674, "grad_norm": 0.5749963474745784, "learning_rate": 1.397429403191825e-05, "loss": 0.9753, "step": 838 }, { "epoch": 2.1626856036152358, "grad_norm": 0.6017911845722985, "learning_rate": 1.3957729108022057e-05, "loss": 0.9698, "step": 839 }, { "epoch": 2.1652679147837315, "grad_norm": 0.598963393328458, "learning_rate": 1.3941151297235007e-05, "loss": 0.9828, "step": 840 }, { "epoch": 2.167850225952227, "grad_norm": 0.5829192765375827, "learning_rate": 1.3924560653536652e-05, "loss": 0.9399, "step": 841 }, { "epoch": 2.170432537120723, "grad_norm": 0.5736202743026629, "learning_rate": 1.3907957230948328e-05, "loss": 0.9414, "step": 842 }, { "epoch": 2.1730148482892186, "grad_norm": 0.6048383091141705, "learning_rate": 1.3891341083532979e-05, "loss": 0.93, "step": 843 }, { "epoch": 2.175597159457715, "grad_norm": 0.6058814179639644, "learning_rate": 1.3874712265394984e-05, "loss": 0.9625, "step": 844 }, { "epoch": 2.1781794706262105, "grad_norm": 0.6048568085747608, "learning_rate": 1.3858070830679987e-05, "loss": 0.9325, "step": 845 }, { "epoch": 2.1807617817947063, "grad_norm": 0.6166529166864086, "learning_rate": 1.3841416833574696e-05, "loss": 0.9991, "step": 846 }, { "epoch": 2.183344092963202, "grad_norm": 0.5836884801008753, "learning_rate": 1.3824750328306747e-05, "loss": 0.9567, "step": 847 }, { "epoch": 2.1859264041316977, "grad_norm": 0.5952429990454414, "learning_rate": 1.3808071369144476e-05, "loss": 0.9244, "step": 848 }, { "epoch": 2.188508715300194, "grad_norm": 0.5857084084921026, "learning_rate": 1.37913800103968e-05, "loss": 0.9655, "step": 849 }, { "epoch": 2.1910910264686896, "grad_norm": 0.6279175507836195, "learning_rate": 1.3774676306412986e-05, "loss": 0.9323, "step": 850 }, { "epoch": 2.1936733376371853, "grad_norm": 0.5863735033805826, "learning_rate": 1.3757960311582518e-05, "loss": 0.961, "step": 851 }, { "epoch": 2.196255648805681, "grad_norm": 0.5793193685107874, "learning_rate": 1.3741232080334889e-05, "loss": 0.9417, "step": 852 }, { "epoch": 2.1988379599741767, "grad_norm": 0.5779370435007501, "learning_rate": 1.3724491667139437e-05, "loss": 0.9543, "step": 853 }, { "epoch": 2.2014202711426725, "grad_norm": 0.5898150549054328, "learning_rate": 1.3707739126505168e-05, "loss": 0.9751, "step": 854 }, { "epoch": 2.2040025823111686, "grad_norm": 0.5932733176039338, "learning_rate": 1.3690974512980577e-05, "loss": 0.9453, "step": 855 }, { "epoch": 2.2065848934796644, "grad_norm": 0.5905918422617804, "learning_rate": 1.3674197881153468e-05, "loss": 0.9361, "step": 856 }, { "epoch": 2.20916720464816, "grad_norm": 0.594528411021171, "learning_rate": 1.365740928565078e-05, "loss": 0.9781, "step": 857 }, { "epoch": 2.211749515816656, "grad_norm": 0.5872952131266409, "learning_rate": 1.3640608781138407e-05, "loss": 0.9479, "step": 858 }, { "epoch": 2.2143318269851515, "grad_norm": 0.5929574963797165, "learning_rate": 1.3623796422321018e-05, "loss": 0.9488, "step": 859 }, { "epoch": 2.2169141381536477, "grad_norm": 0.6100088602969217, "learning_rate": 1.3606972263941884e-05, "loss": 0.93, "step": 860 }, { "epoch": 2.2194964493221434, "grad_norm": 0.5757485969334069, "learning_rate": 1.3590136360782697e-05, "loss": 0.9167, "step": 861 }, { "epoch": 2.222078760490639, "grad_norm": 0.5880881759424176, "learning_rate": 1.3573288767663388e-05, "loss": 0.9831, "step": 862 }, { "epoch": 2.224661071659135, "grad_norm": 0.6101438672240849, "learning_rate": 1.3556429539441957e-05, "loss": 0.9425, "step": 863 }, { "epoch": 2.2272433828276306, "grad_norm": 0.6032144416691072, "learning_rate": 1.3539558731014285e-05, "loss": 0.956, "step": 864 }, { "epoch": 2.2298256939961267, "grad_norm": 0.5877358574038184, "learning_rate": 1.3522676397313963e-05, "loss": 0.9769, "step": 865 }, { "epoch": 2.2324080051646225, "grad_norm": 0.6037905375839121, "learning_rate": 1.3505782593312108e-05, "loss": 0.9577, "step": 866 }, { "epoch": 2.234990316333118, "grad_norm": 0.5826777668673346, "learning_rate": 1.3488877374017189e-05, "loss": 0.9514, "step": 867 }, { "epoch": 2.237572627501614, "grad_norm": 0.591593499195398, "learning_rate": 1.3471960794474837e-05, "loss": 0.9563, "step": 868 }, { "epoch": 2.2401549386701096, "grad_norm": 0.5972872893141782, "learning_rate": 1.345503290976768e-05, "loss": 0.9646, "step": 869 }, { "epoch": 2.242737249838606, "grad_norm": 0.5695814980462333, "learning_rate": 1.3438093775015157e-05, "loss": 0.9295, "step": 870 }, { "epoch": 2.2453195610071015, "grad_norm": 0.5950572680113415, "learning_rate": 1.342114344537334e-05, "loss": 0.9378, "step": 871 }, { "epoch": 2.2479018721755972, "grad_norm": 0.645911801845914, "learning_rate": 1.3404181976034743e-05, "loss": 0.9889, "step": 872 }, { "epoch": 2.250484183344093, "grad_norm": 0.5891952037473503, "learning_rate": 1.3387209422228164e-05, "loss": 0.9257, "step": 873 }, { "epoch": 2.2530664945125887, "grad_norm": 0.6101696680348054, "learning_rate": 1.3370225839218494e-05, "loss": 0.9387, "step": 874 }, { "epoch": 2.2556488056810844, "grad_norm": 0.6235755995527572, "learning_rate": 1.3353231282306521e-05, "loss": 0.9699, "step": 875 }, { "epoch": 2.2582311168495806, "grad_norm": 0.6032240561162692, "learning_rate": 1.3336225806828782e-05, "loss": 0.9256, "step": 876 }, { "epoch": 2.2608134280180763, "grad_norm": 0.6200539436633388, "learning_rate": 1.3319209468157362e-05, "loss": 0.977, "step": 877 }, { "epoch": 2.263395739186572, "grad_norm": 0.6357789919117319, "learning_rate": 1.3302182321699712e-05, "loss": 0.9589, "step": 878 }, { "epoch": 2.2659780503550677, "grad_norm": 0.6102482086269118, "learning_rate": 1.3285144422898486e-05, "loss": 0.9595, "step": 879 }, { "epoch": 2.2685603615235634, "grad_norm": 0.6310634925304537, "learning_rate": 1.3268095827231333e-05, "loss": 0.9406, "step": 880 }, { "epoch": 2.2711426726920596, "grad_norm": 0.6196741175987706, "learning_rate": 1.3251036590210751e-05, "loss": 0.9623, "step": 881 }, { "epoch": 2.2737249838605553, "grad_norm": 0.5865418451174635, "learning_rate": 1.323396676738387e-05, "loss": 0.9618, "step": 882 }, { "epoch": 2.276307295029051, "grad_norm": 0.5886489491664807, "learning_rate": 1.3216886414332304e-05, "loss": 0.9654, "step": 883 }, { "epoch": 2.2788896061975468, "grad_norm": 0.6269313692986308, "learning_rate": 1.319979558667194e-05, "loss": 0.9648, "step": 884 }, { "epoch": 2.2814719173660425, "grad_norm": 0.5950331112803471, "learning_rate": 1.3182694340052785e-05, "loss": 1.0065, "step": 885 }, { "epoch": 2.284054228534538, "grad_norm": 0.5868804806129319, "learning_rate": 1.3165582730158764e-05, "loss": 0.9425, "step": 886 }, { "epoch": 2.2866365397030344, "grad_norm": 0.585709126958065, "learning_rate": 1.3148460812707549e-05, "loss": 0.9866, "step": 887 }, { "epoch": 2.28921885087153, "grad_norm": 0.5943971591153827, "learning_rate": 1.3131328643450373e-05, "loss": 0.928, "step": 888 }, { "epoch": 2.291801162040026, "grad_norm": 0.6011485207920195, "learning_rate": 1.3114186278171855e-05, "loss": 0.9471, "step": 889 }, { "epoch": 2.2943834732085215, "grad_norm": 0.6202130154424499, "learning_rate": 1.3097033772689804e-05, "loss": 0.9555, "step": 890 }, { "epoch": 2.2969657843770173, "grad_norm": 0.601191279942045, "learning_rate": 1.3079871182855056e-05, "loss": 0.9763, "step": 891 }, { "epoch": 2.2995480955455134, "grad_norm": 0.6091424415493963, "learning_rate": 1.3062698564551277e-05, "loss": 0.9564, "step": 892 }, { "epoch": 2.302130406714009, "grad_norm": 0.6322044545300952, "learning_rate": 1.3045515973694793e-05, "loss": 0.9621, "step": 893 }, { "epoch": 2.304712717882505, "grad_norm": 0.593976781762648, "learning_rate": 1.3028323466234398e-05, "loss": 0.9352, "step": 894 }, { "epoch": 2.3072950290510006, "grad_norm": 0.6093135390414695, "learning_rate": 1.3011121098151177e-05, "loss": 0.9444, "step": 895 }, { "epoch": 2.3098773402194963, "grad_norm": 0.6081280945984243, "learning_rate": 1.2993908925458318e-05, "loss": 0.9019, "step": 896 }, { "epoch": 2.312459651387992, "grad_norm": 0.5965625320422764, "learning_rate": 1.2976687004200941e-05, "loss": 0.9504, "step": 897 }, { "epoch": 2.315041962556488, "grad_norm": 0.6136358258415586, "learning_rate": 1.2959455390455906e-05, "loss": 0.9598, "step": 898 }, { "epoch": 2.317624273724984, "grad_norm": 0.614066787514822, "learning_rate": 1.294221414033163e-05, "loss": 0.9151, "step": 899 }, { "epoch": 2.3202065848934796, "grad_norm": 0.595393393778215, "learning_rate": 1.2924963309967914e-05, "loss": 0.9383, "step": 900 }, { "epoch": 2.3227888960619754, "grad_norm": 0.6123276452590078, "learning_rate": 1.2907702955535744e-05, "loss": 0.9449, "step": 901 }, { "epoch": 2.325371207230471, "grad_norm": 0.6002189347008143, "learning_rate": 1.2890433133237129e-05, "loss": 0.9648, "step": 902 }, { "epoch": 2.3279535183989672, "grad_norm": 0.5948640736384636, "learning_rate": 1.2873153899304898e-05, "loss": 0.9654, "step": 903 }, { "epoch": 2.330535829567463, "grad_norm": 0.6253192331451701, "learning_rate": 1.2855865310002526e-05, "loss": 0.9459, "step": 904 }, { "epoch": 2.3331181407359587, "grad_norm": 0.6060085962717341, "learning_rate": 1.2838567421623957e-05, "loss": 0.9648, "step": 905 }, { "epoch": 2.3357004519044544, "grad_norm": 0.5909129536256885, "learning_rate": 1.2821260290493411e-05, "loss": 0.9615, "step": 906 }, { "epoch": 2.33828276307295, "grad_norm": 0.6033489652168267, "learning_rate": 1.2803943972965193e-05, "loss": 0.9822, "step": 907 }, { "epoch": 2.340865074241446, "grad_norm": 0.6471948077451358, "learning_rate": 1.278661852542354e-05, "loss": 0.9372, "step": 908 }, { "epoch": 2.343447385409942, "grad_norm": 0.5875321400886871, "learning_rate": 1.2769284004282398e-05, "loss": 0.9283, "step": 909 }, { "epoch": 2.3460296965784377, "grad_norm": 0.57397296055963, "learning_rate": 1.2751940465985273e-05, "loss": 0.9443, "step": 910 }, { "epoch": 2.3486120077469335, "grad_norm": 0.6083870147758043, "learning_rate": 1.2734587967005025e-05, "loss": 0.9911, "step": 911 }, { "epoch": 2.351194318915429, "grad_norm": 0.5893684173951856, "learning_rate": 1.2717226563843687e-05, "loss": 0.9775, "step": 912 }, { "epoch": 2.3537766300839253, "grad_norm": 0.6098963204737635, "learning_rate": 1.26998563130323e-05, "loss": 0.9352, "step": 913 }, { "epoch": 2.356358941252421, "grad_norm": 0.6028323564667681, "learning_rate": 1.268247727113069e-05, "loss": 0.9535, "step": 914 }, { "epoch": 2.358941252420917, "grad_norm": 0.6139836763290958, "learning_rate": 1.2665089494727338e-05, "loss": 0.9543, "step": 915 }, { "epoch": 2.3615235635894125, "grad_norm": 0.5979010266216653, "learning_rate": 1.2647693040439142e-05, "loss": 0.9584, "step": 916 }, { "epoch": 2.3641058747579082, "grad_norm": 0.6035572479241811, "learning_rate": 1.2630287964911261e-05, "loss": 0.958, "step": 917 }, { "epoch": 2.366688185926404, "grad_norm": 0.5830490108904467, "learning_rate": 1.2612874324816935e-05, "loss": 0.9492, "step": 918 }, { "epoch": 2.3692704970949, "grad_norm": 0.6049407013095448, "learning_rate": 1.2595452176857283e-05, "loss": 0.9215, "step": 919 }, { "epoch": 2.371852808263396, "grad_norm": 0.5853837977544576, "learning_rate": 1.2578021577761132e-05, "loss": 0.9397, "step": 920 }, { "epoch": 2.3744351194318916, "grad_norm": 0.6270536757002744, "learning_rate": 1.2560582584284822e-05, "loss": 0.9817, "step": 921 }, { "epoch": 2.3770174306003873, "grad_norm": 0.6353840335416789, "learning_rate": 1.2543135253212027e-05, "loss": 0.9559, "step": 922 }, { "epoch": 2.379599741768883, "grad_norm": 0.585963379760864, "learning_rate": 1.2525679641353571e-05, "loss": 0.9453, "step": 923 }, { "epoch": 2.382182052937379, "grad_norm": 0.5894211852291655, "learning_rate": 1.2508215805547246e-05, "loss": 0.9251, "step": 924 }, { "epoch": 2.384764364105875, "grad_norm": 0.572359971184135, "learning_rate": 1.2490743802657614e-05, "loss": 0.9564, "step": 925 }, { "epoch": 2.3873466752743706, "grad_norm": 0.5786820742067271, "learning_rate": 1.2473263689575835e-05, "loss": 0.9291, "step": 926 }, { "epoch": 2.3899289864428663, "grad_norm": 0.590281816939995, "learning_rate": 1.2455775523219472e-05, "loss": 0.9248, "step": 927 }, { "epoch": 2.392511297611362, "grad_norm": 0.5851632591490395, "learning_rate": 1.2438279360532317e-05, "loss": 0.9558, "step": 928 }, { "epoch": 2.3950936087798578, "grad_norm": 0.6073264082842632, "learning_rate": 1.2420775258484194e-05, "loss": 0.9152, "step": 929 }, { "epoch": 2.397675919948354, "grad_norm": 0.5961742171533062, "learning_rate": 1.2403263274070786e-05, "loss": 0.9614, "step": 930 }, { "epoch": 2.4002582311168497, "grad_norm": 0.6094671056323115, "learning_rate": 1.238574346431343e-05, "loss": 0.9478, "step": 931 }, { "epoch": 2.4028405422853454, "grad_norm": 0.6141679641874132, "learning_rate": 1.2368215886258952e-05, "loss": 0.9588, "step": 932 }, { "epoch": 2.405422853453841, "grad_norm": 0.5891888079618772, "learning_rate": 1.2350680596979474e-05, "loss": 0.9748, "step": 933 }, { "epoch": 2.408005164622337, "grad_norm": 0.6220233686120056, "learning_rate": 1.233313765357222e-05, "loss": 0.9547, "step": 934 }, { "epoch": 2.410587475790833, "grad_norm": 0.6111194107579635, "learning_rate": 1.2315587113159342e-05, "loss": 0.9374, "step": 935 }, { "epoch": 2.4131697869593287, "grad_norm": 0.5816987304224244, "learning_rate": 1.2298029032887725e-05, "loss": 0.9611, "step": 936 }, { "epoch": 2.4157520981278244, "grad_norm": 0.6103256535275182, "learning_rate": 1.228046346992881e-05, "loss": 0.9388, "step": 937 }, { "epoch": 2.41833440929632, "grad_norm": 0.5861255113568193, "learning_rate": 1.22628904814784e-05, "loss": 0.9582, "step": 938 }, { "epoch": 2.420916720464816, "grad_norm": 0.5823515712678948, "learning_rate": 1.224531012475647e-05, "loss": 0.9898, "step": 939 }, { "epoch": 2.4234990316333116, "grad_norm": 0.609671663934881, "learning_rate": 1.2227722457007e-05, "loss": 0.9596, "step": 940 }, { "epoch": 2.4260813428018078, "grad_norm": 0.5972948161325082, "learning_rate": 1.221012753549776e-05, "loss": 0.9955, "step": 941 }, { "epoch": 2.4286636539703035, "grad_norm": 0.5879925977172995, "learning_rate": 1.2192525417520159e-05, "loss": 0.9615, "step": 942 }, { "epoch": 2.431245965138799, "grad_norm": 0.6075551488590047, "learning_rate": 1.2174916160389024e-05, "loss": 0.9572, "step": 943 }, { "epoch": 2.433828276307295, "grad_norm": 0.6113872256428539, "learning_rate": 1.2157299821442424e-05, "loss": 0.9671, "step": 944 }, { "epoch": 2.4364105874757906, "grad_norm": 0.5838911691926075, "learning_rate": 1.2139676458041505e-05, "loss": 0.9352, "step": 945 }, { "epoch": 2.438992898644287, "grad_norm": 0.604879771295695, "learning_rate": 1.2122046127570268e-05, "loss": 0.9541, "step": 946 }, { "epoch": 2.4415752098127825, "grad_norm": 0.6008885632399309, "learning_rate": 1.2104408887435413e-05, "loss": 0.9633, "step": 947 }, { "epoch": 2.4441575209812783, "grad_norm": 0.5834385132140035, "learning_rate": 1.2086764795066128e-05, "loss": 0.9455, "step": 948 }, { "epoch": 2.446739832149774, "grad_norm": 0.6092567677261247, "learning_rate": 1.2069113907913921e-05, "loss": 0.9564, "step": 949 }, { "epoch": 2.4493221433182697, "grad_norm": 0.5650318694461209, "learning_rate": 1.2051456283452423e-05, "loss": 0.97, "step": 950 }, { "epoch": 2.4519044544867654, "grad_norm": 0.608288549791379, "learning_rate": 1.2033791979177196e-05, "loss": 0.9628, "step": 951 }, { "epoch": 2.4544867656552616, "grad_norm": 0.6033407862962766, "learning_rate": 1.2016121052605558e-05, "loss": 0.9565, "step": 952 }, { "epoch": 2.4570690768237573, "grad_norm": 0.6028336342782669, "learning_rate": 1.1998443561276395e-05, "loss": 0.9829, "step": 953 }, { "epoch": 2.459651387992253, "grad_norm": 0.584653200324165, "learning_rate": 1.1980759562749957e-05, "loss": 0.9566, "step": 954 }, { "epoch": 2.4622336991607487, "grad_norm": 0.6030118438156815, "learning_rate": 1.1963069114607692e-05, "loss": 0.9306, "step": 955 }, { "epoch": 2.464816010329245, "grad_norm": 0.598687643898121, "learning_rate": 1.1945372274452045e-05, "loss": 0.9717, "step": 956 }, { "epoch": 2.4673983214977406, "grad_norm": 0.6007026870754814, "learning_rate": 1.1927669099906274e-05, "loss": 0.9483, "step": 957 }, { "epoch": 2.4699806326662364, "grad_norm": 0.5841035235550123, "learning_rate": 1.1909959648614262e-05, "loss": 0.9888, "step": 958 }, { "epoch": 2.472562943834732, "grad_norm": 0.5950731809881308, "learning_rate": 1.1892243978240332e-05, "loss": 0.9442, "step": 959 }, { "epoch": 2.475145255003228, "grad_norm": 0.6073950825590259, "learning_rate": 1.1874522146469056e-05, "loss": 0.9607, "step": 960 }, { "epoch": 2.4777275661717235, "grad_norm": 0.5917705341695404, "learning_rate": 1.1856794211005069e-05, "loss": 0.9288, "step": 961 }, { "epoch": 2.4803098773402197, "grad_norm": 0.5839083243509722, "learning_rate": 1.183906022957288e-05, "loss": 0.9676, "step": 962 }, { "epoch": 2.4828921885087154, "grad_norm": 0.597493851436026, "learning_rate": 1.182132025991669e-05, "loss": 0.9598, "step": 963 }, { "epoch": 2.485474499677211, "grad_norm": 0.5765315620556862, "learning_rate": 1.1803574359800179e-05, "loss": 0.9744, "step": 964 }, { "epoch": 2.488056810845707, "grad_norm": 0.5912961838573095, "learning_rate": 1.1785822587006362e-05, "loss": 0.9847, "step": 965 }, { "epoch": 2.4906391220142026, "grad_norm": 0.5816475691671312, "learning_rate": 1.1768064999337364e-05, "loss": 0.9411, "step": 966 }, { "epoch": 2.4932214331826987, "grad_norm": 0.5846058314378276, "learning_rate": 1.1750301654614242e-05, "loss": 0.9693, "step": 967 }, { "epoch": 2.4958037443511945, "grad_norm": 0.5830471307870174, "learning_rate": 1.1732532610676808e-05, "loss": 0.9354, "step": 968 }, { "epoch": 2.49838605551969, "grad_norm": 0.5836564756949956, "learning_rate": 1.1714757925383418e-05, "loss": 0.9617, "step": 969 }, { "epoch": 2.500968366688186, "grad_norm": 0.604259375943389, "learning_rate": 1.1696977656610813e-05, "loss": 0.9519, "step": 970 }, { "epoch": 2.5035506778566816, "grad_norm": 0.5742797021433684, "learning_rate": 1.1679191862253898e-05, "loss": 0.9547, "step": 971 }, { "epoch": 2.5061329890251773, "grad_norm": 0.5746553555926329, "learning_rate": 1.1661400600225588e-05, "loss": 0.9564, "step": 972 }, { "epoch": 2.5087153001936735, "grad_norm": 0.5956191491364381, "learning_rate": 1.1643603928456581e-05, "loss": 0.9315, "step": 973 }, { "epoch": 2.5112976113621692, "grad_norm": 0.5972863649697912, "learning_rate": 1.1625801904895207e-05, "loss": 0.9828, "step": 974 }, { "epoch": 2.513879922530665, "grad_norm": 0.628215407427667, "learning_rate": 1.1607994587507216e-05, "loss": 0.9791, "step": 975 }, { "epoch": 2.5164622336991607, "grad_norm": 0.5794209025299315, "learning_rate": 1.1590182034275588e-05, "loss": 0.9765, "step": 976 }, { "epoch": 2.5190445448676564, "grad_norm": 0.5995540341976862, "learning_rate": 1.157236430320037e-05, "loss": 0.9425, "step": 977 }, { "epoch": 2.5216268560361526, "grad_norm": 0.589687726091925, "learning_rate": 1.155454145229845e-05, "loss": 0.9269, "step": 978 }, { "epoch": 2.5242091672046483, "grad_norm": 0.5903300924561746, "learning_rate": 1.1536713539603392e-05, "loss": 0.9515, "step": 979 }, { "epoch": 2.526791478373144, "grad_norm": 0.5926698140037857, "learning_rate": 1.1518880623165249e-05, "loss": 0.9613, "step": 980 }, { "epoch": 2.5293737895416397, "grad_norm": 0.5882141138215461, "learning_rate": 1.1501042761050359e-05, "loss": 0.9646, "step": 981 }, { "epoch": 2.5319561007101354, "grad_norm": 0.5907114287701524, "learning_rate": 1.1483200011341172e-05, "loss": 0.9502, "step": 982 }, { "epoch": 2.534538411878631, "grad_norm": 0.5796881924318279, "learning_rate": 1.1465352432136041e-05, "loss": 0.9337, "step": 983 }, { "epoch": 2.5371207230471273, "grad_norm": 0.5738929122712656, "learning_rate": 1.1447500081549054e-05, "loss": 0.9405, "step": 984 }, { "epoch": 2.539703034215623, "grad_norm": 0.580328116392153, "learning_rate": 1.1429643017709833e-05, "loss": 0.9539, "step": 985 }, { "epoch": 2.5422853453841188, "grad_norm": 0.5881438247765939, "learning_rate": 1.1411781298763343e-05, "loss": 0.9313, "step": 986 }, { "epoch": 2.5448676565526145, "grad_norm": 0.5885562040781032, "learning_rate": 1.1393914982869711e-05, "loss": 0.9425, "step": 987 }, { "epoch": 2.5474499677211107, "grad_norm": 0.580594597098575, "learning_rate": 1.1376044128204033e-05, "loss": 0.9391, "step": 988 }, { "epoch": 2.5500322788896064, "grad_norm": 0.5952429990647207, "learning_rate": 1.1358168792956178e-05, "loss": 0.9504, "step": 989 }, { "epoch": 2.552614590058102, "grad_norm": 0.5970710014238076, "learning_rate": 1.1340289035330614e-05, "loss": 0.9878, "step": 990 }, { "epoch": 2.555196901226598, "grad_norm": 0.6152662315238809, "learning_rate": 1.1322404913546197e-05, "loss": 0.9465, "step": 991 }, { "epoch": 2.5577792123950935, "grad_norm": 0.6027222855714028, "learning_rate": 1.1304516485836002e-05, "loss": 0.971, "step": 992 }, { "epoch": 2.5603615235635893, "grad_norm": 0.5918016876334783, "learning_rate": 1.1286623810447122e-05, "loss": 0.9652, "step": 993 }, { "epoch": 2.562943834732085, "grad_norm": 0.5896636574102831, "learning_rate": 1.1268726945640483e-05, "loss": 0.9372, "step": 994 }, { "epoch": 2.565526145900581, "grad_norm": 0.5824587400275631, "learning_rate": 1.125082594969065e-05, "loss": 0.9529, "step": 995 }, { "epoch": 2.568108457069077, "grad_norm": 0.5697833980293927, "learning_rate": 1.1232920880885632e-05, "loss": 0.9554, "step": 996 }, { "epoch": 2.5706907682375726, "grad_norm": 0.5801306805314953, "learning_rate": 1.1215011797526716e-05, "loss": 0.9268, "step": 997 }, { "epoch": 2.5732730794060683, "grad_norm": 0.5965814716018379, "learning_rate": 1.119709875792825e-05, "loss": 0.962, "step": 998 }, { "epoch": 2.5758553905745645, "grad_norm": 0.6086711337973163, "learning_rate": 1.1179181820417469e-05, "loss": 0.97, "step": 999 }, { "epoch": 2.57843770174306, "grad_norm": 0.5785369755423095, "learning_rate": 1.1161261043334296e-05, "loss": 0.9495, "step": 1000 }, { "epoch": 2.581020012911556, "grad_norm": 0.608023719014441, "learning_rate": 1.1143336485031156e-05, "loss": 0.9165, "step": 1001 }, { "epoch": 2.5836023240800516, "grad_norm": 0.6320332520260791, "learning_rate": 1.1125408203872793e-05, "loss": 1.0028, "step": 1002 }, { "epoch": 2.5861846352485474, "grad_norm": 0.5833673102474324, "learning_rate": 1.1107476258236059e-05, "loss": 0.942, "step": 1003 }, { "epoch": 2.588766946417043, "grad_norm": 0.5888791372130312, "learning_rate": 1.1089540706509757e-05, "loss": 0.9548, "step": 1004 }, { "epoch": 2.591349257585539, "grad_norm": 0.5862228558392754, "learning_rate": 1.1071601607094416e-05, "loss": 0.9096, "step": 1005 }, { "epoch": 2.593931568754035, "grad_norm": 0.6178780038575998, "learning_rate": 1.1053659018402123e-05, "loss": 0.9539, "step": 1006 }, { "epoch": 2.5965138799225307, "grad_norm": 0.6227035958216502, "learning_rate": 1.1035712998856332e-05, "loss": 0.9845, "step": 1007 }, { "epoch": 2.5990961910910264, "grad_norm": 0.585793574816453, "learning_rate": 1.1017763606891653e-05, "loss": 0.9564, "step": 1008 }, { "epoch": 2.601678502259522, "grad_norm": 0.6031850388726575, "learning_rate": 1.0999810900953701e-05, "loss": 0.966, "step": 1009 }, { "epoch": 2.6042608134280183, "grad_norm": 0.6325995476999388, "learning_rate": 1.0981854939498853e-05, "loss": 0.934, "step": 1010 }, { "epoch": 2.606843124596514, "grad_norm": 0.604370954178913, "learning_rate": 1.0963895780994106e-05, "loss": 0.962, "step": 1011 }, { "epoch": 2.6094254357650097, "grad_norm": 0.6046507204858135, "learning_rate": 1.0945933483916867e-05, "loss": 0.9628, "step": 1012 }, { "epoch": 2.6120077469335055, "grad_norm": 0.6055958607582257, "learning_rate": 1.0927968106754747e-05, "loss": 0.9724, "step": 1013 }, { "epoch": 2.614590058102001, "grad_norm": 0.6142519834748665, "learning_rate": 1.0909999708005407e-05, "loss": 0.9859, "step": 1014 }, { "epoch": 2.617172369270497, "grad_norm": 0.587585188897923, "learning_rate": 1.0892028346176333e-05, "loss": 0.9337, "step": 1015 }, { "epoch": 2.6197546804389926, "grad_norm": 0.5775553775383109, "learning_rate": 1.087405407978466e-05, "loss": 0.9247, "step": 1016 }, { "epoch": 2.622336991607489, "grad_norm": 0.5914086152693361, "learning_rate": 1.0856076967356983e-05, "loss": 0.9646, "step": 1017 }, { "epoch": 2.6249193027759845, "grad_norm": 0.6252845963452488, "learning_rate": 1.0838097067429168e-05, "loss": 0.9783, "step": 1018 }, { "epoch": 2.6275016139444802, "grad_norm": 0.5861511527646114, "learning_rate": 1.0820114438546152e-05, "loss": 0.9621, "step": 1019 }, { "epoch": 2.630083925112976, "grad_norm": 0.5836312295046293, "learning_rate": 1.080212913926176e-05, "loss": 0.9554, "step": 1020 }, { "epoch": 2.632666236281472, "grad_norm": 0.6040474003003209, "learning_rate": 1.0784141228138507e-05, "loss": 0.9516, "step": 1021 }, { "epoch": 2.635248547449968, "grad_norm": 0.6355202880988752, "learning_rate": 1.0766150763747423e-05, "loss": 0.9789, "step": 1022 }, { "epoch": 2.6378308586184636, "grad_norm": 0.5897510462672635, "learning_rate": 1.0748157804667844e-05, "loss": 0.9374, "step": 1023 }, { "epoch": 2.6404131697869593, "grad_norm": 0.5899429810230572, "learning_rate": 1.0730162409487233e-05, "loss": 0.9329, "step": 1024 }, { "epoch": 2.642995480955455, "grad_norm": 0.6030569126093994, "learning_rate": 1.071216463680098e-05, "loss": 0.9662, "step": 1025 }, { "epoch": 2.6455777921239507, "grad_norm": 0.5981188227832869, "learning_rate": 1.069416454521222e-05, "loss": 0.9753, "step": 1026 }, { "epoch": 2.648160103292447, "grad_norm": 0.618406401340536, "learning_rate": 1.0676162193331642e-05, "loss": 0.9729, "step": 1027 }, { "epoch": 2.6507424144609426, "grad_norm": 0.5945181324122579, "learning_rate": 1.0658157639777285e-05, "loss": 0.9296, "step": 1028 }, { "epoch": 2.6533247256294383, "grad_norm": 0.621876814177428, "learning_rate": 1.0640150943174368e-05, "loss": 0.9628, "step": 1029 }, { "epoch": 2.655907036797934, "grad_norm": 0.5872555607480314, "learning_rate": 1.0622142162155084e-05, "loss": 0.9647, "step": 1030 }, { "epoch": 2.65848934796643, "grad_norm": 0.6016180713767454, "learning_rate": 1.060413135535841e-05, "loss": 0.9489, "step": 1031 }, { "epoch": 2.661071659134926, "grad_norm": 0.5963657410420156, "learning_rate": 1.0586118581429923e-05, "loss": 0.9476, "step": 1032 }, { "epoch": 2.6636539703034217, "grad_norm": 0.5814763983307615, "learning_rate": 1.05681038990216e-05, "loss": 0.9463, "step": 1033 }, { "epoch": 2.6662362814719174, "grad_norm": 0.5725192948619975, "learning_rate": 1.0550087366791641e-05, "loss": 0.9804, "step": 1034 }, { "epoch": 2.668818592640413, "grad_norm": 0.5916916107783017, "learning_rate": 1.053206904340426e-05, "loss": 0.9629, "step": 1035 }, { "epoch": 2.671400903808909, "grad_norm": 0.5904165915891584, "learning_rate": 1.0514048987529515e-05, "loss": 0.9579, "step": 1036 }, { "epoch": 2.6739832149774045, "grad_norm": 0.5914405056148352, "learning_rate": 1.0496027257843088e-05, "loss": 0.9807, "step": 1037 }, { "epoch": 2.6765655261459007, "grad_norm": 0.5846745644240308, "learning_rate": 1.0478003913026125e-05, "loss": 0.9679, "step": 1038 }, { "epoch": 2.6791478373143964, "grad_norm": 0.6002766375251781, "learning_rate": 1.045997901176503e-05, "loss": 0.971, "step": 1039 }, { "epoch": 2.681730148482892, "grad_norm": 0.5847650891279706, "learning_rate": 1.0441952612751267e-05, "loss": 0.9627, "step": 1040 }, { "epoch": 2.684312459651388, "grad_norm": 0.6040931012169604, "learning_rate": 1.0423924774681186e-05, "loss": 0.9503, "step": 1041 }, { "epoch": 2.686894770819884, "grad_norm": 0.5785542819032363, "learning_rate": 1.0405895556255818e-05, "loss": 0.9559, "step": 1042 }, { "epoch": 2.6894770819883798, "grad_norm": 0.6052229883487668, "learning_rate": 1.0387865016180688e-05, "loss": 0.9622, "step": 1043 }, { "epoch": 2.6920593931568755, "grad_norm": 0.5848263105245827, "learning_rate": 1.0369833213165625e-05, "loss": 0.9598, "step": 1044 }, { "epoch": 2.694641704325371, "grad_norm": 0.5926309991366325, "learning_rate": 1.035180020592457e-05, "loss": 0.9372, "step": 1045 }, { "epoch": 2.697224015493867, "grad_norm": 0.5844049554145337, "learning_rate": 1.0333766053175391e-05, "loss": 0.9439, "step": 1046 }, { "epoch": 2.6998063266623626, "grad_norm": 0.6001743480120659, "learning_rate": 1.031573081363968e-05, "loss": 0.9346, "step": 1047 }, { "epoch": 2.7023886378308584, "grad_norm": 0.5897380533051093, "learning_rate": 1.0297694546042563e-05, "loss": 0.9604, "step": 1048 }, { "epoch": 2.7049709489993545, "grad_norm": 0.584956431101729, "learning_rate": 1.0279657309112526e-05, "loss": 0.9045, "step": 1049 }, { "epoch": 2.7075532601678503, "grad_norm": 0.5712935010828868, "learning_rate": 1.02616191615812e-05, "loss": 0.9466, "step": 1050 }, { "epoch": 2.710135571336346, "grad_norm": 0.583381386123002, "learning_rate": 1.0243580162183189e-05, "loss": 0.9838, "step": 1051 }, { "epoch": 2.7127178825048417, "grad_norm": 0.5846652612272821, "learning_rate": 1.0225540369655866e-05, "loss": 0.9751, "step": 1052 }, { "epoch": 2.715300193673338, "grad_norm": 0.5978067742385131, "learning_rate": 1.0207499842739185e-05, "loss": 0.9625, "step": 1053 }, { "epoch": 2.7178825048418336, "grad_norm": 0.5853977002645502, "learning_rate": 1.01894586401755e-05, "loss": 0.9614, "step": 1054 }, { "epoch": 2.7204648160103293, "grad_norm": 0.5983002966741684, "learning_rate": 1.0171416820709356e-05, "loss": 0.9373, "step": 1055 }, { "epoch": 2.723047127178825, "grad_norm": 0.5856993759606652, "learning_rate": 1.015337444308731e-05, "loss": 0.9489, "step": 1056 }, { "epoch": 2.7256294383473207, "grad_norm": 0.5901281403453162, "learning_rate": 1.0135331566057735e-05, "loss": 0.9332, "step": 1057 }, { "epoch": 2.7282117495158165, "grad_norm": 0.5906660579573058, "learning_rate": 1.0117288248370636e-05, "loss": 0.9609, "step": 1058 }, { "epoch": 2.730794060684312, "grad_norm": 0.6062946865104221, "learning_rate": 1.0099244548777444e-05, "loss": 0.9372, "step": 1059 }, { "epoch": 2.7333763718528084, "grad_norm": 0.6025103390237757, "learning_rate": 1.008120052603084e-05, "loss": 0.9325, "step": 1060 }, { "epoch": 2.735958683021304, "grad_norm": 0.6037740140636985, "learning_rate": 1.006315623888455e-05, "loss": 0.9407, "step": 1061 }, { "epoch": 2.7385409941898, "grad_norm": 0.5818379563267816, "learning_rate": 1.0045111746093174e-05, "loss": 0.9565, "step": 1062 }, { "epoch": 2.7411233053582955, "grad_norm": 0.5972098469584126, "learning_rate": 1.0027067106411969e-05, "loss": 0.9559, "step": 1063 }, { "epoch": 2.7437056165267917, "grad_norm": 0.5921309288084705, "learning_rate": 1.000902237859668e-05, "loss": 0.9267, "step": 1064 }, { "epoch": 2.7462879276952874, "grad_norm": 0.5858852838442818, "learning_rate": 9.990977621403326e-06, "loss": 0.9778, "step": 1065 }, { "epoch": 2.748870238863783, "grad_norm": 0.5887566802759674, "learning_rate": 9.972932893588033e-06, "loss": 0.9054, "step": 1066 }, { "epoch": 2.751452550032279, "grad_norm": 0.5706187383084692, "learning_rate": 9.954888253906827e-06, "loss": 0.9482, "step": 1067 }, { "epoch": 2.7540348612007746, "grad_norm": 0.5737416712225011, "learning_rate": 9.936843761115448e-06, "loss": 0.9313, "step": 1068 }, { "epoch": 2.7566171723692703, "grad_norm": 0.5618668457848085, "learning_rate": 9.918799473969162e-06, "loss": 0.9268, "step": 1069 }, { "epoch": 2.7591994835377665, "grad_norm": 0.5945215622528138, "learning_rate": 9.90075545122256e-06, "loss": 0.9708, "step": 1070 }, { "epoch": 2.761781794706262, "grad_norm": 0.5965929940159351, "learning_rate": 9.882711751629368e-06, "loss": 0.9618, "step": 1071 }, { "epoch": 2.764364105874758, "grad_norm": 0.6238969650308814, "learning_rate": 9.864668433942266e-06, "loss": 0.9206, "step": 1072 }, { "epoch": 2.7669464170432536, "grad_norm": 0.561902457075373, "learning_rate": 9.84662555691269e-06, "loss": 0.9762, "step": 1073 }, { "epoch": 2.76952872821175, "grad_norm": 0.6148930289646558, "learning_rate": 9.828583179290645e-06, "loss": 0.9293, "step": 1074 }, { "epoch": 2.7721110393802455, "grad_norm": 0.5816613771287756, "learning_rate": 9.810541359824501e-06, "loss": 0.9591, "step": 1075 }, { "epoch": 2.774693350548741, "grad_norm": 0.6121639894598173, "learning_rate": 9.792500157260816e-06, "loss": 0.9727, "step": 1076 }, { "epoch": 2.777275661717237, "grad_norm": 0.5738661064344951, "learning_rate": 9.774459630344137e-06, "loss": 0.9067, "step": 1077 }, { "epoch": 2.7798579728857327, "grad_norm": 0.5745834880727902, "learning_rate": 9.756419837816811e-06, "loss": 0.9283, "step": 1078 }, { "epoch": 2.7824402840542284, "grad_norm": 0.6019753697435574, "learning_rate": 9.738380838418804e-06, "loss": 0.9414, "step": 1079 }, { "epoch": 2.785022595222724, "grad_norm": 0.5919495172527766, "learning_rate": 9.720342690887477e-06, "loss": 0.9464, "step": 1080 }, { "epoch": 2.7876049063912203, "grad_norm": 0.5868873130752621, "learning_rate": 9.702305453957439e-06, "loss": 0.9589, "step": 1081 }, { "epoch": 2.790187217559716, "grad_norm": 0.5951626550396919, "learning_rate": 9.684269186360325e-06, "loss": 0.9559, "step": 1082 }, { "epoch": 2.7927695287282117, "grad_norm": 0.6004222716865213, "learning_rate": 9.666233946824612e-06, "loss": 0.9812, "step": 1083 }, { "epoch": 2.7953518398967074, "grad_norm": 0.5696400324744211, "learning_rate": 9.648199794075433e-06, "loss": 0.9503, "step": 1084 }, { "epoch": 2.7979341510652036, "grad_norm": 0.5743980783747284, "learning_rate": 9.630166786834378e-06, "loss": 0.935, "step": 1085 }, { "epoch": 2.8005164622336993, "grad_norm": 0.5751575788089159, "learning_rate": 9.612134983819316e-06, "loss": 0.9294, "step": 1086 }, { "epoch": 2.803098773402195, "grad_norm": 0.5735290035862011, "learning_rate": 9.594104443744184e-06, "loss": 0.9326, "step": 1087 }, { "epoch": 2.8056810845706908, "grad_norm": 0.5850179347042352, "learning_rate": 9.576075225318817e-06, "loss": 0.9489, "step": 1088 }, { "epoch": 2.8082633957391865, "grad_norm": 0.5751681929532767, "learning_rate": 9.558047387248736e-06, "loss": 0.933, "step": 1089 }, { "epoch": 2.810845706907682, "grad_norm": 0.5767517016594284, "learning_rate": 9.540020988234972e-06, "loss": 0.9688, "step": 1090 }, { "epoch": 2.813428018076178, "grad_norm": 0.5834104537340995, "learning_rate": 9.521996086973877e-06, "loss": 1.0005, "step": 1091 }, { "epoch": 2.816010329244674, "grad_norm": 0.6121890401561288, "learning_rate": 9.503972742156917e-06, "loss": 0.9683, "step": 1092 }, { "epoch": 2.81859264041317, "grad_norm": 0.5780502941320363, "learning_rate": 9.485951012470491e-06, "loss": 0.9651, "step": 1093 }, { "epoch": 2.8211749515816655, "grad_norm": 0.5872895683685423, "learning_rate": 9.467930956595742e-06, "loss": 0.9497, "step": 1094 }, { "epoch": 2.8237572627501613, "grad_norm": 0.6012252860645063, "learning_rate": 9.449912633208362e-06, "loss": 0.962, "step": 1095 }, { "epoch": 2.8263395739186574, "grad_norm": 0.5812288173466004, "learning_rate": 9.431896100978402e-06, "loss": 0.9516, "step": 1096 }, { "epoch": 2.828921885087153, "grad_norm": 0.5903667057899601, "learning_rate": 9.413881418570082e-06, "loss": 0.933, "step": 1097 }, { "epoch": 2.831504196255649, "grad_norm": 0.574567344834327, "learning_rate": 9.395868644641594e-06, "loss": 0.9311, "step": 1098 }, { "epoch": 2.8340865074241446, "grad_norm": 0.6029860483322287, "learning_rate": 9.37785783784492e-06, "loss": 0.9365, "step": 1099 }, { "epoch": 2.8366688185926403, "grad_norm": 0.5885522147718864, "learning_rate": 9.359849056825632e-06, "loss": 0.9375, "step": 1100 }, { "epoch": 2.839251129761136, "grad_norm": 0.5897382549514845, "learning_rate": 9.341842360222717e-06, "loss": 0.9568, "step": 1101 }, { "epoch": 2.8418334409296317, "grad_norm": 0.5970771697415443, "learning_rate": 9.323837806668363e-06, "loss": 0.9544, "step": 1102 }, { "epoch": 2.844415752098128, "grad_norm": 0.5998731951808198, "learning_rate": 9.305835454787784e-06, "loss": 0.9668, "step": 1103 }, { "epoch": 2.8469980632666236, "grad_norm": 0.5820942005741839, "learning_rate": 9.287835363199026e-06, "loss": 0.9552, "step": 1104 }, { "epoch": 2.8495803744351194, "grad_norm": 0.6084126869227644, "learning_rate": 9.269837590512768e-06, "loss": 0.9628, "step": 1105 }, { "epoch": 2.852162685603615, "grad_norm": 0.66554698305709, "learning_rate": 9.25184219533216e-06, "loss": 0.9367, "step": 1106 }, { "epoch": 2.8547449967721112, "grad_norm": 0.5807306091688449, "learning_rate": 9.23384923625258e-06, "loss": 0.9692, "step": 1107 }, { "epoch": 2.857327307940607, "grad_norm": 0.5837765762229058, "learning_rate": 9.215858771861495e-06, "loss": 0.9355, "step": 1108 }, { "epoch": 2.8599096191091027, "grad_norm": 0.6108951996108233, "learning_rate": 9.197870860738245e-06, "loss": 0.9618, "step": 1109 }, { "epoch": 2.8624919302775984, "grad_norm": 0.6085207165359778, "learning_rate": 9.17988556145385e-06, "loss": 0.9749, "step": 1110 }, { "epoch": 2.865074241446094, "grad_norm": 0.6014777439249565, "learning_rate": 9.161902932570837e-06, "loss": 0.9419, "step": 1111 }, { "epoch": 2.86765655261459, "grad_norm": 0.5740295354518736, "learning_rate": 9.143923032643019e-06, "loss": 0.9325, "step": 1112 }, { "epoch": 2.870238863783086, "grad_norm": 0.5824503091707712, "learning_rate": 9.125945920215344e-06, "loss": 0.9624, "step": 1113 }, { "epoch": 2.8728211749515817, "grad_norm": 0.5881119183147646, "learning_rate": 9.10797165382367e-06, "loss": 0.9604, "step": 1114 }, { "epoch": 2.8754034861200775, "grad_norm": 0.591602227679226, "learning_rate": 9.090000291994596e-06, "loss": 0.9522, "step": 1115 }, { "epoch": 2.877985797288573, "grad_norm": 0.5894398262140761, "learning_rate": 9.072031893245256e-06, "loss": 0.9447, "step": 1116 }, { "epoch": 2.8805681084570693, "grad_norm": 0.5843901076209989, "learning_rate": 9.054066516083138e-06, "loss": 0.9651, "step": 1117 }, { "epoch": 2.883150419625565, "grad_norm": 0.5830155470269734, "learning_rate": 9.036104219005895e-06, "loss": 0.9391, "step": 1118 }, { "epoch": 2.885732730794061, "grad_norm": 0.5795525849711025, "learning_rate": 9.018145060501152e-06, "loss": 0.9522, "step": 1119 }, { "epoch": 2.8883150419625565, "grad_norm": 0.5722538427227781, "learning_rate": 9.000189099046306e-06, "loss": 0.9652, "step": 1120 }, { "epoch": 2.8908973531310522, "grad_norm": 0.5834430509021916, "learning_rate": 8.982236393108349e-06, "loss": 0.9573, "step": 1121 }, { "epoch": 2.893479664299548, "grad_norm": 0.5834678810709014, "learning_rate": 8.964287001143672e-06, "loss": 0.9901, "step": 1122 }, { "epoch": 2.8960619754680437, "grad_norm": 0.6148204310068593, "learning_rate": 8.946340981597879e-06, "loss": 0.9392, "step": 1123 }, { "epoch": 2.89864428663654, "grad_norm": 0.5823218513706327, "learning_rate": 8.92839839290559e-06, "loss": 0.9595, "step": 1124 }, { "epoch": 2.9012265978050356, "grad_norm": 0.5970215090631561, "learning_rate": 8.910459293490248e-06, "loss": 0.9334, "step": 1125 }, { "epoch": 2.9038089089735313, "grad_norm": 0.5998590562400262, "learning_rate": 8.892523741763945e-06, "loss": 0.9442, "step": 1126 }, { "epoch": 2.906391220142027, "grad_norm": 0.6101328337460503, "learning_rate": 8.874591796127213e-06, "loss": 0.9584, "step": 1127 }, { "epoch": 2.908973531310523, "grad_norm": 0.6129595613398248, "learning_rate": 8.856663514968845e-06, "loss": 0.9524, "step": 1128 }, { "epoch": 2.911555842479019, "grad_norm": 0.603793328877133, "learning_rate": 8.838738956665709e-06, "loss": 0.9197, "step": 1129 }, { "epoch": 2.9141381536475146, "grad_norm": 0.5909017292382529, "learning_rate": 8.820818179582533e-06, "loss": 0.9405, "step": 1130 }, { "epoch": 2.9167204648160103, "grad_norm": 0.5831175832584677, "learning_rate": 8.802901242071751e-06, "loss": 0.9346, "step": 1131 }, { "epoch": 2.919302775984506, "grad_norm": 0.5837762849349555, "learning_rate": 8.784988202473284e-06, "loss": 0.9333, "step": 1132 }, { "epoch": 2.9218850871530018, "grad_norm": 0.5848385359972617, "learning_rate": 8.76707911911437e-06, "loss": 0.9551, "step": 1133 }, { "epoch": 2.9244673983214975, "grad_norm": 0.6253939331925262, "learning_rate": 8.749174050309357e-06, "loss": 0.9813, "step": 1134 }, { "epoch": 2.9270497094899937, "grad_norm": 0.5845874358674058, "learning_rate": 8.73127305435952e-06, "loss": 0.9567, "step": 1135 }, { "epoch": 2.9296320206584894, "grad_norm": 0.5953343497217751, "learning_rate": 8.71337618955288e-06, "loss": 0.9953, "step": 1136 }, { "epoch": 2.932214331826985, "grad_norm": 0.6030060364823723, "learning_rate": 8.695483514163998e-06, "loss": 0.9455, "step": 1137 }, { "epoch": 2.934796642995481, "grad_norm": 0.5854700296562423, "learning_rate": 8.677595086453808e-06, "loss": 0.9408, "step": 1138 }, { "epoch": 2.937378954163977, "grad_norm": 0.5891938385206548, "learning_rate": 8.65971096466939e-06, "loss": 0.9547, "step": 1139 }, { "epoch": 2.9399612653324727, "grad_norm": 0.6001369316354844, "learning_rate": 8.641831207043823e-06, "loss": 0.9686, "step": 1140 }, { "epoch": 2.9425435765009684, "grad_norm": 0.5815314396468559, "learning_rate": 8.62395587179597e-06, "loss": 0.9582, "step": 1141 }, { "epoch": 2.945125887669464, "grad_norm": 0.5918140434178532, "learning_rate": 8.606085017130289e-06, "loss": 0.9825, "step": 1142 }, { "epoch": 2.94770819883796, "grad_norm": 0.5892397810416701, "learning_rate": 8.588218701236662e-06, "loss": 0.944, "step": 1143 }, { "epoch": 2.9502905100064556, "grad_norm": 0.5832582968436837, "learning_rate": 8.570356982290172e-06, "loss": 0.9375, "step": 1144 }, { "epoch": 2.9528728211749513, "grad_norm": 0.6173280182644898, "learning_rate": 8.552499918450949e-06, "loss": 0.9782, "step": 1145 }, { "epoch": 2.9554551323434475, "grad_norm": 0.584992585761828, "learning_rate": 8.534647567863962e-06, "loss": 0.9657, "step": 1146 }, { "epoch": 2.958037443511943, "grad_norm": 0.6102553737687162, "learning_rate": 8.516799988658833e-06, "loss": 0.9371, "step": 1147 }, { "epoch": 2.960619754680439, "grad_norm": 0.5813839505299179, "learning_rate": 8.498957238949645e-06, "loss": 0.9702, "step": 1148 }, { "epoch": 2.9632020658489346, "grad_norm": 0.5873347150678367, "learning_rate": 8.481119376834753e-06, "loss": 0.9843, "step": 1149 }, { "epoch": 2.965784377017431, "grad_norm": 0.6111402642438966, "learning_rate": 8.46328646039661e-06, "loss": 0.9697, "step": 1150 }, { "epoch": 2.9683666881859265, "grad_norm": 0.5922931684597259, "learning_rate": 8.445458547701555e-06, "loss": 0.9627, "step": 1151 }, { "epoch": 2.9709489993544222, "grad_norm": 0.5851245117575304, "learning_rate": 8.427635696799636e-06, "loss": 0.9215, "step": 1152 }, { "epoch": 2.973531310522918, "grad_norm": 0.5804327721924878, "learning_rate": 8.409817965724413e-06, "loss": 0.9716, "step": 1153 }, { "epoch": 2.9761136216914137, "grad_norm": 0.6003712250873723, "learning_rate": 8.392005412492788e-06, "loss": 0.9648, "step": 1154 }, { "epoch": 2.9786959328599094, "grad_norm": 0.6082518706572542, "learning_rate": 8.374198095104795e-06, "loss": 0.95, "step": 1155 }, { "epoch": 2.9812782440284056, "grad_norm": 0.5866011566920423, "learning_rate": 8.356396071543422e-06, "loss": 0.9444, "step": 1156 }, { "epoch": 2.9838605551969013, "grad_norm": 0.6114880019803942, "learning_rate": 8.338599399774417e-06, "loss": 0.9693, "step": 1157 }, { "epoch": 2.986442866365397, "grad_norm": 0.5927950336965607, "learning_rate": 8.320808137746104e-06, "loss": 0.9667, "step": 1158 }, { "epoch": 2.9890251775338927, "grad_norm": 0.6169016547178486, "learning_rate": 8.303022343389188e-06, "loss": 0.9406, "step": 1159 }, { "epoch": 2.991607488702389, "grad_norm": 0.582024586435705, "learning_rate": 8.285242074616582e-06, "loss": 0.9729, "step": 1160 }, { "epoch": 2.9941897998708846, "grad_norm": 0.5944691597316901, "learning_rate": 8.267467389323197e-06, "loss": 0.9649, "step": 1161 }, { "epoch": 2.9967721110393803, "grad_norm": 0.5916217281333404, "learning_rate": 8.249698345385761e-06, "loss": 0.9567, "step": 1162 }, { "epoch": 2.999354422207876, "grad_norm": 0.5736806203482997, "learning_rate": 8.231935000662641e-06, "loss": 0.9526, "step": 1163 }, { "epoch": 3.0, "grad_norm": 0.5736806203482997, "learning_rate": 8.21417741299364e-06, "loss": 0.8611, "step": 1164 }, { "epoch": 3.0025823111684957, "grad_norm": 1.3479570480783307, "learning_rate": 8.196425640199823e-06, "loss": 0.8352, "step": 1165 }, { "epoch": 3.0051646223369914, "grad_norm": 1.1968748953729984, "learning_rate": 8.178679740083317e-06, "loss": 0.8032, "step": 1166 }, { "epoch": 3.0077469335054876, "grad_norm": 0.9744412992683603, "learning_rate": 8.160939770427122e-06, "loss": 0.811, "step": 1167 }, { "epoch": 3.0103292446739833, "grad_norm": 0.8092124569788149, "learning_rate": 8.143205788994933e-06, "loss": 0.8442, "step": 1168 }, { "epoch": 3.012911555842479, "grad_norm": 1.1536699236828805, "learning_rate": 8.125477853530944e-06, "loss": 0.8623, "step": 1169 }, { "epoch": 3.0154938670109748, "grad_norm": 1.4343186656583924, "learning_rate": 8.107756021759673e-06, "loss": 0.7984, "step": 1170 }, { "epoch": 3.0180761781794705, "grad_norm": 1.1928449593644268, "learning_rate": 8.090040351385741e-06, "loss": 0.8323, "step": 1171 }, { "epoch": 3.020658489347966, "grad_norm": 0.9892158368981517, "learning_rate": 8.072330900093728e-06, "loss": 0.8219, "step": 1172 }, { "epoch": 3.0232408005164624, "grad_norm": 0.9513434336373858, "learning_rate": 8.054627725547958e-06, "loss": 0.7942, "step": 1173 }, { "epoch": 3.025823111684958, "grad_norm": 0.9160615847863206, "learning_rate": 8.036930885392308e-06, "loss": 0.824, "step": 1174 }, { "epoch": 3.028405422853454, "grad_norm": 0.9712632670523301, "learning_rate": 8.019240437250046e-06, "loss": 0.8105, "step": 1175 }, { "epoch": 3.0309877340219495, "grad_norm": 0.888585643517764, "learning_rate": 8.001556438723608e-06, "loss": 0.8133, "step": 1176 }, { "epoch": 3.0335700451904453, "grad_norm": 0.8336632805823568, "learning_rate": 7.983878947394444e-06, "loss": 0.8087, "step": 1177 }, { "epoch": 3.0361523563589414, "grad_norm": 0.9159983137263322, "learning_rate": 7.966208020822808e-06, "loss": 0.8458, "step": 1178 }, { "epoch": 3.038734667527437, "grad_norm": 0.9601758597787429, "learning_rate": 7.948543716547584e-06, "loss": 0.8261, "step": 1179 }, { "epoch": 3.041316978695933, "grad_norm": 0.876187258797956, "learning_rate": 7.930886092086084e-06, "loss": 0.8018, "step": 1180 }, { "epoch": 3.0438992898644286, "grad_norm": 0.8319336547056765, "learning_rate": 7.913235204933873e-06, "loss": 0.8301, "step": 1181 }, { "epoch": 3.0464816010329243, "grad_norm": 0.870478623487457, "learning_rate": 7.895591112564588e-06, "loss": 0.793, "step": 1182 }, { "epoch": 3.0490639122014205, "grad_norm": 0.8628894314897713, "learning_rate": 7.877953872429734e-06, "loss": 0.8174, "step": 1183 }, { "epoch": 3.051646223369916, "grad_norm": 0.7872403489116827, "learning_rate": 7.8603235419585e-06, "loss": 0.8163, "step": 1184 }, { "epoch": 3.054228534538412, "grad_norm": 0.7851405920047361, "learning_rate": 7.84270017855758e-06, "loss": 0.8178, "step": 1185 }, { "epoch": 3.0568108457069076, "grad_norm": 0.8157181746918352, "learning_rate": 7.825083839610981e-06, "loss": 0.7963, "step": 1186 }, { "epoch": 3.0593931568754034, "grad_norm": 0.8180215928459832, "learning_rate": 7.807474582479841e-06, "loss": 0.8148, "step": 1187 }, { "epoch": 3.061975468043899, "grad_norm": 0.8088389505024169, "learning_rate": 7.789872464502241e-06, "loss": 0.827, "step": 1188 }, { "epoch": 3.0645577792123952, "grad_norm": 0.7907227679234932, "learning_rate": 7.772277542993006e-06, "loss": 0.8407, "step": 1189 }, { "epoch": 3.067140090380891, "grad_norm": 0.8168926580368819, "learning_rate": 7.754689875243536e-06, "loss": 0.8252, "step": 1190 }, { "epoch": 3.0697224015493867, "grad_norm": 0.8128108171059767, "learning_rate": 7.737109518521604e-06, "loss": 0.811, "step": 1191 }, { "epoch": 3.0723047127178824, "grad_norm": 0.7723477729268966, "learning_rate": 7.71953653007119e-06, "loss": 0.8018, "step": 1192 }, { "epoch": 3.074887023886378, "grad_norm": 0.7771679955089591, "learning_rate": 7.701970967112278e-06, "loss": 0.8206, "step": 1193 }, { "epoch": 3.0774693350548743, "grad_norm": 0.7947531953649853, "learning_rate": 7.684412886840662e-06, "loss": 0.8374, "step": 1194 }, { "epoch": 3.08005164622337, "grad_norm": 0.7797780708525804, "learning_rate": 7.666862346427784e-06, "loss": 0.809, "step": 1195 }, { "epoch": 3.0826339573918657, "grad_norm": 0.7951018870568382, "learning_rate": 7.649319403020528e-06, "loss": 0.8148, "step": 1196 }, { "epoch": 3.0852162685603615, "grad_norm": 0.7768045025376982, "learning_rate": 7.631784113741048e-06, "loss": 0.7905, "step": 1197 }, { "epoch": 3.087798579728857, "grad_norm": 0.7380091530118719, "learning_rate": 7.614256535686574e-06, "loss": 0.8277, "step": 1198 }, { "epoch": 3.090380890897353, "grad_norm": 0.8090369362037133, "learning_rate": 7.596736725929218e-06, "loss": 0.7897, "step": 1199 }, { "epoch": 3.092963202065849, "grad_norm": 0.7816172334191853, "learning_rate": 7.579224741515808e-06, "loss": 0.801, "step": 1200 }, { "epoch": 3.095545513234345, "grad_norm": 0.7716968909350221, "learning_rate": 7.561720639467684e-06, "loss": 0.8253, "step": 1201 }, { "epoch": 3.0981278244028405, "grad_norm": 0.7767223781947307, "learning_rate": 7.544224476780534e-06, "loss": 0.8171, "step": 1202 }, { "epoch": 3.1007101355713362, "grad_norm": 0.7780640346641391, "learning_rate": 7.52673631042417e-06, "loss": 0.8142, "step": 1203 }, { "epoch": 3.103292446739832, "grad_norm": 0.7678875942864142, "learning_rate": 7.509256197342389e-06, "loss": 0.8437, "step": 1204 }, { "epoch": 3.105874757908328, "grad_norm": 0.7741428737890553, "learning_rate": 7.491784194452756e-06, "loss": 0.7948, "step": 1205 }, { "epoch": 3.108457069076824, "grad_norm": 0.7445434454135789, "learning_rate": 7.4743203586464286e-06, "loss": 0.8186, "step": 1206 }, { "epoch": 3.1110393802453196, "grad_norm": 0.7308461534374082, "learning_rate": 7.45686474678798e-06, "loss": 0.8117, "step": 1207 }, { "epoch": 3.1136216914138153, "grad_norm": 0.7624570651090968, "learning_rate": 7.4394174157151826e-06, "loss": 0.8184, "step": 1208 }, { "epoch": 3.116204002582311, "grad_norm": 0.7787385810762857, "learning_rate": 7.421978422238871e-06, "loss": 0.8051, "step": 1209 }, { "epoch": 3.118786313750807, "grad_norm": 0.7487622485166701, "learning_rate": 7.404547823142718e-06, "loss": 0.8065, "step": 1210 }, { "epoch": 3.121368624919303, "grad_norm": 0.7700484213439688, "learning_rate": 7.387125675183069e-06, "loss": 0.7893, "step": 1211 }, { "epoch": 3.1239509360877986, "grad_norm": 0.7498057989796449, "learning_rate": 7.369712035088743e-06, "loss": 0.8271, "step": 1212 }, { "epoch": 3.1265332472562943, "grad_norm": 0.782447832053478, "learning_rate": 7.352306959560862e-06, "loss": 0.8177, "step": 1213 }, { "epoch": 3.12911555842479, "grad_norm": 0.760945467135789, "learning_rate": 7.3349105052726635e-06, "loss": 0.8016, "step": 1214 }, { "epoch": 3.131697869593286, "grad_norm": 0.7451691837423764, "learning_rate": 7.317522728869308e-06, "loss": 0.8292, "step": 1215 }, { "epoch": 3.134280180761782, "grad_norm": 0.7594539784955314, "learning_rate": 7.3001436869677056e-06, "loss": 0.8363, "step": 1216 }, { "epoch": 3.1368624919302777, "grad_norm": 0.7551959784047992, "learning_rate": 7.2827734361563154e-06, "loss": 0.8193, "step": 1217 }, { "epoch": 3.1394448030987734, "grad_norm": 0.7523949203336101, "learning_rate": 7.265412032994977e-06, "loss": 0.8365, "step": 1218 }, { "epoch": 3.142027114267269, "grad_norm": 0.7736463491191788, "learning_rate": 7.248059534014728e-06, "loss": 0.7735, "step": 1219 }, { "epoch": 3.144609425435765, "grad_norm": 0.7260637259865401, "learning_rate": 7.230715995717605e-06, "loss": 0.816, "step": 1220 }, { "epoch": 3.147191736604261, "grad_norm": 0.7971984476822972, "learning_rate": 7.213381474576465e-06, "loss": 0.844, "step": 1221 }, { "epoch": 3.1497740477727567, "grad_norm": 0.7598850752183374, "learning_rate": 7.19605602703481e-06, "loss": 0.7923, "step": 1222 }, { "epoch": 3.1523563589412524, "grad_norm": 0.7608019681518811, "learning_rate": 7.178739709506592e-06, "loss": 0.818, "step": 1223 }, { "epoch": 3.154938670109748, "grad_norm": 0.7773577191907428, "learning_rate": 7.161432578376042e-06, "loss": 0.8353, "step": 1224 }, { "epoch": 3.157520981278244, "grad_norm": 0.7551689941223817, "learning_rate": 7.144134689997475e-06, "loss": 0.8366, "step": 1225 }, { "epoch": 3.16010329244674, "grad_norm": 0.7696289215692551, "learning_rate": 7.126846100695105e-06, "loss": 0.831, "step": 1226 }, { "epoch": 3.1626856036152358, "grad_norm": 0.7600151124859899, "learning_rate": 7.109566866762874e-06, "loss": 0.8073, "step": 1227 }, { "epoch": 3.1652679147837315, "grad_norm": 0.7520515666346982, "learning_rate": 7.092297044464256e-06, "loss": 0.8344, "step": 1228 }, { "epoch": 3.167850225952227, "grad_norm": 0.7818985132603024, "learning_rate": 7.075036690032088e-06, "loss": 0.8273, "step": 1229 }, { "epoch": 3.170432537120723, "grad_norm": 0.7438737448683109, "learning_rate": 7.057785859668373e-06, "loss": 0.8292, "step": 1230 }, { "epoch": 3.1730148482892186, "grad_norm": 0.7604238311874598, "learning_rate": 7.040544609544098e-06, "loss": 0.806, "step": 1231 }, { "epoch": 3.175597159457715, "grad_norm": 0.7739278944618028, "learning_rate": 7.023312995799062e-06, "loss": 0.8321, "step": 1232 }, { "epoch": 3.1781794706262105, "grad_norm": 0.7829719049826178, "learning_rate": 7.006091074541684e-06, "loss": 0.8207, "step": 1233 }, { "epoch": 3.1807617817947063, "grad_norm": 0.8051283397017396, "learning_rate": 6.988878901848829e-06, "loss": 0.7937, "step": 1234 }, { "epoch": 3.183344092963202, "grad_norm": 0.7723245876655893, "learning_rate": 6.9716765337656034e-06, "loss": 0.7945, "step": 1235 }, { "epoch": 3.1859264041316977, "grad_norm": 0.7838025063241568, "learning_rate": 6.954484026305208e-06, "loss": 0.7946, "step": 1236 }, { "epoch": 3.188508715300194, "grad_norm": 0.7307107229399178, "learning_rate": 6.937301435448725e-06, "loss": 0.7995, "step": 1237 }, { "epoch": 3.1910910264686896, "grad_norm": 0.8052398372954221, "learning_rate": 6.920128817144946e-06, "loss": 0.8201, "step": 1238 }, { "epoch": 3.1936733376371853, "grad_norm": 0.760805158015623, "learning_rate": 6.9029662273102015e-06, "loss": 0.7999, "step": 1239 }, { "epoch": 3.196255648805681, "grad_norm": 0.7483396811127839, "learning_rate": 6.885813721828149e-06, "loss": 0.7988, "step": 1240 }, { "epoch": 3.1988379599741767, "grad_norm": 0.7404586071162459, "learning_rate": 6.868671356549628e-06, "loss": 0.8092, "step": 1241 }, { "epoch": 3.2014202711426725, "grad_norm": 0.7813265679668377, "learning_rate": 6.851539187292453e-06, "loss": 0.8358, "step": 1242 }, { "epoch": 3.2040025823111686, "grad_norm": 0.7759146212310302, "learning_rate": 6.83441726984124e-06, "loss": 0.8228, "step": 1243 }, { "epoch": 3.2065848934796644, "grad_norm": 0.7818503076848575, "learning_rate": 6.81730565994722e-06, "loss": 0.8149, "step": 1244 }, { "epoch": 3.20916720464816, "grad_norm": 0.7402110582844729, "learning_rate": 6.800204413328062e-06, "loss": 0.8388, "step": 1245 }, { "epoch": 3.211749515816656, "grad_norm": 0.738425464653366, "learning_rate": 6.7831135856677e-06, "loss": 0.8089, "step": 1246 }, { "epoch": 3.2143318269851515, "grad_norm": 0.777277503906415, "learning_rate": 6.766033232616131e-06, "loss": 0.8233, "step": 1247 }, { "epoch": 3.2169141381536477, "grad_norm": 0.7530080273180854, "learning_rate": 6.748963409789253e-06, "loss": 0.82, "step": 1248 }, { "epoch": 3.2194964493221434, "grad_norm": 0.7527460534251285, "learning_rate": 6.731904172768668e-06, "loss": 0.7935, "step": 1249 }, { "epoch": 3.222078760490639, "grad_norm": 0.7417948760299368, "learning_rate": 6.714855577101515e-06, "loss": 0.81, "step": 1250 }, { "epoch": 3.224661071659135, "grad_norm": 0.76497441596248, "learning_rate": 6.697817678300287e-06, "loss": 0.8134, "step": 1251 }, { "epoch": 3.2272433828276306, "grad_norm": 0.7743095167259862, "learning_rate": 6.680790531842641e-06, "loss": 0.8158, "step": 1252 }, { "epoch": 3.2298256939961267, "grad_norm": 0.7564105405805621, "learning_rate": 6.6637741931712204e-06, "loss": 0.8139, "step": 1253 }, { "epoch": 3.2324080051646225, "grad_norm": 0.7569395840964698, "learning_rate": 6.646768717693484e-06, "loss": 0.8178, "step": 1254 }, { "epoch": 3.234990316333118, "grad_norm": 0.7669325942851178, "learning_rate": 6.629774160781511e-06, "loss": 0.824, "step": 1255 }, { "epoch": 3.237572627501614, "grad_norm": 0.7858446496283839, "learning_rate": 6.612790577771835e-06, "loss": 0.8176, "step": 1256 }, { "epoch": 3.2401549386701096, "grad_norm": 0.756498502430699, "learning_rate": 6.59581802396526e-06, "loss": 0.8322, "step": 1257 }, { "epoch": 3.242737249838606, "grad_norm": 0.7523880712468195, "learning_rate": 6.578856554626665e-06, "loss": 0.8179, "step": 1258 }, { "epoch": 3.2453195610071015, "grad_norm": 0.7418381111698618, "learning_rate": 6.561906224984844e-06, "loss": 0.8214, "step": 1259 }, { "epoch": 3.2479018721755972, "grad_norm": 0.748062534762086, "learning_rate": 6.544967090232321e-06, "loss": 0.8325, "step": 1260 }, { "epoch": 3.250484183344093, "grad_norm": 0.7830260472124719, "learning_rate": 6.5280392055251696e-06, "loss": 0.8245, "step": 1261 }, { "epoch": 3.2530664945125887, "grad_norm": 0.7580297797282579, "learning_rate": 6.511122625982815e-06, "loss": 0.8269, "step": 1262 }, { "epoch": 3.2556488056810844, "grad_norm": 0.7545843411413197, "learning_rate": 6.494217406687893e-06, "loss": 0.8242, "step": 1263 }, { "epoch": 3.2582311168495806, "grad_norm": 0.7439461629106354, "learning_rate": 6.477323602686039e-06, "loss": 0.8087, "step": 1264 }, { "epoch": 3.2608134280180763, "grad_norm": 0.7693068257824085, "learning_rate": 6.460441268985715e-06, "loss": 0.8333, "step": 1265 }, { "epoch": 3.263395739186572, "grad_norm": 0.7572186415123207, "learning_rate": 6.443570460558048e-06, "loss": 0.8085, "step": 1266 }, { "epoch": 3.2659780503550677, "grad_norm": 0.7558961737811011, "learning_rate": 6.426711232336613e-06, "loss": 0.8068, "step": 1267 }, { "epoch": 3.2685603615235634, "grad_norm": 0.7855400126793302, "learning_rate": 6.409863639217306e-06, "loss": 0.8147, "step": 1268 }, { "epoch": 3.2711426726920596, "grad_norm": 0.7790255090638041, "learning_rate": 6.393027736058117e-06, "loss": 0.8256, "step": 1269 }, { "epoch": 3.2737249838605553, "grad_norm": 0.7607044109115157, "learning_rate": 6.376203577678981e-06, "loss": 0.7971, "step": 1270 }, { "epoch": 3.276307295029051, "grad_norm": 0.7553997691720208, "learning_rate": 6.3593912188615966e-06, "loss": 0.842, "step": 1271 }, { "epoch": 3.2788896061975468, "grad_norm": 0.7889787626268039, "learning_rate": 6.3425907143492216e-06, "loss": 0.8183, "step": 1272 }, { "epoch": 3.2814719173660425, "grad_norm": 0.7485790330649242, "learning_rate": 6.325802118846533e-06, "loss": 0.8185, "step": 1273 }, { "epoch": 3.284054228534538, "grad_norm": 0.7536890088538672, "learning_rate": 6.309025487019425e-06, "loss": 0.8266, "step": 1274 }, { "epoch": 3.2866365397030344, "grad_norm": 0.7550810533925633, "learning_rate": 6.2922608734948355e-06, "loss": 0.8079, "step": 1275 }, { "epoch": 3.28921885087153, "grad_norm": 0.7471626158383303, "learning_rate": 6.275508332860567e-06, "loss": 0.8205, "step": 1276 }, { "epoch": 3.291801162040026, "grad_norm": 0.7300314640072086, "learning_rate": 6.258767919665113e-06, "loss": 0.8021, "step": 1277 }, { "epoch": 3.2943834732085215, "grad_norm": 0.7640658148164554, "learning_rate": 6.242039688417483e-06, "loss": 0.8132, "step": 1278 }, { "epoch": 3.2969657843770173, "grad_norm": 0.7547528610145464, "learning_rate": 6.225323693587014e-06, "loss": 0.8287, "step": 1279 }, { "epoch": 3.2995480955455134, "grad_norm": 0.7972400105837699, "learning_rate": 6.208619989603205e-06, "loss": 0.8315, "step": 1280 }, { "epoch": 3.302130406714009, "grad_norm": 0.758612295379575, "learning_rate": 6.191928630855527e-06, "loss": 0.802, "step": 1281 }, { "epoch": 3.304712717882505, "grad_norm": 0.7601220070386198, "learning_rate": 6.1752496716932576e-06, "loss": 0.834, "step": 1282 }, { "epoch": 3.3072950290510006, "grad_norm": 0.7684262734009513, "learning_rate": 6.158583166425304e-06, "loss": 0.8481, "step": 1283 }, { "epoch": 3.3098773402194963, "grad_norm": 0.7880034761965038, "learning_rate": 6.141929169320018e-06, "loss": 0.815, "step": 1284 }, { "epoch": 3.312459651387992, "grad_norm": 0.7560178543170282, "learning_rate": 6.125287734605018e-06, "loss": 0.8129, "step": 1285 }, { "epoch": 3.315041962556488, "grad_norm": 0.7706041266881096, "learning_rate": 6.108658916467025e-06, "loss": 0.8016, "step": 1286 }, { "epoch": 3.317624273724984, "grad_norm": 0.7569765139274263, "learning_rate": 6.092042769051674e-06, "loss": 0.8273, "step": 1287 }, { "epoch": 3.3202065848934796, "grad_norm": 0.7623292555288878, "learning_rate": 6.075439346463349e-06, "loss": 0.7931, "step": 1288 }, { "epoch": 3.3227888960619754, "grad_norm": 0.7427549761512925, "learning_rate": 6.0588487027649954e-06, "loss": 0.7812, "step": 1289 }, { "epoch": 3.325371207230471, "grad_norm": 0.7772449050888204, "learning_rate": 6.042270891977946e-06, "loss": 0.8305, "step": 1290 }, { "epoch": 3.3279535183989672, "grad_norm": 0.7871222544756025, "learning_rate": 6.025705968081753e-06, "loss": 0.8387, "step": 1291 }, { "epoch": 3.330535829567463, "grad_norm": 0.7583353530346796, "learning_rate": 6.009153985014003e-06, "loss": 0.8466, "step": 1292 }, { "epoch": 3.3331181407359587, "grad_norm": 0.772127846582864, "learning_rate": 5.992614996670156e-06, "loss": 0.8155, "step": 1293 }, { "epoch": 3.3357004519044544, "grad_norm": 0.7487201668100457, "learning_rate": 5.976089056903342e-06, "loss": 0.7953, "step": 1294 }, { "epoch": 3.33828276307295, "grad_norm": 0.7614204718665639, "learning_rate": 5.959576219524217e-06, "loss": 0.8131, "step": 1295 }, { "epoch": 3.340865074241446, "grad_norm": 0.7763835258668194, "learning_rate": 5.94307653830077e-06, "loss": 0.8198, "step": 1296 }, { "epoch": 3.343447385409942, "grad_norm": 0.7753456222642561, "learning_rate": 5.926590066958149e-06, "loss": 0.8356, "step": 1297 }, { "epoch": 3.3460296965784377, "grad_norm": 0.7516557804123375, "learning_rate": 5.910116859178494e-06, "loss": 0.7854, "step": 1298 }, { "epoch": 3.3486120077469335, "grad_norm": 0.7503527754212284, "learning_rate": 5.89365696860075e-06, "loss": 0.8383, "step": 1299 }, { "epoch": 3.351194318915429, "grad_norm": 0.8230846268240456, "learning_rate": 5.877210448820508e-06, "loss": 0.8282, "step": 1300 }, { "epoch": 3.3537766300839253, "grad_norm": 0.7864548933883284, "learning_rate": 5.860777353389816e-06, "loss": 0.8201, "step": 1301 }, { "epoch": 3.356358941252421, "grad_norm": 0.796951326601112, "learning_rate": 5.844357735817012e-06, "loss": 0.8124, "step": 1302 }, { "epoch": 3.358941252420917, "grad_norm": 0.7449791284424515, "learning_rate": 5.82795164956655e-06, "loss": 0.8449, "step": 1303 }, { "epoch": 3.3615235635894125, "grad_norm": 0.7629551074846378, "learning_rate": 5.811559148058817e-06, "loss": 0.787, "step": 1304 }, { "epoch": 3.3641058747579082, "grad_norm": 0.7440812424379075, "learning_rate": 5.795180284669981e-06, "loss": 0.8282, "step": 1305 }, { "epoch": 3.366688185926404, "grad_norm": 0.7744183120279426, "learning_rate": 5.7788151127317825e-06, "loss": 0.8258, "step": 1306 }, { "epoch": 3.3692704970949, "grad_norm": 0.7418245554432372, "learning_rate": 5.762463685531403e-06, "loss": 0.8284, "step": 1307 }, { "epoch": 3.371852808263396, "grad_norm": 0.7830933256063822, "learning_rate": 5.746126056311248e-06, "loss": 0.8452, "step": 1308 }, { "epoch": 3.3744351194318916, "grad_norm": 0.7909760674112923, "learning_rate": 5.729802278268813e-06, "loss": 0.8168, "step": 1309 }, { "epoch": 3.3770174306003873, "grad_norm": 0.8007258278890194, "learning_rate": 5.713492404556477e-06, "loss": 0.8027, "step": 1310 }, { "epoch": 3.379599741768883, "grad_norm": 0.7758822514790055, "learning_rate": 5.697196488281357e-06, "loss": 0.8266, "step": 1311 }, { "epoch": 3.382182052937379, "grad_norm": 0.765799650737426, "learning_rate": 5.680914582505123e-06, "loss": 0.8057, "step": 1312 }, { "epoch": 3.384764364105875, "grad_norm": 0.7655979019681851, "learning_rate": 5.6646467402438045e-06, "loss": 0.8157, "step": 1313 }, { "epoch": 3.3873466752743706, "grad_norm": 0.7766423523842311, "learning_rate": 5.6483930144676616e-06, "loss": 0.8162, "step": 1314 }, { "epoch": 3.3899289864428663, "grad_norm": 0.7589364799160417, "learning_rate": 5.632153458100985e-06, "loss": 0.8321, "step": 1315 }, { "epoch": 3.392511297611362, "grad_norm": 0.7788060942298414, "learning_rate": 5.615928124021921e-06, "loss": 0.837, "step": 1316 }, { "epoch": 3.3950936087798578, "grad_norm": 0.7694554084656864, "learning_rate": 5.599717065062302e-06, "loss": 0.8438, "step": 1317 }, { "epoch": 3.397675919948354, "grad_norm": 0.7631810781785031, "learning_rate": 5.583520334007494e-06, "loss": 0.8205, "step": 1318 }, { "epoch": 3.4002582311168497, "grad_norm": 0.7854745994817811, "learning_rate": 5.567337983596201e-06, "loss": 0.8208, "step": 1319 }, { "epoch": 3.4028405422853454, "grad_norm": 0.7690571440773396, "learning_rate": 5.551170066520299e-06, "loss": 0.814, "step": 1320 }, { "epoch": 3.405422853453841, "grad_norm": 0.7681900860146816, "learning_rate": 5.535016635424675e-06, "loss": 0.822, "step": 1321 }, { "epoch": 3.408005164622337, "grad_norm": 0.7811781678205161, "learning_rate": 5.51887774290704e-06, "loss": 0.818, "step": 1322 }, { "epoch": 3.410587475790833, "grad_norm": 0.7664479268038544, "learning_rate": 5.502753441517763e-06, "loss": 0.8331, "step": 1323 }, { "epoch": 3.4131697869593287, "grad_norm": 0.7775122726368401, "learning_rate": 5.486643783759713e-06, "loss": 0.8163, "step": 1324 }, { "epoch": 3.4157520981278244, "grad_norm": 0.7851782250823803, "learning_rate": 5.470548822088075e-06, "loss": 0.833, "step": 1325 }, { "epoch": 3.41833440929632, "grad_norm": 0.7722198216385613, "learning_rate": 5.454468608910177e-06, "loss": 0.8216, "step": 1326 }, { "epoch": 3.420916720464816, "grad_norm": 0.7650125939985358, "learning_rate": 5.43840319658532e-06, "loss": 0.8195, "step": 1327 }, { "epoch": 3.4234990316333116, "grad_norm": 0.7889877069137401, "learning_rate": 5.422352637424623e-06, "loss": 0.8356, "step": 1328 }, { "epoch": 3.4260813428018078, "grad_norm": 0.759046090525073, "learning_rate": 5.4063169836908355e-06, "loss": 0.8281, "step": 1329 }, { "epoch": 3.4286636539703035, "grad_norm": 0.7614819002487212, "learning_rate": 5.390296287598173e-06, "loss": 0.8176, "step": 1330 }, { "epoch": 3.431245965138799, "grad_norm": 0.7733681640312509, "learning_rate": 5.374290601312139e-06, "loss": 0.8347, "step": 1331 }, { "epoch": 3.433828276307295, "grad_norm": 0.7679752600393633, "learning_rate": 5.3582999769493816e-06, "loss": 0.8129, "step": 1332 }, { "epoch": 3.4364105874757906, "grad_norm": 0.7666398230614995, "learning_rate": 5.342324466577484e-06, "loss": 0.8041, "step": 1333 }, { "epoch": 3.438992898644287, "grad_norm": 0.757735562684295, "learning_rate": 5.326364122214833e-06, "loss": 0.832, "step": 1334 }, { "epoch": 3.4415752098127825, "grad_norm": 0.7658776895218172, "learning_rate": 5.310418995830429e-06, "loss": 0.8127, "step": 1335 }, { "epoch": 3.4441575209812783, "grad_norm": 0.7501769033527278, "learning_rate": 5.2944891393437145e-06, "loss": 0.8069, "step": 1336 }, { "epoch": 3.446739832149774, "grad_norm": 0.7720443515660191, "learning_rate": 5.278574604624411e-06, "loss": 0.8031, "step": 1337 }, { "epoch": 3.4493221433182697, "grad_norm": 0.7746952692548283, "learning_rate": 5.262675443492359e-06, "loss": 0.8212, "step": 1338 }, { "epoch": 3.4519044544867654, "grad_norm": 0.7721583267540482, "learning_rate": 5.246791707717343e-06, "loss": 0.806, "step": 1339 }, { "epoch": 3.4544867656552616, "grad_norm": 0.7656034684267539, "learning_rate": 5.230923449018896e-06, "loss": 0.815, "step": 1340 }, { "epoch": 3.4570690768237573, "grad_norm": 0.7847317646384887, "learning_rate": 5.215070719066182e-06, "loss": 0.8406, "step": 1341 }, { "epoch": 3.459651387992253, "grad_norm": 0.76108699723527, "learning_rate": 5.199233569477796e-06, "loss": 0.8535, "step": 1342 }, { "epoch": 3.4622336991607487, "grad_norm": 0.7642688851829174, "learning_rate": 5.183412051821591e-06, "loss": 0.8082, "step": 1343 }, { "epoch": 3.464816010329245, "grad_norm": 0.7767506589914347, "learning_rate": 5.167606217614531e-06, "loss": 0.8175, "step": 1344 }, { "epoch": 3.4673983214977406, "grad_norm": 0.7584520457174025, "learning_rate": 5.151816118322503e-06, "loss": 0.8027, "step": 1345 }, { "epoch": 3.4699806326662364, "grad_norm": 0.768873105060371, "learning_rate": 5.136041805360172e-06, "loss": 0.8109, "step": 1346 }, { "epoch": 3.472562943834732, "grad_norm": 0.7839594886363217, "learning_rate": 5.120283330090787e-06, "loss": 0.8148, "step": 1347 }, { "epoch": 3.475145255003228, "grad_norm": 0.773747886290827, "learning_rate": 5.104540743826038e-06, "loss": 0.8112, "step": 1348 }, { "epoch": 3.4777275661717235, "grad_norm": 0.7739729084947021, "learning_rate": 5.088814097825871e-06, "loss": 0.809, "step": 1349 }, { "epoch": 3.4803098773402197, "grad_norm": 0.7918183544657171, "learning_rate": 5.073103443298326e-06, "loss": 0.8455, "step": 1350 }, { "epoch": 3.4828921885087154, "grad_norm": 0.7878706269294227, "learning_rate": 5.057408831399385e-06, "loss": 0.8308, "step": 1351 }, { "epoch": 3.485474499677211, "grad_norm": 0.7823661607708897, "learning_rate": 5.041730313232786e-06, "loss": 0.8393, "step": 1352 }, { "epoch": 3.488056810845707, "grad_norm": 0.7543319732416526, "learning_rate": 5.026067939849864e-06, "loss": 0.8318, "step": 1353 }, { "epoch": 3.4906391220142026, "grad_norm": 0.769779193831718, "learning_rate": 5.0104217622493736e-06, "loss": 0.833, "step": 1354 }, { "epoch": 3.4932214331826987, "grad_norm": 0.7716520294743638, "learning_rate": 4.994791831377354e-06, "loss": 0.8222, "step": 1355 }, { "epoch": 3.4958037443511945, "grad_norm": 0.7736579686573494, "learning_rate": 4.9791781981269326e-06, "loss": 0.7974, "step": 1356 }, { "epoch": 3.49838605551969, "grad_norm": 0.7302768684285259, "learning_rate": 4.9635809133381685e-06, "loss": 0.8207, "step": 1357 }, { "epoch": 3.500968366688186, "grad_norm": 0.7621702814787035, "learning_rate": 4.948000027797885e-06, "loss": 0.8077, "step": 1358 }, { "epoch": 3.5035506778566816, "grad_norm": 0.7638463102404097, "learning_rate": 4.93243559223952e-06, "loss": 0.7849, "step": 1359 }, { "epoch": 3.5061329890251773, "grad_norm": 0.7525066168052732, "learning_rate": 4.916887657342931e-06, "loss": 0.8103, "step": 1360 }, { "epoch": 3.5087153001936735, "grad_norm": 0.7465812252567701, "learning_rate": 4.901356273734261e-06, "loss": 0.8251, "step": 1361 }, { "epoch": 3.5112976113621692, "grad_norm": 0.789989621898556, "learning_rate": 4.885841491985758e-06, "loss": 0.8156, "step": 1362 }, { "epoch": 3.513879922530665, "grad_norm": 0.7567373410192682, "learning_rate": 4.870343362615605e-06, "loss": 0.8241, "step": 1363 }, { "epoch": 3.5164622336991607, "grad_norm": 0.7748589228781302, "learning_rate": 4.8548619360877635e-06, "loss": 0.8061, "step": 1364 }, { "epoch": 3.5190445448676564, "grad_norm": 0.7738666119944785, "learning_rate": 4.839397262811814e-06, "loss": 0.8101, "step": 1365 }, { "epoch": 3.5216268560361526, "grad_norm": 0.7614113255993917, "learning_rate": 4.823949393142791e-06, "loss": 0.8237, "step": 1366 }, { "epoch": 3.5242091672046483, "grad_norm": 0.776611038166, "learning_rate": 4.808518377380999e-06, "loss": 0.8334, "step": 1367 }, { "epoch": 3.526791478373144, "grad_norm": 0.7600268059134173, "learning_rate": 4.7931042657718685e-06, "loss": 0.8221, "step": 1368 }, { "epoch": 3.5293737895416397, "grad_norm": 0.7659298097233458, "learning_rate": 4.777707108505801e-06, "loss": 0.8374, "step": 1369 }, { "epoch": 3.5319561007101354, "grad_norm": 0.740099038297969, "learning_rate": 4.762326955717972e-06, "loss": 0.8138, "step": 1370 }, { "epoch": 3.534538411878631, "grad_norm": 0.7714772416888985, "learning_rate": 4.746963857488208e-06, "loss": 0.8288, "step": 1371 }, { "epoch": 3.5371207230471273, "grad_norm": 0.7725845357201855, "learning_rate": 4.7316178638407885e-06, "loss": 0.822, "step": 1372 }, { "epoch": 3.539703034215623, "grad_norm": 0.7621909389829162, "learning_rate": 4.716289024744308e-06, "loss": 0.8231, "step": 1373 }, { "epoch": 3.5422853453841188, "grad_norm": 0.7607917831814722, "learning_rate": 4.700977390111495e-06, "loss": 0.8446, "step": 1374 }, { "epoch": 3.5448676565526145, "grad_norm": 0.7656461760658241, "learning_rate": 4.685683009799065e-06, "loss": 0.8214, "step": 1375 }, { "epoch": 3.5474499677211107, "grad_norm": 0.7752118246819907, "learning_rate": 4.670405933607554e-06, "loss": 0.8249, "step": 1376 }, { "epoch": 3.5500322788896064, "grad_norm": 0.7402699684294802, "learning_rate": 4.6551462112811384e-06, "loss": 0.8409, "step": 1377 }, { "epoch": 3.552614590058102, "grad_norm": 0.7778533292706469, "learning_rate": 4.639903892507501e-06, "loss": 0.7924, "step": 1378 }, { "epoch": 3.555196901226598, "grad_norm": 0.7705963566454496, "learning_rate": 4.624679026917658e-06, "loss": 0.8203, "step": 1379 }, { "epoch": 3.5577792123950935, "grad_norm": 0.7365633344886319, "learning_rate": 4.609471664085787e-06, "loss": 0.8123, "step": 1380 }, { "epoch": 3.5603615235635893, "grad_norm": 0.7586569356393417, "learning_rate": 4.594281853529076e-06, "loss": 0.8299, "step": 1381 }, { "epoch": 3.562943834732085, "grad_norm": 0.7652449035633111, "learning_rate": 4.5791096447075645e-06, "loss": 0.8141, "step": 1382 }, { "epoch": 3.565526145900581, "grad_norm": 0.7582437627266295, "learning_rate": 4.563955087023981e-06, "loss": 0.805, "step": 1383 }, { "epoch": 3.568108457069077, "grad_norm": 0.7826618805912787, "learning_rate": 4.548818229823568e-06, "loss": 0.8293, "step": 1384 }, { "epoch": 3.5706907682375726, "grad_norm": 0.762954307030826, "learning_rate": 4.5336991223939486e-06, "loss": 0.8456, "step": 1385 }, { "epoch": 3.5732730794060683, "grad_norm": 0.7623373985847602, "learning_rate": 4.5185978139649355e-06, "loss": 0.8192, "step": 1386 }, { "epoch": 3.5758553905745645, "grad_norm": 0.7765489599431679, "learning_rate": 4.503514353708389e-06, "loss": 0.815, "step": 1387 }, { "epoch": 3.57843770174306, "grad_norm": 0.7584186284661693, "learning_rate": 4.488448790738059e-06, "loss": 0.8301, "step": 1388 }, { "epoch": 3.581020012911556, "grad_norm": 0.7763483912193899, "learning_rate": 4.473401174109423e-06, "loss": 0.8518, "step": 1389 }, { "epoch": 3.5836023240800516, "grad_norm": 0.796977617965849, "learning_rate": 4.45837155281951e-06, "loss": 0.8258, "step": 1390 }, { "epoch": 3.5861846352485474, "grad_norm": 0.7770662519335874, "learning_rate": 4.443359975806757e-06, "loss": 0.8068, "step": 1391 }, { "epoch": 3.588766946417043, "grad_norm": 0.7790952514127689, "learning_rate": 4.428366491950854e-06, "loss": 0.8296, "step": 1392 }, { "epoch": 3.591349257585539, "grad_norm": 0.7764896929937788, "learning_rate": 4.413391150072577e-06, "loss": 0.8007, "step": 1393 }, { "epoch": 3.593931568754035, "grad_norm": 0.7607538258474698, "learning_rate": 4.39843399893362e-06, "loss": 0.8025, "step": 1394 }, { "epoch": 3.5965138799225307, "grad_norm": 0.7492819656752748, "learning_rate": 4.383495087236448e-06, "loss": 0.8157, "step": 1395 }, { "epoch": 3.5990961910910264, "grad_norm": 0.7596354310000931, "learning_rate": 4.368574463624146e-06, "loss": 0.8272, "step": 1396 }, { "epoch": 3.601678502259522, "grad_norm": 0.7407159353225491, "learning_rate": 4.353672176680236e-06, "loss": 0.8123, "step": 1397 }, { "epoch": 3.6042608134280183, "grad_norm": 0.7672926201622885, "learning_rate": 4.338788274928544e-06, "loss": 0.8086, "step": 1398 }, { "epoch": 3.606843124596514, "grad_norm": 0.7636171894829668, "learning_rate": 4.323922806833031e-06, "loss": 0.8067, "step": 1399 }, { "epoch": 3.6094254357650097, "grad_norm": 0.7677055145204084, "learning_rate": 4.3090758207976305e-06, "loss": 0.7908, "step": 1400 }, { "epoch": 3.6120077469335055, "grad_norm": 0.748818246629621, "learning_rate": 4.294247365166093e-06, "loss": 0.8312, "step": 1401 }, { "epoch": 3.614590058102001, "grad_norm": 0.7660560945307108, "learning_rate": 4.279437488221843e-06, "loss": 0.8022, "step": 1402 }, { "epoch": 3.617172369270497, "grad_norm": 0.7838437600172277, "learning_rate": 4.2646462381878076e-06, "loss": 0.8377, "step": 1403 }, { "epoch": 3.6197546804389926, "grad_norm": 0.8010185687327693, "learning_rate": 4.249873663226245e-06, "loss": 0.7993, "step": 1404 }, { "epoch": 3.622336991607489, "grad_norm": 0.7553739164896088, "learning_rate": 4.235119811438627e-06, "loss": 0.8261, "step": 1405 }, { "epoch": 3.6249193027759845, "grad_norm": 0.7716962358330001, "learning_rate": 4.220384730865456e-06, "loss": 0.8405, "step": 1406 }, { "epoch": 3.6275016139444802, "grad_norm": 0.7536919382865905, "learning_rate": 4.205668469486098e-06, "loss": 0.8108, "step": 1407 }, { "epoch": 3.630083925112976, "grad_norm": 0.7420774971418518, "learning_rate": 4.190971075218662e-06, "loss": 0.8065, "step": 1408 }, { "epoch": 3.632666236281472, "grad_norm": 0.7632144438839245, "learning_rate": 4.176292595919803e-06, "loss": 0.7927, "step": 1409 }, { "epoch": 3.635248547449968, "grad_norm": 0.7571064594393233, "learning_rate": 4.1616330793846075e-06, "loss": 0.8362, "step": 1410 }, { "epoch": 3.6378308586184636, "grad_norm": 0.7622517676642483, "learning_rate": 4.146992573346394e-06, "loss": 0.8257, "step": 1411 }, { "epoch": 3.6404131697869593, "grad_norm": 0.7835717386734106, "learning_rate": 4.1323711254766015e-06, "loss": 0.8223, "step": 1412 }, { "epoch": 3.642995480955455, "grad_norm": 0.7478789535763465, "learning_rate": 4.117768783384599e-06, "loss": 0.7949, "step": 1413 }, { "epoch": 3.6455777921239507, "grad_norm": 0.7767414531451252, "learning_rate": 4.1031855946175455e-06, "loss": 0.7961, "step": 1414 }, { "epoch": 3.648160103292447, "grad_norm": 0.7840863973404958, "learning_rate": 4.088621606660243e-06, "loss": 0.7999, "step": 1415 }, { "epoch": 3.6507424144609426, "grad_norm": 0.7547796898810605, "learning_rate": 4.074076866934967e-06, "loss": 0.818, "step": 1416 }, { "epoch": 3.6533247256294383, "grad_norm": 0.7714902567445823, "learning_rate": 4.05955142280132e-06, "loss": 0.8105, "step": 1417 }, { "epoch": 3.655907036797934, "grad_norm": 0.7532163616590967, "learning_rate": 4.0450453215560684e-06, "loss": 0.8049, "step": 1418 }, { "epoch": 3.65848934796643, "grad_norm": 0.7773578119690996, "learning_rate": 4.030558610433005e-06, "loss": 0.7914, "step": 1419 }, { "epoch": 3.661071659134926, "grad_norm": 0.763304292538588, "learning_rate": 4.016091336602789e-06, "loss": 0.8275, "step": 1420 }, { "epoch": 3.6636539703034217, "grad_norm": 0.7804812337616664, "learning_rate": 4.001643547172776e-06, "loss": 0.8377, "step": 1421 }, { "epoch": 3.6662362814719174, "grad_norm": 0.767412403491653, "learning_rate": 3.987215289186881e-06, "loss": 0.8256, "step": 1422 }, { "epoch": 3.668818592640413, "grad_norm": 0.7617867902372989, "learning_rate": 3.972806609625434e-06, "loss": 0.8106, "step": 1423 }, { "epoch": 3.671400903808909, "grad_norm": 0.7540382466723832, "learning_rate": 3.958417555404999e-06, "loss": 0.8074, "step": 1424 }, { "epoch": 3.6739832149774045, "grad_norm": 0.7748311551652659, "learning_rate": 3.9440481733782485e-06, "loss": 0.8125, "step": 1425 }, { "epoch": 3.6765655261459007, "grad_norm": 0.7623341018369493, "learning_rate": 3.929698510333799e-06, "loss": 0.8337, "step": 1426 }, { "epoch": 3.6791478373143964, "grad_norm": 0.767758139184047, "learning_rate": 3.915368612996055e-06, "loss": 0.8341, "step": 1427 }, { "epoch": 3.681730148482892, "grad_norm": 0.7519042843627542, "learning_rate": 3.901058528025055e-06, "loss": 0.8061, "step": 1428 }, { "epoch": 3.684312459651388, "grad_norm": 0.7625093078444409, "learning_rate": 3.8867683020163446e-06, "loss": 0.822, "step": 1429 }, { "epoch": 3.686894770819884, "grad_norm": 0.80028510095772, "learning_rate": 3.872497981500787e-06, "loss": 0.8502, "step": 1430 }, { "epoch": 3.6894770819883798, "grad_norm": 0.7652238383245407, "learning_rate": 3.8582476129444435e-06, "loss": 0.8163, "step": 1431 }, { "epoch": 3.6920593931568755, "grad_norm": 0.7842966117293941, "learning_rate": 3.844017242748398e-06, "loss": 0.7996, "step": 1432 }, { "epoch": 3.694641704325371, "grad_norm": 0.7495726108816106, "learning_rate": 3.829806917248631e-06, "loss": 0.8061, "step": 1433 }, { "epoch": 3.697224015493867, "grad_norm": 0.7579352515486196, "learning_rate": 3.815616682715839e-06, "loss": 0.7876, "step": 1434 }, { "epoch": 3.6998063266623626, "grad_norm": 0.7788849623252266, "learning_rate": 3.801446585355315e-06, "loss": 0.8334, "step": 1435 }, { "epoch": 3.7023886378308584, "grad_norm": 0.7445580252143607, "learning_rate": 3.7872966713067683e-06, "loss": 0.8182, "step": 1436 }, { "epoch": 3.7049709489993545, "grad_norm": 0.7772631069572105, "learning_rate": 3.773166986644202e-06, "loss": 0.8149, "step": 1437 }, { "epoch": 3.7075532601678503, "grad_norm": 0.7542443690987021, "learning_rate": 3.7590575773757378e-06, "loss": 0.8085, "step": 1438 }, { "epoch": 3.710135571336346, "grad_norm": 0.7490006884397156, "learning_rate": 3.744968489443488e-06, "loss": 0.8364, "step": 1439 }, { "epoch": 3.7127178825048417, "grad_norm": 0.7588964697785906, "learning_rate": 3.7308997687233896e-06, "loss": 0.8109, "step": 1440 }, { "epoch": 3.715300193673338, "grad_norm": 0.7401504987559857, "learning_rate": 3.7168514610250594e-06, "loss": 0.8026, "step": 1441 }, { "epoch": 3.7178825048418336, "grad_norm": 0.7554850061896863, "learning_rate": 3.7028236120916537e-06, "loss": 0.8315, "step": 1442 }, { "epoch": 3.7204648160103293, "grad_norm": 0.7804604627439944, "learning_rate": 3.688816267599713e-06, "loss": 0.8317, "step": 1443 }, { "epoch": 3.723047127178825, "grad_norm": 0.7748081363692426, "learning_rate": 3.6748294731590038e-06, "loss": 0.811, "step": 1444 }, { "epoch": 3.7256294383473207, "grad_norm": 0.7657357348049666, "learning_rate": 3.6608632743123827e-06, "loss": 0.8244, "step": 1445 }, { "epoch": 3.7282117495158165, "grad_norm": 0.775640736208981, "learning_rate": 3.6469177165356493e-06, "loss": 0.835, "step": 1446 }, { "epoch": 3.730794060684312, "grad_norm": 0.7864122051723232, "learning_rate": 3.6329928452373843e-06, "loss": 0.8354, "step": 1447 }, { "epoch": 3.7333763718528084, "grad_norm": 0.7862243619147634, "learning_rate": 3.6190887057588185e-06, "loss": 0.8311, "step": 1448 }, { "epoch": 3.735958683021304, "grad_norm": 0.7454712765612813, "learning_rate": 3.6052053433736777e-06, "loss": 0.8061, "step": 1449 }, { "epoch": 3.7385409941898, "grad_norm": 0.7477377098403756, "learning_rate": 3.591342803288027e-06, "loss": 0.7974, "step": 1450 }, { "epoch": 3.7411233053582955, "grad_norm": 0.7414265044721174, "learning_rate": 3.5775011306401317e-06, "loss": 0.8101, "step": 1451 }, { "epoch": 3.7437056165267917, "grad_norm": 0.7682965899056072, "learning_rate": 3.5636803705003174e-06, "loss": 0.8396, "step": 1452 }, { "epoch": 3.7462879276952874, "grad_norm": 0.7638628497129718, "learning_rate": 3.5498805678708172e-06, "loss": 0.8086, "step": 1453 }, { "epoch": 3.748870238863783, "grad_norm": 0.7987728579111757, "learning_rate": 3.5361017676856114e-06, "loss": 0.8301, "step": 1454 }, { "epoch": 3.751452550032279, "grad_norm": 0.7642546226790663, "learning_rate": 3.5223440148103017e-06, "loss": 0.8127, "step": 1455 }, { "epoch": 3.7540348612007746, "grad_norm": 0.7590358311679077, "learning_rate": 3.5086073540419594e-06, "loss": 0.8299, "step": 1456 }, { "epoch": 3.7566171723692703, "grad_norm": 0.7737724761627253, "learning_rate": 3.4948918301089687e-06, "loss": 0.7995, "step": 1457 }, { "epoch": 3.7591994835377665, "grad_norm": 0.7893128648030869, "learning_rate": 3.481197487670901e-06, "loss": 0.8304, "step": 1458 }, { "epoch": 3.761781794706262, "grad_norm": 0.7672293433531253, "learning_rate": 3.4675243713183436e-06, "loss": 0.8271, "step": 1459 }, { "epoch": 3.764364105874758, "grad_norm": 0.781703741527432, "learning_rate": 3.4538725255727855e-06, "loss": 0.8248, "step": 1460 }, { "epoch": 3.7669464170432536, "grad_norm": 0.7580531483398701, "learning_rate": 3.4402419948864384e-06, "loss": 0.7916, "step": 1461 }, { "epoch": 3.76952872821175, "grad_norm": 0.7486908586065847, "learning_rate": 3.426632823642123e-06, "loss": 0.8137, "step": 1462 }, { "epoch": 3.7721110393802455, "grad_norm": 0.7644509690799265, "learning_rate": 3.4130450561531102e-06, "loss": 0.8355, "step": 1463 }, { "epoch": 3.774693350548741, "grad_norm": 0.7534559123863085, "learning_rate": 3.3994787366629623e-06, "loss": 0.8255, "step": 1464 }, { "epoch": 3.777275661717237, "grad_norm": 0.7614512432818442, "learning_rate": 3.385933909345419e-06, "loss": 0.8115, "step": 1465 }, { "epoch": 3.7798579728857327, "grad_norm": 0.786112853584683, "learning_rate": 3.372410618304238e-06, "loss": 0.8559, "step": 1466 }, { "epoch": 3.7824402840542284, "grad_norm": 0.7573955890285036, "learning_rate": 3.3589089075730474e-06, "loss": 0.8079, "step": 1467 }, { "epoch": 3.785022595222724, "grad_norm": 0.7339247578928705, "learning_rate": 3.345428821115202e-06, "loss": 0.8239, "step": 1468 }, { "epoch": 3.7876049063912203, "grad_norm": 0.8027337173972606, "learning_rate": 3.3319704028236553e-06, "loss": 0.8258, "step": 1469 }, { "epoch": 3.790187217559716, "grad_norm": 0.7602204125813788, "learning_rate": 3.3185336965208057e-06, "loss": 0.8267, "step": 1470 }, { "epoch": 3.7927695287282117, "grad_norm": 0.7661870955667278, "learning_rate": 3.3051187459583454e-06, "loss": 0.8059, "step": 1471 }, { "epoch": 3.7953518398967074, "grad_norm": 0.7519474368509446, "learning_rate": 3.2917255948171366e-06, "loss": 0.8056, "step": 1472 }, { "epoch": 3.7979341510652036, "grad_norm": 0.7657682155523916, "learning_rate": 3.2783542867070538e-06, "loss": 0.8293, "step": 1473 }, { "epoch": 3.8005164622336993, "grad_norm": 0.76866731372553, "learning_rate": 3.2650048651668463e-06, "loss": 0.847, "step": 1474 }, { "epoch": 3.803098773402195, "grad_norm": 0.7770729452000317, "learning_rate": 3.251677373664004e-06, "loss": 0.8026, "step": 1475 }, { "epoch": 3.8056810845706908, "grad_norm": 0.7444822909996885, "learning_rate": 3.2383718555946098e-06, "loss": 0.8205, "step": 1476 }, { "epoch": 3.8082633957391865, "grad_norm": 0.7522360723786109, "learning_rate": 3.2250883542831933e-06, "loss": 0.7975, "step": 1477 }, { "epoch": 3.810845706907682, "grad_norm": 0.7476561413065432, "learning_rate": 3.211826912982591e-06, "loss": 0.8302, "step": 1478 }, { "epoch": 3.813428018076178, "grad_norm": 0.7624889384036899, "learning_rate": 3.1985875748738193e-06, "loss": 0.8336, "step": 1479 }, { "epoch": 3.816010329244674, "grad_norm": 0.7611831427412808, "learning_rate": 3.1853703830659223e-06, "loss": 0.8241, "step": 1480 }, { "epoch": 3.81859264041317, "grad_norm": 0.7522692175193729, "learning_rate": 3.1721753805958245e-06, "loss": 0.8464, "step": 1481 }, { "epoch": 3.8211749515816655, "grad_norm": 0.7971847308682318, "learning_rate": 3.1590026104282024e-06, "loss": 0.8315, "step": 1482 }, { "epoch": 3.8237572627501613, "grad_norm": 0.7769310232315048, "learning_rate": 3.145852115455348e-06, "loss": 0.8264, "step": 1483 }, { "epoch": 3.8263395739186574, "grad_norm": 0.7815145076904971, "learning_rate": 3.132723938497011e-06, "loss": 0.8103, "step": 1484 }, { "epoch": 3.828921885087153, "grad_norm": 0.7394387659719708, "learning_rate": 3.1196181223002842e-06, "loss": 0.8057, "step": 1485 }, { "epoch": 3.831504196255649, "grad_norm": 0.7552308531154759, "learning_rate": 3.106534709539435e-06, "loss": 0.8411, "step": 1486 }, { "epoch": 3.8340865074241446, "grad_norm": 0.7493333935655804, "learning_rate": 3.093473742815797e-06, "loss": 0.8039, "step": 1487 }, { "epoch": 3.8366688185926403, "grad_norm": 0.738912917115583, "learning_rate": 3.0804352646576052e-06, "loss": 0.8271, "step": 1488 }, { "epoch": 3.839251129761136, "grad_norm": 0.7712798674043028, "learning_rate": 3.067419317519875e-06, "loss": 0.821, "step": 1489 }, { "epoch": 3.8418334409296317, "grad_norm": 0.7673812637479288, "learning_rate": 3.054425943784265e-06, "loss": 0.8401, "step": 1490 }, { "epoch": 3.844415752098128, "grad_norm": 0.7690243688879774, "learning_rate": 3.041455185758908e-06, "loss": 0.7975, "step": 1491 }, { "epoch": 3.8469980632666236, "grad_norm": 0.7488704099566282, "learning_rate": 3.0285070856783206e-06, "loss": 0.793, "step": 1492 }, { "epoch": 3.8495803744351194, "grad_norm": 0.7468016897185966, "learning_rate": 3.015581685703237e-06, "loss": 0.8109, "step": 1493 }, { "epoch": 3.852162685603615, "grad_norm": 0.7504249930796498, "learning_rate": 3.0026790279204664e-06, "loss": 0.8314, "step": 1494 }, { "epoch": 3.8547449967721112, "grad_norm": 0.7557407963240177, "learning_rate": 2.9897991543427797e-06, "loss": 0.8327, "step": 1495 }, { "epoch": 3.857327307940607, "grad_norm": 0.7670324489918254, "learning_rate": 2.976942106908749e-06, "loss": 0.8292, "step": 1496 }, { "epoch": 3.8599096191091027, "grad_norm": 0.7638293238485997, "learning_rate": 2.9641079274826302e-06, "loss": 0.8177, "step": 1497 }, { "epoch": 3.8624919302775984, "grad_norm": 0.7712018435277216, "learning_rate": 2.951296657854209e-06, "loss": 0.8285, "step": 1498 }, { "epoch": 3.865074241446094, "grad_norm": 0.7597575954985475, "learning_rate": 2.938508339738683e-06, "loss": 0.816, "step": 1499 }, { "epoch": 3.86765655261459, "grad_norm": 0.7717927570202332, "learning_rate": 2.9257430147765096e-06, "loss": 0.8493, "step": 1500 }, { "epoch": 3.870238863783086, "grad_norm": 0.7649795668645112, "learning_rate": 2.913000724533277e-06, "loss": 0.7985, "step": 1501 }, { "epoch": 3.8728211749515817, "grad_norm": 0.7542531439383557, "learning_rate": 2.900281510499575e-06, "loss": 0.8093, "step": 1502 }, { "epoch": 3.8754034861200775, "grad_norm": 0.7519132753628803, "learning_rate": 2.8875854140908544e-06, "loss": 0.8137, "step": 1503 }, { "epoch": 3.877985797288573, "grad_norm": 0.7514626395050483, "learning_rate": 2.8749124766472858e-06, "loss": 0.8094, "step": 1504 }, { "epoch": 3.8805681084570693, "grad_norm": 0.7683645531072449, "learning_rate": 2.862262739433631e-06, "loss": 0.8132, "step": 1505 }, { "epoch": 3.883150419625565, "grad_norm": 0.7458416884658178, "learning_rate": 2.8496362436391157e-06, "loss": 0.8168, "step": 1506 }, { "epoch": 3.885732730794061, "grad_norm": 0.7746722925229123, "learning_rate": 2.8370330303772874e-06, "loss": 0.7996, "step": 1507 }, { "epoch": 3.8883150419625565, "grad_norm": 0.7553669840963009, "learning_rate": 2.8244531406858765e-06, "loss": 0.8288, "step": 1508 }, { "epoch": 3.8908973531310522, "grad_norm": 0.7573845094295684, "learning_rate": 2.81189661552667e-06, "loss": 0.8374, "step": 1509 }, { "epoch": 3.893479664299548, "grad_norm": 0.7400418799985741, "learning_rate": 2.7993634957853843e-06, "loss": 0.8375, "step": 1510 }, { "epoch": 3.8960619754680437, "grad_norm": 0.7719250258407471, "learning_rate": 2.7868538222715134e-06, "loss": 0.826, "step": 1511 }, { "epoch": 3.89864428663654, "grad_norm": 0.7563300858230262, "learning_rate": 2.774367635718217e-06, "loss": 0.7974, "step": 1512 }, { "epoch": 3.9012265978050356, "grad_norm": 0.7458708691408243, "learning_rate": 2.761904976782177e-06, "loss": 0.8012, "step": 1513 }, { "epoch": 3.9038089089735313, "grad_norm": 0.7442133907212378, "learning_rate": 2.749465886043462e-06, "loss": 0.8129, "step": 1514 }, { "epoch": 3.906391220142027, "grad_norm": 0.7606057865370476, "learning_rate": 2.7370504040053957e-06, "loss": 0.7908, "step": 1515 }, { "epoch": 3.908973531310523, "grad_norm": 0.7415475080933377, "learning_rate": 2.7246585710944383e-06, "loss": 0.8383, "step": 1516 }, { "epoch": 3.911555842479019, "grad_norm": 0.7747366485131825, "learning_rate": 2.7122904276600483e-06, "loss": 0.8299, "step": 1517 }, { "epoch": 3.9141381536475146, "grad_norm": 0.7470976085296888, "learning_rate": 2.699946013974527e-06, "loss": 0.8225, "step": 1518 }, { "epoch": 3.9167204648160103, "grad_norm": 0.759056171874551, "learning_rate": 2.68762537023293e-06, "loss": 0.8079, "step": 1519 }, { "epoch": 3.919302775984506, "grad_norm": 0.7626528987921768, "learning_rate": 2.6753285365529103e-06, "loss": 0.8272, "step": 1520 }, { "epoch": 3.9218850871530018, "grad_norm": 0.7651769802741644, "learning_rate": 2.6630555529745826e-06, "loss": 0.8338, "step": 1521 }, { "epoch": 3.9244673983214975, "grad_norm": 0.7774095564165737, "learning_rate": 2.6508064594604157e-06, "loss": 0.8203, "step": 1522 }, { "epoch": 3.9270497094899937, "grad_norm": 0.7651143778092684, "learning_rate": 2.638581295895075e-06, "loss": 0.849, "step": 1523 }, { "epoch": 3.9296320206584894, "grad_norm": 0.7903147596028517, "learning_rate": 2.626380102085322e-06, "loss": 0.8106, "step": 1524 }, { "epoch": 3.932214331826985, "grad_norm": 0.7530887651415382, "learning_rate": 2.614202917759855e-06, "loss": 0.8333, "step": 1525 }, { "epoch": 3.934796642995481, "grad_norm": 0.77568209115188, "learning_rate": 2.602049782569206e-06, "loss": 0.8137, "step": 1526 }, { "epoch": 3.937378954163977, "grad_norm": 0.7855983310276548, "learning_rate": 2.5899207360855984e-06, "loss": 0.7917, "step": 1527 }, { "epoch": 3.9399612653324727, "grad_norm": 0.7531574331667908, "learning_rate": 2.5778158178028045e-06, "loss": 0.8178, "step": 1528 }, { "epoch": 3.9425435765009684, "grad_norm": 0.7469480826297799, "learning_rate": 2.5657350671360514e-06, "loss": 0.844, "step": 1529 }, { "epoch": 3.945125887669464, "grad_norm": 0.7762679927670401, "learning_rate": 2.5536785234218664e-06, "loss": 0.8234, "step": 1530 }, { "epoch": 3.94770819883796, "grad_norm": 0.7554503148115285, "learning_rate": 2.541646225917954e-06, "loss": 0.8214, "step": 1531 }, { "epoch": 3.9502905100064556, "grad_norm": 0.7509360975231941, "learning_rate": 2.529638213803065e-06, "loss": 0.8096, "step": 1532 }, { "epoch": 3.9528728211749513, "grad_norm": 0.71654004355707, "learning_rate": 2.5176545261768847e-06, "loss": 0.8168, "step": 1533 }, { "epoch": 3.9554551323434475, "grad_norm": 0.7790891430190677, "learning_rate": 2.5056952020598913e-06, "loss": 0.8014, "step": 1534 }, { "epoch": 3.958037443511943, "grad_norm": 0.7484517687807049, "learning_rate": 2.4937602803932237e-06, "loss": 0.8326, "step": 1535 }, { "epoch": 3.960619754680439, "grad_norm": 0.7746336735865199, "learning_rate": 2.481849800038577e-06, "loss": 0.8329, "step": 1536 }, { "epoch": 3.9632020658489346, "grad_norm": 0.7508850795911187, "learning_rate": 2.4699637997780503e-06, "loss": 0.8104, "step": 1537 }, { "epoch": 3.965784377017431, "grad_norm": 0.7624683021848369, "learning_rate": 2.458102318314034e-06, "loss": 0.8195, "step": 1538 }, { "epoch": 3.9683666881859265, "grad_norm": 0.7576649154014872, "learning_rate": 2.4462653942690895e-06, "loss": 0.8154, "step": 1539 }, { "epoch": 3.9709489993544222, "grad_norm": 0.7548460407193209, "learning_rate": 2.4344530661858123e-06, "loss": 0.8193, "step": 1540 }, { "epoch": 3.973531310522918, "grad_norm": 0.7602374654670865, "learning_rate": 2.422665372526708e-06, "loss": 0.8203, "step": 1541 }, { "epoch": 3.9761136216914137, "grad_norm": 0.7675841326705145, "learning_rate": 2.410902351674066e-06, "loss": 0.8207, "step": 1542 }, { "epoch": 3.9786959328599094, "grad_norm": 0.7514141818157982, "learning_rate": 2.399164041929846e-06, "loss": 0.7885, "step": 1543 }, { "epoch": 3.9812782440284056, "grad_norm": 0.7642082673174962, "learning_rate": 2.387450481515543e-06, "loss": 0.799, "step": 1544 }, { "epoch": 3.9838605551969013, "grad_norm": 0.7494471147548254, "learning_rate": 2.3757617085720617e-06, "loss": 0.8128, "step": 1545 }, { "epoch": 3.986442866365397, "grad_norm": 0.7566122069144386, "learning_rate": 2.364097761159594e-06, "loss": 0.8212, "step": 1546 }, { "epoch": 3.9890251775338927, "grad_norm": 0.7695050070788328, "learning_rate": 2.3524586772575055e-06, "loss": 0.8265, "step": 1547 }, { "epoch": 3.991607488702389, "grad_norm": 0.767828875724427, "learning_rate": 2.3408444947641897e-06, "loss": 0.8107, "step": 1548 }, { "epoch": 3.9941897998708846, "grad_norm": 0.7241738464533221, "learning_rate": 2.3292552514969723e-06, "loss": 0.8248, "step": 1549 }, { "epoch": 3.9967721110393803, "grad_norm": 0.7625460770903586, "learning_rate": 2.3176909851919593e-06, "loss": 0.8179, "step": 1550 }, { "epoch": 3.999354422207876, "grad_norm": 0.741225630820047, "learning_rate": 2.306151733503943e-06, "loss": 0.7945, "step": 1551 }, { "epoch": 4.0, "grad_norm": 1.6164851247119985, "learning_rate": 2.294637534006251e-06, "loss": 0.789, "step": 1552 }, { "epoch": 4.002582311168496, "grad_norm": 1.424317513923708, "learning_rate": 2.2831484241906456e-06, "loss": 0.7301, "step": 1553 }, { "epoch": 4.005164622336991, "grad_norm": 1.3624952198303995, "learning_rate": 2.271684441467198e-06, "loss": 0.7151, "step": 1554 }, { "epoch": 4.007746933505487, "grad_norm": 1.2756030345247056, "learning_rate": 2.2602456231641457e-06, "loss": 0.73, "step": 1555 }, { "epoch": 4.010329244673983, "grad_norm": 1.0331700783629776, "learning_rate": 2.2488320065278034e-06, "loss": 0.6833, "step": 1556 }, { "epoch": 4.012911555842479, "grad_norm": 0.9404904636726831, "learning_rate": 2.2374436287224245e-06, "loss": 0.728, "step": 1557 }, { "epoch": 4.015493867010975, "grad_norm": 0.9727430065578684, "learning_rate": 2.22608052683007e-06, "loss": 0.7489, "step": 1558 }, { "epoch": 4.018076178179471, "grad_norm": 1.098170239940058, "learning_rate": 2.214742737850514e-06, "loss": 0.7356, "step": 1559 }, { "epoch": 4.020658489347967, "grad_norm": 1.2733638094374413, "learning_rate": 2.2034302987010938e-06, "loss": 0.7244, "step": 1560 }, { "epoch": 4.023240800516462, "grad_norm": 1.2876487754543966, "learning_rate": 2.192143246216618e-06, "loss": 0.71, "step": 1561 }, { "epoch": 4.025823111684958, "grad_norm": 1.2629073802929212, "learning_rate": 2.180881617149221e-06, "loss": 0.7205, "step": 1562 }, { "epoch": 4.028405422853454, "grad_norm": 1.179837995515697, "learning_rate": 2.169645448168265e-06, "loss": 0.7329, "step": 1563 }, { "epoch": 4.0309877340219495, "grad_norm": 1.0543627752747324, "learning_rate": 2.158434775860205e-06, "loss": 0.7173, "step": 1564 }, { "epoch": 4.033570045190445, "grad_norm": 0.9861615682326618, "learning_rate": 2.1472496367284746e-06, "loss": 0.7369, "step": 1565 }, { "epoch": 4.036152356358941, "grad_norm": 1.0060005644443055, "learning_rate": 2.1360900671933703e-06, "loss": 0.7039, "step": 1566 }, { "epoch": 4.038734667527437, "grad_norm": 0.949784297792835, "learning_rate": 2.1249561035919364e-06, "loss": 0.7236, "step": 1567 }, { "epoch": 4.041316978695932, "grad_norm": 0.9737652328085534, "learning_rate": 2.113847782177829e-06, "loss": 0.7088, "step": 1568 }, { "epoch": 4.043899289864429, "grad_norm": 0.9725504207142661, "learning_rate": 2.1027651391212158e-06, "loss": 0.7366, "step": 1569 }, { "epoch": 4.046481601032925, "grad_norm": 0.9215401564734376, "learning_rate": 2.091708210508654e-06, "loss": 0.7031, "step": 1570 }, { "epoch": 4.0490639122014205, "grad_norm": 0.902423487206708, "learning_rate": 2.0806770323429725e-06, "loss": 0.7369, "step": 1571 }, { "epoch": 4.051646223369916, "grad_norm": 0.9424761496048374, "learning_rate": 2.069671640543147e-06, "loss": 0.7624, "step": 1572 }, { "epoch": 4.054228534538412, "grad_norm": 0.9147025826090719, "learning_rate": 2.0586920709441916e-06, "loss": 0.719, "step": 1573 }, { "epoch": 4.056810845706908, "grad_norm": 0.8911633869358179, "learning_rate": 2.0477383592970445e-06, "loss": 0.6934, "step": 1574 }, { "epoch": 4.059393156875403, "grad_norm": 1.0041409401766892, "learning_rate": 2.0368105412684393e-06, "loss": 0.7207, "step": 1575 }, { "epoch": 4.061975468043899, "grad_norm": 1.0810423533153977, "learning_rate": 2.0259086524408036e-06, "loss": 0.7488, "step": 1576 }, { "epoch": 4.064557779212395, "grad_norm": 1.081615913030172, "learning_rate": 2.015032728312134e-06, "loss": 0.7308, "step": 1577 }, { "epoch": 4.0671400903808905, "grad_norm": 0.9264001598492572, "learning_rate": 2.0041828042958823e-06, "loss": 0.7099, "step": 1578 }, { "epoch": 4.069722401549387, "grad_norm": 0.9108042452749365, "learning_rate": 1.9933589157208356e-06, "loss": 0.706, "step": 1579 }, { "epoch": 4.072304712717883, "grad_norm": 0.8937819117625528, "learning_rate": 1.9825610978310127e-06, "loss": 0.7104, "step": 1580 }, { "epoch": 4.074887023886379, "grad_norm": 0.8825092444237356, "learning_rate": 1.9717893857855475e-06, "loss": 0.7053, "step": 1581 }, { "epoch": 4.077469335054874, "grad_norm": 0.8817000599546978, "learning_rate": 1.961043814658552e-06, "loss": 0.7098, "step": 1582 }, { "epoch": 4.08005164622337, "grad_norm": 0.9005717565381924, "learning_rate": 1.950324419439035e-06, "loss": 0.6968, "step": 1583 }, { "epoch": 4.082633957391866, "grad_norm": 0.9122929744904504, "learning_rate": 1.9396312350307722e-06, "loss": 0.7119, "step": 1584 }, { "epoch": 4.0852162685603615, "grad_norm": 0.8952175519583466, "learning_rate": 1.9289642962521847e-06, "loss": 0.7177, "step": 1585 }, { "epoch": 4.087798579728857, "grad_norm": 0.8962217979338948, "learning_rate": 1.918323637836247e-06, "loss": 0.7047, "step": 1586 }, { "epoch": 4.090380890897353, "grad_norm": 0.898920603548723, "learning_rate": 1.9077092944303453e-06, "loss": 0.7328, "step": 1587 }, { "epoch": 4.092963202065849, "grad_norm": 0.9407672412231067, "learning_rate": 1.8971213005961985e-06, "loss": 0.7244, "step": 1588 }, { "epoch": 4.095545513234344, "grad_norm": 0.9175278092820267, "learning_rate": 1.8865596908097105e-06, "loss": 0.7076, "step": 1589 }, { "epoch": 4.098127824402841, "grad_norm": 0.9219067326264855, "learning_rate": 1.8760244994608911e-06, "loss": 0.7205, "step": 1590 }, { "epoch": 4.100710135571337, "grad_norm": 0.9313883405997428, "learning_rate": 1.8655157608537156e-06, "loss": 0.7329, "step": 1591 }, { "epoch": 4.103292446739832, "grad_norm": 0.9069473980493022, "learning_rate": 1.855033509206029e-06, "loss": 0.7058, "step": 1592 }, { "epoch": 4.105874757908328, "grad_norm": 0.9042132782800456, "learning_rate": 1.8445777786494356e-06, "loss": 0.722, "step": 1593 }, { "epoch": 4.108457069076824, "grad_norm": 0.8984738476555618, "learning_rate": 1.8341486032291834e-06, "loss": 0.6965, "step": 1594 }, { "epoch": 4.11103938024532, "grad_norm": 0.9033341125566603, "learning_rate": 1.823746016904049e-06, "loss": 0.7043, "step": 1595 }, { "epoch": 4.113621691413815, "grad_norm": 0.9024091267438483, "learning_rate": 1.8133700535462274e-06, "loss": 0.7181, "step": 1596 }, { "epoch": 4.116204002582311, "grad_norm": 0.8840547702331727, "learning_rate": 1.8030207469412374e-06, "loss": 0.7137, "step": 1597 }, { "epoch": 4.118786313750807, "grad_norm": 0.8791770361734527, "learning_rate": 1.7926981307877944e-06, "loss": 0.707, "step": 1598 }, { "epoch": 4.1213686249193024, "grad_norm": 0.8765500683094899, "learning_rate": 1.7824022386977014e-06, "loss": 0.7332, "step": 1599 }, { "epoch": 4.123950936087798, "grad_norm": 0.8727571261897714, "learning_rate": 1.7721331041957535e-06, "loss": 0.7026, "step": 1600 }, { "epoch": 4.126533247256295, "grad_norm": 0.8893215641197096, "learning_rate": 1.7618907607196112e-06, "loss": 0.699, "step": 1601 }, { "epoch": 4.1291155584247905, "grad_norm": 0.8809282308536279, "learning_rate": 1.7516752416197013e-06, "loss": 0.6937, "step": 1602 }, { "epoch": 4.131697869593286, "grad_norm": 0.9065700149312429, "learning_rate": 1.741486580159112e-06, "loss": 0.7156, "step": 1603 }, { "epoch": 4.134280180761782, "grad_norm": 0.9172825352706213, "learning_rate": 1.7313248095134772e-06, "loss": 0.7224, "step": 1604 }, { "epoch": 4.136862491930278, "grad_norm": 0.9359192197464888, "learning_rate": 1.7211899627708694e-06, "loss": 0.7159, "step": 1605 }, { "epoch": 4.139444803098773, "grad_norm": 0.8923723735789315, "learning_rate": 1.711082072931689e-06, "loss": 0.7144, "step": 1606 }, { "epoch": 4.142027114267269, "grad_norm": 0.8698447798036731, "learning_rate": 1.7010011729085696e-06, "loss": 0.7183, "step": 1607 }, { "epoch": 4.144609425435765, "grad_norm": 0.9184478130391627, "learning_rate": 1.6909472955262596e-06, "loss": 0.7542, "step": 1608 }, { "epoch": 4.1471917366042605, "grad_norm": 0.8850461139260019, "learning_rate": 1.6809204735215179e-06, "loss": 0.7186, "step": 1609 }, { "epoch": 4.149774047772756, "grad_norm": 0.8889238278435273, "learning_rate": 1.6709207395430005e-06, "loss": 0.7405, "step": 1610 }, { "epoch": 4.152356358941253, "grad_norm": 0.9208003199238048, "learning_rate": 1.660948126151175e-06, "loss": 0.7124, "step": 1611 }, { "epoch": 4.154938670109749, "grad_norm": 0.8762844910372851, "learning_rate": 1.6510026658181866e-06, "loss": 0.7292, "step": 1612 }, { "epoch": 4.157520981278244, "grad_norm": 0.8940625291851263, "learning_rate": 1.6410843909277784e-06, "loss": 0.7186, "step": 1613 }, { "epoch": 4.16010329244674, "grad_norm": 0.8584435328232947, "learning_rate": 1.6311933337751652e-06, "loss": 0.7018, "step": 1614 }, { "epoch": 4.162685603615236, "grad_norm": 0.8889295547847345, "learning_rate": 1.6213295265669448e-06, "loss": 0.713, "step": 1615 }, { "epoch": 4.1652679147837315, "grad_norm": 0.8961567659168423, "learning_rate": 1.6114930014209763e-06, "loss": 0.716, "step": 1616 }, { "epoch": 4.167850225952227, "grad_norm": 0.8623292807248303, "learning_rate": 1.601683790366293e-06, "loss": 0.7409, "step": 1617 }, { "epoch": 4.170432537120723, "grad_norm": 0.9014567493180559, "learning_rate": 1.5919019253429923e-06, "loss": 0.7147, "step": 1618 }, { "epoch": 4.173014848289219, "grad_norm": 0.8966269088663105, "learning_rate": 1.5821474382021128e-06, "loss": 0.7202, "step": 1619 }, { "epoch": 4.175597159457714, "grad_norm": 0.9039554140412117, "learning_rate": 1.5724203607055655e-06, "loss": 0.7208, "step": 1620 }, { "epoch": 4.17817947062621, "grad_norm": 0.9117935626371781, "learning_rate": 1.5627207245260046e-06, "loss": 0.7252, "step": 1621 }, { "epoch": 4.180761781794706, "grad_norm": 0.8838537392487884, "learning_rate": 1.5530485612467317e-06, "loss": 0.7143, "step": 1622 }, { "epoch": 4.183344092963202, "grad_norm": 0.8872948835258441, "learning_rate": 1.54340390236159e-06, "loss": 0.6962, "step": 1623 }, { "epoch": 4.185926404131698, "grad_norm": 0.8960284062739021, "learning_rate": 1.5337867792748694e-06, "loss": 0.7195, "step": 1624 }, { "epoch": 4.188508715300194, "grad_norm": 0.8995137169581848, "learning_rate": 1.5241972233012015e-06, "loss": 0.6987, "step": 1625 }, { "epoch": 4.19109102646869, "grad_norm": 0.8725877233542136, "learning_rate": 1.5146352656654473e-06, "loss": 0.6822, "step": 1626 }, { "epoch": 4.193673337637185, "grad_norm": 0.9022708955863336, "learning_rate": 1.5051009375026127e-06, "loss": 0.7124, "step": 1627 }, { "epoch": 4.196255648805681, "grad_norm": 0.894833962035567, "learning_rate": 1.4955942698577341e-06, "loss": 0.7362, "step": 1628 }, { "epoch": 4.198837959974177, "grad_norm": 0.9297395231684169, "learning_rate": 1.4861152936857792e-06, "loss": 0.7272, "step": 1629 }, { "epoch": 4.2014202711426725, "grad_norm": 0.9212227444100315, "learning_rate": 1.476664039851554e-06, "loss": 0.7345, "step": 1630 }, { "epoch": 4.204002582311168, "grad_norm": 0.9040538736898476, "learning_rate": 1.4672405391295964e-06, "loss": 0.7202, "step": 1631 }, { "epoch": 4.206584893479664, "grad_norm": 0.90954184793202, "learning_rate": 1.4578448222040708e-06, "loss": 0.7144, "step": 1632 }, { "epoch": 4.2091672046481605, "grad_norm": 0.8765308727348899, "learning_rate": 1.4484769196686777e-06, "loss": 0.6932, "step": 1633 }, { "epoch": 4.211749515816656, "grad_norm": 0.8913725709084924, "learning_rate": 1.4391368620265522e-06, "loss": 0.6839, "step": 1634 }, { "epoch": 4.214331826985152, "grad_norm": 0.8949496649062495, "learning_rate": 1.4298246796901615e-06, "loss": 0.7081, "step": 1635 }, { "epoch": 4.216914138153648, "grad_norm": 0.8890323192323862, "learning_rate": 1.4205404029812043e-06, "loss": 0.7148, "step": 1636 }, { "epoch": 4.219496449322143, "grad_norm": 0.8898844083009926, "learning_rate": 1.4112840621305156e-06, "loss": 0.7055, "step": 1637 }, { "epoch": 4.222078760490639, "grad_norm": 0.8973037287194257, "learning_rate": 1.4020556872779723e-06, "loss": 0.7001, "step": 1638 }, { "epoch": 4.224661071659135, "grad_norm": 0.8881050754779889, "learning_rate": 1.3928553084723828e-06, "loss": 0.7029, "step": 1639 }, { "epoch": 4.227243382827631, "grad_norm": 0.9282818296375939, "learning_rate": 1.3836829556714027e-06, "loss": 0.7436, "step": 1640 }, { "epoch": 4.229825693996126, "grad_norm": 0.896945078296962, "learning_rate": 1.3745386587414312e-06, "loss": 0.7051, "step": 1641 }, { "epoch": 4.232408005164622, "grad_norm": 0.8904775792068886, "learning_rate": 1.3654224474575105e-06, "loss": 0.75, "step": 1642 }, { "epoch": 4.234990316333118, "grad_norm": 0.9051407253126452, "learning_rate": 1.3563343515032312e-06, "loss": 0.7122, "step": 1643 }, { "epoch": 4.237572627501614, "grad_norm": 0.875475228439986, "learning_rate": 1.3472744004706406e-06, "loss": 0.7138, "step": 1644 }, { "epoch": 4.24015493867011, "grad_norm": 0.9170090364957498, "learning_rate": 1.3382426238601443e-06, "loss": 0.7209, "step": 1645 }, { "epoch": 4.242737249838606, "grad_norm": 0.8963557044070765, "learning_rate": 1.3292390510803987e-06, "loss": 0.7207, "step": 1646 }, { "epoch": 4.2453195610071015, "grad_norm": 0.8870017097792205, "learning_rate": 1.320263711448232e-06, "loss": 0.7344, "step": 1647 }, { "epoch": 4.247901872175597, "grad_norm": 0.8643621623469757, "learning_rate": 1.3113166341885453e-06, "loss": 0.6909, "step": 1648 }, { "epoch": 4.250484183344093, "grad_norm": 0.88693541516387, "learning_rate": 1.3023978484342027e-06, "loss": 0.7172, "step": 1649 }, { "epoch": 4.253066494512589, "grad_norm": 0.8827410284281785, "learning_rate": 1.293507383225958e-06, "loss": 0.6974, "step": 1650 }, { "epoch": 4.255648805681084, "grad_norm": 0.8921892267635948, "learning_rate": 1.2846452675123412e-06, "loss": 0.7198, "step": 1651 }, { "epoch": 4.25823111684958, "grad_norm": 0.8699520076203895, "learning_rate": 1.275811530149581e-06, "loss": 0.712, "step": 1652 }, { "epoch": 4.260813428018076, "grad_norm": 0.9053418709356683, "learning_rate": 1.2670061999014926e-06, "loss": 0.711, "step": 1653 }, { "epoch": 4.263395739186572, "grad_norm": 0.8933588527941319, "learning_rate": 1.2582293054394034e-06, "loss": 0.7191, "step": 1654 }, { "epoch": 4.265978050355068, "grad_norm": 0.8957606594695622, "learning_rate": 1.249480875342044e-06, "loss": 0.7013, "step": 1655 }, { "epoch": 4.268560361523564, "grad_norm": 0.8896835872883254, "learning_rate": 1.240760938095461e-06, "loss": 0.6909, "step": 1656 }, { "epoch": 4.27114267269206, "grad_norm": 0.902860652986643, "learning_rate": 1.232069522092929e-06, "loss": 0.7023, "step": 1657 }, { "epoch": 4.273724983860555, "grad_norm": 0.8991644670101375, "learning_rate": 1.2234066556348524e-06, "loss": 0.7201, "step": 1658 }, { "epoch": 4.276307295029051, "grad_norm": 0.871953669734789, "learning_rate": 1.2147723669286703e-06, "loss": 0.7135, "step": 1659 }, { "epoch": 4.278889606197547, "grad_norm": 0.8976349496395944, "learning_rate": 1.206166684088772e-06, "loss": 0.7142, "step": 1660 }, { "epoch": 4.2814719173660425, "grad_norm": 0.9150642282309124, "learning_rate": 1.1975896351364036e-06, "loss": 0.717, "step": 1661 }, { "epoch": 4.284054228534538, "grad_norm": 0.89830091523198, "learning_rate": 1.189041247999575e-06, "loss": 0.7038, "step": 1662 }, { "epoch": 4.286636539703034, "grad_norm": 0.9002768402003223, "learning_rate": 1.1805215505129653e-06, "loss": 0.7171, "step": 1663 }, { "epoch": 4.28921885087153, "grad_norm": 0.9189338614452367, "learning_rate": 1.1720305704178436e-06, "loss": 0.7198, "step": 1664 }, { "epoch": 4.291801162040025, "grad_norm": 0.9241169326437726, "learning_rate": 1.1635683353619643e-06, "loss": 0.733, "step": 1665 }, { "epoch": 4.294383473208522, "grad_norm": 0.9074601151507607, "learning_rate": 1.1551348728994849e-06, "loss": 0.7251, "step": 1666 }, { "epoch": 4.296965784377018, "grad_norm": 0.9201495613344312, "learning_rate": 1.1467302104908796e-06, "loss": 0.7261, "step": 1667 }, { "epoch": 4.299548095545513, "grad_norm": 0.8938379902505227, "learning_rate": 1.138354375502847e-06, "loss": 0.6994, "step": 1668 }, { "epoch": 4.302130406714009, "grad_norm": 0.9336658672822801, "learning_rate": 1.1300073952082147e-06, "loss": 0.7156, "step": 1669 }, { "epoch": 4.304712717882505, "grad_norm": 0.8831440608193399, "learning_rate": 1.121689296785854e-06, "loss": 0.6995, "step": 1670 }, { "epoch": 4.307295029051001, "grad_norm": 0.8799798301503751, "learning_rate": 1.1134001073206025e-06, "loss": 0.7193, "step": 1671 }, { "epoch": 4.309877340219496, "grad_norm": 0.8804550173412993, "learning_rate": 1.1051398538031544e-06, "loss": 0.7258, "step": 1672 }, { "epoch": 4.312459651387992, "grad_norm": 0.9036007303009939, "learning_rate": 1.0969085631299946e-06, "loss": 0.708, "step": 1673 }, { "epoch": 4.315041962556488, "grad_norm": 0.9024030379125595, "learning_rate": 1.0887062621032951e-06, "loss": 0.7055, "step": 1674 }, { "epoch": 4.3176242737249835, "grad_norm": 0.8894223361973113, "learning_rate": 1.0805329774308392e-06, "loss": 0.726, "step": 1675 }, { "epoch": 4.32020658489348, "grad_norm": 0.885938538418052, "learning_rate": 1.072388735725921e-06, "loss": 0.703, "step": 1676 }, { "epoch": 4.322788896061976, "grad_norm": 0.9305506401372774, "learning_rate": 1.0642735635072764e-06, "loss": 0.7266, "step": 1677 }, { "epoch": 4.3253712072304715, "grad_norm": 0.9113060756686157, "learning_rate": 1.0561874871989775e-06, "loss": 0.7206, "step": 1678 }, { "epoch": 4.327953518398967, "grad_norm": 0.889475937075751, "learning_rate": 1.0481305331303659e-06, "loss": 0.7145, "step": 1679 }, { "epoch": 4.330535829567463, "grad_norm": 0.9193184675001664, "learning_rate": 1.0401027275359487e-06, "loss": 0.7213, "step": 1680 }, { "epoch": 4.333118140735959, "grad_norm": 0.9057888219841576, "learning_rate": 1.0321040965553286e-06, "loss": 0.7142, "step": 1681 }, { "epoch": 4.335700451904454, "grad_norm": 0.9181347146447232, "learning_rate": 1.0241346662331075e-06, "loss": 0.7001, "step": 1682 }, { "epoch": 4.33828276307295, "grad_norm": 0.8761392579902068, "learning_rate": 1.0161944625188046e-06, "loss": 0.7144, "step": 1683 }, { "epoch": 4.340865074241446, "grad_norm": 0.8957507000212769, "learning_rate": 1.008283511266781e-06, "loss": 0.72, "step": 1684 }, { "epoch": 4.343447385409942, "grad_norm": 0.9048968155853336, "learning_rate": 1.0004018382361414e-06, "loss": 0.7154, "step": 1685 }, { "epoch": 4.346029696578437, "grad_norm": 0.9061516663607375, "learning_rate": 9.92549469090659e-07, "loss": 0.7138, "step": 1686 }, { "epoch": 4.348612007746934, "grad_norm": 0.907960683381014, "learning_rate": 9.847264293986869e-07, "loss": 0.6933, "step": 1687 }, { "epoch": 4.35119431891543, "grad_norm": 0.9335500032392648, "learning_rate": 9.769327446330802e-07, "loss": 0.7203, "step": 1688 }, { "epoch": 4.353776630083925, "grad_norm": 0.8979953039902708, "learning_rate": 9.691684401711143e-07, "loss": 0.7335, "step": 1689 }, { "epoch": 4.356358941252421, "grad_norm": 0.884673020616097, "learning_rate": 9.614335412943887e-07, "loss": 0.7141, "step": 1690 }, { "epoch": 4.358941252420917, "grad_norm": 0.8749160348750349, "learning_rate": 9.537280731887644e-07, "loss": 0.7038, "step": 1691 }, { "epoch": 4.3615235635894125, "grad_norm": 0.8709729980037048, "learning_rate": 9.460520609442647e-07, "loss": 0.6812, "step": 1692 }, { "epoch": 4.364105874757908, "grad_norm": 0.8750487222127739, "learning_rate": 9.384055295550032e-07, "loss": 0.69, "step": 1693 }, { "epoch": 4.366688185926404, "grad_norm": 0.8996853241438628, "learning_rate": 9.307885039191011e-07, "loss": 0.7232, "step": 1694 }, { "epoch": 4.3692704970949, "grad_norm": 0.908802912077574, "learning_rate": 9.232010088386067e-07, "loss": 0.7062, "step": 1695 }, { "epoch": 4.371852808263395, "grad_norm": 0.9168462712654659, "learning_rate": 9.156430690194074e-07, "loss": 0.7084, "step": 1696 }, { "epoch": 4.374435119431892, "grad_norm": 0.9259940687964294, "learning_rate": 9.081147090711562e-07, "loss": 0.742, "step": 1697 }, { "epoch": 4.377017430600388, "grad_norm": 0.9383290055652561, "learning_rate": 9.006159535071945e-07, "loss": 0.7364, "step": 1698 }, { "epoch": 4.3795997417688834, "grad_norm": 0.8900989908997234, "learning_rate": 8.93146826744462e-07, "loss": 0.6925, "step": 1699 }, { "epoch": 4.382182052937379, "grad_norm": 0.906245275227718, "learning_rate": 8.8570735310343e-07, "loss": 0.7152, "step": 1700 }, { "epoch": 4.384764364105875, "grad_norm": 0.8817115384410557, "learning_rate": 8.782975568080066e-07, "loss": 0.7119, "step": 1701 }, { "epoch": 4.387346675274371, "grad_norm": 0.8936803543044946, "learning_rate": 8.709174619854766e-07, "loss": 0.7221, "step": 1702 }, { "epoch": 4.389928986442866, "grad_norm": 0.9106707340778925, "learning_rate": 8.635670926664019e-07, "loss": 0.7159, "step": 1703 }, { "epoch": 4.392511297611362, "grad_norm": 0.8995573561800408, "learning_rate": 8.562464727845621e-07, "loss": 0.7232, "step": 1704 }, { "epoch": 4.395093608779858, "grad_norm": 0.9242674966931874, "learning_rate": 8.489556261768694e-07, "loss": 0.7511, "step": 1705 }, { "epoch": 4.3976759199483535, "grad_norm": 0.8973937221578497, "learning_rate": 8.41694576583284e-07, "loss": 0.7151, "step": 1706 }, { "epoch": 4.400258231116849, "grad_norm": 0.9016096213701568, "learning_rate": 8.344633476467456e-07, "loss": 0.7555, "step": 1707 }, { "epoch": 4.402840542285345, "grad_norm": 0.9070638086527365, "learning_rate": 8.272619629130984e-07, "loss": 0.7405, "step": 1708 }, { "epoch": 4.4054228534538415, "grad_norm": 0.8882275237620317, "learning_rate": 8.200904458310022e-07, "loss": 0.6947, "step": 1709 }, { "epoch": 4.408005164622337, "grad_norm": 0.8804791482936096, "learning_rate": 8.129488197518687e-07, "loss": 0.6977, "step": 1710 }, { "epoch": 4.410587475790833, "grad_norm": 0.9272005439255128, "learning_rate": 8.0583710792978e-07, "loss": 0.7132, "step": 1711 }, { "epoch": 4.413169786959329, "grad_norm": 0.8899287069527094, "learning_rate": 7.987553335214149e-07, "loss": 0.731, "step": 1712 }, { "epoch": 4.415752098127824, "grad_norm": 0.8808619191876175, "learning_rate": 7.917035195859668e-07, "loss": 0.7265, "step": 1713 }, { "epoch": 4.41833440929632, "grad_norm": 0.8969831722003705, "learning_rate": 7.846816890850806e-07, "loss": 0.7116, "step": 1714 }, { "epoch": 4.420916720464816, "grad_norm": 0.8924810647720324, "learning_rate": 7.776898648827647e-07, "loss": 0.7146, "step": 1715 }, { "epoch": 4.423499031633312, "grad_norm": 1.1018976996499141, "learning_rate": 7.707280697453256e-07, "loss": 0.6941, "step": 1716 }, { "epoch": 4.426081342801807, "grad_norm": 0.894570214234641, "learning_rate": 7.637963263412929e-07, "loss": 0.7145, "step": 1717 }, { "epoch": 4.428663653970303, "grad_norm": 0.907267880331535, "learning_rate": 7.568946572413438e-07, "loss": 0.7239, "step": 1718 }, { "epoch": 4.4312459651388, "grad_norm": 0.904439736544676, "learning_rate": 7.500230849182278e-07, "loss": 0.7148, "step": 1719 }, { "epoch": 4.433828276307295, "grad_norm": 0.9165453956707031, "learning_rate": 7.431816317466923e-07, "loss": 0.7276, "step": 1720 }, { "epoch": 4.436410587475791, "grad_norm": 0.8958325943917459, "learning_rate": 7.363703200034177e-07, "loss": 0.7121, "step": 1721 }, { "epoch": 4.438992898644287, "grad_norm": 0.9140196598832558, "learning_rate": 7.295891718669423e-07, "loss": 0.7331, "step": 1722 }, { "epoch": 4.4415752098127825, "grad_norm": 0.8872380269740475, "learning_rate": 7.228382094175801e-07, "loss": 0.7001, "step": 1723 }, { "epoch": 4.444157520981278, "grad_norm": 0.9270216958684496, "learning_rate": 7.161174546373595e-07, "loss": 0.7181, "step": 1724 }, { "epoch": 4.446739832149774, "grad_norm": 0.9132455796590981, "learning_rate": 7.094269294099509e-07, "loss": 0.731, "step": 1725 }, { "epoch": 4.44932214331827, "grad_norm": 0.9227817170397697, "learning_rate": 7.027666555205915e-07, "loss": 0.7337, "step": 1726 }, { "epoch": 4.451904454486765, "grad_norm": 0.9210558504510359, "learning_rate": 6.961366546560156e-07, "loss": 0.7291, "step": 1727 }, { "epoch": 4.454486765655261, "grad_norm": 0.9075281818010581, "learning_rate": 6.895369484043879e-07, "loss": 0.7321, "step": 1728 }, { "epoch": 4.457069076823757, "grad_norm": 1.0617614842864411, "learning_rate": 6.829675582552253e-07, "loss": 0.6943, "step": 1729 }, { "epoch": 4.4596513879922535, "grad_norm": 0.9007403795148768, "learning_rate": 6.764285055993313e-07, "loss": 0.7094, "step": 1730 }, { "epoch": 4.462233699160749, "grad_norm": 0.9117234111423507, "learning_rate": 6.699198117287309e-07, "loss": 0.7385, "step": 1731 }, { "epoch": 4.464816010329245, "grad_norm": 0.8912704657574764, "learning_rate": 6.634414978365978e-07, "loss": 0.7145, "step": 1732 }, { "epoch": 4.467398321497741, "grad_norm": 0.9112230980418938, "learning_rate": 6.569935850171749e-07, "loss": 0.7199, "step": 1733 }, { "epoch": 4.469980632666236, "grad_norm": 0.9068319118246875, "learning_rate": 6.505760942657235e-07, "loss": 0.728, "step": 1734 }, { "epoch": 4.472562943834732, "grad_norm": 0.8698085381695577, "learning_rate": 6.441890464784473e-07, "loss": 0.6873, "step": 1735 }, { "epoch": 4.475145255003228, "grad_norm": 0.8820814075457806, "learning_rate": 6.37832462452418e-07, "loss": 0.7087, "step": 1736 }, { "epoch": 4.4777275661717235, "grad_norm": 0.9076300266568639, "learning_rate": 6.315063628855178e-07, "loss": 0.7207, "step": 1737 }, { "epoch": 4.480309877340219, "grad_norm": 0.8761294190152139, "learning_rate": 6.252107683763642e-07, "loss": 0.7028, "step": 1738 }, { "epoch": 4.482892188508715, "grad_norm": 0.9023071618115094, "learning_rate": 6.189456994242516e-07, "loss": 0.7548, "step": 1739 }, { "epoch": 4.485474499677212, "grad_norm": 0.9032118146111997, "learning_rate": 6.127111764290694e-07, "loss": 0.7198, "step": 1740 }, { "epoch": 4.488056810845707, "grad_norm": 0.9423906564703732, "learning_rate": 6.065072196912569e-07, "loss": 0.7192, "step": 1741 }, { "epoch": 4.490639122014203, "grad_norm": 0.8871664697024927, "learning_rate": 6.003338494117183e-07, "loss": 0.7261, "step": 1742 }, { "epoch": 4.493221433182699, "grad_norm": 0.8723986193214446, "learning_rate": 5.941910856917643e-07, "loss": 0.6919, "step": 1743 }, { "epoch": 4.4958037443511945, "grad_norm": 0.9160786881274069, "learning_rate": 5.880789485330484e-07, "loss": 0.7184, "step": 1744 }, { "epoch": 4.49838605551969, "grad_norm": 0.8749173043898083, "learning_rate": 5.81997457837502e-07, "loss": 0.7038, "step": 1745 }, { "epoch": 4.500968366688186, "grad_norm": 0.8836993274435628, "learning_rate": 5.75946633407265e-07, "loss": 0.7058, "step": 1746 }, { "epoch": 4.503550677856682, "grad_norm": 0.9168059326784899, "learning_rate": 5.699264949446215e-07, "loss": 0.7576, "step": 1747 }, { "epoch": 4.506132989025177, "grad_norm": 0.916608005705834, "learning_rate": 5.639370620519424e-07, "loss": 0.7176, "step": 1748 }, { "epoch": 4.508715300193673, "grad_norm": 0.8792129119048742, "learning_rate": 5.579783542316175e-07, "loss": 0.7004, "step": 1749 }, { "epoch": 4.511297611362169, "grad_norm": 0.9102642584599099, "learning_rate": 5.520503908859876e-07, "loss": 0.7296, "step": 1750 }, { "epoch": 4.5138799225306645, "grad_norm": 0.8976181656024875, "learning_rate": 5.461531913172869e-07, "loss": 0.7137, "step": 1751 }, { "epoch": 4.516462233699161, "grad_norm": 0.8955819573412418, "learning_rate": 5.40286774727582e-07, "loss": 0.7243, "step": 1752 }, { "epoch": 4.519044544867657, "grad_norm": 0.8848415473903943, "learning_rate": 5.344511602186986e-07, "loss": 0.6937, "step": 1753 }, { "epoch": 4.5216268560361526, "grad_norm": 0.8935210617924505, "learning_rate": 5.28646366792176e-07, "loss": 0.7115, "step": 1754 }, { "epoch": 4.524209167204648, "grad_norm": 0.9277553981673546, "learning_rate": 5.228724133491903e-07, "loss": 0.7464, "step": 1755 }, { "epoch": 4.526791478373144, "grad_norm": 0.8697756183845172, "learning_rate": 5.171293186904991e-07, "loss": 0.6713, "step": 1756 }, { "epoch": 4.52937378954164, "grad_norm": 0.8699331724645898, "learning_rate": 5.114171015163793e-07, "loss": 0.6981, "step": 1757 }, { "epoch": 4.531956100710135, "grad_norm": 0.8867759757326906, "learning_rate": 5.057357804265695e-07, "loss": 0.713, "step": 1758 }, { "epoch": 4.534538411878631, "grad_norm": 0.8965559347020167, "learning_rate": 5.000853739202039e-07, "loss": 0.7084, "step": 1759 }, { "epoch": 4.537120723047127, "grad_norm": 0.9176654044616335, "learning_rate": 4.944659003957564e-07, "loss": 0.7214, "step": 1760 }, { "epoch": 4.539703034215623, "grad_norm": 0.9221143520181255, "learning_rate": 4.888773781509748e-07, "loss": 0.737, "step": 1761 }, { "epoch": 4.542285345384119, "grad_norm": 0.9322259452120046, "learning_rate": 4.833198253828331e-07, "loss": 0.7416, "step": 1762 }, { "epoch": 4.544867656552615, "grad_norm": 0.9208718551935449, "learning_rate": 4.777932601874557e-07, "loss": 0.7487, "step": 1763 }, { "epoch": 4.547449967721111, "grad_norm": 0.8890662647225912, "learning_rate": 4.7229770056007707e-07, "loss": 0.6894, "step": 1764 }, { "epoch": 4.550032278889606, "grad_norm": 0.887652550850276, "learning_rate": 4.66833164394962e-07, "loss": 0.7031, "step": 1765 }, { "epoch": 4.552614590058102, "grad_norm": 0.8951195625274412, "learning_rate": 4.6139966948537064e-07, "loss": 0.7419, "step": 1766 }, { "epoch": 4.555196901226598, "grad_norm": 0.9259533456732394, "learning_rate": 4.5599723352347857e-07, "loss": 0.6975, "step": 1767 }, { "epoch": 4.5577792123950935, "grad_norm": 0.9119373622565761, "learning_rate": 4.5062587410033663e-07, "loss": 0.727, "step": 1768 }, { "epoch": 4.560361523563589, "grad_norm": 0.8749876771396683, "learning_rate": 4.452856087058044e-07, "loss": 0.6747, "step": 1769 }, { "epoch": 4.562943834732085, "grad_norm": 0.8781088555589601, "learning_rate": 4.3997645472849016e-07, "loss": 0.7024, "step": 1770 }, { "epoch": 4.565526145900581, "grad_norm": 0.8811255741349678, "learning_rate": 4.346984294557055e-07, "loss": 0.7078, "step": 1771 }, { "epoch": 4.568108457069076, "grad_norm": 0.9013328171452745, "learning_rate": 4.29451550073402e-07, "loss": 0.7471, "step": 1772 }, { "epoch": 4.570690768237572, "grad_norm": 0.9233092729022429, "learning_rate": 4.2423583366611345e-07, "loss": 0.7443, "step": 1773 }, { "epoch": 4.573273079406069, "grad_norm": 0.8783319514354955, "learning_rate": 4.190512972169036e-07, "loss": 0.7247, "step": 1774 }, { "epoch": 4.5758553905745645, "grad_norm": 0.8856387585636677, "learning_rate": 4.13897957607311e-07, "loss": 0.701, "step": 1775 }, { "epoch": 4.57843770174306, "grad_norm": 0.9032296044304654, "learning_rate": 4.0877583161729406e-07, "loss": 0.7, "step": 1776 }, { "epoch": 4.581020012911556, "grad_norm": 0.8704929901157932, "learning_rate": 4.036849359251738e-07, "loss": 0.7071, "step": 1777 }, { "epoch": 4.583602324080052, "grad_norm": 0.9068350583367365, "learning_rate": 3.986252871075813e-07, "loss": 0.6992, "step": 1778 }, { "epoch": 4.586184635248547, "grad_norm": 0.874153868567516, "learning_rate": 3.935969016394048e-07, "loss": 0.708, "step": 1779 }, { "epoch": 4.588766946417043, "grad_norm": 0.8976406116845383, "learning_rate": 3.8859979589373265e-07, "loss": 0.7182, "step": 1780 }, { "epoch": 4.591349257585539, "grad_norm": 0.8973977227814353, "learning_rate": 3.836339861418059e-07, "loss": 0.6996, "step": 1781 }, { "epoch": 4.5939315687540345, "grad_norm": 0.9057741077447837, "learning_rate": 3.786994885529582e-07, "loss": 0.707, "step": 1782 }, { "epoch": 4.596513879922531, "grad_norm": 0.9248023645258433, "learning_rate": 3.7379631919457036e-07, "loss": 0.7433, "step": 1783 }, { "epoch": 4.599096191091027, "grad_norm": 0.8815428248966636, "learning_rate": 3.6892449403200805e-07, "loss": 0.7049, "step": 1784 }, { "epoch": 4.601678502259523, "grad_norm": 0.8865336637004515, "learning_rate": 3.6408402892858297e-07, "loss": 0.7074, "step": 1785 }, { "epoch": 4.604260813428018, "grad_norm": 0.8908085795505662, "learning_rate": 3.592749396454931e-07, "loss": 0.7158, "step": 1786 }, { "epoch": 4.606843124596514, "grad_norm": 0.895409574949093, "learning_rate": 3.5449724184176695e-07, "loss": 0.7006, "step": 1787 }, { "epoch": 4.60942543576501, "grad_norm": 0.8953545545983936, "learning_rate": 3.4975095107422473e-07, "loss": 0.7043, "step": 1788 }, { "epoch": 4.6120077469335055, "grad_norm": 0.9024082202649676, "learning_rate": 3.450360827974175e-07, "loss": 0.7188, "step": 1789 }, { "epoch": 4.614590058102001, "grad_norm": 0.8901535896655877, "learning_rate": 3.403526523635825e-07, "loss": 0.7044, "step": 1790 }, { "epoch": 4.617172369270497, "grad_norm": 0.8661077661763952, "learning_rate": 3.3570067502258887e-07, "loss": 0.6895, "step": 1791 }, { "epoch": 4.619754680438993, "grad_norm": 0.8685402290165503, "learning_rate": 3.310801659218943e-07, "loss": 0.7115, "step": 1792 }, { "epoch": 4.622336991607488, "grad_norm": 0.8964236761531204, "learning_rate": 3.264911401064874e-07, "loss": 0.7268, "step": 1793 }, { "epoch": 4.624919302775984, "grad_norm": 0.8907216569293869, "learning_rate": 3.219336125188455e-07, "loss": 0.7009, "step": 1794 }, { "epoch": 4.627501613944481, "grad_norm": 0.8752296130228919, "learning_rate": 3.174075979988811e-07, "loss": 0.7155, "step": 1795 }, { "epoch": 4.630083925112976, "grad_norm": 0.8970700959270853, "learning_rate": 3.1291311128390233e-07, "loss": 0.7261, "step": 1796 }, { "epoch": 4.632666236281472, "grad_norm": 0.8832170790109071, "learning_rate": 3.0845016700854827e-07, "loss": 0.6962, "step": 1797 }, { "epoch": 4.635248547449968, "grad_norm": 0.8737294820452377, "learning_rate": 3.0401877970476e-07, "loss": 0.697, "step": 1798 }, { "epoch": 4.637830858618464, "grad_norm": 0.8664266455857452, "learning_rate": 2.996189638017233e-07, "loss": 0.7204, "step": 1799 }, { "epoch": 4.640413169786959, "grad_norm": 0.8875751806701481, "learning_rate": 2.9525073362581924e-07, "loss": 0.7349, "step": 1800 }, { "epoch": 4.642995480955455, "grad_norm": 0.8976875904169599, "learning_rate": 2.909141034005891e-07, "loss": 0.7118, "step": 1801 }, { "epoch": 4.645577792123951, "grad_norm": 0.875180687352795, "learning_rate": 2.86609087246672e-07, "loss": 0.7072, "step": 1802 }, { "epoch": 4.648160103292446, "grad_norm": 0.8837922794467638, "learning_rate": 2.8233569918177384e-07, "loss": 0.6872, "step": 1803 }, { "epoch": 4.650742414460942, "grad_norm": 0.8773613611606011, "learning_rate": 2.780939531206106e-07, "loss": 0.6997, "step": 1804 }, { "epoch": 4.653324725629439, "grad_norm": 0.9021810787669173, "learning_rate": 2.73883862874873e-07, "loss": 0.7059, "step": 1805 }, { "epoch": 4.6559070367979345, "grad_norm": 0.8823700361393249, "learning_rate": 2.6970544215317197e-07, "loss": 0.7047, "step": 1806 }, { "epoch": 4.65848934796643, "grad_norm": 0.8783238825683962, "learning_rate": 2.655587045609975e-07, "loss": 0.7249, "step": 1807 }, { "epoch": 4.661071659134926, "grad_norm": 0.8827143960172458, "learning_rate": 2.6144366360067896e-07, "loss": 0.7159, "step": 1808 }, { "epoch": 4.663653970303422, "grad_norm": 0.8851244634156046, "learning_rate": 2.57360332671337e-07, "loss": 0.7181, "step": 1809 }, { "epoch": 4.666236281471917, "grad_norm": 0.91360975849369, "learning_rate": 2.5330872506883595e-07, "loss": 0.7426, "step": 1810 }, { "epoch": 4.668818592640413, "grad_norm": 0.9075412171714544, "learning_rate": 2.492888539857485e-07, "loss": 0.6909, "step": 1811 }, { "epoch": 4.671400903808909, "grad_norm": 0.888744832476072, "learning_rate": 2.453007325113077e-07, "loss": 0.7102, "step": 1812 }, { "epoch": 4.6739832149774045, "grad_norm": 0.890507374481685, "learning_rate": 2.41344373631367e-07, "loss": 0.727, "step": 1813 }, { "epoch": 4.6765655261459, "grad_norm": 0.8866830934511432, "learning_rate": 2.374197902283548e-07, "loss": 0.7308, "step": 1814 }, { "epoch": 4.679147837314396, "grad_norm": 0.8771540551708261, "learning_rate": 2.3352699508123579e-07, "loss": 0.6949, "step": 1815 }, { "epoch": 4.681730148482892, "grad_norm": 1.0503180466316975, "learning_rate": 2.296660008654661e-07, "loss": 0.7213, "step": 1816 }, { "epoch": 4.684312459651388, "grad_norm": 0.8959301940497476, "learning_rate": 2.2583682015295593e-07, "loss": 0.7101, "step": 1817 }, { "epoch": 4.686894770819884, "grad_norm": 0.8581736544054299, "learning_rate": 2.2203946541202392e-07, "loss": 0.6836, "step": 1818 }, { "epoch": 4.68947708198838, "grad_norm": 0.8933010730764249, "learning_rate": 2.1827394900736377e-07, "loss": 0.7032, "step": 1819 }, { "epoch": 4.6920593931568755, "grad_norm": 0.8824184636051327, "learning_rate": 2.145402831999943e-07, "loss": 0.6866, "step": 1820 }, { "epoch": 4.694641704325371, "grad_norm": 0.8962203196556846, "learning_rate": 2.108384801472263e-07, "loss": 0.6891, "step": 1821 }, { "epoch": 4.697224015493867, "grad_norm": 0.8825956696146154, "learning_rate": 2.0716855190262118e-07, "loss": 0.7159, "step": 1822 }, { "epoch": 4.699806326662363, "grad_norm": 0.8978568032331434, "learning_rate": 2.035305104159546e-07, "loss": 0.6948, "step": 1823 }, { "epoch": 4.702388637830858, "grad_norm": 0.9066750181863855, "learning_rate": 1.9992436753316967e-07, "loss": 0.7321, "step": 1824 }, { "epoch": 4.704970948999354, "grad_norm": 0.8862969137730125, "learning_rate": 1.963501349963448e-07, "loss": 0.7224, "step": 1825 }, { "epoch": 4.707553260167851, "grad_norm": 0.8963573378284775, "learning_rate": 1.928078244436582e-07, "loss": 0.7078, "step": 1826 }, { "epoch": 4.710135571336346, "grad_norm": 0.8951212547195972, "learning_rate": 1.892974474093412e-07, "loss": 0.7324, "step": 1827 }, { "epoch": 4.712717882504842, "grad_norm": 0.8796409018946402, "learning_rate": 1.8581901532364722e-07, "loss": 0.6997, "step": 1828 }, { "epoch": 4.715300193673338, "grad_norm": 0.9084865841892311, "learning_rate": 1.8237253951281287e-07, "loss": 0.7176, "step": 1829 }, { "epoch": 4.717882504841834, "grad_norm": 0.8939350444456864, "learning_rate": 1.789580311990191e-07, "loss": 0.7273, "step": 1830 }, { "epoch": 4.720464816010329, "grad_norm": 0.8897734672415167, "learning_rate": 1.7557550150035906e-07, "loss": 0.7311, "step": 1831 }, { "epoch": 4.723047127178825, "grad_norm": 0.9004838760057952, "learning_rate": 1.7222496143079803e-07, "loss": 0.735, "step": 1832 }, { "epoch": 4.725629438347321, "grad_norm": 0.8899859768430314, "learning_rate": 1.6890642190013906e-07, "loss": 0.7231, "step": 1833 }, { "epoch": 4.7282117495158165, "grad_norm": 0.8945971398194342, "learning_rate": 1.6561989371398523e-07, "loss": 0.7242, "step": 1834 }, { "epoch": 4.730794060684312, "grad_norm": 0.9022295113729779, "learning_rate": 1.6236538757370967e-07, "loss": 0.7124, "step": 1835 }, { "epoch": 4.733376371852808, "grad_norm": 0.9072500620579464, "learning_rate": 1.5914291407641668e-07, "loss": 0.7252, "step": 1836 }, { "epoch": 4.735958683021304, "grad_norm": 0.9078114931306495, "learning_rate": 1.5595248371490512e-07, "loss": 0.7252, "step": 1837 }, { "epoch": 4.7385409941898, "grad_norm": 0.8961898314266026, "learning_rate": 1.5279410687764173e-07, "loss": 0.7436, "step": 1838 }, { "epoch": 4.741123305358296, "grad_norm": 0.8983212101762444, "learning_rate": 1.4966779384871789e-07, "loss": 0.7123, "step": 1839 }, { "epoch": 4.743705616526792, "grad_norm": 0.8867079379295172, "learning_rate": 1.465735548078262e-07, "loss": 0.7091, "step": 1840 }, { "epoch": 4.746287927695287, "grad_norm": 0.9004558245837394, "learning_rate": 1.4351139983021623e-07, "loss": 0.7001, "step": 1841 }, { "epoch": 4.748870238863783, "grad_norm": 0.9069705760502946, "learning_rate": 1.4048133888667436e-07, "loss": 0.7132, "step": 1842 }, { "epoch": 4.751452550032279, "grad_norm": 0.8977843363832826, "learning_rate": 1.3748338184347842e-07, "loss": 0.7348, "step": 1843 }, { "epoch": 4.754034861200775, "grad_norm": 0.9077365721137113, "learning_rate": 1.3451753846237314e-07, "loss": 0.7221, "step": 1844 }, { "epoch": 4.75661717236927, "grad_norm": 0.8989075753789744, "learning_rate": 1.3158381840054025e-07, "loss": 0.7193, "step": 1845 }, { "epoch": 4.759199483537766, "grad_norm": 0.9186459910068699, "learning_rate": 1.2868223121056178e-07, "loss": 0.722, "step": 1846 }, { "epoch": 4.761781794706262, "grad_norm": 0.9020493854829156, "learning_rate": 1.2581278634038795e-07, "loss": 0.7148, "step": 1847 }, { "epoch": 4.764364105874758, "grad_norm": 0.8768801704765319, "learning_rate": 1.229754931333127e-07, "loss": 0.7035, "step": 1848 }, { "epoch": 4.766946417043254, "grad_norm": 0.8844151561998117, "learning_rate": 1.2017036082793922e-07, "loss": 0.7184, "step": 1849 }, { "epoch": 4.76952872821175, "grad_norm": 0.8929803701346444, "learning_rate": 1.1739739855815224e-07, "loss": 0.7302, "step": 1850 }, { "epoch": 4.7721110393802455, "grad_norm": 0.883991183642243, "learning_rate": 1.1465661535308147e-07, "loss": 0.7293, "step": 1851 }, { "epoch": 4.774693350548741, "grad_norm": 0.8949880710889703, "learning_rate": 1.1194802013708151e-07, "loss": 0.723, "step": 1852 }, { "epoch": 4.777275661717237, "grad_norm": 0.8764502668642703, "learning_rate": 1.0927162172969852e-07, "loss": 0.6951, "step": 1853 }, { "epoch": 4.779857972885733, "grad_norm": 0.8800003007012933, "learning_rate": 1.0662742884563926e-07, "loss": 0.7233, "step": 1854 }, { "epoch": 4.782440284054228, "grad_norm": 0.9056650063921504, "learning_rate": 1.0401545009474768e-07, "loss": 0.7303, "step": 1855 }, { "epoch": 4.785022595222724, "grad_norm": 0.8936292984984834, "learning_rate": 1.0143569398197384e-07, "loss": 0.7165, "step": 1856 }, { "epoch": 4.78760490639122, "grad_norm": 0.8930843203036322, "learning_rate": 9.888816890734399e-08, "loss": 0.6987, "step": 1857 }, { "epoch": 4.7901872175597155, "grad_norm": 0.8914663323073332, "learning_rate": 9.637288316593718e-08, "loss": 0.7257, "step": 1858 }, { "epoch": 4.792769528728211, "grad_norm": 0.8776073044944982, "learning_rate": 9.388984494785869e-08, "loss": 0.692, "step": 1859 }, { "epoch": 4.795351839896708, "grad_norm": 0.8779266479863871, "learning_rate": 9.14390623382111e-08, "loss": 0.7133, "step": 1860 }, { "epoch": 4.797934151065204, "grad_norm": 0.9264245878689963, "learning_rate": 8.902054331706545e-08, "loss": 0.7283, "step": 1861 }, { "epoch": 4.800516462233699, "grad_norm": 0.8900359447271092, "learning_rate": 8.663429575944126e-08, "loss": 0.6855, "step": 1862 }, { "epoch": 4.803098773402195, "grad_norm": 0.8817750073930983, "learning_rate": 8.42803274352777e-08, "loss": 0.6943, "step": 1863 }, { "epoch": 4.805681084570691, "grad_norm": 0.8903555513223317, "learning_rate": 8.195864600940684e-08, "loss": 0.6981, "step": 1864 }, { "epoch": 4.8082633957391865, "grad_norm": 0.9144780971305131, "learning_rate": 7.966925904153156e-08, "loss": 0.7352, "step": 1865 }, { "epoch": 4.810845706907682, "grad_norm": 0.9009644315118552, "learning_rate": 7.741217398619993e-08, "loss": 0.6861, "step": 1866 }, { "epoch": 4.813428018076178, "grad_norm": 0.9355145906326285, "learning_rate": 7.518739819278087e-08, "loss": 0.7482, "step": 1867 }, { "epoch": 4.816010329244674, "grad_norm": 0.9073134543589949, "learning_rate": 7.29949389054374e-08, "loss": 0.7514, "step": 1868 }, { "epoch": 4.81859264041317, "grad_norm": 0.9117075631720867, "learning_rate": 7.08348032631101e-08, "loss": 0.7251, "step": 1869 }, { "epoch": 4.821174951581666, "grad_norm": 0.9074195781202343, "learning_rate": 6.870699829948479e-08, "loss": 0.7186, "step": 1870 }, { "epoch": 4.823757262750162, "grad_norm": 0.8938435858944255, "learning_rate": 6.661153094297823e-08, "loss": 0.7074, "step": 1871 }, { "epoch": 4.826339573918657, "grad_norm": 0.9003531915044409, "learning_rate": 6.454840801670803e-08, "loss": 0.7319, "step": 1872 }, { "epoch": 4.828921885087153, "grad_norm": 0.8820415069306509, "learning_rate": 6.25176362384794e-08, "loss": 0.712, "step": 1873 }, { "epoch": 4.831504196255649, "grad_norm": 0.8995431946768817, "learning_rate": 6.051922222075179e-08, "loss": 0.7358, "step": 1874 }, { "epoch": 4.834086507424145, "grad_norm": 0.9146524578848008, "learning_rate": 5.855317247062786e-08, "loss": 0.708, "step": 1875 }, { "epoch": 4.83666881859264, "grad_norm": 0.882452405450629, "learning_rate": 5.6619493389824534e-08, "loss": 0.6995, "step": 1876 }, { "epoch": 4.839251129761136, "grad_norm": 0.903967291714597, "learning_rate": 5.4718191274659716e-08, "loss": 0.7299, "step": 1877 }, { "epoch": 4.841833440929632, "grad_norm": 0.8789763679697778, "learning_rate": 5.284927231602344e-08, "loss": 0.6955, "step": 1878 }, { "epoch": 4.8444157520981275, "grad_norm": 0.8987948069283417, "learning_rate": 5.101274259936451e-08, "loss": 0.7204, "step": 1879 }, { "epoch": 4.846998063266623, "grad_norm": 0.9053366952624305, "learning_rate": 4.92086081046661e-08, "loss": 0.7246, "step": 1880 }, { "epoch": 4.84958037443512, "grad_norm": 0.8949590025267126, "learning_rate": 4.7436874706431324e-08, "loss": 0.7101, "step": 1881 }, { "epoch": 4.8521626856036155, "grad_norm": 0.9214195129359755, "learning_rate": 4.569754817365657e-08, "loss": 0.7246, "step": 1882 }, { "epoch": 4.854744996772111, "grad_norm": 0.9089598728157908, "learning_rate": 4.399063416982263e-08, "loss": 0.7218, "step": 1883 }, { "epoch": 4.857327307940607, "grad_norm": 0.901720805129725, "learning_rate": 4.2316138252866954e-08, "loss": 0.6872, "step": 1884 }, { "epoch": 4.859909619109103, "grad_norm": 0.9104687149335616, "learning_rate": 4.067406587516809e-08, "loss": 0.7451, "step": 1885 }, { "epoch": 4.862491930277598, "grad_norm": 0.9041958158253693, "learning_rate": 3.9064422383534587e-08, "loss": 0.7049, "step": 1886 }, { "epoch": 4.865074241446094, "grad_norm": 0.8939398515698603, "learning_rate": 3.748721301917724e-08, "loss": 0.7071, "step": 1887 }, { "epoch": 4.86765655261459, "grad_norm": 0.9161924402984971, "learning_rate": 3.5942442917699107e-08, "loss": 0.7442, "step": 1888 }, { "epoch": 4.870238863783086, "grad_norm": 0.8963507291873432, "learning_rate": 3.443011710907662e-08, "loss": 0.726, "step": 1889 }, { "epoch": 4.872821174951581, "grad_norm": 0.8879121159936676, "learning_rate": 3.295024051764406e-08, "loss": 0.6938, "step": 1890 }, { "epoch": 4.875403486120078, "grad_norm": 0.875832944883481, "learning_rate": 3.150281796207466e-08, "loss": 0.6997, "step": 1891 }, { "epoch": 4.877985797288574, "grad_norm": 0.8977974322344513, "learning_rate": 3.008785415536841e-08, "loss": 0.7159, "step": 1892 }, { "epoch": 4.880568108457069, "grad_norm": 0.8748881414978601, "learning_rate": 2.8705353704836515e-08, "loss": 0.7027, "step": 1893 }, { "epoch": 4.883150419625565, "grad_norm": 0.88445635221765, "learning_rate": 2.73553211120825e-08, "loss": 0.7245, "step": 1894 }, { "epoch": 4.885732730794061, "grad_norm": 0.9134713261429174, "learning_rate": 2.6037760772991138e-08, "loss": 0.7239, "step": 1895 }, { "epoch": 4.8883150419625565, "grad_norm": 0.8985059705700046, "learning_rate": 2.4752676977713997e-08, "loss": 0.7245, "step": 1896 }, { "epoch": 4.890897353131052, "grad_norm": 0.8885305233658356, "learning_rate": 2.3500073910655007e-08, "loss": 0.74, "step": 1897 }, { "epoch": 4.893479664299548, "grad_norm": 0.8823177052515677, "learning_rate": 2.2279955650456043e-08, "loss": 0.6953, "step": 1898 }, { "epoch": 4.896061975468044, "grad_norm": 0.9143103720474103, "learning_rate": 2.109232616998247e-08, "loss": 0.7159, "step": 1899 }, { "epoch": 4.898644286636539, "grad_norm": 0.9122168970845141, "learning_rate": 1.993718933631428e-08, "loss": 0.7356, "step": 1900 }, { "epoch": 4.901226597805035, "grad_norm": 0.8855771980386666, "learning_rate": 1.8814548910730535e-08, "loss": 0.7129, "step": 1901 }, { "epoch": 4.903808908973531, "grad_norm": 0.9057306349994061, "learning_rate": 1.7724408548697168e-08, "loss": 0.718, "step": 1902 }, { "epoch": 4.906391220142027, "grad_norm": 0.9014437329192154, "learning_rate": 1.6666771799855875e-08, "loss": 0.7178, "step": 1903 }, { "epoch": 4.908973531310523, "grad_norm": 0.8787325490430078, "learning_rate": 1.5641642108011888e-08, "loss": 0.698, "step": 1904 }, { "epoch": 4.911555842479019, "grad_norm": 0.9212584947084932, "learning_rate": 1.4649022811122904e-08, "loss": 0.7462, "step": 1905 }, { "epoch": 4.914138153647515, "grad_norm": 0.8828205242032983, "learning_rate": 1.368891714129017e-08, "loss": 0.6942, "step": 1906 }, { "epoch": 4.91672046481601, "grad_norm": 0.9012685033665765, "learning_rate": 1.2761328224744074e-08, "loss": 0.711, "step": 1907 }, { "epoch": 4.919302775984506, "grad_norm": 0.8922139187625531, "learning_rate": 1.1866259081837473e-08, "loss": 0.7111, "step": 1908 }, { "epoch": 4.921885087153002, "grad_norm": 0.9214550856281982, "learning_rate": 1.100371262703459e-08, "loss": 0.7151, "step": 1909 }, { "epoch": 4.9244673983214975, "grad_norm": 0.8940406772236208, "learning_rate": 1.0173691668901031e-08, "loss": 0.7364, "step": 1910 }, { "epoch": 4.927049709489993, "grad_norm": 0.9145036949820405, "learning_rate": 9.376198910094892e-09, "loss": 0.7231, "step": 1911 }, { "epoch": 4.92963202065849, "grad_norm": 0.8932180687439902, "learning_rate": 8.611236947357881e-09, "loss": 0.7347, "step": 1912 }, { "epoch": 4.9322143318269855, "grad_norm": 0.9074743226465335, "learning_rate": 7.878808271507554e-09, "loss": 0.7205, "step": 1913 }, { "epoch": 4.934796642995481, "grad_norm": 0.9340148563069326, "learning_rate": 7.178915267429531e-09, "loss": 0.7412, "step": 1914 }, { "epoch": 4.937378954163977, "grad_norm": 0.9029720647465063, "learning_rate": 6.5115602140686244e-09, "loss": 0.706, "step": 1915 }, { "epoch": 4.939961265332473, "grad_norm": 0.9024330652166745, "learning_rate": 5.876745284421059e-09, "loss": 0.7179, "step": 1916 }, { "epoch": 4.942543576500968, "grad_norm": 0.905536650405652, "learning_rate": 5.27447254552782e-09, "loss": 0.7265, "step": 1917 }, { "epoch": 4.945125887669464, "grad_norm": 0.9411331572422978, "learning_rate": 4.704743958467984e-09, "loss": 0.7411, "step": 1918 }, { "epoch": 4.94770819883796, "grad_norm": 0.8844024471969056, "learning_rate": 4.1675613783565e-09, "loss": 0.7076, "step": 1919 }, { "epoch": 4.950290510006456, "grad_norm": 0.8830101926273661, "learning_rate": 3.6629265543275393e-09, "loss": 0.7213, "step": 1920 }, { "epoch": 4.952872821174951, "grad_norm": 0.9108539534557583, "learning_rate": 3.190841129542266e-09, "loss": 0.7139, "step": 1921 }, { "epoch": 4.955455132343447, "grad_norm": 0.9188851667256828, "learning_rate": 2.7513066411699597e-09, "loss": 0.7048, "step": 1922 }, { "epoch": 4.958037443511943, "grad_norm": 0.9011821980131381, "learning_rate": 2.344324520396901e-09, "loss": 0.7166, "step": 1923 }, { "epoch": 4.960619754680439, "grad_norm": 0.9260828637057613, "learning_rate": 1.9698960924074973e-09, "loss": 0.7124, "step": 1924 }, { "epoch": 4.963202065848935, "grad_norm": 0.9176208831919505, "learning_rate": 1.6280225763931623e-09, "loss": 0.7192, "step": 1925 }, { "epoch": 4.965784377017431, "grad_norm": 0.8999670316918704, "learning_rate": 1.3187050855367755e-09, "loss": 0.7011, "step": 1926 }, { "epoch": 4.9683666881859265, "grad_norm": 0.9002005545182482, "learning_rate": 1.0419446270193423e-09, "loss": 0.7271, "step": 1927 }, { "epoch": 4.970948999354422, "grad_norm": 0.8792229281282367, "learning_rate": 7.977421020088916e-10, "loss": 0.6954, "step": 1928 }, { "epoch": 4.973531310522918, "grad_norm": 0.9030675678611226, "learning_rate": 5.860983056604763e-10, "loss": 0.717, "step": 1929 }, { "epoch": 4.976113621691414, "grad_norm": 0.8822396526894568, "learning_rate": 4.0701392711506307e-10, "loss": 0.6956, "step": 1930 }, { "epoch": 4.978695932859909, "grad_norm": 0.8758112231639331, "learning_rate": 2.60489549495091e-10, "loss": 0.6989, "step": 1931 }, { "epoch": 4.981278244028405, "grad_norm": 0.9008898902977136, "learning_rate": 1.4652564990336183e-10, "loss": 0.7116, "step": 1932 }, { "epoch": 4.983860555196901, "grad_norm": 0.8627458403348608, "learning_rate": 6.512259942192955e-11, "loss": 0.7134, "step": 1933 }, { "epoch": 4.9864428663653975, "grad_norm": 0.8843096004745279, "learning_rate": 1.6280663108769745e-11, "loss": 0.6674, "step": 1934 }, { "epoch": 4.989025177533893, "grad_norm": 0.8707183390390197, "learning_rate": 0.0, "loss": 0.6915, "step": 1935 }, { "epoch": 4.989025177533893, "step": 1935, "total_flos": 3.212141825011745e+18, "train_loss": 0.9760797875796178, "train_runtime": 14099.2726, "train_samples_per_second": 17.57, "train_steps_per_second": 0.137 } ], "logging_steps": 1, "max_steps": 1935, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.212141825011745e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }