{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999307718933887, "eval_steps": 145, "global_step": 1444, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "batch_num_effect_tokens": 2112, "batch_num_samples": 29, "batch_num_tokens": 16369, "epoch": 0.00138, "grad_norm": 2.2764828205108643, "learning_rate": 6.896551724137931e-08, "loss": 3.5781, "step": 1 }, { "batch_num_effect_tokens": 2693, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.00277, "grad_norm": 1.659691333770752, "learning_rate": 1.3793103448275863e-07, "loss": 3.6025, "step": 2 }, { "batch_num_effect_tokens": 2476, "batch_num_samples": 45, "batch_num_tokens": 16369, "epoch": 0.00415, "grad_norm": 2.881417989730835, "learning_rate": 2.0689655172413796e-07, "loss": 3.6162, "step": 3 }, { "batch_num_effect_tokens": 2781, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.00554, "grad_norm": 1.7475919723510742, "learning_rate": 2.7586206896551726e-07, "loss": 3.3613, "step": 4 }, { "batch_num_effect_tokens": 2415, "batch_num_samples": 29, "batch_num_tokens": 16274, "epoch": 0.00692, "grad_norm": 2.4939348697662354, "learning_rate": 3.4482758620689656e-07, "loss": 3.4355, "step": 5 }, { "batch_num_effect_tokens": 1823, "batch_num_samples": 29, "batch_num_tokens": 16383, "epoch": 0.00831, "grad_norm": 3.1579501628875732, "learning_rate": 4.137931034482759e-07, "loss": 3.8535, "step": 6 }, { "batch_num_effect_tokens": 2839, "batch_num_samples": 29, "batch_num_tokens": 16330, "epoch": 0.00969, "grad_norm": 2.3355987071990967, "learning_rate": 4.827586206896552e-07, "loss": 3.6318, "step": 7 }, { "batch_num_effect_tokens": 2847, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.01108, "grad_norm": 1.8724162578582764, "learning_rate": 5.517241379310345e-07, "loss": 3.3047, "step": 8 }, { "batch_num_effect_tokens": 2350, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.01246, "grad_norm": 2.7438619136810303, "learning_rate": 6.206896551724139e-07, "loss": 3.7861, "step": 9 }, { "batch_num_effect_tokens": 3533, "batch_num_samples": 42, "batch_num_tokens": 16381, "epoch": 0.01385, "grad_norm": 1.631403923034668, "learning_rate": 6.896551724137931e-07, "loss": 3.7012, "step": 10 }, { "batch_num_effect_tokens": 3955, "batch_num_samples": 46, "batch_num_tokens": 16362, "epoch": 0.01523, "grad_norm": 1.8381500244140625, "learning_rate": 7.586206896551725e-07, "loss": 3.3999, "step": 11 }, { "batch_num_effect_tokens": 2157, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.01661, "grad_norm": 2.100721597671509, "learning_rate": 8.275862068965518e-07, "loss": 3.7012, "step": 12 }, { "batch_num_effect_tokens": 3580, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.018, "grad_norm": 1.237741470336914, "learning_rate": 8.965517241379311e-07, "loss": 3.1348, "step": 13 }, { "batch_num_effect_tokens": 3021, "batch_num_samples": 45, "batch_num_tokens": 16384, "epoch": 0.01938, "grad_norm": 1.5811222791671753, "learning_rate": 9.655172413793103e-07, "loss": 3.2832, "step": 14 }, { "batch_num_effect_tokens": 3268, "batch_num_samples": 44, "batch_num_tokens": 16328, "epoch": 0.02077, "grad_norm": 1.3960686922073364, "learning_rate": 1.0344827586206898e-06, "loss": 3.3086, "step": 15 }, { "batch_num_effect_tokens": 2236, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.02215, "grad_norm": 1.4479105472564697, "learning_rate": 1.103448275862069e-06, "loss": 3.0532, "step": 16 }, { "batch_num_effect_tokens": 2201, "batch_num_samples": 34, "batch_num_tokens": 16260, "epoch": 0.02354, "grad_norm": 1.622763991355896, "learning_rate": 1.1724137931034483e-06, "loss": 3.0444, "step": 17 }, { "batch_num_effect_tokens": 2803, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.02492, "grad_norm": 1.3162716627120972, "learning_rate": 1.2413793103448277e-06, "loss": 3.0332, "step": 18 }, { "batch_num_effect_tokens": 2688, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.02631, "grad_norm": 1.3780149221420288, "learning_rate": 1.3103448275862072e-06, "loss": 2.749, "step": 19 }, { "batch_num_effect_tokens": 2660, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.02769, "grad_norm": 1.4238007068634033, "learning_rate": 1.3793103448275862e-06, "loss": 2.6338, "step": 20 }, { "batch_num_effect_tokens": 2235, "batch_num_samples": 35, "batch_num_tokens": 16278, "epoch": 0.02908, "grad_norm": 1.9952951669692993, "learning_rate": 1.4482758620689657e-06, "loss": 2.543, "step": 21 }, { "batch_num_effect_tokens": 2505, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 0.03046, "grad_norm": 1.8021754026412964, "learning_rate": 1.517241379310345e-06, "loss": 2.4307, "step": 22 }, { "batch_num_effect_tokens": 2511, "batch_num_samples": 39, "batch_num_tokens": 16322, "epoch": 0.03184, "grad_norm": 0.6236157417297363, "learning_rate": 1.5862068965517244e-06, "loss": 2.29, "step": 23 }, { "batch_num_effect_tokens": 2624, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.03323, "grad_norm": 0.6076152324676514, "learning_rate": 1.6551724137931037e-06, "loss": 2.3667, "step": 24 }, { "batch_num_effect_tokens": 2247, "batch_num_samples": 30, "batch_num_tokens": 16383, "epoch": 0.03461, "grad_norm": 0.6154396533966064, "learning_rate": 1.724137931034483e-06, "loss": 2.1719, "step": 25 }, { "batch_num_effect_tokens": 4037, "batch_num_samples": 47, "batch_num_tokens": 16384, "epoch": 0.036, "grad_norm": 0.5821129679679871, "learning_rate": 1.7931034482758622e-06, "loss": 2.4102, "step": 26 }, { "batch_num_effect_tokens": 2821, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 0.03738, "grad_norm": 0.5388163328170776, "learning_rate": 1.8620689655172416e-06, "loss": 2.2852, "step": 27 }, { "batch_num_effect_tokens": 2552, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.03877, "grad_norm": 0.5951407551765442, "learning_rate": 1.9310344827586207e-06, "loss": 2.2178, "step": 28 }, { "batch_num_effect_tokens": 2666, "batch_num_samples": 36, "batch_num_tokens": 16382, "epoch": 0.04015, "grad_norm": 0.5594526529312134, "learning_rate": 2.0000000000000003e-06, "loss": 2.1475, "step": 29 }, { "batch_num_effect_tokens": 2395, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.04154, "grad_norm": 0.5361707210540771, "learning_rate": 2.0689655172413796e-06, "loss": 2.2158, "step": 30 }, { "batch_num_effect_tokens": 2996, "batch_num_samples": 54, "batch_num_tokens": 16328, "epoch": 0.04292, "grad_norm": 0.5809456706047058, "learning_rate": 2.137931034482759e-06, "loss": 2.3921, "step": 31 }, { "batch_num_effect_tokens": 2077, "batch_num_samples": 30, "batch_num_tokens": 16289, "epoch": 0.04431, "grad_norm": 0.7051149606704712, "learning_rate": 2.206896551724138e-06, "loss": 2.5264, "step": 32 }, { "batch_num_effect_tokens": 1880, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.04569, "grad_norm": 0.6108770370483398, "learning_rate": 2.2758620689655173e-06, "loss": 2.2031, "step": 33 }, { "batch_num_effect_tokens": 2444, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.04708, "grad_norm": 0.5668604373931885, "learning_rate": 2.3448275862068966e-06, "loss": 2.1904, "step": 34 }, { "batch_num_effect_tokens": 2186, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.04846, "grad_norm": 0.5768396854400635, "learning_rate": 2.4137931034482762e-06, "loss": 1.998, "step": 35 }, { "batch_num_effect_tokens": 3436, "batch_num_samples": 37, "batch_num_tokens": 16383, "epoch": 0.04984, "grad_norm": 0.4890941083431244, "learning_rate": 2.4827586206896555e-06, "loss": 2.1143, "step": 36 }, { "batch_num_effect_tokens": 3014, "batch_num_samples": 57, "batch_num_tokens": 16334, "epoch": 0.05123, "grad_norm": 0.5648297667503357, "learning_rate": 2.5517241379310347e-06, "loss": 2.1157, "step": 37 }, { "batch_num_effect_tokens": 2420, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.05261, "grad_norm": 0.6032694578170776, "learning_rate": 2.6206896551724144e-06, "loss": 2.2603, "step": 38 }, { "batch_num_effect_tokens": 3079, "batch_num_samples": 29, "batch_num_tokens": 16350, "epoch": 0.054, "grad_norm": 0.5154021382331848, "learning_rate": 2.6896551724137932e-06, "loss": 2.1768, "step": 39 }, { "batch_num_effect_tokens": 3438, "batch_num_samples": 39, "batch_num_tokens": 16272, "epoch": 0.05538, "grad_norm": 0.5439137816429138, "learning_rate": 2.7586206896551725e-06, "loss": 2.373, "step": 40 }, { "batch_num_effect_tokens": 2670, "batch_num_samples": 40, "batch_num_tokens": 16384, "epoch": 0.05677, "grad_norm": 0.5452929139137268, "learning_rate": 2.827586206896552e-06, "loss": 2.9399, "step": 41 }, { "batch_num_effect_tokens": 2389, "batch_num_samples": 36, "batch_num_tokens": 16302, "epoch": 0.05815, "grad_norm": 0.5909498333930969, "learning_rate": 2.8965517241379314e-06, "loss": 2.1294, "step": 42 }, { "batch_num_effect_tokens": 3031, "batch_num_samples": 49, "batch_num_tokens": 16384, "epoch": 0.05954, "grad_norm": 0.47602686285972595, "learning_rate": 2.9655172413793102e-06, "loss": 2.0762, "step": 43 }, { "batch_num_effect_tokens": 2760, "batch_num_samples": 29, "batch_num_tokens": 16373, "epoch": 0.06092, "grad_norm": 0.5776339769363403, "learning_rate": 3.03448275862069e-06, "loss": 1.9316, "step": 44 }, { "batch_num_effect_tokens": 2269, "batch_num_samples": 29, "batch_num_tokens": 16382, "epoch": 0.06231, "grad_norm": 0.5549036860466003, "learning_rate": 3.103448275862069e-06, "loss": 1.9834, "step": 45 }, { "batch_num_effect_tokens": 2383, "batch_num_samples": 34, "batch_num_tokens": 16312, "epoch": 0.06369, "grad_norm": 0.5422660708427429, "learning_rate": 3.172413793103449e-06, "loss": 2.0283, "step": 46 }, { "batch_num_effect_tokens": 2693, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.06507, "grad_norm": 0.5310187935829163, "learning_rate": 3.2413793103448277e-06, "loss": 1.9814, "step": 47 }, { "batch_num_effect_tokens": 2759, "batch_num_samples": 35, "batch_num_tokens": 16356, "epoch": 0.06646, "grad_norm": 0.5831342935562134, "learning_rate": 3.3103448275862073e-06, "loss": 2.0859, "step": 48 }, { "batch_num_effect_tokens": 3657, "batch_num_samples": 48, "batch_num_tokens": 16384, "epoch": 0.06784, "grad_norm": 0.4723730683326721, "learning_rate": 3.3793103448275866e-06, "loss": 2.0947, "step": 49 }, { "batch_num_effect_tokens": 2161, "batch_num_samples": 31, "batch_num_tokens": 16343, "epoch": 0.06923, "grad_norm": 0.5221647024154663, "learning_rate": 3.448275862068966e-06, "loss": 1.9746, "step": 50 }, { "batch_num_effect_tokens": 2694, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.07061, "grad_norm": 0.4860016405582428, "learning_rate": 3.517241379310345e-06, "loss": 2.2134, "step": 51 }, { "batch_num_effect_tokens": 2090, "batch_num_samples": 29, "batch_num_tokens": 16284, "epoch": 0.072, "grad_norm": 0.5617069005966187, "learning_rate": 3.5862068965517243e-06, "loss": 2.1592, "step": 52 }, { "batch_num_effect_tokens": 2591, "batch_num_samples": 31, "batch_num_tokens": 16298, "epoch": 0.07338, "grad_norm": 0.48716187477111816, "learning_rate": 3.655172413793104e-06, "loss": 2.0322, "step": 53 }, { "batch_num_effect_tokens": 3229, "batch_num_samples": 47, "batch_num_tokens": 16331, "epoch": 0.07477, "grad_norm": 0.5877251625061035, "learning_rate": 3.7241379310344832e-06, "loss": 1.9639, "step": 54 }, { "batch_num_effect_tokens": 2511, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.07615, "grad_norm": 0.5444878935813904, "learning_rate": 3.793103448275862e-06, "loss": 2.1323, "step": 55 }, { "batch_num_effect_tokens": 2296, "batch_num_samples": 29, "batch_num_tokens": 16328, "epoch": 0.07754, "grad_norm": 0.541236162185669, "learning_rate": 3.862068965517241e-06, "loss": 2.0327, "step": 56 }, { "batch_num_effect_tokens": 3243, "batch_num_samples": 61, "batch_num_tokens": 16286, "epoch": 0.07892, "grad_norm": 0.5379409193992615, "learning_rate": 3.931034482758621e-06, "loss": 1.8599, "step": 57 }, { "batch_num_effect_tokens": 1940, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.0803, "grad_norm": 0.5343999266624451, "learning_rate": 4.000000000000001e-06, "loss": 2.0293, "step": 58 }, { "batch_num_effect_tokens": 2548, "batch_num_samples": 28, "batch_num_tokens": 16272, "epoch": 0.08169, "grad_norm": 0.539332389831543, "learning_rate": 4.0689655172413795e-06, "loss": 2.1377, "step": 59 }, { "batch_num_effect_tokens": 2235, "batch_num_samples": 30, "batch_num_tokens": 16305, "epoch": 0.08307, "grad_norm": 0.5037583708763123, "learning_rate": 4.137931034482759e-06, "loss": 2.0605, "step": 60 }, { "batch_num_effect_tokens": 2183, "batch_num_samples": 28, "batch_num_tokens": 16277, "epoch": 0.08446, "grad_norm": 0.4962354004383087, "learning_rate": 4.206896551724138e-06, "loss": 1.8213, "step": 61 }, { "batch_num_effect_tokens": 2062, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.08584, "grad_norm": 0.5549421906471252, "learning_rate": 4.275862068965518e-06, "loss": 2.1123, "step": 62 }, { "batch_num_effect_tokens": 2346, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.08723, "grad_norm": 0.5471904277801514, "learning_rate": 4.3448275862068965e-06, "loss": 2.0796, "step": 63 }, { "batch_num_effect_tokens": 1774, "batch_num_samples": 30, "batch_num_tokens": 16328, "epoch": 0.08861, "grad_norm": 0.5746720433235168, "learning_rate": 4.413793103448276e-06, "loss": 1.7998, "step": 64 }, { "batch_num_effect_tokens": 2487, "batch_num_samples": 31, "batch_num_tokens": 16298, "epoch": 0.09, "grad_norm": 0.49535098671913147, "learning_rate": 4.482758620689656e-06, "loss": 1.7563, "step": 65 }, { "batch_num_effect_tokens": 2375, "batch_num_samples": 28, "batch_num_tokens": 16322, "epoch": 0.09138, "grad_norm": 0.7682079076766968, "learning_rate": 4.551724137931035e-06, "loss": 2.0352, "step": 66 }, { "batch_num_effect_tokens": 2122, "batch_num_samples": 35, "batch_num_tokens": 16330, "epoch": 0.09277, "grad_norm": 0.5090745687484741, "learning_rate": 4.620689655172414e-06, "loss": 1.8838, "step": 67 }, { "batch_num_effect_tokens": 2553, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.09415, "grad_norm": 0.5062191486358643, "learning_rate": 4.689655172413793e-06, "loss": 2.1523, "step": 68 }, { "batch_num_effect_tokens": 2351, "batch_num_samples": 29, "batch_num_tokens": 16354, "epoch": 0.09553, "grad_norm": 0.5094422101974487, "learning_rate": 4.758620689655173e-06, "loss": 2.2979, "step": 69 }, { "batch_num_effect_tokens": 2621, "batch_num_samples": 33, "batch_num_tokens": 16280, "epoch": 0.09692, "grad_norm": 0.820463240146637, "learning_rate": 4.8275862068965525e-06, "loss": 2.1719, "step": 70 }, { "batch_num_effect_tokens": 3209, "batch_num_samples": 30, "batch_num_tokens": 16286, "epoch": 0.0983, "grad_norm": 0.4678884446620941, "learning_rate": 4.896551724137931e-06, "loss": 2.1519, "step": 71 }, { "batch_num_effect_tokens": 2595, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.09969, "grad_norm": 0.5287250876426697, "learning_rate": 4.965517241379311e-06, "loss": 2.3496, "step": 72 }, { "batch_num_effect_tokens": 4000, "batch_num_samples": 54, "batch_num_tokens": 16383, "epoch": 0.10107, "grad_norm": 0.5644830465316772, "learning_rate": 5.03448275862069e-06, "loss": 2.0151, "step": 73 }, { "batch_num_effect_tokens": 2605, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.10246, "grad_norm": 0.4621262848377228, "learning_rate": 5.1034482758620695e-06, "loss": 2.126, "step": 74 }, { "batch_num_effect_tokens": 3048, "batch_num_samples": 29, "batch_num_tokens": 16381, "epoch": 0.10384, "grad_norm": 0.5360795855522156, "learning_rate": 5.172413793103449e-06, "loss": 1.9854, "step": 75 }, { "batch_num_effect_tokens": 2397, "batch_num_samples": 29, "batch_num_tokens": 16340, "epoch": 0.10523, "grad_norm": 0.4859134554862976, "learning_rate": 5.241379310344829e-06, "loss": 1.9004, "step": 76 }, { "batch_num_effect_tokens": 2455, "batch_num_samples": 30, "batch_num_tokens": 16371, "epoch": 0.10661, "grad_norm": 0.4554426074028015, "learning_rate": 5.310344827586207e-06, "loss": 1.9277, "step": 77 }, { "batch_num_effect_tokens": 2562, "batch_num_samples": 37, "batch_num_tokens": 16382, "epoch": 0.108, "grad_norm": 0.503370463848114, "learning_rate": 5.3793103448275865e-06, "loss": 2.2236, "step": 78 }, { "batch_num_effect_tokens": 2587, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.10938, "grad_norm": 0.4693808853626251, "learning_rate": 5.448275862068966e-06, "loss": 2.0664, "step": 79 }, { "batch_num_effect_tokens": 2381, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.11076, "grad_norm": 0.4939959943294525, "learning_rate": 5.517241379310345e-06, "loss": 1.9824, "step": 80 }, { "batch_num_effect_tokens": 2516, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.11215, "grad_norm": 0.47511735558509827, "learning_rate": 5.586206896551725e-06, "loss": 2.064, "step": 81 }, { "batch_num_effect_tokens": 2359, "batch_num_samples": 32, "batch_num_tokens": 16288, "epoch": 0.11353, "grad_norm": 0.5066648721694946, "learning_rate": 5.655172413793104e-06, "loss": 2.2344, "step": 82 }, { "batch_num_effect_tokens": 2914, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.11492, "grad_norm": 0.4935562014579773, "learning_rate": 5.724137931034483e-06, "loss": 1.8584, "step": 83 }, { "batch_num_effect_tokens": 2637, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.1163, "grad_norm": 0.5244860053062439, "learning_rate": 5.793103448275863e-06, "loss": 2.0566, "step": 84 }, { "batch_num_effect_tokens": 2318, "batch_num_samples": 30, "batch_num_tokens": 16352, "epoch": 0.11769, "grad_norm": 0.5116328001022339, "learning_rate": 5.862068965517242e-06, "loss": 2.0049, "step": 85 }, { "batch_num_effect_tokens": 2452, "batch_num_samples": 30, "batch_num_tokens": 16297, "epoch": 0.11907, "grad_norm": 0.4940698742866516, "learning_rate": 5.9310344827586205e-06, "loss": 1.9824, "step": 86 }, { "batch_num_effect_tokens": 2211, "batch_num_samples": 29, "batch_num_tokens": 16326, "epoch": 0.12046, "grad_norm": 0.5889806747436523, "learning_rate": 6e-06, "loss": 2.1484, "step": 87 }, { "batch_num_effect_tokens": 2377, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.12184, "grad_norm": 0.5352203845977783, "learning_rate": 6.06896551724138e-06, "loss": 1.8984, "step": 88 }, { "batch_num_effect_tokens": 2740, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.12323, "grad_norm": 0.5443808436393738, "learning_rate": 6.1379310344827595e-06, "loss": 1.9795, "step": 89 }, { "batch_num_effect_tokens": 2486, "batch_num_samples": 28, "batch_num_tokens": 16367, "epoch": 0.12461, "grad_norm": 0.5316216945648193, "learning_rate": 6.206896551724138e-06, "loss": 2.0847, "step": 90 }, { "batch_num_effect_tokens": 2406, "batch_num_samples": 48, "batch_num_tokens": 16384, "epoch": 0.126, "grad_norm": 0.5168768167495728, "learning_rate": 6.275862068965518e-06, "loss": 2.1099, "step": 91 }, { "batch_num_effect_tokens": 2438, "batch_num_samples": 40, "batch_num_tokens": 16341, "epoch": 0.12738, "grad_norm": 0.47964486479759216, "learning_rate": 6.344827586206898e-06, "loss": 2.083, "step": 92 }, { "batch_num_effect_tokens": 3157, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.12876, "grad_norm": 0.4454560875892639, "learning_rate": 6.413793103448276e-06, "loss": 1.9355, "step": 93 }, { "batch_num_effect_tokens": 2652, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.13015, "grad_norm": 0.4772700369358063, "learning_rate": 6.482758620689655e-06, "loss": 1.8301, "step": 94 }, { "batch_num_effect_tokens": 2388, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 0.13153, "grad_norm": 0.50420743227005, "learning_rate": 6.551724137931035e-06, "loss": 2.0781, "step": 95 }, { "batch_num_effect_tokens": 2339, "batch_num_samples": 29, "batch_num_tokens": 16310, "epoch": 0.13292, "grad_norm": 0.48640191555023193, "learning_rate": 6.620689655172415e-06, "loss": 2.2715, "step": 96 }, { "batch_num_effect_tokens": 2156, "batch_num_samples": 34, "batch_num_tokens": 16317, "epoch": 0.1343, "grad_norm": 0.4719928205013275, "learning_rate": 6.6896551724137935e-06, "loss": 1.8389, "step": 97 }, { "batch_num_effect_tokens": 2247, "batch_num_samples": 31, "batch_num_tokens": 16362, "epoch": 0.13569, "grad_norm": 0.4806089699268341, "learning_rate": 6.758620689655173e-06, "loss": 1.9922, "step": 98 }, { "batch_num_effect_tokens": 2961, "batch_num_samples": 47, "batch_num_tokens": 16304, "epoch": 0.13707, "grad_norm": 0.5458834767341614, "learning_rate": 6.827586206896553e-06, "loss": 2.1948, "step": 99 }, { "batch_num_effect_tokens": 2666, "batch_num_samples": 29, "batch_num_tokens": 16380, "epoch": 0.13846, "grad_norm": 0.4964226186275482, "learning_rate": 6.896551724137932e-06, "loss": 2.2334, "step": 100 }, { "batch_num_effect_tokens": 2685, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.13984, "grad_norm": 0.5490928292274475, "learning_rate": 6.9655172413793105e-06, "loss": 1.9707, "step": 101 }, { "batch_num_effect_tokens": 2872, "batch_num_samples": 41, "batch_num_tokens": 16341, "epoch": 0.14123, "grad_norm": 0.5313333868980408, "learning_rate": 7.03448275862069e-06, "loss": 2.085, "step": 102 }, { "batch_num_effect_tokens": 1957, "batch_num_samples": 28, "batch_num_tokens": 16381, "epoch": 0.14261, "grad_norm": 0.6000615358352661, "learning_rate": 7.103448275862069e-06, "loss": 2.3184, "step": 103 }, { "batch_num_effect_tokens": 2359, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.14399, "grad_norm": 0.5101090669631958, "learning_rate": 7.172413793103449e-06, "loss": 2.0137, "step": 104 }, { "batch_num_effect_tokens": 1948, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.14538, "grad_norm": 0.5882840752601624, "learning_rate": 7.241379310344828e-06, "loss": 2.0161, "step": 105 }, { "batch_num_effect_tokens": 2582, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.14676, "grad_norm": 0.5262752771377563, "learning_rate": 7.310344827586208e-06, "loss": 2.2783, "step": 106 }, { "batch_num_effect_tokens": 2419, "batch_num_samples": 31, "batch_num_tokens": 16343, "epoch": 0.14815, "grad_norm": 0.5051460862159729, "learning_rate": 7.379310344827587e-06, "loss": 2.0137, "step": 107 }, { "batch_num_effect_tokens": 2726, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.14953, "grad_norm": 0.5460896492004395, "learning_rate": 7.4482758620689665e-06, "loss": 1.8896, "step": 108 }, { "batch_num_effect_tokens": 2503, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.15092, "grad_norm": 0.6584717035293579, "learning_rate": 7.517241379310345e-06, "loss": 1.9023, "step": 109 }, { "batch_num_effect_tokens": 2661, "batch_num_samples": 47, "batch_num_tokens": 16382, "epoch": 0.1523, "grad_norm": 0.46665823459625244, "learning_rate": 7.586206896551724e-06, "loss": 2.0454, "step": 110 }, { "batch_num_effect_tokens": 2941, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.15369, "grad_norm": 0.7147200107574463, "learning_rate": 7.655172413793104e-06, "loss": 1.9697, "step": 111 }, { "batch_num_effect_tokens": 2216, "batch_num_samples": 28, "batch_num_tokens": 16367, "epoch": 0.15507, "grad_norm": 0.5700640082359314, "learning_rate": 7.724137931034483e-06, "loss": 2.0166, "step": 112 }, { "batch_num_effect_tokens": 3642, "batch_num_samples": 42, "batch_num_tokens": 16383, "epoch": 0.15646, "grad_norm": 0.6275281310081482, "learning_rate": 7.793103448275863e-06, "loss": 1.9883, "step": 113 }, { "batch_num_effect_tokens": 2534, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 0.15784, "grad_norm": 0.44454583525657654, "learning_rate": 7.862068965517242e-06, "loss": 2.127, "step": 114 }, { "batch_num_effect_tokens": 2462, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.15922, "grad_norm": 0.485707551240921, "learning_rate": 7.93103448275862e-06, "loss": 2.1816, "step": 115 }, { "batch_num_effect_tokens": 2229, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.16061, "grad_norm": 0.4775547683238983, "learning_rate": 8.000000000000001e-06, "loss": 2.0205, "step": 116 }, { "batch_num_effect_tokens": 2848, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.16199, "grad_norm": 0.5129414796829224, "learning_rate": 8.06896551724138e-06, "loss": 2.1094, "step": 117 }, { "batch_num_effect_tokens": 2125, "batch_num_samples": 30, "batch_num_tokens": 16305, "epoch": 0.16338, "grad_norm": 0.5633395314216614, "learning_rate": 8.137931034482759e-06, "loss": 2.1504, "step": 118 }, { "batch_num_effect_tokens": 2581, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.16476, "grad_norm": 0.48365625739097595, "learning_rate": 8.206896551724138e-06, "loss": 2.04, "step": 119 }, { "batch_num_effect_tokens": 2301, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 0.16615, "grad_norm": 0.4541108310222626, "learning_rate": 8.275862068965518e-06, "loss": 2.168, "step": 120 }, { "batch_num_effect_tokens": 2501, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.16753, "grad_norm": 0.4856899380683899, "learning_rate": 8.344827586206897e-06, "loss": 1.915, "step": 121 }, { "batch_num_effect_tokens": 2470, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 0.16892, "grad_norm": 0.5057125687599182, "learning_rate": 8.413793103448276e-06, "loss": 2.021, "step": 122 }, { "batch_num_effect_tokens": 3214, "batch_num_samples": 39, "batch_num_tokens": 16383, "epoch": 0.1703, "grad_norm": 0.46762770414352417, "learning_rate": 8.482758620689656e-06, "loss": 2.2461, "step": 123 }, { "batch_num_effect_tokens": 2106, "batch_num_samples": 28, "batch_num_tokens": 16304, "epoch": 0.17169, "grad_norm": 0.4945511519908905, "learning_rate": 8.551724137931035e-06, "loss": 3.4336, "step": 124 }, { "batch_num_effect_tokens": 2092, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.17307, "grad_norm": 0.5720517039299011, "learning_rate": 8.620689655172414e-06, "loss": 2.2051, "step": 125 }, { "batch_num_effect_tokens": 2448, "batch_num_samples": 32, "batch_num_tokens": 16288, "epoch": 0.17445, "grad_norm": 0.4404791593551636, "learning_rate": 8.689655172413793e-06, "loss": 2.063, "step": 126 }, { "batch_num_effect_tokens": 2796, "batch_num_samples": 37, "batch_num_tokens": 16368, "epoch": 0.17584, "grad_norm": 0.4544350206851959, "learning_rate": 8.758620689655173e-06, "loss": 2.0322, "step": 127 }, { "batch_num_effect_tokens": 2496, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 0.17722, "grad_norm": 0.42977914214134216, "learning_rate": 8.827586206896552e-06, "loss": 1.9736, "step": 128 }, { "batch_num_effect_tokens": 2207, "batch_num_samples": 29, "batch_num_tokens": 16364, "epoch": 0.17861, "grad_norm": 0.4452879726886749, "learning_rate": 8.896551724137931e-06, "loss": 1.9536, "step": 129 }, { "batch_num_effect_tokens": 2371, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.17999, "grad_norm": 0.4837096035480499, "learning_rate": 8.965517241379312e-06, "loss": 2.1895, "step": 130 }, { "batch_num_effect_tokens": 2848, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.18138, "grad_norm": 0.4372546970844269, "learning_rate": 9.03448275862069e-06, "loss": 2.0605, "step": 131 }, { "batch_num_effect_tokens": 2753, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.18276, "grad_norm": 0.45657920837402344, "learning_rate": 9.10344827586207e-06, "loss": 2.1445, "step": 132 }, { "batch_num_effect_tokens": 2618, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.18415, "grad_norm": 0.4595434069633484, "learning_rate": 9.172413793103448e-06, "loss": 1.9824, "step": 133 }, { "batch_num_effect_tokens": 2358, "batch_num_samples": 30, "batch_num_tokens": 16380, "epoch": 0.18553, "grad_norm": 0.4178936183452606, "learning_rate": 9.241379310344829e-06, "loss": 1.9512, "step": 134 }, { "batch_num_effect_tokens": 2359, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.18692, "grad_norm": 0.5006194114685059, "learning_rate": 9.310344827586207e-06, "loss": 2.1602, "step": 135 }, { "batch_num_effect_tokens": 1797, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.1883, "grad_norm": 0.5093601942062378, "learning_rate": 9.379310344827586e-06, "loss": 2.0537, "step": 136 }, { "batch_num_effect_tokens": 2514, "batch_num_samples": 43, "batch_num_tokens": 16383, "epoch": 0.18969, "grad_norm": 0.5109336376190186, "learning_rate": 9.448275862068967e-06, "loss": 2.1318, "step": 137 }, { "batch_num_effect_tokens": 2472, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.19107, "grad_norm": 0.5214370489120483, "learning_rate": 9.517241379310346e-06, "loss": 1.8223, "step": 138 }, { "batch_num_effect_tokens": 2532, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.19245, "grad_norm": 0.5395306348800659, "learning_rate": 9.586206896551724e-06, "loss": 1.9702, "step": 139 }, { "batch_num_effect_tokens": 2877, "batch_num_samples": 54, "batch_num_tokens": 16326, "epoch": 0.19384, "grad_norm": 0.44825807213783264, "learning_rate": 9.655172413793105e-06, "loss": 1.9365, "step": 140 }, { "batch_num_effect_tokens": 2404, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.19522, "grad_norm": 0.5079715847969055, "learning_rate": 9.724137931034484e-06, "loss": 2.1123, "step": 141 }, { "batch_num_effect_tokens": 2510, "batch_num_samples": 29, "batch_num_tokens": 16312, "epoch": 0.19661, "grad_norm": 0.43037351965904236, "learning_rate": 9.793103448275863e-06, "loss": 1.9258, "step": 142 }, { "batch_num_effect_tokens": 2366, "batch_num_samples": 28, "batch_num_tokens": 16379, "epoch": 0.19799, "grad_norm": 0.4995596706867218, "learning_rate": 9.862068965517241e-06, "loss": 2.0566, "step": 143 }, { "batch_num_effect_tokens": 2435, "batch_num_samples": 40, "batch_num_tokens": 16303, "epoch": 0.19938, "grad_norm": 0.47196587920188904, "learning_rate": 9.931034482758622e-06, "loss": 2.3574, "step": 144 }, { "batch_num_effect_tokens": 2763, "batch_num_samples": 37, "batch_num_tokens": 16340, "epoch": 0.20076, "grad_norm": 0.42634639143943787, "learning_rate": 1e-05, "loss": 2.1133, "step": 145 }, { "batch_num_effect_tokens": 2763, "batch_num_samples": 37, "batch_num_tokens": 16340, "epoch": 0.20076, "eval_eval_loss": 0.5074756741523743, "eval_eval_runtime": 105.6646, "eval_eval_samples_per_second": 45.881, "eval_eval_steps_per_second": 2.868, "step": 145 }, { "batch_num_effect_tokens": 2253, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.20215, "grad_norm": 0.5233316421508789, "learning_rate": 9.999985377513126e-06, "loss": 2.3047, "step": 146 }, { "batch_num_effect_tokens": 2585, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.20353, "grad_norm": 0.39718231558799744, "learning_rate": 9.999941510138025e-06, "loss": 1.6958, "step": 147 }, { "batch_num_effect_tokens": 2596, "batch_num_samples": 28, "batch_num_tokens": 16352, "epoch": 0.20492, "grad_norm": 0.756216824054718, "learning_rate": 9.999868398131282e-06, "loss": 1.8635, "step": 148 }, { "batch_num_effect_tokens": 2701, "batch_num_samples": 31, "batch_num_tokens": 16372, "epoch": 0.2063, "grad_norm": 0.45411890745162964, "learning_rate": 9.999766041920525e-06, "loss": 2.2236, "step": 149 }, { "batch_num_effect_tokens": 2580, "batch_num_samples": 31, "batch_num_tokens": 16298, "epoch": 0.20768, "grad_norm": 0.46270617842674255, "learning_rate": 9.999634442104438e-06, "loss": 1.9619, "step": 150 }, { "batch_num_effect_tokens": 2576, "batch_num_samples": 32, "batch_num_tokens": 16288, "epoch": 0.20907, "grad_norm": 0.4114486575126648, "learning_rate": 9.999473599452746e-06, "loss": 1.9253, "step": 151 }, { "batch_num_effect_tokens": 2772, "batch_num_samples": 28, "batch_num_tokens": 16382, "epoch": 0.21045, "grad_norm": 0.5233032703399658, "learning_rate": 9.999283514906217e-06, "loss": 2.0537, "step": 152 }, { "batch_num_effect_tokens": 2320, "batch_num_samples": 33, "batch_num_tokens": 16302, "epoch": 0.21184, "grad_norm": 0.4910412132740021, "learning_rate": 9.999064189576653e-06, "loss": 2.0103, "step": 153 }, { "batch_num_effect_tokens": 2359, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.21322, "grad_norm": 0.4834710359573364, "learning_rate": 9.99881562474689e-06, "loss": 2.0957, "step": 154 }, { "batch_num_effect_tokens": 2527, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.21461, "grad_norm": 0.4427872896194458, "learning_rate": 9.99853782187078e-06, "loss": 2.0957, "step": 155 }, { "batch_num_effect_tokens": 2521, "batch_num_samples": 34, "batch_num_tokens": 16372, "epoch": 0.21599, "grad_norm": 0.4544258415699005, "learning_rate": 9.998230782573192e-06, "loss": 2.0781, "step": 156 }, { "batch_num_effect_tokens": 2647, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.21738, "grad_norm": 0.4290156960487366, "learning_rate": 9.997894508649995e-06, "loss": 1.9229, "step": 157 }, { "batch_num_effect_tokens": 2738, "batch_num_samples": 37, "batch_num_tokens": 16368, "epoch": 0.21876, "grad_norm": 0.45441320538520813, "learning_rate": 9.997529002068056e-06, "loss": 2.1396, "step": 158 }, { "batch_num_effect_tokens": 1929, "batch_num_samples": 29, "batch_num_tokens": 16378, "epoch": 0.22015, "grad_norm": 0.5086316466331482, "learning_rate": 9.99713426496522e-06, "loss": 1.9937, "step": 159 }, { "batch_num_effect_tokens": 2357, "batch_num_samples": 28, "batch_num_tokens": 16379, "epoch": 0.22153, "grad_norm": 0.44914594292640686, "learning_rate": 9.996710299650302e-06, "loss": 1.9297, "step": 160 }, { "batch_num_effect_tokens": 2521, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.22291, "grad_norm": 0.40938496589660645, "learning_rate": 9.996257108603073e-06, "loss": 2.0713, "step": 161 }, { "batch_num_effect_tokens": 3881, "batch_num_samples": 46, "batch_num_tokens": 16324, "epoch": 0.2243, "grad_norm": 0.42861175537109375, "learning_rate": 9.995774694474245e-06, "loss": 2.2588, "step": 162 }, { "batch_num_effect_tokens": 2819, "batch_num_samples": 45, "batch_num_tokens": 16295, "epoch": 0.22568, "grad_norm": 0.43471336364746094, "learning_rate": 9.995263060085456e-06, "loss": 2.0742, "step": 163 }, { "batch_num_effect_tokens": 2221, "batch_num_samples": 29, "batch_num_tokens": 16305, "epoch": 0.22707, "grad_norm": 0.4524233937263489, "learning_rate": 9.994722208429251e-06, "loss": 1.9204, "step": 164 }, { "batch_num_effect_tokens": 2555, "batch_num_samples": 34, "batch_num_tokens": 16342, "epoch": 0.22845, "grad_norm": 0.4341295659542084, "learning_rate": 9.994152142669073e-06, "loss": 2.0254, "step": 165 }, { "batch_num_effect_tokens": 2313, "batch_num_samples": 31, "batch_num_tokens": 16382, "epoch": 0.22984, "grad_norm": 0.46852561831474304, "learning_rate": 9.99355286613923e-06, "loss": 2.0625, "step": 166 }, { "batch_num_effect_tokens": 3072, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.23122, "grad_norm": 0.45277923345565796, "learning_rate": 9.992924382344887e-06, "loss": 2.0415, "step": 167 }, { "batch_num_effect_tokens": 1904, "batch_num_samples": 29, "batch_num_tokens": 16367, "epoch": 0.23261, "grad_norm": 0.4951099753379822, "learning_rate": 9.992266694962044e-06, "loss": 2.0884, "step": 168 }, { "batch_num_effect_tokens": 2215, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.23399, "grad_norm": 0.4218391478061676, "learning_rate": 9.991579807837511e-06, "loss": 2.0225, "step": 169 }, { "batch_num_effect_tokens": 1994, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.23538, "grad_norm": 0.47881779074668884, "learning_rate": 9.990863724988886e-06, "loss": 2.5679, "step": 170 }, { "batch_num_effect_tokens": 2347, "batch_num_samples": 35, "batch_num_tokens": 16278, "epoch": 0.23676, "grad_norm": 0.4324640929698944, "learning_rate": 9.990118450604535e-06, "loss": 1.9795, "step": 171 }, { "batch_num_effect_tokens": 2319, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.23814, "grad_norm": 0.4088183343410492, "learning_rate": 9.989343989043563e-06, "loss": 1.9297, "step": 172 }, { "batch_num_effect_tokens": 2155, "batch_num_samples": 33, "batch_num_tokens": 16279, "epoch": 0.23953, "grad_norm": 0.48555171489715576, "learning_rate": 9.988540344835794e-06, "loss": 2.0991, "step": 173 }, { "batch_num_effect_tokens": 2600, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.24091, "grad_norm": 0.38732707500457764, "learning_rate": 9.987707522681735e-06, "loss": 1.8057, "step": 174 }, { "batch_num_effect_tokens": 2621, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.2423, "grad_norm": 0.4354293942451477, "learning_rate": 9.98684552745256e-06, "loss": 2.1753, "step": 175 }, { "batch_num_effect_tokens": 1781, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.24368, "grad_norm": 0.5204740166664124, "learning_rate": 9.985954364190076e-06, "loss": 2.124, "step": 176 }, { "batch_num_effect_tokens": 2786, "batch_num_samples": 29, "batch_num_tokens": 16278, "epoch": 0.24507, "grad_norm": 0.43490099906921387, "learning_rate": 9.98503403810669e-06, "loss": 2.2119, "step": 177 }, { "batch_num_effect_tokens": 2339, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.24645, "grad_norm": 0.43543189764022827, "learning_rate": 9.984084554585387e-06, "loss": 1.8506, "step": 178 }, { "batch_num_effect_tokens": 1949, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.24784, "grad_norm": 0.497896283864975, "learning_rate": 9.98310591917969e-06, "loss": 1.9434, "step": 179 }, { "batch_num_effect_tokens": 2521, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.24922, "grad_norm": 0.44861480593681335, "learning_rate": 9.982098137613631e-06, "loss": 1.9668, "step": 180 }, { "batch_num_effect_tokens": 2375, "batch_num_samples": 29, "batch_num_tokens": 16284, "epoch": 0.25061, "grad_norm": 0.4383162558078766, "learning_rate": 9.98106121578172e-06, "loss": 1.7842, "step": 181 }, { "batch_num_effect_tokens": 2098, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.25199, "grad_norm": 0.4342973828315735, "learning_rate": 9.979995159748907e-06, "loss": 1.9043, "step": 182 }, { "batch_num_effect_tokens": 2957, "batch_num_samples": 31, "batch_num_tokens": 16298, "epoch": 0.25337, "grad_norm": 0.3884802758693695, "learning_rate": 9.978899975750548e-06, "loss": 2.0693, "step": 183 }, { "batch_num_effect_tokens": 2663, "batch_num_samples": 52, "batch_num_tokens": 16384, "epoch": 0.25476, "grad_norm": 0.48671188950538635, "learning_rate": 9.977775670192373e-06, "loss": 2.1494, "step": 184 }, { "batch_num_effect_tokens": 2298, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.25614, "grad_norm": 0.4435107707977295, "learning_rate": 9.976622249650432e-06, "loss": 2.1892, "step": 185 }, { "batch_num_effect_tokens": 3506, "batch_num_samples": 49, "batch_num_tokens": 16351, "epoch": 0.25753, "grad_norm": 0.5555258989334106, "learning_rate": 9.975439720871079e-06, "loss": 2.0781, "step": 186 }, { "batch_num_effect_tokens": 2500, "batch_num_samples": 37, "batch_num_tokens": 16368, "epoch": 0.25891, "grad_norm": 0.45029816031455994, "learning_rate": 9.97422809077092e-06, "loss": 2.126, "step": 187 }, { "batch_num_effect_tokens": 1535, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.2603, "grad_norm": 0.49058008193969727, "learning_rate": 9.972987366436772e-06, "loss": 2.0508, "step": 188 }, { "batch_num_effect_tokens": 2304, "batch_num_samples": 30, "batch_num_tokens": 16292, "epoch": 0.26168, "grad_norm": 0.47166702151298523, "learning_rate": 9.971717555125623e-06, "loss": 2.1328, "step": 189 }, { "batch_num_effect_tokens": 2609, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.26307, "grad_norm": 0.4227210581302643, "learning_rate": 9.970418664264596e-06, "loss": 1.9961, "step": 190 }, { "batch_num_effect_tokens": 3165, "batch_num_samples": 35, "batch_num_tokens": 16304, "epoch": 0.26445, "grad_norm": 0.4291287064552307, "learning_rate": 9.969090701450896e-06, "loss": 2.0713, "step": 191 }, { "batch_num_effect_tokens": 3392, "batch_num_samples": 33, "batch_num_tokens": 16311, "epoch": 0.26584, "grad_norm": 0.4046500027179718, "learning_rate": 9.96773367445177e-06, "loss": 1.8506, "step": 192 }, { "batch_num_effect_tokens": 2035, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.26722, "grad_norm": 0.4191986918449402, "learning_rate": 9.966347591204459e-06, "loss": 1.9399, "step": 193 }, { "batch_num_effect_tokens": 2762, "batch_num_samples": 28, "batch_num_tokens": 16379, "epoch": 0.26861, "grad_norm": 0.39693641662597656, "learning_rate": 9.964932459816161e-06, "loss": 2.043, "step": 194 }, { "batch_num_effect_tokens": 2649, "batch_num_samples": 34, "batch_num_tokens": 16362, "epoch": 0.26999, "grad_norm": 0.41024044156074524, "learning_rate": 9.963488288563972e-06, "loss": 2.1436, "step": 195 }, { "batch_num_effect_tokens": 2279, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.27137, "grad_norm": 0.44875651597976685, "learning_rate": 9.962015085894838e-06, "loss": 2.2217, "step": 196 }, { "batch_num_effect_tokens": 3123, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.27276, "grad_norm": 0.3715358376502991, "learning_rate": 9.960512860425517e-06, "loss": 2.1455, "step": 197 }, { "batch_num_effect_tokens": 2451, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.27414, "grad_norm": 0.41320058703422546, "learning_rate": 9.958981620942519e-06, "loss": 1.9556, "step": 198 }, { "batch_num_effect_tokens": 2724, "batch_num_samples": 28, "batch_num_tokens": 16337, "epoch": 0.27553, "grad_norm": 0.425836443901062, "learning_rate": 9.957421376402053e-06, "loss": 2.0146, "step": 199 }, { "batch_num_effect_tokens": 2182, "batch_num_samples": 29, "batch_num_tokens": 16317, "epoch": 0.27691, "grad_norm": 0.4488903880119324, "learning_rate": 9.955832135929978e-06, "loss": 1.9375, "step": 200 }, { "batch_num_effect_tokens": 2569, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.2783, "grad_norm": 0.4490876793861389, "learning_rate": 9.954213908821762e-06, "loss": 2.085, "step": 201 }, { "batch_num_effect_tokens": 2623, "batch_num_samples": 47, "batch_num_tokens": 16343, "epoch": 0.27968, "grad_norm": 0.43517589569091797, "learning_rate": 9.9525667045424e-06, "loss": 1.9648, "step": 202 }, { "batch_num_effect_tokens": 2316, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.28107, "grad_norm": 0.4525317847728729, "learning_rate": 9.950890532726382e-06, "loss": 2.0791, "step": 203 }, { "batch_num_effect_tokens": 1956, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.28245, "grad_norm": 0.48727837204933167, "learning_rate": 9.949185403177632e-06, "loss": 1.9429, "step": 204 }, { "batch_num_effect_tokens": 2584, "batch_num_samples": 28, "batch_num_tokens": 16368, "epoch": 0.28384, "grad_norm": 0.425095796585083, "learning_rate": 9.94745132586944e-06, "loss": 2.0518, "step": 205 }, { "batch_num_effect_tokens": 2662, "batch_num_samples": 38, "batch_num_tokens": 16370, "epoch": 0.28522, "grad_norm": 0.4351283013820648, "learning_rate": 9.945688310944415e-06, "loss": 2.1528, "step": 206 }, { "batch_num_effect_tokens": 2364, "batch_num_samples": 46, "batch_num_tokens": 16362, "epoch": 0.2866, "grad_norm": 0.4486139714717865, "learning_rate": 9.943896368714423e-06, "loss": 2.2578, "step": 207 }, { "batch_num_effect_tokens": 2094, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.28799, "grad_norm": 0.45044130086898804, "learning_rate": 9.942075509660527e-06, "loss": 2.062, "step": 208 }, { "batch_num_effect_tokens": 2023, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.28937, "grad_norm": 0.49800756573677063, "learning_rate": 9.940225744432919e-06, "loss": 1.7871, "step": 209 }, { "batch_num_effect_tokens": 2598, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.29076, "grad_norm": 0.457740843296051, "learning_rate": 9.938347083850866e-06, "loss": 1.957, "step": 210 }, { "batch_num_effect_tokens": 2144, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.29214, "grad_norm": 0.4548238217830658, "learning_rate": 9.936439538902644e-06, "loss": 1.9907, "step": 211 }, { "batch_num_effect_tokens": 2822, "batch_num_samples": 40, "batch_num_tokens": 16382, "epoch": 0.29353, "grad_norm": 0.444318026304245, "learning_rate": 9.934503120745476e-06, "loss": 2.0205, "step": 212 }, { "batch_num_effect_tokens": 2345, "batch_num_samples": 31, "batch_num_tokens": 16321, "epoch": 0.29491, "grad_norm": 0.4424176514148712, "learning_rate": 9.93253784070546e-06, "loss": 1.8076, "step": 213 }, { "batch_num_effect_tokens": 2511, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.2963, "grad_norm": 0.4655572175979614, "learning_rate": 9.93054371027751e-06, "loss": 2.083, "step": 214 }, { "batch_num_effect_tokens": 2388, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.29768, "grad_norm": 0.5405806303024292, "learning_rate": 9.92852074112528e-06, "loss": 1.999, "step": 215 }, { "batch_num_effect_tokens": 2478, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.29907, "grad_norm": 0.44077908992767334, "learning_rate": 9.926468945081109e-06, "loss": 1.9609, "step": 216 }, { "batch_num_effect_tokens": 2988, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 0.30045, "grad_norm": 0.42795538902282715, "learning_rate": 9.924388334145943e-06, "loss": 2.0645, "step": 217 }, { "batch_num_effect_tokens": 2304, "batch_num_samples": 31, "batch_num_tokens": 16321, "epoch": 0.30183, "grad_norm": 0.4741116166114807, "learning_rate": 9.922278920489262e-06, "loss": 2.1465, "step": 218 }, { "batch_num_effect_tokens": 2857, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 0.30322, "grad_norm": 0.4618435800075531, "learning_rate": 9.920140716449016e-06, "loss": 1.9062, "step": 219 }, { "batch_num_effect_tokens": 2150, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.3046, "grad_norm": 0.4802999496459961, "learning_rate": 9.917973734531549e-06, "loss": 2.0352, "step": 220 }, { "batch_num_effect_tokens": 3970, "batch_num_samples": 41, "batch_num_tokens": 16343, "epoch": 0.30599, "grad_norm": 0.35266441106796265, "learning_rate": 9.915777987411527e-06, "loss": 2.1631, "step": 221 }, { "batch_num_effect_tokens": 2342, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.30737, "grad_norm": 0.4226939082145691, "learning_rate": 9.913553487931865e-06, "loss": 1.9668, "step": 222 }, { "batch_num_effect_tokens": 2721, "batch_num_samples": 31, "batch_num_tokens": 16361, "epoch": 0.30876, "grad_norm": 0.47592079639434814, "learning_rate": 9.911300249103646e-06, "loss": 2.1426, "step": 223 }, { "batch_num_effect_tokens": 2209, "batch_num_samples": 31, "batch_num_tokens": 16377, "epoch": 0.31014, "grad_norm": 0.48503613471984863, "learning_rate": 9.909018284106054e-06, "loss": 2.0718, "step": 224 }, { "batch_num_effect_tokens": 2917, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.31153, "grad_norm": 0.40506455302238464, "learning_rate": 9.906707606286287e-06, "loss": 2.5908, "step": 225 }, { "batch_num_effect_tokens": 3690, "batch_num_samples": 45, "batch_num_tokens": 16332, "epoch": 0.31291, "grad_norm": 0.4393285810947418, "learning_rate": 9.904368229159494e-06, "loss": 1.8477, "step": 226 }, { "batch_num_effect_tokens": 2890, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.3143, "grad_norm": 0.3968466818332672, "learning_rate": 9.902000166408672e-06, "loss": 1.957, "step": 227 }, { "batch_num_effect_tokens": 2499, "batch_num_samples": 34, "batch_num_tokens": 16216, "epoch": 0.31568, "grad_norm": 0.42568477988243103, "learning_rate": 9.899603431884613e-06, "loss": 2.0264, "step": 228 }, { "batch_num_effect_tokens": 2727, "batch_num_samples": 36, "batch_num_tokens": 16373, "epoch": 0.31706, "grad_norm": 0.37732240557670593, "learning_rate": 9.897178039605803e-06, "loss": 1.9155, "step": 229 }, { "batch_num_effect_tokens": 2528, "batch_num_samples": 29, "batch_num_tokens": 16368, "epoch": 0.31845, "grad_norm": 0.47486749291419983, "learning_rate": 9.894724003758349e-06, "loss": 2.0303, "step": 230 }, { "batch_num_effect_tokens": 2206, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.31983, "grad_norm": 0.41266077756881714, "learning_rate": 9.892241338695892e-06, "loss": 2.0352, "step": 231 }, { "batch_num_effect_tokens": 2185, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.32122, "grad_norm": 0.4580242335796356, "learning_rate": 9.889730058939529e-06, "loss": 1.958, "step": 232 }, { "batch_num_effect_tokens": 3110, "batch_num_samples": 44, "batch_num_tokens": 16384, "epoch": 0.3226, "grad_norm": 0.3533010482788086, "learning_rate": 9.887190179177721e-06, "loss": 1.9673, "step": 233 }, { "batch_num_effect_tokens": 2385, "batch_num_samples": 31, "batch_num_tokens": 16298, "epoch": 0.32399, "grad_norm": 0.45556148886680603, "learning_rate": 9.884621714266212e-06, "loss": 2.0459, "step": 234 }, { "batch_num_effect_tokens": 2181, "batch_num_samples": 32, "batch_num_tokens": 16381, "epoch": 0.32537, "grad_norm": 0.45334944128990173, "learning_rate": 9.88202467922794e-06, "loss": 1.8232, "step": 235 }, { "batch_num_effect_tokens": 2548, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.32676, "grad_norm": 0.41539719700813293, "learning_rate": 9.879399089252947e-06, "loss": 2.1338, "step": 236 }, { "batch_num_effect_tokens": 2265, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.32814, "grad_norm": 0.4631437361240387, "learning_rate": 9.876744959698299e-06, "loss": 2.0938, "step": 237 }, { "batch_num_effect_tokens": 2328, "batch_num_samples": 29, "batch_num_tokens": 16354, "epoch": 0.32953, "grad_norm": 0.42043766379356384, "learning_rate": 9.874062306087983e-06, "loss": 1.8711, "step": 238 }, { "batch_num_effect_tokens": 2260, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.33091, "grad_norm": 0.40121302008628845, "learning_rate": 9.871351144112826e-06, "loss": 1.8496, "step": 239 }, { "batch_num_effect_tokens": 2671, "batch_num_samples": 33, "batch_num_tokens": 16357, "epoch": 0.33229, "grad_norm": 0.457048237323761, "learning_rate": 9.868611489630401e-06, "loss": 2.1758, "step": 240 }, { "batch_num_effect_tokens": 2015, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.33368, "grad_norm": 0.4228353798389435, "learning_rate": 9.865843358664933e-06, "loss": 2.1157, "step": 241 }, { "batch_num_effect_tokens": 2929, "batch_num_samples": 41, "batch_num_tokens": 16337, "epoch": 0.33506, "grad_norm": 0.3766017556190491, "learning_rate": 9.863046767407205e-06, "loss": 2.0737, "step": 242 }, { "batch_num_effect_tokens": 2397, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.33645, "grad_norm": 0.4442284107208252, "learning_rate": 9.860221732214463e-06, "loss": 1.9263, "step": 243 }, { "batch_num_effect_tokens": 2056, "batch_num_samples": 28, "batch_num_tokens": 16307, "epoch": 0.33783, "grad_norm": 0.5173854827880859, "learning_rate": 9.857368269610325e-06, "loss": 1.9814, "step": 244 }, { "batch_num_effect_tokens": 2992, "batch_num_samples": 70, "batch_num_tokens": 16238, "epoch": 0.33922, "grad_norm": 0.37581831216812134, "learning_rate": 9.854486396284678e-06, "loss": 1.9219, "step": 245 }, { "batch_num_effect_tokens": 2381, "batch_num_samples": 36, "batch_num_tokens": 16373, "epoch": 0.3406, "grad_norm": 0.48352372646331787, "learning_rate": 9.851576129093584e-06, "loss": 1.8809, "step": 246 }, { "batch_num_effect_tokens": 2154, "batch_num_samples": 33, "batch_num_tokens": 16357, "epoch": 0.34199, "grad_norm": 0.41893377900123596, "learning_rate": 9.848637485059183e-06, "loss": 2.0317, "step": 247 }, { "batch_num_effect_tokens": 2399, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.34337, "grad_norm": 0.4294080138206482, "learning_rate": 9.845670481369585e-06, "loss": 2.0322, "step": 248 }, { "batch_num_effect_tokens": 3105, "batch_num_samples": 31, "batch_num_tokens": 16382, "epoch": 0.34476, "grad_norm": 0.4336344301700592, "learning_rate": 9.842675135378779e-06, "loss": 1.9985, "step": 249 }, { "batch_num_effect_tokens": 2200, "batch_num_samples": 29, "batch_num_tokens": 16318, "epoch": 0.34614, "grad_norm": 0.4368569552898407, "learning_rate": 9.83965146460653e-06, "loss": 1.9434, "step": 250 }, { "batch_num_effect_tokens": 2879, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.34753, "grad_norm": 0.40224331617355347, "learning_rate": 9.836599486738271e-06, "loss": 1.8359, "step": 251 }, { "batch_num_effect_tokens": 2800, "batch_num_samples": 56, "batch_num_tokens": 16245, "epoch": 0.34891, "grad_norm": 0.3770677447319031, "learning_rate": 9.833519219625008e-06, "loss": 1.9521, "step": 252 }, { "batch_num_effect_tokens": 2373, "batch_num_samples": 28, "batch_num_tokens": 16277, "epoch": 0.35029, "grad_norm": 0.3883986473083496, "learning_rate": 9.830410681283203e-06, "loss": 1.9351, "step": 253 }, { "batch_num_effect_tokens": 3138, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.35168, "grad_norm": 0.37304437160491943, "learning_rate": 9.82727388989468e-06, "loss": 1.9443, "step": 254 }, { "batch_num_effect_tokens": 2596, "batch_num_samples": 32, "batch_num_tokens": 16332, "epoch": 0.35306, "grad_norm": 0.421138733625412, "learning_rate": 9.82410886380652e-06, "loss": 2.0508, "step": 255 }, { "batch_num_effect_tokens": 2480, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.35445, "grad_norm": 0.4281174838542938, "learning_rate": 9.820915621530939e-06, "loss": 2.1719, "step": 256 }, { "batch_num_effect_tokens": 2623, "batch_num_samples": 28, "batch_num_tokens": 16335, "epoch": 0.35583, "grad_norm": 0.4175695776939392, "learning_rate": 9.8176941817452e-06, "loss": 2.0923, "step": 257 }, { "batch_num_effect_tokens": 2150, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.35722, "grad_norm": 0.4575291872024536, "learning_rate": 9.814444563291478e-06, "loss": 2.0898, "step": 258 }, { "batch_num_effect_tokens": 2490, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.3586, "grad_norm": 0.4389287829399109, "learning_rate": 9.811166785176785e-06, "loss": 2.1099, "step": 259 }, { "batch_num_effect_tokens": 2503, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.35999, "grad_norm": 0.39610975980758667, "learning_rate": 9.807860866572822e-06, "loss": 1.9336, "step": 260 }, { "batch_num_effect_tokens": 2331, "batch_num_samples": 37, "batch_num_tokens": 16284, "epoch": 0.36137, "grad_norm": 0.4179364740848541, "learning_rate": 9.80452682681589e-06, "loss": 2.1733, "step": 261 }, { "batch_num_effect_tokens": 3065, "batch_num_samples": 40, "batch_num_tokens": 16384, "epoch": 0.36276, "grad_norm": 0.39875179529190063, "learning_rate": 9.80116468540677e-06, "loss": 2.1279, "step": 262 }, { "batch_num_effect_tokens": 2947, "batch_num_samples": 42, "batch_num_tokens": 16318, "epoch": 0.36414, "grad_norm": 0.4247397184371948, "learning_rate": 9.797774462010611e-06, "loss": 1.8511, "step": 263 }, { "batch_num_effect_tokens": 2148, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.36552, "grad_norm": 0.5324507355690002, "learning_rate": 9.794356176456813e-06, "loss": 1.8652, "step": 264 }, { "batch_num_effect_tokens": 2662, "batch_num_samples": 28, "batch_num_tokens": 16352, "epoch": 0.36691, "grad_norm": 0.4329901337623596, "learning_rate": 9.790909848738907e-06, "loss": 1.9775, "step": 265 }, { "batch_num_effect_tokens": 1877, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.36829, "grad_norm": 0.5041355490684509, "learning_rate": 9.787435499014446e-06, "loss": 2.0859, "step": 266 }, { "batch_num_effect_tokens": 3007, "batch_num_samples": 54, "batch_num_tokens": 16280, "epoch": 0.36968, "grad_norm": 0.3835192024707794, "learning_rate": 9.783933147604885e-06, "loss": 1.9097, "step": 267 }, { "batch_num_effect_tokens": 2601, "batch_num_samples": 34, "batch_num_tokens": 16352, "epoch": 0.37106, "grad_norm": 0.4445229470729828, "learning_rate": 9.780402814995458e-06, "loss": 1.9355, "step": 268 }, { "batch_num_effect_tokens": 2435, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.37245, "grad_norm": 0.3842845857143402, "learning_rate": 9.776844521835064e-06, "loss": 2.084, "step": 269 }, { "batch_num_effect_tokens": 1954, "batch_num_samples": 29, "batch_num_tokens": 16365, "epoch": 0.37383, "grad_norm": 0.46704360842704773, "learning_rate": 9.773258288936139e-06, "loss": 1.8672, "step": 270 }, { "batch_num_effect_tokens": 2573, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.37522, "grad_norm": 0.3944907784461975, "learning_rate": 9.76964413727454e-06, "loss": 1.8599, "step": 271 }, { "batch_num_effect_tokens": 2735, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.3766, "grad_norm": 0.4014173150062561, "learning_rate": 9.76600208798942e-06, "loss": 2.0811, "step": 272 }, { "batch_num_effect_tokens": 3052, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.37799, "grad_norm": 0.3884974420070648, "learning_rate": 9.76233216238311e-06, "loss": 1.9229, "step": 273 }, { "batch_num_effect_tokens": 2487, "batch_num_samples": 43, "batch_num_tokens": 16347, "epoch": 0.37937, "grad_norm": 0.44373294711112976, "learning_rate": 9.758634381920982e-06, "loss": 2.0371, "step": 274 }, { "batch_num_effect_tokens": 2662, "batch_num_samples": 33, "batch_num_tokens": 16334, "epoch": 0.38075, "grad_norm": 0.3910280764102936, "learning_rate": 9.754908768231337e-06, "loss": 1.959, "step": 275 }, { "batch_num_effect_tokens": 2548, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.38214, "grad_norm": 0.4032585024833679, "learning_rate": 9.751155343105269e-06, "loss": 2.1074, "step": 276 }, { "batch_num_effect_tokens": 3009, "batch_num_samples": 35, "batch_num_tokens": 16330, "epoch": 0.38352, "grad_norm": 0.4202347695827484, "learning_rate": 9.747374128496541e-06, "loss": 1.959, "step": 277 }, { "batch_num_effect_tokens": 2216, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.38491, "grad_norm": 0.4149520695209503, "learning_rate": 9.743565146521459e-06, "loss": 1.8999, "step": 278 }, { "batch_num_effect_tokens": 2014, "batch_num_samples": 29, "batch_num_tokens": 16322, "epoch": 0.38629, "grad_norm": 0.4332839548587799, "learning_rate": 9.739728419458738e-06, "loss": 1.7886, "step": 279 }, { "batch_num_effect_tokens": 2432, "batch_num_samples": 37, "batch_num_tokens": 16351, "epoch": 0.38768, "grad_norm": 0.4043715298175812, "learning_rate": 9.735863969749373e-06, "loss": 1.9941, "step": 280 }, { "batch_num_effect_tokens": 2167, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.38906, "grad_norm": 0.43381571769714355, "learning_rate": 9.731971819996513e-06, "loss": 1.957, "step": 281 }, { "batch_num_effect_tokens": 2500, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 0.39045, "grad_norm": 0.4764724373817444, "learning_rate": 9.728051992965316e-06, "loss": 2.2783, "step": 282 }, { "batch_num_effect_tokens": 2477, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.39183, "grad_norm": 0.4670235216617584, "learning_rate": 9.724104511582838e-06, "loss": 1.9507, "step": 283 }, { "batch_num_effect_tokens": 2065, "batch_num_samples": 34, "batch_num_tokens": 16382, "epoch": 0.39322, "grad_norm": 0.4099712371826172, "learning_rate": 9.720129398937871e-06, "loss": 2.0747, "step": 284 }, { "batch_num_effect_tokens": 1909, "batch_num_samples": 29, "batch_num_tokens": 16364, "epoch": 0.3946, "grad_norm": 0.4676312208175659, "learning_rate": 9.716126678280829e-06, "loss": 2.0518, "step": 285 }, { "batch_num_effect_tokens": 2810, "batch_num_samples": 30, "batch_num_tokens": 16383, "epoch": 0.39598, "grad_norm": 0.4476577043533325, "learning_rate": 9.712096373023603e-06, "loss": 1.9961, "step": 286 }, { "batch_num_effect_tokens": 2392, "batch_num_samples": 34, "batch_num_tokens": 16342, "epoch": 0.39737, "grad_norm": 0.4112434685230255, "learning_rate": 9.70803850673943e-06, "loss": 1.9873, "step": 287 }, { "batch_num_effect_tokens": 3178, "batch_num_samples": 29, "batch_num_tokens": 16369, "epoch": 0.39875, "grad_norm": 0.4206170439720154, "learning_rate": 9.703953103162748e-06, "loss": 2.167, "step": 288 }, { "batch_num_effect_tokens": 3044, "batch_num_samples": 30, "batch_num_tokens": 16271, "epoch": 0.40014, "grad_norm": 0.4196258783340454, "learning_rate": 9.699840186189061e-06, "loss": 2.2207, "step": 289 }, { "batch_num_effect_tokens": 2454, "batch_num_samples": 30, "batch_num_tokens": 16353, "epoch": 0.40152, "grad_norm": 0.4163075387477875, "learning_rate": 9.695699779874796e-06, "loss": 1.9355, "step": 290 }, { "batch_num_effect_tokens": 2454, "batch_num_samples": 30, "batch_num_tokens": 16353, "epoch": 0.40152, "eval_eval_loss": 0.4898250699043274, "eval_eval_runtime": 124.0708, "eval_eval_samples_per_second": 39.074, "eval_eval_steps_per_second": 2.442, "step": 290 }, { "batch_num_effect_tokens": 2933, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 0.40291, "grad_norm": 0.3759535551071167, "learning_rate": 9.691531908437171e-06, "loss": 2.0122, "step": 291 }, { "batch_num_effect_tokens": 2576, "batch_num_samples": 40, "batch_num_tokens": 16384, "epoch": 0.40429, "grad_norm": 0.44778937101364136, "learning_rate": 9.687336596254045e-06, "loss": 2.0427, "step": 292 }, { "batch_num_effect_tokens": 2210, "batch_num_samples": 30, "batch_num_tokens": 16324, "epoch": 0.40568, "grad_norm": 0.3930279314517975, "learning_rate": 9.683113867863772e-06, "loss": 1.9775, "step": 293 }, { "batch_num_effect_tokens": 2208, "batch_num_samples": 29, "batch_num_tokens": 16380, "epoch": 0.40706, "grad_norm": 0.4249440133571625, "learning_rate": 9.678863747965073e-06, "loss": 1.8184, "step": 294 }, { "batch_num_effect_tokens": 2289, "batch_num_samples": 35, "batch_num_tokens": 16382, "epoch": 0.40845, "grad_norm": 0.4432763457298279, "learning_rate": 9.674586261416874e-06, "loss": 1.9072, "step": 295 }, { "batch_num_effect_tokens": 2301, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.40983, "grad_norm": 0.49523136019706726, "learning_rate": 9.670281433238173e-06, "loss": 2.021, "step": 296 }, { "batch_num_effect_tokens": 2838, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.41121, "grad_norm": 0.4077904224395752, "learning_rate": 9.665949288607889e-06, "loss": 1.9795, "step": 297 }, { "batch_num_effect_tokens": 2342, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.4126, "grad_norm": 0.4294334948062897, "learning_rate": 9.66158985286471e-06, "loss": 1.8535, "step": 298 }, { "batch_num_effect_tokens": 2395, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.41398, "grad_norm": 0.4167822599411011, "learning_rate": 9.657203151506953e-06, "loss": 1.6934, "step": 299 }, { "batch_num_effect_tokens": 2733, "batch_num_samples": 33, "batch_num_tokens": 16383, "epoch": 0.41537, "grad_norm": 0.40432628989219666, "learning_rate": 9.652789210192412e-06, "loss": 2.123, "step": 300 }, { "batch_num_effect_tokens": 2251, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.41675, "grad_norm": 0.48329272866249084, "learning_rate": 9.648348054738208e-06, "loss": 2.1328, "step": 301 }, { "batch_num_effect_tokens": 2386, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.41814, "grad_norm": 0.4899674355983734, "learning_rate": 9.643879711120636e-06, "loss": 2.1675, "step": 302 }, { "batch_num_effect_tokens": 2666, "batch_num_samples": 29, "batch_num_tokens": 16382, "epoch": 0.41952, "grad_norm": 0.42604148387908936, "learning_rate": 9.639384205475012e-06, "loss": 1.9487, "step": 303 }, { "batch_num_effect_tokens": 2378, "batch_num_samples": 44, "batch_num_tokens": 16384, "epoch": 0.42091, "grad_norm": 0.4134483337402344, "learning_rate": 9.634861564095525e-06, "loss": 1.917, "step": 304 }, { "batch_num_effect_tokens": 2631, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.42229, "grad_norm": 0.42034024000167847, "learning_rate": 9.630311813435082e-06, "loss": 2.0049, "step": 305 }, { "batch_num_effect_tokens": 2507, "batch_num_samples": 32, "batch_num_tokens": 16354, "epoch": 0.42368, "grad_norm": 0.3883148729801178, "learning_rate": 9.62573498010515e-06, "loss": 1.8828, "step": 306 }, { "batch_num_effect_tokens": 2442, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.42506, "grad_norm": 0.41206690669059753, "learning_rate": 9.621131090875603e-06, "loss": 2.1611, "step": 307 }, { "batch_num_effect_tokens": 2496, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.42645, "grad_norm": 0.3996421992778778, "learning_rate": 9.616500172674568e-06, "loss": 2.0156, "step": 308 }, { "batch_num_effect_tokens": 2901, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.42783, "grad_norm": 0.36901751160621643, "learning_rate": 9.611842252588259e-06, "loss": 1.9141, "step": 309 }, { "batch_num_effect_tokens": 2211, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.42921, "grad_norm": 0.45314258337020874, "learning_rate": 9.607157357860823e-06, "loss": 1.9326, "step": 310 }, { "batch_num_effect_tokens": 2768, "batch_num_samples": 36, "batch_num_tokens": 16382, "epoch": 0.4306, "grad_norm": 0.37405553460121155, "learning_rate": 9.60244551589419e-06, "loss": 1.8496, "step": 311 }, { "batch_num_effect_tokens": 2519, "batch_num_samples": 32, "batch_num_tokens": 16354, "epoch": 0.43198, "grad_norm": 0.4137744903564453, "learning_rate": 9.597706754247895e-06, "loss": 1.9497, "step": 312 }, { "batch_num_effect_tokens": 2051, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.43337, "grad_norm": 0.45725569128990173, "learning_rate": 9.59294110063893e-06, "loss": 2.0298, "step": 313 }, { "batch_num_effect_tokens": 2389, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.43475, "grad_norm": 0.4542232155799866, "learning_rate": 9.588148582941583e-06, "loss": 2.0645, "step": 314 }, { "batch_num_effect_tokens": 1939, "batch_num_samples": 28, "batch_num_tokens": 16383, "epoch": 0.43614, "grad_norm": 0.46026375889778137, "learning_rate": 9.583329229187259e-06, "loss": 1.8193, "step": 315 }, { "batch_num_effect_tokens": 2469, "batch_num_samples": 28, "batch_num_tokens": 16277, "epoch": 0.43752, "grad_norm": 0.4066016674041748, "learning_rate": 9.578483067564335e-06, "loss": 2.1797, "step": 316 }, { "batch_num_effect_tokens": 2964, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.43891, "grad_norm": 0.379663348197937, "learning_rate": 9.573610126417985e-06, "loss": 2.1143, "step": 317 }, { "batch_num_effect_tokens": 2309, "batch_num_samples": 32, "batch_num_tokens": 16310, "epoch": 0.44029, "grad_norm": 0.4310319721698761, "learning_rate": 9.568710434250017e-06, "loss": 1.8003, "step": 318 }, { "batch_num_effect_tokens": 2542, "batch_num_samples": 40, "batch_num_tokens": 16384, "epoch": 0.44168, "grad_norm": 0.4266505241394043, "learning_rate": 9.563784019718704e-06, "loss": 2.0132, "step": 319 }, { "batch_num_effect_tokens": 1788, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.44306, "grad_norm": 0.483811616897583, "learning_rate": 9.558830911638616e-06, "loss": 1.9443, "step": 320 }, { "batch_num_effect_tokens": 2605, "batch_num_samples": 33, "batch_num_tokens": 16302, "epoch": 0.44444, "grad_norm": 0.4202471077442169, "learning_rate": 9.553851138980462e-06, "loss": 1.958, "step": 321 }, { "batch_num_effect_tokens": 1972, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.44583, "grad_norm": 0.4090140163898468, "learning_rate": 9.548844730870903e-06, "loss": 1.7783, "step": 322 }, { "batch_num_effect_tokens": 2196, "batch_num_samples": 29, "batch_num_tokens": 16298, "epoch": 0.44721, "grad_norm": 0.37841951847076416, "learning_rate": 9.543811716592391e-06, "loss": 1.9243, "step": 323 }, { "batch_num_effect_tokens": 2165, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.4486, "grad_norm": 0.4422109127044678, "learning_rate": 9.538752125583003e-06, "loss": 1.9756, "step": 324 }, { "batch_num_effect_tokens": 2841, "batch_num_samples": 46, "batch_num_tokens": 16284, "epoch": 0.44998, "grad_norm": 0.33119142055511475, "learning_rate": 9.533665987436262e-06, "loss": 1.8472, "step": 325 }, { "batch_num_effect_tokens": 2171, "batch_num_samples": 29, "batch_num_tokens": 16368, "epoch": 0.45137, "grad_norm": 0.43908819556236267, "learning_rate": 9.52855333190096e-06, "loss": 1.8965, "step": 326 }, { "batch_num_effect_tokens": 2327, "batch_num_samples": 31, "batch_num_tokens": 16383, "epoch": 0.45275, "grad_norm": 0.44274982810020447, "learning_rate": 9.523414188880994e-06, "loss": 2.1045, "step": 327 }, { "batch_num_effect_tokens": 2076, "batch_num_samples": 28, "batch_num_tokens": 16304, "epoch": 0.45414, "grad_norm": 0.456655353307724, "learning_rate": 9.518248588435185e-06, "loss": 1.9404, "step": 328 }, { "batch_num_effect_tokens": 2166, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.45552, "grad_norm": 0.39664626121520996, "learning_rate": 9.513056560777102e-06, "loss": 2.1226, "step": 329 }, { "batch_num_effect_tokens": 2174, "batch_num_samples": 31, "batch_num_tokens": 16289, "epoch": 0.45691, "grad_norm": 0.4169045686721802, "learning_rate": 9.507838136274887e-06, "loss": 1.7944, "step": 330 }, { "batch_num_effect_tokens": 1978, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.45829, "grad_norm": 0.4782629609107971, "learning_rate": 9.502593345451078e-06, "loss": 2.2871, "step": 331 }, { "batch_num_effect_tokens": 2410, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.45967, "grad_norm": 0.4343865215778351, "learning_rate": 9.49732221898243e-06, "loss": 1.9702, "step": 332 }, { "batch_num_effect_tokens": 2329, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.46106, "grad_norm": 0.4620456099510193, "learning_rate": 9.492024787699733e-06, "loss": 2.0566, "step": 333 }, { "batch_num_effect_tokens": 2670, "batch_num_samples": 45, "batch_num_tokens": 16384, "epoch": 0.46244, "grad_norm": 0.4406795799732208, "learning_rate": 9.486701082587635e-06, "loss": 1.9121, "step": 334 }, { "batch_num_effect_tokens": 2286, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.46383, "grad_norm": 0.46398094296455383, "learning_rate": 9.481351134784458e-06, "loss": 2.0664, "step": 335 }, { "batch_num_effect_tokens": 2389, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.46521, "grad_norm": 0.45239660143852234, "learning_rate": 9.475974975582021e-06, "loss": 2.0361, "step": 336 }, { "batch_num_effect_tokens": 1953, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.4666, "grad_norm": 0.4313308000564575, "learning_rate": 9.470572636425451e-06, "loss": 1.9053, "step": 337 }, { "batch_num_effect_tokens": 2380, "batch_num_samples": 30, "batch_num_tokens": 16290, "epoch": 0.46798, "grad_norm": 0.4705055058002472, "learning_rate": 9.465144148912997e-06, "loss": 1.9443, "step": 338 }, { "batch_num_effect_tokens": 3110, "batch_num_samples": 44, "batch_num_tokens": 16364, "epoch": 0.46937, "grad_norm": 0.3910743296146393, "learning_rate": 9.459689544795859e-06, "loss": 2.0137, "step": 339 }, { "batch_num_effect_tokens": 2200, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 0.47075, "grad_norm": 0.412332683801651, "learning_rate": 9.454208855977986e-06, "loss": 1.8252, "step": 340 }, { "batch_num_effect_tokens": 2045, "batch_num_samples": 29, "batch_num_tokens": 16382, "epoch": 0.47214, "grad_norm": 0.3931344151496887, "learning_rate": 9.448702114515897e-06, "loss": 2.04, "step": 341 }, { "batch_num_effect_tokens": 2877, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.47352, "grad_norm": 0.4233572781085968, "learning_rate": 9.443169352618498e-06, "loss": 2.1748, "step": 342 }, { "batch_num_effect_tokens": 2035, "batch_num_samples": 31, "batch_num_tokens": 16205, "epoch": 0.4749, "grad_norm": 0.4216836988925934, "learning_rate": 9.437610602646878e-06, "loss": 2.0352, "step": 343 }, { "batch_num_effect_tokens": 3122, "batch_num_samples": 45, "batch_num_tokens": 16382, "epoch": 0.47629, "grad_norm": 0.34738537669181824, "learning_rate": 9.43202589711414e-06, "loss": 1.7275, "step": 344 }, { "batch_num_effect_tokens": 2473, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.47767, "grad_norm": 0.4602767527103424, "learning_rate": 9.426415268685198e-06, "loss": 1.8398, "step": 345 }, { "batch_num_effect_tokens": 2525, "batch_num_samples": 28, "batch_num_tokens": 16304, "epoch": 0.47906, "grad_norm": 0.4130743741989136, "learning_rate": 9.420778750176588e-06, "loss": 1.9717, "step": 346 }, { "batch_num_effect_tokens": 2282, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.48044, "grad_norm": 0.41858863830566406, "learning_rate": 9.415116374556276e-06, "loss": 3.0293, "step": 347 }, { "batch_num_effect_tokens": 2526, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.48183, "grad_norm": 0.4118393361568451, "learning_rate": 9.409428174943468e-06, "loss": 1.917, "step": 348 }, { "batch_num_effect_tokens": 1905, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.48321, "grad_norm": 0.44558751583099365, "learning_rate": 9.403714184608411e-06, "loss": 2.0171, "step": 349 }, { "batch_num_effect_tokens": 2138, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.4846, "grad_norm": 0.4443334639072418, "learning_rate": 9.397974436972208e-06, "loss": 2.103, "step": 350 }, { "batch_num_effect_tokens": 2287, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.48598, "grad_norm": 0.45273807644844055, "learning_rate": 9.392208965606613e-06, "loss": 2.1514, "step": 351 }, { "batch_num_effect_tokens": 2174, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.48737, "grad_norm": 0.40412938594818115, "learning_rate": 9.386417804233836e-06, "loss": 2.0249, "step": 352 }, { "batch_num_effect_tokens": 3036, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.48875, "grad_norm": 0.43419134616851807, "learning_rate": 9.380600986726349e-06, "loss": 2.0479, "step": 353 }, { "batch_num_effect_tokens": 1985, "batch_num_samples": 29, "batch_num_tokens": 16379, "epoch": 0.49013, "grad_norm": 0.40396714210510254, "learning_rate": 9.374758547106689e-06, "loss": 1.6885, "step": 354 }, { "batch_num_effect_tokens": 1954, "batch_num_samples": 31, "batch_num_tokens": 16381, "epoch": 0.49152, "grad_norm": 0.4479818344116211, "learning_rate": 9.36889051954725e-06, "loss": 2.1064, "step": 355 }, { "batch_num_effect_tokens": 2819, "batch_num_samples": 34, "batch_num_tokens": 16377, "epoch": 0.4929, "grad_norm": 0.4053419232368469, "learning_rate": 9.362996938370103e-06, "loss": 1.9756, "step": 356 }, { "batch_num_effect_tokens": 2265, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.49429, "grad_norm": 0.4398326277732849, "learning_rate": 9.357077838046766e-06, "loss": 1.9531, "step": 357 }, { "batch_num_effect_tokens": 2921, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.49567, "grad_norm": 0.3626527488231659, "learning_rate": 9.351133253198027e-06, "loss": 2.0752, "step": 358 }, { "batch_num_effect_tokens": 2654, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.49706, "grad_norm": 0.3847633898258209, "learning_rate": 9.345163218593735e-06, "loss": 1.7324, "step": 359 }, { "batch_num_effect_tokens": 2180, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.49844, "grad_norm": 0.4464966356754303, "learning_rate": 9.339167769152588e-06, "loss": 2.0083, "step": 360 }, { "batch_num_effect_tokens": 1934, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.49983, "grad_norm": 0.3943168520927429, "learning_rate": 9.333146939941938e-06, "loss": 1.8916, "step": 361 }, { "batch_num_effect_tokens": 2505, "batch_num_samples": 29, "batch_num_tokens": 16364, "epoch": 0.50121, "grad_norm": 0.4155826270580292, "learning_rate": 9.327100766177585e-06, "loss": 2.2666, "step": 362 }, { "batch_num_effect_tokens": 2747, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.5026, "grad_norm": 0.416442334651947, "learning_rate": 9.321029283223567e-06, "loss": 2.0107, "step": 363 }, { "batch_num_effect_tokens": 2265, "batch_num_samples": 30, "batch_num_tokens": 16374, "epoch": 0.50398, "grad_norm": 0.4200302064418793, "learning_rate": 9.314932526591956e-06, "loss": 1.9302, "step": 364 }, { "batch_num_effect_tokens": 2532, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.50537, "grad_norm": 0.5051868557929993, "learning_rate": 9.30881053194265e-06, "loss": 2.001, "step": 365 }, { "batch_num_effect_tokens": 2481, "batch_num_samples": 48, "batch_num_tokens": 16312, "epoch": 0.50675, "grad_norm": 0.4330357611179352, "learning_rate": 9.302663335083161e-06, "loss": 1.8872, "step": 366 }, { "batch_num_effect_tokens": 2211, "batch_num_samples": 31, "batch_num_tokens": 16375, "epoch": 0.50813, "grad_norm": 0.4246368706226349, "learning_rate": 9.296490971968416e-06, "loss": 1.917, "step": 367 }, { "batch_num_effect_tokens": 2429, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.50952, "grad_norm": 0.41249212622642517, "learning_rate": 9.29029347870053e-06, "loss": 2.1074, "step": 368 }, { "batch_num_effect_tokens": 2268, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.5109, "grad_norm": 0.4310665428638458, "learning_rate": 9.28407089152861e-06, "loss": 1.9326, "step": 369 }, { "batch_num_effect_tokens": 3215, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.51229, "grad_norm": 0.3914966583251953, "learning_rate": 9.277823246848537e-06, "loss": 1.9341, "step": 370 }, { "batch_num_effect_tokens": 1790, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.51367, "grad_norm": 0.43119508028030396, "learning_rate": 9.27155058120275e-06, "loss": 1.979, "step": 371 }, { "batch_num_effect_tokens": 2241, "batch_num_samples": 31, "batch_num_tokens": 16361, "epoch": 0.51506, "grad_norm": 0.4054321050643921, "learning_rate": 9.26525293128004e-06, "loss": 1.9907, "step": 372 }, { "batch_num_effect_tokens": 2761, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.51644, "grad_norm": 0.4311511218547821, "learning_rate": 9.258930333915325e-06, "loss": 1.9668, "step": 373 }, { "batch_num_effect_tokens": 3167, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.51783, "grad_norm": 0.41182681918144226, "learning_rate": 9.252582826089447e-06, "loss": 2.1475, "step": 374 }, { "batch_num_effect_tokens": 2208, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.51921, "grad_norm": 0.5092533230781555, "learning_rate": 9.246210444928942e-06, "loss": 1.916, "step": 375 }, { "batch_num_effect_tokens": 3473, "batch_num_samples": 49, "batch_num_tokens": 16376, "epoch": 0.5206, "grad_norm": 0.3312603831291199, "learning_rate": 9.23981322770584e-06, "loss": 1.9839, "step": 376 }, { "batch_num_effect_tokens": 2277, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.52198, "grad_norm": 0.41733917593955994, "learning_rate": 9.233391211837423e-06, "loss": 1.7197, "step": 377 }, { "batch_num_effect_tokens": 4126, "batch_num_samples": 58, "batch_num_tokens": 16225, "epoch": 0.52336, "grad_norm": 0.31288862228393555, "learning_rate": 9.226944434886034e-06, "loss": 1.8389, "step": 378 }, { "batch_num_effect_tokens": 2219, "batch_num_samples": 31, "batch_num_tokens": 16272, "epoch": 0.52475, "grad_norm": 0.4391637444496155, "learning_rate": 9.220472934558838e-06, "loss": 2.0757, "step": 379 }, { "batch_num_effect_tokens": 2208, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.52613, "grad_norm": 0.4120720326900482, "learning_rate": 9.213976748707602e-06, "loss": 1.8794, "step": 380 }, { "batch_num_effect_tokens": 2137, "batch_num_samples": 32, "batch_num_tokens": 16376, "epoch": 0.52752, "grad_norm": 0.4928543269634247, "learning_rate": 9.207455915328487e-06, "loss": 1.916, "step": 381 }, { "batch_num_effect_tokens": 3329, "batch_num_samples": 57, "batch_num_tokens": 16276, "epoch": 0.5289, "grad_norm": 0.38053733110427856, "learning_rate": 9.20091047256181e-06, "loss": 1.9473, "step": 382 }, { "batch_num_effect_tokens": 2324, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.53029, "grad_norm": 0.42396068572998047, "learning_rate": 9.194340458691833e-06, "loss": 1.687, "step": 383 }, { "batch_num_effect_tokens": 2237, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.53167, "grad_norm": 0.4578258693218231, "learning_rate": 9.187745912146535e-06, "loss": 1.9639, "step": 384 }, { "batch_num_effect_tokens": 2701, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.53306, "grad_norm": 0.4379984736442566, "learning_rate": 9.181126871497378e-06, "loss": 1.8828, "step": 385 }, { "batch_num_effect_tokens": 3158, "batch_num_samples": 29, "batch_num_tokens": 16353, "epoch": 0.53444, "grad_norm": 0.388353556394577, "learning_rate": 9.174483375459102e-06, "loss": 1.8887, "step": 386 }, { "batch_num_effect_tokens": 2242, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.53583, "grad_norm": 0.4682246744632721, "learning_rate": 9.167815462889477e-06, "loss": 2.0723, "step": 387 }, { "batch_num_effect_tokens": 2525, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.53721, "grad_norm": 0.4079650044441223, "learning_rate": 9.161123172789091e-06, "loss": 1.9727, "step": 388 }, { "batch_num_effect_tokens": 3037, "batch_num_samples": 56, "batch_num_tokens": 16312, "epoch": 0.53859, "grad_norm": 0.3424369692802429, "learning_rate": 9.154406544301113e-06, "loss": 1.8857, "step": 389 }, { "batch_num_effect_tokens": 2613, "batch_num_samples": 31, "batch_num_tokens": 16332, "epoch": 0.53998, "grad_norm": 0.4263276755809784, "learning_rate": 9.147665616711065e-06, "loss": 1.9619, "step": 390 }, { "batch_num_effect_tokens": 2153, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.54136, "grad_norm": 0.4267057478427887, "learning_rate": 9.140900429446601e-06, "loss": 1.9902, "step": 391 }, { "batch_num_effect_tokens": 2642, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.54275, "grad_norm": 0.4078642725944519, "learning_rate": 9.134111022077263e-06, "loss": 1.9346, "step": 392 }, { "batch_num_effect_tokens": 2083, "batch_num_samples": 30, "batch_num_tokens": 16381, "epoch": 0.54413, "grad_norm": 0.4503459930419922, "learning_rate": 9.127297434314262e-06, "loss": 1.8809, "step": 393 }, { "batch_num_effect_tokens": 2572, "batch_num_samples": 29, "batch_num_tokens": 16310, "epoch": 0.54552, "grad_norm": 0.447170227766037, "learning_rate": 9.120459706010233e-06, "loss": 1.9397, "step": 394 }, { "batch_num_effect_tokens": 2207, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.5469, "grad_norm": 0.451015830039978, "learning_rate": 9.113597877159014e-06, "loss": 1.915, "step": 395 }, { "batch_num_effect_tokens": 2688, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 0.54829, "grad_norm": 0.3790961802005768, "learning_rate": 9.106711987895411e-06, "loss": 1.8867, "step": 396 }, { "batch_num_effect_tokens": 2481, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.54967, "grad_norm": 0.4053148627281189, "learning_rate": 9.099802078494947e-06, "loss": 2.0215, "step": 397 }, { "batch_num_effect_tokens": 2232, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.55106, "grad_norm": 0.417107492685318, "learning_rate": 9.092868189373651e-06, "loss": 1.9971, "step": 398 }, { "batch_num_effect_tokens": 2053, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.55244, "grad_norm": 0.3955957293510437, "learning_rate": 9.085910361087802e-06, "loss": 1.8682, "step": 399 }, { "batch_num_effect_tokens": 2283, "batch_num_samples": 29, "batch_num_tokens": 16335, "epoch": 0.55382, "grad_norm": 0.4278542101383209, "learning_rate": 9.0789286343337e-06, "loss": 1.8779, "step": 400 }, { "batch_num_effect_tokens": 3543, "batch_num_samples": 41, "batch_num_tokens": 16319, "epoch": 0.55521, "grad_norm": 0.3659515380859375, "learning_rate": 9.071923049947429e-06, "loss": 1.7993, "step": 401 }, { "batch_num_effect_tokens": 2405, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.55659, "grad_norm": 0.4441213011741638, "learning_rate": 9.064893648904617e-06, "loss": 2.002, "step": 402 }, { "batch_num_effect_tokens": 2291, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.55798, "grad_norm": 0.4033616781234741, "learning_rate": 9.057840472320192e-06, "loss": 1.9761, "step": 403 }, { "batch_num_effect_tokens": 2763, "batch_num_samples": 31, "batch_num_tokens": 16205, "epoch": 0.55936, "grad_norm": 0.4063546657562256, "learning_rate": 9.050763561448147e-06, "loss": 2.1704, "step": 404 }, { "batch_num_effect_tokens": 2688, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.56075, "grad_norm": 0.38846081495285034, "learning_rate": 9.043662957681297e-06, "loss": 1.9189, "step": 405 }, { "batch_num_effect_tokens": 3243, "batch_num_samples": 34, "batch_num_tokens": 16332, "epoch": 0.56213, "grad_norm": 0.400756299495697, "learning_rate": 9.036538702551037e-06, "loss": 2.0503, "step": 406 }, { "batch_num_effect_tokens": 3261, "batch_num_samples": 47, "batch_num_tokens": 16383, "epoch": 0.56352, "grad_norm": 0.3152129054069519, "learning_rate": 9.029390837727094e-06, "loss": 1.7764, "step": 407 }, { "batch_num_effect_tokens": 2266, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.5649, "grad_norm": 0.4328164756298065, "learning_rate": 9.022219405017296e-06, "loss": 1.979, "step": 408 }, { "batch_num_effect_tokens": 2168, "batch_num_samples": 31, "batch_num_tokens": 16382, "epoch": 0.56629, "grad_norm": 0.4132850170135498, "learning_rate": 9.015024446367315e-06, "loss": 1.9937, "step": 409 }, { "batch_num_effect_tokens": 2835, "batch_num_samples": 42, "batch_num_tokens": 16384, "epoch": 0.56767, "grad_norm": 0.4124335050582886, "learning_rate": 9.007806003860424e-06, "loss": 1.9141, "step": 410 }, { "batch_num_effect_tokens": 2453, "batch_num_samples": 33, "batch_num_tokens": 16311, "epoch": 0.56906, "grad_norm": 0.38374003767967224, "learning_rate": 9.000564119717256e-06, "loss": 1.9912, "step": 411 }, { "batch_num_effect_tokens": 2256, "batch_num_samples": 29, "batch_num_tokens": 16317, "epoch": 0.57044, "grad_norm": 0.42961955070495605, "learning_rate": 8.993298836295556e-06, "loss": 1.8506, "step": 412 }, { "batch_num_effect_tokens": 1997, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.57182, "grad_norm": 0.40292981266975403, "learning_rate": 8.986010196089925e-06, "loss": 1.8213, "step": 413 }, { "batch_num_effect_tokens": 2582, "batch_num_samples": 37, "batch_num_tokens": 16340, "epoch": 0.57321, "grad_norm": 0.41805610060691833, "learning_rate": 8.978698241731586e-06, "loss": 2.1348, "step": 414 }, { "batch_num_effect_tokens": 2422, "batch_num_samples": 31, "batch_num_tokens": 16373, "epoch": 0.57459, "grad_norm": 0.4349914789199829, "learning_rate": 8.971363015988115e-06, "loss": 1.9727, "step": 415 }, { "batch_num_effect_tokens": 2926, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.57598, "grad_norm": 0.37256374955177307, "learning_rate": 8.964004561763213e-06, "loss": 1.896, "step": 416 }, { "batch_num_effect_tokens": 2597, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 0.57736, "grad_norm": 0.40795406699180603, "learning_rate": 8.956622922096438e-06, "loss": 1.9551, "step": 417 }, { "batch_num_effect_tokens": 2355, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.57875, "grad_norm": 0.4033205211162567, "learning_rate": 8.949218140162965e-06, "loss": 1.8994, "step": 418 }, { "batch_num_effect_tokens": 2111, "batch_num_samples": 28, "batch_num_tokens": 16320, "epoch": 0.58013, "grad_norm": 0.4335424304008484, "learning_rate": 8.941790259273325e-06, "loss": 2.0215, "step": 419 }, { "batch_num_effect_tokens": 2579, "batch_num_samples": 32, "batch_num_tokens": 16332, "epoch": 0.58152, "grad_norm": 0.36366668343544006, "learning_rate": 8.934339322873149e-06, "loss": 2.0586, "step": 420 }, { "batch_num_effect_tokens": 2401, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.5829, "grad_norm": 0.41302305459976196, "learning_rate": 8.926865374542928e-06, "loss": 1.8975, "step": 421 }, { "batch_num_effect_tokens": 2690, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.58429, "grad_norm": 0.3420538902282715, "learning_rate": 8.919368457997747e-06, "loss": 1.8379, "step": 422 }, { "batch_num_effect_tokens": 2256, "batch_num_samples": 28, "batch_num_tokens": 16292, "epoch": 0.58567, "grad_norm": 0.3978654146194458, "learning_rate": 8.91184861708703e-06, "loss": 1.9482, "step": 423 }, { "batch_num_effect_tokens": 2328, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.58705, "grad_norm": 0.41041678190231323, "learning_rate": 8.904305895794292e-06, "loss": 1.9053, "step": 424 }, { "batch_num_effect_tokens": 2878, "batch_num_samples": 60, "batch_num_tokens": 16291, "epoch": 0.58844, "grad_norm": 0.4303063452243805, "learning_rate": 8.896740338236863e-06, "loss": 1.8276, "step": 425 }, { "batch_num_effect_tokens": 2591, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.58982, "grad_norm": 0.38216012716293335, "learning_rate": 8.889151988665654e-06, "loss": 1.8135, "step": 426 }, { "batch_num_effect_tokens": 2427, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.59121, "grad_norm": 0.3629734516143799, "learning_rate": 8.88154089146488e-06, "loss": 1.8618, "step": 427 }, { "batch_num_effect_tokens": 2273, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.59259, "grad_norm": 0.4155242145061493, "learning_rate": 8.873907091151808e-06, "loss": 1.8247, "step": 428 }, { "batch_num_effect_tokens": 4399, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 0.59398, "grad_norm": 0.32867759466171265, "learning_rate": 8.866250632376499e-06, "loss": 2.0176, "step": 429 }, { "batch_num_effect_tokens": 2254, "batch_num_samples": 33, "batch_num_tokens": 16381, "epoch": 0.59536, "grad_norm": 0.42962148785591125, "learning_rate": 8.858571559921539e-06, "loss": 2.0889, "step": 430 }, { "batch_num_effect_tokens": 2397, "batch_num_samples": 31, "batch_num_tokens": 16365, "epoch": 0.59675, "grad_norm": 0.4126598834991455, "learning_rate": 8.85086991870178e-06, "loss": 1.9644, "step": 431 }, { "batch_num_effect_tokens": 2716, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.59813, "grad_norm": 0.4046272039413452, "learning_rate": 8.843145753764083e-06, "loss": 2.0269, "step": 432 }, { "batch_num_effect_tokens": 1937, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.59952, "grad_norm": 0.4273347854614258, "learning_rate": 8.835399110287046e-06, "loss": 1.6895, "step": 433 }, { "batch_num_effect_tokens": 2025, "batch_num_samples": 33, "batch_num_tokens": 16279, "epoch": 0.6009, "grad_norm": 0.432579904794693, "learning_rate": 8.827630033580752e-06, "loss": 1.9131, "step": 434 }, { "batch_num_effect_tokens": 2432, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.60228, "grad_norm": 0.3724689781665802, "learning_rate": 8.819838569086482e-06, "loss": 1.9556, "step": 435 }, { "batch_num_effect_tokens": 2432, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.60228, "eval_eval_loss": 0.4801722466945648, "eval_eval_runtime": 105.4236, "eval_eval_samples_per_second": 45.986, "eval_eval_steps_per_second": 2.874, "step": 435 }, { "batch_num_effect_tokens": 2934, "batch_num_samples": 31, "batch_num_tokens": 16277, "epoch": 0.60367, "grad_norm": 0.383453369140625, "learning_rate": 8.812024762376477e-06, "loss": 1.9824, "step": 436 }, { "batch_num_effect_tokens": 2141, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.60505, "grad_norm": 0.4867875874042511, "learning_rate": 8.80418865915365e-06, "loss": 1.9453, "step": 437 }, { "batch_num_effect_tokens": 2558, "batch_num_samples": 29, "batch_num_tokens": 16343, "epoch": 0.60644, "grad_norm": 0.4723973870277405, "learning_rate": 8.796330305251326e-06, "loss": 2.1084, "step": 438 }, { "batch_num_effect_tokens": 2870, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.60782, "grad_norm": 0.452468603849411, "learning_rate": 8.788449746632976e-06, "loss": 1.9795, "step": 439 }, { "batch_num_effect_tokens": 2915, "batch_num_samples": 34, "batch_num_tokens": 16260, "epoch": 0.60921, "grad_norm": 0.41661083698272705, "learning_rate": 8.780547029391947e-06, "loss": 1.999, "step": 440 }, { "batch_num_effect_tokens": 2406, "batch_num_samples": 28, "batch_num_tokens": 16337, "epoch": 0.61059, "grad_norm": 0.4197576344013214, "learning_rate": 8.77262219975119e-06, "loss": 1.9023, "step": 441 }, { "batch_num_effect_tokens": 2716, "batch_num_samples": 29, "batch_num_tokens": 16298, "epoch": 0.61198, "grad_norm": 0.4145912826061249, "learning_rate": 8.764675304062992e-06, "loss": 1.8174, "step": 442 }, { "batch_num_effect_tokens": 2246, "batch_num_samples": 29, "batch_num_tokens": 16328, "epoch": 0.61336, "grad_norm": 0.43209904432296753, "learning_rate": 8.756706388808704e-06, "loss": 2.0972, "step": 443 }, { "batch_num_effect_tokens": 2943, "batch_num_samples": 28, "batch_num_tokens": 16383, "epoch": 0.61475, "grad_norm": 0.4133492708206177, "learning_rate": 8.748715500598472e-06, "loss": 1.9355, "step": 444 }, { "batch_num_effect_tokens": 2293, "batch_num_samples": 35, "batch_num_tokens": 16378, "epoch": 0.61613, "grad_norm": 0.39352476596832275, "learning_rate": 8.740702686170955e-06, "loss": 1.833, "step": 445 }, { "batch_num_effect_tokens": 2645, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.61751, "grad_norm": 0.4053751826286316, "learning_rate": 8.732667992393064e-06, "loss": 1.8828, "step": 446 }, { "batch_num_effect_tokens": 2623, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 0.6189, "grad_norm": 0.3661860227584839, "learning_rate": 8.724611466259682e-06, "loss": 1.9624, "step": 447 }, { "batch_num_effect_tokens": 2470, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.62028, "grad_norm": 0.4214949905872345, "learning_rate": 8.71653315489339e-06, "loss": 1.9277, "step": 448 }, { "batch_num_effect_tokens": 3520, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.62167, "grad_norm": 0.33281198143959045, "learning_rate": 8.708433105544183e-06, "loss": 1.8491, "step": 449 }, { "batch_num_effect_tokens": 2347, "batch_num_samples": 29, "batch_num_tokens": 16369, "epoch": 0.62305, "grad_norm": 0.39744654297828674, "learning_rate": 8.70031136558921e-06, "loss": 2.0229, "step": 450 }, { "batch_num_effect_tokens": 2030, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.62444, "grad_norm": 0.4560944139957428, "learning_rate": 8.692167982532487e-06, "loss": 1.9414, "step": 451 }, { "batch_num_effect_tokens": 3218, "batch_num_samples": 32, "batch_num_tokens": 16381, "epoch": 0.62582, "grad_norm": 0.3647509813308716, "learning_rate": 8.684003004004618e-06, "loss": 1.8643, "step": 452 }, { "batch_num_effect_tokens": 2277, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.62721, "grad_norm": 0.369740754365921, "learning_rate": 8.675816477762516e-06, "loss": 1.937, "step": 453 }, { "batch_num_effect_tokens": 2609, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.62859, "grad_norm": 0.4270611107349396, "learning_rate": 8.667608451689135e-06, "loss": 1.7744, "step": 454 }, { "batch_num_effect_tokens": 2599, "batch_num_samples": 28, "batch_num_tokens": 16367, "epoch": 0.62998, "grad_norm": 0.40497589111328125, "learning_rate": 8.659378973793173e-06, "loss": 2.1025, "step": 455 }, { "batch_num_effect_tokens": 3064, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 0.63136, "grad_norm": 0.411929726600647, "learning_rate": 8.651128092208805e-06, "loss": 2.0381, "step": 456 }, { "batch_num_effect_tokens": 3325, "batch_num_samples": 53, "batch_num_tokens": 16384, "epoch": 0.63274, "grad_norm": 0.3954614996910095, "learning_rate": 8.642855855195394e-06, "loss": 1.8633, "step": 457 }, { "batch_num_effect_tokens": 2675, "batch_num_samples": 31, "batch_num_tokens": 16340, "epoch": 0.63413, "grad_norm": 0.4267875552177429, "learning_rate": 8.634562311137209e-06, "loss": 1.9614, "step": 458 }, { "batch_num_effect_tokens": 2595, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.63551, "grad_norm": 0.4395170509815216, "learning_rate": 8.62624750854315e-06, "loss": 2.2441, "step": 459 }, { "batch_num_effect_tokens": 2609, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.6369, "grad_norm": 0.3935035169124603, "learning_rate": 8.617911496046446e-06, "loss": 1.918, "step": 460 }, { "batch_num_effect_tokens": 3330, "batch_num_samples": 49, "batch_num_tokens": 16351, "epoch": 0.63828, "grad_norm": 0.38984444737434387, "learning_rate": 8.609554322404396e-06, "loss": 1.915, "step": 461 }, { "batch_num_effect_tokens": 1992, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.63967, "grad_norm": 0.3799002170562744, "learning_rate": 8.601176036498066e-06, "loss": 1.8672, "step": 462 }, { "batch_num_effect_tokens": 2849, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.64105, "grad_norm": 0.36138293147087097, "learning_rate": 8.592776687332003e-06, "loss": 2.2695, "step": 463 }, { "batch_num_effect_tokens": 2688, "batch_num_samples": 41, "batch_num_tokens": 16337, "epoch": 0.64244, "grad_norm": 0.4214763939380646, "learning_rate": 8.584356324033955e-06, "loss": 2.0166, "step": 464 }, { "batch_num_effect_tokens": 2385, "batch_num_samples": 29, "batch_num_tokens": 16343, "epoch": 0.64382, "grad_norm": 0.43467044830322266, "learning_rate": 8.575914995854588e-06, "loss": 1.9912, "step": 465 }, { "batch_num_effect_tokens": 2135, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.64521, "grad_norm": 0.4968329071998596, "learning_rate": 8.567452752167183e-06, "loss": 1.7998, "step": 466 }, { "batch_num_effect_tokens": 3061, "batch_num_samples": 46, "batch_num_tokens": 16362, "epoch": 0.64659, "grad_norm": 0.3608763813972473, "learning_rate": 8.558969642467356e-06, "loss": 2.0073, "step": 467 }, { "batch_num_effect_tokens": 2096, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.64798, "grad_norm": 0.4326012134552002, "learning_rate": 8.550465716372777e-06, "loss": 1.9199, "step": 468 }, { "batch_num_effect_tokens": 2182, "batch_num_samples": 30, "batch_num_tokens": 16381, "epoch": 0.64936, "grad_norm": 0.4162071645259857, "learning_rate": 8.54194102362286e-06, "loss": 1.9019, "step": 469 }, { "batch_num_effect_tokens": 2478, "batch_num_samples": 29, "batch_num_tokens": 16346, "epoch": 0.65074, "grad_norm": 0.36361655592918396, "learning_rate": 8.533395614078492e-06, "loss": 1.7725, "step": 470 }, { "batch_num_effect_tokens": 2298, "batch_num_samples": 30, "batch_num_tokens": 16340, "epoch": 0.65213, "grad_norm": 0.42054933309555054, "learning_rate": 8.524829537721725e-06, "loss": 1.8618, "step": 471 }, { "batch_num_effect_tokens": 2222, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.65351, "grad_norm": 0.4147699475288391, "learning_rate": 8.516242844655498e-06, "loss": 1.877, "step": 472 }, { "batch_num_effect_tokens": 2051, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.6549, "grad_norm": 0.41126230359077454, "learning_rate": 8.507635585103333e-06, "loss": 2.0137, "step": 473 }, { "batch_num_effect_tokens": 1884, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.65628, "grad_norm": 0.4331945776939392, "learning_rate": 8.499007809409043e-06, "loss": 1.9365, "step": 474 }, { "batch_num_effect_tokens": 2939, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.65767, "grad_norm": 0.3952711224555969, "learning_rate": 8.490359568036446e-06, "loss": 1.9526, "step": 475 }, { "batch_num_effect_tokens": 3071, "batch_num_samples": 56, "batch_num_tokens": 16287, "epoch": 0.65905, "grad_norm": 0.372079998254776, "learning_rate": 8.48169091156906e-06, "loss": 2.0278, "step": 476 }, { "batch_num_effect_tokens": 1985, "batch_num_samples": 29, "batch_num_tokens": 16318, "epoch": 0.66044, "grad_norm": 0.3665173351764679, "learning_rate": 8.47300189070981e-06, "loss": 1.665, "step": 477 }, { "batch_num_effect_tokens": 2790, "batch_num_samples": 33, "batch_num_tokens": 16288, "epoch": 0.66182, "grad_norm": 0.3846535384654999, "learning_rate": 8.464292556280734e-06, "loss": 1.9092, "step": 478 }, { "batch_num_effect_tokens": 2104, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.66321, "grad_norm": 0.39883777499198914, "learning_rate": 8.455562959222682e-06, "loss": 1.9033, "step": 479 }, { "batch_num_effect_tokens": 2345, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.66459, "grad_norm": 0.40159615874290466, "learning_rate": 8.446813150595022e-06, "loss": 1.7139, "step": 480 }, { "batch_num_effect_tokens": 2226, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.66597, "grad_norm": 0.39109814167022705, "learning_rate": 8.43804318157534e-06, "loss": 2.0015, "step": 481 }, { "batch_num_effect_tokens": 2580, "batch_num_samples": 40, "batch_num_tokens": 16384, "epoch": 0.66736, "grad_norm": 0.4000070095062256, "learning_rate": 8.429253103459139e-06, "loss": 1.8726, "step": 482 }, { "batch_num_effect_tokens": 1582, "batch_num_samples": 28, "batch_num_tokens": 16307, "epoch": 0.66874, "grad_norm": 0.40034863352775574, "learning_rate": 8.42044296765954e-06, "loss": 1.6543, "step": 483 }, { "batch_num_effect_tokens": 2431, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 0.67013, "grad_norm": 0.4263266623020172, "learning_rate": 8.411612825706976e-06, "loss": 2.0234, "step": 484 }, { "batch_num_effect_tokens": 2298, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.67151, "grad_norm": 0.3968966603279114, "learning_rate": 8.402762729248907e-06, "loss": 1.8076, "step": 485 }, { "batch_num_effect_tokens": 2275, "batch_num_samples": 34, "batch_num_tokens": 16273, "epoch": 0.6729, "grad_norm": 0.3583934009075165, "learning_rate": 8.393892730049497e-06, "loss": 1.9287, "step": 486 }, { "batch_num_effect_tokens": 2399, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.67428, "grad_norm": 0.42251577973365784, "learning_rate": 8.385002879989328e-06, "loss": 1.8013, "step": 487 }, { "batch_num_effect_tokens": 3115, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 0.67567, "grad_norm": 0.3892873525619507, "learning_rate": 8.376093231065084e-06, "loss": 1.9136, "step": 488 }, { "batch_num_effect_tokens": 2892, "batch_num_samples": 43, "batch_num_tokens": 16383, "epoch": 0.67705, "grad_norm": 0.3951052725315094, "learning_rate": 8.367163835389253e-06, "loss": 1.9038, "step": 489 }, { "batch_num_effect_tokens": 3036, "batch_num_samples": 29, "batch_num_tokens": 16292, "epoch": 0.67844, "grad_norm": 0.4177972376346588, "learning_rate": 8.35821474518983e-06, "loss": 1.9214, "step": 490 }, { "batch_num_effect_tokens": 2694, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 0.67982, "grad_norm": 0.3772607445716858, "learning_rate": 8.349246012809991e-06, "loss": 1.8428, "step": 491 }, { "batch_num_effect_tokens": 2181, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.6812, "grad_norm": 0.38538944721221924, "learning_rate": 8.340257690707805e-06, "loss": 1.8052, "step": 492 }, { "batch_num_effect_tokens": 2293, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 0.68259, "grad_norm": 0.4401363134384155, "learning_rate": 8.331249831455921e-06, "loss": 2.0854, "step": 493 }, { "batch_num_effect_tokens": 2284, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.68397, "grad_norm": 0.45300909876823425, "learning_rate": 8.322222487741261e-06, "loss": 1.8545, "step": 494 }, { "batch_num_effect_tokens": 2399, "batch_num_samples": 31, "batch_num_tokens": 16267, "epoch": 0.68536, "grad_norm": 0.4435186982154846, "learning_rate": 8.313175712364712e-06, "loss": 2.1064, "step": 495 }, { "batch_num_effect_tokens": 2881, "batch_num_samples": 50, "batch_num_tokens": 16340, "epoch": 0.68674, "grad_norm": 0.35863468050956726, "learning_rate": 8.304109558240817e-06, "loss": 1.7803, "step": 496 }, { "batch_num_effect_tokens": 2422, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.68813, "grad_norm": 0.4114570617675781, "learning_rate": 8.29502407839746e-06, "loss": 2.1426, "step": 497 }, { "batch_num_effect_tokens": 3198, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.68951, "grad_norm": 0.38893741369247437, "learning_rate": 8.285919325975566e-06, "loss": 1.7578, "step": 498 }, { "batch_num_effect_tokens": 2241, "batch_num_samples": 42, "batch_num_tokens": 16384, "epoch": 0.6909, "grad_norm": 0.43610015511512756, "learning_rate": 8.276795354228785e-06, "loss": 1.8784, "step": 499 }, { "batch_num_effect_tokens": 2684, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 0.69228, "grad_norm": 0.39508140087127686, "learning_rate": 8.26765221652318e-06, "loss": 2.0229, "step": 500 }, { "batch_num_effect_tokens": 2396, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.69367, "grad_norm": 0.37650975584983826, "learning_rate": 8.258489966336915e-06, "loss": 1.7285, "step": 501 }, { "batch_num_effect_tokens": 2806, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 0.69505, "grad_norm": 0.394896537065506, "learning_rate": 8.249308657259943e-06, "loss": 2.0435, "step": 502 }, { "batch_num_effect_tokens": 2287, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 0.69643, "grad_norm": 0.39366012811660767, "learning_rate": 8.240108342993694e-06, "loss": 1.8149, "step": 503 }, { "batch_num_effect_tokens": 2537, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.69782, "grad_norm": 0.37748780846595764, "learning_rate": 8.230889077350755e-06, "loss": 2.0063, "step": 504 }, { "batch_num_effect_tokens": 2253, "batch_num_samples": 29, "batch_num_tokens": 16326, "epoch": 0.6992, "grad_norm": 0.4337398409843445, "learning_rate": 8.221650914254566e-06, "loss": 1.9824, "step": 505 }, { "batch_num_effect_tokens": 2953, "batch_num_samples": 45, "batch_num_tokens": 16316, "epoch": 0.70059, "grad_norm": 0.35707002878189087, "learning_rate": 8.21239390773909e-06, "loss": 1.9775, "step": 506 }, { "batch_num_effect_tokens": 3041, "batch_num_samples": 44, "batch_num_tokens": 16384, "epoch": 0.70197, "grad_norm": 0.3837774097919464, "learning_rate": 8.203118111948516e-06, "loss": 1.7051, "step": 507 }, { "batch_num_effect_tokens": 2255, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.70336, "grad_norm": 0.35609573125839233, "learning_rate": 8.193823581136919e-06, "loss": 1.792, "step": 508 }, { "batch_num_effect_tokens": 2345, "batch_num_samples": 29, "batch_num_tokens": 16284, "epoch": 0.70474, "grad_norm": 0.4175390303134918, "learning_rate": 8.184510369667962e-06, "loss": 1.8662, "step": 509 }, { "batch_num_effect_tokens": 2945, "batch_num_samples": 51, "batch_num_tokens": 16319, "epoch": 0.70613, "grad_norm": 0.37573444843292236, "learning_rate": 8.175178532014571e-06, "loss": 2.1194, "step": 510 }, { "batch_num_effect_tokens": 3101, "batch_num_samples": 47, "batch_num_tokens": 16343, "epoch": 0.70751, "grad_norm": 0.38517677783966064, "learning_rate": 8.165828122758615e-06, "loss": 1.7793, "step": 511 }, { "batch_num_effect_tokens": 2294, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.7089, "grad_norm": 0.42462700605392456, "learning_rate": 8.15645919659059e-06, "loss": 2.04, "step": 512 }, { "batch_num_effect_tokens": 3580, "batch_num_samples": 43, "batch_num_tokens": 16384, "epoch": 0.71028, "grad_norm": 0.40597039461135864, "learning_rate": 8.147071808309295e-06, "loss": 2.0605, "step": 513 }, { "batch_num_effect_tokens": 2899, "batch_num_samples": 38, "batch_num_tokens": 16312, "epoch": 0.71166, "grad_norm": 0.42046788334846497, "learning_rate": 8.137666012821514e-06, "loss": 1.8418, "step": 514 }, { "batch_num_effect_tokens": 2184, "batch_num_samples": 29, "batch_num_tokens": 16335, "epoch": 0.71305, "grad_norm": 0.4670950174331665, "learning_rate": 8.128241865141697e-06, "loss": 1.8672, "step": 515 }, { "batch_num_effect_tokens": 2576, "batch_num_samples": 29, "batch_num_tokens": 16382, "epoch": 0.71443, "grad_norm": 0.3992348611354828, "learning_rate": 8.118799420391632e-06, "loss": 1.9783, "step": 516 }, { "batch_num_effect_tokens": 2739, "batch_num_samples": 57, "batch_num_tokens": 16221, "epoch": 0.71582, "grad_norm": 0.360512912273407, "learning_rate": 8.109338733800132e-06, "loss": 1.9263, "step": 517 }, { "batch_num_effect_tokens": 2283, "batch_num_samples": 30, "batch_num_tokens": 16376, "epoch": 0.7172, "grad_norm": 0.39321085810661316, "learning_rate": 8.099859860702698e-06, "loss": 1.8721, "step": 518 }, { "batch_num_effect_tokens": 3061, "batch_num_samples": 30, "batch_num_tokens": 16316, "epoch": 0.71859, "grad_norm": 0.36936068534851074, "learning_rate": 8.090362856541218e-06, "loss": 1.979, "step": 519 }, { "batch_num_effect_tokens": 2143, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.71997, "grad_norm": 0.38815975189208984, "learning_rate": 8.080847776863609e-06, "loss": 1.8296, "step": 520 }, { "batch_num_effect_tokens": 2019, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.72136, "grad_norm": 0.41067779064178467, "learning_rate": 8.07131467732353e-06, "loss": 1.8662, "step": 521 }, { "batch_num_effect_tokens": 2633, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.72274, "grad_norm": 0.37291839718818665, "learning_rate": 8.061763613680024e-06, "loss": 2.0586, "step": 522 }, { "batch_num_effect_tokens": 2543, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.72413, "grad_norm": 0.39956262707710266, "learning_rate": 8.052194641797217e-06, "loss": 1.9141, "step": 523 }, { "batch_num_effect_tokens": 1842, "batch_num_samples": 29, "batch_num_tokens": 16312, "epoch": 0.72551, "grad_norm": 0.4637643098831177, "learning_rate": 8.042607817643974e-06, "loss": 1.7207, "step": 524 }, { "batch_num_effect_tokens": 2464, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.7269, "grad_norm": 0.41559848189353943, "learning_rate": 8.033003197293578e-06, "loss": 1.9136, "step": 525 }, { "batch_num_effect_tokens": 2498, "batch_num_samples": 32, "batch_num_tokens": 16332, "epoch": 0.72828, "grad_norm": 0.382429838180542, "learning_rate": 8.023380836923404e-06, "loss": 1.9883, "step": 526 }, { "batch_num_effect_tokens": 2311, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.72966, "grad_norm": 0.39352378249168396, "learning_rate": 8.013740792814589e-06, "loss": 3.8184, "step": 527 }, { "batch_num_effect_tokens": 2263, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.73105, "grad_norm": 0.43792155385017395, "learning_rate": 8.004083121351695e-06, "loss": 1.6157, "step": 528 }, { "batch_num_effect_tokens": 2610, "batch_num_samples": 29, "batch_num_tokens": 16301, "epoch": 0.73243, "grad_norm": 0.4139310121536255, "learning_rate": 7.994407879022397e-06, "loss": 2.0674, "step": 529 }, { "batch_num_effect_tokens": 2470, "batch_num_samples": 29, "batch_num_tokens": 16328, "epoch": 0.73382, "grad_norm": 0.4021367132663727, "learning_rate": 7.984715122417133e-06, "loss": 1.6807, "step": 530 }, { "batch_num_effect_tokens": 2518, "batch_num_samples": 35, "batch_num_tokens": 16330, "epoch": 0.7352, "grad_norm": 0.3984544575214386, "learning_rate": 7.975004908228787e-06, "loss": 1.999, "step": 531 }, { "batch_num_effect_tokens": 2101, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.73659, "grad_norm": 0.38105592131614685, "learning_rate": 7.965277293252354e-06, "loss": 1.7461, "step": 532 }, { "batch_num_effect_tokens": 2413, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.73797, "grad_norm": 0.41828256845474243, "learning_rate": 7.955532334384597e-06, "loss": 2.0166, "step": 533 }, { "batch_num_effect_tokens": 2653, "batch_num_samples": 53, "batch_num_tokens": 16344, "epoch": 0.73936, "grad_norm": 0.3413488268852234, "learning_rate": 7.945770088623735e-06, "loss": 2.0051, "step": 534 }, { "batch_num_effect_tokens": 2519, "batch_num_samples": 33, "batch_num_tokens": 16383, "epoch": 0.74074, "grad_norm": 0.39444324374198914, "learning_rate": 7.935990613069087e-06, "loss": 1.876, "step": 535 }, { "batch_num_effect_tokens": 2021, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.74213, "grad_norm": 0.47030386328697205, "learning_rate": 7.926193964920756e-06, "loss": 1.9878, "step": 536 }, { "batch_num_effect_tokens": 2839, "batch_num_samples": 33, "batch_num_tokens": 16294, "epoch": 0.74351, "grad_norm": 0.45225897431373596, "learning_rate": 7.916380201479287e-06, "loss": 2.2559, "step": 537 }, { "batch_num_effect_tokens": 2433, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 0.74489, "grad_norm": 0.4085559844970703, "learning_rate": 7.90654938014533e-06, "loss": 1.7427, "step": 538 }, { "batch_num_effect_tokens": 2250, "batch_num_samples": 28, "batch_num_tokens": 16288, "epoch": 0.74628, "grad_norm": 0.4955153167247772, "learning_rate": 7.896701558419306e-06, "loss": 2.0176, "step": 539 }, { "batch_num_effect_tokens": 2443, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.74766, "grad_norm": 0.3939432203769684, "learning_rate": 7.886836793901077e-06, "loss": 1.8027, "step": 540 }, { "batch_num_effect_tokens": 2130, "batch_num_samples": 28, "batch_num_tokens": 16379, "epoch": 0.74905, "grad_norm": 0.3650651276111603, "learning_rate": 7.876955144289594e-06, "loss": 1.9258, "step": 541 }, { "batch_num_effect_tokens": 2864, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.75043, "grad_norm": 0.3895401656627655, "learning_rate": 7.867056667382576e-06, "loss": 1.7056, "step": 542 }, { "batch_num_effect_tokens": 2942, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.75182, "grad_norm": 0.38527023792266846, "learning_rate": 7.85714142107616e-06, "loss": 1.9551, "step": 543 }, { "batch_num_effect_tokens": 3256, "batch_num_samples": 42, "batch_num_tokens": 16284, "epoch": 0.7532, "grad_norm": 0.34270429611206055, "learning_rate": 7.847209463364574e-06, "loss": 1.8291, "step": 544 }, { "batch_num_effect_tokens": 2572, "batch_num_samples": 30, "batch_num_tokens": 16287, "epoch": 0.75459, "grad_norm": 0.37214452028274536, "learning_rate": 7.837260852339782e-06, "loss": 1.918, "step": 545 }, { "batch_num_effect_tokens": 3230, "batch_num_samples": 61, "batch_num_tokens": 16253, "epoch": 0.75597, "grad_norm": 0.35212212800979614, "learning_rate": 7.827295646191161e-06, "loss": 1.7563, "step": 546 }, { "batch_num_effect_tokens": 2535, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.75736, "grad_norm": 0.42506250739097595, "learning_rate": 7.817313903205148e-06, "loss": 2.1743, "step": 547 }, { "batch_num_effect_tokens": 2435, "batch_num_samples": 31, "batch_num_tokens": 16298, "epoch": 0.75874, "grad_norm": 0.3594960570335388, "learning_rate": 7.807315681764907e-06, "loss": 1.6167, "step": 548 }, { "batch_num_effect_tokens": 2371, "batch_num_samples": 29, "batch_num_tokens": 16274, "epoch": 0.76012, "grad_norm": 0.4198870360851288, "learning_rate": 7.797301040349978e-06, "loss": 2.2578, "step": 549 }, { "batch_num_effect_tokens": 2460, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.76151, "grad_norm": 0.3992410898208618, "learning_rate": 7.78727003753595e-06, "loss": 2.043, "step": 550 }, { "batch_num_effect_tokens": 2208, "batch_num_samples": 30, "batch_num_tokens": 16328, "epoch": 0.76289, "grad_norm": 0.4332595467567444, "learning_rate": 7.777222731994107e-06, "loss": 1.8271, "step": 551 }, { "batch_num_effect_tokens": 2233, "batch_num_samples": 33, "batch_num_tokens": 16334, "epoch": 0.76428, "grad_norm": 0.41461411118507385, "learning_rate": 7.767159182491084e-06, "loss": 1.9277, "step": 552 }, { "batch_num_effect_tokens": 2263, "batch_num_samples": 29, "batch_num_tokens": 16382, "epoch": 0.76566, "grad_norm": 0.40060073137283325, "learning_rate": 7.757079447888529e-06, "loss": 1.8848, "step": 553 }, { "batch_num_effect_tokens": 2195, "batch_num_samples": 28, "batch_num_tokens": 16272, "epoch": 0.76705, "grad_norm": 0.40586423873901367, "learning_rate": 7.746983587142757e-06, "loss": 2.0293, "step": 554 }, { "batch_num_effect_tokens": 2600, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.76843, "grad_norm": 0.36526283621788025, "learning_rate": 7.736871659304404e-06, "loss": 1.8535, "step": 555 }, { "batch_num_effect_tokens": 2134, "batch_num_samples": 28, "batch_num_tokens": 16307, "epoch": 0.76982, "grad_norm": 0.4139673113822937, "learning_rate": 7.726743723518087e-06, "loss": 2.0688, "step": 556 }, { "batch_num_effect_tokens": 2404, "batch_num_samples": 30, "batch_num_tokens": 16287, "epoch": 0.7712, "grad_norm": 0.4116215407848358, "learning_rate": 7.716599839022044e-06, "loss": 1.9023, "step": 557 }, { "batch_num_effect_tokens": 2400, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.77259, "grad_norm": 0.4026041626930237, "learning_rate": 7.706440065147805e-06, "loss": 1.897, "step": 558 }, { "batch_num_effect_tokens": 2414, "batch_num_samples": 29, "batch_num_tokens": 16291, "epoch": 0.77397, "grad_norm": 0.3891029357910156, "learning_rate": 7.696264461319831e-06, "loss": 1.8799, "step": 559 }, { "batch_num_effect_tokens": 2482, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.77535, "grad_norm": 0.4471919536590576, "learning_rate": 7.686073087055179e-06, "loss": 1.9893, "step": 560 }, { "batch_num_effect_tokens": 2116, "batch_num_samples": 28, "batch_num_tokens": 16313, "epoch": 0.77674, "grad_norm": 0.4177844524383545, "learning_rate": 7.675866001963144e-06, "loss": 1.9644, "step": 561 }, { "batch_num_effect_tokens": 2621, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.77812, "grad_norm": 0.41075554490089417, "learning_rate": 7.66564326574491e-06, "loss": 1.9468, "step": 562 }, { "batch_num_effect_tokens": 2022, "batch_num_samples": 28, "batch_num_tokens": 16382, "epoch": 0.77951, "grad_norm": 0.4458828568458557, "learning_rate": 7.655404938193207e-06, "loss": 1.7832, "step": 563 }, { "batch_num_effect_tokens": 2309, "batch_num_samples": 30, "batch_num_tokens": 16328, "epoch": 0.78089, "grad_norm": 0.4780299663543701, "learning_rate": 7.645151079191962e-06, "loss": 2.124, "step": 564 }, { "batch_num_effect_tokens": 2362, "batch_num_samples": 29, "batch_num_tokens": 16279, "epoch": 0.78228, "grad_norm": 0.41119229793548584, "learning_rate": 7.634881748715941e-06, "loss": 2.0142, "step": 565 }, { "batch_num_effect_tokens": 2367, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.78366, "grad_norm": 0.4126008152961731, "learning_rate": 7.624597006830405e-06, "loss": 1.8438, "step": 566 }, { "batch_num_effect_tokens": 2996, "batch_num_samples": 49, "batch_num_tokens": 16384, "epoch": 0.78505, "grad_norm": 0.3986588418483734, "learning_rate": 7.614296913690756e-06, "loss": 1.9121, "step": 567 }, { "batch_num_effect_tokens": 2753, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 0.78643, "grad_norm": 0.3910169303417206, "learning_rate": 7.60398152954218e-06, "loss": 2.0332, "step": 568 }, { "batch_num_effect_tokens": 2423, "batch_num_samples": 29, "batch_num_tokens": 16367, "epoch": 0.78782, "grad_norm": 0.38262543082237244, "learning_rate": 7.593650914719311e-06, "loss": 1.8989, "step": 569 }, { "batch_num_effect_tokens": 2751, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.7892, "grad_norm": 0.3746867775917053, "learning_rate": 7.583305129645857e-06, "loss": 1.9214, "step": 570 }, { "batch_num_effect_tokens": 2774, "batch_num_samples": 28, "batch_num_tokens": 16371, "epoch": 0.79058, "grad_norm": 0.3766612410545349, "learning_rate": 7.572944234834261e-06, "loss": 1.8643, "step": 571 }, { "batch_num_effect_tokens": 2269, "batch_num_samples": 29, "batch_num_tokens": 16340, "epoch": 0.79197, "grad_norm": 0.4250185489654541, "learning_rate": 7.562568290885344e-06, "loss": 1.6016, "step": 572 }, { "batch_num_effect_tokens": 2354, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.79335, "grad_norm": 0.42661988735198975, "learning_rate": 7.552177358487944e-06, "loss": 1.8389, "step": 573 }, { "batch_num_effect_tokens": 2399, "batch_num_samples": 28, "batch_num_tokens": 16380, "epoch": 0.79474, "grad_norm": 0.4092441499233246, "learning_rate": 7.541771498418575e-06, "loss": 2.1836, "step": 574 }, { "batch_num_effect_tokens": 1976, "batch_num_samples": 28, "batch_num_tokens": 16352, "epoch": 0.79612, "grad_norm": 0.4167001247406006, "learning_rate": 7.531350771541055e-06, "loss": 1.7969, "step": 575 }, { "batch_num_effect_tokens": 2330, "batch_num_samples": 28, "batch_num_tokens": 16303, "epoch": 0.79751, "grad_norm": 0.4098563492298126, "learning_rate": 7.520915238806161e-06, "loss": 1.957, "step": 576 }, { "batch_num_effect_tokens": 2892, "batch_num_samples": 33, "batch_num_tokens": 16279, "epoch": 0.79889, "grad_norm": 0.40418657660484314, "learning_rate": 7.510464961251271e-06, "loss": 1.9922, "step": 577 }, { "batch_num_effect_tokens": 2222, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.80028, "grad_norm": 0.3807937800884247, "learning_rate": 7.500000000000001e-06, "loss": 1.8945, "step": 578 }, { "batch_num_effect_tokens": 3161, "batch_num_samples": 40, "batch_num_tokens": 16382, "epoch": 0.80166, "grad_norm": 0.35870012640953064, "learning_rate": 7.489520416261855e-06, "loss": 1.7446, "step": 579 }, { "batch_num_effect_tokens": 2351, "batch_num_samples": 43, "batch_num_tokens": 16384, "epoch": 0.80305, "grad_norm": 0.4318827688694, "learning_rate": 7.479026271331864e-06, "loss": 1.8193, "step": 580 }, { "batch_num_effect_tokens": 2351, "batch_num_samples": 43, "batch_num_tokens": 16384, "epoch": 0.80305, "eval_eval_loss": 0.4670772850513458, "eval_eval_runtime": 105.3186, "eval_eval_samples_per_second": 46.032, "eval_eval_steps_per_second": 2.877, "step": 580 }, { "batch_num_effect_tokens": 2300, "batch_num_samples": 29, "batch_num_tokens": 16380, "epoch": 0.80443, "grad_norm": 0.4100073277950287, "learning_rate": 7.468517626590229e-06, "loss": 1.8496, "step": 581 }, { "batch_num_effect_tokens": 2749, "batch_num_samples": 39, "batch_num_tokens": 16383, "epoch": 0.80582, "grad_norm": 0.41085439920425415, "learning_rate": 7.457994543501951e-06, "loss": 1.9434, "step": 582 }, { "batch_num_effect_tokens": 2590, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.8072, "grad_norm": 0.40914440155029297, "learning_rate": 7.447457083616494e-06, "loss": 1.8613, "step": 583 }, { "batch_num_effect_tokens": 2201, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.80858, "grad_norm": 0.44034504890441895, "learning_rate": 7.436905308567404e-06, "loss": 1.978, "step": 584 }, { "batch_num_effect_tokens": 2633, "batch_num_samples": 59, "batch_num_tokens": 16227, "epoch": 0.80997, "grad_norm": 0.33627304434776306, "learning_rate": 7.426339280071957e-06, "loss": 1.6929, "step": 585 }, { "batch_num_effect_tokens": 2243, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.81135, "grad_norm": 0.3881838917732239, "learning_rate": 7.415759059930799e-06, "loss": 2.0972, "step": 586 }, { "batch_num_effect_tokens": 3306, "batch_num_samples": 30, "batch_num_tokens": 16376, "epoch": 0.81274, "grad_norm": 0.3939763903617859, "learning_rate": 7.40516471002758e-06, "loss": 1.9053, "step": 587 }, { "batch_num_effect_tokens": 2451, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.81412, "grad_norm": 0.4667435884475708, "learning_rate": 7.394556292328601e-06, "loss": 1.9805, "step": 588 }, { "batch_num_effect_tokens": 2115, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.81551, "grad_norm": 0.42708465456962585, "learning_rate": 7.383933868882438e-06, "loss": 1.7896, "step": 589 }, { "batch_num_effect_tokens": 2613, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 0.81689, "grad_norm": 0.3875483572483063, "learning_rate": 7.373297501819591e-06, "loss": 1.8154, "step": 590 }, { "batch_num_effect_tokens": 2029, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.81828, "grad_norm": 0.38697579503059387, "learning_rate": 7.362647253352116e-06, "loss": 1.7021, "step": 591 }, { "batch_num_effect_tokens": 2502, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.81966, "grad_norm": 0.36124011874198914, "learning_rate": 7.351983185773259e-06, "loss": 1.6836, "step": 592 }, { "batch_num_effect_tokens": 2612, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.82105, "grad_norm": 0.4169224798679352, "learning_rate": 7.341305361457096e-06, "loss": 2.167, "step": 593 }, { "batch_num_effect_tokens": 2570, "batch_num_samples": 29, "batch_num_tokens": 16366, "epoch": 0.82243, "grad_norm": 0.432545006275177, "learning_rate": 7.330613842858165e-06, "loss": 2.0586, "step": 594 }, { "batch_num_effect_tokens": 2256, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.82381, "grad_norm": 0.3733733892440796, "learning_rate": 7.319908692511103e-06, "loss": 1.748, "step": 595 }, { "batch_num_effect_tokens": 2161, "batch_num_samples": 28, "batch_num_tokens": 16383, "epoch": 0.8252, "grad_norm": 0.4341174066066742, "learning_rate": 7.3091899730302765e-06, "loss": 1.9766, "step": 596 }, { "batch_num_effect_tokens": 2680, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 0.82658, "grad_norm": 0.41061729192733765, "learning_rate": 7.298457747109421e-06, "loss": 1.877, "step": 597 }, { "batch_num_effect_tokens": 2550, "batch_num_samples": 32, "batch_num_tokens": 16310, "epoch": 0.82797, "grad_norm": 0.3989335298538208, "learning_rate": 7.2877120775212685e-06, "loss": 1.9165, "step": 598 }, { "batch_num_effect_tokens": 2508, "batch_num_samples": 37, "batch_num_tokens": 16306, "epoch": 0.82935, "grad_norm": 0.41209736466407776, "learning_rate": 7.276953027117186e-06, "loss": 1.8838, "step": 599 }, { "batch_num_effect_tokens": 3056, "batch_num_samples": 30, "batch_num_tokens": 16289, "epoch": 0.83074, "grad_norm": 0.32505002617836, "learning_rate": 7.2661806588268015e-06, "loss": 1.8369, "step": 600 }, { "batch_num_effect_tokens": 2657, "batch_num_samples": 42, "batch_num_tokens": 16384, "epoch": 0.83212, "grad_norm": 0.4228053689002991, "learning_rate": 7.255395035657639e-06, "loss": 1.8594, "step": 601 }, { "batch_num_effect_tokens": 2164, "batch_num_samples": 30, "batch_num_tokens": 16286, "epoch": 0.83351, "grad_norm": 0.48827677965164185, "learning_rate": 7.244596220694754e-06, "loss": 1.8926, "step": 602 }, { "batch_num_effect_tokens": 2249, "batch_num_samples": 28, "batch_num_tokens": 16292, "epoch": 0.83489, "grad_norm": 0.40386247634887695, "learning_rate": 7.233784277100359e-06, "loss": 1.6367, "step": 603 }, { "batch_num_effect_tokens": 2515, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 0.83628, "grad_norm": 0.3992808759212494, "learning_rate": 7.222959268113452e-06, "loss": 1.8022, "step": 604 }, { "batch_num_effect_tokens": 2423, "batch_num_samples": 30, "batch_num_tokens": 16383, "epoch": 0.83766, "grad_norm": 0.4052756428718567, "learning_rate": 7.212121257049457e-06, "loss": 1.875, "step": 605 }, { "batch_num_effect_tokens": 2397, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.83904, "grad_norm": 0.40450066328048706, "learning_rate": 7.201270307299844e-06, "loss": 1.8916, "step": 606 }, { "batch_num_effect_tokens": 2075, "batch_num_samples": 31, "batch_num_tokens": 16361, "epoch": 0.84043, "grad_norm": 0.4483255445957184, "learning_rate": 7.190406482331757e-06, "loss": 1.8496, "step": 607 }, { "batch_num_effect_tokens": 2110, "batch_num_samples": 29, "batch_num_tokens": 16364, "epoch": 0.84181, "grad_norm": 0.40810394287109375, "learning_rate": 7.179529845687656e-06, "loss": 1.7666, "step": 608 }, { "batch_num_effect_tokens": 2740, "batch_num_samples": 34, "batch_num_tokens": 16332, "epoch": 0.8432, "grad_norm": 0.3372354209423065, "learning_rate": 7.168640460984929e-06, "loss": 1.6924, "step": 609 }, { "batch_num_effect_tokens": 2702, "batch_num_samples": 28, "batch_num_tokens": 16336, "epoch": 0.84458, "grad_norm": 0.40133601427078247, "learning_rate": 7.157738391915531e-06, "loss": 1.8677, "step": 610 }, { "batch_num_effect_tokens": 2697, "batch_num_samples": 41, "batch_num_tokens": 16346, "epoch": 0.84597, "grad_norm": 0.4036213159561157, "learning_rate": 7.146823702245606e-06, "loss": 1.9619, "step": 611 }, { "batch_num_effect_tokens": 2605, "batch_num_samples": 31, "batch_num_tokens": 16309, "epoch": 0.84735, "grad_norm": 0.3756750226020813, "learning_rate": 7.135896455815117e-06, "loss": 1.6523, "step": 612 }, { "batch_num_effect_tokens": 2462, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.84874, "grad_norm": 0.33733808994293213, "learning_rate": 7.124956716537471e-06, "loss": 1.7424, "step": 613 }, { "batch_num_effect_tokens": 2254, "batch_num_samples": 29, "batch_num_tokens": 16354, "epoch": 0.85012, "grad_norm": 0.4028259217739105, "learning_rate": 7.114004548399146e-06, "loss": 1.8799, "step": 614 }, { "batch_num_effect_tokens": 2288, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.85151, "grad_norm": 0.40169867873191833, "learning_rate": 7.103040015459315e-06, "loss": 1.876, "step": 615 }, { "batch_num_effect_tokens": 2575, "batch_num_samples": 29, "batch_num_tokens": 16369, "epoch": 0.85289, "grad_norm": 0.43061500787734985, "learning_rate": 7.0920631818494745e-06, "loss": 1.9902, "step": 616 }, { "batch_num_effect_tokens": 2544, "batch_num_samples": 39, "batch_num_tokens": 16352, "epoch": 0.85427, "grad_norm": 0.36419811844825745, "learning_rate": 7.081074111773066e-06, "loss": 1.8799, "step": 617 }, { "batch_num_effect_tokens": 2257, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.85566, "grad_norm": 0.38703206181526184, "learning_rate": 7.070072869505103e-06, "loss": 1.8291, "step": 618 }, { "batch_num_effect_tokens": 2348, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.85704, "grad_norm": 0.3858138620853424, "learning_rate": 7.059059519391794e-06, "loss": 1.9624, "step": 619 }, { "batch_num_effect_tokens": 3529, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.85843, "grad_norm": 0.3456350862979889, "learning_rate": 7.048034125850165e-06, "loss": 1.8813, "step": 620 }, { "batch_num_effect_tokens": 2425, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.85981, "grad_norm": 0.4138028621673584, "learning_rate": 7.036996753367686e-06, "loss": 1.8564, "step": 621 }, { "batch_num_effect_tokens": 2527, "batch_num_samples": 28, "batch_num_tokens": 16352, "epoch": 0.8612, "grad_norm": 0.4119785726070404, "learning_rate": 7.0259474665018915e-06, "loss": 2.0674, "step": 622 }, { "batch_num_effect_tokens": 2401, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.86258, "grad_norm": 0.3811158537864685, "learning_rate": 7.0148863298800005e-06, "loss": 1.8879, "step": 623 }, { "batch_num_effect_tokens": 2365, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.86397, "grad_norm": 0.4014529883861542, "learning_rate": 7.003813408198543e-06, "loss": 1.9238, "step": 624 }, { "batch_num_effect_tokens": 2676, "batch_num_samples": 29, "batch_num_tokens": 16369, "epoch": 0.86535, "grad_norm": 0.4392950236797333, "learning_rate": 6.992728766222982e-06, "loss": 1.8965, "step": 625 }, { "batch_num_effect_tokens": 2023, "batch_num_samples": 29, "batch_num_tokens": 16312, "epoch": 0.86674, "grad_norm": 0.41874659061431885, "learning_rate": 6.981632468787327e-06, "loss": 1.8843, "step": 626 }, { "batch_num_effect_tokens": 2205, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.86812, "grad_norm": 0.4948895275592804, "learning_rate": 6.970524580793766e-06, "loss": 1.9238, "step": 627 }, { "batch_num_effect_tokens": 2213, "batch_num_samples": 34, "batch_num_tokens": 16367, "epoch": 0.86951, "grad_norm": 0.44357216358184814, "learning_rate": 6.959405167212278e-06, "loss": 1.9697, "step": 628 }, { "batch_num_effect_tokens": 1837, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.87089, "grad_norm": 0.41323012113571167, "learning_rate": 6.948274293080252e-06, "loss": 1.9102, "step": 629 }, { "batch_num_effect_tokens": 4275, "batch_num_samples": 51, "batch_num_tokens": 16276, "epoch": 0.87227, "grad_norm": 0.34769535064697266, "learning_rate": 6.937132023502114e-06, "loss": 1.7773, "step": 630 }, { "batch_num_effect_tokens": 2320, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.87366, "grad_norm": 0.38741710782051086, "learning_rate": 6.925978423648941e-06, "loss": 1.9761, "step": 631 }, { "batch_num_effect_tokens": 3069, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.87504, "grad_norm": 0.35769712924957275, "learning_rate": 6.914813558758078e-06, "loss": 2.0537, "step": 632 }, { "batch_num_effect_tokens": 2892, "batch_num_samples": 41, "batch_num_tokens": 16269, "epoch": 0.87643, "grad_norm": 0.37169861793518066, "learning_rate": 6.903637494132762e-06, "loss": 1.9282, "step": 633 }, { "batch_num_effect_tokens": 2142, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.87781, "grad_norm": 0.46575066447257996, "learning_rate": 6.892450295141737e-06, "loss": 1.9766, "step": 634 }, { "batch_num_effect_tokens": 2304, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 0.8792, "grad_norm": 0.4226188659667969, "learning_rate": 6.881252027218872e-06, "loss": 2.042, "step": 635 }, { "batch_num_effect_tokens": 3147, "batch_num_samples": 41, "batch_num_tokens": 16269, "epoch": 0.88058, "grad_norm": 0.40171298384666443, "learning_rate": 6.870042755862775e-06, "loss": 2.0811, "step": 636 }, { "batch_num_effect_tokens": 1749, "batch_num_samples": 29, "batch_num_tokens": 16330, "epoch": 0.88197, "grad_norm": 0.4461555778980255, "learning_rate": 6.858822546636417e-06, "loss": 1.7231, "step": 637 }, { "batch_num_effect_tokens": 2590, "batch_num_samples": 38, "batch_num_tokens": 16341, "epoch": 0.88335, "grad_norm": 0.42748570442199707, "learning_rate": 6.847591465166741e-06, "loss": 1.8799, "step": 638 }, { "batch_num_effect_tokens": 2869, "batch_num_samples": 29, "batch_num_tokens": 16318, "epoch": 0.88474, "grad_norm": 0.4525657594203949, "learning_rate": 6.836349577144284e-06, "loss": 1.894, "step": 639 }, { "batch_num_effect_tokens": 2666, "batch_num_samples": 34, "batch_num_tokens": 16367, "epoch": 0.88612, "grad_norm": 0.4166469871997833, "learning_rate": 6.825096948322791e-06, "loss": 1.8403, "step": 640 }, { "batch_num_effect_tokens": 2278, "batch_num_samples": 29, "batch_num_tokens": 16284, "epoch": 0.8875, "grad_norm": 0.4384889602661133, "learning_rate": 6.81383364451883e-06, "loss": 2.0762, "step": 641 }, { "batch_num_effect_tokens": 1720, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.88889, "grad_norm": 0.4409150183200836, "learning_rate": 6.802559731611404e-06, "loss": 1.6897, "step": 642 }, { "batch_num_effect_tokens": 2389, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.89027, "grad_norm": 0.3527805209159851, "learning_rate": 6.7912752755415716e-06, "loss": 1.5752, "step": 643 }, { "batch_num_effect_tokens": 2782, "batch_num_samples": 41, "batch_num_tokens": 16343, "epoch": 0.89166, "grad_norm": 0.3914033770561218, "learning_rate": 6.779980342312056e-06, "loss": 1.9438, "step": 644 }, { "batch_num_effect_tokens": 2640, "batch_num_samples": 39, "batch_num_tokens": 16292, "epoch": 0.89304, "grad_norm": 0.3787635564804077, "learning_rate": 6.768674997986863e-06, "loss": 1.8965, "step": 645 }, { "batch_num_effect_tokens": 2516, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.89443, "grad_norm": 0.4216201603412628, "learning_rate": 6.757359308690889e-06, "loss": 1.9404, "step": 646 }, { "batch_num_effect_tokens": 2087, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.89581, "grad_norm": 0.41507163643836975, "learning_rate": 6.746033340609548e-06, "loss": 1.5952, "step": 647 }, { "batch_num_effect_tokens": 2192, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.8972, "grad_norm": 0.3958880305290222, "learning_rate": 6.734697159988362e-06, "loss": 1.8115, "step": 648 }, { "batch_num_effect_tokens": 2297, "batch_num_samples": 29, "batch_num_tokens": 16284, "epoch": 0.89858, "grad_norm": 0.4067823588848114, "learning_rate": 6.723350833132596e-06, "loss": 1.8379, "step": 649 }, { "batch_num_effect_tokens": 2485, "batch_num_samples": 30, "batch_num_tokens": 16286, "epoch": 0.89997, "grad_norm": 0.4240349531173706, "learning_rate": 6.711994426406853e-06, "loss": 1.7251, "step": 650 }, { "batch_num_effect_tokens": 2539, "batch_num_samples": 34, "batch_num_tokens": 16317, "epoch": 0.90135, "grad_norm": 0.3994867205619812, "learning_rate": 6.700628006234698e-06, "loss": 1.7349, "step": 651 }, { "batch_num_effect_tokens": 2751, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.90273, "grad_norm": 0.39579135179519653, "learning_rate": 6.689251639098261e-06, "loss": 1.8853, "step": 652 }, { "batch_num_effect_tokens": 2365, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 0.90412, "grad_norm": 0.39717814326286316, "learning_rate": 6.677865391537857e-06, "loss": 1.9067, "step": 653 }, { "batch_num_effect_tokens": 2607, "batch_num_samples": 34, "batch_num_tokens": 16305, "epoch": 0.9055, "grad_norm": 0.5426166653633118, "learning_rate": 6.666469330151585e-06, "loss": 1.9165, "step": 654 }, { "batch_num_effect_tokens": 2116, "batch_num_samples": 29, "batch_num_tokens": 16380, "epoch": 0.90689, "grad_norm": 0.4363274872303009, "learning_rate": 6.65506352159495e-06, "loss": 1.916, "step": 655 }, { "batch_num_effect_tokens": 3381, "batch_num_samples": 54, "batch_num_tokens": 16380, "epoch": 0.90827, "grad_norm": 0.333035945892334, "learning_rate": 6.643648032580466e-06, "loss": 1.8457, "step": 656 }, { "batch_num_effect_tokens": 2063, "batch_num_samples": 28, "batch_num_tokens": 16288, "epoch": 0.90966, "grad_norm": 0.4219987094402313, "learning_rate": 6.632222929877268e-06, "loss": 1.9429, "step": 657 }, { "batch_num_effect_tokens": 2421, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 0.91104, "grad_norm": 0.37440112233161926, "learning_rate": 6.620788280310722e-06, "loss": 1.959, "step": 658 }, { "batch_num_effect_tokens": 2599, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.91243, "grad_norm": 0.39123010635375977, "learning_rate": 6.609344150762035e-06, "loss": 1.8672, "step": 659 }, { "batch_num_effect_tokens": 2766, "batch_num_samples": 30, "batch_num_tokens": 16304, "epoch": 0.91381, "grad_norm": 0.4632302224636078, "learning_rate": 6.597890608167856e-06, "loss": 2.1416, "step": 660 }, { "batch_num_effect_tokens": 2498, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.9152, "grad_norm": 0.3887713849544525, "learning_rate": 6.586427719519901e-06, "loss": 2.0283, "step": 661 }, { "batch_num_effect_tokens": 2059, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.91658, "grad_norm": 0.46535584330558777, "learning_rate": 6.574955551864541e-06, "loss": 1.8779, "step": 662 }, { "batch_num_effect_tokens": 2409, "batch_num_samples": 33, "batch_num_tokens": 16302, "epoch": 0.91796, "grad_norm": 0.47468358278274536, "learning_rate": 6.563474172302429e-06, "loss": 2.0254, "step": 663 }, { "batch_num_effect_tokens": 2317, "batch_num_samples": 31, "batch_num_tokens": 16340, "epoch": 0.91935, "grad_norm": 0.39732304215431213, "learning_rate": 6.551983647988089e-06, "loss": 1.9932, "step": 664 }, { "batch_num_effect_tokens": 2636, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.92073, "grad_norm": 0.41821804642677307, "learning_rate": 6.54048404612954e-06, "loss": 1.8955, "step": 665 }, { "batch_num_effect_tokens": 2558, "batch_num_samples": 47, "batch_num_tokens": 16382, "epoch": 0.92212, "grad_norm": 0.40650078654289246, "learning_rate": 6.528975433987892e-06, "loss": 1.9731, "step": 666 }, { "batch_num_effect_tokens": 2186, "batch_num_samples": 34, "batch_num_tokens": 16380, "epoch": 0.9235, "grad_norm": 0.3741656243801117, "learning_rate": 6.517457878876958e-06, "loss": 1.9004, "step": 667 }, { "batch_num_effect_tokens": 2057, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 0.92489, "grad_norm": 0.3660355806350708, "learning_rate": 6.505931448162857e-06, "loss": 1.8345, "step": 668 }, { "batch_num_effect_tokens": 2245, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.92627, "grad_norm": 0.39942437410354614, "learning_rate": 6.4943962092636205e-06, "loss": 1.7939, "step": 669 }, { "batch_num_effect_tokens": 2458, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.92766, "grad_norm": 0.39698928594589233, "learning_rate": 6.4828522296488014e-06, "loss": 1.748, "step": 670 }, { "batch_num_effect_tokens": 2247, "batch_num_samples": 29, "batch_num_tokens": 16291, "epoch": 0.92904, "grad_norm": 0.42165908217430115, "learning_rate": 6.471299576839076e-06, "loss": 1.5928, "step": 671 }, { "batch_num_effect_tokens": 2449, "batch_num_samples": 41, "batch_num_tokens": 16277, "epoch": 0.93043, "grad_norm": 0.4098219573497772, "learning_rate": 6.45973831840585e-06, "loss": 1.9021, "step": 672 }, { "batch_num_effect_tokens": 2528, "batch_num_samples": 35, "batch_num_tokens": 16304, "epoch": 0.93181, "grad_norm": 0.39399582147598267, "learning_rate": 6.448168521970865e-06, "loss": 1.9644, "step": 673 }, { "batch_num_effect_tokens": 2580, "batch_num_samples": 44, "batch_num_tokens": 16384, "epoch": 0.93319, "grad_norm": 0.4311675727367401, "learning_rate": 6.4365902552057945e-06, "loss": 1.8081, "step": 674 }, { "batch_num_effect_tokens": 2398, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.93458, "grad_norm": 0.4256434738636017, "learning_rate": 6.4250035858318635e-06, "loss": 1.8232, "step": 675 }, { "batch_num_effect_tokens": 2844, "batch_num_samples": 28, "batch_num_tokens": 16371, "epoch": 0.93596, "grad_norm": 0.4233452081680298, "learning_rate": 6.41340858161944e-06, "loss": 1.7383, "step": 676 }, { "batch_num_effect_tokens": 2578, "batch_num_samples": 47, "batch_num_tokens": 16343, "epoch": 0.93735, "grad_norm": 0.4408589005470276, "learning_rate": 6.401805310387644e-06, "loss": 1.8311, "step": 677 }, { "batch_num_effect_tokens": 2136, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.93873, "grad_norm": 0.3957750201225281, "learning_rate": 6.390193840003945e-06, "loss": 1.8452, "step": 678 }, { "batch_num_effect_tokens": 2605, "batch_num_samples": 36, "batch_num_tokens": 16319, "epoch": 0.94012, "grad_norm": 0.40421879291534424, "learning_rate": 6.378574238383776e-06, "loss": 1.8032, "step": 679 }, { "batch_num_effect_tokens": 2730, "batch_num_samples": 28, "batch_num_tokens": 16292, "epoch": 0.9415, "grad_norm": 0.4138784408569336, "learning_rate": 6.366946573490124e-06, "loss": 1.8379, "step": 680 }, { "batch_num_effect_tokens": 2735, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.94289, "grad_norm": 0.3656238317489624, "learning_rate": 6.355310913333139e-06, "loss": 1.9121, "step": 681 }, { "batch_num_effect_tokens": 2171, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.94427, "grad_norm": 0.39593255519866943, "learning_rate": 6.343667325969736e-06, "loss": 1.8281, "step": 682 }, { "batch_num_effect_tokens": 1929, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.94566, "grad_norm": 0.41541507840156555, "learning_rate": 6.332015879503198e-06, "loss": 1.7808, "step": 683 }, { "batch_num_effect_tokens": 2754, "batch_num_samples": 66, "batch_num_tokens": 16211, "epoch": 0.94704, "grad_norm": 0.37788981199264526, "learning_rate": 6.320356642082774e-06, "loss": 2.0254, "step": 684 }, { "batch_num_effect_tokens": 2530, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.94843, "grad_norm": 0.378854364156723, "learning_rate": 6.3086896819032814e-06, "loss": 2.0586, "step": 685 }, { "batch_num_effect_tokens": 2496, "batch_num_samples": 30, "batch_num_tokens": 16292, "epoch": 0.94981, "grad_norm": 0.3876258134841919, "learning_rate": 6.2970150672047115e-06, "loss": 1.7222, "step": 686 }, { "batch_num_effect_tokens": 2181, "batch_num_samples": 34, "batch_num_tokens": 16367, "epoch": 0.95119, "grad_norm": 0.4277624785900116, "learning_rate": 6.2853328662718215e-06, "loss": 1.8838, "step": 687 }, { "batch_num_effect_tokens": 2743, "batch_num_samples": 34, "batch_num_tokens": 16312, "epoch": 0.95258, "grad_norm": 0.3742033541202545, "learning_rate": 6.273643147433743e-06, "loss": 2.0718, "step": 688 }, { "batch_num_effect_tokens": 2141, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.95396, "grad_norm": 0.4302824139595032, "learning_rate": 6.2619459790635835e-06, "loss": 1.8735, "step": 689 }, { "batch_num_effect_tokens": 2444, "batch_num_samples": 34, "batch_num_tokens": 16324, "epoch": 0.95535, "grad_norm": 0.39185622334480286, "learning_rate": 6.250241429578017e-06, "loss": 1.8433, "step": 690 }, { "batch_num_effect_tokens": 2850, "batch_num_samples": 60, "batch_num_tokens": 16224, "epoch": 0.95673, "grad_norm": 0.3810665011405945, "learning_rate": 6.238529567436892e-06, "loss": 1.9756, "step": 691 }, { "batch_num_effect_tokens": 2949, "batch_num_samples": 29, "batch_num_tokens": 16284, "epoch": 0.95812, "grad_norm": 0.4193708300590515, "learning_rate": 6.226810461142829e-06, "loss": 2.085, "step": 692 }, { "batch_num_effect_tokens": 2671, "batch_num_samples": 41, "batch_num_tokens": 16318, "epoch": 0.9595, "grad_norm": 0.3871364891529083, "learning_rate": 6.215084179240821e-06, "loss": 1.813, "step": 693 }, { "batch_num_effect_tokens": 2601, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.96089, "grad_norm": 0.38447120785713196, "learning_rate": 6.203350790317825e-06, "loss": 1.8867, "step": 694 }, { "batch_num_effect_tokens": 2765, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.96227, "grad_norm": 0.4351661205291748, "learning_rate": 6.191610363002376e-06, "loss": 1.9326, "step": 695 }, { "batch_num_effect_tokens": 2533, "batch_num_samples": 39, "batch_num_tokens": 16352, "epoch": 0.96366, "grad_norm": 0.4192517399787903, "learning_rate": 6.1798629659641676e-06, "loss": 2.0151, "step": 696 }, { "batch_num_effect_tokens": 2240, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.96504, "grad_norm": 0.4247339963912964, "learning_rate": 6.168108667913666e-06, "loss": 1.8408, "step": 697 }, { "batch_num_effect_tokens": 2296, "batch_num_samples": 35, "batch_num_tokens": 16382, "epoch": 0.96642, "grad_norm": 0.4531073570251465, "learning_rate": 6.156347537601698e-06, "loss": 1.8701, "step": 698 }, { "batch_num_effect_tokens": 2283, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.96781, "grad_norm": 0.42325282096862793, "learning_rate": 6.144579643819053e-06, "loss": 1.6885, "step": 699 }, { "batch_num_effect_tokens": 2803, "batch_num_samples": 29, "batch_num_tokens": 16312, "epoch": 0.96919, "grad_norm": 0.4002416133880615, "learning_rate": 6.1328050553960804e-06, "loss": 2.0508, "step": 700 }, { "batch_num_effect_tokens": 2095, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.97058, "grad_norm": 0.42198583483695984, "learning_rate": 6.1210238412022875e-06, "loss": 1.9219, "step": 701 }, { "batch_num_effect_tokens": 2726, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 0.97196, "grad_norm": 0.36841604113578796, "learning_rate": 6.10923607014593e-06, "loss": 1.6738, "step": 702 }, { "batch_num_effect_tokens": 2517, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.97335, "grad_norm": 0.4335607886314392, "learning_rate": 6.0974418111736235e-06, "loss": 1.8975, "step": 703 }, { "batch_num_effect_tokens": 2512, "batch_num_samples": 30, "batch_num_tokens": 16292, "epoch": 0.97473, "grad_norm": 0.3487960994243622, "learning_rate": 6.085641133269923e-06, "loss": 2.1431, "step": 704 }, { "batch_num_effect_tokens": 2567, "batch_num_samples": 63, "batch_num_tokens": 16189, "epoch": 0.97612, "grad_norm": 0.35212260484695435, "learning_rate": 6.073834105456934e-06, "loss": 1.7136, "step": 705 }, { "batch_num_effect_tokens": 2054, "batch_num_samples": 29, "batch_num_tokens": 16381, "epoch": 0.9775, "grad_norm": 0.3788537383079529, "learning_rate": 6.0620207967939e-06, "loss": 1.8027, "step": 706 }, { "batch_num_effect_tokens": 2530, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 0.97889, "grad_norm": 0.40114274621009827, "learning_rate": 6.0502012763768e-06, "loss": 1.8755, "step": 707 }, { "batch_num_effect_tokens": 2447, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.98027, "grad_norm": 0.45158132910728455, "learning_rate": 6.038375613337949e-06, "loss": 1.9822, "step": 708 }, { "batch_num_effect_tokens": 1964, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 0.98165, "grad_norm": 0.3929498493671417, "learning_rate": 6.026543876845586e-06, "loss": 1.7207, "step": 709 }, { "batch_num_effect_tokens": 2516, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 0.98304, "grad_norm": 0.4160457253456116, "learning_rate": 6.0147061361034765e-06, "loss": 2.0186, "step": 710 }, { "batch_num_effect_tokens": 2003, "batch_num_samples": 32, "batch_num_tokens": 16383, "epoch": 0.98442, "grad_norm": 0.4715445935726166, "learning_rate": 6.002862460350505e-06, "loss": 1.8125, "step": 711 }, { "batch_num_effect_tokens": 2495, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 0.98581, "grad_norm": 0.4053877592086792, "learning_rate": 5.9910129188602665e-06, "loss": 1.9692, "step": 712 }, { "batch_num_effect_tokens": 2878, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 0.98719, "grad_norm": 0.33761700987815857, "learning_rate": 5.979157580940669e-06, "loss": 1.6553, "step": 713 }, { "batch_num_effect_tokens": 3507, "batch_num_samples": 37, "batch_num_tokens": 16383, "epoch": 0.98858, "grad_norm": 0.32578060030937195, "learning_rate": 5.967296515933519e-06, "loss": 1.8076, "step": 714 }, { "batch_num_effect_tokens": 2731, "batch_num_samples": 35, "batch_num_tokens": 16356, "epoch": 0.98996, "grad_norm": 0.3874088227748871, "learning_rate": 5.955429793214129e-06, "loss": 1.9141, "step": 715 }, { "batch_num_effect_tokens": 2348, "batch_num_samples": 29, "batch_num_tokens": 16330, "epoch": 0.99135, "grad_norm": 0.4583422541618347, "learning_rate": 5.9435574821908914e-06, "loss": 1.8374, "step": 716 }, { "batch_num_effect_tokens": 3344, "batch_num_samples": 39, "batch_num_tokens": 16322, "epoch": 0.99273, "grad_norm": 0.38349786400794983, "learning_rate": 5.931679652304896e-06, "loss": 1.9365, "step": 717 }, { "batch_num_effect_tokens": 2485, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 0.99412, "grad_norm": 0.38204526901245117, "learning_rate": 5.919796373029504e-06, "loss": 1.9404, "step": 718 }, { "batch_num_effect_tokens": 3733, "batch_num_samples": 48, "batch_num_tokens": 16384, "epoch": 0.9955, "grad_norm": 0.3179651200771332, "learning_rate": 5.9079077138699555e-06, "loss": 1.731, "step": 719 }, { "batch_num_effect_tokens": 3070, "batch_num_samples": 29, "batch_num_tokens": 16317, "epoch": 0.99688, "grad_norm": 0.3780371844768524, "learning_rate": 5.896013744362954e-06, "loss": 1.9199, "step": 720 }, { "batch_num_effect_tokens": 2508, "batch_num_samples": 39, "batch_num_tokens": 16352, "epoch": 0.99827, "grad_norm": 0.39197346568107605, "learning_rate": 5.8841145340762665e-06, "loss": 1.7842, "step": 721 }, { "batch_num_effect_tokens": 2642, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 0.99965, "grad_norm": 0.40163227915763855, "learning_rate": 5.872210152608311e-06, "loss": 1.8389, "step": 722 }, { "batch_num_effect_tokens": 2621, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.00104, "grad_norm": 0.3585405945777893, "learning_rate": 5.8603006695877505e-06, "loss": 1.8545, "step": 723 }, { "batch_num_effect_tokens": 2042, "batch_num_samples": 31, "batch_num_tokens": 16319, "epoch": 1.00242, "grad_norm": 0.5187774896621704, "learning_rate": 5.8483861546730915e-06, "loss": 1.7297, "step": 724 }, { "batch_num_effect_tokens": 3803, "batch_num_samples": 44, "batch_num_tokens": 16364, "epoch": 1.00381, "grad_norm": 0.2973698079586029, "learning_rate": 5.836466677552266e-06, "loss": 1.748, "step": 725 }, { "batch_num_effect_tokens": 3803, "batch_num_samples": 44, "batch_num_tokens": 16364, "epoch": 1.00381, "eval_eval_loss": 0.45392751693725586, "eval_eval_runtime": 127.9142, "eval_eval_samples_per_second": 37.9, "eval_eval_steps_per_second": 2.369, "step": 725 }, { "batch_num_effect_tokens": 2289, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.00519, "grad_norm": 0.4149424135684967, "learning_rate": 5.824542307942236e-06, "loss": 1.77, "step": 726 }, { "batch_num_effect_tokens": 3230, "batch_num_samples": 40, "batch_num_tokens": 16376, "epoch": 1.00658, "grad_norm": 0.3944372832775116, "learning_rate": 5.812613115588575e-06, "loss": 1.5496, "step": 727 }, { "batch_num_effect_tokens": 2502, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.00796, "grad_norm": 0.4182272255420685, "learning_rate": 5.800679170265067e-06, "loss": 1.8477, "step": 728 }, { "batch_num_effect_tokens": 2390, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.00935, "grad_norm": 0.42660319805145264, "learning_rate": 5.788740541773296e-06, "loss": 1.8237, "step": 729 }, { "batch_num_effect_tokens": 3002, "batch_num_samples": 30, "batch_num_tokens": 16374, "epoch": 1.01073, "grad_norm": 0.4760968089103699, "learning_rate": 5.776797299942236e-06, "loss": 2.1982, "step": 730 }, { "batch_num_effect_tokens": 2416, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.01211, "grad_norm": 0.4034194350242615, "learning_rate": 5.764849514627848e-06, "loss": 1.9106, "step": 731 }, { "batch_num_effect_tokens": 2804, "batch_num_samples": 67, "batch_num_tokens": 16247, "epoch": 1.0135, "grad_norm": 0.3775666654109955, "learning_rate": 5.7528972557126625e-06, "loss": 1.8086, "step": 732 }, { "batch_num_effect_tokens": 2154, "batch_num_samples": 33, "batch_num_tokens": 16357, "epoch": 1.01488, "grad_norm": 0.31369370222091675, "learning_rate": 5.740940593105383e-06, "loss": 1.4443, "step": 733 }, { "batch_num_effect_tokens": 2123, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.01627, "grad_norm": 0.3986714780330658, "learning_rate": 5.7289795967404624e-06, "loss": 1.7358, "step": 734 }, { "batch_num_effect_tokens": 2670, "batch_num_samples": 40, "batch_num_tokens": 16384, "epoch": 1.01765, "grad_norm": 0.3507808446884155, "learning_rate": 5.717014336577709e-06, "loss": 1.4819, "step": 735 }, { "batch_num_effect_tokens": 2229, "batch_num_samples": 42, "batch_num_tokens": 16318, "epoch": 1.01904, "grad_norm": 0.4190760850906372, "learning_rate": 5.705044882601862e-06, "loss": 1.8735, "step": 736 }, { "batch_num_effect_tokens": 2705, "batch_num_samples": 52, "batch_num_tokens": 16384, "epoch": 1.02042, "grad_norm": 0.38787561655044556, "learning_rate": 5.693071304822203e-06, "loss": 1.6328, "step": 737 }, { "batch_num_effect_tokens": 1797, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.02181, "grad_norm": 0.3226073384284973, "learning_rate": 5.681093673272117e-06, "loss": 1.7295, "step": 738 }, { "batch_num_effect_tokens": 2641, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.02319, "grad_norm": 0.47867104411125183, "learning_rate": 5.6691120580087126e-06, "loss": 1.6006, "step": 739 }, { "batch_num_effect_tokens": 1992, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.02458, "grad_norm": 0.3119468688964844, "learning_rate": 5.657126529112393e-06, "loss": 1.4883, "step": 740 }, { "batch_num_effect_tokens": 2580, "batch_num_samples": 31, "batch_num_tokens": 16298, "epoch": 1.02596, "grad_norm": 0.3492470681667328, "learning_rate": 5.645137156686455e-06, "loss": 1.3928, "step": 741 }, { "batch_num_effect_tokens": 1937, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.02735, "grad_norm": 0.5438535213470459, "learning_rate": 5.6331440108566735e-06, "loss": 1.5923, "step": 742 }, { "batch_num_effect_tokens": 2537, "batch_num_samples": 29, "batch_num_tokens": 16382, "epoch": 1.02873, "grad_norm": 0.4800093472003937, "learning_rate": 5.621147161770898e-06, "loss": 1.7915, "step": 743 }, { "batch_num_effect_tokens": 2100, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.03011, "grad_norm": 0.4597349166870117, "learning_rate": 5.609146679598634e-06, "loss": 1.5664, "step": 744 }, { "batch_num_effect_tokens": 2097, "batch_num_samples": 28, "batch_num_tokens": 16381, "epoch": 1.0315, "grad_norm": 0.5356582999229431, "learning_rate": 5.597142634530639e-06, "loss": 1.8623, "step": 745 }, { "batch_num_effect_tokens": 2382, "batch_num_samples": 29, "batch_num_tokens": 16382, "epoch": 1.03288, "grad_norm": 0.4072340726852417, "learning_rate": 5.5851350967785075e-06, "loss": 1.7947, "step": 746 }, { "batch_num_effect_tokens": 2603, "batch_num_samples": 30, "batch_num_tokens": 16364, "epoch": 1.03427, "grad_norm": 0.45618703961372375, "learning_rate": 5.573124136574268e-06, "loss": 1.6592, "step": 747 }, { "batch_num_effect_tokens": 3774, "batch_num_samples": 40, "batch_num_tokens": 16300, "epoch": 1.03565, "grad_norm": 0.39137235283851624, "learning_rate": 5.561109824169962e-06, "loss": 1.9229, "step": 748 }, { "batch_num_effect_tokens": 2145, "batch_num_samples": 29, "batch_num_tokens": 16300, "epoch": 1.03704, "grad_norm": 0.4353331923484802, "learning_rate": 5.549092229837242e-06, "loss": 1.6499, "step": 749 }, { "batch_num_effect_tokens": 3201, "batch_num_samples": 30, "batch_num_tokens": 16304, "epoch": 1.03842, "grad_norm": 0.36786338686943054, "learning_rate": 5.53707142386695e-06, "loss": 1.9492, "step": 750 }, { "batch_num_effect_tokens": 3207, "batch_num_samples": 41, "batch_num_tokens": 16228, "epoch": 1.03981, "grad_norm": 0.45932990312576294, "learning_rate": 5.525047476568722e-06, "loss": 2.3945, "step": 751 }, { "batch_num_effect_tokens": 2628, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.04119, "grad_norm": 0.40223562717437744, "learning_rate": 5.5130204582705574e-06, "loss": 1.623, "step": 752 }, { "batch_num_effect_tokens": 2596, "batch_num_samples": 28, "batch_num_tokens": 16352, "epoch": 1.04258, "grad_norm": 0.3397735357284546, "learning_rate": 5.500990439318427e-06, "loss": 1.3716, "step": 753 }, { "batch_num_effect_tokens": 1885, "batch_num_samples": 28, "batch_num_tokens": 16352, "epoch": 1.04396, "grad_norm": 0.39209291338920593, "learning_rate": 5.488957490075846e-06, "loss": 1.6104, "step": 754 }, { "batch_num_effect_tokens": 2279, "batch_num_samples": 31, "batch_num_tokens": 16354, "epoch": 1.04534, "grad_norm": 0.441590815782547, "learning_rate": 5.476921680923474e-06, "loss": 1.772, "step": 755 }, { "batch_num_effect_tokens": 3209, "batch_num_samples": 30, "batch_num_tokens": 16286, "epoch": 1.04673, "grad_norm": 0.3218657076358795, "learning_rate": 5.464883082258692e-06, "loss": 1.8574, "step": 756 }, { "batch_num_effect_tokens": 2544, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.04811, "grad_norm": 0.3584845960140228, "learning_rate": 5.452841764495203e-06, "loss": 1.7207, "step": 757 }, { "batch_num_effect_tokens": 3724, "batch_num_samples": 45, "batch_num_tokens": 16384, "epoch": 1.0495, "grad_norm": 0.2936672568321228, "learning_rate": 5.440797798062611e-06, "loss": 1.5332, "step": 758 }, { "batch_num_effect_tokens": 2326, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.05088, "grad_norm": 0.3491065502166748, "learning_rate": 5.428751253406015e-06, "loss": 1.5029, "step": 759 }, { "batch_num_effect_tokens": 2251, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.05227, "grad_norm": 0.3571205139160156, "learning_rate": 5.416702200985585e-06, "loss": 1.7979, "step": 760 }, { "batch_num_effect_tokens": 2505, "batch_num_samples": 28, "batch_num_tokens": 16383, "epoch": 1.05365, "grad_norm": 0.4030872881412506, "learning_rate": 5.4046507112761714e-06, "loss": 1.8477, "step": 761 }, { "batch_num_effect_tokens": 2676, "batch_num_samples": 28, "batch_num_tokens": 16272, "epoch": 1.05504, "grad_norm": 0.44289737939834595, "learning_rate": 5.392596854766869e-06, "loss": 1.7949, "step": 762 }, { "batch_num_effect_tokens": 2130, "batch_num_samples": 35, "batch_num_tokens": 16278, "epoch": 1.05642, "grad_norm": 0.4669971466064453, "learning_rate": 5.380540701960627e-06, "loss": 1.9097, "step": 763 }, { "batch_num_effect_tokens": 2345, "batch_num_samples": 30, "batch_num_tokens": 16381, "epoch": 1.05781, "grad_norm": 0.44127482175827026, "learning_rate": 5.368482323373815e-06, "loss": 1.7817, "step": 764 }, { "batch_num_effect_tokens": 2048, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.05919, "grad_norm": 0.4340735077857971, "learning_rate": 5.35642178953583e-06, "loss": 1.584, "step": 765 }, { "batch_num_effect_tokens": 2975, "batch_num_samples": 65, "batch_num_tokens": 16238, "epoch": 1.06057, "grad_norm": 0.34205520153045654, "learning_rate": 5.344359170988668e-06, "loss": 1.498, "step": 766 }, { "batch_num_effect_tokens": 2198, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.06196, "grad_norm": 0.4290141761302948, "learning_rate": 5.332294538286523e-06, "loss": 1.4443, "step": 767 }, { "batch_num_effect_tokens": 2410, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.06334, "grad_norm": 0.4226868748664856, "learning_rate": 5.3202279619953675e-06, "loss": 1.7307, "step": 768 }, { "batch_num_effect_tokens": 3362, "batch_num_samples": 34, "batch_num_tokens": 16322, "epoch": 1.06473, "grad_norm": 0.3600834310054779, "learning_rate": 5.308159512692544e-06, "loss": 1.7847, "step": 769 }, { "batch_num_effect_tokens": 2054, "batch_num_samples": 32, "batch_num_tokens": 16354, "epoch": 1.06611, "grad_norm": 0.47318387031555176, "learning_rate": 5.296089260966347e-06, "loss": 1.7441, "step": 770 }, { "batch_num_effect_tokens": 2186, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.0675, "grad_norm": 0.39482608437538147, "learning_rate": 5.284017277415619e-06, "loss": 1.6016, "step": 771 }, { "batch_num_effect_tokens": 2381, "batch_num_samples": 36, "batch_num_tokens": 16373, "epoch": 1.06888, "grad_norm": 0.38093140721321106, "learning_rate": 5.2719436326493255e-06, "loss": 1.4893, "step": 772 }, { "batch_num_effect_tokens": 1793, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.07027, "grad_norm": 0.5038796663284302, "learning_rate": 5.259868397286154e-06, "loss": 1.3757, "step": 773 }, { "batch_num_effect_tokens": 3231, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.07165, "grad_norm": 0.37927088141441345, "learning_rate": 5.247791641954089e-06, "loss": 1.9248, "step": 774 }, { "batch_num_effect_tokens": 2555, "batch_num_samples": 34, "batch_num_tokens": 16342, "epoch": 1.07304, "grad_norm": 0.3317091763019562, "learning_rate": 5.235713437290012e-06, "loss": 1.3669, "step": 775 }, { "batch_num_effect_tokens": 2351, "batch_num_samples": 29, "batch_num_tokens": 16354, "epoch": 1.07442, "grad_norm": 0.3651438057422638, "learning_rate": 5.223633853939276e-06, "loss": 1.8257, "step": 776 }, { "batch_num_effect_tokens": 2229, "batch_num_samples": 37, "batch_num_tokens": 16381, "epoch": 1.0758, "grad_norm": 0.405533105134964, "learning_rate": 5.211552962555305e-06, "loss": 2.7515, "step": 777 }, { "batch_num_effect_tokens": 2338, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.07719, "grad_norm": 0.4433091878890991, "learning_rate": 5.199470833799164e-06, "loss": 1.3872, "step": 778 }, { "batch_num_effect_tokens": 2917, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.07857, "grad_norm": 0.3009926974773407, "learning_rate": 5.1873875383391655e-06, "loss": 1.3389, "step": 779 }, { "batch_num_effect_tokens": 2232, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.07996, "grad_norm": 0.275104284286499, "learning_rate": 5.175303146850439e-06, "loss": 1.501, "step": 780 }, { "batch_num_effect_tokens": 2247, "batch_num_samples": 29, "batch_num_tokens": 16291, "epoch": 1.08134, "grad_norm": 0.3017769753932953, "learning_rate": 5.1632177300145255e-06, "loss": 1.3145, "step": 781 }, { "batch_num_effect_tokens": 2253, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.08273, "grad_norm": 0.4704679250717163, "learning_rate": 5.151131358518966e-06, "loss": 1.4663, "step": 782 }, { "batch_num_effect_tokens": 2996, "batch_num_samples": 54, "batch_num_tokens": 16328, "epoch": 1.08411, "grad_norm": 0.45610228180885315, "learning_rate": 5.139044103056885e-06, "loss": 1.8208, "step": 783 }, { "batch_num_effect_tokens": 1864, "batch_num_samples": 28, "batch_num_tokens": 16372, "epoch": 1.0855, "grad_norm": 0.5263177752494812, "learning_rate": 5.126956034326573e-06, "loss": 1.8535, "step": 784 }, { "batch_num_effect_tokens": 3152, "batch_num_samples": 28, "batch_num_tokens": 16337, "epoch": 1.08688, "grad_norm": 0.46754708886146545, "learning_rate": 5.114867223031086e-06, "loss": 1.7769, "step": 785 }, { "batch_num_effect_tokens": 2974, "batch_num_samples": 35, "batch_num_tokens": 16330, "epoch": 1.08827, "grad_norm": 0.47855058312416077, "learning_rate": 5.102777739877812e-06, "loss": 1.709, "step": 786 }, { "batch_num_effect_tokens": 2605, "batch_num_samples": 36, "batch_num_tokens": 16319, "epoch": 1.08965, "grad_norm": 0.32186511158943176, "learning_rate": 5.090687655578078e-06, "loss": 1.6111, "step": 787 }, { "batch_num_effect_tokens": 2658, "batch_num_samples": 40, "batch_num_tokens": 16384, "epoch": 1.09103, "grad_norm": 0.46807661652565, "learning_rate": 5.078597040846723e-06, "loss": 1.7998, "step": 788 }, { "batch_num_effect_tokens": 2110, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.09242, "grad_norm": 0.6199424266815186, "learning_rate": 5.066505966401689e-06, "loss": 1.6865, "step": 789 }, { "batch_num_effect_tokens": 2195, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.0938, "grad_norm": 0.4762854278087616, "learning_rate": 5.054414502963605e-06, "loss": 1.6787, "step": 790 }, { "batch_num_effect_tokens": 2716, "batch_num_samples": 30, "batch_num_tokens": 16292, "epoch": 1.09519, "grad_norm": 0.5097974538803101, "learning_rate": 5.042322721255379e-06, "loss": 1.9941, "step": 791 }, { "batch_num_effect_tokens": 2267, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.09657, "grad_norm": 0.46803590655326843, "learning_rate": 5.030230692001779e-06, "loss": 1.7222, "step": 792 }, { "batch_num_effect_tokens": 2208, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.09796, "grad_norm": 0.31392091512680054, "learning_rate": 5.0181384859290215e-06, "loss": 1.385, "step": 793 }, { "batch_num_effect_tokens": 3157, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.09934, "grad_norm": 0.3446170687675476, "learning_rate": 5.006046173764353e-06, "loss": 1.4917, "step": 794 }, { "batch_num_effect_tokens": 3952, "batch_num_samples": 43, "batch_num_tokens": 16384, "epoch": 1.10073, "grad_norm": 0.33993691205978394, "learning_rate": 4.993953826235649e-06, "loss": 1.9702, "step": 795 }, { "batch_num_effect_tokens": 3356, "batch_num_samples": 43, "batch_num_tokens": 16382, "epoch": 1.10211, "grad_norm": 0.45004844665527344, "learning_rate": 4.981861514070979e-06, "loss": 1.5454, "step": 796 }, { "batch_num_effect_tokens": 2253, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.1035, "grad_norm": 0.3758144974708557, "learning_rate": 4.9697693079982215e-06, "loss": 1.5327, "step": 797 }, { "batch_num_effect_tokens": 3614, "batch_num_samples": 43, "batch_num_tokens": 16312, "epoch": 1.10488, "grad_norm": 0.4214804470539093, "learning_rate": 4.957677278744621e-06, "loss": 1.5034, "step": 798 }, { "batch_num_effect_tokens": 2222, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.10627, "grad_norm": 0.4832451343536377, "learning_rate": 4.945585497036396e-06, "loss": 1.8232, "step": 799 }, { "batch_num_effect_tokens": 4037, "batch_num_samples": 47, "batch_num_tokens": 16384, "epoch": 1.10765, "grad_norm": 0.3420540988445282, "learning_rate": 4.933494033598314e-06, "loss": 1.3403, "step": 800 }, { "batch_num_effect_tokens": 2241, "batch_num_samples": 31, "batch_num_tokens": 16361, "epoch": 1.10903, "grad_norm": 0.29597947001457214, "learning_rate": 4.9214029591532785e-06, "loss": 1.2751, "step": 801 }, { "batch_num_effect_tokens": 2358, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 1.11042, "grad_norm": 0.4686249792575836, "learning_rate": 4.909312344421923e-06, "loss": 1.7446, "step": 802 }, { "batch_num_effect_tokens": 2535, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.1118, "grad_norm": 0.49651825428009033, "learning_rate": 4.897222260122189e-06, "loss": 1.3359, "step": 803 }, { "batch_num_effect_tokens": 2445, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.11319, "grad_norm": 0.42531129717826843, "learning_rate": 4.885132776968915e-06, "loss": 1.8623, "step": 804 }, { "batch_num_effect_tokens": 2010, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.11457, "grad_norm": 0.425052672624588, "learning_rate": 4.873043965673427e-06, "loss": 1.4949, "step": 805 }, { "batch_num_effect_tokens": 1974, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.11596, "grad_norm": 0.5037321448326111, "learning_rate": 4.860955896943117e-06, "loss": 1.6772, "step": 806 }, { "batch_num_effect_tokens": 2702, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.11734, "grad_norm": 0.3394450545310974, "learning_rate": 4.848868641481036e-06, "loss": 1.6199, "step": 807 }, { "batch_num_effect_tokens": 3037, "batch_num_samples": 56, "batch_num_tokens": 16312, "epoch": 1.11873, "grad_norm": 0.27651503682136536, "learning_rate": 4.836782269985475e-06, "loss": 1.5671, "step": 808 }, { "batch_num_effect_tokens": 2270, "batch_num_samples": 28, "batch_num_tokens": 16292, "epoch": 1.12011, "grad_norm": 0.40340420603752136, "learning_rate": 4.824696853149564e-06, "loss": 2.0068, "step": 809 }, { "batch_num_effect_tokens": 2832, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.1215, "grad_norm": 0.4530698359012604, "learning_rate": 4.812612461660835e-06, "loss": 1.5215, "step": 810 }, { "batch_num_effect_tokens": 1892, "batch_num_samples": 28, "batch_num_tokens": 16383, "epoch": 1.12288, "grad_norm": 0.3873331546783447, "learning_rate": 4.800529166200837e-06, "loss": 1.8911, "step": 811 }, { "batch_num_effect_tokens": 3344, "batch_num_samples": 41, "batch_num_tokens": 16311, "epoch": 1.12426, "grad_norm": 0.4275394380092621, "learning_rate": 4.788447037444696e-06, "loss": 1.2529, "step": 812 }, { "batch_num_effect_tokens": 2200, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 1.12565, "grad_norm": 0.30582910776138306, "learning_rate": 4.776366146060725e-06, "loss": 1.363, "step": 813 }, { "batch_num_effect_tokens": 2391, "batch_num_samples": 32, "batch_num_tokens": 16288, "epoch": 1.12703, "grad_norm": 0.45373427867889404, "learning_rate": 4.76428656270999e-06, "loss": 1.6953, "step": 814 }, { "batch_num_effect_tokens": 2291, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.12842, "grad_norm": 0.30373871326446533, "learning_rate": 4.752208358045913e-06, "loss": 1.301, "step": 815 }, { "batch_num_effect_tokens": 3036, "batch_num_samples": 29, "batch_num_tokens": 16292, "epoch": 1.1298, "grad_norm": 0.34827721118927, "learning_rate": 4.740131602713849e-06, "loss": 1.5073, "step": 816 }, { "batch_num_effect_tokens": 2984, "batch_num_samples": 29, "batch_num_tokens": 16291, "epoch": 1.13119, "grad_norm": 0.43140915036201477, "learning_rate": 4.7280563673506745e-06, "loss": 1.8008, "step": 817 }, { "batch_num_effect_tokens": 1904, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.13257, "grad_norm": 0.5050660371780396, "learning_rate": 4.715982722584382e-06, "loss": 1.6475, "step": 818 }, { "batch_num_effect_tokens": 2430, "batch_num_samples": 29, "batch_num_tokens": 16304, "epoch": 1.13396, "grad_norm": 0.4796289801597595, "learning_rate": 4.703910739033653e-06, "loss": 1.4509, "step": 819 }, { "batch_num_effect_tokens": 1979, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.13534, "grad_norm": 0.5220789909362793, "learning_rate": 4.6918404873074574e-06, "loss": 1.8242, "step": 820 }, { "batch_num_effect_tokens": 2778, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.13673, "grad_norm": 0.4105257987976074, "learning_rate": 4.679772038004635e-06, "loss": 1.5215, "step": 821 }, { "batch_num_effect_tokens": 2481, "batch_num_samples": 48, "batch_num_tokens": 16312, "epoch": 1.13811, "grad_norm": 0.3332346975803375, "learning_rate": 4.667705461713478e-06, "loss": 1.5908, "step": 822 }, { "batch_num_effect_tokens": 2782, "batch_num_samples": 45, "batch_num_tokens": 16363, "epoch": 1.13949, "grad_norm": 0.35695531964302063, "learning_rate": 4.655640829011335e-06, "loss": 1.7957, "step": 823 }, { "batch_num_effect_tokens": 1930, "batch_num_samples": 28, "batch_num_tokens": 16304, "epoch": 1.14088, "grad_norm": 0.48681649565696716, "learning_rate": 4.643578210464171e-06, "loss": 1.6416, "step": 824 }, { "batch_num_effect_tokens": 2410, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.14226, "grad_norm": 0.3464092016220093, "learning_rate": 4.631517676626186e-06, "loss": 1.5872, "step": 825 }, { "batch_num_effect_tokens": 2076, "batch_num_samples": 28, "batch_num_tokens": 16304, "epoch": 1.14365, "grad_norm": 0.32844287157058716, "learning_rate": 4.619459298039373e-06, "loss": 1.6211, "step": 826 }, { "batch_num_effect_tokens": 2613, "batch_num_samples": 31, "batch_num_tokens": 16332, "epoch": 1.14503, "grad_norm": 0.3087644577026367, "learning_rate": 4.607403145233132e-06, "loss": 1.6445, "step": 827 }, { "batch_num_effect_tokens": 2683, "batch_num_samples": 44, "batch_num_tokens": 16328, "epoch": 1.14642, "grad_norm": 0.47463592886924744, "learning_rate": 4.595349288723832e-06, "loss": 1.8047, "step": 828 }, { "batch_num_effect_tokens": 2396, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.1478, "grad_norm": 0.4572930634021759, "learning_rate": 4.5832977990144165e-06, "loss": 1.4292, "step": 829 }, { "batch_num_effect_tokens": 2310, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.14919, "grad_norm": 0.4730936288833618, "learning_rate": 4.571248746593988e-06, "loss": 1.6807, "step": 830 }, { "batch_num_effect_tokens": 2167, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.15057, "grad_norm": 0.5189083814620972, "learning_rate": 4.559202201937389e-06, "loss": 1.2634, "step": 831 }, { "batch_num_effect_tokens": 2171, "batch_num_samples": 29, "batch_num_tokens": 16368, "epoch": 1.15196, "grad_norm": 0.3191162049770355, "learning_rate": 4.547158235504797e-06, "loss": 1.3682, "step": 832 }, { "batch_num_effect_tokens": 2235, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.15334, "grad_norm": 0.4910542070865631, "learning_rate": 4.535116917741308e-06, "loss": 1.626, "step": 833 }, { "batch_num_effect_tokens": 2478, "batch_num_samples": 31, "batch_num_tokens": 16382, "epoch": 1.15472, "grad_norm": 0.4647471308708191, "learning_rate": 4.523078319076528e-06, "loss": 1.6147, "step": 834 }, { "batch_num_effect_tokens": 2307, "batch_num_samples": 31, "batch_num_tokens": 16382, "epoch": 1.15611, "grad_norm": 0.5310738682746887, "learning_rate": 4.511042509924157e-06, "loss": 1.6802, "step": 835 }, { "batch_num_effect_tokens": 2552, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.15749, "grad_norm": 0.461700975894928, "learning_rate": 4.499009560681574e-06, "loss": 1.2095, "step": 836 }, { "batch_num_effect_tokens": 2167, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.15888, "grad_norm": 0.3190494179725647, "learning_rate": 4.486979541729445e-06, "loss": 1.4497, "step": 837 }, { "batch_num_effect_tokens": 3076, "batch_num_samples": 62, "batch_num_tokens": 16243, "epoch": 1.16026, "grad_norm": 0.4126320779323578, "learning_rate": 4.47495252343128e-06, "loss": 1.5537, "step": 838 }, { "batch_num_effect_tokens": 3017, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.16165, "grad_norm": 0.4071129560470581, "learning_rate": 4.4629285761330515e-06, "loss": 1.9038, "step": 839 }, { "batch_num_effect_tokens": 2682, "batch_num_samples": 34, "batch_num_tokens": 16294, "epoch": 1.16303, "grad_norm": 0.36762744188308716, "learning_rate": 4.450907770162758e-06, "loss": 1.6416, "step": 840 }, { "batch_num_effect_tokens": 2424, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.16442, "grad_norm": 0.46559593081474304, "learning_rate": 4.438890175830039e-06, "loss": 1.6306, "step": 841 }, { "batch_num_effect_tokens": 2870, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.1658, "grad_norm": 0.3541262745857239, "learning_rate": 4.426875863425733e-06, "loss": 1.3789, "step": 842 }, { "batch_num_effect_tokens": 3406, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 1.16719, "grad_norm": 0.40651848912239075, "learning_rate": 4.414864903221493e-06, "loss": 1.6445, "step": 843 }, { "batch_num_effect_tokens": 2395, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.16857, "grad_norm": 0.4820868968963623, "learning_rate": 4.402857365469364e-06, "loss": 1.4067, "step": 844 }, { "batch_num_effect_tokens": 3315, "batch_num_samples": 52, "batch_num_tokens": 16298, "epoch": 1.16996, "grad_norm": 0.36774498224258423, "learning_rate": 4.390853320401367e-06, "loss": 1.792, "step": 845 }, { "batch_num_effect_tokens": 2352, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.17134, "grad_norm": 0.38548794388771057, "learning_rate": 4.378852838229104e-06, "loss": 1.373, "step": 846 }, { "batch_num_effect_tokens": 2532, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.17272, "grad_norm": 0.4058912396430969, "learning_rate": 4.366855989143326e-06, "loss": 1.8174, "step": 847 }, { "batch_num_effect_tokens": 3374, "batch_num_samples": 53, "batch_num_tokens": 16312, "epoch": 1.17411, "grad_norm": 0.302566796541214, "learning_rate": 4.354862843313547e-06, "loss": 1.7661, "step": 848 }, { "batch_num_effect_tokens": 2595, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.17549, "grad_norm": 0.4759613573551178, "learning_rate": 4.342873470887609e-06, "loss": 1.6152, "step": 849 }, { "batch_num_effect_tokens": 2166, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.17688, "grad_norm": 0.3862478733062744, "learning_rate": 4.330887941991288e-06, "loss": 1.3936, "step": 850 }, { "batch_num_effect_tokens": 3509, "batch_num_samples": 50, "batch_num_tokens": 16276, "epoch": 1.17826, "grad_norm": 0.37599417567253113, "learning_rate": 4.318906326727886e-06, "loss": 1.6206, "step": 851 }, { "batch_num_effect_tokens": 2077, "batch_num_samples": 28, "batch_num_tokens": 16372, "epoch": 1.17965, "grad_norm": 0.5080588459968567, "learning_rate": 4.306928695177799e-06, "loss": 1.6885, "step": 852 }, { "batch_num_effect_tokens": 2738, "batch_num_samples": 37, "batch_num_tokens": 16368, "epoch": 1.18103, "grad_norm": 0.34032300114631653, "learning_rate": 4.294955117398139e-06, "loss": 1.4575, "step": 853 }, { "batch_num_effect_tokens": 2383, "batch_num_samples": 34, "batch_num_tokens": 16312, "epoch": 1.18242, "grad_norm": 0.3602158725261688, "learning_rate": 4.282985663422292e-06, "loss": 1.7559, "step": 854 }, { "batch_num_effect_tokens": 2256, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.1838, "grad_norm": 0.4284519553184509, "learning_rate": 4.271020403259539e-06, "loss": 1.5205, "step": 855 }, { "batch_num_effect_tokens": 3005, "batch_num_samples": 34, "batch_num_tokens": 16317, "epoch": 1.18519, "grad_norm": 0.36641523241996765, "learning_rate": 4.259059406894619e-06, "loss": 1.4036, "step": 856 }, { "batch_num_effect_tokens": 2646, "batch_num_samples": 41, "batch_num_tokens": 16187, "epoch": 1.18657, "grad_norm": 0.4413210451602936, "learning_rate": 4.247102744287338e-06, "loss": 1.8916, "step": 857 }, { "batch_num_effect_tokens": 2621, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.18795, "grad_norm": 0.33154457807540894, "learning_rate": 4.2351504853721545e-06, "loss": 1.6338, "step": 858 }, { "batch_num_effect_tokens": 3243, "batch_num_samples": 34, "batch_num_tokens": 16332, "epoch": 1.18934, "grad_norm": 0.3278566300868988, "learning_rate": 4.223202700057765e-06, "loss": 1.4326, "step": 859 }, { "batch_num_effect_tokens": 2433, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 1.19072, "grad_norm": 0.30693334341049194, "learning_rate": 4.211259458226706e-06, "loss": 1.6475, "step": 860 }, { "batch_num_effect_tokens": 2147, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 1.19211, "grad_norm": 0.4576500952243805, "learning_rate": 4.199320829734934e-06, "loss": 1.8813, "step": 861 }, { "batch_num_effect_tokens": 3317, "batch_num_samples": 31, "batch_num_tokens": 16378, "epoch": 1.19349, "grad_norm": 0.380047082901001, "learning_rate": 4.187386884411426e-06, "loss": 1.7808, "step": 862 }, { "batch_num_effect_tokens": 2224, "batch_num_samples": 30, "batch_num_tokens": 16323, "epoch": 1.19488, "grad_norm": 0.4584408402442932, "learning_rate": 4.175457692057765e-06, "loss": 1.627, "step": 863 }, { "batch_num_effect_tokens": 2621, "batch_num_samples": 33, "batch_num_tokens": 16280, "epoch": 1.19626, "grad_norm": 0.41681399941444397, "learning_rate": 4.163533322447734e-06, "loss": 1.7852, "step": 864 }, { "batch_num_effect_tokens": 2112, "batch_num_samples": 30, "batch_num_tokens": 16336, "epoch": 1.19765, "grad_norm": 0.46527349948883057, "learning_rate": 4.151613845326912e-06, "loss": 1.7158, "step": 865 }, { "batch_num_effect_tokens": 2748, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.19903, "grad_norm": 0.4118035137653351, "learning_rate": 4.13969933041225e-06, "loss": 1.5496, "step": 866 }, { "batch_num_effect_tokens": 3354, "batch_num_samples": 45, "batch_num_tokens": 16384, "epoch": 1.20042, "grad_norm": 0.4038017690181732, "learning_rate": 4.127789847391692e-06, "loss": 1.853, "step": 867 }, { "batch_num_effect_tokens": 2432, "batch_num_samples": 37, "batch_num_tokens": 16351, "epoch": 1.2018, "grad_norm": 0.3060235381126404, "learning_rate": 4.115885465923734e-06, "loss": 1.8469, "step": 868 }, { "batch_num_effect_tokens": 2873, "batch_num_samples": 34, "batch_num_tokens": 16381, "epoch": 1.20318, "grad_norm": 0.48481249809265137, "learning_rate": 4.103986255637048e-06, "loss": 1.4463, "step": 869 }, { "batch_num_effect_tokens": 3160, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.20457, "grad_norm": 0.42701026797294617, "learning_rate": 4.092092286130046e-06, "loss": 1.771, "step": 870 }, { "batch_num_effect_tokens": 3160, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.20457, "eval_eval_loss": 0.4476088583469391, "eval_eval_runtime": 139.0454, "eval_eval_samples_per_second": 34.866, "eval_eval_steps_per_second": 2.179, "step": 870 }, { "batch_num_effect_tokens": 2625, "batch_num_samples": 28, "batch_num_tokens": 16336, "epoch": 1.20595, "grad_norm": 0.4726986587047577, "learning_rate": 4.080203626970498e-06, "loss": 2.0474, "step": 871 }, { "batch_num_effect_tokens": 2976, "batch_num_samples": 31, "batch_num_tokens": 16376, "epoch": 1.20734, "grad_norm": 0.4470539391040802, "learning_rate": 4.0683203476951065e-06, "loss": 1.8535, "step": 872 }, { "batch_num_effect_tokens": 2750, "batch_num_samples": 39, "batch_num_tokens": 16322, "epoch": 1.20872, "grad_norm": 0.48510780930519104, "learning_rate": 4.056442517809109e-06, "loss": 1.407, "step": 873 }, { "batch_num_effect_tokens": 3235, "batch_num_samples": 43, "batch_num_tokens": 16384, "epoch": 1.21011, "grad_norm": 0.36881211400032043, "learning_rate": 4.044570206785874e-06, "loss": 1.8418, "step": 874 }, { "batch_num_effect_tokens": 2368, "batch_num_samples": 46, "batch_num_tokens": 16380, "epoch": 1.21149, "grad_norm": 0.4941299855709076, "learning_rate": 4.03270348406648e-06, "loss": 1.7603, "step": 875 }, { "batch_num_effect_tokens": 2095, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.21288, "grad_norm": 0.517447829246521, "learning_rate": 4.0208424190593325e-06, "loss": 1.6821, "step": 876 }, { "batch_num_effect_tokens": 2221, "batch_num_samples": 34, "batch_num_tokens": 16372, "epoch": 1.21426, "grad_norm": 0.4635030925273895, "learning_rate": 4.008987081139734e-06, "loss": 1.7026, "step": 877 }, { "batch_num_effect_tokens": 2497, "batch_num_samples": 48, "batch_num_tokens": 16383, "epoch": 1.21565, "grad_norm": 0.4337514042854309, "learning_rate": 3.9971375396494965e-06, "loss": 1.6475, "step": 878 }, { "batch_num_effect_tokens": 2245, "batch_num_samples": 29, "batch_num_tokens": 16383, "epoch": 1.21703, "grad_norm": 0.4064190089702606, "learning_rate": 3.985293863896525e-06, "loss": 1.6855, "step": 879 }, { "batch_num_effect_tokens": 2434, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.21841, "grad_norm": 0.4570545256137848, "learning_rate": 3.973456123154415e-06, "loss": 1.8125, "step": 880 }, { "batch_num_effect_tokens": 2731, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.2198, "grad_norm": 0.3595949113368988, "learning_rate": 3.961624386662053e-06, "loss": 1.7915, "step": 881 }, { "batch_num_effect_tokens": 1948, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.22118, "grad_norm": 0.3988320827484131, "learning_rate": 3.949798723623201e-06, "loss": 1.5713, "step": 882 }, { "batch_num_effect_tokens": 2238, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.22257, "grad_norm": 0.38882866501808167, "learning_rate": 3.937979203206103e-06, "loss": 1.6694, "step": 883 }, { "batch_num_effect_tokens": 2229, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.22395, "grad_norm": 0.3313361704349518, "learning_rate": 3.9261658945430675e-06, "loss": 1.439, "step": 884 }, { "batch_num_effect_tokens": 2268, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.22534, "grad_norm": 0.33022063970565796, "learning_rate": 3.9143588667300795e-06, "loss": 1.2952, "step": 885 }, { "batch_num_effect_tokens": 2511, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.22672, "grad_norm": 0.3449042737483978, "learning_rate": 3.90255818882638e-06, "loss": 1.6025, "step": 886 }, { "batch_num_effect_tokens": 2489, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.22811, "grad_norm": 0.4506860375404358, "learning_rate": 3.890763929854071e-06, "loss": 1.7852, "step": 887 }, { "batch_num_effect_tokens": 2424, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.22949, "grad_norm": 0.46795007586479187, "learning_rate": 3.878976158797715e-06, "loss": 1.8376, "step": 888 }, { "batch_num_effect_tokens": 4237, "batch_num_samples": 42, "batch_num_tokens": 16318, "epoch": 1.23088, "grad_norm": 0.2751588821411133, "learning_rate": 3.86719494460392e-06, "loss": 1.7563, "step": 889 }, { "batch_num_effect_tokens": 1934, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.23226, "grad_norm": 0.4576158821582794, "learning_rate": 3.8554203561809475e-06, "loss": 1.6025, "step": 890 }, { "batch_num_effect_tokens": 2929, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.23364, "grad_norm": 0.4638974666595459, "learning_rate": 3.843652462398303e-06, "loss": 1.8354, "step": 891 }, { "batch_num_effect_tokens": 2298, "batch_num_samples": 36, "batch_num_tokens": 16383, "epoch": 1.23503, "grad_norm": 0.48894640803337097, "learning_rate": 3.8318913320863355e-06, "loss": 1.9224, "step": 892 }, { "batch_num_effect_tokens": 2357, "batch_num_samples": 28, "batch_num_tokens": 16382, "epoch": 1.23641, "grad_norm": 0.46466749906539917, "learning_rate": 3.820137034035835e-06, "loss": 1.6339, "step": 893 }, { "batch_num_effect_tokens": 2359, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 1.2378, "grad_norm": 0.4093346893787384, "learning_rate": 3.808389636997626e-06, "loss": 1.6816, "step": 894 }, { "batch_num_effect_tokens": 2677, "batch_num_samples": 30, "batch_num_tokens": 16380, "epoch": 1.23918, "grad_norm": 0.44297659397125244, "learning_rate": 3.7966492096821773e-06, "loss": 1.834, "step": 895 }, { "batch_num_effect_tokens": 3041, "batch_num_samples": 44, "batch_num_tokens": 16384, "epoch": 1.24057, "grad_norm": 0.2949908971786499, "learning_rate": 3.7849158207591806e-06, "loss": 1.5688, "step": 896 }, { "batch_num_effect_tokens": 2183, "batch_num_samples": 30, "batch_num_tokens": 16373, "epoch": 1.24195, "grad_norm": 0.45977649092674255, "learning_rate": 3.7731895388571725e-06, "loss": 1.4233, "step": 897 }, { "batch_num_effect_tokens": 2341, "batch_num_samples": 28, "batch_num_tokens": 16355, "epoch": 1.24334, "grad_norm": 0.4384441077709198, "learning_rate": 3.761470432563109e-06, "loss": 1.8403, "step": 898 }, { "batch_num_effect_tokens": 2153, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.24472, "grad_norm": 0.3226861357688904, "learning_rate": 3.7497585704219845e-06, "loss": 1.3684, "step": 899 }, { "batch_num_effect_tokens": 2351, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.24611, "grad_norm": 0.4933060109615326, "learning_rate": 3.738054020936418e-06, "loss": 1.5859, "step": 900 }, { "batch_num_effect_tokens": 3344, "batch_num_samples": 49, "batch_num_tokens": 16351, "epoch": 1.24749, "grad_norm": 0.3795660436153412, "learning_rate": 3.7263568525662574e-06, "loss": 1.5332, "step": 901 }, { "batch_num_effect_tokens": 2514, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.24888, "grad_norm": 0.41072386503219604, "learning_rate": 3.71466713372818e-06, "loss": 1.9399, "step": 902 }, { "batch_num_effect_tokens": 2848, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.25026, "grad_norm": 0.3781811594963074, "learning_rate": 3.7029849327952897e-06, "loss": 1.4692, "step": 903 }, { "batch_num_effect_tokens": 2898, "batch_num_samples": 29, "batch_num_tokens": 16284, "epoch": 1.25164, "grad_norm": 0.40088632702827454, "learning_rate": 3.691310318096719e-06, "loss": 1.6533, "step": 904 }, { "batch_num_effect_tokens": 2033, "batch_num_samples": 29, "batch_num_tokens": 16330, "epoch": 1.25303, "grad_norm": 0.4224783480167389, "learning_rate": 3.6796433579172265e-06, "loss": 1.2573, "step": 905 }, { "batch_num_effect_tokens": 2024, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.25441, "grad_norm": 0.44476646184921265, "learning_rate": 3.6679841204968025e-06, "loss": 1.5322, "step": 906 }, { "batch_num_effect_tokens": 2547, "batch_num_samples": 40, "batch_num_tokens": 16266, "epoch": 1.2558, "grad_norm": 0.4267597496509552, "learning_rate": 3.6563326740302664e-06, "loss": 1.5366, "step": 907 }, { "batch_num_effect_tokens": 3138, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.25718, "grad_norm": 0.26509785652160645, "learning_rate": 3.6446890866668627e-06, "loss": 1.1592, "step": 908 }, { "batch_num_effect_tokens": 2415, "batch_num_samples": 35, "batch_num_tokens": 16378, "epoch": 1.25857, "grad_norm": 0.5130552649497986, "learning_rate": 3.6330534265098793e-06, "loss": 1.9756, "step": 909 }, { "batch_num_effect_tokens": 2508, "batch_num_samples": 37, "batch_num_tokens": 16306, "epoch": 1.25995, "grad_norm": 0.30271703004837036, "learning_rate": 3.621425761616224e-06, "loss": 1.3228, "step": 910 }, { "batch_num_effect_tokens": 2481, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.26134, "grad_norm": 0.4425184428691864, "learning_rate": 3.609806159996056e-06, "loss": 1.6492, "step": 911 }, { "batch_num_effect_tokens": 2135, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.26272, "grad_norm": 0.4634925127029419, "learning_rate": 3.5981946896123576e-06, "loss": 1.7944, "step": 912 }, { "batch_num_effect_tokens": 4399, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 1.26411, "grad_norm": 0.27736717462539673, "learning_rate": 3.5865914183805606e-06, "loss": 1.8379, "step": 913 }, { "batch_num_effect_tokens": 2185, "batch_num_samples": 29, "batch_num_tokens": 16335, "epoch": 1.26549, "grad_norm": 0.41590017080307007, "learning_rate": 3.574996414168137e-06, "loss": 1.6846, "step": 914 }, { "batch_num_effect_tokens": 2571, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.26687, "grad_norm": 0.4418192207813263, "learning_rate": 3.563409744794207e-06, "loss": 1.5161, "step": 915 }, { "batch_num_effect_tokens": 2479, "batch_num_samples": 33, "batch_num_tokens": 16288, "epoch": 1.26826, "grad_norm": 0.4411601126194, "learning_rate": 3.5518314780291384e-06, "loss": 1.9434, "step": 916 }, { "batch_num_effect_tokens": 3201, "batch_num_samples": 63, "batch_num_tokens": 16288, "epoch": 1.26964, "grad_norm": 0.3934594392776489, "learning_rate": 3.5402616815941504e-06, "loss": 1.5591, "step": 917 }, { "batch_num_effect_tokens": 2171, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.27103, "grad_norm": 0.4277961254119873, "learning_rate": 3.5287004231609245e-06, "loss": 1.6895, "step": 918 }, { "batch_num_effect_tokens": 2899, "batch_num_samples": 38, "batch_num_tokens": 16312, "epoch": 1.27241, "grad_norm": 0.33964699506759644, "learning_rate": 3.517147770351199e-06, "loss": 1.4412, "step": 919 }, { "batch_num_effect_tokens": 2817, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.2738, "grad_norm": 0.42467910051345825, "learning_rate": 3.505603790736381e-06, "loss": 1.6663, "step": 920 }, { "batch_num_effect_tokens": 2456, "batch_num_samples": 33, "batch_num_tokens": 16274, "epoch": 1.27518, "grad_norm": 0.4011613726615906, "learning_rate": 3.4940685518371444e-06, "loss": 1.5254, "step": 921 }, { "batch_num_effect_tokens": 2525, "batch_num_samples": 28, "batch_num_tokens": 16304, "epoch": 1.27657, "grad_norm": 0.3130694329738617, "learning_rate": 3.4825421211230437e-06, "loss": 1.7017, "step": 922 }, { "batch_num_effect_tokens": 2062, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.27795, "grad_norm": 0.38040339946746826, "learning_rate": 3.4710245660121107e-06, "loss": 1.4214, "step": 923 }, { "batch_num_effect_tokens": 2658, "batch_num_samples": 40, "batch_num_tokens": 16382, "epoch": 1.27934, "grad_norm": 0.42727428674697876, "learning_rate": 3.4595159538704613e-06, "loss": 1.854, "step": 924 }, { "batch_num_effect_tokens": 2540, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.28072, "grad_norm": 0.44577351212501526, "learning_rate": 3.448016352011914e-06, "loss": 1.9058, "step": 925 }, { "batch_num_effect_tokens": 2517, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.2821, "grad_norm": 0.36449262499809265, "learning_rate": 3.4365258276975734e-06, "loss": 1.2725, "step": 926 }, { "batch_num_effect_tokens": 2260, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.28349, "grad_norm": 0.38644781708717346, "learning_rate": 3.42504444813546e-06, "loss": 1.7329, "step": 927 }, { "batch_num_effect_tokens": 4350, "batch_num_samples": 43, "batch_num_tokens": 16312, "epoch": 1.28487, "grad_norm": 0.36335328221321106, "learning_rate": 3.4135722804801004e-06, "loss": 1.3799, "step": 928 }, { "batch_num_effect_tokens": 2296, "batch_num_samples": 33, "batch_num_tokens": 16311, "epoch": 1.28626, "grad_norm": 0.49158135056495667, "learning_rate": 3.4021093918321445e-06, "loss": 1.3491, "step": 929 }, { "batch_num_effect_tokens": 2640, "batch_num_samples": 39, "batch_num_tokens": 16292, "epoch": 1.28764, "grad_norm": 0.28753662109375, "learning_rate": 3.390655849237967e-06, "loss": 1.4658, "step": 930 }, { "batch_num_effect_tokens": 2350, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.28903, "grad_norm": 0.45340996980667114, "learning_rate": 3.379211719689278e-06, "loss": 1.7046, "step": 931 }, { "batch_num_effect_tokens": 2317, "batch_num_samples": 30, "batch_num_tokens": 16381, "epoch": 1.29041, "grad_norm": 0.4619886875152588, "learning_rate": 3.367777070122733e-06, "loss": 1.4995, "step": 932 }, { "batch_num_effect_tokens": 2233, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.2918, "grad_norm": 0.48352497816085815, "learning_rate": 3.356351967419535e-06, "loss": 1.4849, "step": 933 }, { "batch_num_effect_tokens": 2996, "batch_num_samples": 49, "batch_num_tokens": 16384, "epoch": 1.29318, "grad_norm": 0.3254553973674774, "learning_rate": 3.3449364784050515e-06, "loss": 1.6035, "step": 934 }, { "batch_num_effect_tokens": 2359, "batch_num_samples": 29, "batch_num_tokens": 16278, "epoch": 1.29457, "grad_norm": 0.46663203835487366, "learning_rate": 3.333530669848416e-06, "loss": 1.6562, "step": 935 }, { "batch_num_effect_tokens": 2735, "batch_num_samples": 49, "batch_num_tokens": 16349, "epoch": 1.29595, "grad_norm": 0.4088381230831146, "learning_rate": 3.3221346084621447e-06, "loss": 1.7729, "step": 936 }, { "batch_num_effect_tokens": 1922, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.29733, "grad_norm": 0.48249420523643494, "learning_rate": 3.310748360901741e-06, "loss": 1.7646, "step": 937 }, { "batch_num_effect_tokens": 2454, "batch_num_samples": 36, "batch_num_tokens": 16373, "epoch": 1.29872, "grad_norm": 0.5235016942024231, "learning_rate": 3.2993719937653037e-06, "loss": 1.9146, "step": 938 }, { "batch_num_effect_tokens": 1749, "batch_num_samples": 29, "batch_num_tokens": 16330, "epoch": 1.3001, "grad_norm": 0.2954041063785553, "learning_rate": 3.28800557359315e-06, "loss": 1.5688, "step": 939 }, { "batch_num_effect_tokens": 3079, "batch_num_samples": 29, "batch_num_tokens": 16350, "epoch": 1.30149, "grad_norm": 0.37800875306129456, "learning_rate": 3.276649166867406e-06, "loss": 1.6436, "step": 940 }, { "batch_num_effect_tokens": 2646, "batch_num_samples": 34, "batch_num_tokens": 16292, "epoch": 1.30287, "grad_norm": 0.4555777311325073, "learning_rate": 3.2653028400116395e-06, "loss": 1.8889, "step": 941 }, { "batch_num_effect_tokens": 2317, "batch_num_samples": 29, "batch_num_tokens": 16369, "epoch": 1.30426, "grad_norm": 0.4693097770214081, "learning_rate": 3.2539666593904534e-06, "loss": 1.4238, "step": 942 }, { "batch_num_effect_tokens": 2760, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.30564, "grad_norm": 0.40017247200012207, "learning_rate": 3.242640691309111e-06, "loss": 1.6177, "step": 943 }, { "batch_num_effect_tokens": 2670, "batch_num_samples": 38, "batch_num_tokens": 16341, "epoch": 1.30703, "grad_norm": 0.41712284088134766, "learning_rate": 3.23132500201314e-06, "loss": 2.0698, "step": 944 }, { "batch_num_effect_tokens": 2732, "batch_num_samples": 29, "batch_num_tokens": 16317, "epoch": 1.30841, "grad_norm": 0.371325820684433, "learning_rate": 3.2200196576879463e-06, "loss": 1.4053, "step": 945 }, { "batch_num_effect_tokens": 2576, "batch_num_samples": 29, "batch_num_tokens": 16284, "epoch": 1.3098, "grad_norm": 0.4886104166507721, "learning_rate": 3.20872472445843e-06, "loss": 1.7012, "step": 946 }, { "batch_num_effect_tokens": 2263, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.31118, "grad_norm": 0.27055272459983826, "learning_rate": 3.1974402683885963e-06, "loss": 1.4731, "step": 947 }, { "batch_num_effect_tokens": 2657, "batch_num_samples": 42, "batch_num_tokens": 16384, "epoch": 1.31256, "grad_norm": 0.32862618565559387, "learning_rate": 3.1861663554811707e-06, "loss": 1.5603, "step": 948 }, { "batch_num_effect_tokens": 1970, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.31395, "grad_norm": 0.4817477762699127, "learning_rate": 3.1749030516772084e-06, "loss": 1.6909, "step": 949 }, { "batch_num_effect_tokens": 2995, "batch_num_samples": 28, "batch_num_tokens": 16337, "epoch": 1.31533, "grad_norm": 0.3697652518749237, "learning_rate": 3.163650422855717e-06, "loss": 1.1365, "step": 950 }, { "batch_num_effect_tokens": 2831, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.31672, "grad_norm": 0.4033261239528656, "learning_rate": 3.1524085348332622e-06, "loss": 1.6616, "step": 951 }, { "batch_num_effect_tokens": 2539, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.3181, "grad_norm": 0.43878668546676636, "learning_rate": 3.1411774533635854e-06, "loss": 1.9097, "step": 952 }, { "batch_num_effect_tokens": 2397, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.31949, "grad_norm": 0.31448036432266235, "learning_rate": 3.1299572441372274e-06, "loss": 1.4751, "step": 953 }, { "batch_num_effect_tokens": 2204, "batch_num_samples": 34, "batch_num_tokens": 16291, "epoch": 1.32087, "grad_norm": 0.46613040566444397, "learning_rate": 3.11874797278113e-06, "loss": 1.7314, "step": 954 }, { "batch_num_effect_tokens": 2840, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 1.32226, "grad_norm": 0.3567422032356262, "learning_rate": 3.1075497048582635e-06, "loss": 1.7329, "step": 955 }, { "batch_num_effect_tokens": 2337, "batch_num_samples": 32, "batch_num_tokens": 16340, "epoch": 1.32364, "grad_norm": 0.4468025267124176, "learning_rate": 3.0963625058672384e-06, "loss": 1.7456, "step": 956 }, { "batch_num_effect_tokens": 2712, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.32503, "grad_norm": 0.3327140510082245, "learning_rate": 3.0851864412419236e-06, "loss": 1.5381, "step": 957 }, { "batch_num_effect_tokens": 1771, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.32641, "grad_norm": 0.44815269112586975, "learning_rate": 3.0740215763510617e-06, "loss": 1.5186, "step": 958 }, { "batch_num_effect_tokens": 2187, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.3278, "grad_norm": 0.5008686184883118, "learning_rate": 3.0628679764978875e-06, "loss": 1.5684, "step": 959 }, { "batch_num_effect_tokens": 3097, "batch_num_samples": 29, "batch_num_tokens": 16373, "epoch": 1.32918, "grad_norm": 0.46639010310173035, "learning_rate": 3.0517257069197497e-06, "loss": 1.6914, "step": 960 }, { "batch_num_effect_tokens": 2445, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.33056, "grad_norm": 0.5597130060195923, "learning_rate": 3.0405948327877233e-06, "loss": 1.7217, "step": 961 }, { "batch_num_effect_tokens": 2684, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 1.33195, "grad_norm": 0.31342458724975586, "learning_rate": 3.0294754192062346e-06, "loss": 1.6523, "step": 962 }, { "batch_num_effect_tokens": 2502, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.33333, "grad_norm": 0.31468817591667175, "learning_rate": 3.0183675312126737e-06, "loss": 1.6855, "step": 963 }, { "batch_num_effect_tokens": 2913, "batch_num_samples": 38, "batch_num_tokens": 16370, "epoch": 1.33472, "grad_norm": 0.4313669502735138, "learning_rate": 3.00727123377702e-06, "loss": 1.6729, "step": 964 }, { "batch_num_effect_tokens": 1997, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.3361, "grad_norm": 0.469485342502594, "learning_rate": 2.9961865918014575e-06, "loss": 1.5527, "step": 965 }, { "batch_num_effect_tokens": 3530, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.33749, "grad_norm": 0.43584564328193665, "learning_rate": 2.985113670120001e-06, "loss": 1.9014, "step": 966 }, { "batch_num_effect_tokens": 2677, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.33887, "grad_norm": 0.38530248403549194, "learning_rate": 2.9740525334981105e-06, "loss": 1.5991, "step": 967 }, { "batch_num_effect_tokens": 2030, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.34026, "grad_norm": 0.32546916604042053, "learning_rate": 2.963003246632315e-06, "loss": 1.5054, "step": 968 }, { "batch_num_effect_tokens": 2545, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.34164, "grad_norm": 0.49246978759765625, "learning_rate": 2.951965874149837e-06, "loss": 1.8369, "step": 969 }, { "batch_num_effect_tokens": 2502, "batch_num_samples": 31, "batch_num_tokens": 16382, "epoch": 1.34303, "grad_norm": 0.4359091818332672, "learning_rate": 2.9409404806082077e-06, "loss": 1.8379, "step": 970 }, { "batch_num_effect_tokens": 2508, "batch_num_samples": 33, "batch_num_tokens": 16380, "epoch": 1.34441, "grad_norm": 0.42566049098968506, "learning_rate": 2.9299271304948985e-06, "loss": 1.6943, "step": 971 }, { "batch_num_effect_tokens": 2405, "batch_num_samples": 34, "batch_num_tokens": 16294, "epoch": 1.34579, "grad_norm": 0.4611959755420685, "learning_rate": 2.918925888226935e-06, "loss": 1.3047, "step": 972 }, { "batch_num_effect_tokens": 2247, "batch_num_samples": 29, "batch_num_tokens": 16383, "epoch": 1.34718, "grad_norm": 0.4036383628845215, "learning_rate": 2.9079368181505263e-06, "loss": 1.6792, "step": 973 }, { "batch_num_effect_tokens": 2028, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.34856, "grad_norm": 0.4465937912464142, "learning_rate": 2.896959984540687e-06, "loss": 1.3398, "step": 974 }, { "batch_num_effect_tokens": 2837, "batch_num_samples": 29, "batch_num_tokens": 16298, "epoch": 1.34995, "grad_norm": 0.44683992862701416, "learning_rate": 2.885995451600855e-06, "loss": 1.8164, "step": 975 }, { "batch_num_effect_tokens": 1921, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.35133, "grad_norm": 0.502355694770813, "learning_rate": 2.8750432834625312e-06, "loss": 1.6018, "step": 976 }, { "batch_num_effect_tokens": 3036, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.35272, "grad_norm": 0.298296719789505, "learning_rate": 2.864103544184885e-06, "loss": 1.6064, "step": 977 }, { "batch_num_effect_tokens": 2300, "batch_num_samples": 29, "batch_num_tokens": 16380, "epoch": 1.3541, "grad_norm": 0.30645519495010376, "learning_rate": 2.8531762977543954e-06, "loss": 1.251, "step": 978 }, { "batch_num_effect_tokens": 1883, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.35549, "grad_norm": 0.3690178096294403, "learning_rate": 2.84226160808447e-06, "loss": 1.4941, "step": 979 }, { "batch_num_effect_tokens": 2373, "batch_num_samples": 40, "batch_num_tokens": 16384, "epoch": 1.35687, "grad_norm": 0.38422366976737976, "learning_rate": 2.831359539015073e-06, "loss": 1.5337, "step": 980 }, { "batch_num_effect_tokens": 2357, "batch_num_samples": 33, "batch_num_tokens": 16301, "epoch": 1.35826, "grad_norm": 0.3881143629550934, "learning_rate": 2.820470154312346e-06, "loss": 1.4199, "step": 981 }, { "batch_num_effect_tokens": 1939, "batch_num_samples": 28, "batch_num_tokens": 16383, "epoch": 1.35964, "grad_norm": 0.32559847831726074, "learning_rate": 2.809593517668243e-06, "loss": 1.4736, "step": 982 }, { "batch_num_effect_tokens": 1956, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.36102, "grad_norm": 0.33476418256759644, "learning_rate": 2.7987296927001597e-06, "loss": 1.616, "step": 983 }, { "batch_num_effect_tokens": 2839, "batch_num_samples": 29, "batch_num_tokens": 16330, "epoch": 1.36241, "grad_norm": 0.39361515641212463, "learning_rate": 2.7878787429505444e-06, "loss": 3.6904, "step": 984 }, { "batch_num_effect_tokens": 2240, "batch_num_samples": 30, "batch_num_tokens": 16283, "epoch": 1.36379, "grad_norm": 0.5338387489318848, "learning_rate": 2.777040731886549e-06, "loss": 1.8589, "step": 985 }, { "batch_num_effect_tokens": 2148, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.36518, "grad_norm": 0.4984688460826874, "learning_rate": 2.766215722899642e-06, "loss": 1.8804, "step": 986 }, { "batch_num_effect_tokens": 2233, "batch_num_samples": 33, "batch_num_tokens": 16334, "epoch": 1.36656, "grad_norm": 0.2897249758243561, "learning_rate": 2.7554037793052476e-06, "loss": 1.3452, "step": 987 }, { "batch_num_effect_tokens": 2506, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.36795, "grad_norm": 0.4737148582935333, "learning_rate": 2.744604964342364e-06, "loss": 1.7588, "step": 988 }, { "batch_num_effect_tokens": 2513, "batch_num_samples": 29, "batch_num_tokens": 16318, "epoch": 1.36933, "grad_norm": 0.5092816352844238, "learning_rate": 2.733819341173202e-06, "loss": 1.8828, "step": 989 }, { "batch_num_effect_tokens": 2725, "batch_num_samples": 31, "batch_num_tokens": 16205, "epoch": 1.37072, "grad_norm": 0.40438881516456604, "learning_rate": 2.7230469728828156e-06, "loss": 1.6777, "step": 990 }, { "batch_num_effect_tokens": 2506, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.3721, "grad_norm": 0.47437766194343567, "learning_rate": 2.7122879224787315e-06, "loss": 1.6055, "step": 991 }, { "batch_num_effect_tokens": 2504, "batch_num_samples": 34, "batch_num_tokens": 16307, "epoch": 1.37349, "grad_norm": 0.42637789249420166, "learning_rate": 2.701542252890581e-06, "loss": 1.5342, "step": 992 }, { "batch_num_effect_tokens": 1851, "batch_num_samples": 29, "batch_num_tokens": 16375, "epoch": 1.37487, "grad_norm": 0.5074117183685303, "learning_rate": 2.690810026969725e-06, "loss": 1.3657, "step": 993 }, { "batch_num_effect_tokens": 2359, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.37625, "grad_norm": 0.3262867033481598, "learning_rate": 2.6800913074888984e-06, "loss": 1.8843, "step": 994 }, { "batch_num_effect_tokens": 2358, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.37764, "grad_norm": 0.44876888394355774, "learning_rate": 2.6693861571418372e-06, "loss": 1.9443, "step": 995 }, { "batch_num_effect_tokens": 2319, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.37902, "grad_norm": 0.2832389175891876, "learning_rate": 2.6586946385429056e-06, "loss": 1.6992, "step": 996 }, { "batch_num_effect_tokens": 3153, "batch_num_samples": 39, "batch_num_tokens": 16352, "epoch": 1.38041, "grad_norm": 0.4189690351486206, "learning_rate": 2.648016814226742e-06, "loss": 1.7036, "step": 997 }, { "batch_num_effect_tokens": 2805, "batch_num_samples": 34, "batch_num_tokens": 16352, "epoch": 1.38179, "grad_norm": 0.4060484766960144, "learning_rate": 2.6373527466478843e-06, "loss": 1.7734, "step": 998 }, { "batch_num_effect_tokens": 2801, "batch_num_samples": 36, "batch_num_tokens": 16346, "epoch": 1.38318, "grad_norm": 0.8762073516845703, "learning_rate": 2.62670249818041e-06, "loss": 1.7598, "step": 999 }, { "batch_num_effect_tokens": 1888, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.38456, "grad_norm": 0.581211507320404, "learning_rate": 2.616066131117563e-06, "loss": 1.54, "step": 1000 }, { "batch_num_effect_tokens": 2721, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.38595, "grad_norm": 0.42715200781822205, "learning_rate": 2.6054437076713997e-06, "loss": 1.2588, "step": 1001 }, { "batch_num_effect_tokens": 2473, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.38733, "grad_norm": 0.33215662837028503, "learning_rate": 2.5948352899724206e-06, "loss": 1.7622, "step": 1002 }, { "batch_num_effect_tokens": 2560, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.38872, "grad_norm": 0.4188079237937927, "learning_rate": 2.5842409400692026e-06, "loss": 1.8574, "step": 1003 }, { "batch_num_effect_tokens": 2545, "batch_num_samples": 35, "batch_num_tokens": 16304, "epoch": 1.3901, "grad_norm": 0.46113309264183044, "learning_rate": 2.5736607199280457e-06, "loss": 1.7102, "step": 1004 }, { "batch_num_effect_tokens": 2578, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.39148, "grad_norm": 0.41554850339889526, "learning_rate": 2.5630946914325983e-06, "loss": 1.8774, "step": 1005 }, { "batch_num_effect_tokens": 3533, "batch_num_samples": 42, "batch_num_tokens": 16381, "epoch": 1.39287, "grad_norm": 0.4100356101989746, "learning_rate": 2.552542916383507e-06, "loss": 1.8496, "step": 1006 }, { "batch_num_effect_tokens": 1858, "batch_num_samples": 28, "batch_num_tokens": 16337, "epoch": 1.39425, "grad_norm": 0.48907506465911865, "learning_rate": 2.5420054564980497e-06, "loss": 1.3267, "step": 1007 }, { "batch_num_effect_tokens": 2532, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.39564, "grad_norm": 0.3441801965236664, "learning_rate": 2.5314823734097748e-06, "loss": 1.6035, "step": 1008 }, { "batch_num_effect_tokens": 3019, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.39702, "grad_norm": 0.395893394947052, "learning_rate": 2.5209737286681367e-06, "loss": 1.8779, "step": 1009 }, { "batch_num_effect_tokens": 2663, "batch_num_samples": 28, "batch_num_tokens": 16382, "epoch": 1.39841, "grad_norm": 0.35219889879226685, "learning_rate": 2.5104795837381457e-06, "loss": 1.52, "step": 1010 }, { "batch_num_effect_tokens": 2892, "batch_num_samples": 33, "batch_num_tokens": 16279, "epoch": 1.39979, "grad_norm": 0.3056180775165558, "learning_rate": 2.5000000000000015e-06, "loss": 1.1934, "step": 1011 }, { "batch_num_effect_tokens": 2285, "batch_num_samples": 29, "batch_num_tokens": 16343, "epoch": 1.40118, "grad_norm": 0.4565596878528595, "learning_rate": 2.4895350387487304e-06, "loss": 1.4209, "step": 1012 }, { "batch_num_effect_tokens": 2642, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.40256, "grad_norm": 0.32005468010902405, "learning_rate": 2.479084761193839e-06, "loss": 1.7222, "step": 1013 }, { "batch_num_effect_tokens": 2717, "batch_num_samples": 33, "batch_num_tokens": 16379, "epoch": 1.40395, "grad_norm": 0.4048800766468048, "learning_rate": 2.4686492284589447e-06, "loss": 1.7456, "step": 1014 }, { "batch_num_effect_tokens": 2386, "batch_num_samples": 34, "batch_num_tokens": 16317, "epoch": 1.40533, "grad_norm": 0.4155402183532715, "learning_rate": 2.4582285015814263e-06, "loss": 1.4043, "step": 1015 }, { "batch_num_effect_tokens": 2386, "batch_num_samples": 34, "batch_num_tokens": 16317, "epoch": 1.40533, "eval_eval_loss": 0.438742458820343, "eval_eval_runtime": 128.1493, "eval_eval_samples_per_second": 37.831, "eval_eval_steps_per_second": 2.364, "step": 1015 }, { "batch_num_effect_tokens": 2371, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.40672, "grad_norm": 0.31526005268096924, "learning_rate": 2.447822641512058e-06, "loss": 1.394, "step": 1016 }, { "batch_num_effect_tokens": 2470, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.4081, "grad_norm": 0.3880106508731842, "learning_rate": 2.4374317091146593e-06, "loss": 1.7803, "step": 1017 }, { "batch_num_effect_tokens": 2068, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.40948, "grad_norm": 0.46882250905036926, "learning_rate": 2.427055765165741e-06, "loss": 1.3398, "step": 1018 }, { "batch_num_effect_tokens": 2398, "batch_num_samples": 31, "batch_num_tokens": 16376, "epoch": 1.41087, "grad_norm": 0.4342593252658844, "learning_rate": 2.416694870354145e-06, "loss": 1.0212, "step": 1019 }, { "batch_num_effect_tokens": 2049, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.41225, "grad_norm": 0.5095953345298767, "learning_rate": 2.406349085280692e-06, "loss": 1.54, "step": 1020 }, { "batch_num_effect_tokens": 2893, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 1.41364, "grad_norm": 0.41362911462783813, "learning_rate": 2.396018470457821e-06, "loss": 1.8291, "step": 1021 }, { "batch_num_effect_tokens": 3001, "batch_num_samples": 59, "batch_num_tokens": 16280, "epoch": 1.41502, "grad_norm": 0.3915373384952545, "learning_rate": 2.385703086309247e-06, "loss": 1.6416, "step": 1022 }, { "batch_num_effect_tokens": 2828, "batch_num_samples": 40, "batch_num_tokens": 16382, "epoch": 1.41641, "grad_norm": 0.4672578275203705, "learning_rate": 2.3754029931695954e-06, "loss": 1.5889, "step": 1023 }, { "batch_num_effect_tokens": 2290, "batch_num_samples": 29, "batch_num_tokens": 16379, "epoch": 1.41779, "grad_norm": 0.5043079853057861, "learning_rate": 2.3651182512840604e-06, "loss": 1.5244, "step": 1024 }, { "batch_num_effect_tokens": 2472, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 1.41918, "grad_norm": 0.33650776743888855, "learning_rate": 2.3548489208080392e-06, "loss": 1.3931, "step": 1025 }, { "batch_num_effect_tokens": 2058, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.42056, "grad_norm": 0.443490594625473, "learning_rate": 2.3445950618067935e-06, "loss": 1.769, "step": 1026 }, { "batch_num_effect_tokens": 2052, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.42195, "grad_norm": 0.4616880416870117, "learning_rate": 2.3343567342550933e-06, "loss": 1.75, "step": 1027 }, { "batch_num_effect_tokens": 2578, "batch_num_samples": 47, "batch_num_tokens": 16343, "epoch": 1.42333, "grad_norm": 0.3548419177532196, "learning_rate": 2.3241339980368584e-06, "loss": 1.4885, "step": 1028 }, { "batch_num_effect_tokens": 2476, "batch_num_samples": 45, "batch_num_tokens": 16369, "epoch": 1.42471, "grad_norm": 0.435629665851593, "learning_rate": 2.313926912944821e-06, "loss": 1.6655, "step": 1029 }, { "batch_num_effect_tokens": 2496, "batch_num_samples": 30, "batch_num_tokens": 16292, "epoch": 1.4261, "grad_norm": 0.30846139788627625, "learning_rate": 2.3037355386801683e-06, "loss": 1.1548, "step": 1030 }, { "batch_num_effect_tokens": 2804, "batch_num_samples": 50, "batch_num_tokens": 16384, "epoch": 1.42748, "grad_norm": 0.38507959246635437, "learning_rate": 2.2935599348521974e-06, "loss": 1.6316, "step": 1031 }, { "batch_num_effect_tokens": 2952, "batch_num_samples": 31, "batch_num_tokens": 16351, "epoch": 1.42887, "grad_norm": 0.4047963321208954, "learning_rate": 2.283400160977959e-06, "loss": 1.3071, "step": 1032 }, { "batch_num_effect_tokens": 2803, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.43025, "grad_norm": 0.3774239718914032, "learning_rate": 2.2732562764819157e-06, "loss": 1.5283, "step": 1033 }, { "batch_num_effect_tokens": 1949, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.43164, "grad_norm": 0.4033801257610321, "learning_rate": 2.263128340695596e-06, "loss": 1.1174, "step": 1034 }, { "batch_num_effect_tokens": 2690, "batch_num_samples": 34, "batch_num_tokens": 16292, "epoch": 1.43302, "grad_norm": 0.4714415967464447, "learning_rate": 2.253016412857244e-06, "loss": 1.7295, "step": 1035 }, { "batch_num_effect_tokens": 3327, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.43441, "grad_norm": 0.4453805685043335, "learning_rate": 2.242920552111473e-06, "loss": 1.875, "step": 1036 }, { "batch_num_effect_tokens": 2508, "batch_num_samples": 39, "batch_num_tokens": 16352, "epoch": 1.43579, "grad_norm": 0.3264619708061218, "learning_rate": 2.232840817508918e-06, "loss": 1.4797, "step": 1037 }, { "batch_num_effect_tokens": 2630, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.43718, "grad_norm": 0.35641780495643616, "learning_rate": 2.222777268005894e-06, "loss": 1.5007, "step": 1038 }, { "batch_num_effect_tokens": 2045, "batch_num_samples": 29, "batch_num_tokens": 16298, "epoch": 1.43856, "grad_norm": 0.4689180254936218, "learning_rate": 2.212729962464051e-06, "loss": 1.48, "step": 1039 }, { "batch_num_effect_tokens": 2730, "batch_num_samples": 42, "batch_num_tokens": 16346, "epoch": 1.43994, "grad_norm": 0.4001019597053528, "learning_rate": 2.202698959650023e-06, "loss": 1.479, "step": 1040 }, { "batch_num_effect_tokens": 2059, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.44133, "grad_norm": 0.29887670278549194, "learning_rate": 2.1926843182350955e-06, "loss": 1.5645, "step": 1041 }, { "batch_num_effect_tokens": 2038, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.44271, "grad_norm": 0.49215254187583923, "learning_rate": 2.182686096794852e-06, "loss": 1.7939, "step": 1042 }, { "batch_num_effect_tokens": 2387, "batch_num_samples": 36, "batch_num_tokens": 16382, "epoch": 1.4441, "grad_norm": 0.4596003592014313, "learning_rate": 2.1727043538088406e-06, "loss": 1.8623, "step": 1043 }, { "batch_num_effect_tokens": 1954, "batch_num_samples": 31, "batch_num_tokens": 16381, "epoch": 1.44548, "grad_norm": 0.3227980434894562, "learning_rate": 2.162739147660219e-06, "loss": 1.4358, "step": 1044 }, { "batch_num_effect_tokens": 2145, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.44687, "grad_norm": 0.5007283687591553, "learning_rate": 2.1527905366354292e-06, "loss": 1.7495, "step": 1045 }, { "batch_num_effect_tokens": 2287, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 1.44825, "grad_norm": 0.3085324168205261, "learning_rate": 2.1428585789238416e-06, "loss": 1.4463, "step": 1046 }, { "batch_num_effect_tokens": 2150, "batch_num_samples": 28, "batch_num_tokens": 16367, "epoch": 1.44964, "grad_norm": 0.4851103127002716, "learning_rate": 2.1329433326174265e-06, "loss": 1.6372, "step": 1047 }, { "batch_num_effect_tokens": 3101, "batch_num_samples": 42, "batch_num_tokens": 16360, "epoch": 1.45102, "grad_norm": 0.4148053824901581, "learning_rate": 2.1230448557104087e-06, "loss": 1.353, "step": 1048 }, { "batch_num_effect_tokens": 2228, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.45241, "grad_norm": 0.5459240674972534, "learning_rate": 2.1131632060989255e-06, "loss": 1.8594, "step": 1049 }, { "batch_num_effect_tokens": 2373, "batch_num_samples": 29, "batch_num_tokens": 16290, "epoch": 1.45379, "grad_norm": 0.4294782876968384, "learning_rate": 2.103298441580694e-06, "loss": 1.9277, "step": 1050 }, { "batch_num_effect_tokens": 2560, "batch_num_samples": 50, "batch_num_tokens": 16298, "epoch": 1.45517, "grad_norm": 0.45906487107276917, "learning_rate": 2.093450619854671e-06, "loss": 1.7617, "step": 1051 }, { "batch_num_effect_tokens": 2482, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.45656, "grad_norm": 0.4132348895072937, "learning_rate": 2.083619798520715e-06, "loss": 1.4282, "step": 1052 }, { "batch_num_effect_tokens": 2576, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.45794, "grad_norm": 0.48784327507019043, "learning_rate": 2.0738060350792454e-06, "loss": 1.9016, "step": 1053 }, { "batch_num_effect_tokens": 3266, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.45933, "grad_norm": 0.4079829454421997, "learning_rate": 2.064009386930915e-06, "loss": 1.6836, "step": 1054 }, { "batch_num_effect_tokens": 2040, "batch_num_samples": 34, "batch_num_tokens": 16367, "epoch": 1.46071, "grad_norm": 0.44097012281417847, "learning_rate": 2.054229911376269e-06, "loss": 1.6016, "step": 1055 }, { "batch_num_effect_tokens": 3585, "batch_num_samples": 42, "batch_num_tokens": 16383, "epoch": 1.4621, "grad_norm": 0.4330642819404602, "learning_rate": 2.0444676656154037e-06, "loss": 1.8677, "step": 1056 }, { "batch_num_effect_tokens": 2450, "batch_num_samples": 28, "batch_num_tokens": 16277, "epoch": 1.46348, "grad_norm": 0.4121383726596832, "learning_rate": 2.0347227067476478e-06, "loss": 1.3215, "step": 1057 }, { "batch_num_effect_tokens": 2131, "batch_num_samples": 30, "batch_num_tokens": 16364, "epoch": 1.46487, "grad_norm": 0.4646347165107727, "learning_rate": 2.024995091771212e-06, "loss": 1.7773, "step": 1058 }, { "batch_num_effect_tokens": 2559, "batch_num_samples": 31, "batch_num_tokens": 16354, "epoch": 1.46625, "grad_norm": 0.42456355690956116, "learning_rate": 2.015284877582868e-06, "loss": 1.186, "step": 1059 }, { "batch_num_effect_tokens": 1942, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.46764, "grad_norm": 0.48479658365249634, "learning_rate": 2.005592120977606e-06, "loss": 1.7349, "step": 1060 }, { "batch_num_effect_tokens": 2257, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.46902, "grad_norm": 0.4582153856754303, "learning_rate": 1.9959168786483074e-06, "loss": 1.791, "step": 1061 }, { "batch_num_effect_tokens": 2763, "batch_num_samples": 37, "batch_num_tokens": 16340, "epoch": 1.4704, "grad_norm": 0.30545473098754883, "learning_rate": 1.9862592071854137e-06, "loss": 1.4775, "step": 1062 }, { "batch_num_effect_tokens": 3333, "batch_num_samples": 41, "batch_num_tokens": 16343, "epoch": 1.47179, "grad_norm": 0.37118247151374817, "learning_rate": 1.9766191630765964e-06, "loss": 1.6533, "step": 1063 }, { "batch_num_effect_tokens": 2645, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.47317, "grad_norm": 0.40963587164878845, "learning_rate": 1.9669968027064234e-06, "loss": 1.5537, "step": 1064 }, { "batch_num_effect_tokens": 2410, "batch_num_samples": 28, "batch_num_tokens": 16336, "epoch": 1.47456, "grad_norm": 0.41919150948524475, "learning_rate": 1.9573921823560273e-06, "loss": 1.6309, "step": 1065 }, { "batch_num_effect_tokens": 2322, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.47594, "grad_norm": 0.42133572697639465, "learning_rate": 1.9478053582027826e-06, "loss": 1.2935, "step": 1066 }, { "batch_num_effect_tokens": 2465, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.47733, "grad_norm": 0.4778744876384735, "learning_rate": 1.9382363863199773e-06, "loss": 1.8535, "step": 1067 }, { "batch_num_effect_tokens": 2865, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.47871, "grad_norm": 0.36298638582229614, "learning_rate": 1.9286853226764725e-06, "loss": 1.8096, "step": 1068 }, { "batch_num_effect_tokens": 2950, "batch_num_samples": 32, "batch_num_tokens": 16332, "epoch": 1.4801, "grad_norm": 0.4348921477794647, "learning_rate": 1.919152223136391e-06, "loss": 1.708, "step": 1069 }, { "batch_num_effect_tokens": 2375, "batch_num_samples": 28, "batch_num_tokens": 16322, "epoch": 1.48148, "grad_norm": 0.4186846911907196, "learning_rate": 1.9096371434587836e-06, "loss": 1.4092, "step": 1070 }, { "batch_num_effect_tokens": 2688, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.48287, "grad_norm": 0.3714967966079712, "learning_rate": 1.9001401392973018e-06, "loss": 1.3628, "step": 1071 }, { "batch_num_effect_tokens": 2726, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.48425, "grad_norm": 0.2763594090938568, "learning_rate": 1.8906612661998698e-06, "loss": 1.3496, "step": 1072 }, { "batch_num_effect_tokens": 2838, "batch_num_samples": 43, "batch_num_tokens": 16384, "epoch": 1.48564, "grad_norm": 0.3681391179561615, "learning_rate": 1.88120057960837e-06, "loss": 2.0007, "step": 1073 }, { "batch_num_effect_tokens": 2685, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.48702, "grad_norm": 0.3195376694202423, "learning_rate": 1.8717581348583052e-06, "loss": 1.3599, "step": 1074 }, { "batch_num_effect_tokens": 2451, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.4884, "grad_norm": 0.45888882875442505, "learning_rate": 1.8623339871784869e-06, "loss": 1.7485, "step": 1075 }, { "batch_num_effect_tokens": 2559, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.48979, "grad_norm": 0.4065034091472626, "learning_rate": 1.852928191690707e-06, "loss": 1.9551, "step": 1076 }, { "batch_num_effect_tokens": 2582, "batch_num_samples": 29, "batch_num_tokens": 16326, "epoch": 1.49117, "grad_norm": 0.3899232745170593, "learning_rate": 1.8435408034094116e-06, "loss": 1.481, "step": 1077 }, { "batch_num_effect_tokens": 1905, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.49256, "grad_norm": 0.2967417538166046, "learning_rate": 1.8341718772413852e-06, "loss": 1.6135, "step": 1078 }, { "batch_num_effect_tokens": 2549, "batch_num_samples": 29, "batch_num_tokens": 16335, "epoch": 1.49394, "grad_norm": 0.4529731869697571, "learning_rate": 1.8248214679854298e-06, "loss": 1.7056, "step": 1079 }, { "batch_num_effect_tokens": 3105, "batch_num_samples": 44, "batch_num_tokens": 16328, "epoch": 1.49533, "grad_norm": 0.38344278931617737, "learning_rate": 1.8154896303320402e-06, "loss": 1.6309, "step": 1080 }, { "batch_num_effect_tokens": 2389, "batch_num_samples": 30, "batch_num_tokens": 16358, "epoch": 1.49671, "grad_norm": 0.45120224356651306, "learning_rate": 1.8061764188630831e-06, "loss": 1.7812, "step": 1081 }, { "batch_num_effect_tokens": 2128, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.4981, "grad_norm": 0.41508033871650696, "learning_rate": 1.7968818880514855e-06, "loss": 1.6602, "step": 1082 }, { "batch_num_effect_tokens": 2118, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.49948, "grad_norm": 0.42764511704444885, "learning_rate": 1.78760609226091e-06, "loss": 1.6582, "step": 1083 }, { "batch_num_effect_tokens": 2367, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.50087, "grad_norm": 0.42199867963790894, "learning_rate": 1.7783490857454354e-06, "loss": 1.7383, "step": 1084 }, { "batch_num_effect_tokens": 3069, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.50225, "grad_norm": 0.29916369915008545, "learning_rate": 1.7691109226492448e-06, "loss": 1.2139, "step": 1085 }, { "batch_num_effect_tokens": 2892, "batch_num_samples": 43, "batch_num_tokens": 16383, "epoch": 1.50363, "grad_norm": 0.3048282861709595, "learning_rate": 1.7598916570063064e-06, "loss": 1.6353, "step": 1086 }, { "batch_num_effect_tokens": 2025, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.50502, "grad_norm": 0.4442513883113861, "learning_rate": 1.750691342740058e-06, "loss": 1.7227, "step": 1087 }, { "batch_num_effect_tokens": 2216, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.5064, "grad_norm": 0.4501620829105377, "learning_rate": 1.7415100336630858e-06, "loss": 1.6396, "step": 1088 }, { "batch_num_effect_tokens": 2352, "batch_num_samples": 29, "batch_num_tokens": 16354, "epoch": 1.50779, "grad_norm": 0.4907625615596771, "learning_rate": 1.732347783476822e-06, "loss": 1.4497, "step": 1089 }, { "batch_num_effect_tokens": 3061, "batch_num_samples": 30, "batch_num_tokens": 16316, "epoch": 1.50917, "grad_norm": 0.3096473813056946, "learning_rate": 1.7232046457712164e-06, "loss": 1.4673, "step": 1090 }, { "batch_num_effect_tokens": 2480, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.51056, "grad_norm": 0.4376479685306549, "learning_rate": 1.7140806740244354e-06, "loss": 1.728, "step": 1091 }, { "batch_num_effect_tokens": 2923, "batch_num_samples": 36, "batch_num_tokens": 16319, "epoch": 1.51194, "grad_norm": 0.4076370894908905, "learning_rate": 1.704975921602543e-06, "loss": 1.8242, "step": 1092 }, { "batch_num_effect_tokens": 2473, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.51333, "grad_norm": 0.43306341767311096, "learning_rate": 1.6958904417591853e-06, "loss": 1.3647, "step": 1093 }, { "batch_num_effect_tokens": 2476, "batch_num_samples": 38, "batch_num_tokens": 16381, "epoch": 1.51471, "grad_norm": 0.47494807839393616, "learning_rate": 1.686824287635288e-06, "loss": 1.4351, "step": 1094 }, { "batch_num_effect_tokens": 2423, "batch_num_samples": 35, "batch_num_tokens": 16304, "epoch": 1.5161, "grad_norm": 0.49817413091659546, "learning_rate": 1.6777775122587387e-06, "loss": 1.854, "step": 1095 }, { "batch_num_effect_tokens": 2326, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.51748, "grad_norm": 0.48580047488212585, "learning_rate": 1.668750168544081e-06, "loss": 1.5005, "step": 1096 }, { "batch_num_effect_tokens": 2401, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 1.51886, "grad_norm": 0.4156758785247803, "learning_rate": 1.6597423092921972e-06, "loss": 1.7275, "step": 1097 }, { "batch_num_effect_tokens": 2010, "batch_num_samples": 29, "batch_num_tokens": 16291, "epoch": 1.52025, "grad_norm": 0.39950811862945557, "learning_rate": 1.6507539871900109e-06, "loss": 1.4976, "step": 1098 }, { "batch_num_effect_tokens": 2794, "batch_num_samples": 34, "batch_num_tokens": 16362, "epoch": 1.52163, "grad_norm": 0.47784772515296936, "learning_rate": 1.641785254810172e-06, "loss": 1.5435, "step": 1099 }, { "batch_num_effect_tokens": 1582, "batch_num_samples": 28, "batch_num_tokens": 16307, "epoch": 1.52302, "grad_norm": 0.30023419857025146, "learning_rate": 1.6328361646107465e-06, "loss": 1.543, "step": 1100 }, { "batch_num_effect_tokens": 3743, "batch_num_samples": 41, "batch_num_tokens": 16344, "epoch": 1.5244, "grad_norm": 0.3596463203430176, "learning_rate": 1.6239067689349186e-06, "loss": 1.417, "step": 1101 }, { "batch_num_effect_tokens": 2013, "batch_num_samples": 28, "batch_num_tokens": 16352, "epoch": 1.52579, "grad_norm": 0.46858853101730347, "learning_rate": 1.6149971200106723e-06, "loss": 1.7285, "step": 1102 }, { "batch_num_effect_tokens": 2652, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 1.52717, "grad_norm": 0.39040011167526245, "learning_rate": 1.6061072699505037e-06, "loss": 1.7808, "step": 1103 }, { "batch_num_effect_tokens": 2609, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.52856, "grad_norm": 0.32134535908699036, "learning_rate": 1.597237270751096e-06, "loss": 1.4102, "step": 1104 }, { "batch_num_effect_tokens": 2563, "batch_num_samples": 38, "batch_num_tokens": 16370, "epoch": 1.52994, "grad_norm": 0.4110313057899475, "learning_rate": 1.5883871742930257e-06, "loss": 1.7041, "step": 1105 }, { "batch_num_effect_tokens": 2399, "batch_num_samples": 28, "batch_num_tokens": 16380, "epoch": 1.53133, "grad_norm": 0.33871206641197205, "learning_rate": 1.579557032340463e-06, "loss": 1.0488, "step": 1106 }, { "batch_num_effect_tokens": 3685, "batch_num_samples": 57, "batch_num_tokens": 16327, "epoch": 1.53271, "grad_norm": 0.3337045907974243, "learning_rate": 1.5707468965408618e-06, "loss": 1.5488, "step": 1107 }, { "batch_num_effect_tokens": 2122, "batch_num_samples": 35, "batch_num_tokens": 16330, "epoch": 1.53409, "grad_norm": 0.35013335943222046, "learning_rate": 1.561956818424661e-06, "loss": 1.4631, "step": 1108 }, { "batch_num_effect_tokens": 2301, "batch_num_samples": 32, "batch_num_tokens": 16354, "epoch": 1.53548, "grad_norm": 0.4086295962333679, "learning_rate": 1.553186849404979e-06, "loss": 1.6953, "step": 1109 }, { "batch_num_effect_tokens": 2388, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.53686, "grad_norm": 0.3341088593006134, "learning_rate": 1.544437040777319e-06, "loss": 1.5825, "step": 1110 }, { "batch_num_effect_tokens": 2234, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.53825, "grad_norm": 0.43831565976142883, "learning_rate": 1.5357074437192688e-06, "loss": 1.4082, "step": 1111 }, { "batch_num_effect_tokens": 2413, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.53963, "grad_norm": 0.33902063965797424, "learning_rate": 1.526998109290192e-06, "loss": 1.7402, "step": 1112 }, { "batch_num_effect_tokens": 2423, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.54102, "grad_norm": 0.4091523289680481, "learning_rate": 1.518309088430941e-06, "loss": 1.7559, "step": 1113 }, { "batch_num_effect_tokens": 2365, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.5424, "grad_norm": 0.315258264541626, "learning_rate": 1.5096404319635533e-06, "loss": 1.6023, "step": 1114 }, { "batch_num_effect_tokens": 2222, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.54379, "grad_norm": 0.5185776352882385, "learning_rate": 1.5009921905909575e-06, "loss": 1.542, "step": 1115 }, { "batch_num_effect_tokens": 2548, "batch_num_samples": 28, "batch_num_tokens": 16272, "epoch": 1.54517, "grad_norm": 0.3788343667984009, "learning_rate": 1.4923644148966682e-06, "loss": 1.6221, "step": 1116 }, { "batch_num_effect_tokens": 2368, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.54656, "grad_norm": 0.46828824281692505, "learning_rate": 1.483757155344503e-06, "loss": 1.7842, "step": 1117 }, { "batch_num_effect_tokens": 2563, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.54794, "grad_norm": 0.39705905318260193, "learning_rate": 1.4751704622782754e-06, "loss": 1.3989, "step": 1118 }, { "batch_num_effect_tokens": 2673, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.54933, "grad_norm": 0.43107208609580994, "learning_rate": 1.466604385921509e-06, "loss": 1.4355, "step": 1119 }, { "batch_num_effect_tokens": 2582, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.55071, "grad_norm": 0.45887160301208496, "learning_rate": 1.4580589763771413e-06, "loss": 1.3662, "step": 1120 }, { "batch_num_effect_tokens": 3653, "batch_num_samples": 51, "batch_num_tokens": 16276, "epoch": 1.55209, "grad_norm": 0.34625759720802307, "learning_rate": 1.4495342836272252e-06, "loss": 1.668, "step": 1121 }, { "batch_num_effect_tokens": 2028, "batch_num_samples": 29, "batch_num_tokens": 16292, "epoch": 1.55348, "grad_norm": 0.3987080752849579, "learning_rate": 1.4410303575326446e-06, "loss": 1.3586, "step": 1122 }, { "batch_num_effect_tokens": 2005, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.55486, "grad_norm": 0.490970641374588, "learning_rate": 1.432547247832819e-06, "loss": 1.5659, "step": 1123 }, { "batch_num_effect_tokens": 2005, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.55625, "grad_norm": 0.47662341594696045, "learning_rate": 1.4240850041454136e-06, "loss": 1.8315, "step": 1124 }, { "batch_num_effect_tokens": 2262, "batch_num_samples": 30, "batch_num_tokens": 16362, "epoch": 1.55763, "grad_norm": 0.4305259585380554, "learning_rate": 1.4156436759660447e-06, "loss": 1.4478, "step": 1125 }, { "batch_num_effect_tokens": 2365, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.55902, "grad_norm": 0.3333793580532074, "learning_rate": 1.4072233126679985e-06, "loss": 1.3164, "step": 1126 }, { "batch_num_effect_tokens": 2252, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.5604, "grad_norm": 0.512894868850708, "learning_rate": 1.3988239635019357e-06, "loss": 1.5088, "step": 1127 }, { "batch_num_effect_tokens": 2272, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.56179, "grad_norm": 0.43213048577308655, "learning_rate": 1.3904456775956044e-06, "loss": 1.5996, "step": 1128 }, { "batch_num_effect_tokens": 2710, "batch_num_samples": 30, "batch_num_tokens": 16374, "epoch": 1.56317, "grad_norm": 0.4284000098705292, "learning_rate": 1.3820885039535564e-06, "loss": 1.2861, "step": 1129 }, { "batch_num_effect_tokens": 2326, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.56456, "grad_norm": 0.390671968460083, "learning_rate": 1.3737524914568523e-06, "loss": 1.4326, "step": 1130 }, { "batch_num_effect_tokens": 2825, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.56594, "grad_norm": 0.3901415169239044, "learning_rate": 1.3654376888627918e-06, "loss": 1.3936, "step": 1131 }, { "batch_num_effect_tokens": 2144, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.56732, "grad_norm": 0.43685001134872437, "learning_rate": 1.3571441448046086e-06, "loss": 1.6631, "step": 1132 }, { "batch_num_effect_tokens": 2368, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.56871, "grad_norm": 0.4244330823421478, "learning_rate": 1.3488719077911965e-06, "loss": 1.6572, "step": 1133 }, { "batch_num_effect_tokens": 2573, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 1.57009, "grad_norm": 0.28905153274536133, "learning_rate": 1.340621026206828e-06, "loss": 0.9819, "step": 1134 }, { "batch_num_effect_tokens": 3177, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 1.57148, "grad_norm": 0.40248242020606995, "learning_rate": 1.3323915483108662e-06, "loss": 1.793, "step": 1135 }, { "batch_num_effect_tokens": 2278, "batch_num_samples": 29, "batch_num_tokens": 16284, "epoch": 1.57286, "grad_norm": 0.3519144654273987, "learning_rate": 1.3241835222374855e-06, "loss": 1.2715, "step": 1136 }, { "batch_num_effect_tokens": 1989, "batch_num_samples": 31, "batch_num_tokens": 16382, "epoch": 1.57425, "grad_norm": 0.49800989031791687, "learning_rate": 1.315996995995385e-06, "loss": 1.457, "step": 1137 }, { "batch_num_effect_tokens": 2566, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.57563, "grad_norm": 0.4932236671447754, "learning_rate": 1.3078320174675141e-06, "loss": 1.7183, "step": 1138 }, { "batch_num_effect_tokens": 2936, "batch_num_samples": 43, "batch_num_tokens": 16291, "epoch": 1.57702, "grad_norm": 0.4102242887020111, "learning_rate": 1.29968863441079e-06, "loss": 1.606, "step": 1139 }, { "batch_num_effect_tokens": 2556, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.5784, "grad_norm": 0.5302222371101379, "learning_rate": 1.2915668944558192e-06, "loss": 2.0117, "step": 1140 }, { "batch_num_effect_tokens": 2642, "batch_num_samples": 33, "batch_num_tokens": 16357, "epoch": 1.57979, "grad_norm": 0.4528213441371918, "learning_rate": 1.2834668451066118e-06, "loss": 1.5835, "step": 1141 }, { "batch_num_effect_tokens": 2497, "batch_num_samples": 38, "batch_num_tokens": 16370, "epoch": 1.58117, "grad_norm": 0.45395907759666443, "learning_rate": 1.275388533740317e-06, "loss": 1.8867, "step": 1142 }, { "batch_num_effect_tokens": 4965, "batch_num_samples": 47, "batch_num_tokens": 16381, "epoch": 1.58255, "grad_norm": 0.31275230646133423, "learning_rate": 1.2673320076069363e-06, "loss": 1.6904, "step": 1143 }, { "batch_num_effect_tokens": 2179, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.58394, "grad_norm": 0.4283202290534973, "learning_rate": 1.259297313829046e-06, "loss": 1.7344, "step": 1144 }, { "batch_num_effect_tokens": 2860, "batch_num_samples": 41, "batch_num_tokens": 16223, "epoch": 1.58532, "grad_norm": 0.4167380630970001, "learning_rate": 1.2512844994015304e-06, "loss": 1.9854, "step": 1145 }, { "batch_num_effect_tokens": 2559, "batch_num_samples": 49, "batch_num_tokens": 16334, "epoch": 1.58671, "grad_norm": 0.45225703716278076, "learning_rate": 1.2432936111912946e-06, "loss": 1.5564, "step": 1146 }, { "batch_num_effect_tokens": 2509, "batch_num_samples": 37, "batch_num_tokens": 16284, "epoch": 1.58809, "grad_norm": 0.42468562722206116, "learning_rate": 1.2353246959370086e-06, "loss": 1.5776, "step": 1147 }, { "batch_num_effect_tokens": 2353, "batch_num_samples": 31, "batch_num_tokens": 16349, "epoch": 1.58948, "grad_norm": 0.3788139522075653, "learning_rate": 1.2273778002488117e-06, "loss": 1.2266, "step": 1148 }, { "batch_num_effect_tokens": 2580, "batch_num_samples": 44, "batch_num_tokens": 16384, "epoch": 1.59086, "grad_norm": 0.32025089859962463, "learning_rate": 1.2194529706080543e-06, "loss": 1.333, "step": 1149 }, { "batch_num_effect_tokens": 2631, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.59225, "grad_norm": 0.3064563572406769, "learning_rate": 1.2115502533670253e-06, "loss": 1.1904, "step": 1150 }, { "batch_num_effect_tokens": 2437, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.59363, "grad_norm": 0.403896301984787, "learning_rate": 1.2036696947486748e-06, "loss": 1.8369, "step": 1151 }, { "batch_num_effect_tokens": 3173, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.59502, "grad_norm": 0.4884367883205414, "learning_rate": 1.1958113408463518e-06, "loss": 1.6167, "step": 1152 }, { "batch_num_effect_tokens": 2221, "batch_num_samples": 29, "batch_num_tokens": 16305, "epoch": 1.5964, "grad_norm": 0.3130112290382385, "learning_rate": 1.1879752376235231e-06, "loss": 1.4902, "step": 1153 }, { "batch_num_effect_tokens": 2911, "batch_num_samples": 29, "batch_num_tokens": 16301, "epoch": 1.59778, "grad_norm": 0.4447314143180847, "learning_rate": 1.1801614309135178e-06, "loss": 1.8047, "step": 1154 }, { "batch_num_effect_tokens": 2572, "batch_num_samples": 29, "batch_num_tokens": 16310, "epoch": 1.59917, "grad_norm": 0.3427513539791107, "learning_rate": 1.1723699664192507e-06, "loss": 1.6401, "step": 1155 }, { "batch_num_effect_tokens": 1813, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.60055, "grad_norm": 0.603606641292572, "learning_rate": 1.1646008897129546e-06, "loss": 1.7422, "step": 1156 }, { "batch_num_effect_tokens": 2226, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.60194, "grad_norm": 0.4188461899757385, "learning_rate": 1.1568542462359206e-06, "loss": 1.5762, "step": 1157 }, { "batch_num_effect_tokens": 2374, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.60332, "grad_norm": 0.45765987038612366, "learning_rate": 1.1491300812982216e-06, "loss": 1.7783, "step": 1158 }, { "batch_num_effect_tokens": 2359, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.60471, "grad_norm": 0.36140167713165283, "learning_rate": 1.1414284400784643e-06, "loss": 1.5664, "step": 1159 }, { "batch_num_effect_tokens": 2676, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.60609, "grad_norm": 0.39494019746780396, "learning_rate": 1.1337493676235023e-06, "loss": 1.793, "step": 1160 }, { "batch_num_effect_tokens": 2676, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.60609, "eval_eval_loss": 0.43317314982414246, "eval_eval_runtime": 105.4808, "eval_eval_samples_per_second": 45.961, "eval_eval_steps_per_second": 2.873, "step": 1160 }, { "batch_num_effect_tokens": 2354, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.60748, "grad_norm": 0.4546908140182495, "learning_rate": 1.1260929088481932e-06, "loss": 1.6494, "step": 1161 }, { "batch_num_effect_tokens": 2622, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.60886, "grad_norm": 0.4043464958667755, "learning_rate": 1.118459108535122e-06, "loss": 3.3174, "step": 1162 }, { "batch_num_effect_tokens": 2302, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.61025, "grad_norm": 0.44950321316719055, "learning_rate": 1.1108480113343478e-06, "loss": 1.6094, "step": 1163 }, { "batch_num_effect_tokens": 2283, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.61163, "grad_norm": 0.3290331959724426, "learning_rate": 1.1032596617631392e-06, "loss": 1.5283, "step": 1164 }, { "batch_num_effect_tokens": 2550, "batch_num_samples": 40, "batch_num_tokens": 16384, "epoch": 1.61301, "grad_norm": 0.45046529173851013, "learning_rate": 1.0956941042057106e-06, "loss": 1.4041, "step": 1165 }, { "batch_num_effect_tokens": 2320, "batch_num_samples": 33, "batch_num_tokens": 16302, "epoch": 1.6144, "grad_norm": 0.36152440309524536, "learning_rate": 1.0881513829129696e-06, "loss": 1.4438, "step": 1166 }, { "batch_num_effect_tokens": 2832, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.61578, "grad_norm": 0.4492241144180298, "learning_rate": 1.0806315420022535e-06, "loss": 1.3811, "step": 1167 }, { "batch_num_effect_tokens": 2035, "batch_num_samples": 31, "batch_num_tokens": 16205, "epoch": 1.61717, "grad_norm": 0.3149929642677307, "learning_rate": 1.0731346254570735e-06, "loss": 1.5059, "step": 1168 }, { "batch_num_effect_tokens": 2787, "batch_num_samples": 34, "batch_num_tokens": 16332, "epoch": 1.61855, "grad_norm": 0.4681971073150635, "learning_rate": 1.065660677126853e-06, "loss": 1.9077, "step": 1169 }, { "batch_num_effect_tokens": 2735, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.61994, "grad_norm": 0.3152649402618408, "learning_rate": 1.0582097407266772e-06, "loss": 1.5752, "step": 1170 }, { "batch_num_effect_tokens": 2021, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.62132, "grad_norm": 0.4782985746860504, "learning_rate": 1.0507818598370355e-06, "loss": 1.2739, "step": 1171 }, { "batch_num_effect_tokens": 2443, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.62271, "grad_norm": 0.29619184136390686, "learning_rate": 1.0433770779035618e-06, "loss": 1.1743, "step": 1172 }, { "batch_num_effect_tokens": 2221, "batch_num_samples": 29, "batch_num_tokens": 16369, "epoch": 1.62409, "grad_norm": 0.4740128815174103, "learning_rate": 1.0359954382367898e-06, "loss": 1.4043, "step": 1173 }, { "batch_num_effect_tokens": 3377, "batch_num_samples": 48, "batch_num_tokens": 16384, "epoch": 1.62548, "grad_norm": 0.35877525806427, "learning_rate": 1.0286369840118859e-06, "loss": 2.0303, "step": 1174 }, { "batch_num_effect_tokens": 2216, "batch_num_samples": 29, "batch_num_tokens": 16350, "epoch": 1.62686, "grad_norm": 0.45086777210235596, "learning_rate": 1.021301758268417e-06, "loss": 1.3184, "step": 1175 }, { "batch_num_effect_tokens": 2009, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.62825, "grad_norm": 0.396729439496994, "learning_rate": 1.013989803910076e-06, "loss": 1.4395, "step": 1176 }, { "batch_num_effect_tokens": 2043, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.62963, "grad_norm": 0.5198664665222168, "learning_rate": 1.006701163704445e-06, "loss": 1.9541, "step": 1177 }, { "batch_num_effect_tokens": 2687, "batch_num_samples": 49, "batch_num_tokens": 16384, "epoch": 1.63101, "grad_norm": 0.46582597494125366, "learning_rate": 9.994358802827437e-07, "loss": 1.7295, "step": 1178 }, { "batch_num_effect_tokens": 3788, "batch_num_samples": 44, "batch_num_tokens": 16384, "epoch": 1.6324, "grad_norm": 0.40241456031799316, "learning_rate": 9.921939961395767e-07, "loss": 1.6118, "step": 1179 }, { "batch_num_effect_tokens": 2209, "batch_num_samples": 33, "batch_num_tokens": 16357, "epoch": 1.63378, "grad_norm": 0.5667777061462402, "learning_rate": 9.849755536326866e-07, "loss": 1.5627, "step": 1180 }, { "batch_num_effect_tokens": 2144, "batch_num_samples": 29, "batch_num_tokens": 16340, "epoch": 1.63517, "grad_norm": 0.5858004689216614, "learning_rate": 9.777805949827046e-07, "loss": 1.2654, "step": 1181 }, { "batch_num_effect_tokens": 2674, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.63655, "grad_norm": 0.43740761280059814, "learning_rate": 9.706091622729065e-07, "loss": 1.5654, "step": 1182 }, { "batch_num_effect_tokens": 2579, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.63794, "grad_norm": 0.46260321140289307, "learning_rate": 9.63461297448966e-07, "loss": 1.8257, "step": 1183 }, { "batch_num_effect_tokens": 1957, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.63932, "grad_norm": 0.39956656098365784, "learning_rate": 9.563370423187046e-07, "loss": 1.4292, "step": 1184 }, { "batch_num_effect_tokens": 2528, "batch_num_samples": 29, "batch_num_tokens": 16368, "epoch": 1.64071, "grad_norm": 0.3191390335559845, "learning_rate": 9.492364385518554e-07, "loss": 1.1135, "step": 1185 }, { "batch_num_effect_tokens": 2242, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.64209, "grad_norm": 0.3295113146305084, "learning_rate": 9.421595276798084e-07, "loss": 1.5195, "step": 1186 }, { "batch_num_effect_tokens": 2502, "batch_num_samples": 38, "batch_num_tokens": 16352, "epoch": 1.64348, "grad_norm": 0.4700864851474762, "learning_rate": 9.351063510953845e-07, "loss": 1.5103, "step": 1187 }, { "batch_num_effect_tokens": 2399, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.64486, "grad_norm": 0.30828502774238586, "learning_rate": 9.280769500525716e-07, "loss": 1.9517, "step": 1188 }, { "batch_num_effect_tokens": 2758, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.64624, "grad_norm": 0.43134236335754395, "learning_rate": 9.210713656663023e-07, "loss": 1.7129, "step": 1189 }, { "batch_num_effect_tokens": 2227, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.64763, "grad_norm": 0.47804298996925354, "learning_rate": 9.14089638912199e-07, "loss": 1.5845, "step": 1190 }, { "batch_num_effect_tokens": 2317, "batch_num_samples": 31, "batch_num_tokens": 16340, "epoch": 1.64901, "grad_norm": 0.31185993552207947, "learning_rate": 9.071318106263499e-07, "loss": 1.6084, "step": 1191 }, { "batch_num_effect_tokens": 2151, "batch_num_samples": 30, "batch_num_tokens": 16383, "epoch": 1.6504, "grad_norm": 0.5235294103622437, "learning_rate": 9.001979215050544e-07, "loss": 1.8564, "step": 1192 }, { "batch_num_effect_tokens": 2003, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.65178, "grad_norm": 0.4882059097290039, "learning_rate": 8.932880121045911e-07, "loss": 1.4536, "step": 1193 }, { "batch_num_effect_tokens": 2871, "batch_num_samples": 33, "batch_num_tokens": 16334, "epoch": 1.65317, "grad_norm": 0.46826136112213135, "learning_rate": 8.864021228409853e-07, "loss": 1.9492, "step": 1194 }, { "batch_num_effect_tokens": 2316, "batch_num_samples": 33, "batch_num_tokens": 16380, "epoch": 1.65455, "grad_norm": 0.4635930061340332, "learning_rate": 8.795402939897679e-07, "loss": 1.6414, "step": 1195 }, { "batch_num_effect_tokens": 2875, "batch_num_samples": 47, "batch_num_tokens": 16274, "epoch": 1.65594, "grad_norm": 0.41375765204429626, "learning_rate": 8.727025656857407e-07, "loss": 1.8257, "step": 1196 }, { "batch_num_effect_tokens": 2490, "batch_num_samples": 34, "batch_num_tokens": 16260, "epoch": 1.65732, "grad_norm": 0.4509265720844269, "learning_rate": 8.658889779227376e-07, "loss": 1.6084, "step": 1197 }, { "batch_num_effect_tokens": 2198, "batch_num_samples": 30, "batch_num_tokens": 16305, "epoch": 1.65871, "grad_norm": 0.4588288962841034, "learning_rate": 8.590995705533994e-07, "loss": 1.7769, "step": 1198 }, { "batch_num_effect_tokens": 2157, "batch_num_samples": 31, "batch_num_tokens": 16298, "epoch": 1.66009, "grad_norm": 0.4390736222267151, "learning_rate": 8.523343832889358e-07, "loss": 1.3877, "step": 1199 }, { "batch_num_effect_tokens": 2425, "batch_num_samples": 34, "batch_num_tokens": 16294, "epoch": 1.66147, "grad_norm": 0.44500279426574707, "learning_rate": 8.455934556988888e-07, "loss": 1.8203, "step": 1200 }, { "batch_num_effect_tokens": 2134, "batch_num_samples": 31, "batch_num_tokens": 16267, "epoch": 1.66286, "grad_norm": 0.419421911239624, "learning_rate": 8.388768272109105e-07, "loss": 1.1108, "step": 1201 }, { "batch_num_effect_tokens": 2117, "batch_num_samples": 31, "batch_num_tokens": 16354, "epoch": 1.66424, "grad_norm": 0.47472044825553894, "learning_rate": 8.321845371105225e-07, "loss": 1.7129, "step": 1202 }, { "batch_num_effect_tokens": 2599, "batch_num_samples": 37, "batch_num_tokens": 16382, "epoch": 1.66563, "grad_norm": 0.4561558663845062, "learning_rate": 8.255166245408985e-07, "loss": 1.394, "step": 1203 }, { "batch_num_effect_tokens": 2432, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.66701, "grad_norm": 0.29153165221214294, "learning_rate": 8.188731285026219e-07, "loss": 1.4001, "step": 1204 }, { "batch_num_effect_tokens": 3267, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 1.6684, "grad_norm": 0.36731991171836853, "learning_rate": 8.122540878534679e-07, "loss": 1.5762, "step": 1205 }, { "batch_num_effect_tokens": 2237, "batch_num_samples": 38, "batch_num_tokens": 16283, "epoch": 1.66978, "grad_norm": 0.4525676667690277, "learning_rate": 8.056595413081675e-07, "loss": 1.8716, "step": 1206 }, { "batch_num_effect_tokens": 2453, "batch_num_samples": 33, "batch_num_tokens": 16311, "epoch": 1.67117, "grad_norm": 0.29530009627342224, "learning_rate": 7.99089527438191e-07, "loss": 1.6079, "step": 1207 }, { "batch_num_effect_tokens": 2524, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.67255, "grad_norm": 0.4480970501899719, "learning_rate": 7.925440846715154e-07, "loss": 1.6079, "step": 1208 }, { "batch_num_effect_tokens": 2304, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 1.67394, "grad_norm": 0.34074676036834717, "learning_rate": 7.860232512923993e-07, "loss": 1.6697, "step": 1209 }, { "batch_num_effect_tokens": 3040, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 1.67532, "grad_norm": 0.44745004177093506, "learning_rate": 7.795270654411635e-07, "loss": 1.5327, "step": 1210 }, { "batch_num_effect_tokens": 2718, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.6767, "grad_norm": 0.4064233899116516, "learning_rate": 7.73055565113966e-07, "loss": 1.4629, "step": 1211 }, { "batch_num_effect_tokens": 3636, "batch_num_samples": 42, "batch_num_tokens": 16384, "epoch": 1.67809, "grad_norm": 0.38418668508529663, "learning_rate": 7.666087881625778e-07, "loss": 1.5925, "step": 1212 }, { "batch_num_effect_tokens": 2257, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.67947, "grad_norm": 0.5256770253181458, "learning_rate": 7.601867722941642e-07, "loss": 1.7163, "step": 1213 }, { "batch_num_effect_tokens": 2704, "batch_num_samples": 32, "batch_num_tokens": 16354, "epoch": 1.68086, "grad_norm": 0.4288358986377716, "learning_rate": 7.537895550710583e-07, "loss": 1.8062, "step": 1214 }, { "batch_num_effect_tokens": 1960, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.68224, "grad_norm": 0.4311806261539459, "learning_rate": 7.474171739105551e-07, "loss": 1.3765, "step": 1215 }, { "batch_num_effect_tokens": 1842, "batch_num_samples": 29, "batch_num_tokens": 16312, "epoch": 1.68363, "grad_norm": 0.35490691661834717, "learning_rate": 7.410696660846761e-07, "loss": 1.4858, "step": 1216 }, { "batch_num_effect_tokens": 2388, "batch_num_samples": 28, "batch_num_tokens": 16383, "epoch": 1.68501, "grad_norm": 0.4309084415435791, "learning_rate": 7.34747068719962e-07, "loss": 1.7842, "step": 1217 }, { "batch_num_effect_tokens": 2481, "batch_num_samples": 31, "batch_num_tokens": 16340, "epoch": 1.6864, "grad_norm": 0.462055504322052, "learning_rate": 7.284494187972496e-07, "loss": 1.413, "step": 1218 }, { "batch_num_effect_tokens": 2466, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.68778, "grad_norm": 0.44760653376579285, "learning_rate": 7.22176753151464e-07, "loss": 1.4937, "step": 1219 }, { "batch_num_effect_tokens": 2345, "batch_num_samples": 33, "batch_num_tokens": 16383, "epoch": 1.68917, "grad_norm": 0.40922272205352783, "learning_rate": 7.15929108471391e-07, "loss": 1.5, "step": 1220 }, { "batch_num_effect_tokens": 2732, "batch_num_samples": 45, "batch_num_tokens": 16369, "epoch": 1.69055, "grad_norm": 0.40360745787620544, "learning_rate": 7.097065212994714e-07, "loss": 1.7085, "step": 1221 }, { "batch_num_effect_tokens": 2476, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.69193, "grad_norm": 0.4323672950267792, "learning_rate": 7.035090280315854e-07, "loss": 1.3052, "step": 1222 }, { "batch_num_effect_tokens": 2430, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.69332, "grad_norm": 0.48815345764160156, "learning_rate": 6.973366649168389e-07, "loss": 1.7935, "step": 1223 }, { "batch_num_effect_tokens": 2718, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.6947, "grad_norm": 0.3908276855945587, "learning_rate": 6.911894680573522e-07, "loss": 1.4883, "step": 1224 }, { "batch_num_effect_tokens": 2547, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.69609, "grad_norm": 0.41076523065567017, "learning_rate": 6.850674734080454e-07, "loss": 1.5625, "step": 1225 }, { "batch_num_effect_tokens": 2767, "batch_num_samples": 40, "batch_num_tokens": 16263, "epoch": 1.69747, "grad_norm": 0.39315274357795715, "learning_rate": 6.789707167764337e-07, "loss": 1.5488, "step": 1226 }, { "batch_num_effect_tokens": 2444, "batch_num_samples": 31, "batch_num_tokens": 16383, "epoch": 1.69886, "grad_norm": 0.48541685938835144, "learning_rate": 6.728992338224166e-07, "loss": 1.3406, "step": 1227 }, { "batch_num_effect_tokens": 2506, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.70024, "grad_norm": 0.3793267011642456, "learning_rate": 6.66853060058063e-07, "loss": 1.0039, "step": 1228 }, { "batch_num_effect_tokens": 3213, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.70163, "grad_norm": 0.4587978720664978, "learning_rate": 6.608322308474141e-07, "loss": 1.584, "step": 1229 }, { "batch_num_effect_tokens": 2112, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.70301, "grad_norm": 0.4160648286342621, "learning_rate": 6.548367814062656e-07, "loss": 1.5381, "step": 1230 }, { "batch_num_effect_tokens": 2650, "batch_num_samples": 33, "batch_num_tokens": 16380, "epoch": 1.7044, "grad_norm": 0.40201956033706665, "learning_rate": 6.488667468019727e-07, "loss": 1.5469, "step": 1231 }, { "batch_num_effect_tokens": 2389, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.70578, "grad_norm": 0.26922547817230225, "learning_rate": 6.429221619532349e-07, "loss": 1.4446, "step": 1232 }, { "batch_num_effect_tokens": 2675, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.70717, "grad_norm": 0.3887218236923218, "learning_rate": 6.370030616298989e-07, "loss": 1.3843, "step": 1233 }, { "batch_num_effect_tokens": 3010, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.70855, "grad_norm": 0.36124423146247864, "learning_rate": 6.31109480452749e-07, "loss": 1.7549, "step": 1234 }, { "batch_num_effect_tokens": 2657, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.70993, "grad_norm": 0.4302148222923279, "learning_rate": 6.252414528933126e-07, "loss": 1.5923, "step": 1235 }, { "batch_num_effect_tokens": 2505, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.71132, "grad_norm": 0.4031151235103607, "learning_rate": 6.193990132736527e-07, "loss": 1.6177, "step": 1236 }, { "batch_num_effect_tokens": 2629, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.7127, "grad_norm": 0.4232177436351776, "learning_rate": 6.135821957661658e-07, "loss": 1.5122, "step": 1237 }, { "batch_num_effect_tokens": 2025, "batch_num_samples": 28, "batch_num_tokens": 16277, "epoch": 1.71409, "grad_norm": 0.4423690438270569, "learning_rate": 6.077910343933879e-07, "loss": 1.6611, "step": 1238 }, { "batch_num_effect_tokens": 1884, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.71547, "grad_norm": 0.3095490038394928, "learning_rate": 6.020255630277916e-07, "loss": 1.4199, "step": 1239 }, { "batch_num_effect_tokens": 2844, "batch_num_samples": 28, "batch_num_tokens": 16371, "epoch": 1.71686, "grad_norm": 0.28403934836387634, "learning_rate": 5.962858153915896e-07, "loss": 1.7002, "step": 1240 }, { "batch_num_effect_tokens": 3059, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.71824, "grad_norm": 0.39329975843429565, "learning_rate": 5.905718250565351e-07, "loss": 1.6221, "step": 1241 }, { "batch_num_effect_tokens": 2869, "batch_num_samples": 29, "batch_num_tokens": 16368, "epoch": 1.71963, "grad_norm": 0.41262367367744446, "learning_rate": 5.848836254437251e-07, "loss": 1.6211, "step": 1242 }, { "batch_num_effect_tokens": 2822, "batch_num_samples": 46, "batch_num_tokens": 16384, "epoch": 1.72101, "grad_norm": 0.4423920214176178, "learning_rate": 5.792212498234134e-07, "loss": 1.5908, "step": 1243 }, { "batch_num_effect_tokens": 2828, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.7224, "grad_norm": 0.38751456141471863, "learning_rate": 5.735847313148024e-07, "loss": 1.6543, "step": 1244 }, { "batch_num_effect_tokens": 2155, "batch_num_samples": 33, "batch_num_tokens": 16279, "epoch": 1.72378, "grad_norm": 0.3441454768180847, "learning_rate": 5.67974102885861e-07, "loss": 1.5991, "step": 1245 }, { "batch_num_effect_tokens": 2643, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.72516, "grad_norm": 0.41788098216056824, "learning_rate": 5.623893973531225e-07, "loss": 1.4275, "step": 1246 }, { "batch_num_effect_tokens": 2978, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.72655, "grad_norm": 0.4316878914833069, "learning_rate": 5.568306473815044e-07, "loss": 1.8042, "step": 1247 }, { "batch_num_effect_tokens": 2970, "batch_num_samples": 36, "batch_num_tokens": 16292, "epoch": 1.72793, "grad_norm": 0.45934247970581055, "learning_rate": 5.512978854841028e-07, "loss": 1.6226, "step": 1248 }, { "batch_num_effect_tokens": 2420, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.72932, "grad_norm": 0.42678302526474, "learning_rate": 5.457911440220154e-07, "loss": 1.6978, "step": 1249 }, { "batch_num_effect_tokens": 2675, "batch_num_samples": 36, "batch_num_tokens": 16292, "epoch": 1.7307, "grad_norm": 0.42161425948143005, "learning_rate": 5.403104552041416e-07, "loss": 1.8662, "step": 1250 }, { "batch_num_effect_tokens": 2766, "batch_num_samples": 30, "batch_num_tokens": 16304, "epoch": 1.73209, "grad_norm": 0.46565961837768555, "learning_rate": 5.348558510870033e-07, "loss": 1.665, "step": 1251 }, { "batch_num_effect_tokens": 2385, "batch_num_samples": 31, "batch_num_tokens": 16298, "epoch": 1.73347, "grad_norm": 0.3356163501739502, "learning_rate": 5.294273635745517e-07, "loss": 1.5342, "step": 1252 }, { "batch_num_effect_tokens": 2809, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 1.73486, "grad_norm": 0.40509241819381714, "learning_rate": 5.240250244179801e-07, "loss": 1.2871, "step": 1253 }, { "batch_num_effect_tokens": 2287, "batch_num_samples": 31, "batch_num_tokens": 16340, "epoch": 1.73624, "grad_norm": 0.450457364320755, "learning_rate": 5.186488652155425e-07, "loss": 1.9863, "step": 1254 }, { "batch_num_effect_tokens": 2324, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.73763, "grad_norm": 0.4772484600543976, "learning_rate": 5.132989174123659e-07, "loss": 1.6689, "step": 1255 }, { "batch_num_effect_tokens": 2958, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.73901, "grad_norm": 0.38812053203582764, "learning_rate": 5.079752123002684e-07, "loss": 1.8105, "step": 1256 }, { "batch_num_effect_tokens": 2507, "batch_num_samples": 32, "batch_num_tokens": 16310, "epoch": 1.74039, "grad_norm": 0.39813515543937683, "learning_rate": 5.026777810175721e-07, "loss": 1.7612, "step": 1257 }, { "batch_num_effect_tokens": 2827, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 1.74178, "grad_norm": 0.3738177716732025, "learning_rate": 4.97406654548922e-07, "loss": 1.7969, "step": 1258 }, { "batch_num_effect_tokens": 2314, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.74316, "grad_norm": 0.45501065254211426, "learning_rate": 4.921618637251141e-07, "loss": 1.5796, "step": 1259 }, { "batch_num_effect_tokens": 2584, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.74455, "grad_norm": 0.45451661944389343, "learning_rate": 4.869434392228989e-07, "loss": 1.791, "step": 1260 }, { "batch_num_effect_tokens": 2474, "batch_num_samples": 31, "batch_num_tokens": 16377, "epoch": 1.74593, "grad_norm": 0.44398120045661926, "learning_rate": 4.817514115648164e-07, "loss": 1.6602, "step": 1261 }, { "batch_num_effect_tokens": 2043, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.74732, "grad_norm": 0.45313188433647156, "learning_rate": 4.765858111190053e-07, "loss": 1.6411, "step": 1262 }, { "batch_num_effect_tokens": 2570, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.7487, "grad_norm": 0.41603195667266846, "learning_rate": 4.7144666809903984e-07, "loss": 1.8652, "step": 1263 }, { "batch_num_effect_tokens": 2519, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.75009, "grad_norm": 0.4656577408313751, "learning_rate": 4.663340125637389e-07, "loss": 1.7432, "step": 1264 }, { "batch_num_effect_tokens": 3071, "batch_num_samples": 56, "batch_num_tokens": 16287, "epoch": 1.75147, "grad_norm": 0.319423109292984, "learning_rate": 4.612478744169968e-07, "loss": 1.3682, "step": 1265 }, { "batch_num_effect_tokens": 3826, "batch_num_samples": 47, "batch_num_tokens": 16383, "epoch": 1.75286, "grad_norm": 0.29101842641830444, "learning_rate": 4.561882834076098e-07, "loss": 1.4424, "step": 1266 }, { "batch_num_effect_tokens": 2621, "batch_num_samples": 49, "batch_num_tokens": 16381, "epoch": 1.75424, "grad_norm": 0.4804738759994507, "learning_rate": 4.511552691290988e-07, "loss": 1.7388, "step": 1267 }, { "batch_num_effect_tokens": 2459, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.75562, "grad_norm": 0.4182913303375244, "learning_rate": 4.4614886101953915e-07, "loss": 1.5127, "step": 1268 }, { "batch_num_effect_tokens": 2240, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.75701, "grad_norm": 0.3360429108142853, "learning_rate": 4.411690883613834e-07, "loss": 1.416, "step": 1269 }, { "batch_num_effect_tokens": 2578, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.75839, "grad_norm": 0.4191362261772156, "learning_rate": 4.362159802812971e-07, "loss": 1.667, "step": 1270 }, { "batch_num_effect_tokens": 2440, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.75978, "grad_norm": 0.41708049178123474, "learning_rate": 4.3128956574998436e-07, "loss": 1.4951, "step": 1271 }, { "batch_num_effect_tokens": 2538, "batch_num_samples": 28, "batch_num_tokens": 16368, "epoch": 1.76116, "grad_norm": 0.4429357051849365, "learning_rate": 4.2638987358201546e-07, "loss": 1.8672, "step": 1272 }, { "batch_num_effect_tokens": 2288, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.76255, "grad_norm": 0.30612295866012573, "learning_rate": 4.215169324356666e-07, "loss": 1.5464, "step": 1273 }, { "batch_num_effect_tokens": 2933, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 1.76393, "grad_norm": 0.28973227739334106, "learning_rate": 4.1667077081274153e-07, "loss": 1.5859, "step": 1274 }, { "batch_num_effect_tokens": 2362, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 1.76532, "grad_norm": 0.4510679841041565, "learning_rate": 4.118514170584187e-07, "loss": 1.6904, "step": 1275 }, { "batch_num_effect_tokens": 3567, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 1.7667, "grad_norm": 0.40006911754608154, "learning_rate": 4.070588993610697e-07, "loss": 1.9072, "step": 1276 }, { "batch_num_effect_tokens": 3153, "batch_num_samples": 43, "batch_num_tokens": 16347, "epoch": 1.76809, "grad_norm": 0.3724789023399353, "learning_rate": 4.022932457521067e-07, "loss": 1.3296, "step": 1277 }, { "batch_num_effect_tokens": 2745, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.76947, "grad_norm": 0.4504218101501465, "learning_rate": 3.975544841058121e-07, "loss": 1.1978, "step": 1278 }, { "batch_num_effect_tokens": 2587, "batch_num_samples": 31, "batch_num_tokens": 16343, "epoch": 1.77085, "grad_norm": 0.4496813714504242, "learning_rate": 3.928426421391773e-07, "loss": 1.3594, "step": 1279 }, { "batch_num_effect_tokens": 2462, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.77224, "grad_norm": 0.3559061288833618, "learning_rate": 3.881577474117432e-07, "loss": 1.5518, "step": 1280 }, { "batch_num_effect_tokens": 3189, "batch_num_samples": 41, "batch_num_tokens": 16307, "epoch": 1.77362, "grad_norm": 0.38277941942214966, "learning_rate": 3.8349982732543257e-07, "loss": 1.6592, "step": 1281 }, { "batch_num_effect_tokens": 1965, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.77501, "grad_norm": 0.4954379200935364, "learning_rate": 3.7886890912439633e-07, "loss": 1.2695, "step": 1282 }, { "batch_num_effect_tokens": 2282, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.77639, "grad_norm": 0.5293394327163696, "learning_rate": 3.7426501989485e-07, "loss": 1.4185, "step": 1283 }, { "batch_num_effect_tokens": 2073, "batch_num_samples": 29, "batch_num_tokens": 16368, "epoch": 1.77778, "grad_norm": 0.4304828643798828, "learning_rate": 3.696881865649193e-07, "loss": 1.5913, "step": 1284 }, { "batch_num_effect_tokens": 2481, "batch_num_samples": 37, "batch_num_tokens": 16373, "epoch": 1.77916, "grad_norm": 0.45321598649024963, "learning_rate": 3.651384359044774e-07, "loss": 1.9199, "step": 1285 }, { "batch_num_effect_tokens": 2831, "batch_num_samples": 30, "batch_num_tokens": 16304, "epoch": 1.78055, "grad_norm": 0.4435335099697113, "learning_rate": 3.6061579452498996e-07, "loss": 1.7031, "step": 1286 }, { "batch_num_effect_tokens": 2113, "batch_num_samples": 29, "batch_num_tokens": 16317, "epoch": 1.78193, "grad_norm": 0.4291432201862335, "learning_rate": 3.5612028887936576e-07, "loss": 1.1802, "step": 1287 }, { "batch_num_effect_tokens": 3062, "batch_num_samples": 28, "batch_num_tokens": 16381, "epoch": 1.78332, "grad_norm": 0.44784969091415405, "learning_rate": 3.516519452617922e-07, "loss": 1.7539, "step": 1288 }, { "batch_num_effect_tokens": 1971, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.7847, "grad_norm": 0.5027155876159668, "learning_rate": 3.4721078980758826e-07, "loss": 1.5884, "step": 1289 }, { "batch_num_effect_tokens": 2287, "batch_num_samples": 28, "batch_num_tokens": 16336, "epoch": 1.78609, "grad_norm": 0.5062171220779419, "learning_rate": 3.4279684849304716e-07, "loss": 1.6201, "step": 1290 }, { "batch_num_effect_tokens": 2848, "batch_num_samples": 41, "batch_num_tokens": 16337, "epoch": 1.78747, "grad_norm": 0.41088375449180603, "learning_rate": 3.3841014713529184e-07, "loss": 1.4468, "step": 1291 }, { "batch_num_effect_tokens": 2566, "batch_num_samples": 38, "batch_num_tokens": 16312, "epoch": 1.78885, "grad_norm": 0.43662628531455994, "learning_rate": 3.3405071139211266e-07, "loss": 1.9893, "step": 1292 }, { "batch_num_effect_tokens": 2727, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.79024, "grad_norm": 0.37984609603881836, "learning_rate": 3.297185667618269e-07, "loss": 1.8091, "step": 1293 }, { "batch_num_effect_tokens": 2070, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.79162, "grad_norm": 0.528954029083252, "learning_rate": 3.254137385831263e-07, "loss": 1.8774, "step": 1294 }, { "batch_num_effect_tokens": 2393, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.79301, "grad_norm": 0.45156311988830566, "learning_rate": 3.2113625203492813e-07, "loss": 1.6567, "step": 1295 }, { "batch_num_effect_tokens": 3329, "batch_num_samples": 57, "batch_num_tokens": 16276, "epoch": 1.79439, "grad_norm": 0.3008648753166199, "learning_rate": 3.1688613213622876e-07, "loss": 1.376, "step": 1296 }, { "batch_num_effect_tokens": 3091, "batch_num_samples": 39, "batch_num_tokens": 16384, "epoch": 1.79578, "grad_norm": 0.4396587312221527, "learning_rate": 3.1266340374595693e-07, "loss": 1.7715, "step": 1297 }, { "batch_num_effect_tokens": 1994, "batch_num_samples": 28, "batch_num_tokens": 16337, "epoch": 1.79716, "grad_norm": 0.4039454460144043, "learning_rate": 3.0846809156282906e-07, "loss": 1.3411, "step": 1298 }, { "batch_num_effect_tokens": 2395, "batch_num_samples": 28, "batch_num_tokens": 16371, "epoch": 1.79855, "grad_norm": 0.4478610157966614, "learning_rate": 3.0430022012520486e-07, "loss": 1.6475, "step": 1299 }, { "batch_num_effect_tokens": 2222, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.79993, "grad_norm": 0.3005165755748749, "learning_rate": 3.0015981381094073e-07, "loss": 0.918, "step": 1300 }, { "batch_num_effect_tokens": 2585, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.80132, "grad_norm": 0.4662596583366394, "learning_rate": 2.960468968372532e-07, "loss": 1.5518, "step": 1301 }, { "batch_num_effect_tokens": 2484, "batch_num_samples": 41, "batch_num_tokens": 16269, "epoch": 1.8027, "grad_norm": 0.40501898527145386, "learning_rate": 2.9196149326056967e-07, "loss": 1.7959, "step": 1302 }, { "batch_num_effect_tokens": 2572, "batch_num_samples": 30, "batch_num_tokens": 16287, "epoch": 1.80408, "grad_norm": 0.31610965728759766, "learning_rate": 2.8790362697639685e-07, "loss": 1.4722, "step": 1303 }, { "batch_num_effect_tokens": 2329, "batch_num_samples": 30, "batch_num_tokens": 16320, "epoch": 1.80547, "grad_norm": 0.4776735007762909, "learning_rate": 2.8387332171917247e-07, "loss": 1.6079, "step": 1304 }, { "batch_num_effect_tokens": 2168, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.80685, "grad_norm": 0.4749198853969574, "learning_rate": 2.798706010621305e-07, "loss": 1.6836, "step": 1305 }, { "batch_num_effect_tokens": 2168, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.80685, "eval_eval_loss": 0.4295859634876251, "eval_eval_runtime": 105.4353, "eval_eval_samples_per_second": 45.981, "eval_eval_steps_per_second": 2.874, "step": 1305 }, { "batch_num_effect_tokens": 2654, "batch_num_samples": 41, "batch_num_tokens": 16353, "epoch": 1.80824, "grad_norm": 0.45353972911834717, "learning_rate": 2.7589548841716274e-07, "loss": 1.6494, "step": 1306 }, { "batch_num_effect_tokens": 2397, "batch_num_samples": 29, "batch_num_tokens": 16340, "epoch": 1.80962, "grad_norm": 0.3457548916339874, "learning_rate": 2.7194800703468305e-07, "loss": 1.1616, "step": 1307 }, { "batch_num_effect_tokens": 2474, "batch_num_samples": 43, "batch_num_tokens": 16384, "epoch": 1.81101, "grad_norm": 0.4693632423877716, "learning_rate": 2.6802818000348894e-07, "loss": 1.4985, "step": 1308 }, { "batch_num_effect_tokens": 2932, "batch_num_samples": 48, "batch_num_tokens": 16384, "epoch": 1.81239, "grad_norm": 0.40080004930496216, "learning_rate": 2.64136030250628e-07, "loss": 1.7173, "step": 1309 }, { "batch_num_effect_tokens": 2098, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.81378, "grad_norm": 0.29731953144073486, "learning_rate": 2.602715805412637e-07, "loss": 1.1118, "step": 1310 }, { "batch_num_effect_tokens": 2478, "batch_num_samples": 33, "batch_num_tokens": 16381, "epoch": 1.81516, "grad_norm": 0.43773138523101807, "learning_rate": 2.564348534785416e-07, "loss": 1.7061, "step": 1311 }, { "batch_num_effect_tokens": 2653, "batch_num_samples": 30, "batch_num_tokens": 16362, "epoch": 1.81655, "grad_norm": 0.4009453356266022, "learning_rate": 2.526258715034602e-07, "loss": 1.5332, "step": 1312 }, { "batch_num_effect_tokens": 4198, "batch_num_samples": 43, "batch_num_tokens": 16382, "epoch": 1.81793, "grad_norm": 0.3337891697883606, "learning_rate": 2.4884465689473223e-07, "loss": 1.4312, "step": 1313 }, { "batch_num_effect_tokens": 2145, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.81931, "grad_norm": 0.3933491110801697, "learning_rate": 2.4509123176866376e-07, "loss": 1.4746, "step": 1314 }, { "batch_num_effect_tokens": 3580, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.8207, "grad_norm": 0.3803938627243042, "learning_rate": 2.4136561807901916e-07, "loss": 1.6191, "step": 1315 }, { "batch_num_effect_tokens": 2448, "batch_num_samples": 33, "batch_num_tokens": 16334, "epoch": 1.82208, "grad_norm": 0.4630650579929352, "learning_rate": 2.376678376168917e-07, "loss": 1.6885, "step": 1316 }, { "batch_num_effect_tokens": 3148, "batch_num_samples": 54, "batch_num_tokens": 16384, "epoch": 1.82347, "grad_norm": 0.351731538772583, "learning_rate": 2.3399791201058174e-07, "loss": 1.7432, "step": 1317 }, { "batch_num_effect_tokens": 2799, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.82485, "grad_norm": 0.4261532723903656, "learning_rate": 2.3035586272546207e-07, "loss": 1.6885, "step": 1318 }, { "batch_num_effect_tokens": 2677, "batch_num_samples": 34, "batch_num_tokens": 16317, "epoch": 1.82624, "grad_norm": 0.3717966377735138, "learning_rate": 2.2674171106386312e-07, "loss": 1.4004, "step": 1319 }, { "batch_num_effect_tokens": 2929, "batch_num_samples": 41, "batch_num_tokens": 16337, "epoch": 1.82762, "grad_norm": 0.302685409784317, "learning_rate": 2.2315547816493698e-07, "loss": 1.4565, "step": 1320 }, { "batch_num_effect_tokens": 2478, "batch_num_samples": 29, "batch_num_tokens": 16346, "epoch": 1.82901, "grad_norm": 0.24434752762317657, "learning_rate": 2.1959718500454196e-07, "loss": 1.2021, "step": 1321 }, { "batch_num_effect_tokens": 2597, "batch_num_samples": 37, "batch_num_tokens": 16383, "epoch": 1.83039, "grad_norm": 0.4066220223903656, "learning_rate": 2.1606685239511537e-07, "loss": 1.5728, "step": 1322 }, { "batch_num_effect_tokens": 2815, "batch_num_samples": 34, "batch_num_tokens": 16342, "epoch": 1.83178, "grad_norm": 0.47096335887908936, "learning_rate": 2.1256450098555426e-07, "loss": 1.396, "step": 1323 }, { "batch_num_effect_tokens": 2165, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.83316, "grad_norm": 0.6552850604057312, "learning_rate": 2.0909015126109488e-07, "loss": 1.6084, "step": 1324 }, { "batch_num_effect_tokens": 1922, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.83454, "grad_norm": 0.45447444915771484, "learning_rate": 2.0564382354318791e-07, "loss": 1.5085, "step": 1325 }, { "batch_num_effect_tokens": 3270, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.83593, "grad_norm": 0.3933018445968628, "learning_rate": 2.0222553798938836e-07, "loss": 1.6211, "step": 1326 }, { "batch_num_effect_tokens": 3262, "batch_num_samples": 46, "batch_num_tokens": 16286, "epoch": 1.83731, "grad_norm": 0.34016117453575134, "learning_rate": 1.988353145932298e-07, "loss": 1.5771, "step": 1327 }, { "batch_num_effect_tokens": 3559, "batch_num_samples": 38, "batch_num_tokens": 16277, "epoch": 1.8387, "grad_norm": 0.3652132451534271, "learning_rate": 1.954731731841114e-07, "loss": 1.4426, "step": 1328 }, { "batch_num_effect_tokens": 2334, "batch_num_samples": 29, "batch_num_tokens": 16310, "epoch": 1.84008, "grad_norm": 0.48497626185417175, "learning_rate": 1.9213913342717995e-07, "loss": 2.0366, "step": 1329 }, { "batch_num_effect_tokens": 2647, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 1.84147, "grad_norm": 0.3975164592266083, "learning_rate": 1.8883321482321583e-07, "loss": 1.75, "step": 1330 }, { "batch_num_effect_tokens": 2961, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.84285, "grad_norm": 0.40051260590553284, "learning_rate": 1.855554367085216e-07, "loss": 1.5605, "step": 1331 }, { "batch_num_effect_tokens": 2415, "batch_num_samples": 29, "batch_num_tokens": 16274, "epoch": 1.84424, "grad_norm": 0.42127829790115356, "learning_rate": 1.8230581825480264e-07, "loss": 1.5854, "step": 1332 }, { "batch_num_effect_tokens": 2548, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.84562, "grad_norm": 0.32183176279067993, "learning_rate": 1.7908437846906158e-07, "loss": 1.4448, "step": 1333 }, { "batch_num_effect_tokens": 2310, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.84701, "grad_norm": 0.4180091619491577, "learning_rate": 1.7589113619348174e-07, "loss": 1.5786, "step": 1334 }, { "batch_num_effect_tokens": 2072, "batch_num_samples": 29, "batch_num_tokens": 16284, "epoch": 1.84839, "grad_norm": 0.3970617651939392, "learning_rate": 1.7272611010532014e-07, "loss": 1.3838, "step": 1335 }, { "batch_num_effect_tokens": 1964, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.84978, "grad_norm": 0.30761829018592834, "learning_rate": 1.6958931871679908e-07, "loss": 1.2729, "step": 1336 }, { "batch_num_effect_tokens": 2440, "batch_num_samples": 28, "batch_num_tokens": 16352, "epoch": 1.85116, "grad_norm": 0.4379083514213562, "learning_rate": 1.664807803749935e-07, "loss": 1.5735, "step": 1337 }, { "batch_num_effect_tokens": 3598, "batch_num_samples": 48, "batch_num_tokens": 16383, "epoch": 1.85254, "grad_norm": 0.3583574593067169, "learning_rate": 1.6340051326172834e-07, "loss": 1.458, "step": 1338 }, { "batch_num_effect_tokens": 2576, "batch_num_samples": 32, "batch_num_tokens": 16288, "epoch": 1.85393, "grad_norm": 0.31333377957344055, "learning_rate": 1.603485353934703e-07, "loss": 1.2212, "step": 1339 }, { "batch_num_effect_tokens": 2499, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.85531, "grad_norm": 0.4568358361721039, "learning_rate": 1.5732486462122166e-07, "loss": 1.416, "step": 1340 }, { "batch_num_effect_tokens": 2629, "batch_num_samples": 35, "batch_num_tokens": 16278, "epoch": 1.8567, "grad_norm": 0.42600491642951965, "learning_rate": 1.5432951863041666e-07, "loss": 1.7529, "step": 1341 }, { "batch_num_effect_tokens": 2862, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.85808, "grad_norm": 0.4039027690887451, "learning_rate": 1.5136251494081822e-07, "loss": 1.7744, "step": 1342 }, { "batch_num_effect_tokens": 2666, "batch_num_samples": 58, "batch_num_tokens": 16288, "epoch": 1.85947, "grad_norm": 0.437547504901886, "learning_rate": 1.484238709064162e-07, "loss": 1.6021, "step": 1343 }, { "batch_num_effect_tokens": 2110, "batch_num_samples": 29, "batch_num_tokens": 16284, "epoch": 1.86085, "grad_norm": 0.38135555386543274, "learning_rate": 1.4551360371532264e-07, "loss": 1.4116, "step": 1344 }, { "batch_num_effect_tokens": 3152, "batch_num_samples": 44, "batch_num_tokens": 16384, "epoch": 1.86224, "grad_norm": 0.38640305399894714, "learning_rate": 1.4263173038967627e-07, "loss": 1.6948, "step": 1345 }, { "batch_num_effect_tokens": 2317, "batch_num_samples": 28, "batch_num_tokens": 16383, "epoch": 1.86362, "grad_norm": 0.4266186058521271, "learning_rate": 1.3977826778553805e-07, "loss": 1.542, "step": 1346 }, { "batch_num_effect_tokens": 2930, "batch_num_samples": 50, "batch_num_tokens": 16382, "epoch": 1.86501, "grad_norm": 0.39876288175582886, "learning_rate": 1.3695323259279748e-07, "loss": 1.7573, "step": 1347 }, { "batch_num_effect_tokens": 2493, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 1.86639, "grad_norm": 0.48447251319885254, "learning_rate": 1.3415664133506812e-07, "loss": 1.7529, "step": 1348 }, { "batch_num_effect_tokens": 2391, "batch_num_samples": 30, "batch_num_tokens": 16340, "epoch": 1.86777, "grad_norm": 0.4109746813774109, "learning_rate": 1.3138851036959998e-07, "loss": 1.5928, "step": 1349 }, { "batch_num_effect_tokens": 2526, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.86916, "grad_norm": 0.42457881569862366, "learning_rate": 1.2864885588717512e-07, "loss": 1.624, "step": 1350 }, { "batch_num_effect_tokens": 2572, "batch_num_samples": 29, "batch_num_tokens": 16356, "epoch": 1.87054, "grad_norm": 0.40381577610969543, "learning_rate": 1.2593769391201827e-07, "loss": 1.6152, "step": 1351 }, { "batch_num_effect_tokens": 2414, "batch_num_samples": 29, "batch_num_tokens": 16291, "epoch": 1.87193, "grad_norm": 0.2693682909011841, "learning_rate": 1.2325504030170243e-07, "loss": 1.4531, "step": 1352 }, { "batch_num_effect_tokens": 2318, "batch_num_samples": 31, "batch_num_tokens": 16321, "epoch": 1.87331, "grad_norm": 0.4909821152687073, "learning_rate": 1.206009107470535e-07, "loss": 1.8057, "step": 1353 }, { "batch_num_effect_tokens": 2015, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.8747, "grad_norm": 0.43076270818710327, "learning_rate": 1.1797532077206187e-07, "loss": 1.6768, "step": 1354 }, { "batch_num_effect_tokens": 2606, "batch_num_samples": 30, "batch_num_tokens": 16295, "epoch": 1.87608, "grad_norm": 0.43280109763145447, "learning_rate": 1.1537828573378929e-07, "loss": 1.8218, "step": 1355 }, { "batch_num_effect_tokens": 2368, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.87747, "grad_norm": 0.44611725211143494, "learning_rate": 1.1280982082228054e-07, "loss": 1.7119, "step": 1356 }, { "batch_num_effect_tokens": 2377, "batch_num_samples": 45, "batch_num_tokens": 16382, "epoch": 1.87885, "grad_norm": 0.4284569323062897, "learning_rate": 1.1026994106047296e-07, "loss": 1.9146, "step": 1357 }, { "batch_num_effect_tokens": 3051, "batch_num_samples": 29, "batch_num_tokens": 16356, "epoch": 1.88024, "grad_norm": 0.4025396406650543, "learning_rate": 1.0775866130410928e-07, "loss": 1.6182, "step": 1358 }, { "batch_num_effect_tokens": 3282, "batch_num_samples": 42, "batch_num_tokens": 16284, "epoch": 1.88162, "grad_norm": 0.3324005901813507, "learning_rate": 1.0527599624165275e-07, "loss": 1.7441, "step": 1359 }, { "batch_num_effect_tokens": 2697, "batch_num_samples": 41, "batch_num_tokens": 16346, "epoch": 1.883, "grad_norm": 0.3120698034763336, "learning_rate": 1.0282196039419823e-07, "loss": 1.1558, "step": 1360 }, { "batch_num_effect_tokens": 2198, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.88439, "grad_norm": 0.45892927050590515, "learning_rate": 1.0039656811538789e-07, "loss": 1.8909, "step": 1361 }, { "batch_num_effect_tokens": 2376, "batch_num_samples": 43, "batch_num_tokens": 16347, "epoch": 1.88577, "grad_norm": 0.47789859771728516, "learning_rate": 9.799983359132848e-08, "loss": 1.8931, "step": 1362 }, { "batch_num_effect_tokens": 2932, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.88716, "grad_norm": 0.42095983028411865, "learning_rate": 9.5631770840508e-08, "loss": 1.8535, "step": 1363 }, { "batch_num_effect_tokens": 2444, "batch_num_samples": 36, "batch_num_tokens": 16384, "epoch": 1.88854, "grad_norm": 0.4561365842819214, "learning_rate": 9.329239371371312e-08, "loss": 1.7251, "step": 1364 }, { "batch_num_effect_tokens": 2045, "batch_num_samples": 29, "batch_num_tokens": 16382, "epoch": 1.88993, "grad_norm": 0.3010897636413574, "learning_rate": 9.098171589394855e-08, "loss": 1.1938, "step": 1365 }, { "batch_num_effect_tokens": 2320, "batch_num_samples": 37, "batch_num_tokens": 16383, "epoch": 1.89131, "grad_norm": 0.416601300239563, "learning_rate": 8.869975089635552e-08, "loss": 1.4932, "step": 1366 }, { "batch_num_effect_tokens": 2541, "batch_num_samples": 40, "batch_num_tokens": 16384, "epoch": 1.8927, "grad_norm": 0.43347445130348206, "learning_rate": 8.644651206813625e-08, "loss": 1.2144, "step": 1367 }, { "batch_num_effect_tokens": 2628, "batch_num_samples": 31, "batch_num_tokens": 16214, "epoch": 1.89408, "grad_norm": 0.41827115416526794, "learning_rate": 8.422201258847351e-08, "loss": 1.5366, "step": 1368 }, { "batch_num_effect_tokens": 2114, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.89547, "grad_norm": 0.47529298067092896, "learning_rate": 8.202626546845172e-08, "loss": 1.7612, "step": 1369 }, { "batch_num_effect_tokens": 2331, "batch_num_samples": 37, "batch_num_tokens": 16284, "epoch": 1.89685, "grad_norm": 0.323431134223938, "learning_rate": 7.985928355098483e-08, "loss": 1.5742, "step": 1370 }, { "batch_num_effect_tokens": 2294, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.89823, "grad_norm": 0.31048235297203064, "learning_rate": 7.77210795107386e-08, "loss": 1.7563, "step": 1371 }, { "batch_num_effect_tokens": 2301, "batch_num_samples": 37, "batch_num_tokens": 16384, "epoch": 1.89962, "grad_norm": 0.3119829297065735, "learning_rate": 7.561166585405789e-08, "loss": 1.3948, "step": 1372 }, { "batch_num_effect_tokens": 2647, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.901, "grad_norm": 0.46493998169898987, "learning_rate": 7.353105491889112e-08, "loss": 1.6055, "step": 1373 }, { "batch_num_effect_tokens": 2236, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.90239, "grad_norm": 0.4489620327949524, "learning_rate": 7.147925887472096e-08, "loss": 1.5132, "step": 1374 }, { "batch_num_effect_tokens": 2555, "batch_num_samples": 28, "batch_num_tokens": 16307, "epoch": 1.90377, "grad_norm": 0.38894128799438477, "learning_rate": 6.945628972249208e-08, "loss": 1.5557, "step": 1375 }, { "batch_num_effect_tokens": 2611, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.90516, "grad_norm": 0.4380820691585541, "learning_rate": 6.746215929454014e-08, "loss": 1.6064, "step": 1376 }, { "batch_num_effect_tokens": 2433, "batch_num_samples": 31, "batch_num_tokens": 16234, "epoch": 1.90654, "grad_norm": 0.44756197929382324, "learning_rate": 6.549687925452408e-08, "loss": 1.6855, "step": 1377 }, { "batch_num_effect_tokens": 2156, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.90793, "grad_norm": 0.4736037254333496, "learning_rate": 6.356046109735614e-08, "loss": 1.6812, "step": 1378 }, { "batch_num_effect_tokens": 2485, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 1.90931, "grad_norm": 0.3162359893321991, "learning_rate": 6.165291614913527e-08, "loss": 1.2229, "step": 1379 }, { "batch_num_effect_tokens": 2222, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.9107, "grad_norm": 0.3159218430519104, "learning_rate": 5.977425556708327e-08, "loss": 1.7632, "step": 1380 }, { "batch_num_effect_tokens": 2588, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.91208, "grad_norm": 0.3981672525405884, "learning_rate": 5.7924490339474335e-08, "loss": 1.6343, "step": 1381 }, { "batch_num_effect_tokens": 4259, "batch_num_samples": 50, "batch_num_tokens": 16369, "epoch": 1.91346, "grad_norm": 0.42232558131217957, "learning_rate": 5.610363128557727e-08, "loss": 2.5764, "step": 1382 }, { "batch_num_effect_tokens": 2864, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.91485, "grad_norm": 0.29273852705955505, "learning_rate": 5.431168905558559e-08, "loss": 1.2611, "step": 1383 }, { "batch_num_effect_tokens": 2497, "batch_num_samples": 40, "batch_num_tokens": 16384, "epoch": 1.91623, "grad_norm": 0.5028719902038574, "learning_rate": 5.2548674130561974e-08, "loss": 1.4668, "step": 1384 }, { "batch_num_effect_tokens": 1929, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.91762, "grad_norm": 0.28879842162132263, "learning_rate": 5.081459682236889e-08, "loss": 1.178, "step": 1385 }, { "batch_num_effect_tokens": 2166, "batch_num_samples": 35, "batch_num_tokens": 16384, "epoch": 1.919, "grad_norm": 0.2848933935165405, "learning_rate": 4.910946727361754e-08, "loss": 1.3677, "step": 1386 }, { "batch_num_effect_tokens": 3436, "batch_num_samples": 37, "batch_num_tokens": 16383, "epoch": 1.92039, "grad_norm": 0.37695643305778503, "learning_rate": 4.743329545760122e-08, "loss": 1.6172, "step": 1387 }, { "batch_num_effect_tokens": 1860, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.92177, "grad_norm": 0.4672938883304596, "learning_rate": 4.578609117823873e-08, "loss": 1.7222, "step": 1388 }, { "batch_num_effect_tokens": 2354, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.92316, "grad_norm": 0.4956672489643097, "learning_rate": 4.4167864070021605e-08, "loss": 1.6177, "step": 1389 }, { "batch_num_effect_tokens": 2774, "batch_num_samples": 28, "batch_num_tokens": 16371, "epoch": 1.92454, "grad_norm": 0.30416035652160645, "learning_rate": 4.2578623597949174e-08, "loss": 1.3018, "step": 1390 }, { "batch_num_effect_tokens": 2940, "batch_num_samples": 45, "batch_num_tokens": 16332, "epoch": 1.92593, "grad_norm": 0.48157092928886414, "learning_rate": 4.1018379057482517e-08, "loss": 1.8633, "step": 1391 }, { "batch_num_effect_tokens": 2180, "batch_num_samples": 29, "batch_num_tokens": 16274, "epoch": 1.92731, "grad_norm": 0.40937340259552, "learning_rate": 3.94871395744828e-08, "loss": 1.5903, "step": 1392 }, { "batch_num_effect_tokens": 2258, "batch_num_samples": 30, "batch_num_tokens": 16271, "epoch": 1.9287, "grad_norm": 0.46866804361343384, "learning_rate": 3.7984914105162474e-08, "loss": 1.5891, "step": 1393 }, { "batch_num_effect_tokens": 3214, "batch_num_samples": 39, "batch_num_tokens": 16383, "epoch": 1.93008, "grad_norm": 0.341865599155426, "learning_rate": 3.651171143602972e-08, "loss": 1.7964, "step": 1394 }, { "batch_num_effect_tokens": 2650, "batch_num_samples": 36, "batch_num_tokens": 16292, "epoch": 1.93146, "grad_norm": 0.4382224380970001, "learning_rate": 3.5067540183839064e-08, "loss": 1.606, "step": 1395 }, { "batch_num_effect_tokens": 2302, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.93285, "grad_norm": 0.4038018584251404, "learning_rate": 3.365240879554144e-08, "loss": 1.71, "step": 1396 }, { "batch_num_effect_tokens": 2438, "batch_num_samples": 30, "batch_num_tokens": 16286, "epoch": 1.93423, "grad_norm": 0.47802260518074036, "learning_rate": 3.22663255482325e-08, "loss": 1.397, "step": 1397 }, { "batch_num_effect_tokens": 2168, "batch_num_samples": 31, "batch_num_tokens": 16382, "epoch": 1.93562, "grad_norm": 0.30471113324165344, "learning_rate": 3.090929854910552e-08, "loss": 1.5837, "step": 1398 }, { "batch_num_effect_tokens": 2660, "batch_num_samples": 48, "batch_num_tokens": 16312, "epoch": 1.937, "grad_norm": 0.4145640730857849, "learning_rate": 2.9581335735404672e-08, "loss": 1.7837, "step": 1399 }, { "batch_num_effect_tokens": 3072, "batch_num_samples": 42, "batch_num_tokens": 16384, "epoch": 1.93839, "grad_norm": 0.40897655487060547, "learning_rate": 2.828244487437737e-08, "loss": 1.4702, "step": 1400 }, { "batch_num_effect_tokens": 2590, "batch_num_samples": 29, "batch_num_tokens": 16273, "epoch": 1.93977, "grad_norm": 0.4332371652126312, "learning_rate": 2.7012633563229808e-08, "loss": 1.6792, "step": 1401 }, { "batch_num_effect_tokens": 2213, "batch_num_samples": 30, "batch_num_tokens": 16384, "epoch": 1.94116, "grad_norm": 0.4699755311012268, "learning_rate": 2.577190922908035e-08, "loss": 1.3296, "step": 1402 }, { "batch_num_effect_tokens": 2752, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.94254, "grad_norm": 0.43317264318466187, "learning_rate": 2.4560279128921226e-08, "loss": 1.6655, "step": 1403 }, { "batch_num_effect_tokens": 2311, "batch_num_samples": 30, "batch_num_tokens": 16287, "epoch": 1.94393, "grad_norm": 0.41715502738952637, "learning_rate": 2.337775034956913e-08, "loss": 1.6421, "step": 1404 }, { "batch_num_effect_tokens": 2896, "batch_num_samples": 41, "batch_num_tokens": 16384, "epoch": 1.94531, "grad_norm": 0.3833671808242798, "learning_rate": 2.222432980762912e-08, "loss": 1.4033, "step": 1405 }, { "batch_num_effect_tokens": 2173, "batch_num_samples": 28, "batch_num_tokens": 16365, "epoch": 1.94669, "grad_norm": 0.43336039781570435, "learning_rate": 2.1100024249451347e-08, "loss": 1.7769, "step": 1406 }, { "batch_num_effect_tokens": 3366, "batch_num_samples": 50, "batch_num_tokens": 16340, "epoch": 1.94808, "grad_norm": 0.4162695109844208, "learning_rate": 2.0004840251093284e-08, "loss": 1.7275, "step": 1407 }, { "batch_num_effect_tokens": 2213, "batch_num_samples": 34, "batch_num_tokens": 16367, "epoch": 1.94946, "grad_norm": 0.337968111038208, "learning_rate": 1.8938784218281435e-08, "loss": 1.0703, "step": 1408 }, { "batch_num_effect_tokens": 2156, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.95085, "grad_norm": 0.46727487444877625, "learning_rate": 1.790186238637026e-08, "loss": 1.7842, "step": 1409 }, { "batch_num_effect_tokens": 2530, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.95223, "grad_norm": 0.3034614622592926, "learning_rate": 1.689408082031163e-08, "loss": 1.6934, "step": 1410 }, { "batch_num_effect_tokens": 2477, "batch_num_samples": 39, "batch_num_tokens": 16374, "epoch": 1.95362, "grad_norm": 0.4455222487449646, "learning_rate": 1.5915445414613208e-08, "loss": 1.873, "step": 1411 }, { "batch_num_effect_tokens": 2797, "batch_num_samples": 28, "batch_num_tokens": 16383, "epoch": 1.955, "grad_norm": 0.44405028223991394, "learning_rate": 1.496596189331012e-08, "loss": 1.6484, "step": 1412 }, { "batch_num_effect_tokens": 2136, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.95639, "grad_norm": 0.2896287441253662, "learning_rate": 1.4045635809925018e-08, "loss": 1.2754, "step": 1413 }, { "batch_num_effect_tokens": 3453, "batch_num_samples": 38, "batch_num_tokens": 16384, "epoch": 1.95777, "grad_norm": 0.38841456174850464, "learning_rate": 1.3154472547440289e-08, "loss": 1.6353, "step": 1414 }, { "batch_num_effect_tokens": 2498, "batch_num_samples": 33, "batch_num_tokens": 16384, "epoch": 1.95916, "grad_norm": 0.4301367700099945, "learning_rate": 1.2292477318266438e-08, "loss": 1.7524, "step": 1415 }, { "batch_num_effect_tokens": 2311, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.96054, "grad_norm": 0.31144705414772034, "learning_rate": 1.1459655164208216e-08, "loss": 1.4004, "step": 1416 }, { "batch_num_effect_tokens": 1802, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.96192, "grad_norm": 0.44672027230262756, "learning_rate": 1.0656010956437979e-08, "loss": 1.5171, "step": 1417 }, { "batch_num_effect_tokens": 2306, "batch_num_samples": 32, "batch_num_tokens": 16384, "epoch": 1.96331, "grad_norm": 0.49117809534072876, "learning_rate": 9.881549395466262e-09, "loss": 1.5449, "step": 1418 }, { "batch_num_effect_tokens": 2447, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.96469, "grad_norm": 0.5049731135368347, "learning_rate": 9.136275011115137e-09, "loss": 1.6785, "step": 1419 }, { "batch_num_effect_tokens": 3304, "batch_num_samples": 30, "batch_num_tokens": 16340, "epoch": 1.96608, "grad_norm": 0.4054308235645294, "learning_rate": 8.42019216249046e-09, "loss": 1.6548, "step": 1420 }, { "batch_num_effect_tokens": 2444, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.96746, "grad_norm": 0.37163886427879333, "learning_rate": 7.733305037957439e-09, "loss": 1.5459, "step": 1421 }, { "batch_num_effect_tokens": 2051, "batch_num_samples": 36, "batch_num_tokens": 16346, "epoch": 1.96885, "grad_norm": 0.4467935562133789, "learning_rate": 7.0756176551145525e-09, "loss": 1.354, "step": 1422 }, { "batch_num_effect_tokens": 1991, "batch_num_samples": 29, "batch_num_tokens": 16384, "epoch": 1.97023, "grad_norm": 0.46855422854423523, "learning_rate": 6.447133860771893e-09, "loss": 1.3618, "step": 1423 }, { "batch_num_effect_tokens": 2670, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.97162, "grad_norm": 0.4507562220096588, "learning_rate": 5.8478573309284085e-09, "loss": 1.5864, "step": 1424 }, { "batch_num_effect_tokens": 3049, "batch_num_samples": 28, "batch_num_tokens": 16384, "epoch": 1.973, "grad_norm": 0.38302040100097656, "learning_rate": 5.2777915707491465e-09, "loss": 1.5195, "step": 1425 }, { "batch_num_effect_tokens": 2254, "batch_num_samples": 29, "batch_num_tokens": 16354, "epoch": 1.97439, "grad_norm": 0.2951429486274719, "learning_rate": 4.736939914545824e-09, "loss": 1.3491, "step": 1426 }, { "batch_num_effect_tokens": 2243, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.97577, "grad_norm": 0.43255022168159485, "learning_rate": 4.225305525756285e-09, "loss": 1.5986, "step": 1427 }, { "batch_num_effect_tokens": 2208, "batch_num_samples": 30, "batch_num_tokens": 16328, "epoch": 1.97715, "grad_norm": 0.3145412802696228, "learning_rate": 3.7428913969284055e-09, "loss": 1.3574, "step": 1428 }, { "batch_num_effect_tokens": 2005, "batch_num_samples": 29, "batch_num_tokens": 16352, "epoch": 1.97854, "grad_norm": 0.4780865013599396, "learning_rate": 3.289700349698999e-09, "loss": 1.8184, "step": 1429 }, { "batch_num_effect_tokens": 3725, "batch_num_samples": 48, "batch_num_tokens": 16352, "epoch": 1.97992, "grad_norm": 0.3435876667499542, "learning_rate": 2.8657350347810473e-09, "loss": 0.9832, "step": 1430 }, { "batch_num_effect_tokens": 2802, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.98131, "grad_norm": 0.42606887221336365, "learning_rate": 2.4709979319442743e-09, "loss": 1.5957, "step": 1431 }, { "batch_num_effect_tokens": 2151, "batch_num_samples": 28, "batch_num_tokens": 16383, "epoch": 1.98269, "grad_norm": 0.4298803508281708, "learning_rate": 2.1054913500051512e-09, "loss": 1.5195, "step": 1432 }, { "batch_num_effect_tokens": 2178, "batch_num_samples": 30, "batch_num_tokens": 16364, "epoch": 1.98408, "grad_norm": 0.43839871883392334, "learning_rate": 1.7692174268091334e-09, "loss": 1.4263, "step": 1433 }, { "batch_num_effect_tokens": 2025, "batch_num_samples": 33, "batch_num_tokens": 16279, "epoch": 1.98546, "grad_norm": 0.3629027307033539, "learning_rate": 1.4621781292201155e-09, "loss": 1.4614, "step": 1434 }, { "batch_num_effect_tokens": 2687, "batch_num_samples": 30, "batch_num_tokens": 16364, "epoch": 1.98685, "grad_norm": 0.4128134548664093, "learning_rate": 1.1843752531104368e-09, "loss": 1.6504, "step": 1435 }, { "batch_num_effect_tokens": 2854, "batch_num_samples": 30, "batch_num_tokens": 16343, "epoch": 1.98823, "grad_norm": 0.38200539350509644, "learning_rate": 9.358104233470055e-10, "loss": 1.8081, "step": 1436 }, { "batch_num_effect_tokens": 2069, "batch_num_samples": 28, "batch_num_tokens": 16378, "epoch": 1.98962, "grad_norm": 0.3999119699001312, "learning_rate": 7.164850937840806e-10, "loss": 1.3359, "step": 1437 }, { "batch_num_effect_tokens": 2309, "batch_num_samples": 31, "batch_num_tokens": 16384, "epoch": 1.991, "grad_norm": 0.43412071466445923, "learning_rate": 5.264005472549461e-10, "loss": 1.5327, "step": 1438 }, { "batch_num_effect_tokens": 2750, "batch_num_samples": 29, "batch_num_tokens": 16274, "epoch": 1.99238, "grad_norm": 0.4243752062320709, "learning_rate": 3.655578955624739e-10, "loss": 1.6328, "step": 1439 }, { "batch_num_effect_tokens": 3208, "batch_num_samples": 34, "batch_num_tokens": 16362, "epoch": 1.99377, "grad_norm": 0.36154425144195557, "learning_rate": 2.339580794752383e-10, "loss": 1.6782, "step": 1440 }, { "batch_num_effect_tokens": 2701, "batch_num_samples": 31, "batch_num_tokens": 16372, "epoch": 1.99515, "grad_norm": 0.3267018496990204, "learning_rate": 1.316018687191889e-10, "loss": 1.291, "step": 1441 }, { "batch_num_effect_tokens": 2974, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.99654, "grad_norm": 0.3666023015975952, "learning_rate": 5.848986197487528e-11, "loss": 1.4873, "step": 1442 }, { "batch_num_effect_tokens": 2422, "batch_num_samples": 44, "batch_num_tokens": 16383, "epoch": 1.99792, "grad_norm": 0.48749086260795593, "learning_rate": 1.4622486875226494e-11, "loss": 1.5444, "step": 1443 }, { "batch_num_effect_tokens": 2442, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.99931, "grad_norm": 0.3050819933414459, "learning_rate": 0.0, "loss": 1.1328, "step": 1444 }, { "batch_num_effect_tokens": 2442, "batch_num_samples": 34, "batch_num_tokens": 16384, "epoch": 1.99931, "eval_eval_loss": 0.4288076162338257, "eval_eval_runtime": 105.4941, "eval_eval_samples_per_second": 45.955, "eval_eval_steps_per_second": 2.872, "step": 1444 } ], "logging_steps": 1.0, "max_steps": 1444, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }