diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,70117 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.984008395592314, + "eval_steps": 1500, + "global_step": 10004, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00019989505509607455, + "grad_norm": 12.8125, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.476, + "step": 1 + }, + { + "epoch": 0.00019989505509607455, + "eval_loss": 1.7460540533065796, + "eval_runtime": 595.8959, + "eval_samples_per_second": 3.588, + "eval_steps_per_second": 3.588, + "step": 1 + }, + { + "epoch": 0.0003997901101921491, + "grad_norm": 12.625, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.6568, + "step": 2 + }, + { + "epoch": 0.0005996851652882237, + "grad_norm": 13.125, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.5199, + "step": 3 + }, + { + "epoch": 0.0007995802203842982, + "grad_norm": 11.75, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.5059, + "step": 4 + }, + { + "epoch": 0.0009994752754803728, + "grad_norm": 10.125, + "learning_rate": 5.000000000000001e-07, + "loss": 1.568, + "step": 5 + }, + { + "epoch": 0.0011993703305764475, + "grad_norm": 10.5625, + "learning_rate": 6.000000000000001e-07, + "loss": 1.5257, + "step": 6 + }, + { + "epoch": 0.001399265385672522, + "grad_norm": 10.6875, + "learning_rate": 7.000000000000001e-07, + "loss": 1.4327, + "step": 7 + }, + { + "epoch": 0.0015991604407685964, + "grad_norm": 9.1875, + "learning_rate": 8.000000000000001e-07, + "loss": 1.5052, + "step": 8 + }, + { + "epoch": 0.001799055495864671, + "grad_norm": 9.5625, + "learning_rate": 9.000000000000001e-07, + "loss": 1.5233, + "step": 9 + }, + { + "epoch": 0.0019989505509607455, + "grad_norm": 8.875, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5549, + "step": 10 + }, + { + "epoch": 0.0021988456060568202, + "grad_norm": 7.4375, + "learning_rate": 1.1e-06, + "loss": 1.5139, + "step": 11 + }, + { + "epoch": 0.002398740661152895, + "grad_norm": 6.625, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.4168, + "step": 12 + }, + { + "epoch": 0.002598635716248969, + "grad_norm": 6.65625, + "learning_rate": 1.3e-06, + "loss": 1.452, + "step": 13 + }, + { + "epoch": 0.002798530771345044, + "grad_norm": 5.6875, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.384, + "step": 14 + }, + { + "epoch": 0.0029984258264411185, + "grad_norm": 5.1875, + "learning_rate": 1.5e-06, + "loss": 1.4018, + "step": 15 + }, + { + "epoch": 0.003198320881537193, + "grad_norm": 5.09375, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.3572, + "step": 16 + }, + { + "epoch": 0.0033982159366332675, + "grad_norm": 4.9375, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.3639, + "step": 17 + }, + { + "epoch": 0.003598110991729342, + "grad_norm": 4.84375, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.4174, + "step": 18 + }, + { + "epoch": 0.003798006046825417, + "grad_norm": 4.625, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.3139, + "step": 19 + }, + { + "epoch": 0.003997901101921491, + "grad_norm": 4.4375, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.468, + "step": 20 + }, + { + "epoch": 0.004197796157017566, + "grad_norm": 4.09375, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.3801, + "step": 21 + }, + { + "epoch": 0.0043976912121136405, + "grad_norm": 3.609375, + "learning_rate": 2.2e-06, + "loss": 1.3115, + "step": 22 + }, + { + "epoch": 0.004597586267209715, + "grad_norm": 3.96875, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.4058, + "step": 23 + }, + { + "epoch": 0.00479748132230579, + "grad_norm": 3.828125, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.3053, + "step": 24 + }, + { + "epoch": 0.004997376377401864, + "grad_norm": 3.671875, + "learning_rate": 2.5e-06, + "loss": 1.285, + "step": 25 + }, + { + "epoch": 0.005197271432497938, + "grad_norm": 3.8125, + "learning_rate": 2.6e-06, + "loss": 1.4823, + "step": 26 + }, + { + "epoch": 0.005397166487594013, + "grad_norm": 3.484375, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.3546, + "step": 27 + }, + { + "epoch": 0.005597061542690088, + "grad_norm": 3.53125, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.3501, + "step": 28 + }, + { + "epoch": 0.005796956597786162, + "grad_norm": 3.8125, + "learning_rate": 2.9e-06, + "loss": 1.3478, + "step": 29 + }, + { + "epoch": 0.005996851652882237, + "grad_norm": 3.28125, + "learning_rate": 3e-06, + "loss": 1.2747, + "step": 30 + }, + { + "epoch": 0.006196746707978312, + "grad_norm": 3.515625, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.4141, + "step": 31 + }, + { + "epoch": 0.006396641763074386, + "grad_norm": 3.3125, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.3632, + "step": 32 + }, + { + "epoch": 0.00659653681817046, + "grad_norm": 2.90625, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.2251, + "step": 33 + }, + { + "epoch": 0.006796431873266535, + "grad_norm": 3.03125, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.3892, + "step": 34 + }, + { + "epoch": 0.00699632692836261, + "grad_norm": 3.015625, + "learning_rate": 3.5e-06, + "loss": 1.3128, + "step": 35 + }, + { + "epoch": 0.007196221983458684, + "grad_norm": 2.765625, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.2615, + "step": 36 + }, + { + "epoch": 0.007396117038554759, + "grad_norm": 3.25, + "learning_rate": 3.7e-06, + "loss": 1.3802, + "step": 37 + }, + { + "epoch": 0.007596012093650834, + "grad_norm": 2.9375, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.3061, + "step": 38 + }, + { + "epoch": 0.0077959071487469075, + "grad_norm": 2.828125, + "learning_rate": 3.900000000000001e-06, + "loss": 1.2044, + "step": 39 + }, + { + "epoch": 0.007995802203842982, + "grad_norm": 2.671875, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3376, + "step": 40 + }, + { + "epoch": 0.008195697258939058, + "grad_norm": 2.671875, + "learning_rate": 4.1e-06, + "loss": 1.3461, + "step": 41 + }, + { + "epoch": 0.008395592314035132, + "grad_norm": 2.53125, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.2514, + "step": 42 + }, + { + "epoch": 0.008595487369131205, + "grad_norm": 2.5625, + "learning_rate": 4.3e-06, + "loss": 1.2735, + "step": 43 + }, + { + "epoch": 0.008795382424227281, + "grad_norm": 2.578125, + "learning_rate": 4.4e-06, + "loss": 1.2665, + "step": 44 + }, + { + "epoch": 0.008995277479323355, + "grad_norm": 2.59375, + "learning_rate": 4.5e-06, + "loss": 1.3754, + "step": 45 + }, + { + "epoch": 0.00919517253441943, + "grad_norm": 2.578125, + "learning_rate": 4.600000000000001e-06, + "loss": 1.2434, + "step": 46 + }, + { + "epoch": 0.009395067589515504, + "grad_norm": 2.515625, + "learning_rate": 4.7e-06, + "loss": 1.3023, + "step": 47 + }, + { + "epoch": 0.00959496264461158, + "grad_norm": 2.4375, + "learning_rate": 4.800000000000001e-06, + "loss": 1.2093, + "step": 48 + }, + { + "epoch": 0.009794857699707653, + "grad_norm": 2.421875, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.3093, + "step": 49 + }, + { + "epoch": 0.009994752754803727, + "grad_norm": 2.5, + "learning_rate": 5e-06, + "loss": 1.299, + "step": 50 + }, + { + "epoch": 0.010194647809899803, + "grad_norm": 2.578125, + "learning_rate": 5.1e-06, + "loss": 1.2736, + "step": 51 + }, + { + "epoch": 0.010394542864995877, + "grad_norm": 2.546875, + "learning_rate": 5.2e-06, + "loss": 1.2607, + "step": 52 + }, + { + "epoch": 0.010594437920091952, + "grad_norm": 2.5, + "learning_rate": 5.300000000000001e-06, + "loss": 1.2765, + "step": 53 + }, + { + "epoch": 0.010794332975188026, + "grad_norm": 2.390625, + "learning_rate": 5.400000000000001e-06, + "loss": 1.2538, + "step": 54 + }, + { + "epoch": 0.010994228030284102, + "grad_norm": 2.328125, + "learning_rate": 5.500000000000001e-06, + "loss": 1.2165, + "step": 55 + }, + { + "epoch": 0.011194123085380175, + "grad_norm": 2.40625, + "learning_rate": 5.600000000000001e-06, + "loss": 1.2152, + "step": 56 + }, + { + "epoch": 0.01139401814047625, + "grad_norm": 2.375, + "learning_rate": 5.7e-06, + "loss": 1.2222, + "step": 57 + }, + { + "epoch": 0.011593913195572325, + "grad_norm": 2.375, + "learning_rate": 5.8e-06, + "loss": 1.2779, + "step": 58 + }, + { + "epoch": 0.011793808250668399, + "grad_norm": 2.375, + "learning_rate": 5.9e-06, + "loss": 1.239, + "step": 59 + }, + { + "epoch": 0.011993703305764474, + "grad_norm": 2.34375, + "learning_rate": 6e-06, + "loss": 1.3348, + "step": 60 + }, + { + "epoch": 0.012193598360860548, + "grad_norm": 2.28125, + "learning_rate": 6.1e-06, + "loss": 1.2105, + "step": 61 + }, + { + "epoch": 0.012393493415956624, + "grad_norm": 2.4375, + "learning_rate": 6.200000000000001e-06, + "loss": 1.2447, + "step": 62 + }, + { + "epoch": 0.012593388471052697, + "grad_norm": 2.453125, + "learning_rate": 6.300000000000001e-06, + "loss": 1.2812, + "step": 63 + }, + { + "epoch": 0.012793283526148771, + "grad_norm": 2.46875, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.1675, + "step": 64 + }, + { + "epoch": 0.012993178581244847, + "grad_norm": 2.390625, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.3107, + "step": 65 + }, + { + "epoch": 0.01319307363634092, + "grad_norm": 2.28125, + "learning_rate": 6.600000000000001e-06, + "loss": 1.2161, + "step": 66 + }, + { + "epoch": 0.013392968691436996, + "grad_norm": 2.40625, + "learning_rate": 6.700000000000001e-06, + "loss": 1.2651, + "step": 67 + }, + { + "epoch": 0.01359286374653307, + "grad_norm": 2.4375, + "learning_rate": 6.800000000000001e-06, + "loss": 1.2281, + "step": 68 + }, + { + "epoch": 0.013792758801629145, + "grad_norm": 2.515625, + "learning_rate": 6.9e-06, + "loss": 1.3876, + "step": 69 + }, + { + "epoch": 0.01399265385672522, + "grad_norm": 2.578125, + "learning_rate": 7e-06, + "loss": 1.3301, + "step": 70 + }, + { + "epoch": 0.014192548911821293, + "grad_norm": 2.28125, + "learning_rate": 7.100000000000001e-06, + "loss": 1.234, + "step": 71 + }, + { + "epoch": 0.014392443966917369, + "grad_norm": 2.5, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.2861, + "step": 72 + }, + { + "epoch": 0.014592339022013442, + "grad_norm": 2.53125, + "learning_rate": 7.3e-06, + "loss": 1.2072, + "step": 73 + }, + { + "epoch": 0.014792234077109518, + "grad_norm": 2.484375, + "learning_rate": 7.4e-06, + "loss": 1.2894, + "step": 74 + }, + { + "epoch": 0.014992129132205592, + "grad_norm": 2.328125, + "learning_rate": 7.500000000000001e-06, + "loss": 1.1828, + "step": 75 + }, + { + "epoch": 0.015192024187301667, + "grad_norm": 2.34375, + "learning_rate": 7.600000000000001e-06, + "loss": 1.1208, + "step": 76 + }, + { + "epoch": 0.015391919242397741, + "grad_norm": 2.21875, + "learning_rate": 7.7e-06, + "loss": 1.1869, + "step": 77 + }, + { + "epoch": 0.015591814297493815, + "grad_norm": 2.453125, + "learning_rate": 7.800000000000002e-06, + "loss": 1.232, + "step": 78 + }, + { + "epoch": 0.01579170935258989, + "grad_norm": 2.3125, + "learning_rate": 7.9e-06, + "loss": 1.1701, + "step": 79 + }, + { + "epoch": 0.015991604407685964, + "grad_norm": 2.640625, + "learning_rate": 8.000000000000001e-06, + "loss": 1.1675, + "step": 80 + }, + { + "epoch": 0.01619149946278204, + "grad_norm": 2.515625, + "learning_rate": 8.1e-06, + "loss": 1.3752, + "step": 81 + }, + { + "epoch": 0.016391394517878115, + "grad_norm": 2.453125, + "learning_rate": 8.2e-06, + "loss": 1.1496, + "step": 82 + }, + { + "epoch": 0.016591289572974188, + "grad_norm": 2.265625, + "learning_rate": 8.3e-06, + "loss": 1.1925, + "step": 83 + }, + { + "epoch": 0.016791184628070263, + "grad_norm": 2.5, + "learning_rate": 8.400000000000001e-06, + "loss": 1.2526, + "step": 84 + }, + { + "epoch": 0.01699107968316634, + "grad_norm": 2.296875, + "learning_rate": 8.5e-06, + "loss": 1.2725, + "step": 85 + }, + { + "epoch": 0.01719097473826241, + "grad_norm": 2.484375, + "learning_rate": 8.6e-06, + "loss": 1.2927, + "step": 86 + }, + { + "epoch": 0.017390869793358486, + "grad_norm": 2.375, + "learning_rate": 8.700000000000001e-06, + "loss": 1.2508, + "step": 87 + }, + { + "epoch": 0.017590764848454562, + "grad_norm": 2.234375, + "learning_rate": 8.8e-06, + "loss": 1.164, + "step": 88 + }, + { + "epoch": 0.017790659903550637, + "grad_norm": 2.40625, + "learning_rate": 8.900000000000001e-06, + "loss": 1.2274, + "step": 89 + }, + { + "epoch": 0.01799055495864671, + "grad_norm": 2.234375, + "learning_rate": 9e-06, + "loss": 1.221, + "step": 90 + }, + { + "epoch": 0.018190450013742785, + "grad_norm": 2.3125, + "learning_rate": 9.100000000000001e-06, + "loss": 1.2804, + "step": 91 + }, + { + "epoch": 0.01839034506883886, + "grad_norm": 2.359375, + "learning_rate": 9.200000000000002e-06, + "loss": 1.1848, + "step": 92 + }, + { + "epoch": 0.018590240123934933, + "grad_norm": 2.3125, + "learning_rate": 9.3e-06, + "loss": 1.2262, + "step": 93 + }, + { + "epoch": 0.018790135179031008, + "grad_norm": 2.25, + "learning_rate": 9.4e-06, + "loss": 1.2016, + "step": 94 + }, + { + "epoch": 0.018990030234127084, + "grad_norm": 2.359375, + "learning_rate": 9.5e-06, + "loss": 1.2149, + "step": 95 + }, + { + "epoch": 0.01918992528922316, + "grad_norm": 2.578125, + "learning_rate": 9.600000000000001e-06, + "loss": 1.2341, + "step": 96 + }, + { + "epoch": 0.01938982034431923, + "grad_norm": 2.390625, + "learning_rate": 9.7e-06, + "loss": 1.2795, + "step": 97 + }, + { + "epoch": 0.019589715399415307, + "grad_norm": 2.21875, + "learning_rate": 9.800000000000001e-06, + "loss": 1.2028, + "step": 98 + }, + { + "epoch": 0.019789610454511383, + "grad_norm": 2.234375, + "learning_rate": 9.9e-06, + "loss": 1.1827, + "step": 99 + }, + { + "epoch": 0.019989505509607455, + "grad_norm": 2.328125, + "learning_rate": 1e-05, + "loss": 1.2169, + "step": 100 + }, + { + "epoch": 0.02018940056470353, + "grad_norm": 2.28125, + "learning_rate": 9.999999888950268e-06, + "loss": 1.1576, + "step": 101 + }, + { + "epoch": 0.020389295619799606, + "grad_norm": 2.234375, + "learning_rate": 9.999999555801075e-06, + "loss": 1.1927, + "step": 102 + }, + { + "epoch": 0.02058919067489568, + "grad_norm": 2.40625, + "learning_rate": 9.999999000552435e-06, + "loss": 1.1975, + "step": 103 + }, + { + "epoch": 0.020789085729991753, + "grad_norm": 2.390625, + "learning_rate": 9.999998223204373e-06, + "loss": 1.2024, + "step": 104 + }, + { + "epoch": 0.02098898078508783, + "grad_norm": 2.328125, + "learning_rate": 9.999997223756924e-06, + "loss": 1.267, + "step": 105 + }, + { + "epoch": 0.021188875840183904, + "grad_norm": 2.296875, + "learning_rate": 9.99999600221013e-06, + "loss": 1.2559, + "step": 106 + }, + { + "epoch": 0.021388770895279977, + "grad_norm": 2.40625, + "learning_rate": 9.999994558564052e-06, + "loss": 1.179, + "step": 107 + }, + { + "epoch": 0.021588665950376052, + "grad_norm": 2.34375, + "learning_rate": 9.999992892818746e-06, + "loss": 1.0606, + "step": 108 + }, + { + "epoch": 0.021788561005472128, + "grad_norm": 2.46875, + "learning_rate": 9.999991004974292e-06, + "loss": 1.1812, + "step": 109 + }, + { + "epoch": 0.021988456060568203, + "grad_norm": 2.265625, + "learning_rate": 9.999988895030772e-06, + "loss": 1.1652, + "step": 110 + }, + { + "epoch": 0.022188351115664275, + "grad_norm": 2.171875, + "learning_rate": 9.999986562988278e-06, + "loss": 1.1498, + "step": 111 + }, + { + "epoch": 0.02238824617076035, + "grad_norm": 2.359375, + "learning_rate": 9.999984008846914e-06, + "loss": 1.3272, + "step": 112 + }, + { + "epoch": 0.022588141225856426, + "grad_norm": 2.296875, + "learning_rate": 9.999981232606796e-06, + "loss": 1.2065, + "step": 113 + }, + { + "epoch": 0.0227880362809525, + "grad_norm": 2.296875, + "learning_rate": 9.999978234268047e-06, + "loss": 1.1944, + "step": 114 + }, + { + "epoch": 0.022987931336048574, + "grad_norm": 2.234375, + "learning_rate": 9.999975013830797e-06, + "loss": 1.1866, + "step": 115 + }, + { + "epoch": 0.02318782639114465, + "grad_norm": 2.1875, + "learning_rate": 9.99997157129519e-06, + "loss": 1.1906, + "step": 116 + }, + { + "epoch": 0.023387721446240725, + "grad_norm": 2.265625, + "learning_rate": 9.99996790666138e-06, + "loss": 1.1623, + "step": 117 + }, + { + "epoch": 0.023587616501336797, + "grad_norm": 2.53125, + "learning_rate": 9.99996401992953e-06, + "loss": 1.4069, + "step": 118 + }, + { + "epoch": 0.023787511556432873, + "grad_norm": 2.421875, + "learning_rate": 9.999959911099814e-06, + "loss": 1.1261, + "step": 119 + }, + { + "epoch": 0.02398740661152895, + "grad_norm": 2.515625, + "learning_rate": 9.999955580172411e-06, + "loss": 1.2397, + "step": 120 + }, + { + "epoch": 0.02418730166662502, + "grad_norm": 2.40625, + "learning_rate": 9.999951027147514e-06, + "loss": 1.3055, + "step": 121 + }, + { + "epoch": 0.024387196721721096, + "grad_norm": 2.484375, + "learning_rate": 9.999946252025329e-06, + "loss": 1.1119, + "step": 122 + }, + { + "epoch": 0.02458709177681717, + "grad_norm": 2.390625, + "learning_rate": 9.999941254806065e-06, + "loss": 1.2665, + "step": 123 + }, + { + "epoch": 0.024786986831913247, + "grad_norm": 2.328125, + "learning_rate": 9.999936035489943e-06, + "loss": 1.1914, + "step": 124 + }, + { + "epoch": 0.02498688188700932, + "grad_norm": 2.171875, + "learning_rate": 9.999930594077199e-06, + "loss": 1.2627, + "step": 125 + }, + { + "epoch": 0.025186776942105395, + "grad_norm": 2.453125, + "learning_rate": 9.99992493056807e-06, + "loss": 1.2579, + "step": 126 + }, + { + "epoch": 0.02538667199720147, + "grad_norm": 2.265625, + "learning_rate": 9.999919044962809e-06, + "loss": 1.2031, + "step": 127 + }, + { + "epoch": 0.025586567052297542, + "grad_norm": 2.296875, + "learning_rate": 9.999912937261679e-06, + "loss": 1.2567, + "step": 128 + }, + { + "epoch": 0.025786462107393618, + "grad_norm": 2.34375, + "learning_rate": 9.99990660746495e-06, + "loss": 1.1828, + "step": 129 + }, + { + "epoch": 0.025986357162489693, + "grad_norm": 2.203125, + "learning_rate": 9.999900055572904e-06, + "loss": 1.2224, + "step": 130 + }, + { + "epoch": 0.02618625221758577, + "grad_norm": 2.21875, + "learning_rate": 9.99989328158583e-06, + "loss": 1.1482, + "step": 131 + }, + { + "epoch": 0.02638614727268184, + "grad_norm": 2.359375, + "learning_rate": 9.999886285504033e-06, + "loss": 1.1253, + "step": 132 + }, + { + "epoch": 0.026586042327777917, + "grad_norm": 2.265625, + "learning_rate": 9.99987906732782e-06, + "loss": 1.1303, + "step": 133 + }, + { + "epoch": 0.026785937382873992, + "grad_norm": 2.359375, + "learning_rate": 9.999871627057511e-06, + "loss": 1.2155, + "step": 134 + }, + { + "epoch": 0.026985832437970064, + "grad_norm": 2.390625, + "learning_rate": 9.999863964693441e-06, + "loss": 1.1278, + "step": 135 + }, + { + "epoch": 0.02718572749306614, + "grad_norm": 2.40625, + "learning_rate": 9.999856080235947e-06, + "loss": 1.1915, + "step": 136 + }, + { + "epoch": 0.027385622548162215, + "grad_norm": 2.265625, + "learning_rate": 9.99984797368538e-06, + "loss": 1.1856, + "step": 137 + }, + { + "epoch": 0.02758551760325829, + "grad_norm": 2.265625, + "learning_rate": 9.9998396450421e-06, + "loss": 1.2587, + "step": 138 + }, + { + "epoch": 0.027785412658354363, + "grad_norm": 2.578125, + "learning_rate": 9.999831094306475e-06, + "loss": 1.2584, + "step": 139 + }, + { + "epoch": 0.02798530771345044, + "grad_norm": 2.359375, + "learning_rate": 9.999822321478889e-06, + "loss": 1.2475, + "step": 140 + }, + { + "epoch": 0.028185202768546514, + "grad_norm": 2.375, + "learning_rate": 9.999813326559728e-06, + "loss": 1.149, + "step": 141 + }, + { + "epoch": 0.028385097823642586, + "grad_norm": 3.03125, + "learning_rate": 9.999804109549397e-06, + "loss": 1.2239, + "step": 142 + }, + { + "epoch": 0.028584992878738662, + "grad_norm": 2.234375, + "learning_rate": 9.999794670448298e-06, + "loss": 1.1599, + "step": 143 + }, + { + "epoch": 0.028784887933834737, + "grad_norm": 2.328125, + "learning_rate": 9.999785009256853e-06, + "loss": 1.2722, + "step": 144 + }, + { + "epoch": 0.028984782988930813, + "grad_norm": 2.140625, + "learning_rate": 9.999775125975492e-06, + "loss": 1.1292, + "step": 145 + }, + { + "epoch": 0.029184678044026885, + "grad_norm": 2.171875, + "learning_rate": 9.999765020604655e-06, + "loss": 1.1786, + "step": 146 + }, + { + "epoch": 0.02938457309912296, + "grad_norm": 2.203125, + "learning_rate": 9.99975469314479e-06, + "loss": 1.1699, + "step": 147 + }, + { + "epoch": 0.029584468154219036, + "grad_norm": 2.328125, + "learning_rate": 9.999744143596354e-06, + "loss": 1.25, + "step": 148 + }, + { + "epoch": 0.029784363209315108, + "grad_norm": 2.28125, + "learning_rate": 9.99973337195982e-06, + "loss": 1.2246, + "step": 149 + }, + { + "epoch": 0.029984258264411184, + "grad_norm": 2.265625, + "learning_rate": 9.999722378235661e-06, + "loss": 1.173, + "step": 150 + }, + { + "epoch": 0.03018415331950726, + "grad_norm": 2.265625, + "learning_rate": 9.99971116242437e-06, + "loss": 1.1347, + "step": 151 + }, + { + "epoch": 0.030384048374603335, + "grad_norm": 2.109375, + "learning_rate": 9.99969972452644e-06, + "loss": 1.1403, + "step": 152 + }, + { + "epoch": 0.030583943429699407, + "grad_norm": 2.25, + "learning_rate": 9.999688064542385e-06, + "loss": 1.2141, + "step": 153 + }, + { + "epoch": 0.030783838484795482, + "grad_norm": 2.375, + "learning_rate": 9.99967618247272e-06, + "loss": 1.1917, + "step": 154 + }, + { + "epoch": 0.030983733539891558, + "grad_norm": 2.46875, + "learning_rate": 9.999664078317972e-06, + "loss": 1.1414, + "step": 155 + }, + { + "epoch": 0.03118362859498763, + "grad_norm": 2.265625, + "learning_rate": 9.999651752078681e-06, + "loss": 1.2256, + "step": 156 + }, + { + "epoch": 0.031383523650083706, + "grad_norm": 2.25, + "learning_rate": 9.999639203755392e-06, + "loss": 1.1904, + "step": 157 + }, + { + "epoch": 0.03158341870517978, + "grad_norm": 2.234375, + "learning_rate": 9.999626433348664e-06, + "loss": 1.2102, + "step": 158 + }, + { + "epoch": 0.03178331376027586, + "grad_norm": 2.28125, + "learning_rate": 9.999613440859064e-06, + "loss": 1.1549, + "step": 159 + }, + { + "epoch": 0.03198320881537193, + "grad_norm": 2.484375, + "learning_rate": 9.999600226287168e-06, + "loss": 1.1984, + "step": 160 + }, + { + "epoch": 0.032183103870468, + "grad_norm": 2.3125, + "learning_rate": 9.999586789633565e-06, + "loss": 1.1532, + "step": 161 + }, + { + "epoch": 0.03238299892556408, + "grad_norm": 2.40625, + "learning_rate": 9.99957313089885e-06, + "loss": 1.2764, + "step": 162 + }, + { + "epoch": 0.03258289398066015, + "grad_norm": 2.484375, + "learning_rate": 9.999559250083631e-06, + "loss": 1.2102, + "step": 163 + }, + { + "epoch": 0.03278278903575623, + "grad_norm": 2.265625, + "learning_rate": 9.999545147188523e-06, + "loss": 1.1573, + "step": 164 + }, + { + "epoch": 0.0329826840908523, + "grad_norm": 2.25, + "learning_rate": 9.999530822214154e-06, + "loss": 1.2516, + "step": 165 + }, + { + "epoch": 0.033182579145948375, + "grad_norm": 2.40625, + "learning_rate": 9.99951627516116e-06, + "loss": 1.2498, + "step": 166 + }, + { + "epoch": 0.033382474201044454, + "grad_norm": 2.265625, + "learning_rate": 9.999501506030187e-06, + "loss": 1.0588, + "step": 167 + }, + { + "epoch": 0.033582369256140526, + "grad_norm": 2.21875, + "learning_rate": 9.999486514821892e-06, + "loss": 1.1712, + "step": 168 + }, + { + "epoch": 0.0337822643112366, + "grad_norm": 2.3125, + "learning_rate": 9.999471301536938e-06, + "loss": 1.0943, + "step": 169 + }, + { + "epoch": 0.03398215936633268, + "grad_norm": 2.328125, + "learning_rate": 9.999455866176004e-06, + "loss": 1.1701, + "step": 170 + }, + { + "epoch": 0.03418205442142875, + "grad_norm": 2.1875, + "learning_rate": 9.999440208739775e-06, + "loss": 1.216, + "step": 171 + }, + { + "epoch": 0.03438194947652482, + "grad_norm": 2.25, + "learning_rate": 9.999424329228944e-06, + "loss": 1.1877, + "step": 172 + }, + { + "epoch": 0.0345818445316209, + "grad_norm": 2.1875, + "learning_rate": 9.999408227644221e-06, + "loss": 1.191, + "step": 173 + }, + { + "epoch": 0.03478173958671697, + "grad_norm": 2.3125, + "learning_rate": 9.999391903986316e-06, + "loss": 1.1894, + "step": 174 + }, + { + "epoch": 0.034981634641813045, + "grad_norm": 2.21875, + "learning_rate": 9.999375358255959e-06, + "loss": 1.2301, + "step": 175 + }, + { + "epoch": 0.035181529696909124, + "grad_norm": 2.25, + "learning_rate": 9.99935859045388e-06, + "loss": 1.1464, + "step": 176 + }, + { + "epoch": 0.035381424752005196, + "grad_norm": 2.265625, + "learning_rate": 9.999341600580827e-06, + "loss": 1.1863, + "step": 177 + }, + { + "epoch": 0.035581319807101275, + "grad_norm": 2.15625, + "learning_rate": 9.999324388637554e-06, + "loss": 1.1387, + "step": 178 + }, + { + "epoch": 0.03578121486219735, + "grad_norm": 2.34375, + "learning_rate": 9.999306954624826e-06, + "loss": 1.2182, + "step": 179 + }, + { + "epoch": 0.03598110991729342, + "grad_norm": 2.21875, + "learning_rate": 9.999289298543417e-06, + "loss": 1.1852, + "step": 180 + }, + { + "epoch": 0.0361810049723895, + "grad_norm": 2.46875, + "learning_rate": 9.999271420394112e-06, + "loss": 1.1018, + "step": 181 + }, + { + "epoch": 0.03638090002748557, + "grad_norm": 2.3125, + "learning_rate": 9.999253320177704e-06, + "loss": 1.3003, + "step": 182 + }, + { + "epoch": 0.03658079508258164, + "grad_norm": 2.25, + "learning_rate": 9.999234997894998e-06, + "loss": 1.15, + "step": 183 + }, + { + "epoch": 0.03678069013767772, + "grad_norm": 2.375, + "learning_rate": 9.999216453546805e-06, + "loss": 1.1683, + "step": 184 + }, + { + "epoch": 0.03698058519277379, + "grad_norm": 2.3125, + "learning_rate": 9.999197687133953e-06, + "loss": 1.1442, + "step": 185 + }, + { + "epoch": 0.037180480247869865, + "grad_norm": 2.421875, + "learning_rate": 9.999178698657273e-06, + "loss": 1.3008, + "step": 186 + }, + { + "epoch": 0.037380375302965944, + "grad_norm": 2.515625, + "learning_rate": 9.99915948811761e-06, + "loss": 1.1976, + "step": 187 + }, + { + "epoch": 0.037580270358062016, + "grad_norm": 2.1875, + "learning_rate": 9.999140055515817e-06, + "loss": 1.2784, + "step": 188 + }, + { + "epoch": 0.03778016541315809, + "grad_norm": 2.34375, + "learning_rate": 9.999120400852756e-06, + "loss": 1.251, + "step": 189 + }, + { + "epoch": 0.03798006046825417, + "grad_norm": 2.296875, + "learning_rate": 9.999100524129299e-06, + "loss": 1.2041, + "step": 190 + }, + { + "epoch": 0.03817995552335024, + "grad_norm": 2.234375, + "learning_rate": 9.999080425346333e-06, + "loss": 1.2354, + "step": 191 + }, + { + "epoch": 0.03837985057844632, + "grad_norm": 2.171875, + "learning_rate": 9.999060104504746e-06, + "loss": 1.1894, + "step": 192 + }, + { + "epoch": 0.03857974563354239, + "grad_norm": 2.3125, + "learning_rate": 9.999039561605445e-06, + "loss": 1.2374, + "step": 193 + }, + { + "epoch": 0.03877964068863846, + "grad_norm": 2.3125, + "learning_rate": 9.99901879664934e-06, + "loss": 1.1506, + "step": 194 + }, + { + "epoch": 0.03897953574373454, + "grad_norm": 2.40625, + "learning_rate": 9.998997809637354e-06, + "loss": 1.1861, + "step": 195 + }, + { + "epoch": 0.039179430798830614, + "grad_norm": 2.265625, + "learning_rate": 9.998976600570418e-06, + "loss": 1.1206, + "step": 196 + }, + { + "epoch": 0.039379325853926686, + "grad_norm": 2.40625, + "learning_rate": 9.998955169449479e-06, + "loss": 1.1798, + "step": 197 + }, + { + "epoch": 0.039579220909022765, + "grad_norm": 2.21875, + "learning_rate": 9.998933516275482e-06, + "loss": 1.1726, + "step": 198 + }, + { + "epoch": 0.03977911596411884, + "grad_norm": 2.40625, + "learning_rate": 9.998911641049393e-06, + "loss": 1.2349, + "step": 199 + }, + { + "epoch": 0.03997901101921491, + "grad_norm": 2.296875, + "learning_rate": 9.998889543772182e-06, + "loss": 1.0874, + "step": 200 + }, + { + "epoch": 0.04017890607431099, + "grad_norm": 2.484375, + "learning_rate": 9.998867224444832e-06, + "loss": 1.141, + "step": 201 + }, + { + "epoch": 0.04037880112940706, + "grad_norm": 2.359375, + "learning_rate": 9.998844683068335e-06, + "loss": 1.2468, + "step": 202 + }, + { + "epoch": 0.04057869618450313, + "grad_norm": 2.328125, + "learning_rate": 9.998821919643689e-06, + "loss": 1.0987, + "step": 203 + }, + { + "epoch": 0.04077859123959921, + "grad_norm": 2.265625, + "learning_rate": 9.998798934171908e-06, + "loss": 1.127, + "step": 204 + }, + { + "epoch": 0.040978486294695284, + "grad_norm": 2.375, + "learning_rate": 9.998775726654014e-06, + "loss": 1.1358, + "step": 205 + }, + { + "epoch": 0.04117838134979136, + "grad_norm": 2.1875, + "learning_rate": 9.998752297091033e-06, + "loss": 1.195, + "step": 206 + }, + { + "epoch": 0.041378276404887435, + "grad_norm": 2.296875, + "learning_rate": 9.99872864548401e-06, + "loss": 1.142, + "step": 207 + }, + { + "epoch": 0.04157817145998351, + "grad_norm": 2.1875, + "learning_rate": 9.998704771833996e-06, + "loss": 1.1762, + "step": 208 + }, + { + "epoch": 0.041778066515079586, + "grad_norm": 2.390625, + "learning_rate": 9.998680676142048e-06, + "loss": 1.162, + "step": 209 + }, + { + "epoch": 0.04197796157017566, + "grad_norm": 2.359375, + "learning_rate": 9.99865635840924e-06, + "loss": 1.2461, + "step": 210 + }, + { + "epoch": 0.04217785662527173, + "grad_norm": 2.3125, + "learning_rate": 9.998631818636648e-06, + "loss": 1.159, + "step": 211 + }, + { + "epoch": 0.04237775168036781, + "grad_norm": 2.421875, + "learning_rate": 9.998607056825367e-06, + "loss": 1.1312, + "step": 212 + }, + { + "epoch": 0.04257764673546388, + "grad_norm": 2.203125, + "learning_rate": 9.998582072976493e-06, + "loss": 1.1496, + "step": 213 + }, + { + "epoch": 0.04277754179055995, + "grad_norm": 2.1875, + "learning_rate": 9.998556867091137e-06, + "loss": 1.1292, + "step": 214 + }, + { + "epoch": 0.04297743684565603, + "grad_norm": 2.296875, + "learning_rate": 9.99853143917042e-06, + "loss": 1.1355, + "step": 215 + }, + { + "epoch": 0.043177331900752104, + "grad_norm": 2.390625, + "learning_rate": 9.998505789215469e-06, + "loss": 1.2097, + "step": 216 + }, + { + "epoch": 0.043377226955848176, + "grad_norm": 2.28125, + "learning_rate": 9.998479917227427e-06, + "loss": 1.2135, + "step": 217 + }, + { + "epoch": 0.043577122010944255, + "grad_norm": 2.25, + "learning_rate": 9.998453823207437e-06, + "loss": 1.1332, + "step": 218 + }, + { + "epoch": 0.04377701706604033, + "grad_norm": 2.390625, + "learning_rate": 9.998427507156665e-06, + "loss": 1.1971, + "step": 219 + }, + { + "epoch": 0.043976912121136406, + "grad_norm": 2.25, + "learning_rate": 9.998400969076276e-06, + "loss": 1.1738, + "step": 220 + }, + { + "epoch": 0.04417680717623248, + "grad_norm": 2.3125, + "learning_rate": 9.998374208967451e-06, + "loss": 1.2697, + "step": 221 + }, + { + "epoch": 0.04437670223132855, + "grad_norm": 2.421875, + "learning_rate": 9.998347226831375e-06, + "loss": 1.1589, + "step": 222 + }, + { + "epoch": 0.04457659728642463, + "grad_norm": 2.359375, + "learning_rate": 9.998320022669253e-06, + "loss": 1.2381, + "step": 223 + }, + { + "epoch": 0.0447764923415207, + "grad_norm": 2.1875, + "learning_rate": 9.998292596482286e-06, + "loss": 1.1596, + "step": 224 + }, + { + "epoch": 0.044976387396616774, + "grad_norm": 2.328125, + "learning_rate": 9.998264948271696e-06, + "loss": 1.2429, + "step": 225 + }, + { + "epoch": 0.04517628245171285, + "grad_norm": 2.1875, + "learning_rate": 9.998237078038712e-06, + "loss": 1.1608, + "step": 226 + }, + { + "epoch": 0.045376177506808925, + "grad_norm": 2.265625, + "learning_rate": 9.99820898578457e-06, + "loss": 1.1371, + "step": 227 + }, + { + "epoch": 0.045576072561905, + "grad_norm": 2.265625, + "learning_rate": 9.99818067151052e-06, + "loss": 1.1286, + "step": 228 + }, + { + "epoch": 0.045775967617001076, + "grad_norm": 2.28125, + "learning_rate": 9.998152135217816e-06, + "loss": 1.2278, + "step": 229 + }, + { + "epoch": 0.04597586267209715, + "grad_norm": 2.328125, + "learning_rate": 9.99812337690773e-06, + "loss": 1.1407, + "step": 230 + }, + { + "epoch": 0.04617575772719322, + "grad_norm": 2.4375, + "learning_rate": 9.998094396581538e-06, + "loss": 1.1912, + "step": 231 + }, + { + "epoch": 0.0463756527822893, + "grad_norm": 2.296875, + "learning_rate": 9.998065194240524e-06, + "loss": 1.252, + "step": 232 + }, + { + "epoch": 0.04657554783738537, + "grad_norm": 2.296875, + "learning_rate": 9.99803576988599e-06, + "loss": 1.166, + "step": 233 + }, + { + "epoch": 0.04677544289248145, + "grad_norm": 2.15625, + "learning_rate": 9.99800612351924e-06, + "loss": 1.1465, + "step": 234 + }, + { + "epoch": 0.04697533794757752, + "grad_norm": 2.25, + "learning_rate": 9.997976255141593e-06, + "loss": 1.23, + "step": 235 + }, + { + "epoch": 0.047175233002673594, + "grad_norm": 2.46875, + "learning_rate": 9.997946164754373e-06, + "loss": 1.1665, + "step": 236 + }, + { + "epoch": 0.04737512805776967, + "grad_norm": 2.28125, + "learning_rate": 9.99791585235892e-06, + "loss": 1.1391, + "step": 237 + }, + { + "epoch": 0.047575023112865746, + "grad_norm": 2.21875, + "learning_rate": 9.997885317956577e-06, + "loss": 1.152, + "step": 238 + }, + { + "epoch": 0.04777491816796182, + "grad_norm": 2.234375, + "learning_rate": 9.997854561548702e-06, + "loss": 1.153, + "step": 239 + }, + { + "epoch": 0.0479748132230579, + "grad_norm": 2.328125, + "learning_rate": 9.99782358313666e-06, + "loss": 1.189, + "step": 240 + }, + { + "epoch": 0.04817470827815397, + "grad_norm": 2.21875, + "learning_rate": 9.997792382721831e-06, + "loss": 1.1307, + "step": 241 + }, + { + "epoch": 0.04837460333325004, + "grad_norm": 2.09375, + "learning_rate": 9.997760960305598e-06, + "loss": 1.0662, + "step": 242 + }, + { + "epoch": 0.04857449838834612, + "grad_norm": 2.1875, + "learning_rate": 9.997729315889356e-06, + "loss": 1.0934, + "step": 243 + }, + { + "epoch": 0.04877439344344219, + "grad_norm": 2.125, + "learning_rate": 9.99769744947451e-06, + "loss": 1.2033, + "step": 244 + }, + { + "epoch": 0.048974288498538264, + "grad_norm": 2.1875, + "learning_rate": 9.99766536106248e-06, + "loss": 1.153, + "step": 245 + }, + { + "epoch": 0.04917418355363434, + "grad_norm": 2.234375, + "learning_rate": 9.997633050654687e-06, + "loss": 1.172, + "step": 246 + }, + { + "epoch": 0.049374078608730415, + "grad_norm": 2.40625, + "learning_rate": 9.997600518252568e-06, + "loss": 1.23, + "step": 247 + }, + { + "epoch": 0.049573973663826494, + "grad_norm": 2.328125, + "learning_rate": 9.997567763857566e-06, + "loss": 1.1873, + "step": 248 + }, + { + "epoch": 0.049773868718922566, + "grad_norm": 2.375, + "learning_rate": 9.997534787471139e-06, + "loss": 1.1962, + "step": 249 + }, + { + "epoch": 0.04997376377401864, + "grad_norm": 2.171875, + "learning_rate": 9.997501589094752e-06, + "loss": 1.1827, + "step": 250 + }, + { + "epoch": 0.05017365882911472, + "grad_norm": 2.140625, + "learning_rate": 9.997468168729876e-06, + "loss": 1.0892, + "step": 251 + }, + { + "epoch": 0.05037355388421079, + "grad_norm": 2.28125, + "learning_rate": 9.997434526377998e-06, + "loss": 1.1739, + "step": 252 + }, + { + "epoch": 0.05057344893930686, + "grad_norm": 2.234375, + "learning_rate": 9.997400662040613e-06, + "loss": 1.0799, + "step": 253 + }, + { + "epoch": 0.05077334399440294, + "grad_norm": 2.234375, + "learning_rate": 9.997366575719223e-06, + "loss": 1.1721, + "step": 254 + }, + { + "epoch": 0.05097323904949901, + "grad_norm": 2.25, + "learning_rate": 9.997332267415345e-06, + "loss": 1.2029, + "step": 255 + }, + { + "epoch": 0.051173134104595085, + "grad_norm": 2.15625, + "learning_rate": 9.9972977371305e-06, + "loss": 1.1735, + "step": 256 + }, + { + "epoch": 0.051373029159691164, + "grad_norm": 2.296875, + "learning_rate": 9.997262984866225e-06, + "loss": 1.1807, + "step": 257 + }, + { + "epoch": 0.051572924214787236, + "grad_norm": 2.453125, + "learning_rate": 9.997228010624061e-06, + "loss": 1.2564, + "step": 258 + }, + { + "epoch": 0.05177281926988331, + "grad_norm": 2.171875, + "learning_rate": 9.997192814405561e-06, + "loss": 1.1182, + "step": 259 + }, + { + "epoch": 0.05197271432497939, + "grad_norm": 2.1875, + "learning_rate": 9.99715739621229e-06, + "loss": 1.2241, + "step": 260 + }, + { + "epoch": 0.05217260938007546, + "grad_norm": 2.203125, + "learning_rate": 9.997121756045824e-06, + "loss": 1.1293, + "step": 261 + }, + { + "epoch": 0.05237250443517154, + "grad_norm": 2.265625, + "learning_rate": 9.997085893907742e-06, + "loss": 1.1547, + "step": 262 + }, + { + "epoch": 0.05257239949026761, + "grad_norm": 2.34375, + "learning_rate": 9.997049809799639e-06, + "loss": 1.1602, + "step": 263 + }, + { + "epoch": 0.05277229454536368, + "grad_norm": 2.3125, + "learning_rate": 9.997013503723116e-06, + "loss": 1.157, + "step": 264 + }, + { + "epoch": 0.05297218960045976, + "grad_norm": 2.421875, + "learning_rate": 9.996976975679786e-06, + "loss": 1.186, + "step": 265 + }, + { + "epoch": 0.05317208465555583, + "grad_norm": 2.1875, + "learning_rate": 9.996940225671275e-06, + "loss": 1.1212, + "step": 266 + }, + { + "epoch": 0.053371979710651905, + "grad_norm": 2.296875, + "learning_rate": 9.996903253699211e-06, + "loss": 1.1478, + "step": 267 + }, + { + "epoch": 0.053571874765747984, + "grad_norm": 2.1875, + "learning_rate": 9.996866059765238e-06, + "loss": 1.1511, + "step": 268 + }, + { + "epoch": 0.053771769820844056, + "grad_norm": 2.3125, + "learning_rate": 9.996828643871008e-06, + "loss": 1.119, + "step": 269 + }, + { + "epoch": 0.05397166487594013, + "grad_norm": 2.328125, + "learning_rate": 9.996791006018185e-06, + "loss": 1.1387, + "step": 270 + }, + { + "epoch": 0.05417155993103621, + "grad_norm": 2.359375, + "learning_rate": 9.996753146208438e-06, + "loss": 1.1503, + "step": 271 + }, + { + "epoch": 0.05437145498613228, + "grad_norm": 2.25, + "learning_rate": 9.996715064443449e-06, + "loss": 1.2386, + "step": 272 + }, + { + "epoch": 0.05457135004122835, + "grad_norm": 2.21875, + "learning_rate": 9.996676760724913e-06, + "loss": 1.1869, + "step": 273 + }, + { + "epoch": 0.05477124509632443, + "grad_norm": 2.3125, + "learning_rate": 9.996638235054527e-06, + "loss": 1.1408, + "step": 274 + }, + { + "epoch": 0.0549711401514205, + "grad_norm": 2.4375, + "learning_rate": 9.996599487434007e-06, + "loss": 1.2918, + "step": 275 + }, + { + "epoch": 0.05517103520651658, + "grad_norm": 2.328125, + "learning_rate": 9.996560517865069e-06, + "loss": 1.1994, + "step": 276 + }, + { + "epoch": 0.055370930261612654, + "grad_norm": 2.53125, + "learning_rate": 9.996521326349447e-06, + "loss": 1.2154, + "step": 277 + }, + { + "epoch": 0.055570825316708726, + "grad_norm": 2.234375, + "learning_rate": 9.996481912888881e-06, + "loss": 1.2332, + "step": 278 + }, + { + "epoch": 0.055770720371804805, + "grad_norm": 2.203125, + "learning_rate": 9.996442277485122e-06, + "loss": 1.1615, + "step": 279 + }, + { + "epoch": 0.05597061542690088, + "grad_norm": 2.28125, + "learning_rate": 9.99640242013993e-06, + "loss": 1.2607, + "step": 280 + }, + { + "epoch": 0.05617051048199695, + "grad_norm": 2.1875, + "learning_rate": 9.996362340855076e-06, + "loss": 1.1624, + "step": 281 + }, + { + "epoch": 0.05637040553709303, + "grad_norm": 2.203125, + "learning_rate": 9.996322039632343e-06, + "loss": 1.1052, + "step": 282 + }, + { + "epoch": 0.0565703005921891, + "grad_norm": 2.234375, + "learning_rate": 9.996281516473517e-06, + "loss": 1.1284, + "step": 283 + }, + { + "epoch": 0.05677019564728517, + "grad_norm": 2.171875, + "learning_rate": 9.9962407713804e-06, + "loss": 1.1039, + "step": 284 + }, + { + "epoch": 0.05697009070238125, + "grad_norm": 2.125, + "learning_rate": 9.9961998043548e-06, + "loss": 1.1431, + "step": 285 + }, + { + "epoch": 0.057169985757477323, + "grad_norm": 2.21875, + "learning_rate": 9.996158615398541e-06, + "loss": 1.1113, + "step": 286 + }, + { + "epoch": 0.057369880812573396, + "grad_norm": 2.5, + "learning_rate": 9.996117204513448e-06, + "loss": 1.1783, + "step": 287 + }, + { + "epoch": 0.057569775867669475, + "grad_norm": 2.15625, + "learning_rate": 9.996075571701363e-06, + "loss": 1.0627, + "step": 288 + }, + { + "epoch": 0.05776967092276555, + "grad_norm": 2.34375, + "learning_rate": 9.996033716964136e-06, + "loss": 1.1035, + "step": 289 + }, + { + "epoch": 0.057969565977861626, + "grad_norm": 2.15625, + "learning_rate": 9.995991640303622e-06, + "loss": 1.1161, + "step": 290 + }, + { + "epoch": 0.0581694610329577, + "grad_norm": 2.265625, + "learning_rate": 9.995949341721695e-06, + "loss": 1.1651, + "step": 291 + }, + { + "epoch": 0.05836935608805377, + "grad_norm": 2.1875, + "learning_rate": 9.995906821220231e-06, + "loss": 1.1721, + "step": 292 + }, + { + "epoch": 0.05856925114314985, + "grad_norm": 2.109375, + "learning_rate": 9.99586407880112e-06, + "loss": 1.1163, + "step": 293 + }, + { + "epoch": 0.05876914619824592, + "grad_norm": 2.328125, + "learning_rate": 9.99582111446626e-06, + "loss": 1.1774, + "step": 294 + }, + { + "epoch": 0.05896904125334199, + "grad_norm": 2.40625, + "learning_rate": 9.99577792821756e-06, + "loss": 1.2022, + "step": 295 + }, + { + "epoch": 0.05916893630843807, + "grad_norm": 2.25, + "learning_rate": 9.995734520056936e-06, + "loss": 1.1483, + "step": 296 + }, + { + "epoch": 0.059368831363534144, + "grad_norm": 2.3125, + "learning_rate": 9.995690889986321e-06, + "loss": 1.1886, + "step": 297 + }, + { + "epoch": 0.059568726418630216, + "grad_norm": 2.140625, + "learning_rate": 9.99564703800765e-06, + "loss": 1.0558, + "step": 298 + }, + { + "epoch": 0.059768621473726295, + "grad_norm": 2.296875, + "learning_rate": 9.99560296412287e-06, + "loss": 1.1211, + "step": 299 + }, + { + "epoch": 0.05996851652882237, + "grad_norm": 2.40625, + "learning_rate": 9.99555866833394e-06, + "loss": 1.1445, + "step": 300 + }, + { + "epoch": 0.06016841158391844, + "grad_norm": 2.203125, + "learning_rate": 9.995514150642827e-06, + "loss": 1.122, + "step": 301 + }, + { + "epoch": 0.06036830663901452, + "grad_norm": 2.109375, + "learning_rate": 9.995469411051511e-06, + "loss": 1.0816, + "step": 302 + }, + { + "epoch": 0.06056820169411059, + "grad_norm": 2.171875, + "learning_rate": 9.995424449561974e-06, + "loss": 1.2716, + "step": 303 + }, + { + "epoch": 0.06076809674920667, + "grad_norm": 2.296875, + "learning_rate": 9.99537926617622e-06, + "loss": 1.1145, + "step": 304 + }, + { + "epoch": 0.06096799180430274, + "grad_norm": 2.3125, + "learning_rate": 9.99533386089625e-06, + "loss": 1.112, + "step": 305 + }, + { + "epoch": 0.061167886859398814, + "grad_norm": 2.234375, + "learning_rate": 9.995288233724084e-06, + "loss": 1.1509, + "step": 306 + }, + { + "epoch": 0.06136778191449489, + "grad_norm": 2.265625, + "learning_rate": 9.995242384661748e-06, + "loss": 1.186, + "step": 307 + }, + { + "epoch": 0.061567676969590965, + "grad_norm": 2.171875, + "learning_rate": 9.995196313711279e-06, + "loss": 1.1512, + "step": 308 + }, + { + "epoch": 0.06176757202468704, + "grad_norm": 2.296875, + "learning_rate": 9.995150020874724e-06, + "loss": 1.136, + "step": 309 + }, + { + "epoch": 0.061967467079783116, + "grad_norm": 2.265625, + "learning_rate": 9.995103506154138e-06, + "loss": 1.2276, + "step": 310 + }, + { + "epoch": 0.06216736213487919, + "grad_norm": 2.328125, + "learning_rate": 9.995056769551587e-06, + "loss": 1.1692, + "step": 311 + }, + { + "epoch": 0.06236725718997526, + "grad_norm": 2.40625, + "learning_rate": 9.99500981106915e-06, + "loss": 1.3003, + "step": 312 + }, + { + "epoch": 0.06256715224507134, + "grad_norm": 2.25, + "learning_rate": 9.99496263070891e-06, + "loss": 1.0791, + "step": 313 + }, + { + "epoch": 0.06276704730016741, + "grad_norm": 2.3125, + "learning_rate": 9.99491522847296e-06, + "loss": 1.2507, + "step": 314 + }, + { + "epoch": 0.06296694235526348, + "grad_norm": 2.34375, + "learning_rate": 9.994867604363415e-06, + "loss": 1.3307, + "step": 315 + }, + { + "epoch": 0.06316683741035956, + "grad_norm": 2.265625, + "learning_rate": 9.994819758382379e-06, + "loss": 1.0752, + "step": 316 + }, + { + "epoch": 0.06336673246545564, + "grad_norm": 2.203125, + "learning_rate": 9.994771690531986e-06, + "loss": 1.137, + "step": 317 + }, + { + "epoch": 0.06356662752055171, + "grad_norm": 2.234375, + "learning_rate": 9.994723400814367e-06, + "loss": 1.1592, + "step": 318 + }, + { + "epoch": 0.06376652257564779, + "grad_norm": 2.1875, + "learning_rate": 9.994674889231668e-06, + "loss": 1.1348, + "step": 319 + }, + { + "epoch": 0.06396641763074386, + "grad_norm": 2.375, + "learning_rate": 9.994626155786044e-06, + "loss": 1.1495, + "step": 320 + }, + { + "epoch": 0.06416631268583993, + "grad_norm": 2.40625, + "learning_rate": 9.994577200479659e-06, + "loss": 1.1524, + "step": 321 + }, + { + "epoch": 0.064366207740936, + "grad_norm": 2.78125, + "learning_rate": 9.994528023314689e-06, + "loss": 1.1618, + "step": 322 + }, + { + "epoch": 0.06456610279603209, + "grad_norm": 2.4375, + "learning_rate": 9.994478624293317e-06, + "loss": 1.1259, + "step": 323 + }, + { + "epoch": 0.06476599785112816, + "grad_norm": 2.1875, + "learning_rate": 9.994429003417739e-06, + "loss": 1.2054, + "step": 324 + }, + { + "epoch": 0.06496589290622423, + "grad_norm": 2.203125, + "learning_rate": 9.994379160690156e-06, + "loss": 1.164, + "step": 325 + }, + { + "epoch": 0.0651657879613203, + "grad_norm": 2.28125, + "learning_rate": 9.994329096112786e-06, + "loss": 1.1282, + "step": 326 + }, + { + "epoch": 0.06536568301641638, + "grad_norm": 2.140625, + "learning_rate": 9.994278809687849e-06, + "loss": 1.1085, + "step": 327 + }, + { + "epoch": 0.06556557807151246, + "grad_norm": 2.28125, + "learning_rate": 9.994228301417584e-06, + "loss": 1.1176, + "step": 328 + }, + { + "epoch": 0.06576547312660853, + "grad_norm": 2.265625, + "learning_rate": 9.994177571304228e-06, + "loss": 1.1428, + "step": 329 + }, + { + "epoch": 0.0659653681817046, + "grad_norm": 2.0625, + "learning_rate": 9.99412661935004e-06, + "loss": 1.0369, + "step": 330 + }, + { + "epoch": 0.06616526323680068, + "grad_norm": 2.328125, + "learning_rate": 9.99407544555728e-06, + "loss": 1.1183, + "step": 331 + }, + { + "epoch": 0.06636515829189675, + "grad_norm": 2.21875, + "learning_rate": 9.994024049928222e-06, + "loss": 1.2004, + "step": 332 + }, + { + "epoch": 0.06656505334699282, + "grad_norm": 2.328125, + "learning_rate": 9.99397243246515e-06, + "loss": 1.0809, + "step": 333 + }, + { + "epoch": 0.06676494840208891, + "grad_norm": 2.21875, + "learning_rate": 9.993920593170355e-06, + "loss": 1.153, + "step": 334 + }, + { + "epoch": 0.06696484345718498, + "grad_norm": 2.3125, + "learning_rate": 9.99386853204614e-06, + "loss": 1.1371, + "step": 335 + }, + { + "epoch": 0.06716473851228105, + "grad_norm": 2.25, + "learning_rate": 9.993816249094818e-06, + "loss": 1.0658, + "step": 336 + }, + { + "epoch": 0.06736463356737712, + "grad_norm": 2.3125, + "learning_rate": 9.993763744318711e-06, + "loss": 1.1894, + "step": 337 + }, + { + "epoch": 0.0675645286224732, + "grad_norm": 2.28125, + "learning_rate": 9.993711017720155e-06, + "loss": 1.1231, + "step": 338 + }, + { + "epoch": 0.06776442367756928, + "grad_norm": 2.234375, + "learning_rate": 9.993658069301487e-06, + "loss": 1.1764, + "step": 339 + }, + { + "epoch": 0.06796431873266535, + "grad_norm": 2.359375, + "learning_rate": 9.993604899065061e-06, + "loss": 1.1511, + "step": 340 + }, + { + "epoch": 0.06816421378776143, + "grad_norm": 2.234375, + "learning_rate": 9.993551507013239e-06, + "loss": 1.1341, + "step": 341 + }, + { + "epoch": 0.0683641088428575, + "grad_norm": 2.46875, + "learning_rate": 9.993497893148391e-06, + "loss": 1.1729, + "step": 342 + }, + { + "epoch": 0.06856400389795357, + "grad_norm": 2.25, + "learning_rate": 9.993444057472902e-06, + "loss": 1.2644, + "step": 343 + }, + { + "epoch": 0.06876389895304964, + "grad_norm": 2.3125, + "learning_rate": 9.99338999998916e-06, + "loss": 1.1547, + "step": 344 + }, + { + "epoch": 0.06896379400814573, + "grad_norm": 2.171875, + "learning_rate": 9.99333572069957e-06, + "loss": 1.0723, + "step": 345 + }, + { + "epoch": 0.0691636890632418, + "grad_norm": 2.171875, + "learning_rate": 9.993281219606537e-06, + "loss": 1.1183, + "step": 346 + }, + { + "epoch": 0.06936358411833787, + "grad_norm": 2.4375, + "learning_rate": 9.993226496712488e-06, + "loss": 1.1073, + "step": 347 + }, + { + "epoch": 0.06956347917343395, + "grad_norm": 2.25, + "learning_rate": 9.993171552019849e-06, + "loss": 1.1318, + "step": 348 + }, + { + "epoch": 0.06976337422853002, + "grad_norm": 2.234375, + "learning_rate": 9.993116385531064e-06, + "loss": 1.202, + "step": 349 + }, + { + "epoch": 0.06996326928362609, + "grad_norm": 2.15625, + "learning_rate": 9.993060997248582e-06, + "loss": 1.1376, + "step": 350 + }, + { + "epoch": 0.07016316433872218, + "grad_norm": 2.203125, + "learning_rate": 9.993005387174865e-06, + "loss": 1.1328, + "step": 351 + }, + { + "epoch": 0.07036305939381825, + "grad_norm": 2.171875, + "learning_rate": 9.99294955531238e-06, + "loss": 1.2125, + "step": 352 + }, + { + "epoch": 0.07056295444891432, + "grad_norm": 2.375, + "learning_rate": 9.992893501663613e-06, + "loss": 1.2495, + "step": 353 + }, + { + "epoch": 0.07076284950401039, + "grad_norm": 2.171875, + "learning_rate": 9.992837226231046e-06, + "loss": 1.0926, + "step": 354 + }, + { + "epoch": 0.07096274455910646, + "grad_norm": 2.265625, + "learning_rate": 9.992780729017184e-06, + "loss": 1.1878, + "step": 355 + }, + { + "epoch": 0.07116263961420255, + "grad_norm": 2.25, + "learning_rate": 9.992724010024536e-06, + "loss": 1.2096, + "step": 356 + }, + { + "epoch": 0.07136253466929862, + "grad_norm": 2.28125, + "learning_rate": 9.99266706925562e-06, + "loss": 1.1596, + "step": 357 + }, + { + "epoch": 0.0715624297243947, + "grad_norm": 2.421875, + "learning_rate": 9.992609906712967e-06, + "loss": 1.1814, + "step": 358 + }, + { + "epoch": 0.07176232477949077, + "grad_norm": 2.265625, + "learning_rate": 9.992552522399112e-06, + "loss": 1.213, + "step": 359 + }, + { + "epoch": 0.07196221983458684, + "grad_norm": 2.21875, + "learning_rate": 9.99249491631661e-06, + "loss": 1.1729, + "step": 360 + }, + { + "epoch": 0.07216211488968291, + "grad_norm": 2.203125, + "learning_rate": 9.992437088468016e-06, + "loss": 1.058, + "step": 361 + }, + { + "epoch": 0.072362009944779, + "grad_norm": 2.15625, + "learning_rate": 9.992379038855902e-06, + "loss": 1.131, + "step": 362 + }, + { + "epoch": 0.07256190499987507, + "grad_norm": 2.1875, + "learning_rate": 9.992320767482842e-06, + "loss": 1.0648, + "step": 363 + }, + { + "epoch": 0.07276180005497114, + "grad_norm": 2.53125, + "learning_rate": 9.992262274351427e-06, + "loss": 1.1781, + "step": 364 + }, + { + "epoch": 0.07296169511006721, + "grad_norm": 2.625, + "learning_rate": 9.992203559464256e-06, + "loss": 1.1641, + "step": 365 + }, + { + "epoch": 0.07316159016516328, + "grad_norm": 2.40625, + "learning_rate": 9.992144622823933e-06, + "loss": 1.144, + "step": 366 + }, + { + "epoch": 0.07336148522025937, + "grad_norm": 2.203125, + "learning_rate": 9.992085464433084e-06, + "loss": 1.0778, + "step": 367 + }, + { + "epoch": 0.07356138027535544, + "grad_norm": 2.1875, + "learning_rate": 9.992026084294328e-06, + "loss": 1.1035, + "step": 368 + }, + { + "epoch": 0.07376127533045151, + "grad_norm": 2.25, + "learning_rate": 9.991966482410307e-06, + "loss": 1.1741, + "step": 369 + }, + { + "epoch": 0.07396117038554759, + "grad_norm": 2.28125, + "learning_rate": 9.99190665878367e-06, + "loss": 1.1478, + "step": 370 + }, + { + "epoch": 0.07416106544064366, + "grad_norm": 2.34375, + "learning_rate": 9.99184661341707e-06, + "loss": 1.1733, + "step": 371 + }, + { + "epoch": 0.07436096049573973, + "grad_norm": 2.328125, + "learning_rate": 9.991786346313178e-06, + "loss": 1.2256, + "step": 372 + }, + { + "epoch": 0.07456085555083582, + "grad_norm": 2.1875, + "learning_rate": 9.991725857474668e-06, + "loss": 1.1197, + "step": 373 + }, + { + "epoch": 0.07476075060593189, + "grad_norm": 2.234375, + "learning_rate": 9.99166514690423e-06, + "loss": 1.0933, + "step": 374 + }, + { + "epoch": 0.07496064566102796, + "grad_norm": 2.234375, + "learning_rate": 9.99160421460456e-06, + "loss": 1.1109, + "step": 375 + }, + { + "epoch": 0.07516054071612403, + "grad_norm": 2.1875, + "learning_rate": 9.991543060578363e-06, + "loss": 1.1302, + "step": 376 + }, + { + "epoch": 0.0753604357712201, + "grad_norm": 2.21875, + "learning_rate": 9.991481684828356e-06, + "loss": 1.0812, + "step": 377 + }, + { + "epoch": 0.07556033082631618, + "grad_norm": 2.21875, + "learning_rate": 9.991420087357266e-06, + "loss": 1.1587, + "step": 378 + }, + { + "epoch": 0.07576022588141226, + "grad_norm": 2.203125, + "learning_rate": 9.991358268167828e-06, + "loss": 1.0641, + "step": 379 + }, + { + "epoch": 0.07596012093650834, + "grad_norm": 2.296875, + "learning_rate": 9.99129622726279e-06, + "loss": 1.0763, + "step": 380 + }, + { + "epoch": 0.07616001599160441, + "grad_norm": 2.296875, + "learning_rate": 9.991233964644906e-06, + "loss": 1.2373, + "step": 381 + }, + { + "epoch": 0.07635991104670048, + "grad_norm": 2.375, + "learning_rate": 9.991171480316944e-06, + "loss": 1.2241, + "step": 382 + }, + { + "epoch": 0.07655980610179655, + "grad_norm": 2.390625, + "learning_rate": 9.991108774281676e-06, + "loss": 1.2159, + "step": 383 + }, + { + "epoch": 0.07675970115689264, + "grad_norm": 2.25, + "learning_rate": 9.99104584654189e-06, + "loss": 1.0728, + "step": 384 + }, + { + "epoch": 0.07695959621198871, + "grad_norm": 2.078125, + "learning_rate": 9.99098269710038e-06, + "loss": 1.0239, + "step": 385 + }, + { + "epoch": 0.07715949126708478, + "grad_norm": 2.125, + "learning_rate": 9.990919325959952e-06, + "loss": 1.1313, + "step": 386 + }, + { + "epoch": 0.07735938632218085, + "grad_norm": 2.140625, + "learning_rate": 9.990855733123421e-06, + "loss": 1.0394, + "step": 387 + }, + { + "epoch": 0.07755928137727693, + "grad_norm": 2.140625, + "learning_rate": 9.99079191859361e-06, + "loss": 1.1339, + "step": 388 + }, + { + "epoch": 0.077759176432373, + "grad_norm": 2.234375, + "learning_rate": 9.990727882373357e-06, + "loss": 1.1582, + "step": 389 + }, + { + "epoch": 0.07795907148746908, + "grad_norm": 2.375, + "learning_rate": 9.990663624465504e-06, + "loss": 1.1049, + "step": 390 + }, + { + "epoch": 0.07815896654256516, + "grad_norm": 2.34375, + "learning_rate": 9.990599144872905e-06, + "loss": 1.1503, + "step": 391 + }, + { + "epoch": 0.07835886159766123, + "grad_norm": 2.078125, + "learning_rate": 9.990534443598425e-06, + "loss": 1.1675, + "step": 392 + }, + { + "epoch": 0.0785587566527573, + "grad_norm": 2.109375, + "learning_rate": 9.99046952064494e-06, + "loss": 1.084, + "step": 393 + }, + { + "epoch": 0.07875865170785337, + "grad_norm": 2.234375, + "learning_rate": 9.99040437601533e-06, + "loss": 1.1882, + "step": 394 + }, + { + "epoch": 0.07895854676294946, + "grad_norm": 2.265625, + "learning_rate": 9.99033900971249e-06, + "loss": 1.0875, + "step": 395 + }, + { + "epoch": 0.07915844181804553, + "grad_norm": 2.109375, + "learning_rate": 9.990273421739325e-06, + "loss": 1.0732, + "step": 396 + }, + { + "epoch": 0.0793583368731416, + "grad_norm": 2.25, + "learning_rate": 9.990207612098748e-06, + "loss": 1.2069, + "step": 397 + }, + { + "epoch": 0.07955823192823767, + "grad_norm": 2.171875, + "learning_rate": 9.990141580793682e-06, + "loss": 1.1131, + "step": 398 + }, + { + "epoch": 0.07975812698333375, + "grad_norm": 2.109375, + "learning_rate": 9.990075327827058e-06, + "loss": 1.0508, + "step": 399 + }, + { + "epoch": 0.07995802203842982, + "grad_norm": 2.3125, + "learning_rate": 9.990008853201823e-06, + "loss": 1.1659, + "step": 400 + }, + { + "epoch": 0.0801579170935259, + "grad_norm": 2.140625, + "learning_rate": 9.989942156920926e-06, + "loss": 1.1182, + "step": 401 + }, + { + "epoch": 0.08035781214862198, + "grad_norm": 2.25, + "learning_rate": 9.989875238987333e-06, + "loss": 1.1216, + "step": 402 + }, + { + "epoch": 0.08055770720371805, + "grad_norm": 2.265625, + "learning_rate": 9.989808099404015e-06, + "loss": 1.104, + "step": 403 + }, + { + "epoch": 0.08075760225881412, + "grad_norm": 2.25, + "learning_rate": 9.989740738173953e-06, + "loss": 1.1707, + "step": 404 + }, + { + "epoch": 0.08095749731391019, + "grad_norm": 2.15625, + "learning_rate": 9.989673155300141e-06, + "loss": 1.1789, + "step": 405 + }, + { + "epoch": 0.08115739236900626, + "grad_norm": 2.265625, + "learning_rate": 9.98960535078558e-06, + "loss": 1.2305, + "step": 406 + }, + { + "epoch": 0.08135728742410235, + "grad_norm": 2.25, + "learning_rate": 9.989537324633283e-06, + "loss": 1.0805, + "step": 407 + }, + { + "epoch": 0.08155718247919842, + "grad_norm": 2.28125, + "learning_rate": 9.98946907684627e-06, + "loss": 1.2271, + "step": 408 + }, + { + "epoch": 0.0817570775342945, + "grad_norm": 2.265625, + "learning_rate": 9.989400607427574e-06, + "loss": 1.0816, + "step": 409 + }, + { + "epoch": 0.08195697258939057, + "grad_norm": 2.25, + "learning_rate": 9.989331916380233e-06, + "loss": 1.1939, + "step": 410 + }, + { + "epoch": 0.08215686764448664, + "grad_norm": 2.078125, + "learning_rate": 9.989263003707306e-06, + "loss": 1.0495, + "step": 411 + }, + { + "epoch": 0.08235676269958273, + "grad_norm": 1.984375, + "learning_rate": 9.989193869411846e-06, + "loss": 1.0046, + "step": 412 + }, + { + "epoch": 0.0825566577546788, + "grad_norm": 2.34375, + "learning_rate": 9.989124513496929e-06, + "loss": 1.1798, + "step": 413 + }, + { + "epoch": 0.08275655280977487, + "grad_norm": 2.1875, + "learning_rate": 9.989054935965631e-06, + "loss": 1.1185, + "step": 414 + }, + { + "epoch": 0.08295644786487094, + "grad_norm": 2.234375, + "learning_rate": 9.98898513682105e-06, + "loss": 1.142, + "step": 415 + }, + { + "epoch": 0.08315634291996701, + "grad_norm": 2.28125, + "learning_rate": 9.98891511606628e-06, + "loss": 1.2098, + "step": 416 + }, + { + "epoch": 0.08335623797506309, + "grad_norm": 2.5625, + "learning_rate": 9.988844873704433e-06, + "loss": 1.1389, + "step": 417 + }, + { + "epoch": 0.08355613303015917, + "grad_norm": 2.1875, + "learning_rate": 9.988774409738628e-06, + "loss": 1.1318, + "step": 418 + }, + { + "epoch": 0.08375602808525524, + "grad_norm": 2.390625, + "learning_rate": 9.988703724172e-06, + "loss": 1.1519, + "step": 419 + }, + { + "epoch": 0.08395592314035132, + "grad_norm": 2.28125, + "learning_rate": 9.988632817007683e-06, + "loss": 1.1853, + "step": 420 + }, + { + "epoch": 0.08415581819544739, + "grad_norm": 2.109375, + "learning_rate": 9.98856168824883e-06, + "loss": 1.0061, + "step": 421 + }, + { + "epoch": 0.08435571325054346, + "grad_norm": 2.078125, + "learning_rate": 9.9884903378986e-06, + "loss": 1.1319, + "step": 422 + }, + { + "epoch": 0.08455560830563955, + "grad_norm": 2.3125, + "learning_rate": 9.988418765960161e-06, + "loss": 1.1209, + "step": 423 + }, + { + "epoch": 0.08475550336073562, + "grad_norm": 2.125, + "learning_rate": 9.988346972436693e-06, + "loss": 1.1002, + "step": 424 + }, + { + "epoch": 0.08495539841583169, + "grad_norm": 2.203125, + "learning_rate": 9.988274957331385e-06, + "loss": 1.0286, + "step": 425 + }, + { + "epoch": 0.08515529347092776, + "grad_norm": 2.109375, + "learning_rate": 9.988202720647438e-06, + "loss": 0.9752, + "step": 426 + }, + { + "epoch": 0.08535518852602383, + "grad_norm": 2.234375, + "learning_rate": 9.988130262388058e-06, + "loss": 1.131, + "step": 427 + }, + { + "epoch": 0.0855550835811199, + "grad_norm": 2.21875, + "learning_rate": 9.988057582556465e-06, + "loss": 1.1002, + "step": 428 + }, + { + "epoch": 0.08575497863621599, + "grad_norm": 2.375, + "learning_rate": 9.987984681155884e-06, + "loss": 1.2404, + "step": 429 + }, + { + "epoch": 0.08595487369131206, + "grad_norm": 2.25, + "learning_rate": 9.98791155818956e-06, + "loss": 1.13, + "step": 430 + }, + { + "epoch": 0.08615476874640814, + "grad_norm": 2.203125, + "learning_rate": 9.987838213660736e-06, + "loss": 1.1198, + "step": 431 + }, + { + "epoch": 0.08635466380150421, + "grad_norm": 2.21875, + "learning_rate": 9.987764647572671e-06, + "loss": 1.1485, + "step": 432 + }, + { + "epoch": 0.08655455885660028, + "grad_norm": 2.21875, + "learning_rate": 9.987690859928633e-06, + "loss": 1.1426, + "step": 433 + }, + { + "epoch": 0.08675445391169635, + "grad_norm": 2.265625, + "learning_rate": 9.987616850731899e-06, + "loss": 1.1378, + "step": 434 + }, + { + "epoch": 0.08695434896679244, + "grad_norm": 2.15625, + "learning_rate": 9.987542619985758e-06, + "loss": 1.2654, + "step": 435 + }, + { + "epoch": 0.08715424402188851, + "grad_norm": 2.328125, + "learning_rate": 9.987468167693507e-06, + "loss": 1.1183, + "step": 436 + }, + { + "epoch": 0.08735413907698458, + "grad_norm": 2.140625, + "learning_rate": 9.987393493858453e-06, + "loss": 1.1122, + "step": 437 + }, + { + "epoch": 0.08755403413208065, + "grad_norm": 2.390625, + "learning_rate": 9.98731859848391e-06, + "loss": 1.1731, + "step": 438 + }, + { + "epoch": 0.08775392918717673, + "grad_norm": 2.171875, + "learning_rate": 9.98724348157321e-06, + "loss": 1.0737, + "step": 439 + }, + { + "epoch": 0.08795382424227281, + "grad_norm": 2.21875, + "learning_rate": 9.987168143129687e-06, + "loss": 1.1597, + "step": 440 + }, + { + "epoch": 0.08815371929736888, + "grad_norm": 2.28125, + "learning_rate": 9.987092583156688e-06, + "loss": 1.1576, + "step": 441 + }, + { + "epoch": 0.08835361435246496, + "grad_norm": 2.1875, + "learning_rate": 9.987016801657568e-06, + "loss": 1.0758, + "step": 442 + }, + { + "epoch": 0.08855350940756103, + "grad_norm": 2.171875, + "learning_rate": 9.986940798635694e-06, + "loss": 1.1427, + "step": 443 + }, + { + "epoch": 0.0887534044626571, + "grad_norm": 2.203125, + "learning_rate": 9.986864574094443e-06, + "loss": 1.1078, + "step": 444 + }, + { + "epoch": 0.08895329951775317, + "grad_norm": 2.171875, + "learning_rate": 9.986788128037202e-06, + "loss": 1.1726, + "step": 445 + }, + { + "epoch": 0.08915319457284926, + "grad_norm": 2.296875, + "learning_rate": 9.986711460467362e-06, + "loss": 1.158, + "step": 446 + }, + { + "epoch": 0.08935308962794533, + "grad_norm": 2.203125, + "learning_rate": 9.986634571388333e-06, + "loss": 1.211, + "step": 447 + }, + { + "epoch": 0.0895529846830414, + "grad_norm": 2.03125, + "learning_rate": 9.986557460803527e-06, + "loss": 1.0706, + "step": 448 + }, + { + "epoch": 0.08975287973813748, + "grad_norm": 2.171875, + "learning_rate": 9.986480128716374e-06, + "loss": 1.0745, + "step": 449 + }, + { + "epoch": 0.08995277479323355, + "grad_norm": 2.28125, + "learning_rate": 9.986402575130305e-06, + "loss": 1.1669, + "step": 450 + }, + { + "epoch": 0.09015266984832963, + "grad_norm": 2.15625, + "learning_rate": 9.986324800048767e-06, + "loss": 1.0876, + "step": 451 + }, + { + "epoch": 0.0903525649034257, + "grad_norm": 2.25, + "learning_rate": 9.986246803475213e-06, + "loss": 1.2014, + "step": 452 + }, + { + "epoch": 0.09055245995852178, + "grad_norm": 2.484375, + "learning_rate": 9.986168585413108e-06, + "loss": 1.2562, + "step": 453 + }, + { + "epoch": 0.09075235501361785, + "grad_norm": 2.203125, + "learning_rate": 9.98609014586593e-06, + "loss": 1.1761, + "step": 454 + }, + { + "epoch": 0.09095225006871392, + "grad_norm": 2.140625, + "learning_rate": 9.986011484837157e-06, + "loss": 1.1152, + "step": 455 + }, + { + "epoch": 0.09115214512381, + "grad_norm": 2.0625, + "learning_rate": 9.985932602330287e-06, + "loss": 1.047, + "step": 456 + }, + { + "epoch": 0.09135204017890608, + "grad_norm": 2.09375, + "learning_rate": 9.985853498348823e-06, + "loss": 1.1492, + "step": 457 + }, + { + "epoch": 0.09155193523400215, + "grad_norm": 2.25, + "learning_rate": 9.985774172896281e-06, + "loss": 1.1138, + "step": 458 + }, + { + "epoch": 0.09175183028909822, + "grad_norm": 2.265625, + "learning_rate": 9.985694625976181e-06, + "loss": 1.106, + "step": 459 + }, + { + "epoch": 0.0919517253441943, + "grad_norm": 2.3125, + "learning_rate": 9.985614857592058e-06, + "loss": 1.1019, + "step": 460 + }, + { + "epoch": 0.09215162039929037, + "grad_norm": 2.40625, + "learning_rate": 9.985534867747457e-06, + "loss": 1.1848, + "step": 461 + }, + { + "epoch": 0.09235151545438644, + "grad_norm": 2.171875, + "learning_rate": 9.985454656445928e-06, + "loss": 1.1093, + "step": 462 + }, + { + "epoch": 0.09255141050948253, + "grad_norm": 2.21875, + "learning_rate": 9.985374223691039e-06, + "loss": 1.1284, + "step": 463 + }, + { + "epoch": 0.0927513055645786, + "grad_norm": 2.3125, + "learning_rate": 9.985293569486356e-06, + "loss": 1.2787, + "step": 464 + }, + { + "epoch": 0.09295120061967467, + "grad_norm": 2.171875, + "learning_rate": 9.985212693835465e-06, + "loss": 1.1167, + "step": 465 + }, + { + "epoch": 0.09315109567477074, + "grad_norm": 2.0625, + "learning_rate": 9.985131596741959e-06, + "loss": 1.158, + "step": 466 + }, + { + "epoch": 0.09335099072986681, + "grad_norm": 2.1875, + "learning_rate": 9.985050278209442e-06, + "loss": 1.2403, + "step": 467 + }, + { + "epoch": 0.0935508857849629, + "grad_norm": 2.234375, + "learning_rate": 9.984968738241522e-06, + "loss": 1.1573, + "step": 468 + }, + { + "epoch": 0.09375078084005897, + "grad_norm": 2.078125, + "learning_rate": 9.984886976841824e-06, + "loss": 1.0895, + "step": 469 + }, + { + "epoch": 0.09395067589515504, + "grad_norm": 2.109375, + "learning_rate": 9.98480499401398e-06, + "loss": 1.0453, + "step": 470 + }, + { + "epoch": 0.09415057095025112, + "grad_norm": 2.34375, + "learning_rate": 9.984722789761627e-06, + "loss": 1.2329, + "step": 471 + }, + { + "epoch": 0.09435046600534719, + "grad_norm": 2.28125, + "learning_rate": 9.984640364088422e-06, + "loss": 1.213, + "step": 472 + }, + { + "epoch": 0.09455036106044326, + "grad_norm": 2.234375, + "learning_rate": 9.984557716998026e-06, + "loss": 1.0911, + "step": 473 + }, + { + "epoch": 0.09475025611553935, + "grad_norm": 2.21875, + "learning_rate": 9.984474848494107e-06, + "loss": 1.0302, + "step": 474 + }, + { + "epoch": 0.09495015117063542, + "grad_norm": 2.1875, + "learning_rate": 9.984391758580347e-06, + "loss": 1.0809, + "step": 475 + }, + { + "epoch": 0.09515004622573149, + "grad_norm": 2.25, + "learning_rate": 9.984308447260437e-06, + "loss": 1.1552, + "step": 476 + }, + { + "epoch": 0.09534994128082756, + "grad_norm": 2.25, + "learning_rate": 9.98422491453808e-06, + "loss": 1.1293, + "step": 477 + }, + { + "epoch": 0.09554983633592364, + "grad_norm": 2.3125, + "learning_rate": 9.984141160416985e-06, + "loss": 1.0692, + "step": 478 + }, + { + "epoch": 0.09574973139101972, + "grad_norm": 2.375, + "learning_rate": 9.98405718490087e-06, + "loss": 1.2213, + "step": 479 + }, + { + "epoch": 0.0959496264461158, + "grad_norm": 2.0625, + "learning_rate": 9.983972987993468e-06, + "loss": 1.1687, + "step": 480 + }, + { + "epoch": 0.09614952150121187, + "grad_norm": 2.203125, + "learning_rate": 9.983888569698519e-06, + "loss": 1.1588, + "step": 481 + }, + { + "epoch": 0.09634941655630794, + "grad_norm": 2.1875, + "learning_rate": 9.983803930019771e-06, + "loss": 1.1676, + "step": 482 + }, + { + "epoch": 0.09654931161140401, + "grad_norm": 2.234375, + "learning_rate": 9.983719068960985e-06, + "loss": 1.1553, + "step": 483 + }, + { + "epoch": 0.09674920666650008, + "grad_norm": 2.28125, + "learning_rate": 9.983633986525932e-06, + "loss": 1.2282, + "step": 484 + }, + { + "epoch": 0.09694910172159617, + "grad_norm": 2.28125, + "learning_rate": 9.983548682718388e-06, + "loss": 1.17, + "step": 485 + }, + { + "epoch": 0.09714899677669224, + "grad_norm": 2.359375, + "learning_rate": 9.983463157542142e-06, + "loss": 1.1835, + "step": 486 + }, + { + "epoch": 0.09734889183178831, + "grad_norm": 2.140625, + "learning_rate": 9.983377411000996e-06, + "loss": 1.1484, + "step": 487 + }, + { + "epoch": 0.09754878688688438, + "grad_norm": 2.25, + "learning_rate": 9.983291443098759e-06, + "loss": 1.1656, + "step": 488 + }, + { + "epoch": 0.09774868194198046, + "grad_norm": 2.234375, + "learning_rate": 9.983205253839247e-06, + "loss": 1.1873, + "step": 489 + }, + { + "epoch": 0.09794857699707653, + "grad_norm": 2.09375, + "learning_rate": 9.98311884322629e-06, + "loss": 1.0371, + "step": 490 + }, + { + "epoch": 0.09814847205217261, + "grad_norm": 2.15625, + "learning_rate": 9.983032211263725e-06, + "loss": 1.1887, + "step": 491 + }, + { + "epoch": 0.09834836710726869, + "grad_norm": 2.234375, + "learning_rate": 9.982945357955406e-06, + "loss": 1.0655, + "step": 492 + }, + { + "epoch": 0.09854826216236476, + "grad_norm": 2.296875, + "learning_rate": 9.982858283305181e-06, + "loss": 1.1893, + "step": 493 + }, + { + "epoch": 0.09874815721746083, + "grad_norm": 2.171875, + "learning_rate": 9.982770987316926e-06, + "loss": 1.1584, + "step": 494 + }, + { + "epoch": 0.0989480522725569, + "grad_norm": 2.109375, + "learning_rate": 9.982683469994515e-06, + "loss": 1.1144, + "step": 495 + }, + { + "epoch": 0.09914794732765299, + "grad_norm": 2.15625, + "learning_rate": 9.982595731341838e-06, + "loss": 1.153, + "step": 496 + }, + { + "epoch": 0.09934784238274906, + "grad_norm": 2.234375, + "learning_rate": 9.982507771362789e-06, + "loss": 1.0859, + "step": 497 + }, + { + "epoch": 0.09954773743784513, + "grad_norm": 2.15625, + "learning_rate": 9.982419590061277e-06, + "loss": 1.1539, + "step": 498 + }, + { + "epoch": 0.0997476324929412, + "grad_norm": 2.3125, + "learning_rate": 9.98233118744122e-06, + "loss": 1.1645, + "step": 499 + }, + { + "epoch": 0.09994752754803728, + "grad_norm": 2.25, + "learning_rate": 9.982242563506543e-06, + "loss": 1.2078, + "step": 500 + }, + { + "epoch": 0.10014742260313335, + "grad_norm": 2.203125, + "learning_rate": 9.982153718261183e-06, + "loss": 1.1374, + "step": 501 + }, + { + "epoch": 0.10034731765822943, + "grad_norm": 2.203125, + "learning_rate": 9.982064651709088e-06, + "loss": 1.1488, + "step": 502 + }, + { + "epoch": 0.1005472127133255, + "grad_norm": 2.3125, + "learning_rate": 9.981975363854212e-06, + "loss": 1.11, + "step": 503 + }, + { + "epoch": 0.10074710776842158, + "grad_norm": 2.234375, + "learning_rate": 9.981885854700524e-06, + "loss": 1.1718, + "step": 504 + }, + { + "epoch": 0.10094700282351765, + "grad_norm": 2.28125, + "learning_rate": 9.981796124251999e-06, + "loss": 1.1763, + "step": 505 + }, + { + "epoch": 0.10114689787861372, + "grad_norm": 2.109375, + "learning_rate": 9.98170617251262e-06, + "loss": 1.103, + "step": 506 + }, + { + "epoch": 0.10134679293370981, + "grad_norm": 2.421875, + "learning_rate": 9.981615999486386e-06, + "loss": 1.1321, + "step": 507 + }, + { + "epoch": 0.10154668798880588, + "grad_norm": 2.265625, + "learning_rate": 9.981525605177301e-06, + "loss": 1.1236, + "step": 508 + }, + { + "epoch": 0.10174658304390195, + "grad_norm": 2.125, + "learning_rate": 9.981434989589382e-06, + "loss": 1.0633, + "step": 509 + }, + { + "epoch": 0.10194647809899803, + "grad_norm": 2.25, + "learning_rate": 9.981344152726651e-06, + "loss": 1.2384, + "step": 510 + }, + { + "epoch": 0.1021463731540941, + "grad_norm": 2.21875, + "learning_rate": 9.981253094593147e-06, + "loss": 1.0622, + "step": 511 + }, + { + "epoch": 0.10234626820919017, + "grad_norm": 2.34375, + "learning_rate": 9.98116181519291e-06, + "loss": 1.1921, + "step": 512 + }, + { + "epoch": 0.10254616326428626, + "grad_norm": 2.234375, + "learning_rate": 9.981070314529998e-06, + "loss": 1.0312, + "step": 513 + }, + { + "epoch": 0.10274605831938233, + "grad_norm": 2.40625, + "learning_rate": 9.980978592608475e-06, + "loss": 1.1928, + "step": 514 + }, + { + "epoch": 0.1029459533744784, + "grad_norm": 2.3125, + "learning_rate": 9.980886649432413e-06, + "loss": 1.2021, + "step": 515 + }, + { + "epoch": 0.10314584842957447, + "grad_norm": 2.296875, + "learning_rate": 9.9807944850059e-06, + "loss": 1.1377, + "step": 516 + }, + { + "epoch": 0.10334574348467054, + "grad_norm": 2.203125, + "learning_rate": 9.980702099333029e-06, + "loss": 1.0915, + "step": 517 + }, + { + "epoch": 0.10354563853976662, + "grad_norm": 2.234375, + "learning_rate": 9.980609492417901e-06, + "loss": 1.1397, + "step": 518 + }, + { + "epoch": 0.1037455335948627, + "grad_norm": 2.15625, + "learning_rate": 9.980516664264632e-06, + "loss": 1.0924, + "step": 519 + }, + { + "epoch": 0.10394542864995877, + "grad_norm": 2.234375, + "learning_rate": 9.980423614877344e-06, + "loss": 1.2099, + "step": 520 + }, + { + "epoch": 0.10414532370505485, + "grad_norm": 2.21875, + "learning_rate": 9.980330344260172e-06, + "loss": 1.124, + "step": 521 + }, + { + "epoch": 0.10434521876015092, + "grad_norm": 2.28125, + "learning_rate": 9.980236852417256e-06, + "loss": 1.1106, + "step": 522 + }, + { + "epoch": 0.10454511381524699, + "grad_norm": 2.203125, + "learning_rate": 9.980143139352753e-06, + "loss": 1.1668, + "step": 523 + }, + { + "epoch": 0.10474500887034308, + "grad_norm": 2.078125, + "learning_rate": 9.980049205070824e-06, + "loss": 1.0663, + "step": 524 + }, + { + "epoch": 0.10494490392543915, + "grad_norm": 2.203125, + "learning_rate": 9.979955049575639e-06, + "loss": 1.1818, + "step": 525 + }, + { + "epoch": 0.10514479898053522, + "grad_norm": 2.25, + "learning_rate": 9.979860672871384e-06, + "loss": 1.1244, + "step": 526 + }, + { + "epoch": 0.10534469403563129, + "grad_norm": 2.21875, + "learning_rate": 9.979766074962249e-06, + "loss": 1.1311, + "step": 527 + }, + { + "epoch": 0.10554458909072736, + "grad_norm": 2.1875, + "learning_rate": 9.979671255852437e-06, + "loss": 1.1327, + "step": 528 + }, + { + "epoch": 0.10574448414582344, + "grad_norm": 2.25, + "learning_rate": 9.979576215546161e-06, + "loss": 1.1418, + "step": 529 + }, + { + "epoch": 0.10594437920091952, + "grad_norm": 2.125, + "learning_rate": 9.97948095404764e-06, + "loss": 1.1011, + "step": 530 + }, + { + "epoch": 0.1061442742560156, + "grad_norm": 2.21875, + "learning_rate": 9.979385471361108e-06, + "loss": 1.0591, + "step": 531 + }, + { + "epoch": 0.10634416931111167, + "grad_norm": 2.3125, + "learning_rate": 9.979289767490803e-06, + "loss": 1.1741, + "step": 532 + }, + { + "epoch": 0.10654406436620774, + "grad_norm": 2.171875, + "learning_rate": 9.97919384244098e-06, + "loss": 1.1312, + "step": 533 + }, + { + "epoch": 0.10674395942130381, + "grad_norm": 2.265625, + "learning_rate": 9.979097696215898e-06, + "loss": 1.1005, + "step": 534 + }, + { + "epoch": 0.1069438544763999, + "grad_norm": 2.1875, + "learning_rate": 9.979001328819828e-06, + "loss": 1.0767, + "step": 535 + }, + { + "epoch": 0.10714374953149597, + "grad_norm": 2.421875, + "learning_rate": 9.978904740257051e-06, + "loss": 1.2502, + "step": 536 + }, + { + "epoch": 0.10734364458659204, + "grad_norm": 2.15625, + "learning_rate": 9.978807930531857e-06, + "loss": 1.0616, + "step": 537 + }, + { + "epoch": 0.10754353964168811, + "grad_norm": 2.203125, + "learning_rate": 9.978710899648547e-06, + "loss": 1.1741, + "step": 538 + }, + { + "epoch": 0.10774343469678418, + "grad_norm": 2.28125, + "learning_rate": 9.978613647611429e-06, + "loss": 1.1575, + "step": 539 + }, + { + "epoch": 0.10794332975188026, + "grad_norm": 2.3125, + "learning_rate": 9.978516174424826e-06, + "loss": 1.2128, + "step": 540 + }, + { + "epoch": 0.10814322480697634, + "grad_norm": 1.9765625, + "learning_rate": 9.978418480093065e-06, + "loss": 1.0435, + "step": 541 + }, + { + "epoch": 0.10834311986207242, + "grad_norm": 2.28125, + "learning_rate": 9.97832056462049e-06, + "loss": 1.1376, + "step": 542 + }, + { + "epoch": 0.10854301491716849, + "grad_norm": 2.1875, + "learning_rate": 9.978222428011444e-06, + "loss": 1.2298, + "step": 543 + }, + { + "epoch": 0.10874290997226456, + "grad_norm": 2.21875, + "learning_rate": 9.97812407027029e-06, + "loss": 1.168, + "step": 544 + }, + { + "epoch": 0.10894280502736063, + "grad_norm": 2.15625, + "learning_rate": 9.978025491401397e-06, + "loss": 1.1013, + "step": 545 + }, + { + "epoch": 0.1091427000824567, + "grad_norm": 2.234375, + "learning_rate": 9.977926691409143e-06, + "loss": 1.194, + "step": 546 + }, + { + "epoch": 0.10934259513755279, + "grad_norm": 2.171875, + "learning_rate": 9.977827670297917e-06, + "loss": 1.0775, + "step": 547 + }, + { + "epoch": 0.10954249019264886, + "grad_norm": 2.140625, + "learning_rate": 9.977728428072118e-06, + "loss": 1.1678, + "step": 548 + }, + { + "epoch": 0.10974238524774493, + "grad_norm": 2.171875, + "learning_rate": 9.977628964736153e-06, + "loss": 1.0741, + "step": 549 + }, + { + "epoch": 0.109942280302841, + "grad_norm": 2.3125, + "learning_rate": 9.977529280294442e-06, + "loss": 1.2779, + "step": 550 + }, + { + "epoch": 0.11014217535793708, + "grad_norm": 2.3125, + "learning_rate": 9.977429374751411e-06, + "loss": 1.2253, + "step": 551 + }, + { + "epoch": 0.11034207041303316, + "grad_norm": 2.09375, + "learning_rate": 9.9773292481115e-06, + "loss": 1.065, + "step": 552 + }, + { + "epoch": 0.11054196546812924, + "grad_norm": 2.125, + "learning_rate": 9.977228900379155e-06, + "loss": 1.154, + "step": 553 + }, + { + "epoch": 0.11074186052322531, + "grad_norm": 2.359375, + "learning_rate": 9.977128331558834e-06, + "loss": 1.1185, + "step": 554 + }, + { + "epoch": 0.11094175557832138, + "grad_norm": 2.359375, + "learning_rate": 9.977027541655003e-06, + "loss": 1.1429, + "step": 555 + }, + { + "epoch": 0.11114165063341745, + "grad_norm": 2.25, + "learning_rate": 9.97692653067214e-06, + "loss": 1.1406, + "step": 556 + }, + { + "epoch": 0.11134154568851352, + "grad_norm": 2.234375, + "learning_rate": 9.976825298614734e-06, + "loss": 1.1413, + "step": 557 + }, + { + "epoch": 0.11154144074360961, + "grad_norm": 2.125, + "learning_rate": 9.976723845487278e-06, + "loss": 1.0157, + "step": 558 + }, + { + "epoch": 0.11174133579870568, + "grad_norm": 2.40625, + "learning_rate": 9.97662217129428e-06, + "loss": 1.2004, + "step": 559 + }, + { + "epoch": 0.11194123085380175, + "grad_norm": 2.3125, + "learning_rate": 9.97652027604026e-06, + "loss": 1.159, + "step": 560 + }, + { + "epoch": 0.11214112590889783, + "grad_norm": 2.171875, + "learning_rate": 9.976418159729737e-06, + "loss": 1.0831, + "step": 561 + }, + { + "epoch": 0.1123410209639939, + "grad_norm": 2.140625, + "learning_rate": 9.976315822367254e-06, + "loss": 1.1762, + "step": 562 + }, + { + "epoch": 0.11254091601908998, + "grad_norm": 2.203125, + "learning_rate": 9.97621326395735e-06, + "loss": 1.1847, + "step": 563 + }, + { + "epoch": 0.11274081107418606, + "grad_norm": 2.15625, + "learning_rate": 9.976110484504587e-06, + "loss": 1.0756, + "step": 564 + }, + { + "epoch": 0.11294070612928213, + "grad_norm": 2.234375, + "learning_rate": 9.976007484013528e-06, + "loss": 1.0894, + "step": 565 + }, + { + "epoch": 0.1131406011843782, + "grad_norm": 2.1875, + "learning_rate": 9.975904262488747e-06, + "loss": 1.0933, + "step": 566 + }, + { + "epoch": 0.11334049623947427, + "grad_norm": 2.21875, + "learning_rate": 9.97580081993483e-06, + "loss": 1.0876, + "step": 567 + }, + { + "epoch": 0.11354039129457034, + "grad_norm": 2.28125, + "learning_rate": 9.975697156356372e-06, + "loss": 1.0503, + "step": 568 + }, + { + "epoch": 0.11374028634966643, + "grad_norm": 2.140625, + "learning_rate": 9.975593271757977e-06, + "loss": 1.0898, + "step": 569 + }, + { + "epoch": 0.1139401814047625, + "grad_norm": 2.171875, + "learning_rate": 9.975489166144262e-06, + "loss": 1.1363, + "step": 570 + }, + { + "epoch": 0.11414007645985857, + "grad_norm": 2.15625, + "learning_rate": 9.975384839519849e-06, + "loss": 1.1002, + "step": 571 + }, + { + "epoch": 0.11433997151495465, + "grad_norm": 2.140625, + "learning_rate": 9.975280291889373e-06, + "loss": 1.0651, + "step": 572 + }, + { + "epoch": 0.11453986657005072, + "grad_norm": 2.234375, + "learning_rate": 9.975175523257477e-06, + "loss": 1.0237, + "step": 573 + }, + { + "epoch": 0.11473976162514679, + "grad_norm": 2.28125, + "learning_rate": 9.975070533628817e-06, + "loss": 1.072, + "step": 574 + }, + { + "epoch": 0.11493965668024288, + "grad_norm": 2.265625, + "learning_rate": 9.974965323008055e-06, + "loss": 1.0854, + "step": 575 + }, + { + "epoch": 0.11513955173533895, + "grad_norm": 2.4375, + "learning_rate": 9.974859891399863e-06, + "loss": 1.1208, + "step": 576 + }, + { + "epoch": 0.11533944679043502, + "grad_norm": 2.1875, + "learning_rate": 9.974754238808927e-06, + "loss": 1.1068, + "step": 577 + }, + { + "epoch": 0.1155393418455311, + "grad_norm": 2.125, + "learning_rate": 9.974648365239938e-06, + "loss": 1.0612, + "step": 578 + }, + { + "epoch": 0.11573923690062717, + "grad_norm": 2.21875, + "learning_rate": 9.974542270697602e-06, + "loss": 1.1479, + "step": 579 + }, + { + "epoch": 0.11593913195572325, + "grad_norm": 2.125, + "learning_rate": 9.974435955186628e-06, + "loss": 1.1138, + "step": 580 + }, + { + "epoch": 0.11613902701081932, + "grad_norm": 2.234375, + "learning_rate": 9.974329418711742e-06, + "loss": 1.1786, + "step": 581 + }, + { + "epoch": 0.1163389220659154, + "grad_norm": 2.21875, + "learning_rate": 9.974222661277672e-06, + "loss": 1.1402, + "step": 582 + }, + { + "epoch": 0.11653881712101147, + "grad_norm": 2.265625, + "learning_rate": 9.974115682889164e-06, + "loss": 1.0779, + "step": 583 + }, + { + "epoch": 0.11673871217610754, + "grad_norm": 2.203125, + "learning_rate": 9.97400848355097e-06, + "loss": 1.1407, + "step": 584 + }, + { + "epoch": 0.11693860723120361, + "grad_norm": 2.515625, + "learning_rate": 9.973901063267848e-06, + "loss": 1.1509, + "step": 585 + }, + { + "epoch": 0.1171385022862997, + "grad_norm": 2.234375, + "learning_rate": 9.973793422044573e-06, + "loss": 1.1145, + "step": 586 + }, + { + "epoch": 0.11733839734139577, + "grad_norm": 2.3125, + "learning_rate": 9.973685559885927e-06, + "loss": 1.1153, + "step": 587 + }, + { + "epoch": 0.11753829239649184, + "grad_norm": 2.25, + "learning_rate": 9.973577476796697e-06, + "loss": 1.1684, + "step": 588 + }, + { + "epoch": 0.11773818745158791, + "grad_norm": 2.234375, + "learning_rate": 9.973469172781688e-06, + "loss": 1.1161, + "step": 589 + }, + { + "epoch": 0.11793808250668399, + "grad_norm": 2.265625, + "learning_rate": 9.973360647845708e-06, + "loss": 1.1282, + "step": 590 + }, + { + "epoch": 0.11813797756178007, + "grad_norm": 2.203125, + "learning_rate": 9.973251901993582e-06, + "loss": 1.0917, + "step": 591 + }, + { + "epoch": 0.11833787261687614, + "grad_norm": 2.296875, + "learning_rate": 9.973142935230135e-06, + "loss": 1.222, + "step": 592 + }, + { + "epoch": 0.11853776767197222, + "grad_norm": 2.25, + "learning_rate": 9.97303374756021e-06, + "loss": 1.1674, + "step": 593 + }, + { + "epoch": 0.11873766272706829, + "grad_norm": 2.171875, + "learning_rate": 9.972924338988658e-06, + "loss": 1.095, + "step": 594 + }, + { + "epoch": 0.11893755778216436, + "grad_norm": 2.140625, + "learning_rate": 9.972814709520339e-06, + "loss": 1.0656, + "step": 595 + }, + { + "epoch": 0.11913745283726043, + "grad_norm": 2.265625, + "learning_rate": 9.97270485916012e-06, + "loss": 1.0735, + "step": 596 + }, + { + "epoch": 0.11933734789235652, + "grad_norm": 2.109375, + "learning_rate": 9.972594787912884e-06, + "loss": 0.9428, + "step": 597 + }, + { + "epoch": 0.11953724294745259, + "grad_norm": 2.046875, + "learning_rate": 9.972484495783518e-06, + "loss": 1.092, + "step": 598 + }, + { + "epoch": 0.11973713800254866, + "grad_norm": 2.125, + "learning_rate": 9.972373982776922e-06, + "loss": 1.0373, + "step": 599 + }, + { + "epoch": 0.11993703305764473, + "grad_norm": 2.15625, + "learning_rate": 9.972263248898004e-06, + "loss": 1.1532, + "step": 600 + }, + { + "epoch": 0.1201369281127408, + "grad_norm": 2.234375, + "learning_rate": 9.972152294151682e-06, + "loss": 1.0678, + "step": 601 + }, + { + "epoch": 0.12033682316783688, + "grad_norm": 2.171875, + "learning_rate": 9.972041118542889e-06, + "loss": 1.0426, + "step": 602 + }, + { + "epoch": 0.12053671822293296, + "grad_norm": 2.171875, + "learning_rate": 9.97192972207656e-06, + "loss": 1.1312, + "step": 603 + }, + { + "epoch": 0.12073661327802904, + "grad_norm": 2.125, + "learning_rate": 9.971818104757643e-06, + "loss": 1.0671, + "step": 604 + }, + { + "epoch": 0.12093650833312511, + "grad_norm": 2.21875, + "learning_rate": 9.971706266591097e-06, + "loss": 1.1811, + "step": 605 + }, + { + "epoch": 0.12113640338822118, + "grad_norm": 2.1875, + "learning_rate": 9.97159420758189e-06, + "loss": 1.1151, + "step": 606 + }, + { + "epoch": 0.12133629844331725, + "grad_norm": 2.0625, + "learning_rate": 9.971481927734998e-06, + "loss": 1.0453, + "step": 607 + }, + { + "epoch": 0.12153619349841334, + "grad_norm": 2.125, + "learning_rate": 9.97136942705541e-06, + "loss": 1.0407, + "step": 608 + }, + { + "epoch": 0.12173608855350941, + "grad_norm": 2.078125, + "learning_rate": 9.971256705548125e-06, + "loss": 1.0148, + "step": 609 + }, + { + "epoch": 0.12193598360860548, + "grad_norm": 2.21875, + "learning_rate": 9.971143763218145e-06, + "loss": 1.1297, + "step": 610 + }, + { + "epoch": 0.12213587866370156, + "grad_norm": 2.078125, + "learning_rate": 9.971030600070493e-06, + "loss": 1.0892, + "step": 611 + }, + { + "epoch": 0.12233577371879763, + "grad_norm": 2.25, + "learning_rate": 9.970917216110192e-06, + "loss": 1.1809, + "step": 612 + }, + { + "epoch": 0.1225356687738937, + "grad_norm": 2.1875, + "learning_rate": 9.970803611342278e-06, + "loss": 1.1111, + "step": 613 + }, + { + "epoch": 0.12273556382898979, + "grad_norm": 2.203125, + "learning_rate": 9.970689785771798e-06, + "loss": 1.0206, + "step": 614 + }, + { + "epoch": 0.12293545888408586, + "grad_norm": 2.359375, + "learning_rate": 9.97057573940381e-06, + "loss": 1.2189, + "step": 615 + }, + { + "epoch": 0.12313535393918193, + "grad_norm": 2.234375, + "learning_rate": 9.970461472243378e-06, + "loss": 1.114, + "step": 616 + }, + { + "epoch": 0.123335248994278, + "grad_norm": 2.09375, + "learning_rate": 9.97034698429558e-06, + "loss": 1.0699, + "step": 617 + }, + { + "epoch": 0.12353514404937407, + "grad_norm": 2.109375, + "learning_rate": 9.970232275565497e-06, + "loss": 1.1188, + "step": 618 + }, + { + "epoch": 0.12373503910447016, + "grad_norm": 2.234375, + "learning_rate": 9.970117346058229e-06, + "loss": 1.138, + "step": 619 + }, + { + "epoch": 0.12393493415956623, + "grad_norm": 2.265625, + "learning_rate": 9.970002195778879e-06, + "loss": 1.2651, + "step": 620 + }, + { + "epoch": 0.1241348292146623, + "grad_norm": 2.265625, + "learning_rate": 9.969886824732561e-06, + "loss": 1.1851, + "step": 621 + }, + { + "epoch": 0.12433472426975838, + "grad_norm": 2.21875, + "learning_rate": 9.969771232924404e-06, + "loss": 1.0908, + "step": 622 + }, + { + "epoch": 0.12453461932485445, + "grad_norm": 2.21875, + "learning_rate": 9.969655420359537e-06, + "loss": 1.0961, + "step": 623 + }, + { + "epoch": 0.12473451437995052, + "grad_norm": 2.21875, + "learning_rate": 9.969539387043106e-06, + "loss": 1.0848, + "step": 624 + }, + { + "epoch": 0.1249344094350466, + "grad_norm": 2.125, + "learning_rate": 9.969423132980269e-06, + "loss": 1.1229, + "step": 625 + }, + { + "epoch": 0.12513430449014268, + "grad_norm": 2.140625, + "learning_rate": 9.969306658176185e-06, + "loss": 1.1251, + "step": 626 + }, + { + "epoch": 0.12533419954523875, + "grad_norm": 2.140625, + "learning_rate": 9.969189962636032e-06, + "loss": 1.0888, + "step": 627 + }, + { + "epoch": 0.12553409460033482, + "grad_norm": 2.265625, + "learning_rate": 9.96907304636499e-06, + "loss": 1.1758, + "step": 628 + }, + { + "epoch": 0.1257339896554309, + "grad_norm": 2.21875, + "learning_rate": 9.968955909368256e-06, + "loss": 1.2216, + "step": 629 + }, + { + "epoch": 0.12593388471052697, + "grad_norm": 2.15625, + "learning_rate": 9.96883855165103e-06, + "loss": 1.1126, + "step": 630 + }, + { + "epoch": 0.12613377976562304, + "grad_norm": 2.234375, + "learning_rate": 9.968720973218525e-06, + "loss": 1.0975, + "step": 631 + }, + { + "epoch": 0.1263336748207191, + "grad_norm": 2.328125, + "learning_rate": 9.968603174075967e-06, + "loss": 1.1615, + "step": 632 + }, + { + "epoch": 0.1265335698758152, + "grad_norm": 2.1875, + "learning_rate": 9.968485154228584e-06, + "loss": 1.0805, + "step": 633 + }, + { + "epoch": 0.12673346493091128, + "grad_norm": 2.390625, + "learning_rate": 9.968366913681624e-06, + "loss": 1.0741, + "step": 634 + }, + { + "epoch": 0.12693335998600735, + "grad_norm": 2.34375, + "learning_rate": 9.968248452440335e-06, + "loss": 1.1183, + "step": 635 + }, + { + "epoch": 0.12713325504110343, + "grad_norm": 2.140625, + "learning_rate": 9.96812977050998e-06, + "loss": 1.069, + "step": 636 + }, + { + "epoch": 0.1273331500961995, + "grad_norm": 2.125, + "learning_rate": 9.968010867895832e-06, + "loss": 1.1027, + "step": 637 + }, + { + "epoch": 0.12753304515129557, + "grad_norm": 2.265625, + "learning_rate": 9.967891744603173e-06, + "loss": 1.1573, + "step": 638 + }, + { + "epoch": 0.12773294020639164, + "grad_norm": 2.28125, + "learning_rate": 9.967772400637292e-06, + "loss": 1.1458, + "step": 639 + }, + { + "epoch": 0.12793283526148772, + "grad_norm": 2.21875, + "learning_rate": 9.96765283600349e-06, + "loss": 1.1447, + "step": 640 + }, + { + "epoch": 0.1281327303165838, + "grad_norm": 2.296875, + "learning_rate": 9.967533050707081e-06, + "loss": 1.1196, + "step": 641 + }, + { + "epoch": 0.12833262537167986, + "grad_norm": 2.171875, + "learning_rate": 9.967413044753385e-06, + "loss": 1.1622, + "step": 642 + }, + { + "epoch": 0.12853252042677593, + "grad_norm": 2.15625, + "learning_rate": 9.967292818147731e-06, + "loss": 1.1221, + "step": 643 + }, + { + "epoch": 0.128732415481872, + "grad_norm": 2.109375, + "learning_rate": 9.967172370895462e-06, + "loss": 1.0577, + "step": 644 + }, + { + "epoch": 0.1289323105369681, + "grad_norm": 2.125, + "learning_rate": 9.967051703001926e-06, + "loss": 0.9921, + "step": 645 + }, + { + "epoch": 0.12913220559206418, + "grad_norm": 2.265625, + "learning_rate": 9.966930814472484e-06, + "loss": 1.2149, + "step": 646 + }, + { + "epoch": 0.12933210064716025, + "grad_norm": 2.28125, + "learning_rate": 9.966809705312506e-06, + "loss": 1.1856, + "step": 647 + }, + { + "epoch": 0.12953199570225632, + "grad_norm": 2.171875, + "learning_rate": 9.96668837552737e-06, + "loss": 1.1351, + "step": 648 + }, + { + "epoch": 0.1297318907573524, + "grad_norm": 2.171875, + "learning_rate": 9.966566825122467e-06, + "loss": 1.059, + "step": 649 + }, + { + "epoch": 0.12993178581244846, + "grad_norm": 2.390625, + "learning_rate": 9.966445054103198e-06, + "loss": 1.1092, + "step": 650 + }, + { + "epoch": 0.13013168086754454, + "grad_norm": 2.296875, + "learning_rate": 9.96632306247497e-06, + "loss": 1.126, + "step": 651 + }, + { + "epoch": 0.1303315759226406, + "grad_norm": 2.1875, + "learning_rate": 9.9662008502432e-06, + "loss": 1.1809, + "step": 652 + }, + { + "epoch": 0.13053147097773668, + "grad_norm": 2.15625, + "learning_rate": 9.96607841741332e-06, + "loss": 1.1616, + "step": 653 + }, + { + "epoch": 0.13073136603283275, + "grad_norm": 2.265625, + "learning_rate": 9.965955763990765e-06, + "loss": 1.0585, + "step": 654 + }, + { + "epoch": 0.13093126108792882, + "grad_norm": 2.203125, + "learning_rate": 9.965832889980987e-06, + "loss": 1.0355, + "step": 655 + }, + { + "epoch": 0.13113115614302492, + "grad_norm": 2.25, + "learning_rate": 9.965709795389441e-06, + "loss": 1.2128, + "step": 656 + }, + { + "epoch": 0.131331051198121, + "grad_norm": 2.265625, + "learning_rate": 9.965586480221599e-06, + "loss": 1.1607, + "step": 657 + }, + { + "epoch": 0.13153094625321707, + "grad_norm": 2.46875, + "learning_rate": 9.965462944482935e-06, + "loss": 1.2712, + "step": 658 + }, + { + "epoch": 0.13173084130831314, + "grad_norm": 2.28125, + "learning_rate": 9.965339188178936e-06, + "loss": 1.143, + "step": 659 + }, + { + "epoch": 0.1319307363634092, + "grad_norm": 2.296875, + "learning_rate": 9.965215211315103e-06, + "loss": 1.1531, + "step": 660 + }, + { + "epoch": 0.13213063141850528, + "grad_norm": 2.15625, + "learning_rate": 9.965091013896937e-06, + "loss": 1.0669, + "step": 661 + }, + { + "epoch": 0.13233052647360136, + "grad_norm": 2.125, + "learning_rate": 9.964966595929961e-06, + "loss": 1.0308, + "step": 662 + }, + { + "epoch": 0.13253042152869743, + "grad_norm": 2.265625, + "learning_rate": 9.9648419574197e-06, + "loss": 1.1347, + "step": 663 + }, + { + "epoch": 0.1327303165837935, + "grad_norm": 2.078125, + "learning_rate": 9.964717098371687e-06, + "loss": 1.1258, + "step": 664 + }, + { + "epoch": 0.13293021163888957, + "grad_norm": 2.125, + "learning_rate": 9.964592018791473e-06, + "loss": 1.2204, + "step": 665 + }, + { + "epoch": 0.13313010669398564, + "grad_norm": 2.078125, + "learning_rate": 9.964466718684609e-06, + "loss": 1.1216, + "step": 666 + }, + { + "epoch": 0.13333000174908174, + "grad_norm": 2.1875, + "learning_rate": 9.964341198056665e-06, + "loss": 1.1338, + "step": 667 + }, + { + "epoch": 0.13352989680417782, + "grad_norm": 2.21875, + "learning_rate": 9.964215456913215e-06, + "loss": 1.1389, + "step": 668 + }, + { + "epoch": 0.1337297918592739, + "grad_norm": 2.25, + "learning_rate": 9.964089495259846e-06, + "loss": 1.2297, + "step": 669 + }, + { + "epoch": 0.13392968691436996, + "grad_norm": 2.1875, + "learning_rate": 9.96396331310215e-06, + "loss": 1.1715, + "step": 670 + }, + { + "epoch": 0.13412958196946603, + "grad_norm": 2.046875, + "learning_rate": 9.963836910445735e-06, + "loss": 1.0458, + "step": 671 + }, + { + "epoch": 0.1343294770245621, + "grad_norm": 2.25, + "learning_rate": 9.963710287296213e-06, + "loss": 1.1225, + "step": 672 + }, + { + "epoch": 0.13452937207965818, + "grad_norm": 2.203125, + "learning_rate": 9.963583443659213e-06, + "loss": 1.1095, + "step": 673 + }, + { + "epoch": 0.13472926713475425, + "grad_norm": 2.140625, + "learning_rate": 9.963456379540364e-06, + "loss": 1.1456, + "step": 674 + }, + { + "epoch": 0.13492916218985032, + "grad_norm": 2.1875, + "learning_rate": 9.963329094945313e-06, + "loss": 1.1126, + "step": 675 + }, + { + "epoch": 0.1351290572449464, + "grad_norm": 2.234375, + "learning_rate": 9.963201589879715e-06, + "loss": 1.0257, + "step": 676 + }, + { + "epoch": 0.13532895230004247, + "grad_norm": 2.09375, + "learning_rate": 9.963073864349232e-06, + "loss": 1.0276, + "step": 677 + }, + { + "epoch": 0.13552884735513857, + "grad_norm": 2.21875, + "learning_rate": 9.962945918359537e-06, + "loss": 1.031, + "step": 678 + }, + { + "epoch": 0.13572874241023464, + "grad_norm": 2.125, + "learning_rate": 9.962817751916316e-06, + "loss": 1.0606, + "step": 679 + }, + { + "epoch": 0.1359286374653307, + "grad_norm": 2.078125, + "learning_rate": 9.962689365025259e-06, + "loss": 1.0507, + "step": 680 + }, + { + "epoch": 0.13612853252042678, + "grad_norm": 2.296875, + "learning_rate": 9.962560757692072e-06, + "loss": 1.2604, + "step": 681 + }, + { + "epoch": 0.13632842757552285, + "grad_norm": 2.125, + "learning_rate": 9.962431929922464e-06, + "loss": 1.1347, + "step": 682 + }, + { + "epoch": 0.13652832263061893, + "grad_norm": 2.171875, + "learning_rate": 9.962302881722162e-06, + "loss": 1.0511, + "step": 683 + }, + { + "epoch": 0.136728217685715, + "grad_norm": 2.34375, + "learning_rate": 9.962173613096895e-06, + "loss": 1.0503, + "step": 684 + }, + { + "epoch": 0.13692811274081107, + "grad_norm": 2.140625, + "learning_rate": 9.962044124052406e-06, + "loss": 1.0673, + "step": 685 + }, + { + "epoch": 0.13712800779590714, + "grad_norm": 2.34375, + "learning_rate": 9.961914414594447e-06, + "loss": 1.2253, + "step": 686 + }, + { + "epoch": 0.13732790285100321, + "grad_norm": 2.15625, + "learning_rate": 9.96178448472878e-06, + "loss": 1.0849, + "step": 687 + }, + { + "epoch": 0.13752779790609929, + "grad_norm": 2.0625, + "learning_rate": 9.961654334461175e-06, + "loss": 1.0873, + "step": 688 + }, + { + "epoch": 0.13772769296119539, + "grad_norm": 2.078125, + "learning_rate": 9.961523963797415e-06, + "loss": 1.1177, + "step": 689 + }, + { + "epoch": 0.13792758801629146, + "grad_norm": 2.25, + "learning_rate": 9.961393372743291e-06, + "loss": 1.1586, + "step": 690 + }, + { + "epoch": 0.13812748307138753, + "grad_norm": 2.296875, + "learning_rate": 9.961262561304604e-06, + "loss": 1.1439, + "step": 691 + }, + { + "epoch": 0.1383273781264836, + "grad_norm": 2.296875, + "learning_rate": 9.961131529487161e-06, + "loss": 1.143, + "step": 692 + }, + { + "epoch": 0.13852727318157967, + "grad_norm": 2.203125, + "learning_rate": 9.961000277296788e-06, + "loss": 1.121, + "step": 693 + }, + { + "epoch": 0.13872716823667575, + "grad_norm": 2.203125, + "learning_rate": 9.960868804739312e-06, + "loss": 1.1314, + "step": 694 + }, + { + "epoch": 0.13892706329177182, + "grad_norm": 2.25, + "learning_rate": 9.960737111820572e-06, + "loss": 1.1114, + "step": 695 + }, + { + "epoch": 0.1391269583468679, + "grad_norm": 2.28125, + "learning_rate": 9.96060519854642e-06, + "loss": 1.22, + "step": 696 + }, + { + "epoch": 0.13932685340196396, + "grad_norm": 2.28125, + "learning_rate": 9.960473064922716e-06, + "loss": 1.2051, + "step": 697 + }, + { + "epoch": 0.13952674845706003, + "grad_norm": 2.15625, + "learning_rate": 9.960340710955327e-06, + "loss": 1.0331, + "step": 698 + }, + { + "epoch": 0.1397266435121561, + "grad_norm": 2.21875, + "learning_rate": 9.960208136650137e-06, + "loss": 1.1898, + "step": 699 + }, + { + "epoch": 0.13992653856725218, + "grad_norm": 2.265625, + "learning_rate": 9.960075342013027e-06, + "loss": 1.1642, + "step": 700 + }, + { + "epoch": 0.14012643362234828, + "grad_norm": 2.46875, + "learning_rate": 9.959942327049901e-06, + "loss": 1.1635, + "step": 701 + }, + { + "epoch": 0.14032632867744435, + "grad_norm": 2.21875, + "learning_rate": 9.959809091766667e-06, + "loss": 1.1644, + "step": 702 + }, + { + "epoch": 0.14052622373254042, + "grad_norm": 2.234375, + "learning_rate": 9.959675636169242e-06, + "loss": 1.1439, + "step": 703 + }, + { + "epoch": 0.1407261187876365, + "grad_norm": 2.1875, + "learning_rate": 9.959541960263557e-06, + "loss": 1.0681, + "step": 704 + }, + { + "epoch": 0.14092601384273257, + "grad_norm": 2.15625, + "learning_rate": 9.959408064055547e-06, + "loss": 1.1353, + "step": 705 + }, + { + "epoch": 0.14112590889782864, + "grad_norm": 2.109375, + "learning_rate": 9.95927394755116e-06, + "loss": 1.1704, + "step": 706 + }, + { + "epoch": 0.1413258039529247, + "grad_norm": 2.234375, + "learning_rate": 9.959139610756353e-06, + "loss": 1.2734, + "step": 707 + }, + { + "epoch": 0.14152569900802078, + "grad_norm": 2.171875, + "learning_rate": 9.959005053677096e-06, + "loss": 1.1302, + "step": 708 + }, + { + "epoch": 0.14172559406311686, + "grad_norm": 2.21875, + "learning_rate": 9.958870276319364e-06, + "loss": 1.1517, + "step": 709 + }, + { + "epoch": 0.14192548911821293, + "grad_norm": 2.03125, + "learning_rate": 9.958735278689143e-06, + "loss": 1.0484, + "step": 710 + }, + { + "epoch": 0.142125384173309, + "grad_norm": 2.3125, + "learning_rate": 9.958600060792429e-06, + "loss": 1.1174, + "step": 711 + }, + { + "epoch": 0.1423252792284051, + "grad_norm": 2.203125, + "learning_rate": 9.958464622635233e-06, + "loss": 1.1346, + "step": 712 + }, + { + "epoch": 0.14252517428350117, + "grad_norm": 2.171875, + "learning_rate": 9.958328964223566e-06, + "loss": 1.1224, + "step": 713 + }, + { + "epoch": 0.14272506933859724, + "grad_norm": 2.234375, + "learning_rate": 9.958193085563456e-06, + "loss": 1.1046, + "step": 714 + }, + { + "epoch": 0.14292496439369332, + "grad_norm": 2.1875, + "learning_rate": 9.95805698666094e-06, + "loss": 1.0429, + "step": 715 + }, + { + "epoch": 0.1431248594487894, + "grad_norm": 2.25, + "learning_rate": 9.957920667522063e-06, + "loss": 1.2122, + "step": 716 + }, + { + "epoch": 0.14332475450388546, + "grad_norm": 2.15625, + "learning_rate": 9.957784128152877e-06, + "loss": 1.1542, + "step": 717 + }, + { + "epoch": 0.14352464955898153, + "grad_norm": 2.21875, + "learning_rate": 9.957647368559451e-06, + "loss": 1.1105, + "step": 718 + }, + { + "epoch": 0.1437245446140776, + "grad_norm": 2.203125, + "learning_rate": 9.957510388747858e-06, + "loss": 1.2298, + "step": 719 + }, + { + "epoch": 0.14392443966917368, + "grad_norm": 2.125, + "learning_rate": 9.957373188724184e-06, + "loss": 1.0803, + "step": 720 + }, + { + "epoch": 0.14412433472426975, + "grad_norm": 2.109375, + "learning_rate": 9.95723576849452e-06, + "loss": 1.0925, + "step": 721 + }, + { + "epoch": 0.14432422977936582, + "grad_norm": 2.3125, + "learning_rate": 9.957098128064974e-06, + "loss": 1.1767, + "step": 722 + }, + { + "epoch": 0.14452412483446192, + "grad_norm": 2.15625, + "learning_rate": 9.95696026744166e-06, + "loss": 1.0755, + "step": 723 + }, + { + "epoch": 0.144724019889558, + "grad_norm": 2.171875, + "learning_rate": 9.956822186630697e-06, + "loss": 1.1333, + "step": 724 + }, + { + "epoch": 0.14492391494465406, + "grad_norm": 2.234375, + "learning_rate": 9.956683885638225e-06, + "loss": 1.1836, + "step": 725 + }, + { + "epoch": 0.14512380999975014, + "grad_norm": 2.234375, + "learning_rate": 9.956545364470383e-06, + "loss": 1.1527, + "step": 726 + }, + { + "epoch": 0.1453237050548462, + "grad_norm": 2.03125, + "learning_rate": 9.956406623133326e-06, + "loss": 1.0995, + "step": 727 + }, + { + "epoch": 0.14552360010994228, + "grad_norm": 2.21875, + "learning_rate": 9.956267661633215e-06, + "loss": 1.1751, + "step": 728 + }, + { + "epoch": 0.14572349516503835, + "grad_norm": 2.234375, + "learning_rate": 9.956128479976223e-06, + "loss": 1.2085, + "step": 729 + }, + { + "epoch": 0.14592339022013442, + "grad_norm": 2.171875, + "learning_rate": 9.955989078168535e-06, + "loss": 1.1981, + "step": 730 + }, + { + "epoch": 0.1461232852752305, + "grad_norm": 2.0625, + "learning_rate": 9.955849456216343e-06, + "loss": 1.109, + "step": 731 + }, + { + "epoch": 0.14632318033032657, + "grad_norm": 2.171875, + "learning_rate": 9.955709614125844e-06, + "loss": 1.1483, + "step": 732 + }, + { + "epoch": 0.14652307538542264, + "grad_norm": 2.15625, + "learning_rate": 9.955569551903255e-06, + "loss": 1.1024, + "step": 733 + }, + { + "epoch": 0.14672297044051874, + "grad_norm": 2.046875, + "learning_rate": 9.955429269554795e-06, + "loss": 1.0162, + "step": 734 + }, + { + "epoch": 0.1469228654956148, + "grad_norm": 2.125, + "learning_rate": 9.955288767086698e-06, + "loss": 1.1791, + "step": 735 + }, + { + "epoch": 0.14712276055071088, + "grad_norm": 2.1875, + "learning_rate": 9.9551480445052e-06, + "loss": 1.156, + "step": 736 + }, + { + "epoch": 0.14732265560580696, + "grad_norm": 2.1875, + "learning_rate": 9.955007101816558e-06, + "loss": 1.1466, + "step": 737 + }, + { + "epoch": 0.14752255066090303, + "grad_norm": 2.03125, + "learning_rate": 9.954865939027028e-06, + "loss": 1.0374, + "step": 738 + }, + { + "epoch": 0.1477224457159991, + "grad_norm": 2.0625, + "learning_rate": 9.954724556142884e-06, + "loss": 1.0774, + "step": 739 + }, + { + "epoch": 0.14792234077109517, + "grad_norm": 2.1875, + "learning_rate": 9.954582953170403e-06, + "loss": 1.0355, + "step": 740 + }, + { + "epoch": 0.14812223582619125, + "grad_norm": 2.21875, + "learning_rate": 9.954441130115876e-06, + "loss": 1.1776, + "step": 741 + }, + { + "epoch": 0.14832213088128732, + "grad_norm": 2.203125, + "learning_rate": 9.954299086985604e-06, + "loss": 1.1553, + "step": 742 + }, + { + "epoch": 0.1485220259363834, + "grad_norm": 2.171875, + "learning_rate": 9.954156823785896e-06, + "loss": 1.0897, + "step": 743 + }, + { + "epoch": 0.14872192099147946, + "grad_norm": 2.171875, + "learning_rate": 9.95401434052307e-06, + "loss": 1.141, + "step": 744 + }, + { + "epoch": 0.14892181604657556, + "grad_norm": 2.125, + "learning_rate": 9.953871637203456e-06, + "loss": 1.0888, + "step": 745 + }, + { + "epoch": 0.14912171110167163, + "grad_norm": 2.109375, + "learning_rate": 9.953728713833395e-06, + "loss": 1.0699, + "step": 746 + }, + { + "epoch": 0.1493216061567677, + "grad_norm": 2.078125, + "learning_rate": 9.95358557041923e-06, + "loss": 1.0551, + "step": 747 + }, + { + "epoch": 0.14952150121186378, + "grad_norm": 2.203125, + "learning_rate": 9.953442206967327e-06, + "loss": 1.0822, + "step": 748 + }, + { + "epoch": 0.14972139626695985, + "grad_norm": 2.078125, + "learning_rate": 9.953298623484049e-06, + "loss": 1.0417, + "step": 749 + }, + { + "epoch": 0.14992129132205592, + "grad_norm": 2.296875, + "learning_rate": 9.953154819975773e-06, + "loss": 1.2122, + "step": 750 + }, + { + "epoch": 0.150121186377152, + "grad_norm": 2.125, + "learning_rate": 9.95301079644889e-06, + "loss": 1.0791, + "step": 751 + }, + { + "epoch": 0.15032108143224807, + "grad_norm": 2.25, + "learning_rate": 9.952866552909797e-06, + "loss": 1.1611, + "step": 752 + }, + { + "epoch": 0.15052097648734414, + "grad_norm": 2.265625, + "learning_rate": 9.9527220893649e-06, + "loss": 1.1831, + "step": 753 + }, + { + "epoch": 0.1507208715424402, + "grad_norm": 2.21875, + "learning_rate": 9.952577405820618e-06, + "loss": 1.1212, + "step": 754 + }, + { + "epoch": 0.15092076659753628, + "grad_norm": 2.1875, + "learning_rate": 9.952432502283378e-06, + "loss": 1.0424, + "step": 755 + }, + { + "epoch": 0.15112066165263235, + "grad_norm": 2.0625, + "learning_rate": 9.952287378759613e-06, + "loss": 1.0381, + "step": 756 + }, + { + "epoch": 0.15132055670772845, + "grad_norm": 2.1875, + "learning_rate": 9.95214203525577e-06, + "loss": 1.1073, + "step": 757 + }, + { + "epoch": 0.15152045176282453, + "grad_norm": 2.203125, + "learning_rate": 9.951996471778308e-06, + "loss": 1.092, + "step": 758 + }, + { + "epoch": 0.1517203468179206, + "grad_norm": 2.390625, + "learning_rate": 9.951850688333693e-06, + "loss": 1.1301, + "step": 759 + }, + { + "epoch": 0.15192024187301667, + "grad_norm": 2.234375, + "learning_rate": 9.951704684928398e-06, + "loss": 1.123, + "step": 760 + }, + { + "epoch": 0.15212013692811274, + "grad_norm": 2.25, + "learning_rate": 9.95155846156891e-06, + "loss": 1.0566, + "step": 761 + }, + { + "epoch": 0.15232003198320881, + "grad_norm": 2.203125, + "learning_rate": 9.951412018261724e-06, + "loss": 1.0931, + "step": 762 + }, + { + "epoch": 0.1525199270383049, + "grad_norm": 2.0625, + "learning_rate": 9.951265355013345e-06, + "loss": 0.9994, + "step": 763 + }, + { + "epoch": 0.15271982209340096, + "grad_norm": 2.21875, + "learning_rate": 9.951118471830287e-06, + "loss": 1.0814, + "step": 764 + }, + { + "epoch": 0.15291971714849703, + "grad_norm": 2.3125, + "learning_rate": 9.950971368719077e-06, + "loss": 1.1147, + "step": 765 + }, + { + "epoch": 0.1531196122035931, + "grad_norm": 2.109375, + "learning_rate": 9.950824045686247e-06, + "loss": 1.0999, + "step": 766 + }, + { + "epoch": 0.15331950725868917, + "grad_norm": 2.265625, + "learning_rate": 9.95067650273834e-06, + "loss": 1.1821, + "step": 767 + }, + { + "epoch": 0.15351940231378527, + "grad_norm": 2.046875, + "learning_rate": 9.950528739881915e-06, + "loss": 1.1256, + "step": 768 + }, + { + "epoch": 0.15371929736888135, + "grad_norm": 2.03125, + "learning_rate": 9.95038075712353e-06, + "loss": 1.1463, + "step": 769 + }, + { + "epoch": 0.15391919242397742, + "grad_norm": 2.1875, + "learning_rate": 9.950232554469759e-06, + "loss": 1.0449, + "step": 770 + }, + { + "epoch": 0.1541190874790735, + "grad_norm": 2.15625, + "learning_rate": 9.950084131927188e-06, + "loss": 1.0059, + "step": 771 + }, + { + "epoch": 0.15431898253416956, + "grad_norm": 2.109375, + "learning_rate": 9.949935489502409e-06, + "loss": 1.0697, + "step": 772 + }, + { + "epoch": 0.15451887758926564, + "grad_norm": 2.28125, + "learning_rate": 9.949786627202023e-06, + "loss": 1.1426, + "step": 773 + }, + { + "epoch": 0.1547187726443617, + "grad_norm": 2.15625, + "learning_rate": 9.949637545032644e-06, + "loss": 1.2014, + "step": 774 + }, + { + "epoch": 0.15491866769945778, + "grad_norm": 2.15625, + "learning_rate": 9.949488243000895e-06, + "loss": 1.0977, + "step": 775 + }, + { + "epoch": 0.15511856275455385, + "grad_norm": 2.109375, + "learning_rate": 9.949338721113406e-06, + "loss": 1.0963, + "step": 776 + }, + { + "epoch": 0.15531845780964992, + "grad_norm": 2.1875, + "learning_rate": 9.94918897937682e-06, + "loss": 1.0902, + "step": 777 + }, + { + "epoch": 0.155518352864746, + "grad_norm": 2.0625, + "learning_rate": 9.949039017797788e-06, + "loss": 1.0116, + "step": 778 + }, + { + "epoch": 0.1557182479198421, + "grad_norm": 2.203125, + "learning_rate": 9.948888836382971e-06, + "loss": 1.1572, + "step": 779 + }, + { + "epoch": 0.15591814297493817, + "grad_norm": 2.1875, + "learning_rate": 9.948738435139042e-06, + "loss": 1.0033, + "step": 780 + }, + { + "epoch": 0.15611803803003424, + "grad_norm": 2.3125, + "learning_rate": 9.948587814072679e-06, + "loss": 0.9917, + "step": 781 + }, + { + "epoch": 0.1563179330851303, + "grad_norm": 2.125, + "learning_rate": 9.948436973190574e-06, + "loss": 1.1254, + "step": 782 + }, + { + "epoch": 0.15651782814022638, + "grad_norm": 2.1875, + "learning_rate": 9.948285912499427e-06, + "loss": 1.04, + "step": 783 + }, + { + "epoch": 0.15671772319532246, + "grad_norm": 2.171875, + "learning_rate": 9.948134632005948e-06, + "loss": 1.1461, + "step": 784 + }, + { + "epoch": 0.15691761825041853, + "grad_norm": 2.140625, + "learning_rate": 9.947983131716858e-06, + "loss": 1.1382, + "step": 785 + }, + { + "epoch": 0.1571175133055146, + "grad_norm": 2.140625, + "learning_rate": 9.947831411638884e-06, + "loss": 1.0689, + "step": 786 + }, + { + "epoch": 0.15731740836061067, + "grad_norm": 2.21875, + "learning_rate": 9.947679471778768e-06, + "loss": 1.0443, + "step": 787 + }, + { + "epoch": 0.15751730341570674, + "grad_norm": 2.078125, + "learning_rate": 9.947527312143259e-06, + "loss": 1.0321, + "step": 788 + }, + { + "epoch": 0.15771719847080282, + "grad_norm": 2.34375, + "learning_rate": 9.947374932739115e-06, + "loss": 1.233, + "step": 789 + }, + { + "epoch": 0.15791709352589892, + "grad_norm": 2.265625, + "learning_rate": 9.947222333573105e-06, + "loss": 1.1441, + "step": 790 + }, + { + "epoch": 0.158116988580995, + "grad_norm": 2.078125, + "learning_rate": 9.947069514652006e-06, + "loss": 1.055, + "step": 791 + }, + { + "epoch": 0.15831688363609106, + "grad_norm": 2.296875, + "learning_rate": 9.94691647598261e-06, + "loss": 1.1694, + "step": 792 + }, + { + "epoch": 0.15851677869118713, + "grad_norm": 2.078125, + "learning_rate": 9.946763217571712e-06, + "loss": 0.9969, + "step": 793 + }, + { + "epoch": 0.1587166737462832, + "grad_norm": 2.140625, + "learning_rate": 9.946609739426119e-06, + "loss": 1.1176, + "step": 794 + }, + { + "epoch": 0.15891656880137928, + "grad_norm": 2.296875, + "learning_rate": 9.94645604155265e-06, + "loss": 1.1539, + "step": 795 + }, + { + "epoch": 0.15911646385647535, + "grad_norm": 2.09375, + "learning_rate": 9.94630212395813e-06, + "loss": 1.1059, + "step": 796 + }, + { + "epoch": 0.15931635891157142, + "grad_norm": 2.15625, + "learning_rate": 9.9461479866494e-06, + "loss": 1.1263, + "step": 797 + }, + { + "epoch": 0.1595162539666675, + "grad_norm": 2.125, + "learning_rate": 9.945993629633305e-06, + "loss": 1.1279, + "step": 798 + }, + { + "epoch": 0.15971614902176356, + "grad_norm": 2.234375, + "learning_rate": 9.945839052916702e-06, + "loss": 1.0915, + "step": 799 + }, + { + "epoch": 0.15991604407685964, + "grad_norm": 2.359375, + "learning_rate": 9.945684256506454e-06, + "loss": 1.2039, + "step": 800 + }, + { + "epoch": 0.16011593913195574, + "grad_norm": 2.265625, + "learning_rate": 9.945529240409442e-06, + "loss": 1.1241, + "step": 801 + }, + { + "epoch": 0.1603158341870518, + "grad_norm": 2.28125, + "learning_rate": 9.945374004632547e-06, + "loss": 1.1659, + "step": 802 + }, + { + "epoch": 0.16051572924214788, + "grad_norm": 2.265625, + "learning_rate": 9.945218549182668e-06, + "loss": 1.1306, + "step": 803 + }, + { + "epoch": 0.16071562429724395, + "grad_norm": 2.3125, + "learning_rate": 9.945062874066709e-06, + "loss": 1.0898, + "step": 804 + }, + { + "epoch": 0.16091551935234003, + "grad_norm": 2.25, + "learning_rate": 9.944906979291587e-06, + "loss": 1.1502, + "step": 805 + }, + { + "epoch": 0.1611154144074361, + "grad_norm": 2.25, + "learning_rate": 9.944750864864224e-06, + "loss": 1.1441, + "step": 806 + }, + { + "epoch": 0.16131530946253217, + "grad_norm": 2.09375, + "learning_rate": 9.944594530791553e-06, + "loss": 1.0863, + "step": 807 + }, + { + "epoch": 0.16151520451762824, + "grad_norm": 2.078125, + "learning_rate": 9.944437977080525e-06, + "loss": 1.1074, + "step": 808 + }, + { + "epoch": 0.1617150995727243, + "grad_norm": 2.03125, + "learning_rate": 9.944281203738087e-06, + "loss": 1.0624, + "step": 809 + }, + { + "epoch": 0.16191499462782039, + "grad_norm": 2.203125, + "learning_rate": 9.944124210771209e-06, + "loss": 1.2287, + "step": 810 + }, + { + "epoch": 0.16211488968291646, + "grad_norm": 2.25, + "learning_rate": 9.94396699818686e-06, + "loss": 1.1621, + "step": 811 + }, + { + "epoch": 0.16231478473801253, + "grad_norm": 2.265625, + "learning_rate": 9.943809565992024e-06, + "loss": 1.1585, + "step": 812 + }, + { + "epoch": 0.16251467979310863, + "grad_norm": 2.15625, + "learning_rate": 9.943651914193697e-06, + "loss": 1.0682, + "step": 813 + }, + { + "epoch": 0.1627145748482047, + "grad_norm": 2.15625, + "learning_rate": 9.943494042798878e-06, + "loss": 1.101, + "step": 814 + }, + { + "epoch": 0.16291446990330077, + "grad_norm": 2.3125, + "learning_rate": 9.943335951814583e-06, + "loss": 1.1353, + "step": 815 + }, + { + "epoch": 0.16311436495839685, + "grad_norm": 2.140625, + "learning_rate": 9.943177641247833e-06, + "loss": 1.0897, + "step": 816 + }, + { + "epoch": 0.16331426001349292, + "grad_norm": 5.25, + "learning_rate": 9.94301911110566e-06, + "loss": 0.9894, + "step": 817 + }, + { + "epoch": 0.163514155068589, + "grad_norm": 2.234375, + "learning_rate": 9.942860361395106e-06, + "loss": 1.095, + "step": 818 + }, + { + "epoch": 0.16371405012368506, + "grad_norm": 2.328125, + "learning_rate": 9.942701392123222e-06, + "loss": 1.1018, + "step": 819 + }, + { + "epoch": 0.16391394517878113, + "grad_norm": 2.125, + "learning_rate": 9.942542203297072e-06, + "loss": 1.1963, + "step": 820 + }, + { + "epoch": 0.1641138402338772, + "grad_norm": 2.09375, + "learning_rate": 9.942382794923723e-06, + "loss": 1.0891, + "step": 821 + }, + { + "epoch": 0.16431373528897328, + "grad_norm": 2.1875, + "learning_rate": 9.94222316701026e-06, + "loss": 1.1304, + "step": 822 + }, + { + "epoch": 0.16451363034406935, + "grad_norm": 2.125, + "learning_rate": 9.942063319563769e-06, + "loss": 1.0862, + "step": 823 + }, + { + "epoch": 0.16471352539916545, + "grad_norm": 3.203125, + "learning_rate": 9.941903252591356e-06, + "loss": 1.1904, + "step": 824 + }, + { + "epoch": 0.16491342045426152, + "grad_norm": 2.09375, + "learning_rate": 9.941742966100128e-06, + "loss": 1.1039, + "step": 825 + }, + { + "epoch": 0.1651133155093576, + "grad_norm": 2.296875, + "learning_rate": 9.941582460097203e-06, + "loss": 1.0461, + "step": 826 + }, + { + "epoch": 0.16531321056445367, + "grad_norm": 2.328125, + "learning_rate": 9.941421734589715e-06, + "loss": 1.0681, + "step": 827 + }, + { + "epoch": 0.16551310561954974, + "grad_norm": 2.03125, + "learning_rate": 9.9412607895848e-06, + "loss": 1.0501, + "step": 828 + }, + { + "epoch": 0.1657130006746458, + "grad_norm": 2.3125, + "learning_rate": 9.94109962508961e-06, + "loss": 1.1685, + "step": 829 + }, + { + "epoch": 0.16591289572974188, + "grad_norm": 2.3125, + "learning_rate": 9.9409382411113e-06, + "loss": 1.0765, + "step": 830 + }, + { + "epoch": 0.16611279078483795, + "grad_norm": 2.109375, + "learning_rate": 9.940776637657044e-06, + "loss": 1.0488, + "step": 831 + }, + { + "epoch": 0.16631268583993403, + "grad_norm": 2.046875, + "learning_rate": 9.940614814734015e-06, + "loss": 1.0233, + "step": 832 + }, + { + "epoch": 0.1665125808950301, + "grad_norm": 2.125, + "learning_rate": 9.940452772349405e-06, + "loss": 1.1514, + "step": 833 + }, + { + "epoch": 0.16671247595012617, + "grad_norm": 2.0625, + "learning_rate": 9.940290510510411e-06, + "loss": 1.0156, + "step": 834 + }, + { + "epoch": 0.16691237100522227, + "grad_norm": 2.1875, + "learning_rate": 9.940128029224239e-06, + "loss": 1.0412, + "step": 835 + }, + { + "epoch": 0.16711226606031834, + "grad_norm": 2.109375, + "learning_rate": 9.939965328498107e-06, + "loss": 1.1049, + "step": 836 + }, + { + "epoch": 0.16731216111541442, + "grad_norm": 2.328125, + "learning_rate": 9.939802408339244e-06, + "loss": 1.173, + "step": 837 + }, + { + "epoch": 0.1675120561705105, + "grad_norm": 2.3125, + "learning_rate": 9.939639268754886e-06, + "loss": 1.1309, + "step": 838 + }, + { + "epoch": 0.16771195122560656, + "grad_norm": 2.15625, + "learning_rate": 9.939475909752278e-06, + "loss": 1.0505, + "step": 839 + }, + { + "epoch": 0.16791184628070263, + "grad_norm": 2.296875, + "learning_rate": 9.939312331338678e-06, + "loss": 1.0639, + "step": 840 + }, + { + "epoch": 0.1681117413357987, + "grad_norm": 2.03125, + "learning_rate": 9.939148533521353e-06, + "loss": 1.0412, + "step": 841 + }, + { + "epoch": 0.16831163639089478, + "grad_norm": 2.15625, + "learning_rate": 9.938984516307575e-06, + "loss": 1.063, + "step": 842 + }, + { + "epoch": 0.16851153144599085, + "grad_norm": 2.109375, + "learning_rate": 9.938820279704635e-06, + "loss": 1.1225, + "step": 843 + }, + { + "epoch": 0.16871142650108692, + "grad_norm": 2.0625, + "learning_rate": 9.938655823719823e-06, + "loss": 1.1079, + "step": 844 + }, + { + "epoch": 0.168911321556183, + "grad_norm": 2.1875, + "learning_rate": 9.938491148360448e-06, + "loss": 1.1533, + "step": 845 + }, + { + "epoch": 0.1691112166112791, + "grad_norm": 2.28125, + "learning_rate": 9.938326253633825e-06, + "loss": 1.1815, + "step": 846 + }, + { + "epoch": 0.16931111166637516, + "grad_norm": 2.15625, + "learning_rate": 9.938161139547276e-06, + "loss": 1.0423, + "step": 847 + }, + { + "epoch": 0.16951100672147124, + "grad_norm": 2.0625, + "learning_rate": 9.937995806108135e-06, + "loss": 1.1081, + "step": 848 + }, + { + "epoch": 0.1697109017765673, + "grad_norm": 2.1875, + "learning_rate": 9.93783025332375e-06, + "loss": 1.0819, + "step": 849 + }, + { + "epoch": 0.16991079683166338, + "grad_norm": 2.21875, + "learning_rate": 9.937664481201472e-06, + "loss": 1.0859, + "step": 850 + }, + { + "epoch": 0.17011069188675945, + "grad_norm": 2.171875, + "learning_rate": 9.937498489748665e-06, + "loss": 1.1142, + "step": 851 + }, + { + "epoch": 0.17031058694185552, + "grad_norm": 2.3125, + "learning_rate": 9.937332278972703e-06, + "loss": 1.1683, + "step": 852 + }, + { + "epoch": 0.1705104819969516, + "grad_norm": 2.109375, + "learning_rate": 9.937165848880968e-06, + "loss": 1.0265, + "step": 853 + }, + { + "epoch": 0.17071037705204767, + "grad_norm": 2.15625, + "learning_rate": 9.936999199480854e-06, + "loss": 1.0466, + "step": 854 + }, + { + "epoch": 0.17091027210714374, + "grad_norm": 2.140625, + "learning_rate": 9.936832330779761e-06, + "loss": 1.0995, + "step": 855 + }, + { + "epoch": 0.1711101671622398, + "grad_norm": 2.21875, + "learning_rate": 9.936665242785105e-06, + "loss": 1.256, + "step": 856 + }, + { + "epoch": 0.1713100622173359, + "grad_norm": 2.15625, + "learning_rate": 9.936497935504306e-06, + "loss": 1.0735, + "step": 857 + }, + { + "epoch": 0.17150995727243198, + "grad_norm": 2.21875, + "learning_rate": 9.936330408944794e-06, + "loss": 1.1574, + "step": 858 + }, + { + "epoch": 0.17170985232752806, + "grad_norm": 2.125, + "learning_rate": 9.936162663114014e-06, + "loss": 1.0946, + "step": 859 + }, + { + "epoch": 0.17190974738262413, + "grad_norm": 2.4375, + "learning_rate": 9.935994698019416e-06, + "loss": 1.1149, + "step": 860 + }, + { + "epoch": 0.1721096424377202, + "grad_norm": 2.09375, + "learning_rate": 9.93582651366846e-06, + "loss": 1.0987, + "step": 861 + }, + { + "epoch": 0.17230953749281627, + "grad_norm": 2.125, + "learning_rate": 9.935658110068618e-06, + "loss": 1.0275, + "step": 862 + }, + { + "epoch": 0.17250943254791234, + "grad_norm": 2.203125, + "learning_rate": 9.93548948722737e-06, + "loss": 1.0396, + "step": 863 + }, + { + "epoch": 0.17270932760300842, + "grad_norm": 2.15625, + "learning_rate": 9.935320645152205e-06, + "loss": 1.0945, + "step": 864 + }, + { + "epoch": 0.1729092226581045, + "grad_norm": 2.171875, + "learning_rate": 9.935151583850624e-06, + "loss": 1.1701, + "step": 865 + }, + { + "epoch": 0.17310911771320056, + "grad_norm": 2.171875, + "learning_rate": 9.934982303330138e-06, + "loss": 1.0033, + "step": 866 + }, + { + "epoch": 0.17330901276829663, + "grad_norm": 2.21875, + "learning_rate": 9.934812803598265e-06, + "loss": 1.027, + "step": 867 + }, + { + "epoch": 0.1735089078233927, + "grad_norm": 2.140625, + "learning_rate": 9.934643084662533e-06, + "loss": 1.1306, + "step": 868 + }, + { + "epoch": 0.1737088028784888, + "grad_norm": 2.28125, + "learning_rate": 9.934473146530483e-06, + "loss": 1.1985, + "step": 869 + }, + { + "epoch": 0.17390869793358488, + "grad_norm": 2.234375, + "learning_rate": 9.934302989209663e-06, + "loss": 1.2504, + "step": 870 + }, + { + "epoch": 0.17410859298868095, + "grad_norm": 2.109375, + "learning_rate": 9.934132612707631e-06, + "loss": 1.0985, + "step": 871 + }, + { + "epoch": 0.17430848804377702, + "grad_norm": 2.1875, + "learning_rate": 9.933962017031957e-06, + "loss": 1.1471, + "step": 872 + }, + { + "epoch": 0.1745083830988731, + "grad_norm": 2.296875, + "learning_rate": 9.933791202190215e-06, + "loss": 1.0599, + "step": 873 + }, + { + "epoch": 0.17470827815396917, + "grad_norm": 2.203125, + "learning_rate": 9.933620168189995e-06, + "loss": 1.162, + "step": 874 + }, + { + "epoch": 0.17490817320906524, + "grad_norm": 2.125, + "learning_rate": 9.933448915038895e-06, + "loss": 1.083, + "step": 875 + }, + { + "epoch": 0.1751080682641613, + "grad_norm": 2.15625, + "learning_rate": 9.93327744274452e-06, + "loss": 1.116, + "step": 876 + }, + { + "epoch": 0.17530796331925738, + "grad_norm": 2.0625, + "learning_rate": 9.933105751314489e-06, + "loss": 0.98, + "step": 877 + }, + { + "epoch": 0.17550785837435345, + "grad_norm": 2.109375, + "learning_rate": 9.932933840756428e-06, + "loss": 1.0147, + "step": 878 + }, + { + "epoch": 0.17570775342944953, + "grad_norm": 2.140625, + "learning_rate": 9.93276171107797e-06, + "loss": 1.0976, + "step": 879 + }, + { + "epoch": 0.17590764848454563, + "grad_norm": 2.1875, + "learning_rate": 9.932589362286766e-06, + "loss": 1.0766, + "step": 880 + }, + { + "epoch": 0.1761075435396417, + "grad_norm": 2.0625, + "learning_rate": 9.932416794390467e-06, + "loss": 1.0134, + "step": 881 + }, + { + "epoch": 0.17630743859473777, + "grad_norm": 2.015625, + "learning_rate": 9.932244007396742e-06, + "loss": 1.0746, + "step": 882 + }, + { + "epoch": 0.17650733364983384, + "grad_norm": 2.15625, + "learning_rate": 9.932071001313265e-06, + "loss": 1.066, + "step": 883 + }, + { + "epoch": 0.17670722870492991, + "grad_norm": 2.203125, + "learning_rate": 9.931897776147724e-06, + "loss": 1.1709, + "step": 884 + }, + { + "epoch": 0.17690712376002599, + "grad_norm": 2.3125, + "learning_rate": 9.931724331907806e-06, + "loss": 1.15, + "step": 885 + }, + { + "epoch": 0.17710701881512206, + "grad_norm": 2.34375, + "learning_rate": 9.931550668601222e-06, + "loss": 1.1492, + "step": 886 + }, + { + "epoch": 0.17730691387021813, + "grad_norm": 2.265625, + "learning_rate": 9.931376786235684e-06, + "loss": 1.1116, + "step": 887 + }, + { + "epoch": 0.1775068089253142, + "grad_norm": 2.4375, + "learning_rate": 9.931202684818914e-06, + "loss": 1.1235, + "step": 888 + }, + { + "epoch": 0.17770670398041027, + "grad_norm": 2.125, + "learning_rate": 9.931028364358651e-06, + "loss": 1.0914, + "step": 889 + }, + { + "epoch": 0.17790659903550635, + "grad_norm": 2.21875, + "learning_rate": 9.930853824862632e-06, + "loss": 1.1155, + "step": 890 + }, + { + "epoch": 0.17810649409060245, + "grad_norm": 2.046875, + "learning_rate": 9.930679066338613e-06, + "loss": 1.0528, + "step": 891 + }, + { + "epoch": 0.17830638914569852, + "grad_norm": 2.109375, + "learning_rate": 9.930504088794356e-06, + "loss": 1.1009, + "step": 892 + }, + { + "epoch": 0.1785062842007946, + "grad_norm": 2.15625, + "learning_rate": 9.930328892237636e-06, + "loss": 1.1056, + "step": 893 + }, + { + "epoch": 0.17870617925589066, + "grad_norm": 2.09375, + "learning_rate": 9.930153476676231e-06, + "loss": 1.0138, + "step": 894 + }, + { + "epoch": 0.17890607431098673, + "grad_norm": 2.28125, + "learning_rate": 9.929977842117935e-06, + "loss": 1.0252, + "step": 895 + }, + { + "epoch": 0.1791059693660828, + "grad_norm": 2.140625, + "learning_rate": 9.92980198857055e-06, + "loss": 1.0429, + "step": 896 + }, + { + "epoch": 0.17930586442117888, + "grad_norm": 2.109375, + "learning_rate": 9.92962591604189e-06, + "loss": 1.0689, + "step": 897 + }, + { + "epoch": 0.17950575947627495, + "grad_norm": 2.125, + "learning_rate": 9.929449624539772e-06, + "loss": 1.0321, + "step": 898 + }, + { + "epoch": 0.17970565453137102, + "grad_norm": 2.125, + "learning_rate": 9.929273114072027e-06, + "loss": 1.0742, + "step": 899 + }, + { + "epoch": 0.1799055495864671, + "grad_norm": 2.078125, + "learning_rate": 9.929096384646498e-06, + "loss": 1.1502, + "step": 900 + }, + { + "epoch": 0.18010544464156317, + "grad_norm": 2.125, + "learning_rate": 9.928919436271032e-06, + "loss": 1.1121, + "step": 901 + }, + { + "epoch": 0.18030533969665927, + "grad_norm": 2.140625, + "learning_rate": 9.928742268953493e-06, + "loss": 1.1333, + "step": 902 + }, + { + "epoch": 0.18050523475175534, + "grad_norm": 2.125, + "learning_rate": 9.928564882701749e-06, + "loss": 1.1531, + "step": 903 + }, + { + "epoch": 0.1807051298068514, + "grad_norm": 2.28125, + "learning_rate": 9.928387277523676e-06, + "loss": 1.1334, + "step": 904 + }, + { + "epoch": 0.18090502486194748, + "grad_norm": 2.140625, + "learning_rate": 9.92820945342717e-06, + "loss": 1.2134, + "step": 905 + }, + { + "epoch": 0.18110491991704356, + "grad_norm": 2.25, + "learning_rate": 9.928031410420125e-06, + "loss": 1.0337, + "step": 906 + }, + { + "epoch": 0.18130481497213963, + "grad_norm": 2.1875, + "learning_rate": 9.927853148510451e-06, + "loss": 1.0733, + "step": 907 + }, + { + "epoch": 0.1815047100272357, + "grad_norm": 2.40625, + "learning_rate": 9.92767466770607e-06, + "loss": 1.065, + "step": 908 + }, + { + "epoch": 0.18170460508233177, + "grad_norm": 2.109375, + "learning_rate": 9.927495968014903e-06, + "loss": 1.1061, + "step": 909 + }, + { + "epoch": 0.18190450013742784, + "grad_norm": 2.140625, + "learning_rate": 9.92731704944489e-06, + "loss": 1.1142, + "step": 910 + }, + { + "epoch": 0.18210439519252392, + "grad_norm": 2.109375, + "learning_rate": 9.92713791200398e-06, + "loss": 1.0194, + "step": 911 + }, + { + "epoch": 0.18230429024762, + "grad_norm": 2.1875, + "learning_rate": 9.926958555700134e-06, + "loss": 1.1316, + "step": 912 + }, + { + "epoch": 0.1825041853027161, + "grad_norm": 2.078125, + "learning_rate": 9.926778980541314e-06, + "loss": 1.0904, + "step": 913 + }, + { + "epoch": 0.18270408035781216, + "grad_norm": 2.109375, + "learning_rate": 9.926599186535496e-06, + "loss": 1.1301, + "step": 914 + }, + { + "epoch": 0.18290397541290823, + "grad_norm": 2.015625, + "learning_rate": 9.92641917369067e-06, + "loss": 1.0447, + "step": 915 + }, + { + "epoch": 0.1831038704680043, + "grad_norm": 2.140625, + "learning_rate": 9.92623894201483e-06, + "loss": 1.0149, + "step": 916 + }, + { + "epoch": 0.18330376552310038, + "grad_norm": 2.09375, + "learning_rate": 9.926058491515982e-06, + "loss": 1.1744, + "step": 917 + }, + { + "epoch": 0.18350366057819645, + "grad_norm": 2.125, + "learning_rate": 9.92587782220214e-06, + "loss": 1.0773, + "step": 918 + }, + { + "epoch": 0.18370355563329252, + "grad_norm": 2.15625, + "learning_rate": 9.925696934081335e-06, + "loss": 1.0894, + "step": 919 + }, + { + "epoch": 0.1839034506883886, + "grad_norm": 2.078125, + "learning_rate": 9.925515827161596e-06, + "loss": 1.1613, + "step": 920 + }, + { + "epoch": 0.18410334574348466, + "grad_norm": 2.171875, + "learning_rate": 9.925334501450972e-06, + "loss": 1.1258, + "step": 921 + }, + { + "epoch": 0.18430324079858074, + "grad_norm": 2.078125, + "learning_rate": 9.925152956957513e-06, + "loss": 1.0427, + "step": 922 + }, + { + "epoch": 0.1845031358536768, + "grad_norm": 2.21875, + "learning_rate": 9.924971193689287e-06, + "loss": 1.0592, + "step": 923 + }, + { + "epoch": 0.18470303090877288, + "grad_norm": 2.125, + "learning_rate": 9.924789211654367e-06, + "loss": 1.1031, + "step": 924 + }, + { + "epoch": 0.18490292596386898, + "grad_norm": 2.0625, + "learning_rate": 9.924607010860833e-06, + "loss": 1.077, + "step": 925 + }, + { + "epoch": 0.18510282101896505, + "grad_norm": 2.25, + "learning_rate": 9.924424591316785e-06, + "loss": 1.1194, + "step": 926 + }, + { + "epoch": 0.18530271607406112, + "grad_norm": 2.125, + "learning_rate": 9.924241953030323e-06, + "loss": 1.0626, + "step": 927 + }, + { + "epoch": 0.1855026111291572, + "grad_norm": 2.171875, + "learning_rate": 9.924059096009556e-06, + "loss": 1.1046, + "step": 928 + }, + { + "epoch": 0.18570250618425327, + "grad_norm": 2.265625, + "learning_rate": 9.923876020262613e-06, + "loss": 1.1724, + "step": 929 + }, + { + "epoch": 0.18590240123934934, + "grad_norm": 2.28125, + "learning_rate": 9.923692725797622e-06, + "loss": 1.1293, + "step": 930 + }, + { + "epoch": 0.1861022962944454, + "grad_norm": 2.171875, + "learning_rate": 9.923509212622726e-06, + "loss": 1.1245, + "step": 931 + }, + { + "epoch": 0.18630219134954148, + "grad_norm": 2.171875, + "learning_rate": 9.923325480746077e-06, + "loss": 1.0509, + "step": 932 + }, + { + "epoch": 0.18650208640463756, + "grad_norm": 2.234375, + "learning_rate": 9.923141530175835e-06, + "loss": 1.0779, + "step": 933 + }, + { + "epoch": 0.18670198145973363, + "grad_norm": 2.125, + "learning_rate": 9.922957360920173e-06, + "loss": 1.1274, + "step": 934 + }, + { + "epoch": 0.1869018765148297, + "grad_norm": 2.140625, + "learning_rate": 9.922772972987271e-06, + "loss": 1.1678, + "step": 935 + }, + { + "epoch": 0.1871017715699258, + "grad_norm": 2.140625, + "learning_rate": 9.922588366385319e-06, + "loss": 1.1291, + "step": 936 + }, + { + "epoch": 0.18730166662502187, + "grad_norm": 2.234375, + "learning_rate": 9.922403541122516e-06, + "loss": 0.9955, + "step": 937 + }, + { + "epoch": 0.18750156168011795, + "grad_norm": 2.234375, + "learning_rate": 9.922218497207075e-06, + "loss": 1.0822, + "step": 938 + }, + { + "epoch": 0.18770145673521402, + "grad_norm": 2.140625, + "learning_rate": 9.922033234647213e-06, + "loss": 1.0365, + "step": 939 + }, + { + "epoch": 0.1879013517903101, + "grad_norm": 2.03125, + "learning_rate": 9.921847753451162e-06, + "loss": 1.0929, + "step": 940 + }, + { + "epoch": 0.18810124684540616, + "grad_norm": 2.34375, + "learning_rate": 9.92166205362716e-06, + "loss": 1.1478, + "step": 941 + }, + { + "epoch": 0.18830114190050223, + "grad_norm": 2.0625, + "learning_rate": 9.921476135183452e-06, + "loss": 1.037, + "step": 942 + }, + { + "epoch": 0.1885010369555983, + "grad_norm": 2.09375, + "learning_rate": 9.921289998128303e-06, + "loss": 1.0561, + "step": 943 + }, + { + "epoch": 0.18870093201069438, + "grad_norm": 2.3125, + "learning_rate": 9.921103642469976e-06, + "loss": 1.0575, + "step": 944 + }, + { + "epoch": 0.18890082706579045, + "grad_norm": 2.109375, + "learning_rate": 9.92091706821675e-06, + "loss": 1.1551, + "step": 945 + }, + { + "epoch": 0.18910072212088652, + "grad_norm": 2.15625, + "learning_rate": 9.920730275376915e-06, + "loss": 1.09, + "step": 946 + }, + { + "epoch": 0.18930061717598262, + "grad_norm": 2.15625, + "learning_rate": 9.920543263958767e-06, + "loss": 1.0898, + "step": 947 + }, + { + "epoch": 0.1895005122310787, + "grad_norm": 2.484375, + "learning_rate": 9.920356033970613e-06, + "loss": 1.1994, + "step": 948 + }, + { + "epoch": 0.18970040728617477, + "grad_norm": 2.296875, + "learning_rate": 9.920168585420768e-06, + "loss": 1.2403, + "step": 949 + }, + { + "epoch": 0.18990030234127084, + "grad_norm": 2.125, + "learning_rate": 9.91998091831756e-06, + "loss": 1.2184, + "step": 950 + }, + { + "epoch": 0.1901001973963669, + "grad_norm": 2.28125, + "learning_rate": 9.919793032669324e-06, + "loss": 1.2009, + "step": 951 + }, + { + "epoch": 0.19030009245146298, + "grad_norm": 2.078125, + "learning_rate": 9.91960492848441e-06, + "loss": 1.0286, + "step": 952 + }, + { + "epoch": 0.19049998750655905, + "grad_norm": 2.21875, + "learning_rate": 9.91941660577117e-06, + "loss": 1.0413, + "step": 953 + }, + { + "epoch": 0.19069988256165513, + "grad_norm": 2.15625, + "learning_rate": 9.919228064537968e-06, + "loss": 1.0637, + "step": 954 + }, + { + "epoch": 0.1908997776167512, + "grad_norm": 2.3125, + "learning_rate": 9.91903930479318e-06, + "loss": 1.206, + "step": 955 + }, + { + "epoch": 0.19109967267184727, + "grad_norm": 2.046875, + "learning_rate": 9.918850326545195e-06, + "loss": 1.0584, + "step": 956 + }, + { + "epoch": 0.19129956772694334, + "grad_norm": 2.15625, + "learning_rate": 9.918661129802402e-06, + "loss": 1.1496, + "step": 957 + }, + { + "epoch": 0.19149946278203944, + "grad_norm": 2.046875, + "learning_rate": 9.918471714573205e-06, + "loss": 1.0116, + "step": 958 + }, + { + "epoch": 0.19169935783713551, + "grad_norm": 2.21875, + "learning_rate": 9.918282080866022e-06, + "loss": 1.1686, + "step": 959 + }, + { + "epoch": 0.1918992528922316, + "grad_norm": 2.265625, + "learning_rate": 9.918092228689276e-06, + "loss": 1.0539, + "step": 960 + }, + { + "epoch": 0.19209914794732766, + "grad_norm": 2.125, + "learning_rate": 9.917902158051395e-06, + "loss": 1.1964, + "step": 961 + }, + { + "epoch": 0.19229904300242373, + "grad_norm": 2.203125, + "learning_rate": 9.917711868960826e-06, + "loss": 1.1666, + "step": 962 + }, + { + "epoch": 0.1924989380575198, + "grad_norm": 2.109375, + "learning_rate": 9.917521361426023e-06, + "loss": 1.0984, + "step": 963 + }, + { + "epoch": 0.19269883311261587, + "grad_norm": 2.140625, + "learning_rate": 9.917330635455445e-06, + "loss": 1.137, + "step": 964 + }, + { + "epoch": 0.19289872816771195, + "grad_norm": 2.0625, + "learning_rate": 9.917139691057565e-06, + "loss": 1.084, + "step": 965 + }, + { + "epoch": 0.19309862322280802, + "grad_norm": 1.9921875, + "learning_rate": 9.916948528240865e-06, + "loss": 1.056, + "step": 966 + }, + { + "epoch": 0.1932985182779041, + "grad_norm": 2.09375, + "learning_rate": 9.916757147013837e-06, + "loss": 1.0956, + "step": 967 + }, + { + "epoch": 0.19349841333300016, + "grad_norm": 2.25, + "learning_rate": 9.916565547384981e-06, + "loss": 1.1427, + "step": 968 + }, + { + "epoch": 0.19369830838809626, + "grad_norm": 2.046875, + "learning_rate": 9.91637372936281e-06, + "loss": 1.0469, + "step": 969 + }, + { + "epoch": 0.19389820344319234, + "grad_norm": 1.9765625, + "learning_rate": 9.916181692955841e-06, + "loss": 1.0015, + "step": 970 + }, + { + "epoch": 0.1940980984982884, + "grad_norm": 2.375, + "learning_rate": 9.915989438172608e-06, + "loss": 1.1941, + "step": 971 + }, + { + "epoch": 0.19429799355338448, + "grad_norm": 2.328125, + "learning_rate": 9.915796965021648e-06, + "loss": 1.109, + "step": 972 + }, + { + "epoch": 0.19449788860848055, + "grad_norm": 2.140625, + "learning_rate": 9.915604273511514e-06, + "loss": 1.0788, + "step": 973 + }, + { + "epoch": 0.19469778366357662, + "grad_norm": 2.015625, + "learning_rate": 9.915411363650762e-06, + "loss": 1.0593, + "step": 974 + }, + { + "epoch": 0.1948976787186727, + "grad_norm": 2.3125, + "learning_rate": 9.915218235447962e-06, + "loss": 1.1721, + "step": 975 + }, + { + "epoch": 0.19509757377376877, + "grad_norm": 2.328125, + "learning_rate": 9.915024888911692e-06, + "loss": 1.1182, + "step": 976 + }, + { + "epoch": 0.19529746882886484, + "grad_norm": 2.171875, + "learning_rate": 9.914831324050542e-06, + "loss": 1.1052, + "step": 977 + }, + { + "epoch": 0.1954973638839609, + "grad_norm": 2.140625, + "learning_rate": 9.914637540873112e-06, + "loss": 1.1633, + "step": 978 + }, + { + "epoch": 0.19569725893905698, + "grad_norm": 2.046875, + "learning_rate": 9.914443539388003e-06, + "loss": 1.1462, + "step": 979 + }, + { + "epoch": 0.19589715399415306, + "grad_norm": 2.21875, + "learning_rate": 9.914249319603839e-06, + "loss": 1.0912, + "step": 980 + }, + { + "epoch": 0.19609704904924916, + "grad_norm": 2.078125, + "learning_rate": 9.914054881529245e-06, + "loss": 1.0723, + "step": 981 + }, + { + "epoch": 0.19629694410434523, + "grad_norm": 2.125, + "learning_rate": 9.91386022517286e-06, + "loss": 1.1248, + "step": 982 + }, + { + "epoch": 0.1964968391594413, + "grad_norm": 2.109375, + "learning_rate": 9.913665350543324e-06, + "loss": 1.0391, + "step": 983 + }, + { + "epoch": 0.19669673421453737, + "grad_norm": 2.234375, + "learning_rate": 9.913470257649303e-06, + "loss": 1.1673, + "step": 984 + }, + { + "epoch": 0.19689662926963344, + "grad_norm": 2.078125, + "learning_rate": 9.913274946499453e-06, + "loss": 1.1034, + "step": 985 + }, + { + "epoch": 0.19709652432472952, + "grad_norm": 2.140625, + "learning_rate": 9.913079417102458e-06, + "loss": 1.1082, + "step": 986 + }, + { + "epoch": 0.1972964193798256, + "grad_norm": 2.265625, + "learning_rate": 9.912883669467e-06, + "loss": 1.1261, + "step": 987 + }, + { + "epoch": 0.19749631443492166, + "grad_norm": 2.21875, + "learning_rate": 9.912687703601774e-06, + "loss": 1.1113, + "step": 988 + }, + { + "epoch": 0.19769620949001773, + "grad_norm": 2.078125, + "learning_rate": 9.912491519515484e-06, + "loss": 1.1534, + "step": 989 + }, + { + "epoch": 0.1978961045451138, + "grad_norm": 2.234375, + "learning_rate": 9.912295117216844e-06, + "loss": 1.0584, + "step": 990 + }, + { + "epoch": 0.19809599960020988, + "grad_norm": 2.109375, + "learning_rate": 9.912098496714582e-06, + "loss": 1.0466, + "step": 991 + }, + { + "epoch": 0.19829589465530598, + "grad_norm": 2.109375, + "learning_rate": 9.911901658017428e-06, + "loss": 1.0825, + "step": 992 + }, + { + "epoch": 0.19849578971040205, + "grad_norm": 2.109375, + "learning_rate": 9.911704601134127e-06, + "loss": 1.1456, + "step": 993 + }, + { + "epoch": 0.19869568476549812, + "grad_norm": 2.296875, + "learning_rate": 9.911507326073433e-06, + "loss": 1.1568, + "step": 994 + }, + { + "epoch": 0.1988955798205942, + "grad_norm": 2.15625, + "learning_rate": 9.911309832844108e-06, + "loss": 1.1335, + "step": 995 + }, + { + "epoch": 0.19909547487569026, + "grad_norm": 2.078125, + "learning_rate": 9.911112121454925e-06, + "loss": 1.0891, + "step": 996 + }, + { + "epoch": 0.19929536993078634, + "grad_norm": 2.21875, + "learning_rate": 9.910914191914664e-06, + "loss": 1.1071, + "step": 997 + }, + { + "epoch": 0.1994952649858824, + "grad_norm": 2.171875, + "learning_rate": 9.910716044232122e-06, + "loss": 1.0985, + "step": 998 + }, + { + "epoch": 0.19969516004097848, + "grad_norm": 2.015625, + "learning_rate": 9.910517678416097e-06, + "loss": 1.055, + "step": 999 + }, + { + "epoch": 0.19989505509607455, + "grad_norm": 2.03125, + "learning_rate": 9.9103190944754e-06, + "loss": 1.092, + "step": 1000 + }, + { + "epoch": 0.20009495015117063, + "grad_norm": 2.125, + "learning_rate": 9.910120292418855e-06, + "loss": 1.1513, + "step": 1001 + }, + { + "epoch": 0.2002948452062667, + "grad_norm": 2.3125, + "learning_rate": 9.909921272255289e-06, + "loss": 1.0961, + "step": 1002 + }, + { + "epoch": 0.2004947402613628, + "grad_norm": 2.171875, + "learning_rate": 9.909722033993546e-06, + "loss": 1.1312, + "step": 1003 + }, + { + "epoch": 0.20069463531645887, + "grad_norm": 2.0625, + "learning_rate": 9.909522577642474e-06, + "loss": 1.088, + "step": 1004 + }, + { + "epoch": 0.20089453037155494, + "grad_norm": 2.140625, + "learning_rate": 9.909322903210934e-06, + "loss": 1.0347, + "step": 1005 + }, + { + "epoch": 0.201094425426651, + "grad_norm": 2.234375, + "learning_rate": 9.909123010707793e-06, + "loss": 1.097, + "step": 1006 + }, + { + "epoch": 0.20129432048174709, + "grad_norm": 2.15625, + "learning_rate": 9.908922900141935e-06, + "loss": 1.158, + "step": 1007 + }, + { + "epoch": 0.20149421553684316, + "grad_norm": 2.09375, + "learning_rate": 9.908722571522244e-06, + "loss": 1.1167, + "step": 1008 + }, + { + "epoch": 0.20169411059193923, + "grad_norm": 2.015625, + "learning_rate": 9.90852202485762e-06, + "loss": 1.0482, + "step": 1009 + }, + { + "epoch": 0.2018940056470353, + "grad_norm": 2.03125, + "learning_rate": 9.908321260156975e-06, + "loss": 1.0526, + "step": 1010 + }, + { + "epoch": 0.20209390070213137, + "grad_norm": 2.21875, + "learning_rate": 9.908120277429224e-06, + "loss": 1.1366, + "step": 1011 + }, + { + "epoch": 0.20229379575722745, + "grad_norm": 2.09375, + "learning_rate": 9.90791907668329e-06, + "loss": 1.0403, + "step": 1012 + }, + { + "epoch": 0.20249369081232352, + "grad_norm": 2.140625, + "learning_rate": 9.907717657928117e-06, + "loss": 1.0606, + "step": 1013 + }, + { + "epoch": 0.20269358586741962, + "grad_norm": 2.390625, + "learning_rate": 9.907516021172652e-06, + "loss": 1.0486, + "step": 1014 + }, + { + "epoch": 0.2028934809225157, + "grad_norm": 2.1875, + "learning_rate": 9.907314166425847e-06, + "loss": 1.1866, + "step": 1015 + }, + { + "epoch": 0.20309337597761176, + "grad_norm": 2.078125, + "learning_rate": 9.907112093696672e-06, + "loss": 1.0322, + "step": 1016 + }, + { + "epoch": 0.20329327103270783, + "grad_norm": 2.109375, + "learning_rate": 9.906909802994101e-06, + "loss": 1.1012, + "step": 1017 + }, + { + "epoch": 0.2034931660878039, + "grad_norm": 2.015625, + "learning_rate": 9.90670729432712e-06, + "loss": 1.0504, + "step": 1018 + }, + { + "epoch": 0.20369306114289998, + "grad_norm": 2.078125, + "learning_rate": 9.906504567704727e-06, + "loss": 1.0983, + "step": 1019 + }, + { + "epoch": 0.20389295619799605, + "grad_norm": 2.125, + "learning_rate": 9.906301623135925e-06, + "loss": 1.1327, + "step": 1020 + }, + { + "epoch": 0.20409285125309212, + "grad_norm": 2.109375, + "learning_rate": 9.906098460629728e-06, + "loss": 1.0782, + "step": 1021 + }, + { + "epoch": 0.2042927463081882, + "grad_norm": 2.078125, + "learning_rate": 9.905895080195161e-06, + "loss": 1.0306, + "step": 1022 + }, + { + "epoch": 0.20449264136328427, + "grad_norm": 2.15625, + "learning_rate": 9.90569148184126e-06, + "loss": 1.0627, + "step": 1023 + }, + { + "epoch": 0.20469253641838034, + "grad_norm": 1.9921875, + "learning_rate": 9.905487665577067e-06, + "loss": 1.0469, + "step": 1024 + }, + { + "epoch": 0.20489243147347644, + "grad_norm": 2.109375, + "learning_rate": 9.905283631411635e-06, + "loss": 0.9674, + "step": 1025 + }, + { + "epoch": 0.2050923265285725, + "grad_norm": 2.1875, + "learning_rate": 9.905079379354028e-06, + "loss": 1.1038, + "step": 1026 + }, + { + "epoch": 0.20529222158366858, + "grad_norm": 2.171875, + "learning_rate": 9.904874909413318e-06, + "loss": 1.0563, + "step": 1027 + }, + { + "epoch": 0.20549211663876465, + "grad_norm": 2.09375, + "learning_rate": 9.90467022159859e-06, + "loss": 1.079, + "step": 1028 + }, + { + "epoch": 0.20569201169386073, + "grad_norm": 2.125, + "learning_rate": 9.904465315918934e-06, + "loss": 1.0519, + "step": 1029 + }, + { + "epoch": 0.2058919067489568, + "grad_norm": 2.328125, + "learning_rate": 9.904260192383452e-06, + "loss": 1.1527, + "step": 1030 + }, + { + "epoch": 0.20609180180405287, + "grad_norm": 2.140625, + "learning_rate": 9.904054851001257e-06, + "loss": 1.0425, + "step": 1031 + }, + { + "epoch": 0.20629169685914894, + "grad_norm": 2.21875, + "learning_rate": 9.903849291781468e-06, + "loss": 1.0826, + "step": 1032 + }, + { + "epoch": 0.20649159191424502, + "grad_norm": 2.28125, + "learning_rate": 9.903643514733218e-06, + "loss": 1.0684, + "step": 1033 + }, + { + "epoch": 0.2066914869693411, + "grad_norm": 2.1875, + "learning_rate": 9.903437519865648e-06, + "loss": 1.0938, + "step": 1034 + }, + { + "epoch": 0.20689138202443716, + "grad_norm": 2.15625, + "learning_rate": 9.903231307187906e-06, + "loss": 1.1274, + "step": 1035 + }, + { + "epoch": 0.20709127707953323, + "grad_norm": 2.171875, + "learning_rate": 9.903024876709154e-06, + "loss": 1.1545, + "step": 1036 + }, + { + "epoch": 0.20729117213462933, + "grad_norm": 2.015625, + "learning_rate": 9.902818228438557e-06, + "loss": 1.0743, + "step": 1037 + }, + { + "epoch": 0.2074910671897254, + "grad_norm": 2.171875, + "learning_rate": 9.902611362385302e-06, + "loss": 1.0972, + "step": 1038 + }, + { + "epoch": 0.20769096224482148, + "grad_norm": 4.625, + "learning_rate": 9.902404278558573e-06, + "loss": 1.0203, + "step": 1039 + }, + { + "epoch": 0.20789085729991755, + "grad_norm": 2.09375, + "learning_rate": 9.902196976967568e-06, + "loss": 1.0272, + "step": 1040 + }, + { + "epoch": 0.20809075235501362, + "grad_norm": 2.328125, + "learning_rate": 9.901989457621497e-06, + "loss": 1.2083, + "step": 1041 + }, + { + "epoch": 0.2082906474101097, + "grad_norm": 2.03125, + "learning_rate": 9.90178172052958e-06, + "loss": 1.0203, + "step": 1042 + }, + { + "epoch": 0.20849054246520576, + "grad_norm": 2.140625, + "learning_rate": 9.90157376570104e-06, + "loss": 1.1863, + "step": 1043 + }, + { + "epoch": 0.20869043752030184, + "grad_norm": 2.1875, + "learning_rate": 9.901365593145119e-06, + "loss": 1.2206, + "step": 1044 + }, + { + "epoch": 0.2088903325753979, + "grad_norm": 2.28125, + "learning_rate": 9.90115720287106e-06, + "loss": 1.1041, + "step": 1045 + }, + { + "epoch": 0.20909022763049398, + "grad_norm": 2.1875, + "learning_rate": 9.900948594888122e-06, + "loss": 1.1631, + "step": 1046 + }, + { + "epoch": 0.20929012268559005, + "grad_norm": 2.15625, + "learning_rate": 9.900739769205571e-06, + "loss": 1.0053, + "step": 1047 + }, + { + "epoch": 0.20949001774068615, + "grad_norm": 2.28125, + "learning_rate": 9.900530725832685e-06, + "loss": 1.0736, + "step": 1048 + }, + { + "epoch": 0.20968991279578222, + "grad_norm": 2.171875, + "learning_rate": 9.900321464778745e-06, + "loss": 1.1255, + "step": 1049 + }, + { + "epoch": 0.2098898078508783, + "grad_norm": 2.234375, + "learning_rate": 9.90011198605305e-06, + "loss": 1.0491, + "step": 1050 + }, + { + "epoch": 0.21008970290597437, + "grad_norm": 2.15625, + "learning_rate": 9.899902289664902e-06, + "loss": 1.0678, + "step": 1051 + }, + { + "epoch": 0.21028959796107044, + "grad_norm": 2.078125, + "learning_rate": 9.899692375623619e-06, + "loss": 1.0399, + "step": 1052 + }, + { + "epoch": 0.2104894930161665, + "grad_norm": 2.203125, + "learning_rate": 9.899482243938526e-06, + "loss": 1.0988, + "step": 1053 + }, + { + "epoch": 0.21068938807126258, + "grad_norm": 2.0625, + "learning_rate": 9.89927189461895e-06, + "loss": 1.0065, + "step": 1054 + }, + { + "epoch": 0.21088928312635866, + "grad_norm": 2.171875, + "learning_rate": 9.899061327674245e-06, + "loss": 1.0968, + "step": 1055 + }, + { + "epoch": 0.21108917818145473, + "grad_norm": 2.1875, + "learning_rate": 9.898850543113756e-06, + "loss": 1.1262, + "step": 1056 + }, + { + "epoch": 0.2112890732365508, + "grad_norm": 2.09375, + "learning_rate": 9.89863954094685e-06, + "loss": 0.9881, + "step": 1057 + }, + { + "epoch": 0.21148896829164687, + "grad_norm": 2.140625, + "learning_rate": 9.8984283211829e-06, + "loss": 1.1341, + "step": 1058 + }, + { + "epoch": 0.21168886334674297, + "grad_norm": 2.109375, + "learning_rate": 9.898216883831284e-06, + "loss": 1.0557, + "step": 1059 + }, + { + "epoch": 0.21188875840183904, + "grad_norm": 2.125, + "learning_rate": 9.898005228901399e-06, + "loss": 1.1488, + "step": 1060 + }, + { + "epoch": 0.21208865345693512, + "grad_norm": 2.21875, + "learning_rate": 9.897793356402646e-06, + "loss": 1.1246, + "step": 1061 + }, + { + "epoch": 0.2122885485120312, + "grad_norm": 2.171875, + "learning_rate": 9.897581266344434e-06, + "loss": 1.1322, + "step": 1062 + }, + { + "epoch": 0.21248844356712726, + "grad_norm": 2.21875, + "learning_rate": 9.897368958736185e-06, + "loss": 1.0576, + "step": 1063 + }, + { + "epoch": 0.21268833862222333, + "grad_norm": 2.0625, + "learning_rate": 9.89715643358733e-06, + "loss": 1.0027, + "step": 1064 + }, + { + "epoch": 0.2128882336773194, + "grad_norm": 1.953125, + "learning_rate": 9.89694369090731e-06, + "loss": 1.091, + "step": 1065 + }, + { + "epoch": 0.21308812873241548, + "grad_norm": 2.125, + "learning_rate": 9.896730730705574e-06, + "loss": 1.0713, + "step": 1066 + }, + { + "epoch": 0.21328802378751155, + "grad_norm": 2.25, + "learning_rate": 9.896517552991581e-06, + "loss": 1.1078, + "step": 1067 + }, + { + "epoch": 0.21348791884260762, + "grad_norm": 2.25, + "learning_rate": 9.896304157774802e-06, + "loss": 1.1173, + "step": 1068 + }, + { + "epoch": 0.2136878138977037, + "grad_norm": 2.125, + "learning_rate": 9.896090545064717e-06, + "loss": 1.0837, + "step": 1069 + }, + { + "epoch": 0.2138877089527998, + "grad_norm": 2.28125, + "learning_rate": 9.895876714870809e-06, + "loss": 1.1325, + "step": 1070 + }, + { + "epoch": 0.21408760400789587, + "grad_norm": 2.25, + "learning_rate": 9.895662667202583e-06, + "loss": 1.0625, + "step": 1071 + }, + { + "epoch": 0.21428749906299194, + "grad_norm": 2.0, + "learning_rate": 9.895448402069543e-06, + "loss": 1.0352, + "step": 1072 + }, + { + "epoch": 0.214487394118088, + "grad_norm": 2.15625, + "learning_rate": 9.895233919481208e-06, + "loss": 1.1547, + "step": 1073 + }, + { + "epoch": 0.21468728917318408, + "grad_norm": 2.140625, + "learning_rate": 9.895019219447107e-06, + "loss": 1.1479, + "step": 1074 + }, + { + "epoch": 0.21488718422828015, + "grad_norm": 2.0625, + "learning_rate": 9.894804301976773e-06, + "loss": 0.9864, + "step": 1075 + }, + { + "epoch": 0.21508707928337623, + "grad_norm": 1.9921875, + "learning_rate": 9.894589167079754e-06, + "loss": 1.0687, + "step": 1076 + }, + { + "epoch": 0.2152869743384723, + "grad_norm": 2.15625, + "learning_rate": 9.894373814765609e-06, + "loss": 1.063, + "step": 1077 + }, + { + "epoch": 0.21548686939356837, + "grad_norm": 2.09375, + "learning_rate": 9.894158245043902e-06, + "loss": 1.114, + "step": 1078 + }, + { + "epoch": 0.21568676444866444, + "grad_norm": 2.046875, + "learning_rate": 9.893942457924206e-06, + "loss": 1.1266, + "step": 1079 + }, + { + "epoch": 0.21588665950376051, + "grad_norm": 2.203125, + "learning_rate": 9.89372645341611e-06, + "loss": 1.0792, + "step": 1080 + }, + { + "epoch": 0.21608655455885661, + "grad_norm": 2.1875, + "learning_rate": 9.893510231529209e-06, + "loss": 1.1263, + "step": 1081 + }, + { + "epoch": 0.21628644961395269, + "grad_norm": 2.078125, + "learning_rate": 9.893293792273104e-06, + "loss": 1.0622, + "step": 1082 + }, + { + "epoch": 0.21648634466904876, + "grad_norm": 2.21875, + "learning_rate": 9.893077135657413e-06, + "loss": 1.1443, + "step": 1083 + }, + { + "epoch": 0.21668623972414483, + "grad_norm": 2.125, + "learning_rate": 9.892860261691756e-06, + "loss": 1.0848, + "step": 1084 + }, + { + "epoch": 0.2168861347792409, + "grad_norm": 2.171875, + "learning_rate": 9.892643170385771e-06, + "loss": 1.0369, + "step": 1085 + }, + { + "epoch": 0.21708602983433697, + "grad_norm": 2.078125, + "learning_rate": 9.8924258617491e-06, + "loss": 1.0839, + "step": 1086 + }, + { + "epoch": 0.21728592488943305, + "grad_norm": 1.96875, + "learning_rate": 9.892208335791392e-06, + "loss": 1.0948, + "step": 1087 + }, + { + "epoch": 0.21748581994452912, + "grad_norm": 2.203125, + "learning_rate": 9.891990592522314e-06, + "loss": 1.0512, + "step": 1088 + }, + { + "epoch": 0.2176857149996252, + "grad_norm": 2.0625, + "learning_rate": 9.891772631951535e-06, + "loss": 1.0416, + "step": 1089 + }, + { + "epoch": 0.21788561005472126, + "grad_norm": 2.0625, + "learning_rate": 9.891554454088738e-06, + "loss": 1.1323, + "step": 1090 + }, + { + "epoch": 0.21808550510981733, + "grad_norm": 2.125, + "learning_rate": 9.891336058943617e-06, + "loss": 1.1509, + "step": 1091 + }, + { + "epoch": 0.2182854001649134, + "grad_norm": 2.1875, + "learning_rate": 9.891117446525869e-06, + "loss": 1.0703, + "step": 1092 + }, + { + "epoch": 0.2184852952200095, + "grad_norm": 2.21875, + "learning_rate": 9.890898616845206e-06, + "loss": 1.1602, + "step": 1093 + }, + { + "epoch": 0.21868519027510558, + "grad_norm": 2.234375, + "learning_rate": 9.890679569911349e-06, + "loss": 1.0828, + "step": 1094 + }, + { + "epoch": 0.21888508533020165, + "grad_norm": 2.0, + "learning_rate": 9.890460305734028e-06, + "loss": 1.0106, + "step": 1095 + }, + { + "epoch": 0.21908498038529772, + "grad_norm": 2.0625, + "learning_rate": 9.890240824322983e-06, + "loss": 1.1057, + "step": 1096 + }, + { + "epoch": 0.2192848754403938, + "grad_norm": 2.140625, + "learning_rate": 9.890021125687962e-06, + "loss": 1.1892, + "step": 1097 + }, + { + "epoch": 0.21948477049548987, + "grad_norm": 2.03125, + "learning_rate": 9.889801209838725e-06, + "loss": 1.0643, + "step": 1098 + }, + { + "epoch": 0.21968466555058594, + "grad_norm": 2.234375, + "learning_rate": 9.889581076785042e-06, + "loss": 1.1837, + "step": 1099 + }, + { + "epoch": 0.219884560605682, + "grad_norm": 2.703125, + "learning_rate": 9.889360726536687e-06, + "loss": 1.2431, + "step": 1100 + }, + { + "epoch": 0.22008445566077808, + "grad_norm": 2.28125, + "learning_rate": 9.889140159103454e-06, + "loss": 1.1512, + "step": 1101 + }, + { + "epoch": 0.22028435071587416, + "grad_norm": 2.34375, + "learning_rate": 9.888919374495134e-06, + "loss": 1.1054, + "step": 1102 + }, + { + "epoch": 0.22048424577097023, + "grad_norm": 2.046875, + "learning_rate": 9.88869837272154e-06, + "loss": 1.0885, + "step": 1103 + }, + { + "epoch": 0.22068414082606633, + "grad_norm": 2.1875, + "learning_rate": 9.888477153792486e-06, + "loss": 1.0958, + "step": 1104 + }, + { + "epoch": 0.2208840358811624, + "grad_norm": 2.265625, + "learning_rate": 9.888255717717798e-06, + "loss": 1.1226, + "step": 1105 + }, + { + "epoch": 0.22108393093625847, + "grad_norm": 2.140625, + "learning_rate": 9.888034064507314e-06, + "loss": 1.1548, + "step": 1106 + }, + { + "epoch": 0.22128382599135454, + "grad_norm": 2.078125, + "learning_rate": 9.887812194170878e-06, + "loss": 1.0003, + "step": 1107 + }, + { + "epoch": 0.22148372104645062, + "grad_norm": 2.1875, + "learning_rate": 9.887590106718348e-06, + "loss": 1.0862, + "step": 1108 + }, + { + "epoch": 0.2216836161015467, + "grad_norm": 2.046875, + "learning_rate": 9.887367802159587e-06, + "loss": 1.0139, + "step": 1109 + }, + { + "epoch": 0.22188351115664276, + "grad_norm": 2.3125, + "learning_rate": 9.887145280504468e-06, + "loss": 1.072, + "step": 1110 + }, + { + "epoch": 0.22208340621173883, + "grad_norm": 2.125, + "learning_rate": 9.88692254176288e-06, + "loss": 0.973, + "step": 1111 + }, + { + "epoch": 0.2222833012668349, + "grad_norm": 2.265625, + "learning_rate": 9.886699585944715e-06, + "loss": 1.1539, + "step": 1112 + }, + { + "epoch": 0.22248319632193098, + "grad_norm": 2.03125, + "learning_rate": 9.886476413059874e-06, + "loss": 0.9878, + "step": 1113 + }, + { + "epoch": 0.22268309137702705, + "grad_norm": 2.234375, + "learning_rate": 9.886253023118276e-06, + "loss": 1.1039, + "step": 1114 + }, + { + "epoch": 0.22288298643212315, + "grad_norm": 2.1875, + "learning_rate": 9.886029416129837e-06, + "loss": 1.0656, + "step": 1115 + }, + { + "epoch": 0.22308288148721922, + "grad_norm": 2.140625, + "learning_rate": 9.885805592104494e-06, + "loss": 1.0074, + "step": 1116 + }, + { + "epoch": 0.2232827765423153, + "grad_norm": 2.28125, + "learning_rate": 9.88558155105219e-06, + "loss": 1.1455, + "step": 1117 + }, + { + "epoch": 0.22348267159741136, + "grad_norm": 2.140625, + "learning_rate": 9.885357292982873e-06, + "loss": 1.1776, + "step": 1118 + }, + { + "epoch": 0.22368256665250744, + "grad_norm": 2.125, + "learning_rate": 9.885132817906509e-06, + "loss": 1.1606, + "step": 1119 + }, + { + "epoch": 0.2238824617076035, + "grad_norm": 2.171875, + "learning_rate": 9.884908125833066e-06, + "loss": 1.1144, + "step": 1120 + }, + { + "epoch": 0.22408235676269958, + "grad_norm": 2.109375, + "learning_rate": 9.884683216772527e-06, + "loss": 1.1148, + "step": 1121 + }, + { + "epoch": 0.22428225181779565, + "grad_norm": 2.140625, + "learning_rate": 9.88445809073488e-06, + "loss": 1.0931, + "step": 1122 + }, + { + "epoch": 0.22448214687289172, + "grad_norm": 2.140625, + "learning_rate": 9.884232747730125e-06, + "loss": 1.1479, + "step": 1123 + }, + { + "epoch": 0.2246820419279878, + "grad_norm": 2.078125, + "learning_rate": 9.884007187768275e-06, + "loss": 0.9995, + "step": 1124 + }, + { + "epoch": 0.22488193698308387, + "grad_norm": 2.234375, + "learning_rate": 9.883781410859347e-06, + "loss": 1.1488, + "step": 1125 + }, + { + "epoch": 0.22508183203817997, + "grad_norm": 2.140625, + "learning_rate": 9.88355541701337e-06, + "loss": 1.0807, + "step": 1126 + }, + { + "epoch": 0.22528172709327604, + "grad_norm": 2.09375, + "learning_rate": 9.883329206240383e-06, + "loss": 1.0489, + "step": 1127 + }, + { + "epoch": 0.2254816221483721, + "grad_norm": 2.03125, + "learning_rate": 9.883102778550434e-06, + "loss": 1.0662, + "step": 1128 + }, + { + "epoch": 0.22568151720346818, + "grad_norm": 2.171875, + "learning_rate": 9.882876133953582e-06, + "loss": 1.1357, + "step": 1129 + }, + { + "epoch": 0.22588141225856426, + "grad_norm": 2.09375, + "learning_rate": 9.882649272459892e-06, + "loss": 1.0505, + "step": 1130 + }, + { + "epoch": 0.22608130731366033, + "grad_norm": 2.125, + "learning_rate": 9.882422194079444e-06, + "loss": 1.1083, + "step": 1131 + }, + { + "epoch": 0.2262812023687564, + "grad_norm": 2.171875, + "learning_rate": 9.882194898822324e-06, + "loss": 1.1381, + "step": 1132 + }, + { + "epoch": 0.22648109742385247, + "grad_norm": 2.140625, + "learning_rate": 9.881967386698627e-06, + "loss": 1.0367, + "step": 1133 + }, + { + "epoch": 0.22668099247894855, + "grad_norm": 2.328125, + "learning_rate": 9.881739657718462e-06, + "loss": 1.2138, + "step": 1134 + }, + { + "epoch": 0.22688088753404462, + "grad_norm": 2.125, + "learning_rate": 9.881511711891941e-06, + "loss": 1.1267, + "step": 1135 + }, + { + "epoch": 0.2270807825891407, + "grad_norm": 2.21875, + "learning_rate": 9.88128354922919e-06, + "loss": 1.0884, + "step": 1136 + }, + { + "epoch": 0.2272806776442368, + "grad_norm": 2.140625, + "learning_rate": 9.881055169740347e-06, + "loss": 1.1466, + "step": 1137 + }, + { + "epoch": 0.22748057269933286, + "grad_norm": 2.0625, + "learning_rate": 9.880826573435555e-06, + "loss": 1.16, + "step": 1138 + }, + { + "epoch": 0.22768046775442893, + "grad_norm": 2.296875, + "learning_rate": 9.880597760324966e-06, + "loss": 1.1587, + "step": 1139 + }, + { + "epoch": 0.227880362809525, + "grad_norm": 2.109375, + "learning_rate": 9.880368730418749e-06, + "loss": 1.1319, + "step": 1140 + }, + { + "epoch": 0.22808025786462108, + "grad_norm": 2.171875, + "learning_rate": 9.880139483727071e-06, + "loss": 1.1318, + "step": 1141 + }, + { + "epoch": 0.22828015291971715, + "grad_norm": 2.0625, + "learning_rate": 9.879910020260119e-06, + "loss": 1.0074, + "step": 1142 + }, + { + "epoch": 0.22848004797481322, + "grad_norm": 2.21875, + "learning_rate": 9.879680340028087e-06, + "loss": 1.0707, + "step": 1143 + }, + { + "epoch": 0.2286799430299093, + "grad_norm": 2.140625, + "learning_rate": 9.879450443041172e-06, + "loss": 1.0146, + "step": 1144 + }, + { + "epoch": 0.22887983808500537, + "grad_norm": 2.203125, + "learning_rate": 9.879220329309591e-06, + "loss": 1.1047, + "step": 1145 + }, + { + "epoch": 0.22907973314010144, + "grad_norm": 2.234375, + "learning_rate": 9.878989998843565e-06, + "loss": 1.0841, + "step": 1146 + }, + { + "epoch": 0.2292796281951975, + "grad_norm": 2.1875, + "learning_rate": 9.878759451653323e-06, + "loss": 1.1131, + "step": 1147 + }, + { + "epoch": 0.22947952325029358, + "grad_norm": 2.140625, + "learning_rate": 9.87852868774911e-06, + "loss": 1.1771, + "step": 1148 + }, + { + "epoch": 0.22967941830538968, + "grad_norm": 2.25, + "learning_rate": 9.878297707141172e-06, + "loss": 1.1596, + "step": 1149 + }, + { + "epoch": 0.22987931336048575, + "grad_norm": 2.203125, + "learning_rate": 9.87806650983977e-06, + "loss": 1.1953, + "step": 1150 + }, + { + "epoch": 0.23007920841558183, + "grad_norm": 2.109375, + "learning_rate": 9.877835095855174e-06, + "loss": 1.0653, + "step": 1151 + }, + { + "epoch": 0.2302791034706779, + "grad_norm": 2.1875, + "learning_rate": 9.877603465197667e-06, + "loss": 1.0236, + "step": 1152 + }, + { + "epoch": 0.23047899852577397, + "grad_norm": 2.171875, + "learning_rate": 9.877371617877533e-06, + "loss": 1.1288, + "step": 1153 + }, + { + "epoch": 0.23067889358087004, + "grad_norm": 2.109375, + "learning_rate": 9.877139553905072e-06, + "loss": 1.0238, + "step": 1154 + }, + { + "epoch": 0.23087878863596611, + "grad_norm": 2.296875, + "learning_rate": 9.876907273290594e-06, + "loss": 1.0817, + "step": 1155 + }, + { + "epoch": 0.2310786836910622, + "grad_norm": 2.1875, + "learning_rate": 9.876674776044417e-06, + "loss": 1.1418, + "step": 1156 + }, + { + "epoch": 0.23127857874615826, + "grad_norm": 2.09375, + "learning_rate": 9.876442062176866e-06, + "loss": 1.1305, + "step": 1157 + }, + { + "epoch": 0.23147847380125433, + "grad_norm": 2.03125, + "learning_rate": 9.87620913169828e-06, + "loss": 1.0216, + "step": 1158 + }, + { + "epoch": 0.2316783688563504, + "grad_norm": 2.140625, + "learning_rate": 9.875975984619004e-06, + "loss": 1.0736, + "step": 1159 + }, + { + "epoch": 0.2318782639114465, + "grad_norm": 2.171875, + "learning_rate": 9.875742620949395e-06, + "loss": 1.0892, + "step": 1160 + }, + { + "epoch": 0.23207815896654257, + "grad_norm": 2.1875, + "learning_rate": 9.875509040699821e-06, + "loss": 1.0612, + "step": 1161 + }, + { + "epoch": 0.23227805402163865, + "grad_norm": 2.109375, + "learning_rate": 9.875275243880657e-06, + "loss": 1.1469, + "step": 1162 + }, + { + "epoch": 0.23247794907673472, + "grad_norm": 2.15625, + "learning_rate": 9.875041230502286e-06, + "loss": 1.1195, + "step": 1163 + }, + { + "epoch": 0.2326778441318308, + "grad_norm": 2.125, + "learning_rate": 9.874807000575105e-06, + "loss": 1.0738, + "step": 1164 + }, + { + "epoch": 0.23287773918692686, + "grad_norm": 2.15625, + "learning_rate": 9.874572554109517e-06, + "loss": 1.0748, + "step": 1165 + }, + { + "epoch": 0.23307763424202294, + "grad_norm": 2.125, + "learning_rate": 9.874337891115938e-06, + "loss": 1.0728, + "step": 1166 + }, + { + "epoch": 0.233277529297119, + "grad_norm": 2.0625, + "learning_rate": 9.874103011604788e-06, + "loss": 1.0373, + "step": 1167 + }, + { + "epoch": 0.23347742435221508, + "grad_norm": 2.1875, + "learning_rate": 9.873867915586504e-06, + "loss": 1.0788, + "step": 1168 + }, + { + "epoch": 0.23367731940731115, + "grad_norm": 2.15625, + "learning_rate": 9.873632603071528e-06, + "loss": 1.1423, + "step": 1169 + }, + { + "epoch": 0.23387721446240722, + "grad_norm": 2.265625, + "learning_rate": 9.873397074070312e-06, + "loss": 1.1078, + "step": 1170 + }, + { + "epoch": 0.23407710951750332, + "grad_norm": 2.140625, + "learning_rate": 9.873161328593319e-06, + "loss": 1.1371, + "step": 1171 + }, + { + "epoch": 0.2342770045725994, + "grad_norm": 2.03125, + "learning_rate": 9.87292536665102e-06, + "loss": 1.0824, + "step": 1172 + }, + { + "epoch": 0.23447689962769547, + "grad_norm": 2.328125, + "learning_rate": 9.872689188253895e-06, + "loss": 1.191, + "step": 1173 + }, + { + "epoch": 0.23467679468279154, + "grad_norm": 2.0, + "learning_rate": 9.872452793412439e-06, + "loss": 1.0575, + "step": 1174 + }, + { + "epoch": 0.2348766897378876, + "grad_norm": 2.109375, + "learning_rate": 9.872216182137148e-06, + "loss": 1.1062, + "step": 1175 + }, + { + "epoch": 0.23507658479298368, + "grad_norm": 2.203125, + "learning_rate": 9.871979354438539e-06, + "loss": 1.1298, + "step": 1176 + }, + { + "epoch": 0.23527647984807976, + "grad_norm": 2.28125, + "learning_rate": 9.871742310327124e-06, + "loss": 1.1012, + "step": 1177 + }, + { + "epoch": 0.23547637490317583, + "grad_norm": 2.15625, + "learning_rate": 9.871505049813436e-06, + "loss": 1.0347, + "step": 1178 + }, + { + "epoch": 0.2356762699582719, + "grad_norm": 2.09375, + "learning_rate": 9.871267572908015e-06, + "loss": 1.0454, + "step": 1179 + }, + { + "epoch": 0.23587616501336797, + "grad_norm": 2.140625, + "learning_rate": 9.871029879621408e-06, + "loss": 1.1042, + "step": 1180 + }, + { + "epoch": 0.23607606006846404, + "grad_norm": 2.15625, + "learning_rate": 9.870791969964173e-06, + "loss": 1.0733, + "step": 1181 + }, + { + "epoch": 0.23627595512356014, + "grad_norm": 2.21875, + "learning_rate": 9.870553843946879e-06, + "loss": 1.0674, + "step": 1182 + }, + { + "epoch": 0.23647585017865622, + "grad_norm": 2.203125, + "learning_rate": 9.870315501580106e-06, + "loss": 1.0334, + "step": 1183 + }, + { + "epoch": 0.2366757452337523, + "grad_norm": 2.109375, + "learning_rate": 9.870076942874435e-06, + "loss": 1.0535, + "step": 1184 + }, + { + "epoch": 0.23687564028884836, + "grad_norm": 2.1875, + "learning_rate": 9.86983816784047e-06, + "loss": 1.093, + "step": 1185 + }, + { + "epoch": 0.23707553534394443, + "grad_norm": 2.234375, + "learning_rate": 9.869599176488812e-06, + "loss": 1.0887, + "step": 1186 + }, + { + "epoch": 0.2372754303990405, + "grad_norm": 2.21875, + "learning_rate": 9.869359968830078e-06, + "loss": 1.0941, + "step": 1187 + }, + { + "epoch": 0.23747532545413658, + "grad_norm": 2.15625, + "learning_rate": 9.869120544874895e-06, + "loss": 1.0711, + "step": 1188 + }, + { + "epoch": 0.23767522050923265, + "grad_norm": 2.0625, + "learning_rate": 9.868880904633898e-06, + "loss": 1.2008, + "step": 1189 + }, + { + "epoch": 0.23787511556432872, + "grad_norm": 2.1875, + "learning_rate": 9.86864104811773e-06, + "loss": 1.1192, + "step": 1190 + }, + { + "epoch": 0.2380750106194248, + "grad_norm": 2.15625, + "learning_rate": 9.868400975337046e-06, + "loss": 1.0715, + "step": 1191 + }, + { + "epoch": 0.23827490567452086, + "grad_norm": 2.296875, + "learning_rate": 9.868160686302513e-06, + "loss": 1.1605, + "step": 1192 + }, + { + "epoch": 0.23847480072961696, + "grad_norm": 2.1875, + "learning_rate": 9.867920181024802e-06, + "loss": 1.1909, + "step": 1193 + }, + { + "epoch": 0.23867469578471304, + "grad_norm": 1.984375, + "learning_rate": 9.867679459514596e-06, + "loss": 1.0285, + "step": 1194 + }, + { + "epoch": 0.2388745908398091, + "grad_norm": 2.125, + "learning_rate": 9.867438521782586e-06, + "loss": 1.1216, + "step": 1195 + }, + { + "epoch": 0.23907448589490518, + "grad_norm": 2.078125, + "learning_rate": 9.867197367839481e-06, + "loss": 1.0404, + "step": 1196 + }, + { + "epoch": 0.23927438095000125, + "grad_norm": 2.171875, + "learning_rate": 9.866955997695984e-06, + "loss": 1.1369, + "step": 1197 + }, + { + "epoch": 0.23947427600509733, + "grad_norm": 2.15625, + "learning_rate": 9.866714411362825e-06, + "loss": 1.0819, + "step": 1198 + }, + { + "epoch": 0.2396741710601934, + "grad_norm": 2.09375, + "learning_rate": 9.86647260885073e-06, + "loss": 1.0858, + "step": 1199 + }, + { + "epoch": 0.23987406611528947, + "grad_norm": 2.15625, + "learning_rate": 9.866230590170442e-06, + "loss": 1.081, + "step": 1200 + }, + { + "epoch": 0.24007396117038554, + "grad_norm": 2.078125, + "learning_rate": 9.86598835533271e-06, + "loss": 1.0809, + "step": 1201 + }, + { + "epoch": 0.2402738562254816, + "grad_norm": 2.1875, + "learning_rate": 9.865745904348296e-06, + "loss": 1.1058, + "step": 1202 + }, + { + "epoch": 0.24047375128057769, + "grad_norm": 2.0625, + "learning_rate": 9.86550323722797e-06, + "loss": 1.1048, + "step": 1203 + }, + { + "epoch": 0.24067364633567376, + "grad_norm": 2.109375, + "learning_rate": 9.865260353982506e-06, + "loss": 1.0469, + "step": 1204 + }, + { + "epoch": 0.24087354139076986, + "grad_norm": 2.203125, + "learning_rate": 9.8650172546227e-06, + "loss": 1.082, + "step": 1205 + }, + { + "epoch": 0.24107343644586593, + "grad_norm": 2.0625, + "learning_rate": 9.864773939159346e-06, + "loss": 1.1258, + "step": 1206 + }, + { + "epoch": 0.241273331500962, + "grad_norm": 2.3125, + "learning_rate": 9.864530407603253e-06, + "loss": 1.1412, + "step": 1207 + }, + { + "epoch": 0.24147322655605807, + "grad_norm": 2.1875, + "learning_rate": 9.86428665996524e-06, + "loss": 0.947, + "step": 1208 + }, + { + "epoch": 0.24167312161115415, + "grad_norm": 2.109375, + "learning_rate": 9.864042696256132e-06, + "loss": 1.1225, + "step": 1209 + }, + { + "epoch": 0.24187301666625022, + "grad_norm": 2.109375, + "learning_rate": 9.863798516486767e-06, + "loss": 1.1101, + "step": 1210 + }, + { + "epoch": 0.2420729117213463, + "grad_norm": 2.0625, + "learning_rate": 9.86355412066799e-06, + "loss": 1.0778, + "step": 1211 + }, + { + "epoch": 0.24227280677644236, + "grad_norm": 2.203125, + "learning_rate": 9.86330950881066e-06, + "loss": 1.151, + "step": 1212 + }, + { + "epoch": 0.24247270183153843, + "grad_norm": 2.125, + "learning_rate": 9.863064680925643e-06, + "loss": 1.0395, + "step": 1213 + }, + { + "epoch": 0.2426725968866345, + "grad_norm": 2.09375, + "learning_rate": 9.86281963702381e-06, + "loss": 1.03, + "step": 1214 + }, + { + "epoch": 0.24287249194173058, + "grad_norm": 2.1875, + "learning_rate": 9.862574377116048e-06, + "loss": 1.0622, + "step": 1215 + }, + { + "epoch": 0.24307238699682668, + "grad_norm": 2.09375, + "learning_rate": 9.862328901213253e-06, + "loss": 1.0321, + "step": 1216 + }, + { + "epoch": 0.24327228205192275, + "grad_norm": 2.078125, + "learning_rate": 9.862083209326326e-06, + "loss": 1.1011, + "step": 1217 + }, + { + "epoch": 0.24347217710701882, + "grad_norm": 2.078125, + "learning_rate": 9.861837301466182e-06, + "loss": 1.1446, + "step": 1218 + }, + { + "epoch": 0.2436720721621149, + "grad_norm": 2.21875, + "learning_rate": 9.861591177643744e-06, + "loss": 1.1048, + "step": 1219 + }, + { + "epoch": 0.24387196721721097, + "grad_norm": 2.09375, + "learning_rate": 9.861344837869947e-06, + "loss": 1.1738, + "step": 1220 + }, + { + "epoch": 0.24407186227230704, + "grad_norm": 2.15625, + "learning_rate": 9.86109828215573e-06, + "loss": 1.0178, + "step": 1221 + }, + { + "epoch": 0.2442717573274031, + "grad_norm": 2.140625, + "learning_rate": 9.860851510512046e-06, + "loss": 1.0646, + "step": 1222 + }, + { + "epoch": 0.24447165238249918, + "grad_norm": 2.109375, + "learning_rate": 9.860604522949859e-06, + "loss": 1.0765, + "step": 1223 + }, + { + "epoch": 0.24467154743759525, + "grad_norm": 2.1875, + "learning_rate": 9.860357319480137e-06, + "loss": 1.1155, + "step": 1224 + }, + { + "epoch": 0.24487144249269133, + "grad_norm": 2.25, + "learning_rate": 9.860109900113861e-06, + "loss": 1.1207, + "step": 1225 + }, + { + "epoch": 0.2450713375477874, + "grad_norm": 2.265625, + "learning_rate": 9.859862264862023e-06, + "loss": 1.1647, + "step": 1226 + }, + { + "epoch": 0.2452712326028835, + "grad_norm": 2.09375, + "learning_rate": 9.859614413735623e-06, + "loss": 1.0827, + "step": 1227 + }, + { + "epoch": 0.24547112765797957, + "grad_norm": 2.140625, + "learning_rate": 9.85936634674567e-06, + "loss": 1.0661, + "step": 1228 + }, + { + "epoch": 0.24567102271307564, + "grad_norm": 1.96875, + "learning_rate": 9.859118063903182e-06, + "loss": 1.1036, + "step": 1229 + }, + { + "epoch": 0.24587091776817172, + "grad_norm": 2.109375, + "learning_rate": 9.858869565219189e-06, + "loss": 1.0841, + "step": 1230 + }, + { + "epoch": 0.2460708128232678, + "grad_norm": 2.09375, + "learning_rate": 9.85862085070473e-06, + "loss": 1.0565, + "step": 1231 + }, + { + "epoch": 0.24627070787836386, + "grad_norm": 2.1875, + "learning_rate": 9.85837192037085e-06, + "loss": 1.1141, + "step": 1232 + }, + { + "epoch": 0.24647060293345993, + "grad_norm": 2.015625, + "learning_rate": 9.858122774228609e-06, + "loss": 1.0912, + "step": 1233 + }, + { + "epoch": 0.246670497988556, + "grad_norm": 1.9921875, + "learning_rate": 9.857873412289071e-06, + "loss": 0.9702, + "step": 1234 + }, + { + "epoch": 0.24687039304365208, + "grad_norm": 2.21875, + "learning_rate": 9.85762383456332e-06, + "loss": 1.1655, + "step": 1235 + }, + { + "epoch": 0.24707028809874815, + "grad_norm": 2.109375, + "learning_rate": 9.857374041062433e-06, + "loss": 1.0594, + "step": 1236 + }, + { + "epoch": 0.24727018315384422, + "grad_norm": 2.09375, + "learning_rate": 9.85712403179751e-06, + "loss": 1.0415, + "step": 1237 + }, + { + "epoch": 0.24747007820894032, + "grad_norm": 2.0625, + "learning_rate": 9.856873806779656e-06, + "loss": 1.0599, + "step": 1238 + }, + { + "epoch": 0.2476699732640364, + "grad_norm": 2.125, + "learning_rate": 9.856623366019988e-06, + "loss": 1.0679, + "step": 1239 + }, + { + "epoch": 0.24786986831913246, + "grad_norm": 2.265625, + "learning_rate": 9.85637270952963e-06, + "loss": 1.1581, + "step": 1240 + }, + { + "epoch": 0.24806976337422854, + "grad_norm": 1.984375, + "learning_rate": 9.856121837319712e-06, + "loss": 0.9953, + "step": 1241 + }, + { + "epoch": 0.2482696584293246, + "grad_norm": 2.125, + "learning_rate": 9.855870749401381e-06, + "loss": 1.0956, + "step": 1242 + }, + { + "epoch": 0.24846955348442068, + "grad_norm": 2.328125, + "learning_rate": 9.855619445785791e-06, + "loss": 1.0473, + "step": 1243 + }, + { + "epoch": 0.24866944853951675, + "grad_norm": 2.09375, + "learning_rate": 9.855367926484103e-06, + "loss": 1.1015, + "step": 1244 + }, + { + "epoch": 0.24886934359461282, + "grad_norm": 2.1875, + "learning_rate": 9.85511619150749e-06, + "loss": 1.1346, + "step": 1245 + }, + { + "epoch": 0.2490692386497089, + "grad_norm": 2.28125, + "learning_rate": 9.854864240867137e-06, + "loss": 1.1165, + "step": 1246 + }, + { + "epoch": 0.24926913370480497, + "grad_norm": 2.25, + "learning_rate": 9.85461207457423e-06, + "loss": 1.1865, + "step": 1247 + }, + { + "epoch": 0.24946902875990104, + "grad_norm": 2.171875, + "learning_rate": 9.854359692639974e-06, + "loss": 1.0496, + "step": 1248 + }, + { + "epoch": 0.24966892381499714, + "grad_norm": 2.140625, + "learning_rate": 9.854107095075578e-06, + "loss": 1.0782, + "step": 1249 + }, + { + "epoch": 0.2498688188700932, + "grad_norm": 2.0625, + "learning_rate": 9.853854281892265e-06, + "loss": 1.0533, + "step": 1250 + }, + { + "epoch": 0.25006871392518926, + "grad_norm": 2.140625, + "learning_rate": 9.853601253101262e-06, + "loss": 1.1048, + "step": 1251 + }, + { + "epoch": 0.25026860898028536, + "grad_norm": 2.09375, + "learning_rate": 9.85334800871381e-06, + "loss": 1.0275, + "step": 1252 + }, + { + "epoch": 0.2504685040353814, + "grad_norm": 2.125, + "learning_rate": 9.853094548741158e-06, + "loss": 1.1582, + "step": 1253 + }, + { + "epoch": 0.2506683990904775, + "grad_norm": 2.125, + "learning_rate": 9.852840873194565e-06, + "loss": 1.0982, + "step": 1254 + }, + { + "epoch": 0.2508682941455736, + "grad_norm": 2.171875, + "learning_rate": 9.852586982085298e-06, + "loss": 1.0824, + "step": 1255 + }, + { + "epoch": 0.25106818920066964, + "grad_norm": 2.203125, + "learning_rate": 9.852332875424636e-06, + "loss": 1.1468, + "step": 1256 + }, + { + "epoch": 0.25126808425576574, + "grad_norm": 2.109375, + "learning_rate": 9.852078553223865e-06, + "loss": 1.0221, + "step": 1257 + }, + { + "epoch": 0.2514679793108618, + "grad_norm": 2.140625, + "learning_rate": 9.851824015494284e-06, + "loss": 1.049, + "step": 1258 + }, + { + "epoch": 0.2516678743659579, + "grad_norm": 2.203125, + "learning_rate": 9.851569262247198e-06, + "loss": 1.1261, + "step": 1259 + }, + { + "epoch": 0.25186776942105393, + "grad_norm": 2.046875, + "learning_rate": 9.851314293493923e-06, + "loss": 1.1281, + "step": 1260 + }, + { + "epoch": 0.25206766447615003, + "grad_norm": 2.09375, + "learning_rate": 9.851059109245785e-06, + "loss": 1.022, + "step": 1261 + }, + { + "epoch": 0.2522675595312461, + "grad_norm": 2.140625, + "learning_rate": 9.850803709514121e-06, + "loss": 1.1081, + "step": 1262 + }, + { + "epoch": 0.2524674545863422, + "grad_norm": 2.09375, + "learning_rate": 9.850548094310273e-06, + "loss": 1.0569, + "step": 1263 + }, + { + "epoch": 0.2526673496414382, + "grad_norm": 2.078125, + "learning_rate": 9.850292263645597e-06, + "loss": 1.0977, + "step": 1264 + }, + { + "epoch": 0.2528672446965343, + "grad_norm": 2.265625, + "learning_rate": 9.850036217531457e-06, + "loss": 1.06, + "step": 1265 + }, + { + "epoch": 0.2530671397516304, + "grad_norm": 2.046875, + "learning_rate": 9.849779955979226e-06, + "loss": 1.0829, + "step": 1266 + }, + { + "epoch": 0.25326703480672647, + "grad_norm": 2.171875, + "learning_rate": 9.849523479000287e-06, + "loss": 1.1455, + "step": 1267 + }, + { + "epoch": 0.25346692986182257, + "grad_norm": 2.265625, + "learning_rate": 9.849266786606033e-06, + "loss": 1.1681, + "step": 1268 + }, + { + "epoch": 0.2536668249169186, + "grad_norm": 2.078125, + "learning_rate": 9.849009878807867e-06, + "loss": 1.07, + "step": 1269 + }, + { + "epoch": 0.2538667199720147, + "grad_norm": 2.25, + "learning_rate": 9.848752755617201e-06, + "loss": 1.1013, + "step": 1270 + }, + { + "epoch": 0.25406661502711075, + "grad_norm": 2.171875, + "learning_rate": 9.848495417045454e-06, + "loss": 1.1068, + "step": 1271 + }, + { + "epoch": 0.25426651008220685, + "grad_norm": 2.1875, + "learning_rate": 9.848237863104057e-06, + "loss": 0.9974, + "step": 1272 + }, + { + "epoch": 0.2544664051373029, + "grad_norm": 2.21875, + "learning_rate": 9.847980093804455e-06, + "loss": 1.1419, + "step": 1273 + }, + { + "epoch": 0.254666300192399, + "grad_norm": 2.09375, + "learning_rate": 9.847722109158094e-06, + "loss": 1.0713, + "step": 1274 + }, + { + "epoch": 0.25486619524749504, + "grad_norm": 2.234375, + "learning_rate": 9.847463909176433e-06, + "loss": 1.0939, + "step": 1275 + }, + { + "epoch": 0.25506609030259114, + "grad_norm": 2.234375, + "learning_rate": 9.847205493870944e-06, + "loss": 1.1118, + "step": 1276 + }, + { + "epoch": 0.25526598535768724, + "grad_norm": 2.046875, + "learning_rate": 9.846946863253104e-06, + "loss": 1.052, + "step": 1277 + }, + { + "epoch": 0.2554658804127833, + "grad_norm": 2.125, + "learning_rate": 9.846688017334405e-06, + "loss": 1.0839, + "step": 1278 + }, + { + "epoch": 0.2556657754678794, + "grad_norm": 2.1875, + "learning_rate": 9.846428956126338e-06, + "loss": 1.1021, + "step": 1279 + }, + { + "epoch": 0.25586567052297543, + "grad_norm": 2.109375, + "learning_rate": 9.846169679640417e-06, + "loss": 1.011, + "step": 1280 + }, + { + "epoch": 0.25606556557807153, + "grad_norm": 2.203125, + "learning_rate": 9.845910187888155e-06, + "loss": 1.0385, + "step": 1281 + }, + { + "epoch": 0.2562654606331676, + "grad_norm": 2.234375, + "learning_rate": 9.84565048088108e-06, + "loss": 1.1017, + "step": 1282 + }, + { + "epoch": 0.2564653556882637, + "grad_norm": 2.125, + "learning_rate": 9.84539055863073e-06, + "loss": 1.1327, + "step": 1283 + }, + { + "epoch": 0.2566652507433597, + "grad_norm": 2.09375, + "learning_rate": 9.845130421148646e-06, + "loss": 1.1224, + "step": 1284 + }, + { + "epoch": 0.2568651457984558, + "grad_norm": 2.046875, + "learning_rate": 9.844870068446389e-06, + "loss": 1.1337, + "step": 1285 + }, + { + "epoch": 0.25706504085355186, + "grad_norm": 2.1875, + "learning_rate": 9.84460950053552e-06, + "loss": 1.0371, + "step": 1286 + }, + { + "epoch": 0.25726493590864796, + "grad_norm": 2.171875, + "learning_rate": 9.844348717427614e-06, + "loss": 1.1558, + "step": 1287 + }, + { + "epoch": 0.257464830963744, + "grad_norm": 2.015625, + "learning_rate": 9.844087719134254e-06, + "loss": 1.0198, + "step": 1288 + }, + { + "epoch": 0.2576647260188401, + "grad_norm": 2.1875, + "learning_rate": 9.843826505667038e-06, + "loss": 1.1347, + "step": 1289 + }, + { + "epoch": 0.2578646210739362, + "grad_norm": 2.125, + "learning_rate": 9.843565077037563e-06, + "loss": 1.2295, + "step": 1290 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 2.109375, + "learning_rate": 9.843303433257447e-06, + "loss": 1.1543, + "step": 1291 + }, + { + "epoch": 0.25826441118412835, + "grad_norm": 2.140625, + "learning_rate": 9.843041574338307e-06, + "loss": 1.1631, + "step": 1292 + }, + { + "epoch": 0.2584643062392244, + "grad_norm": 2.25, + "learning_rate": 9.84277950029178e-06, + "loss": 1.0916, + "step": 1293 + }, + { + "epoch": 0.2586642012943205, + "grad_norm": 2.0625, + "learning_rate": 9.842517211129504e-06, + "loss": 1.1489, + "step": 1294 + }, + { + "epoch": 0.25886409634941654, + "grad_norm": 2.078125, + "learning_rate": 9.842254706863131e-06, + "loss": 0.9759, + "step": 1295 + }, + { + "epoch": 0.25906399140451264, + "grad_norm": 2.109375, + "learning_rate": 9.84199198750432e-06, + "loss": 1.0633, + "step": 1296 + }, + { + "epoch": 0.2592638864596087, + "grad_norm": 2.171875, + "learning_rate": 9.841729053064744e-06, + "loss": 1.0705, + "step": 1297 + }, + { + "epoch": 0.2594637815147048, + "grad_norm": 2.203125, + "learning_rate": 9.84146590355608e-06, + "loss": 1.1205, + "step": 1298 + }, + { + "epoch": 0.2596636765698008, + "grad_norm": 2.203125, + "learning_rate": 9.841202538990016e-06, + "loss": 1.1211, + "step": 1299 + }, + { + "epoch": 0.2598635716248969, + "grad_norm": 2.15625, + "learning_rate": 9.840938959378254e-06, + "loss": 1.1488, + "step": 1300 + }, + { + "epoch": 0.260063466679993, + "grad_norm": 2.203125, + "learning_rate": 9.8406751647325e-06, + "loss": 1.1001, + "step": 1301 + }, + { + "epoch": 0.26026336173508907, + "grad_norm": 2.046875, + "learning_rate": 9.840411155064472e-06, + "loss": 1.0636, + "step": 1302 + }, + { + "epoch": 0.26046325679018517, + "grad_norm": 2.078125, + "learning_rate": 9.840146930385897e-06, + "loss": 1.0858, + "step": 1303 + }, + { + "epoch": 0.2606631518452812, + "grad_norm": 2.078125, + "learning_rate": 9.839882490708512e-06, + "loss": 0.9908, + "step": 1304 + }, + { + "epoch": 0.2608630469003773, + "grad_norm": 2.171875, + "learning_rate": 9.839617836044065e-06, + "loss": 1.1681, + "step": 1305 + }, + { + "epoch": 0.26106294195547336, + "grad_norm": 2.140625, + "learning_rate": 9.83935296640431e-06, + "loss": 1.1194, + "step": 1306 + }, + { + "epoch": 0.26126283701056946, + "grad_norm": 2.1875, + "learning_rate": 9.839087881801012e-06, + "loss": 1.1615, + "step": 1307 + }, + { + "epoch": 0.2614627320656655, + "grad_norm": 2.109375, + "learning_rate": 9.83882258224595e-06, + "loss": 1.0381, + "step": 1308 + }, + { + "epoch": 0.2616626271207616, + "grad_norm": 2.125, + "learning_rate": 9.838557067750903e-06, + "loss": 1.1391, + "step": 1309 + }, + { + "epoch": 0.26186252217585765, + "grad_norm": 2.28125, + "learning_rate": 9.83829133832767e-06, + "loss": 1.1343, + "step": 1310 + }, + { + "epoch": 0.26206241723095375, + "grad_norm": 2.125, + "learning_rate": 9.838025393988051e-06, + "loss": 1.0177, + "step": 1311 + }, + { + "epoch": 0.26226231228604985, + "grad_norm": 2.3125, + "learning_rate": 9.83775923474386e-06, + "loss": 1.1457, + "step": 1312 + }, + { + "epoch": 0.2624622073411459, + "grad_norm": 2.140625, + "learning_rate": 9.837492860606923e-06, + "loss": 1.1076, + "step": 1313 + }, + { + "epoch": 0.262662102396242, + "grad_norm": 2.15625, + "learning_rate": 9.837226271589067e-06, + "loss": 1.0448, + "step": 1314 + }, + { + "epoch": 0.26286199745133804, + "grad_norm": 2.15625, + "learning_rate": 9.836959467702139e-06, + "loss": 1.0799, + "step": 1315 + }, + { + "epoch": 0.26306189250643414, + "grad_norm": 2.0625, + "learning_rate": 9.836692448957987e-06, + "loss": 1.0002, + "step": 1316 + }, + { + "epoch": 0.2632617875615302, + "grad_norm": 2.046875, + "learning_rate": 9.836425215368472e-06, + "loss": 1.1429, + "step": 1317 + }, + { + "epoch": 0.2634616826166263, + "grad_norm": 2.171875, + "learning_rate": 9.836157766945467e-06, + "loss": 1.0986, + "step": 1318 + }, + { + "epoch": 0.2636615776717223, + "grad_norm": 2.25, + "learning_rate": 9.835890103700849e-06, + "loss": 1.1768, + "step": 1319 + }, + { + "epoch": 0.2638614727268184, + "grad_norm": 2.171875, + "learning_rate": 9.83562222564651e-06, + "loss": 1.0491, + "step": 1320 + }, + { + "epoch": 0.26406136778191447, + "grad_norm": 2.0625, + "learning_rate": 9.835354132794349e-06, + "loss": 1.0405, + "step": 1321 + }, + { + "epoch": 0.26426126283701057, + "grad_norm": 2.1875, + "learning_rate": 9.835085825156274e-06, + "loss": 1.1763, + "step": 1322 + }, + { + "epoch": 0.26446115789210667, + "grad_norm": 2.078125, + "learning_rate": 9.834817302744201e-06, + "loss": 1.11, + "step": 1323 + }, + { + "epoch": 0.2646610529472027, + "grad_norm": 2.140625, + "learning_rate": 9.834548565570062e-06, + "loss": 1.1882, + "step": 1324 + }, + { + "epoch": 0.2648609480022988, + "grad_norm": 2.171875, + "learning_rate": 9.834279613645791e-06, + "loss": 1.1312, + "step": 1325 + }, + { + "epoch": 0.26506084305739486, + "grad_norm": 2.375, + "learning_rate": 9.834010446983335e-06, + "loss": 1.1019, + "step": 1326 + }, + { + "epoch": 0.26526073811249096, + "grad_norm": 2.109375, + "learning_rate": 9.833741065594652e-06, + "loss": 1.1003, + "step": 1327 + }, + { + "epoch": 0.265460633167587, + "grad_norm": 2.15625, + "learning_rate": 9.833471469491708e-06, + "loss": 1.0937, + "step": 1328 + }, + { + "epoch": 0.2656605282226831, + "grad_norm": 2.171875, + "learning_rate": 9.833201658686478e-06, + "loss": 1.0801, + "step": 1329 + }, + { + "epoch": 0.26586042327777915, + "grad_norm": 2.140625, + "learning_rate": 9.832931633190943e-06, + "loss": 1.0232, + "step": 1330 + }, + { + "epoch": 0.26606031833287525, + "grad_norm": 2.15625, + "learning_rate": 9.832661393017104e-06, + "loss": 1.1369, + "step": 1331 + }, + { + "epoch": 0.2662602133879713, + "grad_norm": 2.09375, + "learning_rate": 9.83239093817696e-06, + "loss": 0.9929, + "step": 1332 + }, + { + "epoch": 0.2664601084430674, + "grad_norm": 2.1875, + "learning_rate": 9.83212026868253e-06, + "loss": 1.1219, + "step": 1333 + }, + { + "epoch": 0.2666600034981635, + "grad_norm": 2.0625, + "learning_rate": 9.831849384545831e-06, + "loss": 1.0917, + "step": 1334 + }, + { + "epoch": 0.26685989855325953, + "grad_norm": 2.203125, + "learning_rate": 9.8315782857789e-06, + "loss": 1.2081, + "step": 1335 + }, + { + "epoch": 0.26705979360835563, + "grad_norm": 2.171875, + "learning_rate": 9.831306972393778e-06, + "loss": 1.0511, + "step": 1336 + }, + { + "epoch": 0.2672596886634517, + "grad_norm": 2.09375, + "learning_rate": 9.831035444402514e-06, + "loss": 1.1524, + "step": 1337 + }, + { + "epoch": 0.2674595837185478, + "grad_norm": 1.9765625, + "learning_rate": 9.830763701817173e-06, + "loss": 1.0655, + "step": 1338 + }, + { + "epoch": 0.2676594787736438, + "grad_norm": 2.171875, + "learning_rate": 9.830491744649824e-06, + "loss": 1.1691, + "step": 1339 + }, + { + "epoch": 0.2678593738287399, + "grad_norm": 2.078125, + "learning_rate": 9.830219572912546e-06, + "loss": 1.0408, + "step": 1340 + }, + { + "epoch": 0.26805926888383597, + "grad_norm": 2.171875, + "learning_rate": 9.829947186617432e-06, + "loss": 1.0452, + "step": 1341 + }, + { + "epoch": 0.26825916393893207, + "grad_norm": 2.15625, + "learning_rate": 9.82967458577658e-06, + "loss": 1.1355, + "step": 1342 + }, + { + "epoch": 0.2684590589940281, + "grad_norm": 2.125, + "learning_rate": 9.829401770402099e-06, + "loss": 1.1821, + "step": 1343 + }, + { + "epoch": 0.2686589540491242, + "grad_norm": 2.203125, + "learning_rate": 9.829128740506107e-06, + "loss": 1.0181, + "step": 1344 + }, + { + "epoch": 0.2688588491042203, + "grad_norm": 2.109375, + "learning_rate": 9.828855496100733e-06, + "loss": 1.1149, + "step": 1345 + }, + { + "epoch": 0.26905874415931635, + "grad_norm": 2.125, + "learning_rate": 9.828582037198111e-06, + "loss": 1.0257, + "step": 1346 + }, + { + "epoch": 0.26925863921441245, + "grad_norm": 2.171875, + "learning_rate": 9.828308363810392e-06, + "loss": 1.1245, + "step": 1347 + }, + { + "epoch": 0.2694585342695085, + "grad_norm": 2.0625, + "learning_rate": 9.828034475949732e-06, + "loss": 1.066, + "step": 1348 + }, + { + "epoch": 0.2696584293246046, + "grad_norm": 2.234375, + "learning_rate": 9.827760373628295e-06, + "loss": 1.1373, + "step": 1349 + }, + { + "epoch": 0.26985832437970064, + "grad_norm": 2.25, + "learning_rate": 9.82748605685826e-06, + "loss": 1.2466, + "step": 1350 + }, + { + "epoch": 0.27005821943479674, + "grad_norm": 2.078125, + "learning_rate": 9.827211525651808e-06, + "loss": 1.1199, + "step": 1351 + }, + { + "epoch": 0.2702581144898928, + "grad_norm": 1.9921875, + "learning_rate": 9.826936780021134e-06, + "loss": 0.9884, + "step": 1352 + }, + { + "epoch": 0.2704580095449889, + "grad_norm": 2.046875, + "learning_rate": 9.826661819978446e-06, + "loss": 1.0505, + "step": 1353 + }, + { + "epoch": 0.27065790460008493, + "grad_norm": 2.09375, + "learning_rate": 9.826386645535955e-06, + "loss": 1.048, + "step": 1354 + }, + { + "epoch": 0.27085779965518103, + "grad_norm": 2.25, + "learning_rate": 9.826111256705885e-06, + "loss": 1.2033, + "step": 1355 + }, + { + "epoch": 0.27105769471027713, + "grad_norm": 2.09375, + "learning_rate": 9.825835653500468e-06, + "loss": 1.0404, + "step": 1356 + }, + { + "epoch": 0.2712575897653732, + "grad_norm": 2.078125, + "learning_rate": 9.825559835931948e-06, + "loss": 1.077, + "step": 1357 + }, + { + "epoch": 0.2714574848204693, + "grad_norm": 2.28125, + "learning_rate": 9.825283804012573e-06, + "loss": 1.1831, + "step": 1358 + }, + { + "epoch": 0.2716573798755653, + "grad_norm": 2.015625, + "learning_rate": 9.825007557754608e-06, + "loss": 0.9796, + "step": 1359 + }, + { + "epoch": 0.2718572749306614, + "grad_norm": 2.4375, + "learning_rate": 9.824731097170323e-06, + "loss": 1.0301, + "step": 1360 + }, + { + "epoch": 0.27205716998575746, + "grad_norm": 2.3125, + "learning_rate": 9.824454422271999e-06, + "loss": 1.1681, + "step": 1361 + }, + { + "epoch": 0.27225706504085356, + "grad_norm": 2.140625, + "learning_rate": 9.824177533071922e-06, + "loss": 1.0397, + "step": 1362 + }, + { + "epoch": 0.2724569600959496, + "grad_norm": 2.1875, + "learning_rate": 9.823900429582396e-06, + "loss": 1.0641, + "step": 1363 + }, + { + "epoch": 0.2726568551510457, + "grad_norm": 2.15625, + "learning_rate": 9.823623111815728e-06, + "loss": 1.0351, + "step": 1364 + }, + { + "epoch": 0.27285675020614175, + "grad_norm": 2.078125, + "learning_rate": 9.823345579784236e-06, + "loss": 1.076, + "step": 1365 + }, + { + "epoch": 0.27305664526123785, + "grad_norm": 2.140625, + "learning_rate": 9.823067833500248e-06, + "loss": 1.0386, + "step": 1366 + }, + { + "epoch": 0.27325654031633395, + "grad_norm": 2.09375, + "learning_rate": 9.822789872976105e-06, + "loss": 1.1059, + "step": 1367 + }, + { + "epoch": 0.27345643537143, + "grad_norm": 2.25, + "learning_rate": 9.822511698224147e-06, + "loss": 1.1233, + "step": 1368 + }, + { + "epoch": 0.2736563304265261, + "grad_norm": 2.3125, + "learning_rate": 9.822233309256738e-06, + "loss": 1.1211, + "step": 1369 + }, + { + "epoch": 0.27385622548162214, + "grad_norm": 2.109375, + "learning_rate": 9.821954706086237e-06, + "loss": 1.0413, + "step": 1370 + }, + { + "epoch": 0.27405612053671824, + "grad_norm": 2.15625, + "learning_rate": 9.821675888725025e-06, + "loss": 1.0646, + "step": 1371 + }, + { + "epoch": 0.2742560155918143, + "grad_norm": 2.15625, + "learning_rate": 9.821396857185484e-06, + "loss": 1.0904, + "step": 1372 + }, + { + "epoch": 0.2744559106469104, + "grad_norm": 2.3125, + "learning_rate": 9.821117611480011e-06, + "loss": 1.0808, + "step": 1373 + }, + { + "epoch": 0.27465580570200643, + "grad_norm": 2.125, + "learning_rate": 9.820838151621008e-06, + "loss": 1.0683, + "step": 1374 + }, + { + "epoch": 0.27485570075710253, + "grad_norm": 2.21875, + "learning_rate": 9.820558477620888e-06, + "loss": 1.198, + "step": 1375 + }, + { + "epoch": 0.27505559581219857, + "grad_norm": 2.1875, + "learning_rate": 9.820278589492076e-06, + "loss": 0.9376, + "step": 1376 + }, + { + "epoch": 0.27525549086729467, + "grad_norm": 2.078125, + "learning_rate": 9.819998487247004e-06, + "loss": 1.0398, + "step": 1377 + }, + { + "epoch": 0.27545538592239077, + "grad_norm": 2.03125, + "learning_rate": 9.819718170898116e-06, + "loss": 1.0014, + "step": 1378 + }, + { + "epoch": 0.2756552809774868, + "grad_norm": 2.171875, + "learning_rate": 9.819437640457858e-06, + "loss": 1.0894, + "step": 1379 + }, + { + "epoch": 0.2758551760325829, + "grad_norm": 2.109375, + "learning_rate": 9.819156895938697e-06, + "loss": 1.0558, + "step": 1380 + }, + { + "epoch": 0.27605507108767896, + "grad_norm": 2.0625, + "learning_rate": 9.8188759373531e-06, + "loss": 1.0198, + "step": 1381 + }, + { + "epoch": 0.27625496614277506, + "grad_norm": 2.296875, + "learning_rate": 9.81859476471355e-06, + "loss": 1.0753, + "step": 1382 + }, + { + "epoch": 0.2764548611978711, + "grad_norm": 2.125, + "learning_rate": 9.818313378032535e-06, + "loss": 1.1915, + "step": 1383 + }, + { + "epoch": 0.2766547562529672, + "grad_norm": 2.234375, + "learning_rate": 9.818031777322554e-06, + "loss": 1.081, + "step": 1384 + }, + { + "epoch": 0.27685465130806325, + "grad_norm": 2.140625, + "learning_rate": 9.817749962596115e-06, + "loss": 1.0739, + "step": 1385 + }, + { + "epoch": 0.27705454636315935, + "grad_norm": 2.171875, + "learning_rate": 9.817467933865739e-06, + "loss": 1.1438, + "step": 1386 + }, + { + "epoch": 0.2772544414182554, + "grad_norm": 2.125, + "learning_rate": 9.81718569114395e-06, + "loss": 1.0289, + "step": 1387 + }, + { + "epoch": 0.2774543364733515, + "grad_norm": 2.09375, + "learning_rate": 9.81690323444329e-06, + "loss": 1.1005, + "step": 1388 + }, + { + "epoch": 0.2776542315284476, + "grad_norm": 2.265625, + "learning_rate": 9.8166205637763e-06, + "loss": 1.0965, + "step": 1389 + }, + { + "epoch": 0.27785412658354364, + "grad_norm": 2.203125, + "learning_rate": 9.81633767915554e-06, + "loss": 1.1429, + "step": 1390 + }, + { + "epoch": 0.27805402163863974, + "grad_norm": 2.046875, + "learning_rate": 9.816054580593575e-06, + "loss": 1.0227, + "step": 1391 + }, + { + "epoch": 0.2782539166937358, + "grad_norm": 2.125, + "learning_rate": 9.81577126810298e-06, + "loss": 1.1028, + "step": 1392 + }, + { + "epoch": 0.2784538117488319, + "grad_norm": 2.296875, + "learning_rate": 9.815487741696339e-06, + "loss": 1.244, + "step": 1393 + }, + { + "epoch": 0.2786537068039279, + "grad_norm": 2.1875, + "learning_rate": 9.815204001386245e-06, + "loss": 1.0981, + "step": 1394 + }, + { + "epoch": 0.278853601859024, + "grad_norm": 2.0625, + "learning_rate": 9.814920047185306e-06, + "loss": 1.1177, + "step": 1395 + }, + { + "epoch": 0.27905349691412007, + "grad_norm": 2.09375, + "learning_rate": 9.814635879106134e-06, + "loss": 1.1771, + "step": 1396 + }, + { + "epoch": 0.27925339196921617, + "grad_norm": 2.234375, + "learning_rate": 9.814351497161348e-06, + "loss": 1.1397, + "step": 1397 + }, + { + "epoch": 0.2794532870243122, + "grad_norm": 2.296875, + "learning_rate": 9.814066901363584e-06, + "loss": 1.0615, + "step": 1398 + }, + { + "epoch": 0.2796531820794083, + "grad_norm": 2.15625, + "learning_rate": 9.81378209172548e-06, + "loss": 1.0988, + "step": 1399 + }, + { + "epoch": 0.27985307713450436, + "grad_norm": 2.09375, + "learning_rate": 9.813497068259692e-06, + "loss": 1.0661, + "step": 1400 + }, + { + "epoch": 0.28005297218960046, + "grad_norm": 2.03125, + "learning_rate": 9.813211830978879e-06, + "loss": 1.1077, + "step": 1401 + }, + { + "epoch": 0.28025286724469656, + "grad_norm": 2.171875, + "learning_rate": 9.812926379895708e-06, + "loss": 1.122, + "step": 1402 + }, + { + "epoch": 0.2804527622997926, + "grad_norm": 2.171875, + "learning_rate": 9.812640715022863e-06, + "loss": 1.173, + "step": 1403 + }, + { + "epoch": 0.2806526573548887, + "grad_norm": 2.078125, + "learning_rate": 9.812354836373031e-06, + "loss": 1.0561, + "step": 1404 + }, + { + "epoch": 0.28085255240998475, + "grad_norm": 2.109375, + "learning_rate": 9.812068743958912e-06, + "loss": 1.1136, + "step": 1405 + }, + { + "epoch": 0.28105244746508085, + "grad_norm": 2.125, + "learning_rate": 9.811782437793211e-06, + "loss": 1.0279, + "step": 1406 + }, + { + "epoch": 0.2812523425201769, + "grad_norm": 2.15625, + "learning_rate": 9.81149591788865e-06, + "loss": 1.0633, + "step": 1407 + }, + { + "epoch": 0.281452237575273, + "grad_norm": 2.171875, + "learning_rate": 9.811209184257953e-06, + "loss": 1.1056, + "step": 1408 + }, + { + "epoch": 0.28165213263036903, + "grad_norm": 2.125, + "learning_rate": 9.81092223691386e-06, + "loss": 1.0893, + "step": 1409 + }, + { + "epoch": 0.28185202768546513, + "grad_norm": 2.125, + "learning_rate": 9.810635075869113e-06, + "loss": 0.9967, + "step": 1410 + }, + { + "epoch": 0.2820519227405612, + "grad_norm": 2.09375, + "learning_rate": 9.81034770113647e-06, + "loss": 1.0597, + "step": 1411 + }, + { + "epoch": 0.2822518177956573, + "grad_norm": 2.140625, + "learning_rate": 9.810060112728696e-06, + "loss": 0.9984, + "step": 1412 + }, + { + "epoch": 0.2824517128507534, + "grad_norm": 2.09375, + "learning_rate": 9.809772310658567e-06, + "loss": 1.0279, + "step": 1413 + }, + { + "epoch": 0.2826516079058494, + "grad_norm": 2.234375, + "learning_rate": 9.809484294938864e-06, + "loss": 1.1348, + "step": 1414 + }, + { + "epoch": 0.2828515029609455, + "grad_norm": 2.140625, + "learning_rate": 9.809196065582383e-06, + "loss": 1.0826, + "step": 1415 + }, + { + "epoch": 0.28305139801604157, + "grad_norm": 2.171875, + "learning_rate": 9.808907622601926e-06, + "loss": 1.0565, + "step": 1416 + }, + { + "epoch": 0.28325129307113767, + "grad_norm": 2.140625, + "learning_rate": 9.808618966010306e-06, + "loss": 1.1655, + "step": 1417 + }, + { + "epoch": 0.2834511881262337, + "grad_norm": 2.21875, + "learning_rate": 9.808330095820346e-06, + "loss": 1.1342, + "step": 1418 + }, + { + "epoch": 0.2836510831813298, + "grad_norm": 2.109375, + "learning_rate": 9.808041012044875e-06, + "loss": 1.07, + "step": 1419 + }, + { + "epoch": 0.28385097823642585, + "grad_norm": 2.21875, + "learning_rate": 9.807751714696737e-06, + "loss": 1.0354, + "step": 1420 + }, + { + "epoch": 0.28405087329152195, + "grad_norm": 2.125, + "learning_rate": 9.807462203788782e-06, + "loss": 1.0195, + "step": 1421 + }, + { + "epoch": 0.284250768346618, + "grad_norm": 2.109375, + "learning_rate": 9.807172479333868e-06, + "loss": 1.1112, + "step": 1422 + }, + { + "epoch": 0.2844506634017141, + "grad_norm": 2.046875, + "learning_rate": 9.806882541344867e-06, + "loss": 1.0059, + "step": 1423 + }, + { + "epoch": 0.2846505584568102, + "grad_norm": 2.265625, + "learning_rate": 9.806592389834654e-06, + "loss": 1.1527, + "step": 1424 + }, + { + "epoch": 0.28485045351190624, + "grad_norm": 2.15625, + "learning_rate": 9.806302024816124e-06, + "loss": 1.0392, + "step": 1425 + }, + { + "epoch": 0.28505034856700234, + "grad_norm": 2.203125, + "learning_rate": 9.806011446302169e-06, + "loss": 1.1161, + "step": 1426 + }, + { + "epoch": 0.2852502436220984, + "grad_norm": 2.1875, + "learning_rate": 9.8057206543057e-06, + "loss": 1.0734, + "step": 1427 + }, + { + "epoch": 0.2854501386771945, + "grad_norm": 2.109375, + "learning_rate": 9.805429648839633e-06, + "loss": 1.0341, + "step": 1428 + }, + { + "epoch": 0.28565003373229053, + "grad_norm": 2.109375, + "learning_rate": 9.805138429916894e-06, + "loss": 1.1391, + "step": 1429 + }, + { + "epoch": 0.28584992878738663, + "grad_norm": 2.09375, + "learning_rate": 9.80484699755042e-06, + "loss": 1.1245, + "step": 1430 + }, + { + "epoch": 0.2860498238424827, + "grad_norm": 2.125, + "learning_rate": 9.804555351753153e-06, + "loss": 1.1387, + "step": 1431 + }, + { + "epoch": 0.2862497188975788, + "grad_norm": 2.75, + "learning_rate": 9.804263492538054e-06, + "loss": 1.0452, + "step": 1432 + }, + { + "epoch": 0.2864496139526748, + "grad_norm": 2.078125, + "learning_rate": 9.80397141991808e-06, + "loss": 1.0985, + "step": 1433 + }, + { + "epoch": 0.2866495090077709, + "grad_norm": 2.09375, + "learning_rate": 9.80367913390621e-06, + "loss": 0.9788, + "step": 1434 + }, + { + "epoch": 0.286849404062867, + "grad_norm": 2.09375, + "learning_rate": 9.803386634515427e-06, + "loss": 1.053, + "step": 1435 + }, + { + "epoch": 0.28704929911796306, + "grad_norm": 2.34375, + "learning_rate": 9.803093921758721e-06, + "loss": 1.109, + "step": 1436 + }, + { + "epoch": 0.28724919417305916, + "grad_norm": 2.265625, + "learning_rate": 9.802800995649098e-06, + "loss": 1.2099, + "step": 1437 + }, + { + "epoch": 0.2874490892281552, + "grad_norm": 2.03125, + "learning_rate": 9.802507856199567e-06, + "loss": 1.0597, + "step": 1438 + }, + { + "epoch": 0.2876489842832513, + "grad_norm": 2.171875, + "learning_rate": 9.802214503423149e-06, + "loss": 1.0489, + "step": 1439 + }, + { + "epoch": 0.28784887933834735, + "grad_norm": 2.140625, + "learning_rate": 9.801920937332876e-06, + "loss": 1.1361, + "step": 1440 + }, + { + "epoch": 0.28804877439344345, + "grad_norm": 2.09375, + "learning_rate": 9.801627157941788e-06, + "loss": 1.0418, + "step": 1441 + }, + { + "epoch": 0.2882486694485395, + "grad_norm": 2.21875, + "learning_rate": 9.801333165262936e-06, + "loss": 1.2393, + "step": 1442 + }, + { + "epoch": 0.2884485645036356, + "grad_norm": 2.1875, + "learning_rate": 9.801038959309376e-06, + "loss": 1.1047, + "step": 1443 + }, + { + "epoch": 0.28864845955873164, + "grad_norm": 2.140625, + "learning_rate": 9.800744540094178e-06, + "loss": 0.9954, + "step": 1444 + }, + { + "epoch": 0.28884835461382774, + "grad_norm": 2.1875, + "learning_rate": 9.80044990763042e-06, + "loss": 1.1797, + "step": 1445 + }, + { + "epoch": 0.28904824966892384, + "grad_norm": 2.234375, + "learning_rate": 9.800155061931192e-06, + "loss": 1.0957, + "step": 1446 + }, + { + "epoch": 0.2892481447240199, + "grad_norm": 2.046875, + "learning_rate": 9.799860003009587e-06, + "loss": 1.1394, + "step": 1447 + }, + { + "epoch": 0.289448039779116, + "grad_norm": 2.140625, + "learning_rate": 9.799564730878713e-06, + "loss": 1.0629, + "step": 1448 + }, + { + "epoch": 0.28964793483421203, + "grad_norm": 2.296875, + "learning_rate": 9.799269245551688e-06, + "loss": 1.1368, + "step": 1449 + }, + { + "epoch": 0.28984782988930813, + "grad_norm": 2.140625, + "learning_rate": 9.798973547041633e-06, + "loss": 1.0617, + "step": 1450 + }, + { + "epoch": 0.2900477249444042, + "grad_norm": 2.09375, + "learning_rate": 9.79867763536169e-06, + "loss": 1.0883, + "step": 1451 + }, + { + "epoch": 0.2902476199995003, + "grad_norm": 2.140625, + "learning_rate": 9.798381510524995e-06, + "loss": 1.042, + "step": 1452 + }, + { + "epoch": 0.2904475150545963, + "grad_norm": 2.265625, + "learning_rate": 9.798085172544707e-06, + "loss": 1.1617, + "step": 1453 + }, + { + "epoch": 0.2906474101096924, + "grad_norm": 2.015625, + "learning_rate": 9.797788621433987e-06, + "loss": 1.0817, + "step": 1454 + }, + { + "epoch": 0.29084730516478846, + "grad_norm": 2.234375, + "learning_rate": 9.797491857206009e-06, + "loss": 1.2027, + "step": 1455 + }, + { + "epoch": 0.29104720021988456, + "grad_norm": 1.9453125, + "learning_rate": 9.797194879873956e-06, + "loss": 1.1056, + "step": 1456 + }, + { + "epoch": 0.29124709527498066, + "grad_norm": 2.296875, + "learning_rate": 9.796897689451019e-06, + "loss": 1.1966, + "step": 1457 + }, + { + "epoch": 0.2914469903300767, + "grad_norm": 2.078125, + "learning_rate": 9.7966002859504e-06, + "loss": 1.1577, + "step": 1458 + }, + { + "epoch": 0.2916468853851728, + "grad_norm": 2.0625, + "learning_rate": 9.796302669385307e-06, + "loss": 1.0255, + "step": 1459 + }, + { + "epoch": 0.29184678044026885, + "grad_norm": 2.0625, + "learning_rate": 9.796004839768962e-06, + "loss": 1.1133, + "step": 1460 + }, + { + "epoch": 0.29204667549536495, + "grad_norm": 2.15625, + "learning_rate": 9.795706797114593e-06, + "loss": 1.0865, + "step": 1461 + }, + { + "epoch": 0.292246570550461, + "grad_norm": 2.0625, + "learning_rate": 9.795408541435443e-06, + "loss": 1.0332, + "step": 1462 + }, + { + "epoch": 0.2924464656055571, + "grad_norm": 2.109375, + "learning_rate": 9.795110072744756e-06, + "loss": 1.1121, + "step": 1463 + }, + { + "epoch": 0.29264636066065314, + "grad_norm": 2.21875, + "learning_rate": 9.794811391055793e-06, + "loss": 1.0971, + "step": 1464 + }, + { + "epoch": 0.29284625571574924, + "grad_norm": 2.046875, + "learning_rate": 9.79451249638182e-06, + "loss": 1.0235, + "step": 1465 + }, + { + "epoch": 0.2930461507708453, + "grad_norm": 2.078125, + "learning_rate": 9.79421338873611e-06, + "loss": 1.0476, + "step": 1466 + }, + { + "epoch": 0.2932460458259414, + "grad_norm": 2.171875, + "learning_rate": 9.793914068131959e-06, + "loss": 1.1475, + "step": 1467 + }, + { + "epoch": 0.2934459408810375, + "grad_norm": 2.328125, + "learning_rate": 9.793614534582653e-06, + "loss": 1.1651, + "step": 1468 + }, + { + "epoch": 0.2936458359361335, + "grad_norm": 2.15625, + "learning_rate": 9.793314788101502e-06, + "loss": 1.1456, + "step": 1469 + }, + { + "epoch": 0.2938457309912296, + "grad_norm": 2.109375, + "learning_rate": 9.793014828701822e-06, + "loss": 1.0554, + "step": 1470 + }, + { + "epoch": 0.29404562604632567, + "grad_norm": 2.21875, + "learning_rate": 9.792714656396934e-06, + "loss": 1.0414, + "step": 1471 + }, + { + "epoch": 0.29424552110142177, + "grad_norm": 2.09375, + "learning_rate": 9.792414271200173e-06, + "loss": 1.0277, + "step": 1472 + }, + { + "epoch": 0.2944454161565178, + "grad_norm": 2.171875, + "learning_rate": 9.79211367312488e-06, + "loss": 1.1543, + "step": 1473 + }, + { + "epoch": 0.2946453112116139, + "grad_norm": 2.21875, + "learning_rate": 9.791812862184413e-06, + "loss": 1.1505, + "step": 1474 + }, + { + "epoch": 0.29484520626670996, + "grad_norm": 2.375, + "learning_rate": 9.791511838392128e-06, + "loss": 1.0876, + "step": 1475 + }, + { + "epoch": 0.29504510132180606, + "grad_norm": 2.09375, + "learning_rate": 9.7912106017614e-06, + "loss": 1.1306, + "step": 1476 + }, + { + "epoch": 0.2952449963769021, + "grad_norm": 2.0625, + "learning_rate": 9.790909152305609e-06, + "loss": 1.0793, + "step": 1477 + }, + { + "epoch": 0.2954448914319982, + "grad_norm": 2.1875, + "learning_rate": 9.790607490038145e-06, + "loss": 1.0108, + "step": 1478 + }, + { + "epoch": 0.2956447864870943, + "grad_norm": 2.125, + "learning_rate": 9.790305614972407e-06, + "loss": 1.0751, + "step": 1479 + }, + { + "epoch": 0.29584468154219035, + "grad_norm": 2.0, + "learning_rate": 9.790003527121806e-06, + "loss": 1.0636, + "step": 1480 + }, + { + "epoch": 0.29604457659728645, + "grad_norm": 2.03125, + "learning_rate": 9.78970122649976e-06, + "loss": 1.1285, + "step": 1481 + }, + { + "epoch": 0.2962444716523825, + "grad_norm": 2.203125, + "learning_rate": 9.789398713119696e-06, + "loss": 1.0562, + "step": 1482 + }, + { + "epoch": 0.2964443667074786, + "grad_norm": 2.140625, + "learning_rate": 9.789095986995052e-06, + "loss": 1.1148, + "step": 1483 + }, + { + "epoch": 0.29664426176257463, + "grad_norm": 2.09375, + "learning_rate": 9.788793048139277e-06, + "loss": 1.0267, + "step": 1484 + }, + { + "epoch": 0.29684415681767073, + "grad_norm": 2.140625, + "learning_rate": 9.788489896565827e-06, + "loss": 1.0022, + "step": 1485 + }, + { + "epoch": 0.2970440518727668, + "grad_norm": 2.21875, + "learning_rate": 9.788186532288166e-06, + "loss": 1.0756, + "step": 1486 + }, + { + "epoch": 0.2972439469278629, + "grad_norm": 2.21875, + "learning_rate": 9.787882955319771e-06, + "loss": 1.1432, + "step": 1487 + }, + { + "epoch": 0.2974438419829589, + "grad_norm": 2.3125, + "learning_rate": 9.787579165674129e-06, + "loss": 1.1975, + "step": 1488 + }, + { + "epoch": 0.297643737038055, + "grad_norm": 2.171875, + "learning_rate": 9.787275163364729e-06, + "loss": 1.087, + "step": 1489 + }, + { + "epoch": 0.2978436320931511, + "grad_norm": 2.09375, + "learning_rate": 9.786970948405077e-06, + "loss": 1.0913, + "step": 1490 + }, + { + "epoch": 0.29804352714824717, + "grad_norm": 2.15625, + "learning_rate": 9.786666520808688e-06, + "loss": 1.0329, + "step": 1491 + }, + { + "epoch": 0.29824342220334327, + "grad_norm": 2.03125, + "learning_rate": 9.786361880589084e-06, + "loss": 1.1613, + "step": 1492 + }, + { + "epoch": 0.2984433172584393, + "grad_norm": 2.0625, + "learning_rate": 9.786057027759796e-06, + "loss": 1.0602, + "step": 1493 + }, + { + "epoch": 0.2986432123135354, + "grad_norm": 2.140625, + "learning_rate": 9.785751962334365e-06, + "loss": 1.125, + "step": 1494 + }, + { + "epoch": 0.29884310736863146, + "grad_norm": 2.171875, + "learning_rate": 9.785446684326345e-06, + "loss": 1.1915, + "step": 1495 + }, + { + "epoch": 0.29904300242372756, + "grad_norm": 2.03125, + "learning_rate": 9.785141193749292e-06, + "loss": 1.1184, + "step": 1496 + }, + { + "epoch": 0.2992428974788236, + "grad_norm": 2.015625, + "learning_rate": 9.78483549061678e-06, + "loss": 1.0091, + "step": 1497 + }, + { + "epoch": 0.2994427925339197, + "grad_norm": 2.03125, + "learning_rate": 9.784529574942385e-06, + "loss": 0.9844, + "step": 1498 + }, + { + "epoch": 0.29964268758901574, + "grad_norm": 2.15625, + "learning_rate": 9.784223446739698e-06, + "loss": 1.1051, + "step": 1499 + }, + { + "epoch": 0.29984258264411184, + "grad_norm": 2.109375, + "learning_rate": 9.783917106022316e-06, + "loss": 1.1233, + "step": 1500 + }, + { + "epoch": 0.29984258264411184, + "eval_loss": 0.954179048538208, + "eval_runtime": 595.9323, + "eval_samples_per_second": 3.588, + "eval_steps_per_second": 3.588, + "step": 1500 + }, + { + "epoch": 0.30004247769920794, + "grad_norm": 2.125, + "learning_rate": 9.783610552803849e-06, + "loss": 1.0654, + "step": 1501 + }, + { + "epoch": 0.300242372754304, + "grad_norm": 2.171875, + "learning_rate": 9.78330378709791e-06, + "loss": 1.0456, + "step": 1502 + }, + { + "epoch": 0.3004422678094001, + "grad_norm": 2.203125, + "learning_rate": 9.782996808918128e-06, + "loss": 1.1971, + "step": 1503 + }, + { + "epoch": 0.30064216286449613, + "grad_norm": 2.234375, + "learning_rate": 9.782689618278139e-06, + "loss": 1.1156, + "step": 1504 + }, + { + "epoch": 0.30084205791959223, + "grad_norm": 2.203125, + "learning_rate": 9.782382215191589e-06, + "loss": 1.1085, + "step": 1505 + }, + { + "epoch": 0.3010419529746883, + "grad_norm": 2.21875, + "learning_rate": 9.782074599672131e-06, + "loss": 1.0369, + "step": 1506 + }, + { + "epoch": 0.3012418480297844, + "grad_norm": 2.09375, + "learning_rate": 9.78176677173343e-06, + "loss": 1.2029, + "step": 1507 + }, + { + "epoch": 0.3014417430848804, + "grad_norm": 2.140625, + "learning_rate": 9.78145873138916e-06, + "loss": 1.1371, + "step": 1508 + }, + { + "epoch": 0.3016416381399765, + "grad_norm": 2.078125, + "learning_rate": 9.781150478653003e-06, + "loss": 1.1452, + "step": 1509 + }, + { + "epoch": 0.30184153319507256, + "grad_norm": 2.15625, + "learning_rate": 9.780842013538652e-06, + "loss": 1.0721, + "step": 1510 + }, + { + "epoch": 0.30204142825016866, + "grad_norm": 2.1875, + "learning_rate": 9.78053333605981e-06, + "loss": 1.1146, + "step": 1511 + }, + { + "epoch": 0.3022413233052647, + "grad_norm": 2.125, + "learning_rate": 9.780224446230188e-06, + "loss": 1.1378, + "step": 1512 + }, + { + "epoch": 0.3024412183603608, + "grad_norm": 2.21875, + "learning_rate": 9.779915344063506e-06, + "loss": 1.0507, + "step": 1513 + }, + { + "epoch": 0.3026411134154569, + "grad_norm": 2.171875, + "learning_rate": 9.779606029573496e-06, + "loss": 1.1384, + "step": 1514 + }, + { + "epoch": 0.30284100847055295, + "grad_norm": 2.203125, + "learning_rate": 9.779296502773896e-06, + "loss": 1.0204, + "step": 1515 + }, + { + "epoch": 0.30304090352564905, + "grad_norm": 2.125, + "learning_rate": 9.778986763678455e-06, + "loss": 1.0589, + "step": 1516 + }, + { + "epoch": 0.3032407985807451, + "grad_norm": 2.171875, + "learning_rate": 9.778676812300935e-06, + "loss": 1.11, + "step": 1517 + }, + { + "epoch": 0.3034406936358412, + "grad_norm": 2.125, + "learning_rate": 9.778366648655098e-06, + "loss": 1.0446, + "step": 1518 + }, + { + "epoch": 0.30364058869093724, + "grad_norm": 2.21875, + "learning_rate": 9.778056272754728e-06, + "loss": 1.0326, + "step": 1519 + }, + { + "epoch": 0.30384048374603334, + "grad_norm": 2.109375, + "learning_rate": 9.777745684613606e-06, + "loss": 1.0558, + "step": 1520 + }, + { + "epoch": 0.3040403788011294, + "grad_norm": 2.28125, + "learning_rate": 9.777434884245533e-06, + "loss": 1.0122, + "step": 1521 + }, + { + "epoch": 0.3042402738562255, + "grad_norm": 2.21875, + "learning_rate": 9.77712387166431e-06, + "loss": 1.0617, + "step": 1522 + }, + { + "epoch": 0.30444016891132153, + "grad_norm": 2.171875, + "learning_rate": 9.776812646883758e-06, + "loss": 1.1102, + "step": 1523 + }, + { + "epoch": 0.30464006396641763, + "grad_norm": 2.109375, + "learning_rate": 9.776501209917697e-06, + "loss": 1.1396, + "step": 1524 + }, + { + "epoch": 0.30483995902151373, + "grad_norm": 2.015625, + "learning_rate": 9.776189560779963e-06, + "loss": 0.9904, + "step": 1525 + }, + { + "epoch": 0.3050398540766098, + "grad_norm": 2.125, + "learning_rate": 9.775877699484397e-06, + "loss": 1.0014, + "step": 1526 + }, + { + "epoch": 0.3052397491317059, + "grad_norm": 2.296875, + "learning_rate": 9.775565626044856e-06, + "loss": 1.0889, + "step": 1527 + }, + { + "epoch": 0.3054396441868019, + "grad_norm": 2.109375, + "learning_rate": 9.775253340475199e-06, + "loss": 1.0932, + "step": 1528 + }, + { + "epoch": 0.305639539241898, + "grad_norm": 2.125, + "learning_rate": 9.774940842789298e-06, + "loss": 1.0992, + "step": 1529 + }, + { + "epoch": 0.30583943429699406, + "grad_norm": 2.09375, + "learning_rate": 9.774628133001037e-06, + "loss": 1.0882, + "step": 1530 + }, + { + "epoch": 0.30603932935209016, + "grad_norm": 2.203125, + "learning_rate": 9.7743152111243e-06, + "loss": 1.1073, + "step": 1531 + }, + { + "epoch": 0.3062392244071862, + "grad_norm": 2.09375, + "learning_rate": 9.774002077172994e-06, + "loss": 1.0974, + "step": 1532 + }, + { + "epoch": 0.3064391194622823, + "grad_norm": 2.109375, + "learning_rate": 9.773688731161027e-06, + "loss": 1.029, + "step": 1533 + }, + { + "epoch": 0.30663901451737835, + "grad_norm": 2.09375, + "learning_rate": 9.773375173102315e-06, + "loss": 1.1065, + "step": 1534 + }, + { + "epoch": 0.30683890957247445, + "grad_norm": 2.171875, + "learning_rate": 9.773061403010786e-06, + "loss": 1.0404, + "step": 1535 + }, + { + "epoch": 0.30703880462757055, + "grad_norm": 2.1875, + "learning_rate": 9.772747420900381e-06, + "loss": 1.0275, + "step": 1536 + }, + { + "epoch": 0.3072386996826666, + "grad_norm": 2.21875, + "learning_rate": 9.772433226785045e-06, + "loss": 1.0852, + "step": 1537 + }, + { + "epoch": 0.3074385947377627, + "grad_norm": 2.125, + "learning_rate": 9.772118820678735e-06, + "loss": 1.0726, + "step": 1538 + }, + { + "epoch": 0.30763848979285874, + "grad_norm": 2.15625, + "learning_rate": 9.771804202595417e-06, + "loss": 1.1003, + "step": 1539 + }, + { + "epoch": 0.30783838484795484, + "grad_norm": 2.28125, + "learning_rate": 9.771489372549064e-06, + "loss": 1.0986, + "step": 1540 + }, + { + "epoch": 0.3080382799030509, + "grad_norm": 2.109375, + "learning_rate": 9.771174330553665e-06, + "loss": 1.0746, + "step": 1541 + }, + { + "epoch": 0.308238174958147, + "grad_norm": 2.125, + "learning_rate": 9.770859076623211e-06, + "loss": 0.9782, + "step": 1542 + }, + { + "epoch": 0.308438070013243, + "grad_norm": 2.140625, + "learning_rate": 9.770543610771706e-06, + "loss": 1.1435, + "step": 1543 + }, + { + "epoch": 0.3086379650683391, + "grad_norm": 2.046875, + "learning_rate": 9.770227933013163e-06, + "loss": 1.0373, + "step": 1544 + }, + { + "epoch": 0.30883786012343517, + "grad_norm": 2.046875, + "learning_rate": 9.769912043361606e-06, + "loss": 1.0527, + "step": 1545 + }, + { + "epoch": 0.30903775517853127, + "grad_norm": 2.28125, + "learning_rate": 9.769595941831066e-06, + "loss": 1.185, + "step": 1546 + }, + { + "epoch": 0.30923765023362737, + "grad_norm": 2.078125, + "learning_rate": 9.76927962843558e-06, + "loss": 1.0581, + "step": 1547 + }, + { + "epoch": 0.3094375452887234, + "grad_norm": 2.15625, + "learning_rate": 9.768963103189206e-06, + "loss": 1.1129, + "step": 1548 + }, + { + "epoch": 0.3096374403438195, + "grad_norm": 2.125, + "learning_rate": 9.768646366105997e-06, + "loss": 1.1202, + "step": 1549 + }, + { + "epoch": 0.30983733539891556, + "grad_norm": 2.125, + "learning_rate": 9.768329417200029e-06, + "loss": 1.1176, + "step": 1550 + }, + { + "epoch": 0.31003723045401166, + "grad_norm": 2.078125, + "learning_rate": 9.768012256485376e-06, + "loss": 1.0759, + "step": 1551 + }, + { + "epoch": 0.3102371255091077, + "grad_norm": 2.234375, + "learning_rate": 9.767694883976128e-06, + "loss": 1.1666, + "step": 1552 + }, + { + "epoch": 0.3104370205642038, + "grad_norm": 1.921875, + "learning_rate": 9.767377299686382e-06, + "loss": 1.0544, + "step": 1553 + }, + { + "epoch": 0.31063691561929985, + "grad_norm": 2.1875, + "learning_rate": 9.767059503630247e-06, + "loss": 1.1891, + "step": 1554 + }, + { + "epoch": 0.31083681067439595, + "grad_norm": 2.234375, + "learning_rate": 9.766741495821838e-06, + "loss": 1.1982, + "step": 1555 + }, + { + "epoch": 0.311036705729492, + "grad_norm": 2.21875, + "learning_rate": 9.76642327627528e-06, + "loss": 1.0769, + "step": 1556 + }, + { + "epoch": 0.3112366007845881, + "grad_norm": 2.203125, + "learning_rate": 9.766104845004709e-06, + "loss": 1.1212, + "step": 1557 + }, + { + "epoch": 0.3114364958396842, + "grad_norm": 2.296875, + "learning_rate": 9.76578620202427e-06, + "loss": 1.0862, + "step": 1558 + }, + { + "epoch": 0.31163639089478024, + "grad_norm": 2.03125, + "learning_rate": 9.765467347348116e-06, + "loss": 1.0631, + "step": 1559 + }, + { + "epoch": 0.31183628594987634, + "grad_norm": 2.046875, + "learning_rate": 9.765148280990412e-06, + "loss": 1.0448, + "step": 1560 + }, + { + "epoch": 0.3120361810049724, + "grad_norm": 2.109375, + "learning_rate": 9.76482900296533e-06, + "loss": 1.0488, + "step": 1561 + }, + { + "epoch": 0.3122360760600685, + "grad_norm": 2.171875, + "learning_rate": 9.764509513287054e-06, + "loss": 1.0261, + "step": 1562 + }, + { + "epoch": 0.3124359711151645, + "grad_norm": 2.125, + "learning_rate": 9.764189811969773e-06, + "loss": 1.0469, + "step": 1563 + }, + { + "epoch": 0.3126358661702606, + "grad_norm": 2.15625, + "learning_rate": 9.763869899027689e-06, + "loss": 1.0626, + "step": 1564 + }, + { + "epoch": 0.31283576122535667, + "grad_norm": 2.03125, + "learning_rate": 9.763549774475014e-06, + "loss": 1.0377, + "step": 1565 + }, + { + "epoch": 0.31303565628045277, + "grad_norm": 2.0625, + "learning_rate": 9.763229438325968e-06, + "loss": 0.9987, + "step": 1566 + }, + { + "epoch": 0.3132355513355488, + "grad_norm": 2.125, + "learning_rate": 9.762908890594777e-06, + "loss": 1.0444, + "step": 1567 + }, + { + "epoch": 0.3134354463906449, + "grad_norm": 2.109375, + "learning_rate": 9.762588131295681e-06, + "loss": 1.1318, + "step": 1568 + }, + { + "epoch": 0.313635341445741, + "grad_norm": 2.140625, + "learning_rate": 9.76226716044293e-06, + "loss": 1.0587, + "step": 1569 + }, + { + "epoch": 0.31383523650083706, + "grad_norm": 2.234375, + "learning_rate": 9.761945978050782e-06, + "loss": 1.1694, + "step": 1570 + }, + { + "epoch": 0.31403513155593316, + "grad_norm": 2.09375, + "learning_rate": 9.7616245841335e-06, + "loss": 1.0964, + "step": 1571 + }, + { + "epoch": 0.3142350266110292, + "grad_norm": 2.046875, + "learning_rate": 9.761302978705364e-06, + "loss": 1.0968, + "step": 1572 + }, + { + "epoch": 0.3144349216661253, + "grad_norm": 2.1875, + "learning_rate": 9.760981161780657e-06, + "loss": 1.0919, + "step": 1573 + }, + { + "epoch": 0.31463481672122134, + "grad_norm": 2.109375, + "learning_rate": 9.760659133373675e-06, + "loss": 1.0793, + "step": 1574 + }, + { + "epoch": 0.31483471177631744, + "grad_norm": 2.0625, + "learning_rate": 9.760336893498724e-06, + "loss": 1.1369, + "step": 1575 + }, + { + "epoch": 0.3150346068314135, + "grad_norm": 2.21875, + "learning_rate": 9.760014442170116e-06, + "loss": 1.2367, + "step": 1576 + }, + { + "epoch": 0.3152345018865096, + "grad_norm": 2.109375, + "learning_rate": 9.759691779402175e-06, + "loss": 1.0322, + "step": 1577 + }, + { + "epoch": 0.31543439694160563, + "grad_norm": 2.40625, + "learning_rate": 9.759368905209234e-06, + "loss": 1.2766, + "step": 1578 + }, + { + "epoch": 0.31563429199670173, + "grad_norm": 2.0625, + "learning_rate": 9.759045819605635e-06, + "loss": 1.1224, + "step": 1579 + }, + { + "epoch": 0.31583418705179783, + "grad_norm": 2.125, + "learning_rate": 9.758722522605727e-06, + "loss": 1.1049, + "step": 1580 + }, + { + "epoch": 0.3160340821068939, + "grad_norm": 2.1875, + "learning_rate": 9.758399014223874e-06, + "loss": 1.1412, + "step": 1581 + }, + { + "epoch": 0.31623397716199, + "grad_norm": 2.078125, + "learning_rate": 9.758075294474445e-06, + "loss": 1.0146, + "step": 1582 + }, + { + "epoch": 0.316433872217086, + "grad_norm": 2.125, + "learning_rate": 9.75775136337182e-06, + "loss": 1.0551, + "step": 1583 + }, + { + "epoch": 0.3166337672721821, + "grad_norm": 2.3125, + "learning_rate": 9.757427220930387e-06, + "loss": 1.1061, + "step": 1584 + }, + { + "epoch": 0.31683366232727816, + "grad_norm": 2.09375, + "learning_rate": 9.757102867164544e-06, + "loss": 1.0484, + "step": 1585 + }, + { + "epoch": 0.31703355738237426, + "grad_norm": 2.625, + "learning_rate": 9.756778302088701e-06, + "loss": 1.0879, + "step": 1586 + }, + { + "epoch": 0.3172334524374703, + "grad_norm": 2.09375, + "learning_rate": 9.756453525717274e-06, + "loss": 1.0491, + "step": 1587 + }, + { + "epoch": 0.3174333474925664, + "grad_norm": 2.125, + "learning_rate": 9.75612853806469e-06, + "loss": 1.049, + "step": 1588 + }, + { + "epoch": 0.31763324254766245, + "grad_norm": 2.078125, + "learning_rate": 9.755803339145382e-06, + "loss": 1.1043, + "step": 1589 + }, + { + "epoch": 0.31783313760275855, + "grad_norm": 2.03125, + "learning_rate": 9.755477928973797e-06, + "loss": 1.0747, + "step": 1590 + }, + { + "epoch": 0.31803303265785465, + "grad_norm": 2.171875, + "learning_rate": 9.755152307564393e-06, + "loss": 1.031, + "step": 1591 + }, + { + "epoch": 0.3182329277129507, + "grad_norm": 2.171875, + "learning_rate": 9.75482647493163e-06, + "loss": 1.1383, + "step": 1592 + }, + { + "epoch": 0.3184328227680468, + "grad_norm": 2.1875, + "learning_rate": 9.754500431089984e-06, + "loss": 1.1015, + "step": 1593 + }, + { + "epoch": 0.31863271782314284, + "grad_norm": 2.09375, + "learning_rate": 9.754174176053936e-06, + "loss": 1.083, + "step": 1594 + }, + { + "epoch": 0.31883261287823894, + "grad_norm": 2.140625, + "learning_rate": 9.75384770983798e-06, + "loss": 1.004, + "step": 1595 + }, + { + "epoch": 0.319032507933335, + "grad_norm": 1.984375, + "learning_rate": 9.753521032456615e-06, + "loss": 1.0074, + "step": 1596 + }, + { + "epoch": 0.3192324029884311, + "grad_norm": 2.0625, + "learning_rate": 9.753194143924354e-06, + "loss": 1.1477, + "step": 1597 + }, + { + "epoch": 0.31943229804352713, + "grad_norm": 2.1875, + "learning_rate": 9.752867044255716e-06, + "loss": 1.0784, + "step": 1598 + }, + { + "epoch": 0.31963219309862323, + "grad_norm": 2.1875, + "learning_rate": 9.752539733465231e-06, + "loss": 1.0531, + "step": 1599 + }, + { + "epoch": 0.3198320881537193, + "grad_norm": 2.09375, + "learning_rate": 9.75221221156744e-06, + "loss": 1.08, + "step": 1600 + }, + { + "epoch": 0.3200319832088154, + "grad_norm": 2.09375, + "learning_rate": 9.75188447857689e-06, + "loss": 1.093, + "step": 1601 + }, + { + "epoch": 0.3202318782639115, + "grad_norm": 2.078125, + "learning_rate": 9.75155653450814e-06, + "loss": 1.1182, + "step": 1602 + }, + { + "epoch": 0.3204317733190075, + "grad_norm": 2.125, + "learning_rate": 9.751228379375754e-06, + "loss": 1.058, + "step": 1603 + }, + { + "epoch": 0.3206316683741036, + "grad_norm": 2.046875, + "learning_rate": 9.750900013194312e-06, + "loss": 1.0517, + "step": 1604 + }, + { + "epoch": 0.32083156342919966, + "grad_norm": 2.015625, + "learning_rate": 9.750571435978399e-06, + "loss": 0.9927, + "step": 1605 + }, + { + "epoch": 0.32103145848429576, + "grad_norm": 2.140625, + "learning_rate": 9.750242647742609e-06, + "loss": 1.1342, + "step": 1606 + }, + { + "epoch": 0.3212313535393918, + "grad_norm": 2.234375, + "learning_rate": 9.74991364850155e-06, + "loss": 1.0087, + "step": 1607 + }, + { + "epoch": 0.3214312485944879, + "grad_norm": 2.078125, + "learning_rate": 9.749584438269833e-06, + "loss": 1.1029, + "step": 1608 + }, + { + "epoch": 0.32163114364958395, + "grad_norm": 2.1875, + "learning_rate": 9.749255017062081e-06, + "loss": 1.1581, + "step": 1609 + }, + { + "epoch": 0.32183103870468005, + "grad_norm": 2.109375, + "learning_rate": 9.74892538489293e-06, + "loss": 1.0755, + "step": 1610 + }, + { + "epoch": 0.3220309337597761, + "grad_norm": 2.203125, + "learning_rate": 9.748595541777021e-06, + "loss": 1.1912, + "step": 1611 + }, + { + "epoch": 0.3222308288148722, + "grad_norm": 2.203125, + "learning_rate": 9.748265487729003e-06, + "loss": 1.165, + "step": 1612 + }, + { + "epoch": 0.3224307238699683, + "grad_norm": 2.078125, + "learning_rate": 9.747935222763542e-06, + "loss": 0.9882, + "step": 1613 + }, + { + "epoch": 0.32263061892506434, + "grad_norm": 2.09375, + "learning_rate": 9.747604746895303e-06, + "loss": 1.0114, + "step": 1614 + }, + { + "epoch": 0.32283051398016044, + "grad_norm": 2.125, + "learning_rate": 9.747274060138971e-06, + "loss": 1.0804, + "step": 1615 + }, + { + "epoch": 0.3230304090352565, + "grad_norm": 2.015625, + "learning_rate": 9.74694316250923e-06, + "loss": 0.9537, + "step": 1616 + }, + { + "epoch": 0.3232303040903526, + "grad_norm": 2.140625, + "learning_rate": 9.74661205402078e-06, + "loss": 1.1196, + "step": 1617 + }, + { + "epoch": 0.3234301991454486, + "grad_norm": 2.09375, + "learning_rate": 9.746280734688332e-06, + "loss": 1.01, + "step": 1618 + }, + { + "epoch": 0.3236300942005447, + "grad_norm": 2.03125, + "learning_rate": 9.7459492045266e-06, + "loss": 1.1002, + "step": 1619 + }, + { + "epoch": 0.32382998925564077, + "grad_norm": 2.09375, + "learning_rate": 9.74561746355031e-06, + "loss": 1.0775, + "step": 1620 + }, + { + "epoch": 0.32402988431073687, + "grad_norm": 2.21875, + "learning_rate": 9.7452855117742e-06, + "loss": 1.1096, + "step": 1621 + }, + { + "epoch": 0.3242297793658329, + "grad_norm": 2.15625, + "learning_rate": 9.744953349213016e-06, + "loss": 1.0339, + "step": 1622 + }, + { + "epoch": 0.324429674420929, + "grad_norm": 2.125, + "learning_rate": 9.74462097588151e-06, + "loss": 1.0775, + "step": 1623 + }, + { + "epoch": 0.32462956947602506, + "grad_norm": 2.046875, + "learning_rate": 9.744288391794446e-06, + "loss": 1.091, + "step": 1624 + }, + { + "epoch": 0.32482946453112116, + "grad_norm": 2.09375, + "learning_rate": 9.743955596966597e-06, + "loss": 1.037, + "step": 1625 + }, + { + "epoch": 0.32502935958621726, + "grad_norm": 2.125, + "learning_rate": 9.743622591412749e-06, + "loss": 1.0736, + "step": 1626 + }, + { + "epoch": 0.3252292546413133, + "grad_norm": 2.046875, + "learning_rate": 9.743289375147693e-06, + "loss": 1.0584, + "step": 1627 + }, + { + "epoch": 0.3254291496964094, + "grad_norm": 2.125, + "learning_rate": 9.742955948186228e-06, + "loss": 1.0357, + "step": 1628 + }, + { + "epoch": 0.32562904475150545, + "grad_norm": 2.109375, + "learning_rate": 9.742622310543165e-06, + "loss": 1.0085, + "step": 1629 + }, + { + "epoch": 0.32582893980660155, + "grad_norm": 2.15625, + "learning_rate": 9.742288462233329e-06, + "loss": 1.0853, + "step": 1630 + }, + { + "epoch": 0.3260288348616976, + "grad_norm": 2.078125, + "learning_rate": 9.741954403271543e-06, + "loss": 1.0205, + "step": 1631 + }, + { + "epoch": 0.3262287299167937, + "grad_norm": 2.109375, + "learning_rate": 9.741620133672651e-06, + "loss": 1.0081, + "step": 1632 + }, + { + "epoch": 0.32642862497188974, + "grad_norm": 2.140625, + "learning_rate": 9.741285653451497e-06, + "loss": 1.0474, + "step": 1633 + }, + { + "epoch": 0.32662852002698584, + "grad_norm": 2.1875, + "learning_rate": 9.740950962622943e-06, + "loss": 1.1232, + "step": 1634 + }, + { + "epoch": 0.3268284150820819, + "grad_norm": 2.125, + "learning_rate": 9.740616061201852e-06, + "loss": 1.0578, + "step": 1635 + }, + { + "epoch": 0.327028310137178, + "grad_norm": 2.1875, + "learning_rate": 9.740280949203102e-06, + "loss": 1.1024, + "step": 1636 + }, + { + "epoch": 0.3272282051922741, + "grad_norm": 2.078125, + "learning_rate": 9.739945626641579e-06, + "loss": 1.1667, + "step": 1637 + }, + { + "epoch": 0.3274281002473701, + "grad_norm": 2.140625, + "learning_rate": 9.739610093532176e-06, + "loss": 1.0927, + "step": 1638 + }, + { + "epoch": 0.3276279953024662, + "grad_norm": 2.125, + "learning_rate": 9.739274349889802e-06, + "loss": 1.0491, + "step": 1639 + }, + { + "epoch": 0.32782789035756227, + "grad_norm": 2.15625, + "learning_rate": 9.738938395729364e-06, + "loss": 0.9977, + "step": 1640 + }, + { + "epoch": 0.32802778541265837, + "grad_norm": 2.03125, + "learning_rate": 9.738602231065793e-06, + "loss": 0.9795, + "step": 1641 + }, + { + "epoch": 0.3282276804677544, + "grad_norm": 2.203125, + "learning_rate": 9.738265855914014e-06, + "loss": 1.1082, + "step": 1642 + }, + { + "epoch": 0.3284275755228505, + "grad_norm": 2.265625, + "learning_rate": 9.73792927028897e-06, + "loss": 1.046, + "step": 1643 + }, + { + "epoch": 0.32862747057794656, + "grad_norm": 2.03125, + "learning_rate": 9.737592474205617e-06, + "loss": 1.0181, + "step": 1644 + }, + { + "epoch": 0.32882736563304266, + "grad_norm": 2.125, + "learning_rate": 9.73725546767891e-06, + "loss": 1.0592, + "step": 1645 + }, + { + "epoch": 0.3290272606881387, + "grad_norm": 2.09375, + "learning_rate": 9.736918250723823e-06, + "loss": 1.0517, + "step": 1646 + }, + { + "epoch": 0.3292271557432348, + "grad_norm": 2.21875, + "learning_rate": 9.736580823355333e-06, + "loss": 1.0778, + "step": 1647 + }, + { + "epoch": 0.3294270507983309, + "grad_norm": 2.109375, + "learning_rate": 9.736243185588428e-06, + "loss": 1.038, + "step": 1648 + }, + { + "epoch": 0.32962694585342694, + "grad_norm": 2.078125, + "learning_rate": 9.735905337438107e-06, + "loss": 1.0171, + "step": 1649 + }, + { + "epoch": 0.32982684090852304, + "grad_norm": 2.1875, + "learning_rate": 9.735567278919376e-06, + "loss": 1.1386, + "step": 1650 + }, + { + "epoch": 0.3300267359636191, + "grad_norm": 2.15625, + "learning_rate": 9.735229010047253e-06, + "loss": 1.0989, + "step": 1651 + }, + { + "epoch": 0.3302266310187152, + "grad_norm": 2.03125, + "learning_rate": 9.734890530836763e-06, + "loss": 1.0239, + "step": 1652 + }, + { + "epoch": 0.33042652607381123, + "grad_norm": 2.0625, + "learning_rate": 9.734551841302941e-06, + "loss": 1.0697, + "step": 1653 + }, + { + "epoch": 0.33062642112890733, + "grad_norm": 2.15625, + "learning_rate": 9.734212941460833e-06, + "loss": 1.1375, + "step": 1654 + }, + { + "epoch": 0.3308263161840034, + "grad_norm": 2.15625, + "learning_rate": 9.73387383132549e-06, + "loss": 0.9819, + "step": 1655 + }, + { + "epoch": 0.3310262112390995, + "grad_norm": 1.9375, + "learning_rate": 9.733534510911977e-06, + "loss": 0.9949, + "step": 1656 + }, + { + "epoch": 0.3312261062941955, + "grad_norm": 2.03125, + "learning_rate": 9.733194980235367e-06, + "loss": 1.0075, + "step": 1657 + }, + { + "epoch": 0.3314260013492916, + "grad_norm": 2.390625, + "learning_rate": 9.732855239310743e-06, + "loss": 1.1145, + "step": 1658 + }, + { + "epoch": 0.3316258964043877, + "grad_norm": 2.25, + "learning_rate": 9.732515288153193e-06, + "loss": 1.0824, + "step": 1659 + }, + { + "epoch": 0.33182579145948377, + "grad_norm": 2.265625, + "learning_rate": 9.732175126777821e-06, + "loss": 1.1481, + "step": 1660 + }, + { + "epoch": 0.33202568651457987, + "grad_norm": 2.03125, + "learning_rate": 9.731834755199734e-06, + "loss": 0.9968, + "step": 1661 + }, + { + "epoch": 0.3322255815696759, + "grad_norm": 2.1875, + "learning_rate": 9.731494173434053e-06, + "loss": 1.053, + "step": 1662 + }, + { + "epoch": 0.332425476624772, + "grad_norm": 1.9453125, + "learning_rate": 9.731153381495905e-06, + "loss": 1.0219, + "step": 1663 + }, + { + "epoch": 0.33262537167986805, + "grad_norm": 2.09375, + "learning_rate": 9.730812379400432e-06, + "loss": 1.0924, + "step": 1664 + }, + { + "epoch": 0.33282526673496415, + "grad_norm": 2.203125, + "learning_rate": 9.730471167162776e-06, + "loss": 1.0849, + "step": 1665 + }, + { + "epoch": 0.3330251617900602, + "grad_norm": 2.125, + "learning_rate": 9.730129744798096e-06, + "loss": 1.2105, + "step": 1666 + }, + { + "epoch": 0.3332250568451563, + "grad_norm": 2.046875, + "learning_rate": 9.729788112321558e-06, + "loss": 1.0258, + "step": 1667 + }, + { + "epoch": 0.33342495190025234, + "grad_norm": 2.109375, + "learning_rate": 9.729446269748338e-06, + "loss": 1.1197, + "step": 1668 + }, + { + "epoch": 0.33362484695534844, + "grad_norm": 2.09375, + "learning_rate": 9.729104217093618e-06, + "loss": 1.0676, + "step": 1669 + }, + { + "epoch": 0.33382474201044454, + "grad_norm": 2.09375, + "learning_rate": 9.728761954372597e-06, + "loss": 1.0351, + "step": 1670 + }, + { + "epoch": 0.3340246370655406, + "grad_norm": 2.25, + "learning_rate": 9.728419481600472e-06, + "loss": 1.1705, + "step": 1671 + }, + { + "epoch": 0.3342245321206367, + "grad_norm": 2.09375, + "learning_rate": 9.728076798792461e-06, + "loss": 1.1647, + "step": 1672 + }, + { + "epoch": 0.33442442717573273, + "grad_norm": 2.078125, + "learning_rate": 9.727733905963783e-06, + "loss": 1.006, + "step": 1673 + }, + { + "epoch": 0.33462432223082883, + "grad_norm": 2.109375, + "learning_rate": 9.727390803129668e-06, + "loss": 1.0953, + "step": 1674 + }, + { + "epoch": 0.3348242172859249, + "grad_norm": 2.171875, + "learning_rate": 9.72704749030536e-06, + "loss": 1.0583, + "step": 1675 + }, + { + "epoch": 0.335024112341021, + "grad_norm": 2.1875, + "learning_rate": 9.726703967506107e-06, + "loss": 1.0887, + "step": 1676 + }, + { + "epoch": 0.335224007396117, + "grad_norm": 2.125, + "learning_rate": 9.726360234747168e-06, + "loss": 0.9653, + "step": 1677 + }, + { + "epoch": 0.3354239024512131, + "grad_norm": 2.125, + "learning_rate": 9.726016292043814e-06, + "loss": 0.9459, + "step": 1678 + }, + { + "epoch": 0.33562379750630916, + "grad_norm": 2.140625, + "learning_rate": 9.725672139411319e-06, + "loss": 1.0816, + "step": 1679 + }, + { + "epoch": 0.33582369256140526, + "grad_norm": 2.109375, + "learning_rate": 9.725327776864974e-06, + "loss": 1.0573, + "step": 1680 + }, + { + "epoch": 0.33602358761650136, + "grad_norm": 2.25, + "learning_rate": 9.724983204420073e-06, + "loss": 1.0713, + "step": 1681 + }, + { + "epoch": 0.3362234826715974, + "grad_norm": 2.140625, + "learning_rate": 9.724638422091922e-06, + "loss": 1.0342, + "step": 1682 + }, + { + "epoch": 0.3364233777266935, + "grad_norm": 2.109375, + "learning_rate": 9.724293429895836e-06, + "loss": 1.111, + "step": 1683 + }, + { + "epoch": 0.33662327278178955, + "grad_norm": 2.15625, + "learning_rate": 9.723948227847145e-06, + "loss": 0.9647, + "step": 1684 + }, + { + "epoch": 0.33682316783688565, + "grad_norm": 2.28125, + "learning_rate": 9.723602815961173e-06, + "loss": 1.0749, + "step": 1685 + }, + { + "epoch": 0.3370230628919817, + "grad_norm": 2.125, + "learning_rate": 9.723257194253272e-06, + "loss": 1.0911, + "step": 1686 + }, + { + "epoch": 0.3372229579470778, + "grad_norm": 2.09375, + "learning_rate": 9.72291136273879e-06, + "loss": 1.0592, + "step": 1687 + }, + { + "epoch": 0.33742285300217384, + "grad_norm": 2.03125, + "learning_rate": 9.722565321433088e-06, + "loss": 1.043, + "step": 1688 + }, + { + "epoch": 0.33762274805726994, + "grad_norm": 2.03125, + "learning_rate": 9.722219070351541e-06, + "loss": 1.1058, + "step": 1689 + }, + { + "epoch": 0.337822643112366, + "grad_norm": 2.15625, + "learning_rate": 9.721872609509526e-06, + "loss": 1.1402, + "step": 1690 + }, + { + "epoch": 0.3380225381674621, + "grad_norm": 2.140625, + "learning_rate": 9.721525938922434e-06, + "loss": 1.1047, + "step": 1691 + }, + { + "epoch": 0.3382224332225582, + "grad_norm": 2.09375, + "learning_rate": 9.721179058605664e-06, + "loss": 1.0193, + "step": 1692 + }, + { + "epoch": 0.3384223282776542, + "grad_norm": 2.140625, + "learning_rate": 9.720831968574625e-06, + "loss": 1.0501, + "step": 1693 + }, + { + "epoch": 0.3386222233327503, + "grad_norm": 2.28125, + "learning_rate": 9.720484668844734e-06, + "loss": 1.0406, + "step": 1694 + }, + { + "epoch": 0.33882211838784637, + "grad_norm": 2.0625, + "learning_rate": 9.720137159431418e-06, + "loss": 1.0744, + "step": 1695 + }, + { + "epoch": 0.33902201344294247, + "grad_norm": 2.125, + "learning_rate": 9.719789440350113e-06, + "loss": 1.0918, + "step": 1696 + }, + { + "epoch": 0.3392219084980385, + "grad_norm": 2.15625, + "learning_rate": 9.719441511616266e-06, + "loss": 1.0955, + "step": 1697 + }, + { + "epoch": 0.3394218035531346, + "grad_norm": 2.0625, + "learning_rate": 9.719093373245331e-06, + "loss": 1.0195, + "step": 1698 + }, + { + "epoch": 0.33962169860823066, + "grad_norm": 2.171875, + "learning_rate": 9.718745025252773e-06, + "loss": 1.0618, + "step": 1699 + }, + { + "epoch": 0.33982159366332676, + "grad_norm": 2.140625, + "learning_rate": 9.718396467654064e-06, + "loss": 1.1286, + "step": 1700 + }, + { + "epoch": 0.3400214887184228, + "grad_norm": 2.03125, + "learning_rate": 9.718047700464688e-06, + "loss": 1.0813, + "step": 1701 + }, + { + "epoch": 0.3402213837735189, + "grad_norm": 2.125, + "learning_rate": 9.717698723700137e-06, + "loss": 1.0753, + "step": 1702 + }, + { + "epoch": 0.340421278828615, + "grad_norm": 2.21875, + "learning_rate": 9.717349537375913e-06, + "loss": 1.1358, + "step": 1703 + }, + { + "epoch": 0.34062117388371105, + "grad_norm": 2.171875, + "learning_rate": 9.717000141507527e-06, + "loss": 0.9364, + "step": 1704 + }, + { + "epoch": 0.34082106893880715, + "grad_norm": 2.078125, + "learning_rate": 9.716650536110496e-06, + "loss": 1.0648, + "step": 1705 + }, + { + "epoch": 0.3410209639939032, + "grad_norm": 2.140625, + "learning_rate": 9.716300721200354e-06, + "loss": 1.0959, + "step": 1706 + }, + { + "epoch": 0.3412208590489993, + "grad_norm": 2.1875, + "learning_rate": 9.715950696792638e-06, + "loss": 1.009, + "step": 1707 + }, + { + "epoch": 0.34142075410409534, + "grad_norm": 2.390625, + "learning_rate": 9.715600462902895e-06, + "loss": 1.051, + "step": 1708 + }, + { + "epoch": 0.34162064915919144, + "grad_norm": 2.109375, + "learning_rate": 9.715250019546683e-06, + "loss": 0.999, + "step": 1709 + }, + { + "epoch": 0.3418205442142875, + "grad_norm": 1.9921875, + "learning_rate": 9.714899366739569e-06, + "loss": 0.9646, + "step": 1710 + }, + { + "epoch": 0.3420204392693836, + "grad_norm": 2.0625, + "learning_rate": 9.714548504497128e-06, + "loss": 1.0364, + "step": 1711 + }, + { + "epoch": 0.3422203343244796, + "grad_norm": 2.09375, + "learning_rate": 9.714197432834947e-06, + "loss": 1.0647, + "step": 1712 + }, + { + "epoch": 0.3424202293795757, + "grad_norm": 2.0625, + "learning_rate": 9.713846151768618e-06, + "loss": 1.12, + "step": 1713 + }, + { + "epoch": 0.3426201244346718, + "grad_norm": 1.9921875, + "learning_rate": 9.713494661313746e-06, + "loss": 1.033, + "step": 1714 + }, + { + "epoch": 0.34282001948976787, + "grad_norm": 2.109375, + "learning_rate": 9.713142961485947e-06, + "loss": 1.0881, + "step": 1715 + }, + { + "epoch": 0.34301991454486397, + "grad_norm": 2.21875, + "learning_rate": 9.71279105230084e-06, + "loss": 1.1303, + "step": 1716 + }, + { + "epoch": 0.34321980959996, + "grad_norm": 2.109375, + "learning_rate": 9.712438933774057e-06, + "loss": 1.1371, + "step": 1717 + }, + { + "epoch": 0.3434197046550561, + "grad_norm": 2.0625, + "learning_rate": 9.71208660592124e-06, + "loss": 0.968, + "step": 1718 + }, + { + "epoch": 0.34361959971015216, + "grad_norm": 2.1875, + "learning_rate": 9.71173406875804e-06, + "loss": 1.0274, + "step": 1719 + }, + { + "epoch": 0.34381949476524826, + "grad_norm": 2.25, + "learning_rate": 9.711381322300117e-06, + "loss": 0.9893, + "step": 1720 + }, + { + "epoch": 0.3440193898203443, + "grad_norm": 2.296875, + "learning_rate": 9.711028366563138e-06, + "loss": 1.171, + "step": 1721 + }, + { + "epoch": 0.3442192848754404, + "grad_norm": 2.140625, + "learning_rate": 9.71067520156278e-06, + "loss": 1.1283, + "step": 1722 + }, + { + "epoch": 0.34441917993053645, + "grad_norm": 2.078125, + "learning_rate": 9.710321827314735e-06, + "loss": 1.0001, + "step": 1723 + }, + { + "epoch": 0.34461907498563255, + "grad_norm": 2.40625, + "learning_rate": 9.709968243834698e-06, + "loss": 1.0845, + "step": 1724 + }, + { + "epoch": 0.34481897004072865, + "grad_norm": 2.125, + "learning_rate": 9.709614451138373e-06, + "loss": 1.0823, + "step": 1725 + }, + { + "epoch": 0.3450188650958247, + "grad_norm": 2.015625, + "learning_rate": 9.709260449241478e-06, + "loss": 1.058, + "step": 1726 + }, + { + "epoch": 0.3452187601509208, + "grad_norm": 2.15625, + "learning_rate": 9.708906238159736e-06, + "loss": 1.1112, + "step": 1727 + }, + { + "epoch": 0.34541865520601683, + "grad_norm": 2.140625, + "learning_rate": 9.708551817908883e-06, + "loss": 1.1158, + "step": 1728 + }, + { + "epoch": 0.34561855026111293, + "grad_norm": 2.21875, + "learning_rate": 9.70819718850466e-06, + "loss": 1.0908, + "step": 1729 + }, + { + "epoch": 0.345818445316209, + "grad_norm": 2.3125, + "learning_rate": 9.707842349962821e-06, + "loss": 1.1913, + "step": 1730 + }, + { + "epoch": 0.3460183403713051, + "grad_norm": 2.09375, + "learning_rate": 9.707487302299128e-06, + "loss": 1.0317, + "step": 1731 + }, + { + "epoch": 0.3462182354264011, + "grad_norm": 2.203125, + "learning_rate": 9.707132045529352e-06, + "loss": 1.1217, + "step": 1732 + }, + { + "epoch": 0.3464181304814972, + "grad_norm": 2.015625, + "learning_rate": 9.706776579669272e-06, + "loss": 1.0175, + "step": 1733 + }, + { + "epoch": 0.34661802553659327, + "grad_norm": 2.140625, + "learning_rate": 9.70642090473468e-06, + "loss": 1.1629, + "step": 1734 + }, + { + "epoch": 0.34681792059168937, + "grad_norm": 2.09375, + "learning_rate": 9.706065020741375e-06, + "loss": 1.0368, + "step": 1735 + }, + { + "epoch": 0.3470178156467854, + "grad_norm": 2.078125, + "learning_rate": 9.705708927705163e-06, + "loss": 1.0294, + "step": 1736 + }, + { + "epoch": 0.3472177107018815, + "grad_norm": 2.140625, + "learning_rate": 9.705352625641863e-06, + "loss": 1.0309, + "step": 1737 + }, + { + "epoch": 0.3474176057569776, + "grad_norm": 2.171875, + "learning_rate": 9.704996114567302e-06, + "loss": 1.079, + "step": 1738 + }, + { + "epoch": 0.34761750081207365, + "grad_norm": 2.03125, + "learning_rate": 9.704639394497317e-06, + "loss": 1.0315, + "step": 1739 + }, + { + "epoch": 0.34781739586716975, + "grad_norm": 2.390625, + "learning_rate": 9.704282465447752e-06, + "loss": 1.1318, + "step": 1740 + }, + { + "epoch": 0.3480172909222658, + "grad_norm": 2.140625, + "learning_rate": 9.703925327434462e-06, + "loss": 1.0676, + "step": 1741 + }, + { + "epoch": 0.3482171859773619, + "grad_norm": 2.140625, + "learning_rate": 9.703567980473311e-06, + "loss": 1.0814, + "step": 1742 + }, + { + "epoch": 0.34841708103245794, + "grad_norm": 2.171875, + "learning_rate": 9.703210424580174e-06, + "loss": 1.0813, + "step": 1743 + }, + { + "epoch": 0.34861697608755404, + "grad_norm": 2.203125, + "learning_rate": 9.702852659770932e-06, + "loss": 1.1354, + "step": 1744 + }, + { + "epoch": 0.3488168711426501, + "grad_norm": 2.125, + "learning_rate": 9.702494686061477e-06, + "loss": 0.9704, + "step": 1745 + }, + { + "epoch": 0.3490167661977462, + "grad_norm": 2.125, + "learning_rate": 9.70213650346771e-06, + "loss": 1.0792, + "step": 1746 + }, + { + "epoch": 0.34921666125284223, + "grad_norm": 2.1875, + "learning_rate": 9.70177811200554e-06, + "loss": 1.0346, + "step": 1747 + }, + { + "epoch": 0.34941655630793833, + "grad_norm": 2.203125, + "learning_rate": 9.701419511690891e-06, + "loss": 1.1517, + "step": 1748 + }, + { + "epoch": 0.34961645136303443, + "grad_norm": 2.09375, + "learning_rate": 9.701060702539689e-06, + "loss": 1.1122, + "step": 1749 + }, + { + "epoch": 0.3498163464181305, + "grad_norm": 2.296875, + "learning_rate": 9.700701684567872e-06, + "loss": 0.995, + "step": 1750 + }, + { + "epoch": 0.3500162414732266, + "grad_norm": 2.03125, + "learning_rate": 9.700342457791387e-06, + "loss": 1.0151, + "step": 1751 + }, + { + "epoch": 0.3502161365283226, + "grad_norm": 2.15625, + "learning_rate": 9.699983022226194e-06, + "loss": 1.1168, + "step": 1752 + }, + { + "epoch": 0.3504160315834187, + "grad_norm": 2.046875, + "learning_rate": 9.699623377888256e-06, + "loss": 1.0673, + "step": 1753 + }, + { + "epoch": 0.35061592663851476, + "grad_norm": 1.984375, + "learning_rate": 9.699263524793551e-06, + "loss": 1.0326, + "step": 1754 + }, + { + "epoch": 0.35081582169361086, + "grad_norm": 2.15625, + "learning_rate": 9.69890346295806e-06, + "loss": 1.1526, + "step": 1755 + }, + { + "epoch": 0.3510157167487069, + "grad_norm": 2.15625, + "learning_rate": 9.69854319239778e-06, + "loss": 1.101, + "step": 1756 + }, + { + "epoch": 0.351215611803803, + "grad_norm": 2.171875, + "learning_rate": 9.698182713128714e-06, + "loss": 1.1748, + "step": 1757 + }, + { + "epoch": 0.35141550685889905, + "grad_norm": 2.234375, + "learning_rate": 9.697822025166874e-06, + "loss": 1.1522, + "step": 1758 + }, + { + "epoch": 0.35161540191399515, + "grad_norm": 2.25, + "learning_rate": 9.697461128528281e-06, + "loss": 1.0798, + "step": 1759 + }, + { + "epoch": 0.35181529696909125, + "grad_norm": 2.140625, + "learning_rate": 9.697100023228966e-06, + "loss": 1.0741, + "step": 1760 + }, + { + "epoch": 0.3520151920241873, + "grad_norm": 2.03125, + "learning_rate": 9.69673870928497e-06, + "loss": 1.0234, + "step": 1761 + }, + { + "epoch": 0.3522150870792834, + "grad_norm": 2.140625, + "learning_rate": 9.69637718671234e-06, + "loss": 1.1072, + "step": 1762 + }, + { + "epoch": 0.35241498213437944, + "grad_norm": 2.140625, + "learning_rate": 9.696015455527139e-06, + "loss": 1.0557, + "step": 1763 + }, + { + "epoch": 0.35261487718947554, + "grad_norm": 2.203125, + "learning_rate": 9.695653515745433e-06, + "loss": 1.0192, + "step": 1764 + }, + { + "epoch": 0.3528147722445716, + "grad_norm": 2.1875, + "learning_rate": 9.6952913673833e-06, + "loss": 1.0337, + "step": 1765 + }, + { + "epoch": 0.3530146672996677, + "grad_norm": 2.0625, + "learning_rate": 9.694929010456824e-06, + "loss": 1.0215, + "step": 1766 + }, + { + "epoch": 0.35321456235476373, + "grad_norm": 2.140625, + "learning_rate": 9.694566444982104e-06, + "loss": 1.1584, + "step": 1767 + }, + { + "epoch": 0.35341445740985983, + "grad_norm": 2.03125, + "learning_rate": 9.694203670975244e-06, + "loss": 1.0854, + "step": 1768 + }, + { + "epoch": 0.3536143524649559, + "grad_norm": 2.15625, + "learning_rate": 9.693840688452358e-06, + "loss": 1.1391, + "step": 1769 + }, + { + "epoch": 0.35381424752005197, + "grad_norm": 2.25, + "learning_rate": 9.69347749742957e-06, + "loss": 0.9615, + "step": 1770 + }, + { + "epoch": 0.35401414257514807, + "grad_norm": 2.265625, + "learning_rate": 9.693114097923013e-06, + "loss": 1.0806, + "step": 1771 + }, + { + "epoch": 0.3542140376302441, + "grad_norm": 2.046875, + "learning_rate": 9.692750489948829e-06, + "loss": 1.1137, + "step": 1772 + }, + { + "epoch": 0.3544139326853402, + "grad_norm": 2.09375, + "learning_rate": 9.692386673523168e-06, + "loss": 1.004, + "step": 1773 + }, + { + "epoch": 0.35461382774043626, + "grad_norm": 2.203125, + "learning_rate": 9.692022648662193e-06, + "loss": 1.0561, + "step": 1774 + }, + { + "epoch": 0.35481372279553236, + "grad_norm": 2.171875, + "learning_rate": 9.691658415382073e-06, + "loss": 1.003, + "step": 1775 + }, + { + "epoch": 0.3550136178506284, + "grad_norm": 2.015625, + "learning_rate": 9.691293973698988e-06, + "loss": 1.0291, + "step": 1776 + }, + { + "epoch": 0.3552135129057245, + "grad_norm": 2.046875, + "learning_rate": 9.690929323629126e-06, + "loss": 1.0943, + "step": 1777 + }, + { + "epoch": 0.35541340796082055, + "grad_norm": 2.125, + "learning_rate": 9.690564465188684e-06, + "loss": 1.1177, + "step": 1778 + }, + { + "epoch": 0.35561330301591665, + "grad_norm": 2.09375, + "learning_rate": 9.69019939839387e-06, + "loss": 1.0692, + "step": 1779 + }, + { + "epoch": 0.3558131980710127, + "grad_norm": 2.09375, + "learning_rate": 9.689834123260896e-06, + "loss": 1.1017, + "step": 1780 + }, + { + "epoch": 0.3560130931261088, + "grad_norm": 2.25, + "learning_rate": 9.689468639805996e-06, + "loss": 1.0774, + "step": 1781 + }, + { + "epoch": 0.3562129881812049, + "grad_norm": 2.03125, + "learning_rate": 9.689102948045398e-06, + "loss": 1.0658, + "step": 1782 + }, + { + "epoch": 0.35641288323630094, + "grad_norm": 2.078125, + "learning_rate": 9.688737047995349e-06, + "loss": 1.0039, + "step": 1783 + }, + { + "epoch": 0.35661277829139704, + "grad_norm": 2.078125, + "learning_rate": 9.688370939672099e-06, + "loss": 1.0924, + "step": 1784 + }, + { + "epoch": 0.3568126733464931, + "grad_norm": 2.15625, + "learning_rate": 9.688004623091914e-06, + "loss": 1.1315, + "step": 1785 + }, + { + "epoch": 0.3570125684015892, + "grad_norm": 2.25, + "learning_rate": 9.687638098271064e-06, + "loss": 1.1243, + "step": 1786 + }, + { + "epoch": 0.3572124634566852, + "grad_norm": 2.140625, + "learning_rate": 9.68727136522583e-06, + "loss": 1.0848, + "step": 1787 + }, + { + "epoch": 0.3574123585117813, + "grad_norm": 2.203125, + "learning_rate": 9.686904423972502e-06, + "loss": 1.0546, + "step": 1788 + }, + { + "epoch": 0.35761225356687737, + "grad_norm": 2.140625, + "learning_rate": 9.68653727452738e-06, + "loss": 1.0223, + "step": 1789 + }, + { + "epoch": 0.35781214862197347, + "grad_norm": 2.046875, + "learning_rate": 9.686169916906773e-06, + "loss": 1.0758, + "step": 1790 + }, + { + "epoch": 0.3580120436770695, + "grad_norm": 2.078125, + "learning_rate": 9.685802351127e-06, + "loss": 1.0396, + "step": 1791 + }, + { + "epoch": 0.3582119387321656, + "grad_norm": 2.21875, + "learning_rate": 9.685434577204387e-06, + "loss": 1.0373, + "step": 1792 + }, + { + "epoch": 0.3584118337872617, + "grad_norm": 2.03125, + "learning_rate": 9.68506659515527e-06, + "loss": 1.0745, + "step": 1793 + }, + { + "epoch": 0.35861172884235776, + "grad_norm": 2.25, + "learning_rate": 9.684698404995995e-06, + "loss": 1.0851, + "step": 1794 + }, + { + "epoch": 0.35881162389745386, + "grad_norm": 2.171875, + "learning_rate": 9.684330006742916e-06, + "loss": 1.0869, + "step": 1795 + }, + { + "epoch": 0.3590115189525499, + "grad_norm": 2.328125, + "learning_rate": 9.6839614004124e-06, + "loss": 1.1189, + "step": 1796 + }, + { + "epoch": 0.359211414007646, + "grad_norm": 2.09375, + "learning_rate": 9.68359258602082e-06, + "loss": 1.0061, + "step": 1797 + }, + { + "epoch": 0.35941130906274205, + "grad_norm": 2.3125, + "learning_rate": 9.683223563584556e-06, + "loss": 1.0797, + "step": 1798 + }, + { + "epoch": 0.35961120411783815, + "grad_norm": 2.046875, + "learning_rate": 9.682854333120002e-06, + "loss": 1.0724, + "step": 1799 + }, + { + "epoch": 0.3598110991729342, + "grad_norm": 2.046875, + "learning_rate": 9.682484894643558e-06, + "loss": 1.0145, + "step": 1800 + }, + { + "epoch": 0.3600109942280303, + "grad_norm": 2.359375, + "learning_rate": 9.682115248171635e-06, + "loss": 1.1822, + "step": 1801 + }, + { + "epoch": 0.36021088928312633, + "grad_norm": 2.203125, + "learning_rate": 9.681745393720654e-06, + "loss": 1.1737, + "step": 1802 + }, + { + "epoch": 0.36041078433822243, + "grad_norm": 2.203125, + "learning_rate": 9.68137533130704e-06, + "loss": 1.1308, + "step": 1803 + }, + { + "epoch": 0.36061067939331853, + "grad_norm": 2.140625, + "learning_rate": 9.681005060947237e-06, + "loss": 0.9999, + "step": 1804 + }, + { + "epoch": 0.3608105744484146, + "grad_norm": 1.984375, + "learning_rate": 9.680634582657688e-06, + "loss": 1.0311, + "step": 1805 + }, + { + "epoch": 0.3610104695035107, + "grad_norm": 2.234375, + "learning_rate": 9.68026389645485e-06, + "loss": 1.0794, + "step": 1806 + }, + { + "epoch": 0.3612103645586067, + "grad_norm": 2.203125, + "learning_rate": 9.67989300235519e-06, + "loss": 1.0, + "step": 1807 + }, + { + "epoch": 0.3614102596137028, + "grad_norm": 2.0625, + "learning_rate": 9.679521900375183e-06, + "loss": 1.0749, + "step": 1808 + }, + { + "epoch": 0.36161015466879887, + "grad_norm": 2.109375, + "learning_rate": 9.679150590531312e-06, + "loss": 1.1228, + "step": 1809 + }, + { + "epoch": 0.36181004972389497, + "grad_norm": 2.125, + "learning_rate": 9.678779072840072e-06, + "loss": 1.0755, + "step": 1810 + }, + { + "epoch": 0.362009944778991, + "grad_norm": 2.234375, + "learning_rate": 9.678407347317967e-06, + "loss": 1.1757, + "step": 1811 + }, + { + "epoch": 0.3622098398340871, + "grad_norm": 2.09375, + "learning_rate": 9.678035413981505e-06, + "loss": 0.97, + "step": 1812 + }, + { + "epoch": 0.36240973488918315, + "grad_norm": 2.21875, + "learning_rate": 9.677663272847211e-06, + "loss": 0.9848, + "step": 1813 + }, + { + "epoch": 0.36260962994427925, + "grad_norm": 2.125, + "learning_rate": 9.677290923931613e-06, + "loss": 1.1265, + "step": 1814 + }, + { + "epoch": 0.36280952499937535, + "grad_norm": 2.21875, + "learning_rate": 9.676918367251251e-06, + "loss": 1.0995, + "step": 1815 + }, + { + "epoch": 0.3630094200544714, + "grad_norm": 2.0625, + "learning_rate": 9.676545602822675e-06, + "loss": 1.0715, + "step": 1816 + }, + { + "epoch": 0.3632093151095675, + "grad_norm": 2.125, + "learning_rate": 9.676172630662446e-06, + "loss": 1.0607, + "step": 1817 + }, + { + "epoch": 0.36340921016466354, + "grad_norm": 2.046875, + "learning_rate": 9.675799450787125e-06, + "loss": 1.0595, + "step": 1818 + }, + { + "epoch": 0.36360910521975964, + "grad_norm": 2.140625, + "learning_rate": 9.675426063213291e-06, + "loss": 1.0687, + "step": 1819 + }, + { + "epoch": 0.3638090002748557, + "grad_norm": 2.109375, + "learning_rate": 9.675052467957533e-06, + "loss": 1.0903, + "step": 1820 + }, + { + "epoch": 0.3640088953299518, + "grad_norm": 2.171875, + "learning_rate": 9.674678665036443e-06, + "loss": 1.1392, + "step": 1821 + }, + { + "epoch": 0.36420879038504783, + "grad_norm": 2.40625, + "learning_rate": 9.674304654466625e-06, + "loss": 1.0846, + "step": 1822 + }, + { + "epoch": 0.36440868544014393, + "grad_norm": 2.296875, + "learning_rate": 9.673930436264695e-06, + "loss": 1.1428, + "step": 1823 + }, + { + "epoch": 0.36460858049524, + "grad_norm": 2.265625, + "learning_rate": 9.673556010447274e-06, + "loss": 1.0965, + "step": 1824 + }, + { + "epoch": 0.3648084755503361, + "grad_norm": 2.046875, + "learning_rate": 9.673181377030993e-06, + "loss": 1.0701, + "step": 1825 + }, + { + "epoch": 0.3650083706054322, + "grad_norm": 2.03125, + "learning_rate": 9.672806536032495e-06, + "loss": 1.0332, + "step": 1826 + }, + { + "epoch": 0.3652082656605282, + "grad_norm": 1.9765625, + "learning_rate": 9.67243148746843e-06, + "loss": 1.0372, + "step": 1827 + }, + { + "epoch": 0.3654081607156243, + "grad_norm": 2.234375, + "learning_rate": 9.672056231355455e-06, + "loss": 1.1678, + "step": 1828 + }, + { + "epoch": 0.36560805577072036, + "grad_norm": 2.078125, + "learning_rate": 9.671680767710244e-06, + "loss": 1.0088, + "step": 1829 + }, + { + "epoch": 0.36580795082581646, + "grad_norm": 2.21875, + "learning_rate": 9.671305096549473e-06, + "loss": 1.1343, + "step": 1830 + }, + { + "epoch": 0.3660078458809125, + "grad_norm": 2.234375, + "learning_rate": 9.670929217889825e-06, + "loss": 0.9761, + "step": 1831 + }, + { + "epoch": 0.3662077409360086, + "grad_norm": 2.125, + "learning_rate": 9.670553131748003e-06, + "loss": 1.0866, + "step": 1832 + }, + { + "epoch": 0.36640763599110465, + "grad_norm": 2.25, + "learning_rate": 9.670176838140708e-06, + "loss": 1.1274, + "step": 1833 + }, + { + "epoch": 0.36660753104620075, + "grad_norm": 2.09375, + "learning_rate": 9.669800337084658e-06, + "loss": 1.1767, + "step": 1834 + }, + { + "epoch": 0.3668074261012968, + "grad_norm": 2.25, + "learning_rate": 9.669423628596574e-06, + "loss": 1.0194, + "step": 1835 + }, + { + "epoch": 0.3670073211563929, + "grad_norm": 2.0625, + "learning_rate": 9.669046712693192e-06, + "loss": 0.9397, + "step": 1836 + }, + { + "epoch": 0.367207216211489, + "grad_norm": 2.1875, + "learning_rate": 9.668669589391256e-06, + "loss": 1.0062, + "step": 1837 + }, + { + "epoch": 0.36740711126658504, + "grad_norm": 2.078125, + "learning_rate": 9.668292258707513e-06, + "loss": 1.0522, + "step": 1838 + }, + { + "epoch": 0.36760700632168114, + "grad_norm": 2.15625, + "learning_rate": 9.667914720658726e-06, + "loss": 1.0841, + "step": 1839 + }, + { + "epoch": 0.3678069013767772, + "grad_norm": 2.078125, + "learning_rate": 9.667536975261667e-06, + "loss": 1.1204, + "step": 1840 + }, + { + "epoch": 0.3680067964318733, + "grad_norm": 2.265625, + "learning_rate": 9.667159022533115e-06, + "loss": 1.0638, + "step": 1841 + }, + { + "epoch": 0.36820669148696933, + "grad_norm": 2.234375, + "learning_rate": 9.666780862489856e-06, + "loss": 1.035, + "step": 1842 + }, + { + "epoch": 0.36840658654206543, + "grad_norm": 2.125, + "learning_rate": 9.66640249514869e-06, + "loss": 1.2266, + "step": 1843 + }, + { + "epoch": 0.3686064815971615, + "grad_norm": 2.171875, + "learning_rate": 9.666023920526423e-06, + "loss": 1.0598, + "step": 1844 + }, + { + "epoch": 0.3688063766522576, + "grad_norm": 2.125, + "learning_rate": 9.665645138639872e-06, + "loss": 1.0532, + "step": 1845 + }, + { + "epoch": 0.3690062717073536, + "grad_norm": 2.1875, + "learning_rate": 9.665266149505863e-06, + "loss": 1.0488, + "step": 1846 + }, + { + "epoch": 0.3692061667624497, + "grad_norm": 2.078125, + "learning_rate": 9.664886953141228e-06, + "loss": 1.0693, + "step": 1847 + }, + { + "epoch": 0.36940606181754576, + "grad_norm": 2.125, + "learning_rate": 9.664507549562814e-06, + "loss": 1.016, + "step": 1848 + }, + { + "epoch": 0.36960595687264186, + "grad_norm": 2.140625, + "learning_rate": 9.664127938787473e-06, + "loss": 1.0602, + "step": 1849 + }, + { + "epoch": 0.36980585192773796, + "grad_norm": 2.265625, + "learning_rate": 9.663748120832069e-06, + "loss": 1.1527, + "step": 1850 + }, + { + "epoch": 0.370005746982834, + "grad_norm": 2.125, + "learning_rate": 9.663368095713468e-06, + "loss": 0.9813, + "step": 1851 + }, + { + "epoch": 0.3702056420379301, + "grad_norm": 2.09375, + "learning_rate": 9.662987863448556e-06, + "loss": 1.065, + "step": 1852 + }, + { + "epoch": 0.37040553709302615, + "grad_norm": 2.296875, + "learning_rate": 9.662607424054221e-06, + "loss": 1.0928, + "step": 1853 + }, + { + "epoch": 0.37060543214812225, + "grad_norm": 2.125, + "learning_rate": 9.662226777547363e-06, + "loss": 1.0619, + "step": 1854 + }, + { + "epoch": 0.3708053272032183, + "grad_norm": 2.125, + "learning_rate": 9.661845923944888e-06, + "loss": 1.1729, + "step": 1855 + }, + { + "epoch": 0.3710052222583144, + "grad_norm": 2.15625, + "learning_rate": 9.661464863263717e-06, + "loss": 1.0977, + "step": 1856 + }, + { + "epoch": 0.37120511731341044, + "grad_norm": 2.25, + "learning_rate": 9.661083595520772e-06, + "loss": 1.0552, + "step": 1857 + }, + { + "epoch": 0.37140501236850654, + "grad_norm": 2.15625, + "learning_rate": 9.660702120732992e-06, + "loss": 1.1044, + "step": 1858 + }, + { + "epoch": 0.3716049074236026, + "grad_norm": 2.09375, + "learning_rate": 9.660320438917323e-06, + "loss": 1.0601, + "step": 1859 + }, + { + "epoch": 0.3718048024786987, + "grad_norm": 2.09375, + "learning_rate": 9.659938550090717e-06, + "loss": 1.0558, + "step": 1860 + }, + { + "epoch": 0.3720046975337948, + "grad_norm": 2.140625, + "learning_rate": 9.659556454270139e-06, + "loss": 1.0663, + "step": 1861 + }, + { + "epoch": 0.3722045925888908, + "grad_norm": 2.0625, + "learning_rate": 9.659174151472562e-06, + "loss": 1.0182, + "step": 1862 + }, + { + "epoch": 0.3724044876439869, + "grad_norm": 2.015625, + "learning_rate": 9.658791641714965e-06, + "loss": 1.0283, + "step": 1863 + }, + { + "epoch": 0.37260438269908297, + "grad_norm": 2.125, + "learning_rate": 9.65840892501434e-06, + "loss": 1.0972, + "step": 1864 + }, + { + "epoch": 0.37280427775417907, + "grad_norm": 2.171875, + "learning_rate": 9.65802600138769e-06, + "loss": 1.0751, + "step": 1865 + }, + { + "epoch": 0.3730041728092751, + "grad_norm": 2.046875, + "learning_rate": 9.65764287085202e-06, + "loss": 1.076, + "step": 1866 + }, + { + "epoch": 0.3732040678643712, + "grad_norm": 2.0625, + "learning_rate": 9.657259533424355e-06, + "loss": 1.0158, + "step": 1867 + }, + { + "epoch": 0.37340396291946726, + "grad_norm": 2.125, + "learning_rate": 9.656875989121715e-06, + "loss": 1.0534, + "step": 1868 + }, + { + "epoch": 0.37360385797456336, + "grad_norm": 1.9609375, + "learning_rate": 9.656492237961143e-06, + "loss": 1.062, + "step": 1869 + }, + { + "epoch": 0.3738037530296594, + "grad_norm": 2.140625, + "learning_rate": 9.656108279959684e-06, + "loss": 1.111, + "step": 1870 + }, + { + "epoch": 0.3740036480847555, + "grad_norm": 2.1875, + "learning_rate": 9.65572411513439e-06, + "loss": 1.1052, + "step": 1871 + }, + { + "epoch": 0.3742035431398516, + "grad_norm": 2.390625, + "learning_rate": 9.65533974350233e-06, + "loss": 1.0701, + "step": 1872 + }, + { + "epoch": 0.37440343819494765, + "grad_norm": 2.09375, + "learning_rate": 9.654955165080575e-06, + "loss": 1.0559, + "step": 1873 + }, + { + "epoch": 0.37460333325004375, + "grad_norm": 1.890625, + "learning_rate": 9.65457037988621e-06, + "loss": 0.9771, + "step": 1874 + }, + { + "epoch": 0.3748032283051398, + "grad_norm": 2.28125, + "learning_rate": 9.654185387936323e-06, + "loss": 1.0589, + "step": 1875 + }, + { + "epoch": 0.3750031233602359, + "grad_norm": 2.171875, + "learning_rate": 9.653800189248022e-06, + "loss": 1.0885, + "step": 1876 + }, + { + "epoch": 0.37520301841533193, + "grad_norm": 2.03125, + "learning_rate": 9.65341478383841e-06, + "loss": 0.9952, + "step": 1877 + }, + { + "epoch": 0.37540291347042803, + "grad_norm": 2.171875, + "learning_rate": 9.653029171724612e-06, + "loss": 1.0742, + "step": 1878 + }, + { + "epoch": 0.3756028085255241, + "grad_norm": 2.046875, + "learning_rate": 9.652643352923756e-06, + "loss": 1.0288, + "step": 1879 + }, + { + "epoch": 0.3758027035806202, + "grad_norm": 2.078125, + "learning_rate": 9.652257327452978e-06, + "loss": 1.1885, + "step": 1880 + }, + { + "epoch": 0.3760025986357162, + "grad_norm": 2.109375, + "learning_rate": 9.651871095329425e-06, + "loss": 1.1295, + "step": 1881 + }, + { + "epoch": 0.3762024936908123, + "grad_norm": 2.203125, + "learning_rate": 9.651484656570257e-06, + "loss": 1.0479, + "step": 1882 + }, + { + "epoch": 0.3764023887459084, + "grad_norm": 2.09375, + "learning_rate": 9.651098011192638e-06, + "loss": 1.0396, + "step": 1883 + }, + { + "epoch": 0.37660228380100447, + "grad_norm": 2.21875, + "learning_rate": 9.65071115921374e-06, + "loss": 1.1473, + "step": 1884 + }, + { + "epoch": 0.37680217885610057, + "grad_norm": 2.109375, + "learning_rate": 9.65032410065075e-06, + "loss": 1.0292, + "step": 1885 + }, + { + "epoch": 0.3770020739111966, + "grad_norm": 2.109375, + "learning_rate": 9.64993683552086e-06, + "loss": 1.1086, + "step": 1886 + }, + { + "epoch": 0.3772019689662927, + "grad_norm": 2.109375, + "learning_rate": 9.649549363841273e-06, + "loss": 0.9923, + "step": 1887 + }, + { + "epoch": 0.37740186402138876, + "grad_norm": 2.140625, + "learning_rate": 9.649161685629199e-06, + "loss": 1.0775, + "step": 1888 + }, + { + "epoch": 0.37760175907648486, + "grad_norm": 2.21875, + "learning_rate": 9.64877380090186e-06, + "loss": 1.1234, + "step": 1889 + }, + { + "epoch": 0.3778016541315809, + "grad_norm": 2.0, + "learning_rate": 9.648385709676485e-06, + "loss": 1.0188, + "step": 1890 + }, + { + "epoch": 0.378001549186677, + "grad_norm": 2.140625, + "learning_rate": 9.647997411970313e-06, + "loss": 0.9725, + "step": 1891 + }, + { + "epoch": 0.37820144424177304, + "grad_norm": 2.171875, + "learning_rate": 9.647608907800593e-06, + "loss": 1.0811, + "step": 1892 + }, + { + "epoch": 0.37840133929686914, + "grad_norm": 2.109375, + "learning_rate": 9.647220197184582e-06, + "loss": 0.9877, + "step": 1893 + }, + { + "epoch": 0.37860123435196524, + "grad_norm": 2.140625, + "learning_rate": 9.646831280139544e-06, + "loss": 1.1433, + "step": 1894 + }, + { + "epoch": 0.3788011294070613, + "grad_norm": 2.140625, + "learning_rate": 9.646442156682758e-06, + "loss": 1.1211, + "step": 1895 + }, + { + "epoch": 0.3790010244621574, + "grad_norm": 2.140625, + "learning_rate": 9.646052826831509e-06, + "loss": 0.996, + "step": 1896 + }, + { + "epoch": 0.37920091951725343, + "grad_norm": 2.109375, + "learning_rate": 9.64566329060309e-06, + "loss": 1.088, + "step": 1897 + }, + { + "epoch": 0.37940081457234953, + "grad_norm": 2.171875, + "learning_rate": 9.6452735480148e-06, + "loss": 1.0102, + "step": 1898 + }, + { + "epoch": 0.3796007096274456, + "grad_norm": 2.140625, + "learning_rate": 9.644883599083959e-06, + "loss": 1.072, + "step": 1899 + }, + { + "epoch": 0.3798006046825417, + "grad_norm": 2.078125, + "learning_rate": 9.644493443827883e-06, + "loss": 1.113, + "step": 1900 + }, + { + "epoch": 0.3800004997376377, + "grad_norm": 2.28125, + "learning_rate": 9.644103082263904e-06, + "loss": 1.0365, + "step": 1901 + }, + { + "epoch": 0.3802003947927338, + "grad_norm": 2.171875, + "learning_rate": 9.643712514409362e-06, + "loss": 1.1374, + "step": 1902 + }, + { + "epoch": 0.38040028984782986, + "grad_norm": 2.03125, + "learning_rate": 9.643321740281606e-06, + "loss": 1.091, + "step": 1903 + }, + { + "epoch": 0.38060018490292596, + "grad_norm": 2.171875, + "learning_rate": 9.642930759897995e-06, + "loss": 1.1503, + "step": 1904 + }, + { + "epoch": 0.38080007995802206, + "grad_norm": 2.1875, + "learning_rate": 9.642539573275895e-06, + "loss": 1.1326, + "step": 1905 + }, + { + "epoch": 0.3809999750131181, + "grad_norm": 1.96875, + "learning_rate": 9.642148180432685e-06, + "loss": 1.0329, + "step": 1906 + }, + { + "epoch": 0.3811998700682142, + "grad_norm": 2.109375, + "learning_rate": 9.641756581385745e-06, + "loss": 1.1499, + "step": 1907 + }, + { + "epoch": 0.38139976512331025, + "grad_norm": 2.265625, + "learning_rate": 9.641364776152477e-06, + "loss": 1.1574, + "step": 1908 + }, + { + "epoch": 0.38159966017840635, + "grad_norm": 2.078125, + "learning_rate": 9.64097276475028e-06, + "loss": 1.0513, + "step": 1909 + }, + { + "epoch": 0.3817995552335024, + "grad_norm": 2.109375, + "learning_rate": 9.640580547196568e-06, + "loss": 1.0772, + "step": 1910 + }, + { + "epoch": 0.3819994502885985, + "grad_norm": 2.15625, + "learning_rate": 9.640188123508764e-06, + "loss": 1.0826, + "step": 1911 + }, + { + "epoch": 0.38219934534369454, + "grad_norm": 2.171875, + "learning_rate": 9.639795493704299e-06, + "loss": 1.1318, + "step": 1912 + }, + { + "epoch": 0.38239924039879064, + "grad_norm": 2.109375, + "learning_rate": 9.639402657800613e-06, + "loss": 1.0901, + "step": 1913 + }, + { + "epoch": 0.3825991354538867, + "grad_norm": 2.109375, + "learning_rate": 9.639009615815158e-06, + "loss": 1.07, + "step": 1914 + }, + { + "epoch": 0.3827990305089828, + "grad_norm": 2.0, + "learning_rate": 9.63861636776539e-06, + "loss": 1.103, + "step": 1915 + }, + { + "epoch": 0.3829989255640789, + "grad_norm": 2.15625, + "learning_rate": 9.63822291366878e-06, + "loss": 1.1312, + "step": 1916 + }, + { + "epoch": 0.38319882061917493, + "grad_norm": 2.078125, + "learning_rate": 9.637829253542801e-06, + "loss": 1.0867, + "step": 1917 + }, + { + "epoch": 0.38339871567427103, + "grad_norm": 2.0625, + "learning_rate": 9.637435387404943e-06, + "loss": 1.1388, + "step": 1918 + }, + { + "epoch": 0.3835986107293671, + "grad_norm": 2.15625, + "learning_rate": 9.637041315272702e-06, + "loss": 1.1109, + "step": 1919 + }, + { + "epoch": 0.3837985057844632, + "grad_norm": 2.015625, + "learning_rate": 9.63664703716358e-06, + "loss": 0.9584, + "step": 1920 + }, + { + "epoch": 0.3839984008395592, + "grad_norm": 2.078125, + "learning_rate": 9.636252553095091e-06, + "loss": 1.1159, + "step": 1921 + }, + { + "epoch": 0.3841982958946553, + "grad_norm": 2.171875, + "learning_rate": 9.635857863084758e-06, + "loss": 1.0773, + "step": 1922 + }, + { + "epoch": 0.38439819094975136, + "grad_norm": 2.15625, + "learning_rate": 9.635462967150116e-06, + "loss": 1.0794, + "step": 1923 + }, + { + "epoch": 0.38459808600484746, + "grad_norm": 2.109375, + "learning_rate": 9.635067865308703e-06, + "loss": 1.0855, + "step": 1924 + }, + { + "epoch": 0.3847979810599435, + "grad_norm": 2.140625, + "learning_rate": 9.634672557578072e-06, + "loss": 1.0506, + "step": 1925 + }, + { + "epoch": 0.3849978761150396, + "grad_norm": 2.125, + "learning_rate": 9.63427704397578e-06, + "loss": 0.9756, + "step": 1926 + }, + { + "epoch": 0.3851977711701357, + "grad_norm": 2.140625, + "learning_rate": 9.633881324519397e-06, + "loss": 1.0076, + "step": 1927 + }, + { + "epoch": 0.38539766622523175, + "grad_norm": 2.296875, + "learning_rate": 9.633485399226499e-06, + "loss": 1.0761, + "step": 1928 + }, + { + "epoch": 0.38559756128032785, + "grad_norm": 2.078125, + "learning_rate": 9.633089268114675e-06, + "loss": 1.0176, + "step": 1929 + }, + { + "epoch": 0.3857974563354239, + "grad_norm": 2.078125, + "learning_rate": 9.632692931201522e-06, + "loss": 1.0506, + "step": 1930 + }, + { + "epoch": 0.38599735139052, + "grad_norm": 2.046875, + "learning_rate": 9.632296388504641e-06, + "loss": 1.0752, + "step": 1931 + }, + { + "epoch": 0.38619724644561604, + "grad_norm": 2.1875, + "learning_rate": 9.631899640041652e-06, + "loss": 1.1649, + "step": 1932 + }, + { + "epoch": 0.38639714150071214, + "grad_norm": 2.328125, + "learning_rate": 9.631502685830175e-06, + "loss": 1.0854, + "step": 1933 + }, + { + "epoch": 0.3865970365558082, + "grad_norm": 2.078125, + "learning_rate": 9.631105525887841e-06, + "loss": 1.0688, + "step": 1934 + }, + { + "epoch": 0.3867969316109043, + "grad_norm": 2.09375, + "learning_rate": 9.630708160232297e-06, + "loss": 1.0149, + "step": 1935 + }, + { + "epoch": 0.3869968266660003, + "grad_norm": 2.21875, + "learning_rate": 9.63031058888119e-06, + "loss": 1.0706, + "step": 1936 + }, + { + "epoch": 0.3871967217210964, + "grad_norm": 2.0625, + "learning_rate": 9.629912811852181e-06, + "loss": 0.984, + "step": 1937 + }, + { + "epoch": 0.3873966167761925, + "grad_norm": 2.140625, + "learning_rate": 9.62951482916294e-06, + "loss": 1.07, + "step": 1938 + }, + { + "epoch": 0.38759651183128857, + "grad_norm": 2.265625, + "learning_rate": 9.629116640831144e-06, + "loss": 1.1219, + "step": 1939 + }, + { + "epoch": 0.38779640688638467, + "grad_norm": 2.09375, + "learning_rate": 9.628718246874482e-06, + "loss": 1.1367, + "step": 1940 + }, + { + "epoch": 0.3879963019414807, + "grad_norm": 2.078125, + "learning_rate": 9.628319647310648e-06, + "loss": 1.0883, + "step": 1941 + }, + { + "epoch": 0.3881961969965768, + "grad_norm": 2.171875, + "learning_rate": 9.627920842157352e-06, + "loss": 1.1226, + "step": 1942 + }, + { + "epoch": 0.38839609205167286, + "grad_norm": 2.21875, + "learning_rate": 9.627521831432308e-06, + "loss": 1.1547, + "step": 1943 + }, + { + "epoch": 0.38859598710676896, + "grad_norm": 1.9453125, + "learning_rate": 9.627122615153234e-06, + "loss": 1.0724, + "step": 1944 + }, + { + "epoch": 0.388795882161865, + "grad_norm": 2.1875, + "learning_rate": 9.62672319333787e-06, + "loss": 1.1552, + "step": 1945 + }, + { + "epoch": 0.3889957772169611, + "grad_norm": 2.046875, + "learning_rate": 9.626323566003955e-06, + "loss": 1.1128, + "step": 1946 + }, + { + "epoch": 0.38919567227205715, + "grad_norm": 2.265625, + "learning_rate": 9.625923733169242e-06, + "loss": 1.1447, + "step": 1947 + }, + { + "epoch": 0.38939556732715325, + "grad_norm": 2.03125, + "learning_rate": 9.62552369485149e-06, + "loss": 1.0015, + "step": 1948 + }, + { + "epoch": 0.38959546238224935, + "grad_norm": 2.21875, + "learning_rate": 9.625123451068472e-06, + "loss": 1.267, + "step": 1949 + }, + { + "epoch": 0.3897953574373454, + "grad_norm": 2.234375, + "learning_rate": 9.62472300183796e-06, + "loss": 1.0929, + "step": 1950 + }, + { + "epoch": 0.3899952524924415, + "grad_norm": 2.046875, + "learning_rate": 9.62432234717775e-06, + "loss": 0.9937, + "step": 1951 + }, + { + "epoch": 0.39019514754753754, + "grad_norm": 2.078125, + "learning_rate": 9.623921487105634e-06, + "loss": 1.1026, + "step": 1952 + }, + { + "epoch": 0.39039504260263364, + "grad_norm": 2.203125, + "learning_rate": 9.62352042163942e-06, + "loss": 1.0733, + "step": 1953 + }, + { + "epoch": 0.3905949376577297, + "grad_norm": 2.171875, + "learning_rate": 9.623119150796923e-06, + "loss": 1.0681, + "step": 1954 + }, + { + "epoch": 0.3907948327128258, + "grad_norm": 2.078125, + "learning_rate": 9.622717674595965e-06, + "loss": 1.0314, + "step": 1955 + }, + { + "epoch": 0.3909947277679218, + "grad_norm": 2.234375, + "learning_rate": 9.622315993054384e-06, + "loss": 1.1474, + "step": 1956 + }, + { + "epoch": 0.3911946228230179, + "grad_norm": 2.046875, + "learning_rate": 9.621914106190019e-06, + "loss": 1.0736, + "step": 1957 + }, + { + "epoch": 0.39139451787811397, + "grad_norm": 2.125, + "learning_rate": 9.621512014020722e-06, + "loss": 1.0334, + "step": 1958 + }, + { + "epoch": 0.39159441293321007, + "grad_norm": 2.15625, + "learning_rate": 9.621109716564357e-06, + "loss": 1.1237, + "step": 1959 + }, + { + "epoch": 0.3917943079883061, + "grad_norm": 2.09375, + "learning_rate": 9.620707213838789e-06, + "loss": 1.0922, + "step": 1960 + }, + { + "epoch": 0.3919942030434022, + "grad_norm": 2.109375, + "learning_rate": 9.620304505861902e-06, + "loss": 1.0834, + "step": 1961 + }, + { + "epoch": 0.3921940980984983, + "grad_norm": 2.015625, + "learning_rate": 9.619901592651582e-06, + "loss": 1.0662, + "step": 1962 + }, + { + "epoch": 0.39239399315359436, + "grad_norm": 2.0625, + "learning_rate": 9.619498474225729e-06, + "loss": 1.0044, + "step": 1963 + }, + { + "epoch": 0.39259388820869046, + "grad_norm": 2.171875, + "learning_rate": 9.619095150602243e-06, + "loss": 1.0423, + "step": 1964 + }, + { + "epoch": 0.3927937832637865, + "grad_norm": 2.15625, + "learning_rate": 9.618691621799047e-06, + "loss": 1.0499, + "step": 1965 + }, + { + "epoch": 0.3929936783188826, + "grad_norm": 2.09375, + "learning_rate": 9.618287887834062e-06, + "loss": 0.9927, + "step": 1966 + }, + { + "epoch": 0.39319357337397864, + "grad_norm": 2.109375, + "learning_rate": 9.617883948725219e-06, + "loss": 0.9855, + "step": 1967 + }, + { + "epoch": 0.39339346842907474, + "grad_norm": 2.25, + "learning_rate": 9.617479804490468e-06, + "loss": 1.0272, + "step": 1968 + }, + { + "epoch": 0.3935933634841708, + "grad_norm": 2.171875, + "learning_rate": 9.617075455147757e-06, + "loss": 1.1507, + "step": 1969 + }, + { + "epoch": 0.3937932585392669, + "grad_norm": 2.0625, + "learning_rate": 9.616670900715046e-06, + "loss": 1.0953, + "step": 1970 + }, + { + "epoch": 0.39399315359436293, + "grad_norm": 2.109375, + "learning_rate": 9.616266141210306e-06, + "loss": 1.0569, + "step": 1971 + }, + { + "epoch": 0.39419304864945903, + "grad_norm": 2.125, + "learning_rate": 9.615861176651519e-06, + "loss": 1.1121, + "step": 1972 + }, + { + "epoch": 0.39439294370455513, + "grad_norm": 2.078125, + "learning_rate": 9.61545600705667e-06, + "loss": 1.1301, + "step": 1973 + }, + { + "epoch": 0.3945928387596512, + "grad_norm": 2.203125, + "learning_rate": 9.615050632443759e-06, + "loss": 1.0539, + "step": 1974 + }, + { + "epoch": 0.3947927338147473, + "grad_norm": 2.0625, + "learning_rate": 9.614645052830791e-06, + "loss": 1.0629, + "step": 1975 + }, + { + "epoch": 0.3949926288698433, + "grad_norm": 2.015625, + "learning_rate": 9.614239268235783e-06, + "loss": 1.0353, + "step": 1976 + }, + { + "epoch": 0.3951925239249394, + "grad_norm": 2.34375, + "learning_rate": 9.613833278676762e-06, + "loss": 1.1782, + "step": 1977 + }, + { + "epoch": 0.39539241898003546, + "grad_norm": 2.25, + "learning_rate": 9.613427084171755e-06, + "loss": 1.0323, + "step": 1978 + }, + { + "epoch": 0.39559231403513156, + "grad_norm": 2.109375, + "learning_rate": 9.613020684738813e-06, + "loss": 1.146, + "step": 1979 + }, + { + "epoch": 0.3957922090902276, + "grad_norm": 2.046875, + "learning_rate": 9.612614080395983e-06, + "loss": 1.0603, + "step": 1980 + }, + { + "epoch": 0.3959921041453237, + "grad_norm": 2.046875, + "learning_rate": 9.612207271161328e-06, + "loss": 1.0065, + "step": 1981 + }, + { + "epoch": 0.39619199920041975, + "grad_norm": 2.21875, + "learning_rate": 9.61180025705292e-06, + "loss": 1.0248, + "step": 1982 + }, + { + "epoch": 0.39639189425551585, + "grad_norm": 2.234375, + "learning_rate": 9.611393038088839e-06, + "loss": 1.0946, + "step": 1983 + }, + { + "epoch": 0.39659178931061195, + "grad_norm": 2.25, + "learning_rate": 9.610985614287168e-06, + "loss": 1.049, + "step": 1984 + }, + { + "epoch": 0.396791684365708, + "grad_norm": 2.09375, + "learning_rate": 9.61057798566601e-06, + "loss": 1.0451, + "step": 1985 + }, + { + "epoch": 0.3969915794208041, + "grad_norm": 2.1875, + "learning_rate": 9.61017015224347e-06, + "loss": 1.1273, + "step": 1986 + }, + { + "epoch": 0.39719147447590014, + "grad_norm": 2.109375, + "learning_rate": 9.609762114037665e-06, + "loss": 1.1022, + "step": 1987 + }, + { + "epoch": 0.39739136953099624, + "grad_norm": 2.234375, + "learning_rate": 9.609353871066719e-06, + "loss": 0.9584, + "step": 1988 + }, + { + "epoch": 0.3975912645860923, + "grad_norm": 2.25, + "learning_rate": 9.608945423348766e-06, + "loss": 1.1338, + "step": 1989 + }, + { + "epoch": 0.3977911596411884, + "grad_norm": 2.265625, + "learning_rate": 9.60853677090195e-06, + "loss": 1.0808, + "step": 1990 + }, + { + "epoch": 0.39799105469628443, + "grad_norm": 2.171875, + "learning_rate": 9.608127913744421e-06, + "loss": 1.0717, + "step": 1991 + }, + { + "epoch": 0.39819094975138053, + "grad_norm": 2.046875, + "learning_rate": 9.607718851894346e-06, + "loss": 1.066, + "step": 1992 + }, + { + "epoch": 0.3983908448064766, + "grad_norm": 2.171875, + "learning_rate": 9.607309585369889e-06, + "loss": 1.1569, + "step": 1993 + }, + { + "epoch": 0.3985907398615727, + "grad_norm": 2.171875, + "learning_rate": 9.606900114189231e-06, + "loss": 1.0631, + "step": 1994 + }, + { + "epoch": 0.3987906349166688, + "grad_norm": 2.09375, + "learning_rate": 9.606490438370563e-06, + "loss": 1.0811, + "step": 1995 + }, + { + "epoch": 0.3989905299717648, + "grad_norm": 2.203125, + "learning_rate": 9.606080557932084e-06, + "loss": 1.15, + "step": 1996 + }, + { + "epoch": 0.3991904250268609, + "grad_norm": 2.140625, + "learning_rate": 9.605670472891998e-06, + "loss": 1.0575, + "step": 1997 + }, + { + "epoch": 0.39939032008195696, + "grad_norm": 2.0625, + "learning_rate": 9.605260183268519e-06, + "loss": 0.9726, + "step": 1998 + }, + { + "epoch": 0.39959021513705306, + "grad_norm": 2.8125, + "learning_rate": 9.604849689079875e-06, + "loss": 1.0419, + "step": 1999 + }, + { + "epoch": 0.3997901101921491, + "grad_norm": 2.109375, + "learning_rate": 9.604438990344303e-06, + "loss": 1.0341, + "step": 2000 + }, + { + "epoch": 0.3999900052472452, + "grad_norm": 2.21875, + "learning_rate": 9.60402808708004e-06, + "loss": 1.0606, + "step": 2001 + }, + { + "epoch": 0.40018990030234125, + "grad_norm": 2.0625, + "learning_rate": 9.60361697930534e-06, + "loss": 1.011, + "step": 2002 + }, + { + "epoch": 0.40038979535743735, + "grad_norm": 2.09375, + "learning_rate": 9.603205667038468e-06, + "loss": 1.0795, + "step": 2003 + }, + { + "epoch": 0.4005896904125334, + "grad_norm": 2.171875, + "learning_rate": 9.60279415029769e-06, + "loss": 0.9848, + "step": 2004 + }, + { + "epoch": 0.4007895854676295, + "grad_norm": 1.96875, + "learning_rate": 9.60238242910129e-06, + "loss": 0.9808, + "step": 2005 + }, + { + "epoch": 0.4009894805227256, + "grad_norm": 2.03125, + "learning_rate": 9.601970503467551e-06, + "loss": 1.0222, + "step": 2006 + }, + { + "epoch": 0.40118937557782164, + "grad_norm": 2.15625, + "learning_rate": 9.601558373414776e-06, + "loss": 1.0831, + "step": 2007 + }, + { + "epoch": 0.40138927063291774, + "grad_norm": 2.015625, + "learning_rate": 9.601146038961267e-06, + "loss": 0.9672, + "step": 2008 + }, + { + "epoch": 0.4015891656880138, + "grad_norm": 2.0625, + "learning_rate": 9.600733500125345e-06, + "loss": 1.0311, + "step": 2009 + }, + { + "epoch": 0.4017890607431099, + "grad_norm": 2.140625, + "learning_rate": 9.600320756925332e-06, + "loss": 1.1057, + "step": 2010 + }, + { + "epoch": 0.4019889557982059, + "grad_norm": 2.296875, + "learning_rate": 9.59990780937956e-06, + "loss": 1.1822, + "step": 2011 + }, + { + "epoch": 0.402188850853302, + "grad_norm": 2.1875, + "learning_rate": 9.599494657506376e-06, + "loss": 1.1193, + "step": 2012 + }, + { + "epoch": 0.40238874590839807, + "grad_norm": 2.078125, + "learning_rate": 9.599081301324132e-06, + "loss": 1.1015, + "step": 2013 + }, + { + "epoch": 0.40258864096349417, + "grad_norm": 1.9921875, + "learning_rate": 9.598667740851187e-06, + "loss": 1.0014, + "step": 2014 + }, + { + "epoch": 0.4027885360185902, + "grad_norm": 2.125, + "learning_rate": 9.59825397610591e-06, + "loss": 1.0768, + "step": 2015 + }, + { + "epoch": 0.4029884310736863, + "grad_norm": 3.21875, + "learning_rate": 9.597840007106685e-06, + "loss": 1.0659, + "step": 2016 + }, + { + "epoch": 0.4031883261287824, + "grad_norm": 2.109375, + "learning_rate": 9.597425833871896e-06, + "loss": 1.0412, + "step": 2017 + }, + { + "epoch": 0.40338822118387846, + "grad_norm": 2.0625, + "learning_rate": 9.597011456419943e-06, + "loss": 1.0398, + "step": 2018 + }, + { + "epoch": 0.40358811623897456, + "grad_norm": 2.1875, + "learning_rate": 9.596596874769232e-06, + "loss": 1.1117, + "step": 2019 + }, + { + "epoch": 0.4037880112940706, + "grad_norm": 2.0, + "learning_rate": 9.59618208893818e-06, + "loss": 1.0276, + "step": 2020 + }, + { + "epoch": 0.4039879063491667, + "grad_norm": 2.171875, + "learning_rate": 9.59576709894521e-06, + "loss": 1.1292, + "step": 2021 + }, + { + "epoch": 0.40418780140426275, + "grad_norm": 2.015625, + "learning_rate": 9.595351904808757e-06, + "loss": 1.0813, + "step": 2022 + }, + { + "epoch": 0.40438769645935885, + "grad_norm": 2.234375, + "learning_rate": 9.594936506547262e-06, + "loss": 1.1305, + "step": 2023 + }, + { + "epoch": 0.4045875915144549, + "grad_norm": 2.1875, + "learning_rate": 9.594520904179176e-06, + "loss": 1.1371, + "step": 2024 + }, + { + "epoch": 0.404787486569551, + "grad_norm": 2.03125, + "learning_rate": 9.594105097722966e-06, + "loss": 0.9341, + "step": 2025 + }, + { + "epoch": 0.40498738162464704, + "grad_norm": 2.09375, + "learning_rate": 9.593689087197096e-06, + "loss": 1.1288, + "step": 2026 + }, + { + "epoch": 0.40518727667974314, + "grad_norm": 2.203125, + "learning_rate": 9.59327287262005e-06, + "loss": 1.0997, + "step": 2027 + }, + { + "epoch": 0.40538717173483924, + "grad_norm": 2.09375, + "learning_rate": 9.592856454010309e-06, + "loss": 1.0629, + "step": 2028 + }, + { + "epoch": 0.4055870667899353, + "grad_norm": 2.0625, + "learning_rate": 9.592439831386378e-06, + "loss": 1.0659, + "step": 2029 + }, + { + "epoch": 0.4057869618450314, + "grad_norm": 2.1875, + "learning_rate": 9.59202300476676e-06, + "loss": 1.1291, + "step": 2030 + }, + { + "epoch": 0.4059868569001274, + "grad_norm": 2.046875, + "learning_rate": 9.59160597416997e-06, + "loss": 1.0006, + "step": 2031 + }, + { + "epoch": 0.4061867519552235, + "grad_norm": 2.15625, + "learning_rate": 9.591188739614534e-06, + "loss": 1.1197, + "step": 2032 + }, + { + "epoch": 0.40638664701031957, + "grad_norm": 2.21875, + "learning_rate": 9.590771301118983e-06, + "loss": 1.1276, + "step": 2033 + }, + { + "epoch": 0.40658654206541567, + "grad_norm": 2.140625, + "learning_rate": 9.590353658701863e-06, + "loss": 1.0585, + "step": 2034 + }, + { + "epoch": 0.4067864371205117, + "grad_norm": 2.125, + "learning_rate": 9.589935812381722e-06, + "loss": 1.0337, + "step": 2035 + }, + { + "epoch": 0.4069863321756078, + "grad_norm": 2.078125, + "learning_rate": 9.589517762177122e-06, + "loss": 1.0947, + "step": 2036 + }, + { + "epoch": 0.40718622723070386, + "grad_norm": 2.296875, + "learning_rate": 9.589099508106637e-06, + "loss": 1.167, + "step": 2037 + }, + { + "epoch": 0.40738612228579996, + "grad_norm": 2.21875, + "learning_rate": 9.588681050188837e-06, + "loss": 1.1224, + "step": 2038 + }, + { + "epoch": 0.40758601734089606, + "grad_norm": 2.125, + "learning_rate": 9.588262388442317e-06, + "loss": 1.0462, + "step": 2039 + }, + { + "epoch": 0.4077859123959921, + "grad_norm": 2.265625, + "learning_rate": 9.587843522885674e-06, + "loss": 1.0847, + "step": 2040 + }, + { + "epoch": 0.4079858074510882, + "grad_norm": 2.140625, + "learning_rate": 9.587424453537508e-06, + "loss": 1.0729, + "step": 2041 + }, + { + "epoch": 0.40818570250618424, + "grad_norm": 2.125, + "learning_rate": 9.587005180416439e-06, + "loss": 1.0238, + "step": 2042 + }, + { + "epoch": 0.40838559756128034, + "grad_norm": 2.09375, + "learning_rate": 9.586585703541092e-06, + "loss": 1.0113, + "step": 2043 + }, + { + "epoch": 0.4085854926163764, + "grad_norm": 2.125, + "learning_rate": 9.586166022930095e-06, + "loss": 1.0278, + "step": 2044 + }, + { + "epoch": 0.4087853876714725, + "grad_norm": 2.109375, + "learning_rate": 9.585746138602095e-06, + "loss": 1.0631, + "step": 2045 + }, + { + "epoch": 0.40898528272656853, + "grad_norm": 2.28125, + "learning_rate": 9.58532605057574e-06, + "loss": 1.1036, + "step": 2046 + }, + { + "epoch": 0.40918517778166463, + "grad_norm": 2.234375, + "learning_rate": 9.584905758869691e-06, + "loss": 1.1368, + "step": 2047 + }, + { + "epoch": 0.4093850728367607, + "grad_norm": 2.265625, + "learning_rate": 9.584485263502619e-06, + "loss": 1.1605, + "step": 2048 + }, + { + "epoch": 0.4095849678918568, + "grad_norm": 2.203125, + "learning_rate": 9.5840645644932e-06, + "loss": 1.0755, + "step": 2049 + }, + { + "epoch": 0.4097848629469529, + "grad_norm": 2.15625, + "learning_rate": 9.583643661860125e-06, + "loss": 1.0774, + "step": 2050 + }, + { + "epoch": 0.4099847580020489, + "grad_norm": 2.0625, + "learning_rate": 9.583222555622087e-06, + "loss": 0.9607, + "step": 2051 + }, + { + "epoch": 0.410184653057145, + "grad_norm": 2.125, + "learning_rate": 9.582801245797793e-06, + "loss": 1.1346, + "step": 2052 + }, + { + "epoch": 0.41038454811224107, + "grad_norm": 2.078125, + "learning_rate": 9.582379732405955e-06, + "loss": 1.0721, + "step": 2053 + }, + { + "epoch": 0.41058444316733717, + "grad_norm": 2.171875, + "learning_rate": 9.5819580154653e-06, + "loss": 1.1277, + "step": 2054 + }, + { + "epoch": 0.4107843382224332, + "grad_norm": 2.15625, + "learning_rate": 9.581536094994562e-06, + "loss": 1.0082, + "step": 2055 + }, + { + "epoch": 0.4109842332775293, + "grad_norm": 2.125, + "learning_rate": 9.581113971012476e-06, + "loss": 1.0861, + "step": 2056 + }, + { + "epoch": 0.41118412833262535, + "grad_norm": 2.15625, + "learning_rate": 9.580691643537798e-06, + "loss": 1.1013, + "step": 2057 + }, + { + "epoch": 0.41138402338772145, + "grad_norm": 2.125, + "learning_rate": 9.580269112589287e-06, + "loss": 1.0299, + "step": 2058 + }, + { + "epoch": 0.4115839184428175, + "grad_norm": 2.203125, + "learning_rate": 9.579846378185712e-06, + "loss": 1.0958, + "step": 2059 + }, + { + "epoch": 0.4117838134979136, + "grad_norm": 2.09375, + "learning_rate": 9.579423440345847e-06, + "loss": 1.1165, + "step": 2060 + }, + { + "epoch": 0.4119837085530097, + "grad_norm": 2.1875, + "learning_rate": 9.579000299088485e-06, + "loss": 1.0798, + "step": 2061 + }, + { + "epoch": 0.41218360360810574, + "grad_norm": 2.140625, + "learning_rate": 9.578576954432416e-06, + "loss": 1.0372, + "step": 2062 + }, + { + "epoch": 0.41238349866320184, + "grad_norm": 2.15625, + "learning_rate": 9.57815340639645e-06, + "loss": 1.0837, + "step": 2063 + }, + { + "epoch": 0.4125833937182979, + "grad_norm": 2.046875, + "learning_rate": 9.577729654999396e-06, + "loss": 1.0542, + "step": 2064 + }, + { + "epoch": 0.412783288773394, + "grad_norm": 2.046875, + "learning_rate": 9.577305700260084e-06, + "loss": 1.1162, + "step": 2065 + }, + { + "epoch": 0.41298318382849003, + "grad_norm": 2.09375, + "learning_rate": 9.576881542197337e-06, + "loss": 1.0897, + "step": 2066 + }, + { + "epoch": 0.41318307888358613, + "grad_norm": 2.234375, + "learning_rate": 9.576457180830004e-06, + "loss": 1.16, + "step": 2067 + }, + { + "epoch": 0.4133829739386822, + "grad_norm": 2.15625, + "learning_rate": 9.576032616176932e-06, + "loss": 1.0592, + "step": 2068 + }, + { + "epoch": 0.4135828689937783, + "grad_norm": 2.0625, + "learning_rate": 9.575607848256979e-06, + "loss": 1.0421, + "step": 2069 + }, + { + "epoch": 0.4137827640488743, + "grad_norm": 2.078125, + "learning_rate": 9.575182877089014e-06, + "loss": 1.0748, + "step": 2070 + }, + { + "epoch": 0.4139826591039704, + "grad_norm": 2.21875, + "learning_rate": 9.574757702691912e-06, + "loss": 1.1479, + "step": 2071 + }, + { + "epoch": 0.41418255415906646, + "grad_norm": 2.0625, + "learning_rate": 9.574332325084564e-06, + "loss": 0.9352, + "step": 2072 + }, + { + "epoch": 0.41438244921416256, + "grad_norm": 2.109375, + "learning_rate": 9.573906744285862e-06, + "loss": 1.0908, + "step": 2073 + }, + { + "epoch": 0.41458234426925866, + "grad_norm": 2.28125, + "learning_rate": 9.573480960314711e-06, + "loss": 1.0818, + "step": 2074 + }, + { + "epoch": 0.4147822393243547, + "grad_norm": 2.140625, + "learning_rate": 9.573054973190023e-06, + "loss": 1.0848, + "step": 2075 + }, + { + "epoch": 0.4149821343794508, + "grad_norm": 2.125, + "learning_rate": 9.572628782930724e-06, + "loss": 1.0214, + "step": 2076 + }, + { + "epoch": 0.41518202943454685, + "grad_norm": 2.078125, + "learning_rate": 9.572202389555741e-06, + "loss": 1.1551, + "step": 2077 + }, + { + "epoch": 0.41538192448964295, + "grad_norm": 2.34375, + "learning_rate": 9.571775793084017e-06, + "loss": 1.0506, + "step": 2078 + }, + { + "epoch": 0.415581819544739, + "grad_norm": 2.078125, + "learning_rate": 9.5713489935345e-06, + "loss": 1.0266, + "step": 2079 + }, + { + "epoch": 0.4157817145998351, + "grad_norm": 2.078125, + "learning_rate": 9.57092199092615e-06, + "loss": 1.0646, + "step": 2080 + }, + { + "epoch": 0.41598160965493114, + "grad_norm": 2.15625, + "learning_rate": 9.570494785277931e-06, + "loss": 1.0513, + "step": 2081 + }, + { + "epoch": 0.41618150471002724, + "grad_norm": 2.078125, + "learning_rate": 9.570067376608826e-06, + "loss": 1.0574, + "step": 2082 + }, + { + "epoch": 0.4163813997651233, + "grad_norm": 2.140625, + "learning_rate": 9.569639764937813e-06, + "loss": 1.0695, + "step": 2083 + }, + { + "epoch": 0.4165812948202194, + "grad_norm": 2.171875, + "learning_rate": 9.56921195028389e-06, + "loss": 1.1231, + "step": 2084 + }, + { + "epoch": 0.4167811898753155, + "grad_norm": 2.21875, + "learning_rate": 9.56878393266606e-06, + "loss": 1.135, + "step": 2085 + }, + { + "epoch": 0.4169810849304115, + "grad_norm": 2.140625, + "learning_rate": 9.568355712103336e-06, + "loss": 1.1205, + "step": 2086 + }, + { + "epoch": 0.4171809799855076, + "grad_norm": 2.21875, + "learning_rate": 9.56792728861474e-06, + "loss": 1.1319, + "step": 2087 + }, + { + "epoch": 0.41738087504060367, + "grad_norm": 2.109375, + "learning_rate": 9.5674986622193e-06, + "loss": 1.0088, + "step": 2088 + }, + { + "epoch": 0.41758077009569977, + "grad_norm": 2.046875, + "learning_rate": 9.567069832936058e-06, + "loss": 1.076, + "step": 2089 + }, + { + "epoch": 0.4177806651507958, + "grad_norm": 2.21875, + "learning_rate": 9.566640800784061e-06, + "loss": 1.1271, + "step": 2090 + }, + { + "epoch": 0.4179805602058919, + "grad_norm": 2.09375, + "learning_rate": 9.56621156578237e-06, + "loss": 1.0794, + "step": 2091 + }, + { + "epoch": 0.41818045526098796, + "grad_norm": 2.03125, + "learning_rate": 9.565782127950047e-06, + "loss": 1.0835, + "step": 2092 + }, + { + "epoch": 0.41838035031608406, + "grad_norm": 2.28125, + "learning_rate": 9.56535248730617e-06, + "loss": 1.1816, + "step": 2093 + }, + { + "epoch": 0.4185802453711801, + "grad_norm": 2.171875, + "learning_rate": 9.564922643869822e-06, + "loss": 1.0854, + "step": 2094 + }, + { + "epoch": 0.4187801404262762, + "grad_norm": 2.203125, + "learning_rate": 9.5644925976601e-06, + "loss": 1.0796, + "step": 2095 + }, + { + "epoch": 0.4189800354813723, + "grad_norm": 2.328125, + "learning_rate": 9.564062348696103e-06, + "loss": 1.1448, + "step": 2096 + }, + { + "epoch": 0.41917993053646835, + "grad_norm": 2.265625, + "learning_rate": 9.563631896996943e-06, + "loss": 1.0599, + "step": 2097 + }, + { + "epoch": 0.41937982559156445, + "grad_norm": 2.078125, + "learning_rate": 9.563201242581743e-06, + "loss": 1.0835, + "step": 2098 + }, + { + "epoch": 0.4195797206466605, + "grad_norm": 2.09375, + "learning_rate": 9.562770385469631e-06, + "loss": 1.1198, + "step": 2099 + }, + { + "epoch": 0.4197796157017566, + "grad_norm": 2.125, + "learning_rate": 9.562339325679747e-06, + "loss": 1.0479, + "step": 2100 + }, + { + "epoch": 0.41997951075685264, + "grad_norm": 2.125, + "learning_rate": 9.561908063231234e-06, + "loss": 1.1437, + "step": 2101 + }, + { + "epoch": 0.42017940581194874, + "grad_norm": 2.171875, + "learning_rate": 9.561476598143255e-06, + "loss": 1.0515, + "step": 2102 + }, + { + "epoch": 0.4203793008670448, + "grad_norm": 2.28125, + "learning_rate": 9.56104493043497e-06, + "loss": 1.0693, + "step": 2103 + }, + { + "epoch": 0.4205791959221409, + "grad_norm": 2.125, + "learning_rate": 9.560613060125558e-06, + "loss": 1.1833, + "step": 2104 + }, + { + "epoch": 0.4207790909772369, + "grad_norm": 1.9453125, + "learning_rate": 9.560180987234202e-06, + "loss": 1.058, + "step": 2105 + }, + { + "epoch": 0.420978986032333, + "grad_norm": 2.234375, + "learning_rate": 9.559748711780092e-06, + "loss": 1.0376, + "step": 2106 + }, + { + "epoch": 0.4211788810874291, + "grad_norm": 2.171875, + "learning_rate": 9.559316233782432e-06, + "loss": 1.096, + "step": 2107 + }, + { + "epoch": 0.42137877614252517, + "grad_norm": 2.015625, + "learning_rate": 9.558883553260431e-06, + "loss": 1.059, + "step": 2108 + }, + { + "epoch": 0.42157867119762127, + "grad_norm": 2.0625, + "learning_rate": 9.55845067023331e-06, + "loss": 1.0785, + "step": 2109 + }, + { + "epoch": 0.4217785662527173, + "grad_norm": 2.0, + "learning_rate": 9.558017584720298e-06, + "loss": 1.014, + "step": 2110 + }, + { + "epoch": 0.4219784613078134, + "grad_norm": 2.25, + "learning_rate": 9.55758429674063e-06, + "loss": 1.0571, + "step": 2111 + }, + { + "epoch": 0.42217835636290946, + "grad_norm": 2.140625, + "learning_rate": 9.557150806313555e-06, + "loss": 1.0677, + "step": 2112 + }, + { + "epoch": 0.42237825141800556, + "grad_norm": 2.0625, + "learning_rate": 9.55671711345833e-06, + "loss": 1.0337, + "step": 2113 + }, + { + "epoch": 0.4225781464731016, + "grad_norm": 2.078125, + "learning_rate": 9.556283218194214e-06, + "loss": 1.0861, + "step": 2114 + }, + { + "epoch": 0.4227780415281977, + "grad_norm": 2.171875, + "learning_rate": 9.555849120540486e-06, + "loss": 1.0313, + "step": 2115 + }, + { + "epoch": 0.42297793658329375, + "grad_norm": 2.015625, + "learning_rate": 9.555414820516427e-06, + "loss": 0.9798, + "step": 2116 + }, + { + "epoch": 0.42317783163838985, + "grad_norm": 2.03125, + "learning_rate": 9.554980318141327e-06, + "loss": 1.1204, + "step": 2117 + }, + { + "epoch": 0.42337772669348595, + "grad_norm": 2.203125, + "learning_rate": 9.554545613434489e-06, + "loss": 1.0616, + "step": 2118 + }, + { + "epoch": 0.423577621748582, + "grad_norm": 2.015625, + "learning_rate": 9.55411070641522e-06, + "loss": 1.0907, + "step": 2119 + }, + { + "epoch": 0.4237775168036781, + "grad_norm": 2.390625, + "learning_rate": 9.55367559710284e-06, + "loss": 1.1348, + "step": 2120 + }, + { + "epoch": 0.42397741185877413, + "grad_norm": 2.078125, + "learning_rate": 9.553240285516676e-06, + "loss": 0.9642, + "step": 2121 + }, + { + "epoch": 0.42417730691387023, + "grad_norm": 2.171875, + "learning_rate": 9.552804771676067e-06, + "loss": 1.1451, + "step": 2122 + }, + { + "epoch": 0.4243772019689663, + "grad_norm": 2.140625, + "learning_rate": 9.552369055600354e-06, + "loss": 1.0617, + "step": 2123 + }, + { + "epoch": 0.4245770970240624, + "grad_norm": 2.109375, + "learning_rate": 9.551933137308895e-06, + "loss": 1.111, + "step": 2124 + }, + { + "epoch": 0.4247769920791584, + "grad_norm": 2.09375, + "learning_rate": 9.551497016821051e-06, + "loss": 1.0553, + "step": 2125 + }, + { + "epoch": 0.4249768871342545, + "grad_norm": 2.0625, + "learning_rate": 9.551060694156197e-06, + "loss": 1.1344, + "step": 2126 + }, + { + "epoch": 0.42517678218935057, + "grad_norm": 2.140625, + "learning_rate": 9.550624169333713e-06, + "loss": 0.9748, + "step": 2127 + }, + { + "epoch": 0.42537667724444667, + "grad_norm": 2.140625, + "learning_rate": 9.550187442372987e-06, + "loss": 1.0298, + "step": 2128 + }, + { + "epoch": 0.42557657229954277, + "grad_norm": 2.015625, + "learning_rate": 9.549750513293422e-06, + "loss": 1.0584, + "step": 2129 + }, + { + "epoch": 0.4257764673546388, + "grad_norm": 2.125, + "learning_rate": 9.549313382114427e-06, + "loss": 1.0891, + "step": 2130 + }, + { + "epoch": 0.4259763624097349, + "grad_norm": 2.421875, + "learning_rate": 9.548876048855417e-06, + "loss": 1.1657, + "step": 2131 + }, + { + "epoch": 0.42617625746483095, + "grad_norm": 2.171875, + "learning_rate": 9.548438513535819e-06, + "loss": 1.0, + "step": 2132 + }, + { + "epoch": 0.42637615251992705, + "grad_norm": 2.234375, + "learning_rate": 9.548000776175066e-06, + "loss": 1.2088, + "step": 2133 + }, + { + "epoch": 0.4265760475750231, + "grad_norm": 2.1875, + "learning_rate": 9.547562836792606e-06, + "loss": 1.1251, + "step": 2134 + }, + { + "epoch": 0.4267759426301192, + "grad_norm": 2.171875, + "learning_rate": 9.547124695407888e-06, + "loss": 1.1375, + "step": 2135 + }, + { + "epoch": 0.42697583768521524, + "grad_norm": 2.171875, + "learning_rate": 9.546686352040379e-06, + "loss": 1.0466, + "step": 2136 + }, + { + "epoch": 0.42717573274031134, + "grad_norm": 2.203125, + "learning_rate": 9.546247806709548e-06, + "loss": 1.0535, + "step": 2137 + }, + { + "epoch": 0.4273756277954074, + "grad_norm": 2.140625, + "learning_rate": 9.545809059434874e-06, + "loss": 1.0398, + "step": 2138 + }, + { + "epoch": 0.4275755228505035, + "grad_norm": 2.234375, + "learning_rate": 9.545370110235847e-06, + "loss": 1.0722, + "step": 2139 + }, + { + "epoch": 0.4277754179055996, + "grad_norm": 2.09375, + "learning_rate": 9.544930959131967e-06, + "loss": 1.0896, + "step": 2140 + }, + { + "epoch": 0.42797531296069563, + "grad_norm": 2.09375, + "learning_rate": 9.544491606142737e-06, + "loss": 1.0248, + "step": 2141 + }, + { + "epoch": 0.42817520801579173, + "grad_norm": 2.03125, + "learning_rate": 9.544052051287675e-06, + "loss": 1.0491, + "step": 2142 + }, + { + "epoch": 0.4283751030708878, + "grad_norm": 2.125, + "learning_rate": 9.543612294586306e-06, + "loss": 1.0873, + "step": 2143 + }, + { + "epoch": 0.4285749981259839, + "grad_norm": 2.140625, + "learning_rate": 9.543172336058166e-06, + "loss": 1.0051, + "step": 2144 + }, + { + "epoch": 0.4287748931810799, + "grad_norm": 2.234375, + "learning_rate": 9.542732175722796e-06, + "loss": 1.1224, + "step": 2145 + }, + { + "epoch": 0.428974788236176, + "grad_norm": 2.171875, + "learning_rate": 9.542291813599746e-06, + "loss": 1.1181, + "step": 2146 + }, + { + "epoch": 0.42917468329127206, + "grad_norm": 2.03125, + "learning_rate": 9.541851249708581e-06, + "loss": 1.0761, + "step": 2147 + }, + { + "epoch": 0.42937457834636816, + "grad_norm": 2.03125, + "learning_rate": 9.541410484068867e-06, + "loss": 1.0671, + "step": 2148 + }, + { + "epoch": 0.4295744734014642, + "grad_norm": 2.234375, + "learning_rate": 9.540969516700185e-06, + "loss": 1.1324, + "step": 2149 + }, + { + "epoch": 0.4297743684565603, + "grad_norm": 2.21875, + "learning_rate": 9.540528347622123e-06, + "loss": 1.0483, + "step": 2150 + }, + { + "epoch": 0.4299742635116564, + "grad_norm": 2.15625, + "learning_rate": 9.540086976854274e-06, + "loss": 1.0794, + "step": 2151 + }, + { + "epoch": 0.43017415856675245, + "grad_norm": 2.1875, + "learning_rate": 9.539645404416249e-06, + "loss": 1.108, + "step": 2152 + }, + { + "epoch": 0.43037405362184855, + "grad_norm": 2.03125, + "learning_rate": 9.53920363032766e-06, + "loss": 0.9816, + "step": 2153 + }, + { + "epoch": 0.4305739486769446, + "grad_norm": 2.078125, + "learning_rate": 9.538761654608128e-06, + "loss": 1.0624, + "step": 2154 + }, + { + "epoch": 0.4307738437320407, + "grad_norm": 2.0, + "learning_rate": 9.53831947727729e-06, + "loss": 1.0004, + "step": 2155 + }, + { + "epoch": 0.43097373878713674, + "grad_norm": 2.109375, + "learning_rate": 9.537877098354787e-06, + "loss": 1.0317, + "step": 2156 + }, + { + "epoch": 0.43117363384223284, + "grad_norm": 2.1875, + "learning_rate": 9.537434517860265e-06, + "loss": 1.058, + "step": 2157 + }, + { + "epoch": 0.4313735288973289, + "grad_norm": 2.078125, + "learning_rate": 9.536991735813388e-06, + "loss": 1.0414, + "step": 2158 + }, + { + "epoch": 0.431573423952425, + "grad_norm": 2.1875, + "learning_rate": 9.536548752233822e-06, + "loss": 1.1674, + "step": 2159 + }, + { + "epoch": 0.43177331900752103, + "grad_norm": 2.109375, + "learning_rate": 9.536105567141246e-06, + "loss": 1.0639, + "step": 2160 + }, + { + "epoch": 0.43197321406261713, + "grad_norm": 2.15625, + "learning_rate": 9.535662180555342e-06, + "loss": 1.0177, + "step": 2161 + }, + { + "epoch": 0.43217310911771323, + "grad_norm": 2.171875, + "learning_rate": 9.535218592495812e-06, + "loss": 1.145, + "step": 2162 + }, + { + "epoch": 0.43237300417280927, + "grad_norm": 2.15625, + "learning_rate": 9.534774802982356e-06, + "loss": 1.0808, + "step": 2163 + }, + { + "epoch": 0.43257289922790537, + "grad_norm": 2.140625, + "learning_rate": 9.534330812034686e-06, + "loss": 1.0745, + "step": 2164 + }, + { + "epoch": 0.4327727942830014, + "grad_norm": 2.046875, + "learning_rate": 9.533886619672527e-06, + "loss": 0.9411, + "step": 2165 + }, + { + "epoch": 0.4329726893380975, + "grad_norm": 2.078125, + "learning_rate": 9.533442225915607e-06, + "loss": 1.0548, + "step": 2166 + }, + { + "epoch": 0.43317258439319356, + "grad_norm": 2.0625, + "learning_rate": 9.532997630783669e-06, + "loss": 1.0255, + "step": 2167 + }, + { + "epoch": 0.43337247944828966, + "grad_norm": 2.25, + "learning_rate": 9.53255283429646e-06, + "loss": 1.0143, + "step": 2168 + }, + { + "epoch": 0.4335723745033857, + "grad_norm": 2.21875, + "learning_rate": 9.532107836473739e-06, + "loss": 1.0807, + "step": 2169 + }, + { + "epoch": 0.4337722695584818, + "grad_norm": 2.203125, + "learning_rate": 9.53166263733527e-06, + "loss": 1.1443, + "step": 2170 + }, + { + "epoch": 0.43397216461357785, + "grad_norm": 2.25, + "learning_rate": 9.531217236900832e-06, + "loss": 1.1225, + "step": 2171 + }, + { + "epoch": 0.43417205966867395, + "grad_norm": 2.171875, + "learning_rate": 9.530771635190208e-06, + "loss": 1.0479, + "step": 2172 + }, + { + "epoch": 0.43437195472377005, + "grad_norm": 2.078125, + "learning_rate": 9.53032583222319e-06, + "loss": 1.0765, + "step": 2173 + }, + { + "epoch": 0.4345718497788661, + "grad_norm": 2.203125, + "learning_rate": 9.529879828019586e-06, + "loss": 1.1234, + "step": 2174 + }, + { + "epoch": 0.4347717448339622, + "grad_norm": 2.125, + "learning_rate": 9.5294336225992e-06, + "loss": 1.088, + "step": 2175 + }, + { + "epoch": 0.43497163988905824, + "grad_norm": 2.09375, + "learning_rate": 9.528987215981859e-06, + "loss": 1.065, + "step": 2176 + }, + { + "epoch": 0.43517153494415434, + "grad_norm": 2.109375, + "learning_rate": 9.52854060818739e-06, + "loss": 1.0823, + "step": 2177 + }, + { + "epoch": 0.4353714299992504, + "grad_norm": 2.109375, + "learning_rate": 9.528093799235629e-06, + "loss": 1.0597, + "step": 2178 + }, + { + "epoch": 0.4355713250543465, + "grad_norm": 2.140625, + "learning_rate": 9.527646789146427e-06, + "loss": 1.0214, + "step": 2179 + }, + { + "epoch": 0.4357712201094425, + "grad_norm": 2.171875, + "learning_rate": 9.527199577939636e-06, + "loss": 1.0689, + "step": 2180 + }, + { + "epoch": 0.4359711151645386, + "grad_norm": 2.078125, + "learning_rate": 9.526752165635125e-06, + "loss": 1.0881, + "step": 2181 + }, + { + "epoch": 0.43617101021963467, + "grad_norm": 1.9765625, + "learning_rate": 9.526304552252766e-06, + "loss": 0.9681, + "step": 2182 + }, + { + "epoch": 0.43637090527473077, + "grad_norm": 2.1875, + "learning_rate": 9.52585673781244e-06, + "loss": 1.0983, + "step": 2183 + }, + { + "epoch": 0.4365708003298268, + "grad_norm": 2.0625, + "learning_rate": 9.525408722334045e-06, + "loss": 1.1078, + "step": 2184 + }, + { + "epoch": 0.4367706953849229, + "grad_norm": 2.015625, + "learning_rate": 9.524960505837475e-06, + "loss": 0.992, + "step": 2185 + }, + { + "epoch": 0.436970590440019, + "grad_norm": 1.96875, + "learning_rate": 9.524512088342642e-06, + "loss": 1.0459, + "step": 2186 + }, + { + "epoch": 0.43717048549511506, + "grad_norm": 2.078125, + "learning_rate": 9.524063469869467e-06, + "loss": 1.0761, + "step": 2187 + }, + { + "epoch": 0.43737038055021116, + "grad_norm": 2.171875, + "learning_rate": 9.523614650437876e-06, + "loss": 1.1383, + "step": 2188 + }, + { + "epoch": 0.4375702756053072, + "grad_norm": 2.171875, + "learning_rate": 9.523165630067804e-06, + "loss": 1.0518, + "step": 2189 + }, + { + "epoch": 0.4377701706604033, + "grad_norm": 2.109375, + "learning_rate": 9.522716408779198e-06, + "loss": 1.0729, + "step": 2190 + }, + { + "epoch": 0.43797006571549935, + "grad_norm": 2.15625, + "learning_rate": 9.522266986592012e-06, + "loss": 1.0924, + "step": 2191 + }, + { + "epoch": 0.43816996077059545, + "grad_norm": 2.25, + "learning_rate": 9.521817363526211e-06, + "loss": 1.0138, + "step": 2192 + }, + { + "epoch": 0.4383698558256915, + "grad_norm": 2.0625, + "learning_rate": 9.521367539601766e-06, + "loss": 1.0787, + "step": 2193 + }, + { + "epoch": 0.4385697508807876, + "grad_norm": 1.9921875, + "learning_rate": 9.520917514838657e-06, + "loss": 0.9883, + "step": 2194 + }, + { + "epoch": 0.43876964593588363, + "grad_norm": 2.03125, + "learning_rate": 9.520467289256874e-06, + "loss": 0.9786, + "step": 2195 + }, + { + "epoch": 0.43896954099097973, + "grad_norm": 2.109375, + "learning_rate": 9.520016862876416e-06, + "loss": 1.03, + "step": 2196 + }, + { + "epoch": 0.43916943604607583, + "grad_norm": 2.140625, + "learning_rate": 9.519566235717295e-06, + "loss": 1.0743, + "step": 2197 + }, + { + "epoch": 0.4393693311011719, + "grad_norm": 2.09375, + "learning_rate": 9.519115407799523e-06, + "loss": 1.0433, + "step": 2198 + }, + { + "epoch": 0.439569226156268, + "grad_norm": 2.140625, + "learning_rate": 9.518664379143125e-06, + "loss": 1.0033, + "step": 2199 + }, + { + "epoch": 0.439769121211364, + "grad_norm": 2.0, + "learning_rate": 9.518213149768141e-06, + "loss": 1.0444, + "step": 2200 + }, + { + "epoch": 0.4399690162664601, + "grad_norm": 2.046875, + "learning_rate": 9.517761719694609e-06, + "loss": 1.0285, + "step": 2201 + }, + { + "epoch": 0.44016891132155617, + "grad_norm": 2.28125, + "learning_rate": 9.517310088942585e-06, + "loss": 1.1743, + "step": 2202 + }, + { + "epoch": 0.44036880637665227, + "grad_norm": 1.9921875, + "learning_rate": 9.516858257532128e-06, + "loss": 0.973, + "step": 2203 + }, + { + "epoch": 0.4405687014317483, + "grad_norm": 2.234375, + "learning_rate": 9.516406225483312e-06, + "loss": 1.074, + "step": 2204 + }, + { + "epoch": 0.4407685964868444, + "grad_norm": 2.046875, + "learning_rate": 9.515953992816215e-06, + "loss": 1.0519, + "step": 2205 + }, + { + "epoch": 0.44096849154194045, + "grad_norm": 2.125, + "learning_rate": 9.515501559550919e-06, + "loss": 1.0592, + "step": 2206 + }, + { + "epoch": 0.44116838659703655, + "grad_norm": 2.09375, + "learning_rate": 9.51504892570753e-06, + "loss": 1.0736, + "step": 2207 + }, + { + "epoch": 0.44136828165213265, + "grad_norm": 2.078125, + "learning_rate": 9.514596091306148e-06, + "loss": 1.1223, + "step": 2208 + }, + { + "epoch": 0.4415681767072287, + "grad_norm": 2.09375, + "learning_rate": 9.514143056366892e-06, + "loss": 1.0031, + "step": 2209 + }, + { + "epoch": 0.4417680717623248, + "grad_norm": 2.0, + "learning_rate": 9.513689820909882e-06, + "loss": 1.0427, + "step": 2210 + }, + { + "epoch": 0.44196796681742084, + "grad_norm": 2.078125, + "learning_rate": 9.51323638495525e-06, + "loss": 1.0908, + "step": 2211 + }, + { + "epoch": 0.44216786187251694, + "grad_norm": 2.0, + "learning_rate": 9.512782748523143e-06, + "loss": 0.9979, + "step": 2212 + }, + { + "epoch": 0.442367756927613, + "grad_norm": 2.140625, + "learning_rate": 9.512328911633708e-06, + "loss": 1.104, + "step": 2213 + }, + { + "epoch": 0.4425676519827091, + "grad_norm": 2.171875, + "learning_rate": 9.511874874307103e-06, + "loss": 1.0095, + "step": 2214 + }, + { + "epoch": 0.44276754703780513, + "grad_norm": 2.09375, + "learning_rate": 9.5114206365635e-06, + "loss": 1.1195, + "step": 2215 + }, + { + "epoch": 0.44296744209290123, + "grad_norm": 2.109375, + "learning_rate": 9.510966198423073e-06, + "loss": 1.0611, + "step": 2216 + }, + { + "epoch": 0.4431673371479973, + "grad_norm": 2.140625, + "learning_rate": 9.51051155990601e-06, + "loss": 1.0874, + "step": 2217 + }, + { + "epoch": 0.4433672322030934, + "grad_norm": 2.109375, + "learning_rate": 9.510056721032504e-06, + "loss": 1.1258, + "step": 2218 + }, + { + "epoch": 0.4435671272581895, + "grad_norm": 2.0625, + "learning_rate": 9.509601681822761e-06, + "loss": 1.0456, + "step": 2219 + }, + { + "epoch": 0.4437670223132855, + "grad_norm": 2.125, + "learning_rate": 9.509146442296992e-06, + "loss": 1.048, + "step": 2220 + }, + { + "epoch": 0.4439669173683816, + "grad_norm": 2.109375, + "learning_rate": 9.508691002475421e-06, + "loss": 1.1, + "step": 2221 + }, + { + "epoch": 0.44416681242347766, + "grad_norm": 2.140625, + "learning_rate": 9.508235362378278e-06, + "loss": 1.1121, + "step": 2222 + }, + { + "epoch": 0.44436670747857376, + "grad_norm": 2.125, + "learning_rate": 9.507779522025799e-06, + "loss": 0.9907, + "step": 2223 + }, + { + "epoch": 0.4445666025336698, + "grad_norm": 2.265625, + "learning_rate": 9.507323481438236e-06, + "loss": 1.071, + "step": 2224 + }, + { + "epoch": 0.4447664975887659, + "grad_norm": 2.09375, + "learning_rate": 9.506867240635847e-06, + "loss": 1.0096, + "step": 2225 + }, + { + "epoch": 0.44496639264386195, + "grad_norm": 2.125, + "learning_rate": 9.506410799638894e-06, + "loss": 1.1819, + "step": 2226 + }, + { + "epoch": 0.44516628769895805, + "grad_norm": 2.09375, + "learning_rate": 9.505954158467656e-06, + "loss": 1.1682, + "step": 2227 + }, + { + "epoch": 0.4453661827540541, + "grad_norm": 2.046875, + "learning_rate": 9.505497317142416e-06, + "loss": 1.0369, + "step": 2228 + }, + { + "epoch": 0.4455660778091502, + "grad_norm": 2.09375, + "learning_rate": 9.505040275683465e-06, + "loss": 1.1164, + "step": 2229 + }, + { + "epoch": 0.4457659728642463, + "grad_norm": 2.21875, + "learning_rate": 9.504583034111108e-06, + "loss": 1.1338, + "step": 2230 + }, + { + "epoch": 0.44596586791934234, + "grad_norm": 2.09375, + "learning_rate": 9.504125592445653e-06, + "loss": 1.074, + "step": 2231 + }, + { + "epoch": 0.44616576297443844, + "grad_norm": 2.140625, + "learning_rate": 9.50366795070742e-06, + "loss": 1.0827, + "step": 2232 + }, + { + "epoch": 0.4463656580295345, + "grad_norm": 2.125, + "learning_rate": 9.503210108916736e-06, + "loss": 0.9933, + "step": 2233 + }, + { + "epoch": 0.4465655530846306, + "grad_norm": 2.140625, + "learning_rate": 9.502752067093942e-06, + "loss": 1.0505, + "step": 2234 + }, + { + "epoch": 0.44676544813972663, + "grad_norm": 2.09375, + "learning_rate": 9.50229382525938e-06, + "loss": 1.0627, + "step": 2235 + }, + { + "epoch": 0.44696534319482273, + "grad_norm": 2.15625, + "learning_rate": 9.50183538343341e-06, + "loss": 1.0896, + "step": 2236 + }, + { + "epoch": 0.4471652382499188, + "grad_norm": 2.21875, + "learning_rate": 9.501376741636392e-06, + "loss": 1.0012, + "step": 2237 + }, + { + "epoch": 0.4473651333050149, + "grad_norm": 2.109375, + "learning_rate": 9.5009178998887e-06, + "loss": 1.1222, + "step": 2238 + }, + { + "epoch": 0.4475650283601109, + "grad_norm": 2.234375, + "learning_rate": 9.500458858210714e-06, + "loss": 1.1579, + "step": 2239 + }, + { + "epoch": 0.447764923415207, + "grad_norm": 2.21875, + "learning_rate": 9.499999616622828e-06, + "loss": 1.105, + "step": 2240 + }, + { + "epoch": 0.4479648184703031, + "grad_norm": 2.109375, + "learning_rate": 9.499540175145438e-06, + "loss": 1.0628, + "step": 2241 + }, + { + "epoch": 0.44816471352539916, + "grad_norm": 2.015625, + "learning_rate": 9.499080533798956e-06, + "loss": 0.9828, + "step": 2242 + }, + { + "epoch": 0.44836460858049526, + "grad_norm": 2.171875, + "learning_rate": 9.498620692603797e-06, + "loss": 1.0454, + "step": 2243 + }, + { + "epoch": 0.4485645036355913, + "grad_norm": 2.21875, + "learning_rate": 9.498160651580387e-06, + "loss": 1.063, + "step": 2244 + }, + { + "epoch": 0.4487643986906874, + "grad_norm": 2.0, + "learning_rate": 9.49770041074916e-06, + "loss": 1.0674, + "step": 2245 + }, + { + "epoch": 0.44896429374578345, + "grad_norm": 2.15625, + "learning_rate": 9.497239970130561e-06, + "loss": 0.9574, + "step": 2246 + }, + { + "epoch": 0.44916418880087955, + "grad_norm": 2.171875, + "learning_rate": 9.496779329745045e-06, + "loss": 1.0144, + "step": 2247 + }, + { + "epoch": 0.4493640838559756, + "grad_norm": 2.25, + "learning_rate": 9.49631848961307e-06, + "loss": 1.1661, + "step": 2248 + }, + { + "epoch": 0.4495639789110717, + "grad_norm": 2.109375, + "learning_rate": 9.495857449755109e-06, + "loss": 0.9927, + "step": 2249 + }, + { + "epoch": 0.44976387396616774, + "grad_norm": 2.125, + "learning_rate": 9.495396210191639e-06, + "loss": 1.0636, + "step": 2250 + }, + { + "epoch": 0.44996376902126384, + "grad_norm": 2.109375, + "learning_rate": 9.49493477094315e-06, + "loss": 1.0666, + "step": 2251 + }, + { + "epoch": 0.45016366407635994, + "grad_norm": 2.09375, + "learning_rate": 9.494473132030137e-06, + "loss": 1.0508, + "step": 2252 + }, + { + "epoch": 0.450363559131456, + "grad_norm": 2.125, + "learning_rate": 9.494011293473109e-06, + "loss": 1.0578, + "step": 2253 + }, + { + "epoch": 0.4505634541865521, + "grad_norm": 2.15625, + "learning_rate": 9.49354925529258e-06, + "loss": 1.0574, + "step": 2254 + }, + { + "epoch": 0.4507633492416481, + "grad_norm": 2.140625, + "learning_rate": 9.493087017509072e-06, + "loss": 1.1154, + "step": 2255 + }, + { + "epoch": 0.4509632442967442, + "grad_norm": 2.1875, + "learning_rate": 9.49262458014312e-06, + "loss": 1.1417, + "step": 2256 + }, + { + "epoch": 0.45116313935184027, + "grad_norm": 2.046875, + "learning_rate": 9.492161943215262e-06, + "loss": 1.1611, + "step": 2257 + }, + { + "epoch": 0.45136303440693637, + "grad_norm": 2.03125, + "learning_rate": 9.491699106746051e-06, + "loss": 1.0836, + "step": 2258 + }, + { + "epoch": 0.4515629294620324, + "grad_norm": 2.125, + "learning_rate": 9.491236070756045e-06, + "loss": 1.1178, + "step": 2259 + }, + { + "epoch": 0.4517628245171285, + "grad_norm": 2.109375, + "learning_rate": 9.490772835265814e-06, + "loss": 1.0072, + "step": 2260 + }, + { + "epoch": 0.45196271957222456, + "grad_norm": 2.171875, + "learning_rate": 9.490309400295932e-06, + "loss": 1.0635, + "step": 2261 + }, + { + "epoch": 0.45216261462732066, + "grad_norm": 2.1875, + "learning_rate": 9.489845765866986e-06, + "loss": 0.9716, + "step": 2262 + }, + { + "epoch": 0.45236250968241676, + "grad_norm": 2.09375, + "learning_rate": 9.48938193199957e-06, + "loss": 1.0294, + "step": 2263 + }, + { + "epoch": 0.4525624047375128, + "grad_norm": 2.28125, + "learning_rate": 9.48891789871429e-06, + "loss": 1.0198, + "step": 2264 + }, + { + "epoch": 0.4527622997926089, + "grad_norm": 2.046875, + "learning_rate": 9.488453666031755e-06, + "loss": 1.0776, + "step": 2265 + }, + { + "epoch": 0.45296219484770495, + "grad_norm": 2.0, + "learning_rate": 9.487989233972587e-06, + "loss": 1.0584, + "step": 2266 + }, + { + "epoch": 0.45316208990280105, + "grad_norm": 2.109375, + "learning_rate": 9.487524602557417e-06, + "loss": 1.0575, + "step": 2267 + }, + { + "epoch": 0.4533619849578971, + "grad_norm": 2.125, + "learning_rate": 9.487059771806883e-06, + "loss": 1.0741, + "step": 2268 + }, + { + "epoch": 0.4535618800129932, + "grad_norm": 2.15625, + "learning_rate": 9.486594741741634e-06, + "loss": 1.0547, + "step": 2269 + }, + { + "epoch": 0.45376177506808923, + "grad_norm": 2.21875, + "learning_rate": 9.486129512382327e-06, + "loss": 1.0859, + "step": 2270 + }, + { + "epoch": 0.45396167012318533, + "grad_norm": 2.03125, + "learning_rate": 9.485664083749623e-06, + "loss": 1.1005, + "step": 2271 + }, + { + "epoch": 0.4541615651782814, + "grad_norm": 2.21875, + "learning_rate": 9.485198455864203e-06, + "loss": 1.2026, + "step": 2272 + }, + { + "epoch": 0.4543614602333775, + "grad_norm": 2.203125, + "learning_rate": 9.484732628746744e-06, + "loss": 1.0519, + "step": 2273 + }, + { + "epoch": 0.4545613552884736, + "grad_norm": 2.21875, + "learning_rate": 9.484266602417942e-06, + "loss": 1.0333, + "step": 2274 + }, + { + "epoch": 0.4547612503435696, + "grad_norm": 2.15625, + "learning_rate": 9.483800376898496e-06, + "loss": 1.1083, + "step": 2275 + }, + { + "epoch": 0.4549611453986657, + "grad_norm": 2.125, + "learning_rate": 9.483333952209118e-06, + "loss": 1.071, + "step": 2276 + }, + { + "epoch": 0.45516104045376177, + "grad_norm": 2.03125, + "learning_rate": 9.482867328370521e-06, + "loss": 0.9757, + "step": 2277 + }, + { + "epoch": 0.45536093550885787, + "grad_norm": 2.265625, + "learning_rate": 9.482400505403439e-06, + "loss": 1.1338, + "step": 2278 + }, + { + "epoch": 0.4555608305639539, + "grad_norm": 2.15625, + "learning_rate": 9.481933483328604e-06, + "loss": 1.1481, + "step": 2279 + }, + { + "epoch": 0.45576072561905, + "grad_norm": 2.125, + "learning_rate": 9.481466262166763e-06, + "loss": 0.9722, + "step": 2280 + }, + { + "epoch": 0.45596062067414606, + "grad_norm": 2.09375, + "learning_rate": 9.480998841938668e-06, + "loss": 1.0343, + "step": 2281 + }, + { + "epoch": 0.45616051572924216, + "grad_norm": 2.171875, + "learning_rate": 9.480531222665084e-06, + "loss": 1.089, + "step": 2282 + }, + { + "epoch": 0.4563604107843382, + "grad_norm": 2.03125, + "learning_rate": 9.480063404366781e-06, + "loss": 1.1038, + "step": 2283 + }, + { + "epoch": 0.4565603058394343, + "grad_norm": 2.15625, + "learning_rate": 9.479595387064542e-06, + "loss": 1.0458, + "step": 2284 + }, + { + "epoch": 0.4567602008945304, + "grad_norm": 2.3125, + "learning_rate": 9.479127170779151e-06, + "loss": 1.1301, + "step": 2285 + }, + { + "epoch": 0.45696009594962644, + "grad_norm": 2.203125, + "learning_rate": 9.478658755531413e-06, + "loss": 1.1147, + "step": 2286 + }, + { + "epoch": 0.45715999100472254, + "grad_norm": 2.15625, + "learning_rate": 9.47819014134213e-06, + "loss": 1.0767, + "step": 2287 + }, + { + "epoch": 0.4573598860598186, + "grad_norm": 2.171875, + "learning_rate": 9.47772132823212e-06, + "loss": 1.1587, + "step": 2288 + }, + { + "epoch": 0.4575597811149147, + "grad_norm": 2.09375, + "learning_rate": 9.477252316222204e-06, + "loss": 1.1138, + "step": 2289 + }, + { + "epoch": 0.45775967617001073, + "grad_norm": 2.125, + "learning_rate": 9.47678310533322e-06, + "loss": 0.9616, + "step": 2290 + }, + { + "epoch": 0.45795957122510683, + "grad_norm": 2.015625, + "learning_rate": 9.47631369558601e-06, + "loss": 1.029, + "step": 2291 + }, + { + "epoch": 0.4581594662802029, + "grad_norm": 2.265625, + "learning_rate": 9.475844087001423e-06, + "loss": 1.0689, + "step": 2292 + }, + { + "epoch": 0.458359361335299, + "grad_norm": 2.25, + "learning_rate": 9.475374279600317e-06, + "loss": 1.1476, + "step": 2293 + }, + { + "epoch": 0.458559256390395, + "grad_norm": 2.21875, + "learning_rate": 9.474904273403567e-06, + "loss": 1.0776, + "step": 2294 + }, + { + "epoch": 0.4587591514454911, + "grad_norm": 2.15625, + "learning_rate": 9.474434068432046e-06, + "loss": 1.0983, + "step": 2295 + }, + { + "epoch": 0.45895904650058716, + "grad_norm": 2.234375, + "learning_rate": 9.47396366470664e-06, + "loss": 1.0944, + "step": 2296 + }, + { + "epoch": 0.45915894155568326, + "grad_norm": 2.109375, + "learning_rate": 9.473493062248247e-06, + "loss": 1.0984, + "step": 2297 + }, + { + "epoch": 0.45935883661077936, + "grad_norm": 2.09375, + "learning_rate": 9.473022261077771e-06, + "loss": 0.881, + "step": 2298 + }, + { + "epoch": 0.4595587316658754, + "grad_norm": 1.8984375, + "learning_rate": 9.472551261216124e-06, + "loss": 0.9679, + "step": 2299 + }, + { + "epoch": 0.4597586267209715, + "grad_norm": 2.265625, + "learning_rate": 9.472080062684225e-06, + "loss": 1.0331, + "step": 2300 + }, + { + "epoch": 0.45995852177606755, + "grad_norm": 2.171875, + "learning_rate": 9.471608665503008e-06, + "loss": 1.1127, + "step": 2301 + }, + { + "epoch": 0.46015841683116365, + "grad_norm": 2.03125, + "learning_rate": 9.471137069693415e-06, + "loss": 1.1487, + "step": 2302 + }, + { + "epoch": 0.4603583118862597, + "grad_norm": 2.09375, + "learning_rate": 9.470665275276387e-06, + "loss": 1.0939, + "step": 2303 + }, + { + "epoch": 0.4605582069413558, + "grad_norm": 2.171875, + "learning_rate": 9.470193282272886e-06, + "loss": 1.0531, + "step": 2304 + }, + { + "epoch": 0.46075810199645184, + "grad_norm": 2.15625, + "learning_rate": 9.469721090703879e-06, + "loss": 1.0568, + "step": 2305 + }, + { + "epoch": 0.46095799705154794, + "grad_norm": 2.15625, + "learning_rate": 9.469248700590336e-06, + "loss": 1.0505, + "step": 2306 + }, + { + "epoch": 0.461157892106644, + "grad_norm": 2.09375, + "learning_rate": 9.468776111953243e-06, + "loss": 1.0995, + "step": 2307 + }, + { + "epoch": 0.4613577871617401, + "grad_norm": 2.1875, + "learning_rate": 9.468303324813595e-06, + "loss": 1.1443, + "step": 2308 + }, + { + "epoch": 0.4615576822168362, + "grad_norm": 2.15625, + "learning_rate": 9.467830339192387e-06, + "loss": 1.101, + "step": 2309 + }, + { + "epoch": 0.46175757727193223, + "grad_norm": 2.09375, + "learning_rate": 9.467357155110636e-06, + "loss": 1.08, + "step": 2310 + }, + { + "epoch": 0.46195747232702833, + "grad_norm": 2.234375, + "learning_rate": 9.466883772589355e-06, + "loss": 1.0662, + "step": 2311 + }, + { + "epoch": 0.4621573673821244, + "grad_norm": 2.171875, + "learning_rate": 9.466410191649575e-06, + "loss": 1.0828, + "step": 2312 + }, + { + "epoch": 0.4623572624372205, + "grad_norm": 2.125, + "learning_rate": 9.46593641231233e-06, + "loss": 1.0318, + "step": 2313 + }, + { + "epoch": 0.4625571574923165, + "grad_norm": 2.109375, + "learning_rate": 9.465462434598669e-06, + "loss": 1.0425, + "step": 2314 + }, + { + "epoch": 0.4627570525474126, + "grad_norm": 2.03125, + "learning_rate": 9.464988258529642e-06, + "loss": 1.0506, + "step": 2315 + }, + { + "epoch": 0.46295694760250866, + "grad_norm": 2.125, + "learning_rate": 9.464513884126312e-06, + "loss": 1.1342, + "step": 2316 + }, + { + "epoch": 0.46315684265760476, + "grad_norm": 2.015625, + "learning_rate": 9.464039311409753e-06, + "loss": 1.0277, + "step": 2317 + }, + { + "epoch": 0.4633567377127008, + "grad_norm": 2.0625, + "learning_rate": 9.463564540401046e-06, + "loss": 1.0571, + "step": 2318 + }, + { + "epoch": 0.4635566327677969, + "grad_norm": 2.15625, + "learning_rate": 9.463089571121278e-06, + "loss": 1.1149, + "step": 2319 + }, + { + "epoch": 0.463756527822893, + "grad_norm": 2.078125, + "learning_rate": 9.462614403591548e-06, + "loss": 1.0607, + "step": 2320 + }, + { + "epoch": 0.46395642287798905, + "grad_norm": 2.0625, + "learning_rate": 9.462139037832963e-06, + "loss": 1.0649, + "step": 2321 + }, + { + "epoch": 0.46415631793308515, + "grad_norm": 2.375, + "learning_rate": 9.461663473866638e-06, + "loss": 1.1467, + "step": 2322 + }, + { + "epoch": 0.4643562129881812, + "grad_norm": 2.140625, + "learning_rate": 9.461187711713697e-06, + "loss": 1.1511, + "step": 2323 + }, + { + "epoch": 0.4645561080432773, + "grad_norm": 2.1875, + "learning_rate": 9.460711751395276e-06, + "loss": 1.0512, + "step": 2324 + }, + { + "epoch": 0.46475600309837334, + "grad_norm": 2.125, + "learning_rate": 9.460235592932515e-06, + "loss": 1.0769, + "step": 2325 + }, + { + "epoch": 0.46495589815346944, + "grad_norm": 2.140625, + "learning_rate": 9.459759236346565e-06, + "loss": 1.1492, + "step": 2326 + }, + { + "epoch": 0.4651557932085655, + "grad_norm": 2.140625, + "learning_rate": 9.459282681658585e-06, + "loss": 1.0587, + "step": 2327 + }, + { + "epoch": 0.4653556882636616, + "grad_norm": 2.140625, + "learning_rate": 9.458805928889747e-06, + "loss": 1.0467, + "step": 2328 + }, + { + "epoch": 0.4655555833187576, + "grad_norm": 2.296875, + "learning_rate": 9.458328978061225e-06, + "loss": 1.0781, + "step": 2329 + }, + { + "epoch": 0.4657554783738537, + "grad_norm": 2.21875, + "learning_rate": 9.457851829194205e-06, + "loss": 1.0948, + "step": 2330 + }, + { + "epoch": 0.4659553734289498, + "grad_norm": 2.171875, + "learning_rate": 9.457374482309885e-06, + "loss": 1.141, + "step": 2331 + }, + { + "epoch": 0.46615526848404587, + "grad_norm": 2.15625, + "learning_rate": 9.456896937429465e-06, + "loss": 0.9686, + "step": 2332 + }, + { + "epoch": 0.46635516353914197, + "grad_norm": 2.125, + "learning_rate": 9.456419194574158e-06, + "loss": 1.0933, + "step": 2333 + }, + { + "epoch": 0.466555058594238, + "grad_norm": 2.125, + "learning_rate": 9.455941253765188e-06, + "loss": 1.048, + "step": 2334 + }, + { + "epoch": 0.4667549536493341, + "grad_norm": 2.140625, + "learning_rate": 9.455463115023783e-06, + "loss": 1.105, + "step": 2335 + }, + { + "epoch": 0.46695484870443016, + "grad_norm": 1.9453125, + "learning_rate": 9.45498477837118e-06, + "loss": 1.0048, + "step": 2336 + }, + { + "epoch": 0.46715474375952626, + "grad_norm": 2.171875, + "learning_rate": 9.454506243828633e-06, + "loss": 0.9989, + "step": 2337 + }, + { + "epoch": 0.4673546388146223, + "grad_norm": 2.25, + "learning_rate": 9.454027511417392e-06, + "loss": 1.0726, + "step": 2338 + }, + { + "epoch": 0.4675545338697184, + "grad_norm": 2.046875, + "learning_rate": 9.453548581158726e-06, + "loss": 1.1318, + "step": 2339 + }, + { + "epoch": 0.46775442892481445, + "grad_norm": 2.046875, + "learning_rate": 9.453069453073906e-06, + "loss": 1.064, + "step": 2340 + }, + { + "epoch": 0.46795432397991055, + "grad_norm": 2.015625, + "learning_rate": 9.452590127184217e-06, + "loss": 1.0699, + "step": 2341 + }, + { + "epoch": 0.46815421903500665, + "grad_norm": 2.09375, + "learning_rate": 9.45211060351095e-06, + "loss": 1.0564, + "step": 2342 + }, + { + "epoch": 0.4683541140901027, + "grad_norm": 2.1875, + "learning_rate": 9.451630882075407e-06, + "loss": 1.086, + "step": 2343 + }, + { + "epoch": 0.4685540091451988, + "grad_norm": 2.21875, + "learning_rate": 9.451150962898894e-06, + "loss": 1.1009, + "step": 2344 + }, + { + "epoch": 0.46875390420029484, + "grad_norm": 2.21875, + "learning_rate": 9.450670846002732e-06, + "loss": 1.1218, + "step": 2345 + }, + { + "epoch": 0.46895379925539094, + "grad_norm": 2.203125, + "learning_rate": 9.450190531408245e-06, + "loss": 1.067, + "step": 2346 + }, + { + "epoch": 0.469153694310487, + "grad_norm": 2.0625, + "learning_rate": 9.44971001913677e-06, + "loss": 1.0177, + "step": 2347 + }, + { + "epoch": 0.4693535893655831, + "grad_norm": 2.03125, + "learning_rate": 9.449229309209654e-06, + "loss": 0.9842, + "step": 2348 + }, + { + "epoch": 0.4695534844206791, + "grad_norm": 2.25, + "learning_rate": 9.448748401648244e-06, + "loss": 1.0315, + "step": 2349 + }, + { + "epoch": 0.4697533794757752, + "grad_norm": 2.234375, + "learning_rate": 9.448267296473905e-06, + "loss": 1.1507, + "step": 2350 + }, + { + "epoch": 0.46995327453087127, + "grad_norm": 1.9609375, + "learning_rate": 9.44778599370801e-06, + "loss": 1.0273, + "step": 2351 + }, + { + "epoch": 0.47015316958596737, + "grad_norm": 2.125, + "learning_rate": 9.447304493371934e-06, + "loss": 1.0988, + "step": 2352 + }, + { + "epoch": 0.47035306464106347, + "grad_norm": 2.03125, + "learning_rate": 9.44682279548707e-06, + "loss": 1.1306, + "step": 2353 + }, + { + "epoch": 0.4705529596961595, + "grad_norm": 2.015625, + "learning_rate": 9.44634090007481e-06, + "loss": 0.9867, + "step": 2354 + }, + { + "epoch": 0.4707528547512556, + "grad_norm": 2.046875, + "learning_rate": 9.445858807156563e-06, + "loss": 1.0614, + "step": 2355 + }, + { + "epoch": 0.47095274980635166, + "grad_norm": 2.015625, + "learning_rate": 9.445376516753743e-06, + "loss": 0.9955, + "step": 2356 + }, + { + "epoch": 0.47115264486144776, + "grad_norm": 2.1875, + "learning_rate": 9.444894028887773e-06, + "loss": 1.0919, + "step": 2357 + }, + { + "epoch": 0.4713525399165438, + "grad_norm": 2.078125, + "learning_rate": 9.444411343580083e-06, + "loss": 0.9668, + "step": 2358 + }, + { + "epoch": 0.4715524349716399, + "grad_norm": 2.015625, + "learning_rate": 9.443928460852118e-06, + "loss": 1.089, + "step": 2359 + }, + { + "epoch": 0.47175233002673594, + "grad_norm": 2.375, + "learning_rate": 9.443445380725324e-06, + "loss": 1.1675, + "step": 2360 + }, + { + "epoch": 0.47195222508183204, + "grad_norm": 2.203125, + "learning_rate": 9.442962103221161e-06, + "loss": 1.0798, + "step": 2361 + }, + { + "epoch": 0.4721521201369281, + "grad_norm": 2.109375, + "learning_rate": 9.442478628361098e-06, + "loss": 1.0142, + "step": 2362 + }, + { + "epoch": 0.4723520151920242, + "grad_norm": 2.09375, + "learning_rate": 9.441994956166607e-06, + "loss": 1.0302, + "step": 2363 + }, + { + "epoch": 0.4725519102471203, + "grad_norm": 2.03125, + "learning_rate": 9.441511086659175e-06, + "loss": 1.0224, + "step": 2364 + }, + { + "epoch": 0.47275180530221633, + "grad_norm": 2.1875, + "learning_rate": 9.441027019860294e-06, + "loss": 1.013, + "step": 2365 + }, + { + "epoch": 0.47295170035731243, + "grad_norm": 2.234375, + "learning_rate": 9.440542755791467e-06, + "loss": 1.1531, + "step": 2366 + }, + { + "epoch": 0.4731515954124085, + "grad_norm": 2.140625, + "learning_rate": 9.440058294474206e-06, + "loss": 1.1263, + "step": 2367 + }, + { + "epoch": 0.4733514904675046, + "grad_norm": 2.171875, + "learning_rate": 9.439573635930029e-06, + "loss": 1.2652, + "step": 2368 + }, + { + "epoch": 0.4735513855226006, + "grad_norm": 2.03125, + "learning_rate": 9.439088780180465e-06, + "loss": 1.0479, + "step": 2369 + }, + { + "epoch": 0.4737512805776967, + "grad_norm": 2.015625, + "learning_rate": 9.438603727247053e-06, + "loss": 0.9835, + "step": 2370 + }, + { + "epoch": 0.47395117563279276, + "grad_norm": 2.15625, + "learning_rate": 9.438118477151336e-06, + "loss": 1.105, + "step": 2371 + }, + { + "epoch": 0.47415107068788886, + "grad_norm": 2.109375, + "learning_rate": 9.43763302991487e-06, + "loss": 1.094, + "step": 2372 + }, + { + "epoch": 0.4743509657429849, + "grad_norm": 2.1875, + "learning_rate": 9.43714738555922e-06, + "loss": 1.0631, + "step": 2373 + }, + { + "epoch": 0.474550860798081, + "grad_norm": 2.0625, + "learning_rate": 9.436661544105958e-06, + "loss": 1.0402, + "step": 2374 + }, + { + "epoch": 0.4747507558531771, + "grad_norm": 2.109375, + "learning_rate": 9.436175505576663e-06, + "loss": 1.0733, + "step": 2375 + }, + { + "epoch": 0.47495065090827315, + "grad_norm": 2.046875, + "learning_rate": 9.435689269992924e-06, + "loss": 0.9708, + "step": 2376 + }, + { + "epoch": 0.47515054596336925, + "grad_norm": 2.21875, + "learning_rate": 9.435202837376344e-06, + "loss": 1.0567, + "step": 2377 + }, + { + "epoch": 0.4753504410184653, + "grad_norm": 2.078125, + "learning_rate": 9.434716207748527e-06, + "loss": 1.0168, + "step": 2378 + }, + { + "epoch": 0.4755503360735614, + "grad_norm": 2.140625, + "learning_rate": 9.434229381131088e-06, + "loss": 1.0316, + "step": 2379 + }, + { + "epoch": 0.47575023112865744, + "grad_norm": 2.078125, + "learning_rate": 9.433742357545655e-06, + "loss": 1.0635, + "step": 2380 + }, + { + "epoch": 0.47595012618375354, + "grad_norm": 2.0625, + "learning_rate": 9.433255137013861e-06, + "loss": 1.0482, + "step": 2381 + }, + { + "epoch": 0.4761500212388496, + "grad_norm": 2.15625, + "learning_rate": 9.432767719557345e-06, + "loss": 1.0387, + "step": 2382 + }, + { + "epoch": 0.4763499162939457, + "grad_norm": 2.140625, + "learning_rate": 9.432280105197761e-06, + "loss": 1.0403, + "step": 2383 + }, + { + "epoch": 0.47654981134904173, + "grad_norm": 2.328125, + "learning_rate": 9.43179229395677e-06, + "loss": 1.186, + "step": 2384 + }, + { + "epoch": 0.47674970640413783, + "grad_norm": 2.125, + "learning_rate": 9.431304285856037e-06, + "loss": 1.0996, + "step": 2385 + }, + { + "epoch": 0.47694960145923393, + "grad_norm": 2.15625, + "learning_rate": 9.43081608091724e-06, + "loss": 1.0019, + "step": 2386 + }, + { + "epoch": 0.47714949651433, + "grad_norm": 2.0625, + "learning_rate": 9.430327679162068e-06, + "loss": 1.0486, + "step": 2387 + }, + { + "epoch": 0.4773493915694261, + "grad_norm": 2.03125, + "learning_rate": 9.429839080612213e-06, + "loss": 1.0941, + "step": 2388 + }, + { + "epoch": 0.4775492866245221, + "grad_norm": 2.0625, + "learning_rate": 9.429350285289378e-06, + "loss": 1.1118, + "step": 2389 + }, + { + "epoch": 0.4777491816796182, + "grad_norm": 2.21875, + "learning_rate": 9.428861293215278e-06, + "loss": 1.0182, + "step": 2390 + }, + { + "epoch": 0.47794907673471426, + "grad_norm": 2.234375, + "learning_rate": 9.428372104411632e-06, + "loss": 1.0864, + "step": 2391 + }, + { + "epoch": 0.47814897178981036, + "grad_norm": 2.09375, + "learning_rate": 9.427882718900168e-06, + "loss": 1.0834, + "step": 2392 + }, + { + "epoch": 0.4783488668449064, + "grad_norm": 2.078125, + "learning_rate": 9.42739313670263e-06, + "loss": 1.0204, + "step": 2393 + }, + { + "epoch": 0.4785487619000025, + "grad_norm": 2.0625, + "learning_rate": 9.42690335784076e-06, + "loss": 0.9799, + "step": 2394 + }, + { + "epoch": 0.47874865695509855, + "grad_norm": 2.25, + "learning_rate": 9.426413382336314e-06, + "loss": 1.16, + "step": 2395 + }, + { + "epoch": 0.47894855201019465, + "grad_norm": 2.1875, + "learning_rate": 9.42592321021106e-06, + "loss": 1.1539, + "step": 2396 + }, + { + "epoch": 0.4791484470652907, + "grad_norm": 2.078125, + "learning_rate": 9.42543284148677e-06, + "loss": 1.108, + "step": 2397 + }, + { + "epoch": 0.4793483421203868, + "grad_norm": 2.09375, + "learning_rate": 9.424942276185226e-06, + "loss": 1.0172, + "step": 2398 + }, + { + "epoch": 0.4795482371754829, + "grad_norm": 2.109375, + "learning_rate": 9.424451514328218e-06, + "loss": 1.0551, + "step": 2399 + }, + { + "epoch": 0.47974813223057894, + "grad_norm": 2.171875, + "learning_rate": 9.423960555937546e-06, + "loss": 1.054, + "step": 2400 + }, + { + "epoch": 0.47994802728567504, + "grad_norm": 1.953125, + "learning_rate": 9.423469401035019e-06, + "loss": 0.9714, + "step": 2401 + }, + { + "epoch": 0.4801479223407711, + "grad_norm": 2.15625, + "learning_rate": 9.422978049642456e-06, + "loss": 1.084, + "step": 2402 + }, + { + "epoch": 0.4803478173958672, + "grad_norm": 2.171875, + "learning_rate": 9.422486501781676e-06, + "loss": 1.0797, + "step": 2403 + }, + { + "epoch": 0.4805477124509632, + "grad_norm": 2.078125, + "learning_rate": 9.421994757474522e-06, + "loss": 1.0522, + "step": 2404 + }, + { + "epoch": 0.4807476075060593, + "grad_norm": 2.171875, + "learning_rate": 9.421502816742829e-06, + "loss": 1.1385, + "step": 2405 + }, + { + "epoch": 0.48094750256115537, + "grad_norm": 2.0625, + "learning_rate": 9.421010679608455e-06, + "loss": 1.0394, + "step": 2406 + }, + { + "epoch": 0.48114739761625147, + "grad_norm": 2.09375, + "learning_rate": 9.42051834609326e-06, + "loss": 1.0059, + "step": 2407 + }, + { + "epoch": 0.4813472926713475, + "grad_norm": 2.078125, + "learning_rate": 9.420025816219111e-06, + "loss": 1.1555, + "step": 2408 + }, + { + "epoch": 0.4815471877264436, + "grad_norm": 2.140625, + "learning_rate": 9.419533090007888e-06, + "loss": 1.0863, + "step": 2409 + }, + { + "epoch": 0.4817470827815397, + "grad_norm": 2.1875, + "learning_rate": 9.419040167481477e-06, + "loss": 1.1842, + "step": 2410 + }, + { + "epoch": 0.48194697783663576, + "grad_norm": 2.15625, + "learning_rate": 9.418547048661772e-06, + "loss": 1.1178, + "step": 2411 + }, + { + "epoch": 0.48214687289173186, + "grad_norm": 2.109375, + "learning_rate": 9.418053733570682e-06, + "loss": 1.0866, + "step": 2412 + }, + { + "epoch": 0.4823467679468279, + "grad_norm": 2.078125, + "learning_rate": 9.417560222230115e-06, + "loss": 1.1088, + "step": 2413 + }, + { + "epoch": 0.482546663001924, + "grad_norm": 2.15625, + "learning_rate": 9.417066514661995e-06, + "loss": 1.0889, + "step": 2414 + }, + { + "epoch": 0.48274655805702005, + "grad_norm": 2.125, + "learning_rate": 9.416572610888253e-06, + "loss": 1.0295, + "step": 2415 + }, + { + "epoch": 0.48294645311211615, + "grad_norm": 2.0, + "learning_rate": 9.416078510930827e-06, + "loss": 1.0452, + "step": 2416 + }, + { + "epoch": 0.4831463481672122, + "grad_norm": 2.171875, + "learning_rate": 9.415584214811665e-06, + "loss": 1.0287, + "step": 2417 + }, + { + "epoch": 0.4833462432223083, + "grad_norm": 2.109375, + "learning_rate": 9.415089722552723e-06, + "loss": 1.0204, + "step": 2418 + }, + { + "epoch": 0.48354613827740434, + "grad_norm": 2.0625, + "learning_rate": 9.414595034175968e-06, + "loss": 1.031, + "step": 2419 + }, + { + "epoch": 0.48374603333250044, + "grad_norm": 2.1875, + "learning_rate": 9.414100149703373e-06, + "loss": 1.1251, + "step": 2420 + }, + { + "epoch": 0.48394592838759654, + "grad_norm": 1.9921875, + "learning_rate": 9.413605069156921e-06, + "loss": 1.0836, + "step": 2421 + }, + { + "epoch": 0.4841458234426926, + "grad_norm": 2.015625, + "learning_rate": 9.413109792558603e-06, + "loss": 1.0684, + "step": 2422 + }, + { + "epoch": 0.4843457184977887, + "grad_norm": 2.109375, + "learning_rate": 9.41261431993042e-06, + "loss": 1.091, + "step": 2423 + }, + { + "epoch": 0.4845456135528847, + "grad_norm": 2.234375, + "learning_rate": 9.41211865129438e-06, + "loss": 1.1022, + "step": 2424 + }, + { + "epoch": 0.4847455086079808, + "grad_norm": 2.1875, + "learning_rate": 9.411622786672499e-06, + "loss": 1.1228, + "step": 2425 + }, + { + "epoch": 0.48494540366307687, + "grad_norm": 2.171875, + "learning_rate": 9.411126726086807e-06, + "loss": 1.1744, + "step": 2426 + }, + { + "epoch": 0.48514529871817297, + "grad_norm": 2.03125, + "learning_rate": 9.410630469559336e-06, + "loss": 1.0278, + "step": 2427 + }, + { + "epoch": 0.485345193773269, + "grad_norm": 1.96875, + "learning_rate": 9.41013401711213e-06, + "loss": 1.0577, + "step": 2428 + }, + { + "epoch": 0.4855450888283651, + "grad_norm": 2.109375, + "learning_rate": 9.409637368767244e-06, + "loss": 1.0304, + "step": 2429 + }, + { + "epoch": 0.48574498388346116, + "grad_norm": 2.09375, + "learning_rate": 9.409140524546736e-06, + "loss": 0.9207, + "step": 2430 + }, + { + "epoch": 0.48594487893855726, + "grad_norm": 2.109375, + "learning_rate": 9.408643484472676e-06, + "loss": 1.0671, + "step": 2431 + }, + { + "epoch": 0.48614477399365336, + "grad_norm": 2.09375, + "learning_rate": 9.408146248567143e-06, + "loss": 1.044, + "step": 2432 + }, + { + "epoch": 0.4863446690487494, + "grad_norm": 2.25, + "learning_rate": 9.407648816852226e-06, + "loss": 1.0707, + "step": 2433 + }, + { + "epoch": 0.4865445641038455, + "grad_norm": 2.015625, + "learning_rate": 9.407151189350019e-06, + "loss": 1.0925, + "step": 2434 + }, + { + "epoch": 0.48674445915894154, + "grad_norm": 2.125, + "learning_rate": 9.406653366082626e-06, + "loss": 1.0222, + "step": 2435 + }, + { + "epoch": 0.48694435421403764, + "grad_norm": 2.140625, + "learning_rate": 9.406155347072162e-06, + "loss": 1.0815, + "step": 2436 + }, + { + "epoch": 0.4871442492691337, + "grad_norm": 2.25, + "learning_rate": 9.405657132340746e-06, + "loss": 1.0399, + "step": 2437 + }, + { + "epoch": 0.4873441443242298, + "grad_norm": 2.125, + "learning_rate": 9.405158721910514e-06, + "loss": 1.0846, + "step": 2438 + }, + { + "epoch": 0.48754403937932583, + "grad_norm": 2.09375, + "learning_rate": 9.4046601158036e-06, + "loss": 1.0979, + "step": 2439 + }, + { + "epoch": 0.48774393443442193, + "grad_norm": 2.28125, + "learning_rate": 9.404161314042155e-06, + "loss": 1.0644, + "step": 2440 + }, + { + "epoch": 0.487943829489518, + "grad_norm": 2.03125, + "learning_rate": 9.403662316648335e-06, + "loss": 1.0271, + "step": 2441 + }, + { + "epoch": 0.4881437245446141, + "grad_norm": 2.109375, + "learning_rate": 9.403163123644303e-06, + "loss": 1.0608, + "step": 2442 + }, + { + "epoch": 0.4883436195997102, + "grad_norm": 2.125, + "learning_rate": 9.402663735052238e-06, + "loss": 1.0577, + "step": 2443 + }, + { + "epoch": 0.4885435146548062, + "grad_norm": 2.15625, + "learning_rate": 9.402164150894318e-06, + "loss": 1.1031, + "step": 2444 + }, + { + "epoch": 0.4887434097099023, + "grad_norm": 2.296875, + "learning_rate": 9.401664371192738e-06, + "loss": 0.9917, + "step": 2445 + }, + { + "epoch": 0.48894330476499837, + "grad_norm": 2.234375, + "learning_rate": 9.401164395969697e-06, + "loss": 1.1124, + "step": 2446 + }, + { + "epoch": 0.48914319982009447, + "grad_norm": 2.09375, + "learning_rate": 9.400664225247402e-06, + "loss": 1.1387, + "step": 2447 + }, + { + "epoch": 0.4893430948751905, + "grad_norm": 2.09375, + "learning_rate": 9.400163859048073e-06, + "loss": 1.0916, + "step": 2448 + }, + { + "epoch": 0.4895429899302866, + "grad_norm": 2.0625, + "learning_rate": 9.399663297393937e-06, + "loss": 1.0525, + "step": 2449 + }, + { + "epoch": 0.48974288498538265, + "grad_norm": 2.203125, + "learning_rate": 9.399162540307225e-06, + "loss": 1.0592, + "step": 2450 + }, + { + "epoch": 0.48994278004047875, + "grad_norm": 2.046875, + "learning_rate": 9.398661587810183e-06, + "loss": 1.0823, + "step": 2451 + }, + { + "epoch": 0.4901426750955748, + "grad_norm": 2.140625, + "learning_rate": 9.398160439925064e-06, + "loss": 0.9808, + "step": 2452 + }, + { + "epoch": 0.4903425701506709, + "grad_norm": 2.25, + "learning_rate": 9.397659096674128e-06, + "loss": 1.1753, + "step": 2453 + }, + { + "epoch": 0.490542465205767, + "grad_norm": 2.15625, + "learning_rate": 9.397157558079644e-06, + "loss": 1.1111, + "step": 2454 + }, + { + "epoch": 0.49074236026086304, + "grad_norm": 2.15625, + "learning_rate": 9.39665582416389e-06, + "loss": 0.9791, + "step": 2455 + }, + { + "epoch": 0.49094225531595914, + "grad_norm": 2.046875, + "learning_rate": 9.396153894949155e-06, + "loss": 1.0467, + "step": 2456 + }, + { + "epoch": 0.4911421503710552, + "grad_norm": 2.125, + "learning_rate": 9.395651770457735e-06, + "loss": 1.0843, + "step": 2457 + }, + { + "epoch": 0.4913420454261513, + "grad_norm": 2.140625, + "learning_rate": 9.39514945071193e-06, + "loss": 1.0924, + "step": 2458 + }, + { + "epoch": 0.49154194048124733, + "grad_norm": 2.265625, + "learning_rate": 9.394646935734057e-06, + "loss": 1.1448, + "step": 2459 + }, + { + "epoch": 0.49174183553634343, + "grad_norm": 2.25, + "learning_rate": 9.394144225546436e-06, + "loss": 1.0852, + "step": 2460 + }, + { + "epoch": 0.4919417305914395, + "grad_norm": 2.25, + "learning_rate": 9.393641320171398e-06, + "loss": 1.1342, + "step": 2461 + }, + { + "epoch": 0.4921416256465356, + "grad_norm": 2.1875, + "learning_rate": 9.393138219631283e-06, + "loss": 1.0038, + "step": 2462 + }, + { + "epoch": 0.4923415207016316, + "grad_norm": 2.125, + "learning_rate": 9.392634923948437e-06, + "loss": 0.9803, + "step": 2463 + }, + { + "epoch": 0.4925414157567277, + "grad_norm": 2.125, + "learning_rate": 9.392131433145216e-06, + "loss": 1.0426, + "step": 2464 + }, + { + "epoch": 0.4927413108118238, + "grad_norm": 2.109375, + "learning_rate": 9.391627747243986e-06, + "loss": 1.0738, + "step": 2465 + }, + { + "epoch": 0.49294120586691986, + "grad_norm": 2.0, + "learning_rate": 9.391123866267121e-06, + "loss": 0.9984, + "step": 2466 + }, + { + "epoch": 0.49314110092201596, + "grad_norm": 2.125, + "learning_rate": 9.390619790237003e-06, + "loss": 1.0881, + "step": 2467 + }, + { + "epoch": 0.493340995977112, + "grad_norm": 2.078125, + "learning_rate": 9.390115519176022e-06, + "loss": 1.0613, + "step": 2468 + }, + { + "epoch": 0.4935408910322081, + "grad_norm": 2.109375, + "learning_rate": 9.38961105310658e-06, + "loss": 1.0991, + "step": 2469 + }, + { + "epoch": 0.49374078608730415, + "grad_norm": 2.1875, + "learning_rate": 9.389106392051083e-06, + "loss": 1.0358, + "step": 2470 + }, + { + "epoch": 0.49394068114240025, + "grad_norm": 2.125, + "learning_rate": 9.388601536031949e-06, + "loss": 1.0046, + "step": 2471 + }, + { + "epoch": 0.4941405761974963, + "grad_norm": 2.171875, + "learning_rate": 9.388096485071603e-06, + "loss": 1.0651, + "step": 2472 + }, + { + "epoch": 0.4943404712525924, + "grad_norm": 2.046875, + "learning_rate": 9.387591239192479e-06, + "loss": 1.0863, + "step": 2473 + }, + { + "epoch": 0.49454036630768844, + "grad_norm": 2.265625, + "learning_rate": 9.387085798417021e-06, + "loss": 1.1749, + "step": 2474 + }, + { + "epoch": 0.49474026136278454, + "grad_norm": 1.9765625, + "learning_rate": 9.386580162767682e-06, + "loss": 1.025, + "step": 2475 + }, + { + "epoch": 0.49494015641788064, + "grad_norm": 2.0625, + "learning_rate": 9.38607433226692e-06, + "loss": 1.0639, + "step": 2476 + }, + { + "epoch": 0.4951400514729767, + "grad_norm": 2.265625, + "learning_rate": 9.385568306937204e-06, + "loss": 1.1536, + "step": 2477 + }, + { + "epoch": 0.4953399465280728, + "grad_norm": 2.03125, + "learning_rate": 9.385062086801013e-06, + "loss": 1.0838, + "step": 2478 + }, + { + "epoch": 0.4955398415831688, + "grad_norm": 1.9765625, + "learning_rate": 9.384555671880834e-06, + "loss": 1.0367, + "step": 2479 + }, + { + "epoch": 0.4957397366382649, + "grad_norm": 1.984375, + "learning_rate": 9.384049062199157e-06, + "loss": 1.0263, + "step": 2480 + }, + { + "epoch": 0.49593963169336097, + "grad_norm": 1.9375, + "learning_rate": 9.383542257778491e-06, + "loss": 0.9039, + "step": 2481 + }, + { + "epoch": 0.49613952674845707, + "grad_norm": 2.15625, + "learning_rate": 9.383035258641345e-06, + "loss": 1.0273, + "step": 2482 + }, + { + "epoch": 0.4963394218035531, + "grad_norm": 2.109375, + "learning_rate": 9.382528064810242e-06, + "loss": 1.0644, + "step": 2483 + }, + { + "epoch": 0.4965393168586492, + "grad_norm": 2.09375, + "learning_rate": 9.382020676307708e-06, + "loss": 1.1447, + "step": 2484 + }, + { + "epoch": 0.49673921191374526, + "grad_norm": 2.25, + "learning_rate": 9.381513093156286e-06, + "loss": 1.0842, + "step": 2485 + }, + { + "epoch": 0.49693910696884136, + "grad_norm": 2.265625, + "learning_rate": 9.381005315378519e-06, + "loss": 1.1256, + "step": 2486 + }, + { + "epoch": 0.49713900202393746, + "grad_norm": 2.09375, + "learning_rate": 9.380497342996966e-06, + "loss": 1.0525, + "step": 2487 + }, + { + "epoch": 0.4973388970790335, + "grad_norm": 2.046875, + "learning_rate": 9.379989176034187e-06, + "loss": 1.0753, + "step": 2488 + }, + { + "epoch": 0.4975387921341296, + "grad_norm": 2.140625, + "learning_rate": 9.379480814512756e-06, + "loss": 1.0823, + "step": 2489 + }, + { + "epoch": 0.49773868718922565, + "grad_norm": 2.125, + "learning_rate": 9.378972258455256e-06, + "loss": 1.0258, + "step": 2490 + }, + { + "epoch": 0.49793858224432175, + "grad_norm": 2.09375, + "learning_rate": 9.378463507884276e-06, + "loss": 1.1388, + "step": 2491 + }, + { + "epoch": 0.4981384772994178, + "grad_norm": 2.140625, + "learning_rate": 9.377954562822416e-06, + "loss": 1.1564, + "step": 2492 + }, + { + "epoch": 0.4983383723545139, + "grad_norm": 2.21875, + "learning_rate": 9.37744542329228e-06, + "loss": 1.1278, + "step": 2493 + }, + { + "epoch": 0.49853826740960994, + "grad_norm": 2.046875, + "learning_rate": 9.376936089316487e-06, + "loss": 1.0647, + "step": 2494 + }, + { + "epoch": 0.49873816246470604, + "grad_norm": 2.1875, + "learning_rate": 9.376426560917659e-06, + "loss": 1.1887, + "step": 2495 + }, + { + "epoch": 0.4989380575198021, + "grad_norm": 2.03125, + "learning_rate": 9.37591683811843e-06, + "loss": 1.0494, + "step": 2496 + }, + { + "epoch": 0.4991379525748982, + "grad_norm": 2.078125, + "learning_rate": 9.375406920941444e-06, + "loss": 1.09, + "step": 2497 + }, + { + "epoch": 0.4993378476299943, + "grad_norm": 2.03125, + "learning_rate": 9.37489680940935e-06, + "loss": 1.0159, + "step": 2498 + }, + { + "epoch": 0.4995377426850903, + "grad_norm": 2.09375, + "learning_rate": 9.374386503544805e-06, + "loss": 1.0739, + "step": 2499 + }, + { + "epoch": 0.4997376377401864, + "grad_norm": 2.03125, + "learning_rate": 9.37387600337048e-06, + "loss": 1.0217, + "step": 2500 + }, + { + "epoch": 0.49993753279528247, + "grad_norm": 2.109375, + "learning_rate": 9.373365308909052e-06, + "loss": 1.0345, + "step": 2501 + }, + { + "epoch": 0.5001374278503785, + "grad_norm": 2.1875, + "learning_rate": 9.372854420183201e-06, + "loss": 1.0266, + "step": 2502 + }, + { + "epoch": 0.5003373229054746, + "grad_norm": 2.265625, + "learning_rate": 9.372343337215627e-06, + "loss": 1.1541, + "step": 2503 + }, + { + "epoch": 0.5005372179605707, + "grad_norm": 2.109375, + "learning_rate": 9.371832060029027e-06, + "loss": 1.0033, + "step": 2504 + }, + { + "epoch": 0.5007371130156668, + "grad_norm": 2.03125, + "learning_rate": 9.371320588646113e-06, + "loss": 1.0522, + "step": 2505 + }, + { + "epoch": 0.5009370080707628, + "grad_norm": 2.1875, + "learning_rate": 9.370808923089606e-06, + "loss": 1.1124, + "step": 2506 + }, + { + "epoch": 0.5011369031258589, + "grad_norm": 1.953125, + "learning_rate": 9.370297063382235e-06, + "loss": 1.0216, + "step": 2507 + }, + { + "epoch": 0.501336798180955, + "grad_norm": 2.046875, + "learning_rate": 9.369785009546732e-06, + "loss": 1.0626, + "step": 2508 + }, + { + "epoch": 0.5015366932360511, + "grad_norm": 2.140625, + "learning_rate": 9.369272761605848e-06, + "loss": 1.076, + "step": 2509 + }, + { + "epoch": 0.5017365882911472, + "grad_norm": 2.15625, + "learning_rate": 9.368760319582334e-06, + "loss": 1.0161, + "step": 2510 + }, + { + "epoch": 0.5019364833462432, + "grad_norm": 2.140625, + "learning_rate": 9.368247683498952e-06, + "loss": 1.1571, + "step": 2511 + }, + { + "epoch": 0.5021363784013393, + "grad_norm": 2.34375, + "learning_rate": 9.367734853378476e-06, + "loss": 1.0536, + "step": 2512 + }, + { + "epoch": 0.5023362734564354, + "grad_norm": 2.015625, + "learning_rate": 9.367221829243685e-06, + "loss": 0.9218, + "step": 2513 + }, + { + "epoch": 0.5025361685115315, + "grad_norm": 2.40625, + "learning_rate": 9.366708611117366e-06, + "loss": 0.9943, + "step": 2514 + }, + { + "epoch": 0.5027360635666275, + "grad_norm": 2.0625, + "learning_rate": 9.366195199022315e-06, + "loss": 1.0951, + "step": 2515 + }, + { + "epoch": 0.5029359586217236, + "grad_norm": 2.15625, + "learning_rate": 9.365681592981341e-06, + "loss": 1.0603, + "step": 2516 + }, + { + "epoch": 0.5031358536768197, + "grad_norm": 2.203125, + "learning_rate": 9.365167793017258e-06, + "loss": 1.0842, + "step": 2517 + }, + { + "epoch": 0.5033357487319158, + "grad_norm": 2.140625, + "learning_rate": 9.364653799152887e-06, + "loss": 1.1236, + "step": 2518 + }, + { + "epoch": 0.5035356437870118, + "grad_norm": 2.078125, + "learning_rate": 9.36413961141106e-06, + "loss": 0.9942, + "step": 2519 + }, + { + "epoch": 0.5037355388421079, + "grad_norm": 2.203125, + "learning_rate": 9.363625229814617e-06, + "loss": 1.0299, + "step": 2520 + }, + { + "epoch": 0.503935433897204, + "grad_norm": 2.0625, + "learning_rate": 9.363110654386409e-06, + "loss": 0.9862, + "step": 2521 + }, + { + "epoch": 0.5041353289523001, + "grad_norm": 2.03125, + "learning_rate": 9.36259588514929e-06, + "loss": 1.0143, + "step": 2522 + }, + { + "epoch": 0.5043352240073962, + "grad_norm": 2.09375, + "learning_rate": 9.36208092212613e-06, + "loss": 1.0184, + "step": 2523 + }, + { + "epoch": 0.5045351190624922, + "grad_norm": 2.015625, + "learning_rate": 9.361565765339799e-06, + "loss": 1.0772, + "step": 2524 + }, + { + "epoch": 0.5047350141175883, + "grad_norm": 2.125, + "learning_rate": 9.361050414813184e-06, + "loss": 1.0264, + "step": 2525 + }, + { + "epoch": 0.5049349091726844, + "grad_norm": 2.15625, + "learning_rate": 9.360534870569175e-06, + "loss": 1.0159, + "step": 2526 + }, + { + "epoch": 0.5051348042277805, + "grad_norm": 2.21875, + "learning_rate": 9.360019132630672e-06, + "loss": 1.1623, + "step": 2527 + }, + { + "epoch": 0.5053346992828764, + "grad_norm": 2.140625, + "learning_rate": 9.359503201020587e-06, + "loss": 0.9668, + "step": 2528 + }, + { + "epoch": 0.5055345943379725, + "grad_norm": 2.015625, + "learning_rate": 9.358987075761834e-06, + "loss": 1.0134, + "step": 2529 + }, + { + "epoch": 0.5057344893930686, + "grad_norm": 2.203125, + "learning_rate": 9.35847075687734e-06, + "loss": 1.116, + "step": 2530 + }, + { + "epoch": 0.5059343844481647, + "grad_norm": 1.96875, + "learning_rate": 9.35795424439004e-06, + "loss": 1.011, + "step": 2531 + }, + { + "epoch": 0.5061342795032608, + "grad_norm": 2.1875, + "learning_rate": 9.35743753832288e-06, + "loss": 1.0904, + "step": 2532 + }, + { + "epoch": 0.5063341745583568, + "grad_norm": 2.125, + "learning_rate": 9.356920638698809e-06, + "loss": 1.0521, + "step": 2533 + }, + { + "epoch": 0.5065340696134529, + "grad_norm": 2.046875, + "learning_rate": 9.35640354554079e-06, + "loss": 0.9193, + "step": 2534 + }, + { + "epoch": 0.506733964668549, + "grad_norm": 2.078125, + "learning_rate": 9.355886258871786e-06, + "loss": 1.0674, + "step": 2535 + }, + { + "epoch": 0.5069338597236451, + "grad_norm": 2.0, + "learning_rate": 9.355368778714784e-06, + "loss": 1.0886, + "step": 2536 + }, + { + "epoch": 0.5071337547787411, + "grad_norm": 2.078125, + "learning_rate": 9.354851105092765e-06, + "loss": 1.099, + "step": 2537 + }, + { + "epoch": 0.5073336498338372, + "grad_norm": 2.109375, + "learning_rate": 9.354333238028726e-06, + "loss": 1.1305, + "step": 2538 + }, + { + "epoch": 0.5075335448889333, + "grad_norm": 2.0625, + "learning_rate": 9.353815177545666e-06, + "loss": 1.0836, + "step": 2539 + }, + { + "epoch": 0.5077334399440294, + "grad_norm": 2.140625, + "learning_rate": 9.353296923666605e-06, + "loss": 1.0657, + "step": 2540 + }, + { + "epoch": 0.5079333349991254, + "grad_norm": 1.96875, + "learning_rate": 9.352778476414556e-06, + "loss": 1.0458, + "step": 2541 + }, + { + "epoch": 0.5081332300542215, + "grad_norm": 2.046875, + "learning_rate": 9.352259835812556e-06, + "loss": 1.059, + "step": 2542 + }, + { + "epoch": 0.5083331251093176, + "grad_norm": 2.140625, + "learning_rate": 9.351741001883636e-06, + "loss": 1.1243, + "step": 2543 + }, + { + "epoch": 0.5085330201644137, + "grad_norm": 2.296875, + "learning_rate": 9.351221974650846e-06, + "loss": 1.1725, + "step": 2544 + }, + { + "epoch": 0.5087329152195098, + "grad_norm": 2.0, + "learning_rate": 9.350702754137242e-06, + "loss": 1.0211, + "step": 2545 + }, + { + "epoch": 0.5089328102746058, + "grad_norm": 2.34375, + "learning_rate": 9.350183340365884e-06, + "loss": 1.0711, + "step": 2546 + }, + { + "epoch": 0.5091327053297019, + "grad_norm": 1.984375, + "learning_rate": 9.349663733359848e-06, + "loss": 1.0382, + "step": 2547 + }, + { + "epoch": 0.509332600384798, + "grad_norm": 2.1875, + "learning_rate": 9.349143933142214e-06, + "loss": 1.1329, + "step": 2548 + }, + { + "epoch": 0.5095324954398941, + "grad_norm": 2.328125, + "learning_rate": 9.34862393973607e-06, + "loss": 1.0075, + "step": 2549 + }, + { + "epoch": 0.5097323904949901, + "grad_norm": 2.21875, + "learning_rate": 9.348103753164515e-06, + "loss": 1.0336, + "step": 2550 + }, + { + "epoch": 0.5099322855500862, + "grad_norm": 2.171875, + "learning_rate": 9.347583373450657e-06, + "loss": 1.1463, + "step": 2551 + }, + { + "epoch": 0.5101321806051823, + "grad_norm": 2.078125, + "learning_rate": 9.347062800617609e-06, + "loss": 1.0115, + "step": 2552 + }, + { + "epoch": 0.5103320756602784, + "grad_norm": 2.09375, + "learning_rate": 9.346542034688495e-06, + "loss": 1.1991, + "step": 2553 + }, + { + "epoch": 0.5105319707153745, + "grad_norm": 2.046875, + "learning_rate": 9.346021075686448e-06, + "loss": 0.978, + "step": 2554 + }, + { + "epoch": 0.5107318657704705, + "grad_norm": 2.1875, + "learning_rate": 9.345499923634612e-06, + "loss": 1.0984, + "step": 2555 + }, + { + "epoch": 0.5109317608255666, + "grad_norm": 1.9921875, + "learning_rate": 9.34497857855613e-06, + "loss": 1.0057, + "step": 2556 + }, + { + "epoch": 0.5111316558806627, + "grad_norm": 2.15625, + "learning_rate": 9.344457040474164e-06, + "loss": 1.1259, + "step": 2557 + }, + { + "epoch": 0.5113315509357588, + "grad_norm": 1.9609375, + "learning_rate": 9.343935309411882e-06, + "loss": 0.9573, + "step": 2558 + }, + { + "epoch": 0.5115314459908548, + "grad_norm": 2.015625, + "learning_rate": 9.343413385392457e-06, + "loss": 1.0034, + "step": 2559 + }, + { + "epoch": 0.5117313410459509, + "grad_norm": 2.15625, + "learning_rate": 9.342891268439071e-06, + "loss": 1.0915, + "step": 2560 + }, + { + "epoch": 0.511931236101047, + "grad_norm": 2.015625, + "learning_rate": 9.342368958574921e-06, + "loss": 1.0353, + "step": 2561 + }, + { + "epoch": 0.5121311311561431, + "grad_norm": 2.125, + "learning_rate": 9.341846455823206e-06, + "loss": 1.1109, + "step": 2562 + }, + { + "epoch": 0.512331026211239, + "grad_norm": 2.171875, + "learning_rate": 9.341323760207135e-06, + "loss": 1.1072, + "step": 2563 + }, + { + "epoch": 0.5125309212663351, + "grad_norm": 2.109375, + "learning_rate": 9.340800871749925e-06, + "loss": 1.0226, + "step": 2564 + }, + { + "epoch": 0.5127308163214312, + "grad_norm": 2.046875, + "learning_rate": 9.340277790474804e-06, + "loss": 1.0575, + "step": 2565 + }, + { + "epoch": 0.5129307113765273, + "grad_norm": 2.171875, + "learning_rate": 9.33975451640501e-06, + "loss": 1.0841, + "step": 2566 + }, + { + "epoch": 0.5131306064316234, + "grad_norm": 2.125, + "learning_rate": 9.339231049563779e-06, + "loss": 1.141, + "step": 2567 + }, + { + "epoch": 0.5133305014867194, + "grad_norm": 2.25, + "learning_rate": 9.33870738997437e-06, + "loss": 1.1355, + "step": 2568 + }, + { + "epoch": 0.5135303965418155, + "grad_norm": 2.171875, + "learning_rate": 9.338183537660043e-06, + "loss": 1.0538, + "step": 2569 + }, + { + "epoch": 0.5137302915969116, + "grad_norm": 2.234375, + "learning_rate": 9.337659492644067e-06, + "loss": 1.1484, + "step": 2570 + }, + { + "epoch": 0.5139301866520077, + "grad_norm": 2.03125, + "learning_rate": 9.337135254949719e-06, + "loss": 1.0157, + "step": 2571 + }, + { + "epoch": 0.5141300817071037, + "grad_norm": 2.203125, + "learning_rate": 9.336610824600288e-06, + "loss": 1.0814, + "step": 2572 + }, + { + "epoch": 0.5143299767621998, + "grad_norm": 2.15625, + "learning_rate": 9.336086201619065e-06, + "loss": 0.9951, + "step": 2573 + }, + { + "epoch": 0.5145298718172959, + "grad_norm": 2.109375, + "learning_rate": 9.335561386029356e-06, + "loss": 1.0877, + "step": 2574 + }, + { + "epoch": 0.514729766872392, + "grad_norm": 2.15625, + "learning_rate": 9.335036377854474e-06, + "loss": 1.0248, + "step": 2575 + }, + { + "epoch": 0.514929661927488, + "grad_norm": 2.046875, + "learning_rate": 9.334511177117739e-06, + "loss": 1.0022, + "step": 2576 + }, + { + "epoch": 0.5151295569825841, + "grad_norm": 2.0625, + "learning_rate": 9.333985783842482e-06, + "loss": 1.0916, + "step": 2577 + }, + { + "epoch": 0.5153294520376802, + "grad_norm": 1.9296875, + "learning_rate": 9.333460198052036e-06, + "loss": 1.0654, + "step": 2578 + }, + { + "epoch": 0.5155293470927763, + "grad_norm": 2.171875, + "learning_rate": 9.332934419769752e-06, + "loss": 1.1317, + "step": 2579 + }, + { + "epoch": 0.5157292421478724, + "grad_norm": 2.0625, + "learning_rate": 9.332408449018987e-06, + "loss": 1.0809, + "step": 2580 + }, + { + "epoch": 0.5159291372029684, + "grad_norm": 2.03125, + "learning_rate": 9.331882285823098e-06, + "loss": 1.1266, + "step": 2581 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 1.921875, + "learning_rate": 9.33135593020546e-06, + "loss": 1.0306, + "step": 2582 + }, + { + "epoch": 0.5163289273131606, + "grad_norm": 2.140625, + "learning_rate": 9.330829382189456e-06, + "loss": 1.1622, + "step": 2583 + }, + { + "epoch": 0.5165288223682567, + "grad_norm": 2.0625, + "learning_rate": 9.330302641798473e-06, + "loss": 1.014, + "step": 2584 + }, + { + "epoch": 0.5167287174233527, + "grad_norm": 2.015625, + "learning_rate": 9.32977570905591e-06, + "loss": 1.0604, + "step": 2585 + }, + { + "epoch": 0.5169286124784488, + "grad_norm": 2.015625, + "learning_rate": 9.329248583985171e-06, + "loss": 1.0563, + "step": 2586 + }, + { + "epoch": 0.5171285075335449, + "grad_norm": 2.109375, + "learning_rate": 9.328721266609673e-06, + "loss": 1.1403, + "step": 2587 + }, + { + "epoch": 0.517328402588641, + "grad_norm": 1.9921875, + "learning_rate": 9.328193756952837e-06, + "loss": 1.0036, + "step": 2588 + }, + { + "epoch": 0.5175282976437371, + "grad_norm": 2.046875, + "learning_rate": 9.3276660550381e-06, + "loss": 0.8862, + "step": 2589 + }, + { + "epoch": 0.5177281926988331, + "grad_norm": 2.140625, + "learning_rate": 9.327138160888897e-06, + "loss": 1.2041, + "step": 2590 + }, + { + "epoch": 0.5179280877539292, + "grad_norm": 2.078125, + "learning_rate": 9.32661007452868e-06, + "loss": 1.0425, + "step": 2591 + }, + { + "epoch": 0.5181279828090253, + "grad_norm": 2.109375, + "learning_rate": 9.326081795980903e-06, + "loss": 1.0795, + "step": 2592 + }, + { + "epoch": 0.5183278778641214, + "grad_norm": 2.421875, + "learning_rate": 9.325553325269036e-06, + "loss": 1.0518, + "step": 2593 + }, + { + "epoch": 0.5185277729192174, + "grad_norm": 2.140625, + "learning_rate": 9.325024662416553e-06, + "loss": 0.9939, + "step": 2594 + }, + { + "epoch": 0.5187276679743135, + "grad_norm": 2.171875, + "learning_rate": 9.324495807446935e-06, + "loss": 0.9843, + "step": 2595 + }, + { + "epoch": 0.5189275630294096, + "grad_norm": 2.1875, + "learning_rate": 9.323966760383679e-06, + "loss": 1.183, + "step": 2596 + }, + { + "epoch": 0.5191274580845057, + "grad_norm": 2.15625, + "learning_rate": 9.323437521250278e-06, + "loss": 1.0339, + "step": 2597 + }, + { + "epoch": 0.5193273531396017, + "grad_norm": 2.375, + "learning_rate": 9.322908090070243e-06, + "loss": 1.0489, + "step": 2598 + }, + { + "epoch": 0.5195272481946978, + "grad_norm": 2.09375, + "learning_rate": 9.322378466867095e-06, + "loss": 1.2156, + "step": 2599 + }, + { + "epoch": 0.5197271432497939, + "grad_norm": 2.046875, + "learning_rate": 9.321848651664357e-06, + "loss": 1.0942, + "step": 2600 + }, + { + "epoch": 0.51992703830489, + "grad_norm": 2.140625, + "learning_rate": 9.321318644485561e-06, + "loss": 0.9874, + "step": 2601 + }, + { + "epoch": 0.520126933359986, + "grad_norm": 2.015625, + "learning_rate": 9.320788445354255e-06, + "loss": 0.9747, + "step": 2602 + }, + { + "epoch": 0.520326828415082, + "grad_norm": 2.078125, + "learning_rate": 9.320258054293987e-06, + "loss": 1.0756, + "step": 2603 + }, + { + "epoch": 0.5205267234701781, + "grad_norm": 2.234375, + "learning_rate": 9.319727471328318e-06, + "loss": 1.1056, + "step": 2604 + }, + { + "epoch": 0.5207266185252742, + "grad_norm": 2.125, + "learning_rate": 9.319196696480814e-06, + "loss": 1.0712, + "step": 2605 + }, + { + "epoch": 0.5209265135803703, + "grad_norm": 2.15625, + "learning_rate": 9.318665729775056e-06, + "loss": 1.1167, + "step": 2606 + }, + { + "epoch": 0.5211264086354663, + "grad_norm": 2.09375, + "learning_rate": 9.318134571234626e-06, + "loss": 1.0241, + "step": 2607 + }, + { + "epoch": 0.5213263036905624, + "grad_norm": 2.046875, + "learning_rate": 9.317603220883121e-06, + "loss": 1.0953, + "step": 2608 + }, + { + "epoch": 0.5215261987456585, + "grad_norm": 1.9609375, + "learning_rate": 9.317071678744143e-06, + "loss": 0.9876, + "step": 2609 + }, + { + "epoch": 0.5217260938007546, + "grad_norm": 2.03125, + "learning_rate": 9.3165399448413e-06, + "loss": 1.016, + "step": 2610 + }, + { + "epoch": 0.5219259888558507, + "grad_norm": 2.078125, + "learning_rate": 9.316008019198216e-06, + "loss": 0.9751, + "step": 2611 + }, + { + "epoch": 0.5221258839109467, + "grad_norm": 2.171875, + "learning_rate": 9.315475901838514e-06, + "loss": 1.0808, + "step": 2612 + }, + { + "epoch": 0.5223257789660428, + "grad_norm": 2.109375, + "learning_rate": 9.314943592785834e-06, + "loss": 1.0874, + "step": 2613 + }, + { + "epoch": 0.5225256740211389, + "grad_norm": 2.171875, + "learning_rate": 9.314411092063822e-06, + "loss": 1.1452, + "step": 2614 + }, + { + "epoch": 0.522725569076235, + "grad_norm": 2.046875, + "learning_rate": 9.313878399696127e-06, + "loss": 1.0408, + "step": 2615 + }, + { + "epoch": 0.522925464131331, + "grad_norm": 2.078125, + "learning_rate": 9.313345515706417e-06, + "loss": 0.955, + "step": 2616 + }, + { + "epoch": 0.5231253591864271, + "grad_norm": 2.015625, + "learning_rate": 9.31281244011836e-06, + "loss": 0.9659, + "step": 2617 + }, + { + "epoch": 0.5233252542415232, + "grad_norm": 2.234375, + "learning_rate": 9.312279172955634e-06, + "loss": 1.0329, + "step": 2618 + }, + { + "epoch": 0.5235251492966193, + "grad_norm": 2.046875, + "learning_rate": 9.311745714241926e-06, + "loss": 1.0165, + "step": 2619 + }, + { + "epoch": 0.5237250443517153, + "grad_norm": 2.203125, + "learning_rate": 9.311212064000936e-06, + "loss": 1.134, + "step": 2620 + }, + { + "epoch": 0.5239249394068114, + "grad_norm": 2.171875, + "learning_rate": 9.310678222256367e-06, + "loss": 1.1248, + "step": 2621 + }, + { + "epoch": 0.5241248344619075, + "grad_norm": 2.203125, + "learning_rate": 9.31014418903193e-06, + "loss": 1.0595, + "step": 2622 + }, + { + "epoch": 0.5243247295170036, + "grad_norm": 2.234375, + "learning_rate": 9.30960996435135e-06, + "loss": 1.0898, + "step": 2623 + }, + { + "epoch": 0.5245246245720997, + "grad_norm": 2.21875, + "learning_rate": 9.309075548238355e-06, + "loss": 1.1028, + "step": 2624 + }, + { + "epoch": 0.5247245196271957, + "grad_norm": 2.078125, + "learning_rate": 9.308540940716685e-06, + "loss": 1.0883, + "step": 2625 + }, + { + "epoch": 0.5249244146822918, + "grad_norm": 2.015625, + "learning_rate": 9.308006141810086e-06, + "loss": 0.973, + "step": 2626 + }, + { + "epoch": 0.5251243097373879, + "grad_norm": 2.03125, + "learning_rate": 9.307471151542315e-06, + "loss": 1.081, + "step": 2627 + }, + { + "epoch": 0.525324204792484, + "grad_norm": 2.078125, + "learning_rate": 9.306935969937135e-06, + "loss": 1.0576, + "step": 2628 + }, + { + "epoch": 0.52552409984758, + "grad_norm": 2.125, + "learning_rate": 9.30640059701832e-06, + "loss": 1.1406, + "step": 2629 + }, + { + "epoch": 0.5257239949026761, + "grad_norm": 2.21875, + "learning_rate": 9.30586503280965e-06, + "loss": 1.162, + "step": 2630 + }, + { + "epoch": 0.5259238899577722, + "grad_norm": 2.09375, + "learning_rate": 9.305329277334914e-06, + "loss": 1.106, + "step": 2631 + }, + { + "epoch": 0.5261237850128683, + "grad_norm": 2.109375, + "learning_rate": 9.304793330617912e-06, + "loss": 1.0658, + "step": 2632 + }, + { + "epoch": 0.5263236800679644, + "grad_norm": 2.0625, + "learning_rate": 9.304257192682449e-06, + "loss": 1.0368, + "step": 2633 + }, + { + "epoch": 0.5265235751230604, + "grad_norm": 1.984375, + "learning_rate": 9.303720863552343e-06, + "loss": 1.0142, + "step": 2634 + }, + { + "epoch": 0.5267234701781565, + "grad_norm": 2.25, + "learning_rate": 9.303184343251415e-06, + "loss": 1.0655, + "step": 2635 + }, + { + "epoch": 0.5269233652332526, + "grad_norm": 2.125, + "learning_rate": 9.302647631803498e-06, + "loss": 1.0741, + "step": 2636 + }, + { + "epoch": 0.5271232602883487, + "grad_norm": 2.109375, + "learning_rate": 9.302110729232432e-06, + "loss": 1.0317, + "step": 2637 + }, + { + "epoch": 0.5273231553434446, + "grad_norm": 2.140625, + "learning_rate": 9.301573635562068e-06, + "loss": 1.0812, + "step": 2638 + }, + { + "epoch": 0.5275230503985407, + "grad_norm": 1.9921875, + "learning_rate": 9.301036350816264e-06, + "loss": 1.0591, + "step": 2639 + }, + { + "epoch": 0.5277229454536368, + "grad_norm": 2.171875, + "learning_rate": 9.300498875018882e-06, + "loss": 1.0668, + "step": 2640 + }, + { + "epoch": 0.527922840508733, + "grad_norm": 2.171875, + "learning_rate": 9.299961208193801e-06, + "loss": 1.1023, + "step": 2641 + }, + { + "epoch": 0.5281227355638289, + "grad_norm": 2.09375, + "learning_rate": 9.299423350364903e-06, + "loss": 1.0111, + "step": 2642 + }, + { + "epoch": 0.528322630618925, + "grad_norm": 2.078125, + "learning_rate": 9.298885301556075e-06, + "loss": 1.1069, + "step": 2643 + }, + { + "epoch": 0.5285225256740211, + "grad_norm": 1.96875, + "learning_rate": 9.298347061791224e-06, + "loss": 0.9699, + "step": 2644 + }, + { + "epoch": 0.5287224207291172, + "grad_norm": 2.109375, + "learning_rate": 9.297808631094257e-06, + "loss": 1.0813, + "step": 2645 + }, + { + "epoch": 0.5289223157842133, + "grad_norm": 2.0625, + "learning_rate": 9.297270009489088e-06, + "loss": 0.9616, + "step": 2646 + }, + { + "epoch": 0.5291222108393093, + "grad_norm": 2.0625, + "learning_rate": 9.296731196999643e-06, + "loss": 1.0316, + "step": 2647 + }, + { + "epoch": 0.5293221058944054, + "grad_norm": 1.9609375, + "learning_rate": 9.296192193649857e-06, + "loss": 0.9889, + "step": 2648 + }, + { + "epoch": 0.5295220009495015, + "grad_norm": 2.09375, + "learning_rate": 9.295652999463675e-06, + "loss": 0.9873, + "step": 2649 + }, + { + "epoch": 0.5297218960045976, + "grad_norm": 2.125, + "learning_rate": 9.295113614465045e-06, + "loss": 1.1143, + "step": 2650 + }, + { + "epoch": 0.5299217910596936, + "grad_norm": 2.0625, + "learning_rate": 9.294574038677926e-06, + "loss": 1.0498, + "step": 2651 + }, + { + "epoch": 0.5301216861147897, + "grad_norm": 2.140625, + "learning_rate": 9.294034272126286e-06, + "loss": 1.0765, + "step": 2652 + }, + { + "epoch": 0.5303215811698858, + "grad_norm": 2.140625, + "learning_rate": 9.293494314834105e-06, + "loss": 1.0264, + "step": 2653 + }, + { + "epoch": 0.5305214762249819, + "grad_norm": 2.046875, + "learning_rate": 9.292954166825363e-06, + "loss": 1.0044, + "step": 2654 + }, + { + "epoch": 0.530721371280078, + "grad_norm": 2.03125, + "learning_rate": 9.292413828124056e-06, + "loss": 1.0188, + "step": 2655 + }, + { + "epoch": 0.530921266335174, + "grad_norm": 1.984375, + "learning_rate": 9.291873298754187e-06, + "loss": 1.0454, + "step": 2656 + }, + { + "epoch": 0.5311211613902701, + "grad_norm": 2.25, + "learning_rate": 9.291332578739762e-06, + "loss": 1.067, + "step": 2657 + }, + { + "epoch": 0.5313210564453662, + "grad_norm": 2.140625, + "learning_rate": 9.290791668104802e-06, + "loss": 1.0037, + "step": 2658 + }, + { + "epoch": 0.5315209515004623, + "grad_norm": 2.125, + "learning_rate": 9.290250566873335e-06, + "loss": 1.0557, + "step": 2659 + }, + { + "epoch": 0.5317208465555583, + "grad_norm": 2.0625, + "learning_rate": 9.289709275069396e-06, + "loss": 1.0282, + "step": 2660 + }, + { + "epoch": 0.5319207416106544, + "grad_norm": 2.109375, + "learning_rate": 9.28916779271703e-06, + "loss": 1.0812, + "step": 2661 + }, + { + "epoch": 0.5321206366657505, + "grad_norm": 2.0625, + "learning_rate": 9.288626119840287e-06, + "loss": 1.0249, + "step": 2662 + }, + { + "epoch": 0.5323205317208466, + "grad_norm": 2.125, + "learning_rate": 9.288084256463233e-06, + "loss": 1.0339, + "step": 2663 + }, + { + "epoch": 0.5325204267759426, + "grad_norm": 1.9765625, + "learning_rate": 9.287542202609932e-06, + "loss": 1.0646, + "step": 2664 + }, + { + "epoch": 0.5327203218310387, + "grad_norm": 2.234375, + "learning_rate": 9.286999958304464e-06, + "loss": 1.0251, + "step": 2665 + }, + { + "epoch": 0.5329202168861348, + "grad_norm": 2.09375, + "learning_rate": 9.286457523570915e-06, + "loss": 1.1246, + "step": 2666 + }, + { + "epoch": 0.5331201119412309, + "grad_norm": 2.109375, + "learning_rate": 9.285914898433384e-06, + "loss": 1.0116, + "step": 2667 + }, + { + "epoch": 0.533320006996327, + "grad_norm": 2.078125, + "learning_rate": 9.285372082915968e-06, + "loss": 1.0878, + "step": 2668 + }, + { + "epoch": 0.533519902051423, + "grad_norm": 2.234375, + "learning_rate": 9.284829077042784e-06, + "loss": 1.0951, + "step": 2669 + }, + { + "epoch": 0.5337197971065191, + "grad_norm": 2.09375, + "learning_rate": 9.284285880837947e-06, + "loss": 1.1074, + "step": 2670 + }, + { + "epoch": 0.5339196921616152, + "grad_norm": 2.15625, + "learning_rate": 9.283742494325591e-06, + "loss": 1.0042, + "step": 2671 + }, + { + "epoch": 0.5341195872167113, + "grad_norm": 2.140625, + "learning_rate": 9.28319891752985e-06, + "loss": 1.064, + "step": 2672 + }, + { + "epoch": 0.5343194822718073, + "grad_norm": 2.1875, + "learning_rate": 9.282655150474871e-06, + "loss": 0.9645, + "step": 2673 + }, + { + "epoch": 0.5345193773269034, + "grad_norm": 2.140625, + "learning_rate": 9.282111193184806e-06, + "loss": 1.0087, + "step": 2674 + }, + { + "epoch": 0.5347192723819995, + "grad_norm": 2.140625, + "learning_rate": 9.281567045683822e-06, + "loss": 1.1043, + "step": 2675 + }, + { + "epoch": 0.5349191674370956, + "grad_norm": 2.15625, + "learning_rate": 9.281022707996085e-06, + "loss": 1.0959, + "step": 2676 + }, + { + "epoch": 0.5351190624921915, + "grad_norm": 2.1875, + "learning_rate": 9.280478180145778e-06, + "loss": 1.1068, + "step": 2677 + }, + { + "epoch": 0.5353189575472876, + "grad_norm": 2.21875, + "learning_rate": 9.279933462157088e-06, + "loss": 1.129, + "step": 2678 + }, + { + "epoch": 0.5355188526023837, + "grad_norm": 2.34375, + "learning_rate": 9.279388554054207e-06, + "loss": 0.9898, + "step": 2679 + }, + { + "epoch": 0.5357187476574798, + "grad_norm": 2.15625, + "learning_rate": 9.278843455861346e-06, + "loss": 1.0103, + "step": 2680 + }, + { + "epoch": 0.5359186427125759, + "grad_norm": 2.5, + "learning_rate": 9.278298167602716e-06, + "loss": 1.1809, + "step": 2681 + }, + { + "epoch": 0.5361185377676719, + "grad_norm": 2.078125, + "learning_rate": 9.277752689302537e-06, + "loss": 0.9956, + "step": 2682 + }, + { + "epoch": 0.536318432822768, + "grad_norm": 2.21875, + "learning_rate": 9.277207020985042e-06, + "loss": 1.0903, + "step": 2683 + }, + { + "epoch": 0.5365183278778641, + "grad_norm": 1.9921875, + "learning_rate": 9.276661162674467e-06, + "loss": 0.9674, + "step": 2684 + }, + { + "epoch": 0.5367182229329602, + "grad_norm": 2.09375, + "learning_rate": 9.27611511439506e-06, + "loss": 1.0355, + "step": 2685 + }, + { + "epoch": 0.5369181179880562, + "grad_norm": 2.0625, + "learning_rate": 9.275568876171078e-06, + "loss": 0.9648, + "step": 2686 + }, + { + "epoch": 0.5371180130431523, + "grad_norm": 2.15625, + "learning_rate": 9.275022448026782e-06, + "loss": 0.9565, + "step": 2687 + }, + { + "epoch": 0.5373179080982484, + "grad_norm": 2.140625, + "learning_rate": 9.274475829986444e-06, + "loss": 1.0516, + "step": 2688 + }, + { + "epoch": 0.5375178031533445, + "grad_norm": 2.09375, + "learning_rate": 9.273929022074348e-06, + "loss": 1.0077, + "step": 2689 + }, + { + "epoch": 0.5377176982084406, + "grad_norm": 2.0625, + "learning_rate": 9.273382024314781e-06, + "loss": 1.0389, + "step": 2690 + }, + { + "epoch": 0.5379175932635366, + "grad_norm": 2.140625, + "learning_rate": 9.272834836732039e-06, + "loss": 0.9891, + "step": 2691 + }, + { + "epoch": 0.5381174883186327, + "grad_norm": 2.0625, + "learning_rate": 9.272287459350432e-06, + "loss": 1.0165, + "step": 2692 + }, + { + "epoch": 0.5383173833737288, + "grad_norm": 2.03125, + "learning_rate": 9.271739892194272e-06, + "loss": 0.9821, + "step": 2693 + }, + { + "epoch": 0.5385172784288249, + "grad_norm": 2.0625, + "learning_rate": 9.271192135287882e-06, + "loss": 1.102, + "step": 2694 + }, + { + "epoch": 0.5387171734839209, + "grad_norm": 1.984375, + "learning_rate": 9.270644188655594e-06, + "loss": 0.9087, + "step": 2695 + }, + { + "epoch": 0.538917068539017, + "grad_norm": 2.078125, + "learning_rate": 9.270096052321747e-06, + "loss": 1.1452, + "step": 2696 + }, + { + "epoch": 0.5391169635941131, + "grad_norm": 2.140625, + "learning_rate": 9.269547726310688e-06, + "loss": 1.0612, + "step": 2697 + }, + { + "epoch": 0.5393168586492092, + "grad_norm": 2.140625, + "learning_rate": 9.268999210646777e-06, + "loss": 0.9784, + "step": 2698 + }, + { + "epoch": 0.5395167537043052, + "grad_norm": 2.09375, + "learning_rate": 9.268450505354375e-06, + "loss": 0.9654, + "step": 2699 + }, + { + "epoch": 0.5397166487594013, + "grad_norm": 2.171875, + "learning_rate": 9.267901610457859e-06, + "loss": 1.121, + "step": 2700 + }, + { + "epoch": 0.5399165438144974, + "grad_norm": 2.109375, + "learning_rate": 9.26735252598161e-06, + "loss": 1.073, + "step": 2701 + }, + { + "epoch": 0.5401164388695935, + "grad_norm": 2.265625, + "learning_rate": 9.266803251950017e-06, + "loss": 1.082, + "step": 2702 + }, + { + "epoch": 0.5403163339246896, + "grad_norm": 2.09375, + "learning_rate": 9.266253788387479e-06, + "loss": 1.0063, + "step": 2703 + }, + { + "epoch": 0.5405162289797856, + "grad_norm": 2.21875, + "learning_rate": 9.2657041353184e-06, + "loss": 1.1217, + "step": 2704 + }, + { + "epoch": 0.5407161240348817, + "grad_norm": 2.1875, + "learning_rate": 9.265154292767204e-06, + "loss": 0.9752, + "step": 2705 + }, + { + "epoch": 0.5409160190899778, + "grad_norm": 2.015625, + "learning_rate": 9.264604260758307e-06, + "loss": 1.02, + "step": 2706 + }, + { + "epoch": 0.5411159141450739, + "grad_norm": 1.9609375, + "learning_rate": 9.264054039316146e-06, + "loss": 1.0283, + "step": 2707 + }, + { + "epoch": 0.5413158092001699, + "grad_norm": 2.1875, + "learning_rate": 9.263503628465159e-06, + "loss": 1.1819, + "step": 2708 + }, + { + "epoch": 0.541515704255266, + "grad_norm": 2.015625, + "learning_rate": 9.262953028229794e-06, + "loss": 1.0463, + "step": 2709 + }, + { + "epoch": 0.5417155993103621, + "grad_norm": 2.03125, + "learning_rate": 9.262402238634514e-06, + "loss": 1.2011, + "step": 2710 + }, + { + "epoch": 0.5419154943654582, + "grad_norm": 2.0, + "learning_rate": 9.261851259703782e-06, + "loss": 0.9946, + "step": 2711 + }, + { + "epoch": 0.5421153894205543, + "grad_norm": 2.03125, + "learning_rate": 9.261300091462071e-06, + "loss": 1.0925, + "step": 2712 + }, + { + "epoch": 0.5423152844756502, + "grad_norm": 2.0625, + "learning_rate": 9.260748733933865e-06, + "loss": 1.0614, + "step": 2713 + }, + { + "epoch": 0.5425151795307463, + "grad_norm": 2.09375, + "learning_rate": 9.260197187143656e-06, + "loss": 1.0966, + "step": 2714 + }, + { + "epoch": 0.5427150745858424, + "grad_norm": 2.125, + "learning_rate": 9.259645451115941e-06, + "loss": 1.1616, + "step": 2715 + }, + { + "epoch": 0.5429149696409385, + "grad_norm": 2.109375, + "learning_rate": 9.259093525875232e-06, + "loss": 1.1168, + "step": 2716 + }, + { + "epoch": 0.5431148646960345, + "grad_norm": 1.9375, + "learning_rate": 9.258541411446042e-06, + "loss": 1.0294, + "step": 2717 + }, + { + "epoch": 0.5433147597511306, + "grad_norm": 1.96875, + "learning_rate": 9.257989107852898e-06, + "loss": 1.0106, + "step": 2718 + }, + { + "epoch": 0.5435146548062267, + "grad_norm": 2.078125, + "learning_rate": 9.257436615120335e-06, + "loss": 0.9188, + "step": 2719 + }, + { + "epoch": 0.5437145498613228, + "grad_norm": 2.140625, + "learning_rate": 9.256883933272888e-06, + "loss": 1.0516, + "step": 2720 + }, + { + "epoch": 0.5439144449164188, + "grad_norm": 2.0625, + "learning_rate": 9.256331062335114e-06, + "loss": 1.1339, + "step": 2721 + }, + { + "epoch": 0.5441143399715149, + "grad_norm": 2.0625, + "learning_rate": 9.255778002331569e-06, + "loss": 1.0622, + "step": 2722 + }, + { + "epoch": 0.544314235026611, + "grad_norm": 2.21875, + "learning_rate": 9.255224753286818e-06, + "loss": 1.0338, + "step": 2723 + }, + { + "epoch": 0.5445141300817071, + "grad_norm": 2.0, + "learning_rate": 9.25467131522544e-06, + "loss": 1.0663, + "step": 2724 + }, + { + "epoch": 0.5447140251368032, + "grad_norm": 2.046875, + "learning_rate": 9.254117688172014e-06, + "loss": 1.0436, + "step": 2725 + }, + { + "epoch": 0.5449139201918992, + "grad_norm": 2.125, + "learning_rate": 9.253563872151136e-06, + "loss": 1.0577, + "step": 2726 + }, + { + "epoch": 0.5451138152469953, + "grad_norm": 2.09375, + "learning_rate": 9.253009867187406e-06, + "loss": 1.0489, + "step": 2727 + }, + { + "epoch": 0.5453137103020914, + "grad_norm": 2.046875, + "learning_rate": 9.252455673305431e-06, + "loss": 0.9571, + "step": 2728 + }, + { + "epoch": 0.5455136053571875, + "grad_norm": 2.109375, + "learning_rate": 9.251901290529829e-06, + "loss": 1.1165, + "step": 2729 + }, + { + "epoch": 0.5457135004122835, + "grad_norm": 2.109375, + "learning_rate": 9.251346718885226e-06, + "loss": 1.0281, + "step": 2730 + }, + { + "epoch": 0.5459133954673796, + "grad_norm": 2.015625, + "learning_rate": 9.250791958396255e-06, + "loss": 1.0421, + "step": 2731 + }, + { + "epoch": 0.5461132905224757, + "grad_norm": 2.109375, + "learning_rate": 9.25023700908756e-06, + "loss": 1.0919, + "step": 2732 + }, + { + "epoch": 0.5463131855775718, + "grad_norm": 2.015625, + "learning_rate": 9.249681870983792e-06, + "loss": 1.0162, + "step": 2733 + }, + { + "epoch": 0.5465130806326679, + "grad_norm": 2.0625, + "learning_rate": 9.249126544109608e-06, + "loss": 1.0894, + "step": 2734 + }, + { + "epoch": 0.5467129756877639, + "grad_norm": 2.0625, + "learning_rate": 9.248571028489678e-06, + "loss": 1.039, + "step": 2735 + }, + { + "epoch": 0.54691287074286, + "grad_norm": 2.1875, + "learning_rate": 9.248015324148673e-06, + "loss": 1.1551, + "step": 2736 + }, + { + "epoch": 0.5471127657979561, + "grad_norm": 2.09375, + "learning_rate": 9.247459431111285e-06, + "loss": 1.0632, + "step": 2737 + }, + { + "epoch": 0.5473126608530522, + "grad_norm": 2.109375, + "learning_rate": 9.246903349402201e-06, + "loss": 1.1491, + "step": 2738 + }, + { + "epoch": 0.5475125559081482, + "grad_norm": 2.296875, + "learning_rate": 9.246347079046124e-06, + "loss": 1.0886, + "step": 2739 + }, + { + "epoch": 0.5477124509632443, + "grad_norm": 2.109375, + "learning_rate": 9.245790620067763e-06, + "loss": 1.0163, + "step": 2740 + }, + { + "epoch": 0.5479123460183404, + "grad_norm": 2.140625, + "learning_rate": 9.245233972491836e-06, + "loss": 1.1689, + "step": 2741 + }, + { + "epoch": 0.5481122410734365, + "grad_norm": 1.9609375, + "learning_rate": 9.24467713634307e-06, + "loss": 0.9898, + "step": 2742 + }, + { + "epoch": 0.5483121361285325, + "grad_norm": 2.046875, + "learning_rate": 9.2441201116462e-06, + "loss": 1.0118, + "step": 2743 + }, + { + "epoch": 0.5485120311836286, + "grad_norm": 2.03125, + "learning_rate": 9.243562898425967e-06, + "loss": 1.0321, + "step": 2744 + }, + { + "epoch": 0.5487119262387247, + "grad_norm": 2.21875, + "learning_rate": 9.243005496707123e-06, + "loss": 1.1106, + "step": 2745 + }, + { + "epoch": 0.5489118212938208, + "grad_norm": 2.203125, + "learning_rate": 9.242447906514428e-06, + "loss": 1.123, + "step": 2746 + }, + { + "epoch": 0.5491117163489169, + "grad_norm": 2.171875, + "learning_rate": 9.24189012787265e-06, + "loss": 1.0215, + "step": 2747 + }, + { + "epoch": 0.5493116114040129, + "grad_norm": 2.046875, + "learning_rate": 9.241332160806566e-06, + "loss": 1.0798, + "step": 2748 + }, + { + "epoch": 0.549511506459109, + "grad_norm": 2.0625, + "learning_rate": 9.24077400534096e-06, + "loss": 1.0293, + "step": 2749 + }, + { + "epoch": 0.5497114015142051, + "grad_norm": 2.09375, + "learning_rate": 9.240215661500625e-06, + "loss": 1.129, + "step": 2750 + }, + { + "epoch": 0.5499112965693012, + "grad_norm": 2.453125, + "learning_rate": 9.239657129310364e-06, + "loss": 1.1994, + "step": 2751 + }, + { + "epoch": 0.5501111916243971, + "grad_norm": 2.1875, + "learning_rate": 9.239098408794986e-06, + "loss": 1.0474, + "step": 2752 + }, + { + "epoch": 0.5503110866794932, + "grad_norm": 2.015625, + "learning_rate": 9.23853949997931e-06, + "loss": 1.0514, + "step": 2753 + }, + { + "epoch": 0.5505109817345893, + "grad_norm": 2.140625, + "learning_rate": 9.23798040288816e-06, + "loss": 1.0656, + "step": 2754 + }, + { + "epoch": 0.5507108767896854, + "grad_norm": 2.125, + "learning_rate": 9.237421117546375e-06, + "loss": 1.0498, + "step": 2755 + }, + { + "epoch": 0.5509107718447815, + "grad_norm": 2.015625, + "learning_rate": 9.236861643978797e-06, + "loss": 0.9487, + "step": 2756 + }, + { + "epoch": 0.5511106668998775, + "grad_norm": 2.15625, + "learning_rate": 9.236301982210276e-06, + "loss": 1.0643, + "step": 2757 + }, + { + "epoch": 0.5513105619549736, + "grad_norm": 2.078125, + "learning_rate": 9.235742132265676e-06, + "loss": 1.0824, + "step": 2758 + }, + { + "epoch": 0.5515104570100697, + "grad_norm": 2.078125, + "learning_rate": 9.235182094169859e-06, + "loss": 1.0396, + "step": 2759 + }, + { + "epoch": 0.5517103520651658, + "grad_norm": 2.15625, + "learning_rate": 9.234621867947709e-06, + "loss": 1.1565, + "step": 2760 + }, + { + "epoch": 0.5519102471202618, + "grad_norm": 2.0, + "learning_rate": 9.234061453624106e-06, + "loss": 0.9588, + "step": 2761 + }, + { + "epoch": 0.5521101421753579, + "grad_norm": 2.34375, + "learning_rate": 9.233500851223945e-06, + "loss": 1.0023, + "step": 2762 + }, + { + "epoch": 0.552310037230454, + "grad_norm": 2.125, + "learning_rate": 9.23294006077213e-06, + "loss": 1.1064, + "step": 2763 + }, + { + "epoch": 0.5525099322855501, + "grad_norm": 2.125, + "learning_rate": 9.232379082293568e-06, + "loss": 1.1066, + "step": 2764 + }, + { + "epoch": 0.5527098273406461, + "grad_norm": 2.078125, + "learning_rate": 9.23181791581318e-06, + "loss": 1.0331, + "step": 2765 + }, + { + "epoch": 0.5529097223957422, + "grad_norm": 2.125, + "learning_rate": 9.231256561355892e-06, + "loss": 0.9992, + "step": 2766 + }, + { + "epoch": 0.5531096174508383, + "grad_norm": 2.171875, + "learning_rate": 9.23069501894664e-06, + "loss": 1.0246, + "step": 2767 + }, + { + "epoch": 0.5533095125059344, + "grad_norm": 2.171875, + "learning_rate": 9.230133288610366e-06, + "loss": 0.978, + "step": 2768 + }, + { + "epoch": 0.5535094075610305, + "grad_norm": 2.078125, + "learning_rate": 9.229571370372023e-06, + "loss": 1.0375, + "step": 2769 + }, + { + "epoch": 0.5537093026161265, + "grad_norm": 2.09375, + "learning_rate": 9.229009264256574e-06, + "loss": 1.0868, + "step": 2770 + }, + { + "epoch": 0.5539091976712226, + "grad_norm": 2.140625, + "learning_rate": 9.228446970288983e-06, + "loss": 1.1305, + "step": 2771 + }, + { + "epoch": 0.5541090927263187, + "grad_norm": 2.078125, + "learning_rate": 9.22788448849423e-06, + "loss": 1.0305, + "step": 2772 + }, + { + "epoch": 0.5543089877814148, + "grad_norm": 2.0625, + "learning_rate": 9.2273218188973e-06, + "loss": 1.1207, + "step": 2773 + }, + { + "epoch": 0.5545088828365108, + "grad_norm": 2.125, + "learning_rate": 9.226758961523185e-06, + "loss": 1.0737, + "step": 2774 + }, + { + "epoch": 0.5547087778916069, + "grad_norm": 2.03125, + "learning_rate": 9.226195916396888e-06, + "loss": 1.1121, + "step": 2775 + }, + { + "epoch": 0.554908672946703, + "grad_norm": 2.140625, + "learning_rate": 9.225632683543421e-06, + "loss": 1.1163, + "step": 2776 + }, + { + "epoch": 0.5551085680017991, + "grad_norm": 2.140625, + "learning_rate": 9.2250692629878e-06, + "loss": 1.1029, + "step": 2777 + }, + { + "epoch": 0.5553084630568952, + "grad_norm": 2.0625, + "learning_rate": 9.224505654755055e-06, + "loss": 1.0431, + "step": 2778 + }, + { + "epoch": 0.5555083581119912, + "grad_norm": 2.046875, + "learning_rate": 9.22394185887022e-06, + "loss": 1.0452, + "step": 2779 + }, + { + "epoch": 0.5557082531670873, + "grad_norm": 2.078125, + "learning_rate": 9.22337787535834e-06, + "loss": 0.9932, + "step": 2780 + }, + { + "epoch": 0.5559081482221834, + "grad_norm": 2.15625, + "learning_rate": 9.222813704244464e-06, + "loss": 1.145, + "step": 2781 + }, + { + "epoch": 0.5561080432772795, + "grad_norm": 2.140625, + "learning_rate": 9.222249345553654e-06, + "loss": 1.0276, + "step": 2782 + }, + { + "epoch": 0.5563079383323755, + "grad_norm": 2.046875, + "learning_rate": 9.221684799310979e-06, + "loss": 0.9823, + "step": 2783 + }, + { + "epoch": 0.5565078333874716, + "grad_norm": 2.109375, + "learning_rate": 9.221120065541517e-06, + "loss": 1.0601, + "step": 2784 + }, + { + "epoch": 0.5567077284425677, + "grad_norm": 2.078125, + "learning_rate": 9.220555144270353e-06, + "loss": 1.0849, + "step": 2785 + }, + { + "epoch": 0.5569076234976638, + "grad_norm": 2.03125, + "learning_rate": 9.219990035522579e-06, + "loss": 0.9661, + "step": 2786 + }, + { + "epoch": 0.5571075185527598, + "grad_norm": 2.09375, + "learning_rate": 9.2194247393233e-06, + "loss": 1.128, + "step": 2787 + }, + { + "epoch": 0.5573074136078559, + "grad_norm": 1.9609375, + "learning_rate": 9.218859255697622e-06, + "loss": 1.0929, + "step": 2788 + }, + { + "epoch": 0.557507308662952, + "grad_norm": 2.25, + "learning_rate": 9.218293584670667e-06, + "loss": 0.9534, + "step": 2789 + }, + { + "epoch": 0.557707203718048, + "grad_norm": 2.078125, + "learning_rate": 9.217727726267564e-06, + "loss": 1.1144, + "step": 2790 + }, + { + "epoch": 0.5579070987731441, + "grad_norm": 2.125, + "learning_rate": 9.217161680513445e-06, + "loss": 1.0807, + "step": 2791 + }, + { + "epoch": 0.5581069938282401, + "grad_norm": 2.171875, + "learning_rate": 9.216595447433454e-06, + "loss": 1.1086, + "step": 2792 + }, + { + "epoch": 0.5583068888833362, + "grad_norm": 2.109375, + "learning_rate": 9.216029027052744e-06, + "loss": 1.0106, + "step": 2793 + }, + { + "epoch": 0.5585067839384323, + "grad_norm": 2.328125, + "learning_rate": 9.215462419396474e-06, + "loss": 1.0926, + "step": 2794 + }, + { + "epoch": 0.5587066789935284, + "grad_norm": 2.078125, + "learning_rate": 9.214895624489813e-06, + "loss": 1.0694, + "step": 2795 + }, + { + "epoch": 0.5589065740486244, + "grad_norm": 2.0625, + "learning_rate": 9.21432864235794e-06, + "loss": 1.0344, + "step": 2796 + }, + { + "epoch": 0.5591064691037205, + "grad_norm": 2.125, + "learning_rate": 9.213761473026039e-06, + "loss": 1.1513, + "step": 2797 + }, + { + "epoch": 0.5593063641588166, + "grad_norm": 2.015625, + "learning_rate": 9.213194116519301e-06, + "loss": 1.0874, + "step": 2798 + }, + { + "epoch": 0.5595062592139127, + "grad_norm": 2.21875, + "learning_rate": 9.212626572862933e-06, + "loss": 1.0074, + "step": 2799 + }, + { + "epoch": 0.5597061542690087, + "grad_norm": 2.15625, + "learning_rate": 9.212058842082141e-06, + "loss": 1.0354, + "step": 2800 + }, + { + "epoch": 0.5599060493241048, + "grad_norm": 2.09375, + "learning_rate": 9.211490924202147e-06, + "loss": 0.9803, + "step": 2801 + }, + { + "epoch": 0.5601059443792009, + "grad_norm": 2.015625, + "learning_rate": 9.210922819248173e-06, + "loss": 1.0824, + "step": 2802 + }, + { + "epoch": 0.560305839434297, + "grad_norm": 1.9921875, + "learning_rate": 9.21035452724546e-06, + "loss": 1.1036, + "step": 2803 + }, + { + "epoch": 0.5605057344893931, + "grad_norm": 2.140625, + "learning_rate": 9.209786048219247e-06, + "loss": 1.1111, + "step": 2804 + }, + { + "epoch": 0.5607056295444891, + "grad_norm": 2.09375, + "learning_rate": 9.209217382194788e-06, + "loss": 1.1009, + "step": 2805 + }, + { + "epoch": 0.5609055245995852, + "grad_norm": 2.71875, + "learning_rate": 9.208648529197342e-06, + "loss": 1.2159, + "step": 2806 + }, + { + "epoch": 0.5611054196546813, + "grad_norm": 2.15625, + "learning_rate": 9.208079489252178e-06, + "loss": 1.0909, + "step": 2807 + }, + { + "epoch": 0.5613053147097774, + "grad_norm": 2.125, + "learning_rate": 9.207510262384573e-06, + "loss": 1.0151, + "step": 2808 + }, + { + "epoch": 0.5615052097648734, + "grad_norm": 1.9921875, + "learning_rate": 9.20694084861981e-06, + "loss": 1.0352, + "step": 2809 + }, + { + "epoch": 0.5617051048199695, + "grad_norm": 2.09375, + "learning_rate": 9.206371247983186e-06, + "loss": 1.0656, + "step": 2810 + }, + { + "epoch": 0.5619049998750656, + "grad_norm": 2.203125, + "learning_rate": 9.205801460499999e-06, + "loss": 1.1071, + "step": 2811 + }, + { + "epoch": 0.5621048949301617, + "grad_norm": 2.03125, + "learning_rate": 9.205231486195561e-06, + "loss": 1.0241, + "step": 2812 + }, + { + "epoch": 0.5623047899852578, + "grad_norm": 2.140625, + "learning_rate": 9.204661325095189e-06, + "loss": 1.1143, + "step": 2813 + }, + { + "epoch": 0.5625046850403538, + "grad_norm": 2.078125, + "learning_rate": 9.20409097722421e-06, + "loss": 1.1146, + "step": 2814 + }, + { + "epoch": 0.5627045800954499, + "grad_norm": 2.0, + "learning_rate": 9.203520442607958e-06, + "loss": 1.01, + "step": 2815 + }, + { + "epoch": 0.562904475150546, + "grad_norm": 1.9609375, + "learning_rate": 9.202949721271779e-06, + "loss": 0.963, + "step": 2816 + }, + { + "epoch": 0.5631043702056421, + "grad_norm": 2.046875, + "learning_rate": 9.20237881324102e-06, + "loss": 1.0704, + "step": 2817 + }, + { + "epoch": 0.5633042652607381, + "grad_norm": 2.140625, + "learning_rate": 9.201807718541046e-06, + "loss": 1.0934, + "step": 2818 + }, + { + "epoch": 0.5635041603158342, + "grad_norm": 2.109375, + "learning_rate": 9.201236437197219e-06, + "loss": 1.1211, + "step": 2819 + }, + { + "epoch": 0.5637040553709303, + "grad_norm": 2.09375, + "learning_rate": 9.200664969234919e-06, + "loss": 1.0596, + "step": 2820 + }, + { + "epoch": 0.5639039504260264, + "grad_norm": 2.109375, + "learning_rate": 9.20009331467953e-06, + "loss": 1.0909, + "step": 2821 + }, + { + "epoch": 0.5641038454811224, + "grad_norm": 2.171875, + "learning_rate": 9.199521473556444e-06, + "loss": 1.0574, + "step": 2822 + }, + { + "epoch": 0.5643037405362185, + "grad_norm": 2.015625, + "learning_rate": 9.198949445891064e-06, + "loss": 1.0427, + "step": 2823 + }, + { + "epoch": 0.5645036355913146, + "grad_norm": 1.984375, + "learning_rate": 9.198377231708795e-06, + "loss": 1.0195, + "step": 2824 + }, + { + "epoch": 0.5647035306464107, + "grad_norm": 2.25, + "learning_rate": 9.197804831035061e-06, + "loss": 1.1618, + "step": 2825 + }, + { + "epoch": 0.5649034257015068, + "grad_norm": 2.15625, + "learning_rate": 9.197232243895285e-06, + "loss": 1.0371, + "step": 2826 + }, + { + "epoch": 0.5651033207566027, + "grad_norm": 2.140625, + "learning_rate": 9.196659470314899e-06, + "loss": 1.0595, + "step": 2827 + }, + { + "epoch": 0.5653032158116988, + "grad_norm": 2.0625, + "learning_rate": 9.196086510319348e-06, + "loss": 1.014, + "step": 2828 + }, + { + "epoch": 0.565503110866795, + "grad_norm": 2.234375, + "learning_rate": 9.195513363934083e-06, + "loss": 1.1328, + "step": 2829 + }, + { + "epoch": 0.565703005921891, + "grad_norm": 2.09375, + "learning_rate": 9.194940031184562e-06, + "loss": 1.0148, + "step": 2830 + }, + { + "epoch": 0.565902900976987, + "grad_norm": 2.15625, + "learning_rate": 9.194366512096252e-06, + "loss": 1.1294, + "step": 2831 + }, + { + "epoch": 0.5661027960320831, + "grad_norm": 2.203125, + "learning_rate": 9.193792806694631e-06, + "loss": 1.0612, + "step": 2832 + }, + { + "epoch": 0.5663026910871792, + "grad_norm": 2.015625, + "learning_rate": 9.19321891500518e-06, + "loss": 1.0499, + "step": 2833 + }, + { + "epoch": 0.5665025861422753, + "grad_norm": 2.140625, + "learning_rate": 9.192644837053394e-06, + "loss": 1.1479, + "step": 2834 + }, + { + "epoch": 0.5667024811973714, + "grad_norm": 2.125, + "learning_rate": 9.192070572864772e-06, + "loss": 1.0428, + "step": 2835 + }, + { + "epoch": 0.5669023762524674, + "grad_norm": 2.03125, + "learning_rate": 9.191496122464822e-06, + "loss": 1.0941, + "step": 2836 + }, + { + "epoch": 0.5671022713075635, + "grad_norm": 2.046875, + "learning_rate": 9.190921485879064e-06, + "loss": 1.0604, + "step": 2837 + }, + { + "epoch": 0.5673021663626596, + "grad_norm": 2.109375, + "learning_rate": 9.19034666313302e-06, + "loss": 1.1028, + "step": 2838 + }, + { + "epoch": 0.5675020614177557, + "grad_norm": 2.09375, + "learning_rate": 9.189771654252221e-06, + "loss": 1.0805, + "step": 2839 + }, + { + "epoch": 0.5677019564728517, + "grad_norm": 2.234375, + "learning_rate": 9.189196459262217e-06, + "loss": 1.1518, + "step": 2840 + }, + { + "epoch": 0.5679018515279478, + "grad_norm": 2.3125, + "learning_rate": 9.188621078188554e-06, + "loss": 1.1624, + "step": 2841 + }, + { + "epoch": 0.5681017465830439, + "grad_norm": 2.140625, + "learning_rate": 9.188045511056787e-06, + "loss": 1.0701, + "step": 2842 + }, + { + "epoch": 0.56830164163814, + "grad_norm": 2.21875, + "learning_rate": 9.187469757892489e-06, + "loss": 1.1878, + "step": 2843 + }, + { + "epoch": 0.568501536693236, + "grad_norm": 2.03125, + "learning_rate": 9.18689381872123e-06, + "loss": 1.0141, + "step": 2844 + }, + { + "epoch": 0.5687014317483321, + "grad_norm": 1.9453125, + "learning_rate": 9.186317693568594e-06, + "loss": 0.999, + "step": 2845 + }, + { + "epoch": 0.5689013268034282, + "grad_norm": 2.140625, + "learning_rate": 9.185741382460173e-06, + "loss": 1.0444, + "step": 2846 + }, + { + "epoch": 0.5691012218585243, + "grad_norm": 2.125, + "learning_rate": 9.18516488542157e-06, + "loss": 1.078, + "step": 2847 + }, + { + "epoch": 0.5693011169136204, + "grad_norm": 2.15625, + "learning_rate": 9.184588202478387e-06, + "loss": 1.1302, + "step": 2848 + }, + { + "epoch": 0.5695010119687164, + "grad_norm": 2.03125, + "learning_rate": 9.184011333656243e-06, + "loss": 1.0853, + "step": 2849 + }, + { + "epoch": 0.5697009070238125, + "grad_norm": 2.09375, + "learning_rate": 9.183434278980763e-06, + "loss": 1.0982, + "step": 2850 + }, + { + "epoch": 0.5699008020789086, + "grad_norm": 2.140625, + "learning_rate": 9.18285703847758e-06, + "loss": 1.1812, + "step": 2851 + }, + { + "epoch": 0.5701006971340047, + "grad_norm": 2.09375, + "learning_rate": 9.182279612172332e-06, + "loss": 1.0367, + "step": 2852 + }, + { + "epoch": 0.5703005921891007, + "grad_norm": 2.03125, + "learning_rate": 9.18170200009067e-06, + "loss": 1.0112, + "step": 2853 + }, + { + "epoch": 0.5705004872441968, + "grad_norm": 2.171875, + "learning_rate": 9.181124202258254e-06, + "loss": 0.9669, + "step": 2854 + }, + { + "epoch": 0.5707003822992929, + "grad_norm": 1.9921875, + "learning_rate": 9.180546218700748e-06, + "loss": 0.9886, + "step": 2855 + }, + { + "epoch": 0.570900277354389, + "grad_norm": 2.078125, + "learning_rate": 9.179968049443823e-06, + "loss": 1.0388, + "step": 2856 + }, + { + "epoch": 0.5711001724094851, + "grad_norm": 2.015625, + "learning_rate": 9.179389694513166e-06, + "loss": 1.0461, + "step": 2857 + }, + { + "epoch": 0.5713000674645811, + "grad_norm": 2.15625, + "learning_rate": 9.178811153934463e-06, + "loss": 1.0419, + "step": 2858 + }, + { + "epoch": 0.5714999625196772, + "grad_norm": 2.125, + "learning_rate": 9.178232427733418e-06, + "loss": 1.0207, + "step": 2859 + }, + { + "epoch": 0.5716998575747733, + "grad_norm": 2.046875, + "learning_rate": 9.177653515935732e-06, + "loss": 1.052, + "step": 2860 + }, + { + "epoch": 0.5718997526298694, + "grad_norm": 2.078125, + "learning_rate": 9.177074418567124e-06, + "loss": 1.0429, + "step": 2861 + }, + { + "epoch": 0.5720996476849654, + "grad_norm": 2.015625, + "learning_rate": 9.176495135653315e-06, + "loss": 0.9709, + "step": 2862 + }, + { + "epoch": 0.5722995427400615, + "grad_norm": 2.234375, + "learning_rate": 9.175915667220039e-06, + "loss": 1.084, + "step": 2863 + }, + { + "epoch": 0.5724994377951576, + "grad_norm": 1.9609375, + "learning_rate": 9.175336013293037e-06, + "loss": 1.029, + "step": 2864 + }, + { + "epoch": 0.5726993328502537, + "grad_norm": 2.0, + "learning_rate": 9.174756173898053e-06, + "loss": 1.0011, + "step": 2865 + }, + { + "epoch": 0.5728992279053496, + "grad_norm": 2.125, + "learning_rate": 9.174176149060847e-06, + "loss": 1.1444, + "step": 2866 + }, + { + "epoch": 0.5730991229604457, + "grad_norm": 2.0625, + "learning_rate": 9.173595938807181e-06, + "loss": 1.0194, + "step": 2867 + }, + { + "epoch": 0.5732990180155418, + "grad_norm": 2.140625, + "learning_rate": 9.173015543162828e-06, + "loss": 1.0937, + "step": 2868 + }, + { + "epoch": 0.5734989130706379, + "grad_norm": 2.109375, + "learning_rate": 9.172434962153573e-06, + "loss": 1.0366, + "step": 2869 + }, + { + "epoch": 0.573698808125734, + "grad_norm": 2.171875, + "learning_rate": 9.171854195805202e-06, + "loss": 1.0427, + "step": 2870 + }, + { + "epoch": 0.57389870318083, + "grad_norm": 2.234375, + "learning_rate": 9.171273244143512e-06, + "loss": 1.0721, + "step": 2871 + }, + { + "epoch": 0.5740985982359261, + "grad_norm": 2.046875, + "learning_rate": 9.170692107194311e-06, + "loss": 1.0075, + "step": 2872 + }, + { + "epoch": 0.5742984932910222, + "grad_norm": 2.09375, + "learning_rate": 9.170110784983413e-06, + "loss": 1.0556, + "step": 2873 + }, + { + "epoch": 0.5744983883461183, + "grad_norm": 2.0625, + "learning_rate": 9.169529277536637e-06, + "loss": 1.0713, + "step": 2874 + }, + { + "epoch": 0.5746982834012143, + "grad_norm": 2.03125, + "learning_rate": 9.168947584879818e-06, + "loss": 0.9023, + "step": 2875 + }, + { + "epoch": 0.5748981784563104, + "grad_norm": 2.15625, + "learning_rate": 9.16836570703879e-06, + "loss": 1.1496, + "step": 2876 + }, + { + "epoch": 0.5750980735114065, + "grad_norm": 2.09375, + "learning_rate": 9.167783644039405e-06, + "loss": 1.145, + "step": 2877 + }, + { + "epoch": 0.5752979685665026, + "grad_norm": 2.078125, + "learning_rate": 9.167201395907515e-06, + "loss": 0.9597, + "step": 2878 + }, + { + "epoch": 0.5754978636215987, + "grad_norm": 2.109375, + "learning_rate": 9.166618962668983e-06, + "loss": 1.0683, + "step": 2879 + }, + { + "epoch": 0.5756977586766947, + "grad_norm": 1.9296875, + "learning_rate": 9.166036344349683e-06, + "loss": 0.936, + "step": 2880 + }, + { + "epoch": 0.5758976537317908, + "grad_norm": 2.0625, + "learning_rate": 9.165453540975492e-06, + "loss": 1.0168, + "step": 2881 + }, + { + "epoch": 0.5760975487868869, + "grad_norm": 2.015625, + "learning_rate": 9.1648705525723e-06, + "loss": 1.0268, + "step": 2882 + }, + { + "epoch": 0.576297443841983, + "grad_norm": 2.09375, + "learning_rate": 9.164287379166004e-06, + "loss": 1.0488, + "step": 2883 + }, + { + "epoch": 0.576497338897079, + "grad_norm": 2.09375, + "learning_rate": 9.163704020782507e-06, + "loss": 1.0319, + "step": 2884 + }, + { + "epoch": 0.5766972339521751, + "grad_norm": 2.03125, + "learning_rate": 9.163120477447722e-06, + "loss": 1.0859, + "step": 2885 + }, + { + "epoch": 0.5768971290072712, + "grad_norm": 2.140625, + "learning_rate": 9.16253674918757e-06, + "loss": 1.0748, + "step": 2886 + }, + { + "epoch": 0.5770970240623673, + "grad_norm": 2.03125, + "learning_rate": 9.161952836027978e-06, + "loss": 1.0619, + "step": 2887 + }, + { + "epoch": 0.5772969191174633, + "grad_norm": 2.046875, + "learning_rate": 9.16136873799489e-06, + "loss": 1.0363, + "step": 2888 + }, + { + "epoch": 0.5774968141725594, + "grad_norm": 1.9453125, + "learning_rate": 9.160784455114245e-06, + "loss": 0.9707, + "step": 2889 + }, + { + "epoch": 0.5776967092276555, + "grad_norm": 2.078125, + "learning_rate": 9.160199987411997e-06, + "loss": 1.1169, + "step": 2890 + }, + { + "epoch": 0.5778966042827516, + "grad_norm": 2.09375, + "learning_rate": 9.159615334914112e-06, + "loss": 1.0517, + "step": 2891 + }, + { + "epoch": 0.5780964993378477, + "grad_norm": 2.1875, + "learning_rate": 9.159030497646558e-06, + "loss": 1.1246, + "step": 2892 + }, + { + "epoch": 0.5782963943929437, + "grad_norm": 2.015625, + "learning_rate": 9.158445475635312e-06, + "loss": 0.9835, + "step": 2893 + }, + { + "epoch": 0.5784962894480398, + "grad_norm": 2.0625, + "learning_rate": 9.157860268906364e-06, + "loss": 0.9527, + "step": 2894 + }, + { + "epoch": 0.5786961845031359, + "grad_norm": 2.171875, + "learning_rate": 9.157274877485706e-06, + "loss": 1.0521, + "step": 2895 + }, + { + "epoch": 0.578896079558232, + "grad_norm": 2.109375, + "learning_rate": 9.156689301399342e-06, + "loss": 1.0526, + "step": 2896 + }, + { + "epoch": 0.579095974613328, + "grad_norm": 2.078125, + "learning_rate": 9.156103540673282e-06, + "loss": 1.0322, + "step": 2897 + }, + { + "epoch": 0.5792958696684241, + "grad_norm": 2.234375, + "learning_rate": 9.15551759533355e-06, + "loss": 1.0069, + "step": 2898 + }, + { + "epoch": 0.5794957647235202, + "grad_norm": 2.296875, + "learning_rate": 9.154931465406167e-06, + "loss": 1.1428, + "step": 2899 + }, + { + "epoch": 0.5796956597786163, + "grad_norm": 2.609375, + "learning_rate": 9.154345150917173e-06, + "loss": 1.1547, + "step": 2900 + }, + { + "epoch": 0.5798955548337122, + "grad_norm": 2.0625, + "learning_rate": 9.15375865189261e-06, + "loss": 1.1787, + "step": 2901 + }, + { + "epoch": 0.5800954498888083, + "grad_norm": 2.046875, + "learning_rate": 9.153171968358534e-06, + "loss": 1.0503, + "step": 2902 + }, + { + "epoch": 0.5802953449439044, + "grad_norm": 2.03125, + "learning_rate": 9.152585100341e-06, + "loss": 1.0329, + "step": 2903 + }, + { + "epoch": 0.5804952399990005, + "grad_norm": 2.078125, + "learning_rate": 9.151998047866082e-06, + "loss": 1.0738, + "step": 2904 + }, + { + "epoch": 0.5806951350540966, + "grad_norm": 2.109375, + "learning_rate": 9.151410810959853e-06, + "loss": 1.0141, + "step": 2905 + }, + { + "epoch": 0.5808950301091926, + "grad_norm": 2.0625, + "learning_rate": 9.150823389648398e-06, + "loss": 1.1399, + "step": 2906 + }, + { + "epoch": 0.5810949251642887, + "grad_norm": 2.046875, + "learning_rate": 9.150235783957812e-06, + "loss": 1.0416, + "step": 2907 + }, + { + "epoch": 0.5812948202193848, + "grad_norm": 2.046875, + "learning_rate": 9.149647993914196e-06, + "loss": 1.0442, + "step": 2908 + }, + { + "epoch": 0.5814947152744809, + "grad_norm": 2.140625, + "learning_rate": 9.149060019543661e-06, + "loss": 1.0853, + "step": 2909 + }, + { + "epoch": 0.5816946103295769, + "grad_norm": 2.046875, + "learning_rate": 9.148471860872322e-06, + "loss": 0.9844, + "step": 2910 + }, + { + "epoch": 0.581894505384673, + "grad_norm": 2.1875, + "learning_rate": 9.147883517926303e-06, + "loss": 1.0472, + "step": 2911 + }, + { + "epoch": 0.5820944004397691, + "grad_norm": 2.171875, + "learning_rate": 9.147294990731746e-06, + "loss": 1.1004, + "step": 2912 + }, + { + "epoch": 0.5822942954948652, + "grad_norm": 2.140625, + "learning_rate": 9.146706279314786e-06, + "loss": 1.075, + "step": 2913 + }, + { + "epoch": 0.5824941905499613, + "grad_norm": 2.21875, + "learning_rate": 9.146117383701575e-06, + "loss": 0.9788, + "step": 2914 + }, + { + "epoch": 0.5826940856050573, + "grad_norm": 2.125, + "learning_rate": 9.145528303918273e-06, + "loss": 1.0786, + "step": 2915 + }, + { + "epoch": 0.5828939806601534, + "grad_norm": 2.171875, + "learning_rate": 9.144939039991047e-06, + "loss": 1.0987, + "step": 2916 + }, + { + "epoch": 0.5830938757152495, + "grad_norm": 2.0625, + "learning_rate": 9.144349591946072e-06, + "loss": 1.0612, + "step": 2917 + }, + { + "epoch": 0.5832937707703456, + "grad_norm": 2.09375, + "learning_rate": 9.143759959809531e-06, + "loss": 1.1392, + "step": 2918 + }, + { + "epoch": 0.5834936658254416, + "grad_norm": 2.0, + "learning_rate": 9.143170143607613e-06, + "loss": 1.0307, + "step": 2919 + }, + { + "epoch": 0.5836935608805377, + "grad_norm": 2.203125, + "learning_rate": 9.142580143366523e-06, + "loss": 1.1968, + "step": 2920 + }, + { + "epoch": 0.5838934559356338, + "grad_norm": 2.15625, + "learning_rate": 9.141989959112464e-06, + "loss": 1.0975, + "step": 2921 + }, + { + "epoch": 0.5840933509907299, + "grad_norm": 2.078125, + "learning_rate": 9.141399590871652e-06, + "loss": 1.0722, + "step": 2922 + }, + { + "epoch": 0.5842932460458259, + "grad_norm": 2.171875, + "learning_rate": 9.140809038670314e-06, + "loss": 1.0535, + "step": 2923 + }, + { + "epoch": 0.584493141100922, + "grad_norm": 2.078125, + "learning_rate": 9.14021830253468e-06, + "loss": 1.1045, + "step": 2924 + }, + { + "epoch": 0.5846930361560181, + "grad_norm": 2.109375, + "learning_rate": 9.139627382490992e-06, + "loss": 1.0238, + "step": 2925 + }, + { + "epoch": 0.5848929312111142, + "grad_norm": 2.15625, + "learning_rate": 9.1390362785655e-06, + "loss": 1.1183, + "step": 2926 + }, + { + "epoch": 0.5850928262662103, + "grad_norm": 2.109375, + "learning_rate": 9.138444990784455e-06, + "loss": 1.0808, + "step": 2927 + }, + { + "epoch": 0.5852927213213063, + "grad_norm": 2.1875, + "learning_rate": 9.137853519174124e-06, + "loss": 1.0787, + "step": 2928 + }, + { + "epoch": 0.5854926163764024, + "grad_norm": 2.046875, + "learning_rate": 9.137261863760784e-06, + "loss": 1.0622, + "step": 2929 + }, + { + "epoch": 0.5856925114314985, + "grad_norm": 2.1875, + "learning_rate": 9.136670024570715e-06, + "loss": 1.0653, + "step": 2930 + }, + { + "epoch": 0.5858924064865946, + "grad_norm": 2.125, + "learning_rate": 9.136078001630205e-06, + "loss": 1.1184, + "step": 2931 + }, + { + "epoch": 0.5860923015416906, + "grad_norm": 1.9375, + "learning_rate": 9.13548579496555e-06, + "loss": 0.9863, + "step": 2932 + }, + { + "epoch": 0.5862921965967867, + "grad_norm": 2.09375, + "learning_rate": 9.13489340460306e-06, + "loss": 1.1144, + "step": 2933 + }, + { + "epoch": 0.5864920916518828, + "grad_norm": 2.15625, + "learning_rate": 9.134300830569046e-06, + "loss": 1.1486, + "step": 2934 + }, + { + "epoch": 0.5866919867069789, + "grad_norm": 2.203125, + "learning_rate": 9.133708072889828e-06, + "loss": 1.0694, + "step": 2935 + }, + { + "epoch": 0.586891881762075, + "grad_norm": 2.078125, + "learning_rate": 9.133115131591744e-06, + "loss": 1.1313, + "step": 2936 + }, + { + "epoch": 0.587091776817171, + "grad_norm": 2.0625, + "learning_rate": 9.132522006701123e-06, + "loss": 0.9837, + "step": 2937 + }, + { + "epoch": 0.587291671872267, + "grad_norm": 2.265625, + "learning_rate": 9.131928698244317e-06, + "loss": 1.2114, + "step": 2938 + }, + { + "epoch": 0.5874915669273632, + "grad_norm": 2.125, + "learning_rate": 9.13133520624768e-06, + "loss": 0.9378, + "step": 2939 + }, + { + "epoch": 0.5876914619824593, + "grad_norm": 2.03125, + "learning_rate": 9.130741530737573e-06, + "loss": 1.0948, + "step": 2940 + }, + { + "epoch": 0.5878913570375552, + "grad_norm": 2.046875, + "learning_rate": 9.130147671740371e-06, + "loss": 1.0223, + "step": 2941 + }, + { + "epoch": 0.5880912520926513, + "grad_norm": 2.078125, + "learning_rate": 9.129553629282448e-06, + "loss": 1.1147, + "step": 2942 + }, + { + "epoch": 0.5882911471477474, + "grad_norm": 2.109375, + "learning_rate": 9.128959403390195e-06, + "loss": 1.0553, + "step": 2943 + }, + { + "epoch": 0.5884910422028435, + "grad_norm": 2.0625, + "learning_rate": 9.128364994090007e-06, + "loss": 1.0579, + "step": 2944 + }, + { + "epoch": 0.5886909372579395, + "grad_norm": 2.1875, + "learning_rate": 9.127770401408284e-06, + "loss": 1.097, + "step": 2945 + }, + { + "epoch": 0.5888908323130356, + "grad_norm": 2.125, + "learning_rate": 9.127175625371443e-06, + "loss": 1.0519, + "step": 2946 + }, + { + "epoch": 0.5890907273681317, + "grad_norm": 2.15625, + "learning_rate": 9.126580666005901e-06, + "loss": 1.0373, + "step": 2947 + }, + { + "epoch": 0.5892906224232278, + "grad_norm": 2.125, + "learning_rate": 9.125985523338088e-06, + "loss": 1.0299, + "step": 2948 + }, + { + "epoch": 0.5894905174783239, + "grad_norm": 2.046875, + "learning_rate": 9.125390197394437e-06, + "loss": 1.0801, + "step": 2949 + }, + { + "epoch": 0.5896904125334199, + "grad_norm": 2.078125, + "learning_rate": 9.124794688201394e-06, + "loss": 1.1661, + "step": 2950 + }, + { + "epoch": 0.589890307588516, + "grad_norm": 2.015625, + "learning_rate": 9.124198995785414e-06, + "loss": 1.1123, + "step": 2951 + }, + { + "epoch": 0.5900902026436121, + "grad_norm": 2.0, + "learning_rate": 9.123603120172952e-06, + "loss": 0.9848, + "step": 2952 + }, + { + "epoch": 0.5902900976987082, + "grad_norm": 2.171875, + "learning_rate": 9.123007061390481e-06, + "loss": 0.9996, + "step": 2953 + }, + { + "epoch": 0.5904899927538042, + "grad_norm": 2.0625, + "learning_rate": 9.122410819464476e-06, + "loss": 1.1073, + "step": 2954 + }, + { + "epoch": 0.5906898878089003, + "grad_norm": 1.984375, + "learning_rate": 9.121814394421423e-06, + "loss": 0.9456, + "step": 2955 + }, + { + "epoch": 0.5908897828639964, + "grad_norm": 2.046875, + "learning_rate": 9.121217786287815e-06, + "loss": 1.0559, + "step": 2956 + }, + { + "epoch": 0.5910896779190925, + "grad_norm": 2.09375, + "learning_rate": 9.120620995090154e-06, + "loss": 0.9933, + "step": 2957 + }, + { + "epoch": 0.5912895729741886, + "grad_norm": 1.9765625, + "learning_rate": 9.120024020854947e-06, + "loss": 1.0785, + "step": 2958 + }, + { + "epoch": 0.5914894680292846, + "grad_norm": 2.140625, + "learning_rate": 9.119426863608713e-06, + "loss": 1.1198, + "step": 2959 + }, + { + "epoch": 0.5916893630843807, + "grad_norm": 1.984375, + "learning_rate": 9.11882952337798e-06, + "loss": 0.9539, + "step": 2960 + }, + { + "epoch": 0.5918892581394768, + "grad_norm": 1.8671875, + "learning_rate": 9.118232000189277e-06, + "loss": 0.9188, + "step": 2961 + }, + { + "epoch": 0.5920891531945729, + "grad_norm": 2.109375, + "learning_rate": 9.117634294069148e-06, + "loss": 1.1255, + "step": 2962 + }, + { + "epoch": 0.5922890482496689, + "grad_norm": 2.046875, + "learning_rate": 9.117036405044146e-06, + "loss": 1.1125, + "step": 2963 + }, + { + "epoch": 0.592488943304765, + "grad_norm": 2.078125, + "learning_rate": 9.116438333140825e-06, + "loss": 1.0321, + "step": 2964 + }, + { + "epoch": 0.5926888383598611, + "grad_norm": 2.171875, + "learning_rate": 9.115840078385753e-06, + "loss": 1.0814, + "step": 2965 + }, + { + "epoch": 0.5928887334149572, + "grad_norm": 2.078125, + "learning_rate": 9.115241640805506e-06, + "loss": 1.0344, + "step": 2966 + }, + { + "epoch": 0.5930886284700532, + "grad_norm": 2.109375, + "learning_rate": 9.114643020426662e-06, + "loss": 1.0181, + "step": 2967 + }, + { + "epoch": 0.5932885235251493, + "grad_norm": 2.109375, + "learning_rate": 9.114044217275816e-06, + "loss": 1.0138, + "step": 2968 + }, + { + "epoch": 0.5934884185802454, + "grad_norm": 2.25, + "learning_rate": 9.113445231379565e-06, + "loss": 1.1072, + "step": 2969 + }, + { + "epoch": 0.5936883136353415, + "grad_norm": 2.078125, + "learning_rate": 9.112846062764516e-06, + "loss": 0.9869, + "step": 2970 + }, + { + "epoch": 0.5938882086904376, + "grad_norm": 2.109375, + "learning_rate": 9.112246711457284e-06, + "loss": 0.998, + "step": 2971 + }, + { + "epoch": 0.5940881037455336, + "grad_norm": 2.203125, + "learning_rate": 9.111647177484493e-06, + "loss": 1.0892, + "step": 2972 + }, + { + "epoch": 0.5942879988006297, + "grad_norm": 2.3125, + "learning_rate": 9.111047460872773e-06, + "loss": 1.1074, + "step": 2973 + }, + { + "epoch": 0.5944878938557258, + "grad_norm": 2.078125, + "learning_rate": 9.110447561648766e-06, + "loss": 1.0628, + "step": 2974 + }, + { + "epoch": 0.5946877889108219, + "grad_norm": 2.0625, + "learning_rate": 9.109847479839114e-06, + "loss": 1.0733, + "step": 2975 + }, + { + "epoch": 0.5948876839659178, + "grad_norm": 1.984375, + "learning_rate": 9.109247215470478e-06, + "loss": 1.0749, + "step": 2976 + }, + { + "epoch": 0.595087579021014, + "grad_norm": 2.078125, + "learning_rate": 9.108646768569518e-06, + "loss": 0.9976, + "step": 2977 + }, + { + "epoch": 0.59528747407611, + "grad_norm": 2.125, + "learning_rate": 9.108046139162908e-06, + "loss": 1.0781, + "step": 2978 + }, + { + "epoch": 0.5954873691312061, + "grad_norm": 2.046875, + "learning_rate": 9.107445327277327e-06, + "loss": 1.1034, + "step": 2979 + }, + { + "epoch": 0.5956872641863022, + "grad_norm": 1.953125, + "learning_rate": 9.106844332939464e-06, + "loss": 0.968, + "step": 2980 + }, + { + "epoch": 0.5958871592413982, + "grad_norm": 2.09375, + "learning_rate": 9.106243156176015e-06, + "loss": 1.0788, + "step": 2981 + }, + { + "epoch": 0.5960870542964943, + "grad_norm": 1.96875, + "learning_rate": 9.105641797013682e-06, + "loss": 1.058, + "step": 2982 + }, + { + "epoch": 0.5962869493515904, + "grad_norm": 1.96875, + "learning_rate": 9.10504025547918e-06, + "loss": 1.0087, + "step": 2983 + }, + { + "epoch": 0.5964868444066865, + "grad_norm": 2.21875, + "learning_rate": 9.104438531599227e-06, + "loss": 1.013, + "step": 2984 + }, + { + "epoch": 0.5966867394617825, + "grad_norm": 2.046875, + "learning_rate": 9.103836625400554e-06, + "loss": 0.9962, + "step": 2985 + }, + { + "epoch": 0.5968866345168786, + "grad_norm": 2.15625, + "learning_rate": 9.103234536909895e-06, + "loss": 1.0629, + "step": 2986 + }, + { + "epoch": 0.5970865295719747, + "grad_norm": 2.09375, + "learning_rate": 9.102632266153997e-06, + "loss": 1.1041, + "step": 2987 + }, + { + "epoch": 0.5972864246270708, + "grad_norm": 2.015625, + "learning_rate": 9.102029813159613e-06, + "loss": 0.9685, + "step": 2988 + }, + { + "epoch": 0.5974863196821668, + "grad_norm": 2.03125, + "learning_rate": 9.101427177953502e-06, + "loss": 0.9442, + "step": 2989 + }, + { + "epoch": 0.5976862147372629, + "grad_norm": 2.171875, + "learning_rate": 9.100824360562432e-06, + "loss": 0.9826, + "step": 2990 + }, + { + "epoch": 0.597886109792359, + "grad_norm": 2.03125, + "learning_rate": 9.100221361013185e-06, + "loss": 0.9846, + "step": 2991 + }, + { + "epoch": 0.5980860048474551, + "grad_norm": 2.140625, + "learning_rate": 9.099618179332541e-06, + "loss": 1.1631, + "step": 2992 + }, + { + "epoch": 0.5982858999025512, + "grad_norm": 2.171875, + "learning_rate": 9.099014815547296e-06, + "loss": 1.1305, + "step": 2993 + }, + { + "epoch": 0.5984857949576472, + "grad_norm": 2.0625, + "learning_rate": 9.098411269684251e-06, + "loss": 0.9717, + "step": 2994 + }, + { + "epoch": 0.5986856900127433, + "grad_norm": 2.21875, + "learning_rate": 9.097807541770214e-06, + "loss": 1.1103, + "step": 2995 + }, + { + "epoch": 0.5988855850678394, + "grad_norm": 2.046875, + "learning_rate": 9.097203631832006e-06, + "loss": 1.0348, + "step": 2996 + }, + { + "epoch": 0.5990854801229355, + "grad_norm": 2.265625, + "learning_rate": 9.096599539896447e-06, + "loss": 1.1009, + "step": 2997 + }, + { + "epoch": 0.5992853751780315, + "grad_norm": 2.109375, + "learning_rate": 9.095995265990375e-06, + "loss": 1.0759, + "step": 2998 + }, + { + "epoch": 0.5994852702331276, + "grad_norm": 2.15625, + "learning_rate": 9.095390810140633e-06, + "loss": 1.0881, + "step": 2999 + }, + { + "epoch": 0.5996851652882237, + "grad_norm": 2.125, + "learning_rate": 9.094786172374066e-06, + "loss": 1.0333, + "step": 3000 + }, + { + "epoch": 0.5996851652882237, + "eval_loss": 0.9266427159309387, + "eval_runtime": 594.7722, + "eval_samples_per_second": 3.595, + "eval_steps_per_second": 3.595, + "step": 3000 + }, + { + "epoch": 0.5998850603433198, + "grad_norm": 1.9921875, + "learning_rate": 9.094181352717535e-06, + "loss": 1.0592, + "step": 3001 + }, + { + "epoch": 0.6000849553984159, + "grad_norm": 2.234375, + "learning_rate": 9.093576351197907e-06, + "loss": 1.1619, + "step": 3002 + }, + { + "epoch": 0.6002848504535119, + "grad_norm": 2.15625, + "learning_rate": 9.092971167842053e-06, + "loss": 1.0548, + "step": 3003 + }, + { + "epoch": 0.600484745508608, + "grad_norm": 2.328125, + "learning_rate": 9.092365802676858e-06, + "loss": 1.1491, + "step": 3004 + }, + { + "epoch": 0.6006846405637041, + "grad_norm": 2.140625, + "learning_rate": 9.091760255729212e-06, + "loss": 1.0809, + "step": 3005 + }, + { + "epoch": 0.6008845356188002, + "grad_norm": 2.078125, + "learning_rate": 9.09115452702601e-06, + "loss": 1.0501, + "step": 3006 + }, + { + "epoch": 0.6010844306738962, + "grad_norm": 2.234375, + "learning_rate": 9.090548616594163e-06, + "loss": 1.0777, + "step": 3007 + }, + { + "epoch": 0.6012843257289923, + "grad_norm": 2.1875, + "learning_rate": 9.089942524460582e-06, + "loss": 1.0544, + "step": 3008 + }, + { + "epoch": 0.6014842207840884, + "grad_norm": 2.0625, + "learning_rate": 9.08933625065219e-06, + "loss": 1.0639, + "step": 3009 + }, + { + "epoch": 0.6016841158391845, + "grad_norm": 2.046875, + "learning_rate": 9.088729795195921e-06, + "loss": 1.0678, + "step": 3010 + }, + { + "epoch": 0.6018840108942805, + "grad_norm": 2.140625, + "learning_rate": 9.08812315811871e-06, + "loss": 1.0587, + "step": 3011 + }, + { + "epoch": 0.6020839059493766, + "grad_norm": 2.0625, + "learning_rate": 9.087516339447504e-06, + "loss": 1.0833, + "step": 3012 + }, + { + "epoch": 0.6022838010044727, + "grad_norm": 2.28125, + "learning_rate": 9.08690933920926e-06, + "loss": 1.1271, + "step": 3013 + }, + { + "epoch": 0.6024836960595688, + "grad_norm": 1.9453125, + "learning_rate": 9.08630215743094e-06, + "loss": 0.9931, + "step": 3014 + }, + { + "epoch": 0.6026835911146649, + "grad_norm": 2.25, + "learning_rate": 9.085694794139514e-06, + "loss": 1.0981, + "step": 3015 + }, + { + "epoch": 0.6028834861697608, + "grad_norm": 2.015625, + "learning_rate": 9.085087249361961e-06, + "loss": 1.0819, + "step": 3016 + }, + { + "epoch": 0.6030833812248569, + "grad_norm": 2.03125, + "learning_rate": 9.08447952312527e-06, + "loss": 0.9643, + "step": 3017 + }, + { + "epoch": 0.603283276279953, + "grad_norm": 2.15625, + "learning_rate": 9.083871615456433e-06, + "loss": 1.0773, + "step": 3018 + }, + { + "epoch": 0.6034831713350491, + "grad_norm": 2.046875, + "learning_rate": 9.083263526382457e-06, + "loss": 0.9463, + "step": 3019 + }, + { + "epoch": 0.6036830663901451, + "grad_norm": 2.21875, + "learning_rate": 9.08265525593035e-06, + "loss": 1.0375, + "step": 3020 + }, + { + "epoch": 0.6038829614452412, + "grad_norm": 1.9765625, + "learning_rate": 9.082046804127133e-06, + "loss": 0.9687, + "step": 3021 + }, + { + "epoch": 0.6040828565003373, + "grad_norm": 2.140625, + "learning_rate": 9.081438170999833e-06, + "loss": 1.1131, + "step": 3022 + }, + { + "epoch": 0.6042827515554334, + "grad_norm": 2.0625, + "learning_rate": 9.080829356575484e-06, + "loss": 0.963, + "step": 3023 + }, + { + "epoch": 0.6044826466105294, + "grad_norm": 2.140625, + "learning_rate": 9.080220360881133e-06, + "loss": 1.1095, + "step": 3024 + }, + { + "epoch": 0.6046825416656255, + "grad_norm": 1.9921875, + "learning_rate": 9.079611183943828e-06, + "loss": 1.0107, + "step": 3025 + }, + { + "epoch": 0.6048824367207216, + "grad_norm": 2.171875, + "learning_rate": 9.079001825790632e-06, + "loss": 1.0581, + "step": 3026 + }, + { + "epoch": 0.6050823317758177, + "grad_norm": 2.203125, + "learning_rate": 9.078392286448607e-06, + "loss": 1.1935, + "step": 3027 + }, + { + "epoch": 0.6052822268309138, + "grad_norm": 2.09375, + "learning_rate": 9.077782565944836e-06, + "loss": 1.0421, + "step": 3028 + }, + { + "epoch": 0.6054821218860098, + "grad_norm": 2.25, + "learning_rate": 9.077172664306396e-06, + "loss": 1.1599, + "step": 3029 + }, + { + "epoch": 0.6056820169411059, + "grad_norm": 2.234375, + "learning_rate": 9.076562581560384e-06, + "loss": 1.0622, + "step": 3030 + }, + { + "epoch": 0.605881911996202, + "grad_norm": 2.203125, + "learning_rate": 9.075952317733894e-06, + "loss": 1.0438, + "step": 3031 + }, + { + "epoch": 0.6060818070512981, + "grad_norm": 2.0625, + "learning_rate": 9.07534187285404e-06, + "loss": 0.9847, + "step": 3032 + }, + { + "epoch": 0.6062817021063941, + "grad_norm": 2.140625, + "learning_rate": 9.074731246947936e-06, + "loss": 1.0763, + "step": 3033 + }, + { + "epoch": 0.6064815971614902, + "grad_norm": 2.0, + "learning_rate": 9.074120440042705e-06, + "loss": 0.954, + "step": 3034 + }, + { + "epoch": 0.6066814922165863, + "grad_norm": 2.0625, + "learning_rate": 9.073509452165476e-06, + "loss": 1.0371, + "step": 3035 + }, + { + "epoch": 0.6068813872716824, + "grad_norm": 2.078125, + "learning_rate": 9.072898283343395e-06, + "loss": 1.0441, + "step": 3036 + }, + { + "epoch": 0.6070812823267785, + "grad_norm": 2.328125, + "learning_rate": 9.072286933603607e-06, + "loss": 1.1834, + "step": 3037 + }, + { + "epoch": 0.6072811773818745, + "grad_norm": 2.015625, + "learning_rate": 9.071675402973268e-06, + "loss": 1.1353, + "step": 3038 + }, + { + "epoch": 0.6074810724369706, + "grad_norm": 2.09375, + "learning_rate": 9.071063691479542e-06, + "loss": 1.0175, + "step": 3039 + }, + { + "epoch": 0.6076809674920667, + "grad_norm": 2.046875, + "learning_rate": 9.070451799149604e-06, + "loss": 1.003, + "step": 3040 + }, + { + "epoch": 0.6078808625471628, + "grad_norm": 2.421875, + "learning_rate": 9.069839726010629e-06, + "loss": 1.2073, + "step": 3041 + }, + { + "epoch": 0.6080807576022588, + "grad_norm": 2.265625, + "learning_rate": 9.06922747208981e-06, + "loss": 1.0623, + "step": 3042 + }, + { + "epoch": 0.6082806526573549, + "grad_norm": 2.046875, + "learning_rate": 9.068615037414339e-06, + "loss": 1.0947, + "step": 3043 + }, + { + "epoch": 0.608480547712451, + "grad_norm": 2.03125, + "learning_rate": 9.068002422011426e-06, + "loss": 0.8832, + "step": 3044 + }, + { + "epoch": 0.6086804427675471, + "grad_norm": 2.0625, + "learning_rate": 9.067389625908277e-06, + "loss": 1.0155, + "step": 3045 + }, + { + "epoch": 0.6088803378226431, + "grad_norm": 2.265625, + "learning_rate": 9.066776649132116e-06, + "loss": 1.1535, + "step": 3046 + }, + { + "epoch": 0.6090802328777392, + "grad_norm": 2.171875, + "learning_rate": 9.06616349171017e-06, + "loss": 1.0368, + "step": 3047 + }, + { + "epoch": 0.6092801279328353, + "grad_norm": 2.234375, + "learning_rate": 9.065550153669676e-06, + "loss": 1.1035, + "step": 3048 + }, + { + "epoch": 0.6094800229879314, + "grad_norm": 2.0, + "learning_rate": 9.06493663503788e-06, + "loss": 1.0604, + "step": 3049 + }, + { + "epoch": 0.6096799180430275, + "grad_norm": 2.03125, + "learning_rate": 9.064322935842032e-06, + "loss": 0.999, + "step": 3050 + }, + { + "epoch": 0.6098798130981234, + "grad_norm": 2.046875, + "learning_rate": 9.063709056109393e-06, + "loss": 1.0718, + "step": 3051 + }, + { + "epoch": 0.6100797081532195, + "grad_norm": 2.03125, + "learning_rate": 9.063094995867232e-06, + "loss": 1.0119, + "step": 3052 + }, + { + "epoch": 0.6102796032083156, + "grad_norm": 2.0625, + "learning_rate": 9.062480755142824e-06, + "loss": 0.9728, + "step": 3053 + }, + { + "epoch": 0.6104794982634117, + "grad_norm": 2.1875, + "learning_rate": 9.061866333963455e-06, + "loss": 1.0766, + "step": 3054 + }, + { + "epoch": 0.6106793933185077, + "grad_norm": 2.109375, + "learning_rate": 9.06125173235642e-06, + "loss": 1.0861, + "step": 3055 + }, + { + "epoch": 0.6108792883736038, + "grad_norm": 2.0, + "learning_rate": 9.060636950349015e-06, + "loss": 1.0806, + "step": 3056 + }, + { + "epoch": 0.6110791834286999, + "grad_norm": 2.0, + "learning_rate": 9.06002198796855e-06, + "loss": 1.0579, + "step": 3057 + }, + { + "epoch": 0.611279078483796, + "grad_norm": 2.1875, + "learning_rate": 9.059406845242343e-06, + "loss": 1.0949, + "step": 3058 + }, + { + "epoch": 0.6114789735388921, + "grad_norm": 2.0625, + "learning_rate": 9.058791522197717e-06, + "loss": 0.9956, + "step": 3059 + }, + { + "epoch": 0.6116788685939881, + "grad_norm": 1.9921875, + "learning_rate": 9.058176018862004e-06, + "loss": 1.0662, + "step": 3060 + }, + { + "epoch": 0.6118787636490842, + "grad_norm": 1.984375, + "learning_rate": 9.057560335262546e-06, + "loss": 1.0139, + "step": 3061 + }, + { + "epoch": 0.6120786587041803, + "grad_norm": 2.078125, + "learning_rate": 9.056944471426692e-06, + "loss": 1.0547, + "step": 3062 + }, + { + "epoch": 0.6122785537592764, + "grad_norm": 2.09375, + "learning_rate": 9.056328427381798e-06, + "loss": 1.1331, + "step": 3063 + }, + { + "epoch": 0.6124784488143724, + "grad_norm": 2.203125, + "learning_rate": 9.055712203155226e-06, + "loss": 1.0904, + "step": 3064 + }, + { + "epoch": 0.6126783438694685, + "grad_norm": 2.09375, + "learning_rate": 9.055095798774353e-06, + "loss": 1.0879, + "step": 3065 + }, + { + "epoch": 0.6128782389245646, + "grad_norm": 2.0, + "learning_rate": 9.05447921426656e-06, + "loss": 1.0473, + "step": 3066 + }, + { + "epoch": 0.6130781339796607, + "grad_norm": 1.9453125, + "learning_rate": 9.05386244965923e-06, + "loss": 1.0045, + "step": 3067 + }, + { + "epoch": 0.6132780290347567, + "grad_norm": 2.078125, + "learning_rate": 9.053245504979764e-06, + "loss": 0.9839, + "step": 3068 + }, + { + "epoch": 0.6134779240898528, + "grad_norm": 2.09375, + "learning_rate": 9.052628380255565e-06, + "loss": 1.0137, + "step": 3069 + }, + { + "epoch": 0.6136778191449489, + "grad_norm": 1.9375, + "learning_rate": 9.052011075514049e-06, + "loss": 0.985, + "step": 3070 + }, + { + "epoch": 0.613877714200045, + "grad_norm": 2.0625, + "learning_rate": 9.051393590782631e-06, + "loss": 0.9909, + "step": 3071 + }, + { + "epoch": 0.6140776092551411, + "grad_norm": 1.9609375, + "learning_rate": 9.050775926088743e-06, + "loss": 1.0162, + "step": 3072 + }, + { + "epoch": 0.6142775043102371, + "grad_norm": 2.046875, + "learning_rate": 9.050158081459821e-06, + "loss": 1.0008, + "step": 3073 + }, + { + "epoch": 0.6144773993653332, + "grad_norm": 1.953125, + "learning_rate": 9.049540056923309e-06, + "loss": 1.0254, + "step": 3074 + }, + { + "epoch": 0.6146772944204293, + "grad_norm": 2.078125, + "learning_rate": 9.048921852506662e-06, + "loss": 1.0035, + "step": 3075 + }, + { + "epoch": 0.6148771894755254, + "grad_norm": 2.09375, + "learning_rate": 9.048303468237337e-06, + "loss": 1.0047, + "step": 3076 + }, + { + "epoch": 0.6150770845306214, + "grad_norm": 2.25, + "learning_rate": 9.047684904142806e-06, + "loss": 1.0216, + "step": 3077 + }, + { + "epoch": 0.6152769795857175, + "grad_norm": 2.203125, + "learning_rate": 9.047066160250542e-06, + "loss": 1.0792, + "step": 3078 + }, + { + "epoch": 0.6154768746408136, + "grad_norm": 1.96875, + "learning_rate": 9.046447236588032e-06, + "loss": 0.9194, + "step": 3079 + }, + { + "epoch": 0.6156767696959097, + "grad_norm": 2.21875, + "learning_rate": 9.045828133182769e-06, + "loss": 1.0588, + "step": 3080 + }, + { + "epoch": 0.6158766647510058, + "grad_norm": 2.03125, + "learning_rate": 9.045208850062252e-06, + "loss": 0.926, + "step": 3081 + }, + { + "epoch": 0.6160765598061018, + "grad_norm": 3.609375, + "learning_rate": 9.044589387253988e-06, + "loss": 1.0148, + "step": 3082 + }, + { + "epoch": 0.6162764548611979, + "grad_norm": 2.203125, + "learning_rate": 9.043969744785498e-06, + "loss": 1.0506, + "step": 3083 + }, + { + "epoch": 0.616476349916294, + "grad_norm": 2.015625, + "learning_rate": 9.043349922684302e-06, + "loss": 1.0651, + "step": 3084 + }, + { + "epoch": 0.6166762449713901, + "grad_norm": 2.234375, + "learning_rate": 9.042729920977936e-06, + "loss": 1.0557, + "step": 3085 + }, + { + "epoch": 0.616876140026486, + "grad_norm": 2.09375, + "learning_rate": 9.042109739693938e-06, + "loss": 1.0763, + "step": 3086 + }, + { + "epoch": 0.6170760350815822, + "grad_norm": 2.09375, + "learning_rate": 9.041489378859856e-06, + "loss": 1.0005, + "step": 3087 + }, + { + "epoch": 0.6172759301366783, + "grad_norm": 2.109375, + "learning_rate": 9.040868838503247e-06, + "loss": 1.0142, + "step": 3088 + }, + { + "epoch": 0.6174758251917744, + "grad_norm": 2.046875, + "learning_rate": 9.040248118651677e-06, + "loss": 1.0612, + "step": 3089 + }, + { + "epoch": 0.6176757202468703, + "grad_norm": 2.046875, + "learning_rate": 9.039627219332718e-06, + "loss": 1.0389, + "step": 3090 + }, + { + "epoch": 0.6178756153019664, + "grad_norm": 2.0, + "learning_rate": 9.03900614057395e-06, + "loss": 0.9776, + "step": 3091 + }, + { + "epoch": 0.6180755103570625, + "grad_norm": 2.125, + "learning_rate": 9.038384882402957e-06, + "loss": 0.9351, + "step": 3092 + }, + { + "epoch": 0.6182754054121586, + "grad_norm": 2.09375, + "learning_rate": 9.037763444847342e-06, + "loss": 1.0853, + "step": 3093 + }, + { + "epoch": 0.6184753004672547, + "grad_norm": 2.15625, + "learning_rate": 9.037141827934705e-06, + "loss": 1.0996, + "step": 3094 + }, + { + "epoch": 0.6186751955223507, + "grad_norm": 2.140625, + "learning_rate": 9.036520031692658e-06, + "loss": 0.9808, + "step": 3095 + }, + { + "epoch": 0.6188750905774468, + "grad_norm": 2.15625, + "learning_rate": 9.035898056148824e-06, + "loss": 1.0767, + "step": 3096 + }, + { + "epoch": 0.6190749856325429, + "grad_norm": 2.125, + "learning_rate": 9.035275901330828e-06, + "loss": 0.9893, + "step": 3097 + }, + { + "epoch": 0.619274880687639, + "grad_norm": 2.71875, + "learning_rate": 9.03465356726631e-06, + "loss": 1.0129, + "step": 3098 + }, + { + "epoch": 0.619474775742735, + "grad_norm": 2.015625, + "learning_rate": 9.034031053982909e-06, + "loss": 1.0106, + "step": 3099 + }, + { + "epoch": 0.6196746707978311, + "grad_norm": 2.046875, + "learning_rate": 9.03340836150828e-06, + "loss": 0.9799, + "step": 3100 + }, + { + "epoch": 0.6198745658529272, + "grad_norm": 2.03125, + "learning_rate": 9.032785489870084e-06, + "loss": 0.9461, + "step": 3101 + }, + { + "epoch": 0.6200744609080233, + "grad_norm": 2.109375, + "learning_rate": 9.032162439095984e-06, + "loss": 1.1415, + "step": 3102 + }, + { + "epoch": 0.6202743559631194, + "grad_norm": 2.21875, + "learning_rate": 9.031539209213662e-06, + "loss": 1.1099, + "step": 3103 + }, + { + "epoch": 0.6204742510182154, + "grad_norm": 2.09375, + "learning_rate": 9.030915800250797e-06, + "loss": 1.03, + "step": 3104 + }, + { + "epoch": 0.6206741460733115, + "grad_norm": 2.09375, + "learning_rate": 9.030292212235083e-06, + "loss": 1.0174, + "step": 3105 + }, + { + "epoch": 0.6208740411284076, + "grad_norm": 2.171875, + "learning_rate": 9.02966844519422e-06, + "loss": 1.0381, + "step": 3106 + }, + { + "epoch": 0.6210739361835037, + "grad_norm": 2.09375, + "learning_rate": 9.029044499155914e-06, + "loss": 0.9533, + "step": 3107 + }, + { + "epoch": 0.6212738312385997, + "grad_norm": 2.125, + "learning_rate": 9.028420374147885e-06, + "loss": 0.9208, + "step": 3108 + }, + { + "epoch": 0.6214737262936958, + "grad_norm": 2.140625, + "learning_rate": 9.02779607019785e-06, + "loss": 1.0589, + "step": 3109 + }, + { + "epoch": 0.6216736213487919, + "grad_norm": 2.09375, + "learning_rate": 9.027171587333543e-06, + "loss": 1.0516, + "step": 3110 + }, + { + "epoch": 0.621873516403888, + "grad_norm": 2.03125, + "learning_rate": 9.026546925582707e-06, + "loss": 1.038, + "step": 3111 + }, + { + "epoch": 0.622073411458984, + "grad_norm": 2.046875, + "learning_rate": 9.025922084973084e-06, + "loss": 1.0842, + "step": 3112 + }, + { + "epoch": 0.6222733065140801, + "grad_norm": 2.171875, + "learning_rate": 9.025297065532435e-06, + "loss": 1.1177, + "step": 3113 + }, + { + "epoch": 0.6224732015691762, + "grad_norm": 2.25, + "learning_rate": 9.024671867288518e-06, + "loss": 1.1309, + "step": 3114 + }, + { + "epoch": 0.6226730966242723, + "grad_norm": 2.203125, + "learning_rate": 9.024046490269107e-06, + "loss": 1.1373, + "step": 3115 + }, + { + "epoch": 0.6228729916793684, + "grad_norm": 2.078125, + "learning_rate": 9.023420934501981e-06, + "loss": 1.0819, + "step": 3116 + }, + { + "epoch": 0.6230728867344644, + "grad_norm": 2.09375, + "learning_rate": 9.022795200014927e-06, + "loss": 0.9935, + "step": 3117 + }, + { + "epoch": 0.6232727817895605, + "grad_norm": 2.015625, + "learning_rate": 9.022169286835737e-06, + "loss": 0.9575, + "step": 3118 + }, + { + "epoch": 0.6234726768446566, + "grad_norm": 2.125, + "learning_rate": 9.02154319499222e-06, + "loss": 1.0777, + "step": 3119 + }, + { + "epoch": 0.6236725718997527, + "grad_norm": 2.109375, + "learning_rate": 9.020916924512183e-06, + "loss": 1.0401, + "step": 3120 + }, + { + "epoch": 0.6238724669548487, + "grad_norm": 2.046875, + "learning_rate": 9.020290475423447e-06, + "loss": 0.9766, + "step": 3121 + }, + { + "epoch": 0.6240723620099448, + "grad_norm": 2.09375, + "learning_rate": 9.019663847753837e-06, + "loss": 1.0157, + "step": 3122 + }, + { + "epoch": 0.6242722570650409, + "grad_norm": 2.265625, + "learning_rate": 9.019037041531187e-06, + "loss": 0.9897, + "step": 3123 + }, + { + "epoch": 0.624472152120137, + "grad_norm": 2.109375, + "learning_rate": 9.01841005678334e-06, + "loss": 0.974, + "step": 3124 + }, + { + "epoch": 0.624672047175233, + "grad_norm": 2.078125, + "learning_rate": 9.017782893538149e-06, + "loss": 1.0121, + "step": 3125 + }, + { + "epoch": 0.624871942230329, + "grad_norm": 2.046875, + "learning_rate": 9.01715555182347e-06, + "loss": 1.0159, + "step": 3126 + }, + { + "epoch": 0.6250718372854251, + "grad_norm": 2.09375, + "learning_rate": 9.016528031667173e-06, + "loss": 1.0378, + "step": 3127 + }, + { + "epoch": 0.6252717323405212, + "grad_norm": 2.0625, + "learning_rate": 9.015900333097127e-06, + "loss": 1.0614, + "step": 3128 + }, + { + "epoch": 0.6254716273956173, + "grad_norm": 2.09375, + "learning_rate": 9.015272456141218e-06, + "loss": 1.041, + "step": 3129 + }, + { + "epoch": 0.6256715224507133, + "grad_norm": 2.015625, + "learning_rate": 9.014644400827336e-06, + "loss": 1.0148, + "step": 3130 + }, + { + "epoch": 0.6258714175058094, + "grad_norm": 2.171875, + "learning_rate": 9.014016167183378e-06, + "loss": 1.0268, + "step": 3131 + }, + { + "epoch": 0.6260713125609055, + "grad_norm": 2.1875, + "learning_rate": 9.013387755237251e-06, + "loss": 0.9141, + "step": 3132 + }, + { + "epoch": 0.6262712076160016, + "grad_norm": 2.203125, + "learning_rate": 9.012759165016867e-06, + "loss": 1.0393, + "step": 3133 + }, + { + "epoch": 0.6264711026710976, + "grad_norm": 2.140625, + "learning_rate": 9.01213039655015e-06, + "loss": 1.0668, + "step": 3134 + }, + { + "epoch": 0.6266709977261937, + "grad_norm": 2.09375, + "learning_rate": 9.01150144986503e-06, + "loss": 1.0435, + "step": 3135 + }, + { + "epoch": 0.6268708927812898, + "grad_norm": 2.21875, + "learning_rate": 9.010872324989444e-06, + "loss": 1.0377, + "step": 3136 + }, + { + "epoch": 0.6270707878363859, + "grad_norm": 2.171875, + "learning_rate": 9.010243021951338e-06, + "loss": 1.021, + "step": 3137 + }, + { + "epoch": 0.627270682891482, + "grad_norm": 2.0, + "learning_rate": 9.009613540778666e-06, + "loss": 0.9202, + "step": 3138 + }, + { + "epoch": 0.627470577946578, + "grad_norm": 2.21875, + "learning_rate": 9.008983881499387e-06, + "loss": 1.0746, + "step": 3139 + }, + { + "epoch": 0.6276704730016741, + "grad_norm": 1.9609375, + "learning_rate": 9.008354044141471e-06, + "loss": 0.9922, + "step": 3140 + }, + { + "epoch": 0.6278703680567702, + "grad_norm": 2.15625, + "learning_rate": 9.0077240287329e-06, + "loss": 1.1474, + "step": 3141 + }, + { + "epoch": 0.6280702631118663, + "grad_norm": 2.0, + "learning_rate": 9.007093835301652e-06, + "loss": 1.0599, + "step": 3142 + }, + { + "epoch": 0.6282701581669623, + "grad_norm": 2.0625, + "learning_rate": 9.006463463875728e-06, + "loss": 1.044, + "step": 3143 + }, + { + "epoch": 0.6284700532220584, + "grad_norm": 2.0, + "learning_rate": 9.005832914483121e-06, + "loss": 0.971, + "step": 3144 + }, + { + "epoch": 0.6286699482771545, + "grad_norm": 1.9765625, + "learning_rate": 9.005202187151845e-06, + "loss": 1.0, + "step": 3145 + }, + { + "epoch": 0.6288698433322506, + "grad_norm": 2.0, + "learning_rate": 9.004571281909918e-06, + "loss": 1.0634, + "step": 3146 + }, + { + "epoch": 0.6290697383873466, + "grad_norm": 2.109375, + "learning_rate": 9.00394019878536e-06, + "loss": 0.9944, + "step": 3147 + }, + { + "epoch": 0.6292696334424427, + "grad_norm": 2.1875, + "learning_rate": 9.003308937806206e-06, + "loss": 1.077, + "step": 3148 + }, + { + "epoch": 0.6294695284975388, + "grad_norm": 2.140625, + "learning_rate": 9.002677499000496e-06, + "loss": 1.1407, + "step": 3149 + }, + { + "epoch": 0.6296694235526349, + "grad_norm": 2.046875, + "learning_rate": 9.002045882396279e-06, + "loss": 1.0637, + "step": 3150 + }, + { + "epoch": 0.629869318607731, + "grad_norm": 2.171875, + "learning_rate": 9.001414088021612e-06, + "loss": 1.0534, + "step": 3151 + }, + { + "epoch": 0.630069213662827, + "grad_norm": 1.9296875, + "learning_rate": 9.00078211590456e-06, + "loss": 1.0054, + "step": 3152 + }, + { + "epoch": 0.6302691087179231, + "grad_norm": 2.015625, + "learning_rate": 9.000149966073192e-06, + "loss": 0.9175, + "step": 3153 + }, + { + "epoch": 0.6304690037730192, + "grad_norm": 2.125, + "learning_rate": 8.99951763855559e-06, + "loss": 1.1064, + "step": 3154 + }, + { + "epoch": 0.6306688988281153, + "grad_norm": 2.0625, + "learning_rate": 8.998885133379842e-06, + "loss": 1.07, + "step": 3155 + }, + { + "epoch": 0.6308687938832113, + "grad_norm": 2.125, + "learning_rate": 8.998252450574044e-06, + "loss": 1.0637, + "step": 3156 + }, + { + "epoch": 0.6310686889383074, + "grad_norm": 2.078125, + "learning_rate": 8.997619590166298e-06, + "loss": 1.1576, + "step": 3157 + }, + { + "epoch": 0.6312685839934035, + "grad_norm": 2.09375, + "learning_rate": 8.996986552184716e-06, + "loss": 1.0527, + "step": 3158 + }, + { + "epoch": 0.6314684790484996, + "grad_norm": 2.03125, + "learning_rate": 8.996353336657421e-06, + "loss": 1.0808, + "step": 3159 + }, + { + "epoch": 0.6316683741035957, + "grad_norm": 2.21875, + "learning_rate": 8.995719943612535e-06, + "loss": 1.2639, + "step": 3160 + }, + { + "epoch": 0.6318682691586917, + "grad_norm": 1.9140625, + "learning_rate": 8.995086373078197e-06, + "loss": 0.948, + "step": 3161 + }, + { + "epoch": 0.6320681642137878, + "grad_norm": 1.921875, + "learning_rate": 8.99445262508255e-06, + "loss": 0.9227, + "step": 3162 + }, + { + "epoch": 0.6322680592688839, + "grad_norm": 2.03125, + "learning_rate": 8.993818699653742e-06, + "loss": 1.0668, + "step": 3163 + }, + { + "epoch": 0.63246795432398, + "grad_norm": 2.203125, + "learning_rate": 8.993184596819935e-06, + "loss": 1.103, + "step": 3164 + }, + { + "epoch": 0.6326678493790759, + "grad_norm": 2.125, + "learning_rate": 8.992550316609294e-06, + "loss": 1.0526, + "step": 3165 + }, + { + "epoch": 0.632867744434172, + "grad_norm": 2.3125, + "learning_rate": 8.991915859049996e-06, + "loss": 1.105, + "step": 3166 + }, + { + "epoch": 0.6330676394892681, + "grad_norm": 2.03125, + "learning_rate": 8.99128122417022e-06, + "loss": 0.9941, + "step": 3167 + }, + { + "epoch": 0.6332675345443642, + "grad_norm": 2.1875, + "learning_rate": 8.990646411998161e-06, + "loss": 1.0805, + "step": 3168 + }, + { + "epoch": 0.6334674295994602, + "grad_norm": 1.96875, + "learning_rate": 8.990011422562012e-06, + "loss": 0.986, + "step": 3169 + }, + { + "epoch": 0.6336673246545563, + "grad_norm": 2.171875, + "learning_rate": 8.989376255889982e-06, + "loss": 1.0714, + "step": 3170 + }, + { + "epoch": 0.6338672197096524, + "grad_norm": 2.125, + "learning_rate": 8.988740912010285e-06, + "loss": 1.1068, + "step": 3171 + }, + { + "epoch": 0.6340671147647485, + "grad_norm": 2.109375, + "learning_rate": 8.988105390951143e-06, + "loss": 1.0174, + "step": 3172 + }, + { + "epoch": 0.6342670098198446, + "grad_norm": 2.140625, + "learning_rate": 8.987469692740787e-06, + "loss": 1.1122, + "step": 3173 + }, + { + "epoch": 0.6344669048749406, + "grad_norm": 2.03125, + "learning_rate": 8.98683381740745e-06, + "loss": 1.0848, + "step": 3174 + }, + { + "epoch": 0.6346667999300367, + "grad_norm": 2.21875, + "learning_rate": 8.986197764979382e-06, + "loss": 1.025, + "step": 3175 + }, + { + "epoch": 0.6348666949851328, + "grad_norm": 1.9609375, + "learning_rate": 8.985561535484836e-06, + "loss": 0.9516, + "step": 3176 + }, + { + "epoch": 0.6350665900402289, + "grad_norm": 2.21875, + "learning_rate": 8.984925128952072e-06, + "loss": 1.1565, + "step": 3177 + }, + { + "epoch": 0.6352664850953249, + "grad_norm": 2.171875, + "learning_rate": 8.984288545409358e-06, + "loss": 1.0739, + "step": 3178 + }, + { + "epoch": 0.635466380150421, + "grad_norm": 2.078125, + "learning_rate": 8.983651784884974e-06, + "loss": 1.0221, + "step": 3179 + }, + { + "epoch": 0.6356662752055171, + "grad_norm": 1.96875, + "learning_rate": 8.983014847407202e-06, + "loss": 1.0373, + "step": 3180 + }, + { + "epoch": 0.6358661702606132, + "grad_norm": 2.171875, + "learning_rate": 8.982377733004338e-06, + "loss": 1.0651, + "step": 3181 + }, + { + "epoch": 0.6360660653157093, + "grad_norm": 2.09375, + "learning_rate": 8.981740441704677e-06, + "loss": 1.1086, + "step": 3182 + }, + { + "epoch": 0.6362659603708053, + "grad_norm": 2.234375, + "learning_rate": 8.981102973536533e-06, + "loss": 1.1332, + "step": 3183 + }, + { + "epoch": 0.6364658554259014, + "grad_norm": 2.171875, + "learning_rate": 8.98046532852822e-06, + "loss": 1.0953, + "step": 3184 + }, + { + "epoch": 0.6366657504809975, + "grad_norm": 2.171875, + "learning_rate": 8.97982750670806e-06, + "loss": 1.0265, + "step": 3185 + }, + { + "epoch": 0.6368656455360936, + "grad_norm": 2.03125, + "learning_rate": 8.979189508104391e-06, + "loss": 1.089, + "step": 3186 + }, + { + "epoch": 0.6370655405911896, + "grad_norm": 2.171875, + "learning_rate": 8.978551332745546e-06, + "loss": 1.0756, + "step": 3187 + }, + { + "epoch": 0.6372654356462857, + "grad_norm": 2.171875, + "learning_rate": 8.977912980659878e-06, + "loss": 1.0064, + "step": 3188 + }, + { + "epoch": 0.6374653307013818, + "grad_norm": 2.109375, + "learning_rate": 8.97727445187574e-06, + "loss": 1.0146, + "step": 3189 + }, + { + "epoch": 0.6376652257564779, + "grad_norm": 1.953125, + "learning_rate": 8.976635746421493e-06, + "loss": 1.0238, + "step": 3190 + }, + { + "epoch": 0.6378651208115739, + "grad_norm": 2.046875, + "learning_rate": 8.975996864325514e-06, + "loss": 0.9988, + "step": 3191 + }, + { + "epoch": 0.63806501586667, + "grad_norm": 2.09375, + "learning_rate": 8.975357805616176e-06, + "loss": 1.0451, + "step": 3192 + }, + { + "epoch": 0.6382649109217661, + "grad_norm": 1.984375, + "learning_rate": 8.974718570321873e-06, + "loss": 1.0624, + "step": 3193 + }, + { + "epoch": 0.6384648059768622, + "grad_norm": 2.15625, + "learning_rate": 8.974079158470991e-06, + "loss": 0.9772, + "step": 3194 + }, + { + "epoch": 0.6386647010319583, + "grad_norm": 2.125, + "learning_rate": 8.97343957009194e-06, + "loss": 1.0772, + "step": 3195 + }, + { + "epoch": 0.6388645960870543, + "grad_norm": 1.953125, + "learning_rate": 8.972799805213125e-06, + "loss": 0.962, + "step": 3196 + }, + { + "epoch": 0.6390644911421504, + "grad_norm": 2.1875, + "learning_rate": 8.97215986386297e-06, + "loss": 1.1486, + "step": 3197 + }, + { + "epoch": 0.6392643861972465, + "grad_norm": 2.125, + "learning_rate": 8.971519746069897e-06, + "loss": 1.0017, + "step": 3198 + }, + { + "epoch": 0.6394642812523426, + "grad_norm": 2.109375, + "learning_rate": 8.970879451862341e-06, + "loss": 1.0565, + "step": 3199 + }, + { + "epoch": 0.6396641763074385, + "grad_norm": 2.109375, + "learning_rate": 8.970238981268745e-06, + "loss": 1.0777, + "step": 3200 + }, + { + "epoch": 0.6398640713625346, + "grad_norm": 2.015625, + "learning_rate": 8.969598334317556e-06, + "loss": 0.9181, + "step": 3201 + }, + { + "epoch": 0.6400639664176307, + "grad_norm": 2.125, + "learning_rate": 8.968957511037233e-06, + "loss": 1.0161, + "step": 3202 + }, + { + "epoch": 0.6402638614727268, + "grad_norm": 2.09375, + "learning_rate": 8.968316511456241e-06, + "loss": 1.1003, + "step": 3203 + }, + { + "epoch": 0.640463756527823, + "grad_norm": 2.03125, + "learning_rate": 8.967675335603055e-06, + "loss": 1.0365, + "step": 3204 + }, + { + "epoch": 0.6406636515829189, + "grad_norm": 2.0625, + "learning_rate": 8.967033983506153e-06, + "loss": 0.9613, + "step": 3205 + }, + { + "epoch": 0.640863546638015, + "grad_norm": 2.0, + "learning_rate": 8.966392455194026e-06, + "loss": 1.041, + "step": 3206 + }, + { + "epoch": 0.6410634416931111, + "grad_norm": 2.171875, + "learning_rate": 8.965750750695168e-06, + "loss": 1.0774, + "step": 3207 + }, + { + "epoch": 0.6412633367482072, + "grad_norm": 2.078125, + "learning_rate": 8.965108870038088e-06, + "loss": 1.0497, + "step": 3208 + }, + { + "epoch": 0.6414632318033032, + "grad_norm": 2.125, + "learning_rate": 8.964466813251294e-06, + "loss": 1.1386, + "step": 3209 + }, + { + "epoch": 0.6416631268583993, + "grad_norm": 2.03125, + "learning_rate": 8.963824580363307e-06, + "loss": 1.098, + "step": 3210 + }, + { + "epoch": 0.6418630219134954, + "grad_norm": 2.15625, + "learning_rate": 8.963182171402656e-06, + "loss": 1.1158, + "step": 3211 + }, + { + "epoch": 0.6420629169685915, + "grad_norm": 2.140625, + "learning_rate": 8.962539586397876e-06, + "loss": 1.1141, + "step": 3212 + }, + { + "epoch": 0.6422628120236875, + "grad_norm": 1.9453125, + "learning_rate": 8.961896825377512e-06, + "loss": 1.0352, + "step": 3213 + }, + { + "epoch": 0.6424627070787836, + "grad_norm": 2.09375, + "learning_rate": 8.961253888370113e-06, + "loss": 0.9783, + "step": 3214 + }, + { + "epoch": 0.6426626021338797, + "grad_norm": 2.03125, + "learning_rate": 8.960610775404239e-06, + "loss": 0.9919, + "step": 3215 + }, + { + "epoch": 0.6428624971889758, + "grad_norm": 2.046875, + "learning_rate": 8.95996748650846e-06, + "loss": 0.9799, + "step": 3216 + }, + { + "epoch": 0.6430623922440719, + "grad_norm": 2.015625, + "learning_rate": 8.959324021711345e-06, + "loss": 0.9817, + "step": 3217 + }, + { + "epoch": 0.6432622872991679, + "grad_norm": 2.109375, + "learning_rate": 8.95868038104148e-06, + "loss": 1.13, + "step": 3218 + }, + { + "epoch": 0.643462182354264, + "grad_norm": 2.203125, + "learning_rate": 8.958036564527457e-06, + "loss": 1.1042, + "step": 3219 + }, + { + "epoch": 0.6436620774093601, + "grad_norm": 2.109375, + "learning_rate": 8.957392572197871e-06, + "loss": 0.9991, + "step": 3220 + }, + { + "epoch": 0.6438619724644562, + "grad_norm": 2.0625, + "learning_rate": 8.95674840408133e-06, + "loss": 0.9395, + "step": 3221 + }, + { + "epoch": 0.6440618675195522, + "grad_norm": 2.140625, + "learning_rate": 8.95610406020645e-06, + "loss": 1.0837, + "step": 3222 + }, + { + "epoch": 0.6442617625746483, + "grad_norm": 2.0625, + "learning_rate": 8.955459540601847e-06, + "loss": 1.0176, + "step": 3223 + }, + { + "epoch": 0.6444616576297444, + "grad_norm": 2.0625, + "learning_rate": 8.954814845296153e-06, + "loss": 1.0846, + "step": 3224 + }, + { + "epoch": 0.6446615526848405, + "grad_norm": 2.09375, + "learning_rate": 8.95416997431801e-06, + "loss": 0.9247, + "step": 3225 + }, + { + "epoch": 0.6448614477399366, + "grad_norm": 2.03125, + "learning_rate": 8.953524927696056e-06, + "loss": 1.0291, + "step": 3226 + }, + { + "epoch": 0.6450613427950326, + "grad_norm": 2.125, + "learning_rate": 8.952879705458949e-06, + "loss": 1.0674, + "step": 3227 + }, + { + "epoch": 0.6452612378501287, + "grad_norm": 2.125, + "learning_rate": 8.952234307635346e-06, + "loss": 1.0894, + "step": 3228 + }, + { + "epoch": 0.6454611329052248, + "grad_norm": 2.1875, + "learning_rate": 8.951588734253917e-06, + "loss": 1.023, + "step": 3229 + }, + { + "epoch": 0.6456610279603209, + "grad_norm": 1.9609375, + "learning_rate": 8.950942985343339e-06, + "loss": 0.9906, + "step": 3230 + }, + { + "epoch": 0.6458609230154169, + "grad_norm": 2.015625, + "learning_rate": 8.950297060932294e-06, + "loss": 0.9879, + "step": 3231 + }, + { + "epoch": 0.646060818070513, + "grad_norm": 2.109375, + "learning_rate": 8.949650961049479e-06, + "loss": 1.0749, + "step": 3232 + }, + { + "epoch": 0.6462607131256091, + "grad_norm": 2.125, + "learning_rate": 8.949004685723587e-06, + "loss": 1.0911, + "step": 3233 + }, + { + "epoch": 0.6464606081807052, + "grad_norm": 2.140625, + "learning_rate": 8.94835823498333e-06, + "loss": 0.9759, + "step": 3234 + }, + { + "epoch": 0.6466605032358012, + "grad_norm": 2.109375, + "learning_rate": 8.94771160885742e-06, + "loss": 1.0999, + "step": 3235 + }, + { + "epoch": 0.6468603982908973, + "grad_norm": 1.953125, + "learning_rate": 8.947064807374586e-06, + "loss": 0.9683, + "step": 3236 + }, + { + "epoch": 0.6470602933459934, + "grad_norm": 2.09375, + "learning_rate": 8.946417830563551e-06, + "loss": 1.0466, + "step": 3237 + }, + { + "epoch": 0.6472601884010895, + "grad_norm": 2.09375, + "learning_rate": 8.94577067845306e-06, + "loss": 1.0536, + "step": 3238 + }, + { + "epoch": 0.6474600834561856, + "grad_norm": 2.203125, + "learning_rate": 8.945123351071856e-06, + "loss": 1.0053, + "step": 3239 + }, + { + "epoch": 0.6476599785112815, + "grad_norm": 2.109375, + "learning_rate": 8.944475848448692e-06, + "loss": 1.0653, + "step": 3240 + }, + { + "epoch": 0.6478598735663776, + "grad_norm": 2.15625, + "learning_rate": 8.943828170612335e-06, + "loss": 1.0131, + "step": 3241 + }, + { + "epoch": 0.6480597686214737, + "grad_norm": 2.109375, + "learning_rate": 8.94318031759155e-06, + "loss": 1.125, + "step": 3242 + }, + { + "epoch": 0.6482596636765698, + "grad_norm": 2.0, + "learning_rate": 8.942532289415117e-06, + "loss": 1.025, + "step": 3243 + }, + { + "epoch": 0.6484595587316658, + "grad_norm": 2.109375, + "learning_rate": 8.941884086111824e-06, + "loss": 1.0884, + "step": 3244 + }, + { + "epoch": 0.6486594537867619, + "grad_norm": 2.140625, + "learning_rate": 8.941235707710457e-06, + "loss": 1.0583, + "step": 3245 + }, + { + "epoch": 0.648859348841858, + "grad_norm": 2.078125, + "learning_rate": 8.940587154239822e-06, + "loss": 1.0255, + "step": 3246 + }, + { + "epoch": 0.6490592438969541, + "grad_norm": 2.125, + "learning_rate": 8.939938425728725e-06, + "loss": 1.1508, + "step": 3247 + }, + { + "epoch": 0.6492591389520501, + "grad_norm": 2.0625, + "learning_rate": 8.939289522205986e-06, + "loss": 1.1196, + "step": 3248 + }, + { + "epoch": 0.6494590340071462, + "grad_norm": 2.03125, + "learning_rate": 8.938640443700426e-06, + "loss": 0.9891, + "step": 3249 + }, + { + "epoch": 0.6496589290622423, + "grad_norm": 2.125, + "learning_rate": 8.937991190240878e-06, + "loss": 1.0844, + "step": 3250 + }, + { + "epoch": 0.6498588241173384, + "grad_norm": 2.140625, + "learning_rate": 8.937341761856184e-06, + "loss": 1.0354, + "step": 3251 + }, + { + "epoch": 0.6500587191724345, + "grad_norm": 2.03125, + "learning_rate": 8.936692158575186e-06, + "loss": 1.0588, + "step": 3252 + }, + { + "epoch": 0.6502586142275305, + "grad_norm": 2.078125, + "learning_rate": 8.936042380426746e-06, + "loss": 0.9619, + "step": 3253 + }, + { + "epoch": 0.6504585092826266, + "grad_norm": 1.9765625, + "learning_rate": 8.935392427439723e-06, + "loss": 0.9899, + "step": 3254 + }, + { + "epoch": 0.6506584043377227, + "grad_norm": 2.21875, + "learning_rate": 8.934742299642987e-06, + "loss": 1.1772, + "step": 3255 + }, + { + "epoch": 0.6508582993928188, + "grad_norm": 2.171875, + "learning_rate": 8.93409199706542e-06, + "loss": 1.025, + "step": 3256 + }, + { + "epoch": 0.6510581944479148, + "grad_norm": 2.09375, + "learning_rate": 8.933441519735907e-06, + "loss": 1.0423, + "step": 3257 + }, + { + "epoch": 0.6512580895030109, + "grad_norm": 2.125, + "learning_rate": 8.932790867683339e-06, + "loss": 1.0719, + "step": 3258 + }, + { + "epoch": 0.651457984558107, + "grad_norm": 2.109375, + "learning_rate": 8.932140040936623e-06, + "loss": 1.0208, + "step": 3259 + }, + { + "epoch": 0.6516578796132031, + "grad_norm": 2.078125, + "learning_rate": 8.931489039524667e-06, + "loss": 1.0917, + "step": 3260 + }, + { + "epoch": 0.6518577746682992, + "grad_norm": 2.109375, + "learning_rate": 8.930837863476386e-06, + "loss": 1.0357, + "step": 3261 + }, + { + "epoch": 0.6520576697233952, + "grad_norm": 2.0625, + "learning_rate": 8.930186512820707e-06, + "loss": 1.02, + "step": 3262 + }, + { + "epoch": 0.6522575647784913, + "grad_norm": 2.109375, + "learning_rate": 8.929534987586565e-06, + "loss": 1.1055, + "step": 3263 + }, + { + "epoch": 0.6524574598335874, + "grad_norm": 2.046875, + "learning_rate": 8.928883287802897e-06, + "loss": 1.0013, + "step": 3264 + }, + { + "epoch": 0.6526573548886835, + "grad_norm": 1.9921875, + "learning_rate": 8.928231413498652e-06, + "loss": 0.9495, + "step": 3265 + }, + { + "epoch": 0.6528572499437795, + "grad_norm": 1.9375, + "learning_rate": 8.92757936470279e-06, + "loss": 0.9701, + "step": 3266 + }, + { + "epoch": 0.6530571449988756, + "grad_norm": 2.046875, + "learning_rate": 8.92692714144427e-06, + "loss": 1.0784, + "step": 3267 + }, + { + "epoch": 0.6532570400539717, + "grad_norm": 2.140625, + "learning_rate": 8.926274743752065e-06, + "loss": 1.0928, + "step": 3268 + }, + { + "epoch": 0.6534569351090678, + "grad_norm": 2.0625, + "learning_rate": 8.925622171655157e-06, + "loss": 1.0559, + "step": 3269 + }, + { + "epoch": 0.6536568301641638, + "grad_norm": 2.15625, + "learning_rate": 8.92496942518253e-06, + "loss": 0.9591, + "step": 3270 + }, + { + "epoch": 0.6538567252192599, + "grad_norm": 2.0625, + "learning_rate": 8.924316504363182e-06, + "loss": 1.0161, + "step": 3271 + }, + { + "epoch": 0.654056620274356, + "grad_norm": 2.09375, + "learning_rate": 8.923663409226112e-06, + "loss": 1.0892, + "step": 3272 + }, + { + "epoch": 0.6542565153294521, + "grad_norm": 2.046875, + "learning_rate": 8.923010139800335e-06, + "loss": 1.0097, + "step": 3273 + }, + { + "epoch": 0.6544564103845482, + "grad_norm": 2.328125, + "learning_rate": 8.922356696114865e-06, + "loss": 1.0528, + "step": 3274 + }, + { + "epoch": 0.6546563054396441, + "grad_norm": 2.15625, + "learning_rate": 8.921703078198728e-06, + "loss": 1.0536, + "step": 3275 + }, + { + "epoch": 0.6548562004947402, + "grad_norm": 2.15625, + "learning_rate": 8.92104928608096e-06, + "loss": 1.0184, + "step": 3276 + }, + { + "epoch": 0.6550560955498363, + "grad_norm": 2.1875, + "learning_rate": 8.920395319790604e-06, + "loss": 0.9432, + "step": 3277 + }, + { + "epoch": 0.6552559906049324, + "grad_norm": 2.265625, + "learning_rate": 8.919741179356705e-06, + "loss": 1.126, + "step": 3278 + }, + { + "epoch": 0.6554558856600284, + "grad_norm": 2.171875, + "learning_rate": 8.919086864808319e-06, + "loss": 1.0793, + "step": 3279 + }, + { + "epoch": 0.6556557807151245, + "grad_norm": 2.125, + "learning_rate": 8.918432376174516e-06, + "loss": 1.0164, + "step": 3280 + }, + { + "epoch": 0.6558556757702206, + "grad_norm": 2.109375, + "learning_rate": 8.917777713484366e-06, + "loss": 1.0069, + "step": 3281 + }, + { + "epoch": 0.6560555708253167, + "grad_norm": 2.0, + "learning_rate": 8.917122876766946e-06, + "loss": 1.0158, + "step": 3282 + }, + { + "epoch": 0.6562554658804128, + "grad_norm": 2.078125, + "learning_rate": 8.916467866051347e-06, + "loss": 1.0721, + "step": 3283 + }, + { + "epoch": 0.6564553609355088, + "grad_norm": 2.125, + "learning_rate": 8.915812681366665e-06, + "loss": 1.0576, + "step": 3284 + }, + { + "epoch": 0.6566552559906049, + "grad_norm": 2.28125, + "learning_rate": 8.915157322742e-06, + "loss": 1.0276, + "step": 3285 + }, + { + "epoch": 0.656855151045701, + "grad_norm": 2.09375, + "learning_rate": 8.914501790206466e-06, + "loss": 0.9683, + "step": 3286 + }, + { + "epoch": 0.6570550461007971, + "grad_norm": 2.078125, + "learning_rate": 8.913846083789179e-06, + "loss": 0.9412, + "step": 3287 + }, + { + "epoch": 0.6572549411558931, + "grad_norm": 2.0625, + "learning_rate": 8.91319020351927e-06, + "loss": 1.0381, + "step": 3288 + }, + { + "epoch": 0.6574548362109892, + "grad_norm": 2.09375, + "learning_rate": 8.912534149425868e-06, + "loss": 1.049, + "step": 3289 + }, + { + "epoch": 0.6576547312660853, + "grad_norm": 2.15625, + "learning_rate": 8.911877921538117e-06, + "loss": 1.0618, + "step": 3290 + }, + { + "epoch": 0.6578546263211814, + "grad_norm": 2.0625, + "learning_rate": 8.911221519885167e-06, + "loss": 1.0587, + "step": 3291 + }, + { + "epoch": 0.6580545213762774, + "grad_norm": 2.09375, + "learning_rate": 8.910564944496174e-06, + "loss": 0.9946, + "step": 3292 + }, + { + "epoch": 0.6582544164313735, + "grad_norm": 2.109375, + "learning_rate": 8.909908195400305e-06, + "loss": 1.0538, + "step": 3293 + }, + { + "epoch": 0.6584543114864696, + "grad_norm": 2.0625, + "learning_rate": 8.909251272626731e-06, + "loss": 1.0824, + "step": 3294 + }, + { + "epoch": 0.6586542065415657, + "grad_norm": 2.046875, + "learning_rate": 8.908594176204632e-06, + "loss": 1.0578, + "step": 3295 + }, + { + "epoch": 0.6588541015966618, + "grad_norm": 2.125, + "learning_rate": 8.9079369061632e-06, + "loss": 1.0767, + "step": 3296 + }, + { + "epoch": 0.6590539966517578, + "grad_norm": 2.0625, + "learning_rate": 8.907279462531625e-06, + "loss": 1.0766, + "step": 3297 + }, + { + "epoch": 0.6592538917068539, + "grad_norm": 2.09375, + "learning_rate": 8.906621845339115e-06, + "loss": 0.9306, + "step": 3298 + }, + { + "epoch": 0.65945378676195, + "grad_norm": 2.0, + "learning_rate": 8.90596405461488e-06, + "loss": 0.8914, + "step": 3299 + }, + { + "epoch": 0.6596536818170461, + "grad_norm": 2.21875, + "learning_rate": 8.905306090388137e-06, + "loss": 1.0862, + "step": 3300 + }, + { + "epoch": 0.6598535768721421, + "grad_norm": 2.0, + "learning_rate": 8.904647952688117e-06, + "loss": 0.9762, + "step": 3301 + }, + { + "epoch": 0.6600534719272382, + "grad_norm": 2.09375, + "learning_rate": 8.903989641544052e-06, + "loss": 1.0451, + "step": 3302 + }, + { + "epoch": 0.6602533669823343, + "grad_norm": 2.125, + "learning_rate": 8.903331156985181e-06, + "loss": 1.044, + "step": 3303 + }, + { + "epoch": 0.6604532620374304, + "grad_norm": 2.078125, + "learning_rate": 8.902672499040759e-06, + "loss": 1.0062, + "step": 3304 + }, + { + "epoch": 0.6606531570925265, + "grad_norm": 2.078125, + "learning_rate": 8.902013667740043e-06, + "loss": 1.0294, + "step": 3305 + }, + { + "epoch": 0.6608530521476225, + "grad_norm": 2.125, + "learning_rate": 8.901354663112294e-06, + "loss": 1.0725, + "step": 3306 + }, + { + "epoch": 0.6610529472027186, + "grad_norm": 2.171875, + "learning_rate": 8.900695485186788e-06, + "loss": 1.0517, + "step": 3307 + }, + { + "epoch": 0.6612528422578147, + "grad_norm": 2.03125, + "learning_rate": 8.900036133992807e-06, + "loss": 0.9954, + "step": 3308 + }, + { + "epoch": 0.6614527373129108, + "grad_norm": 1.9921875, + "learning_rate": 8.899376609559636e-06, + "loss": 1.0294, + "step": 3309 + }, + { + "epoch": 0.6616526323680068, + "grad_norm": 2.21875, + "learning_rate": 8.898716911916571e-06, + "loss": 1.0569, + "step": 3310 + }, + { + "epoch": 0.6618525274231029, + "grad_norm": 2.09375, + "learning_rate": 8.89805704109292e-06, + "loss": 0.9953, + "step": 3311 + }, + { + "epoch": 0.662052422478199, + "grad_norm": 2.0, + "learning_rate": 8.897396997117991e-06, + "loss": 1.005, + "step": 3312 + }, + { + "epoch": 0.662252317533295, + "grad_norm": 2.15625, + "learning_rate": 8.896736780021102e-06, + "loss": 1.0816, + "step": 3313 + }, + { + "epoch": 0.662452212588391, + "grad_norm": 1.9921875, + "learning_rate": 8.896076389831583e-06, + "loss": 0.9675, + "step": 3314 + }, + { + "epoch": 0.6626521076434871, + "grad_norm": 2.234375, + "learning_rate": 8.895415826578766e-06, + "loss": 1.1988, + "step": 3315 + }, + { + "epoch": 0.6628520026985832, + "grad_norm": 2.140625, + "learning_rate": 8.894755090291996e-06, + "loss": 1.1341, + "step": 3316 + }, + { + "epoch": 0.6630518977536793, + "grad_norm": 2.109375, + "learning_rate": 8.89409418100062e-06, + "loss": 1.0202, + "step": 3317 + }, + { + "epoch": 0.6632517928087754, + "grad_norm": 2.265625, + "learning_rate": 8.893433098733995e-06, + "loss": 1.0561, + "step": 3318 + }, + { + "epoch": 0.6634516878638714, + "grad_norm": 2.203125, + "learning_rate": 8.892771843521487e-06, + "loss": 1.1008, + "step": 3319 + }, + { + "epoch": 0.6636515829189675, + "grad_norm": 2.109375, + "learning_rate": 8.89211041539247e-06, + "loss": 1.0711, + "step": 3320 + }, + { + "epoch": 0.6638514779740636, + "grad_norm": 2.078125, + "learning_rate": 8.891448814376326e-06, + "loss": 1.0149, + "step": 3321 + }, + { + "epoch": 0.6640513730291597, + "grad_norm": 2.046875, + "learning_rate": 8.89078704050244e-06, + "loss": 0.9886, + "step": 3322 + }, + { + "epoch": 0.6642512680842557, + "grad_norm": 2.140625, + "learning_rate": 8.890125093800208e-06, + "loss": 1.0175, + "step": 3323 + }, + { + "epoch": 0.6644511631393518, + "grad_norm": 2.015625, + "learning_rate": 8.889462974299037e-06, + "loss": 1.0867, + "step": 3324 + }, + { + "epoch": 0.6646510581944479, + "grad_norm": 2.0625, + "learning_rate": 8.888800682028334e-06, + "loss": 1.0379, + "step": 3325 + }, + { + "epoch": 0.664850953249544, + "grad_norm": 2.09375, + "learning_rate": 8.888138217017521e-06, + "loss": 1.0689, + "step": 3326 + }, + { + "epoch": 0.6650508483046401, + "grad_norm": 2.109375, + "learning_rate": 8.887475579296025e-06, + "loss": 1.0328, + "step": 3327 + }, + { + "epoch": 0.6652507433597361, + "grad_norm": 1.9296875, + "learning_rate": 8.886812768893277e-06, + "loss": 1.0361, + "step": 3328 + }, + { + "epoch": 0.6654506384148322, + "grad_norm": 2.15625, + "learning_rate": 8.886149785838722e-06, + "loss": 1.0431, + "step": 3329 + }, + { + "epoch": 0.6656505334699283, + "grad_norm": 2.1875, + "learning_rate": 8.885486630161808e-06, + "loss": 1.0075, + "step": 3330 + }, + { + "epoch": 0.6658504285250244, + "grad_norm": 2.09375, + "learning_rate": 8.884823301891993e-06, + "loss": 1.0585, + "step": 3331 + }, + { + "epoch": 0.6660503235801204, + "grad_norm": 2.0625, + "learning_rate": 8.884159801058743e-06, + "loss": 1.0838, + "step": 3332 + }, + { + "epoch": 0.6662502186352165, + "grad_norm": 2.140625, + "learning_rate": 8.88349612769153e-06, + "loss": 1.0646, + "step": 3333 + }, + { + "epoch": 0.6664501136903126, + "grad_norm": 1.96875, + "learning_rate": 8.88283228181983e-06, + "loss": 0.8971, + "step": 3334 + }, + { + "epoch": 0.6666500087454087, + "grad_norm": 2.109375, + "learning_rate": 8.882168263473137e-06, + "loss": 1.0699, + "step": 3335 + }, + { + "epoch": 0.6668499038005047, + "grad_norm": 2.109375, + "learning_rate": 8.881504072680945e-06, + "loss": 0.9898, + "step": 3336 + }, + { + "epoch": 0.6670497988556008, + "grad_norm": 2.09375, + "learning_rate": 8.880839709472755e-06, + "loss": 1.0643, + "step": 3337 + }, + { + "epoch": 0.6672496939106969, + "grad_norm": 2.234375, + "learning_rate": 8.88017517387808e-06, + "loss": 1.1073, + "step": 3338 + }, + { + "epoch": 0.667449588965793, + "grad_norm": 2.15625, + "learning_rate": 8.87951046592644e-06, + "loss": 1.0848, + "step": 3339 + }, + { + "epoch": 0.6676494840208891, + "grad_norm": 2.078125, + "learning_rate": 8.878845585647357e-06, + "loss": 1.0101, + "step": 3340 + }, + { + "epoch": 0.6678493790759851, + "grad_norm": 2.109375, + "learning_rate": 8.87818053307037e-06, + "loss": 1.055, + "step": 3341 + }, + { + "epoch": 0.6680492741310812, + "grad_norm": 1.9375, + "learning_rate": 8.877515308225015e-06, + "loss": 0.9716, + "step": 3342 + }, + { + "epoch": 0.6682491691861773, + "grad_norm": 2.03125, + "learning_rate": 8.876849911140846e-06, + "loss": 0.9588, + "step": 3343 + }, + { + "epoch": 0.6684490642412734, + "grad_norm": 1.9609375, + "learning_rate": 8.876184341847418e-06, + "loss": 1.0359, + "step": 3344 + }, + { + "epoch": 0.6686489592963694, + "grad_norm": 2.125, + "learning_rate": 8.875518600374296e-06, + "loss": 1.0397, + "step": 3345 + }, + { + "epoch": 0.6688488543514655, + "grad_norm": 2.046875, + "learning_rate": 8.874852686751051e-06, + "loss": 1.0957, + "step": 3346 + }, + { + "epoch": 0.6690487494065616, + "grad_norm": 2.078125, + "learning_rate": 8.874186601007263e-06, + "loss": 1.0045, + "step": 3347 + }, + { + "epoch": 0.6692486444616577, + "grad_norm": 2.015625, + "learning_rate": 8.87352034317252e-06, + "loss": 1.0593, + "step": 3348 + }, + { + "epoch": 0.6694485395167536, + "grad_norm": 2.21875, + "learning_rate": 8.872853913276418e-06, + "loss": 1.045, + "step": 3349 + }, + { + "epoch": 0.6696484345718497, + "grad_norm": 2.109375, + "learning_rate": 8.872187311348558e-06, + "loss": 1.1227, + "step": 3350 + }, + { + "epoch": 0.6698483296269458, + "grad_norm": 2.109375, + "learning_rate": 8.871520537418552e-06, + "loss": 0.9508, + "step": 3351 + }, + { + "epoch": 0.670048224682042, + "grad_norm": 2.140625, + "learning_rate": 8.870853591516016e-06, + "loss": 0.9973, + "step": 3352 + }, + { + "epoch": 0.670248119737138, + "grad_norm": 2.078125, + "learning_rate": 8.870186473670577e-06, + "loss": 1.0057, + "step": 3353 + }, + { + "epoch": 0.670448014792234, + "grad_norm": 2.0625, + "learning_rate": 8.86951918391187e-06, + "loss": 1.0588, + "step": 3354 + }, + { + "epoch": 0.6706479098473301, + "grad_norm": 2.03125, + "learning_rate": 8.868851722269531e-06, + "loss": 0.987, + "step": 3355 + }, + { + "epoch": 0.6708478049024262, + "grad_norm": 2.203125, + "learning_rate": 8.868184088773216e-06, + "loss": 1.1464, + "step": 3356 + }, + { + "epoch": 0.6710476999575223, + "grad_norm": 2.015625, + "learning_rate": 8.867516283452572e-06, + "loss": 1.0607, + "step": 3357 + }, + { + "epoch": 0.6712475950126183, + "grad_norm": 2.078125, + "learning_rate": 8.866848306337272e-06, + "loss": 0.99, + "step": 3358 + }, + { + "epoch": 0.6714474900677144, + "grad_norm": 2.078125, + "learning_rate": 8.866180157456981e-06, + "loss": 1.0594, + "step": 3359 + }, + { + "epoch": 0.6716473851228105, + "grad_norm": 2.0625, + "learning_rate": 8.865511836841381e-06, + "loss": 1.0414, + "step": 3360 + }, + { + "epoch": 0.6718472801779066, + "grad_norm": 2.140625, + "learning_rate": 8.864843344520158e-06, + "loss": 1.16, + "step": 3361 + }, + { + "epoch": 0.6720471752330027, + "grad_norm": 2.171875, + "learning_rate": 8.864174680523005e-06, + "loss": 1.0834, + "step": 3362 + }, + { + "epoch": 0.6722470702880987, + "grad_norm": 2.0625, + "learning_rate": 8.863505844879628e-06, + "loss": 1.1152, + "step": 3363 + }, + { + "epoch": 0.6724469653431948, + "grad_norm": 2.171875, + "learning_rate": 8.862836837619732e-06, + "loss": 0.9614, + "step": 3364 + }, + { + "epoch": 0.6726468603982909, + "grad_norm": 2.078125, + "learning_rate": 8.862167658773037e-06, + "loss": 0.9623, + "step": 3365 + }, + { + "epoch": 0.672846755453387, + "grad_norm": 2.359375, + "learning_rate": 8.861498308369267e-06, + "loss": 1.1579, + "step": 3366 + }, + { + "epoch": 0.673046650508483, + "grad_norm": 1.9765625, + "learning_rate": 8.860828786438155e-06, + "loss": 0.9972, + "step": 3367 + }, + { + "epoch": 0.6732465455635791, + "grad_norm": 2.140625, + "learning_rate": 8.86015909300944e-06, + "loss": 1.0825, + "step": 3368 + }, + { + "epoch": 0.6734464406186752, + "grad_norm": 2.0625, + "learning_rate": 8.85948922811287e-06, + "loss": 1.0788, + "step": 3369 + }, + { + "epoch": 0.6736463356737713, + "grad_norm": 2.0625, + "learning_rate": 8.858819191778201e-06, + "loss": 1.1012, + "step": 3370 + }, + { + "epoch": 0.6738462307288673, + "grad_norm": 2.046875, + "learning_rate": 8.858148984035196e-06, + "loss": 0.9939, + "step": 3371 + }, + { + "epoch": 0.6740461257839634, + "grad_norm": 2.171875, + "learning_rate": 8.857478604913625e-06, + "loss": 1.1226, + "step": 3372 + }, + { + "epoch": 0.6742460208390595, + "grad_norm": 2.03125, + "learning_rate": 8.856808054443266e-06, + "loss": 0.9636, + "step": 3373 + }, + { + "epoch": 0.6744459158941556, + "grad_norm": 2.140625, + "learning_rate": 8.856137332653907e-06, + "loss": 0.9492, + "step": 3374 + }, + { + "epoch": 0.6746458109492517, + "grad_norm": 1.9609375, + "learning_rate": 8.855466439575338e-06, + "loss": 1.0091, + "step": 3375 + }, + { + "epoch": 0.6748457060043477, + "grad_norm": 2.03125, + "learning_rate": 8.85479537523736e-06, + "loss": 1.0131, + "step": 3376 + }, + { + "epoch": 0.6750456010594438, + "grad_norm": 1.953125, + "learning_rate": 8.854124139669786e-06, + "loss": 1.013, + "step": 3377 + }, + { + "epoch": 0.6752454961145399, + "grad_norm": 2.296875, + "learning_rate": 8.853452732902428e-06, + "loss": 1.0919, + "step": 3378 + }, + { + "epoch": 0.675445391169636, + "grad_norm": 2.15625, + "learning_rate": 8.85278115496511e-06, + "loss": 1.0912, + "step": 3379 + }, + { + "epoch": 0.675645286224732, + "grad_norm": 2.09375, + "learning_rate": 8.852109405887667e-06, + "loss": 1.1148, + "step": 3380 + }, + { + "epoch": 0.6758451812798281, + "grad_norm": 1.984375, + "learning_rate": 8.851437485699935e-06, + "loss": 0.9866, + "step": 3381 + }, + { + "epoch": 0.6760450763349242, + "grad_norm": 2.046875, + "learning_rate": 8.85076539443176e-06, + "loss": 0.9758, + "step": 3382 + }, + { + "epoch": 0.6762449713900203, + "grad_norm": 2.015625, + "learning_rate": 8.850093132112999e-06, + "loss": 0.9487, + "step": 3383 + }, + { + "epoch": 0.6764448664451164, + "grad_norm": 2.09375, + "learning_rate": 8.849420698773513e-06, + "loss": 1.0743, + "step": 3384 + }, + { + "epoch": 0.6766447615002124, + "grad_norm": 2.109375, + "learning_rate": 8.848748094443167e-06, + "loss": 1.088, + "step": 3385 + }, + { + "epoch": 0.6768446565553085, + "grad_norm": 2.03125, + "learning_rate": 8.848075319151844e-06, + "loss": 1.0013, + "step": 3386 + }, + { + "epoch": 0.6770445516104046, + "grad_norm": 2.109375, + "learning_rate": 8.847402372929426e-06, + "loss": 1.1631, + "step": 3387 + }, + { + "epoch": 0.6772444466655007, + "grad_norm": 2.03125, + "learning_rate": 8.846729255805806e-06, + "loss": 1.0131, + "step": 3388 + }, + { + "epoch": 0.6774443417205966, + "grad_norm": 2.0625, + "learning_rate": 8.846055967810882e-06, + "loss": 1.0088, + "step": 3389 + }, + { + "epoch": 0.6776442367756927, + "grad_norm": 2.078125, + "learning_rate": 8.845382508974565e-06, + "loss": 1.0365, + "step": 3390 + }, + { + "epoch": 0.6778441318307888, + "grad_norm": 2.0, + "learning_rate": 8.844708879326767e-06, + "loss": 0.9699, + "step": 3391 + }, + { + "epoch": 0.6780440268858849, + "grad_norm": 2.0625, + "learning_rate": 8.84403507889741e-06, + "loss": 0.9918, + "step": 3392 + }, + { + "epoch": 0.6782439219409809, + "grad_norm": 2.140625, + "learning_rate": 8.843361107716427e-06, + "loss": 1.0832, + "step": 3393 + }, + { + "epoch": 0.678443816996077, + "grad_norm": 2.109375, + "learning_rate": 8.842686965813752e-06, + "loss": 0.992, + "step": 3394 + }, + { + "epoch": 0.6786437120511731, + "grad_norm": 2.078125, + "learning_rate": 8.842012653219333e-06, + "loss": 1.0853, + "step": 3395 + }, + { + "epoch": 0.6788436071062692, + "grad_norm": 1.9765625, + "learning_rate": 8.841338169963122e-06, + "loss": 1.0101, + "step": 3396 + }, + { + "epoch": 0.6790435021613653, + "grad_norm": 2.0625, + "learning_rate": 8.840663516075081e-06, + "loss": 0.9916, + "step": 3397 + }, + { + "epoch": 0.6792433972164613, + "grad_norm": 2.1875, + "learning_rate": 8.839988691585177e-06, + "loss": 1.0407, + "step": 3398 + }, + { + "epoch": 0.6794432922715574, + "grad_norm": 2.09375, + "learning_rate": 8.839313696523384e-06, + "loss": 1.0267, + "step": 3399 + }, + { + "epoch": 0.6796431873266535, + "grad_norm": 2.046875, + "learning_rate": 8.838638530919688e-06, + "loss": 1.0569, + "step": 3400 + }, + { + "epoch": 0.6798430823817496, + "grad_norm": 2.046875, + "learning_rate": 8.837963194804077e-06, + "loss": 1.0449, + "step": 3401 + }, + { + "epoch": 0.6800429774368456, + "grad_norm": 2.1875, + "learning_rate": 8.837287688206552e-06, + "loss": 1.0707, + "step": 3402 + }, + { + "epoch": 0.6802428724919417, + "grad_norm": 2.078125, + "learning_rate": 8.836612011157117e-06, + "loss": 1.0324, + "step": 3403 + }, + { + "epoch": 0.6804427675470378, + "grad_norm": 2.0625, + "learning_rate": 8.835936163685786e-06, + "loss": 1.0641, + "step": 3404 + }, + { + "epoch": 0.6806426626021339, + "grad_norm": 2.0625, + "learning_rate": 8.835260145822582e-06, + "loss": 1.0306, + "step": 3405 + }, + { + "epoch": 0.68084255765723, + "grad_norm": 2.109375, + "learning_rate": 8.83458395759753e-06, + "loss": 1.1774, + "step": 3406 + }, + { + "epoch": 0.681042452712326, + "grad_norm": 2.109375, + "learning_rate": 8.833907599040668e-06, + "loss": 1.0588, + "step": 3407 + }, + { + "epoch": 0.6812423477674221, + "grad_norm": 2.171875, + "learning_rate": 8.833231070182042e-06, + "loss": 1.1366, + "step": 3408 + }, + { + "epoch": 0.6814422428225182, + "grad_norm": 1.9765625, + "learning_rate": 8.8325543710517e-06, + "loss": 1.0346, + "step": 3409 + }, + { + "epoch": 0.6816421378776143, + "grad_norm": 2.15625, + "learning_rate": 8.831877501679701e-06, + "loss": 1.1177, + "step": 3410 + }, + { + "epoch": 0.6818420329327103, + "grad_norm": 2.125, + "learning_rate": 8.831200462096115e-06, + "loss": 1.0492, + "step": 3411 + }, + { + "epoch": 0.6820419279878064, + "grad_norm": 2.0, + "learning_rate": 8.83052325233101e-06, + "loss": 1.0085, + "step": 3412 + }, + { + "epoch": 0.6822418230429025, + "grad_norm": 2.03125, + "learning_rate": 8.829845872414477e-06, + "loss": 1.0229, + "step": 3413 + }, + { + "epoch": 0.6824417180979986, + "grad_norm": 2.03125, + "learning_rate": 8.829168322376595e-06, + "loss": 0.9829, + "step": 3414 + }, + { + "epoch": 0.6826416131530946, + "grad_norm": 2.03125, + "learning_rate": 8.828490602247466e-06, + "loss": 1.0217, + "step": 3415 + }, + { + "epoch": 0.6828415082081907, + "grad_norm": 2.109375, + "learning_rate": 8.827812712057195e-06, + "loss": 1.0444, + "step": 3416 + }, + { + "epoch": 0.6830414032632868, + "grad_norm": 2.109375, + "learning_rate": 8.827134651835889e-06, + "loss": 1.0498, + "step": 3417 + }, + { + "epoch": 0.6832412983183829, + "grad_norm": 2.109375, + "learning_rate": 8.826456421613674e-06, + "loss": 1.0302, + "step": 3418 + }, + { + "epoch": 0.683441193373479, + "grad_norm": 2.015625, + "learning_rate": 8.82577802142067e-06, + "loss": 1.0086, + "step": 3419 + }, + { + "epoch": 0.683641088428575, + "grad_norm": 2.125, + "learning_rate": 8.825099451287018e-06, + "loss": 1.0617, + "step": 3420 + }, + { + "epoch": 0.6838409834836711, + "grad_norm": 2.1875, + "learning_rate": 8.824420711242855e-06, + "loss": 1.027, + "step": 3421 + }, + { + "epoch": 0.6840408785387672, + "grad_norm": 2.015625, + "learning_rate": 8.823741801318332e-06, + "loss": 1.1092, + "step": 3422 + }, + { + "epoch": 0.6842407735938633, + "grad_norm": 2.03125, + "learning_rate": 8.82306272154361e-06, + "loss": 1.0552, + "step": 3423 + }, + { + "epoch": 0.6844406686489592, + "grad_norm": 2.0625, + "learning_rate": 8.822383471948846e-06, + "loss": 0.9296, + "step": 3424 + }, + { + "epoch": 0.6846405637040553, + "grad_norm": 2.0625, + "learning_rate": 8.821704052564218e-06, + "loss": 1.0257, + "step": 3425 + }, + { + "epoch": 0.6848404587591514, + "grad_norm": 1.9609375, + "learning_rate": 8.821024463419904e-06, + "loss": 0.9842, + "step": 3426 + }, + { + "epoch": 0.6850403538142475, + "grad_norm": 2.078125, + "learning_rate": 8.820344704546093e-06, + "loss": 1.0134, + "step": 3427 + }, + { + "epoch": 0.6852402488693436, + "grad_norm": 2.15625, + "learning_rate": 8.819664775972976e-06, + "loss": 1.1109, + "step": 3428 + }, + { + "epoch": 0.6854401439244396, + "grad_norm": 2.015625, + "learning_rate": 8.818984677730759e-06, + "loss": 1.0232, + "step": 3429 + }, + { + "epoch": 0.6856400389795357, + "grad_norm": 2.0, + "learning_rate": 8.81830440984965e-06, + "loss": 1.0117, + "step": 3430 + }, + { + "epoch": 0.6858399340346318, + "grad_norm": 2.0625, + "learning_rate": 8.817623972359867e-06, + "loss": 1.0454, + "step": 3431 + }, + { + "epoch": 0.6860398290897279, + "grad_norm": 2.09375, + "learning_rate": 8.816943365291635e-06, + "loss": 1.0472, + "step": 3432 + }, + { + "epoch": 0.6862397241448239, + "grad_norm": 2.234375, + "learning_rate": 8.816262588675186e-06, + "loss": 1.1059, + "step": 3433 + }, + { + "epoch": 0.68643961919992, + "grad_norm": 2.03125, + "learning_rate": 8.815581642540763e-06, + "loss": 0.9828, + "step": 3434 + }, + { + "epoch": 0.6866395142550161, + "grad_norm": 2.109375, + "learning_rate": 8.814900526918608e-06, + "loss": 1.0322, + "step": 3435 + }, + { + "epoch": 0.6868394093101122, + "grad_norm": 2.59375, + "learning_rate": 8.814219241838979e-06, + "loss": 1.0228, + "step": 3436 + }, + { + "epoch": 0.6870393043652082, + "grad_norm": 2.296875, + "learning_rate": 8.81353778733214e-06, + "loss": 1.0842, + "step": 3437 + }, + { + "epoch": 0.6872391994203043, + "grad_norm": 2.015625, + "learning_rate": 8.812856163428358e-06, + "loss": 1.0629, + "step": 3438 + }, + { + "epoch": 0.6874390944754004, + "grad_norm": 2.265625, + "learning_rate": 8.812174370157915e-06, + "loss": 1.0816, + "step": 3439 + }, + { + "epoch": 0.6876389895304965, + "grad_norm": 2.09375, + "learning_rate": 8.811492407551092e-06, + "loss": 1.0512, + "step": 3440 + }, + { + "epoch": 0.6878388845855926, + "grad_norm": 2.03125, + "learning_rate": 8.810810275638183e-06, + "loss": 1.0826, + "step": 3441 + }, + { + "epoch": 0.6880387796406886, + "grad_norm": 2.03125, + "learning_rate": 8.810127974449489e-06, + "loss": 1.0436, + "step": 3442 + }, + { + "epoch": 0.6882386746957847, + "grad_norm": 2.03125, + "learning_rate": 8.809445504015318e-06, + "loss": 0.9917, + "step": 3443 + }, + { + "epoch": 0.6884385697508808, + "grad_norm": 2.296875, + "learning_rate": 8.808762864365985e-06, + "loss": 1.106, + "step": 3444 + }, + { + "epoch": 0.6886384648059769, + "grad_norm": 2.015625, + "learning_rate": 8.80808005553181e-06, + "loss": 1.0241, + "step": 3445 + }, + { + "epoch": 0.6888383598610729, + "grad_norm": 2.171875, + "learning_rate": 8.807397077543127e-06, + "loss": 1.1441, + "step": 3446 + }, + { + "epoch": 0.689038254916169, + "grad_norm": 2.0625, + "learning_rate": 8.806713930430273e-06, + "loss": 1.0712, + "step": 3447 + }, + { + "epoch": 0.6892381499712651, + "grad_norm": 2.03125, + "learning_rate": 8.806030614223592e-06, + "loss": 1.082, + "step": 3448 + }, + { + "epoch": 0.6894380450263612, + "grad_norm": 2.03125, + "learning_rate": 8.805347128953438e-06, + "loss": 1.0949, + "step": 3449 + }, + { + "epoch": 0.6896379400814573, + "grad_norm": 2.03125, + "learning_rate": 8.80466347465017e-06, + "loss": 0.9658, + "step": 3450 + }, + { + "epoch": 0.6898378351365533, + "grad_norm": 2.0625, + "learning_rate": 8.803979651344159e-06, + "loss": 0.9569, + "step": 3451 + }, + { + "epoch": 0.6900377301916494, + "grad_norm": 2.140625, + "learning_rate": 8.803295659065776e-06, + "loss": 1.0401, + "step": 3452 + }, + { + "epoch": 0.6902376252467455, + "grad_norm": 2.046875, + "learning_rate": 8.802611497845407e-06, + "loss": 1.0324, + "step": 3453 + }, + { + "epoch": 0.6904375203018416, + "grad_norm": 2.078125, + "learning_rate": 8.801927167713442e-06, + "loss": 1.0611, + "step": 3454 + }, + { + "epoch": 0.6906374153569376, + "grad_norm": 2.296875, + "learning_rate": 8.801242668700277e-06, + "loss": 1.005, + "step": 3455 + }, + { + "epoch": 0.6908373104120337, + "grad_norm": 2.09375, + "learning_rate": 8.800558000836318e-06, + "loss": 1.0715, + "step": 3456 + }, + { + "epoch": 0.6910372054671298, + "grad_norm": 2.15625, + "learning_rate": 8.799873164151981e-06, + "loss": 1.0608, + "step": 3457 + }, + { + "epoch": 0.6912371005222259, + "grad_norm": 2.09375, + "learning_rate": 8.799188158677683e-06, + "loss": 1.1111, + "step": 3458 + }, + { + "epoch": 0.6914369955773219, + "grad_norm": 2.09375, + "learning_rate": 8.79850298444385e-06, + "loss": 1.0775, + "step": 3459 + }, + { + "epoch": 0.691636890632418, + "grad_norm": 2.21875, + "learning_rate": 8.797817641480923e-06, + "loss": 1.0977, + "step": 3460 + }, + { + "epoch": 0.691836785687514, + "grad_norm": 2.140625, + "learning_rate": 8.797132129819343e-06, + "loss": 1.0691, + "step": 3461 + }, + { + "epoch": 0.6920366807426102, + "grad_norm": 2.109375, + "learning_rate": 8.796446449489557e-06, + "loss": 1.0476, + "step": 3462 + }, + { + "epoch": 0.6922365757977063, + "grad_norm": 2.15625, + "learning_rate": 8.795760600522025e-06, + "loss": 1.0863, + "step": 3463 + }, + { + "epoch": 0.6924364708528022, + "grad_norm": 1.9765625, + "learning_rate": 8.795074582947214e-06, + "loss": 1.073, + "step": 3464 + }, + { + "epoch": 0.6926363659078983, + "grad_norm": 2.046875, + "learning_rate": 8.794388396795595e-06, + "loss": 1.0367, + "step": 3465 + }, + { + "epoch": 0.6928362609629944, + "grad_norm": 2.046875, + "learning_rate": 8.79370204209765e-06, + "loss": 0.9947, + "step": 3466 + }, + { + "epoch": 0.6930361560180905, + "grad_norm": 2.15625, + "learning_rate": 8.793015518883862e-06, + "loss": 1.1298, + "step": 3467 + }, + { + "epoch": 0.6932360510731865, + "grad_norm": 2.03125, + "learning_rate": 8.792328827184733e-06, + "loss": 1.0406, + "step": 3468 + }, + { + "epoch": 0.6934359461282826, + "grad_norm": 2.25, + "learning_rate": 8.791641967030761e-06, + "loss": 1.0598, + "step": 3469 + }, + { + "epoch": 0.6936358411833787, + "grad_norm": 2.140625, + "learning_rate": 8.790954938452458e-06, + "loss": 1.0779, + "step": 3470 + }, + { + "epoch": 0.6938357362384748, + "grad_norm": 2.1875, + "learning_rate": 8.790267741480342e-06, + "loss": 1.0949, + "step": 3471 + }, + { + "epoch": 0.6940356312935708, + "grad_norm": 2.09375, + "learning_rate": 8.789580376144938e-06, + "loss": 1.0611, + "step": 3472 + }, + { + "epoch": 0.6942355263486669, + "grad_norm": 2.125, + "learning_rate": 8.788892842476777e-06, + "loss": 1.1123, + "step": 3473 + }, + { + "epoch": 0.694435421403763, + "grad_norm": 2.046875, + "learning_rate": 8.7882051405064e-06, + "loss": 1.0101, + "step": 3474 + }, + { + "epoch": 0.6946353164588591, + "grad_norm": 1.9609375, + "learning_rate": 8.78751727026436e-06, + "loss": 1.0699, + "step": 3475 + }, + { + "epoch": 0.6948352115139552, + "grad_norm": 2.03125, + "learning_rate": 8.786829231781203e-06, + "loss": 0.986, + "step": 3476 + }, + { + "epoch": 0.6950351065690512, + "grad_norm": 2.15625, + "learning_rate": 8.786141025087496e-06, + "loss": 1.0022, + "step": 3477 + }, + { + "epoch": 0.6952350016241473, + "grad_norm": 2.15625, + "learning_rate": 8.78545265021381e-06, + "loss": 1.0105, + "step": 3478 + }, + { + "epoch": 0.6954348966792434, + "grad_norm": 2.203125, + "learning_rate": 8.784764107190723e-06, + "loss": 1.059, + "step": 3479 + }, + { + "epoch": 0.6956347917343395, + "grad_norm": 2.015625, + "learning_rate": 8.784075396048814e-06, + "loss": 0.9632, + "step": 3480 + }, + { + "epoch": 0.6958346867894355, + "grad_norm": 2.03125, + "learning_rate": 8.783386516818684e-06, + "loss": 1.001, + "step": 3481 + }, + { + "epoch": 0.6960345818445316, + "grad_norm": 1.9765625, + "learning_rate": 8.782697469530929e-06, + "loss": 1.0463, + "step": 3482 + }, + { + "epoch": 0.6962344768996277, + "grad_norm": 2.0625, + "learning_rate": 8.782008254216155e-06, + "loss": 1.0061, + "step": 3483 + }, + { + "epoch": 0.6964343719547238, + "grad_norm": 2.109375, + "learning_rate": 8.78131887090498e-06, + "loss": 1.1041, + "step": 3484 + }, + { + "epoch": 0.6966342670098199, + "grad_norm": 2.046875, + "learning_rate": 8.780629319628023e-06, + "loss": 0.9527, + "step": 3485 + }, + { + "epoch": 0.6968341620649159, + "grad_norm": 2.171875, + "learning_rate": 8.779939600415917e-06, + "loss": 1.0341, + "step": 3486 + }, + { + "epoch": 0.697034057120012, + "grad_norm": 2.046875, + "learning_rate": 8.779249713299296e-06, + "loss": 0.8984, + "step": 3487 + }, + { + "epoch": 0.6972339521751081, + "grad_norm": 2.140625, + "learning_rate": 8.778559658308806e-06, + "loss": 1.0076, + "step": 3488 + }, + { + "epoch": 0.6974338472302042, + "grad_norm": 2.125, + "learning_rate": 8.777869435475101e-06, + "loss": 1.0677, + "step": 3489 + }, + { + "epoch": 0.6976337422853002, + "grad_norm": 2.1875, + "learning_rate": 8.777179044828838e-06, + "loss": 1.0401, + "step": 3490 + }, + { + "epoch": 0.6978336373403963, + "grad_norm": 2.0, + "learning_rate": 8.776488486400688e-06, + "loss": 0.942, + "step": 3491 + }, + { + "epoch": 0.6980335323954924, + "grad_norm": 2.203125, + "learning_rate": 8.775797760221318e-06, + "loss": 1.1208, + "step": 3492 + }, + { + "epoch": 0.6982334274505885, + "grad_norm": 2.171875, + "learning_rate": 8.775106866321419e-06, + "loss": 1.0702, + "step": 3493 + }, + { + "epoch": 0.6984333225056845, + "grad_norm": 2.25, + "learning_rate": 8.774415804731674e-06, + "loss": 1.153, + "step": 3494 + }, + { + "epoch": 0.6986332175607806, + "grad_norm": 2.09375, + "learning_rate": 8.773724575482783e-06, + "loss": 1.0306, + "step": 3495 + }, + { + "epoch": 0.6988331126158767, + "grad_norm": 2.0625, + "learning_rate": 8.77303317860545e-06, + "loss": 0.9907, + "step": 3496 + }, + { + "epoch": 0.6990330076709728, + "grad_norm": 2.171875, + "learning_rate": 8.772341614130384e-06, + "loss": 1.0426, + "step": 3497 + }, + { + "epoch": 0.6992329027260689, + "grad_norm": 1.9921875, + "learning_rate": 8.771649882088309e-06, + "loss": 1.0738, + "step": 3498 + }, + { + "epoch": 0.6994327977811649, + "grad_norm": 2.046875, + "learning_rate": 8.770957982509947e-06, + "loss": 1.0714, + "step": 3499 + }, + { + "epoch": 0.699632692836261, + "grad_norm": 2.015625, + "learning_rate": 8.770265915426035e-06, + "loss": 1.0174, + "step": 3500 + }, + { + "epoch": 0.699832587891357, + "grad_norm": 2.125, + "learning_rate": 8.769573680867314e-06, + "loss": 0.9708, + "step": 3501 + }, + { + "epoch": 0.7000324829464531, + "grad_norm": 2.0, + "learning_rate": 8.768881278864532e-06, + "loss": 1.025, + "step": 3502 + }, + { + "epoch": 0.7002323780015491, + "grad_norm": 2.0625, + "learning_rate": 8.768188709448446e-06, + "loss": 1.0102, + "step": 3503 + }, + { + "epoch": 0.7004322730566452, + "grad_norm": 2.203125, + "learning_rate": 8.76749597264982e-06, + "loss": 1.0685, + "step": 3504 + }, + { + "epoch": 0.7006321681117413, + "grad_norm": 2.078125, + "learning_rate": 8.766803068499426e-06, + "loss": 1.0231, + "step": 3505 + }, + { + "epoch": 0.7008320631668374, + "grad_norm": 2.109375, + "learning_rate": 8.766109997028042e-06, + "loss": 1.0792, + "step": 3506 + }, + { + "epoch": 0.7010319582219335, + "grad_norm": 2.0625, + "learning_rate": 8.765416758266454e-06, + "loss": 1.0519, + "step": 3507 + }, + { + "epoch": 0.7012318532770295, + "grad_norm": 2.109375, + "learning_rate": 8.764723352245455e-06, + "loss": 1.057, + "step": 3508 + }, + { + "epoch": 0.7014317483321256, + "grad_norm": 2.0625, + "learning_rate": 8.764029778995848e-06, + "loss": 0.9859, + "step": 3509 + }, + { + "epoch": 0.7016316433872217, + "grad_norm": 1.984375, + "learning_rate": 8.76333603854844e-06, + "loss": 1.0624, + "step": 3510 + }, + { + "epoch": 0.7018315384423178, + "grad_norm": 2.0625, + "learning_rate": 8.762642130934048e-06, + "loss": 1.0676, + "step": 3511 + }, + { + "epoch": 0.7020314334974138, + "grad_norm": 1.9921875, + "learning_rate": 8.761948056183492e-06, + "loss": 1.0841, + "step": 3512 + }, + { + "epoch": 0.7022313285525099, + "grad_norm": 2.03125, + "learning_rate": 8.761253814327606e-06, + "loss": 1.0859, + "step": 3513 + }, + { + "epoch": 0.702431223607606, + "grad_norm": 2.03125, + "learning_rate": 8.760559405397228e-06, + "loss": 1.0258, + "step": 3514 + }, + { + "epoch": 0.7026311186627021, + "grad_norm": 1.90625, + "learning_rate": 8.759864829423202e-06, + "loss": 0.9817, + "step": 3515 + }, + { + "epoch": 0.7028310137177981, + "grad_norm": 1.9921875, + "learning_rate": 8.759170086436382e-06, + "loss": 1.0133, + "step": 3516 + }, + { + "epoch": 0.7030309087728942, + "grad_norm": 2.15625, + "learning_rate": 8.75847517646763e-06, + "loss": 1.1056, + "step": 3517 + }, + { + "epoch": 0.7032308038279903, + "grad_norm": 2.03125, + "learning_rate": 8.75778009954781e-06, + "loss": 1.0127, + "step": 3518 + }, + { + "epoch": 0.7034306988830864, + "grad_norm": 2.09375, + "learning_rate": 8.757084855707799e-06, + "loss": 1.1148, + "step": 3519 + }, + { + "epoch": 0.7036305939381825, + "grad_norm": 2.15625, + "learning_rate": 8.756389444978482e-06, + "loss": 1.0803, + "step": 3520 + }, + { + "epoch": 0.7038304889932785, + "grad_norm": 2.078125, + "learning_rate": 8.755693867390746e-06, + "loss": 1.0793, + "step": 3521 + }, + { + "epoch": 0.7040303840483746, + "grad_norm": 2.09375, + "learning_rate": 8.754998122975489e-06, + "loss": 1.1372, + "step": 3522 + }, + { + "epoch": 0.7042302791034707, + "grad_norm": 2.078125, + "learning_rate": 8.754302211763616e-06, + "loss": 1.0156, + "step": 3523 + }, + { + "epoch": 0.7044301741585668, + "grad_norm": 1.9765625, + "learning_rate": 8.753606133786042e-06, + "loss": 0.9808, + "step": 3524 + }, + { + "epoch": 0.7046300692136628, + "grad_norm": 2.09375, + "learning_rate": 8.752909889073681e-06, + "loss": 1.0435, + "step": 3525 + }, + { + "epoch": 0.7048299642687589, + "grad_norm": 2.046875, + "learning_rate": 8.752213477657467e-06, + "loss": 1.017, + "step": 3526 + }, + { + "epoch": 0.705029859323855, + "grad_norm": 2.109375, + "learning_rate": 8.751516899568329e-06, + "loss": 1.0034, + "step": 3527 + }, + { + "epoch": 0.7052297543789511, + "grad_norm": 2.09375, + "learning_rate": 8.750820154837213e-06, + "loss": 1.0923, + "step": 3528 + }, + { + "epoch": 0.7054296494340472, + "grad_norm": 2.234375, + "learning_rate": 8.750123243495066e-06, + "loss": 1.1833, + "step": 3529 + }, + { + "epoch": 0.7056295444891432, + "grad_norm": 2.046875, + "learning_rate": 8.749426165572843e-06, + "loss": 1.0742, + "step": 3530 + }, + { + "epoch": 0.7058294395442393, + "grad_norm": 2.0625, + "learning_rate": 8.748728921101511e-06, + "loss": 0.9526, + "step": 3531 + }, + { + "epoch": 0.7060293345993354, + "grad_norm": 2.09375, + "learning_rate": 8.748031510112041e-06, + "loss": 1.0627, + "step": 3532 + }, + { + "epoch": 0.7062292296544315, + "grad_norm": 2.15625, + "learning_rate": 8.747333932635412e-06, + "loss": 0.9998, + "step": 3533 + }, + { + "epoch": 0.7064291247095275, + "grad_norm": 1.921875, + "learning_rate": 8.746636188702609e-06, + "loss": 0.9563, + "step": 3534 + }, + { + "epoch": 0.7066290197646236, + "grad_norm": 2.125, + "learning_rate": 8.745938278344628e-06, + "loss": 1.1783, + "step": 3535 + }, + { + "epoch": 0.7068289148197197, + "grad_norm": 2.0, + "learning_rate": 8.745240201592466e-06, + "loss": 1.0853, + "step": 3536 + }, + { + "epoch": 0.7070288098748158, + "grad_norm": 2.109375, + "learning_rate": 8.744541958477138e-06, + "loss": 1.1799, + "step": 3537 + }, + { + "epoch": 0.7072287049299117, + "grad_norm": 2.03125, + "learning_rate": 8.743843549029653e-06, + "loss": 0.994, + "step": 3538 + }, + { + "epoch": 0.7074285999850078, + "grad_norm": 2.125, + "learning_rate": 8.74314497328104e-06, + "loss": 1.1054, + "step": 3539 + }, + { + "epoch": 0.7076284950401039, + "grad_norm": 2.140625, + "learning_rate": 8.742446231262324e-06, + "loss": 1.0607, + "step": 3540 + }, + { + "epoch": 0.7078283900952, + "grad_norm": 2.109375, + "learning_rate": 8.741747323004549e-06, + "loss": 1.0312, + "step": 3541 + }, + { + "epoch": 0.7080282851502961, + "grad_norm": 2.109375, + "learning_rate": 8.741048248538757e-06, + "loss": 1.0552, + "step": 3542 + }, + { + "epoch": 0.7082281802053921, + "grad_norm": 2.171875, + "learning_rate": 8.740349007896001e-06, + "loss": 1.0946, + "step": 3543 + }, + { + "epoch": 0.7084280752604882, + "grad_norm": 2.109375, + "learning_rate": 8.73964960110734e-06, + "loss": 1.029, + "step": 3544 + }, + { + "epoch": 0.7086279703155843, + "grad_norm": 2.15625, + "learning_rate": 8.738950028203845e-06, + "loss": 1.1373, + "step": 3545 + }, + { + "epoch": 0.7088278653706804, + "grad_norm": 2.234375, + "learning_rate": 8.738250289216588e-06, + "loss": 1.1068, + "step": 3546 + }, + { + "epoch": 0.7090277604257764, + "grad_norm": 2.046875, + "learning_rate": 8.737550384176654e-06, + "loss": 0.9384, + "step": 3547 + }, + { + "epoch": 0.7092276554808725, + "grad_norm": 2.125, + "learning_rate": 8.73685031311513e-06, + "loss": 1.1025, + "step": 3548 + }, + { + "epoch": 0.7094275505359686, + "grad_norm": 2.140625, + "learning_rate": 8.736150076063114e-06, + "loss": 1.1502, + "step": 3549 + }, + { + "epoch": 0.7096274455910647, + "grad_norm": 2.09375, + "learning_rate": 8.735449673051711e-06, + "loss": 1.0809, + "step": 3550 + }, + { + "epoch": 0.7098273406461608, + "grad_norm": 2.0625, + "learning_rate": 8.734749104112032e-06, + "loss": 1.0088, + "step": 3551 + }, + { + "epoch": 0.7100272357012568, + "grad_norm": 2.140625, + "learning_rate": 8.734048369275199e-06, + "loss": 1.1559, + "step": 3552 + }, + { + "epoch": 0.7102271307563529, + "grad_norm": 1.9765625, + "learning_rate": 8.733347468572333e-06, + "loss": 0.9297, + "step": 3553 + }, + { + "epoch": 0.710427025811449, + "grad_norm": 2.015625, + "learning_rate": 8.732646402034572e-06, + "loss": 0.9721, + "step": 3554 + }, + { + "epoch": 0.7106269208665451, + "grad_norm": 2.03125, + "learning_rate": 8.731945169693058e-06, + "loss": 1.0645, + "step": 3555 + }, + { + "epoch": 0.7108268159216411, + "grad_norm": 1.984375, + "learning_rate": 8.731243771578937e-06, + "loss": 1.0229, + "step": 3556 + }, + { + "epoch": 0.7110267109767372, + "grad_norm": 2.140625, + "learning_rate": 8.730542207723367e-06, + "loss": 1.0102, + "step": 3557 + }, + { + "epoch": 0.7112266060318333, + "grad_norm": 2.015625, + "learning_rate": 8.72984047815751e-06, + "loss": 1.0751, + "step": 3558 + }, + { + "epoch": 0.7114265010869294, + "grad_norm": 2.171875, + "learning_rate": 8.729138582912538e-06, + "loss": 1.0856, + "step": 3559 + }, + { + "epoch": 0.7116263961420254, + "grad_norm": 1.96875, + "learning_rate": 8.728436522019627e-06, + "loss": 1.0536, + "step": 3560 + }, + { + "epoch": 0.7118262911971215, + "grad_norm": 2.109375, + "learning_rate": 8.727734295509964e-06, + "loss": 1.0875, + "step": 3561 + }, + { + "epoch": 0.7120261862522176, + "grad_norm": 2.140625, + "learning_rate": 8.727031903414743e-06, + "loss": 1.0707, + "step": 3562 + }, + { + "epoch": 0.7122260813073137, + "grad_norm": 2.09375, + "learning_rate": 8.72632934576516e-06, + "loss": 1.0657, + "step": 3563 + }, + { + "epoch": 0.7124259763624098, + "grad_norm": 2.03125, + "learning_rate": 8.72562662259243e-06, + "loss": 1.0626, + "step": 3564 + }, + { + "epoch": 0.7126258714175058, + "grad_norm": 1.9765625, + "learning_rate": 8.72492373392776e-06, + "loss": 0.9411, + "step": 3565 + }, + { + "epoch": 0.7128257664726019, + "grad_norm": 2.0625, + "learning_rate": 8.724220679802377e-06, + "loss": 1.157, + "step": 3566 + }, + { + "epoch": 0.713025661527698, + "grad_norm": 1.9921875, + "learning_rate": 8.723517460247509e-06, + "loss": 1.0013, + "step": 3567 + }, + { + "epoch": 0.7132255565827941, + "grad_norm": 2.0, + "learning_rate": 8.722814075294392e-06, + "loss": 1.0098, + "step": 3568 + }, + { + "epoch": 0.7134254516378901, + "grad_norm": 2.03125, + "learning_rate": 8.722110524974273e-06, + "loss": 1.0333, + "step": 3569 + }, + { + "epoch": 0.7136253466929862, + "grad_norm": 2.09375, + "learning_rate": 8.7214068093184e-06, + "loss": 1.0358, + "step": 3570 + }, + { + "epoch": 0.7138252417480823, + "grad_norm": 2.15625, + "learning_rate": 8.720702928358036e-06, + "loss": 1.0748, + "step": 3571 + }, + { + "epoch": 0.7140251368031784, + "grad_norm": 2.03125, + "learning_rate": 8.719998882124446e-06, + "loss": 1.0685, + "step": 3572 + }, + { + "epoch": 0.7142250318582744, + "grad_norm": 1.9765625, + "learning_rate": 8.7192946706489e-06, + "loss": 1.0813, + "step": 3573 + }, + { + "epoch": 0.7144249269133705, + "grad_norm": 2.09375, + "learning_rate": 8.718590293962684e-06, + "loss": 1.0034, + "step": 3574 + }, + { + "epoch": 0.7146248219684666, + "grad_norm": 2.078125, + "learning_rate": 8.717885752097084e-06, + "loss": 1.0424, + "step": 3575 + }, + { + "epoch": 0.7148247170235627, + "grad_norm": 2.109375, + "learning_rate": 8.717181045083396e-06, + "loss": 1.0401, + "step": 3576 + }, + { + "epoch": 0.7150246120786588, + "grad_norm": 2.046875, + "learning_rate": 8.716476172952921e-06, + "loss": 1.0595, + "step": 3577 + }, + { + "epoch": 0.7152245071337547, + "grad_norm": 2.140625, + "learning_rate": 8.715771135736975e-06, + "loss": 1.0123, + "step": 3578 + }, + { + "epoch": 0.7154244021888508, + "grad_norm": 2.1875, + "learning_rate": 8.715065933466869e-06, + "loss": 1.1616, + "step": 3579 + }, + { + "epoch": 0.7156242972439469, + "grad_norm": 2.078125, + "learning_rate": 8.714360566173932e-06, + "loss": 1.057, + "step": 3580 + }, + { + "epoch": 0.715824192299043, + "grad_norm": 2.140625, + "learning_rate": 8.713655033889495e-06, + "loss": 1.0771, + "step": 3581 + }, + { + "epoch": 0.716024087354139, + "grad_norm": 2.140625, + "learning_rate": 8.712949336644898e-06, + "loss": 1.0603, + "step": 3582 + }, + { + "epoch": 0.7162239824092351, + "grad_norm": 2.09375, + "learning_rate": 8.71224347447149e-06, + "loss": 1.0811, + "step": 3583 + }, + { + "epoch": 0.7164238774643312, + "grad_norm": 1.9765625, + "learning_rate": 8.711537447400622e-06, + "loss": 1.0414, + "step": 3584 + }, + { + "epoch": 0.7166237725194273, + "grad_norm": 2.140625, + "learning_rate": 8.710831255463656e-06, + "loss": 1.0759, + "step": 3585 + }, + { + "epoch": 0.7168236675745234, + "grad_norm": 2.1875, + "learning_rate": 8.710124898691963e-06, + "loss": 1.0057, + "step": 3586 + }, + { + "epoch": 0.7170235626296194, + "grad_norm": 2.078125, + "learning_rate": 8.709418377116918e-06, + "loss": 1.0743, + "step": 3587 + }, + { + "epoch": 0.7172234576847155, + "grad_norm": 2.234375, + "learning_rate": 8.708711690769904e-06, + "loss": 1.1095, + "step": 3588 + }, + { + "epoch": 0.7174233527398116, + "grad_norm": 2.125, + "learning_rate": 8.708004839682315e-06, + "loss": 1.1079, + "step": 3589 + }, + { + "epoch": 0.7176232477949077, + "grad_norm": 2.03125, + "learning_rate": 8.707297823885545e-06, + "loss": 1.0356, + "step": 3590 + }, + { + "epoch": 0.7178231428500037, + "grad_norm": 2.03125, + "learning_rate": 8.706590643411002e-06, + "loss": 1.057, + "step": 3591 + }, + { + "epoch": 0.7180230379050998, + "grad_norm": 2.03125, + "learning_rate": 8.7058832982901e-06, + "loss": 1.0344, + "step": 3592 + }, + { + "epoch": 0.7182229329601959, + "grad_norm": 2.125, + "learning_rate": 8.705175788554256e-06, + "loss": 0.9713, + "step": 3593 + }, + { + "epoch": 0.718422828015292, + "grad_norm": 2.15625, + "learning_rate": 8.7044681142349e-06, + "loss": 1.0609, + "step": 3594 + }, + { + "epoch": 0.718622723070388, + "grad_norm": 2.1875, + "learning_rate": 8.703760275363466e-06, + "loss": 1.0574, + "step": 3595 + }, + { + "epoch": 0.7188226181254841, + "grad_norm": 2.0, + "learning_rate": 8.703052271971395e-06, + "loss": 1.0209, + "step": 3596 + }, + { + "epoch": 0.7190225131805802, + "grad_norm": 2.140625, + "learning_rate": 8.702344104090139e-06, + "loss": 0.9954, + "step": 3597 + }, + { + "epoch": 0.7192224082356763, + "grad_norm": 2.125, + "learning_rate": 8.701635771751153e-06, + "loss": 0.9903, + "step": 3598 + }, + { + "epoch": 0.7194223032907724, + "grad_norm": 2.109375, + "learning_rate": 8.700927274985903e-06, + "loss": 1.092, + "step": 3599 + }, + { + "epoch": 0.7196221983458684, + "grad_norm": 2.109375, + "learning_rate": 8.700218613825855e-06, + "loss": 1.0449, + "step": 3600 + }, + { + "epoch": 0.7198220934009645, + "grad_norm": 2.0, + "learning_rate": 8.699509788302493e-06, + "loss": 1.0659, + "step": 3601 + }, + { + "epoch": 0.7200219884560606, + "grad_norm": 2.046875, + "learning_rate": 8.698800798447302e-06, + "loss": 0.9256, + "step": 3602 + }, + { + "epoch": 0.7202218835111567, + "grad_norm": 1.9921875, + "learning_rate": 8.698091644291774e-06, + "loss": 1.0132, + "step": 3603 + }, + { + "epoch": 0.7204217785662527, + "grad_norm": 2.09375, + "learning_rate": 8.69738232586741e-06, + "loss": 1.0542, + "step": 3604 + }, + { + "epoch": 0.7206216736213488, + "grad_norm": 2.09375, + "learning_rate": 8.696672843205718e-06, + "loss": 1.046, + "step": 3605 + }, + { + "epoch": 0.7208215686764449, + "grad_norm": 2.109375, + "learning_rate": 8.695963196338214e-06, + "loss": 0.9889, + "step": 3606 + }, + { + "epoch": 0.721021463731541, + "grad_norm": 2.078125, + "learning_rate": 8.69525338529642e-06, + "loss": 1.0458, + "step": 3607 + }, + { + "epoch": 0.7212213587866371, + "grad_norm": 2.015625, + "learning_rate": 8.694543410111864e-06, + "loss": 1.0359, + "step": 3608 + }, + { + "epoch": 0.7214212538417331, + "grad_norm": 2.015625, + "learning_rate": 8.693833270816083e-06, + "loss": 1.0612, + "step": 3609 + }, + { + "epoch": 0.7216211488968292, + "grad_norm": 2.15625, + "learning_rate": 8.693122967440626e-06, + "loss": 1.0821, + "step": 3610 + }, + { + "epoch": 0.7218210439519253, + "grad_norm": 2.015625, + "learning_rate": 8.69241250001704e-06, + "loss": 1.0275, + "step": 3611 + }, + { + "epoch": 0.7220209390070214, + "grad_norm": 2.03125, + "learning_rate": 8.691701868576883e-06, + "loss": 1.0511, + "step": 3612 + }, + { + "epoch": 0.7222208340621173, + "grad_norm": 1.96875, + "learning_rate": 8.690991073151724e-06, + "loss": 0.9708, + "step": 3613 + }, + { + "epoch": 0.7224207291172134, + "grad_norm": 2.03125, + "learning_rate": 8.690280113773138e-06, + "loss": 0.9171, + "step": 3614 + }, + { + "epoch": 0.7226206241723095, + "grad_norm": 2.203125, + "learning_rate": 8.689568990472701e-06, + "loss": 1.1249, + "step": 3615 + }, + { + "epoch": 0.7228205192274056, + "grad_norm": 2.15625, + "learning_rate": 8.688857703282005e-06, + "loss": 1.0374, + "step": 3616 + }, + { + "epoch": 0.7230204142825016, + "grad_norm": 2.015625, + "learning_rate": 8.688146252232644e-06, + "loss": 0.9733, + "step": 3617 + }, + { + "epoch": 0.7232203093375977, + "grad_norm": 2.0625, + "learning_rate": 8.68743463735622e-06, + "loss": 1.0975, + "step": 3618 + }, + { + "epoch": 0.7234202043926938, + "grad_norm": 2.015625, + "learning_rate": 8.686722858684342e-06, + "loss": 1.0447, + "step": 3619 + }, + { + "epoch": 0.7236200994477899, + "grad_norm": 2.078125, + "learning_rate": 8.68601091624863e-06, + "loss": 1.1732, + "step": 3620 + }, + { + "epoch": 0.723819994502886, + "grad_norm": 2.03125, + "learning_rate": 8.685298810080706e-06, + "loss": 1.0777, + "step": 3621 + }, + { + "epoch": 0.724019889557982, + "grad_norm": 2.15625, + "learning_rate": 8.684586540212203e-06, + "loss": 1.1034, + "step": 3622 + }, + { + "epoch": 0.7242197846130781, + "grad_norm": 2.0, + "learning_rate": 8.683874106674759e-06, + "loss": 1.0943, + "step": 3623 + }, + { + "epoch": 0.7244196796681742, + "grad_norm": 1.9921875, + "learning_rate": 8.68316150950002e-06, + "loss": 1.0627, + "step": 3624 + }, + { + "epoch": 0.7246195747232703, + "grad_norm": 2.0, + "learning_rate": 8.68244874871964e-06, + "loss": 0.9761, + "step": 3625 + }, + { + "epoch": 0.7248194697783663, + "grad_norm": 2.0625, + "learning_rate": 8.681735824365281e-06, + "loss": 0.9976, + "step": 3626 + }, + { + "epoch": 0.7250193648334624, + "grad_norm": 2.015625, + "learning_rate": 8.681022736468609e-06, + "loss": 0.9816, + "step": 3627 + }, + { + "epoch": 0.7252192598885585, + "grad_norm": 2.046875, + "learning_rate": 8.680309485061302e-06, + "loss": 1.0761, + "step": 3628 + }, + { + "epoch": 0.7254191549436546, + "grad_norm": 2.09375, + "learning_rate": 8.679596070175038e-06, + "loss": 1.0507, + "step": 3629 + }, + { + "epoch": 0.7256190499987507, + "grad_norm": 2.078125, + "learning_rate": 8.678882491841512e-06, + "loss": 1.0754, + "step": 3630 + }, + { + "epoch": 0.7258189450538467, + "grad_norm": 2.015625, + "learning_rate": 8.678168750092419e-06, + "loss": 1.0151, + "step": 3631 + }, + { + "epoch": 0.7260188401089428, + "grad_norm": 2.140625, + "learning_rate": 8.67745484495946e-06, + "loss": 1.0596, + "step": 3632 + }, + { + "epoch": 0.7262187351640389, + "grad_norm": 2.03125, + "learning_rate": 8.676740776474351e-06, + "loss": 1.0023, + "step": 3633 + }, + { + "epoch": 0.726418630219135, + "grad_norm": 2.171875, + "learning_rate": 8.67602654466881e-06, + "loss": 1.0468, + "step": 3634 + }, + { + "epoch": 0.726618525274231, + "grad_norm": 2.046875, + "learning_rate": 8.675312149574562e-06, + "loss": 1.0281, + "step": 3635 + }, + { + "epoch": 0.7268184203293271, + "grad_norm": 2.09375, + "learning_rate": 8.67459759122334e-06, + "loss": 1.0334, + "step": 3636 + }, + { + "epoch": 0.7270183153844232, + "grad_norm": 2.109375, + "learning_rate": 8.673882869646888e-06, + "loss": 1.1007, + "step": 3637 + }, + { + "epoch": 0.7272182104395193, + "grad_norm": 2.15625, + "learning_rate": 8.67316798487695e-06, + "loss": 1.1333, + "step": 3638 + }, + { + "epoch": 0.7274181054946153, + "grad_norm": 2.09375, + "learning_rate": 8.672452936945282e-06, + "loss": 1.0483, + "step": 3639 + }, + { + "epoch": 0.7276180005497114, + "grad_norm": 2.125, + "learning_rate": 8.671737725883646e-06, + "loss": 1.0026, + "step": 3640 + }, + { + "epoch": 0.7278178956048075, + "grad_norm": 1.9921875, + "learning_rate": 8.671022351723813e-06, + "loss": 0.9676, + "step": 3641 + }, + { + "epoch": 0.7280177906599036, + "grad_norm": 2.171875, + "learning_rate": 8.67030681449756e-06, + "loss": 1.0058, + "step": 3642 + }, + { + "epoch": 0.7282176857149997, + "grad_norm": 2.25, + "learning_rate": 8.66959111423667e-06, + "loss": 1.0806, + "step": 3643 + }, + { + "epoch": 0.7284175807700957, + "grad_norm": 2.03125, + "learning_rate": 8.668875250972934e-06, + "loss": 1.0737, + "step": 3644 + }, + { + "epoch": 0.7286174758251918, + "grad_norm": 2.0625, + "learning_rate": 8.668159224738153e-06, + "loss": 1.0112, + "step": 3645 + }, + { + "epoch": 0.7288173708802879, + "grad_norm": 2.15625, + "learning_rate": 8.667443035564129e-06, + "loss": 1.073, + "step": 3646 + }, + { + "epoch": 0.729017265935384, + "grad_norm": 2.078125, + "learning_rate": 8.666726683482678e-06, + "loss": 1.1717, + "step": 3647 + }, + { + "epoch": 0.72921716099048, + "grad_norm": 2.15625, + "learning_rate": 8.666010168525618e-06, + "loss": 1.072, + "step": 3648 + }, + { + "epoch": 0.729417056045576, + "grad_norm": 2.09375, + "learning_rate": 8.66529349072478e-06, + "loss": 1.0895, + "step": 3649 + }, + { + "epoch": 0.7296169511006722, + "grad_norm": 2.234375, + "learning_rate": 8.664576650111995e-06, + "loss": 1.0967, + "step": 3650 + }, + { + "epoch": 0.7298168461557683, + "grad_norm": 2.171875, + "learning_rate": 8.663859646719106e-06, + "loss": 1.024, + "step": 3651 + }, + { + "epoch": 0.7300167412108644, + "grad_norm": 2.109375, + "learning_rate": 8.663142480577965e-06, + "loss": 0.999, + "step": 3652 + }, + { + "epoch": 0.7302166362659603, + "grad_norm": 2.140625, + "learning_rate": 8.662425151720425e-06, + "loss": 1.0337, + "step": 3653 + }, + { + "epoch": 0.7304165313210564, + "grad_norm": 2.015625, + "learning_rate": 8.661707660178351e-06, + "loss": 1.0501, + "step": 3654 + }, + { + "epoch": 0.7306164263761525, + "grad_norm": 2.046875, + "learning_rate": 8.660990005983613e-06, + "loss": 1.0929, + "step": 3655 + }, + { + "epoch": 0.7308163214312486, + "grad_norm": 2.03125, + "learning_rate": 8.660272189168093e-06, + "loss": 1.002, + "step": 3656 + }, + { + "epoch": 0.7310162164863446, + "grad_norm": 2.109375, + "learning_rate": 8.659554209763669e-06, + "loss": 1.0552, + "step": 3657 + }, + { + "epoch": 0.7312161115414407, + "grad_norm": 2.0625, + "learning_rate": 8.65883606780224e-06, + "loss": 1.031, + "step": 3658 + }, + { + "epoch": 0.7314160065965368, + "grad_norm": 2.015625, + "learning_rate": 8.658117763315705e-06, + "loss": 0.9722, + "step": 3659 + }, + { + "epoch": 0.7316159016516329, + "grad_norm": 2.21875, + "learning_rate": 8.657399296335967e-06, + "loss": 1.0115, + "step": 3660 + }, + { + "epoch": 0.7318157967067289, + "grad_norm": 2.125, + "learning_rate": 8.656680666894945e-06, + "loss": 0.9965, + "step": 3661 + }, + { + "epoch": 0.732015691761825, + "grad_norm": 2.15625, + "learning_rate": 8.655961875024557e-06, + "loss": 1.065, + "step": 3662 + }, + { + "epoch": 0.7322155868169211, + "grad_norm": 2.1875, + "learning_rate": 8.655242920756733e-06, + "loss": 1.1328, + "step": 3663 + }, + { + "epoch": 0.7324154818720172, + "grad_norm": 2.09375, + "learning_rate": 8.65452380412341e-06, + "loss": 1.085, + "step": 3664 + }, + { + "epoch": 0.7326153769271133, + "grad_norm": 2.09375, + "learning_rate": 8.653804525156529e-06, + "loss": 1.0886, + "step": 3665 + }, + { + "epoch": 0.7328152719822093, + "grad_norm": 2.015625, + "learning_rate": 8.653085083888042e-06, + "loss": 1.0906, + "step": 3666 + }, + { + "epoch": 0.7330151670373054, + "grad_norm": 2.046875, + "learning_rate": 8.652365480349904e-06, + "loss": 1.0874, + "step": 3667 + }, + { + "epoch": 0.7332150620924015, + "grad_norm": 1.9375, + "learning_rate": 8.651645714574082e-06, + "loss": 0.971, + "step": 3668 + }, + { + "epoch": 0.7334149571474976, + "grad_norm": 1.875, + "learning_rate": 8.65092578659255e-06, + "loss": 1.0021, + "step": 3669 + }, + { + "epoch": 0.7336148522025936, + "grad_norm": 2.140625, + "learning_rate": 8.650205696437282e-06, + "loss": 0.9673, + "step": 3670 + }, + { + "epoch": 0.7338147472576897, + "grad_norm": 2.0625, + "learning_rate": 8.649485444140267e-06, + "loss": 1.0164, + "step": 3671 + }, + { + "epoch": 0.7340146423127858, + "grad_norm": 2.046875, + "learning_rate": 8.6487650297335e-06, + "loss": 1.0038, + "step": 3672 + }, + { + "epoch": 0.7342145373678819, + "grad_norm": 2.078125, + "learning_rate": 8.648044453248978e-06, + "loss": 0.9634, + "step": 3673 + }, + { + "epoch": 0.734414432422978, + "grad_norm": 2.03125, + "learning_rate": 8.647323714718712e-06, + "loss": 1.1127, + "step": 3674 + }, + { + "epoch": 0.734614327478074, + "grad_norm": 2.015625, + "learning_rate": 8.646602814174715e-06, + "loss": 0.8807, + "step": 3675 + }, + { + "epoch": 0.7348142225331701, + "grad_norm": 1.96875, + "learning_rate": 8.645881751649012e-06, + "loss": 1.0312, + "step": 3676 + }, + { + "epoch": 0.7350141175882662, + "grad_norm": 1.8828125, + "learning_rate": 8.64516052717363e-06, + "loss": 1.0603, + "step": 3677 + }, + { + "epoch": 0.7352140126433623, + "grad_norm": 2.1875, + "learning_rate": 8.644439140780608e-06, + "loss": 1.1072, + "step": 3678 + }, + { + "epoch": 0.7354139076984583, + "grad_norm": 2.109375, + "learning_rate": 8.643717592501988e-06, + "loss": 1.0585, + "step": 3679 + }, + { + "epoch": 0.7356138027535544, + "grad_norm": 2.0625, + "learning_rate": 8.64299588236982e-06, + "loss": 1.0328, + "step": 3680 + }, + { + "epoch": 0.7358136978086505, + "grad_norm": 2.125, + "learning_rate": 8.642274010416165e-06, + "loss": 1.0931, + "step": 3681 + }, + { + "epoch": 0.7360135928637466, + "grad_norm": 2.0625, + "learning_rate": 8.641551976673088e-06, + "loss": 1.0921, + "step": 3682 + }, + { + "epoch": 0.7362134879188426, + "grad_norm": 2.046875, + "learning_rate": 8.64082978117266e-06, + "loss": 1.0605, + "step": 3683 + }, + { + "epoch": 0.7364133829739387, + "grad_norm": 1.9296875, + "learning_rate": 8.640107423946964e-06, + "loss": 0.9517, + "step": 3684 + }, + { + "epoch": 0.7366132780290348, + "grad_norm": 2.0625, + "learning_rate": 8.639384905028084e-06, + "loss": 1.0521, + "step": 3685 + }, + { + "epoch": 0.7368131730841309, + "grad_norm": 2.046875, + "learning_rate": 8.638662224448115e-06, + "loss": 0.9472, + "step": 3686 + }, + { + "epoch": 0.737013068139227, + "grad_norm": 2.0625, + "learning_rate": 8.63793938223916e-06, + "loss": 1.0221, + "step": 3687 + }, + { + "epoch": 0.737212963194323, + "grad_norm": 2.171875, + "learning_rate": 8.637216378433324e-06, + "loss": 1.121, + "step": 3688 + }, + { + "epoch": 0.737412858249419, + "grad_norm": 2.140625, + "learning_rate": 8.636493213062725e-06, + "loss": 1.0409, + "step": 3689 + }, + { + "epoch": 0.7376127533045151, + "grad_norm": 2.1875, + "learning_rate": 8.635769886159488e-06, + "loss": 1.1295, + "step": 3690 + }, + { + "epoch": 0.7378126483596112, + "grad_norm": 2.046875, + "learning_rate": 8.63504639775574e-06, + "loss": 1.0737, + "step": 3691 + }, + { + "epoch": 0.7380125434147072, + "grad_norm": 2.046875, + "learning_rate": 8.634322747883619e-06, + "loss": 1.025, + "step": 3692 + }, + { + "epoch": 0.7382124384698033, + "grad_norm": 2.03125, + "learning_rate": 8.63359893657527e-06, + "loss": 1.011, + "step": 3693 + }, + { + "epoch": 0.7384123335248994, + "grad_norm": 2.1875, + "learning_rate": 8.632874963862844e-06, + "loss": 1.0802, + "step": 3694 + }, + { + "epoch": 0.7386122285799955, + "grad_norm": 2.125, + "learning_rate": 8.632150829778498e-06, + "loss": 1.0525, + "step": 3695 + }, + { + "epoch": 0.7388121236350915, + "grad_norm": 1.9765625, + "learning_rate": 8.631426534354404e-06, + "loss": 0.9998, + "step": 3696 + }, + { + "epoch": 0.7390120186901876, + "grad_norm": 2.03125, + "learning_rate": 8.630702077622728e-06, + "loss": 1.0091, + "step": 3697 + }, + { + "epoch": 0.7392119137452837, + "grad_norm": 2.046875, + "learning_rate": 8.629977459615655e-06, + "loss": 1.0374, + "step": 3698 + }, + { + "epoch": 0.7394118088003798, + "grad_norm": 1.9765625, + "learning_rate": 8.62925268036537e-06, + "loss": 0.9763, + "step": 3699 + }, + { + "epoch": 0.7396117038554759, + "grad_norm": 2.09375, + "learning_rate": 8.62852773990407e-06, + "loss": 1.0981, + "step": 3700 + }, + { + "epoch": 0.7398115989105719, + "grad_norm": 1.96875, + "learning_rate": 8.627802638263955e-06, + "loss": 0.9995, + "step": 3701 + }, + { + "epoch": 0.740011493965668, + "grad_norm": 2.0, + "learning_rate": 8.627077375477233e-06, + "loss": 1.0346, + "step": 3702 + }, + { + "epoch": 0.7402113890207641, + "grad_norm": 2.171875, + "learning_rate": 8.626351951576122e-06, + "loss": 1.0345, + "step": 3703 + }, + { + "epoch": 0.7404112840758602, + "grad_norm": 2.125, + "learning_rate": 8.625626366592844e-06, + "loss": 0.99, + "step": 3704 + }, + { + "epoch": 0.7406111791309562, + "grad_norm": 2.03125, + "learning_rate": 8.624900620559633e-06, + "loss": 1.1051, + "step": 3705 + }, + { + "epoch": 0.7408110741860523, + "grad_norm": 2.265625, + "learning_rate": 8.624174713508722e-06, + "loss": 1.1394, + "step": 3706 + }, + { + "epoch": 0.7410109692411484, + "grad_norm": 2.078125, + "learning_rate": 8.623448645472356e-06, + "loss": 1.0432, + "step": 3707 + }, + { + "epoch": 0.7412108642962445, + "grad_norm": 1.8515625, + "learning_rate": 8.62272241648279e-06, + "loss": 0.9488, + "step": 3708 + }, + { + "epoch": 0.7414107593513406, + "grad_norm": 2.078125, + "learning_rate": 8.62199602657228e-06, + "loss": 1.0239, + "step": 3709 + }, + { + "epoch": 0.7416106544064366, + "grad_norm": 2.234375, + "learning_rate": 8.621269475773092e-06, + "loss": 1.1023, + "step": 3710 + }, + { + "epoch": 0.7418105494615327, + "grad_norm": 1.96875, + "learning_rate": 8.620542764117503e-06, + "loss": 0.9405, + "step": 3711 + }, + { + "epoch": 0.7420104445166288, + "grad_norm": 2.078125, + "learning_rate": 8.61981589163779e-06, + "loss": 1.129, + "step": 3712 + }, + { + "epoch": 0.7422103395717249, + "grad_norm": 2.140625, + "learning_rate": 8.619088858366242e-06, + "loss": 0.9819, + "step": 3713 + }, + { + "epoch": 0.7424102346268209, + "grad_norm": 2.078125, + "learning_rate": 8.618361664335153e-06, + "loss": 1.0968, + "step": 3714 + }, + { + "epoch": 0.742610129681917, + "grad_norm": 2.171875, + "learning_rate": 8.617634309576827e-06, + "loss": 1.0922, + "step": 3715 + }, + { + "epoch": 0.7428100247370131, + "grad_norm": 2.046875, + "learning_rate": 8.61690679412357e-06, + "loss": 1.0316, + "step": 3716 + }, + { + "epoch": 0.7430099197921092, + "grad_norm": 2.09375, + "learning_rate": 8.6161791180077e-06, + "loss": 1.0087, + "step": 3717 + }, + { + "epoch": 0.7432098148472052, + "grad_norm": 1.96875, + "learning_rate": 8.615451281261539e-06, + "loss": 0.9273, + "step": 3718 + }, + { + "epoch": 0.7434097099023013, + "grad_norm": 2.03125, + "learning_rate": 8.614723283917418e-06, + "loss": 0.9929, + "step": 3719 + }, + { + "epoch": 0.7436096049573974, + "grad_norm": 2.046875, + "learning_rate": 8.613995126007674e-06, + "loss": 0.9644, + "step": 3720 + }, + { + "epoch": 0.7438095000124935, + "grad_norm": 2.015625, + "learning_rate": 8.613266807564656e-06, + "loss": 1.0416, + "step": 3721 + }, + { + "epoch": 0.7440093950675896, + "grad_norm": 2.015625, + "learning_rate": 8.61253832862071e-06, + "loss": 1.0855, + "step": 3722 + }, + { + "epoch": 0.7442092901226856, + "grad_norm": 2.15625, + "learning_rate": 8.611809689208197e-06, + "loss": 1.073, + "step": 3723 + }, + { + "epoch": 0.7444091851777817, + "grad_norm": 2.0, + "learning_rate": 8.611080889359485e-06, + "loss": 1.0305, + "step": 3724 + }, + { + "epoch": 0.7446090802328778, + "grad_norm": 2.125, + "learning_rate": 8.610351929106944e-06, + "loss": 1.0055, + "step": 3725 + }, + { + "epoch": 0.7448089752879739, + "grad_norm": 2.203125, + "learning_rate": 8.609622808482956e-06, + "loss": 1.0343, + "step": 3726 + }, + { + "epoch": 0.7450088703430698, + "grad_norm": 2.140625, + "learning_rate": 8.608893527519908e-06, + "loss": 1.0539, + "step": 3727 + }, + { + "epoch": 0.7452087653981659, + "grad_norm": 2.125, + "learning_rate": 8.608164086250197e-06, + "loss": 1.0643, + "step": 3728 + }, + { + "epoch": 0.745408660453262, + "grad_norm": 1.953125, + "learning_rate": 8.607434484706221e-06, + "loss": 1.0237, + "step": 3729 + }, + { + "epoch": 0.7456085555083581, + "grad_norm": 2.1875, + "learning_rate": 8.60670472292039e-06, + "loss": 1.0754, + "step": 3730 + }, + { + "epoch": 0.7458084505634542, + "grad_norm": 2.140625, + "learning_rate": 8.605974800925121e-06, + "loss": 1.0677, + "step": 3731 + }, + { + "epoch": 0.7460083456185502, + "grad_norm": 1.9921875, + "learning_rate": 8.605244718752837e-06, + "loss": 0.9748, + "step": 3732 + }, + { + "epoch": 0.7462082406736463, + "grad_norm": 2.015625, + "learning_rate": 8.604514476435969e-06, + "loss": 0.9695, + "step": 3733 + }, + { + "epoch": 0.7464081357287424, + "grad_norm": 2.015625, + "learning_rate": 8.60378407400695e-06, + "loss": 0.9913, + "step": 3734 + }, + { + "epoch": 0.7466080307838385, + "grad_norm": 2.0625, + "learning_rate": 8.603053511498228e-06, + "loss": 1.0384, + "step": 3735 + }, + { + "epoch": 0.7468079258389345, + "grad_norm": 2.078125, + "learning_rate": 8.602322788942255e-06, + "loss": 0.9894, + "step": 3736 + }, + { + "epoch": 0.7470078208940306, + "grad_norm": 2.125, + "learning_rate": 8.601591906371487e-06, + "loss": 1.0653, + "step": 3737 + }, + { + "epoch": 0.7472077159491267, + "grad_norm": 2.09375, + "learning_rate": 8.600860863818392e-06, + "loss": 1.0099, + "step": 3738 + }, + { + "epoch": 0.7474076110042228, + "grad_norm": 2.109375, + "learning_rate": 8.600129661315443e-06, + "loss": 1.1134, + "step": 3739 + }, + { + "epoch": 0.7476075060593188, + "grad_norm": 2.046875, + "learning_rate": 8.599398298895117e-06, + "loss": 1.0463, + "step": 3740 + }, + { + "epoch": 0.7478074011144149, + "grad_norm": 2.0625, + "learning_rate": 8.598666776589904e-06, + "loss": 1.0142, + "step": 3741 + }, + { + "epoch": 0.748007296169511, + "grad_norm": 2.0, + "learning_rate": 8.597935094432298e-06, + "loss": 1.0762, + "step": 3742 + }, + { + "epoch": 0.7482071912246071, + "grad_norm": 2.09375, + "learning_rate": 8.597203252454798e-06, + "loss": 1.1143, + "step": 3743 + }, + { + "epoch": 0.7484070862797032, + "grad_norm": 2.125, + "learning_rate": 8.596471250689915e-06, + "loss": 1.078, + "step": 3744 + }, + { + "epoch": 0.7486069813347992, + "grad_norm": 2.25, + "learning_rate": 8.595739089170162e-06, + "loss": 1.0361, + "step": 3745 + }, + { + "epoch": 0.7488068763898953, + "grad_norm": 2.140625, + "learning_rate": 8.595006767928064e-06, + "loss": 1.0813, + "step": 3746 + }, + { + "epoch": 0.7490067714449914, + "grad_norm": 2.21875, + "learning_rate": 8.594274286996147e-06, + "loss": 1.1081, + "step": 3747 + }, + { + "epoch": 0.7492066665000875, + "grad_norm": 2.03125, + "learning_rate": 8.593541646406952e-06, + "loss": 1.0592, + "step": 3748 + }, + { + "epoch": 0.7494065615551835, + "grad_norm": 2.078125, + "learning_rate": 8.59280884619302e-06, + "loss": 0.9439, + "step": 3749 + }, + { + "epoch": 0.7496064566102796, + "grad_norm": 2.078125, + "learning_rate": 8.592075886386903e-06, + "loss": 1.0597, + "step": 3750 + }, + { + "epoch": 0.7498063516653757, + "grad_norm": 2.15625, + "learning_rate": 8.59134276702116e-06, + "loss": 0.9952, + "step": 3751 + }, + { + "epoch": 0.7500062467204718, + "grad_norm": 2.21875, + "learning_rate": 8.590609488128354e-06, + "loss": 0.9731, + "step": 3752 + }, + { + "epoch": 0.7502061417755679, + "grad_norm": 2.125, + "learning_rate": 8.589876049741058e-06, + "loss": 0.9752, + "step": 3753 + }, + { + "epoch": 0.7504060368306639, + "grad_norm": 2.078125, + "learning_rate": 8.589142451891849e-06, + "loss": 1.0571, + "step": 3754 + }, + { + "epoch": 0.75060593188576, + "grad_norm": 2.109375, + "learning_rate": 8.58840869461332e-06, + "loss": 1.0112, + "step": 3755 + }, + { + "epoch": 0.7508058269408561, + "grad_norm": 2.109375, + "learning_rate": 8.587674777938057e-06, + "loss": 0.9903, + "step": 3756 + }, + { + "epoch": 0.7510057219959522, + "grad_norm": 2.1875, + "learning_rate": 8.586940701898665e-06, + "loss": 1.1016, + "step": 3757 + }, + { + "epoch": 0.7512056170510482, + "grad_norm": 2.09375, + "learning_rate": 8.586206466527749e-06, + "loss": 0.9849, + "step": 3758 + }, + { + "epoch": 0.7514055121061443, + "grad_norm": 2.078125, + "learning_rate": 8.585472071857924e-06, + "loss": 1.0105, + "step": 3759 + }, + { + "epoch": 0.7516054071612404, + "grad_norm": 1.9765625, + "learning_rate": 8.584737517921815e-06, + "loss": 0.9755, + "step": 3760 + }, + { + "epoch": 0.7518053022163365, + "grad_norm": 2.109375, + "learning_rate": 8.584002804752046e-06, + "loss": 1.0461, + "step": 3761 + }, + { + "epoch": 0.7520051972714324, + "grad_norm": 2.171875, + "learning_rate": 8.583267932381257e-06, + "loss": 1.1102, + "step": 3762 + }, + { + "epoch": 0.7522050923265285, + "grad_norm": 2.109375, + "learning_rate": 8.582532900842088e-06, + "loss": 1.0866, + "step": 3763 + }, + { + "epoch": 0.7524049873816246, + "grad_norm": 2.03125, + "learning_rate": 8.58179771016719e-06, + "loss": 1.0478, + "step": 3764 + }, + { + "epoch": 0.7526048824367207, + "grad_norm": 1.9765625, + "learning_rate": 8.581062360389222e-06, + "loss": 0.9944, + "step": 3765 + }, + { + "epoch": 0.7528047774918168, + "grad_norm": 2.0, + "learning_rate": 8.580326851540844e-06, + "loss": 1.1228, + "step": 3766 + }, + { + "epoch": 0.7530046725469128, + "grad_norm": 2.203125, + "learning_rate": 8.579591183654731e-06, + "loss": 1.0448, + "step": 3767 + }, + { + "epoch": 0.7532045676020089, + "grad_norm": 2.28125, + "learning_rate": 8.578855356763559e-06, + "loss": 1.1004, + "step": 3768 + }, + { + "epoch": 0.753404462657105, + "grad_norm": 2.15625, + "learning_rate": 8.578119370900016e-06, + "loss": 1.0422, + "step": 3769 + }, + { + "epoch": 0.7536043577122011, + "grad_norm": 2.03125, + "learning_rate": 8.577383226096792e-06, + "loss": 0.9927, + "step": 3770 + }, + { + "epoch": 0.7538042527672971, + "grad_norm": 2.109375, + "learning_rate": 8.576646922386587e-06, + "loss": 1.0948, + "step": 3771 + }, + { + "epoch": 0.7540041478223932, + "grad_norm": 1.9765625, + "learning_rate": 8.575910459802107e-06, + "loss": 1.0204, + "step": 3772 + }, + { + "epoch": 0.7542040428774893, + "grad_norm": 2.109375, + "learning_rate": 8.575173838376067e-06, + "loss": 1.0898, + "step": 3773 + }, + { + "epoch": 0.7544039379325854, + "grad_norm": 1.9921875, + "learning_rate": 8.574437058141187e-06, + "loss": 0.9686, + "step": 3774 + }, + { + "epoch": 0.7546038329876815, + "grad_norm": 2.140625, + "learning_rate": 8.573700119130194e-06, + "loss": 1.0663, + "step": 3775 + }, + { + "epoch": 0.7548037280427775, + "grad_norm": 2.125, + "learning_rate": 8.572963021375825e-06, + "loss": 0.9369, + "step": 3776 + }, + { + "epoch": 0.7550036230978736, + "grad_norm": 2.046875, + "learning_rate": 8.572225764910818e-06, + "loss": 1.0403, + "step": 3777 + }, + { + "epoch": 0.7552035181529697, + "grad_norm": 2.15625, + "learning_rate": 8.571488349767925e-06, + "loss": 1.069, + "step": 3778 + }, + { + "epoch": 0.7554034132080658, + "grad_norm": 1.9765625, + "learning_rate": 8.570750775979901e-06, + "loss": 0.9747, + "step": 3779 + }, + { + "epoch": 0.7556033082631618, + "grad_norm": 2.09375, + "learning_rate": 8.57001304357951e-06, + "loss": 1.1118, + "step": 3780 + }, + { + "epoch": 0.7558032033182579, + "grad_norm": 2.109375, + "learning_rate": 8.56927515259952e-06, + "loss": 1.0418, + "step": 3781 + }, + { + "epoch": 0.756003098373354, + "grad_norm": 2.25, + "learning_rate": 8.568537103072707e-06, + "loss": 1.1046, + "step": 3782 + }, + { + "epoch": 0.7562029934284501, + "grad_norm": 2.078125, + "learning_rate": 8.56779889503186e-06, + "loss": 1.0775, + "step": 3783 + }, + { + "epoch": 0.7564028884835461, + "grad_norm": 2.140625, + "learning_rate": 8.567060528509767e-06, + "loss": 1.1559, + "step": 3784 + }, + { + "epoch": 0.7566027835386422, + "grad_norm": 2.125, + "learning_rate": 8.566322003539225e-06, + "loss": 1.0536, + "step": 3785 + }, + { + "epoch": 0.7568026785937383, + "grad_norm": 2.015625, + "learning_rate": 8.56558332015304e-06, + "loss": 1.0553, + "step": 3786 + }, + { + "epoch": 0.7570025736488344, + "grad_norm": 2.0, + "learning_rate": 8.564844478384027e-06, + "loss": 1.0333, + "step": 3787 + }, + { + "epoch": 0.7572024687039305, + "grad_norm": 2.015625, + "learning_rate": 8.564105478265e-06, + "loss": 0.9435, + "step": 3788 + }, + { + "epoch": 0.7574023637590265, + "grad_norm": 2.171875, + "learning_rate": 8.563366319828789e-06, + "loss": 1.1762, + "step": 3789 + }, + { + "epoch": 0.7576022588141226, + "grad_norm": 2.0625, + "learning_rate": 8.562627003108229e-06, + "loss": 1.0673, + "step": 3790 + }, + { + "epoch": 0.7578021538692187, + "grad_norm": 2.015625, + "learning_rate": 8.561887528136157e-06, + "loss": 0.942, + "step": 3791 + }, + { + "epoch": 0.7580020489243148, + "grad_norm": 2.125, + "learning_rate": 8.56114789494542e-06, + "loss": 1.0512, + "step": 3792 + }, + { + "epoch": 0.7582019439794108, + "grad_norm": 2.078125, + "learning_rate": 8.560408103568875e-06, + "loss": 1.0053, + "step": 3793 + }, + { + "epoch": 0.7584018390345069, + "grad_norm": 2.140625, + "learning_rate": 8.559668154039382e-06, + "loss": 1.0991, + "step": 3794 + }, + { + "epoch": 0.758601734089603, + "grad_norm": 2.0625, + "learning_rate": 8.55892804638981e-06, + "loss": 0.9974, + "step": 3795 + }, + { + "epoch": 0.7588016291446991, + "grad_norm": 1.9921875, + "learning_rate": 8.558187780653033e-06, + "loss": 0.9253, + "step": 3796 + }, + { + "epoch": 0.759001524199795, + "grad_norm": 2.078125, + "learning_rate": 8.557447356861937e-06, + "loss": 1.0197, + "step": 3797 + }, + { + "epoch": 0.7592014192548912, + "grad_norm": 2.03125, + "learning_rate": 8.556706775049408e-06, + "loss": 1.0748, + "step": 3798 + }, + { + "epoch": 0.7594013143099873, + "grad_norm": 2.09375, + "learning_rate": 8.555966035248344e-06, + "loss": 1.0473, + "step": 3799 + }, + { + "epoch": 0.7596012093650834, + "grad_norm": 2.203125, + "learning_rate": 8.555225137491649e-06, + "loss": 1.1592, + "step": 3800 + }, + { + "epoch": 0.7598011044201795, + "grad_norm": 2.28125, + "learning_rate": 8.554484081812233e-06, + "loss": 1.1061, + "step": 3801 + }, + { + "epoch": 0.7600009994752754, + "grad_norm": 2.171875, + "learning_rate": 8.553742868243014e-06, + "loss": 1.1416, + "step": 3802 + }, + { + "epoch": 0.7602008945303715, + "grad_norm": 2.0625, + "learning_rate": 8.553001496816915e-06, + "loss": 1.0807, + "step": 3803 + }, + { + "epoch": 0.7604007895854676, + "grad_norm": 2.03125, + "learning_rate": 8.552259967566871e-06, + "loss": 1.1007, + "step": 3804 + }, + { + "epoch": 0.7606006846405637, + "grad_norm": 2.078125, + "learning_rate": 8.551518280525816e-06, + "loss": 0.9757, + "step": 3805 + }, + { + "epoch": 0.7608005796956597, + "grad_norm": 2.046875, + "learning_rate": 8.550776435726701e-06, + "loss": 1.0683, + "step": 3806 + }, + { + "epoch": 0.7610004747507558, + "grad_norm": 2.125, + "learning_rate": 8.550034433202474e-06, + "loss": 1.1383, + "step": 3807 + }, + { + "epoch": 0.7612003698058519, + "grad_norm": 2.234375, + "learning_rate": 8.549292272986098e-06, + "loss": 1.1601, + "step": 3808 + }, + { + "epoch": 0.761400264860948, + "grad_norm": 1.9921875, + "learning_rate": 8.548549955110538e-06, + "loss": 0.9979, + "step": 3809 + }, + { + "epoch": 0.7616001599160441, + "grad_norm": 2.125, + "learning_rate": 8.547807479608768e-06, + "loss": 1.0703, + "step": 3810 + }, + { + "epoch": 0.7618000549711401, + "grad_norm": 1.96875, + "learning_rate": 8.547064846513768e-06, + "loss": 1.0299, + "step": 3811 + }, + { + "epoch": 0.7619999500262362, + "grad_norm": 2.0625, + "learning_rate": 8.546322055858526e-06, + "loss": 0.9472, + "step": 3812 + }, + { + "epoch": 0.7621998450813323, + "grad_norm": 2.125, + "learning_rate": 8.545579107676039e-06, + "loss": 1.095, + "step": 3813 + }, + { + "epoch": 0.7623997401364284, + "grad_norm": 2.078125, + "learning_rate": 8.544836001999306e-06, + "loss": 1.1145, + "step": 3814 + }, + { + "epoch": 0.7625996351915244, + "grad_norm": 2.125, + "learning_rate": 8.544092738861336e-06, + "loss": 1.0481, + "step": 3815 + }, + { + "epoch": 0.7627995302466205, + "grad_norm": 2.25, + "learning_rate": 8.543349318295145e-06, + "loss": 1.0895, + "step": 3816 + }, + { + "epoch": 0.7629994253017166, + "grad_norm": 2.078125, + "learning_rate": 8.542605740333754e-06, + "loss": 1.1261, + "step": 3817 + }, + { + "epoch": 0.7631993203568127, + "grad_norm": 2.140625, + "learning_rate": 8.541862005010198e-06, + "loss": 0.9861, + "step": 3818 + }, + { + "epoch": 0.7633992154119087, + "grad_norm": 2.03125, + "learning_rate": 8.541118112357507e-06, + "loss": 0.97, + "step": 3819 + }, + { + "epoch": 0.7635991104670048, + "grad_norm": 2.015625, + "learning_rate": 8.54037406240873e-06, + "loss": 1.0524, + "step": 3820 + }, + { + "epoch": 0.7637990055221009, + "grad_norm": 2.078125, + "learning_rate": 8.539629855196913e-06, + "loss": 1.0944, + "step": 3821 + }, + { + "epoch": 0.763998900577197, + "grad_norm": 2.03125, + "learning_rate": 8.538885490755117e-06, + "loss": 0.9729, + "step": 3822 + }, + { + "epoch": 0.7641987956322931, + "grad_norm": 2.0625, + "learning_rate": 8.538140969116406e-06, + "loss": 1.0453, + "step": 3823 + }, + { + "epoch": 0.7643986906873891, + "grad_norm": 1.9765625, + "learning_rate": 8.537396290313849e-06, + "loss": 1.0251, + "step": 3824 + }, + { + "epoch": 0.7645985857424852, + "grad_norm": 2.09375, + "learning_rate": 8.53665145438053e-06, + "loss": 1.1079, + "step": 3825 + }, + { + "epoch": 0.7647984807975813, + "grad_norm": 2.09375, + "learning_rate": 8.535906461349528e-06, + "loss": 1.0166, + "step": 3826 + }, + { + "epoch": 0.7649983758526774, + "grad_norm": 2.03125, + "learning_rate": 8.53516131125394e-06, + "loss": 1.0555, + "step": 3827 + }, + { + "epoch": 0.7651982709077734, + "grad_norm": 2.09375, + "learning_rate": 8.534416004126863e-06, + "loss": 1.1303, + "step": 3828 + }, + { + "epoch": 0.7653981659628695, + "grad_norm": 2.078125, + "learning_rate": 8.533670540001406e-06, + "loss": 1.0398, + "step": 3829 + }, + { + "epoch": 0.7655980610179656, + "grad_norm": 1.9765625, + "learning_rate": 8.53292491891068e-06, + "loss": 1.0753, + "step": 3830 + }, + { + "epoch": 0.7657979560730617, + "grad_norm": 2.03125, + "learning_rate": 8.532179140887807e-06, + "loss": 0.973, + "step": 3831 + }, + { + "epoch": 0.7659978511281578, + "grad_norm": 2.046875, + "learning_rate": 8.531433205965913e-06, + "loss": 0.9727, + "step": 3832 + }, + { + "epoch": 0.7661977461832538, + "grad_norm": 2.203125, + "learning_rate": 8.530687114178134e-06, + "loss": 1.0441, + "step": 3833 + }, + { + "epoch": 0.7663976412383499, + "grad_norm": 2.109375, + "learning_rate": 8.529940865557611e-06, + "loss": 0.9827, + "step": 3834 + }, + { + "epoch": 0.766597536293446, + "grad_norm": 1.984375, + "learning_rate": 8.52919446013749e-06, + "loss": 1.0333, + "step": 3835 + }, + { + "epoch": 0.7667974313485421, + "grad_norm": 2.28125, + "learning_rate": 8.528447897950932e-06, + "loss": 1.0204, + "step": 3836 + }, + { + "epoch": 0.766997326403638, + "grad_norm": 2.1875, + "learning_rate": 8.527701179031092e-06, + "loss": 1.1111, + "step": 3837 + }, + { + "epoch": 0.7671972214587341, + "grad_norm": 2.0, + "learning_rate": 8.526954303411141e-06, + "loss": 1.0995, + "step": 3838 + }, + { + "epoch": 0.7673971165138302, + "grad_norm": 1.9140625, + "learning_rate": 8.526207271124258e-06, + "loss": 0.9218, + "step": 3839 + }, + { + "epoch": 0.7675970115689263, + "grad_norm": 2.140625, + "learning_rate": 8.525460082203626e-06, + "loss": 1.113, + "step": 3840 + }, + { + "epoch": 0.7677969066240223, + "grad_norm": 1.9140625, + "learning_rate": 8.524712736682433e-06, + "loss": 0.9998, + "step": 3841 + }, + { + "epoch": 0.7679968016791184, + "grad_norm": 2.171875, + "learning_rate": 8.523965234593877e-06, + "loss": 1.0899, + "step": 3842 + }, + { + "epoch": 0.7681966967342145, + "grad_norm": 2.015625, + "learning_rate": 8.523217575971159e-06, + "loss": 1.0126, + "step": 3843 + }, + { + "epoch": 0.7683965917893106, + "grad_norm": 1.984375, + "learning_rate": 8.522469760847496e-06, + "loss": 0.9371, + "step": 3844 + }, + { + "epoch": 0.7685964868444067, + "grad_norm": 2.140625, + "learning_rate": 8.5217217892561e-06, + "loss": 1.0394, + "step": 3845 + }, + { + "epoch": 0.7687963818995027, + "grad_norm": 2.109375, + "learning_rate": 8.520973661230198e-06, + "loss": 1.0531, + "step": 3846 + }, + { + "epoch": 0.7689962769545988, + "grad_norm": 2.0625, + "learning_rate": 8.520225376803024e-06, + "loss": 1.0576, + "step": 3847 + }, + { + "epoch": 0.7691961720096949, + "grad_norm": 2.171875, + "learning_rate": 8.519476936007814e-06, + "loss": 1.0065, + "step": 3848 + }, + { + "epoch": 0.769396067064791, + "grad_norm": 2.296875, + "learning_rate": 8.518728338877814e-06, + "loss": 1.0778, + "step": 3849 + }, + { + "epoch": 0.769595962119887, + "grad_norm": 2.0, + "learning_rate": 8.51797958544628e-06, + "loss": 0.9625, + "step": 3850 + }, + { + "epoch": 0.7697958571749831, + "grad_norm": 1.984375, + "learning_rate": 8.517230675746464e-06, + "loss": 1.035, + "step": 3851 + }, + { + "epoch": 0.7699957522300792, + "grad_norm": 2.125, + "learning_rate": 8.51648160981164e-06, + "loss": 1.0866, + "step": 3852 + }, + { + "epoch": 0.7701956472851753, + "grad_norm": 2.0625, + "learning_rate": 8.515732387675077e-06, + "loss": 1.1333, + "step": 3853 + }, + { + "epoch": 0.7703955423402714, + "grad_norm": 2.125, + "learning_rate": 8.514983009370057e-06, + "loss": 1.0626, + "step": 3854 + }, + { + "epoch": 0.7705954373953674, + "grad_norm": 2.09375, + "learning_rate": 8.514233474929867e-06, + "loss": 1.0494, + "step": 3855 + }, + { + "epoch": 0.7707953324504635, + "grad_norm": 2.15625, + "learning_rate": 8.5134837843878e-06, + "loss": 1.1193, + "step": 3856 + }, + { + "epoch": 0.7709952275055596, + "grad_norm": 2.03125, + "learning_rate": 8.512733937777162e-06, + "loss": 0.9475, + "step": 3857 + }, + { + "epoch": 0.7711951225606557, + "grad_norm": 2.0, + "learning_rate": 8.511983935131256e-06, + "loss": 1.052, + "step": 3858 + }, + { + "epoch": 0.7713950176157517, + "grad_norm": 2.1875, + "learning_rate": 8.511233776483398e-06, + "loss": 1.0843, + "step": 3859 + }, + { + "epoch": 0.7715949126708478, + "grad_norm": 2.109375, + "learning_rate": 8.510483461866911e-06, + "loss": 1.0252, + "step": 3860 + }, + { + "epoch": 0.7717948077259439, + "grad_norm": 2.15625, + "learning_rate": 8.509732991315125e-06, + "loss": 1.0261, + "step": 3861 + }, + { + "epoch": 0.77199470278104, + "grad_norm": 2.1875, + "learning_rate": 8.508982364861373e-06, + "loss": 1.0702, + "step": 3862 + }, + { + "epoch": 0.772194597836136, + "grad_norm": 2.046875, + "learning_rate": 8.508231582538999e-06, + "loss": 1.0123, + "step": 3863 + }, + { + "epoch": 0.7723944928912321, + "grad_norm": 2.125, + "learning_rate": 8.507480644381355e-06, + "loss": 1.0134, + "step": 3864 + }, + { + "epoch": 0.7725943879463282, + "grad_norm": 2.125, + "learning_rate": 8.506729550421791e-06, + "loss": 1.1005, + "step": 3865 + }, + { + "epoch": 0.7727942830014243, + "grad_norm": 2.09375, + "learning_rate": 8.505978300693678e-06, + "loss": 1.0609, + "step": 3866 + }, + { + "epoch": 0.7729941780565204, + "grad_norm": 2.046875, + "learning_rate": 8.505226895230383e-06, + "loss": 1.0536, + "step": 3867 + }, + { + "epoch": 0.7731940731116164, + "grad_norm": 2.015625, + "learning_rate": 8.504475334065283e-06, + "loss": 1.0487, + "step": 3868 + }, + { + "epoch": 0.7733939681667125, + "grad_norm": 2.296875, + "learning_rate": 8.503723617231764e-06, + "loss": 1.0884, + "step": 3869 + }, + { + "epoch": 0.7735938632218086, + "grad_norm": 2.09375, + "learning_rate": 8.502971744763216e-06, + "loss": 1.0808, + "step": 3870 + }, + { + "epoch": 0.7737937582769047, + "grad_norm": 2.15625, + "learning_rate": 8.502219716693037e-06, + "loss": 1.0959, + "step": 3871 + }, + { + "epoch": 0.7739936533320007, + "grad_norm": 2.125, + "learning_rate": 8.501467533054632e-06, + "loss": 1.1245, + "step": 3872 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 1.8984375, + "learning_rate": 8.500715193881412e-06, + "loss": 0.8819, + "step": 3873 + }, + { + "epoch": 0.7743934434421929, + "grad_norm": 2.09375, + "learning_rate": 8.499962699206798e-06, + "loss": 0.9738, + "step": 3874 + }, + { + "epoch": 0.774593338497289, + "grad_norm": 2.109375, + "learning_rate": 8.499210049064216e-06, + "loss": 1.1011, + "step": 3875 + }, + { + "epoch": 0.774793233552385, + "grad_norm": 2.03125, + "learning_rate": 8.498457243487096e-06, + "loss": 1.0569, + "step": 3876 + }, + { + "epoch": 0.774993128607481, + "grad_norm": 2.15625, + "learning_rate": 8.497704282508879e-06, + "loss": 1.0651, + "step": 3877 + }, + { + "epoch": 0.7751930236625771, + "grad_norm": 2.015625, + "learning_rate": 8.496951166163012e-06, + "loss": 1.0435, + "step": 3878 + }, + { + "epoch": 0.7753929187176732, + "grad_norm": 2.109375, + "learning_rate": 8.496197894482947e-06, + "loss": 0.9878, + "step": 3879 + }, + { + "epoch": 0.7755928137727693, + "grad_norm": 2.109375, + "learning_rate": 8.495444467502144e-06, + "loss": 1.1107, + "step": 3880 + }, + { + "epoch": 0.7757927088278653, + "grad_norm": 2.0625, + "learning_rate": 8.494690885254073e-06, + "loss": 1.0232, + "step": 3881 + }, + { + "epoch": 0.7759926038829614, + "grad_norm": 2.109375, + "learning_rate": 8.493937147772204e-06, + "loss": 1.058, + "step": 3882 + }, + { + "epoch": 0.7761924989380575, + "grad_norm": 2.203125, + "learning_rate": 8.493183255090022e-06, + "loss": 1.043, + "step": 3883 + }, + { + "epoch": 0.7763923939931536, + "grad_norm": 1.9765625, + "learning_rate": 8.492429207241013e-06, + "loss": 0.9438, + "step": 3884 + }, + { + "epoch": 0.7765922890482496, + "grad_norm": 2.03125, + "learning_rate": 8.49167500425867e-06, + "loss": 1.0213, + "step": 3885 + }, + { + "epoch": 0.7767921841033457, + "grad_norm": 2.046875, + "learning_rate": 8.490920646176497e-06, + "loss": 1.0034, + "step": 3886 + }, + { + "epoch": 0.7769920791584418, + "grad_norm": 2.125, + "learning_rate": 8.490166133028002e-06, + "loss": 1.0945, + "step": 3887 + }, + { + "epoch": 0.7771919742135379, + "grad_norm": 2.203125, + "learning_rate": 8.4894114648467e-06, + "loss": 0.9811, + "step": 3888 + }, + { + "epoch": 0.777391869268634, + "grad_norm": 2.109375, + "learning_rate": 8.488656641666113e-06, + "loss": 1.1282, + "step": 3889 + }, + { + "epoch": 0.77759176432373, + "grad_norm": 2.0625, + "learning_rate": 8.487901663519771e-06, + "loss": 0.9224, + "step": 3890 + }, + { + "epoch": 0.7777916593788261, + "grad_norm": 2.078125, + "learning_rate": 8.48714653044121e-06, + "loss": 1.0431, + "step": 3891 + }, + { + "epoch": 0.7779915544339222, + "grad_norm": 2.015625, + "learning_rate": 8.486391242463972e-06, + "loss": 1.0122, + "step": 3892 + }, + { + "epoch": 0.7781914494890183, + "grad_norm": 2.21875, + "learning_rate": 8.485635799621607e-06, + "loss": 1.112, + "step": 3893 + }, + { + "epoch": 0.7783913445441143, + "grad_norm": 2.171875, + "learning_rate": 8.484880201947674e-06, + "loss": 0.95, + "step": 3894 + }, + { + "epoch": 0.7785912395992104, + "grad_norm": 2.21875, + "learning_rate": 8.484124449475733e-06, + "loss": 1.0991, + "step": 3895 + }, + { + "epoch": 0.7787911346543065, + "grad_norm": 2.078125, + "learning_rate": 8.483368542239356e-06, + "loss": 1.0603, + "step": 3896 + }, + { + "epoch": 0.7789910297094026, + "grad_norm": 2.09375, + "learning_rate": 8.482612480272121e-06, + "loss": 1.0628, + "step": 3897 + }, + { + "epoch": 0.7791909247644987, + "grad_norm": 2.03125, + "learning_rate": 8.481856263607611e-06, + "loss": 1.004, + "step": 3898 + }, + { + "epoch": 0.7793908198195947, + "grad_norm": 2.25, + "learning_rate": 8.481099892279418e-06, + "loss": 1.0244, + "step": 3899 + }, + { + "epoch": 0.7795907148746908, + "grad_norm": 2.109375, + "learning_rate": 8.480343366321138e-06, + "loss": 1.0226, + "step": 3900 + }, + { + "epoch": 0.7797906099297869, + "grad_norm": 2.140625, + "learning_rate": 8.47958668576638e-06, + "loss": 0.9943, + "step": 3901 + }, + { + "epoch": 0.779990504984883, + "grad_norm": 2.203125, + "learning_rate": 8.478829850648752e-06, + "loss": 1.0683, + "step": 3902 + }, + { + "epoch": 0.780190400039979, + "grad_norm": 2.125, + "learning_rate": 8.478072861001872e-06, + "loss": 1.0354, + "step": 3903 + }, + { + "epoch": 0.7803902950950751, + "grad_norm": 2.15625, + "learning_rate": 8.477315716859369e-06, + "loss": 1.1009, + "step": 3904 + }, + { + "epoch": 0.7805901901501712, + "grad_norm": 2.140625, + "learning_rate": 8.476558418254872e-06, + "loss": 1.1659, + "step": 3905 + }, + { + "epoch": 0.7807900852052673, + "grad_norm": 2.109375, + "learning_rate": 8.475800965222024e-06, + "loss": 1.0932, + "step": 3906 + }, + { + "epoch": 0.7809899802603633, + "grad_norm": 2.0, + "learning_rate": 8.475043357794466e-06, + "loss": 0.9968, + "step": 3907 + }, + { + "epoch": 0.7811898753154594, + "grad_norm": 2.015625, + "learning_rate": 8.474285596005853e-06, + "loss": 1.0099, + "step": 3908 + }, + { + "epoch": 0.7813897703705555, + "grad_norm": 2.25, + "learning_rate": 8.473527679889847e-06, + "loss": 1.0626, + "step": 3909 + }, + { + "epoch": 0.7815896654256516, + "grad_norm": 2.109375, + "learning_rate": 8.472769609480109e-06, + "loss": 1.1036, + "step": 3910 + }, + { + "epoch": 0.7817895604807477, + "grad_norm": 2.046875, + "learning_rate": 8.472011384810317e-06, + "loss": 1.0819, + "step": 3911 + }, + { + "epoch": 0.7819894555358436, + "grad_norm": 2.09375, + "learning_rate": 8.471253005914153e-06, + "loss": 1.0333, + "step": 3912 + }, + { + "epoch": 0.7821893505909397, + "grad_norm": 2.046875, + "learning_rate": 8.470494472825299e-06, + "loss": 1.0854, + "step": 3913 + }, + { + "epoch": 0.7823892456460358, + "grad_norm": 2.015625, + "learning_rate": 8.469735785577451e-06, + "loss": 0.9313, + "step": 3914 + }, + { + "epoch": 0.782589140701132, + "grad_norm": 2.09375, + "learning_rate": 8.46897694420431e-06, + "loss": 1.0547, + "step": 3915 + }, + { + "epoch": 0.7827890357562279, + "grad_norm": 2.046875, + "learning_rate": 8.468217948739583e-06, + "loss": 1.0774, + "step": 3916 + }, + { + "epoch": 0.782988930811324, + "grad_norm": 1.921875, + "learning_rate": 8.467458799216987e-06, + "loss": 1.0402, + "step": 3917 + }, + { + "epoch": 0.7831888258664201, + "grad_norm": 2.109375, + "learning_rate": 8.46669949567024e-06, + "loss": 0.9801, + "step": 3918 + }, + { + "epoch": 0.7833887209215162, + "grad_norm": 2.171875, + "learning_rate": 8.465940038133072e-06, + "loss": 1.0642, + "step": 3919 + }, + { + "epoch": 0.7835886159766122, + "grad_norm": 2.0625, + "learning_rate": 8.465180426639218e-06, + "loss": 1.0513, + "step": 3920 + }, + { + "epoch": 0.7837885110317083, + "grad_norm": 2.1875, + "learning_rate": 8.464420661222419e-06, + "loss": 1.0154, + "step": 3921 + }, + { + "epoch": 0.7839884060868044, + "grad_norm": 2.171875, + "learning_rate": 8.463660741916425e-06, + "loss": 1.0165, + "step": 3922 + }, + { + "epoch": 0.7841883011419005, + "grad_norm": 2.109375, + "learning_rate": 8.46290066875499e-06, + "loss": 1.0002, + "step": 3923 + }, + { + "epoch": 0.7843881961969966, + "grad_norm": 2.078125, + "learning_rate": 8.462140441771878e-06, + "loss": 1.0972, + "step": 3924 + }, + { + "epoch": 0.7845880912520926, + "grad_norm": 2.078125, + "learning_rate": 8.461380061000857e-06, + "loss": 1.0545, + "step": 3925 + }, + { + "epoch": 0.7847879863071887, + "grad_norm": 2.0625, + "learning_rate": 8.460619526475704e-06, + "loss": 1.0847, + "step": 3926 + }, + { + "epoch": 0.7849878813622848, + "grad_norm": 2.078125, + "learning_rate": 8.4598588382302e-06, + "loss": 0.9939, + "step": 3927 + }, + { + "epoch": 0.7851877764173809, + "grad_norm": 2.28125, + "learning_rate": 8.459097996298137e-06, + "loss": 1.1484, + "step": 3928 + }, + { + "epoch": 0.7853876714724769, + "grad_norm": 2.0625, + "learning_rate": 8.45833700071331e-06, + "loss": 1.029, + "step": 3929 + }, + { + "epoch": 0.785587566527573, + "grad_norm": 2.0625, + "learning_rate": 8.457575851509523e-06, + "loss": 0.9141, + "step": 3930 + }, + { + "epoch": 0.7857874615826691, + "grad_norm": 2.125, + "learning_rate": 8.456814548720584e-06, + "loss": 1.0615, + "step": 3931 + }, + { + "epoch": 0.7859873566377652, + "grad_norm": 1.9765625, + "learning_rate": 8.456053092380313e-06, + "loss": 0.8921, + "step": 3932 + }, + { + "epoch": 0.7861872516928613, + "grad_norm": 2.171875, + "learning_rate": 8.455291482522536e-06, + "loss": 1.1657, + "step": 3933 + }, + { + "epoch": 0.7863871467479573, + "grad_norm": 2.015625, + "learning_rate": 8.454529719181077e-06, + "loss": 1.0102, + "step": 3934 + }, + { + "epoch": 0.7865870418030534, + "grad_norm": 2.09375, + "learning_rate": 8.453767802389776e-06, + "loss": 1.0866, + "step": 3935 + }, + { + "epoch": 0.7867869368581495, + "grad_norm": 2.140625, + "learning_rate": 8.453005732182482e-06, + "loss": 1.0282, + "step": 3936 + }, + { + "epoch": 0.7869868319132456, + "grad_norm": 2.03125, + "learning_rate": 8.452243508593038e-06, + "loss": 1.1174, + "step": 3937 + }, + { + "epoch": 0.7871867269683416, + "grad_norm": 2.109375, + "learning_rate": 8.451481131655308e-06, + "loss": 0.9955, + "step": 3938 + }, + { + "epoch": 0.7873866220234377, + "grad_norm": 2.046875, + "learning_rate": 8.450718601403155e-06, + "loss": 0.9518, + "step": 3939 + }, + { + "epoch": 0.7875865170785338, + "grad_norm": 2.078125, + "learning_rate": 8.449955917870449e-06, + "loss": 1.1177, + "step": 3940 + }, + { + "epoch": 0.7877864121336299, + "grad_norm": 2.125, + "learning_rate": 8.449193081091071e-06, + "loss": 1.1051, + "step": 3941 + }, + { + "epoch": 0.7879863071887259, + "grad_norm": 2.015625, + "learning_rate": 8.448430091098904e-06, + "loss": 1.0499, + "step": 3942 + }, + { + "epoch": 0.788186202243822, + "grad_norm": 2.03125, + "learning_rate": 8.447666947927842e-06, + "loss": 1.0631, + "step": 3943 + }, + { + "epoch": 0.7883860972989181, + "grad_norm": 2.140625, + "learning_rate": 8.446903651611782e-06, + "loss": 1.0635, + "step": 3944 + }, + { + "epoch": 0.7885859923540142, + "grad_norm": 2.09375, + "learning_rate": 8.446140202184628e-06, + "loss": 0.8986, + "step": 3945 + }, + { + "epoch": 0.7887858874091103, + "grad_norm": 2.0625, + "learning_rate": 8.445376599680295e-06, + "loss": 1.0746, + "step": 3946 + }, + { + "epoch": 0.7889857824642063, + "grad_norm": 1.96875, + "learning_rate": 8.444612844132703e-06, + "loss": 0.9498, + "step": 3947 + }, + { + "epoch": 0.7891856775193024, + "grad_norm": 2.125, + "learning_rate": 8.443848935575776e-06, + "loss": 1.0482, + "step": 3948 + }, + { + "epoch": 0.7893855725743985, + "grad_norm": 2.109375, + "learning_rate": 8.443084874043446e-06, + "loss": 1.0781, + "step": 3949 + }, + { + "epoch": 0.7895854676294946, + "grad_norm": 2.234375, + "learning_rate": 8.442320659569654e-06, + "loss": 1.1076, + "step": 3950 + }, + { + "epoch": 0.7897853626845905, + "grad_norm": 2.0, + "learning_rate": 8.441556292188347e-06, + "loss": 0.984, + "step": 3951 + }, + { + "epoch": 0.7899852577396866, + "grad_norm": 2.125, + "learning_rate": 8.440791771933474e-06, + "loss": 1.0832, + "step": 3952 + }, + { + "epoch": 0.7901851527947827, + "grad_norm": 2.09375, + "learning_rate": 8.440027098839002e-06, + "loss": 0.9624, + "step": 3953 + }, + { + "epoch": 0.7903850478498788, + "grad_norm": 2.078125, + "learning_rate": 8.439262272938891e-06, + "loss": 1.1171, + "step": 3954 + }, + { + "epoch": 0.7905849429049749, + "grad_norm": 2.296875, + "learning_rate": 8.438497294267117e-06, + "loss": 1.1415, + "step": 3955 + }, + { + "epoch": 0.7907848379600709, + "grad_norm": 2.09375, + "learning_rate": 8.43773216285766e-06, + "loss": 1.0359, + "step": 3956 + }, + { + "epoch": 0.790984733015167, + "grad_norm": 2.453125, + "learning_rate": 8.43696687874451e-06, + "loss": 1.0812, + "step": 3957 + }, + { + "epoch": 0.7911846280702631, + "grad_norm": 2.015625, + "learning_rate": 8.436201441961658e-06, + "loss": 1.0503, + "step": 3958 + }, + { + "epoch": 0.7913845231253592, + "grad_norm": 2.21875, + "learning_rate": 8.435435852543103e-06, + "loss": 1.1193, + "step": 3959 + }, + { + "epoch": 0.7915844181804552, + "grad_norm": 1.9296875, + "learning_rate": 8.434670110522855e-06, + "loss": 0.9925, + "step": 3960 + }, + { + "epoch": 0.7917843132355513, + "grad_norm": 2.015625, + "learning_rate": 8.433904215934929e-06, + "loss": 1.1037, + "step": 3961 + }, + { + "epoch": 0.7919842082906474, + "grad_norm": 2.0625, + "learning_rate": 8.433138168813344e-06, + "loss": 0.9896, + "step": 3962 + }, + { + "epoch": 0.7921841033457435, + "grad_norm": 2.0625, + "learning_rate": 8.432371969192127e-06, + "loss": 1.1006, + "step": 3963 + }, + { + "epoch": 0.7923839984008395, + "grad_norm": 2.0, + "learning_rate": 8.431605617105314e-06, + "loss": 1.0182, + "step": 3964 + }, + { + "epoch": 0.7925838934559356, + "grad_norm": 2.125, + "learning_rate": 8.430839112586947e-06, + "loss": 1.0864, + "step": 3965 + }, + { + "epoch": 0.7927837885110317, + "grad_norm": 2.015625, + "learning_rate": 8.430072455671072e-06, + "loss": 1.0649, + "step": 3966 + }, + { + "epoch": 0.7929836835661278, + "grad_norm": 2.140625, + "learning_rate": 8.429305646391746e-06, + "loss": 1.0821, + "step": 3967 + }, + { + "epoch": 0.7931835786212239, + "grad_norm": 2.0625, + "learning_rate": 8.428538684783027e-06, + "loss": 1.0131, + "step": 3968 + }, + { + "epoch": 0.7933834736763199, + "grad_norm": 2.078125, + "learning_rate": 8.427771570878988e-06, + "loss": 1.1314, + "step": 3969 + }, + { + "epoch": 0.793583368731416, + "grad_norm": 2.25, + "learning_rate": 8.427004304713701e-06, + "loss": 1.1188, + "step": 3970 + }, + { + "epoch": 0.7937832637865121, + "grad_norm": 2.015625, + "learning_rate": 8.42623688632125e-06, + "loss": 0.9961, + "step": 3971 + }, + { + "epoch": 0.7939831588416082, + "grad_norm": 2.140625, + "learning_rate": 8.425469315735722e-06, + "loss": 1.0934, + "step": 3972 + }, + { + "epoch": 0.7941830538967042, + "grad_norm": 2.015625, + "learning_rate": 8.424701592991212e-06, + "loss": 0.9757, + "step": 3973 + }, + { + "epoch": 0.7943829489518003, + "grad_norm": 2.125, + "learning_rate": 8.423933718121823e-06, + "loss": 1.0581, + "step": 3974 + }, + { + "epoch": 0.7945828440068964, + "grad_norm": 2.015625, + "learning_rate": 8.423165691161665e-06, + "loss": 1.033, + "step": 3975 + }, + { + "epoch": 0.7947827390619925, + "grad_norm": 2.125, + "learning_rate": 8.422397512144854e-06, + "loss": 1.0961, + "step": 3976 + }, + { + "epoch": 0.7949826341170886, + "grad_norm": 2.25, + "learning_rate": 8.421629181105509e-06, + "loss": 1.08, + "step": 3977 + }, + { + "epoch": 0.7951825291721846, + "grad_norm": 2.109375, + "learning_rate": 8.42086069807776e-06, + "loss": 1.0658, + "step": 3978 + }, + { + "epoch": 0.7953824242272807, + "grad_norm": 1.9921875, + "learning_rate": 8.420092063095746e-06, + "loss": 0.9352, + "step": 3979 + }, + { + "epoch": 0.7955823192823768, + "grad_norm": 2.171875, + "learning_rate": 8.419323276193607e-06, + "loss": 1.0642, + "step": 3980 + }, + { + "epoch": 0.7957822143374729, + "grad_norm": 2.109375, + "learning_rate": 8.418554337405493e-06, + "loss": 1.0289, + "step": 3981 + }, + { + "epoch": 0.7959821093925689, + "grad_norm": 2.28125, + "learning_rate": 8.417785246765561e-06, + "loss": 1.1825, + "step": 3982 + }, + { + "epoch": 0.796182004447665, + "grad_norm": 1.9765625, + "learning_rate": 8.417016004307974e-06, + "loss": 0.9411, + "step": 3983 + }, + { + "epoch": 0.7963818995027611, + "grad_norm": 2.078125, + "learning_rate": 8.4162466100669e-06, + "loss": 1.015, + "step": 3984 + }, + { + "epoch": 0.7965817945578572, + "grad_norm": 1.9765625, + "learning_rate": 8.415477064076518e-06, + "loss": 1.0249, + "step": 3985 + }, + { + "epoch": 0.7967816896129531, + "grad_norm": 2.1875, + "learning_rate": 8.414707366371006e-06, + "loss": 1.1011, + "step": 3986 + }, + { + "epoch": 0.7969815846680492, + "grad_norm": 2.171875, + "learning_rate": 8.41393751698456e-06, + "loss": 1.15, + "step": 3987 + }, + { + "epoch": 0.7971814797231453, + "grad_norm": 2.390625, + "learning_rate": 8.413167515951374e-06, + "loss": 1.0764, + "step": 3988 + }, + { + "epoch": 0.7973813747782414, + "grad_norm": 2.21875, + "learning_rate": 8.412397363305653e-06, + "loss": 1.0558, + "step": 3989 + }, + { + "epoch": 0.7975812698333375, + "grad_norm": 2.140625, + "learning_rate": 8.411627059081603e-06, + "loss": 1.0267, + "step": 3990 + }, + { + "epoch": 0.7977811648884335, + "grad_norm": 2.09375, + "learning_rate": 8.410856603313446e-06, + "loss": 1.0978, + "step": 3991 + }, + { + "epoch": 0.7979810599435296, + "grad_norm": 2.03125, + "learning_rate": 8.410085996035402e-06, + "loss": 1.0463, + "step": 3992 + }, + { + "epoch": 0.7981809549986257, + "grad_norm": 2.015625, + "learning_rate": 8.409315237281701e-06, + "loss": 1.0431, + "step": 3993 + }, + { + "epoch": 0.7983808500537218, + "grad_norm": 2.265625, + "learning_rate": 8.408544327086583e-06, + "loss": 1.2017, + "step": 3994 + }, + { + "epoch": 0.7985807451088178, + "grad_norm": 2.125, + "learning_rate": 8.407773265484289e-06, + "loss": 1.1155, + "step": 3995 + }, + { + "epoch": 0.7987806401639139, + "grad_norm": 1.953125, + "learning_rate": 8.40700205250907e-06, + "loss": 1.0254, + "step": 3996 + }, + { + "epoch": 0.79898053521901, + "grad_norm": 2.046875, + "learning_rate": 8.406230688195184e-06, + "loss": 0.9988, + "step": 3997 + }, + { + "epoch": 0.7991804302741061, + "grad_norm": 1.9453125, + "learning_rate": 8.405459172576895e-06, + "loss": 0.9203, + "step": 3998 + }, + { + "epoch": 0.7993803253292022, + "grad_norm": 2.03125, + "learning_rate": 8.404687505688474e-06, + "loss": 0.9984, + "step": 3999 + }, + { + "epoch": 0.7995802203842982, + "grad_norm": 2.234375, + "learning_rate": 8.403915687564198e-06, + "loss": 0.9678, + "step": 4000 + }, + { + "epoch": 0.7997801154393943, + "grad_norm": 2.0, + "learning_rate": 8.403143718238347e-06, + "loss": 0.986, + "step": 4001 + }, + { + "epoch": 0.7999800104944904, + "grad_norm": 1.953125, + "learning_rate": 8.402371597745218e-06, + "loss": 1.0025, + "step": 4002 + }, + { + "epoch": 0.8001799055495865, + "grad_norm": 2.078125, + "learning_rate": 8.401599326119108e-06, + "loss": 1.0755, + "step": 4003 + }, + { + "epoch": 0.8003798006046825, + "grad_norm": 1.9140625, + "learning_rate": 8.400826903394317e-06, + "loss": 0.8787, + "step": 4004 + }, + { + "epoch": 0.8005796956597786, + "grad_norm": 2.1875, + "learning_rate": 8.400054329605159e-06, + "loss": 0.9739, + "step": 4005 + }, + { + "epoch": 0.8007795907148747, + "grad_norm": 2.078125, + "learning_rate": 8.39928160478595e-06, + "loss": 0.9824, + "step": 4006 + }, + { + "epoch": 0.8009794857699708, + "grad_norm": 2.1875, + "learning_rate": 8.398508728971016e-06, + "loss": 1.004, + "step": 4007 + }, + { + "epoch": 0.8011793808250668, + "grad_norm": 2.03125, + "learning_rate": 8.397735702194686e-06, + "loss": 0.9618, + "step": 4008 + }, + { + "epoch": 0.8013792758801629, + "grad_norm": 2.09375, + "learning_rate": 8.396962524491299e-06, + "loss": 1.1228, + "step": 4009 + }, + { + "epoch": 0.801579170935259, + "grad_norm": 2.171875, + "learning_rate": 8.3961891958952e-06, + "loss": 0.9377, + "step": 4010 + }, + { + "epoch": 0.8017790659903551, + "grad_norm": 2.1875, + "learning_rate": 8.39541571644074e-06, + "loss": 1.0748, + "step": 4011 + }, + { + "epoch": 0.8019789610454512, + "grad_norm": 2.09375, + "learning_rate": 8.394642086162278e-06, + "loss": 1.0429, + "step": 4012 + }, + { + "epoch": 0.8021788561005472, + "grad_norm": 2.046875, + "learning_rate": 8.393868305094173e-06, + "loss": 1.099, + "step": 4013 + }, + { + "epoch": 0.8023787511556433, + "grad_norm": 1.9765625, + "learning_rate": 8.393094373270804e-06, + "loss": 1.0019, + "step": 4014 + }, + { + "epoch": 0.8025786462107394, + "grad_norm": 2.0625, + "learning_rate": 8.392320290726543e-06, + "loss": 1.059, + "step": 4015 + }, + { + "epoch": 0.8027785412658355, + "grad_norm": 2.078125, + "learning_rate": 8.391546057495778e-06, + "loss": 1.0901, + "step": 4016 + }, + { + "epoch": 0.8029784363209315, + "grad_norm": 2.09375, + "learning_rate": 8.3907716736129e-06, + "loss": 1.0134, + "step": 4017 + }, + { + "epoch": 0.8031783313760276, + "grad_norm": 2.0625, + "learning_rate": 8.389997139112306e-06, + "loss": 0.9359, + "step": 4018 + }, + { + "epoch": 0.8033782264311237, + "grad_norm": 2.046875, + "learning_rate": 8.3892224540284e-06, + "loss": 1.0603, + "step": 4019 + }, + { + "epoch": 0.8035781214862198, + "grad_norm": 1.9453125, + "learning_rate": 8.388447618395598e-06, + "loss": 0.9355, + "step": 4020 + }, + { + "epoch": 0.8037780165413158, + "grad_norm": 2.140625, + "learning_rate": 8.387672632248312e-06, + "loss": 1.1244, + "step": 4021 + }, + { + "epoch": 0.8039779115964119, + "grad_norm": 1.9609375, + "learning_rate": 8.38689749562097e-06, + "loss": 1.0161, + "step": 4022 + }, + { + "epoch": 0.804177806651508, + "grad_norm": 2.078125, + "learning_rate": 8.386122208548002e-06, + "loss": 0.9934, + "step": 4023 + }, + { + "epoch": 0.804377701706604, + "grad_norm": 2.015625, + "learning_rate": 8.385346771063848e-06, + "loss": 1.0326, + "step": 4024 + }, + { + "epoch": 0.8045775967617002, + "grad_norm": 1.984375, + "learning_rate": 8.384571183202952e-06, + "loss": 1.059, + "step": 4025 + }, + { + "epoch": 0.8047774918167961, + "grad_norm": 2.078125, + "learning_rate": 8.383795444999766e-06, + "loss": 1.0174, + "step": 4026 + }, + { + "epoch": 0.8049773868718922, + "grad_norm": 2.0, + "learning_rate": 8.383019556488747e-06, + "loss": 1.0145, + "step": 4027 + }, + { + "epoch": 0.8051772819269883, + "grad_norm": 2.015625, + "learning_rate": 8.38224351770436e-06, + "loss": 1.0756, + "step": 4028 + }, + { + "epoch": 0.8053771769820844, + "grad_norm": 2.109375, + "learning_rate": 8.381467328681078e-06, + "loss": 1.0536, + "step": 4029 + }, + { + "epoch": 0.8055770720371804, + "grad_norm": 2.046875, + "learning_rate": 8.380690989453379e-06, + "loss": 1.0156, + "step": 4030 + }, + { + "epoch": 0.8057769670922765, + "grad_norm": 2.015625, + "learning_rate": 8.379914500055745e-06, + "loss": 1.0135, + "step": 4031 + }, + { + "epoch": 0.8059768621473726, + "grad_norm": 2.0625, + "learning_rate": 8.379137860522672e-06, + "loss": 1.0025, + "step": 4032 + }, + { + "epoch": 0.8061767572024687, + "grad_norm": 2.015625, + "learning_rate": 8.378361070888656e-06, + "loss": 1.1067, + "step": 4033 + }, + { + "epoch": 0.8063766522575648, + "grad_norm": 1.9375, + "learning_rate": 8.3775841311882e-06, + "loss": 1.0107, + "step": 4034 + }, + { + "epoch": 0.8065765473126608, + "grad_norm": 1.9921875, + "learning_rate": 8.376807041455822e-06, + "loss": 0.9656, + "step": 4035 + }, + { + "epoch": 0.8067764423677569, + "grad_norm": 2.046875, + "learning_rate": 8.376029801726033e-06, + "loss": 1.0475, + "step": 4036 + }, + { + "epoch": 0.806976337422853, + "grad_norm": 2.0625, + "learning_rate": 8.375252412033361e-06, + "loss": 1.0904, + "step": 4037 + }, + { + "epoch": 0.8071762324779491, + "grad_norm": 2.046875, + "learning_rate": 8.374474872412338e-06, + "loss": 1.0797, + "step": 4038 + }, + { + "epoch": 0.8073761275330451, + "grad_norm": 2.0, + "learning_rate": 8.3736971828975e-06, + "loss": 1.0428, + "step": 4039 + }, + { + "epoch": 0.8075760225881412, + "grad_norm": 2.171875, + "learning_rate": 8.372919343523395e-06, + "loss": 0.9768, + "step": 4040 + }, + { + "epoch": 0.8077759176432373, + "grad_norm": 2.015625, + "learning_rate": 8.372141354324573e-06, + "loss": 1.057, + "step": 4041 + }, + { + "epoch": 0.8079758126983334, + "grad_norm": 2.125, + "learning_rate": 8.37136321533559e-06, + "loss": 0.9486, + "step": 4042 + }, + { + "epoch": 0.8081757077534294, + "grad_norm": 2.015625, + "learning_rate": 8.370584926591015e-06, + "loss": 1.0627, + "step": 4043 + }, + { + "epoch": 0.8083756028085255, + "grad_norm": 2.265625, + "learning_rate": 8.369806488125418e-06, + "loss": 1.0046, + "step": 4044 + }, + { + "epoch": 0.8085754978636216, + "grad_norm": 2.046875, + "learning_rate": 8.369027899973377e-06, + "loss": 0.9561, + "step": 4045 + }, + { + "epoch": 0.8087753929187177, + "grad_norm": 2.125, + "learning_rate": 8.368249162169474e-06, + "loss": 1.1483, + "step": 4046 + }, + { + "epoch": 0.8089752879738138, + "grad_norm": 2.296875, + "learning_rate": 8.367470274748303e-06, + "loss": 1.0364, + "step": 4047 + }, + { + "epoch": 0.8091751830289098, + "grad_norm": 2.015625, + "learning_rate": 8.366691237744465e-06, + "loss": 1.0638, + "step": 4048 + }, + { + "epoch": 0.8093750780840059, + "grad_norm": 1.9921875, + "learning_rate": 8.365912051192559e-06, + "loss": 1.0725, + "step": 4049 + }, + { + "epoch": 0.809574973139102, + "grad_norm": 2.03125, + "learning_rate": 8.365132715127201e-06, + "loss": 1.0622, + "step": 4050 + }, + { + "epoch": 0.8097748681941981, + "grad_norm": 2.1875, + "learning_rate": 8.364353229583007e-06, + "loss": 1.115, + "step": 4051 + }, + { + "epoch": 0.8099747632492941, + "grad_norm": 1.953125, + "learning_rate": 8.363573594594603e-06, + "loss": 0.9759, + "step": 4052 + }, + { + "epoch": 0.8101746583043902, + "grad_norm": 2.015625, + "learning_rate": 8.362793810196616e-06, + "loss": 0.9862, + "step": 4053 + }, + { + "epoch": 0.8103745533594863, + "grad_norm": 2.03125, + "learning_rate": 8.362013876423689e-06, + "loss": 1.0371, + "step": 4054 + }, + { + "epoch": 0.8105744484145824, + "grad_norm": 2.140625, + "learning_rate": 8.361233793310466e-06, + "loss": 1.0631, + "step": 4055 + }, + { + "epoch": 0.8107743434696785, + "grad_norm": 2.125, + "learning_rate": 8.360453560891594e-06, + "loss": 1.0111, + "step": 4056 + }, + { + "epoch": 0.8109742385247745, + "grad_norm": 2.0625, + "learning_rate": 8.359673179201734e-06, + "loss": 1.0972, + "step": 4057 + }, + { + "epoch": 0.8111741335798706, + "grad_norm": 2.0625, + "learning_rate": 8.358892648275554e-06, + "loss": 0.9885, + "step": 4058 + }, + { + "epoch": 0.8113740286349667, + "grad_norm": 2.0625, + "learning_rate": 8.358111968147717e-06, + "loss": 0.994, + "step": 4059 + }, + { + "epoch": 0.8115739236900628, + "grad_norm": 1.984375, + "learning_rate": 8.357331138852907e-06, + "loss": 0.9565, + "step": 4060 + }, + { + "epoch": 0.8117738187451587, + "grad_norm": 2.078125, + "learning_rate": 8.356550160425806e-06, + "loss": 1.1143, + "step": 4061 + }, + { + "epoch": 0.8119737138002548, + "grad_norm": 2.1875, + "learning_rate": 8.355769032901105e-06, + "loss": 1.0377, + "step": 4062 + }, + { + "epoch": 0.812173608855351, + "grad_norm": 1.9453125, + "learning_rate": 8.354987756313501e-06, + "loss": 0.9248, + "step": 4063 + }, + { + "epoch": 0.812373503910447, + "grad_norm": 2.171875, + "learning_rate": 8.354206330697702e-06, + "loss": 0.9865, + "step": 4064 + }, + { + "epoch": 0.812573398965543, + "grad_norm": 1.9921875, + "learning_rate": 8.353424756088415e-06, + "loss": 0.9604, + "step": 4065 + }, + { + "epoch": 0.8127732940206391, + "grad_norm": 2.171875, + "learning_rate": 8.352643032520357e-06, + "loss": 1.1125, + "step": 4066 + }, + { + "epoch": 0.8129731890757352, + "grad_norm": 2.03125, + "learning_rate": 8.351861160028256e-06, + "loss": 1.1097, + "step": 4067 + }, + { + "epoch": 0.8131730841308313, + "grad_norm": 2.0625, + "learning_rate": 8.351079138646838e-06, + "loss": 1.0526, + "step": 4068 + }, + { + "epoch": 0.8133729791859274, + "grad_norm": 2.125, + "learning_rate": 8.350296968410845e-06, + "loss": 1.0931, + "step": 4069 + }, + { + "epoch": 0.8135728742410234, + "grad_norm": 2.09375, + "learning_rate": 8.349514649355016e-06, + "loss": 1.0257, + "step": 4070 + }, + { + "epoch": 0.8137727692961195, + "grad_norm": 2.09375, + "learning_rate": 8.348732181514105e-06, + "loss": 1.0811, + "step": 4071 + }, + { + "epoch": 0.8139726643512156, + "grad_norm": 2.09375, + "learning_rate": 8.347949564922869e-06, + "loss": 1.1355, + "step": 4072 + }, + { + "epoch": 0.8141725594063117, + "grad_norm": 2.015625, + "learning_rate": 8.347166799616069e-06, + "loss": 1.0506, + "step": 4073 + }, + { + "epoch": 0.8143724544614077, + "grad_norm": 2.015625, + "learning_rate": 8.346383885628478e-06, + "loss": 1.0503, + "step": 4074 + }, + { + "epoch": 0.8145723495165038, + "grad_norm": 2.078125, + "learning_rate": 8.345600822994872e-06, + "loss": 1.0772, + "step": 4075 + }, + { + "epoch": 0.8147722445715999, + "grad_norm": 2.03125, + "learning_rate": 8.344817611750036e-06, + "loss": 1.0449, + "step": 4076 + }, + { + "epoch": 0.814972139626696, + "grad_norm": 1.9921875, + "learning_rate": 8.344034251928759e-06, + "loss": 0.9987, + "step": 4077 + }, + { + "epoch": 0.8151720346817921, + "grad_norm": 2.203125, + "learning_rate": 8.343250743565837e-06, + "loss": 1.0547, + "step": 4078 + }, + { + "epoch": 0.8153719297368881, + "grad_norm": 2.125, + "learning_rate": 8.342467086696073e-06, + "loss": 1.1069, + "step": 4079 + }, + { + "epoch": 0.8155718247919842, + "grad_norm": 2.109375, + "learning_rate": 8.341683281354277e-06, + "loss": 1.0543, + "step": 4080 + }, + { + "epoch": 0.8157717198470803, + "grad_norm": 2.1875, + "learning_rate": 8.34089932757527e-06, + "loss": 1.0877, + "step": 4081 + }, + { + "epoch": 0.8159716149021764, + "grad_norm": 2.109375, + "learning_rate": 8.34011522539387e-06, + "loss": 1.1096, + "step": 4082 + }, + { + "epoch": 0.8161715099572724, + "grad_norm": 2.09375, + "learning_rate": 8.339330974844908e-06, + "loss": 1.0155, + "step": 4083 + }, + { + "epoch": 0.8163714050123685, + "grad_norm": 2.171875, + "learning_rate": 8.338546575963223e-06, + "loss": 1.0979, + "step": 4084 + }, + { + "epoch": 0.8165713000674646, + "grad_norm": 2.046875, + "learning_rate": 8.337762028783653e-06, + "loss": 1.0712, + "step": 4085 + }, + { + "epoch": 0.8167711951225607, + "grad_norm": 2.015625, + "learning_rate": 8.336977333341052e-06, + "loss": 1.034, + "step": 4086 + }, + { + "epoch": 0.8169710901776567, + "grad_norm": 1.984375, + "learning_rate": 8.336192489670273e-06, + "loss": 1.0542, + "step": 4087 + }, + { + "epoch": 0.8171709852327528, + "grad_norm": 2.046875, + "learning_rate": 8.33540749780618e-06, + "loss": 0.9744, + "step": 4088 + }, + { + "epoch": 0.8173708802878489, + "grad_norm": 2.0625, + "learning_rate": 8.334622357783642e-06, + "loss": 1.0116, + "step": 4089 + }, + { + "epoch": 0.817570775342945, + "grad_norm": 2.1875, + "learning_rate": 8.333837069637536e-06, + "loss": 1.1014, + "step": 4090 + }, + { + "epoch": 0.8177706703980411, + "grad_norm": 1.9453125, + "learning_rate": 8.333051633402743e-06, + "loss": 0.9645, + "step": 4091 + }, + { + "epoch": 0.8179705654531371, + "grad_norm": 1.984375, + "learning_rate": 8.332266049114152e-06, + "loss": 0.9681, + "step": 4092 + }, + { + "epoch": 0.8181704605082332, + "grad_norm": 2.078125, + "learning_rate": 8.33148031680666e-06, + "loss": 1.0724, + "step": 4093 + }, + { + "epoch": 0.8183703555633293, + "grad_norm": 2.078125, + "learning_rate": 8.330694436515168e-06, + "loss": 1.0307, + "step": 4094 + }, + { + "epoch": 0.8185702506184254, + "grad_norm": 2.265625, + "learning_rate": 8.329908408274583e-06, + "loss": 1.1436, + "step": 4095 + }, + { + "epoch": 0.8187701456735214, + "grad_norm": 2.046875, + "learning_rate": 8.329122232119824e-06, + "loss": 1.0197, + "step": 4096 + }, + { + "epoch": 0.8189700407286175, + "grad_norm": 1.921875, + "learning_rate": 8.32833590808581e-06, + "loss": 0.9844, + "step": 4097 + }, + { + "epoch": 0.8191699357837136, + "grad_norm": 2.0, + "learning_rate": 8.327549436207472e-06, + "loss": 0.9957, + "step": 4098 + }, + { + "epoch": 0.8193698308388097, + "grad_norm": 1.9921875, + "learning_rate": 8.326762816519743e-06, + "loss": 0.9574, + "step": 4099 + }, + { + "epoch": 0.8195697258939058, + "grad_norm": 2.0, + "learning_rate": 8.325976049057565e-06, + "loss": 0.9851, + "step": 4100 + }, + { + "epoch": 0.8197696209490017, + "grad_norm": 2.359375, + "learning_rate": 8.325189133855884e-06, + "loss": 1.1136, + "step": 4101 + }, + { + "epoch": 0.8199695160040978, + "grad_norm": 1.9921875, + "learning_rate": 8.324402070949658e-06, + "loss": 1.0151, + "step": 4102 + }, + { + "epoch": 0.8201694110591939, + "grad_norm": 2.03125, + "learning_rate": 8.323614860373848e-06, + "loss": 1.0295, + "step": 4103 + }, + { + "epoch": 0.82036930611429, + "grad_norm": 1.984375, + "learning_rate": 8.322827502163422e-06, + "loss": 1.0286, + "step": 4104 + }, + { + "epoch": 0.820569201169386, + "grad_norm": 2.1875, + "learning_rate": 8.32203999635335e-06, + "loss": 1.1238, + "step": 4105 + }, + { + "epoch": 0.8207690962244821, + "grad_norm": 2.078125, + "learning_rate": 8.321252342978617e-06, + "loss": 1.1182, + "step": 4106 + }, + { + "epoch": 0.8209689912795782, + "grad_norm": 2.109375, + "learning_rate": 8.32046454207421e-06, + "loss": 1.0793, + "step": 4107 + }, + { + "epoch": 0.8211688863346743, + "grad_norm": 2.0, + "learning_rate": 8.319676593675124e-06, + "loss": 0.9489, + "step": 4108 + }, + { + "epoch": 0.8213687813897703, + "grad_norm": 1.9609375, + "learning_rate": 8.318888497816357e-06, + "loss": 0.9854, + "step": 4109 + }, + { + "epoch": 0.8215686764448664, + "grad_norm": 2.0, + "learning_rate": 8.318100254532917e-06, + "loss": 0.8775, + "step": 4110 + }, + { + "epoch": 0.8217685714999625, + "grad_norm": 2.0625, + "learning_rate": 8.31731186385982e-06, + "loss": 1.0759, + "step": 4111 + }, + { + "epoch": 0.8219684665550586, + "grad_norm": 2.140625, + "learning_rate": 8.316523325832083e-06, + "loss": 0.9916, + "step": 4112 + }, + { + "epoch": 0.8221683616101547, + "grad_norm": 2.0625, + "learning_rate": 8.315734640484734e-06, + "loss": 1.062, + "step": 4113 + }, + { + "epoch": 0.8223682566652507, + "grad_norm": 2.109375, + "learning_rate": 8.314945807852808e-06, + "loss": 1.061, + "step": 4114 + }, + { + "epoch": 0.8225681517203468, + "grad_norm": 2.078125, + "learning_rate": 8.31415682797134e-06, + "loss": 1.0866, + "step": 4115 + }, + { + "epoch": 0.8227680467754429, + "grad_norm": 2.21875, + "learning_rate": 8.313367700875381e-06, + "loss": 1.17, + "step": 4116 + }, + { + "epoch": 0.822967941830539, + "grad_norm": 1.984375, + "learning_rate": 8.312578426599984e-06, + "loss": 0.9527, + "step": 4117 + }, + { + "epoch": 0.823167836885635, + "grad_norm": 2.109375, + "learning_rate": 8.311789005180207e-06, + "loss": 1.085, + "step": 4118 + }, + { + "epoch": 0.8233677319407311, + "grad_norm": 2.0625, + "learning_rate": 8.310999436651115e-06, + "loss": 1.0659, + "step": 4119 + }, + { + "epoch": 0.8235676269958272, + "grad_norm": 2.15625, + "learning_rate": 8.310209721047782e-06, + "loss": 1.0947, + "step": 4120 + }, + { + "epoch": 0.8237675220509233, + "grad_norm": 1.96875, + "learning_rate": 8.309419858405287e-06, + "loss": 0.9207, + "step": 4121 + }, + { + "epoch": 0.8239674171060194, + "grad_norm": 2.0, + "learning_rate": 8.308629848758714e-06, + "loss": 1.0384, + "step": 4122 + }, + { + "epoch": 0.8241673121611154, + "grad_norm": 2.25, + "learning_rate": 8.307839692143158e-06, + "loss": 1.0301, + "step": 4123 + }, + { + "epoch": 0.8243672072162115, + "grad_norm": 2.15625, + "learning_rate": 8.307049388593716e-06, + "loss": 1.0894, + "step": 4124 + }, + { + "epoch": 0.8245671022713076, + "grad_norm": 2.03125, + "learning_rate": 8.306258938145493e-06, + "loss": 0.9666, + "step": 4125 + }, + { + "epoch": 0.8247669973264037, + "grad_norm": 2.21875, + "learning_rate": 8.3054683408336e-06, + "loss": 1.0584, + "step": 4126 + }, + { + "epoch": 0.8249668923814997, + "grad_norm": 2.15625, + "learning_rate": 8.304677596693158e-06, + "loss": 1.0143, + "step": 4127 + }, + { + "epoch": 0.8251667874365958, + "grad_norm": 2.03125, + "learning_rate": 8.30388670575929e-06, + "loss": 0.9847, + "step": 4128 + }, + { + "epoch": 0.8253666824916919, + "grad_norm": 2.015625, + "learning_rate": 8.303095668067127e-06, + "loss": 0.9561, + "step": 4129 + }, + { + "epoch": 0.825566577546788, + "grad_norm": 2.046875, + "learning_rate": 8.302304483651806e-06, + "loss": 1.0133, + "step": 4130 + }, + { + "epoch": 0.825766472601884, + "grad_norm": 2.234375, + "learning_rate": 8.301513152548474e-06, + "loss": 1.0884, + "step": 4131 + }, + { + "epoch": 0.8259663676569801, + "grad_norm": 2.078125, + "learning_rate": 8.30072167479228e-06, + "loss": 1.0631, + "step": 4132 + }, + { + "epoch": 0.8261662627120762, + "grad_norm": 2.15625, + "learning_rate": 8.299930050418383e-06, + "loss": 1.0408, + "step": 4133 + }, + { + "epoch": 0.8263661577671723, + "grad_norm": 2.109375, + "learning_rate": 8.299138279461945e-06, + "loss": 1.0644, + "step": 4134 + }, + { + "epoch": 0.8265660528222684, + "grad_norm": 2.0625, + "learning_rate": 8.298346361958136e-06, + "loss": 0.9942, + "step": 4135 + }, + { + "epoch": 0.8267659478773643, + "grad_norm": 1.953125, + "learning_rate": 8.297554297942134e-06, + "loss": 1.0154, + "step": 4136 + }, + { + "epoch": 0.8269658429324604, + "grad_norm": 2.28125, + "learning_rate": 8.296762087449122e-06, + "loss": 1.07, + "step": 4137 + }, + { + "epoch": 0.8271657379875565, + "grad_norm": 1.96875, + "learning_rate": 8.29596973051429e-06, + "loss": 1.014, + "step": 4138 + }, + { + "epoch": 0.8273656330426526, + "grad_norm": 2.078125, + "learning_rate": 8.295177227172837e-06, + "loss": 0.9717, + "step": 4139 + }, + { + "epoch": 0.8275655280977486, + "grad_norm": 2.03125, + "learning_rate": 8.294384577459961e-06, + "loss": 1.0929, + "step": 4140 + }, + { + "epoch": 0.8277654231528447, + "grad_norm": 2.0625, + "learning_rate": 8.293591781410874e-06, + "loss": 0.992, + "step": 4141 + }, + { + "epoch": 0.8279653182079408, + "grad_norm": 2.3125, + "learning_rate": 8.292798839060794e-06, + "loss": 1.1135, + "step": 4142 + }, + { + "epoch": 0.8281652132630369, + "grad_norm": 1.9765625, + "learning_rate": 8.29200575044494e-06, + "loss": 1.0303, + "step": 4143 + }, + { + "epoch": 0.8283651083181329, + "grad_norm": 2.171875, + "learning_rate": 8.29121251559854e-06, + "loss": 0.9511, + "step": 4144 + }, + { + "epoch": 0.828565003373229, + "grad_norm": 2.046875, + "learning_rate": 8.290419134556835e-06, + "loss": 1.0883, + "step": 4145 + }, + { + "epoch": 0.8287648984283251, + "grad_norm": 2.21875, + "learning_rate": 8.289625607355062e-06, + "loss": 1.0834, + "step": 4146 + }, + { + "epoch": 0.8289647934834212, + "grad_norm": 2.078125, + "learning_rate": 8.288831934028471e-06, + "loss": 1.0117, + "step": 4147 + }, + { + "epoch": 0.8291646885385173, + "grad_norm": 2.046875, + "learning_rate": 8.288038114612316e-06, + "loss": 0.9673, + "step": 4148 + }, + { + "epoch": 0.8293645835936133, + "grad_norm": 2.21875, + "learning_rate": 8.287244149141861e-06, + "loss": 1.0559, + "step": 4149 + }, + { + "epoch": 0.8295644786487094, + "grad_norm": 2.0625, + "learning_rate": 8.286450037652369e-06, + "loss": 1.0355, + "step": 4150 + }, + { + "epoch": 0.8297643737038055, + "grad_norm": 2.09375, + "learning_rate": 8.28565578017912e-06, + "loss": 1.0386, + "step": 4151 + }, + { + "epoch": 0.8299642687589016, + "grad_norm": 2.078125, + "learning_rate": 8.284861376757391e-06, + "loss": 1.0309, + "step": 4152 + }, + { + "epoch": 0.8301641638139976, + "grad_norm": 1.9765625, + "learning_rate": 8.28406682742247e-06, + "loss": 0.9418, + "step": 4153 + }, + { + "epoch": 0.8303640588690937, + "grad_norm": 2.09375, + "learning_rate": 8.283272132209653e-06, + "loss": 1.0459, + "step": 4154 + }, + { + "epoch": 0.8305639539241898, + "grad_norm": 1.96875, + "learning_rate": 8.282477291154238e-06, + "loss": 1.0016, + "step": 4155 + }, + { + "epoch": 0.8307638489792859, + "grad_norm": 2.109375, + "learning_rate": 8.281682304291531e-06, + "loss": 1.0282, + "step": 4156 + }, + { + "epoch": 0.830963744034382, + "grad_norm": 2.109375, + "learning_rate": 8.280887171656848e-06, + "loss": 1.0862, + "step": 4157 + }, + { + "epoch": 0.831163639089478, + "grad_norm": 2.078125, + "learning_rate": 8.280091893285508e-06, + "loss": 1.0085, + "step": 4158 + }, + { + "epoch": 0.8313635341445741, + "grad_norm": 2.140625, + "learning_rate": 8.279296469212833e-06, + "loss": 1.0787, + "step": 4159 + }, + { + "epoch": 0.8315634291996702, + "grad_norm": 2.109375, + "learning_rate": 8.278500899474162e-06, + "loss": 1.0031, + "step": 4160 + }, + { + "epoch": 0.8317633242547663, + "grad_norm": 2.015625, + "learning_rate": 8.277705184104831e-06, + "loss": 0.9732, + "step": 4161 + }, + { + "epoch": 0.8319632193098623, + "grad_norm": 1.9765625, + "learning_rate": 8.276909323140186e-06, + "loss": 0.9604, + "step": 4162 + }, + { + "epoch": 0.8321631143649584, + "grad_norm": 2.203125, + "learning_rate": 8.276113316615577e-06, + "loss": 1.0415, + "step": 4163 + }, + { + "epoch": 0.8323630094200545, + "grad_norm": 2.234375, + "learning_rate": 8.275317164566365e-06, + "loss": 1.0282, + "step": 4164 + }, + { + "epoch": 0.8325629044751506, + "grad_norm": 2.140625, + "learning_rate": 8.274520867027915e-06, + "loss": 1.0074, + "step": 4165 + }, + { + "epoch": 0.8327627995302466, + "grad_norm": 2.046875, + "learning_rate": 8.273724424035599e-06, + "loss": 1.022, + "step": 4166 + }, + { + "epoch": 0.8329626945853427, + "grad_norm": 2.0625, + "learning_rate": 8.272927835624791e-06, + "loss": 1.0232, + "step": 4167 + }, + { + "epoch": 0.8331625896404388, + "grad_norm": 2.0625, + "learning_rate": 8.272131101830878e-06, + "loss": 1.0433, + "step": 4168 + }, + { + "epoch": 0.8333624846955349, + "grad_norm": 2.171875, + "learning_rate": 8.271334222689254e-06, + "loss": 1.0848, + "step": 4169 + }, + { + "epoch": 0.833562379750631, + "grad_norm": 2.0, + "learning_rate": 8.270537198235311e-06, + "loss": 1.024, + "step": 4170 + }, + { + "epoch": 0.833762274805727, + "grad_norm": 2.109375, + "learning_rate": 8.269740028504455e-06, + "loss": 1.0799, + "step": 4171 + }, + { + "epoch": 0.833962169860823, + "grad_norm": 2.140625, + "learning_rate": 8.268942713532098e-06, + "loss": 1.02, + "step": 4172 + }, + { + "epoch": 0.8341620649159192, + "grad_norm": 1.953125, + "learning_rate": 8.268145253353653e-06, + "loss": 0.9191, + "step": 4173 + }, + { + "epoch": 0.8343619599710153, + "grad_norm": 2.015625, + "learning_rate": 8.267347648004545e-06, + "loss": 1.1071, + "step": 4174 + }, + { + "epoch": 0.8345618550261112, + "grad_norm": 2.46875, + "learning_rate": 8.266549897520204e-06, + "loss": 1.0778, + "step": 4175 + }, + { + "epoch": 0.8347617500812073, + "grad_norm": 2.0, + "learning_rate": 8.265752001936067e-06, + "loss": 1.0233, + "step": 4176 + }, + { + "epoch": 0.8349616451363034, + "grad_norm": 2.0, + "learning_rate": 8.264953961287573e-06, + "loss": 1.0217, + "step": 4177 + }, + { + "epoch": 0.8351615401913995, + "grad_norm": 2.078125, + "learning_rate": 8.264155775610172e-06, + "loss": 1.0185, + "step": 4178 + }, + { + "epoch": 0.8353614352464956, + "grad_norm": 2.109375, + "learning_rate": 8.263357444939321e-06, + "loss": 1.0463, + "step": 4179 + }, + { + "epoch": 0.8355613303015916, + "grad_norm": 2.390625, + "learning_rate": 8.26255896931048e-06, + "loss": 1.0506, + "step": 4180 + }, + { + "epoch": 0.8357612253566877, + "grad_norm": 2.015625, + "learning_rate": 8.26176034875912e-06, + "loss": 1.0384, + "step": 4181 + }, + { + "epoch": 0.8359611204117838, + "grad_norm": 2.203125, + "learning_rate": 8.26096158332071e-06, + "loss": 1.03, + "step": 4182 + }, + { + "epoch": 0.8361610154668799, + "grad_norm": 1.984375, + "learning_rate": 8.260162673030739e-06, + "loss": 1.1151, + "step": 4183 + }, + { + "epoch": 0.8363609105219759, + "grad_norm": 2.03125, + "learning_rate": 8.259363617924689e-06, + "loss": 0.9808, + "step": 4184 + }, + { + "epoch": 0.836560805577072, + "grad_norm": 2.109375, + "learning_rate": 8.258564418038053e-06, + "loss": 1.0657, + "step": 4185 + }, + { + "epoch": 0.8367607006321681, + "grad_norm": 2.09375, + "learning_rate": 8.257765073406337e-06, + "loss": 1.0732, + "step": 4186 + }, + { + "epoch": 0.8369605956872642, + "grad_norm": 2.0625, + "learning_rate": 8.256965584065042e-06, + "loss": 0.9965, + "step": 4187 + }, + { + "epoch": 0.8371604907423602, + "grad_norm": 2.09375, + "learning_rate": 8.256165950049684e-06, + "loss": 1.0115, + "step": 4188 + }, + { + "epoch": 0.8373603857974563, + "grad_norm": 2.203125, + "learning_rate": 8.255366171395783e-06, + "loss": 1.1086, + "step": 4189 + }, + { + "epoch": 0.8375602808525524, + "grad_norm": 2.015625, + "learning_rate": 8.254566248138865e-06, + "loss": 0.9886, + "step": 4190 + }, + { + "epoch": 0.8377601759076485, + "grad_norm": 2.03125, + "learning_rate": 8.25376618031446e-06, + "loss": 1.0307, + "step": 4191 + }, + { + "epoch": 0.8379600709627446, + "grad_norm": 2.09375, + "learning_rate": 8.252965967958108e-06, + "loss": 1.1474, + "step": 4192 + }, + { + "epoch": 0.8381599660178406, + "grad_norm": 2.0625, + "learning_rate": 8.252165611105358e-06, + "loss": 1.0694, + "step": 4193 + }, + { + "epoch": 0.8383598610729367, + "grad_norm": 2.09375, + "learning_rate": 8.251365109791758e-06, + "loss": 0.9362, + "step": 4194 + }, + { + "epoch": 0.8385597561280328, + "grad_norm": 1.984375, + "learning_rate": 8.250564464052865e-06, + "loss": 0.9577, + "step": 4195 + }, + { + "epoch": 0.8387596511831289, + "grad_norm": 2.46875, + "learning_rate": 8.249763673924248e-06, + "loss": 0.9991, + "step": 4196 + }, + { + "epoch": 0.8389595462382249, + "grad_norm": 2.015625, + "learning_rate": 8.248962739441475e-06, + "loss": 0.9452, + "step": 4197 + }, + { + "epoch": 0.839159441293321, + "grad_norm": 2.078125, + "learning_rate": 8.248161660640123e-06, + "loss": 1.0779, + "step": 4198 + }, + { + "epoch": 0.8393593363484171, + "grad_norm": 2.109375, + "learning_rate": 8.24736043755578e-06, + "loss": 1.0068, + "step": 4199 + }, + { + "epoch": 0.8395592314035132, + "grad_norm": 2.0, + "learning_rate": 8.24655907022403e-06, + "loss": 0.9692, + "step": 4200 + }, + { + "epoch": 0.8397591264586093, + "grad_norm": 2.078125, + "learning_rate": 8.245757558680474e-06, + "loss": 0.9732, + "step": 4201 + }, + { + "epoch": 0.8399590215137053, + "grad_norm": 2.0, + "learning_rate": 8.244955902960713e-06, + "loss": 0.9686, + "step": 4202 + }, + { + "epoch": 0.8401589165688014, + "grad_norm": 2.34375, + "learning_rate": 8.24415410310036e-06, + "loss": 0.9976, + "step": 4203 + }, + { + "epoch": 0.8403588116238975, + "grad_norm": 2.1875, + "learning_rate": 8.243352159135026e-06, + "loss": 1.1032, + "step": 4204 + }, + { + "epoch": 0.8405587066789936, + "grad_norm": 1.984375, + "learning_rate": 8.242550071100336e-06, + "loss": 1.012, + "step": 4205 + }, + { + "epoch": 0.8407586017340896, + "grad_norm": 2.15625, + "learning_rate": 8.24174783903192e-06, + "loss": 1.1062, + "step": 4206 + }, + { + "epoch": 0.8409584967891857, + "grad_norm": 1.9921875, + "learning_rate": 8.240945462965408e-06, + "loss": 1.0424, + "step": 4207 + }, + { + "epoch": 0.8411583918442818, + "grad_norm": 2.078125, + "learning_rate": 8.240142942936446e-06, + "loss": 1.0927, + "step": 4208 + }, + { + "epoch": 0.8413582868993779, + "grad_norm": 2.125, + "learning_rate": 8.239340278980681e-06, + "loss": 1.0483, + "step": 4209 + }, + { + "epoch": 0.8415581819544738, + "grad_norm": 2.03125, + "learning_rate": 8.238537471133768e-06, + "loss": 1.0106, + "step": 4210 + }, + { + "epoch": 0.84175807700957, + "grad_norm": 2.1875, + "learning_rate": 8.237734519431365e-06, + "loss": 0.9823, + "step": 4211 + }, + { + "epoch": 0.841957972064666, + "grad_norm": 2.21875, + "learning_rate": 8.23693142390914e-06, + "loss": 1.0696, + "step": 4212 + }, + { + "epoch": 0.8421578671197621, + "grad_norm": 2.1875, + "learning_rate": 8.236128184602766e-06, + "loss": 0.9739, + "step": 4213 + }, + { + "epoch": 0.8423577621748582, + "grad_norm": 2.15625, + "learning_rate": 8.235324801547926e-06, + "loss": 1.0381, + "step": 4214 + }, + { + "epoch": 0.8425576572299542, + "grad_norm": 2.015625, + "learning_rate": 8.234521274780302e-06, + "loss": 0.9209, + "step": 4215 + }, + { + "epoch": 0.8427575522850503, + "grad_norm": 1.9375, + "learning_rate": 8.233717604335589e-06, + "loss": 1.057, + "step": 4216 + }, + { + "epoch": 0.8429574473401464, + "grad_norm": 2.03125, + "learning_rate": 8.232913790249486e-06, + "loss": 0.9963, + "step": 4217 + }, + { + "epoch": 0.8431573423952425, + "grad_norm": 2.171875, + "learning_rate": 8.232109832557696e-06, + "loss": 1.087, + "step": 4218 + }, + { + "epoch": 0.8433572374503385, + "grad_norm": 2.171875, + "learning_rate": 8.231305731295935e-06, + "loss": 1.128, + "step": 4219 + }, + { + "epoch": 0.8435571325054346, + "grad_norm": 2.203125, + "learning_rate": 8.230501486499915e-06, + "loss": 1.0614, + "step": 4220 + }, + { + "epoch": 0.8437570275605307, + "grad_norm": 2.109375, + "learning_rate": 8.22969709820537e-06, + "loss": 1.0407, + "step": 4221 + }, + { + "epoch": 0.8439569226156268, + "grad_norm": 2.15625, + "learning_rate": 8.228892566448018e-06, + "loss": 1.0015, + "step": 4222 + }, + { + "epoch": 0.8441568176707229, + "grad_norm": 2.171875, + "learning_rate": 8.228087891263608e-06, + "loss": 1.0536, + "step": 4223 + }, + { + "epoch": 0.8443567127258189, + "grad_norm": 2.03125, + "learning_rate": 8.227283072687877e-06, + "loss": 1.0252, + "step": 4224 + }, + { + "epoch": 0.844556607780915, + "grad_norm": 2.015625, + "learning_rate": 8.226478110756574e-06, + "loss": 1.0179, + "step": 4225 + }, + { + "epoch": 0.8447565028360111, + "grad_norm": 2.109375, + "learning_rate": 8.225673005505461e-06, + "loss": 1.0577, + "step": 4226 + }, + { + "epoch": 0.8449563978911072, + "grad_norm": 1.9765625, + "learning_rate": 8.224867756970298e-06, + "loss": 0.9411, + "step": 4227 + }, + { + "epoch": 0.8451562929462032, + "grad_norm": 2.125, + "learning_rate": 8.224062365186852e-06, + "loss": 1.1332, + "step": 4228 + }, + { + "epoch": 0.8453561880012993, + "grad_norm": 2.125, + "learning_rate": 8.223256830190901e-06, + "loss": 1.0812, + "step": 4229 + }, + { + "epoch": 0.8455560830563954, + "grad_norm": 2.046875, + "learning_rate": 8.222451152018225e-06, + "loss": 0.9662, + "step": 4230 + }, + { + "epoch": 0.8457559781114915, + "grad_norm": 2.0625, + "learning_rate": 8.221645330704615e-06, + "loss": 1.0622, + "step": 4231 + }, + { + "epoch": 0.8459558731665875, + "grad_norm": 1.90625, + "learning_rate": 8.220839366285862e-06, + "loss": 0.948, + "step": 4232 + }, + { + "epoch": 0.8461557682216836, + "grad_norm": 2.046875, + "learning_rate": 8.220033258797767e-06, + "loss": 1.0634, + "step": 4233 + }, + { + "epoch": 0.8463556632767797, + "grad_norm": 1.9375, + "learning_rate": 8.21922700827614e-06, + "loss": 1.0048, + "step": 4234 + }, + { + "epoch": 0.8465555583318758, + "grad_norm": 2.03125, + "learning_rate": 8.218420614756793e-06, + "loss": 1.0041, + "step": 4235 + }, + { + "epoch": 0.8467554533869719, + "grad_norm": 2.09375, + "learning_rate": 8.217614078275547e-06, + "loss": 1.008, + "step": 4236 + }, + { + "epoch": 0.8469553484420679, + "grad_norm": 1.9140625, + "learning_rate": 8.216807398868225e-06, + "loss": 0.9848, + "step": 4237 + }, + { + "epoch": 0.847155243497164, + "grad_norm": 2.046875, + "learning_rate": 8.216000576570664e-06, + "loss": 1.0451, + "step": 4238 + }, + { + "epoch": 0.8473551385522601, + "grad_norm": 2.0625, + "learning_rate": 8.2151936114187e-06, + "loss": 0.9849, + "step": 4239 + }, + { + "epoch": 0.8475550336073562, + "grad_norm": 2.15625, + "learning_rate": 8.21438650344818e-06, + "loss": 1.1296, + "step": 4240 + }, + { + "epoch": 0.8477549286624522, + "grad_norm": 2.078125, + "learning_rate": 8.213579252694954e-06, + "loss": 1.0473, + "step": 4241 + }, + { + "epoch": 0.8479548237175483, + "grad_norm": 2.046875, + "learning_rate": 8.212771859194881e-06, + "loss": 1.056, + "step": 4242 + }, + { + "epoch": 0.8481547187726444, + "grad_norm": 2.03125, + "learning_rate": 8.211964322983824e-06, + "loss": 1.0194, + "step": 4243 + }, + { + "epoch": 0.8483546138277405, + "grad_norm": 2.109375, + "learning_rate": 8.211156644097656e-06, + "loss": 1.0683, + "step": 4244 + }, + { + "epoch": 0.8485545088828365, + "grad_norm": 1.9375, + "learning_rate": 8.210348822572253e-06, + "loss": 1.0429, + "step": 4245 + }, + { + "epoch": 0.8487544039379326, + "grad_norm": 2.0625, + "learning_rate": 8.209540858443499e-06, + "loss": 1.0126, + "step": 4246 + }, + { + "epoch": 0.8489542989930287, + "grad_norm": 2.140625, + "learning_rate": 8.208732751747281e-06, + "loss": 1.0767, + "step": 4247 + }, + { + "epoch": 0.8491541940481248, + "grad_norm": 2.234375, + "learning_rate": 8.207924502519498e-06, + "loss": 1.0443, + "step": 4248 + }, + { + "epoch": 0.8493540891032209, + "grad_norm": 2.203125, + "learning_rate": 8.20711611079605e-06, + "loss": 1.0341, + "step": 4249 + }, + { + "epoch": 0.8495539841583168, + "grad_norm": 1.984375, + "learning_rate": 8.206307576612848e-06, + "loss": 0.9718, + "step": 4250 + }, + { + "epoch": 0.8497538792134129, + "grad_norm": 2.0625, + "learning_rate": 8.205498900005806e-06, + "loss": 1.1404, + "step": 4251 + }, + { + "epoch": 0.849953774268509, + "grad_norm": 2.046875, + "learning_rate": 8.204690081010845e-06, + "loss": 0.9937, + "step": 4252 + }, + { + "epoch": 0.8501536693236051, + "grad_norm": 2.078125, + "learning_rate": 8.203881119663893e-06, + "loss": 1.0616, + "step": 4253 + }, + { + "epoch": 0.8503535643787011, + "grad_norm": 2.0625, + "learning_rate": 8.203072016000884e-06, + "loss": 1.064, + "step": 4254 + }, + { + "epoch": 0.8505534594337972, + "grad_norm": 2.15625, + "learning_rate": 8.202262770057756e-06, + "loss": 1.0711, + "step": 4255 + }, + { + "epoch": 0.8507533544888933, + "grad_norm": 2.046875, + "learning_rate": 8.201453381870461e-06, + "loss": 0.9932, + "step": 4256 + }, + { + "epoch": 0.8509532495439894, + "grad_norm": 2.203125, + "learning_rate": 8.200643851474947e-06, + "loss": 1.0243, + "step": 4257 + }, + { + "epoch": 0.8511531445990855, + "grad_norm": 2.125, + "learning_rate": 8.199834178907174e-06, + "loss": 1.0337, + "step": 4258 + }, + { + "epoch": 0.8513530396541815, + "grad_norm": 1.9453125, + "learning_rate": 8.19902436420311e-06, + "loss": 1.0912, + "step": 4259 + }, + { + "epoch": 0.8515529347092776, + "grad_norm": 2.046875, + "learning_rate": 8.198214407398726e-06, + "loss": 0.9725, + "step": 4260 + }, + { + "epoch": 0.8517528297643737, + "grad_norm": 2.234375, + "learning_rate": 8.197404308529997e-06, + "loss": 1.0013, + "step": 4261 + }, + { + "epoch": 0.8519527248194698, + "grad_norm": 1.96875, + "learning_rate": 8.196594067632913e-06, + "loss": 1.0067, + "step": 4262 + }, + { + "epoch": 0.8521526198745658, + "grad_norm": 2.078125, + "learning_rate": 8.195783684743461e-06, + "loss": 0.9922, + "step": 4263 + }, + { + "epoch": 0.8523525149296619, + "grad_norm": 1.96875, + "learning_rate": 8.19497315989764e-06, + "loss": 0.9992, + "step": 4264 + }, + { + "epoch": 0.852552409984758, + "grad_norm": 2.109375, + "learning_rate": 8.19416249313145e-06, + "loss": 1.0969, + "step": 4265 + }, + { + "epoch": 0.8527523050398541, + "grad_norm": 1.953125, + "learning_rate": 8.193351684480904e-06, + "loss": 1.0002, + "step": 4266 + }, + { + "epoch": 0.8529522000949501, + "grad_norm": 1.953125, + "learning_rate": 8.192540733982017e-06, + "loss": 0.8837, + "step": 4267 + }, + { + "epoch": 0.8531520951500462, + "grad_norm": 2.171875, + "learning_rate": 8.191729641670813e-06, + "loss": 1.0419, + "step": 4268 + }, + { + "epoch": 0.8533519902051423, + "grad_norm": 2.0625, + "learning_rate": 8.190918407583319e-06, + "loss": 0.9997, + "step": 4269 + }, + { + "epoch": 0.8535518852602384, + "grad_norm": 2.109375, + "learning_rate": 8.190107031755569e-06, + "loss": 1.0607, + "step": 4270 + }, + { + "epoch": 0.8537517803153345, + "grad_norm": 1.9765625, + "learning_rate": 8.189295514223607e-06, + "loss": 1.0516, + "step": 4271 + }, + { + "epoch": 0.8539516753704305, + "grad_norm": 2.046875, + "learning_rate": 8.188483855023476e-06, + "loss": 0.9929, + "step": 4272 + }, + { + "epoch": 0.8541515704255266, + "grad_norm": 2.046875, + "learning_rate": 8.187672054191236e-06, + "loss": 1.0549, + "step": 4273 + }, + { + "epoch": 0.8543514654806227, + "grad_norm": 2.109375, + "learning_rate": 8.186860111762941e-06, + "loss": 1.1324, + "step": 4274 + }, + { + "epoch": 0.8545513605357188, + "grad_norm": 2.03125, + "learning_rate": 8.186048027774661e-06, + "loss": 1.1121, + "step": 4275 + }, + { + "epoch": 0.8547512555908148, + "grad_norm": 2.046875, + "learning_rate": 8.18523580226247e-06, + "loss": 0.9482, + "step": 4276 + }, + { + "epoch": 0.8549511506459109, + "grad_norm": 1.9609375, + "learning_rate": 8.184423435262442e-06, + "loss": 0.979, + "step": 4277 + }, + { + "epoch": 0.855151045701007, + "grad_norm": 2.078125, + "learning_rate": 8.183610926810667e-06, + "loss": 1.0637, + "step": 4278 + }, + { + "epoch": 0.8553509407561031, + "grad_norm": 2.140625, + "learning_rate": 8.182798276943236e-06, + "loss": 1.1295, + "step": 4279 + }, + { + "epoch": 0.8555508358111992, + "grad_norm": 2.03125, + "learning_rate": 8.181985485696242e-06, + "loss": 1.0336, + "step": 4280 + }, + { + "epoch": 0.8557507308662952, + "grad_norm": 2.140625, + "learning_rate": 8.181172553105793e-06, + "loss": 1.0707, + "step": 4281 + }, + { + "epoch": 0.8559506259213913, + "grad_norm": 2.078125, + "learning_rate": 8.180359479208002e-06, + "loss": 1.1241, + "step": 4282 + }, + { + "epoch": 0.8561505209764874, + "grad_norm": 2.046875, + "learning_rate": 8.179546264038982e-06, + "loss": 1.0029, + "step": 4283 + }, + { + "epoch": 0.8563504160315835, + "grad_norm": 2.078125, + "learning_rate": 8.178732907634854e-06, + "loss": 1.0113, + "step": 4284 + }, + { + "epoch": 0.8565503110866795, + "grad_norm": 2.296875, + "learning_rate": 8.177919410031752e-06, + "loss": 1.0303, + "step": 4285 + }, + { + "epoch": 0.8567502061417755, + "grad_norm": 2.0625, + "learning_rate": 8.177105771265808e-06, + "loss": 0.9785, + "step": 4286 + }, + { + "epoch": 0.8569501011968716, + "grad_norm": 2.078125, + "learning_rate": 8.176291991373164e-06, + "loss": 0.9925, + "step": 4287 + }, + { + "epoch": 0.8571499962519677, + "grad_norm": 2.125, + "learning_rate": 8.17547807038997e-06, + "loss": 1.0772, + "step": 4288 + }, + { + "epoch": 0.8573498913070637, + "grad_norm": 2.109375, + "learning_rate": 8.17466400835238e-06, + "loss": 1.0025, + "step": 4289 + }, + { + "epoch": 0.8575497863621598, + "grad_norm": 1.9921875, + "learning_rate": 8.173849805296553e-06, + "loss": 1.0628, + "step": 4290 + }, + { + "epoch": 0.8577496814172559, + "grad_norm": 2.1875, + "learning_rate": 8.173035461258658e-06, + "loss": 1.0494, + "step": 4291 + }, + { + "epoch": 0.857949576472352, + "grad_norm": 2.046875, + "learning_rate": 8.172220976274865e-06, + "loss": 1.0009, + "step": 4292 + }, + { + "epoch": 0.8581494715274481, + "grad_norm": 2.09375, + "learning_rate": 8.171406350381354e-06, + "loss": 1.0389, + "step": 4293 + }, + { + "epoch": 0.8583493665825441, + "grad_norm": 2.046875, + "learning_rate": 8.170591583614313e-06, + "loss": 0.9508, + "step": 4294 + }, + { + "epoch": 0.8585492616376402, + "grad_norm": 2.09375, + "learning_rate": 8.169776676009935e-06, + "loss": 1.0055, + "step": 4295 + }, + { + "epoch": 0.8587491566927363, + "grad_norm": 2.015625, + "learning_rate": 8.168961627604413e-06, + "loss": 0.9748, + "step": 4296 + }, + { + "epoch": 0.8589490517478324, + "grad_norm": 2.15625, + "learning_rate": 8.168146438433952e-06, + "loss": 1.0029, + "step": 4297 + }, + { + "epoch": 0.8591489468029284, + "grad_norm": 2.140625, + "learning_rate": 8.167331108534769e-06, + "loss": 1.1574, + "step": 4298 + }, + { + "epoch": 0.8593488418580245, + "grad_norm": 2.046875, + "learning_rate": 8.166515637943072e-06, + "loss": 0.9615, + "step": 4299 + }, + { + "epoch": 0.8595487369131206, + "grad_norm": 2.015625, + "learning_rate": 8.165700026695094e-06, + "loss": 1.0508, + "step": 4300 + }, + { + "epoch": 0.8597486319682167, + "grad_norm": 2.109375, + "learning_rate": 8.164884274827055e-06, + "loss": 1.0879, + "step": 4301 + }, + { + "epoch": 0.8599485270233128, + "grad_norm": 2.0625, + "learning_rate": 8.164068382375195e-06, + "loss": 0.9528, + "step": 4302 + }, + { + "epoch": 0.8601484220784088, + "grad_norm": 2.125, + "learning_rate": 8.163252349375755e-06, + "loss": 1.0222, + "step": 4303 + }, + { + "epoch": 0.8603483171335049, + "grad_norm": 2.03125, + "learning_rate": 8.162436175864985e-06, + "loss": 0.9972, + "step": 4304 + }, + { + "epoch": 0.860548212188601, + "grad_norm": 2.140625, + "learning_rate": 8.161619861879136e-06, + "loss": 1.0338, + "step": 4305 + }, + { + "epoch": 0.8607481072436971, + "grad_norm": 2.03125, + "learning_rate": 8.160803407454472e-06, + "loss": 1.0123, + "step": 4306 + }, + { + "epoch": 0.8609480022987931, + "grad_norm": 2.203125, + "learning_rate": 8.159986812627258e-06, + "loss": 1.0679, + "step": 4307 + }, + { + "epoch": 0.8611478973538892, + "grad_norm": 2.1875, + "learning_rate": 8.159170077433766e-06, + "loss": 1.1495, + "step": 4308 + }, + { + "epoch": 0.8613477924089853, + "grad_norm": 1.9453125, + "learning_rate": 8.158353201910279e-06, + "loss": 1.0023, + "step": 4309 + }, + { + "epoch": 0.8615476874640814, + "grad_norm": 2.15625, + "learning_rate": 8.157536186093079e-06, + "loss": 1.037, + "step": 4310 + }, + { + "epoch": 0.8617475825191774, + "grad_norm": 2.015625, + "learning_rate": 8.156719030018456e-06, + "loss": 1.0871, + "step": 4311 + }, + { + "epoch": 0.8619474775742735, + "grad_norm": 2.0, + "learning_rate": 8.155901733722714e-06, + "loss": 1.0027, + "step": 4312 + }, + { + "epoch": 0.8621473726293696, + "grad_norm": 2.078125, + "learning_rate": 8.155084297242152e-06, + "loss": 1.0906, + "step": 4313 + }, + { + "epoch": 0.8623472676844657, + "grad_norm": 2.125, + "learning_rate": 8.154266720613086e-06, + "loss": 1.0564, + "step": 4314 + }, + { + "epoch": 0.8625471627395618, + "grad_norm": 2.1875, + "learning_rate": 8.153449003871828e-06, + "loss": 1.1208, + "step": 4315 + }, + { + "epoch": 0.8627470577946578, + "grad_norm": 2.015625, + "learning_rate": 8.1526311470547e-06, + "loss": 0.9666, + "step": 4316 + }, + { + "epoch": 0.8629469528497539, + "grad_norm": 2.171875, + "learning_rate": 8.151813150198033e-06, + "loss": 1.0289, + "step": 4317 + }, + { + "epoch": 0.86314684790485, + "grad_norm": 2.15625, + "learning_rate": 8.150995013338165e-06, + "loss": 1.0856, + "step": 4318 + }, + { + "epoch": 0.8633467429599461, + "grad_norm": 2.1875, + "learning_rate": 8.150176736511432e-06, + "loss": 1.0077, + "step": 4319 + }, + { + "epoch": 0.8635466380150421, + "grad_norm": 2.015625, + "learning_rate": 8.149358319754188e-06, + "loss": 1.0339, + "step": 4320 + }, + { + "epoch": 0.8637465330701382, + "grad_norm": 2.09375, + "learning_rate": 8.148539763102782e-06, + "loss": 1.0569, + "step": 4321 + }, + { + "epoch": 0.8639464281252343, + "grad_norm": 2.125, + "learning_rate": 8.147721066593577e-06, + "loss": 1.0126, + "step": 4322 + }, + { + "epoch": 0.8641463231803304, + "grad_norm": 2.171875, + "learning_rate": 8.146902230262936e-06, + "loss": 1.0008, + "step": 4323 + }, + { + "epoch": 0.8643462182354265, + "grad_norm": 2.125, + "learning_rate": 8.146083254147237e-06, + "loss": 1.0532, + "step": 4324 + }, + { + "epoch": 0.8645461132905224, + "grad_norm": 2.15625, + "learning_rate": 8.145264138282853e-06, + "loss": 1.0592, + "step": 4325 + }, + { + "epoch": 0.8647460083456185, + "grad_norm": 1.9765625, + "learning_rate": 8.144444882706175e-06, + "loss": 0.9666, + "step": 4326 + }, + { + "epoch": 0.8649459034007146, + "grad_norm": 1.9921875, + "learning_rate": 8.14362548745359e-06, + "loss": 1.0502, + "step": 4327 + }, + { + "epoch": 0.8651457984558107, + "grad_norm": 2.203125, + "learning_rate": 8.142805952561495e-06, + "loss": 1.0719, + "step": 4328 + }, + { + "epoch": 0.8653456935109067, + "grad_norm": 2.140625, + "learning_rate": 8.141986278066296e-06, + "loss": 1.0825, + "step": 4329 + }, + { + "epoch": 0.8655455885660028, + "grad_norm": 2.171875, + "learning_rate": 8.141166464004404e-06, + "loss": 1.0434, + "step": 4330 + }, + { + "epoch": 0.8657454836210989, + "grad_norm": 2.140625, + "learning_rate": 8.140346510412232e-06, + "loss": 1.0814, + "step": 4331 + }, + { + "epoch": 0.865945378676195, + "grad_norm": 1.9921875, + "learning_rate": 8.139526417326203e-06, + "loss": 0.9666, + "step": 4332 + }, + { + "epoch": 0.866145273731291, + "grad_norm": 2.15625, + "learning_rate": 8.138706184782745e-06, + "loss": 1.0682, + "step": 4333 + }, + { + "epoch": 0.8663451687863871, + "grad_norm": 2.15625, + "learning_rate": 8.137885812818296e-06, + "loss": 1.0596, + "step": 4334 + }, + { + "epoch": 0.8665450638414832, + "grad_norm": 2.015625, + "learning_rate": 8.137065301469292e-06, + "loss": 0.9758, + "step": 4335 + }, + { + "epoch": 0.8667449588965793, + "grad_norm": 2.03125, + "learning_rate": 8.136244650772183e-06, + "loss": 1.0406, + "step": 4336 + }, + { + "epoch": 0.8669448539516754, + "grad_norm": 2.09375, + "learning_rate": 8.135423860763422e-06, + "loss": 1.0339, + "step": 4337 + }, + { + "epoch": 0.8671447490067714, + "grad_norm": 1.9609375, + "learning_rate": 8.134602931479468e-06, + "loss": 0.9688, + "step": 4338 + }, + { + "epoch": 0.8673446440618675, + "grad_norm": 1.90625, + "learning_rate": 8.133781862956787e-06, + "loss": 0.9505, + "step": 4339 + }, + { + "epoch": 0.8675445391169636, + "grad_norm": 2.09375, + "learning_rate": 8.132960655231849e-06, + "loss": 1.0011, + "step": 4340 + }, + { + "epoch": 0.8677444341720597, + "grad_norm": 2.140625, + "learning_rate": 8.132139308341134e-06, + "loss": 0.988, + "step": 4341 + }, + { + "epoch": 0.8679443292271557, + "grad_norm": 2.015625, + "learning_rate": 8.131317822321125e-06, + "loss": 1.0682, + "step": 4342 + }, + { + "epoch": 0.8681442242822518, + "grad_norm": 2.15625, + "learning_rate": 8.130496197208313e-06, + "loss": 1.0255, + "step": 4343 + }, + { + "epoch": 0.8683441193373479, + "grad_norm": 1.984375, + "learning_rate": 8.129674433039196e-06, + "loss": 1.0647, + "step": 4344 + }, + { + "epoch": 0.868544014392444, + "grad_norm": 1.9453125, + "learning_rate": 8.128852529850272e-06, + "loss": 0.9956, + "step": 4345 + }, + { + "epoch": 0.8687439094475401, + "grad_norm": 2.078125, + "learning_rate": 8.128030487678055e-06, + "loss": 1.0578, + "step": 4346 + }, + { + "epoch": 0.8689438045026361, + "grad_norm": 2.09375, + "learning_rate": 8.127208306559058e-06, + "loss": 1.0504, + "step": 4347 + }, + { + "epoch": 0.8691436995577322, + "grad_norm": 2.140625, + "learning_rate": 8.126385986529802e-06, + "loss": 1.038, + "step": 4348 + }, + { + "epoch": 0.8693435946128283, + "grad_norm": 2.125, + "learning_rate": 8.125563527626812e-06, + "loss": 0.996, + "step": 4349 + }, + { + "epoch": 0.8695434896679244, + "grad_norm": 2.1875, + "learning_rate": 8.124740929886625e-06, + "loss": 1.0464, + "step": 4350 + }, + { + "epoch": 0.8697433847230204, + "grad_norm": 2.078125, + "learning_rate": 8.12391819334578e-06, + "loss": 0.9245, + "step": 4351 + }, + { + "epoch": 0.8699432797781165, + "grad_norm": 2.125, + "learning_rate": 8.123095318040824e-06, + "loss": 1.0916, + "step": 4352 + }, + { + "epoch": 0.8701431748332126, + "grad_norm": 2.109375, + "learning_rate": 8.122272304008306e-06, + "loss": 1.0831, + "step": 4353 + }, + { + "epoch": 0.8703430698883087, + "grad_norm": 2.09375, + "learning_rate": 8.121449151284784e-06, + "loss": 0.9706, + "step": 4354 + }, + { + "epoch": 0.8705429649434047, + "grad_norm": 2.046875, + "learning_rate": 8.120625859906825e-06, + "loss": 1.0293, + "step": 4355 + }, + { + "epoch": 0.8707428599985008, + "grad_norm": 2.140625, + "learning_rate": 8.119802429911002e-06, + "loss": 0.9694, + "step": 4356 + }, + { + "epoch": 0.8709427550535969, + "grad_norm": 2.046875, + "learning_rate": 8.118978861333883e-06, + "loss": 0.9881, + "step": 4357 + }, + { + "epoch": 0.871142650108693, + "grad_norm": 2.03125, + "learning_rate": 8.11815515421206e-06, + "loss": 1.0694, + "step": 4358 + }, + { + "epoch": 0.8713425451637891, + "grad_norm": 2.046875, + "learning_rate": 8.117331308582116e-06, + "loss": 0.9553, + "step": 4359 + }, + { + "epoch": 0.871542440218885, + "grad_norm": 2.078125, + "learning_rate": 8.116507324480651e-06, + "loss": 1.0958, + "step": 4360 + }, + { + "epoch": 0.8717423352739812, + "grad_norm": 2.0625, + "learning_rate": 8.115683201944262e-06, + "loss": 1.1264, + "step": 4361 + }, + { + "epoch": 0.8719422303290773, + "grad_norm": 2.03125, + "learning_rate": 8.114858941009556e-06, + "loss": 0.9975, + "step": 4362 + }, + { + "epoch": 0.8721421253841734, + "grad_norm": 1.9453125, + "learning_rate": 8.114034541713152e-06, + "loss": 1.0681, + "step": 4363 + }, + { + "epoch": 0.8723420204392693, + "grad_norm": 2.25, + "learning_rate": 8.113210004091663e-06, + "loss": 1.083, + "step": 4364 + }, + { + "epoch": 0.8725419154943654, + "grad_norm": 2.046875, + "learning_rate": 8.11238532818172e-06, + "loss": 1.044, + "step": 4365 + }, + { + "epoch": 0.8727418105494615, + "grad_norm": 2.09375, + "learning_rate": 8.111560514019951e-06, + "loss": 0.9365, + "step": 4366 + }, + { + "epoch": 0.8729417056045576, + "grad_norm": 2.234375, + "learning_rate": 8.110735561643e-06, + "loss": 1.128, + "step": 4367 + }, + { + "epoch": 0.8731416006596536, + "grad_norm": 2.03125, + "learning_rate": 8.109910471087505e-06, + "loss": 1.0742, + "step": 4368 + }, + { + "epoch": 0.8733414957147497, + "grad_norm": 1.984375, + "learning_rate": 8.109085242390118e-06, + "loss": 1.0137, + "step": 4369 + }, + { + "epoch": 0.8735413907698458, + "grad_norm": 2.0625, + "learning_rate": 8.108259875587498e-06, + "loss": 1.023, + "step": 4370 + }, + { + "epoch": 0.8737412858249419, + "grad_norm": 2.03125, + "learning_rate": 8.107434370716307e-06, + "loss": 1.027, + "step": 4371 + }, + { + "epoch": 0.873941180880038, + "grad_norm": 2.0, + "learning_rate": 8.106608727813212e-06, + "loss": 0.9632, + "step": 4372 + }, + { + "epoch": 0.874141075935134, + "grad_norm": 2.09375, + "learning_rate": 8.105782946914891e-06, + "loss": 1.0354, + "step": 4373 + }, + { + "epoch": 0.8743409709902301, + "grad_norm": 2.109375, + "learning_rate": 8.10495702805802e-06, + "loss": 0.9849, + "step": 4374 + }, + { + "epoch": 0.8745408660453262, + "grad_norm": 2.4375, + "learning_rate": 8.104130971279292e-06, + "loss": 1.0414, + "step": 4375 + }, + { + "epoch": 0.8747407611004223, + "grad_norm": 2.15625, + "learning_rate": 8.103304776615399e-06, + "loss": 1.1185, + "step": 4376 + }, + { + "epoch": 0.8749406561555183, + "grad_norm": 2.171875, + "learning_rate": 8.102478444103037e-06, + "loss": 1.1865, + "step": 4377 + }, + { + "epoch": 0.8751405512106144, + "grad_norm": 2.1875, + "learning_rate": 8.101651973778914e-06, + "loss": 1.025, + "step": 4378 + }, + { + "epoch": 0.8753404462657105, + "grad_norm": 2.09375, + "learning_rate": 8.100825365679741e-06, + "loss": 0.9985, + "step": 4379 + }, + { + "epoch": 0.8755403413208066, + "grad_norm": 2.15625, + "learning_rate": 8.099998619842238e-06, + "loss": 1.0752, + "step": 4380 + }, + { + "epoch": 0.8757402363759027, + "grad_norm": 2.125, + "learning_rate": 8.099171736303126e-06, + "loss": 0.985, + "step": 4381 + }, + { + "epoch": 0.8759401314309987, + "grad_norm": 1.9453125, + "learning_rate": 8.098344715099136e-06, + "loss": 1.0516, + "step": 4382 + }, + { + "epoch": 0.8761400264860948, + "grad_norm": 2.109375, + "learning_rate": 8.097517556267007e-06, + "loss": 1.0916, + "step": 4383 + }, + { + "epoch": 0.8763399215411909, + "grad_norm": 2.15625, + "learning_rate": 8.096690259843478e-06, + "loss": 0.8999, + "step": 4384 + }, + { + "epoch": 0.876539816596287, + "grad_norm": 2.21875, + "learning_rate": 8.095862825865297e-06, + "loss": 1.0837, + "step": 4385 + }, + { + "epoch": 0.876739711651383, + "grad_norm": 2.03125, + "learning_rate": 8.09503525436922e-06, + "loss": 0.9613, + "step": 4386 + }, + { + "epoch": 0.8769396067064791, + "grad_norm": 2.078125, + "learning_rate": 8.09420754539201e-06, + "loss": 1.0288, + "step": 4387 + }, + { + "epoch": 0.8771395017615752, + "grad_norm": 2.0625, + "learning_rate": 8.09337969897043e-06, + "loss": 0.987, + "step": 4388 + }, + { + "epoch": 0.8773393968166713, + "grad_norm": 2.078125, + "learning_rate": 8.092551715141254e-06, + "loss": 1.0139, + "step": 4389 + }, + { + "epoch": 0.8775392918717673, + "grad_norm": 1.9609375, + "learning_rate": 8.091723593941261e-06, + "loss": 1.0515, + "step": 4390 + }, + { + "epoch": 0.8777391869268634, + "grad_norm": 2.015625, + "learning_rate": 8.090895335407238e-06, + "loss": 1.0595, + "step": 4391 + }, + { + "epoch": 0.8779390819819595, + "grad_norm": 2.03125, + "learning_rate": 8.090066939575972e-06, + "loss": 1.1062, + "step": 4392 + }, + { + "epoch": 0.8781389770370556, + "grad_norm": 2.15625, + "learning_rate": 8.089238406484263e-06, + "loss": 0.975, + "step": 4393 + }, + { + "epoch": 0.8783388720921517, + "grad_norm": 1.9296875, + "learning_rate": 8.088409736168915e-06, + "loss": 0.9842, + "step": 4394 + }, + { + "epoch": 0.8785387671472477, + "grad_norm": 2.078125, + "learning_rate": 8.087580928666736e-06, + "loss": 1.084, + "step": 4395 + }, + { + "epoch": 0.8787386622023438, + "grad_norm": 2.015625, + "learning_rate": 8.08675198401454e-06, + "loss": 0.9676, + "step": 4396 + }, + { + "epoch": 0.8789385572574399, + "grad_norm": 1.9296875, + "learning_rate": 8.085922902249153e-06, + "loss": 0.9899, + "step": 4397 + }, + { + "epoch": 0.879138452312536, + "grad_norm": 2.03125, + "learning_rate": 8.085093683407399e-06, + "loss": 1.0496, + "step": 4398 + }, + { + "epoch": 0.879338347367632, + "grad_norm": 2.03125, + "learning_rate": 8.084264327526112e-06, + "loss": 1.014, + "step": 4399 + }, + { + "epoch": 0.879538242422728, + "grad_norm": 2.234375, + "learning_rate": 8.083434834642133e-06, + "loss": 1.085, + "step": 4400 + }, + { + "epoch": 0.8797381374778241, + "grad_norm": 2.109375, + "learning_rate": 8.08260520479231e-06, + "loss": 1.0846, + "step": 4401 + }, + { + "epoch": 0.8799380325329202, + "grad_norm": 2.0625, + "learning_rate": 8.081775438013493e-06, + "loss": 1.0657, + "step": 4402 + }, + { + "epoch": 0.8801379275880163, + "grad_norm": 1.953125, + "learning_rate": 8.08094553434254e-06, + "loss": 0.9896, + "step": 4403 + }, + { + "epoch": 0.8803378226431123, + "grad_norm": 2.03125, + "learning_rate": 8.080115493816314e-06, + "loss": 1.0088, + "step": 4404 + }, + { + "epoch": 0.8805377176982084, + "grad_norm": 2.203125, + "learning_rate": 8.079285316471688e-06, + "loss": 1.0624, + "step": 4405 + }, + { + "epoch": 0.8807376127533045, + "grad_norm": 2.0625, + "learning_rate": 8.078455002345538e-06, + "loss": 0.9994, + "step": 4406 + }, + { + "epoch": 0.8809375078084006, + "grad_norm": 2.03125, + "learning_rate": 8.077624551474744e-06, + "loss": 1.0085, + "step": 4407 + }, + { + "epoch": 0.8811374028634966, + "grad_norm": 1.9921875, + "learning_rate": 8.076793963896197e-06, + "loss": 0.9928, + "step": 4408 + }, + { + "epoch": 0.8813372979185927, + "grad_norm": 2.046875, + "learning_rate": 8.07596323964679e-06, + "loss": 1.0614, + "step": 4409 + }, + { + "epoch": 0.8815371929736888, + "grad_norm": 2.109375, + "learning_rate": 8.075132378763424e-06, + "loss": 1.0672, + "step": 4410 + }, + { + "epoch": 0.8817370880287849, + "grad_norm": 2.0625, + "learning_rate": 8.074301381283007e-06, + "loss": 0.9155, + "step": 4411 + }, + { + "epoch": 0.8819369830838809, + "grad_norm": 2.046875, + "learning_rate": 8.073470247242452e-06, + "loss": 0.9595, + "step": 4412 + }, + { + "epoch": 0.882136878138977, + "grad_norm": 2.046875, + "learning_rate": 8.072638976678675e-06, + "loss": 0.9704, + "step": 4413 + }, + { + "epoch": 0.8823367731940731, + "grad_norm": 2.015625, + "learning_rate": 8.071807569628602e-06, + "loss": 1.0088, + "step": 4414 + }, + { + "epoch": 0.8825366682491692, + "grad_norm": 1.921875, + "learning_rate": 8.070976026129166e-06, + "loss": 0.9518, + "step": 4415 + }, + { + "epoch": 0.8827365633042653, + "grad_norm": 2.109375, + "learning_rate": 8.070144346217305e-06, + "loss": 1.0652, + "step": 4416 + }, + { + "epoch": 0.8829364583593613, + "grad_norm": 2.0625, + "learning_rate": 8.069312529929958e-06, + "loss": 0.9716, + "step": 4417 + }, + { + "epoch": 0.8831363534144574, + "grad_norm": 2.125, + "learning_rate": 8.068480577304076e-06, + "loss": 1.0798, + "step": 4418 + }, + { + "epoch": 0.8833362484695535, + "grad_norm": 2.265625, + "learning_rate": 8.067648488376616e-06, + "loss": 0.9723, + "step": 4419 + }, + { + "epoch": 0.8835361435246496, + "grad_norm": 1.9375, + "learning_rate": 8.066816263184535e-06, + "loss": 0.9742, + "step": 4420 + }, + { + "epoch": 0.8837360385797456, + "grad_norm": 2.09375, + "learning_rate": 8.065983901764807e-06, + "loss": 0.9657, + "step": 4421 + }, + { + "epoch": 0.8839359336348417, + "grad_norm": 2.03125, + "learning_rate": 8.0651514041544e-06, + "loss": 1.02, + "step": 4422 + }, + { + "epoch": 0.8841358286899378, + "grad_norm": 1.984375, + "learning_rate": 8.064318770390293e-06, + "loss": 1.0419, + "step": 4423 + }, + { + "epoch": 0.8843357237450339, + "grad_norm": 2.078125, + "learning_rate": 8.063486000509475e-06, + "loss": 1.085, + "step": 4424 + }, + { + "epoch": 0.88453561880013, + "grad_norm": 2.046875, + "learning_rate": 8.062653094548936e-06, + "loss": 1.083, + "step": 4425 + }, + { + "epoch": 0.884735513855226, + "grad_norm": 2.03125, + "learning_rate": 8.061820052545675e-06, + "loss": 1.1178, + "step": 4426 + }, + { + "epoch": 0.8849354089103221, + "grad_norm": 2.078125, + "learning_rate": 8.060986874536691e-06, + "loss": 1.0527, + "step": 4427 + }, + { + "epoch": 0.8851353039654182, + "grad_norm": 2.359375, + "learning_rate": 8.060153560559e-06, + "loss": 1.0529, + "step": 4428 + }, + { + "epoch": 0.8853351990205143, + "grad_norm": 2.09375, + "learning_rate": 8.059320110649614e-06, + "loss": 0.948, + "step": 4429 + }, + { + "epoch": 0.8855350940756103, + "grad_norm": 2.078125, + "learning_rate": 8.058486524845554e-06, + "loss": 1.0805, + "step": 4430 + }, + { + "epoch": 0.8857349891307064, + "grad_norm": 2.03125, + "learning_rate": 8.05765280318385e-06, + "loss": 1.0965, + "step": 4431 + }, + { + "epoch": 0.8859348841858025, + "grad_norm": 2.109375, + "learning_rate": 8.056818945701537e-06, + "loss": 1.0667, + "step": 4432 + }, + { + "epoch": 0.8861347792408986, + "grad_norm": 2.03125, + "learning_rate": 8.05598495243565e-06, + "loss": 1.0619, + "step": 4433 + }, + { + "epoch": 0.8863346742959946, + "grad_norm": 2.0, + "learning_rate": 8.055150823423239e-06, + "loss": 0.8879, + "step": 4434 + }, + { + "epoch": 0.8865345693510907, + "grad_norm": 2.140625, + "learning_rate": 8.054316558701355e-06, + "loss": 1.0322, + "step": 4435 + }, + { + "epoch": 0.8867344644061868, + "grad_norm": 2.1875, + "learning_rate": 8.053482158307055e-06, + "loss": 1.1139, + "step": 4436 + }, + { + "epoch": 0.8869343594612829, + "grad_norm": 1.9921875, + "learning_rate": 8.052647622277405e-06, + "loss": 1.0347, + "step": 4437 + }, + { + "epoch": 0.887134254516379, + "grad_norm": 2.125, + "learning_rate": 8.051812950649474e-06, + "loss": 1.0791, + "step": 4438 + }, + { + "epoch": 0.8873341495714749, + "grad_norm": 2.078125, + "learning_rate": 8.050978143460335e-06, + "loss": 0.9837, + "step": 4439 + }, + { + "epoch": 0.887534044626571, + "grad_norm": 2.015625, + "learning_rate": 8.050143200747073e-06, + "loss": 0.9943, + "step": 4440 + }, + { + "epoch": 0.8877339396816671, + "grad_norm": 2.28125, + "learning_rate": 8.049308122546776e-06, + "loss": 1.16, + "step": 4441 + }, + { + "epoch": 0.8879338347367632, + "grad_norm": 2.015625, + "learning_rate": 8.04847290889654e-06, + "loss": 1.0046, + "step": 4442 + }, + { + "epoch": 0.8881337297918592, + "grad_norm": 2.046875, + "learning_rate": 8.047637559833464e-06, + "loss": 1.0639, + "step": 4443 + }, + { + "epoch": 0.8883336248469553, + "grad_norm": 2.03125, + "learning_rate": 8.04680207539465e-06, + "loss": 1.0163, + "step": 4444 + }, + { + "epoch": 0.8885335199020514, + "grad_norm": 1.984375, + "learning_rate": 8.045966455617214e-06, + "loss": 1.0285, + "step": 4445 + }, + { + "epoch": 0.8887334149571475, + "grad_norm": 2.0625, + "learning_rate": 8.045130700538273e-06, + "loss": 1.0341, + "step": 4446 + }, + { + "epoch": 0.8889333100122436, + "grad_norm": 2.1875, + "learning_rate": 8.044294810194953e-06, + "loss": 0.9628, + "step": 4447 + }, + { + "epoch": 0.8891332050673396, + "grad_norm": 2.21875, + "learning_rate": 8.04345878462438e-06, + "loss": 1.0712, + "step": 4448 + }, + { + "epoch": 0.8893331001224357, + "grad_norm": 2.046875, + "learning_rate": 8.042622623863694e-06, + "loss": 1.027, + "step": 4449 + }, + { + "epoch": 0.8895329951775318, + "grad_norm": 2.0625, + "learning_rate": 8.041786327950037e-06, + "loss": 1.0139, + "step": 4450 + }, + { + "epoch": 0.8897328902326279, + "grad_norm": 2.125, + "learning_rate": 8.040949896920556e-06, + "loss": 1.0845, + "step": 4451 + }, + { + "epoch": 0.8899327852877239, + "grad_norm": 2.171875, + "learning_rate": 8.040113330812404e-06, + "loss": 1.0785, + "step": 4452 + }, + { + "epoch": 0.89013268034282, + "grad_norm": 2.09375, + "learning_rate": 8.039276629662745e-06, + "loss": 1.0703, + "step": 4453 + }, + { + "epoch": 0.8903325753979161, + "grad_norm": 2.078125, + "learning_rate": 8.038439793508741e-06, + "loss": 1.0732, + "step": 4454 + }, + { + "epoch": 0.8905324704530122, + "grad_norm": 2.1875, + "learning_rate": 8.037602822387566e-06, + "loss": 1.0146, + "step": 4455 + }, + { + "epoch": 0.8907323655081082, + "grad_norm": 2.0625, + "learning_rate": 8.036765716336399e-06, + "loss": 0.9116, + "step": 4456 + }, + { + "epoch": 0.8909322605632043, + "grad_norm": 2.03125, + "learning_rate": 8.035928475392422e-06, + "loss": 0.9984, + "step": 4457 + }, + { + "epoch": 0.8911321556183004, + "grad_norm": 2.03125, + "learning_rate": 8.035091099592827e-06, + "loss": 1.0102, + "step": 4458 + }, + { + "epoch": 0.8913320506733965, + "grad_norm": 2.03125, + "learning_rate": 8.034253588974809e-06, + "loss": 0.8845, + "step": 4459 + }, + { + "epoch": 0.8915319457284926, + "grad_norm": 2.390625, + "learning_rate": 8.033415943575572e-06, + "loss": 0.9525, + "step": 4460 + }, + { + "epoch": 0.8917318407835886, + "grad_norm": 2.078125, + "learning_rate": 8.03257816343232e-06, + "loss": 1.005, + "step": 4461 + }, + { + "epoch": 0.8919317358386847, + "grad_norm": 2.109375, + "learning_rate": 8.031740248582272e-06, + "loss": 1.0035, + "step": 4462 + }, + { + "epoch": 0.8921316308937808, + "grad_norm": 2.125, + "learning_rate": 8.030902199062646e-06, + "loss": 1.0726, + "step": 4463 + }, + { + "epoch": 0.8923315259488769, + "grad_norm": 2.0, + "learning_rate": 8.030064014910668e-06, + "loss": 1.0157, + "step": 4464 + }, + { + "epoch": 0.8925314210039729, + "grad_norm": 2.0625, + "learning_rate": 8.02922569616357e-06, + "loss": 1.0501, + "step": 4465 + }, + { + "epoch": 0.892731316059069, + "grad_norm": 2.1875, + "learning_rate": 8.028387242858588e-06, + "loss": 0.9781, + "step": 4466 + }, + { + "epoch": 0.8929312111141651, + "grad_norm": 2.09375, + "learning_rate": 8.02754865503297e-06, + "loss": 1.0365, + "step": 4467 + }, + { + "epoch": 0.8931311061692612, + "grad_norm": 2.0625, + "learning_rate": 8.026709932723964e-06, + "loss": 0.9437, + "step": 4468 + }, + { + "epoch": 0.8933310012243572, + "grad_norm": 1.9921875, + "learning_rate": 8.025871075968828e-06, + "loss": 0.9995, + "step": 4469 + }, + { + "epoch": 0.8935308962794533, + "grad_norm": 2.0625, + "learning_rate": 8.02503208480482e-06, + "loss": 1.0265, + "step": 4470 + }, + { + "epoch": 0.8937307913345494, + "grad_norm": 2.203125, + "learning_rate": 8.024192959269209e-06, + "loss": 1.1485, + "step": 4471 + }, + { + "epoch": 0.8939306863896455, + "grad_norm": 1.96875, + "learning_rate": 8.02335369939927e-06, + "loss": 1.0292, + "step": 4472 + }, + { + "epoch": 0.8941305814447416, + "grad_norm": 2.03125, + "learning_rate": 8.022514305232283e-06, + "loss": 0.9181, + "step": 4473 + }, + { + "epoch": 0.8943304764998375, + "grad_norm": 2.140625, + "learning_rate": 8.021674776805534e-06, + "loss": 1.0503, + "step": 4474 + }, + { + "epoch": 0.8945303715549336, + "grad_norm": 2.1875, + "learning_rate": 8.020835114156313e-06, + "loss": 1.0704, + "step": 4475 + }, + { + "epoch": 0.8947302666100297, + "grad_norm": 2.078125, + "learning_rate": 8.019995317321921e-06, + "loss": 1.0324, + "step": 4476 + }, + { + "epoch": 0.8949301616651258, + "grad_norm": 2.078125, + "learning_rate": 8.019155386339657e-06, + "loss": 0.9435, + "step": 4477 + }, + { + "epoch": 0.8951300567202218, + "grad_norm": 2.203125, + "learning_rate": 8.018315321246834e-06, + "loss": 1.0742, + "step": 4478 + }, + { + "epoch": 0.8953299517753179, + "grad_norm": 2.140625, + "learning_rate": 8.017475122080767e-06, + "loss": 1.0427, + "step": 4479 + }, + { + "epoch": 0.895529846830414, + "grad_norm": 2.03125, + "learning_rate": 8.016634788878779e-06, + "loss": 1.0984, + "step": 4480 + }, + { + "epoch": 0.8957297418855101, + "grad_norm": 2.046875, + "learning_rate": 8.015794321678194e-06, + "loss": 0.9826, + "step": 4481 + }, + { + "epoch": 0.8959296369406062, + "grad_norm": 2.046875, + "learning_rate": 8.014953720516347e-06, + "loss": 1.0419, + "step": 4482 + }, + { + "epoch": 0.8961295319957022, + "grad_norm": 2.0, + "learning_rate": 8.014112985430578e-06, + "loss": 0.9784, + "step": 4483 + }, + { + "epoch": 0.8963294270507983, + "grad_norm": 2.03125, + "learning_rate": 8.013272116458233e-06, + "loss": 1.0174, + "step": 4484 + }, + { + "epoch": 0.8965293221058944, + "grad_norm": 2.09375, + "learning_rate": 8.012431113636662e-06, + "loss": 1.0536, + "step": 4485 + }, + { + "epoch": 0.8967292171609905, + "grad_norm": 2.171875, + "learning_rate": 8.011589977003222e-06, + "loss": 1.0687, + "step": 4486 + }, + { + "epoch": 0.8969291122160865, + "grad_norm": 2.1875, + "learning_rate": 8.01074870659528e-06, + "loss": 1.0259, + "step": 4487 + }, + { + "epoch": 0.8971290072711826, + "grad_norm": 2.140625, + "learning_rate": 8.009907302450199e-06, + "loss": 1.0712, + "step": 4488 + }, + { + "epoch": 0.8973289023262787, + "grad_norm": 2.0, + "learning_rate": 8.009065764605358e-06, + "loss": 1.0412, + "step": 4489 + }, + { + "epoch": 0.8975287973813748, + "grad_norm": 2.0625, + "learning_rate": 8.008224093098136e-06, + "loss": 1.1454, + "step": 4490 + }, + { + "epoch": 0.8977286924364708, + "grad_norm": 2.3125, + "learning_rate": 8.007382287965921e-06, + "loss": 1.054, + "step": 4491 + }, + { + "epoch": 0.8979285874915669, + "grad_norm": 2.296875, + "learning_rate": 8.006540349246107e-06, + "loss": 0.8789, + "step": 4492 + }, + { + "epoch": 0.898128482546663, + "grad_norm": 2.109375, + "learning_rate": 8.005698276976092e-06, + "loss": 1.0072, + "step": 4493 + }, + { + "epoch": 0.8983283776017591, + "grad_norm": 2.125, + "learning_rate": 8.004856071193278e-06, + "loss": 1.0211, + "step": 4494 + }, + { + "epoch": 0.8985282726568552, + "grad_norm": 2.21875, + "learning_rate": 8.004013731935082e-06, + "loss": 1.0182, + "step": 4495 + }, + { + "epoch": 0.8987281677119512, + "grad_norm": 2.140625, + "learning_rate": 8.003171259238915e-06, + "loss": 1.0249, + "step": 4496 + }, + { + "epoch": 0.8989280627670473, + "grad_norm": 2.046875, + "learning_rate": 8.002328653142203e-06, + "loss": 1.0605, + "step": 4497 + }, + { + "epoch": 0.8991279578221434, + "grad_norm": 2.0625, + "learning_rate": 8.00148591368237e-06, + "loss": 0.9666, + "step": 4498 + }, + { + "epoch": 0.8993278528772395, + "grad_norm": 2.0625, + "learning_rate": 8.000643040896855e-06, + "loss": 1.0767, + "step": 4499 + }, + { + "epoch": 0.8995277479323355, + "grad_norm": 2.078125, + "learning_rate": 7.999800034823097e-06, + "loss": 1.0541, + "step": 4500 + }, + { + "epoch": 0.8995277479323355, + "eval_loss": 0.9116131067276001, + "eval_runtime": 594.896, + "eval_samples_per_second": 3.594, + "eval_steps_per_second": 3.594, + "step": 4500 + }, + { + "epoch": 0.8997276429874316, + "grad_norm": 2.125, + "learning_rate": 7.998956895498542e-06, + "loss": 1.1048, + "step": 4501 + }, + { + "epoch": 0.8999275380425277, + "grad_norm": 1.9765625, + "learning_rate": 7.99811362296064e-06, + "loss": 0.9938, + "step": 4502 + }, + { + "epoch": 0.9001274330976238, + "grad_norm": 2.0625, + "learning_rate": 7.997270217246853e-06, + "loss": 1.1029, + "step": 4503 + }, + { + "epoch": 0.9003273281527199, + "grad_norm": 2.171875, + "learning_rate": 7.996426678394642e-06, + "loss": 1.1396, + "step": 4504 + }, + { + "epoch": 0.9005272232078159, + "grad_norm": 1.9453125, + "learning_rate": 7.99558300644148e-06, + "loss": 0.9531, + "step": 4505 + }, + { + "epoch": 0.900727118262912, + "grad_norm": 2.03125, + "learning_rate": 7.994739201424836e-06, + "loss": 1.0054, + "step": 4506 + }, + { + "epoch": 0.9009270133180081, + "grad_norm": 2.203125, + "learning_rate": 7.993895263382201e-06, + "loss": 1.0793, + "step": 4507 + }, + { + "epoch": 0.9011269083731042, + "grad_norm": 2.09375, + "learning_rate": 7.993051192351056e-06, + "loss": 1.0196, + "step": 4508 + }, + { + "epoch": 0.9013268034282002, + "grad_norm": 1.984375, + "learning_rate": 7.992206988368898e-06, + "loss": 0.954, + "step": 4509 + }, + { + "epoch": 0.9015266984832963, + "grad_norm": 2.0, + "learning_rate": 7.991362651473225e-06, + "loss": 1.0146, + "step": 4510 + }, + { + "epoch": 0.9017265935383924, + "grad_norm": 2.109375, + "learning_rate": 7.990518181701542e-06, + "loss": 1.0859, + "step": 4511 + }, + { + "epoch": 0.9019264885934885, + "grad_norm": 2.109375, + "learning_rate": 7.989673579091361e-06, + "loss": 1.0375, + "step": 4512 + }, + { + "epoch": 0.9021263836485844, + "grad_norm": 2.171875, + "learning_rate": 7.988828843680198e-06, + "loss": 1.0987, + "step": 4513 + }, + { + "epoch": 0.9023262787036805, + "grad_norm": 2.03125, + "learning_rate": 7.987983975505579e-06, + "loss": 0.984, + "step": 4514 + }, + { + "epoch": 0.9025261737587766, + "grad_norm": 2.15625, + "learning_rate": 7.98713897460503e-06, + "loss": 1.049, + "step": 4515 + }, + { + "epoch": 0.9027260688138727, + "grad_norm": 2.0625, + "learning_rate": 7.986293841016087e-06, + "loss": 1.0108, + "step": 4516 + }, + { + "epoch": 0.9029259638689688, + "grad_norm": 2.046875, + "learning_rate": 7.98544857477629e-06, + "loss": 1.0748, + "step": 4517 + }, + { + "epoch": 0.9031258589240648, + "grad_norm": 2.078125, + "learning_rate": 7.984603175923186e-06, + "loss": 1.043, + "step": 4518 + }, + { + "epoch": 0.9033257539791609, + "grad_norm": 2.203125, + "learning_rate": 7.983757644494327e-06, + "loss": 1.0418, + "step": 4519 + }, + { + "epoch": 0.903525649034257, + "grad_norm": 2.09375, + "learning_rate": 7.982911980527276e-06, + "loss": 1.1628, + "step": 4520 + }, + { + "epoch": 0.9037255440893531, + "grad_norm": 1.9140625, + "learning_rate": 7.98206618405959e-06, + "loss": 0.9724, + "step": 4521 + }, + { + "epoch": 0.9039254391444491, + "grad_norm": 2.078125, + "learning_rate": 7.981220255128842e-06, + "loss": 1.0297, + "step": 4522 + }, + { + "epoch": 0.9041253341995452, + "grad_norm": 2.078125, + "learning_rate": 7.98037419377261e-06, + "loss": 0.9934, + "step": 4523 + }, + { + "epoch": 0.9043252292546413, + "grad_norm": 2.171875, + "learning_rate": 7.979528000028474e-06, + "loss": 1.0693, + "step": 4524 + }, + { + "epoch": 0.9045251243097374, + "grad_norm": 2.109375, + "learning_rate": 7.978681673934023e-06, + "loss": 1.015, + "step": 4525 + }, + { + "epoch": 0.9047250193648335, + "grad_norm": 1.96875, + "learning_rate": 7.97783521552685e-06, + "loss": 0.9598, + "step": 4526 + }, + { + "epoch": 0.9049249144199295, + "grad_norm": 2.015625, + "learning_rate": 7.976988624844556e-06, + "loss": 1.0851, + "step": 4527 + }, + { + "epoch": 0.9051248094750256, + "grad_norm": 1.9921875, + "learning_rate": 7.976141901924743e-06, + "loss": 0.9594, + "step": 4528 + }, + { + "epoch": 0.9053247045301217, + "grad_norm": 2.046875, + "learning_rate": 7.975295046805026e-06, + "loss": 0.9813, + "step": 4529 + }, + { + "epoch": 0.9055245995852178, + "grad_norm": 2.0625, + "learning_rate": 7.974448059523018e-06, + "loss": 0.9971, + "step": 4530 + }, + { + "epoch": 0.9057244946403138, + "grad_norm": 2.1875, + "learning_rate": 7.97360094011635e-06, + "loss": 1.1101, + "step": 4531 + }, + { + "epoch": 0.9059243896954099, + "grad_norm": 2.015625, + "learning_rate": 7.972753688622644e-06, + "loss": 1.0363, + "step": 4532 + }, + { + "epoch": 0.906124284750506, + "grad_norm": 2.109375, + "learning_rate": 7.971906305079535e-06, + "loss": 1.0802, + "step": 4533 + }, + { + "epoch": 0.9063241798056021, + "grad_norm": 2.015625, + "learning_rate": 7.971058789524666e-06, + "loss": 1.0146, + "step": 4534 + }, + { + "epoch": 0.9065240748606981, + "grad_norm": 2.0, + "learning_rate": 7.970211141995682e-06, + "loss": 0.9591, + "step": 4535 + }, + { + "epoch": 0.9067239699157942, + "grad_norm": 2.109375, + "learning_rate": 7.969363362530238e-06, + "loss": 1.0927, + "step": 4536 + }, + { + "epoch": 0.9069238649708903, + "grad_norm": 2.0, + "learning_rate": 7.96851545116599e-06, + "loss": 0.9781, + "step": 4537 + }, + { + "epoch": 0.9071237600259864, + "grad_norm": 2.0, + "learning_rate": 7.967667407940603e-06, + "loss": 0.9595, + "step": 4538 + }, + { + "epoch": 0.9073236550810825, + "grad_norm": 2.125, + "learning_rate": 7.966819232891744e-06, + "loss": 1.0559, + "step": 4539 + }, + { + "epoch": 0.9075235501361785, + "grad_norm": 2.125, + "learning_rate": 7.965970926057095e-06, + "loss": 1.0765, + "step": 4540 + }, + { + "epoch": 0.9077234451912746, + "grad_norm": 2.1875, + "learning_rate": 7.965122487474333e-06, + "loss": 1.1197, + "step": 4541 + }, + { + "epoch": 0.9079233402463707, + "grad_norm": 2.171875, + "learning_rate": 7.964273917181147e-06, + "loss": 1.0798, + "step": 4542 + }, + { + "epoch": 0.9081232353014668, + "grad_norm": 2.28125, + "learning_rate": 7.96342521521523e-06, + "loss": 0.9719, + "step": 4543 + }, + { + "epoch": 0.9083231303565628, + "grad_norm": 2.140625, + "learning_rate": 7.962576381614282e-06, + "loss": 1.0806, + "step": 4544 + }, + { + "epoch": 0.9085230254116589, + "grad_norm": 2.078125, + "learning_rate": 7.961727416416007e-06, + "loss": 1.099, + "step": 4545 + }, + { + "epoch": 0.908722920466755, + "grad_norm": 2.078125, + "learning_rate": 7.960878319658117e-06, + "loss": 1.1292, + "step": 4546 + }, + { + "epoch": 0.9089228155218511, + "grad_norm": 2.046875, + "learning_rate": 7.960029091378327e-06, + "loss": 0.9665, + "step": 4547 + }, + { + "epoch": 0.9091227105769472, + "grad_norm": 2.1875, + "learning_rate": 7.959179731614363e-06, + "loss": 1.1101, + "step": 4548 + }, + { + "epoch": 0.9093226056320431, + "grad_norm": 2.0625, + "learning_rate": 7.95833024040395e-06, + "loss": 1.0884, + "step": 4549 + }, + { + "epoch": 0.9095225006871392, + "grad_norm": 2.015625, + "learning_rate": 7.957480617784823e-06, + "loss": 0.9976, + "step": 4550 + }, + { + "epoch": 0.9097223957422353, + "grad_norm": 2.140625, + "learning_rate": 7.956630863794723e-06, + "loss": 1.0861, + "step": 4551 + }, + { + "epoch": 0.9099222907973314, + "grad_norm": 2.03125, + "learning_rate": 7.955780978471396e-06, + "loss": 1.0039, + "step": 4552 + }, + { + "epoch": 0.9101221858524274, + "grad_norm": 2.15625, + "learning_rate": 7.954930961852594e-06, + "loss": 1.0688, + "step": 4553 + }, + { + "epoch": 0.9103220809075235, + "grad_norm": 2.15625, + "learning_rate": 7.954080813976075e-06, + "loss": 1.0032, + "step": 4554 + }, + { + "epoch": 0.9105219759626196, + "grad_norm": 2.140625, + "learning_rate": 7.953230534879601e-06, + "loss": 1.0197, + "step": 4555 + }, + { + "epoch": 0.9107218710177157, + "grad_norm": 2.140625, + "learning_rate": 7.952380124600943e-06, + "loss": 0.9938, + "step": 4556 + }, + { + "epoch": 0.9109217660728117, + "grad_norm": 2.09375, + "learning_rate": 7.951529583177874e-06, + "loss": 1.0392, + "step": 4557 + }, + { + "epoch": 0.9111216611279078, + "grad_norm": 2.0, + "learning_rate": 7.950678910648176e-06, + "loss": 1.0021, + "step": 4558 + }, + { + "epoch": 0.9113215561830039, + "grad_norm": 2.046875, + "learning_rate": 7.949828107049638e-06, + "loss": 1.0592, + "step": 4559 + }, + { + "epoch": 0.9115214512381, + "grad_norm": 2.046875, + "learning_rate": 7.948977172420046e-06, + "loss": 0.975, + "step": 4560 + }, + { + "epoch": 0.9117213462931961, + "grad_norm": 2.0, + "learning_rate": 7.948126106797208e-06, + "loss": 1.0879, + "step": 4561 + }, + { + "epoch": 0.9119212413482921, + "grad_norm": 2.015625, + "learning_rate": 7.94727491021892e-06, + "loss": 0.9099, + "step": 4562 + }, + { + "epoch": 0.9121211364033882, + "grad_norm": 2.34375, + "learning_rate": 7.946423582722998e-06, + "loss": 1.1033, + "step": 4563 + }, + { + "epoch": 0.9123210314584843, + "grad_norm": 2.03125, + "learning_rate": 7.945572124347253e-06, + "loss": 0.9591, + "step": 4564 + }, + { + "epoch": 0.9125209265135804, + "grad_norm": 2.0, + "learning_rate": 7.944720535129509e-06, + "loss": 1.0181, + "step": 4565 + }, + { + "epoch": 0.9127208215686764, + "grad_norm": 2.140625, + "learning_rate": 7.943868815107594e-06, + "loss": 1.0219, + "step": 4566 + }, + { + "epoch": 0.9129207166237725, + "grad_norm": 2.015625, + "learning_rate": 7.94301696431934e-06, + "loss": 1.0529, + "step": 4567 + }, + { + "epoch": 0.9131206116788686, + "grad_norm": 2.28125, + "learning_rate": 7.942164982802588e-06, + "loss": 1.0201, + "step": 4568 + }, + { + "epoch": 0.9133205067339647, + "grad_norm": 2.140625, + "learning_rate": 7.941312870595179e-06, + "loss": 0.9623, + "step": 4569 + }, + { + "epoch": 0.9135204017890608, + "grad_norm": 2.078125, + "learning_rate": 7.940460627734969e-06, + "loss": 1.0634, + "step": 4570 + }, + { + "epoch": 0.9137202968441568, + "grad_norm": 2.140625, + "learning_rate": 7.939608254259812e-06, + "loss": 1.0753, + "step": 4571 + }, + { + "epoch": 0.9139201918992529, + "grad_norm": 2.09375, + "learning_rate": 7.938755750207569e-06, + "loss": 1.0382, + "step": 4572 + }, + { + "epoch": 0.914120086954349, + "grad_norm": 2.171875, + "learning_rate": 7.93790311561611e-06, + "loss": 1.0883, + "step": 4573 + }, + { + "epoch": 0.9143199820094451, + "grad_norm": 2.09375, + "learning_rate": 7.937050350523308e-06, + "loss": 1.0178, + "step": 4574 + }, + { + "epoch": 0.9145198770645411, + "grad_norm": 2.296875, + "learning_rate": 7.936197454967043e-06, + "loss": 0.9342, + "step": 4575 + }, + { + "epoch": 0.9147197721196372, + "grad_norm": 2.125, + "learning_rate": 7.935344428985202e-06, + "loss": 1.1515, + "step": 4576 + }, + { + "epoch": 0.9149196671747333, + "grad_norm": 2.046875, + "learning_rate": 7.934491272615674e-06, + "loss": 1.0595, + "step": 4577 + }, + { + "epoch": 0.9151195622298294, + "grad_norm": 2.046875, + "learning_rate": 7.933637985896356e-06, + "loss": 1.0235, + "step": 4578 + }, + { + "epoch": 0.9153194572849254, + "grad_norm": 2.0625, + "learning_rate": 7.932784568865155e-06, + "loss": 0.9725, + "step": 4579 + }, + { + "epoch": 0.9155193523400215, + "grad_norm": 2.09375, + "learning_rate": 7.931931021559973e-06, + "loss": 1.1227, + "step": 4580 + }, + { + "epoch": 0.9157192473951176, + "grad_norm": 2.0, + "learning_rate": 7.931077344018731e-06, + "loss": 1.0332, + "step": 4581 + }, + { + "epoch": 0.9159191424502137, + "grad_norm": 2.046875, + "learning_rate": 7.930223536279344e-06, + "loss": 1.025, + "step": 4582 + }, + { + "epoch": 0.9161190375053098, + "grad_norm": 2.09375, + "learning_rate": 7.929369598379743e-06, + "loss": 1.0924, + "step": 4583 + }, + { + "epoch": 0.9163189325604058, + "grad_norm": 2.234375, + "learning_rate": 7.928515530357857e-06, + "loss": 1.0261, + "step": 4584 + }, + { + "epoch": 0.9165188276155019, + "grad_norm": 2.09375, + "learning_rate": 7.927661332251622e-06, + "loss": 1.0325, + "step": 4585 + }, + { + "epoch": 0.916718722670598, + "grad_norm": 2.09375, + "learning_rate": 7.926807004098985e-06, + "loss": 0.9855, + "step": 4586 + }, + { + "epoch": 0.916918617725694, + "grad_norm": 1.9921875, + "learning_rate": 7.925952545937892e-06, + "loss": 1.0322, + "step": 4587 + }, + { + "epoch": 0.91711851278079, + "grad_norm": 2.015625, + "learning_rate": 7.9250979578063e-06, + "loss": 0.9784, + "step": 4588 + }, + { + "epoch": 0.9173184078358861, + "grad_norm": 2.15625, + "learning_rate": 7.924243239742171e-06, + "loss": 1.0395, + "step": 4589 + }, + { + "epoch": 0.9175183028909822, + "grad_norm": 2.125, + "learning_rate": 7.923388391783467e-06, + "loss": 1.1187, + "step": 4590 + }, + { + "epoch": 0.9177181979460783, + "grad_norm": 1.9609375, + "learning_rate": 7.922533413968164e-06, + "loss": 0.9561, + "step": 4591 + }, + { + "epoch": 0.9179180930011743, + "grad_norm": 2.09375, + "learning_rate": 7.92167830633424e-06, + "loss": 1.0588, + "step": 4592 + }, + { + "epoch": 0.9181179880562704, + "grad_norm": 1.9609375, + "learning_rate": 7.920823068919676e-06, + "loss": 0.9722, + "step": 4593 + }, + { + "epoch": 0.9183178831113665, + "grad_norm": 2.109375, + "learning_rate": 7.919967701762464e-06, + "loss": 1.0818, + "step": 4594 + }, + { + "epoch": 0.9185177781664626, + "grad_norm": 2.140625, + "learning_rate": 7.919112204900597e-06, + "loss": 0.9887, + "step": 4595 + }, + { + "epoch": 0.9187176732215587, + "grad_norm": 1.9296875, + "learning_rate": 7.918256578372079e-06, + "loss": 1.0106, + "step": 4596 + }, + { + "epoch": 0.9189175682766547, + "grad_norm": 2.046875, + "learning_rate": 7.917400822214916e-06, + "loss": 0.9737, + "step": 4597 + }, + { + "epoch": 0.9191174633317508, + "grad_norm": 2.03125, + "learning_rate": 7.916544936467119e-06, + "loss": 1.1294, + "step": 4598 + }, + { + "epoch": 0.9193173583868469, + "grad_norm": 2.15625, + "learning_rate": 7.915688921166709e-06, + "loss": 1.0798, + "step": 4599 + }, + { + "epoch": 0.919517253441943, + "grad_norm": 2.203125, + "learning_rate": 7.914832776351707e-06, + "loss": 1.0665, + "step": 4600 + }, + { + "epoch": 0.919717148497039, + "grad_norm": 2.03125, + "learning_rate": 7.913976502060143e-06, + "loss": 0.9593, + "step": 4601 + }, + { + "epoch": 0.9199170435521351, + "grad_norm": 2.09375, + "learning_rate": 7.913120098330056e-06, + "loss": 1.0382, + "step": 4602 + }, + { + "epoch": 0.9201169386072312, + "grad_norm": 1.953125, + "learning_rate": 7.912263565199486e-06, + "loss": 0.9877, + "step": 4603 + }, + { + "epoch": 0.9203168336623273, + "grad_norm": 2.0625, + "learning_rate": 7.911406902706478e-06, + "loss": 0.9745, + "step": 4604 + }, + { + "epoch": 0.9205167287174234, + "grad_norm": 2.0625, + "learning_rate": 7.910550110889086e-06, + "loss": 0.9713, + "step": 4605 + }, + { + "epoch": 0.9207166237725194, + "grad_norm": 2.046875, + "learning_rate": 7.909693189785371e-06, + "loss": 1.0932, + "step": 4606 + }, + { + "epoch": 0.9209165188276155, + "grad_norm": 1.9921875, + "learning_rate": 7.908836139433393e-06, + "loss": 1.0144, + "step": 4607 + }, + { + "epoch": 0.9211164138827116, + "grad_norm": 1.984375, + "learning_rate": 7.907978959871228e-06, + "loss": 0.9774, + "step": 4608 + }, + { + "epoch": 0.9213163089378077, + "grad_norm": 2.1875, + "learning_rate": 7.907121651136944e-06, + "loss": 1.0085, + "step": 4609 + }, + { + "epoch": 0.9215162039929037, + "grad_norm": 2.125, + "learning_rate": 7.90626421326863e-06, + "loss": 1.0132, + "step": 4610 + }, + { + "epoch": 0.9217160990479998, + "grad_norm": 2.09375, + "learning_rate": 7.905406646304367e-06, + "loss": 1.053, + "step": 4611 + }, + { + "epoch": 0.9219159941030959, + "grad_norm": 2.0625, + "learning_rate": 7.904548950282254e-06, + "loss": 1.0387, + "step": 4612 + }, + { + "epoch": 0.922115889158192, + "grad_norm": 2.09375, + "learning_rate": 7.903691125240385e-06, + "loss": 1.0253, + "step": 4613 + }, + { + "epoch": 0.922315784213288, + "grad_norm": 2.328125, + "learning_rate": 7.902833171216867e-06, + "loss": 1.0571, + "step": 4614 + }, + { + "epoch": 0.9225156792683841, + "grad_norm": 2.015625, + "learning_rate": 7.901975088249808e-06, + "loss": 1.0589, + "step": 4615 + }, + { + "epoch": 0.9227155743234802, + "grad_norm": 2.125, + "learning_rate": 7.901116876377326e-06, + "loss": 1.0856, + "step": 4616 + }, + { + "epoch": 0.9229154693785763, + "grad_norm": 2.0625, + "learning_rate": 7.900258535637544e-06, + "loss": 1.1088, + "step": 4617 + }, + { + "epoch": 0.9231153644336724, + "grad_norm": 2.046875, + "learning_rate": 7.899400066068588e-06, + "loss": 1.0745, + "step": 4618 + }, + { + "epoch": 0.9233152594887684, + "grad_norm": 2.09375, + "learning_rate": 7.898541467708588e-06, + "loss": 0.9503, + "step": 4619 + }, + { + "epoch": 0.9235151545438645, + "grad_norm": 2.046875, + "learning_rate": 7.897682740595686e-06, + "loss": 0.9778, + "step": 4620 + }, + { + "epoch": 0.9237150495989606, + "grad_norm": 2.25, + "learning_rate": 7.896823884768028e-06, + "loss": 0.9761, + "step": 4621 + }, + { + "epoch": 0.9239149446540567, + "grad_norm": 2.078125, + "learning_rate": 7.895964900263762e-06, + "loss": 1.0342, + "step": 4622 + }, + { + "epoch": 0.9241148397091526, + "grad_norm": 2.09375, + "learning_rate": 7.895105787121045e-06, + "loss": 1.0463, + "step": 4623 + }, + { + "epoch": 0.9243147347642487, + "grad_norm": 1.9609375, + "learning_rate": 7.894246545378037e-06, + "loss": 1.0387, + "step": 4624 + }, + { + "epoch": 0.9245146298193448, + "grad_norm": 2.015625, + "learning_rate": 7.893387175072907e-06, + "loss": 1.055, + "step": 4625 + }, + { + "epoch": 0.924714524874441, + "grad_norm": 2.015625, + "learning_rate": 7.892527676243825e-06, + "loss": 0.9812, + "step": 4626 + }, + { + "epoch": 0.924914419929537, + "grad_norm": 2.078125, + "learning_rate": 7.891668048928975e-06, + "loss": 1.0316, + "step": 4627 + }, + { + "epoch": 0.925114314984633, + "grad_norm": 2.0, + "learning_rate": 7.89080829316654e-06, + "loss": 0.9835, + "step": 4628 + }, + { + "epoch": 0.9253142100397291, + "grad_norm": 2.0625, + "learning_rate": 7.889948408994707e-06, + "loss": 1.0214, + "step": 4629 + }, + { + "epoch": 0.9255141050948252, + "grad_norm": 2.0, + "learning_rate": 7.889088396451676e-06, + "loss": 1.0421, + "step": 4630 + }, + { + "epoch": 0.9257140001499213, + "grad_norm": 2.078125, + "learning_rate": 7.888228255575648e-06, + "loss": 0.9934, + "step": 4631 + }, + { + "epoch": 0.9259138952050173, + "grad_norm": 2.015625, + "learning_rate": 7.887367986404827e-06, + "loss": 0.9687, + "step": 4632 + }, + { + "epoch": 0.9261137902601134, + "grad_norm": 2.0625, + "learning_rate": 7.88650758897743e-06, + "loss": 1.1221, + "step": 4633 + }, + { + "epoch": 0.9263136853152095, + "grad_norm": 2.09375, + "learning_rate": 7.885647063331674e-06, + "loss": 1.0123, + "step": 4634 + }, + { + "epoch": 0.9265135803703056, + "grad_norm": 2.03125, + "learning_rate": 7.884786409505782e-06, + "loss": 0.9704, + "step": 4635 + }, + { + "epoch": 0.9267134754254016, + "grad_norm": 2.078125, + "learning_rate": 7.883925627537987e-06, + "loss": 1.0636, + "step": 4636 + }, + { + "epoch": 0.9269133704804977, + "grad_norm": 2.015625, + "learning_rate": 7.883064717466524e-06, + "loss": 1.0541, + "step": 4637 + }, + { + "epoch": 0.9271132655355938, + "grad_norm": 2.015625, + "learning_rate": 7.882203679329635e-06, + "loss": 1.0097, + "step": 4638 + }, + { + "epoch": 0.9273131605906899, + "grad_norm": 2.046875, + "learning_rate": 7.881342513165567e-06, + "loss": 1.0571, + "step": 4639 + }, + { + "epoch": 0.927513055645786, + "grad_norm": 1.9765625, + "learning_rate": 7.88048121901257e-06, + "loss": 0.9967, + "step": 4640 + }, + { + "epoch": 0.927712950700882, + "grad_norm": 2.078125, + "learning_rate": 7.879619796908905e-06, + "loss": 0.9924, + "step": 4641 + }, + { + "epoch": 0.9279128457559781, + "grad_norm": 2.03125, + "learning_rate": 7.878758246892836e-06, + "loss": 0.9907, + "step": 4642 + }, + { + "epoch": 0.9281127408110742, + "grad_norm": 2.234375, + "learning_rate": 7.877896569002634e-06, + "loss": 1.0302, + "step": 4643 + }, + { + "epoch": 0.9283126358661703, + "grad_norm": 1.9765625, + "learning_rate": 7.877034763276575e-06, + "loss": 1.0495, + "step": 4644 + }, + { + "epoch": 0.9285125309212663, + "grad_norm": 2.359375, + "learning_rate": 7.876172829752937e-06, + "loss": 1.1101, + "step": 4645 + }, + { + "epoch": 0.9287124259763624, + "grad_norm": 2.140625, + "learning_rate": 7.87531076847001e-06, + "loss": 1.0882, + "step": 4646 + }, + { + "epoch": 0.9289123210314585, + "grad_norm": 1.9765625, + "learning_rate": 7.874448579466085e-06, + "loss": 1.0235, + "step": 4647 + }, + { + "epoch": 0.9291122160865546, + "grad_norm": 2.015625, + "learning_rate": 7.873586262779462e-06, + "loss": 1.0154, + "step": 4648 + }, + { + "epoch": 0.9293121111416507, + "grad_norm": 2.078125, + "learning_rate": 7.872723818448443e-06, + "loss": 1.083, + "step": 4649 + }, + { + "epoch": 0.9295120061967467, + "grad_norm": 2.109375, + "learning_rate": 7.87186124651134e-06, + "loss": 1.0428, + "step": 4650 + }, + { + "epoch": 0.9297119012518428, + "grad_norm": 2.203125, + "learning_rate": 7.870998547006467e-06, + "loss": 1.0099, + "step": 4651 + }, + { + "epoch": 0.9299117963069389, + "grad_norm": 2.109375, + "learning_rate": 7.870135719972146e-06, + "loss": 0.9716, + "step": 4652 + }, + { + "epoch": 0.930111691362035, + "grad_norm": 2.03125, + "learning_rate": 7.869272765446701e-06, + "loss": 1.0331, + "step": 4653 + }, + { + "epoch": 0.930311586417131, + "grad_norm": 1.9375, + "learning_rate": 7.868409683468466e-06, + "loss": 0.9574, + "step": 4654 + }, + { + "epoch": 0.9305114814722271, + "grad_norm": 2.15625, + "learning_rate": 7.867546474075782e-06, + "loss": 1.1178, + "step": 4655 + }, + { + "epoch": 0.9307113765273232, + "grad_norm": 2.046875, + "learning_rate": 7.866683137306987e-06, + "loss": 1.0142, + "step": 4656 + }, + { + "epoch": 0.9309112715824193, + "grad_norm": 2.09375, + "learning_rate": 7.865819673200435e-06, + "loss": 0.9947, + "step": 4657 + }, + { + "epoch": 0.9311111666375153, + "grad_norm": 2.046875, + "learning_rate": 7.864956081794477e-06, + "loss": 0.9642, + "step": 4658 + }, + { + "epoch": 0.9313110616926114, + "grad_norm": 2.140625, + "learning_rate": 7.864092363127478e-06, + "loss": 1.0742, + "step": 4659 + }, + { + "epoch": 0.9315109567477075, + "grad_norm": 1.921875, + "learning_rate": 7.8632285172378e-06, + "loss": 0.8985, + "step": 4660 + }, + { + "epoch": 0.9317108518028036, + "grad_norm": 2.0, + "learning_rate": 7.86236454416382e-06, + "loss": 0.9271, + "step": 4661 + }, + { + "epoch": 0.9319107468578997, + "grad_norm": 2.171875, + "learning_rate": 7.86150044394391e-06, + "loss": 1.0636, + "step": 4662 + }, + { + "epoch": 0.9321106419129956, + "grad_norm": 2.109375, + "learning_rate": 7.860636216616458e-06, + "loss": 1.073, + "step": 4663 + }, + { + "epoch": 0.9323105369680917, + "grad_norm": 2.109375, + "learning_rate": 7.85977186221985e-06, + "loss": 1.0081, + "step": 4664 + }, + { + "epoch": 0.9325104320231878, + "grad_norm": 2.03125, + "learning_rate": 7.85890738079248e-06, + "loss": 1.0321, + "step": 4665 + }, + { + "epoch": 0.9327103270782839, + "grad_norm": 2.015625, + "learning_rate": 7.858042772372751e-06, + "loss": 1.0386, + "step": 4666 + }, + { + "epoch": 0.9329102221333799, + "grad_norm": 2.171875, + "learning_rate": 7.857178036999066e-06, + "loss": 0.9717, + "step": 4667 + }, + { + "epoch": 0.933110117188476, + "grad_norm": 2.34375, + "learning_rate": 7.85631317470984e-06, + "loss": 1.0604, + "step": 4668 + }, + { + "epoch": 0.9333100122435721, + "grad_norm": 2.078125, + "learning_rate": 7.855448185543486e-06, + "loss": 0.9417, + "step": 4669 + }, + { + "epoch": 0.9335099072986682, + "grad_norm": 2.03125, + "learning_rate": 7.854583069538431e-06, + "loss": 1.0199, + "step": 4670 + }, + { + "epoch": 0.9337098023537643, + "grad_norm": 2.1875, + "learning_rate": 7.853717826733098e-06, + "loss": 1.1401, + "step": 4671 + }, + { + "epoch": 0.9339096974088603, + "grad_norm": 2.140625, + "learning_rate": 7.852852457165924e-06, + "loss": 1.0263, + "step": 4672 + }, + { + "epoch": 0.9341095924639564, + "grad_norm": 1.9765625, + "learning_rate": 7.851986960875351e-06, + "loss": 0.9771, + "step": 4673 + }, + { + "epoch": 0.9343094875190525, + "grad_norm": 2.15625, + "learning_rate": 7.851121337899819e-06, + "loss": 1.0479, + "step": 4674 + }, + { + "epoch": 0.9345093825741486, + "grad_norm": 2.046875, + "learning_rate": 7.850255588277784e-06, + "loss": 0.9217, + "step": 4675 + }, + { + "epoch": 0.9347092776292446, + "grad_norm": 2.03125, + "learning_rate": 7.8493897120477e-06, + "loss": 0.9912, + "step": 4676 + }, + { + "epoch": 0.9349091726843407, + "grad_norm": 2.1875, + "learning_rate": 7.848523709248026e-06, + "loss": 1.0977, + "step": 4677 + }, + { + "epoch": 0.9351090677394368, + "grad_norm": 1.9453125, + "learning_rate": 7.847657579917237e-06, + "loss": 0.9783, + "step": 4678 + }, + { + "epoch": 0.9353089627945329, + "grad_norm": 2.03125, + "learning_rate": 7.8467913240938e-06, + "loss": 1.0672, + "step": 4679 + }, + { + "epoch": 0.9355088578496289, + "grad_norm": 2.03125, + "learning_rate": 7.845924941816198e-06, + "loss": 0.968, + "step": 4680 + }, + { + "epoch": 0.935708752904725, + "grad_norm": 2.0625, + "learning_rate": 7.845058433122914e-06, + "loss": 1.0014, + "step": 4681 + }, + { + "epoch": 0.9359086479598211, + "grad_norm": 2.03125, + "learning_rate": 7.844191798052438e-06, + "loss": 1.1018, + "step": 4682 + }, + { + "epoch": 0.9361085430149172, + "grad_norm": 2.03125, + "learning_rate": 7.843325036643265e-06, + "loss": 1.003, + "step": 4683 + }, + { + "epoch": 0.9363084380700133, + "grad_norm": 2.140625, + "learning_rate": 7.842458148933898e-06, + "loss": 1.0554, + "step": 4684 + }, + { + "epoch": 0.9365083331251093, + "grad_norm": 1.9140625, + "learning_rate": 7.841591134962845e-06, + "loss": 0.9854, + "step": 4685 + }, + { + "epoch": 0.9367082281802054, + "grad_norm": 2.0625, + "learning_rate": 7.840723994768616e-06, + "loss": 0.9764, + "step": 4686 + }, + { + "epoch": 0.9369081232353015, + "grad_norm": 2.15625, + "learning_rate": 7.83985672838973e-06, + "loss": 1.0718, + "step": 4687 + }, + { + "epoch": 0.9371080182903976, + "grad_norm": 2.015625, + "learning_rate": 7.838989335864714e-06, + "loss": 0.9564, + "step": 4688 + }, + { + "epoch": 0.9373079133454936, + "grad_norm": 2.140625, + "learning_rate": 7.838121817232093e-06, + "loss": 1.0227, + "step": 4689 + }, + { + "epoch": 0.9375078084005897, + "grad_norm": 2.109375, + "learning_rate": 7.837254172530404e-06, + "loss": 1.0139, + "step": 4690 + }, + { + "epoch": 0.9377077034556858, + "grad_norm": 2.046875, + "learning_rate": 7.836386401798188e-06, + "loss": 1.0004, + "step": 4691 + }, + { + "epoch": 0.9379075985107819, + "grad_norm": 2.25, + "learning_rate": 7.83551850507399e-06, + "loss": 1.1292, + "step": 4692 + }, + { + "epoch": 0.9381074935658779, + "grad_norm": 2.15625, + "learning_rate": 7.834650482396364e-06, + "loss": 1.0614, + "step": 4693 + }, + { + "epoch": 0.938307388620974, + "grad_norm": 2.09375, + "learning_rate": 7.833782333803865e-06, + "loss": 1.076, + "step": 4694 + }, + { + "epoch": 0.9385072836760701, + "grad_norm": 2.078125, + "learning_rate": 7.83291405933506e-06, + "loss": 0.9859, + "step": 4695 + }, + { + "epoch": 0.9387071787311662, + "grad_norm": 2.046875, + "learning_rate": 7.832045659028513e-06, + "loss": 1.0348, + "step": 4696 + }, + { + "epoch": 0.9389070737862623, + "grad_norm": 2.0625, + "learning_rate": 7.831177132922801e-06, + "loss": 1.0461, + "step": 4697 + }, + { + "epoch": 0.9391069688413582, + "grad_norm": 2.109375, + "learning_rate": 7.830308481056503e-06, + "loss": 1.0873, + "step": 4698 + }, + { + "epoch": 0.9393068638964543, + "grad_norm": 2.3125, + "learning_rate": 7.829439703468203e-06, + "loss": 0.9891, + "step": 4699 + }, + { + "epoch": 0.9395067589515504, + "grad_norm": 2.015625, + "learning_rate": 7.828570800196495e-06, + "loss": 0.9237, + "step": 4700 + }, + { + "epoch": 0.9397066540066465, + "grad_norm": 2.1875, + "learning_rate": 7.827701771279976e-06, + "loss": 1.0243, + "step": 4701 + }, + { + "epoch": 0.9399065490617425, + "grad_norm": 2.359375, + "learning_rate": 7.826832616757244e-06, + "loss": 1.0948, + "step": 4702 + }, + { + "epoch": 0.9401064441168386, + "grad_norm": 2.125, + "learning_rate": 7.825963336666909e-06, + "loss": 1.0013, + "step": 4703 + }, + { + "epoch": 0.9403063391719347, + "grad_norm": 2.09375, + "learning_rate": 7.825093931047585e-06, + "loss": 1.0162, + "step": 4704 + }, + { + "epoch": 0.9405062342270308, + "grad_norm": 2.09375, + "learning_rate": 7.824224399937891e-06, + "loss": 0.9432, + "step": 4705 + }, + { + "epoch": 0.9407061292821269, + "grad_norm": 2.046875, + "learning_rate": 7.82335474337645e-06, + "loss": 1.0542, + "step": 4706 + }, + { + "epoch": 0.9409060243372229, + "grad_norm": 2.09375, + "learning_rate": 7.822484961401893e-06, + "loss": 1.0713, + "step": 4707 + }, + { + "epoch": 0.941105919392319, + "grad_norm": 2.125, + "learning_rate": 7.821615054052856e-06, + "loss": 0.9936, + "step": 4708 + }, + { + "epoch": 0.9413058144474151, + "grad_norm": 2.015625, + "learning_rate": 7.820745021367977e-06, + "loss": 0.9805, + "step": 4709 + }, + { + "epoch": 0.9415057095025112, + "grad_norm": 2.109375, + "learning_rate": 7.819874863385908e-06, + "loss": 1.0279, + "step": 4710 + }, + { + "epoch": 0.9417056045576072, + "grad_norm": 2.203125, + "learning_rate": 7.819004580145298e-06, + "loss": 1.1394, + "step": 4711 + }, + { + "epoch": 0.9419054996127033, + "grad_norm": 2.03125, + "learning_rate": 7.818134171684805e-06, + "loss": 0.972, + "step": 4712 + }, + { + "epoch": 0.9421053946677994, + "grad_norm": 2.25, + "learning_rate": 7.817263638043096e-06, + "loss": 1.0544, + "step": 4713 + }, + { + "epoch": 0.9423052897228955, + "grad_norm": 2.25, + "learning_rate": 7.816392979258834e-06, + "loss": 1.0741, + "step": 4714 + }, + { + "epoch": 0.9425051847779915, + "grad_norm": 1.96875, + "learning_rate": 7.815522195370697e-06, + "loss": 0.9426, + "step": 4715 + }, + { + "epoch": 0.9427050798330876, + "grad_norm": 1.953125, + "learning_rate": 7.814651286417367e-06, + "loss": 1.0312, + "step": 4716 + }, + { + "epoch": 0.9429049748881837, + "grad_norm": 2.046875, + "learning_rate": 7.813780252437526e-06, + "loss": 1.0476, + "step": 4717 + }, + { + "epoch": 0.9431048699432798, + "grad_norm": 2.125, + "learning_rate": 7.812909093469868e-06, + "loss": 1.0199, + "step": 4718 + }, + { + "epoch": 0.9433047649983759, + "grad_norm": 2.203125, + "learning_rate": 7.812037809553086e-06, + "loss": 1.0714, + "step": 4719 + }, + { + "epoch": 0.9435046600534719, + "grad_norm": 2.28125, + "learning_rate": 7.811166400725884e-06, + "loss": 1.1525, + "step": 4720 + }, + { + "epoch": 0.943704555108568, + "grad_norm": 2.125, + "learning_rate": 7.810294867026974e-06, + "loss": 1.0328, + "step": 4721 + }, + { + "epoch": 0.9439044501636641, + "grad_norm": 1.953125, + "learning_rate": 7.809423208495064e-06, + "loss": 0.9166, + "step": 4722 + }, + { + "epoch": 0.9441043452187602, + "grad_norm": 1.9453125, + "learning_rate": 7.808551425168878e-06, + "loss": 1.0082, + "step": 4723 + }, + { + "epoch": 0.9443042402738562, + "grad_norm": 2.03125, + "learning_rate": 7.807679517087135e-06, + "loss": 1.0474, + "step": 4724 + }, + { + "epoch": 0.9445041353289523, + "grad_norm": 1.9921875, + "learning_rate": 7.806807484288567e-06, + "loss": 0.9546, + "step": 4725 + }, + { + "epoch": 0.9447040303840484, + "grad_norm": 2.03125, + "learning_rate": 7.805935326811913e-06, + "loss": 1.0596, + "step": 4726 + }, + { + "epoch": 0.9449039254391445, + "grad_norm": 2.03125, + "learning_rate": 7.805063044695909e-06, + "loss": 1.0443, + "step": 4727 + }, + { + "epoch": 0.9451038204942406, + "grad_norm": 2.296875, + "learning_rate": 7.804190637979305e-06, + "loss": 0.9845, + "step": 4728 + }, + { + "epoch": 0.9453037155493366, + "grad_norm": 2.140625, + "learning_rate": 7.803318106700853e-06, + "loss": 1.0737, + "step": 4729 + }, + { + "epoch": 0.9455036106044327, + "grad_norm": 2.03125, + "learning_rate": 7.80244545089931e-06, + "loss": 1.0044, + "step": 4730 + }, + { + "epoch": 0.9457035056595288, + "grad_norm": 2.03125, + "learning_rate": 7.80157267061344e-06, + "loss": 1.0396, + "step": 4731 + }, + { + "epoch": 0.9459034007146249, + "grad_norm": 2.109375, + "learning_rate": 7.800699765882009e-06, + "loss": 1.052, + "step": 4732 + }, + { + "epoch": 0.9461032957697209, + "grad_norm": 2.15625, + "learning_rate": 7.799826736743796e-06, + "loss": 1.0078, + "step": 4733 + }, + { + "epoch": 0.946303190824817, + "grad_norm": 2.15625, + "learning_rate": 7.798953583237578e-06, + "loss": 1.017, + "step": 4734 + }, + { + "epoch": 0.946503085879913, + "grad_norm": 2.09375, + "learning_rate": 7.79808030540214e-06, + "loss": 1.0441, + "step": 4735 + }, + { + "epoch": 0.9467029809350092, + "grad_norm": 1.9921875, + "learning_rate": 7.797206903276274e-06, + "loss": 0.9931, + "step": 4736 + }, + { + "epoch": 0.9469028759901051, + "grad_norm": 2.109375, + "learning_rate": 7.796333376898774e-06, + "loss": 0.9818, + "step": 4737 + }, + { + "epoch": 0.9471027710452012, + "grad_norm": 2.03125, + "learning_rate": 7.795459726308446e-06, + "loss": 1.0046, + "step": 4738 + }, + { + "epoch": 0.9473026661002973, + "grad_norm": 2.0, + "learning_rate": 7.794585951544096e-06, + "loss": 1.0416, + "step": 4739 + }, + { + "epoch": 0.9475025611553934, + "grad_norm": 2.140625, + "learning_rate": 7.793712052644535e-06, + "loss": 1.0064, + "step": 4740 + }, + { + "epoch": 0.9477024562104895, + "grad_norm": 2.0625, + "learning_rate": 7.792838029648584e-06, + "loss": 0.9996, + "step": 4741 + }, + { + "epoch": 0.9479023512655855, + "grad_norm": 2.203125, + "learning_rate": 7.791963882595066e-06, + "loss": 1.101, + "step": 4742 + }, + { + "epoch": 0.9481022463206816, + "grad_norm": 2.1875, + "learning_rate": 7.791089611522811e-06, + "loss": 1.0511, + "step": 4743 + }, + { + "epoch": 0.9483021413757777, + "grad_norm": 2.15625, + "learning_rate": 7.790215216470654e-06, + "loss": 1.1489, + "step": 4744 + }, + { + "epoch": 0.9485020364308738, + "grad_norm": 2.0, + "learning_rate": 7.789340697477432e-06, + "loss": 1.0727, + "step": 4745 + }, + { + "epoch": 0.9487019314859698, + "grad_norm": 2.140625, + "learning_rate": 7.788466054581997e-06, + "loss": 1.0585, + "step": 4746 + }, + { + "epoch": 0.9489018265410659, + "grad_norm": 2.078125, + "learning_rate": 7.787591287823197e-06, + "loss": 0.9812, + "step": 4747 + }, + { + "epoch": 0.949101721596162, + "grad_norm": 2.078125, + "learning_rate": 7.78671639723989e-06, + "loss": 0.9217, + "step": 4748 + }, + { + "epoch": 0.9493016166512581, + "grad_norm": 2.109375, + "learning_rate": 7.785841382870938e-06, + "loss": 1.0322, + "step": 4749 + }, + { + "epoch": 0.9495015117063542, + "grad_norm": 2.046875, + "learning_rate": 7.78496624475521e-06, + "loss": 1.0459, + "step": 4750 + }, + { + "epoch": 0.9497014067614502, + "grad_norm": 2.0625, + "learning_rate": 7.784090982931577e-06, + "loss": 1.0993, + "step": 4751 + }, + { + "epoch": 0.9499013018165463, + "grad_norm": 2.0625, + "learning_rate": 7.78321559743892e-06, + "loss": 0.9609, + "step": 4752 + }, + { + "epoch": 0.9501011968716424, + "grad_norm": 2.140625, + "learning_rate": 7.782340088316125e-06, + "loss": 1.0295, + "step": 4753 + }, + { + "epoch": 0.9503010919267385, + "grad_norm": 2.046875, + "learning_rate": 7.78146445560208e-06, + "loss": 1.0306, + "step": 4754 + }, + { + "epoch": 0.9505009869818345, + "grad_norm": 2.078125, + "learning_rate": 7.78058869933568e-06, + "loss": 0.9982, + "step": 4755 + }, + { + "epoch": 0.9507008820369306, + "grad_norm": 2.171875, + "learning_rate": 7.77971281955583e-06, + "loss": 1.0818, + "step": 4756 + }, + { + "epoch": 0.9509007770920267, + "grad_norm": 1.9765625, + "learning_rate": 7.778836816301429e-06, + "loss": 0.9594, + "step": 4757 + }, + { + "epoch": 0.9511006721471228, + "grad_norm": 1.984375, + "learning_rate": 7.777960689611396e-06, + "loss": 1.085, + "step": 4758 + }, + { + "epoch": 0.9513005672022188, + "grad_norm": 2.0, + "learning_rate": 7.777084439524644e-06, + "loss": 1.0684, + "step": 4759 + }, + { + "epoch": 0.9515004622573149, + "grad_norm": 1.9609375, + "learning_rate": 7.7762080660801e-06, + "loss": 1.0481, + "step": 4760 + }, + { + "epoch": 0.951700357312411, + "grad_norm": 2.03125, + "learning_rate": 7.775331569316688e-06, + "loss": 0.9095, + "step": 4761 + }, + { + "epoch": 0.9519002523675071, + "grad_norm": 1.9609375, + "learning_rate": 7.774454949273348e-06, + "loss": 0.9383, + "step": 4762 + }, + { + "epoch": 0.9521001474226032, + "grad_norm": 2.3125, + "learning_rate": 7.773578205989013e-06, + "loss": 0.9806, + "step": 4763 + }, + { + "epoch": 0.9523000424776992, + "grad_norm": 2.15625, + "learning_rate": 7.77270133950263e-06, + "loss": 1.0464, + "step": 4764 + }, + { + "epoch": 0.9524999375327953, + "grad_norm": 2.28125, + "learning_rate": 7.77182434985315e-06, + "loss": 1.2045, + "step": 4765 + }, + { + "epoch": 0.9526998325878914, + "grad_norm": 2.15625, + "learning_rate": 7.770947237079528e-06, + "loss": 1.0449, + "step": 4766 + }, + { + "epoch": 0.9528997276429875, + "grad_norm": 2.140625, + "learning_rate": 7.770070001220727e-06, + "loss": 1.0402, + "step": 4767 + }, + { + "epoch": 0.9530996226980835, + "grad_norm": 2.234375, + "learning_rate": 7.76919264231571e-06, + "loss": 1.1098, + "step": 4768 + }, + { + "epoch": 0.9532995177531796, + "grad_norm": 2.046875, + "learning_rate": 7.768315160403453e-06, + "loss": 1.0016, + "step": 4769 + }, + { + "epoch": 0.9534994128082757, + "grad_norm": 2.109375, + "learning_rate": 7.767437555522934e-06, + "loss": 0.996, + "step": 4770 + }, + { + "epoch": 0.9536993078633718, + "grad_norm": 2.046875, + "learning_rate": 7.766559827713131e-06, + "loss": 1.0092, + "step": 4771 + }, + { + "epoch": 0.9538992029184679, + "grad_norm": 2.078125, + "learning_rate": 7.765681977013037e-06, + "loss": 1.0211, + "step": 4772 + }, + { + "epoch": 0.9540990979735638, + "grad_norm": 2.15625, + "learning_rate": 7.764804003461646e-06, + "loss": 0.9899, + "step": 4773 + }, + { + "epoch": 0.95429899302866, + "grad_norm": 2.140625, + "learning_rate": 7.763925907097956e-06, + "loss": 1.0812, + "step": 4774 + }, + { + "epoch": 0.954498888083756, + "grad_norm": 2.0625, + "learning_rate": 7.763047687960971e-06, + "loss": 0.984, + "step": 4775 + }, + { + "epoch": 0.9546987831388521, + "grad_norm": 2.125, + "learning_rate": 7.762169346089705e-06, + "loss": 1.0208, + "step": 4776 + }, + { + "epoch": 0.9548986781939481, + "grad_norm": 1.96875, + "learning_rate": 7.76129088152317e-06, + "loss": 0.9695, + "step": 4777 + }, + { + "epoch": 0.9550985732490442, + "grad_norm": 2.09375, + "learning_rate": 7.760412294300392e-06, + "loss": 1.0673, + "step": 4778 + }, + { + "epoch": 0.9552984683041403, + "grad_norm": 2.0625, + "learning_rate": 7.759533584460392e-06, + "loss": 0.9945, + "step": 4779 + }, + { + "epoch": 0.9554983633592364, + "grad_norm": 2.015625, + "learning_rate": 7.758654752042205e-06, + "loss": 1.0414, + "step": 4780 + }, + { + "epoch": 0.9556982584143324, + "grad_norm": 2.0625, + "learning_rate": 7.75777579708487e-06, + "loss": 1.0086, + "step": 4781 + }, + { + "epoch": 0.9558981534694285, + "grad_norm": 2.25, + "learning_rate": 7.756896719627428e-06, + "loss": 1.0577, + "step": 4782 + }, + { + "epoch": 0.9560980485245246, + "grad_norm": 2.0625, + "learning_rate": 7.756017519708926e-06, + "loss": 1.0469, + "step": 4783 + }, + { + "epoch": 0.9562979435796207, + "grad_norm": 2.03125, + "learning_rate": 7.755138197368423e-06, + "loss": 1.0342, + "step": 4784 + }, + { + "epoch": 0.9564978386347168, + "grad_norm": 2.03125, + "learning_rate": 7.754258752644974e-06, + "loss": 0.9765, + "step": 4785 + }, + { + "epoch": 0.9566977336898128, + "grad_norm": 2.0625, + "learning_rate": 7.753379185577645e-06, + "loss": 1.0252, + "step": 4786 + }, + { + "epoch": 0.9568976287449089, + "grad_norm": 2.09375, + "learning_rate": 7.752499496205509e-06, + "loss": 0.975, + "step": 4787 + }, + { + "epoch": 0.957097523800005, + "grad_norm": 2.03125, + "learning_rate": 7.751619684567638e-06, + "loss": 1.1005, + "step": 4788 + }, + { + "epoch": 0.9572974188551011, + "grad_norm": 2.046875, + "learning_rate": 7.750739750703114e-06, + "loss": 1.0138, + "step": 4789 + }, + { + "epoch": 0.9574973139101971, + "grad_norm": 2.171875, + "learning_rate": 7.749859694651023e-06, + "loss": 1.1132, + "step": 4790 + }, + { + "epoch": 0.9576972089652932, + "grad_norm": 2.09375, + "learning_rate": 7.74897951645046e-06, + "loss": 1.0403, + "step": 4791 + }, + { + "epoch": 0.9578971040203893, + "grad_norm": 1.9453125, + "learning_rate": 7.74809921614052e-06, + "loss": 0.9768, + "step": 4792 + }, + { + "epoch": 0.9580969990754854, + "grad_norm": 2.140625, + "learning_rate": 7.747218793760308e-06, + "loss": 1.0403, + "step": 4793 + }, + { + "epoch": 0.9582968941305814, + "grad_norm": 1.9296875, + "learning_rate": 7.746338249348928e-06, + "loss": 1.0055, + "step": 4794 + }, + { + "epoch": 0.9584967891856775, + "grad_norm": 2.09375, + "learning_rate": 7.745457582945497e-06, + "loss": 1.0358, + "step": 4795 + }, + { + "epoch": 0.9586966842407736, + "grad_norm": 2.0625, + "learning_rate": 7.744576794589132e-06, + "loss": 1.0853, + "step": 4796 + }, + { + "epoch": 0.9588965792958697, + "grad_norm": 2.140625, + "learning_rate": 7.743695884318961e-06, + "loss": 1.0774, + "step": 4797 + }, + { + "epoch": 0.9590964743509658, + "grad_norm": 2.140625, + "learning_rate": 7.742814852174112e-06, + "loss": 1.0557, + "step": 4798 + }, + { + "epoch": 0.9592963694060618, + "grad_norm": 1.9765625, + "learning_rate": 7.741933698193719e-06, + "loss": 0.9809, + "step": 4799 + }, + { + "epoch": 0.9594962644611579, + "grad_norm": 2.21875, + "learning_rate": 7.741052422416923e-06, + "loss": 0.9666, + "step": 4800 + }, + { + "epoch": 0.959696159516254, + "grad_norm": 1.9921875, + "learning_rate": 7.740171024882875e-06, + "loss": 1.0036, + "step": 4801 + }, + { + "epoch": 0.9598960545713501, + "grad_norm": 2.109375, + "learning_rate": 7.73928950563072e-06, + "loss": 1.0543, + "step": 4802 + }, + { + "epoch": 0.9600959496264461, + "grad_norm": 2.046875, + "learning_rate": 7.738407864699618e-06, + "loss": 1.0295, + "step": 4803 + }, + { + "epoch": 0.9602958446815422, + "grad_norm": 2.125, + "learning_rate": 7.737526102128729e-06, + "loss": 1.0324, + "step": 4804 + }, + { + "epoch": 0.9604957397366383, + "grad_norm": 2.03125, + "learning_rate": 7.736644217957226e-06, + "loss": 0.9947, + "step": 4805 + }, + { + "epoch": 0.9606956347917344, + "grad_norm": 2.359375, + "learning_rate": 7.735762212224278e-06, + "loss": 1.0961, + "step": 4806 + }, + { + "epoch": 0.9608955298468305, + "grad_norm": 2.125, + "learning_rate": 7.734880084969065e-06, + "loss": 0.999, + "step": 4807 + }, + { + "epoch": 0.9610954249019265, + "grad_norm": 2.234375, + "learning_rate": 7.733997836230771e-06, + "loss": 1.0628, + "step": 4808 + }, + { + "epoch": 0.9612953199570226, + "grad_norm": 2.0, + "learning_rate": 7.733115466048585e-06, + "loss": 1.0227, + "step": 4809 + }, + { + "epoch": 0.9614952150121187, + "grad_norm": 2.171875, + "learning_rate": 7.732232974461701e-06, + "loss": 1.0994, + "step": 4810 + }, + { + "epoch": 0.9616951100672148, + "grad_norm": 2.140625, + "learning_rate": 7.731350361509322e-06, + "loss": 1.0277, + "step": 4811 + }, + { + "epoch": 0.9618950051223107, + "grad_norm": 2.015625, + "learning_rate": 7.730467627230648e-06, + "loss": 1.0349, + "step": 4812 + }, + { + "epoch": 0.9620949001774068, + "grad_norm": 1.9609375, + "learning_rate": 7.729584771664897e-06, + "loss": 0.9012, + "step": 4813 + }, + { + "epoch": 0.9622947952325029, + "grad_norm": 2.21875, + "learning_rate": 7.728701794851281e-06, + "loss": 0.9579, + "step": 4814 + }, + { + "epoch": 0.962494690287599, + "grad_norm": 2.03125, + "learning_rate": 7.727818696829023e-06, + "loss": 1.1154, + "step": 4815 + }, + { + "epoch": 0.962694585342695, + "grad_norm": 2.046875, + "learning_rate": 7.72693547763735e-06, + "loss": 0.9688, + "step": 4816 + }, + { + "epoch": 0.9628944803977911, + "grad_norm": 2.15625, + "learning_rate": 7.726052137315493e-06, + "loss": 1.043, + "step": 4817 + }, + { + "epoch": 0.9630943754528872, + "grad_norm": 2.390625, + "learning_rate": 7.725168675902692e-06, + "loss": 1.0939, + "step": 4818 + }, + { + "epoch": 0.9632942705079833, + "grad_norm": 2.1875, + "learning_rate": 7.72428509343819e-06, + "loss": 1.0627, + "step": 4819 + }, + { + "epoch": 0.9634941655630794, + "grad_norm": 2.125, + "learning_rate": 7.723401389961235e-06, + "loss": 1.0583, + "step": 4820 + }, + { + "epoch": 0.9636940606181754, + "grad_norm": 2.140625, + "learning_rate": 7.72251756551108e-06, + "loss": 1.067, + "step": 4821 + }, + { + "epoch": 0.9638939556732715, + "grad_norm": 2.046875, + "learning_rate": 7.721633620126987e-06, + "loss": 1.0691, + "step": 4822 + }, + { + "epoch": 0.9640938507283676, + "grad_norm": 2.171875, + "learning_rate": 7.72074955384822e-06, + "loss": 1.027, + "step": 4823 + }, + { + "epoch": 0.9642937457834637, + "grad_norm": 2.171875, + "learning_rate": 7.719865366714046e-06, + "loss": 1.1455, + "step": 4824 + }, + { + "epoch": 0.9644936408385597, + "grad_norm": 1.9921875, + "learning_rate": 7.718981058763744e-06, + "loss": 0.9386, + "step": 4825 + }, + { + "epoch": 0.9646935358936558, + "grad_norm": 2.25, + "learning_rate": 7.718096630036593e-06, + "loss": 1.0428, + "step": 4826 + }, + { + "epoch": 0.9648934309487519, + "grad_norm": 1.9140625, + "learning_rate": 7.71721208057188e-06, + "loss": 0.941, + "step": 4827 + }, + { + "epoch": 0.965093326003848, + "grad_norm": 2.078125, + "learning_rate": 7.7163274104089e-06, + "loss": 0.9998, + "step": 4828 + }, + { + "epoch": 0.9652932210589441, + "grad_norm": 2.078125, + "learning_rate": 7.715442619586943e-06, + "loss": 1.0023, + "step": 4829 + }, + { + "epoch": 0.9654931161140401, + "grad_norm": 1.875, + "learning_rate": 7.714557708145315e-06, + "loss": 0.9066, + "step": 4830 + }, + { + "epoch": 0.9656930111691362, + "grad_norm": 2.125, + "learning_rate": 7.713672676123324e-06, + "loss": 1.0072, + "step": 4831 + }, + { + "epoch": 0.9658929062242323, + "grad_norm": 2.046875, + "learning_rate": 7.712787523560283e-06, + "loss": 1.0759, + "step": 4832 + }, + { + "epoch": 0.9660928012793284, + "grad_norm": 2.0, + "learning_rate": 7.711902250495508e-06, + "loss": 0.9749, + "step": 4833 + }, + { + "epoch": 0.9662926963344244, + "grad_norm": 2.109375, + "learning_rate": 7.711016856968327e-06, + "loss": 1.0362, + "step": 4834 + }, + { + "epoch": 0.9664925913895205, + "grad_norm": 2.234375, + "learning_rate": 7.710131343018066e-06, + "loss": 1.0392, + "step": 4835 + }, + { + "epoch": 0.9666924864446166, + "grad_norm": 2.0, + "learning_rate": 7.709245708684061e-06, + "loss": 0.9769, + "step": 4836 + }, + { + "epoch": 0.9668923814997127, + "grad_norm": 2.15625, + "learning_rate": 7.708359954005651e-06, + "loss": 1.0357, + "step": 4837 + }, + { + "epoch": 0.9670922765548087, + "grad_norm": 2.28125, + "learning_rate": 7.70747407902218e-06, + "loss": 1.1927, + "step": 4838 + }, + { + "epoch": 0.9672921716099048, + "grad_norm": 1.9765625, + "learning_rate": 7.706588083772999e-06, + "loss": 0.9573, + "step": 4839 + }, + { + "epoch": 0.9674920666650009, + "grad_norm": 1.984375, + "learning_rate": 7.705701968297466e-06, + "loss": 0.9817, + "step": 4840 + }, + { + "epoch": 0.967691961720097, + "grad_norm": 2.234375, + "learning_rate": 7.704815732634941e-06, + "loss": 1.0487, + "step": 4841 + }, + { + "epoch": 0.9678918567751931, + "grad_norm": 1.9921875, + "learning_rate": 7.703929376824787e-06, + "loss": 0.9766, + "step": 4842 + }, + { + "epoch": 0.9680917518302891, + "grad_norm": 2.0625, + "learning_rate": 7.703042900906383e-06, + "loss": 1.0385, + "step": 4843 + }, + { + "epoch": 0.9682916468853852, + "grad_norm": 2.015625, + "learning_rate": 7.702156304919098e-06, + "loss": 0.9849, + "step": 4844 + }, + { + "epoch": 0.9684915419404813, + "grad_norm": 2.125, + "learning_rate": 7.70126958890232e-06, + "loss": 1.0976, + "step": 4845 + }, + { + "epoch": 0.9686914369955774, + "grad_norm": 2.03125, + "learning_rate": 7.700382752895436e-06, + "loss": 0.9467, + "step": 4846 + }, + { + "epoch": 0.9688913320506733, + "grad_norm": 2.125, + "learning_rate": 7.69949579693784e-06, + "loss": 0.989, + "step": 4847 + }, + { + "epoch": 0.9690912271057694, + "grad_norm": 2.015625, + "learning_rate": 7.698608721068926e-06, + "loss": 1.0281, + "step": 4848 + }, + { + "epoch": 0.9692911221608655, + "grad_norm": 2.171875, + "learning_rate": 7.697721525328104e-06, + "loss": 1.0828, + "step": 4849 + }, + { + "epoch": 0.9694910172159616, + "grad_norm": 2.078125, + "learning_rate": 7.696834209754777e-06, + "loss": 0.9711, + "step": 4850 + }, + { + "epoch": 0.9696909122710577, + "grad_norm": 2.015625, + "learning_rate": 7.695946774388364e-06, + "loss": 1.026, + "step": 4851 + }, + { + "epoch": 0.9698908073261537, + "grad_norm": 2.015625, + "learning_rate": 7.695059219268281e-06, + "loss": 1.0395, + "step": 4852 + }, + { + "epoch": 0.9700907023812498, + "grad_norm": 2.09375, + "learning_rate": 7.694171544433958e-06, + "loss": 0.9939, + "step": 4853 + }, + { + "epoch": 0.9702905974363459, + "grad_norm": 2.046875, + "learning_rate": 7.693283749924821e-06, + "loss": 0.9857, + "step": 4854 + }, + { + "epoch": 0.970490492491442, + "grad_norm": 2.109375, + "learning_rate": 7.69239583578031e-06, + "loss": 1.0404, + "step": 4855 + }, + { + "epoch": 0.970690387546538, + "grad_norm": 2.078125, + "learning_rate": 7.691507802039861e-06, + "loss": 1.0722, + "step": 4856 + }, + { + "epoch": 0.9708902826016341, + "grad_norm": 2.046875, + "learning_rate": 7.690619648742923e-06, + "loss": 0.989, + "step": 4857 + }, + { + "epoch": 0.9710901776567302, + "grad_norm": 2.203125, + "learning_rate": 7.68973137592895e-06, + "loss": 1.0558, + "step": 4858 + }, + { + "epoch": 0.9712900727118263, + "grad_norm": 2.046875, + "learning_rate": 7.688842983637395e-06, + "loss": 1.0041, + "step": 4859 + }, + { + "epoch": 0.9714899677669223, + "grad_norm": 2.078125, + "learning_rate": 7.687954471907719e-06, + "loss": 1.0483, + "step": 4860 + }, + { + "epoch": 0.9716898628220184, + "grad_norm": 2.078125, + "learning_rate": 7.687065840779397e-06, + "loss": 1.0264, + "step": 4861 + }, + { + "epoch": 0.9718897578771145, + "grad_norm": 2.171875, + "learning_rate": 7.686177090291896e-06, + "loss": 1.079, + "step": 4862 + }, + { + "epoch": 0.9720896529322106, + "grad_norm": 2.09375, + "learning_rate": 7.685288220484693e-06, + "loss": 1.1362, + "step": 4863 + }, + { + "epoch": 0.9722895479873067, + "grad_norm": 2.109375, + "learning_rate": 7.684399231397278e-06, + "loss": 1.0843, + "step": 4864 + }, + { + "epoch": 0.9724894430424027, + "grad_norm": 2.15625, + "learning_rate": 7.683510123069133e-06, + "loss": 1.0176, + "step": 4865 + }, + { + "epoch": 0.9726893380974988, + "grad_norm": 1.9453125, + "learning_rate": 7.682620895539756e-06, + "loss": 1.0219, + "step": 4866 + }, + { + "epoch": 0.9728892331525949, + "grad_norm": 2.15625, + "learning_rate": 7.681731548848645e-06, + "loss": 1.1454, + "step": 4867 + }, + { + "epoch": 0.973089128207691, + "grad_norm": 2.109375, + "learning_rate": 7.680842083035305e-06, + "loss": 1.0198, + "step": 4868 + }, + { + "epoch": 0.973289023262787, + "grad_norm": 2.171875, + "learning_rate": 7.679952498139248e-06, + "loss": 0.9693, + "step": 4869 + }, + { + "epoch": 0.9734889183178831, + "grad_norm": 2.078125, + "learning_rate": 7.679062794199982e-06, + "loss": 1.0717, + "step": 4870 + }, + { + "epoch": 0.9736888133729792, + "grad_norm": 2.0625, + "learning_rate": 7.678172971257038e-06, + "loss": 1.0067, + "step": 4871 + }, + { + "epoch": 0.9738887084280753, + "grad_norm": 2.0, + "learning_rate": 7.677283029349936e-06, + "loss": 1.0556, + "step": 4872 + }, + { + "epoch": 0.9740886034831714, + "grad_norm": 1.9375, + "learning_rate": 7.676392968518205e-06, + "loss": 0.964, + "step": 4873 + }, + { + "epoch": 0.9742884985382674, + "grad_norm": 2.296875, + "learning_rate": 7.675502788801387e-06, + "loss": 1.1312, + "step": 4874 + }, + { + "epoch": 0.9744883935933635, + "grad_norm": 2.359375, + "learning_rate": 7.67461249023902e-06, + "loss": 0.971, + "step": 4875 + }, + { + "epoch": 0.9746882886484596, + "grad_norm": 2.0625, + "learning_rate": 7.67372207287065e-06, + "loss": 0.9664, + "step": 4876 + }, + { + "epoch": 0.9748881837035557, + "grad_norm": 1.984375, + "learning_rate": 7.672831536735832e-06, + "loss": 1.0971, + "step": 4877 + }, + { + "epoch": 0.9750880787586517, + "grad_norm": 2.125, + "learning_rate": 7.671940881874124e-06, + "loss": 1.087, + "step": 4878 + }, + { + "epoch": 0.9752879738137478, + "grad_norm": 2.125, + "learning_rate": 7.671050108325087e-06, + "loss": 0.9465, + "step": 4879 + }, + { + "epoch": 0.9754878688688439, + "grad_norm": 2.4375, + "learning_rate": 7.670159216128291e-06, + "loss": 0.9395, + "step": 4880 + }, + { + "epoch": 0.97568776392394, + "grad_norm": 2.078125, + "learning_rate": 7.669268205323304e-06, + "loss": 0.9663, + "step": 4881 + }, + { + "epoch": 0.975887658979036, + "grad_norm": 2.015625, + "learning_rate": 7.66837707594971e-06, + "loss": 1.026, + "step": 4882 + }, + { + "epoch": 0.976087554034132, + "grad_norm": 2.140625, + "learning_rate": 7.667485828047091e-06, + "loss": 1.1083, + "step": 4883 + }, + { + "epoch": 0.9762874490892282, + "grad_norm": 2.0, + "learning_rate": 7.666594461655039e-06, + "loss": 1.0118, + "step": 4884 + }, + { + "epoch": 0.9764873441443243, + "grad_norm": 2.078125, + "learning_rate": 7.665702976813142e-06, + "loss": 1.0811, + "step": 4885 + }, + { + "epoch": 0.9766872391994204, + "grad_norm": 2.21875, + "learning_rate": 7.664811373561008e-06, + "loss": 1.1536, + "step": 4886 + }, + { + "epoch": 0.9768871342545163, + "grad_norm": 2.15625, + "learning_rate": 7.663919651938234e-06, + "loss": 1.0617, + "step": 4887 + }, + { + "epoch": 0.9770870293096124, + "grad_norm": 2.015625, + "learning_rate": 7.663027811984433e-06, + "loss": 1.0567, + "step": 4888 + }, + { + "epoch": 0.9772869243647085, + "grad_norm": 2.171875, + "learning_rate": 7.662135853739224e-06, + "loss": 0.9078, + "step": 4889 + }, + { + "epoch": 0.9774868194198046, + "grad_norm": 2.09375, + "learning_rate": 7.661243777242223e-06, + "loss": 1.0036, + "step": 4890 + }, + { + "epoch": 0.9776867144749006, + "grad_norm": 1.9296875, + "learning_rate": 7.660351582533057e-06, + "loss": 0.9038, + "step": 4891 + }, + { + "epoch": 0.9778866095299967, + "grad_norm": 2.09375, + "learning_rate": 7.65945926965136e-06, + "loss": 1.1392, + "step": 4892 + }, + { + "epoch": 0.9780865045850928, + "grad_norm": 1.953125, + "learning_rate": 7.658566838636762e-06, + "loss": 0.93, + "step": 4893 + }, + { + "epoch": 0.9782863996401889, + "grad_norm": 2.046875, + "learning_rate": 7.657674289528914e-06, + "loss": 0.9989, + "step": 4894 + }, + { + "epoch": 0.978486294695285, + "grad_norm": 2.1875, + "learning_rate": 7.656781622367455e-06, + "loss": 1.0522, + "step": 4895 + }, + { + "epoch": 0.978686189750381, + "grad_norm": 2.1875, + "learning_rate": 7.65588883719204e-06, + "loss": 1.0799, + "step": 4896 + }, + { + "epoch": 0.9788860848054771, + "grad_norm": 1.953125, + "learning_rate": 7.654995934042328e-06, + "loss": 0.9497, + "step": 4897 + }, + { + "epoch": 0.9790859798605732, + "grad_norm": 2.234375, + "learning_rate": 7.65410291295798e-06, + "loss": 1.1484, + "step": 4898 + }, + { + "epoch": 0.9792858749156693, + "grad_norm": 2.21875, + "learning_rate": 7.653209773978662e-06, + "loss": 1.06, + "step": 4899 + }, + { + "epoch": 0.9794857699707653, + "grad_norm": 2.078125, + "learning_rate": 7.652316517144052e-06, + "loss": 1.0536, + "step": 4900 + }, + { + "epoch": 0.9796856650258614, + "grad_norm": 1.9921875, + "learning_rate": 7.651423142493824e-06, + "loss": 0.9357, + "step": 4901 + }, + { + "epoch": 0.9798855600809575, + "grad_norm": 2.140625, + "learning_rate": 7.650529650067665e-06, + "loss": 1.0424, + "step": 4902 + }, + { + "epoch": 0.9800854551360536, + "grad_norm": 2.09375, + "learning_rate": 7.64963603990526e-06, + "loss": 1.1032, + "step": 4903 + }, + { + "epoch": 0.9802853501911496, + "grad_norm": 2.09375, + "learning_rate": 7.648742312046306e-06, + "loss": 1.0987, + "step": 4904 + }, + { + "epoch": 0.9804852452462457, + "grad_norm": 2.109375, + "learning_rate": 7.6478484665305e-06, + "loss": 1.0874, + "step": 4905 + }, + { + "epoch": 0.9806851403013418, + "grad_norm": 2.03125, + "learning_rate": 7.64695450339755e-06, + "loss": 0.9831, + "step": 4906 + }, + { + "epoch": 0.9808850353564379, + "grad_norm": 2.015625, + "learning_rate": 7.64606042268716e-06, + "loss": 0.996, + "step": 4907 + }, + { + "epoch": 0.981084930411534, + "grad_norm": 2.0625, + "learning_rate": 7.645166224439053e-06, + "loss": 1.0016, + "step": 4908 + }, + { + "epoch": 0.98128482546663, + "grad_norm": 2.0, + "learning_rate": 7.644271908692944e-06, + "loss": 0.9861, + "step": 4909 + }, + { + "epoch": 0.9814847205217261, + "grad_norm": 2.046875, + "learning_rate": 7.643377475488558e-06, + "loss": 1.081, + "step": 4910 + }, + { + "epoch": 0.9816846155768222, + "grad_norm": 2.109375, + "learning_rate": 7.642482924865627e-06, + "loss": 1.0244, + "step": 4911 + }, + { + "epoch": 0.9818845106319183, + "grad_norm": 2.203125, + "learning_rate": 7.641588256863887e-06, + "loss": 1.0412, + "step": 4912 + }, + { + "epoch": 0.9820844056870143, + "grad_norm": 2.234375, + "learning_rate": 7.640693471523078e-06, + "loss": 1.1168, + "step": 4913 + }, + { + "epoch": 0.9822843007421104, + "grad_norm": 2.109375, + "learning_rate": 7.639798568882947e-06, + "loss": 1.0678, + "step": 4914 + }, + { + "epoch": 0.9824841957972065, + "grad_norm": 2.109375, + "learning_rate": 7.638903548983248e-06, + "loss": 1.0225, + "step": 4915 + }, + { + "epoch": 0.9826840908523026, + "grad_norm": 2.0, + "learning_rate": 7.63800841186373e-06, + "loss": 0.9436, + "step": 4916 + }, + { + "epoch": 0.9828839859073986, + "grad_norm": 2.09375, + "learning_rate": 7.637113157564167e-06, + "loss": 1.0828, + "step": 4917 + }, + { + "epoch": 0.9830838809624947, + "grad_norm": 1.9765625, + "learning_rate": 7.636217786124315e-06, + "loss": 0.9891, + "step": 4918 + }, + { + "epoch": 0.9832837760175908, + "grad_norm": 2.15625, + "learning_rate": 7.635322297583952e-06, + "loss": 1.1208, + "step": 4919 + }, + { + "epoch": 0.9834836710726869, + "grad_norm": 1.9140625, + "learning_rate": 7.634426691982852e-06, + "loss": 0.9974, + "step": 4920 + }, + { + "epoch": 0.983683566127783, + "grad_norm": 2.078125, + "learning_rate": 7.633530969360801e-06, + "loss": 1.0724, + "step": 4921 + }, + { + "epoch": 0.983883461182879, + "grad_norm": 2.015625, + "learning_rate": 7.632635129757585e-06, + "loss": 1.0418, + "step": 4922 + }, + { + "epoch": 0.984083356237975, + "grad_norm": 2.03125, + "learning_rate": 7.631739173212998e-06, + "loss": 0.9366, + "step": 4923 + }, + { + "epoch": 0.9842832512930711, + "grad_norm": 2.015625, + "learning_rate": 7.630843099766838e-06, + "loss": 1.0145, + "step": 4924 + }, + { + "epoch": 0.9844831463481672, + "grad_norm": 2.125, + "learning_rate": 7.62994690945891e-06, + "loss": 1.0334, + "step": 4925 + }, + { + "epoch": 0.9846830414032632, + "grad_norm": 2.140625, + "learning_rate": 7.62905060232902e-06, + "loss": 1.0941, + "step": 4926 + }, + { + "epoch": 0.9848829364583593, + "grad_norm": 2.09375, + "learning_rate": 7.628154178416982e-06, + "loss": 1.0462, + "step": 4927 + }, + { + "epoch": 0.9850828315134554, + "grad_norm": 1.9296875, + "learning_rate": 7.627257637762617e-06, + "loss": 0.9218, + "step": 4928 + }, + { + "epoch": 0.9852827265685515, + "grad_norm": 2.046875, + "learning_rate": 7.626360980405748e-06, + "loss": 0.9681, + "step": 4929 + }, + { + "epoch": 0.9854826216236476, + "grad_norm": 2.265625, + "learning_rate": 7.625464206386205e-06, + "loss": 1.0727, + "step": 4930 + }, + { + "epoch": 0.9856825166787436, + "grad_norm": 1.921875, + "learning_rate": 7.624567315743823e-06, + "loss": 0.9793, + "step": 4931 + }, + { + "epoch": 0.9858824117338397, + "grad_norm": 1.9921875, + "learning_rate": 7.623670308518441e-06, + "loss": 0.9675, + "step": 4932 + }, + { + "epoch": 0.9860823067889358, + "grad_norm": 2.0625, + "learning_rate": 7.622773184749903e-06, + "loss": 1.0283, + "step": 4933 + }, + { + "epoch": 0.9862822018440319, + "grad_norm": 2.0625, + "learning_rate": 7.621875944478062e-06, + "loss": 1.1335, + "step": 4934 + }, + { + "epoch": 0.9864820968991279, + "grad_norm": 2.1875, + "learning_rate": 7.62097858774277e-06, + "loss": 0.9563, + "step": 4935 + }, + { + "epoch": 0.986681991954224, + "grad_norm": 1.984375, + "learning_rate": 7.6200811145838895e-06, + "loss": 0.9792, + "step": 4936 + }, + { + "epoch": 0.9868818870093201, + "grad_norm": 2.171875, + "learning_rate": 7.619183525041286e-06, + "loss": 0.9916, + "step": 4937 + }, + { + "epoch": 0.9870817820644162, + "grad_norm": 2.125, + "learning_rate": 7.618285819154829e-06, + "loss": 1.1046, + "step": 4938 + }, + { + "epoch": 0.9872816771195122, + "grad_norm": 2.015625, + "learning_rate": 7.617387996964396e-06, + "loss": 1.0579, + "step": 4939 + }, + { + "epoch": 0.9874815721746083, + "grad_norm": 2.109375, + "learning_rate": 7.616490058509869e-06, + "loss": 1.0783, + "step": 4940 + }, + { + "epoch": 0.9876814672297044, + "grad_norm": 1.9609375, + "learning_rate": 7.615592003831132e-06, + "loss": 1.0153, + "step": 4941 + }, + { + "epoch": 0.9878813622848005, + "grad_norm": 2.015625, + "learning_rate": 7.614693832968079e-06, + "loss": 1.0465, + "step": 4942 + }, + { + "epoch": 0.9880812573398966, + "grad_norm": 2.0, + "learning_rate": 7.613795545960602e-06, + "loss": 1.0529, + "step": 4943 + }, + { + "epoch": 0.9882811523949926, + "grad_norm": 2.078125, + "learning_rate": 7.612897142848609e-06, + "loss": 1.1096, + "step": 4944 + }, + { + "epoch": 0.9884810474500887, + "grad_norm": 2.125, + "learning_rate": 7.611998623672004e-06, + "loss": 0.9959, + "step": 4945 + }, + { + "epoch": 0.9886809425051848, + "grad_norm": 2.09375, + "learning_rate": 7.611099988470697e-06, + "loss": 0.986, + "step": 4946 + }, + { + "epoch": 0.9888808375602809, + "grad_norm": 2.09375, + "learning_rate": 7.610201237284608e-06, + "loss": 1.0304, + "step": 4947 + }, + { + "epoch": 0.9890807326153769, + "grad_norm": 1.921875, + "learning_rate": 7.60930237015366e-06, + "loss": 0.9567, + "step": 4948 + }, + { + "epoch": 0.989280627670473, + "grad_norm": 2.078125, + "learning_rate": 7.608403387117779e-06, + "loss": 0.9946, + "step": 4949 + }, + { + "epoch": 0.9894805227255691, + "grad_norm": 2.0625, + "learning_rate": 7.607504288216898e-06, + "loss": 1.0602, + "step": 4950 + }, + { + "epoch": 0.9896804177806652, + "grad_norm": 2.109375, + "learning_rate": 7.606605073490955e-06, + "loss": 1.1024, + "step": 4951 + }, + { + "epoch": 0.9898803128357613, + "grad_norm": 2.09375, + "learning_rate": 7.605705742979892e-06, + "loss": 1.0524, + "step": 4952 + }, + { + "epoch": 0.9900802078908573, + "grad_norm": 2.015625, + "learning_rate": 7.604806296723659e-06, + "loss": 1.0077, + "step": 4953 + }, + { + "epoch": 0.9902801029459534, + "grad_norm": 2.1875, + "learning_rate": 7.603906734762209e-06, + "loss": 1.0811, + "step": 4954 + }, + { + "epoch": 0.9904799980010495, + "grad_norm": 2.203125, + "learning_rate": 7.6030070571354986e-06, + "loss": 1.0962, + "step": 4955 + }, + { + "epoch": 0.9906798930561456, + "grad_norm": 1.96875, + "learning_rate": 7.602107263883494e-06, + "loss": 0.9136, + "step": 4956 + }, + { + "epoch": 0.9908797881112416, + "grad_norm": 2.0625, + "learning_rate": 7.601207355046163e-06, + "loss": 1.1236, + "step": 4957 + }, + { + "epoch": 0.9910796831663377, + "grad_norm": 2.0625, + "learning_rate": 7.600307330663477e-06, + "loss": 0.9953, + "step": 4958 + }, + { + "epoch": 0.9912795782214338, + "grad_norm": 2.109375, + "learning_rate": 7.5994071907754185e-06, + "loss": 1.0416, + "step": 4959 + }, + { + "epoch": 0.9914794732765299, + "grad_norm": 2.203125, + "learning_rate": 7.598506935421969e-06, + "loss": 1.0197, + "step": 4960 + }, + { + "epoch": 0.9916793683316258, + "grad_norm": 2.0, + "learning_rate": 7.59760656464312e-06, + "loss": 0.9814, + "step": 4961 + }, + { + "epoch": 0.9918792633867219, + "grad_norm": 2.015625, + "learning_rate": 7.596706078478866e-06, + "loss": 0.9847, + "step": 4962 + }, + { + "epoch": 0.992079158441818, + "grad_norm": 2.109375, + "learning_rate": 7.595805476969205e-06, + "loss": 1.0569, + "step": 4963 + }, + { + "epoch": 0.9922790534969141, + "grad_norm": 2.09375, + "learning_rate": 7.594904760154142e-06, + "loss": 1.0601, + "step": 4964 + }, + { + "epoch": 0.9924789485520102, + "grad_norm": 2.1875, + "learning_rate": 7.594003928073685e-06, + "loss": 1.1193, + "step": 4965 + }, + { + "epoch": 0.9926788436071062, + "grad_norm": 2.09375, + "learning_rate": 7.593102980767853e-06, + "loss": 1.0392, + "step": 4966 + }, + { + "epoch": 0.9928787386622023, + "grad_norm": 2.109375, + "learning_rate": 7.592201918276661e-06, + "loss": 1.0762, + "step": 4967 + }, + { + "epoch": 0.9930786337172984, + "grad_norm": 2.375, + "learning_rate": 7.591300740640138e-06, + "loss": 1.0531, + "step": 4968 + }, + { + "epoch": 0.9932785287723945, + "grad_norm": 2.171875, + "learning_rate": 7.590399447898312e-06, + "loss": 0.9448, + "step": 4969 + }, + { + "epoch": 0.9934784238274905, + "grad_norm": 2.03125, + "learning_rate": 7.589498040091221e-06, + "loss": 0.9441, + "step": 4970 + }, + { + "epoch": 0.9936783188825866, + "grad_norm": 2.125, + "learning_rate": 7.588596517258901e-06, + "loss": 1.1056, + "step": 4971 + }, + { + "epoch": 0.9938782139376827, + "grad_norm": 2.109375, + "learning_rate": 7.5876948794414015e-06, + "loss": 1.0203, + "step": 4972 + }, + { + "epoch": 0.9940781089927788, + "grad_norm": 2.0625, + "learning_rate": 7.586793126678773e-06, + "loss": 0.9404, + "step": 4973 + }, + { + "epoch": 0.9942780040478749, + "grad_norm": 1.9765625, + "learning_rate": 7.585891259011069e-06, + "loss": 1.062, + "step": 4974 + }, + { + "epoch": 0.9944778991029709, + "grad_norm": 2.234375, + "learning_rate": 7.58498927647835e-06, + "loss": 1.132, + "step": 4975 + }, + { + "epoch": 0.994677794158067, + "grad_norm": 2.09375, + "learning_rate": 7.5840871791206845e-06, + "loss": 1.0546, + "step": 4976 + }, + { + "epoch": 0.9948776892131631, + "grad_norm": 2.046875, + "learning_rate": 7.583184966978143e-06, + "loss": 1.0652, + "step": 4977 + }, + { + "epoch": 0.9950775842682592, + "grad_norm": 2.015625, + "learning_rate": 7.582282640090801e-06, + "loss": 1.0238, + "step": 4978 + }, + { + "epoch": 0.9952774793233552, + "grad_norm": 2.09375, + "learning_rate": 7.581380198498743e-06, + "loss": 1.0439, + "step": 4979 + }, + { + "epoch": 0.9954773743784513, + "grad_norm": 2.09375, + "learning_rate": 7.580477642242048e-06, + "loss": 0.9548, + "step": 4980 + }, + { + "epoch": 0.9956772694335474, + "grad_norm": 1.953125, + "learning_rate": 7.5795749713608125e-06, + "loss": 0.9202, + "step": 4981 + }, + { + "epoch": 0.9958771644886435, + "grad_norm": 4.34375, + "learning_rate": 7.578672185895133e-06, + "loss": 1.0457, + "step": 4982 + }, + { + "epoch": 0.9960770595437395, + "grad_norm": 2.015625, + "learning_rate": 7.57776928588511e-06, + "loss": 0.9742, + "step": 4983 + }, + { + "epoch": 0.9962769545988356, + "grad_norm": 2.15625, + "learning_rate": 7.576866271370851e-06, + "loss": 1.0402, + "step": 4984 + }, + { + "epoch": 0.9964768496539317, + "grad_norm": 2.015625, + "learning_rate": 7.575963142392466e-06, + "loss": 0.9298, + "step": 4985 + }, + { + "epoch": 0.9966767447090278, + "grad_norm": 2.109375, + "learning_rate": 7.5750598989900745e-06, + "loss": 1.0118, + "step": 4986 + }, + { + "epoch": 0.9968766397641239, + "grad_norm": 2.03125, + "learning_rate": 7.574156541203799e-06, + "loss": 0.9927, + "step": 4987 + }, + { + "epoch": 0.9970765348192199, + "grad_norm": 2.109375, + "learning_rate": 7.573253069073763e-06, + "loss": 1.0184, + "step": 4988 + }, + { + "epoch": 0.997276429874316, + "grad_norm": 2.09375, + "learning_rate": 7.5723494826401e-06, + "loss": 0.9645, + "step": 4989 + }, + { + "epoch": 0.9974763249294121, + "grad_norm": 2.0625, + "learning_rate": 7.571445781942948e-06, + "loss": 0.9988, + "step": 4990 + }, + { + "epoch": 0.9976762199845082, + "grad_norm": 2.0625, + "learning_rate": 7.570541967022449e-06, + "loss": 1.0216, + "step": 4991 + }, + { + "epoch": 0.9978761150396042, + "grad_norm": 2.015625, + "learning_rate": 7.569638037918751e-06, + "loss": 1.0075, + "step": 4992 + }, + { + "epoch": 0.9980760100947003, + "grad_norm": 2.09375, + "learning_rate": 7.568733994672006e-06, + "loss": 1.1056, + "step": 4993 + }, + { + "epoch": 0.9982759051497964, + "grad_norm": 2.046875, + "learning_rate": 7.567829837322371e-06, + "loss": 0.9353, + "step": 4994 + }, + { + "epoch": 0.9984758002048925, + "grad_norm": 2.15625, + "learning_rate": 7.56692556591001e-06, + "loss": 1.153, + "step": 4995 + }, + { + "epoch": 0.9986756952599886, + "grad_norm": 2.078125, + "learning_rate": 7.566021180475088e-06, + "loss": 1.0002, + "step": 4996 + }, + { + "epoch": 0.9988755903150845, + "grad_norm": 2.015625, + "learning_rate": 7.565116681057779e-06, + "loss": 0.9701, + "step": 4997 + }, + { + "epoch": 0.9990754853701806, + "grad_norm": 2.203125, + "learning_rate": 7.564212067698262e-06, + "loss": 1.0824, + "step": 4998 + }, + { + "epoch": 0.9992753804252767, + "grad_norm": 2.0625, + "learning_rate": 7.563307340436718e-06, + "loss": 0.9802, + "step": 4999 + }, + { + "epoch": 0.9994752754803728, + "grad_norm": 2.03125, + "learning_rate": 7.562402499313336e-06, + "loss": 1.1098, + "step": 5000 + }, + { + "epoch": 0.9996751705354688, + "grad_norm": 2.0625, + "learning_rate": 7.561497544368309e-06, + "loss": 1.0144, + "step": 5001 + }, + { + "epoch": 0.9998750655905649, + "grad_norm": 2.1875, + "learning_rate": 7.560592475641835e-06, + "loss": 1.0494, + "step": 5002 + }, + { + "epoch": 1.000074960645661, + "grad_norm": 1.9765625, + "learning_rate": 7.559687293174115e-06, + "loss": 0.9346, + "step": 5003 + }, + { + "epoch": 1.000274855700757, + "grad_norm": 2.15625, + "learning_rate": 7.55878199700536e-06, + "loss": 1.0637, + "step": 5004 + }, + { + "epoch": 1.0004747507558531, + "grad_norm": 2.015625, + "learning_rate": 7.557876587175782e-06, + "loss": 0.9782, + "step": 5005 + }, + { + "epoch": 1.0006746458109492, + "grad_norm": 2.140625, + "learning_rate": 7.5569710637255985e-06, + "loss": 0.9836, + "step": 5006 + }, + { + "epoch": 1.0008745408660453, + "grad_norm": 2.015625, + "learning_rate": 7.556065426695035e-06, + "loss": 1.0265, + "step": 5007 + }, + { + "epoch": 1.0010744359211414, + "grad_norm": 2.09375, + "learning_rate": 7.555159676124317e-06, + "loss": 1.1274, + "step": 5008 + }, + { + "epoch": 1.0012743309762375, + "grad_norm": 2.171875, + "learning_rate": 7.55425381205368e-06, + "loss": 1.0692, + "step": 5009 + }, + { + "epoch": 1.0014742260313336, + "grad_norm": 2.078125, + "learning_rate": 7.553347834523361e-06, + "loss": 0.9171, + "step": 5010 + }, + { + "epoch": 1.0016741210864297, + "grad_norm": 2.109375, + "learning_rate": 7.552441743573604e-06, + "loss": 1.0443, + "step": 5011 + }, + { + "epoch": 1.0018740161415256, + "grad_norm": 2.1875, + "learning_rate": 7.551535539244657e-06, + "loss": 1.0757, + "step": 5012 + }, + { + "epoch": 1.0020739111966217, + "grad_norm": 1.8984375, + "learning_rate": 7.550629221576774e-06, + "loss": 0.9581, + "step": 5013 + }, + { + "epoch": 1.0022738062517178, + "grad_norm": 2.078125, + "learning_rate": 7.5497227906102125e-06, + "loss": 1.0106, + "step": 5014 + }, + { + "epoch": 1.002473701306814, + "grad_norm": 2.046875, + "learning_rate": 7.5488162463852385e-06, + "loss": 1.0183, + "step": 5015 + }, + { + "epoch": 1.00267359636191, + "grad_norm": 2.109375, + "learning_rate": 7.547909588942118e-06, + "loss": 0.9909, + "step": 5016 + }, + { + "epoch": 1.002873491417006, + "grad_norm": 2.03125, + "learning_rate": 7.547002818321125e-06, + "loss": 0.9133, + "step": 5017 + }, + { + "epoch": 1.0030733864721022, + "grad_norm": 2.078125, + "learning_rate": 7.54609593456254e-06, + "loss": 1.0139, + "step": 5018 + }, + { + "epoch": 1.0032732815271983, + "grad_norm": 2.109375, + "learning_rate": 7.545188937706647e-06, + "loss": 1.036, + "step": 5019 + }, + { + "epoch": 1.0034731765822944, + "grad_norm": 2.046875, + "learning_rate": 7.544281827793731e-06, + "loss": 1.0002, + "step": 5020 + }, + { + "epoch": 1.0036730716373903, + "grad_norm": 2.046875, + "learning_rate": 7.543374604864089e-06, + "loss": 1.0068, + "step": 5021 + }, + { + "epoch": 1.0038729666924864, + "grad_norm": 2.0625, + "learning_rate": 7.5424672689580185e-06, + "loss": 1.0954, + "step": 5022 + }, + { + "epoch": 1.0040728617475825, + "grad_norm": 2.21875, + "learning_rate": 7.541559820115823e-06, + "loss": 1.0482, + "step": 5023 + }, + { + "epoch": 1.0042727568026786, + "grad_norm": 1.96875, + "learning_rate": 7.540652258377812e-06, + "loss": 0.973, + "step": 5024 + }, + { + "epoch": 1.0044726518577747, + "grad_norm": 2.0, + "learning_rate": 7.539744583784299e-06, + "loss": 1.0635, + "step": 5025 + }, + { + "epoch": 1.0046725469128708, + "grad_norm": 2.0625, + "learning_rate": 7.538836796375603e-06, + "loss": 0.9313, + "step": 5026 + }, + { + "epoch": 1.0048724419679669, + "grad_norm": 3.09375, + "learning_rate": 7.537928896192048e-06, + "loss": 0.9956, + "step": 5027 + }, + { + "epoch": 1.005072337023063, + "grad_norm": 2.25, + "learning_rate": 7.5370208832739614e-06, + "loss": 1.0241, + "step": 5028 + }, + { + "epoch": 1.0052722320781589, + "grad_norm": 2.21875, + "learning_rate": 7.536112757661678e-06, + "loss": 1.0751, + "step": 5029 + }, + { + "epoch": 1.005472127133255, + "grad_norm": 2.03125, + "learning_rate": 7.535204519395538e-06, + "loss": 1.0667, + "step": 5030 + }, + { + "epoch": 1.005672022188351, + "grad_norm": 2.15625, + "learning_rate": 7.534296168515883e-06, + "loss": 1.0316, + "step": 5031 + }, + { + "epoch": 1.0058719172434472, + "grad_norm": 2.15625, + "learning_rate": 7.5333877050630645e-06, + "loss": 1.1334, + "step": 5032 + }, + { + "epoch": 1.0060718122985433, + "grad_norm": 2.046875, + "learning_rate": 7.532479129077433e-06, + "loss": 1.0552, + "step": 5033 + }, + { + "epoch": 1.0062717073536394, + "grad_norm": 2.125, + "learning_rate": 7.53157044059935e-06, + "loss": 1.0294, + "step": 5034 + }, + { + "epoch": 1.0064716024087355, + "grad_norm": 2.015625, + "learning_rate": 7.530661639669178e-06, + "loss": 0.9817, + "step": 5035 + }, + { + "epoch": 1.0066714974638316, + "grad_norm": 2.0, + "learning_rate": 7.529752726327286e-06, + "loss": 0.9589, + "step": 5036 + }, + { + "epoch": 1.0068713925189277, + "grad_norm": 2.234375, + "learning_rate": 7.5288437006140484e-06, + "loss": 1.0801, + "step": 5037 + }, + { + "epoch": 1.0070712875740235, + "grad_norm": 2.03125, + "learning_rate": 7.527934562569843e-06, + "loss": 0.9743, + "step": 5038 + }, + { + "epoch": 1.0072711826291196, + "grad_norm": 2.015625, + "learning_rate": 7.5270253122350555e-06, + "loss": 0.9501, + "step": 5039 + }, + { + "epoch": 1.0074710776842157, + "grad_norm": 2.21875, + "learning_rate": 7.526115949650073e-06, + "loss": 1.0456, + "step": 5040 + }, + { + "epoch": 1.0076709727393118, + "grad_norm": 2.15625, + "learning_rate": 7.5252064748552915e-06, + "loss": 0.9935, + "step": 5041 + }, + { + "epoch": 1.007870867794408, + "grad_norm": 2.046875, + "learning_rate": 7.524296887891105e-06, + "loss": 1.0365, + "step": 5042 + }, + { + "epoch": 1.008070762849504, + "grad_norm": 2.078125, + "learning_rate": 7.5233871887979215e-06, + "loss": 1.073, + "step": 5043 + }, + { + "epoch": 1.0082706579046001, + "grad_norm": 2.09375, + "learning_rate": 7.5224773776161495e-06, + "loss": 1.1247, + "step": 5044 + }, + { + "epoch": 1.0084705529596962, + "grad_norm": 2.140625, + "learning_rate": 7.5215674543862004e-06, + "loss": 1.1517, + "step": 5045 + }, + { + "epoch": 1.0086704480147923, + "grad_norm": 2.09375, + "learning_rate": 7.520657419148496e-06, + "loss": 1.0468, + "step": 5046 + }, + { + "epoch": 1.0088703430698882, + "grad_norm": 1.984375, + "learning_rate": 7.519747271943457e-06, + "loss": 0.9336, + "step": 5047 + }, + { + "epoch": 1.0090702381249843, + "grad_norm": 2.125, + "learning_rate": 7.5188370128115125e-06, + "loss": 1.0277, + "step": 5048 + }, + { + "epoch": 1.0092701331800804, + "grad_norm": 2.078125, + "learning_rate": 7.5179266417931e-06, + "loss": 0.9911, + "step": 5049 + }, + { + "epoch": 1.0094700282351765, + "grad_norm": 2.078125, + "learning_rate": 7.517016158928652e-06, + "loss": 0.9861, + "step": 5050 + }, + { + "epoch": 1.0096699232902726, + "grad_norm": 2.078125, + "learning_rate": 7.5161055642586154e-06, + "loss": 1.0165, + "step": 5051 + }, + { + "epoch": 1.0098698183453687, + "grad_norm": 2.140625, + "learning_rate": 7.5151948578234405e-06, + "loss": 1.0301, + "step": 5052 + }, + { + "epoch": 1.0100697134004648, + "grad_norm": 2.078125, + "learning_rate": 7.514284039663576e-06, + "loss": 0.9706, + "step": 5053 + }, + { + "epoch": 1.010269608455561, + "grad_norm": 2.1875, + "learning_rate": 7.5133731098194854e-06, + "loss": 1.0205, + "step": 5054 + }, + { + "epoch": 1.010469503510657, + "grad_norm": 2.109375, + "learning_rate": 7.5124620683316275e-06, + "loss": 1.0774, + "step": 5055 + }, + { + "epoch": 1.0106693985657529, + "grad_norm": 2.0625, + "learning_rate": 7.511550915240475e-06, + "loss": 1.0181, + "step": 5056 + }, + { + "epoch": 1.010869293620849, + "grad_norm": 2.140625, + "learning_rate": 7.510639650586498e-06, + "loss": 0.9683, + "step": 5057 + }, + { + "epoch": 1.011069188675945, + "grad_norm": 2.046875, + "learning_rate": 7.509728274410175e-06, + "loss": 1.035, + "step": 5058 + }, + { + "epoch": 1.0112690837310412, + "grad_norm": 2.078125, + "learning_rate": 7.508816786751991e-06, + "loss": 1.0364, + "step": 5059 + }, + { + "epoch": 1.0114689787861373, + "grad_norm": 2.125, + "learning_rate": 7.507905187652433e-06, + "loss": 1.0686, + "step": 5060 + }, + { + "epoch": 1.0116688738412334, + "grad_norm": 2.046875, + "learning_rate": 7.506993477151996e-06, + "loss": 1.0212, + "step": 5061 + }, + { + "epoch": 1.0118687688963295, + "grad_norm": 2.03125, + "learning_rate": 7.506081655291174e-06, + "loss": 1.054, + "step": 5062 + }, + { + "epoch": 1.0120686639514256, + "grad_norm": 1.9453125, + "learning_rate": 7.505169722110475e-06, + "loss": 1.0896, + "step": 5063 + }, + { + "epoch": 1.0122685590065217, + "grad_norm": 2.09375, + "learning_rate": 7.504257677650402e-06, + "loss": 1.1003, + "step": 5064 + }, + { + "epoch": 1.0124684540616176, + "grad_norm": 2.140625, + "learning_rate": 7.5033455219514705e-06, + "loss": 1.0835, + "step": 5065 + }, + { + "epoch": 1.0126683491167137, + "grad_norm": 2.046875, + "learning_rate": 7.5024332550542e-06, + "loss": 0.909, + "step": 5066 + }, + { + "epoch": 1.0128682441718098, + "grad_norm": 2.234375, + "learning_rate": 7.501520876999111e-06, + "loss": 1.0562, + "step": 5067 + }, + { + "epoch": 1.0130681392269059, + "grad_norm": 2.171875, + "learning_rate": 7.500608387826731e-06, + "loss": 1.0792, + "step": 5068 + }, + { + "epoch": 1.013268034282002, + "grad_norm": 1.9453125, + "learning_rate": 7.499695787577596e-06, + "loss": 1.0013, + "step": 5069 + }, + { + "epoch": 1.013467929337098, + "grad_norm": 1.984375, + "learning_rate": 7.498783076292238e-06, + "loss": 1.0855, + "step": 5070 + }, + { + "epoch": 1.0136678243921942, + "grad_norm": 2.0, + "learning_rate": 7.497870254011205e-06, + "loss": 1.0653, + "step": 5071 + }, + { + "epoch": 1.0138677194472903, + "grad_norm": 2.03125, + "learning_rate": 7.496957320775042e-06, + "loss": 0.9759, + "step": 5072 + }, + { + "epoch": 1.0140676145023861, + "grad_norm": 2.03125, + "learning_rate": 7.496044276624299e-06, + "loss": 1.0511, + "step": 5073 + }, + { + "epoch": 1.0142675095574822, + "grad_norm": 2.109375, + "learning_rate": 7.495131121599537e-06, + "loss": 0.9811, + "step": 5074 + }, + { + "epoch": 1.0144674046125783, + "grad_norm": 1.984375, + "learning_rate": 7.494217855741319e-06, + "loss": 0.9164, + "step": 5075 + }, + { + "epoch": 1.0146672996676744, + "grad_norm": 2.015625, + "learning_rate": 7.493304479090208e-06, + "loss": 0.971, + "step": 5076 + }, + { + "epoch": 1.0148671947227705, + "grad_norm": 2.125, + "learning_rate": 7.49239099168678e-06, + "loss": 1.0386, + "step": 5077 + }, + { + "epoch": 1.0150670897778666, + "grad_norm": 2.078125, + "learning_rate": 7.491477393571609e-06, + "loss": 0.9575, + "step": 5078 + }, + { + "epoch": 1.0152669848329627, + "grad_norm": 2.09375, + "learning_rate": 7.490563684785277e-06, + "loss": 1.02, + "step": 5079 + }, + { + "epoch": 1.0154668798880588, + "grad_norm": 2.03125, + "learning_rate": 7.489649865368375e-06, + "loss": 0.986, + "step": 5080 + }, + { + "epoch": 1.015666774943155, + "grad_norm": 2.203125, + "learning_rate": 7.488735935361491e-06, + "loss": 1.0406, + "step": 5081 + }, + { + "epoch": 1.000124934409435, + "grad_norm": 2.09375, + "learning_rate": 7.4878218948052206e-06, + "loss": 1.0103, + "step": 5082 + }, + { + "epoch": 1.000324829464531, + "grad_norm": 2.03125, + "learning_rate": 7.4869077437401705e-06, + "loss": 0.9574, + "step": 5083 + }, + { + "epoch": 1.0005247245196272, + "grad_norm": 1.9296875, + "learning_rate": 7.485993482206941e-06, + "loss": 0.916, + "step": 5084 + }, + { + "epoch": 1.0007246195747233, + "grad_norm": 2.015625, + "learning_rate": 7.48507911024615e-06, + "loss": 1.0239, + "step": 5085 + }, + { + "epoch": 1.0009245146298194, + "grad_norm": 2.015625, + "learning_rate": 7.484164627898407e-06, + "loss": 1.0095, + "step": 5086 + }, + { + "epoch": 1.0011244096849155, + "grad_norm": 2.015625, + "learning_rate": 7.483250035204338e-06, + "loss": 0.9091, + "step": 5087 + }, + { + "epoch": 1.0013243047400116, + "grad_norm": 2.171875, + "learning_rate": 7.482335332204568e-06, + "loss": 0.9781, + "step": 5088 + }, + { + "epoch": 1.0015241997951077, + "grad_norm": 2.0, + "learning_rate": 7.481420518939727e-06, + "loss": 0.897, + "step": 5089 + }, + { + "epoch": 1.0017240948502038, + "grad_norm": 2.0625, + "learning_rate": 7.480505595450451e-06, + "loss": 0.9291, + "step": 5090 + }, + { + "epoch": 1.0019239899052996, + "grad_norm": 2.078125, + "learning_rate": 7.4795905617773834e-06, + "loss": 0.9006, + "step": 5091 + }, + { + "epoch": 1.0021238849603957, + "grad_norm": 2.078125, + "learning_rate": 7.4786754179611666e-06, + "loss": 0.9878, + "step": 5092 + }, + { + "epoch": 1.0023237800154918, + "grad_norm": 2.03125, + "learning_rate": 7.477760164042451e-06, + "loss": 0.9614, + "step": 5093 + }, + { + "epoch": 1.002523675070588, + "grad_norm": 2.015625, + "learning_rate": 7.476844800061896e-06, + "loss": 0.9483, + "step": 5094 + }, + { + "epoch": 1.002723570125684, + "grad_norm": 2.078125, + "learning_rate": 7.4759293260601585e-06, + "loss": 0.9782, + "step": 5095 + }, + { + "epoch": 1.0029234651807801, + "grad_norm": 2.140625, + "learning_rate": 7.475013742077905e-06, + "loss": 0.9642, + "step": 5096 + }, + { + "epoch": 1.0031233602358762, + "grad_norm": 2.0625, + "learning_rate": 7.474098048155806e-06, + "loss": 0.8765, + "step": 5097 + }, + { + "epoch": 1.0033232552909723, + "grad_norm": 2.125, + "learning_rate": 7.473182244334533e-06, + "loss": 1.0415, + "step": 5098 + }, + { + "epoch": 1.0035231503460682, + "grad_norm": 2.078125, + "learning_rate": 7.472266330654773e-06, + "loss": 0.9978, + "step": 5099 + }, + { + "epoch": 1.0037230454011643, + "grad_norm": 2.390625, + "learning_rate": 7.471350307157204e-06, + "loss": 1.0603, + "step": 5100 + }, + { + "epoch": 1.0039229404562604, + "grad_norm": 2.0625, + "learning_rate": 7.470434173882519e-06, + "loss": 0.9902, + "step": 5101 + }, + { + "epoch": 1.0041228355113565, + "grad_norm": 1.9765625, + "learning_rate": 7.469517930871411e-06, + "loss": 0.8795, + "step": 5102 + }, + { + "epoch": 1.0043227305664526, + "grad_norm": 2.046875, + "learning_rate": 7.468601578164582e-06, + "loss": 1.0094, + "step": 5103 + }, + { + "epoch": 1.0045226256215487, + "grad_norm": 2.0625, + "learning_rate": 7.467685115802734e-06, + "loss": 0.9736, + "step": 5104 + }, + { + "epoch": 1.0047225206766448, + "grad_norm": 2.03125, + "learning_rate": 7.466768543826577e-06, + "loss": 1.0023, + "step": 5105 + }, + { + "epoch": 1.004922415731741, + "grad_norm": 2.0625, + "learning_rate": 7.465851862276824e-06, + "loss": 1.0279, + "step": 5106 + }, + { + "epoch": 1.005122310786837, + "grad_norm": 2.09375, + "learning_rate": 7.4649350711941935e-06, + "loss": 1.079, + "step": 5107 + }, + { + "epoch": 1.0053222058419329, + "grad_norm": 2.046875, + "learning_rate": 7.464018170619413e-06, + "loss": 0.9784, + "step": 5108 + }, + { + "epoch": 1.005522100897029, + "grad_norm": 2.1875, + "learning_rate": 7.4631011605932065e-06, + "loss": 1.0549, + "step": 5109 + }, + { + "epoch": 1.005721995952125, + "grad_norm": 2.0625, + "learning_rate": 7.462184041156309e-06, + "loss": 1.0397, + "step": 5110 + }, + { + "epoch": 1.0059218910072212, + "grad_norm": 2.078125, + "learning_rate": 7.461266812349462e-06, + "loss": 1.0394, + "step": 5111 + }, + { + "epoch": 1.0061217860623173, + "grad_norm": 2.171875, + "learning_rate": 7.460349474213404e-06, + "loss": 0.9565, + "step": 5112 + }, + { + "epoch": 1.0063216811174134, + "grad_norm": 2.125, + "learning_rate": 7.459432026788885e-06, + "loss": 0.9286, + "step": 5113 + }, + { + "epoch": 1.0065215761725095, + "grad_norm": 2.078125, + "learning_rate": 7.45851447011666e-06, + "loss": 0.9695, + "step": 5114 + }, + { + "epoch": 1.0067214712276056, + "grad_norm": 2.171875, + "learning_rate": 7.457596804237484e-06, + "loss": 0.9977, + "step": 5115 + }, + { + "epoch": 1.0069213662827017, + "grad_norm": 2.078125, + "learning_rate": 7.45667902919212e-06, + "loss": 0.9384, + "step": 5116 + }, + { + "epoch": 1.0071212613377976, + "grad_norm": 2.109375, + "learning_rate": 7.455761145021335e-06, + "loss": 0.9987, + "step": 5117 + }, + { + "epoch": 1.0073211563928937, + "grad_norm": 2.125, + "learning_rate": 7.454843151765904e-06, + "loss": 0.9157, + "step": 5118 + }, + { + "epoch": 1.0075210514479898, + "grad_norm": 2.125, + "learning_rate": 7.453925049466601e-06, + "loss": 0.9917, + "step": 5119 + }, + { + "epoch": 1.0077209465030859, + "grad_norm": 2.25, + "learning_rate": 7.453006838164211e-06, + "loss": 1.0897, + "step": 5120 + }, + { + "epoch": 1.007920841558182, + "grad_norm": 1.9765625, + "learning_rate": 7.452088517899518e-06, + "loss": 0.9977, + "step": 5121 + }, + { + "epoch": 1.008120736613278, + "grad_norm": 2.015625, + "learning_rate": 7.451170088713315e-06, + "loss": 0.938, + "step": 5122 + }, + { + "epoch": 1.0083206316683742, + "grad_norm": 2.171875, + "learning_rate": 7.450251550646398e-06, + "loss": 1.0497, + "step": 5123 + }, + { + "epoch": 1.0085205267234703, + "grad_norm": 2.15625, + "learning_rate": 7.449332903739569e-06, + "loss": 1.0163, + "step": 5124 + }, + { + "epoch": 1.0087204217785664, + "grad_norm": 2.109375, + "learning_rate": 7.4484141480336355e-06, + "loss": 1.0633, + "step": 5125 + }, + { + "epoch": 1.0089203168336622, + "grad_norm": 2.09375, + "learning_rate": 7.447495283569406e-06, + "loss": 0.9094, + "step": 5126 + }, + { + "epoch": 1.0091202118887583, + "grad_norm": 2.125, + "learning_rate": 7.446576310387696e-06, + "loss": 0.9195, + "step": 5127 + }, + { + "epoch": 1.0093201069438544, + "grad_norm": 2.21875, + "learning_rate": 7.44565722852933e-06, + "loss": 1.0892, + "step": 5128 + }, + { + "epoch": 1.0095200019989505, + "grad_norm": 2.21875, + "learning_rate": 7.44473803803513e-06, + "loss": 0.9135, + "step": 5129 + }, + { + "epoch": 1.0097198970540466, + "grad_norm": 2.078125, + "learning_rate": 7.443818738945927e-06, + "loss": 1.0406, + "step": 5130 + }, + { + "epoch": 1.0099197921091427, + "grad_norm": 2.078125, + "learning_rate": 7.442899331302557e-06, + "loss": 0.9805, + "step": 5131 + }, + { + "epoch": 1.0101196871642388, + "grad_norm": 2.1875, + "learning_rate": 7.44197981514586e-06, + "loss": 1.0349, + "step": 5132 + }, + { + "epoch": 1.010319582219335, + "grad_norm": 1.96875, + "learning_rate": 7.44106019051668e-06, + "loss": 0.9732, + "step": 5133 + }, + { + "epoch": 1.010519477274431, + "grad_norm": 2.0625, + "learning_rate": 7.440140457455869e-06, + "loss": 1.0351, + "step": 5134 + }, + { + "epoch": 1.010719372329527, + "grad_norm": 2.046875, + "learning_rate": 7.439220616004277e-06, + "loss": 1.0287, + "step": 5135 + }, + { + "epoch": 1.010919267384623, + "grad_norm": 2.171875, + "learning_rate": 7.438300666202767e-06, + "loss": 1.0394, + "step": 5136 + }, + { + "epoch": 1.011119162439719, + "grad_norm": 2.203125, + "learning_rate": 7.4373806080922005e-06, + "loss": 0.9785, + "step": 5137 + }, + { + "epoch": 1.0113190574948152, + "grad_norm": 2.109375, + "learning_rate": 7.436460441713448e-06, + "loss": 1.0159, + "step": 5138 + }, + { + "epoch": 1.0115189525499113, + "grad_norm": 2.078125, + "learning_rate": 7.435540167107384e-06, + "loss": 0.9642, + "step": 5139 + }, + { + "epoch": 1.0117188476050074, + "grad_norm": 2.21875, + "learning_rate": 7.434619784314885e-06, + "loss": 0.9716, + "step": 5140 + }, + { + "epoch": 1.0119187426601035, + "grad_norm": 2.4375, + "learning_rate": 7.433699293376835e-06, + "loss": 0.9343, + "step": 5141 + }, + { + "epoch": 1.0121186377151996, + "grad_norm": 2.171875, + "learning_rate": 7.432778694334124e-06, + "loss": 1.0314, + "step": 5142 + }, + { + "epoch": 1.0123185327702955, + "grad_norm": 2.234375, + "learning_rate": 7.431857987227642e-06, + "loss": 1.0005, + "step": 5143 + }, + { + "epoch": 1.0125184278253916, + "grad_norm": 2.046875, + "learning_rate": 7.430937172098288e-06, + "loss": 0.9713, + "step": 5144 + }, + { + "epoch": 1.0127183228804877, + "grad_norm": 2.09375, + "learning_rate": 7.430016248986964e-06, + "loss": 1.0475, + "step": 5145 + }, + { + "epoch": 1.0129182179355838, + "grad_norm": 2.203125, + "learning_rate": 7.429095217934578e-06, + "loss": 1.0999, + "step": 5146 + }, + { + "epoch": 1.0131181129906799, + "grad_norm": 2.234375, + "learning_rate": 7.428174078982042e-06, + "loss": 1.0007, + "step": 5147 + }, + { + "epoch": 1.013318008045776, + "grad_norm": 2.203125, + "learning_rate": 7.427252832170273e-06, + "loss": 0.9999, + "step": 5148 + }, + { + "epoch": 1.013517903100872, + "grad_norm": 2.109375, + "learning_rate": 7.426331477540193e-06, + "loss": 1.0016, + "step": 5149 + }, + { + "epoch": 1.0137177981559682, + "grad_norm": 1.9765625, + "learning_rate": 7.425410015132728e-06, + "loss": 0.9672, + "step": 5150 + }, + { + "epoch": 1.0139176932110643, + "grad_norm": 2.09375, + "learning_rate": 7.424488444988807e-06, + "loss": 0.9684, + "step": 5151 + }, + { + "epoch": 1.0141175882661602, + "grad_norm": 2.171875, + "learning_rate": 7.42356676714937e-06, + "loss": 0.8617, + "step": 5152 + }, + { + "epoch": 1.0143174833212563, + "grad_norm": 2.046875, + "learning_rate": 7.422644981655356e-06, + "loss": 0.9923, + "step": 5153 + }, + { + "epoch": 1.0145173783763524, + "grad_norm": 2.125, + "learning_rate": 7.42172308854771e-06, + "loss": 0.8722, + "step": 5154 + }, + { + "epoch": 1.0147172734314485, + "grad_norm": 2.03125, + "learning_rate": 7.420801087867382e-06, + "loss": 0.9362, + "step": 5155 + }, + { + "epoch": 1.0149171684865446, + "grad_norm": 2.109375, + "learning_rate": 7.419878979655331e-06, + "loss": 0.9697, + "step": 5156 + }, + { + "epoch": 1.0151170635416407, + "grad_norm": 2.109375, + "learning_rate": 7.418956763952512e-06, + "loss": 1.011, + "step": 5157 + }, + { + "epoch": 1.0153169585967368, + "grad_norm": 2.140625, + "learning_rate": 7.418034440799892e-06, + "loss": 1.0548, + "step": 5158 + }, + { + "epoch": 1.0155168536518329, + "grad_norm": 2.046875, + "learning_rate": 7.417112010238442e-06, + "loss": 0.9735, + "step": 5159 + }, + { + "epoch": 1.015716748706929, + "grad_norm": 2.109375, + "learning_rate": 7.416189472309133e-06, + "loss": 0.9858, + "step": 5160 + }, + { + "epoch": 1.0159166437620248, + "grad_norm": 2.03125, + "learning_rate": 7.415266827052947e-06, + "loss": 0.9463, + "step": 5161 + }, + { + "epoch": 1.016116538817121, + "grad_norm": 2.09375, + "learning_rate": 7.414344074510865e-06, + "loss": 0.9858, + "step": 5162 + }, + { + "epoch": 1.016316433872217, + "grad_norm": 2.1875, + "learning_rate": 7.413421214723878e-06, + "loss": 1.0522, + "step": 5163 + }, + { + "epoch": 1.0165163289273131, + "grad_norm": 2.09375, + "learning_rate": 7.412498247732979e-06, + "loss": 1.021, + "step": 5164 + }, + { + "epoch": 1.0167162239824092, + "grad_norm": 1.921875, + "learning_rate": 7.4115751735791655e-06, + "loss": 0.9006, + "step": 5165 + }, + { + "epoch": 1.0169161190375053, + "grad_norm": 2.1875, + "learning_rate": 7.410651992303439e-06, + "loss": 1.0421, + "step": 5166 + }, + { + "epoch": 1.0171160140926014, + "grad_norm": 2.078125, + "learning_rate": 7.40972870394681e-06, + "loss": 1.002, + "step": 5167 + }, + { + "epoch": 1.0173159091476975, + "grad_norm": 2.015625, + "learning_rate": 7.408805308550288e-06, + "loss": 0.9943, + "step": 5168 + }, + { + "epoch": 1.0175158042027936, + "grad_norm": 2.21875, + "learning_rate": 7.407881806154892e-06, + "loss": 1.0892, + "step": 5169 + }, + { + "epoch": 1.0177156992578895, + "grad_norm": 2.15625, + "learning_rate": 7.406958196801644e-06, + "loss": 0.9304, + "step": 5170 + }, + { + "epoch": 1.0179155943129856, + "grad_norm": 2.171875, + "learning_rate": 7.406034480531568e-06, + "loss": 0.9051, + "step": 5171 + }, + { + "epoch": 1.0181154893680817, + "grad_norm": 2.21875, + "learning_rate": 7.405110657385699e-06, + "loss": 0.9884, + "step": 5172 + }, + { + "epoch": 1.0183153844231778, + "grad_norm": 2.1875, + "learning_rate": 7.4041867274050715e-06, + "loss": 1.0153, + "step": 5173 + }, + { + "epoch": 1.018515279478274, + "grad_norm": 2.03125, + "learning_rate": 7.403262690630725e-06, + "loss": 0.8693, + "step": 5174 + }, + { + "epoch": 1.01871517453337, + "grad_norm": 2.140625, + "learning_rate": 7.402338547103708e-06, + "loss": 1.012, + "step": 5175 + }, + { + "epoch": 1.0189150695884661, + "grad_norm": 2.203125, + "learning_rate": 7.401414296865068e-06, + "loss": 1.0537, + "step": 5176 + }, + { + "epoch": 1.0191149646435622, + "grad_norm": 2.109375, + "learning_rate": 7.400489939955862e-06, + "loss": 0.9194, + "step": 5177 + }, + { + "epoch": 1.019314859698658, + "grad_norm": 2.171875, + "learning_rate": 7.39956547641715e-06, + "loss": 0.9411, + "step": 5178 + }, + { + "epoch": 1.0195147547537542, + "grad_norm": 2.125, + "learning_rate": 7.398640906289996e-06, + "loss": 0.9623, + "step": 5179 + }, + { + "epoch": 1.0197146498088503, + "grad_norm": 2.0, + "learning_rate": 7.397716229615468e-06, + "loss": 0.9034, + "step": 5180 + }, + { + "epoch": 1.0199145448639464, + "grad_norm": 2.125, + "learning_rate": 7.396791446434641e-06, + "loss": 0.9748, + "step": 5181 + }, + { + "epoch": 1.0201144399190425, + "grad_norm": 2.109375, + "learning_rate": 7.3958665567885945e-06, + "loss": 0.9026, + "step": 5182 + }, + { + "epoch": 1.0203143349741386, + "grad_norm": 2.140625, + "learning_rate": 7.394941560718412e-06, + "loss": 0.9915, + "step": 5183 + }, + { + "epoch": 1.0205142300292347, + "grad_norm": 2.140625, + "learning_rate": 7.394016458265181e-06, + "loss": 0.9947, + "step": 5184 + }, + { + "epoch": 1.0207141250843308, + "grad_norm": 2.203125, + "learning_rate": 7.3930912494699935e-06, + "loss": 1.004, + "step": 5185 + }, + { + "epoch": 1.020914020139427, + "grad_norm": 1.9765625, + "learning_rate": 7.3921659343739485e-06, + "loss": 0.9647, + "step": 5186 + }, + { + "epoch": 1.0211139151945228, + "grad_norm": 2.0, + "learning_rate": 7.391240513018149e-06, + "loss": 0.9818, + "step": 5187 + }, + { + "epoch": 1.0213138102496189, + "grad_norm": 2.109375, + "learning_rate": 7.3903149854437e-06, + "loss": 0.8793, + "step": 5188 + }, + { + "epoch": 1.021513705304715, + "grad_norm": 2.046875, + "learning_rate": 7.389389351691717e-06, + "loss": 1.012, + "step": 5189 + }, + { + "epoch": 1.021713600359811, + "grad_norm": 2.109375, + "learning_rate": 7.3884636118033115e-06, + "loss": 0.9668, + "step": 5190 + }, + { + "epoch": 1.0219134954149072, + "grad_norm": 2.34375, + "learning_rate": 7.387537765819609e-06, + "loss": 0.9573, + "step": 5191 + }, + { + "epoch": 1.0221133904700033, + "grad_norm": 2.046875, + "learning_rate": 7.3866118137817344e-06, + "loss": 0.9249, + "step": 5192 + }, + { + "epoch": 1.0223132855250994, + "grad_norm": 2.09375, + "learning_rate": 7.385685755730816e-06, + "loss": 1.0011, + "step": 5193 + }, + { + "epoch": 1.0225131805801955, + "grad_norm": 2.078125, + "learning_rate": 7.384759591707993e-06, + "loss": 1.0457, + "step": 5194 + }, + { + "epoch": 1.0227130756352916, + "grad_norm": 2.171875, + "learning_rate": 7.3838333217544035e-06, + "loss": 0.986, + "step": 5195 + }, + { + "epoch": 1.0229129706903874, + "grad_norm": 2.234375, + "learning_rate": 7.3829069459111925e-06, + "loss": 1.0031, + "step": 5196 + }, + { + "epoch": 1.0231128657454835, + "grad_norm": 2.046875, + "learning_rate": 7.381980464219508e-06, + "loss": 0.9511, + "step": 5197 + }, + { + "epoch": 1.0233127608005796, + "grad_norm": 2.15625, + "learning_rate": 7.381053876720508e-06, + "loss": 1.0288, + "step": 5198 + }, + { + "epoch": 1.0235126558556757, + "grad_norm": 2.0, + "learning_rate": 7.380127183455345e-06, + "loss": 0.9307, + "step": 5199 + }, + { + "epoch": 1.0237125509107718, + "grad_norm": 2.078125, + "learning_rate": 7.379200384465191e-06, + "loss": 0.9667, + "step": 5200 + }, + { + "epoch": 1.023912445965868, + "grad_norm": 2.109375, + "learning_rate": 7.378273479791208e-06, + "loss": 0.968, + "step": 5201 + }, + { + "epoch": 1.024112341020964, + "grad_norm": 2.109375, + "learning_rate": 7.377346469474571e-06, + "loss": 0.9898, + "step": 5202 + }, + { + "epoch": 1.0243122360760601, + "grad_norm": 2.203125, + "learning_rate": 7.376419353556458e-06, + "loss": 0.9714, + "step": 5203 + }, + { + "epoch": 1.0245121311311562, + "grad_norm": 2.078125, + "learning_rate": 7.375492132078051e-06, + "loss": 0.9354, + "step": 5204 + }, + { + "epoch": 1.0247120261862521, + "grad_norm": 2.15625, + "learning_rate": 7.374564805080537e-06, + "loss": 1.0316, + "step": 5205 + }, + { + "epoch": 1.0249119212413482, + "grad_norm": 2.09375, + "learning_rate": 7.373637372605111e-06, + "loss": 0.9557, + "step": 5206 + }, + { + "epoch": 1.0251118162964443, + "grad_norm": 2.03125, + "learning_rate": 7.372709834692962e-06, + "loss": 0.9577, + "step": 5207 + }, + { + "epoch": 1.0253117113515404, + "grad_norm": 2.0, + "learning_rate": 7.371782191385297e-06, + "loss": 0.8865, + "step": 5208 + }, + { + "epoch": 1.0255116064066365, + "grad_norm": 2.171875, + "learning_rate": 7.370854442723322e-06, + "loss": 0.9397, + "step": 5209 + }, + { + "epoch": 1.0257115014617326, + "grad_norm": 2.015625, + "learning_rate": 7.369926588748244e-06, + "loss": 0.9066, + "step": 5210 + }, + { + "epoch": 1.0259113965168287, + "grad_norm": 2.171875, + "learning_rate": 7.368998629501282e-06, + "loss": 1.0038, + "step": 5211 + }, + { + "epoch": 1.0261112915719248, + "grad_norm": 2.0625, + "learning_rate": 7.368070565023653e-06, + "loss": 0.9404, + "step": 5212 + }, + { + "epoch": 1.026311186627021, + "grad_norm": 2.0625, + "learning_rate": 7.367142395356581e-06, + "loss": 0.8999, + "step": 5213 + }, + { + "epoch": 1.0265110816821168, + "grad_norm": 2.125, + "learning_rate": 7.3662141205412975e-06, + "loss": 0.9461, + "step": 5214 + }, + { + "epoch": 1.026710976737213, + "grad_norm": 2.296875, + "learning_rate": 7.365285740619036e-06, + "loss": 0.9219, + "step": 5215 + }, + { + "epoch": 1.026910871792309, + "grad_norm": 2.15625, + "learning_rate": 7.364357255631034e-06, + "loss": 0.9997, + "step": 5216 + }, + { + "epoch": 1.027110766847405, + "grad_norm": 2.109375, + "learning_rate": 7.363428665618535e-06, + "loss": 0.9891, + "step": 5217 + }, + { + "epoch": 1.0273106619025012, + "grad_norm": 2.078125, + "learning_rate": 7.3624999706227885e-06, + "loss": 0.9606, + "step": 5218 + }, + { + "epoch": 1.0275105569575973, + "grad_norm": 2.296875, + "learning_rate": 7.361571170685043e-06, + "loss": 1.0675, + "step": 5219 + }, + { + "epoch": 1.0277104520126934, + "grad_norm": 2.046875, + "learning_rate": 7.360642265846562e-06, + "loss": 0.9213, + "step": 5220 + }, + { + "epoch": 1.0279103470677895, + "grad_norm": 1.921875, + "learning_rate": 7.359713256148601e-06, + "loss": 0.8831, + "step": 5221 + }, + { + "epoch": 1.0281102421228854, + "grad_norm": 2.0625, + "learning_rate": 7.358784141632429e-06, + "loss": 1.1255, + "step": 5222 + }, + { + "epoch": 1.0283101371779815, + "grad_norm": 2.03125, + "learning_rate": 7.357854922339318e-06, + "loss": 0.9533, + "step": 5223 + }, + { + "epoch": 1.0285100322330776, + "grad_norm": 2.125, + "learning_rate": 7.356925598310544e-06, + "loss": 0.9819, + "step": 5224 + }, + { + "epoch": 1.0287099272881737, + "grad_norm": 2.046875, + "learning_rate": 7.355996169587385e-06, + "loss": 0.8877, + "step": 5225 + }, + { + "epoch": 1.0289098223432698, + "grad_norm": 2.046875, + "learning_rate": 7.35506663621113e-06, + "loss": 0.9553, + "step": 5226 + }, + { + "epoch": 1.0291097173983659, + "grad_norm": 2.15625, + "learning_rate": 7.354136998223066e-06, + "loss": 1.0505, + "step": 5227 + }, + { + "epoch": 1.029309612453462, + "grad_norm": 2.15625, + "learning_rate": 7.353207255664486e-06, + "loss": 0.9877, + "step": 5228 + }, + { + "epoch": 1.029509507508558, + "grad_norm": 2.15625, + "learning_rate": 7.352277408576693e-06, + "loss": 1.1288, + "step": 5229 + }, + { + "epoch": 1.0297094025636542, + "grad_norm": 2.109375, + "learning_rate": 7.3513474570009876e-06, + "loss": 0.9763, + "step": 5230 + }, + { + "epoch": 1.02990929761875, + "grad_norm": 2.078125, + "learning_rate": 7.35041740097868e-06, + "loss": 0.975, + "step": 5231 + }, + { + "epoch": 1.0301091926738462, + "grad_norm": 2.234375, + "learning_rate": 7.349487240551083e-06, + "loss": 1.0409, + "step": 5232 + }, + { + "epoch": 1.0303090877289423, + "grad_norm": 2.234375, + "learning_rate": 7.348556975759512e-06, + "loss": 1.0961, + "step": 5233 + }, + { + "epoch": 1.0305089827840384, + "grad_norm": 2.015625, + "learning_rate": 7.3476266066452925e-06, + "loss": 1.0103, + "step": 5234 + }, + { + "epoch": 1.0307088778391345, + "grad_norm": 2.15625, + "learning_rate": 7.346696133249749e-06, + "loss": 0.9519, + "step": 5235 + }, + { + "epoch": 1.0309087728942306, + "grad_norm": 2.046875, + "learning_rate": 7.345765555614214e-06, + "loss": 0.9152, + "step": 5236 + }, + { + "epoch": 1.0311086679493267, + "grad_norm": 2.078125, + "learning_rate": 7.344834873780024e-06, + "loss": 1.0414, + "step": 5237 + }, + { + "epoch": 1.0313085630044228, + "grad_norm": 2.046875, + "learning_rate": 7.3439040877885184e-06, + "loss": 0.9877, + "step": 5238 + }, + { + "epoch": 1.0315084580595189, + "grad_norm": 2.0625, + "learning_rate": 7.342973197681045e-06, + "loss": 0.9529, + "step": 5239 + }, + { + "epoch": 1.0317083531146147, + "grad_norm": 2.046875, + "learning_rate": 7.342042203498952e-06, + "loss": 0.9597, + "step": 5240 + }, + { + "epoch": 1.0319082481697108, + "grad_norm": 2.140625, + "learning_rate": 7.341111105283594e-06, + "loss": 0.9566, + "step": 5241 + }, + { + "epoch": 1.032108143224807, + "grad_norm": 2.171875, + "learning_rate": 7.34017990307633e-06, + "loss": 1.0736, + "step": 5242 + }, + { + "epoch": 1.032308038279903, + "grad_norm": 2.140625, + "learning_rate": 7.339248596918526e-06, + "loss": 1.0523, + "step": 5243 + }, + { + "epoch": 1.0325079333349991, + "grad_norm": 2.0, + "learning_rate": 7.338317186851549e-06, + "loss": 1.0294, + "step": 5244 + }, + { + "epoch": 1.0327078283900952, + "grad_norm": 2.140625, + "learning_rate": 7.337385672916772e-06, + "loss": 0.9691, + "step": 5245 + }, + { + "epoch": 1.0329077234451913, + "grad_norm": 2.1875, + "learning_rate": 7.336454055155573e-06, + "loss": 1.0367, + "step": 5246 + }, + { + "epoch": 1.0331076185002874, + "grad_norm": 2.046875, + "learning_rate": 7.335522333609334e-06, + "loss": 0.9095, + "step": 5247 + }, + { + "epoch": 1.0333075135553835, + "grad_norm": 2.0625, + "learning_rate": 7.334590508319442e-06, + "loss": 0.9809, + "step": 5248 + }, + { + "epoch": 1.0335074086104794, + "grad_norm": 2.046875, + "learning_rate": 7.3336585793272905e-06, + "loss": 0.9904, + "step": 5249 + }, + { + "epoch": 1.0337073036655755, + "grad_norm": 2.140625, + "learning_rate": 7.3327265466742734e-06, + "loss": 0.941, + "step": 5250 + }, + { + "epoch": 1.0339071987206716, + "grad_norm": 2.296875, + "learning_rate": 7.331794410401792e-06, + "loss": 1.0044, + "step": 5251 + }, + { + "epoch": 1.0341070937757677, + "grad_norm": 2.109375, + "learning_rate": 7.330862170551253e-06, + "loss": 0.8835, + "step": 5252 + }, + { + "epoch": 1.0343069888308638, + "grad_norm": 2.125, + "learning_rate": 7.329929827164064e-06, + "loss": 0.9934, + "step": 5253 + }, + { + "epoch": 1.03450688388596, + "grad_norm": 2.15625, + "learning_rate": 7.328997380281642e-06, + "loss": 0.9716, + "step": 5254 + }, + { + "epoch": 1.034706778941056, + "grad_norm": 2.25, + "learning_rate": 7.3280648299454035e-06, + "loss": 1.0728, + "step": 5255 + }, + { + "epoch": 1.034906673996152, + "grad_norm": 2.171875, + "learning_rate": 7.3271321761967754e-06, + "loss": 0.9832, + "step": 5256 + }, + { + "epoch": 1.0351065690512482, + "grad_norm": 2.09375, + "learning_rate": 7.326199419077185e-06, + "loss": 1.0008, + "step": 5257 + }, + { + "epoch": 1.035306464106344, + "grad_norm": 2.015625, + "learning_rate": 7.325266558628064e-06, + "loss": 0.9786, + "step": 5258 + }, + { + "epoch": 1.0355063591614402, + "grad_norm": 2.0625, + "learning_rate": 7.32433359489085e-06, + "loss": 0.9522, + "step": 5259 + }, + { + "epoch": 1.0357062542165363, + "grad_norm": 2.0, + "learning_rate": 7.323400527906988e-06, + "loss": 0.9269, + "step": 5260 + }, + { + "epoch": 1.0359061492716324, + "grad_norm": 2.171875, + "learning_rate": 7.32246735771792e-06, + "loss": 0.8953, + "step": 5261 + }, + { + "epoch": 1.0361060443267285, + "grad_norm": 2.109375, + "learning_rate": 7.321534084365101e-06, + "loss": 1.0412, + "step": 5262 + }, + { + "epoch": 1.0363059393818246, + "grad_norm": 2.234375, + "learning_rate": 7.320600707889988e-06, + "loss": 1.0666, + "step": 5263 + }, + { + "epoch": 1.0365058344369207, + "grad_norm": 2.03125, + "learning_rate": 7.3196672283340364e-06, + "loss": 0.9433, + "step": 5264 + }, + { + "epoch": 1.0367057294920168, + "grad_norm": 2.078125, + "learning_rate": 7.318733645738716e-06, + "loss": 0.9937, + "step": 5265 + }, + { + "epoch": 1.0369056245471127, + "grad_norm": 2.109375, + "learning_rate": 7.317799960145495e-06, + "loss": 1.0192, + "step": 5266 + }, + { + "epoch": 1.0371055196022088, + "grad_norm": 2.03125, + "learning_rate": 7.316866171595846e-06, + "loss": 0.9529, + "step": 5267 + }, + { + "epoch": 1.0373054146573049, + "grad_norm": 2.140625, + "learning_rate": 7.31593228013125e-06, + "loss": 1.0775, + "step": 5268 + }, + { + "epoch": 1.037505309712401, + "grad_norm": 2.09375, + "learning_rate": 7.314998285793189e-06, + "loss": 1.0069, + "step": 5269 + }, + { + "epoch": 1.037705204767497, + "grad_norm": 2.265625, + "learning_rate": 7.314064188623151e-06, + "loss": 0.9793, + "step": 5270 + }, + { + "epoch": 1.0379050998225932, + "grad_norm": 2.140625, + "learning_rate": 7.313129988662631e-06, + "loss": 1.0244, + "step": 5271 + }, + { + "epoch": 1.0381049948776893, + "grad_norm": 2.140625, + "learning_rate": 7.312195685953122e-06, + "loss": 1.026, + "step": 5272 + }, + { + "epoch": 1.0383048899327854, + "grad_norm": 2.0625, + "learning_rate": 7.311261280536129e-06, + "loss": 0.9317, + "step": 5273 + }, + { + "epoch": 1.0385047849878815, + "grad_norm": 2.09375, + "learning_rate": 7.310326772453156e-06, + "loss": 1.0624, + "step": 5274 + }, + { + "epoch": 1.0387046800429773, + "grad_norm": 2.03125, + "learning_rate": 7.309392161745714e-06, + "loss": 0.9344, + "step": 5275 + }, + { + "epoch": 1.0389045750980734, + "grad_norm": 2.046875, + "learning_rate": 7.3084574484553185e-06, + "loss": 0.9381, + "step": 5276 + }, + { + "epoch": 1.0391044701531695, + "grad_norm": 2.046875, + "learning_rate": 7.307522632623491e-06, + "loss": 0.9804, + "step": 5277 + }, + { + "epoch": 1.0393043652082656, + "grad_norm": 2.234375, + "learning_rate": 7.306587714291753e-06, + "loss": 1.0398, + "step": 5278 + }, + { + "epoch": 1.0395042602633617, + "grad_norm": 2.125, + "learning_rate": 7.305652693501637e-06, + "loss": 1.012, + "step": 5279 + }, + { + "epoch": 1.0397041553184578, + "grad_norm": 2.09375, + "learning_rate": 7.304717570294674e-06, + "loss": 0.9732, + "step": 5280 + }, + { + "epoch": 1.039904050373554, + "grad_norm": 2.078125, + "learning_rate": 7.303782344712401e-06, + "loss": 1.025, + "step": 5281 + }, + { + "epoch": 1.04010394542865, + "grad_norm": 2.09375, + "learning_rate": 7.302847016796365e-06, + "loss": 0.9559, + "step": 5282 + }, + { + "epoch": 1.0403038404837461, + "grad_norm": 2.078125, + "learning_rate": 7.301911586588108e-06, + "loss": 0.9812, + "step": 5283 + }, + { + "epoch": 1.040503735538842, + "grad_norm": 2.15625, + "learning_rate": 7.300976054129185e-06, + "loss": 0.9592, + "step": 5284 + }, + { + "epoch": 1.0407036305939381, + "grad_norm": 2.125, + "learning_rate": 7.300040419461153e-06, + "loss": 0.9289, + "step": 5285 + }, + { + "epoch": 1.0409035256490342, + "grad_norm": 2.125, + "learning_rate": 7.2991046826255685e-06, + "loss": 0.9968, + "step": 5286 + }, + { + "epoch": 1.0411034207041303, + "grad_norm": 2.15625, + "learning_rate": 7.298168843664001e-06, + "loss": 0.9789, + "step": 5287 + }, + { + "epoch": 1.0413033157592264, + "grad_norm": 2.0625, + "learning_rate": 7.297232902618021e-06, + "loss": 0.8654, + "step": 5288 + }, + { + "epoch": 1.0415032108143225, + "grad_norm": 2.21875, + "learning_rate": 7.296296859529199e-06, + "loss": 0.9042, + "step": 5289 + }, + { + "epoch": 1.0417031058694186, + "grad_norm": 2.0625, + "learning_rate": 7.295360714439115e-06, + "loss": 0.9787, + "step": 5290 + }, + { + "epoch": 1.0419030009245147, + "grad_norm": 2.046875, + "learning_rate": 7.294424467389354e-06, + "loss": 0.9877, + "step": 5291 + }, + { + "epoch": 1.0421028959796108, + "grad_norm": 2.296875, + "learning_rate": 7.293488118421502e-06, + "loss": 1.0066, + "step": 5292 + }, + { + "epoch": 1.0423027910347067, + "grad_norm": 2.328125, + "learning_rate": 7.292551667577153e-06, + "loss": 0.9006, + "step": 5293 + }, + { + "epoch": 1.0425026860898028, + "grad_norm": 2.125, + "learning_rate": 7.291615114897905e-06, + "loss": 0.9904, + "step": 5294 + }, + { + "epoch": 1.0427025811448989, + "grad_norm": 2.140625, + "learning_rate": 7.290678460425358e-06, + "loss": 0.9638, + "step": 5295 + }, + { + "epoch": 1.042902476199995, + "grad_norm": 2.0625, + "learning_rate": 7.289741704201119e-06, + "loss": 0.9075, + "step": 5296 + }, + { + "epoch": 1.043102371255091, + "grad_norm": 2.1875, + "learning_rate": 7.288804846266796e-06, + "loss": 0.9738, + "step": 5297 + }, + { + "epoch": 1.0433022663101872, + "grad_norm": 2.078125, + "learning_rate": 7.287867886664008e-06, + "loss": 0.9857, + "step": 5298 + }, + { + "epoch": 1.0435021613652833, + "grad_norm": 2.140625, + "learning_rate": 7.286930825434372e-06, + "loss": 0.9809, + "step": 5299 + }, + { + "epoch": 1.0437020564203794, + "grad_norm": 3.0, + "learning_rate": 7.2859936626195126e-06, + "loss": 1.0613, + "step": 5300 + }, + { + "epoch": 1.0439019514754753, + "grad_norm": 2.15625, + "learning_rate": 7.285056398261059e-06, + "loss": 0.9796, + "step": 5301 + }, + { + "epoch": 1.0441018465305714, + "grad_norm": 2.03125, + "learning_rate": 7.2841190324006464e-06, + "loss": 0.8832, + "step": 5302 + }, + { + "epoch": 1.0443017415856675, + "grad_norm": 2.203125, + "learning_rate": 7.283181565079907e-06, + "loss": 1.0052, + "step": 5303 + }, + { + "epoch": 1.0445016366407636, + "grad_norm": 2.078125, + "learning_rate": 7.28224399634049e-06, + "loss": 0.9675, + "step": 5304 + }, + { + "epoch": 1.0447015316958597, + "grad_norm": 2.15625, + "learning_rate": 7.2813063262240355e-06, + "loss": 1.0163, + "step": 5305 + }, + { + "epoch": 1.0449014267509558, + "grad_norm": 2.265625, + "learning_rate": 7.280368554772199e-06, + "loss": 0.9621, + "step": 5306 + }, + { + "epoch": 1.0451013218060519, + "grad_norm": 2.140625, + "learning_rate": 7.2794306820266335e-06, + "loss": 1.0418, + "step": 5307 + }, + { + "epoch": 1.045301216861148, + "grad_norm": 2.21875, + "learning_rate": 7.2784927080290025e-06, + "loss": 1.0345, + "step": 5308 + }, + { + "epoch": 1.045501111916244, + "grad_norm": 2.171875, + "learning_rate": 7.277554632820968e-06, + "loss": 1.0125, + "step": 5309 + }, + { + "epoch": 1.04570100697134, + "grad_norm": 2.078125, + "learning_rate": 7.2766164564442e-06, + "loss": 0.9752, + "step": 5310 + }, + { + "epoch": 1.045900902026436, + "grad_norm": 2.15625, + "learning_rate": 7.275678178940372e-06, + "loss": 1.078, + "step": 5311 + }, + { + "epoch": 1.0461007970815321, + "grad_norm": 2.234375, + "learning_rate": 7.274739800351162e-06, + "loss": 0.9943, + "step": 5312 + }, + { + "epoch": 1.0463006921366282, + "grad_norm": 2.28125, + "learning_rate": 7.273801320718254e-06, + "loss": 1.0436, + "step": 5313 + }, + { + "epoch": 1.0465005871917243, + "grad_norm": 2.09375, + "learning_rate": 7.272862740083332e-06, + "loss": 1.027, + "step": 5314 + }, + { + "epoch": 1.0467004822468204, + "grad_norm": 2.125, + "learning_rate": 7.271924058488091e-06, + "loss": 1.0068, + "step": 5315 + }, + { + "epoch": 1.0469003773019165, + "grad_norm": 2.015625, + "learning_rate": 7.270985275974227e-06, + "loss": 0.947, + "step": 5316 + }, + { + "epoch": 1.0471002723570126, + "grad_norm": 2.265625, + "learning_rate": 7.270046392583438e-06, + "loss": 1.0863, + "step": 5317 + }, + { + "epoch": 1.0473001674121087, + "grad_norm": 2.328125, + "learning_rate": 7.269107408357432e-06, + "loss": 0.9548, + "step": 5318 + }, + { + "epoch": 1.0475000624672046, + "grad_norm": 2.421875, + "learning_rate": 7.2681683233379176e-06, + "loss": 0.9997, + "step": 5319 + }, + { + "epoch": 1.0476999575223007, + "grad_norm": 2.078125, + "learning_rate": 7.267229137566607e-06, + "loss": 0.9421, + "step": 5320 + }, + { + "epoch": 1.0478998525773968, + "grad_norm": 2.03125, + "learning_rate": 7.266289851085221e-06, + "loss": 0.9005, + "step": 5321 + }, + { + "epoch": 1.048099747632493, + "grad_norm": 2.140625, + "learning_rate": 7.265350463935482e-06, + "loss": 1.0251, + "step": 5322 + }, + { + "epoch": 1.048299642687589, + "grad_norm": 2.234375, + "learning_rate": 7.264410976159117e-06, + "loss": 1.008, + "step": 5323 + }, + { + "epoch": 1.0484995377426851, + "grad_norm": 2.078125, + "learning_rate": 7.263471387797859e-06, + "loss": 1.0063, + "step": 5324 + }, + { + "epoch": 1.0486994327977812, + "grad_norm": 2.15625, + "learning_rate": 7.262531698893443e-06, + "loss": 1.0123, + "step": 5325 + }, + { + "epoch": 1.0488993278528773, + "grad_norm": 2.140625, + "learning_rate": 7.261591909487611e-06, + "loss": 0.9359, + "step": 5326 + }, + { + "epoch": 1.0490992229079734, + "grad_norm": 2.109375, + "learning_rate": 7.260652019622108e-06, + "loss": 0.9853, + "step": 5327 + }, + { + "epoch": 1.0492991179630693, + "grad_norm": 2.296875, + "learning_rate": 7.259712029338682e-06, + "loss": 0.9753, + "step": 5328 + }, + { + "epoch": 1.0494990130181654, + "grad_norm": 2.15625, + "learning_rate": 7.25877193867909e-06, + "loss": 1.0207, + "step": 5329 + }, + { + "epoch": 1.0496989080732615, + "grad_norm": 2.109375, + "learning_rate": 7.2578317476850915e-06, + "loss": 1.0005, + "step": 5330 + }, + { + "epoch": 1.0498988031283576, + "grad_norm": 2.125, + "learning_rate": 7.256891456398446e-06, + "loss": 0.9452, + "step": 5331 + }, + { + "epoch": 1.0500986981834537, + "grad_norm": 2.1875, + "learning_rate": 7.2559510648609234e-06, + "loss": 1.0279, + "step": 5332 + }, + { + "epoch": 1.0502985932385498, + "grad_norm": 2.1875, + "learning_rate": 7.255010573114296e-06, + "loss": 1.002, + "step": 5333 + }, + { + "epoch": 1.050498488293646, + "grad_norm": 2.0, + "learning_rate": 7.254069981200339e-06, + "loss": 0.8251, + "step": 5334 + }, + { + "epoch": 1.050698383348742, + "grad_norm": 2.125, + "learning_rate": 7.253129289160835e-06, + "loss": 1.0114, + "step": 5335 + }, + { + "epoch": 1.0508982784038379, + "grad_norm": 2.171875, + "learning_rate": 7.252188497037569e-06, + "loss": 1.0005, + "step": 5336 + }, + { + "epoch": 1.051098173458934, + "grad_norm": 2.171875, + "learning_rate": 7.251247604872329e-06, + "loss": 0.9921, + "step": 5337 + }, + { + "epoch": 1.05129806851403, + "grad_norm": 2.234375, + "learning_rate": 7.250306612706912e-06, + "loss": 0.9755, + "step": 5338 + }, + { + "epoch": 1.0514979635691262, + "grad_norm": 2.0625, + "learning_rate": 7.249365520583116e-06, + "loss": 0.942, + "step": 5339 + }, + { + "epoch": 1.0516978586242223, + "grad_norm": 2.09375, + "learning_rate": 7.248424328542742e-06, + "loss": 1.055, + "step": 5340 + }, + { + "epoch": 1.0518977536793184, + "grad_norm": 2.140625, + "learning_rate": 7.247483036627601e-06, + "loss": 1.0509, + "step": 5341 + }, + { + "epoch": 1.0520976487344145, + "grad_norm": 2.046875, + "learning_rate": 7.246541644879502e-06, + "loss": 0.9881, + "step": 5342 + }, + { + "epoch": 1.0522975437895106, + "grad_norm": 2.125, + "learning_rate": 7.245600153340264e-06, + "loss": 0.9471, + "step": 5343 + }, + { + "epoch": 1.0524974388446067, + "grad_norm": 2.109375, + "learning_rate": 7.244658562051708e-06, + "loss": 0.9631, + "step": 5344 + }, + { + "epoch": 1.0526973338997028, + "grad_norm": 2.1875, + "learning_rate": 7.243716871055657e-06, + "loss": 0.9688, + "step": 5345 + }, + { + "epoch": 1.0528972289547986, + "grad_norm": 2.203125, + "learning_rate": 7.242775080393942e-06, + "loss": 0.9997, + "step": 5346 + }, + { + "epoch": 1.0530971240098947, + "grad_norm": 2.09375, + "learning_rate": 7.241833190108399e-06, + "loss": 0.947, + "step": 5347 + }, + { + "epoch": 1.0532970190649908, + "grad_norm": 2.125, + "learning_rate": 7.240891200240864e-06, + "loss": 1.0385, + "step": 5348 + }, + { + "epoch": 1.053496914120087, + "grad_norm": 2.234375, + "learning_rate": 7.239949110833182e-06, + "loss": 1.0076, + "step": 5349 + }, + { + "epoch": 1.053696809175183, + "grad_norm": 2.0625, + "learning_rate": 7.2390069219272e-06, + "loss": 1.033, + "step": 5350 + }, + { + "epoch": 1.0538967042302791, + "grad_norm": 2.046875, + "learning_rate": 7.238064633564769e-06, + "loss": 1.0587, + "step": 5351 + }, + { + "epoch": 1.0540965992853752, + "grad_norm": 2.171875, + "learning_rate": 7.2371222457877456e-06, + "loss": 1.0285, + "step": 5352 + }, + { + "epoch": 1.0542964943404713, + "grad_norm": 2.046875, + "learning_rate": 7.236179758637991e-06, + "loss": 0.9545, + "step": 5353 + }, + { + "epoch": 1.0544963893955672, + "grad_norm": 2.109375, + "learning_rate": 7.2352371721573715e-06, + "loss": 1.0356, + "step": 5354 + }, + { + "epoch": 1.0546962844506633, + "grad_norm": 2.109375, + "learning_rate": 7.234294486387754e-06, + "loss": 0.9266, + "step": 5355 + }, + { + "epoch": 1.0548961795057594, + "grad_norm": 2.15625, + "learning_rate": 7.233351701371015e-06, + "loss": 0.9389, + "step": 5356 + }, + { + "epoch": 1.0550960745608555, + "grad_norm": 2.09375, + "learning_rate": 7.232408817149032e-06, + "loss": 0.9307, + "step": 5357 + }, + { + "epoch": 1.0552959696159516, + "grad_norm": 2.171875, + "learning_rate": 7.231465833763687e-06, + "loss": 1.0046, + "step": 5358 + }, + { + "epoch": 1.0554958646710477, + "grad_norm": 2.1875, + "learning_rate": 7.230522751256868e-06, + "loss": 0.9269, + "step": 5359 + }, + { + "epoch": 1.0556957597261438, + "grad_norm": 2.0625, + "learning_rate": 7.229579569670467e-06, + "loss": 0.9892, + "step": 5360 + }, + { + "epoch": 1.05589565478124, + "grad_norm": 2.046875, + "learning_rate": 7.2286362890463805e-06, + "loss": 0.9841, + "step": 5361 + }, + { + "epoch": 1.056095549836336, + "grad_norm": 2.3125, + "learning_rate": 7.227692909426507e-06, + "loss": 1.094, + "step": 5362 + }, + { + "epoch": 1.056295444891432, + "grad_norm": 2.203125, + "learning_rate": 7.226749430852753e-06, + "loss": 0.9837, + "step": 5363 + }, + { + "epoch": 1.056495339946528, + "grad_norm": 2.015625, + "learning_rate": 7.225805853367027e-06, + "loss": 0.9187, + "step": 5364 + }, + { + "epoch": 1.056695235001624, + "grad_norm": 1.9765625, + "learning_rate": 7.224862177011241e-06, + "loss": 0.9311, + "step": 5365 + }, + { + "epoch": 1.0568951300567202, + "grad_norm": 2.125, + "learning_rate": 7.223918401827318e-06, + "loss": 0.9494, + "step": 5366 + }, + { + "epoch": 1.0570950251118163, + "grad_norm": 2.078125, + "learning_rate": 7.222974527857176e-06, + "loss": 0.9531, + "step": 5367 + }, + { + "epoch": 1.0572949201669124, + "grad_norm": 2.125, + "learning_rate": 7.222030555142742e-06, + "loss": 1.0476, + "step": 5368 + }, + { + "epoch": 1.0574948152220085, + "grad_norm": 2.15625, + "learning_rate": 7.2210864837259474e-06, + "loss": 0.9496, + "step": 5369 + }, + { + "epoch": 1.0576947102771046, + "grad_norm": 2.140625, + "learning_rate": 7.22014231364873e-06, + "loss": 1.0252, + "step": 5370 + }, + { + "epoch": 1.0578946053322007, + "grad_norm": 2.1875, + "learning_rate": 7.219198044953026e-06, + "loss": 0.9492, + "step": 5371 + }, + { + "epoch": 1.0580945003872966, + "grad_norm": 2.265625, + "learning_rate": 7.2182536776807845e-06, + "loss": 1.0375, + "step": 5372 + }, + { + "epoch": 1.0582943954423927, + "grad_norm": 2.125, + "learning_rate": 7.217309211873951e-06, + "loss": 0.9649, + "step": 5373 + }, + { + "epoch": 1.0584942904974888, + "grad_norm": 2.171875, + "learning_rate": 7.2163646475744775e-06, + "loss": 0.9941, + "step": 5374 + }, + { + "epoch": 1.0586941855525849, + "grad_norm": 2.171875, + "learning_rate": 7.215419984824325e-06, + "loss": 1.0115, + "step": 5375 + }, + { + "epoch": 1.058894080607681, + "grad_norm": 2.046875, + "learning_rate": 7.214475223665452e-06, + "loss": 1.0192, + "step": 5376 + }, + { + "epoch": 1.059093975662777, + "grad_norm": 2.171875, + "learning_rate": 7.213530364139826e-06, + "loss": 1.0203, + "step": 5377 + }, + { + "epoch": 1.0592938707178732, + "grad_norm": 2.140625, + "learning_rate": 7.2125854062894184e-06, + "loss": 0.9952, + "step": 5378 + }, + { + "epoch": 1.0594937657729693, + "grad_norm": 2.1875, + "learning_rate": 7.211640350156203e-06, + "loss": 1.0367, + "step": 5379 + }, + { + "epoch": 1.0596936608280654, + "grad_norm": 2.15625, + "learning_rate": 7.21069519578216e-06, + "loss": 0.9191, + "step": 5380 + }, + { + "epoch": 1.0598935558831613, + "grad_norm": 2.15625, + "learning_rate": 7.209749943209273e-06, + "loss": 0.983, + "step": 5381 + }, + { + "epoch": 1.0600934509382574, + "grad_norm": 2.03125, + "learning_rate": 7.208804592479528e-06, + "loss": 0.9328, + "step": 5382 + }, + { + "epoch": 1.0602933459933535, + "grad_norm": 2.109375, + "learning_rate": 7.207859143634919e-06, + "loss": 0.9217, + "step": 5383 + }, + { + "epoch": 1.0604932410484496, + "grad_norm": 2.15625, + "learning_rate": 7.206913596717444e-06, + "loss": 1.083, + "step": 5384 + }, + { + "epoch": 1.0606931361035457, + "grad_norm": 2.1875, + "learning_rate": 7.205967951769101e-06, + "loss": 1.018, + "step": 5385 + }, + { + "epoch": 1.0608930311586418, + "grad_norm": 2.046875, + "learning_rate": 7.2050222088318996e-06, + "loss": 0.9879, + "step": 5386 + }, + { + "epoch": 1.0610929262137379, + "grad_norm": 2.09375, + "learning_rate": 7.204076367947846e-06, + "loss": 1.0441, + "step": 5387 + }, + { + "epoch": 1.061292821268834, + "grad_norm": 2.203125, + "learning_rate": 7.203130429158954e-06, + "loss": 0.9985, + "step": 5388 + }, + { + "epoch": 1.0614927163239298, + "grad_norm": 2.03125, + "learning_rate": 7.202184392507245e-06, + "loss": 0.9014, + "step": 5389 + }, + { + "epoch": 1.061692611379026, + "grad_norm": 2.046875, + "learning_rate": 7.2012382580347405e-06, + "loss": 0.9595, + "step": 5390 + }, + { + "epoch": 1.061892506434122, + "grad_norm": 2.21875, + "learning_rate": 7.200292025783467e-06, + "loss": 1.0465, + "step": 5391 + }, + { + "epoch": 1.0620924014892181, + "grad_norm": 2.140625, + "learning_rate": 7.199345695795458e-06, + "loss": 0.9302, + "step": 5392 + }, + { + "epoch": 1.0622922965443142, + "grad_norm": 2.21875, + "learning_rate": 7.198399268112747e-06, + "loss": 0.9104, + "step": 5393 + }, + { + "epoch": 1.0624921915994103, + "grad_norm": 2.15625, + "learning_rate": 7.197452742777376e-06, + "loss": 0.9866, + "step": 5394 + }, + { + "epoch": 1.0626920866545064, + "grad_norm": 2.1875, + "learning_rate": 7.196506119831388e-06, + "loss": 0.8295, + "step": 5395 + }, + { + "epoch": 1.0628919817096025, + "grad_norm": 2.1875, + "learning_rate": 7.1955593993168335e-06, + "loss": 0.979, + "step": 5396 + }, + { + "epoch": 1.0630918767646986, + "grad_norm": 2.078125, + "learning_rate": 7.194612581275765e-06, + "loss": 0.9389, + "step": 5397 + }, + { + "epoch": 1.0632917718197945, + "grad_norm": 2.140625, + "learning_rate": 7.1936656657502405e-06, + "loss": 0.972, + "step": 5398 + }, + { + "epoch": 1.0634916668748906, + "grad_norm": 2.234375, + "learning_rate": 7.19271865278232e-06, + "loss": 0.9406, + "step": 5399 + }, + { + "epoch": 1.0636915619299867, + "grad_norm": 2.171875, + "learning_rate": 7.1917715424140736e-06, + "loss": 1.0155, + "step": 5400 + }, + { + "epoch": 1.0638914569850828, + "grad_norm": 2.15625, + "learning_rate": 7.190824334687567e-06, + "loss": 0.9612, + "step": 5401 + }, + { + "epoch": 1.064091352040179, + "grad_norm": 2.234375, + "learning_rate": 7.1898770296448775e-06, + "loss": 0.9834, + "step": 5402 + }, + { + "epoch": 1.064291247095275, + "grad_norm": 2.140625, + "learning_rate": 7.188929627328085e-06, + "loss": 0.9918, + "step": 5403 + }, + { + "epoch": 1.064491142150371, + "grad_norm": 2.171875, + "learning_rate": 7.187982127779272e-06, + "loss": 0.8852, + "step": 5404 + }, + { + "epoch": 1.0646910372054672, + "grad_norm": 2.140625, + "learning_rate": 7.187034531040526e-06, + "loss": 0.9535, + "step": 5405 + }, + { + "epoch": 1.0648909322605633, + "grad_norm": 2.140625, + "learning_rate": 7.186086837153941e-06, + "loss": 0.9757, + "step": 5406 + }, + { + "epoch": 1.0650908273156592, + "grad_norm": 2.15625, + "learning_rate": 7.185139046161611e-06, + "loss": 0.9962, + "step": 5407 + }, + { + "epoch": 1.0652907223707553, + "grad_norm": 2.1875, + "learning_rate": 7.184191158105639e-06, + "loss": 1.019, + "step": 5408 + }, + { + "epoch": 1.0654906174258514, + "grad_norm": 2.09375, + "learning_rate": 7.183243173028128e-06, + "loss": 0.9948, + "step": 5409 + }, + { + "epoch": 1.0656905124809475, + "grad_norm": 2.140625, + "learning_rate": 7.182295090971189e-06, + "loss": 0.947, + "step": 5410 + }, + { + "epoch": 1.0658904075360436, + "grad_norm": 2.234375, + "learning_rate": 7.181346911976935e-06, + "loss": 0.9602, + "step": 5411 + }, + { + "epoch": 1.0660903025911397, + "grad_norm": 2.078125, + "learning_rate": 7.180398636087485e-06, + "loss": 0.9464, + "step": 5412 + }, + { + "epoch": 1.0662901976462358, + "grad_norm": 2.21875, + "learning_rate": 7.179450263344959e-06, + "loss": 1.0616, + "step": 5413 + }, + { + "epoch": 1.0664900927013319, + "grad_norm": 2.140625, + "learning_rate": 7.178501793791487e-06, + "loss": 0.9518, + "step": 5414 + }, + { + "epoch": 1.066689987756428, + "grad_norm": 1.953125, + "learning_rate": 7.1775532274691965e-06, + "loss": 0.895, + "step": 5415 + }, + { + "epoch": 1.0668898828115239, + "grad_norm": 2.078125, + "learning_rate": 7.176604564420224e-06, + "loss": 0.9706, + "step": 5416 + }, + { + "epoch": 1.06708977786662, + "grad_norm": 2.140625, + "learning_rate": 7.17565580468671e-06, + "loss": 0.976, + "step": 5417 + }, + { + "epoch": 1.067289672921716, + "grad_norm": 2.140625, + "learning_rate": 7.174706948310797e-06, + "loss": 0.9806, + "step": 5418 + }, + { + "epoch": 1.0674895679768122, + "grad_norm": 2.15625, + "learning_rate": 7.173757995334634e-06, + "loss": 1.0118, + "step": 5419 + }, + { + "epoch": 1.0676894630319083, + "grad_norm": 2.03125, + "learning_rate": 7.172808945800372e-06, + "loss": 0.9701, + "step": 5420 + }, + { + "epoch": 1.0678893580870044, + "grad_norm": 2.015625, + "learning_rate": 7.171859799750169e-06, + "loss": 0.9854, + "step": 5421 + }, + { + "epoch": 1.0680892531421005, + "grad_norm": 2.109375, + "learning_rate": 7.170910557226186e-06, + "loss": 0.9865, + "step": 5422 + }, + { + "epoch": 1.0682891481971966, + "grad_norm": 2.15625, + "learning_rate": 7.1699612182705894e-06, + "loss": 1.0558, + "step": 5423 + }, + { + "epoch": 1.0684890432522924, + "grad_norm": 2.046875, + "learning_rate": 7.169011782925545e-06, + "loss": 0.9415, + "step": 5424 + }, + { + "epoch": 1.0686889383073885, + "grad_norm": 2.109375, + "learning_rate": 7.16806225123323e-06, + "loss": 1.0717, + "step": 5425 + }, + { + "epoch": 1.0688888333624846, + "grad_norm": 2.0625, + "learning_rate": 7.167112623235821e-06, + "loss": 0.8841, + "step": 5426 + }, + { + "epoch": 1.0690887284175807, + "grad_norm": 2.109375, + "learning_rate": 7.1661628989755016e-06, + "loss": 0.9353, + "step": 5427 + }, + { + "epoch": 1.0692886234726768, + "grad_norm": 2.25, + "learning_rate": 7.165213078494456e-06, + "loss": 1.0364, + "step": 5428 + }, + { + "epoch": 1.069488518527773, + "grad_norm": 2.28125, + "learning_rate": 7.164263161834879e-06, + "loss": 0.9237, + "step": 5429 + }, + { + "epoch": 1.069688413582869, + "grad_norm": 2.109375, + "learning_rate": 7.163313149038962e-06, + "loss": 0.9851, + "step": 5430 + }, + { + "epoch": 1.0698883086379651, + "grad_norm": 2.3125, + "learning_rate": 7.162363040148905e-06, + "loss": 1.0218, + "step": 5431 + }, + { + "epoch": 1.0700882036930612, + "grad_norm": 2.078125, + "learning_rate": 7.161412835206915e-06, + "loss": 0.9518, + "step": 5432 + }, + { + "epoch": 1.0702880987481573, + "grad_norm": 2.28125, + "learning_rate": 7.160462534255195e-06, + "loss": 1.0077, + "step": 5433 + }, + { + "epoch": 1.0704879938032532, + "grad_norm": 2.109375, + "learning_rate": 7.159512137335962e-06, + "loss": 0.9244, + "step": 5434 + }, + { + "epoch": 1.0706878888583493, + "grad_norm": 2.09375, + "learning_rate": 7.15856164449143e-06, + "loss": 1.0028, + "step": 5435 + }, + { + "epoch": 1.0708877839134454, + "grad_norm": 2.125, + "learning_rate": 7.15761105576382e-06, + "loss": 1.0343, + "step": 5436 + }, + { + "epoch": 1.0710876789685415, + "grad_norm": 2.0625, + "learning_rate": 7.156660371195357e-06, + "loss": 0.9361, + "step": 5437 + }, + { + "epoch": 1.0712875740236376, + "grad_norm": 2.21875, + "learning_rate": 7.155709590828271e-06, + "loss": 0.986, + "step": 5438 + }, + { + "epoch": 1.0714874690787337, + "grad_norm": 2.15625, + "learning_rate": 7.154758714704797e-06, + "loss": 1.0873, + "step": 5439 + }, + { + "epoch": 1.0716873641338298, + "grad_norm": 2.234375, + "learning_rate": 7.153807742867169e-06, + "loss": 0.9848, + "step": 5440 + }, + { + "epoch": 1.071887259188926, + "grad_norm": 2.21875, + "learning_rate": 7.152856675357631e-06, + "loss": 0.9704, + "step": 5441 + }, + { + "epoch": 1.0720871542440218, + "grad_norm": 2.15625, + "learning_rate": 7.15190551221843e-06, + "loss": 0.9277, + "step": 5442 + }, + { + "epoch": 1.072287049299118, + "grad_norm": 2.25, + "learning_rate": 7.150954253491818e-06, + "loss": 0.9952, + "step": 5443 + }, + { + "epoch": 1.072486944354214, + "grad_norm": 2.234375, + "learning_rate": 7.1500028992200445e-06, + "loss": 0.9912, + "step": 5444 + }, + { + "epoch": 1.07268683940931, + "grad_norm": 2.1875, + "learning_rate": 7.1490514494453736e-06, + "loss": 0.964, + "step": 5445 + }, + { + "epoch": 1.0728867344644062, + "grad_norm": 2.203125, + "learning_rate": 7.148099904210067e-06, + "loss": 1.1416, + "step": 5446 + }, + { + "epoch": 1.0730866295195023, + "grad_norm": 2.046875, + "learning_rate": 7.14714826355639e-06, + "loss": 0.927, + "step": 5447 + }, + { + "epoch": 1.0732865245745984, + "grad_norm": 2.140625, + "learning_rate": 7.146196527526617e-06, + "loss": 0.9822, + "step": 5448 + }, + { + "epoch": 1.0734864196296945, + "grad_norm": 2.21875, + "learning_rate": 7.145244696163025e-06, + "loss": 1.0945, + "step": 5449 + }, + { + "epoch": 1.0736863146847906, + "grad_norm": 2.078125, + "learning_rate": 7.144292769507891e-06, + "loss": 0.9773, + "step": 5450 + }, + { + "epoch": 1.0738862097398865, + "grad_norm": 2.046875, + "learning_rate": 7.143340747603503e-06, + "loss": 0.9519, + "step": 5451 + }, + { + "epoch": 1.0740861047949826, + "grad_norm": 2.171875, + "learning_rate": 7.142388630492147e-06, + "loss": 0.9781, + "step": 5452 + }, + { + "epoch": 1.0742859998500787, + "grad_norm": 2.09375, + "learning_rate": 7.141436418216118e-06, + "loss": 0.954, + "step": 5453 + }, + { + "epoch": 1.0744858949051748, + "grad_norm": 1.984375, + "learning_rate": 7.14048411081771e-06, + "loss": 0.8944, + "step": 5454 + }, + { + "epoch": 1.0746857899602709, + "grad_norm": 2.203125, + "learning_rate": 7.139531708339227e-06, + "loss": 0.9898, + "step": 5455 + }, + { + "epoch": 1.074885685015367, + "grad_norm": 2.046875, + "learning_rate": 7.138579210822976e-06, + "loss": 0.9674, + "step": 5456 + }, + { + "epoch": 1.075085580070463, + "grad_norm": 2.15625, + "learning_rate": 7.137626618311262e-06, + "loss": 0.9289, + "step": 5457 + }, + { + "epoch": 1.0752854751255592, + "grad_norm": 2.09375, + "learning_rate": 7.136673930846404e-06, + "loss": 0.9659, + "step": 5458 + }, + { + "epoch": 1.075485370180655, + "grad_norm": 2.3125, + "learning_rate": 7.135721148470718e-06, + "loss": 1.0642, + "step": 5459 + }, + { + "epoch": 1.0756852652357511, + "grad_norm": 2.1875, + "learning_rate": 7.134768271226525e-06, + "loss": 1.0505, + "step": 5460 + }, + { + "epoch": 1.0758851602908472, + "grad_norm": 2.140625, + "learning_rate": 7.133815299156155e-06, + "loss": 0.9973, + "step": 5461 + }, + { + "epoch": 1.0760850553459433, + "grad_norm": 2.0625, + "learning_rate": 7.132862232301937e-06, + "loss": 0.9298, + "step": 5462 + }, + { + "epoch": 1.0762849504010394, + "grad_norm": 2.078125, + "learning_rate": 7.1319090707062065e-06, + "loss": 0.9508, + "step": 5463 + }, + { + "epoch": 1.0764848454561355, + "grad_norm": 2.078125, + "learning_rate": 7.130955814411302e-06, + "loss": 1.013, + "step": 5464 + }, + { + "epoch": 1.0766847405112316, + "grad_norm": 2.1875, + "learning_rate": 7.130002463459569e-06, + "loss": 1.0727, + "step": 5465 + }, + { + "epoch": 1.0768846355663277, + "grad_norm": 2.125, + "learning_rate": 7.129049017893352e-06, + "loss": 0.9824, + "step": 5466 + }, + { + "epoch": 1.0770845306214238, + "grad_norm": 2.078125, + "learning_rate": 7.128095477755006e-06, + "loss": 0.8751, + "step": 5467 + }, + { + "epoch": 1.07728442567652, + "grad_norm": 2.0625, + "learning_rate": 7.127141843086888e-06, + "loss": 0.9525, + "step": 5468 + }, + { + "epoch": 1.0774843207316158, + "grad_norm": 2.0625, + "learning_rate": 7.126188113931353e-06, + "loss": 0.9577, + "step": 5469 + }, + { + "epoch": 1.077684215786712, + "grad_norm": 2.171875, + "learning_rate": 7.125234290330774e-06, + "loss": 1.0297, + "step": 5470 + }, + { + "epoch": 1.077884110841808, + "grad_norm": 2.09375, + "learning_rate": 7.124280372327511e-06, + "loss": 0.9516, + "step": 5471 + }, + { + "epoch": 1.0780840058969041, + "grad_norm": 2.078125, + "learning_rate": 7.123326359963941e-06, + "loss": 0.978, + "step": 5472 + }, + { + "epoch": 1.0782839009520002, + "grad_norm": 2.078125, + "learning_rate": 7.122372253282442e-06, + "loss": 0.924, + "step": 5473 + }, + { + "epoch": 1.0784837960070963, + "grad_norm": 2.234375, + "learning_rate": 7.121418052325395e-06, + "loss": 0.9747, + "step": 5474 + }, + { + "epoch": 1.0786836910621924, + "grad_norm": 2.125, + "learning_rate": 7.1204637571351835e-06, + "loss": 0.9425, + "step": 5475 + }, + { + "epoch": 1.0788835861172885, + "grad_norm": 2.125, + "learning_rate": 7.119509367754198e-06, + "loss": 0.8835, + "step": 5476 + }, + { + "epoch": 1.0790834811723844, + "grad_norm": 2.125, + "learning_rate": 7.118554884224833e-06, + "loss": 0.9955, + "step": 5477 + }, + { + "epoch": 1.0792833762274805, + "grad_norm": 2.15625, + "learning_rate": 7.117600306589486e-06, + "loss": 0.9089, + "step": 5478 + }, + { + "epoch": 1.0794832712825766, + "grad_norm": 2.09375, + "learning_rate": 7.11664563489056e-06, + "loss": 0.9591, + "step": 5479 + }, + { + "epoch": 1.0796831663376727, + "grad_norm": 2.25, + "learning_rate": 7.1156908691704604e-06, + "loss": 1.0151, + "step": 5480 + }, + { + "epoch": 1.0798830613927688, + "grad_norm": 2.265625, + "learning_rate": 7.114736009471599e-06, + "loss": 1.1079, + "step": 5481 + }, + { + "epoch": 1.080082956447865, + "grad_norm": 2.15625, + "learning_rate": 7.113781055836391e-06, + "loss": 0.9985, + "step": 5482 + }, + { + "epoch": 1.080282851502961, + "grad_norm": 2.109375, + "learning_rate": 7.112826008307252e-06, + "loss": 1.023, + "step": 5483 + }, + { + "epoch": 1.080482746558057, + "grad_norm": 2.171875, + "learning_rate": 7.111870866926609e-06, + "loss": 1.0379, + "step": 5484 + }, + { + "epoch": 1.0806826416131532, + "grad_norm": 2.09375, + "learning_rate": 7.110915631736887e-06, + "loss": 0.9583, + "step": 5485 + }, + { + "epoch": 1.080882536668249, + "grad_norm": 2.171875, + "learning_rate": 7.109960302780518e-06, + "loss": 1.0009, + "step": 5486 + }, + { + "epoch": 1.0810824317233452, + "grad_norm": 2.078125, + "learning_rate": 7.109004880099938e-06, + "loss": 0.9799, + "step": 5487 + }, + { + "epoch": 1.0812823267784413, + "grad_norm": 2.140625, + "learning_rate": 7.108049363737586e-06, + "loss": 0.9076, + "step": 5488 + }, + { + "epoch": 1.0814822218335374, + "grad_norm": 2.21875, + "learning_rate": 7.107093753735907e-06, + "loss": 0.9846, + "step": 5489 + }, + { + "epoch": 1.0816821168886335, + "grad_norm": 2.109375, + "learning_rate": 7.106138050137349e-06, + "loss": 1.0191, + "step": 5490 + }, + { + "epoch": 1.0818820119437296, + "grad_norm": 2.140625, + "learning_rate": 7.105182252984363e-06, + "loss": 0.9635, + "step": 5491 + }, + { + "epoch": 1.0820819069988257, + "grad_norm": 2.140625, + "learning_rate": 7.104226362319405e-06, + "loss": 0.8561, + "step": 5492 + }, + { + "epoch": 1.0822818020539218, + "grad_norm": 2.0625, + "learning_rate": 7.103270378184939e-06, + "loss": 0.9057, + "step": 5493 + }, + { + "epoch": 1.0824816971090176, + "grad_norm": 2.0625, + "learning_rate": 7.102314300623425e-06, + "loss": 0.9439, + "step": 5494 + }, + { + "epoch": 1.0826815921641137, + "grad_norm": 2.046875, + "learning_rate": 7.101358129677336e-06, + "loss": 0.9405, + "step": 5495 + }, + { + "epoch": 1.0828814872192098, + "grad_norm": 2.078125, + "learning_rate": 7.100401865389144e-06, + "loss": 0.9559, + "step": 5496 + }, + { + "epoch": 1.083081382274306, + "grad_norm": 2.046875, + "learning_rate": 7.099445507801324e-06, + "loss": 0.9675, + "step": 5497 + }, + { + "epoch": 1.083281277329402, + "grad_norm": 2.25, + "learning_rate": 7.0984890569563595e-06, + "loss": 0.9406, + "step": 5498 + }, + { + "epoch": 1.0834811723844981, + "grad_norm": 2.1875, + "learning_rate": 7.097532512896734e-06, + "loss": 1.0121, + "step": 5499 + }, + { + "epoch": 1.0836810674395942, + "grad_norm": 2.046875, + "learning_rate": 7.096575875664939e-06, + "loss": 0.9781, + "step": 5500 + }, + { + "epoch": 1.0838809624946903, + "grad_norm": 2.359375, + "learning_rate": 7.095619145303469e-06, + "loss": 1.0666, + "step": 5501 + }, + { + "epoch": 1.0840808575497864, + "grad_norm": 2.21875, + "learning_rate": 7.094662321854818e-06, + "loss": 0.9576, + "step": 5502 + }, + { + "epoch": 1.0842807526048825, + "grad_norm": 2.40625, + "learning_rate": 7.09370540536149e-06, + "loss": 0.9628, + "step": 5503 + }, + { + "epoch": 1.0844806476599784, + "grad_norm": 2.109375, + "learning_rate": 7.092748395865995e-06, + "loss": 0.8464, + "step": 5504 + }, + { + "epoch": 1.0846805427150745, + "grad_norm": 2.453125, + "learning_rate": 7.091791293410838e-06, + "loss": 1.0179, + "step": 5505 + }, + { + "epoch": 1.0848804377701706, + "grad_norm": 2.296875, + "learning_rate": 7.090834098038535e-06, + "loss": 1.0201, + "step": 5506 + }, + { + "epoch": 1.0850803328252667, + "grad_norm": 2.09375, + "learning_rate": 7.0898768097916045e-06, + "loss": 0.9504, + "step": 5507 + }, + { + "epoch": 1.0852802278803628, + "grad_norm": 2.015625, + "learning_rate": 7.08891942871257e-06, + "loss": 0.8916, + "step": 5508 + }, + { + "epoch": 1.085480122935459, + "grad_norm": 2.0625, + "learning_rate": 7.087961954843956e-06, + "loss": 0.9202, + "step": 5509 + }, + { + "epoch": 1.085680017990555, + "grad_norm": 2.078125, + "learning_rate": 7.087004388228297e-06, + "loss": 0.9722, + "step": 5510 + }, + { + "epoch": 1.0858799130456511, + "grad_norm": 2.09375, + "learning_rate": 7.086046728908125e-06, + "loss": 1.0817, + "step": 5511 + }, + { + "epoch": 1.086079808100747, + "grad_norm": 2.078125, + "learning_rate": 7.085088976925979e-06, + "loss": 0.8969, + "step": 5512 + }, + { + "epoch": 1.086279703155843, + "grad_norm": 2.109375, + "learning_rate": 7.084131132324405e-06, + "loss": 0.9292, + "step": 5513 + }, + { + "epoch": 1.0864795982109392, + "grad_norm": 2.09375, + "learning_rate": 7.083173195145947e-06, + "loss": 0.8921, + "step": 5514 + }, + { + "epoch": 1.0866794932660353, + "grad_norm": 2.125, + "learning_rate": 7.08221516543316e-06, + "loss": 0.9571, + "step": 5515 + }, + { + "epoch": 1.0868793883211314, + "grad_norm": 2.125, + "learning_rate": 7.081257043228597e-06, + "loss": 1.0, + "step": 5516 + }, + { + "epoch": 1.0870792833762275, + "grad_norm": 2.265625, + "learning_rate": 7.080298828574818e-06, + "loss": 0.9582, + "step": 5517 + }, + { + "epoch": 1.0872791784313236, + "grad_norm": 2.0625, + "learning_rate": 7.079340521514389e-06, + "loss": 0.9338, + "step": 5518 + }, + { + "epoch": 1.0874790734864197, + "grad_norm": 2.125, + "learning_rate": 7.078382122089873e-06, + "loss": 1.0542, + "step": 5519 + }, + { + "epoch": 1.0876789685415158, + "grad_norm": 2.09375, + "learning_rate": 7.0774236303438485e-06, + "loss": 0.9537, + "step": 5520 + }, + { + "epoch": 1.0878788635966117, + "grad_norm": 2.25, + "learning_rate": 7.076465046318886e-06, + "loss": 0.903, + "step": 5521 + }, + { + "epoch": 1.0880787586517078, + "grad_norm": 2.21875, + "learning_rate": 7.075506370057569e-06, + "loss": 0.9471, + "step": 5522 + }, + { + "epoch": 1.0882786537068039, + "grad_norm": 2.078125, + "learning_rate": 7.074547601602479e-06, + "loss": 0.96, + "step": 5523 + }, + { + "epoch": 1.0884785487619, + "grad_norm": 1.9765625, + "learning_rate": 7.073588740996208e-06, + "loss": 0.9324, + "step": 5524 + }, + { + "epoch": 1.088678443816996, + "grad_norm": 2.125, + "learning_rate": 7.072629788281345e-06, + "loss": 1.002, + "step": 5525 + }, + { + "epoch": 1.0888783388720922, + "grad_norm": 2.046875, + "learning_rate": 7.07167074350049e-06, + "loss": 0.9232, + "step": 5526 + }, + { + "epoch": 1.0890782339271883, + "grad_norm": 2.0625, + "learning_rate": 7.0707116066962415e-06, + "loss": 0.9671, + "step": 5527 + }, + { + "epoch": 1.0892781289822844, + "grad_norm": 2.21875, + "learning_rate": 7.069752377911203e-06, + "loss": 0.9804, + "step": 5528 + }, + { + "epoch": 1.0894780240373805, + "grad_norm": 2.296875, + "learning_rate": 7.068793057187986e-06, + "loss": 0.8772, + "step": 5529 + }, + { + "epoch": 1.0896779190924764, + "grad_norm": 2.203125, + "learning_rate": 7.067833644569202e-06, + "loss": 1.0187, + "step": 5530 + }, + { + "epoch": 1.0898778141475725, + "grad_norm": 2.15625, + "learning_rate": 7.066874140097468e-06, + "loss": 0.9763, + "step": 5531 + }, + { + "epoch": 1.0900777092026686, + "grad_norm": 2.125, + "learning_rate": 7.065914543815408e-06, + "loss": 0.9646, + "step": 5532 + }, + { + "epoch": 1.0902776042577647, + "grad_norm": 2.15625, + "learning_rate": 7.064954855765641e-06, + "loss": 0.9707, + "step": 5533 + }, + { + "epoch": 1.0904774993128608, + "grad_norm": 2.03125, + "learning_rate": 7.063995075990801e-06, + "loss": 1.0343, + "step": 5534 + }, + { + "epoch": 1.0906773943679569, + "grad_norm": 2.046875, + "learning_rate": 7.063035204533522e-06, + "loss": 0.9079, + "step": 5535 + }, + { + "epoch": 1.090877289423053, + "grad_norm": 2.0, + "learning_rate": 7.062075241436439e-06, + "loss": 0.9172, + "step": 5536 + }, + { + "epoch": 1.091077184478149, + "grad_norm": 2.296875, + "learning_rate": 7.061115186742192e-06, + "loss": 1.0221, + "step": 5537 + }, + { + "epoch": 1.0912770795332452, + "grad_norm": 2.03125, + "learning_rate": 7.060155040493431e-06, + "loss": 0.9661, + "step": 5538 + }, + { + "epoch": 1.091476974588341, + "grad_norm": 2.171875, + "learning_rate": 7.059194802732802e-06, + "loss": 1.0385, + "step": 5539 + }, + { + "epoch": 1.0916768696434371, + "grad_norm": 2.203125, + "learning_rate": 7.0582344735029585e-06, + "loss": 0.9009, + "step": 5540 + }, + { + "epoch": 1.0918767646985332, + "grad_norm": 2.15625, + "learning_rate": 7.0572740528465625e-06, + "loss": 0.9973, + "step": 5541 + }, + { + "epoch": 1.0920766597536293, + "grad_norm": 2.171875, + "learning_rate": 7.0563135408062696e-06, + "loss": 1.022, + "step": 5542 + }, + { + "epoch": 1.0922765548087254, + "grad_norm": 2.140625, + "learning_rate": 7.055352937424751e-06, + "loss": 0.947, + "step": 5543 + }, + { + "epoch": 1.0924764498638215, + "grad_norm": 2.046875, + "learning_rate": 7.0543922427446734e-06, + "loss": 0.9117, + "step": 5544 + }, + { + "epoch": 1.0926763449189176, + "grad_norm": 2.140625, + "learning_rate": 7.053431456808712e-06, + "loss": 0.9254, + "step": 5545 + }, + { + "epoch": 1.0928762399740137, + "grad_norm": 2.15625, + "learning_rate": 7.052470579659545e-06, + "loss": 1.0449, + "step": 5546 + }, + { + "epoch": 1.0930761350291096, + "grad_norm": 2.109375, + "learning_rate": 7.051509611339853e-06, + "loss": 0.9985, + "step": 5547 + }, + { + "epoch": 1.0932760300842057, + "grad_norm": 2.125, + "learning_rate": 7.050548551892325e-06, + "loss": 0.9993, + "step": 5548 + }, + { + "epoch": 1.0934759251393018, + "grad_norm": 2.65625, + "learning_rate": 7.049587401359647e-06, + "loss": 0.9773, + "step": 5549 + }, + { + "epoch": 1.093675820194398, + "grad_norm": 2.171875, + "learning_rate": 7.048626159784517e-06, + "loss": 1.0189, + "step": 5550 + }, + { + "epoch": 1.093875715249494, + "grad_norm": 2.109375, + "learning_rate": 7.047664827209633e-06, + "loss": 0.9613, + "step": 5551 + }, + { + "epoch": 1.09407561030459, + "grad_norm": 2.125, + "learning_rate": 7.0467034036776945e-06, + "loss": 0.938, + "step": 5552 + }, + { + "epoch": 1.0942755053596862, + "grad_norm": 2.09375, + "learning_rate": 7.04574188923141e-06, + "loss": 0.9479, + "step": 5553 + }, + { + "epoch": 1.0944754004147823, + "grad_norm": 2.203125, + "learning_rate": 7.044780283913488e-06, + "loss": 1.0025, + "step": 5554 + }, + { + "epoch": 1.0946752954698784, + "grad_norm": 2.203125, + "learning_rate": 7.043818587766645e-06, + "loss": 1.0156, + "step": 5555 + }, + { + "epoch": 1.0948751905249743, + "grad_norm": 2.140625, + "learning_rate": 7.0428568008336e-06, + "loss": 1.1055, + "step": 5556 + }, + { + "epoch": 1.0950750855800704, + "grad_norm": 2.375, + "learning_rate": 7.041894923157071e-06, + "loss": 1.0323, + "step": 5557 + }, + { + "epoch": 1.0952749806351665, + "grad_norm": 2.15625, + "learning_rate": 7.04093295477979e-06, + "loss": 1.035, + "step": 5558 + }, + { + "epoch": 1.0954748756902626, + "grad_norm": 2.09375, + "learning_rate": 7.039970895744485e-06, + "loss": 1.0006, + "step": 5559 + }, + { + "epoch": 1.0956747707453587, + "grad_norm": 2.140625, + "learning_rate": 7.039008746093889e-06, + "loss": 1.0166, + "step": 5560 + }, + { + "epoch": 1.0958746658004548, + "grad_norm": 2.140625, + "learning_rate": 7.038046505870744e-06, + "loss": 0.9481, + "step": 5561 + }, + { + "epoch": 1.0960745608555509, + "grad_norm": 2.140625, + "learning_rate": 7.03708417511779e-06, + "loss": 0.9159, + "step": 5562 + }, + { + "epoch": 1.096274455910647, + "grad_norm": 2.109375, + "learning_rate": 7.036121753877776e-06, + "loss": 1.0279, + "step": 5563 + }, + { + "epoch": 1.096474350965743, + "grad_norm": 2.15625, + "learning_rate": 7.035159242193449e-06, + "loss": 1.049, + "step": 5564 + }, + { + "epoch": 1.096674246020839, + "grad_norm": 2.15625, + "learning_rate": 7.0341966401075664e-06, + "loss": 0.9342, + "step": 5565 + }, + { + "epoch": 1.096874141075935, + "grad_norm": 2.0625, + "learning_rate": 7.033233947662887e-06, + "loss": 0.8556, + "step": 5566 + }, + { + "epoch": 1.0970740361310312, + "grad_norm": 2.09375, + "learning_rate": 7.0322711649021735e-06, + "loss": 0.9793, + "step": 5567 + }, + { + "epoch": 1.0972739311861273, + "grad_norm": 2.09375, + "learning_rate": 7.031308291868191e-06, + "loss": 1.0418, + "step": 5568 + }, + { + "epoch": 1.0974738262412234, + "grad_norm": 2.09375, + "learning_rate": 7.030345328603711e-06, + "loss": 0.9986, + "step": 5569 + }, + { + "epoch": 1.0976737212963195, + "grad_norm": 2.09375, + "learning_rate": 7.029382275151508e-06, + "loss": 0.9284, + "step": 5570 + }, + { + "epoch": 1.0978736163514156, + "grad_norm": 2.125, + "learning_rate": 7.0284191315543606e-06, + "loss": 1.0405, + "step": 5571 + }, + { + "epoch": 1.0980735114065117, + "grad_norm": 2.21875, + "learning_rate": 7.027455897855053e-06, + "loss": 1.0036, + "step": 5572 + }, + { + "epoch": 1.0982734064616078, + "grad_norm": 2.09375, + "learning_rate": 7.026492574096371e-06, + "loss": 0.999, + "step": 5573 + }, + { + "epoch": 1.0984733015167036, + "grad_norm": 2.171875, + "learning_rate": 7.025529160321107e-06, + "loss": 0.9254, + "step": 5574 + }, + { + "epoch": 1.0986731965717997, + "grad_norm": 2.09375, + "learning_rate": 7.024565656572051e-06, + "loss": 1.0104, + "step": 5575 + }, + { + "epoch": 1.0988730916268958, + "grad_norm": 2.203125, + "learning_rate": 7.023602062892005e-06, + "loss": 1.0095, + "step": 5576 + }, + { + "epoch": 1.099072986681992, + "grad_norm": 2.125, + "learning_rate": 7.022638379323774e-06, + "loss": 0.9809, + "step": 5577 + }, + { + "epoch": 1.099272881737088, + "grad_norm": 2.125, + "learning_rate": 7.021674605910161e-06, + "loss": 0.9851, + "step": 5578 + }, + { + "epoch": 1.0994727767921841, + "grad_norm": 2.234375, + "learning_rate": 7.0207107426939755e-06, + "loss": 0.9673, + "step": 5579 + }, + { + "epoch": 1.0996726718472802, + "grad_norm": 2.203125, + "learning_rate": 7.019746789718038e-06, + "loss": 0.9563, + "step": 5580 + }, + { + "epoch": 1.0998725669023763, + "grad_norm": 2.109375, + "learning_rate": 7.018782747025161e-06, + "loss": 0.9989, + "step": 5581 + }, + { + "epoch": 1.1000724619574722, + "grad_norm": 2.171875, + "learning_rate": 7.0178186146581725e-06, + "loss": 0.9575, + "step": 5582 + }, + { + "epoch": 1.1002723570125683, + "grad_norm": 1.9609375, + "learning_rate": 7.0168543926598965e-06, + "loss": 0.9197, + "step": 5583 + }, + { + "epoch": 1.1004722520676644, + "grad_norm": 2.21875, + "learning_rate": 7.015890081073162e-06, + "loss": 0.9936, + "step": 5584 + }, + { + "epoch": 1.1006721471227605, + "grad_norm": 2.0, + "learning_rate": 7.014925679940807e-06, + "loss": 0.8853, + "step": 5585 + }, + { + "epoch": 1.1008720421778566, + "grad_norm": 2.15625, + "learning_rate": 7.013961189305668e-06, + "loss": 1.0785, + "step": 5586 + }, + { + "epoch": 1.1010719372329527, + "grad_norm": 2.078125, + "learning_rate": 7.012996609210587e-06, + "loss": 0.9489, + "step": 5587 + }, + { + "epoch": 1.1012718322880488, + "grad_norm": 2.09375, + "learning_rate": 7.012031939698414e-06, + "loss": 0.9351, + "step": 5588 + }, + { + "epoch": 1.101471727343145, + "grad_norm": 2.234375, + "learning_rate": 7.011067180811994e-06, + "loss": 0.9849, + "step": 5589 + }, + { + "epoch": 1.101671622398241, + "grad_norm": 2.1875, + "learning_rate": 7.010102332594186e-06, + "loss": 1.059, + "step": 5590 + }, + { + "epoch": 1.1018715174533371, + "grad_norm": 2.234375, + "learning_rate": 7.009137395087848e-06, + "loss": 1.0314, + "step": 5591 + }, + { + "epoch": 1.102071412508433, + "grad_norm": 2.203125, + "learning_rate": 7.008172368335842e-06, + "loss": 0.8892, + "step": 5592 + }, + { + "epoch": 1.102271307563529, + "grad_norm": 2.203125, + "learning_rate": 7.007207252381032e-06, + "loss": 0.9998, + "step": 5593 + }, + { + "epoch": 1.1024712026186252, + "grad_norm": 2.25, + "learning_rate": 7.006242047266292e-06, + "loss": 0.9962, + "step": 5594 + }, + { + "epoch": 1.1026710976737213, + "grad_norm": 2.3125, + "learning_rate": 7.005276753034492e-06, + "loss": 0.9656, + "step": 5595 + }, + { + "epoch": 1.1028709927288174, + "grad_norm": 2.109375, + "learning_rate": 7.004311369728514e-06, + "loss": 1.0528, + "step": 5596 + }, + { + "epoch": 1.1030708877839135, + "grad_norm": 2.1875, + "learning_rate": 7.003345897391241e-06, + "loss": 0.94, + "step": 5597 + }, + { + "epoch": 1.1032707828390096, + "grad_norm": 2.25, + "learning_rate": 7.002380336065555e-06, + "loss": 1.0038, + "step": 5598 + }, + { + "epoch": 1.1034706778941057, + "grad_norm": 2.09375, + "learning_rate": 7.0014146857943486e-06, + "loss": 0.9046, + "step": 5599 + }, + { + "epoch": 1.1036705729492016, + "grad_norm": 2.15625, + "learning_rate": 7.000448946620517e-06, + "loss": 1.0184, + "step": 5600 + }, + { + "epoch": 1.1038704680042977, + "grad_norm": 2.0625, + "learning_rate": 6.999483118586955e-06, + "loss": 0.9515, + "step": 5601 + }, + { + "epoch": 1.1040703630593938, + "grad_norm": 2.046875, + "learning_rate": 6.998517201736566e-06, + "loss": 0.9786, + "step": 5602 + }, + { + "epoch": 1.1042702581144899, + "grad_norm": 2.28125, + "learning_rate": 6.997551196112259e-06, + "loss": 1.0022, + "step": 5603 + }, + { + "epoch": 1.104470153169586, + "grad_norm": 2.15625, + "learning_rate": 6.996585101756938e-06, + "loss": 0.9757, + "step": 5604 + }, + { + "epoch": 1.104670048224682, + "grad_norm": 2.015625, + "learning_rate": 6.9956189187135226e-06, + "loss": 0.8901, + "step": 5605 + }, + { + "epoch": 1.1048699432797782, + "grad_norm": 2.078125, + "learning_rate": 6.994652647024927e-06, + "loss": 0.9721, + "step": 5606 + }, + { + "epoch": 1.1050698383348743, + "grad_norm": 2.046875, + "learning_rate": 6.9936862867340736e-06, + "loss": 1.022, + "step": 5607 + }, + { + "epoch": 1.1052697333899704, + "grad_norm": 2.140625, + "learning_rate": 6.99271983788389e-06, + "loss": 1.0285, + "step": 5608 + }, + { + "epoch": 1.1054696284450662, + "grad_norm": 2.1875, + "learning_rate": 6.991753300517302e-06, + "loss": 1.0873, + "step": 5609 + }, + { + "epoch": 1.1056695235001623, + "grad_norm": 2.09375, + "learning_rate": 6.990786674677246e-06, + "loss": 0.9022, + "step": 5610 + }, + { + "epoch": 1.1058694185552584, + "grad_norm": 1.96875, + "learning_rate": 6.989819960406659e-06, + "loss": 0.8544, + "step": 5611 + }, + { + "epoch": 1.1060693136103545, + "grad_norm": 2.046875, + "learning_rate": 6.988853157748479e-06, + "loss": 0.9286, + "step": 5612 + }, + { + "epoch": 1.1062692086654506, + "grad_norm": 2.1875, + "learning_rate": 6.987886266745658e-06, + "loss": 0.9779, + "step": 5613 + }, + { + "epoch": 1.1064691037205467, + "grad_norm": 2.125, + "learning_rate": 6.986919287441141e-06, + "loss": 0.8984, + "step": 5614 + }, + { + "epoch": 1.1066689987756428, + "grad_norm": 2.09375, + "learning_rate": 6.985952219877879e-06, + "loss": 0.9477, + "step": 5615 + }, + { + "epoch": 1.106868893830739, + "grad_norm": 2.109375, + "learning_rate": 6.984985064098833e-06, + "loss": 1.0035, + "step": 5616 + }, + { + "epoch": 1.1070687888858348, + "grad_norm": 2.078125, + "learning_rate": 6.984017820146962e-06, + "loss": 0.8997, + "step": 5617 + }, + { + "epoch": 1.107268683940931, + "grad_norm": 2.140625, + "learning_rate": 6.983050488065231e-06, + "loss": 1.0193, + "step": 5618 + }, + { + "epoch": 1.107468578996027, + "grad_norm": 1.9921875, + "learning_rate": 6.98208306789661e-06, + "loss": 0.9445, + "step": 5619 + }, + { + "epoch": 1.1076684740511231, + "grad_norm": 2.1875, + "learning_rate": 6.98111555968407e-06, + "loss": 0.9414, + "step": 5620 + }, + { + "epoch": 1.1078683691062192, + "grad_norm": 2.296875, + "learning_rate": 6.9801479634705885e-06, + "loss": 1.0249, + "step": 5621 + }, + { + "epoch": 1.1080682641613153, + "grad_norm": 2.046875, + "learning_rate": 6.979180279299147e-06, + "loss": 0.9569, + "step": 5622 + }, + { + "epoch": 1.1082681592164114, + "grad_norm": 2.078125, + "learning_rate": 6.978212507212727e-06, + "loss": 1.0551, + "step": 5623 + }, + { + "epoch": 1.1084680542715075, + "grad_norm": 2.171875, + "learning_rate": 6.97724464725432e-06, + "loss": 0.875, + "step": 5624 + }, + { + "epoch": 1.1086679493266036, + "grad_norm": 2.078125, + "learning_rate": 6.9762766994669175e-06, + "loss": 0.9271, + "step": 5625 + }, + { + "epoch": 1.1088678443816997, + "grad_norm": 2.078125, + "learning_rate": 6.9753086638935144e-06, + "loss": 0.9936, + "step": 5626 + }, + { + "epoch": 1.1090677394367956, + "grad_norm": 2.0625, + "learning_rate": 6.974340540577111e-06, + "loss": 0.9048, + "step": 5627 + }, + { + "epoch": 1.1092676344918917, + "grad_norm": 2.234375, + "learning_rate": 6.973372329560713e-06, + "loss": 0.9632, + "step": 5628 + }, + { + "epoch": 1.1094675295469878, + "grad_norm": 2.015625, + "learning_rate": 6.972404030887325e-06, + "loss": 0.8766, + "step": 5629 + }, + { + "epoch": 1.109667424602084, + "grad_norm": 2.125, + "learning_rate": 6.971435644599961e-06, + "loss": 0.8904, + "step": 5630 + }, + { + "epoch": 1.10986731965718, + "grad_norm": 2.09375, + "learning_rate": 6.970467170741637e-06, + "loss": 0.9855, + "step": 5631 + }, + { + "epoch": 1.110067214712276, + "grad_norm": 2.203125, + "learning_rate": 6.969498609355372e-06, + "loss": 0.9869, + "step": 5632 + }, + { + "epoch": 1.1102671097673722, + "grad_norm": 2.1875, + "learning_rate": 6.968529960484189e-06, + "loss": 0.9859, + "step": 5633 + }, + { + "epoch": 1.1104670048224683, + "grad_norm": 2.15625, + "learning_rate": 6.967561224171114e-06, + "loss": 0.9741, + "step": 5634 + }, + { + "epoch": 1.1106668998775642, + "grad_norm": 2.203125, + "learning_rate": 6.96659240045918e-06, + "loss": 0.8432, + "step": 5635 + }, + { + "epoch": 1.1108667949326603, + "grad_norm": 2.171875, + "learning_rate": 6.965623489391423e-06, + "loss": 1.0741, + "step": 5636 + }, + { + "epoch": 1.1110666899877564, + "grad_norm": 2.09375, + "learning_rate": 6.964654491010879e-06, + "loss": 0.9718, + "step": 5637 + }, + { + "epoch": 1.1112665850428525, + "grad_norm": 2.109375, + "learning_rate": 6.963685405360594e-06, + "loss": 1.0086, + "step": 5638 + }, + { + "epoch": 1.1114664800979486, + "grad_norm": 2.15625, + "learning_rate": 6.962716232483612e-06, + "loss": 0.9935, + "step": 5639 + }, + { + "epoch": 1.1116663751530447, + "grad_norm": 2.046875, + "learning_rate": 6.961746972422985e-06, + "loss": 0.9189, + "step": 5640 + }, + { + "epoch": 1.1118662702081408, + "grad_norm": 2.234375, + "learning_rate": 6.960777625221765e-06, + "loss": 0.8785, + "step": 5641 + }, + { + "epoch": 1.1120661652632369, + "grad_norm": 2.0625, + "learning_rate": 6.959808190923015e-06, + "loss": 1.0427, + "step": 5642 + }, + { + "epoch": 1.112266060318333, + "grad_norm": 2.046875, + "learning_rate": 6.958838669569793e-06, + "loss": 0.944, + "step": 5643 + }, + { + "epoch": 1.1124659553734288, + "grad_norm": 2.09375, + "learning_rate": 6.957869061205168e-06, + "loss": 0.9901, + "step": 5644 + }, + { + "epoch": 1.112665850428525, + "grad_norm": 2.234375, + "learning_rate": 6.956899365872207e-06, + "loss": 0.9979, + "step": 5645 + }, + { + "epoch": 1.112865745483621, + "grad_norm": 2.265625, + "learning_rate": 6.955929583613985e-06, + "loss": 1.0601, + "step": 5646 + }, + { + "epoch": 1.1130656405387171, + "grad_norm": 2.1875, + "learning_rate": 6.95495971447358e-06, + "loss": 1.0979, + "step": 5647 + }, + { + "epoch": 1.1132655355938132, + "grad_norm": 2.15625, + "learning_rate": 6.953989758494075e-06, + "loss": 0.9665, + "step": 5648 + }, + { + "epoch": 1.1134654306489093, + "grad_norm": 2.203125, + "learning_rate": 6.953019715718552e-06, + "loss": 0.9663, + "step": 5649 + }, + { + "epoch": 1.1136653257040054, + "grad_norm": 2.1875, + "learning_rate": 6.952049586190102e-06, + "loss": 1.0084, + "step": 5650 + }, + { + "epoch": 1.1138652207591015, + "grad_norm": 2.140625, + "learning_rate": 6.951079369951817e-06, + "loss": 0.9027, + "step": 5651 + }, + { + "epoch": 1.1140651158141976, + "grad_norm": 2.15625, + "learning_rate": 6.950109067046797e-06, + "loss": 0.9388, + "step": 5652 + }, + { + "epoch": 1.1142650108692935, + "grad_norm": 2.203125, + "learning_rate": 6.949138677518139e-06, + "loss": 1.0136, + "step": 5653 + }, + { + "epoch": 1.1144649059243896, + "grad_norm": 2.125, + "learning_rate": 6.948168201408949e-06, + "loss": 0.9745, + "step": 5654 + }, + { + "epoch": 1.1146648009794857, + "grad_norm": 2.0625, + "learning_rate": 6.947197638762335e-06, + "loss": 0.9844, + "step": 5655 + }, + { + "epoch": 1.1148646960345818, + "grad_norm": 2.046875, + "learning_rate": 6.946226989621412e-06, + "loss": 0.9096, + "step": 5656 + }, + { + "epoch": 1.115064591089678, + "grad_norm": 2.046875, + "learning_rate": 6.945256254029292e-06, + "loss": 1.0328, + "step": 5657 + }, + { + "epoch": 1.115264486144774, + "grad_norm": 2.171875, + "learning_rate": 6.944285432029098e-06, + "loss": 0.9901, + "step": 5658 + }, + { + "epoch": 1.1154643811998701, + "grad_norm": 2.171875, + "learning_rate": 6.943314523663953e-06, + "loss": 0.9223, + "step": 5659 + }, + { + "epoch": 1.1156642762549662, + "grad_norm": 2.203125, + "learning_rate": 6.942343528976984e-06, + "loss": 0.9988, + "step": 5660 + }, + { + "epoch": 1.1158641713100623, + "grad_norm": 2.109375, + "learning_rate": 6.9413724480113224e-06, + "loss": 0.9521, + "step": 5661 + }, + { + "epoch": 1.1160640663651582, + "grad_norm": 2.15625, + "learning_rate": 6.940401280810105e-06, + "loss": 0.9889, + "step": 5662 + }, + { + "epoch": 1.1162639614202543, + "grad_norm": 2.109375, + "learning_rate": 6.939430027416468e-06, + "loss": 1.0112, + "step": 5663 + }, + { + "epoch": 1.1164638564753504, + "grad_norm": 2.265625, + "learning_rate": 6.938458687873558e-06, + "loss": 1.0181, + "step": 5664 + }, + { + "epoch": 1.1166637515304465, + "grad_norm": 2.1875, + "learning_rate": 6.93748726222452e-06, + "loss": 0.9091, + "step": 5665 + }, + { + "epoch": 1.1168636465855426, + "grad_norm": 2.046875, + "learning_rate": 6.936515750512505e-06, + "loss": 0.9331, + "step": 5666 + }, + { + "epoch": 1.1170635416406387, + "grad_norm": 2.25, + "learning_rate": 6.935544152780666e-06, + "loss": 1.0873, + "step": 5667 + }, + { + "epoch": 1.1172634366957348, + "grad_norm": 2.296875, + "learning_rate": 6.934572469072163e-06, + "loss": 0.8997, + "step": 5668 + }, + { + "epoch": 1.117463331750831, + "grad_norm": 2.140625, + "learning_rate": 6.933600699430157e-06, + "loss": 1.0442, + "step": 5669 + }, + { + "epoch": 1.1176632268059268, + "grad_norm": 2.203125, + "learning_rate": 6.932628843897816e-06, + "loss": 0.9619, + "step": 5670 + }, + { + "epoch": 1.1178631218610229, + "grad_norm": 2.125, + "learning_rate": 6.931656902518307e-06, + "loss": 0.9943, + "step": 5671 + }, + { + "epoch": 1.118063016916119, + "grad_norm": 2.109375, + "learning_rate": 6.930684875334806e-06, + "loss": 0.9998, + "step": 5672 + }, + { + "epoch": 1.118262911971215, + "grad_norm": 2.109375, + "learning_rate": 6.929712762390487e-06, + "loss": 1.0107, + "step": 5673 + }, + { + "epoch": 1.1184628070263112, + "grad_norm": 2.0625, + "learning_rate": 6.928740563728533e-06, + "loss": 1.0266, + "step": 5674 + }, + { + "epoch": 1.1186627020814073, + "grad_norm": 2.140625, + "learning_rate": 6.927768279392132e-06, + "loss": 0.9991, + "step": 5675 + }, + { + "epoch": 1.1188625971365034, + "grad_norm": 2.1875, + "learning_rate": 6.926795909424468e-06, + "loss": 1.0663, + "step": 5676 + }, + { + "epoch": 1.1190624921915995, + "grad_norm": 2.109375, + "learning_rate": 6.9258234538687366e-06, + "loss": 0.9595, + "step": 5677 + }, + { + "epoch": 1.1192623872466956, + "grad_norm": 2.078125, + "learning_rate": 6.924850912768133e-06, + "loss": 0.9147, + "step": 5678 + }, + { + "epoch": 1.1194622823017915, + "grad_norm": 2.15625, + "learning_rate": 6.923878286165856e-06, + "loss": 0.9804, + "step": 5679 + }, + { + "epoch": 1.1196621773568876, + "grad_norm": 2.0, + "learning_rate": 6.922905574105111e-06, + "loss": 0.8939, + "step": 5680 + }, + { + "epoch": 1.1198620724119837, + "grad_norm": 2.140625, + "learning_rate": 6.921932776629107e-06, + "loss": 0.9842, + "step": 5681 + }, + { + "epoch": 1.1200619674670798, + "grad_norm": 2.21875, + "learning_rate": 6.920959893781054e-06, + "loss": 1.0701, + "step": 5682 + }, + { + "epoch": 1.1202618625221759, + "grad_norm": 2.015625, + "learning_rate": 6.919986925604166e-06, + "loss": 0.9869, + "step": 5683 + }, + { + "epoch": 1.120461757577272, + "grad_norm": 2.390625, + "learning_rate": 6.919013872141667e-06, + "loss": 1.0938, + "step": 5684 + }, + { + "epoch": 1.120661652632368, + "grad_norm": 2.125, + "learning_rate": 6.918040733436774e-06, + "loss": 0.9659, + "step": 5685 + }, + { + "epoch": 1.1208615476874642, + "grad_norm": 2.09375, + "learning_rate": 6.917067509532718e-06, + "loss": 0.9582, + "step": 5686 + }, + { + "epoch": 1.1210614427425603, + "grad_norm": 2.0625, + "learning_rate": 6.916094200472727e-06, + "loss": 0.9951, + "step": 5687 + }, + { + "epoch": 1.1212613377976561, + "grad_norm": 2.1875, + "learning_rate": 6.915120806300036e-06, + "loss": 0.9728, + "step": 5688 + }, + { + "epoch": 1.1214612328527522, + "grad_norm": 2.15625, + "learning_rate": 6.914147327057885e-06, + "loss": 0.9721, + "step": 5689 + }, + { + "epoch": 1.1216611279078483, + "grad_norm": 2.15625, + "learning_rate": 6.913173762789515e-06, + "loss": 0.9446, + "step": 5690 + }, + { + "epoch": 1.1218610229629444, + "grad_norm": 2.203125, + "learning_rate": 6.912200113538168e-06, + "loss": 0.9531, + "step": 5691 + }, + { + "epoch": 1.1220609180180405, + "grad_norm": 2.09375, + "learning_rate": 6.911226379347097e-06, + "loss": 0.9618, + "step": 5692 + }, + { + "epoch": 1.1222608130731366, + "grad_norm": 2.171875, + "learning_rate": 6.910252560259555e-06, + "loss": 0.9827, + "step": 5693 + }, + { + "epoch": 1.1224607081282327, + "grad_norm": 2.15625, + "learning_rate": 6.9092786563187975e-06, + "loss": 1.0449, + "step": 5694 + }, + { + "epoch": 1.1226606031833288, + "grad_norm": 2.25, + "learning_rate": 6.908304667568087e-06, + "loss": 1.0932, + "step": 5695 + }, + { + "epoch": 1.122860498238425, + "grad_norm": 2.109375, + "learning_rate": 6.907330594050685e-06, + "loss": 0.9571, + "step": 5696 + }, + { + "epoch": 1.1230603932935208, + "grad_norm": 2.265625, + "learning_rate": 6.9063564358098636e-06, + "loss": 1.0944, + "step": 5697 + }, + { + "epoch": 1.123260288348617, + "grad_norm": 2.25, + "learning_rate": 6.905382192888893e-06, + "loss": 1.1063, + "step": 5698 + }, + { + "epoch": 1.123460183403713, + "grad_norm": 2.25, + "learning_rate": 6.904407865331048e-06, + "loss": 0.8986, + "step": 5699 + }, + { + "epoch": 1.123660078458809, + "grad_norm": 2.09375, + "learning_rate": 6.903433453179609e-06, + "loss": 1.0043, + "step": 5700 + }, + { + "epoch": 1.1238599735139052, + "grad_norm": 2.0625, + "learning_rate": 6.90245895647786e-06, + "loss": 0.9294, + "step": 5701 + }, + { + "epoch": 1.1240598685690013, + "grad_norm": 2.140625, + "learning_rate": 6.901484375269086e-06, + "loss": 0.9703, + "step": 5702 + }, + { + "epoch": 1.1242597636240974, + "grad_norm": 2.109375, + "learning_rate": 6.900509709596581e-06, + "loss": 0.9744, + "step": 5703 + }, + { + "epoch": 1.1244596586791935, + "grad_norm": 2.375, + "learning_rate": 6.8995349595036365e-06, + "loss": 0.9381, + "step": 5704 + }, + { + "epoch": 1.1246595537342894, + "grad_norm": 2.109375, + "learning_rate": 6.898560125033552e-06, + "loss": 0.9513, + "step": 5705 + }, + { + "epoch": 1.1248594487893855, + "grad_norm": 2.125, + "learning_rate": 6.897585206229631e-06, + "loss": 0.935, + "step": 5706 + }, + { + "epoch": 1.1250593438444816, + "grad_norm": 2.046875, + "learning_rate": 6.896610203135176e-06, + "loss": 0.9522, + "step": 5707 + }, + { + "epoch": 1.1252592388995777, + "grad_norm": 2.203125, + "learning_rate": 6.8956351157935e-06, + "loss": 1.0196, + "step": 5708 + }, + { + "epoch": 1.1254591339546738, + "grad_norm": 2.078125, + "learning_rate": 6.894659944247914e-06, + "loss": 0.9577, + "step": 5709 + }, + { + "epoch": 1.1256590290097699, + "grad_norm": 2.171875, + "learning_rate": 6.8936846885417344e-06, + "loss": 0.971, + "step": 5710 + }, + { + "epoch": 1.125858924064866, + "grad_norm": 2.203125, + "learning_rate": 6.892709348718283e-06, + "loss": 0.9899, + "step": 5711 + }, + { + "epoch": 1.126058819119962, + "grad_norm": 2.078125, + "learning_rate": 6.891733924820887e-06, + "loss": 0.9589, + "step": 5712 + }, + { + "epoch": 1.1262587141750582, + "grad_norm": 2.234375, + "learning_rate": 6.89075841689287e-06, + "loss": 0.9113, + "step": 5713 + }, + { + "epoch": 1.1264586092301543, + "grad_norm": 2.21875, + "learning_rate": 6.889782824977566e-06, + "loss": 0.9732, + "step": 5714 + }, + { + "epoch": 1.1266585042852502, + "grad_norm": 2.109375, + "learning_rate": 6.8888071491183114e-06, + "loss": 0.941, + "step": 5715 + }, + { + "epoch": 1.1268583993403463, + "grad_norm": 2.09375, + "learning_rate": 6.887831389358445e-06, + "loss": 0.946, + "step": 5716 + }, + { + "epoch": 1.1270582943954424, + "grad_norm": 2.140625, + "learning_rate": 6.8868555457413115e-06, + "loss": 1.0431, + "step": 5717 + }, + { + "epoch": 1.1272581894505385, + "grad_norm": 2.1875, + "learning_rate": 6.885879618310253e-06, + "loss": 0.9591, + "step": 5718 + }, + { + "epoch": 1.1274580845056346, + "grad_norm": 2.03125, + "learning_rate": 6.884903607108624e-06, + "loss": 0.9041, + "step": 5719 + }, + { + "epoch": 1.1276579795607307, + "grad_norm": 2.015625, + "learning_rate": 6.88392751217978e-06, + "loss": 1.0078, + "step": 5720 + }, + { + "epoch": 1.1278578746158268, + "grad_norm": 2.265625, + "learning_rate": 6.882951333567076e-06, + "loss": 1.0244, + "step": 5721 + }, + { + "epoch": 1.1280577696709229, + "grad_norm": 2.125, + "learning_rate": 6.881975071313876e-06, + "loss": 0.8964, + "step": 5722 + }, + { + "epoch": 1.1282576647260187, + "grad_norm": 2.15625, + "learning_rate": 6.880998725463543e-06, + "loss": 0.9442, + "step": 5723 + }, + { + "epoch": 1.1284575597811148, + "grad_norm": 2.21875, + "learning_rate": 6.880022296059448e-06, + "loss": 1.0363, + "step": 5724 + }, + { + "epoch": 1.128657454836211, + "grad_norm": 2.234375, + "learning_rate": 6.879045783144962e-06, + "loss": 0.9913, + "step": 5725 + }, + { + "epoch": 1.128857349891307, + "grad_norm": 2.15625, + "learning_rate": 6.878069186763466e-06, + "loss": 1.0463, + "step": 5726 + }, + { + "epoch": 1.1290572449464031, + "grad_norm": 2.0625, + "learning_rate": 6.877092506958334e-06, + "loss": 0.9575, + "step": 5727 + }, + { + "epoch": 1.1292571400014992, + "grad_norm": 2.0625, + "learning_rate": 6.876115743772954e-06, + "loss": 0.9106, + "step": 5728 + }, + { + "epoch": 1.1294570350565953, + "grad_norm": 2.09375, + "learning_rate": 6.8751388972507146e-06, + "loss": 0.936, + "step": 5729 + }, + { + "epoch": 1.1296569301116914, + "grad_norm": 2.140625, + "learning_rate": 6.874161967435005e-06, + "loss": 1.0641, + "step": 5730 + }, + { + "epoch": 1.1298568251667875, + "grad_norm": 2.1875, + "learning_rate": 6.87318495436922e-06, + "loss": 0.9322, + "step": 5731 + }, + { + "epoch": 1.1300567202218834, + "grad_norm": 2.078125, + "learning_rate": 6.87220785809676e-06, + "loss": 0.9547, + "step": 5732 + }, + { + "epoch": 1.1302566152769795, + "grad_norm": 2.203125, + "learning_rate": 6.871230678661027e-06, + "loss": 1.0251, + "step": 5733 + }, + { + "epoch": 1.1304565103320756, + "grad_norm": 2.125, + "learning_rate": 6.870253416105428e-06, + "loss": 0.9616, + "step": 5734 + }, + { + "epoch": 1.1306564053871717, + "grad_norm": 2.1875, + "learning_rate": 6.8692760704733705e-06, + "loss": 0.9055, + "step": 5735 + }, + { + "epoch": 1.1308563004422678, + "grad_norm": 2.21875, + "learning_rate": 6.868298641808271e-06, + "loss": 1.0096, + "step": 5736 + }, + { + "epoch": 1.131056195497364, + "grad_norm": 2.1875, + "learning_rate": 6.867321130153545e-06, + "loss": 0.9167, + "step": 5737 + }, + { + "epoch": 1.13125609055246, + "grad_norm": 2.0625, + "learning_rate": 6.866343535552614e-06, + "loss": 0.9472, + "step": 5738 + }, + { + "epoch": 1.1314559856075561, + "grad_norm": 2.0625, + "learning_rate": 6.865365858048902e-06, + "loss": 0.9423, + "step": 5739 + }, + { + "epoch": 1.131655880662652, + "grad_norm": 2.1875, + "learning_rate": 6.864388097685838e-06, + "loss": 1.0048, + "step": 5740 + }, + { + "epoch": 1.131855775717748, + "grad_norm": 2.125, + "learning_rate": 6.863410254506853e-06, + "loss": 0.8936, + "step": 5741 + }, + { + "epoch": 1.1320556707728442, + "grad_norm": 2.1875, + "learning_rate": 6.862432328555384e-06, + "loss": 0.9888, + "step": 5742 + }, + { + "epoch": 1.1322555658279403, + "grad_norm": 2.15625, + "learning_rate": 6.861454319874871e-06, + "loss": 1.0313, + "step": 5743 + }, + { + "epoch": 1.1324554608830364, + "grad_norm": 2.140625, + "learning_rate": 6.860476228508755e-06, + "loss": 1.0116, + "step": 5744 + }, + { + "epoch": 1.1326553559381325, + "grad_norm": 2.078125, + "learning_rate": 6.859498054500482e-06, + "loss": 1.0209, + "step": 5745 + }, + { + "epoch": 1.1328552509932286, + "grad_norm": 2.03125, + "learning_rate": 6.858519797893507e-06, + "loss": 1.0137, + "step": 5746 + }, + { + "epoch": 1.1330551460483247, + "grad_norm": 2.09375, + "learning_rate": 6.8575414587312785e-06, + "loss": 0.9231, + "step": 5747 + }, + { + "epoch": 1.1332550411034208, + "grad_norm": 2.15625, + "learning_rate": 6.856563037057259e-06, + "loss": 0.9772, + "step": 5748 + }, + { + "epoch": 1.133454936158517, + "grad_norm": 2.0625, + "learning_rate": 6.855584532914906e-06, + "loss": 0.9557, + "step": 5749 + }, + { + "epoch": 1.1336548312136128, + "grad_norm": 2.140625, + "learning_rate": 6.8546059463476864e-06, + "loss": 0.9826, + "step": 5750 + }, + { + "epoch": 1.1338547262687089, + "grad_norm": 2.171875, + "learning_rate": 6.853627277399071e-06, + "loss": 0.9502, + "step": 5751 + }, + { + "epoch": 1.134054621323805, + "grad_norm": 2.09375, + "learning_rate": 6.852648526112529e-06, + "loss": 1.0073, + "step": 5752 + }, + { + "epoch": 1.134254516378901, + "grad_norm": 2.3125, + "learning_rate": 6.851669692531535e-06, + "loss": 1.0135, + "step": 5753 + }, + { + "epoch": 1.1344544114339972, + "grad_norm": 2.078125, + "learning_rate": 6.850690776699574e-06, + "loss": 0.986, + "step": 5754 + }, + { + "epoch": 1.1346543064890933, + "grad_norm": 2.0625, + "learning_rate": 6.849711778660124e-06, + "loss": 1.0102, + "step": 5755 + }, + { + "epoch": 1.1348542015441894, + "grad_norm": 2.015625, + "learning_rate": 6.848732698456675e-06, + "loss": 1.0157, + "step": 5756 + }, + { + "epoch": 1.1350540965992855, + "grad_norm": 2.125, + "learning_rate": 6.847753536132717e-06, + "loss": 0.9198, + "step": 5757 + }, + { + "epoch": 1.1352539916543813, + "grad_norm": 2.390625, + "learning_rate": 6.846774291731744e-06, + "loss": 1.0121, + "step": 5758 + }, + { + "epoch": 1.1354538867094774, + "grad_norm": 1.953125, + "learning_rate": 6.845794965297254e-06, + "loss": 0.8884, + "step": 5759 + }, + { + "epoch": 1.1356537817645735, + "grad_norm": 2.171875, + "learning_rate": 6.844815556872751e-06, + "loss": 0.9572, + "step": 5760 + }, + { + "epoch": 1.1358536768196696, + "grad_norm": 1.984375, + "learning_rate": 6.8438360665017355e-06, + "loss": 0.8053, + "step": 5761 + }, + { + "epoch": 1.1360535718747657, + "grad_norm": 2.09375, + "learning_rate": 6.842856494227721e-06, + "loss": 1.0477, + "step": 5762 + }, + { + "epoch": 1.1362534669298618, + "grad_norm": 2.15625, + "learning_rate": 6.841876840094216e-06, + "loss": 0.9417, + "step": 5763 + }, + { + "epoch": 1.136453361984958, + "grad_norm": 2.140625, + "learning_rate": 6.840897104144739e-06, + "loss": 0.9415, + "step": 5764 + }, + { + "epoch": 1.136653257040054, + "grad_norm": 2.078125, + "learning_rate": 6.839917286422811e-06, + "loss": 0.9455, + "step": 5765 + }, + { + "epoch": 1.1368531520951501, + "grad_norm": 2.265625, + "learning_rate": 6.838937386971951e-06, + "loss": 1.1089, + "step": 5766 + }, + { + "epoch": 1.1370530471502462, + "grad_norm": 2.109375, + "learning_rate": 6.837957405835689e-06, + "loss": 0.9556, + "step": 5767 + }, + { + "epoch": 1.1372529422053421, + "grad_norm": 1.984375, + "learning_rate": 6.836977343057558e-06, + "loss": 0.9827, + "step": 5768 + }, + { + "epoch": 1.1374528372604382, + "grad_norm": 2.0625, + "learning_rate": 6.835997198681087e-06, + "loss": 0.9803, + "step": 5769 + }, + { + "epoch": 1.1376527323155343, + "grad_norm": 2.09375, + "learning_rate": 6.835016972749817e-06, + "loss": 1.0608, + "step": 5770 + }, + { + "epoch": 1.1378526273706304, + "grad_norm": 2.09375, + "learning_rate": 6.83403666530729e-06, + "loss": 0.9443, + "step": 5771 + }, + { + "epoch": 1.1380525224257265, + "grad_norm": 2.109375, + "learning_rate": 6.8330562763970484e-06, + "loss": 0.9665, + "step": 5772 + }, + { + "epoch": 1.1382524174808226, + "grad_norm": 2.09375, + "learning_rate": 6.832075806062644e-06, + "loss": 0.9547, + "step": 5773 + }, + { + "epoch": 1.1384523125359187, + "grad_norm": 2.203125, + "learning_rate": 6.831095254347629e-06, + "loss": 1.0589, + "step": 5774 + }, + { + "epoch": 1.1386522075910146, + "grad_norm": 2.078125, + "learning_rate": 6.830114621295556e-06, + "loss": 1.0209, + "step": 5775 + }, + { + "epoch": 1.1388521026461107, + "grad_norm": 2.1875, + "learning_rate": 6.829133906949988e-06, + "loss": 0.9995, + "step": 5776 + }, + { + "epoch": 1.1390519977012068, + "grad_norm": 2.21875, + "learning_rate": 6.8281531113544875e-06, + "loss": 0.9766, + "step": 5777 + }, + { + "epoch": 1.139251892756303, + "grad_norm": 2.140625, + "learning_rate": 6.827172234552621e-06, + "loss": 0.9472, + "step": 5778 + }, + { + "epoch": 1.139451787811399, + "grad_norm": 2.328125, + "learning_rate": 6.826191276587959e-06, + "loss": 1.0093, + "step": 5779 + }, + { + "epoch": 1.139651682866495, + "grad_norm": 2.15625, + "learning_rate": 6.825210237504075e-06, + "loss": 0.9545, + "step": 5780 + }, + { + "epoch": 1.1398515779215912, + "grad_norm": 2.046875, + "learning_rate": 6.824229117344547e-06, + "loss": 0.9503, + "step": 5781 + }, + { + "epoch": 1.1400514729766873, + "grad_norm": 2.21875, + "learning_rate": 6.823247916152957e-06, + "loss": 0.988, + "step": 5782 + }, + { + "epoch": 1.1402513680317834, + "grad_norm": 2.203125, + "learning_rate": 6.822266633972891e-06, + "loss": 0.9702, + "step": 5783 + }, + { + "epoch": 1.1404512630868795, + "grad_norm": 2.015625, + "learning_rate": 6.821285270847934e-06, + "loss": 0.8738, + "step": 5784 + }, + { + "epoch": 1.1406511581419754, + "grad_norm": 2.09375, + "learning_rate": 6.82030382682168e-06, + "loss": 0.8834, + "step": 5785 + }, + { + "epoch": 1.1408510531970715, + "grad_norm": 2.125, + "learning_rate": 6.819322301937724e-06, + "loss": 0.9622, + "step": 5786 + }, + { + "epoch": 1.1410509482521676, + "grad_norm": 2.1875, + "learning_rate": 6.818340696239666e-06, + "loss": 0.9004, + "step": 5787 + }, + { + "epoch": 1.1412508433072637, + "grad_norm": 2.1875, + "learning_rate": 6.817359009771109e-06, + "loss": 0.9325, + "step": 5788 + }, + { + "epoch": 1.1414507383623598, + "grad_norm": 2.0625, + "learning_rate": 6.816377242575658e-06, + "loss": 0.973, + "step": 5789 + }, + { + "epoch": 1.1416506334174559, + "grad_norm": 2.21875, + "learning_rate": 6.815395394696924e-06, + "loss": 0.934, + "step": 5790 + }, + { + "epoch": 1.141850528472552, + "grad_norm": 2.203125, + "learning_rate": 6.814413466178521e-06, + "loss": 1.0456, + "step": 5791 + }, + { + "epoch": 1.142050423527648, + "grad_norm": 2.171875, + "learning_rate": 6.813431457064064e-06, + "loss": 1.0741, + "step": 5792 + }, + { + "epoch": 1.142250318582744, + "grad_norm": 2.171875, + "learning_rate": 6.812449367397178e-06, + "loss": 0.9353, + "step": 5793 + }, + { + "epoch": 1.14245021363784, + "grad_norm": 2.09375, + "learning_rate": 6.811467197221483e-06, + "loss": 0.9993, + "step": 5794 + }, + { + "epoch": 1.1426501086929362, + "grad_norm": 2.0625, + "learning_rate": 6.8104849465806086e-06, + "loss": 0.8419, + "step": 5795 + }, + { + "epoch": 1.1428500037480323, + "grad_norm": 2.03125, + "learning_rate": 6.809502615518187e-06, + "loss": 0.8346, + "step": 5796 + }, + { + "epoch": 1.1430498988031284, + "grad_norm": 2.1875, + "learning_rate": 6.808520204077852e-06, + "loss": 1.0155, + "step": 5797 + }, + { + "epoch": 1.1432497938582245, + "grad_norm": 2.21875, + "learning_rate": 6.807537712303243e-06, + "loss": 1.03, + "step": 5798 + }, + { + "epoch": 1.1434496889133205, + "grad_norm": 2.1875, + "learning_rate": 6.806555140238002e-06, + "loss": 1.0012, + "step": 5799 + }, + { + "epoch": 1.1436495839684166, + "grad_norm": 2.125, + "learning_rate": 6.805572487925774e-06, + "loss": 0.9254, + "step": 5800 + }, + { + "epoch": 1.1438494790235127, + "grad_norm": 2.265625, + "learning_rate": 6.804589755410209e-06, + "loss": 1.0384, + "step": 5801 + }, + { + "epoch": 1.1440493740786088, + "grad_norm": 2.203125, + "learning_rate": 6.803606942734961e-06, + "loss": 0.954, + "step": 5802 + }, + { + "epoch": 1.1442492691337047, + "grad_norm": 2.25, + "learning_rate": 6.802624049943684e-06, + "loss": 1.0369, + "step": 5803 + }, + { + "epoch": 1.1444491641888008, + "grad_norm": 2.109375, + "learning_rate": 6.801641077080039e-06, + "loss": 0.9482, + "step": 5804 + }, + { + "epoch": 1.144649059243897, + "grad_norm": 2.203125, + "learning_rate": 6.80065802418769e-06, + "loss": 1.0058, + "step": 5805 + }, + { + "epoch": 1.144848954298993, + "grad_norm": 2.21875, + "learning_rate": 6.799674891310304e-06, + "loss": 0.9918, + "step": 5806 + }, + { + "epoch": 1.1450488493540891, + "grad_norm": 2.28125, + "learning_rate": 6.798691678491552e-06, + "loss": 0.948, + "step": 5807 + }, + { + "epoch": 1.1452487444091852, + "grad_norm": 2.03125, + "learning_rate": 6.797708385775107e-06, + "loss": 0.8365, + "step": 5808 + }, + { + "epoch": 1.1454486394642813, + "grad_norm": 2.15625, + "learning_rate": 6.796725013204648e-06, + "loss": 0.9136, + "step": 5809 + }, + { + "epoch": 1.1456485345193772, + "grad_norm": 2.171875, + "learning_rate": 6.795741560823856e-06, + "loss": 1.0892, + "step": 5810 + }, + { + "epoch": 1.1458484295744733, + "grad_norm": 2.125, + "learning_rate": 6.794758028676415e-06, + "loss": 0.9594, + "step": 5811 + }, + { + "epoch": 1.1460483246295694, + "grad_norm": 2.140625, + "learning_rate": 6.793774416806014e-06, + "loss": 1.0519, + "step": 5812 + }, + { + "epoch": 1.1462482196846655, + "grad_norm": 2.09375, + "learning_rate": 6.792790725256347e-06, + "loss": 0.9291, + "step": 5813 + }, + { + "epoch": 1.1464481147397616, + "grad_norm": 2.203125, + "learning_rate": 6.791806954071105e-06, + "loss": 0.9338, + "step": 5814 + }, + { + "epoch": 1.1466480097948577, + "grad_norm": 2.0625, + "learning_rate": 6.79082310329399e-06, + "loss": 0.9201, + "step": 5815 + }, + { + "epoch": 1.1468479048499538, + "grad_norm": 2.0625, + "learning_rate": 6.789839172968705e-06, + "loss": 0.9484, + "step": 5816 + }, + { + "epoch": 1.14704779990505, + "grad_norm": 2.0625, + "learning_rate": 6.7888551631389545e-06, + "loss": 0.9763, + "step": 5817 + }, + { + "epoch": 1.147247694960146, + "grad_norm": 2.109375, + "learning_rate": 6.787871073848448e-06, + "loss": 1.0782, + "step": 5818 + }, + { + "epoch": 1.147447590015242, + "grad_norm": 2.109375, + "learning_rate": 6.7868869051409e-06, + "loss": 0.9979, + "step": 5819 + }, + { + "epoch": 1.147647485070338, + "grad_norm": 2.09375, + "learning_rate": 6.785902657060026e-06, + "loss": 0.9832, + "step": 5820 + }, + { + "epoch": 1.147847380125434, + "grad_norm": 2.109375, + "learning_rate": 6.784918329649548e-06, + "loss": 0.965, + "step": 5821 + }, + { + "epoch": 1.1480472751805302, + "grad_norm": 2.09375, + "learning_rate": 6.783933922953188e-06, + "loss": 0.9862, + "step": 5822 + }, + { + "epoch": 1.1482471702356263, + "grad_norm": 2.078125, + "learning_rate": 6.782949437014672e-06, + "loss": 0.9793, + "step": 5823 + }, + { + "epoch": 1.1484470652907224, + "grad_norm": 2.0, + "learning_rate": 6.781964871877735e-06, + "loss": 0.9625, + "step": 5824 + }, + { + "epoch": 1.1486469603458185, + "grad_norm": 2.046875, + "learning_rate": 6.780980227586107e-06, + "loss": 0.9612, + "step": 5825 + }, + { + "epoch": 1.1488468554009146, + "grad_norm": 2.015625, + "learning_rate": 6.7799955041835276e-06, + "loss": 0.965, + "step": 5826 + }, + { + "epoch": 1.1490467504560107, + "grad_norm": 2.21875, + "learning_rate": 6.77901070171374e-06, + "loss": 1.0603, + "step": 5827 + }, + { + "epoch": 1.1492466455111066, + "grad_norm": 2.15625, + "learning_rate": 6.778025820220484e-06, + "loss": 1.0257, + "step": 5828 + }, + { + "epoch": 1.1494465405662027, + "grad_norm": 2.078125, + "learning_rate": 6.777040859747512e-06, + "loss": 0.9683, + "step": 5829 + }, + { + "epoch": 1.1496464356212988, + "grad_norm": 2.234375, + "learning_rate": 6.7760558203385765e-06, + "loss": 1.0206, + "step": 5830 + }, + { + "epoch": 1.1498463306763949, + "grad_norm": 2.28125, + "learning_rate": 6.77507070203743e-06, + "loss": 0.9786, + "step": 5831 + }, + { + "epoch": 1.150046225731491, + "grad_norm": 2.140625, + "learning_rate": 6.774085504887832e-06, + "loss": 1.0074, + "step": 5832 + }, + { + "epoch": 1.150246120786587, + "grad_norm": 2.140625, + "learning_rate": 6.7731002289335455e-06, + "loss": 1.0272, + "step": 5833 + }, + { + "epoch": 1.1504460158416832, + "grad_norm": 2.015625, + "learning_rate": 6.772114874218337e-06, + "loss": 0.959, + "step": 5834 + }, + { + "epoch": 1.1506459108967793, + "grad_norm": 2.3125, + "learning_rate": 6.771129440785973e-06, + "loss": 1.0305, + "step": 5835 + }, + { + "epoch": 1.1508458059518754, + "grad_norm": 2.25, + "learning_rate": 6.770143928680231e-06, + "loss": 1.1036, + "step": 5836 + }, + { + "epoch": 1.1510457010069715, + "grad_norm": 2.15625, + "learning_rate": 6.769158337944883e-06, + "loss": 0.983, + "step": 5837 + }, + { + "epoch": 1.1512455960620673, + "grad_norm": 2.1875, + "learning_rate": 6.768172668623711e-06, + "loss": 1.0334, + "step": 5838 + }, + { + "epoch": 1.1514454911171634, + "grad_norm": 2.09375, + "learning_rate": 6.767186920760499e-06, + "loss": 0.9598, + "step": 5839 + }, + { + "epoch": 1.1516453861722595, + "grad_norm": 2.28125, + "learning_rate": 6.766201094399031e-06, + "loss": 1.0598, + "step": 5840 + }, + { + "epoch": 1.1518452812273556, + "grad_norm": 2.125, + "learning_rate": 6.765215189583101e-06, + "loss": 0.887, + "step": 5841 + }, + { + "epoch": 1.1520451762824517, + "grad_norm": 2.109375, + "learning_rate": 6.764229206356498e-06, + "loss": 0.9367, + "step": 5842 + }, + { + "epoch": 1.1522450713375478, + "grad_norm": 2.296875, + "learning_rate": 6.763243144763024e-06, + "loss": 1.0542, + "step": 5843 + }, + { + "epoch": 1.152444966392644, + "grad_norm": 2.0625, + "learning_rate": 6.762257004846479e-06, + "loss": 1.0015, + "step": 5844 + }, + { + "epoch": 1.15264486144774, + "grad_norm": 2.15625, + "learning_rate": 6.761270786650664e-06, + "loss": 0.9359, + "step": 5845 + }, + { + "epoch": 1.152844756502836, + "grad_norm": 2.203125, + "learning_rate": 6.76028449021939e-06, + "loss": 1.0411, + "step": 5846 + }, + { + "epoch": 1.153044651557932, + "grad_norm": 2.109375, + "learning_rate": 6.759298115596467e-06, + "loss": 0.9873, + "step": 5847 + }, + { + "epoch": 1.153244546613028, + "grad_norm": 2.1875, + "learning_rate": 6.7583116628257075e-06, + "loss": 1.012, + "step": 5848 + }, + { + "epoch": 1.1534444416681242, + "grad_norm": 2.09375, + "learning_rate": 6.757325131950934e-06, + "loss": 0.9291, + "step": 5849 + }, + { + "epoch": 1.1536443367232203, + "grad_norm": 2.203125, + "learning_rate": 6.756338523015965e-06, + "loss": 1.0056, + "step": 5850 + }, + { + "epoch": 1.1538442317783164, + "grad_norm": 2.015625, + "learning_rate": 6.755351836064625e-06, + "loss": 0.8476, + "step": 5851 + }, + { + "epoch": 1.1540441268334125, + "grad_norm": 2.109375, + "learning_rate": 6.754365071140747e-06, + "loss": 0.9314, + "step": 5852 + }, + { + "epoch": 1.1542440218885086, + "grad_norm": 2.125, + "learning_rate": 6.753378228288158e-06, + "loss": 1.0368, + "step": 5853 + }, + { + "epoch": 1.1544439169436047, + "grad_norm": 2.21875, + "learning_rate": 6.752391307550694e-06, + "loss": 1.0438, + "step": 5854 + }, + { + "epoch": 1.1546438119987006, + "grad_norm": 2.1875, + "learning_rate": 6.751404308972198e-06, + "loss": 1.0567, + "step": 5855 + }, + { + "epoch": 1.1548437070537967, + "grad_norm": 2.109375, + "learning_rate": 6.750417232596509e-06, + "loss": 0.9272, + "step": 5856 + }, + { + "epoch": 1.1550436021088928, + "grad_norm": 2.15625, + "learning_rate": 6.749430078467472e-06, + "loss": 1.0246, + "step": 5857 + }, + { + "epoch": 1.1552434971639889, + "grad_norm": 2.171875, + "learning_rate": 6.74844284662894e-06, + "loss": 0.9252, + "step": 5858 + }, + { + "epoch": 1.155443392219085, + "grad_norm": 2.125, + "learning_rate": 6.7474555371247605e-06, + "loss": 0.9725, + "step": 5859 + }, + { + "epoch": 1.155643287274181, + "grad_norm": 2.015625, + "learning_rate": 6.746468149998796e-06, + "loss": 0.9515, + "step": 5860 + }, + { + "epoch": 1.1558431823292772, + "grad_norm": 2.078125, + "learning_rate": 6.7454806852949015e-06, + "loss": 0.9205, + "step": 5861 + }, + { + "epoch": 1.1560430773843733, + "grad_norm": 2.09375, + "learning_rate": 6.744493143056941e-06, + "loss": 0.9725, + "step": 5862 + }, + { + "epoch": 1.1562429724394692, + "grad_norm": 2.0625, + "learning_rate": 6.743505523328781e-06, + "loss": 0.8935, + "step": 5863 + }, + { + "epoch": 1.1564428674945653, + "grad_norm": 2.15625, + "learning_rate": 6.742517826154293e-06, + "loss": 1.0846, + "step": 5864 + }, + { + "epoch": 1.1566427625496614, + "grad_norm": 2.03125, + "learning_rate": 6.741530051577347e-06, + "loss": 0.9303, + "step": 5865 + }, + { + "epoch": 1.1568426576047575, + "grad_norm": 2.140625, + "learning_rate": 6.740542199641824e-06, + "loss": 0.9525, + "step": 5866 + }, + { + "epoch": 1.1570425526598536, + "grad_norm": 2.046875, + "learning_rate": 6.739554270391603e-06, + "loss": 0.964, + "step": 5867 + }, + { + "epoch": 1.1572424477149497, + "grad_norm": 1.984375, + "learning_rate": 6.738566263870566e-06, + "loss": 0.8848, + "step": 5868 + }, + { + "epoch": 1.1574423427700458, + "grad_norm": 2.0625, + "learning_rate": 6.737578180122603e-06, + "loss": 0.9419, + "step": 5869 + }, + { + "epoch": 1.1576422378251419, + "grad_norm": 2.125, + "learning_rate": 6.7365900191916e-06, + "loss": 0.968, + "step": 5870 + }, + { + "epoch": 1.157842132880238, + "grad_norm": 2.234375, + "learning_rate": 6.735601781121454e-06, + "loss": 0.9856, + "step": 5871 + }, + { + "epoch": 1.158042027935334, + "grad_norm": 2.046875, + "learning_rate": 6.734613465956065e-06, + "loss": 0.953, + "step": 5872 + }, + { + "epoch": 1.15824192299043, + "grad_norm": 2.234375, + "learning_rate": 6.733625073739329e-06, + "loss": 0.9394, + "step": 5873 + }, + { + "epoch": 1.158441818045526, + "grad_norm": 2.25, + "learning_rate": 6.732636604515153e-06, + "loss": 0.9459, + "step": 5874 + }, + { + "epoch": 1.1586417131006221, + "grad_norm": 1.9921875, + "learning_rate": 6.731648058327445e-06, + "loss": 0.8777, + "step": 5875 + }, + { + "epoch": 1.1588416081557182, + "grad_norm": 2.15625, + "learning_rate": 6.7306594352201135e-06, + "loss": 0.9294, + "step": 5876 + }, + { + "epoch": 1.1590415032108143, + "grad_norm": 2.125, + "learning_rate": 6.729670735237075e-06, + "loss": 1.0541, + "step": 5877 + }, + { + "epoch": 1.1592413982659104, + "grad_norm": 2.1875, + "learning_rate": 6.728681958422248e-06, + "loss": 0.9905, + "step": 5878 + }, + { + "epoch": 1.1594412933210065, + "grad_norm": 2.28125, + "learning_rate": 6.727693104819553e-06, + "loss": 0.9105, + "step": 5879 + }, + { + "epoch": 1.1596411883761026, + "grad_norm": 2.109375, + "learning_rate": 6.726704174472913e-06, + "loss": 1.0022, + "step": 5880 + }, + { + "epoch": 1.1598410834311985, + "grad_norm": 2.171875, + "learning_rate": 6.725715167426261e-06, + "loss": 1.0047, + "step": 5881 + }, + { + "epoch": 1.1600409784862946, + "grad_norm": 2.125, + "learning_rate": 6.724726083723523e-06, + "loss": 0.9738, + "step": 5882 + }, + { + "epoch": 1.1602408735413907, + "grad_norm": 2.125, + "learning_rate": 6.723736923408638e-06, + "loss": 0.9422, + "step": 5883 + }, + { + "epoch": 1.1604407685964868, + "grad_norm": 2.109375, + "learning_rate": 6.722747686525543e-06, + "loss": 0.9893, + "step": 5884 + }, + { + "epoch": 1.160640663651583, + "grad_norm": 2.0625, + "learning_rate": 6.721758373118178e-06, + "loss": 1.0724, + "step": 5885 + }, + { + "epoch": 1.160840558706679, + "grad_norm": 2.171875, + "learning_rate": 6.720768983230492e-06, + "loss": 1.0144, + "step": 5886 + }, + { + "epoch": 1.1610404537617751, + "grad_norm": 2.09375, + "learning_rate": 6.7197795169064305e-06, + "loss": 0.9422, + "step": 5887 + }, + { + "epoch": 1.1612403488168712, + "grad_norm": 2.140625, + "learning_rate": 6.7187899741899465e-06, + "loss": 0.9876, + "step": 5888 + }, + { + "epoch": 1.1614402438719673, + "grad_norm": 2.09375, + "learning_rate": 6.717800355124996e-06, + "loss": 0.8915, + "step": 5889 + }, + { + "epoch": 1.1616401389270634, + "grad_norm": 2.03125, + "learning_rate": 6.716810659755537e-06, + "loss": 0.8954, + "step": 5890 + }, + { + "epoch": 1.1618400339821593, + "grad_norm": 2.078125, + "learning_rate": 6.715820888125532e-06, + "loss": 0.9359, + "step": 5891 + }, + { + "epoch": 1.1620399290372554, + "grad_norm": 2.03125, + "learning_rate": 6.714831040278946e-06, + "loss": 0.9886, + "step": 5892 + }, + { + "epoch": 1.1622398240923515, + "grad_norm": 2.109375, + "learning_rate": 6.713841116259749e-06, + "loss": 0.9766, + "step": 5893 + }, + { + "epoch": 1.1624397191474476, + "grad_norm": 2.109375, + "learning_rate": 6.7128511161119115e-06, + "loss": 0.9345, + "step": 5894 + }, + { + "epoch": 1.1626396142025437, + "grad_norm": 2.078125, + "learning_rate": 6.7118610398794115e-06, + "loss": 0.9462, + "step": 5895 + }, + { + "epoch": 1.1628395092576398, + "grad_norm": 2.171875, + "learning_rate": 6.710870887606227e-06, + "loss": 0.9552, + "step": 5896 + }, + { + "epoch": 1.163039404312736, + "grad_norm": 2.109375, + "learning_rate": 6.70988065933634e-06, + "loss": 0.9285, + "step": 5897 + }, + { + "epoch": 1.1632392993678318, + "grad_norm": 2.203125, + "learning_rate": 6.708890355113736e-06, + "loss": 1.0474, + "step": 5898 + }, + { + "epoch": 1.1634391944229279, + "grad_norm": 1.96875, + "learning_rate": 6.707899974982405e-06, + "loss": 0.8558, + "step": 5899 + }, + { + "epoch": 1.163639089478024, + "grad_norm": 2.140625, + "learning_rate": 6.706909518986341e-06, + "loss": 0.9501, + "step": 5900 + }, + { + "epoch": 1.16383898453312, + "grad_norm": 2.21875, + "learning_rate": 6.705918987169537e-06, + "loss": 0.9321, + "step": 5901 + }, + { + "epoch": 1.1640388795882162, + "grad_norm": 2.109375, + "learning_rate": 6.704928379575993e-06, + "loss": 0.9636, + "step": 5902 + }, + { + "epoch": 1.1642387746433123, + "grad_norm": 2.0625, + "learning_rate": 6.703937696249715e-06, + "loss": 1.0126, + "step": 5903 + }, + { + "epoch": 1.1644386696984084, + "grad_norm": 2.1875, + "learning_rate": 6.7029469372347045e-06, + "loss": 1.084, + "step": 5904 + }, + { + "epoch": 1.1646385647535045, + "grad_norm": 2.0625, + "learning_rate": 6.701956102574973e-06, + "loss": 0.9991, + "step": 5905 + }, + { + "epoch": 1.1648384598086006, + "grad_norm": 2.1875, + "learning_rate": 6.700965192314536e-06, + "loss": 0.9644, + "step": 5906 + }, + { + "epoch": 1.1650383548636967, + "grad_norm": 2.078125, + "learning_rate": 6.699974206497405e-06, + "loss": 1.0096, + "step": 5907 + }, + { + "epoch": 1.1652382499187925, + "grad_norm": 2.125, + "learning_rate": 6.6989831451676015e-06, + "loss": 0.9761, + "step": 5908 + }, + { + "epoch": 1.1654381449738886, + "grad_norm": 2.125, + "learning_rate": 6.697992008369147e-06, + "loss": 1.0231, + "step": 5909 + }, + { + "epoch": 1.1656380400289847, + "grad_norm": 2.1875, + "learning_rate": 6.6970007961460695e-06, + "loss": 1.039, + "step": 5910 + }, + { + "epoch": 1.1658379350840808, + "grad_norm": 2.078125, + "learning_rate": 6.6960095085423985e-06, + "loss": 0.9953, + "step": 5911 + }, + { + "epoch": 1.166037830139177, + "grad_norm": 2.15625, + "learning_rate": 6.695018145602165e-06, + "loss": 1.0316, + "step": 5912 + }, + { + "epoch": 1.166237725194273, + "grad_norm": 2.0625, + "learning_rate": 6.694026707369407e-06, + "loss": 1.0413, + "step": 5913 + }, + { + "epoch": 1.1664376202493691, + "grad_norm": 2.1875, + "learning_rate": 6.693035193888164e-06, + "loss": 0.9899, + "step": 5914 + }, + { + "epoch": 1.1666375153044652, + "grad_norm": 2.1875, + "learning_rate": 6.692043605202478e-06, + "loss": 0.9738, + "step": 5915 + }, + { + "epoch": 1.1668374103595611, + "grad_norm": 2.140625, + "learning_rate": 6.691051941356397e-06, + "loss": 0.9412, + "step": 5916 + }, + { + "epoch": 1.1670373054146572, + "grad_norm": 2.140625, + "learning_rate": 6.6900602023939685e-06, + "loss": 1.0286, + "step": 5917 + }, + { + "epoch": 1.1672372004697533, + "grad_norm": 2.15625, + "learning_rate": 6.6890683883592455e-06, + "loss": 0.9842, + "step": 5918 + }, + { + "epoch": 1.1674370955248494, + "grad_norm": 2.203125, + "learning_rate": 6.688076499296285e-06, + "loss": 0.9791, + "step": 5919 + }, + { + "epoch": 1.1676369905799455, + "grad_norm": 2.1875, + "learning_rate": 6.687084535249149e-06, + "loss": 0.9836, + "step": 5920 + }, + { + "epoch": 1.1678368856350416, + "grad_norm": 2.171875, + "learning_rate": 6.686092496261896e-06, + "loss": 0.9675, + "step": 5921 + }, + { + "epoch": 1.1680367806901377, + "grad_norm": 2.09375, + "learning_rate": 6.685100382378595e-06, + "loss": 0.9873, + "step": 5922 + }, + { + "epoch": 1.1682366757452338, + "grad_norm": 2.125, + "learning_rate": 6.684108193643317e-06, + "loss": 1.0282, + "step": 5923 + }, + { + "epoch": 1.16843657080033, + "grad_norm": 2.234375, + "learning_rate": 6.68311593010013e-06, + "loss": 1.0129, + "step": 5924 + }, + { + "epoch": 1.168636465855426, + "grad_norm": 2.015625, + "learning_rate": 6.682123591793114e-06, + "loss": 0.9194, + "step": 5925 + }, + { + "epoch": 1.168836360910522, + "grad_norm": 2.203125, + "learning_rate": 6.681131178766349e-06, + "loss": 0.9182, + "step": 5926 + }, + { + "epoch": 1.169036255965618, + "grad_norm": 2.109375, + "learning_rate": 6.680138691063914e-06, + "loss": 0.9553, + "step": 5927 + }, + { + "epoch": 1.169236151020714, + "grad_norm": 2.125, + "learning_rate": 6.679146128729901e-06, + "loss": 0.9834, + "step": 5928 + }, + { + "epoch": 1.1694360460758102, + "grad_norm": 2.140625, + "learning_rate": 6.678153491808394e-06, + "loss": 1.0813, + "step": 5929 + }, + { + "epoch": 1.1696359411309063, + "grad_norm": 2.171875, + "learning_rate": 6.677160780343488e-06, + "loss": 1.0055, + "step": 5930 + }, + { + "epoch": 1.1698358361860024, + "grad_norm": 2.046875, + "learning_rate": 6.6761679943792805e-06, + "loss": 0.9817, + "step": 5931 + }, + { + "epoch": 1.1700357312410985, + "grad_norm": 2.0625, + "learning_rate": 6.675175133959868e-06, + "loss": 0.9821, + "step": 5932 + }, + { + "epoch": 1.1702356262961944, + "grad_norm": 2.140625, + "learning_rate": 6.674182199129356e-06, + "loss": 0.8733, + "step": 5933 + }, + { + "epoch": 1.1704355213512905, + "grad_norm": 2.25, + "learning_rate": 6.67318918993185e-06, + "loss": 0.9753, + "step": 5934 + }, + { + "epoch": 1.1706354164063866, + "grad_norm": 2.046875, + "learning_rate": 6.6721961064114584e-06, + "loss": 0.9783, + "step": 5935 + }, + { + "epoch": 1.1708353114614827, + "grad_norm": 2.078125, + "learning_rate": 6.6712029486122946e-06, + "loss": 0.9435, + "step": 5936 + }, + { + "epoch": 1.1710352065165788, + "grad_norm": 2.03125, + "learning_rate": 6.670209716578474e-06, + "loss": 0.9447, + "step": 5937 + }, + { + "epoch": 1.1712351015716749, + "grad_norm": 2.046875, + "learning_rate": 6.669216410354118e-06, + "loss": 0.9303, + "step": 5938 + }, + { + "epoch": 1.171434996626771, + "grad_norm": 2.1875, + "learning_rate": 6.668223029983345e-06, + "loss": 0.9607, + "step": 5939 + }, + { + "epoch": 1.171634891681867, + "grad_norm": 2.234375, + "learning_rate": 6.667229575510284e-06, + "loss": 0.9161, + "step": 5940 + }, + { + "epoch": 1.1718347867369632, + "grad_norm": 2.109375, + "learning_rate": 6.666236046979062e-06, + "loss": 0.9774, + "step": 5941 + }, + { + "epoch": 1.1720346817920593, + "grad_norm": 2.046875, + "learning_rate": 6.665242444433815e-06, + "loss": 0.9084, + "step": 5942 + }, + { + "epoch": 1.1722345768471552, + "grad_norm": 2.296875, + "learning_rate": 6.664248767918675e-06, + "loss": 0.9373, + "step": 5943 + }, + { + "epoch": 1.1724344719022513, + "grad_norm": 2.25, + "learning_rate": 6.663255017477783e-06, + "loss": 0.9009, + "step": 5944 + }, + { + "epoch": 1.1726343669573474, + "grad_norm": 2.203125, + "learning_rate": 6.662261193155281e-06, + "loss": 1.1289, + "step": 5945 + }, + { + "epoch": 1.1728342620124435, + "grad_norm": 2.15625, + "learning_rate": 6.661267294995314e-06, + "loss": 1.0036, + "step": 5946 + }, + { + "epoch": 1.1730341570675396, + "grad_norm": 2.328125, + "learning_rate": 6.66027332304203e-06, + "loss": 1.0468, + "step": 5947 + }, + { + "epoch": 1.1732340521226357, + "grad_norm": 2.09375, + "learning_rate": 6.659279277339584e-06, + "loss": 0.8799, + "step": 5948 + }, + { + "epoch": 1.1734339471777318, + "grad_norm": 2.1875, + "learning_rate": 6.65828515793213e-06, + "loss": 0.9727, + "step": 5949 + }, + { + "epoch": 1.1736338422328279, + "grad_norm": 2.15625, + "learning_rate": 6.657290964863825e-06, + "loss": 0.9576, + "step": 5950 + }, + { + "epoch": 1.1738337372879237, + "grad_norm": 2.09375, + "learning_rate": 6.656296698178832e-06, + "loss": 0.8868, + "step": 5951 + }, + { + "epoch": 1.1740336323430198, + "grad_norm": 2.0625, + "learning_rate": 6.655302357921318e-06, + "loss": 0.943, + "step": 5952 + }, + { + "epoch": 1.174233527398116, + "grad_norm": 2.09375, + "learning_rate": 6.65430794413545e-06, + "loss": 1.0092, + "step": 5953 + }, + { + "epoch": 1.174433422453212, + "grad_norm": 2.171875, + "learning_rate": 6.6533134568654e-06, + "loss": 0.9646, + "step": 5954 + }, + { + "epoch": 1.1746333175083081, + "grad_norm": 2.25, + "learning_rate": 6.652318896155342e-06, + "loss": 1.1088, + "step": 5955 + }, + { + "epoch": 1.1748332125634042, + "grad_norm": 2.125, + "learning_rate": 6.651324262049454e-06, + "loss": 1.0244, + "step": 5956 + }, + { + "epoch": 1.1750331076185003, + "grad_norm": 2.203125, + "learning_rate": 6.650329554591921e-06, + "loss": 1.0009, + "step": 5957 + }, + { + "epoch": 1.1752330026735964, + "grad_norm": 2.0, + "learning_rate": 6.649334773826924e-06, + "loss": 0.9352, + "step": 5958 + }, + { + "epoch": 1.1754328977286925, + "grad_norm": 2.09375, + "learning_rate": 6.648339919798654e-06, + "loss": 0.9745, + "step": 5959 + }, + { + "epoch": 1.1756327927837886, + "grad_norm": 2.171875, + "learning_rate": 6.647344992551299e-06, + "loss": 1.1091, + "step": 5960 + }, + { + "epoch": 1.1758326878388845, + "grad_norm": 2.171875, + "learning_rate": 6.646349992129055e-06, + "loss": 0.9725, + "step": 5961 + }, + { + "epoch": 1.1760325828939806, + "grad_norm": 2.375, + "learning_rate": 6.645354918576122e-06, + "loss": 0.9178, + "step": 5962 + }, + { + "epoch": 1.1762324779490767, + "grad_norm": 2.078125, + "learning_rate": 6.644359771936699e-06, + "loss": 0.9725, + "step": 5963 + }, + { + "epoch": 1.1764323730041728, + "grad_norm": 2.0625, + "learning_rate": 6.643364552254989e-06, + "loss": 0.8993, + "step": 5964 + }, + { + "epoch": 1.176632268059269, + "grad_norm": 2.203125, + "learning_rate": 6.642369259575203e-06, + "loss": 0.9373, + "step": 5965 + }, + { + "epoch": 1.176832163114365, + "grad_norm": 2.25, + "learning_rate": 6.64137389394155e-06, + "loss": 1.0464, + "step": 5966 + }, + { + "epoch": 1.177032058169461, + "grad_norm": 2.171875, + "learning_rate": 6.640378455398242e-06, + "loss": 0.9632, + "step": 5967 + }, + { + "epoch": 1.1772319532245572, + "grad_norm": 2.046875, + "learning_rate": 6.6393829439895e-06, + "loss": 0.9081, + "step": 5968 + }, + { + "epoch": 1.177431848279653, + "grad_norm": 2.140625, + "learning_rate": 6.6383873597595415e-06, + "loss": 1.0095, + "step": 5969 + }, + { + "epoch": 1.1776317433347492, + "grad_norm": 2.015625, + "learning_rate": 6.637391702752591e-06, + "loss": 0.9309, + "step": 5970 + }, + { + "epoch": 1.1778316383898453, + "grad_norm": 2.1875, + "learning_rate": 6.636395973012878e-06, + "loss": 1.0349, + "step": 5971 + }, + { + "epoch": 1.1780315334449414, + "grad_norm": 2.328125, + "learning_rate": 6.635400170584629e-06, + "loss": 1.0354, + "step": 5972 + }, + { + "epoch": 1.1782314285000375, + "grad_norm": 2.109375, + "learning_rate": 6.63440429551208e-06, + "loss": 0.9784, + "step": 5973 + }, + { + "epoch": 1.1784313235551336, + "grad_norm": 2.03125, + "learning_rate": 6.633408347839466e-06, + "loss": 0.9179, + "step": 5974 + }, + { + "epoch": 1.1786312186102297, + "grad_norm": 2.046875, + "learning_rate": 6.632412327611029e-06, + "loss": 0.9423, + "step": 5975 + }, + { + "epoch": 1.1788311136653258, + "grad_norm": 2.234375, + "learning_rate": 6.63141623487101e-06, + "loss": 0.9934, + "step": 5976 + }, + { + "epoch": 1.1790310087204219, + "grad_norm": 2.21875, + "learning_rate": 6.6304200696636545e-06, + "loss": 1.0466, + "step": 5977 + }, + { + "epoch": 1.1792309037755178, + "grad_norm": 2.203125, + "learning_rate": 6.629423832033215e-06, + "loss": 0.9837, + "step": 5978 + }, + { + "epoch": 1.1794307988306139, + "grad_norm": 2.0625, + "learning_rate": 6.6284275220239435e-06, + "loss": 0.9193, + "step": 5979 + }, + { + "epoch": 1.17963069388571, + "grad_norm": 2.21875, + "learning_rate": 6.627431139680094e-06, + "loss": 0.9951, + "step": 5980 + }, + { + "epoch": 1.179830588940806, + "grad_norm": 2.0625, + "learning_rate": 6.626434685045928e-06, + "loss": 0.9343, + "step": 5981 + }, + { + "epoch": 1.1800304839959022, + "grad_norm": 2.140625, + "learning_rate": 6.625438158165707e-06, + "loss": 1.1256, + "step": 5982 + }, + { + "epoch": 1.1802303790509983, + "grad_norm": 2.140625, + "learning_rate": 6.624441559083696e-06, + "loss": 0.9476, + "step": 5983 + }, + { + "epoch": 1.1804302741060944, + "grad_norm": 2.140625, + "learning_rate": 6.623444887844166e-06, + "loss": 1.0104, + "step": 5984 + }, + { + "epoch": 1.1806301691611905, + "grad_norm": 2.3125, + "learning_rate": 6.622448144491387e-06, + "loss": 0.926, + "step": 5985 + }, + { + "epoch": 1.1808300642162863, + "grad_norm": 2.15625, + "learning_rate": 6.621451329069634e-06, + "loss": 0.9933, + "step": 5986 + }, + { + "epoch": 1.1810299592713824, + "grad_norm": 2.265625, + "learning_rate": 6.6204544416231865e-06, + "loss": 0.988, + "step": 5987 + }, + { + "epoch": 1.1812298543264785, + "grad_norm": 2.046875, + "learning_rate": 6.619457482196326e-06, + "loss": 0.9556, + "step": 5988 + }, + { + "epoch": 1.1814297493815746, + "grad_norm": 2.1875, + "learning_rate": 6.618460450833335e-06, + "loss": 0.9795, + "step": 5989 + }, + { + "epoch": 1.1816296444366707, + "grad_norm": 2.140625, + "learning_rate": 6.617463347578506e-06, + "loss": 1.0121, + "step": 5990 + }, + { + "epoch": 1.1818295394917668, + "grad_norm": 2.234375, + "learning_rate": 6.6164661724761255e-06, + "loss": 1.0122, + "step": 5991 + }, + { + "epoch": 1.182029434546863, + "grad_norm": 2.1875, + "learning_rate": 6.615468925570492e-06, + "loss": 1.0103, + "step": 5992 + }, + { + "epoch": 1.182229329601959, + "grad_norm": 2.1875, + "learning_rate": 6.614471606905902e-06, + "loss": 0.9475, + "step": 5993 + }, + { + "epoch": 1.1824292246570551, + "grad_norm": 2.15625, + "learning_rate": 6.6134742165266545e-06, + "loss": 0.9314, + "step": 5994 + }, + { + "epoch": 1.1826291197121512, + "grad_norm": 2.25, + "learning_rate": 6.612476754477055e-06, + "loss": 0.98, + "step": 5995 + }, + { + "epoch": 1.1828290147672471, + "grad_norm": 2.125, + "learning_rate": 6.6114792208014115e-06, + "loss": 0.9514, + "step": 5996 + }, + { + "epoch": 1.1830289098223432, + "grad_norm": 2.09375, + "learning_rate": 6.610481615544031e-06, + "loss": 0.9479, + "step": 5997 + }, + { + "epoch": 1.1832288048774393, + "grad_norm": 2.0625, + "learning_rate": 6.60948393874923e-06, + "loss": 0.9903, + "step": 5998 + }, + { + "epoch": 1.1834286999325354, + "grad_norm": 2.03125, + "learning_rate": 6.608486190461324e-06, + "loss": 1.0254, + "step": 5999 + }, + { + "epoch": 1.1836285949876315, + "grad_norm": 2.15625, + "learning_rate": 6.607488370724635e-06, + "loss": 0.9799, + "step": 6000 + }, + { + "epoch": 1.1836285949876315, + "eval_loss": 0.9062411189079285, + "eval_runtime": 594.9136, + "eval_samples_per_second": 3.594, + "eval_steps_per_second": 3.594, + "step": 6000 + }, + { + "epoch": 1.1838284900427276, + "grad_norm": 2.109375, + "learning_rate": 6.606490479583481e-06, + "loss": 0.9528, + "step": 6001 + }, + { + "epoch": 1.1840283850978237, + "grad_norm": 2.390625, + "learning_rate": 6.605492517082195e-06, + "loss": 1.0422, + "step": 6002 + }, + { + "epoch": 1.1842282801529198, + "grad_norm": 2.234375, + "learning_rate": 6.604494483265101e-06, + "loss": 0.9536, + "step": 6003 + }, + { + "epoch": 1.1844281752080157, + "grad_norm": 2.015625, + "learning_rate": 6.603496378176534e-06, + "loss": 0.9032, + "step": 6004 + }, + { + "epoch": 1.1846280702631118, + "grad_norm": 2.109375, + "learning_rate": 6.602498201860828e-06, + "loss": 0.9891, + "step": 6005 + }, + { + "epoch": 1.1848279653182079, + "grad_norm": 2.1875, + "learning_rate": 6.601499954362324e-06, + "loss": 0.9402, + "step": 6006 + }, + { + "epoch": 1.185027860373304, + "grad_norm": 2.0625, + "learning_rate": 6.6005016357253624e-06, + "loss": 0.9325, + "step": 6007 + }, + { + "epoch": 1.1852277554284, + "grad_norm": 2.125, + "learning_rate": 6.5995032459942895e-06, + "loss": 0.923, + "step": 6008 + }, + { + "epoch": 1.1854276504834962, + "grad_norm": 2.3125, + "learning_rate": 6.5985047852134535e-06, + "loss": 1.0531, + "step": 6009 + }, + { + "epoch": 1.1856275455385923, + "grad_norm": 2.328125, + "learning_rate": 6.597506253427206e-06, + "loss": 0.9276, + "step": 6010 + }, + { + "epoch": 1.1858274405936884, + "grad_norm": 2.109375, + "learning_rate": 6.5965076506799e-06, + "loss": 1.0075, + "step": 6011 + }, + { + "epoch": 1.1860273356487845, + "grad_norm": 2.078125, + "learning_rate": 6.595508977015897e-06, + "loss": 0.9152, + "step": 6012 + }, + { + "epoch": 1.1862272307038806, + "grad_norm": 2.0625, + "learning_rate": 6.594510232479553e-06, + "loss": 0.9343, + "step": 6013 + }, + { + "epoch": 1.1864271257589765, + "grad_norm": 2.15625, + "learning_rate": 6.5935114171152345e-06, + "loss": 0.9068, + "step": 6014 + }, + { + "epoch": 1.1866270208140726, + "grad_norm": 2.25, + "learning_rate": 6.592512530967312e-06, + "loss": 0.9557, + "step": 6015 + }, + { + "epoch": 1.1868269158691687, + "grad_norm": 2.09375, + "learning_rate": 6.591513574080152e-06, + "loss": 0.9594, + "step": 6016 + }, + { + "epoch": 1.1870268109242648, + "grad_norm": 2.140625, + "learning_rate": 6.590514546498128e-06, + "loss": 0.9499, + "step": 6017 + }, + { + "epoch": 1.1872267059793609, + "grad_norm": 2.15625, + "learning_rate": 6.589515448265619e-06, + "loss": 0.9983, + "step": 6018 + }, + { + "epoch": 1.187426601034457, + "grad_norm": 2.125, + "learning_rate": 6.588516279427002e-06, + "loss": 0.9502, + "step": 6019 + }, + { + "epoch": 1.187626496089553, + "grad_norm": 2.09375, + "learning_rate": 6.587517040026662e-06, + "loss": 0.9559, + "step": 6020 + }, + { + "epoch": 1.187826391144649, + "grad_norm": 2.234375, + "learning_rate": 6.586517730108985e-06, + "loss": 1.1202, + "step": 6021 + }, + { + "epoch": 1.188026286199745, + "grad_norm": 2.15625, + "learning_rate": 6.58551834971836e-06, + "loss": 0.9888, + "step": 6022 + }, + { + "epoch": 1.1882261812548411, + "grad_norm": 2.21875, + "learning_rate": 6.584518898899178e-06, + "loss": 1.0361, + "step": 6023 + }, + { + "epoch": 1.1884260763099372, + "grad_norm": 2.21875, + "learning_rate": 6.583519377695838e-06, + "loss": 0.989, + "step": 6024 + }, + { + "epoch": 1.1886259713650333, + "grad_norm": 2.078125, + "learning_rate": 6.582519786152735e-06, + "loss": 0.9821, + "step": 6025 + }, + { + "epoch": 1.1888258664201294, + "grad_norm": 2.234375, + "learning_rate": 6.581520124314271e-06, + "loss": 0.9858, + "step": 6026 + }, + { + "epoch": 1.1890257614752255, + "grad_norm": 2.25, + "learning_rate": 6.580520392224854e-06, + "loss": 1.0612, + "step": 6027 + }, + { + "epoch": 1.1892256565303216, + "grad_norm": 2.03125, + "learning_rate": 6.579520589928888e-06, + "loss": 0.9674, + "step": 6028 + }, + { + "epoch": 1.1894255515854177, + "grad_norm": 2.109375, + "learning_rate": 6.578520717470789e-06, + "loss": 0.948, + "step": 6029 + }, + { + "epoch": 1.1896254466405138, + "grad_norm": 2.203125, + "learning_rate": 6.577520774894967e-06, + "loss": 1.0566, + "step": 6030 + }, + { + "epoch": 1.1898253416956097, + "grad_norm": 2.296875, + "learning_rate": 6.57652076224584e-06, + "loss": 1.1256, + "step": 6031 + }, + { + "epoch": 1.1900252367507058, + "grad_norm": 2.21875, + "learning_rate": 6.57552067956783e-06, + "loss": 1.0127, + "step": 6032 + }, + { + "epoch": 1.190225131805802, + "grad_norm": 2.375, + "learning_rate": 6.574520526905358e-06, + "loss": 0.9052, + "step": 6033 + }, + { + "epoch": 1.190425026860898, + "grad_norm": 2.125, + "learning_rate": 6.573520304302853e-06, + "loss": 0.9465, + "step": 6034 + }, + { + "epoch": 1.1906249219159941, + "grad_norm": 2.1875, + "learning_rate": 6.572520011804745e-06, + "loss": 0.9954, + "step": 6035 + }, + { + "epoch": 1.1908248169710902, + "grad_norm": 2.140625, + "learning_rate": 6.571519649455464e-06, + "loss": 0.9809, + "step": 6036 + }, + { + "epoch": 1.1910247120261863, + "grad_norm": 2.15625, + "learning_rate": 6.5705192172994505e-06, + "loss": 0.9703, + "step": 6037 + }, + { + "epoch": 1.1912246070812824, + "grad_norm": 2.125, + "learning_rate": 6.56951871538114e-06, + "loss": 0.9372, + "step": 6038 + }, + { + "epoch": 1.1914245021363783, + "grad_norm": 2.140625, + "learning_rate": 6.568518143744977e-06, + "loss": 1.0186, + "step": 6039 + }, + { + "epoch": 1.1916243971914744, + "grad_norm": 2.34375, + "learning_rate": 6.567517502435403e-06, + "loss": 0.9661, + "step": 6040 + }, + { + "epoch": 1.1918242922465705, + "grad_norm": 2.015625, + "learning_rate": 6.5665167914968706e-06, + "loss": 0.8746, + "step": 6041 + }, + { + "epoch": 1.1920241873016666, + "grad_norm": 2.140625, + "learning_rate": 6.56551601097383e-06, + "loss": 1.0034, + "step": 6042 + }, + { + "epoch": 1.1922240823567627, + "grad_norm": 2.078125, + "learning_rate": 6.564515160910736e-06, + "loss": 1.093, + "step": 6043 + }, + { + "epoch": 1.1924239774118588, + "grad_norm": 2.0625, + "learning_rate": 6.563514241352043e-06, + "loss": 1.0007, + "step": 6044 + }, + { + "epoch": 1.192623872466955, + "grad_norm": 2.15625, + "learning_rate": 6.562513252342216e-06, + "loss": 0.9565, + "step": 6045 + }, + { + "epoch": 1.192823767522051, + "grad_norm": 2.109375, + "learning_rate": 6.561512193925718e-06, + "loss": 0.9181, + "step": 6046 + }, + { + "epoch": 1.193023662577147, + "grad_norm": 2.15625, + "learning_rate": 6.560511066147015e-06, + "loss": 0.9166, + "step": 6047 + }, + { + "epoch": 1.1932235576322432, + "grad_norm": 2.15625, + "learning_rate": 6.559509869050575e-06, + "loss": 1.0238, + "step": 6048 + }, + { + "epoch": 1.193423452687339, + "grad_norm": 2.1875, + "learning_rate": 6.558508602680876e-06, + "loss": 0.9868, + "step": 6049 + }, + { + "epoch": 1.1936233477424352, + "grad_norm": 2.1875, + "learning_rate": 6.557507267082391e-06, + "loss": 0.9623, + "step": 6050 + }, + { + "epoch": 1.1938232427975313, + "grad_norm": 2.0, + "learning_rate": 6.556505862299597e-06, + "loss": 0.9718, + "step": 6051 + }, + { + "epoch": 1.1940231378526274, + "grad_norm": 2.140625, + "learning_rate": 6.555504388376981e-06, + "loss": 0.9164, + "step": 6052 + }, + { + "epoch": 1.1942230329077235, + "grad_norm": 2.15625, + "learning_rate": 6.554502845359026e-06, + "loss": 1.0381, + "step": 6053 + }, + { + "epoch": 1.1944229279628196, + "grad_norm": 2.15625, + "learning_rate": 6.553501233290218e-06, + "loss": 1.0206, + "step": 6054 + }, + { + "epoch": 1.1946228230179157, + "grad_norm": 2.203125, + "learning_rate": 6.5524995522150545e-06, + "loss": 1.0788, + "step": 6055 + }, + { + "epoch": 1.1948227180730115, + "grad_norm": 2.140625, + "learning_rate": 6.551497802178025e-06, + "loss": 1.0028, + "step": 6056 + }, + { + "epoch": 1.1950226131281076, + "grad_norm": 2.140625, + "learning_rate": 6.55049598322363e-06, + "loss": 1.0223, + "step": 6057 + }, + { + "epoch": 1.1952225081832037, + "grad_norm": 2.140625, + "learning_rate": 6.549494095396368e-06, + "loss": 0.975, + "step": 6058 + }, + { + "epoch": 1.1954224032382998, + "grad_norm": 2.28125, + "learning_rate": 6.548492138740743e-06, + "loss": 1.0228, + "step": 6059 + }, + { + "epoch": 1.195622298293396, + "grad_norm": 2.15625, + "learning_rate": 6.547490113301265e-06, + "loss": 1.0139, + "step": 6060 + }, + { + "epoch": 1.195822193348492, + "grad_norm": 2.09375, + "learning_rate": 6.546488019122441e-06, + "loss": 0.9619, + "step": 6061 + }, + { + "epoch": 1.1960220884035881, + "grad_norm": 2.3125, + "learning_rate": 6.5454858562487835e-06, + "loss": 0.9858, + "step": 6062 + }, + { + "epoch": 1.1962219834586842, + "grad_norm": 2.03125, + "learning_rate": 6.544483624724809e-06, + "loss": 0.9372, + "step": 6063 + }, + { + "epoch": 1.1964218785137803, + "grad_norm": 2.28125, + "learning_rate": 6.543481324595037e-06, + "loss": 0.9888, + "step": 6064 + }, + { + "epoch": 1.1966217735688764, + "grad_norm": 2.015625, + "learning_rate": 6.54247895590399e-06, + "loss": 0.9256, + "step": 6065 + }, + { + "epoch": 1.1968216686239723, + "grad_norm": 2.09375, + "learning_rate": 6.541476518696191e-06, + "loss": 0.9669, + "step": 6066 + }, + { + "epoch": 1.1970215636790684, + "grad_norm": 2.140625, + "learning_rate": 6.5404740130161715e-06, + "loss": 0.9895, + "step": 6067 + }, + { + "epoch": 1.1972214587341645, + "grad_norm": 2.109375, + "learning_rate": 6.539471438908459e-06, + "loss": 0.8756, + "step": 6068 + }, + { + "epoch": 1.1974213537892606, + "grad_norm": 2.0625, + "learning_rate": 6.5384687964175915e-06, + "loss": 1.0367, + "step": 6069 + }, + { + "epoch": 1.1976212488443567, + "grad_norm": 2.140625, + "learning_rate": 6.537466085588104e-06, + "loss": 0.9823, + "step": 6070 + }, + { + "epoch": 1.1978211438994528, + "grad_norm": 2.078125, + "learning_rate": 6.536463306464535e-06, + "loss": 0.9332, + "step": 6071 + }, + { + "epoch": 1.198021038954549, + "grad_norm": 2.1875, + "learning_rate": 6.535460459091434e-06, + "loss": 0.9466, + "step": 6072 + }, + { + "epoch": 1.198220934009645, + "grad_norm": 2.125, + "learning_rate": 6.534457543513341e-06, + "loss": 0.9703, + "step": 6073 + }, + { + "epoch": 1.198420829064741, + "grad_norm": 2.234375, + "learning_rate": 6.5334545597748075e-06, + "loss": 1.1124, + "step": 6074 + }, + { + "epoch": 1.198620724119837, + "grad_norm": 2.078125, + "learning_rate": 6.532451507920386e-06, + "loss": 0.9838, + "step": 6075 + }, + { + "epoch": 1.198820619174933, + "grad_norm": 2.3125, + "learning_rate": 6.531448387994634e-06, + "loss": 0.9515, + "step": 6076 + }, + { + "epoch": 1.1990205142300292, + "grad_norm": 2.21875, + "learning_rate": 6.530445200042107e-06, + "loss": 1.0353, + "step": 6077 + }, + { + "epoch": 1.1992204092851253, + "grad_norm": 2.234375, + "learning_rate": 6.5294419441073684e-06, + "loss": 1.0373, + "step": 6078 + }, + { + "epoch": 1.1994203043402214, + "grad_norm": 2.203125, + "learning_rate": 6.528438620234981e-06, + "loss": 1.0811, + "step": 6079 + }, + { + "epoch": 1.1996201993953175, + "grad_norm": 2.078125, + "learning_rate": 6.5274352284695144e-06, + "loss": 0.9719, + "step": 6080 + }, + { + "epoch": 1.1998200944504136, + "grad_norm": 2.03125, + "learning_rate": 6.526431768855537e-06, + "loss": 0.8906, + "step": 6081 + }, + { + "epoch": 1.2000199895055097, + "grad_norm": 2.078125, + "learning_rate": 6.5254282414376235e-06, + "loss": 0.9426, + "step": 6082 + }, + { + "epoch": 1.2002198845606058, + "grad_norm": 2.09375, + "learning_rate": 6.524424646260351e-06, + "loss": 0.9419, + "step": 6083 + }, + { + "epoch": 1.2004197796157017, + "grad_norm": 2.125, + "learning_rate": 6.523420983368298e-06, + "loss": 0.9869, + "step": 6084 + }, + { + "epoch": 1.2006196746707978, + "grad_norm": 2.28125, + "learning_rate": 6.522417252806048e-06, + "loss": 0.9829, + "step": 6085 + }, + { + "epoch": 1.2008195697258939, + "grad_norm": 2.15625, + "learning_rate": 6.5214134546181865e-06, + "loss": 1.0244, + "step": 6086 + }, + { + "epoch": 1.20101946478099, + "grad_norm": 2.125, + "learning_rate": 6.520409588849301e-06, + "loss": 0.8331, + "step": 6087 + }, + { + "epoch": 1.201219359836086, + "grad_norm": 2.265625, + "learning_rate": 6.519405655543985e-06, + "loss": 0.8882, + "step": 6088 + }, + { + "epoch": 1.2014192548911822, + "grad_norm": 2.234375, + "learning_rate": 6.518401654746831e-06, + "loss": 0.9884, + "step": 6089 + }, + { + "epoch": 1.2016191499462783, + "grad_norm": 2.15625, + "learning_rate": 6.517397586502439e-06, + "loss": 0.9536, + "step": 6090 + }, + { + "epoch": 1.2018190450013744, + "grad_norm": 2.203125, + "learning_rate": 6.516393450855407e-06, + "loss": 0.9221, + "step": 6091 + }, + { + "epoch": 1.2020189400564703, + "grad_norm": 2.140625, + "learning_rate": 6.5153892478503414e-06, + "loss": 0.9778, + "step": 6092 + }, + { + "epoch": 1.2022188351115664, + "grad_norm": 2.21875, + "learning_rate": 6.514384977531846e-06, + "loss": 0.9758, + "step": 6093 + }, + { + "epoch": 1.2024187301666625, + "grad_norm": 2.15625, + "learning_rate": 6.513380639944532e-06, + "loss": 0.9534, + "step": 6094 + }, + { + "epoch": 1.2026186252217586, + "grad_norm": 2.171875, + "learning_rate": 6.512376235133011e-06, + "loss": 1.0497, + "step": 6095 + }, + { + "epoch": 1.2028185202768547, + "grad_norm": 2.078125, + "learning_rate": 6.511371763141899e-06, + "loss": 1.0026, + "step": 6096 + }, + { + "epoch": 1.2030184153319508, + "grad_norm": 2.21875, + "learning_rate": 6.510367224015817e-06, + "loss": 0.9807, + "step": 6097 + }, + { + "epoch": 1.2032183103870469, + "grad_norm": 2.078125, + "learning_rate": 6.5093626177993815e-06, + "loss": 1.0168, + "step": 6098 + }, + { + "epoch": 1.203418205442143, + "grad_norm": 2.09375, + "learning_rate": 6.508357944537221e-06, + "loss": 0.9322, + "step": 6099 + }, + { + "epoch": 1.203618100497239, + "grad_norm": 2.171875, + "learning_rate": 6.507353204273962e-06, + "loss": 1.0694, + "step": 6100 + }, + { + "epoch": 1.203817995552335, + "grad_norm": 2.28125, + "learning_rate": 6.506348397054233e-06, + "loss": 0.9876, + "step": 6101 + }, + { + "epoch": 1.204017890607431, + "grad_norm": 2.109375, + "learning_rate": 6.505343522922672e-06, + "loss": 1.0069, + "step": 6102 + }, + { + "epoch": 1.2042177856625271, + "grad_norm": 2.125, + "learning_rate": 6.5043385819239095e-06, + "loss": 0.9277, + "step": 6103 + }, + { + "epoch": 1.2044176807176232, + "grad_norm": 2.078125, + "learning_rate": 6.5033335741025885e-06, + "loss": 1.0108, + "step": 6104 + }, + { + "epoch": 1.2046175757727193, + "grad_norm": 2.125, + "learning_rate": 6.502328499503352e-06, + "loss": 1.0157, + "step": 6105 + }, + { + "epoch": 1.2048174708278154, + "grad_norm": 2.296875, + "learning_rate": 6.5013233581708425e-06, + "loss": 1.0746, + "step": 6106 + }, + { + "epoch": 1.2050173658829115, + "grad_norm": 2.109375, + "learning_rate": 6.500318150149711e-06, + "loss": 0.9329, + "step": 6107 + }, + { + "epoch": 1.2052172609380076, + "grad_norm": 2.1875, + "learning_rate": 6.499312875484608e-06, + "loss": 1.0009, + "step": 6108 + }, + { + "epoch": 1.2054171559931035, + "grad_norm": 2.296875, + "learning_rate": 6.498307534220186e-06, + "loss": 0.9681, + "step": 6109 + }, + { + "epoch": 1.2056170510481996, + "grad_norm": 2.140625, + "learning_rate": 6.497302126401103e-06, + "loss": 1.043, + "step": 6110 + }, + { + "epoch": 1.2058169461032957, + "grad_norm": 2.109375, + "learning_rate": 6.496296652072021e-06, + "loss": 0.9317, + "step": 6111 + }, + { + "epoch": 1.2060168411583918, + "grad_norm": 2.1875, + "learning_rate": 6.4952911112776e-06, + "loss": 0.8968, + "step": 6112 + }, + { + "epoch": 1.206216736213488, + "grad_norm": 2.125, + "learning_rate": 6.494285504062507e-06, + "loss": 0.9643, + "step": 6113 + }, + { + "epoch": 1.206416631268584, + "grad_norm": 2.125, + "learning_rate": 6.493279830471414e-06, + "loss": 0.9282, + "step": 6114 + }, + { + "epoch": 1.20661652632368, + "grad_norm": 2.109375, + "learning_rate": 6.492274090548989e-06, + "loss": 0.922, + "step": 6115 + }, + { + "epoch": 1.2068164213787762, + "grad_norm": 2.21875, + "learning_rate": 6.491268284339908e-06, + "loss": 1.0417, + "step": 6116 + }, + { + "epoch": 1.2070163164338723, + "grad_norm": 2.328125, + "learning_rate": 6.490262411888851e-06, + "loss": 1.0131, + "step": 6117 + }, + { + "epoch": 1.2072162114889684, + "grad_norm": 2.09375, + "learning_rate": 6.489256473240493e-06, + "loss": 0.9763, + "step": 6118 + }, + { + "epoch": 1.2074161065440643, + "grad_norm": 2.28125, + "learning_rate": 6.488250468439525e-06, + "loss": 0.9694, + "step": 6119 + }, + { + "epoch": 1.2076160015991604, + "grad_norm": 2.28125, + "learning_rate": 6.4872443975306275e-06, + "loss": 1.0717, + "step": 6120 + }, + { + "epoch": 1.2078158966542565, + "grad_norm": 2.25, + "learning_rate": 6.486238260558495e-06, + "loss": 1.0287, + "step": 6121 + }, + { + "epoch": 1.2080157917093526, + "grad_norm": 2.03125, + "learning_rate": 6.485232057567816e-06, + "loss": 0.9361, + "step": 6122 + }, + { + "epoch": 1.2082156867644487, + "grad_norm": 2.09375, + "learning_rate": 6.48422578860329e-06, + "loss": 0.9639, + "step": 6123 + }, + { + "epoch": 1.2084155818195448, + "grad_norm": 2.0625, + "learning_rate": 6.4832194537096105e-06, + "loss": 0.9281, + "step": 6124 + }, + { + "epoch": 1.2086154768746409, + "grad_norm": 2.4375, + "learning_rate": 6.482213052931483e-06, + "loss": 1.0782, + "step": 6125 + }, + { + "epoch": 1.208815371929737, + "grad_norm": 2.171875, + "learning_rate": 6.481206586313609e-06, + "loss": 0.9929, + "step": 6126 + }, + { + "epoch": 1.2090152669848329, + "grad_norm": 2.328125, + "learning_rate": 6.480200053900696e-06, + "loss": 0.92, + "step": 6127 + }, + { + "epoch": 1.209215162039929, + "grad_norm": 2.171875, + "learning_rate": 6.479193455737457e-06, + "loss": 0.9682, + "step": 6128 + }, + { + "epoch": 1.209415057095025, + "grad_norm": 2.203125, + "learning_rate": 6.4781867918686e-06, + "loss": 0.9514, + "step": 6129 + }, + { + "epoch": 1.2096149521501212, + "grad_norm": 2.15625, + "learning_rate": 6.477180062338845e-06, + "loss": 0.9545, + "step": 6130 + }, + { + "epoch": 1.2098148472052173, + "grad_norm": 2.265625, + "learning_rate": 6.47617326719291e-06, + "loss": 1.0176, + "step": 6131 + }, + { + "epoch": 1.2100147422603134, + "grad_norm": 2.15625, + "learning_rate": 6.475166406475515e-06, + "loss": 0.9661, + "step": 6132 + }, + { + "epoch": 1.2102146373154095, + "grad_norm": 2.078125, + "learning_rate": 6.474159480231386e-06, + "loss": 0.9227, + "step": 6133 + }, + { + "epoch": 1.2104145323705056, + "grad_norm": 2.171875, + "learning_rate": 6.47315248850525e-06, + "loss": 1.0038, + "step": 6134 + }, + { + "epoch": 1.2106144274256017, + "grad_norm": 2.28125, + "learning_rate": 6.472145431341838e-06, + "loss": 0.9451, + "step": 6135 + }, + { + "epoch": 1.2108143224806978, + "grad_norm": 2.21875, + "learning_rate": 6.471138308785885e-06, + "loss": 1.0021, + "step": 6136 + }, + { + "epoch": 1.2110142175357936, + "grad_norm": 2.109375, + "learning_rate": 6.4701311208821225e-06, + "loss": 0.9096, + "step": 6137 + }, + { + "epoch": 1.2112141125908897, + "grad_norm": 2.171875, + "learning_rate": 6.4691238676752935e-06, + "loss": 0.934, + "step": 6138 + }, + { + "epoch": 1.2114140076459858, + "grad_norm": 2.234375, + "learning_rate": 6.468116549210142e-06, + "loss": 1.0087, + "step": 6139 + }, + { + "epoch": 1.211613902701082, + "grad_norm": 2.28125, + "learning_rate": 6.467109165531407e-06, + "loss": 0.986, + "step": 6140 + }, + { + "epoch": 1.211813797756178, + "grad_norm": 2.140625, + "learning_rate": 6.46610171668384e-06, + "loss": 1.0162, + "step": 6141 + }, + { + "epoch": 1.2120136928112741, + "grad_norm": 2.25, + "learning_rate": 6.465094202712192e-06, + "loss": 1.0288, + "step": 6142 + }, + { + "epoch": 1.2122135878663702, + "grad_norm": 2.1875, + "learning_rate": 6.464086623661215e-06, + "loss": 0.8564, + "step": 6143 + }, + { + "epoch": 1.2124134829214661, + "grad_norm": 2.046875, + "learning_rate": 6.463078979575667e-06, + "loss": 0.9441, + "step": 6144 + }, + { + "epoch": 1.2126133779765622, + "grad_norm": 2.140625, + "learning_rate": 6.462071270500308e-06, + "loss": 0.9968, + "step": 6145 + }, + { + "epoch": 1.2128132730316583, + "grad_norm": 2.1875, + "learning_rate": 6.461063496479899e-06, + "loss": 1.0823, + "step": 6146 + }, + { + "epoch": 1.2130131680867544, + "grad_norm": 2.09375, + "learning_rate": 6.460055657559206e-06, + "loss": 0.9145, + "step": 6147 + }, + { + "epoch": 1.2132130631418505, + "grad_norm": 2.125, + "learning_rate": 6.459047753782994e-06, + "loss": 0.9598, + "step": 6148 + }, + { + "epoch": 1.2134129581969466, + "grad_norm": 2.25, + "learning_rate": 6.458039785196039e-06, + "loss": 1.0672, + "step": 6149 + }, + { + "epoch": 1.2136128532520427, + "grad_norm": 2.234375, + "learning_rate": 6.457031751843113e-06, + "loss": 0.9857, + "step": 6150 + }, + { + "epoch": 1.2138127483071388, + "grad_norm": 2.125, + "learning_rate": 6.4560236537689905e-06, + "loss": 0.9374, + "step": 6151 + }, + { + "epoch": 1.214012643362235, + "grad_norm": 2.0625, + "learning_rate": 6.455015491018452e-06, + "loss": 1.0154, + "step": 6152 + }, + { + "epoch": 1.214212538417331, + "grad_norm": 2.09375, + "learning_rate": 6.454007263636283e-06, + "loss": 0.9403, + "step": 6153 + }, + { + "epoch": 1.2144124334724269, + "grad_norm": 2.046875, + "learning_rate": 6.452998971667266e-06, + "loss": 0.9519, + "step": 6154 + }, + { + "epoch": 1.214612328527523, + "grad_norm": 2.140625, + "learning_rate": 6.451990615156189e-06, + "loss": 0.9895, + "step": 6155 + }, + { + "epoch": 1.214812223582619, + "grad_norm": 2.140625, + "learning_rate": 6.4509821941478455e-06, + "loss": 0.9187, + "step": 6156 + }, + { + "epoch": 1.2150121186377152, + "grad_norm": 2.171875, + "learning_rate": 6.4499737086870276e-06, + "loss": 0.9367, + "step": 6157 + }, + { + "epoch": 1.2152120136928113, + "grad_norm": 2.09375, + "learning_rate": 6.448965158818531e-06, + "loss": 0.9306, + "step": 6158 + }, + { + "epoch": 1.2154119087479074, + "grad_norm": 2.3125, + "learning_rate": 6.447956544587158e-06, + "loss": 1.0404, + "step": 6159 + }, + { + "epoch": 1.2156118038030035, + "grad_norm": 2.140625, + "learning_rate": 6.44694786603771e-06, + "loss": 0.961, + "step": 6160 + }, + { + "epoch": 1.2158116988580996, + "grad_norm": 2.328125, + "learning_rate": 6.445939123214991e-06, + "loss": 1.0009, + "step": 6161 + }, + { + "epoch": 1.2160115939131955, + "grad_norm": 2.0, + "learning_rate": 6.444930316163812e-06, + "loss": 0.9166, + "step": 6162 + }, + { + "epoch": 1.2162114889682916, + "grad_norm": 2.125, + "learning_rate": 6.443921444928982e-06, + "loss": 0.9853, + "step": 6163 + }, + { + "epoch": 1.2164113840233877, + "grad_norm": 2.125, + "learning_rate": 6.442912509555316e-06, + "loss": 0.913, + "step": 6164 + }, + { + "epoch": 1.2166112790784838, + "grad_norm": 2.15625, + "learning_rate": 6.44190351008763e-06, + "loss": 0.9804, + "step": 6165 + }, + { + "epoch": 1.2168111741335799, + "grad_norm": 2.171875, + "learning_rate": 6.4408944465707435e-06, + "loss": 0.993, + "step": 6166 + }, + { + "epoch": 1.217011069188676, + "grad_norm": 2.171875, + "learning_rate": 6.439885319049482e-06, + "loss": 0.9392, + "step": 6167 + }, + { + "epoch": 1.217210964243772, + "grad_norm": 2.078125, + "learning_rate": 6.438876127568665e-06, + "loss": 0.9837, + "step": 6168 + }, + { + "epoch": 1.2174108592988682, + "grad_norm": 2.109375, + "learning_rate": 6.437866872173127e-06, + "loss": 0.9335, + "step": 6169 + }, + { + "epoch": 1.2176107543539643, + "grad_norm": 2.046875, + "learning_rate": 6.436857552907696e-06, + "loss": 0.9545, + "step": 6170 + }, + { + "epoch": 1.2178106494090604, + "grad_norm": 2.078125, + "learning_rate": 6.435848169817205e-06, + "loss": 1.0012, + "step": 6171 + }, + { + "epoch": 1.2180105444641562, + "grad_norm": 2.171875, + "learning_rate": 6.43483872294649e-06, + "loss": 1.0375, + "step": 6172 + }, + { + "epoch": 1.2182104395192523, + "grad_norm": 2.09375, + "learning_rate": 6.433829212340394e-06, + "loss": 1.0023, + "step": 6173 + }, + { + "epoch": 1.2184103345743484, + "grad_norm": 2.046875, + "learning_rate": 6.432819638043758e-06, + "loss": 0.9833, + "step": 6174 + }, + { + "epoch": 1.2186102296294445, + "grad_norm": 2.046875, + "learning_rate": 6.431810000101425e-06, + "loss": 0.9473, + "step": 6175 + }, + { + "epoch": 1.2188101246845406, + "grad_norm": 2.359375, + "learning_rate": 6.430800298558246e-06, + "loss": 0.9796, + "step": 6176 + }, + { + "epoch": 1.2190100197396367, + "grad_norm": 2.171875, + "learning_rate": 6.429790533459071e-06, + "loss": 1.0393, + "step": 6177 + }, + { + "epoch": 1.2192099147947328, + "grad_norm": 2.203125, + "learning_rate": 6.428780704848753e-06, + "loss": 1.0952, + "step": 6178 + }, + { + "epoch": 1.2194098098498287, + "grad_norm": 2.234375, + "learning_rate": 6.427770812772147e-06, + "loss": 0.967, + "step": 6179 + }, + { + "epoch": 1.2196097049049248, + "grad_norm": 2.296875, + "learning_rate": 6.426760857274115e-06, + "loss": 0.9787, + "step": 6180 + }, + { + "epoch": 1.219809599960021, + "grad_norm": 2.15625, + "learning_rate": 6.425750838399519e-06, + "loss": 0.9558, + "step": 6181 + }, + { + "epoch": 1.220009495015117, + "grad_norm": 2.109375, + "learning_rate": 6.4247407561932215e-06, + "loss": 0.9681, + "step": 6182 + }, + { + "epoch": 1.2202093900702131, + "grad_norm": 2.21875, + "learning_rate": 6.423730610700092e-06, + "loss": 1.1079, + "step": 6183 + }, + { + "epoch": 1.2204092851253092, + "grad_norm": 2.21875, + "learning_rate": 6.422720401965003e-06, + "loss": 0.9808, + "step": 6184 + }, + { + "epoch": 1.2206091801804053, + "grad_norm": 2.3125, + "learning_rate": 6.421710130032824e-06, + "loss": 1.0888, + "step": 6185 + }, + { + "epoch": 1.2208090752355014, + "grad_norm": 2.109375, + "learning_rate": 6.420699794948433e-06, + "loss": 0.9608, + "step": 6186 + }, + { + "epoch": 1.2210089702905975, + "grad_norm": 2.1875, + "learning_rate": 6.419689396756709e-06, + "loss": 1.0215, + "step": 6187 + }, + { + "epoch": 1.2212088653456936, + "grad_norm": 2.1875, + "learning_rate": 6.418678935502534e-06, + "loss": 0.9127, + "step": 6188 + }, + { + "epoch": 1.2214087604007895, + "grad_norm": 2.1875, + "learning_rate": 6.4176684112307905e-06, + "loss": 0.9422, + "step": 6189 + }, + { + "epoch": 1.2216086554558856, + "grad_norm": 2.046875, + "learning_rate": 6.41665782398637e-06, + "loss": 1.0003, + "step": 6190 + }, + { + "epoch": 1.2218085505109817, + "grad_norm": 2.1875, + "learning_rate": 6.415647173814158e-06, + "loss": 0.9936, + "step": 6191 + }, + { + "epoch": 1.2220084455660778, + "grad_norm": 2.234375, + "learning_rate": 6.414636460759052e-06, + "loss": 1.034, + "step": 6192 + }, + { + "epoch": 1.222208340621174, + "grad_norm": 2.0, + "learning_rate": 6.413625684865942e-06, + "loss": 0.8903, + "step": 6193 + }, + { + "epoch": 1.22240823567627, + "grad_norm": 2.140625, + "learning_rate": 6.412614846179734e-06, + "loss": 0.9948, + "step": 6194 + }, + { + "epoch": 1.222608130731366, + "grad_norm": 2.171875, + "learning_rate": 6.411603944745323e-06, + "loss": 1.0735, + "step": 6195 + }, + { + "epoch": 1.2228080257864622, + "grad_norm": 2.15625, + "learning_rate": 6.410592980607616e-06, + "loss": 0.8989, + "step": 6196 + }, + { + "epoch": 1.223007920841558, + "grad_norm": 1.984375, + "learning_rate": 6.409581953811519e-06, + "loss": 0.8938, + "step": 6197 + }, + { + "epoch": 1.2232078158966542, + "grad_norm": 2.140625, + "learning_rate": 6.408570864401944e-06, + "loss": 1.0571, + "step": 6198 + }, + { + "epoch": 1.2234077109517503, + "grad_norm": 2.15625, + "learning_rate": 6.4075597124238e-06, + "loss": 1.0363, + "step": 6199 + }, + { + "epoch": 1.2236076060068464, + "grad_norm": 2.03125, + "learning_rate": 6.4065484979220035e-06, + "loss": 0.938, + "step": 6200 + }, + { + "epoch": 1.2238075010619425, + "grad_norm": 2.09375, + "learning_rate": 6.405537220941475e-06, + "loss": 0.9658, + "step": 6201 + }, + { + "epoch": 1.2240073961170386, + "grad_norm": 2.078125, + "learning_rate": 6.404525881527133e-06, + "loss": 0.9345, + "step": 6202 + }, + { + "epoch": 1.2242072911721347, + "grad_norm": 2.109375, + "learning_rate": 6.4035144797239e-06, + "loss": 1.0204, + "step": 6203 + }, + { + "epoch": 1.2244071862272308, + "grad_norm": 2.125, + "learning_rate": 6.402503015576705e-06, + "loss": 0.9567, + "step": 6204 + }, + { + "epoch": 1.2246070812823269, + "grad_norm": 2.171875, + "learning_rate": 6.401491489130474e-06, + "loss": 0.9587, + "step": 6205 + }, + { + "epoch": 1.224806976337423, + "grad_norm": 2.3125, + "learning_rate": 6.400479900430141e-06, + "loss": 1.0711, + "step": 6206 + }, + { + "epoch": 1.2250068713925188, + "grad_norm": 2.234375, + "learning_rate": 6.399468249520641e-06, + "loss": 0.9917, + "step": 6207 + }, + { + "epoch": 1.225206766447615, + "grad_norm": 2.390625, + "learning_rate": 6.398456536446912e-06, + "loss": 1.0016, + "step": 6208 + }, + { + "epoch": 1.225406661502711, + "grad_norm": 2.140625, + "learning_rate": 6.397444761253892e-06, + "loss": 0.9884, + "step": 6209 + }, + { + "epoch": 1.2256065565578071, + "grad_norm": 2.21875, + "learning_rate": 6.396432923986525e-06, + "loss": 1.0465, + "step": 6210 + }, + { + "epoch": 1.2258064516129032, + "grad_norm": 2.046875, + "learning_rate": 6.3954210246897565e-06, + "loss": 0.8695, + "step": 6211 + }, + { + "epoch": 1.2260063466679993, + "grad_norm": 2.25, + "learning_rate": 6.3944090634085355e-06, + "loss": 1.0473, + "step": 6212 + }, + { + "epoch": 1.2262062417230954, + "grad_norm": 2.171875, + "learning_rate": 6.393397040187812e-06, + "loss": 0.928, + "step": 6213 + }, + { + "epoch": 1.2264061367781913, + "grad_norm": 2.0625, + "learning_rate": 6.392384955072541e-06, + "loss": 0.889, + "step": 6214 + }, + { + "epoch": 1.2266060318332874, + "grad_norm": 2.09375, + "learning_rate": 6.39137280810768e-06, + "loss": 0.9301, + "step": 6215 + }, + { + "epoch": 1.2268059268883835, + "grad_norm": 2.015625, + "learning_rate": 6.390360599338188e-06, + "loss": 0.947, + "step": 6216 + }, + { + "epoch": 1.2270058219434796, + "grad_norm": 2.203125, + "learning_rate": 6.389348328809024e-06, + "loss": 1.0382, + "step": 6217 + }, + { + "epoch": 1.2272057169985757, + "grad_norm": 2.203125, + "learning_rate": 6.388335996565158e-06, + "loss": 1.0804, + "step": 6218 + }, + { + "epoch": 1.2274056120536718, + "grad_norm": 2.046875, + "learning_rate": 6.387323602651554e-06, + "loss": 0.894, + "step": 6219 + }, + { + "epoch": 1.227605507108768, + "grad_norm": 2.171875, + "learning_rate": 6.386311147113185e-06, + "loss": 1.0323, + "step": 6220 + }, + { + "epoch": 1.227805402163864, + "grad_norm": 2.296875, + "learning_rate": 6.385298629995021e-06, + "loss": 1.0044, + "step": 6221 + }, + { + "epoch": 1.2280052972189601, + "grad_norm": 2.375, + "learning_rate": 6.3842860513420416e-06, + "loss": 0.9877, + "step": 6222 + }, + { + "epoch": 1.2282051922740562, + "grad_norm": 2.328125, + "learning_rate": 6.3832734111992236e-06, + "loss": 0.9811, + "step": 6223 + }, + { + "epoch": 1.228405087329152, + "grad_norm": 2.359375, + "learning_rate": 6.382260709611547e-06, + "loss": 1.006, + "step": 6224 + }, + { + "epoch": 1.2286049823842482, + "grad_norm": 2.21875, + "learning_rate": 6.381247946623997e-06, + "loss": 1.0315, + "step": 6225 + }, + { + "epoch": 1.2288048774393443, + "grad_norm": 2.15625, + "learning_rate": 6.380235122281563e-06, + "loss": 0.9488, + "step": 6226 + }, + { + "epoch": 1.2290047724944404, + "grad_norm": 2.140625, + "learning_rate": 6.379222236629231e-06, + "loss": 0.9502, + "step": 6227 + }, + { + "epoch": 1.2292046675495365, + "grad_norm": 2.046875, + "learning_rate": 6.378209289711994e-06, + "loss": 0.9568, + "step": 6228 + }, + { + "epoch": 1.2294045626046326, + "grad_norm": 2.21875, + "learning_rate": 6.377196281574849e-06, + "loss": 0.9754, + "step": 6229 + }, + { + "epoch": 1.2296044576597287, + "grad_norm": 2.109375, + "learning_rate": 6.376183212262791e-06, + "loss": 0.9237, + "step": 6230 + }, + { + "epoch": 1.2298043527148248, + "grad_norm": 2.203125, + "learning_rate": 6.375170081820823e-06, + "loss": 1.066, + "step": 6231 + }, + { + "epoch": 1.2300042477699207, + "grad_norm": 2.15625, + "learning_rate": 6.374156890293947e-06, + "loss": 1.0714, + "step": 6232 + }, + { + "epoch": 1.2302041428250168, + "grad_norm": 2.15625, + "learning_rate": 6.373143637727166e-06, + "loss": 0.9691, + "step": 6233 + }, + { + "epoch": 1.2304040378801129, + "grad_norm": 2.21875, + "learning_rate": 6.372130324165493e-06, + "loss": 0.8767, + "step": 6234 + }, + { + "epoch": 1.230603932935209, + "grad_norm": 2.1875, + "learning_rate": 6.3711169496539385e-06, + "loss": 0.9945, + "step": 6235 + }, + { + "epoch": 1.230803827990305, + "grad_norm": 2.125, + "learning_rate": 6.370103514237513e-06, + "loss": 0.9998, + "step": 6236 + }, + { + "epoch": 1.2310037230454012, + "grad_norm": 2.140625, + "learning_rate": 6.3690900179612395e-06, + "loss": 0.9744, + "step": 6237 + }, + { + "epoch": 1.2312036181004973, + "grad_norm": 2.09375, + "learning_rate": 6.36807646087013e-06, + "loss": 1.0184, + "step": 6238 + }, + { + "epoch": 1.2314035131555934, + "grad_norm": 2.140625, + "learning_rate": 6.367062843009211e-06, + "loss": 0.9687, + "step": 6239 + }, + { + "epoch": 1.2316034082106895, + "grad_norm": 1.9921875, + "learning_rate": 6.366049164423508e-06, + "loss": 0.9406, + "step": 6240 + }, + { + "epoch": 1.2318033032657856, + "grad_norm": 2.0625, + "learning_rate": 6.365035425158046e-06, + "loss": 0.9448, + "step": 6241 + }, + { + "epoch": 1.2320031983208815, + "grad_norm": 2.46875, + "learning_rate": 6.364021625257856e-06, + "loss": 1.001, + "step": 6242 + }, + { + "epoch": 1.2322030933759776, + "grad_norm": 2.1875, + "learning_rate": 6.363007764767972e-06, + "loss": 0.9919, + "step": 6243 + }, + { + "epoch": 1.2324029884310737, + "grad_norm": 2.078125, + "learning_rate": 6.361993843733427e-06, + "loss": 1.0646, + "step": 6244 + }, + { + "epoch": 1.2326028834861698, + "grad_norm": 2.125, + "learning_rate": 6.360979862199262e-06, + "loss": 0.9798, + "step": 6245 + }, + { + "epoch": 1.2328027785412659, + "grad_norm": 2.046875, + "learning_rate": 6.3599658202105175e-06, + "loss": 0.9565, + "step": 6246 + }, + { + "epoch": 1.233002673596362, + "grad_norm": 2.109375, + "learning_rate": 6.3589517178122365e-06, + "loss": 0.8722, + "step": 6247 + }, + { + "epoch": 1.233202568651458, + "grad_norm": 2.09375, + "learning_rate": 6.357937555049465e-06, + "loss": 0.9004, + "step": 6248 + }, + { + "epoch": 1.2334024637065542, + "grad_norm": 2.25, + "learning_rate": 6.356923331967252e-06, + "loss": 1.0188, + "step": 6249 + }, + { + "epoch": 1.23360235876165, + "grad_norm": 2.484375, + "learning_rate": 6.355909048610649e-06, + "loss": 0.9297, + "step": 6250 + }, + { + "epoch": 1.2338022538167461, + "grad_norm": 2.078125, + "learning_rate": 6.354894705024711e-06, + "loss": 0.9353, + "step": 6251 + }, + { + "epoch": 1.2340021488718422, + "grad_norm": 2.546875, + "learning_rate": 6.353880301254496e-06, + "loss": 1.0263, + "step": 6252 + }, + { + "epoch": 1.2342020439269383, + "grad_norm": 2.09375, + "learning_rate": 6.352865837345061e-06, + "loss": 1.0051, + "step": 6253 + }, + { + "epoch": 1.2344019389820344, + "grad_norm": 2.171875, + "learning_rate": 6.351851313341473e-06, + "loss": 0.9922, + "step": 6254 + }, + { + "epoch": 1.2346018340371305, + "grad_norm": 2.28125, + "learning_rate": 6.350836729288792e-06, + "loss": 0.9041, + "step": 6255 + }, + { + "epoch": 1.2348017290922266, + "grad_norm": 2.234375, + "learning_rate": 6.349822085232087e-06, + "loss": 0.9869, + "step": 6256 + }, + { + "epoch": 1.2350016241473227, + "grad_norm": 2.234375, + "learning_rate": 6.348807381216431e-06, + "loss": 0.9799, + "step": 6257 + }, + { + "epoch": 1.2352015192024188, + "grad_norm": 2.21875, + "learning_rate": 6.347792617286893e-06, + "loss": 1.0542, + "step": 6258 + }, + { + "epoch": 1.235401414257515, + "grad_norm": 2.15625, + "learning_rate": 6.346777793488552e-06, + "loss": 1.0436, + "step": 6259 + }, + { + "epoch": 1.2356013093126108, + "grad_norm": 2.140625, + "learning_rate": 6.345762909866485e-06, + "loss": 1.0035, + "step": 6260 + }, + { + "epoch": 1.235801204367707, + "grad_norm": 2.125, + "learning_rate": 6.344747966465774e-06, + "loss": 0.9534, + "step": 6261 + }, + { + "epoch": 1.236001099422803, + "grad_norm": 2.15625, + "learning_rate": 6.343732963331502e-06, + "loss": 0.9065, + "step": 6262 + }, + { + "epoch": 1.236200994477899, + "grad_norm": 2.078125, + "learning_rate": 6.342717900508755e-06, + "loss": 0.9186, + "step": 6263 + }, + { + "epoch": 1.2364008895329952, + "grad_norm": 2.265625, + "learning_rate": 6.341702778042622e-06, + "loss": 1.046, + "step": 6264 + }, + { + "epoch": 1.2366007845880913, + "grad_norm": 2.203125, + "learning_rate": 6.3406875959781944e-06, + "loss": 0.9073, + "step": 6265 + }, + { + "epoch": 1.2368006796431874, + "grad_norm": 2.28125, + "learning_rate": 6.339672354360568e-06, + "loss": 0.9849, + "step": 6266 + }, + { + "epoch": 1.2370005746982833, + "grad_norm": 2.078125, + "learning_rate": 6.338657053234838e-06, + "loss": 0.9251, + "step": 6267 + }, + { + "epoch": 1.2372004697533794, + "grad_norm": 2.359375, + "learning_rate": 6.337641692646106e-06, + "loss": 0.9606, + "step": 6268 + }, + { + "epoch": 1.2374003648084755, + "grad_norm": 2.140625, + "learning_rate": 6.336626272639471e-06, + "loss": 0.9163, + "step": 6269 + }, + { + "epoch": 1.2376002598635716, + "grad_norm": 2.21875, + "learning_rate": 6.3356107932600396e-06, + "loss": 0.9725, + "step": 6270 + }, + { + "epoch": 1.2378001549186677, + "grad_norm": 2.03125, + "learning_rate": 6.334595254552921e-06, + "loss": 0.9324, + "step": 6271 + }, + { + "epoch": 1.2380000499737638, + "grad_norm": 2.21875, + "learning_rate": 6.333579656563222e-06, + "loss": 0.9135, + "step": 6272 + }, + { + "epoch": 1.2381999450288599, + "grad_norm": 2.40625, + "learning_rate": 6.332563999336059e-06, + "loss": 1.0486, + "step": 6273 + }, + { + "epoch": 1.238399840083956, + "grad_norm": 2.03125, + "learning_rate": 6.331548282916545e-06, + "loss": 0.9279, + "step": 6274 + }, + { + "epoch": 1.238599735139052, + "grad_norm": 2.09375, + "learning_rate": 6.330532507349798e-06, + "loss": 0.9609, + "step": 6275 + }, + { + "epoch": 1.2387996301941482, + "grad_norm": 2.0625, + "learning_rate": 6.3295166726809396e-06, + "loss": 0.8932, + "step": 6276 + }, + { + "epoch": 1.238999525249244, + "grad_norm": 2.140625, + "learning_rate": 6.328500778955091e-06, + "loss": 0.9976, + "step": 6277 + }, + { + "epoch": 1.2391994203043402, + "grad_norm": 2.140625, + "learning_rate": 6.327484826217382e-06, + "loss": 0.9441, + "step": 6278 + }, + { + "epoch": 1.2393993153594363, + "grad_norm": 2.1875, + "learning_rate": 6.326468814512937e-06, + "loss": 0.9518, + "step": 6279 + }, + { + "epoch": 1.2395992104145324, + "grad_norm": 2.078125, + "learning_rate": 6.325452743886891e-06, + "loss": 0.9586, + "step": 6280 + }, + { + "epoch": 1.2397991054696285, + "grad_norm": 2.203125, + "learning_rate": 6.324436614384374e-06, + "loss": 1.0174, + "step": 6281 + }, + { + "epoch": 1.2399990005247246, + "grad_norm": 2.078125, + "learning_rate": 6.3234204260505235e-06, + "loss": 0.9922, + "step": 6282 + }, + { + "epoch": 1.2401988955798207, + "grad_norm": 2.0625, + "learning_rate": 6.3224041789304804e-06, + "loss": 1.0269, + "step": 6283 + }, + { + "epoch": 1.2403987906349168, + "grad_norm": 2.109375, + "learning_rate": 6.321387873069384e-06, + "loss": 0.9655, + "step": 6284 + }, + { + "epoch": 1.2405986856900126, + "grad_norm": 2.15625, + "learning_rate": 6.3203715085123805e-06, + "loss": 0.9162, + "step": 6285 + }, + { + "epoch": 1.2407985807451087, + "grad_norm": 2.0625, + "learning_rate": 6.319355085304615e-06, + "loss": 1.0348, + "step": 6286 + }, + { + "epoch": 1.2409984758002048, + "grad_norm": 3.765625, + "learning_rate": 6.3183386034912364e-06, + "loss": 1.0138, + "step": 6287 + }, + { + "epoch": 1.241198370855301, + "grad_norm": 2.171875, + "learning_rate": 6.317322063117399e-06, + "loss": 1.0809, + "step": 6288 + }, + { + "epoch": 1.241398265910397, + "grad_norm": 2.15625, + "learning_rate": 6.316305464228256e-06, + "loss": 0.9187, + "step": 6289 + }, + { + "epoch": 1.2415981609654931, + "grad_norm": 2.15625, + "learning_rate": 6.315288806868964e-06, + "loss": 0.99, + "step": 6290 + }, + { + "epoch": 1.2417980560205892, + "grad_norm": 2.203125, + "learning_rate": 6.314272091084686e-06, + "loss": 1.0059, + "step": 6291 + }, + { + "epoch": 1.2419979510756853, + "grad_norm": 2.046875, + "learning_rate": 6.31325531692058e-06, + "loss": 0.9138, + "step": 6292 + }, + { + "epoch": 1.2421978461307814, + "grad_norm": 2.109375, + "learning_rate": 6.312238484421815e-06, + "loss": 0.948, + "step": 6293 + }, + { + "epoch": 1.2423977411858775, + "grad_norm": 2.21875, + "learning_rate": 6.311221593633557e-06, + "loss": 1.025, + "step": 6294 + }, + { + "epoch": 1.2425976362409734, + "grad_norm": 2.171875, + "learning_rate": 6.310204644600974e-06, + "loss": 1.0143, + "step": 6295 + }, + { + "epoch": 1.2427975312960695, + "grad_norm": 2.203125, + "learning_rate": 6.30918763736924e-06, + "loss": 0.926, + "step": 6296 + }, + { + "epoch": 1.2429974263511656, + "grad_norm": 2.25, + "learning_rate": 6.3081705719835316e-06, + "loss": 1.0279, + "step": 6297 + }, + { + "epoch": 1.2431973214062617, + "grad_norm": 2.03125, + "learning_rate": 6.307153448489028e-06, + "loss": 0.9368, + "step": 6298 + }, + { + "epoch": 1.2433972164613578, + "grad_norm": 2.21875, + "learning_rate": 6.306136266930906e-06, + "loss": 0.9261, + "step": 6299 + }, + { + "epoch": 1.243597111516454, + "grad_norm": 2.171875, + "learning_rate": 6.305119027354349e-06, + "loss": 0.9971, + "step": 6300 + }, + { + "epoch": 1.24379700657155, + "grad_norm": 2.0625, + "learning_rate": 6.304101729804546e-06, + "loss": 0.9341, + "step": 6301 + }, + { + "epoch": 1.243996901626646, + "grad_norm": 2.078125, + "learning_rate": 6.303084374326685e-06, + "loss": 1.0005, + "step": 6302 + }, + { + "epoch": 1.244196796681742, + "grad_norm": 2.09375, + "learning_rate": 6.302066960965954e-06, + "loss": 0.8845, + "step": 6303 + }, + { + "epoch": 1.244396691736838, + "grad_norm": 3.03125, + "learning_rate": 6.301049489767545e-06, + "loss": 1.0183, + "step": 6304 + }, + { + "epoch": 1.2445965867919342, + "grad_norm": 2.234375, + "learning_rate": 6.30003196077666e-06, + "loss": 0.9922, + "step": 6305 + }, + { + "epoch": 1.2447964818470303, + "grad_norm": 2.078125, + "learning_rate": 6.299014374038493e-06, + "loss": 0.9394, + "step": 6306 + }, + { + "epoch": 1.2449963769021264, + "grad_norm": 2.109375, + "learning_rate": 6.297996729598247e-06, + "loss": 0.9718, + "step": 6307 + }, + { + "epoch": 1.2451962719572225, + "grad_norm": 2.203125, + "learning_rate": 6.2969790275011245e-06, + "loss": 1.0739, + "step": 6308 + }, + { + "epoch": 1.2453961670123186, + "grad_norm": 2.109375, + "learning_rate": 6.295961267792332e-06, + "loss": 0.9442, + "step": 6309 + }, + { + "epoch": 1.2455960620674147, + "grad_norm": 2.0625, + "learning_rate": 6.294943450517078e-06, + "loss": 0.926, + "step": 6310 + }, + { + "epoch": 1.2457959571225108, + "grad_norm": 2.078125, + "learning_rate": 6.293925575720575e-06, + "loss": 0.991, + "step": 6311 + }, + { + "epoch": 1.2459958521776067, + "grad_norm": 2.078125, + "learning_rate": 6.292907643448035e-06, + "loss": 0.9396, + "step": 6312 + }, + { + "epoch": 1.2461957472327028, + "grad_norm": 2.171875, + "learning_rate": 6.291889653744677e-06, + "loss": 1.037, + "step": 6313 + }, + { + "epoch": 1.2463956422877989, + "grad_norm": 2.1875, + "learning_rate": 6.290871606655718e-06, + "loss": 0.9848, + "step": 6314 + }, + { + "epoch": 1.246595537342895, + "grad_norm": 2.09375, + "learning_rate": 6.2898535022263795e-06, + "loss": 0.9971, + "step": 6315 + }, + { + "epoch": 1.246795432397991, + "grad_norm": 2.09375, + "learning_rate": 6.288835340501886e-06, + "loss": 1.0538, + "step": 6316 + }, + { + "epoch": 1.2469953274530872, + "grad_norm": 2.125, + "learning_rate": 6.287817121527465e-06, + "loss": 1.141, + "step": 6317 + }, + { + "epoch": 1.2471952225081833, + "grad_norm": 2.234375, + "learning_rate": 6.286798845348345e-06, + "loss": 1.0421, + "step": 6318 + }, + { + "epoch": 1.2473951175632794, + "grad_norm": 2.109375, + "learning_rate": 6.285780512009758e-06, + "loss": 0.9491, + "step": 6319 + }, + { + "epoch": 1.2475950126183752, + "grad_norm": 2.140625, + "learning_rate": 6.284762121556937e-06, + "loss": 0.9259, + "step": 6320 + }, + { + "epoch": 1.2477949076734713, + "grad_norm": 2.125, + "learning_rate": 6.283743674035121e-06, + "loss": 0.9349, + "step": 6321 + }, + { + "epoch": 1.2479948027285674, + "grad_norm": 2.09375, + "learning_rate": 6.282725169489547e-06, + "loss": 0.8958, + "step": 6322 + }, + { + "epoch": 1.2481946977836635, + "grad_norm": 2.0625, + "learning_rate": 6.281706607965459e-06, + "loss": 0.8789, + "step": 6323 + }, + { + "epoch": 1.2483945928387596, + "grad_norm": 2.234375, + "learning_rate": 6.2806879895081006e-06, + "loss": 1.0401, + "step": 6324 + }, + { + "epoch": 1.2485944878938557, + "grad_norm": 2.203125, + "learning_rate": 6.279669314162717e-06, + "loss": 0.9825, + "step": 6325 + }, + { + "epoch": 1.2487943829489518, + "grad_norm": 2.03125, + "learning_rate": 6.2786505819745604e-06, + "loss": 0.8916, + "step": 6326 + }, + { + "epoch": 1.248994278004048, + "grad_norm": 2.28125, + "learning_rate": 6.277631792988882e-06, + "loss": 0.985, + "step": 6327 + }, + { + "epoch": 1.249194173059144, + "grad_norm": 2.28125, + "learning_rate": 6.276612947250934e-06, + "loss": 0.9907, + "step": 6328 + }, + { + "epoch": 1.2493940681142401, + "grad_norm": 2.09375, + "learning_rate": 6.275594044805976e-06, + "loss": 1.0056, + "step": 6329 + }, + { + "epoch": 1.249593963169336, + "grad_norm": 2.171875, + "learning_rate": 6.274575085699267e-06, + "loss": 0.9899, + "step": 6330 + }, + { + "epoch": 1.2497938582244321, + "grad_norm": 2.0625, + "learning_rate": 6.2735560699760676e-06, + "loss": 0.9518, + "step": 6331 + }, + { + "epoch": 1.2499937532795282, + "grad_norm": 2.15625, + "learning_rate": 6.2725369976816455e-06, + "loss": 1.0028, + "step": 6332 + }, + { + "epoch": 1.2501936483346243, + "grad_norm": 2.125, + "learning_rate": 6.271517868861266e-06, + "loss": 0.9737, + "step": 6333 + }, + { + "epoch": 1.2503935433897204, + "grad_norm": 2.140625, + "learning_rate": 6.270498683560195e-06, + "loss": 0.944, + "step": 6334 + }, + { + "epoch": 1.2505934384448165, + "grad_norm": 2.25, + "learning_rate": 6.269479441823712e-06, + "loss": 0.9353, + "step": 6335 + }, + { + "epoch": 1.2507933334999126, + "grad_norm": 2.171875, + "learning_rate": 6.268460143697086e-06, + "loss": 1.0613, + "step": 6336 + }, + { + "epoch": 1.2509932285550085, + "grad_norm": 2.21875, + "learning_rate": 6.267440789225596e-06, + "loss": 0.9998, + "step": 6337 + }, + { + "epoch": 1.2511931236101046, + "grad_norm": 2.046875, + "learning_rate": 6.266421378454524e-06, + "loss": 0.9825, + "step": 6338 + }, + { + "epoch": 1.2513930186652007, + "grad_norm": 2.203125, + "learning_rate": 6.265401911429147e-06, + "loss": 0.8972, + "step": 6339 + }, + { + "epoch": 1.2515929137202968, + "grad_norm": 2.296875, + "learning_rate": 6.264382388194753e-06, + "loss": 1.0687, + "step": 6340 + }, + { + "epoch": 1.251792808775393, + "grad_norm": 2.28125, + "learning_rate": 6.263362808796627e-06, + "loss": 1.0331, + "step": 6341 + }, + { + "epoch": 1.251992703830489, + "grad_norm": 2.1875, + "learning_rate": 6.262343173280062e-06, + "loss": 0.9854, + "step": 6342 + }, + { + "epoch": 1.252192598885585, + "grad_norm": 2.1875, + "learning_rate": 6.261323481690347e-06, + "loss": 1.0489, + "step": 6343 + }, + { + "epoch": 1.2523924939406812, + "grad_norm": 2.09375, + "learning_rate": 6.260303734072778e-06, + "loss": 0.9546, + "step": 6344 + }, + { + "epoch": 1.2525923889957773, + "grad_norm": 2.109375, + "learning_rate": 6.2592839304726515e-06, + "loss": 1.035, + "step": 6345 + }, + { + "epoch": 1.2527922840508734, + "grad_norm": 1.9921875, + "learning_rate": 6.258264070935267e-06, + "loss": 0.933, + "step": 6346 + }, + { + "epoch": 1.2529921791059695, + "grad_norm": 2.140625, + "learning_rate": 6.257244155505928e-06, + "loss": 0.9164, + "step": 6347 + }, + { + "epoch": 1.2531920741610654, + "grad_norm": 2.1875, + "learning_rate": 6.256224184229936e-06, + "loss": 0.984, + "step": 6348 + }, + { + "epoch": 1.2533919692161615, + "grad_norm": 2.125, + "learning_rate": 6.255204157152601e-06, + "loss": 0.9565, + "step": 6349 + }, + { + "epoch": 1.2535918642712576, + "grad_norm": 2.140625, + "learning_rate": 6.254184074319231e-06, + "loss": 1.0095, + "step": 6350 + }, + { + "epoch": 1.2537917593263537, + "grad_norm": 2.09375, + "learning_rate": 6.253163935775139e-06, + "loss": 0.994, + "step": 6351 + }, + { + "epoch": 1.2539916543814498, + "grad_norm": 2.140625, + "learning_rate": 6.252143741565639e-06, + "loss": 0.9789, + "step": 6352 + }, + { + "epoch": 1.2541915494365459, + "grad_norm": 2.125, + "learning_rate": 6.251123491736048e-06, + "loss": 0.9684, + "step": 6353 + }, + { + "epoch": 1.2543914444916417, + "grad_norm": 2.140625, + "learning_rate": 6.250103186331684e-06, + "loss": 1.0011, + "step": 6354 + }, + { + "epoch": 1.2545913395467378, + "grad_norm": 2.09375, + "learning_rate": 6.249082825397871e-06, + "loss": 0.9764, + "step": 6355 + }, + { + "epoch": 1.254791234601834, + "grad_norm": 2.109375, + "learning_rate": 6.248062408979933e-06, + "loss": 0.988, + "step": 6356 + }, + { + "epoch": 1.25499112965693, + "grad_norm": 2.171875, + "learning_rate": 6.247041937123194e-06, + "loss": 0.992, + "step": 6357 + }, + { + "epoch": 1.2551910247120261, + "grad_norm": 2.125, + "learning_rate": 6.246021409872987e-06, + "loss": 0.9474, + "step": 6358 + }, + { + "epoch": 1.2553909197671222, + "grad_norm": 2.171875, + "learning_rate": 6.2450008272746395e-06, + "loss": 1.0386, + "step": 6359 + }, + { + "epoch": 1.2555908148222183, + "grad_norm": 2.09375, + "learning_rate": 6.24398018937349e-06, + "loss": 0.9926, + "step": 6360 + }, + { + "epoch": 1.2557907098773144, + "grad_norm": 2.140625, + "learning_rate": 6.242959496214874e-06, + "loss": 1.0452, + "step": 6361 + }, + { + "epoch": 1.2559906049324105, + "grad_norm": 2.015625, + "learning_rate": 6.241938747844129e-06, + "loss": 0.9355, + "step": 6362 + }, + { + "epoch": 1.2561904999875066, + "grad_norm": 2.125, + "learning_rate": 6.240917944306597e-06, + "loss": 0.9983, + "step": 6363 + }, + { + "epoch": 1.2563903950426027, + "grad_norm": 2.15625, + "learning_rate": 6.239897085647624e-06, + "loss": 1.0291, + "step": 6364 + }, + { + "epoch": 1.2565902900976986, + "grad_norm": 2.25, + "learning_rate": 6.238876171912553e-06, + "loss": 0.9773, + "step": 6365 + }, + { + "epoch": 1.2567901851527947, + "grad_norm": 2.03125, + "learning_rate": 6.237855203146737e-06, + "loss": 0.9554, + "step": 6366 + }, + { + "epoch": 1.2569900802078908, + "grad_norm": 2.34375, + "learning_rate": 6.2368341793955235e-06, + "loss": 0.9327, + "step": 6367 + }, + { + "epoch": 1.257189975262987, + "grad_norm": 2.03125, + "learning_rate": 6.235813100704267e-06, + "loss": 0.9893, + "step": 6368 + }, + { + "epoch": 1.257389870318083, + "grad_norm": 2.125, + "learning_rate": 6.2347919671183265e-06, + "loss": 1.056, + "step": 6369 + }, + { + "epoch": 1.2575897653731791, + "grad_norm": 2.28125, + "learning_rate": 6.233770778683058e-06, + "loss": 0.9212, + "step": 6370 + }, + { + "epoch": 1.2577896604282752, + "grad_norm": 1.9765625, + "learning_rate": 6.2327495354438235e-06, + "loss": 0.9166, + "step": 6371 + }, + { + "epoch": 1.257989555483371, + "grad_norm": 2.21875, + "learning_rate": 6.231728237445987e-06, + "loss": 0.9113, + "step": 6372 + }, + { + "epoch": 1.2581894505384672, + "grad_norm": 2.125, + "learning_rate": 6.230706884734913e-06, + "loss": 0.9421, + "step": 6373 + }, + { + "epoch": 1.2583893455935633, + "grad_norm": 2.1875, + "learning_rate": 6.2296854773559685e-06, + "loss": 0.9322, + "step": 6374 + }, + { + "epoch": 1.2585892406486594, + "grad_norm": 2.234375, + "learning_rate": 6.22866401535453e-06, + "loss": 1.1273, + "step": 6375 + }, + { + "epoch": 1.2587891357037555, + "grad_norm": 2.359375, + "learning_rate": 6.227642498775965e-06, + "loss": 1.0321, + "step": 6376 + }, + { + "epoch": 1.2589890307588516, + "grad_norm": 2.109375, + "learning_rate": 6.226620927665652e-06, + "loss": 0.9332, + "step": 6377 + }, + { + "epoch": 1.2591889258139477, + "grad_norm": 2.140625, + "learning_rate": 6.225599302068968e-06, + "loss": 0.9504, + "step": 6378 + }, + { + "epoch": 1.2593888208690438, + "grad_norm": 2.203125, + "learning_rate": 6.224577622031294e-06, + "loss": 0.9156, + "step": 6379 + }, + { + "epoch": 1.25958871592414, + "grad_norm": 2.1875, + "learning_rate": 6.223555887598013e-06, + "loss": 0.9659, + "step": 6380 + }, + { + "epoch": 1.259788610979236, + "grad_norm": 2.1875, + "learning_rate": 6.22253409881451e-06, + "loss": 0.9739, + "step": 6381 + }, + { + "epoch": 1.259988506034332, + "grad_norm": 1.9609375, + "learning_rate": 6.221512255726173e-06, + "loss": 0.8863, + "step": 6382 + }, + { + "epoch": 1.260188401089428, + "grad_norm": 2.25, + "learning_rate": 6.2204903583783924e-06, + "loss": 0.9899, + "step": 6383 + }, + { + "epoch": 1.260388296144524, + "grad_norm": 2.140625, + "learning_rate": 6.21946840681656e-06, + "loss": 1.0136, + "step": 6384 + }, + { + "epoch": 1.2605881911996202, + "grad_norm": 2.109375, + "learning_rate": 6.218446401086071e-06, + "loss": 1.0317, + "step": 6385 + }, + { + "epoch": 1.2607880862547163, + "grad_norm": 2.25, + "learning_rate": 6.217424341232324e-06, + "loss": 1.0341, + "step": 6386 + }, + { + "epoch": 1.2609879813098124, + "grad_norm": 2.03125, + "learning_rate": 6.216402227300717e-06, + "loss": 0.9551, + "step": 6387 + }, + { + "epoch": 1.2611878763649085, + "grad_norm": 2.125, + "learning_rate": 6.215380059336653e-06, + "loss": 0.9913, + "step": 6388 + }, + { + "epoch": 1.2613877714200046, + "grad_norm": 2.109375, + "learning_rate": 6.214357837385538e-06, + "loss": 0.8355, + "step": 6389 + }, + { + "epoch": 1.2615876664751005, + "grad_norm": 2.0625, + "learning_rate": 6.213335561492775e-06, + "loss": 0.9165, + "step": 6390 + }, + { + "epoch": 1.2617875615301966, + "grad_norm": 2.09375, + "learning_rate": 6.212313231703777e-06, + "loss": 0.9236, + "step": 6391 + }, + { + "epoch": 1.2619874565852927, + "grad_norm": 2.15625, + "learning_rate": 6.211290848063955e-06, + "loss": 1.0185, + "step": 6392 + }, + { + "epoch": 1.2621873516403888, + "grad_norm": 2.234375, + "learning_rate": 6.210268410618723e-06, + "loss": 1.0065, + "step": 6393 + }, + { + "epoch": 1.2623872466954849, + "grad_norm": 2.078125, + "learning_rate": 6.209245919413497e-06, + "loss": 0.9274, + "step": 6394 + }, + { + "epoch": 1.262587141750581, + "grad_norm": 2.0625, + "learning_rate": 6.208223374493697e-06, + "loss": 0.9656, + "step": 6395 + }, + { + "epoch": 1.262787036805677, + "grad_norm": 2.0625, + "learning_rate": 6.2072007759047426e-06, + "loss": 0.9707, + "step": 6396 + }, + { + "epoch": 1.2629869318607732, + "grad_norm": 2.140625, + "learning_rate": 6.20617812369206e-06, + "loss": 0.9844, + "step": 6397 + }, + { + "epoch": 1.2631868269158693, + "grad_norm": 2.15625, + "learning_rate": 6.205155417901074e-06, + "loss": 0.9175, + "step": 6398 + }, + { + "epoch": 1.2633867219709654, + "grad_norm": 2.515625, + "learning_rate": 6.204132658577212e-06, + "loss": 0.9726, + "step": 6399 + }, + { + "epoch": 1.2635866170260615, + "grad_norm": 2.1875, + "learning_rate": 6.203109845765907e-06, + "loss": 0.9825, + "step": 6400 + }, + { + "epoch": 1.2637865120811573, + "grad_norm": 2.28125, + "learning_rate": 6.202086979512589e-06, + "loss": 0.976, + "step": 6401 + }, + { + "epoch": 1.2639864071362534, + "grad_norm": 2.09375, + "learning_rate": 6.201064059862699e-06, + "loss": 1.0102, + "step": 6402 + }, + { + "epoch": 1.2641863021913495, + "grad_norm": 2.078125, + "learning_rate": 6.20004108686167e-06, + "loss": 0.9396, + "step": 6403 + }, + { + "epoch": 1.2643861972464456, + "grad_norm": 2.234375, + "learning_rate": 6.199018060554945e-06, + "loss": 1.0043, + "step": 6404 + }, + { + "epoch": 1.2645860923015417, + "grad_norm": 2.203125, + "learning_rate": 6.197994980987964e-06, + "loss": 0.9013, + "step": 6405 + }, + { + "epoch": 1.2647859873566378, + "grad_norm": 2.203125, + "learning_rate": 6.196971848206175e-06, + "loss": 1.0221, + "step": 6406 + }, + { + "epoch": 1.2649858824117337, + "grad_norm": 2.140625, + "learning_rate": 6.195948662255024e-06, + "loss": 0.8652, + "step": 6407 + }, + { + "epoch": 1.2651857774668298, + "grad_norm": 2.078125, + "learning_rate": 6.194925423179961e-06, + "loss": 0.9215, + "step": 6408 + }, + { + "epoch": 1.265385672521926, + "grad_norm": 2.078125, + "learning_rate": 6.193902131026439e-06, + "loss": 1.0518, + "step": 6409 + }, + { + "epoch": 1.265585567577022, + "grad_norm": 2.125, + "learning_rate": 6.192878785839911e-06, + "loss": 0.9335, + "step": 6410 + }, + { + "epoch": 1.265785462632118, + "grad_norm": 2.171875, + "learning_rate": 6.191855387665836e-06, + "loss": 1.0277, + "step": 6411 + }, + { + "epoch": 1.2659853576872142, + "grad_norm": 2.234375, + "learning_rate": 6.190831936549671e-06, + "loss": 0.9709, + "step": 6412 + }, + { + "epoch": 1.2661852527423103, + "grad_norm": 2.3125, + "learning_rate": 6.189808432536879e-06, + "loss": 1.0246, + "step": 6413 + }, + { + "epoch": 1.2663851477974064, + "grad_norm": 2.109375, + "learning_rate": 6.188784875672923e-06, + "loss": 0.9244, + "step": 6414 + }, + { + "epoch": 1.2665850428525025, + "grad_norm": 2.078125, + "learning_rate": 6.1877612660032706e-06, + "loss": 0.9935, + "step": 6415 + }, + { + "epoch": 1.2667849379075986, + "grad_norm": 2.21875, + "learning_rate": 6.1867376035733885e-06, + "loss": 0.975, + "step": 6416 + }, + { + "epoch": 1.2669848329626947, + "grad_norm": 2.078125, + "learning_rate": 6.185713888428751e-06, + "loss": 0.9996, + "step": 6417 + }, + { + "epoch": 1.2671847280177906, + "grad_norm": 2.09375, + "learning_rate": 6.1846901206148265e-06, + "loss": 0.9165, + "step": 6418 + }, + { + "epoch": 1.2673846230728867, + "grad_norm": 2.046875, + "learning_rate": 6.183666300177095e-06, + "loss": 0.92, + "step": 6419 + }, + { + "epoch": 1.2675845181279828, + "grad_norm": 2.1875, + "learning_rate": 6.182642427161033e-06, + "loss": 0.9924, + "step": 6420 + }, + { + "epoch": 1.2677844131830789, + "grad_norm": 2.375, + "learning_rate": 6.181618501612119e-06, + "loss": 1.0589, + "step": 6421 + }, + { + "epoch": 1.267984308238175, + "grad_norm": 2.21875, + "learning_rate": 6.180594523575838e-06, + "loss": 0.9984, + "step": 6422 + }, + { + "epoch": 1.268184203293271, + "grad_norm": 2.09375, + "learning_rate": 6.179570493097675e-06, + "loss": 0.9574, + "step": 6423 + }, + { + "epoch": 1.2683840983483672, + "grad_norm": 2.28125, + "learning_rate": 6.178546410223116e-06, + "loss": 0.9633, + "step": 6424 + }, + { + "epoch": 1.268583993403463, + "grad_norm": 2.0625, + "learning_rate": 6.177522274997651e-06, + "loss": 0.9466, + "step": 6425 + }, + { + "epoch": 1.2687838884585592, + "grad_norm": 2.140625, + "learning_rate": 6.176498087466775e-06, + "loss": 0.938, + "step": 6426 + }, + { + "epoch": 1.2689837835136553, + "grad_norm": 2.28125, + "learning_rate": 6.1754738476759764e-06, + "loss": 1.0154, + "step": 6427 + }, + { + "epoch": 1.2691836785687514, + "grad_norm": 2.109375, + "learning_rate": 6.174449555670757e-06, + "loss": 0.9987, + "step": 6428 + }, + { + "epoch": 1.2693835736238475, + "grad_norm": 2.171875, + "learning_rate": 6.173425211496613e-06, + "loss": 0.9349, + "step": 6429 + }, + { + "epoch": 1.2695834686789436, + "grad_norm": 2.140625, + "learning_rate": 6.1724008151990465e-06, + "loss": 1.0085, + "step": 6430 + }, + { + "epoch": 1.2697833637340397, + "grad_norm": 2.0, + "learning_rate": 6.171376366823562e-06, + "loss": 0.9886, + "step": 6431 + }, + { + "epoch": 1.2699832587891358, + "grad_norm": 2.28125, + "learning_rate": 6.1703518664156636e-06, + "loss": 1.0036, + "step": 6432 + }, + { + "epoch": 1.2701831538442319, + "grad_norm": 2.25, + "learning_rate": 6.16932731402086e-06, + "loss": 0.9389, + "step": 6433 + }, + { + "epoch": 1.270383048899328, + "grad_norm": 2.3125, + "learning_rate": 6.168302709684663e-06, + "loss": 0.9765, + "step": 6434 + }, + { + "epoch": 1.270582943954424, + "grad_norm": 2.234375, + "learning_rate": 6.1672780534525825e-06, + "loss": 1.0638, + "step": 6435 + }, + { + "epoch": 1.27078283900952, + "grad_norm": 2.0625, + "learning_rate": 6.166253345370137e-06, + "loss": 0.9807, + "step": 6436 + }, + { + "epoch": 1.270982734064616, + "grad_norm": 2.109375, + "learning_rate": 6.165228585482842e-06, + "loss": 0.9761, + "step": 6437 + }, + { + "epoch": 1.2711826291197121, + "grad_norm": 2.0625, + "learning_rate": 6.164203773836217e-06, + "loss": 0.8968, + "step": 6438 + }, + { + "epoch": 1.2713825241748082, + "grad_norm": 2.140625, + "learning_rate": 6.163178910475786e-06, + "loss": 0.9883, + "step": 6439 + }, + { + "epoch": 1.2715824192299043, + "grad_norm": 2.09375, + "learning_rate": 6.162153995447071e-06, + "loss": 0.9872, + "step": 6440 + }, + { + "epoch": 1.2717823142850004, + "grad_norm": 2.359375, + "learning_rate": 6.1611290287956e-06, + "loss": 1.0828, + "step": 6441 + }, + { + "epoch": 1.2719822093400963, + "grad_norm": 2.171875, + "learning_rate": 6.160104010566902e-06, + "loss": 1.0257, + "step": 6442 + }, + { + "epoch": 1.2721821043951924, + "grad_norm": 2.171875, + "learning_rate": 6.159078940806507e-06, + "loss": 0.9679, + "step": 6443 + }, + { + "epoch": 1.2723819994502885, + "grad_norm": 2.140625, + "learning_rate": 6.15805381955995e-06, + "loss": 0.9412, + "step": 6444 + }, + { + "epoch": 1.2725818945053846, + "grad_norm": 2.296875, + "learning_rate": 6.157028646872765e-06, + "loss": 1.0561, + "step": 6445 + }, + { + "epoch": 1.2727817895604807, + "grad_norm": 2.140625, + "learning_rate": 6.156003422790492e-06, + "loss": 0.9577, + "step": 6446 + }, + { + "epoch": 1.2729816846155768, + "grad_norm": 2.125, + "learning_rate": 6.15497814735867e-06, + "loss": 0.9266, + "step": 6447 + }, + { + "epoch": 1.273181579670673, + "grad_norm": 2.09375, + "learning_rate": 6.153952820622844e-06, + "loss": 0.9729, + "step": 6448 + }, + { + "epoch": 1.273381474725769, + "grad_norm": 2.125, + "learning_rate": 6.1529274426285546e-06, + "loss": 0.9299, + "step": 6449 + }, + { + "epoch": 1.2735813697808651, + "grad_norm": 2.046875, + "learning_rate": 6.151902013421351e-06, + "loss": 1.0202, + "step": 6450 + }, + { + "epoch": 1.2737812648359612, + "grad_norm": 2.109375, + "learning_rate": 6.150876533046784e-06, + "loss": 0.9122, + "step": 6451 + }, + { + "epoch": 1.2739811598910573, + "grad_norm": 2.140625, + "learning_rate": 6.149851001550404e-06, + "loss": 0.8825, + "step": 6452 + }, + { + "epoch": 1.2741810549461532, + "grad_norm": 2.171875, + "learning_rate": 6.1488254189777665e-06, + "loss": 0.9751, + "step": 6453 + }, + { + "epoch": 1.2743809500012493, + "grad_norm": 2.203125, + "learning_rate": 6.147799785374425e-06, + "loss": 1.0231, + "step": 6454 + }, + { + "epoch": 1.2745808450563454, + "grad_norm": 2.0625, + "learning_rate": 6.1467741007859405e-06, + "loss": 1.0558, + "step": 6455 + }, + { + "epoch": 1.2747807401114415, + "grad_norm": 2.046875, + "learning_rate": 6.145748365257873e-06, + "loss": 0.9363, + "step": 6456 + }, + { + "epoch": 1.2749806351665376, + "grad_norm": 2.15625, + "learning_rate": 6.1447225788357845e-06, + "loss": 0.9968, + "step": 6457 + }, + { + "epoch": 1.2751805302216337, + "grad_norm": 2.09375, + "learning_rate": 6.143696741565241e-06, + "loss": 0.9579, + "step": 6458 + }, + { + "epoch": 1.2753804252767298, + "grad_norm": 2.078125, + "learning_rate": 6.142670853491813e-06, + "loss": 0.9892, + "step": 6459 + }, + { + "epoch": 1.2755803203318257, + "grad_norm": 2.203125, + "learning_rate": 6.141644914661066e-06, + "loss": 0.9936, + "step": 6460 + }, + { + "epoch": 1.2757802153869218, + "grad_norm": 2.21875, + "learning_rate": 6.140618925118574e-06, + "loss": 1.0118, + "step": 6461 + }, + { + "epoch": 1.2759801104420179, + "grad_norm": 2.140625, + "learning_rate": 6.13959288490991e-06, + "loss": 0.9798, + "step": 6462 + }, + { + "epoch": 1.276180005497114, + "grad_norm": 2.046875, + "learning_rate": 6.138566794080655e-06, + "loss": 0.8971, + "step": 6463 + }, + { + "epoch": 1.27637990055221, + "grad_norm": 2.15625, + "learning_rate": 6.137540652676382e-06, + "loss": 0.9741, + "step": 6464 + }, + { + "epoch": 1.2765797956073062, + "grad_norm": 2.109375, + "learning_rate": 6.136514460742675e-06, + "loss": 0.9904, + "step": 6465 + }, + { + "epoch": 1.2767796906624023, + "grad_norm": 2.078125, + "learning_rate": 6.135488218325116e-06, + "loss": 1.0251, + "step": 6466 + }, + { + "epoch": 1.2769795857174984, + "grad_norm": 2.21875, + "learning_rate": 6.134461925469293e-06, + "loss": 1.0236, + "step": 6467 + }, + { + "epoch": 1.2771794807725945, + "grad_norm": 2.015625, + "learning_rate": 6.133435582220794e-06, + "loss": 0.9359, + "step": 6468 + }, + { + "epoch": 1.2773793758276906, + "grad_norm": 2.203125, + "learning_rate": 6.132409188625205e-06, + "loss": 1.0094, + "step": 6469 + }, + { + "epoch": 1.2775792708827867, + "grad_norm": 2.140625, + "learning_rate": 6.13138274472812e-06, + "loss": 1.0281, + "step": 6470 + }, + { + "epoch": 1.2777791659378825, + "grad_norm": 2.1875, + "learning_rate": 6.130356250575137e-06, + "loss": 1.0421, + "step": 6471 + }, + { + "epoch": 1.2779790609929786, + "grad_norm": 2.171875, + "learning_rate": 6.129329706211849e-06, + "loss": 1.0368, + "step": 6472 + }, + { + "epoch": 1.2781789560480747, + "grad_norm": 2.265625, + "learning_rate": 6.1283031116838565e-06, + "loss": 0.9902, + "step": 6473 + }, + { + "epoch": 1.2783788511031708, + "grad_norm": 2.0625, + "learning_rate": 6.127276467036759e-06, + "loss": 0.9434, + "step": 6474 + }, + { + "epoch": 1.278578746158267, + "grad_norm": 2.203125, + "learning_rate": 6.126249772316161e-06, + "loss": 0.9404, + "step": 6475 + }, + { + "epoch": 1.278778641213363, + "grad_norm": 2.171875, + "learning_rate": 6.1252230275676705e-06, + "loss": 0.9842, + "step": 6476 + }, + { + "epoch": 1.278978536268459, + "grad_norm": 2.15625, + "learning_rate": 6.124196232836892e-06, + "loss": 1.0444, + "step": 6477 + }, + { + "epoch": 1.279178431323555, + "grad_norm": 2.140625, + "learning_rate": 6.123169388169437e-06, + "loss": 0.9739, + "step": 6478 + }, + { + "epoch": 1.2793783263786511, + "grad_norm": 2.3125, + "learning_rate": 6.122142493610918e-06, + "loss": 1.0631, + "step": 6479 + }, + { + "epoch": 1.2795782214337472, + "grad_norm": 2.0625, + "learning_rate": 6.12111554920695e-06, + "loss": 0.9143, + "step": 6480 + }, + { + "epoch": 1.2797781164888433, + "grad_norm": 2.25, + "learning_rate": 6.120088555003148e-06, + "loss": 1.0243, + "step": 6481 + }, + { + "epoch": 1.2799780115439394, + "grad_norm": 2.25, + "learning_rate": 6.119061511045133e-06, + "loss": 1.0501, + "step": 6482 + }, + { + "epoch": 1.2801779065990355, + "grad_norm": 2.125, + "learning_rate": 6.118034417378523e-06, + "loss": 1.0083, + "step": 6483 + }, + { + "epoch": 1.2803778016541316, + "grad_norm": 2.125, + "learning_rate": 6.117007274048945e-06, + "loss": 0.9384, + "step": 6484 + }, + { + "epoch": 1.2805776967092277, + "grad_norm": 2.0, + "learning_rate": 6.115980081102024e-06, + "loss": 0.9705, + "step": 6485 + }, + { + "epoch": 1.2807775917643238, + "grad_norm": 2.203125, + "learning_rate": 6.114952838583386e-06, + "loss": 1.0992, + "step": 6486 + }, + { + "epoch": 1.28097748681942, + "grad_norm": 2.046875, + "learning_rate": 6.1139255465386614e-06, + "loss": 0.9487, + "step": 6487 + }, + { + "epoch": 1.2811773818745158, + "grad_norm": 2.203125, + "learning_rate": 6.1128982050134845e-06, + "loss": 0.9714, + "step": 6488 + }, + { + "epoch": 1.281377276929612, + "grad_norm": 2.171875, + "learning_rate": 6.111870814053487e-06, + "loss": 0.9044, + "step": 6489 + }, + { + "epoch": 1.281577171984708, + "grad_norm": 2.125, + "learning_rate": 6.110843373704307e-06, + "loss": 0.9707, + "step": 6490 + }, + { + "epoch": 1.281777067039804, + "grad_norm": 2.28125, + "learning_rate": 6.109815884011583e-06, + "loss": 0.9192, + "step": 6491 + }, + { + "epoch": 1.2819769620949002, + "grad_norm": 2.03125, + "learning_rate": 6.108788345020955e-06, + "loss": 0.8727, + "step": 6492 + }, + { + "epoch": 1.2821768571499963, + "grad_norm": 2.390625, + "learning_rate": 6.10776075677807e-06, + "loss": 1.0429, + "step": 6493 + }, + { + "epoch": 1.2823767522050924, + "grad_norm": 2.25, + "learning_rate": 6.106733119328568e-06, + "loss": 1.0389, + "step": 6494 + }, + { + "epoch": 1.2825766472601883, + "grad_norm": 2.140625, + "learning_rate": 6.1057054327181e-06, + "loss": 1.047, + "step": 6495 + }, + { + "epoch": 1.2827765423152844, + "grad_norm": 2.03125, + "learning_rate": 6.104677696992315e-06, + "loss": 0.8999, + "step": 6496 + }, + { + "epoch": 1.2829764373703805, + "grad_norm": 2.078125, + "learning_rate": 6.103649912196864e-06, + "loss": 0.9554, + "step": 6497 + }, + { + "epoch": 1.2831763324254766, + "grad_norm": 2.234375, + "learning_rate": 6.102622078377401e-06, + "loss": 1.0413, + "step": 6498 + }, + { + "epoch": 1.2833762274805727, + "grad_norm": 2.15625, + "learning_rate": 6.101594195579585e-06, + "loss": 1.0102, + "step": 6499 + }, + { + "epoch": 1.2835761225356688, + "grad_norm": 2.046875, + "learning_rate": 6.1005662638490715e-06, + "loss": 0.9513, + "step": 6500 + }, + { + "epoch": 1.2837760175907649, + "grad_norm": 2.1875, + "learning_rate": 6.099538283231523e-06, + "loss": 1.0219, + "step": 6501 + }, + { + "epoch": 1.283975912645861, + "grad_norm": 2.203125, + "learning_rate": 6.098510253772599e-06, + "loss": 0.9539, + "step": 6502 + }, + { + "epoch": 1.284175807700957, + "grad_norm": 2.15625, + "learning_rate": 6.097482175517968e-06, + "loss": 0.9664, + "step": 6503 + }, + { + "epoch": 1.2843757027560532, + "grad_norm": 2.265625, + "learning_rate": 6.096454048513297e-06, + "loss": 0.9948, + "step": 6504 + }, + { + "epoch": 1.2845755978111493, + "grad_norm": 2.046875, + "learning_rate": 6.0954258728042535e-06, + "loss": 0.9397, + "step": 6505 + }, + { + "epoch": 1.2847754928662452, + "grad_norm": 2.1875, + "learning_rate": 6.0943976484365095e-06, + "loss": 1.0426, + "step": 6506 + }, + { + "epoch": 1.2849753879213412, + "grad_norm": 2.140625, + "learning_rate": 6.09336937545574e-06, + "loss": 0.9307, + "step": 6507 + }, + { + "epoch": 1.2851752829764373, + "grad_norm": 2.109375, + "learning_rate": 6.092341053907618e-06, + "loss": 0.9363, + "step": 6508 + }, + { + "epoch": 1.2853751780315334, + "grad_norm": 2.078125, + "learning_rate": 6.091312683837823e-06, + "loss": 0.9501, + "step": 6509 + }, + { + "epoch": 1.2855750730866295, + "grad_norm": 2.1875, + "learning_rate": 6.090284265292038e-06, + "loss": 0.9455, + "step": 6510 + }, + { + "epoch": 1.2857749681417256, + "grad_norm": 2.234375, + "learning_rate": 6.089255798315941e-06, + "loss": 1.0456, + "step": 6511 + }, + { + "epoch": 1.2859748631968217, + "grad_norm": 2.125, + "learning_rate": 6.088227282955216e-06, + "loss": 0.9585, + "step": 6512 + }, + { + "epoch": 1.2861747582519176, + "grad_norm": 2.34375, + "learning_rate": 6.0871987192555524e-06, + "loss": 1.0309, + "step": 6513 + }, + { + "epoch": 1.2863746533070137, + "grad_norm": 2.15625, + "learning_rate": 6.086170107262638e-06, + "loss": 0.9564, + "step": 6514 + }, + { + "epoch": 1.2865745483621098, + "grad_norm": 2.203125, + "learning_rate": 6.085141447022162e-06, + "loss": 1.031, + "step": 6515 + }, + { + "epoch": 1.286774443417206, + "grad_norm": 2.09375, + "learning_rate": 6.084112738579821e-06, + "loss": 1.0041, + "step": 6516 + }, + { + "epoch": 1.286974338472302, + "grad_norm": 2.15625, + "learning_rate": 6.083083981981307e-06, + "loss": 1.0031, + "step": 6517 + }, + { + "epoch": 1.2871742335273981, + "grad_norm": 2.125, + "learning_rate": 6.0820551772723195e-06, + "loss": 1.0031, + "step": 6518 + }, + { + "epoch": 1.2873741285824942, + "grad_norm": 2.234375, + "learning_rate": 6.081026324498554e-06, + "loss": 0.9412, + "step": 6519 + }, + { + "epoch": 1.2875740236375903, + "grad_norm": 2.171875, + "learning_rate": 6.0799974237057155e-06, + "loss": 1.1451, + "step": 6520 + }, + { + "epoch": 1.2877739186926864, + "grad_norm": 2.1875, + "learning_rate": 6.078968474939508e-06, + "loss": 1.0042, + "step": 6521 + }, + { + "epoch": 1.2879738137477825, + "grad_norm": 2.140625, + "learning_rate": 6.0779394782456355e-06, + "loss": 0.9027, + "step": 6522 + }, + { + "epoch": 1.2881737088028786, + "grad_norm": 2.078125, + "learning_rate": 6.076910433669806e-06, + "loss": 0.9414, + "step": 6523 + }, + { + "epoch": 1.2883736038579745, + "grad_norm": 2.15625, + "learning_rate": 6.0758813412577315e-06, + "loss": 1.0103, + "step": 6524 + }, + { + "epoch": 1.2885734989130706, + "grad_norm": 2.265625, + "learning_rate": 6.074852201055121e-06, + "loss": 1.0374, + "step": 6525 + }, + { + "epoch": 1.2887733939681667, + "grad_norm": 2.109375, + "learning_rate": 6.0738230131076915e-06, + "loss": 1.066, + "step": 6526 + }, + { + "epoch": 1.2889732890232628, + "grad_norm": 2.3125, + "learning_rate": 6.072793777461159e-06, + "loss": 1.0472, + "step": 6527 + }, + { + "epoch": 1.289173184078359, + "grad_norm": 2.203125, + "learning_rate": 6.07176449416124e-06, + "loss": 0.9168, + "step": 6528 + }, + { + "epoch": 1.289373079133455, + "grad_norm": 2.125, + "learning_rate": 6.070735163253656e-06, + "loss": 0.9151, + "step": 6529 + }, + { + "epoch": 1.2895729741885509, + "grad_norm": 2.140625, + "learning_rate": 6.069705784784133e-06, + "loss": 1.0055, + "step": 6530 + }, + { + "epoch": 1.289772869243647, + "grad_norm": 2.03125, + "learning_rate": 6.068676358798391e-06, + "loss": 0.9039, + "step": 6531 + }, + { + "epoch": 1.289972764298743, + "grad_norm": 2.203125, + "learning_rate": 6.067646885342161e-06, + "loss": 0.9601, + "step": 6532 + }, + { + "epoch": 1.2901726593538392, + "grad_norm": 2.203125, + "learning_rate": 6.066617364461169e-06, + "loss": 0.8968, + "step": 6533 + }, + { + "epoch": 1.2903725544089353, + "grad_norm": 2.015625, + "learning_rate": 6.065587796201148e-06, + "loss": 0.9923, + "step": 6534 + }, + { + "epoch": 1.2905724494640314, + "grad_norm": 2.234375, + "learning_rate": 6.0645581806078315e-06, + "loss": 1.0246, + "step": 6535 + }, + { + "epoch": 1.2907723445191275, + "grad_norm": 2.1875, + "learning_rate": 6.063528517726953e-06, + "loss": 0.9303, + "step": 6536 + }, + { + "epoch": 1.2909722395742236, + "grad_norm": 2.234375, + "learning_rate": 6.0624988076042525e-06, + "loss": 0.9379, + "step": 6537 + }, + { + "epoch": 1.2911721346293197, + "grad_norm": 2.046875, + "learning_rate": 6.061469050285469e-06, + "loss": 0.9697, + "step": 6538 + }, + { + "epoch": 1.2913720296844158, + "grad_norm": 2.0625, + "learning_rate": 6.060439245816342e-06, + "loss": 0.9549, + "step": 6539 + }, + { + "epoch": 1.2915719247395119, + "grad_norm": 2.0, + "learning_rate": 6.059409394242618e-06, + "loss": 0.9163, + "step": 6540 + }, + { + "epoch": 1.2917718197946078, + "grad_norm": 2.171875, + "learning_rate": 6.058379495610043e-06, + "loss": 1.0255, + "step": 6541 + }, + { + "epoch": 1.2919717148497039, + "grad_norm": 2.109375, + "learning_rate": 6.057349549964362e-06, + "loss": 0.9965, + "step": 6542 + }, + { + "epoch": 1.2921716099048, + "grad_norm": 2.25, + "learning_rate": 6.056319557351327e-06, + "loss": 0.9442, + "step": 6543 + }, + { + "epoch": 1.292371504959896, + "grad_norm": 2.15625, + "learning_rate": 6.055289517816691e-06, + "loss": 1.004, + "step": 6544 + }, + { + "epoch": 1.2925714000149922, + "grad_norm": 2.265625, + "learning_rate": 6.054259431406206e-06, + "loss": 1.0521, + "step": 6545 + }, + { + "epoch": 1.2927712950700883, + "grad_norm": 2.1875, + "learning_rate": 6.0532292981656315e-06, + "loss": 0.9739, + "step": 6546 + }, + { + "epoch": 1.2929711901251844, + "grad_norm": 2.140625, + "learning_rate": 6.0521991181407225e-06, + "loss": 0.9814, + "step": 6547 + }, + { + "epoch": 1.2931710851802802, + "grad_norm": 2.171875, + "learning_rate": 6.051168891377242e-06, + "loss": 0.9492, + "step": 6548 + }, + { + "epoch": 1.2933709802353763, + "grad_norm": 3.0625, + "learning_rate": 6.050138617920951e-06, + "loss": 1.0261, + "step": 6549 + }, + { + "epoch": 1.2935708752904724, + "grad_norm": 2.234375, + "learning_rate": 6.049108297817614e-06, + "loss": 1.0818, + "step": 6550 + }, + { + "epoch": 1.2937707703455685, + "grad_norm": 2.171875, + "learning_rate": 6.048077931112999e-06, + "loss": 0.9663, + "step": 6551 + }, + { + "epoch": 1.2939706654006646, + "grad_norm": 2.3125, + "learning_rate": 6.047047517852875e-06, + "loss": 1.0825, + "step": 6552 + }, + { + "epoch": 1.2941705604557607, + "grad_norm": 2.109375, + "learning_rate": 6.046017058083011e-06, + "loss": 0.9238, + "step": 6553 + }, + { + "epoch": 1.2943704555108568, + "grad_norm": 2.234375, + "learning_rate": 6.0449865518491825e-06, + "loss": 1.0634, + "step": 6554 + }, + { + "epoch": 1.294570350565953, + "grad_norm": 2.09375, + "learning_rate": 6.0439559991971615e-06, + "loss": 0.9655, + "step": 6555 + }, + { + "epoch": 1.294770245621049, + "grad_norm": 2.0625, + "learning_rate": 6.042925400172729e-06, + "loss": 0.971, + "step": 6556 + }, + { + "epoch": 1.2949701406761451, + "grad_norm": 2.015625, + "learning_rate": 6.041894754821659e-06, + "loss": 1.0065, + "step": 6557 + }, + { + "epoch": 1.2951700357312412, + "grad_norm": 2.328125, + "learning_rate": 6.040864063189738e-06, + "loss": 1.0393, + "step": 6558 + }, + { + "epoch": 1.295369930786337, + "grad_norm": 2.140625, + "learning_rate": 6.039833325322744e-06, + "loss": 1.006, + "step": 6559 + }, + { + "epoch": 1.2955698258414332, + "grad_norm": 2.1875, + "learning_rate": 6.038802541266464e-06, + "loss": 0.9824, + "step": 6560 + }, + { + "epoch": 1.2957697208965293, + "grad_norm": 2.078125, + "learning_rate": 6.037771711066688e-06, + "loss": 0.8906, + "step": 6561 + }, + { + "epoch": 1.2959696159516254, + "grad_norm": 2.203125, + "learning_rate": 6.036740834769202e-06, + "loss": 1.0133, + "step": 6562 + }, + { + "epoch": 1.2961695110067215, + "grad_norm": 2.09375, + "learning_rate": 6.035709912419801e-06, + "loss": 0.9655, + "step": 6563 + }, + { + "epoch": 1.2963694060618176, + "grad_norm": 2.15625, + "learning_rate": 6.0346789440642736e-06, + "loss": 0.9906, + "step": 6564 + }, + { + "epoch": 1.2965693011169135, + "grad_norm": 2.203125, + "learning_rate": 6.0336479297484195e-06, + "loss": 1.0288, + "step": 6565 + }, + { + "epoch": 1.2967691961720096, + "grad_norm": 2.3125, + "learning_rate": 6.032616869518036e-06, + "loss": 1.0503, + "step": 6566 + }, + { + "epoch": 1.2969690912271057, + "grad_norm": 2.15625, + "learning_rate": 6.031585763418919e-06, + "loss": 0.9395, + "step": 6567 + }, + { + "epoch": 1.2971689862822018, + "grad_norm": 2.171875, + "learning_rate": 6.030554611496873e-06, + "loss": 1.0274, + "step": 6568 + }, + { + "epoch": 1.2973688813372979, + "grad_norm": 2.125, + "learning_rate": 6.029523413797703e-06, + "loss": 0.9283, + "step": 6569 + }, + { + "epoch": 1.297568776392394, + "grad_norm": 2.046875, + "learning_rate": 6.028492170367211e-06, + "loss": 0.9313, + "step": 6570 + }, + { + "epoch": 1.29776867144749, + "grad_norm": 2.25, + "learning_rate": 6.027460881251208e-06, + "loss": 1.0137, + "step": 6571 + }, + { + "epoch": 1.2979685665025862, + "grad_norm": 2.203125, + "learning_rate": 6.026429546495502e-06, + "loss": 1.0137, + "step": 6572 + }, + { + "epoch": 1.2981684615576823, + "grad_norm": 2.21875, + "learning_rate": 6.025398166145905e-06, + "loss": 0.9563, + "step": 6573 + }, + { + "epoch": 1.2983683566127784, + "grad_norm": 2.1875, + "learning_rate": 6.024366740248231e-06, + "loss": 0.9904, + "step": 6574 + }, + { + "epoch": 1.2985682516678745, + "grad_norm": 2.140625, + "learning_rate": 6.023335268848296e-06, + "loss": 0.8159, + "step": 6575 + }, + { + "epoch": 1.2987681467229704, + "grad_norm": 2.390625, + "learning_rate": 6.022303751991917e-06, + "loss": 0.957, + "step": 6576 + }, + { + "epoch": 1.2989680417780665, + "grad_norm": 2.046875, + "learning_rate": 6.021272189724916e-06, + "loss": 0.8763, + "step": 6577 + }, + { + "epoch": 1.2991679368331626, + "grad_norm": 2.0, + "learning_rate": 6.020240582093112e-06, + "loss": 0.8495, + "step": 6578 + }, + { + "epoch": 1.2993678318882587, + "grad_norm": 2.28125, + "learning_rate": 6.019208929142329e-06, + "loss": 0.9718, + "step": 6579 + }, + { + "epoch": 1.2995677269433548, + "grad_norm": 2.15625, + "learning_rate": 6.018177230918397e-06, + "loss": 0.9856, + "step": 6580 + }, + { + "epoch": 1.2997676219984509, + "grad_norm": 2.265625, + "learning_rate": 6.017145487467139e-06, + "loss": 1.1287, + "step": 6581 + }, + { + "epoch": 1.299967517053547, + "grad_norm": 2.28125, + "learning_rate": 6.016113698834388e-06, + "loss": 0.9454, + "step": 6582 + }, + { + "epoch": 1.3001674121086428, + "grad_norm": 2.296875, + "learning_rate": 6.015081865065977e-06, + "loss": 1.0867, + "step": 6583 + }, + { + "epoch": 1.300367307163739, + "grad_norm": 2.15625, + "learning_rate": 6.014049986207735e-06, + "loss": 0.9523, + "step": 6584 + }, + { + "epoch": 1.300567202218835, + "grad_norm": 2.171875, + "learning_rate": 6.013018062305502e-06, + "loss": 1.0047, + "step": 6585 + }, + { + "epoch": 1.3007670972739311, + "grad_norm": 2.28125, + "learning_rate": 6.011986093405115e-06, + "loss": 0.9859, + "step": 6586 + }, + { + "epoch": 1.3009669923290272, + "grad_norm": 2.09375, + "learning_rate": 6.010954079552414e-06, + "loss": 0.9844, + "step": 6587 + }, + { + "epoch": 1.3011668873841233, + "grad_norm": 2.09375, + "learning_rate": 6.009922020793241e-06, + "loss": 0.9852, + "step": 6588 + }, + { + "epoch": 1.3013667824392194, + "grad_norm": 2.171875, + "learning_rate": 6.008889917173439e-06, + "loss": 1.069, + "step": 6589 + }, + { + "epoch": 1.3015666774943155, + "grad_norm": 2.203125, + "learning_rate": 6.007857768738855e-06, + "loss": 1.019, + "step": 6590 + }, + { + "epoch": 1.3017665725494116, + "grad_norm": 2.0625, + "learning_rate": 6.006825575535337e-06, + "loss": 0.9025, + "step": 6591 + }, + { + "epoch": 1.3019664676045077, + "grad_norm": 2.109375, + "learning_rate": 6.005793337608733e-06, + "loss": 0.9806, + "step": 6592 + }, + { + "epoch": 1.3021663626596038, + "grad_norm": 2.15625, + "learning_rate": 6.004761055004897e-06, + "loss": 0.999, + "step": 6593 + }, + { + "epoch": 1.3023662577146997, + "grad_norm": 2.0625, + "learning_rate": 6.003728727769683e-06, + "loss": 0.9815, + "step": 6594 + }, + { + "epoch": 1.3025661527697958, + "grad_norm": 2.125, + "learning_rate": 6.002696355948945e-06, + "loss": 0.9643, + "step": 6595 + }, + { + "epoch": 1.302766047824892, + "grad_norm": 2.078125, + "learning_rate": 6.0016639395885424e-06, + "loss": 0.9539, + "step": 6596 + }, + { + "epoch": 1.302965942879988, + "grad_norm": 2.265625, + "learning_rate": 6.000631478734336e-06, + "loss": 1.0288, + "step": 6597 + }, + { + "epoch": 1.3031658379350841, + "grad_norm": 2.171875, + "learning_rate": 5.999598973432184e-06, + "loss": 1.0162, + "step": 6598 + }, + { + "epoch": 1.3033657329901802, + "grad_norm": 2.15625, + "learning_rate": 5.9985664237279515e-06, + "loss": 0.9597, + "step": 6599 + }, + { + "epoch": 1.303565628045276, + "grad_norm": 2.203125, + "learning_rate": 5.997533829667508e-06, + "loss": 1.013, + "step": 6600 + }, + { + "epoch": 1.3037655231003722, + "grad_norm": 2.140625, + "learning_rate": 5.996501191296717e-06, + "loss": 0.9786, + "step": 6601 + }, + { + "epoch": 1.3039654181554683, + "grad_norm": 2.109375, + "learning_rate": 5.995468508661451e-06, + "loss": 0.9443, + "step": 6602 + }, + { + "epoch": 1.3041653132105644, + "grad_norm": 2.140625, + "learning_rate": 5.99443578180758e-06, + "loss": 0.9783, + "step": 6603 + }, + { + "epoch": 1.3043652082656605, + "grad_norm": 2.21875, + "learning_rate": 5.993403010780975e-06, + "loss": 0.9471, + "step": 6604 + }, + { + "epoch": 1.3045651033207566, + "grad_norm": 2.3125, + "learning_rate": 5.992370195627516e-06, + "loss": 1.0114, + "step": 6605 + }, + { + "epoch": 1.3047649983758527, + "grad_norm": 2.1875, + "learning_rate": 5.99133733639308e-06, + "loss": 1.0314, + "step": 6606 + }, + { + "epoch": 1.3049648934309488, + "grad_norm": 2.140625, + "learning_rate": 5.990304433123543e-06, + "loss": 1.0279, + "step": 6607 + }, + { + "epoch": 1.305164788486045, + "grad_norm": 2.140625, + "learning_rate": 5.98927148586479e-06, + "loss": 0.8717, + "step": 6608 + }, + { + "epoch": 1.305364683541141, + "grad_norm": 2.109375, + "learning_rate": 5.988238494662704e-06, + "loss": 0.9683, + "step": 6609 + }, + { + "epoch": 1.305564578596237, + "grad_norm": 2.25, + "learning_rate": 5.987205459563168e-06, + "loss": 1.0831, + "step": 6610 + }, + { + "epoch": 1.305764473651333, + "grad_norm": 2.109375, + "learning_rate": 5.986172380612073e-06, + "loss": 0.979, + "step": 6611 + }, + { + "epoch": 1.305964368706429, + "grad_norm": 2.078125, + "learning_rate": 5.985139257855304e-06, + "loss": 0.9041, + "step": 6612 + }, + { + "epoch": 1.3061642637615252, + "grad_norm": 2.203125, + "learning_rate": 5.984106091338756e-06, + "loss": 0.9586, + "step": 6613 + }, + { + "epoch": 1.3063641588166213, + "grad_norm": 2.09375, + "learning_rate": 5.983072881108321e-06, + "loss": 0.9586, + "step": 6614 + }, + { + "epoch": 1.3065640538717174, + "grad_norm": 2.40625, + "learning_rate": 5.982039627209891e-06, + "loss": 0.9355, + "step": 6615 + }, + { + "epoch": 1.3067639489268135, + "grad_norm": 2.40625, + "learning_rate": 5.981006329689368e-06, + "loss": 1.0287, + "step": 6616 + }, + { + "epoch": 1.3069638439819096, + "grad_norm": 2.203125, + "learning_rate": 5.979972988592648e-06, + "loss": 1.0191, + "step": 6617 + }, + { + "epoch": 1.3071637390370054, + "grad_norm": 2.171875, + "learning_rate": 5.978939603965633e-06, + "loss": 1.0091, + "step": 6618 + }, + { + "epoch": 1.3073636340921015, + "grad_norm": 2.125, + "learning_rate": 5.977906175854224e-06, + "loss": 0.9668, + "step": 6619 + }, + { + "epoch": 1.3075635291471976, + "grad_norm": 2.28125, + "learning_rate": 5.976872704304328e-06, + "loss": 0.9464, + "step": 6620 + }, + { + "epoch": 1.3077634242022937, + "grad_norm": 2.09375, + "learning_rate": 5.975839189361851e-06, + "loss": 1.0497, + "step": 6621 + }, + { + "epoch": 1.3079633192573898, + "grad_norm": 2.234375, + "learning_rate": 5.974805631072702e-06, + "loss": 1.0606, + "step": 6622 + }, + { + "epoch": 1.308163214312486, + "grad_norm": 2.234375, + "learning_rate": 5.97377202948279e-06, + "loss": 1.0158, + "step": 6623 + }, + { + "epoch": 1.308363109367582, + "grad_norm": 2.171875, + "learning_rate": 5.9727383846380275e-06, + "loss": 0.996, + "step": 6624 + }, + { + "epoch": 1.3085630044226781, + "grad_norm": 2.203125, + "learning_rate": 5.971704696584332e-06, + "loss": 1.0327, + "step": 6625 + }, + { + "epoch": 1.3087628994777742, + "grad_norm": 2.0625, + "learning_rate": 5.9706709653676156e-06, + "loss": 0.9993, + "step": 6626 + }, + { + "epoch": 1.3089627945328703, + "grad_norm": 2.09375, + "learning_rate": 5.9696371910337995e-06, + "loss": 1.0207, + "step": 6627 + }, + { + "epoch": 1.3091626895879664, + "grad_norm": 2.21875, + "learning_rate": 5.968603373628803e-06, + "loss": 0.9862, + "step": 6628 + }, + { + "epoch": 1.3093625846430623, + "grad_norm": 2.046875, + "learning_rate": 5.967569513198548e-06, + "loss": 0.9262, + "step": 6629 + }, + { + "epoch": 1.3095624796981584, + "grad_norm": 2.125, + "learning_rate": 5.966535609788959e-06, + "loss": 1.0114, + "step": 6630 + }, + { + "epoch": 1.3097623747532545, + "grad_norm": 2.140625, + "learning_rate": 5.965501663445961e-06, + "loss": 0.9882, + "step": 6631 + }, + { + "epoch": 1.3099622698083506, + "grad_norm": 2.109375, + "learning_rate": 5.9644676742154814e-06, + "loss": 0.9676, + "step": 6632 + }, + { + "epoch": 1.3101621648634467, + "grad_norm": 2.15625, + "learning_rate": 5.963433642143452e-06, + "loss": 0.954, + "step": 6633 + }, + { + "epoch": 1.3103620599185428, + "grad_norm": 2.09375, + "learning_rate": 5.962399567275804e-06, + "loss": 0.9889, + "step": 6634 + }, + { + "epoch": 1.310561954973639, + "grad_norm": 2.09375, + "learning_rate": 5.961365449658469e-06, + "loss": 0.9578, + "step": 6635 + }, + { + "epoch": 1.3107618500287348, + "grad_norm": 2.046875, + "learning_rate": 5.960331289337383e-06, + "loss": 0.9138, + "step": 6636 + }, + { + "epoch": 1.310961745083831, + "grad_norm": 2.3125, + "learning_rate": 5.9592970863584835e-06, + "loss": 1.0682, + "step": 6637 + }, + { + "epoch": 1.311161640138927, + "grad_norm": 2.140625, + "learning_rate": 5.95826284076771e-06, + "loss": 0.9996, + "step": 6638 + }, + { + "epoch": 1.311361535194023, + "grad_norm": 2.1875, + "learning_rate": 5.957228552611004e-06, + "loss": 1.0233, + "step": 6639 + }, + { + "epoch": 1.3115614302491192, + "grad_norm": 2.140625, + "learning_rate": 5.956194221934307e-06, + "loss": 0.9941, + "step": 6640 + }, + { + "epoch": 1.3117613253042153, + "grad_norm": 2.171875, + "learning_rate": 5.955159848783565e-06, + "loss": 0.9846, + "step": 6641 + }, + { + "epoch": 1.3119612203593114, + "grad_norm": 2.203125, + "learning_rate": 5.954125433204726e-06, + "loss": 0.9759, + "step": 6642 + }, + { + "epoch": 1.3121611154144075, + "grad_norm": 2.140625, + "learning_rate": 5.953090975243735e-06, + "loss": 0.989, + "step": 6643 + }, + { + "epoch": 1.3123610104695036, + "grad_norm": 2.296875, + "learning_rate": 5.952056474946546e-06, + "loss": 0.9784, + "step": 6644 + }, + { + "epoch": 1.3125609055245997, + "grad_norm": 2.140625, + "learning_rate": 5.9510219323591104e-06, + "loss": 0.9401, + "step": 6645 + }, + { + "epoch": 1.3127608005796958, + "grad_norm": 2.15625, + "learning_rate": 5.94998734752738e-06, + "loss": 0.9958, + "step": 6646 + }, + { + "epoch": 1.3129606956347917, + "grad_norm": 2.125, + "learning_rate": 5.9489527204973165e-06, + "loss": 0.9949, + "step": 6647 + }, + { + "epoch": 1.3131605906898878, + "grad_norm": 2.1875, + "learning_rate": 5.9479180513148716e-06, + "loss": 1.0382, + "step": 6648 + }, + { + "epoch": 1.3133604857449839, + "grad_norm": 2.078125, + "learning_rate": 5.94688334002601e-06, + "loss": 1.0021, + "step": 6649 + }, + { + "epoch": 1.31356038080008, + "grad_norm": 2.109375, + "learning_rate": 5.94584858667669e-06, + "loss": 0.8882, + "step": 6650 + }, + { + "epoch": 1.313760275855176, + "grad_norm": 2.21875, + "learning_rate": 5.944813791312878e-06, + "loss": 0.9116, + "step": 6651 + }, + { + "epoch": 1.3139601709102722, + "grad_norm": 2.125, + "learning_rate": 5.943778953980538e-06, + "loss": 1.0571, + "step": 6652 + }, + { + "epoch": 1.314160065965368, + "grad_norm": 2.078125, + "learning_rate": 5.942744074725638e-06, + "loss": 0.9611, + "step": 6653 + }, + { + "epoch": 1.3143599610204642, + "grad_norm": 2.125, + "learning_rate": 5.941709153594146e-06, + "loss": 0.9936, + "step": 6654 + }, + { + "epoch": 1.3145598560755603, + "grad_norm": 2.203125, + "learning_rate": 5.940674190632034e-06, + "loss": 0.967, + "step": 6655 + }, + { + "epoch": 1.3147597511306564, + "grad_norm": 2.125, + "learning_rate": 5.939639185885276e-06, + "loss": 0.9805, + "step": 6656 + }, + { + "epoch": 1.3149596461857525, + "grad_norm": 2.140625, + "learning_rate": 5.938604139399844e-06, + "loss": 0.9485, + "step": 6657 + }, + { + "epoch": 1.3151595412408486, + "grad_norm": 2.21875, + "learning_rate": 5.937569051221716e-06, + "loss": 0.9486, + "step": 6658 + }, + { + "epoch": 1.3153594362959447, + "grad_norm": 2.15625, + "learning_rate": 5.936533921396874e-06, + "loss": 1.0129, + "step": 6659 + }, + { + "epoch": 1.3155593313510408, + "grad_norm": 2.09375, + "learning_rate": 5.935498749971293e-06, + "loss": 1.0116, + "step": 6660 + }, + { + "epoch": 1.3157592264061369, + "grad_norm": 2.15625, + "learning_rate": 5.934463536990959e-06, + "loss": 0.9824, + "step": 6661 + }, + { + "epoch": 1.315959121461233, + "grad_norm": 2.15625, + "learning_rate": 5.933428282501853e-06, + "loss": 0.9549, + "step": 6662 + }, + { + "epoch": 1.316159016516329, + "grad_norm": 2.0, + "learning_rate": 5.9323929865499615e-06, + "loss": 0.9275, + "step": 6663 + }, + { + "epoch": 1.316358911571425, + "grad_norm": 2.125, + "learning_rate": 5.931357649181275e-06, + "loss": 0.9875, + "step": 6664 + }, + { + "epoch": 1.316558806626521, + "grad_norm": 2.25, + "learning_rate": 5.930322270441781e-06, + "loss": 0.9468, + "step": 6665 + }, + { + "epoch": 1.3167587016816171, + "grad_norm": 2.265625, + "learning_rate": 5.929286850377471e-06, + "loss": 0.9012, + "step": 6666 + }, + { + "epoch": 1.3169585967367132, + "grad_norm": 2.375, + "learning_rate": 5.928251389034338e-06, + "loss": 0.958, + "step": 6667 + }, + { + "epoch": 1.3171584917918093, + "grad_norm": 2.109375, + "learning_rate": 5.927215886458377e-06, + "loss": 0.9074, + "step": 6668 + }, + { + "epoch": 1.3173583868469054, + "grad_norm": 2.171875, + "learning_rate": 5.926180342695587e-06, + "loss": 0.9661, + "step": 6669 + }, + { + "epoch": 1.3175582819020015, + "grad_norm": 2.1875, + "learning_rate": 5.925144757791964e-06, + "loss": 0.9343, + "step": 6670 + }, + { + "epoch": 1.3177581769570974, + "grad_norm": 2.09375, + "learning_rate": 5.92410913179351e-06, + "loss": 0.9184, + "step": 6671 + }, + { + "epoch": 1.3179580720121935, + "grad_norm": 2.046875, + "learning_rate": 5.923073464746225e-06, + "loss": 0.9682, + "step": 6672 + }, + { + "epoch": 1.3181579670672896, + "grad_norm": 2.15625, + "learning_rate": 5.9220377566961175e-06, + "loss": 1.0067, + "step": 6673 + }, + { + "epoch": 1.3183578621223857, + "grad_norm": 2.109375, + "learning_rate": 5.921002007689191e-06, + "loss": 0.9897, + "step": 6674 + }, + { + "epoch": 1.3185577571774818, + "grad_norm": 2.265625, + "learning_rate": 5.919966217771454e-06, + "loss": 0.8978, + "step": 6675 + }, + { + "epoch": 1.318757652232578, + "grad_norm": 2.265625, + "learning_rate": 5.9189303869889145e-06, + "loss": 1.0046, + "step": 6676 + }, + { + "epoch": 1.318957547287674, + "grad_norm": 2.140625, + "learning_rate": 5.9178945153875856e-06, + "loss": 0.981, + "step": 6677 + }, + { + "epoch": 1.31915744234277, + "grad_norm": 2.140625, + "learning_rate": 5.916858603013482e-06, + "loss": 0.9779, + "step": 6678 + }, + { + "epoch": 1.3193573373978662, + "grad_norm": 2.21875, + "learning_rate": 5.915822649912616e-06, + "loss": 1.0136, + "step": 6679 + }, + { + "epoch": 1.3195572324529623, + "grad_norm": 2.171875, + "learning_rate": 5.914786656131007e-06, + "loss": 0.9407, + "step": 6680 + }, + { + "epoch": 1.3197571275080584, + "grad_norm": 2.125, + "learning_rate": 5.913750621714671e-06, + "loss": 1.0132, + "step": 6681 + }, + { + "epoch": 1.3199570225631543, + "grad_norm": 2.359375, + "learning_rate": 5.91271454670963e-06, + "loss": 0.975, + "step": 6682 + }, + { + "epoch": 1.3201569176182504, + "grad_norm": 2.15625, + "learning_rate": 5.911678431161907e-06, + "loss": 0.9969, + "step": 6683 + }, + { + "epoch": 1.3203568126733465, + "grad_norm": 2.109375, + "learning_rate": 5.910642275117525e-06, + "loss": 0.9714, + "step": 6684 + }, + { + "epoch": 1.3205567077284426, + "grad_norm": 2.234375, + "learning_rate": 5.9096060786225095e-06, + "loss": 0.9532, + "step": 6685 + }, + { + "epoch": 1.3207566027835387, + "grad_norm": 2.15625, + "learning_rate": 5.90856984172289e-06, + "loss": 0.9656, + "step": 6686 + }, + { + "epoch": 1.3209564978386348, + "grad_norm": 2.078125, + "learning_rate": 5.907533564464696e-06, + "loss": 0.9187, + "step": 6687 + }, + { + "epoch": 1.3211563928937307, + "grad_norm": 2.078125, + "learning_rate": 5.9064972468939565e-06, + "loss": 0.9709, + "step": 6688 + }, + { + "epoch": 1.3213562879488268, + "grad_norm": 2.125, + "learning_rate": 5.905460889056707e-06, + "loss": 0.9907, + "step": 6689 + }, + { + "epoch": 1.3215561830039229, + "grad_norm": 2.1875, + "learning_rate": 5.904424490998981e-06, + "loss": 0.9996, + "step": 6690 + }, + { + "epoch": 1.321756078059019, + "grad_norm": 2.15625, + "learning_rate": 5.903388052766817e-06, + "loss": 0.9861, + "step": 6691 + }, + { + "epoch": 1.321955973114115, + "grad_norm": 2.296875, + "learning_rate": 5.902351574406251e-06, + "loss": 1.0058, + "step": 6692 + }, + { + "epoch": 1.3221558681692112, + "grad_norm": 2.234375, + "learning_rate": 5.9013150559633245e-06, + "loss": 1.1108, + "step": 6693 + }, + { + "epoch": 1.3223557632243073, + "grad_norm": 2.078125, + "learning_rate": 5.900278497484079e-06, + "loss": 0.9455, + "step": 6694 + }, + { + "epoch": 1.3225556582794034, + "grad_norm": 2.125, + "learning_rate": 5.89924189901456e-06, + "loss": 0.9911, + "step": 6695 + }, + { + "epoch": 1.3227555533344995, + "grad_norm": 2.265625, + "learning_rate": 5.898205260600812e-06, + "loss": 0.8872, + "step": 6696 + }, + { + "epoch": 1.3229554483895956, + "grad_norm": 2.171875, + "learning_rate": 5.897168582288881e-06, + "loss": 0.8304, + "step": 6697 + }, + { + "epoch": 1.3231553434446917, + "grad_norm": 2.09375, + "learning_rate": 5.8961318641248185e-06, + "loss": 0.9498, + "step": 6698 + }, + { + "epoch": 1.3233552384997875, + "grad_norm": 2.25, + "learning_rate": 5.895095106154674e-06, + "loss": 0.9766, + "step": 6699 + }, + { + "epoch": 1.3235551335548836, + "grad_norm": 2.125, + "learning_rate": 5.8940583084245e-06, + "loss": 0.9235, + "step": 6700 + }, + { + "epoch": 1.3237550286099797, + "grad_norm": 2.078125, + "learning_rate": 5.893021470980352e-06, + "loss": 0.9945, + "step": 6701 + }, + { + "epoch": 1.3239549236650758, + "grad_norm": 2.203125, + "learning_rate": 5.891984593868285e-06, + "loss": 1.0195, + "step": 6702 + }, + { + "epoch": 1.324154818720172, + "grad_norm": 2.3125, + "learning_rate": 5.890947677134358e-06, + "loss": 1.0657, + "step": 6703 + }, + { + "epoch": 1.324354713775268, + "grad_norm": 2.3125, + "learning_rate": 5.889910720824631e-06, + "loss": 1.0126, + "step": 6704 + }, + { + "epoch": 1.3245546088303641, + "grad_norm": 2.171875, + "learning_rate": 5.888873724985163e-06, + "loss": 0.9802, + "step": 6705 + }, + { + "epoch": 1.32475450388546, + "grad_norm": 2.171875, + "learning_rate": 5.887836689662021e-06, + "loss": 1.0151, + "step": 6706 + }, + { + "epoch": 1.324954398940556, + "grad_norm": 2.203125, + "learning_rate": 5.886799614901267e-06, + "loss": 1.0367, + "step": 6707 + }, + { + "epoch": 1.3251542939956522, + "grad_norm": 2.15625, + "learning_rate": 5.885762500748968e-06, + "loss": 0.99, + "step": 6708 + }, + { + "epoch": 1.3253541890507483, + "grad_norm": 2.1875, + "learning_rate": 5.8847253472511956e-06, + "loss": 1.0279, + "step": 6709 + }, + { + "epoch": 1.3255540841058444, + "grad_norm": 2.265625, + "learning_rate": 5.883688154454016e-06, + "loss": 0.9555, + "step": 6710 + }, + { + "epoch": 1.3257539791609405, + "grad_norm": 2.1875, + "learning_rate": 5.8826509224035046e-06, + "loss": 0.9638, + "step": 6711 + }, + { + "epoch": 1.3259538742160366, + "grad_norm": 2.078125, + "learning_rate": 5.881613651145732e-06, + "loss": 0.9608, + "step": 6712 + }, + { + "epoch": 1.3261537692711327, + "grad_norm": 2.1875, + "learning_rate": 5.880576340726776e-06, + "loss": 1.0941, + "step": 6713 + }, + { + "epoch": 1.3263536643262288, + "grad_norm": 2.15625, + "learning_rate": 5.8795389911927126e-06, + "loss": 1.0361, + "step": 6714 + }, + { + "epoch": 1.326553559381325, + "grad_norm": 2.0625, + "learning_rate": 5.8785016025896216e-06, + "loss": 0.9403, + "step": 6715 + }, + { + "epoch": 1.326753454436421, + "grad_norm": 2.140625, + "learning_rate": 5.877464174963582e-06, + "loss": 0.9459, + "step": 6716 + }, + { + "epoch": 1.3269533494915169, + "grad_norm": 2.1875, + "learning_rate": 5.876426708360678e-06, + "loss": 1.0179, + "step": 6717 + }, + { + "epoch": 1.327153244546613, + "grad_norm": 2.359375, + "learning_rate": 5.875389202826994e-06, + "loss": 1.0085, + "step": 6718 + }, + { + "epoch": 1.327353139601709, + "grad_norm": 2.15625, + "learning_rate": 5.874351658408614e-06, + "loss": 0.9004, + "step": 6719 + }, + { + "epoch": 1.3275530346568052, + "grad_norm": 2.125, + "learning_rate": 5.873314075151628e-06, + "loss": 1.0149, + "step": 6720 + }, + { + "epoch": 1.3277529297119013, + "grad_norm": 2.203125, + "learning_rate": 5.872276453102122e-06, + "loss": 1.0591, + "step": 6721 + }, + { + "epoch": 1.3279528247669974, + "grad_norm": 2.203125, + "learning_rate": 5.8712387923061905e-06, + "loss": 1.0282, + "step": 6722 + }, + { + "epoch": 1.3281527198220933, + "grad_norm": 2.15625, + "learning_rate": 5.870201092809926e-06, + "loss": 0.917, + "step": 6723 + }, + { + "epoch": 1.3283526148771894, + "grad_norm": 2.109375, + "learning_rate": 5.8691633546594206e-06, + "loss": 0.9753, + "step": 6724 + }, + { + "epoch": 1.3285525099322855, + "grad_norm": 2.109375, + "learning_rate": 5.868125577900772e-06, + "loss": 0.9051, + "step": 6725 + }, + { + "epoch": 1.3287524049873816, + "grad_norm": 2.0625, + "learning_rate": 5.867087762580079e-06, + "loss": 0.913, + "step": 6726 + }, + { + "epoch": 1.3289523000424777, + "grad_norm": 2.1875, + "learning_rate": 5.866049908743439e-06, + "loss": 1.0433, + "step": 6727 + }, + { + "epoch": 1.3291521950975738, + "grad_norm": 2.09375, + "learning_rate": 5.865012016436954e-06, + "loss": 1.0055, + "step": 6728 + }, + { + "epoch": 1.3293520901526699, + "grad_norm": 2.109375, + "learning_rate": 5.863974085706729e-06, + "loss": 0.9556, + "step": 6729 + }, + { + "epoch": 1.329551985207766, + "grad_norm": 2.34375, + "learning_rate": 5.862936116598866e-06, + "loss": 0.9934, + "step": 6730 + }, + { + "epoch": 1.329751880262862, + "grad_norm": 2.015625, + "learning_rate": 5.861898109159473e-06, + "loss": 0.8601, + "step": 6731 + }, + { + "epoch": 1.3299517753179582, + "grad_norm": 2.265625, + "learning_rate": 5.860860063434659e-06, + "loss": 1.0253, + "step": 6732 + }, + { + "epoch": 1.3301516703730543, + "grad_norm": 2.015625, + "learning_rate": 5.859821979470532e-06, + "loss": 0.8942, + "step": 6733 + }, + { + "epoch": 1.3303515654281501, + "grad_norm": 2.15625, + "learning_rate": 5.858783857313205e-06, + "loss": 1.0331, + "step": 6734 + }, + { + "epoch": 1.3305514604832462, + "grad_norm": 2.171875, + "learning_rate": 5.857745697008792e-06, + "loss": 0.9352, + "step": 6735 + }, + { + "epoch": 1.3307513555383423, + "grad_norm": 2.21875, + "learning_rate": 5.856707498603404e-06, + "loss": 1.0373, + "step": 6736 + }, + { + "epoch": 1.3309512505934384, + "grad_norm": 2.203125, + "learning_rate": 5.855669262143163e-06, + "loss": 0.9652, + "step": 6737 + }, + { + "epoch": 1.3311511456485345, + "grad_norm": 2.125, + "learning_rate": 5.854630987674184e-06, + "loss": 0.8334, + "step": 6738 + }, + { + "epoch": 1.3313510407036306, + "grad_norm": 2.15625, + "learning_rate": 5.853592675242587e-06, + "loss": 0.8961, + "step": 6739 + }, + { + "epoch": 1.3315509357587267, + "grad_norm": 2.203125, + "learning_rate": 5.852554324894495e-06, + "loss": 0.9398, + "step": 6740 + }, + { + "epoch": 1.3317508308138226, + "grad_norm": 2.1875, + "learning_rate": 5.851515936676031e-06, + "loss": 1.0227, + "step": 6741 + }, + { + "epoch": 1.3319507258689187, + "grad_norm": 2.078125, + "learning_rate": 5.85047751063332e-06, + "loss": 0.9559, + "step": 6742 + }, + { + "epoch": 1.3321506209240148, + "grad_norm": 2.34375, + "learning_rate": 5.84943904681249e-06, + "loss": 1.0473, + "step": 6743 + }, + { + "epoch": 1.332350515979111, + "grad_norm": 2.234375, + "learning_rate": 5.848400545259667e-06, + "loss": 0.9805, + "step": 6744 + }, + { + "epoch": 1.332550411034207, + "grad_norm": 2.15625, + "learning_rate": 5.847362006020982e-06, + "loss": 1.0385, + "step": 6745 + }, + { + "epoch": 1.3327503060893031, + "grad_norm": 2.015625, + "learning_rate": 5.846323429142569e-06, + "loss": 0.879, + "step": 6746 + }, + { + "epoch": 1.3329502011443992, + "grad_norm": 2.171875, + "learning_rate": 5.845284814670556e-06, + "loss": 0.9384, + "step": 6747 + }, + { + "epoch": 1.3331500961994953, + "grad_norm": 2.171875, + "learning_rate": 5.844246162651085e-06, + "loss": 1.0013, + "step": 6748 + }, + { + "epoch": 1.3333499912545914, + "grad_norm": 2.21875, + "learning_rate": 5.843207473130289e-06, + "loss": 1.0776, + "step": 6749 + }, + { + "epoch": 1.3335498863096875, + "grad_norm": 2.09375, + "learning_rate": 5.842168746154306e-06, + "loss": 1.0119, + "step": 6750 + }, + { + "epoch": 1.3337497813647836, + "grad_norm": 2.125, + "learning_rate": 5.841129981769279e-06, + "loss": 0.9853, + "step": 6751 + }, + { + "epoch": 1.3339496764198795, + "grad_norm": 2.03125, + "learning_rate": 5.840091180021348e-06, + "loss": 0.8704, + "step": 6752 + }, + { + "epoch": 1.3341495714749756, + "grad_norm": 2.171875, + "learning_rate": 5.839052340956654e-06, + "loss": 0.9847, + "step": 6753 + }, + { + "epoch": 1.3343494665300717, + "grad_norm": 2.171875, + "learning_rate": 5.838013464621347e-06, + "loss": 0.998, + "step": 6754 + }, + { + "epoch": 1.3345493615851678, + "grad_norm": 2.21875, + "learning_rate": 5.836974551061571e-06, + "loss": 1.0238, + "step": 6755 + }, + { + "epoch": 1.334749256640264, + "grad_norm": 2.125, + "learning_rate": 5.835935600323473e-06, + "loss": 1.0121, + "step": 6756 + }, + { + "epoch": 1.33494915169536, + "grad_norm": 2.109375, + "learning_rate": 5.834896612453209e-06, + "loss": 0.9981, + "step": 6757 + }, + { + "epoch": 1.335149046750456, + "grad_norm": 2.28125, + "learning_rate": 5.8338575874969235e-06, + "loss": 0.937, + "step": 6758 + }, + { + "epoch": 1.335348941805552, + "grad_norm": 2.140625, + "learning_rate": 5.832818525500773e-06, + "loss": 0.9186, + "step": 6759 + }, + { + "epoch": 1.335548836860648, + "grad_norm": 2.25, + "learning_rate": 5.831779426510914e-06, + "loss": 1.0156, + "step": 6760 + }, + { + "epoch": 1.3357487319157442, + "grad_norm": 2.171875, + "learning_rate": 5.830740290573501e-06, + "loss": 0.9893, + "step": 6761 + }, + { + "epoch": 1.3359486269708403, + "grad_norm": 2.25, + "learning_rate": 5.829701117734692e-06, + "loss": 1.0315, + "step": 6762 + }, + { + "epoch": 1.3361485220259364, + "grad_norm": 2.109375, + "learning_rate": 5.828661908040649e-06, + "loss": 0.9556, + "step": 6763 + }, + { + "epoch": 1.3363484170810325, + "grad_norm": 2.15625, + "learning_rate": 5.827622661537533e-06, + "loss": 0.928, + "step": 6764 + }, + { + "epoch": 1.3365483121361286, + "grad_norm": 2.21875, + "learning_rate": 5.826583378271506e-06, + "loss": 1.0471, + "step": 6765 + }, + { + "epoch": 1.3367482071912247, + "grad_norm": 2.171875, + "learning_rate": 5.825544058288735e-06, + "loss": 1.0082, + "step": 6766 + }, + { + "epoch": 1.3369481022463208, + "grad_norm": 2.125, + "learning_rate": 5.824504701635383e-06, + "loss": 1.0759, + "step": 6767 + }, + { + "epoch": 1.3371479973014169, + "grad_norm": 2.046875, + "learning_rate": 5.823465308357621e-06, + "loss": 0.99, + "step": 6768 + }, + { + "epoch": 1.337347892356513, + "grad_norm": 2.0625, + "learning_rate": 5.822425878501618e-06, + "loss": 1.0142, + "step": 6769 + }, + { + "epoch": 1.3375477874116088, + "grad_norm": 2.1875, + "learning_rate": 5.821386412113546e-06, + "loss": 0.9711, + "step": 6770 + }, + { + "epoch": 1.337747682466705, + "grad_norm": 2.234375, + "learning_rate": 5.820346909239576e-06, + "loss": 1.06, + "step": 6771 + }, + { + "epoch": 1.337947577521801, + "grad_norm": 2.1875, + "learning_rate": 5.819307369925884e-06, + "loss": 0.9441, + "step": 6772 + }, + { + "epoch": 1.3381474725768971, + "grad_norm": 2.15625, + "learning_rate": 5.8182677942186485e-06, + "loss": 0.9636, + "step": 6773 + }, + { + "epoch": 1.3383473676319932, + "grad_norm": 2.125, + "learning_rate": 5.817228182164043e-06, + "loss": 0.8985, + "step": 6774 + }, + { + "epoch": 1.3385472626870893, + "grad_norm": 2.09375, + "learning_rate": 5.816188533808249e-06, + "loss": 1.0072, + "step": 6775 + }, + { + "epoch": 1.3387471577421852, + "grad_norm": 2.328125, + "learning_rate": 5.815148849197447e-06, + "loss": 0.993, + "step": 6776 + }, + { + "epoch": 1.3389470527972813, + "grad_norm": 2.171875, + "learning_rate": 5.814109128377822e-06, + "loss": 0.9225, + "step": 6777 + }, + { + "epoch": 1.3391469478523774, + "grad_norm": 2.078125, + "learning_rate": 5.8130693713955545e-06, + "loss": 0.8882, + "step": 6778 + }, + { + "epoch": 1.3393468429074735, + "grad_norm": 2.109375, + "learning_rate": 5.8120295782968325e-06, + "loss": 0.8944, + "step": 6779 + }, + { + "epoch": 1.3395467379625696, + "grad_norm": 2.140625, + "learning_rate": 5.810989749127844e-06, + "loss": 0.921, + "step": 6780 + }, + { + "epoch": 1.3397466330176657, + "grad_norm": 2.09375, + "learning_rate": 5.809949883934778e-06, + "loss": 0.9186, + "step": 6781 + }, + { + "epoch": 1.3399465280727618, + "grad_norm": 2.1875, + "learning_rate": 5.808909982763825e-06, + "loss": 0.9882, + "step": 6782 + }, + { + "epoch": 1.340146423127858, + "grad_norm": 2.265625, + "learning_rate": 5.807870045661176e-06, + "loss": 0.9886, + "step": 6783 + }, + { + "epoch": 1.340346318182954, + "grad_norm": 2.125, + "learning_rate": 5.806830072673025e-06, + "loss": 0.9413, + "step": 6784 + }, + { + "epoch": 1.3405462132380501, + "grad_norm": 2.15625, + "learning_rate": 5.80579006384557e-06, + "loss": 0.9557, + "step": 6785 + }, + { + "epoch": 1.3407461082931462, + "grad_norm": 2.21875, + "learning_rate": 5.804750019225005e-06, + "loss": 1.0898, + "step": 6786 + }, + { + "epoch": 1.340946003348242, + "grad_norm": 2.140625, + "learning_rate": 5.8037099388575305e-06, + "loss": 0.9668, + "step": 6787 + }, + { + "epoch": 1.3411458984033382, + "grad_norm": 2.234375, + "learning_rate": 5.802669822789349e-06, + "loss": 0.9791, + "step": 6788 + }, + { + "epoch": 1.3413457934584343, + "grad_norm": 2.28125, + "learning_rate": 5.801629671066657e-06, + "loss": 1.0095, + "step": 6789 + }, + { + "epoch": 1.3415456885135304, + "grad_norm": 2.171875, + "learning_rate": 5.80058948373566e-06, + "loss": 1.0419, + "step": 6790 + }, + { + "epoch": 1.3417455835686265, + "grad_norm": 2.1875, + "learning_rate": 5.799549260842565e-06, + "loss": 1.065, + "step": 6791 + }, + { + "epoch": 1.3419454786237226, + "grad_norm": 2.125, + "learning_rate": 5.7985090024335766e-06, + "loss": 0.9716, + "step": 6792 + }, + { + "epoch": 1.3421453736788187, + "grad_norm": 2.1875, + "learning_rate": 5.797468708554903e-06, + "loss": 0.963, + "step": 6793 + }, + { + "epoch": 1.3423452687339146, + "grad_norm": 2.296875, + "learning_rate": 5.796428379252756e-06, + "loss": 0.9844, + "step": 6794 + }, + { + "epoch": 1.3425451637890107, + "grad_norm": 2.0625, + "learning_rate": 5.795388014573345e-06, + "loss": 0.9513, + "step": 6795 + }, + { + "epoch": 1.3427450588441068, + "grad_norm": 2.21875, + "learning_rate": 5.794347614562884e-06, + "loss": 1.0168, + "step": 6796 + }, + { + "epoch": 1.3429449538992029, + "grad_norm": 2.109375, + "learning_rate": 5.793307179267585e-06, + "loss": 1.0011, + "step": 6797 + }, + { + "epoch": 1.343144848954299, + "grad_norm": 2.0625, + "learning_rate": 5.792266708733667e-06, + "loss": 0.9518, + "step": 6798 + }, + { + "epoch": 1.343344744009395, + "grad_norm": 2.109375, + "learning_rate": 5.791226203007346e-06, + "loss": 1.0294, + "step": 6799 + }, + { + "epoch": 1.3435446390644912, + "grad_norm": 2.0625, + "learning_rate": 5.790185662134842e-06, + "loss": 0.9154, + "step": 6800 + }, + { + "epoch": 1.3437445341195873, + "grad_norm": 2.125, + "learning_rate": 5.789145086162374e-06, + "loss": 0.9629, + "step": 6801 + }, + { + "epoch": 1.3439444291746834, + "grad_norm": 2.203125, + "learning_rate": 5.788104475136168e-06, + "loss": 0.9986, + "step": 6802 + }, + { + "epoch": 1.3441443242297795, + "grad_norm": 2.078125, + "learning_rate": 5.7870638291024436e-06, + "loss": 1.0083, + "step": 6803 + }, + { + "epoch": 1.3443442192848756, + "grad_norm": 2.3125, + "learning_rate": 5.786023148107429e-06, + "loss": 1.0264, + "step": 6804 + }, + { + "epoch": 1.3445441143399715, + "grad_norm": 2.078125, + "learning_rate": 5.7849824321973505e-06, + "loss": 0.8659, + "step": 6805 + }, + { + "epoch": 1.3447440093950676, + "grad_norm": 2.234375, + "learning_rate": 5.783941681418435e-06, + "loss": 0.9921, + "step": 6806 + }, + { + "epoch": 1.3449439044501637, + "grad_norm": 2.203125, + "learning_rate": 5.782900895816913e-06, + "loss": 0.9895, + "step": 6807 + }, + { + "epoch": 1.3451437995052598, + "grad_norm": 2.078125, + "learning_rate": 5.781860075439019e-06, + "loss": 1.0123, + "step": 6808 + }, + { + "epoch": 1.3453436945603559, + "grad_norm": 2.140625, + "learning_rate": 5.780819220330982e-06, + "loss": 0.9928, + "step": 6809 + }, + { + "epoch": 1.345543589615452, + "grad_norm": 2.15625, + "learning_rate": 5.77977833053904e-06, + "loss": 0.9065, + "step": 6810 + }, + { + "epoch": 1.3457434846705478, + "grad_norm": 2.265625, + "learning_rate": 5.778737406109427e-06, + "loss": 0.9993, + "step": 6811 + }, + { + "epoch": 1.345943379725644, + "grad_norm": 2.0625, + "learning_rate": 5.777696447088381e-06, + "loss": 0.9916, + "step": 6812 + }, + { + "epoch": 1.34614327478074, + "grad_norm": 2.125, + "learning_rate": 5.7766554535221435e-06, + "loss": 0.9238, + "step": 6813 + }, + { + "epoch": 1.3463431698358361, + "grad_norm": 2.234375, + "learning_rate": 5.775614425456953e-06, + "loss": 0.9169, + "step": 6814 + }, + { + "epoch": 1.3465430648909322, + "grad_norm": 2.125, + "learning_rate": 5.774573362939051e-06, + "loss": 0.8883, + "step": 6815 + }, + { + "epoch": 1.3467429599460283, + "grad_norm": 2.0625, + "learning_rate": 5.773532266014686e-06, + "loss": 0.8809, + "step": 6816 + }, + { + "epoch": 1.3469428550011244, + "grad_norm": 2.203125, + "learning_rate": 5.772491134730097e-06, + "loss": 0.9436, + "step": 6817 + }, + { + "epoch": 1.3471427500562205, + "grad_norm": 2.140625, + "learning_rate": 5.771449969131536e-06, + "loss": 0.8461, + "step": 6818 + }, + { + "epoch": 1.3473426451113166, + "grad_norm": 2.203125, + "learning_rate": 5.770408769265252e-06, + "loss": 1.0222, + "step": 6819 + }, + { + "epoch": 1.3475425401664127, + "grad_norm": 2.09375, + "learning_rate": 5.7693675351774905e-06, + "loss": 0.9295, + "step": 6820 + }, + { + "epoch": 1.3477424352215088, + "grad_norm": 2.125, + "learning_rate": 5.768326266914505e-06, + "loss": 0.9204, + "step": 6821 + }, + { + "epoch": 1.3479423302766047, + "grad_norm": 2.15625, + "learning_rate": 5.767284964522549e-06, + "loss": 1.0075, + "step": 6822 + }, + { + "epoch": 1.3481422253317008, + "grad_norm": 2.03125, + "learning_rate": 5.766243628047876e-06, + "loss": 0.8984, + "step": 6823 + }, + { + "epoch": 1.348342120386797, + "grad_norm": 2.15625, + "learning_rate": 5.765202257536744e-06, + "loss": 0.8898, + "step": 6824 + }, + { + "epoch": 1.348542015441893, + "grad_norm": 2.296875, + "learning_rate": 5.76416085303541e-06, + "loss": 1.1051, + "step": 6825 + }, + { + "epoch": 1.348741910496989, + "grad_norm": 2.09375, + "learning_rate": 5.7631194145901315e-06, + "loss": 0.9619, + "step": 6826 + }, + { + "epoch": 1.3489418055520852, + "grad_norm": 2.109375, + "learning_rate": 5.76207794224717e-06, + "loss": 0.9741, + "step": 6827 + }, + { + "epoch": 1.3491417006071813, + "grad_norm": 2.15625, + "learning_rate": 5.761036436052788e-06, + "loss": 1.0536, + "step": 6828 + }, + { + "epoch": 1.3493415956622772, + "grad_norm": 2.0625, + "learning_rate": 5.75999489605325e-06, + "loss": 0.9193, + "step": 6829 + }, + { + "epoch": 1.3495414907173733, + "grad_norm": 2.171875, + "learning_rate": 5.7589533222948176e-06, + "loss": 0.9696, + "step": 6830 + }, + { + "epoch": 1.3497413857724694, + "grad_norm": 2.078125, + "learning_rate": 5.757911714823761e-06, + "loss": 0.9368, + "step": 6831 + }, + { + "epoch": 1.3499412808275655, + "grad_norm": 2.1875, + "learning_rate": 5.756870073686347e-06, + "loss": 1.0816, + "step": 6832 + }, + { + "epoch": 1.3501411758826616, + "grad_norm": 2.140625, + "learning_rate": 5.755828398928845e-06, + "loss": 1.013, + "step": 6833 + }, + { + "epoch": 1.3503410709377577, + "grad_norm": 2.0625, + "learning_rate": 5.754786690597527e-06, + "loss": 0.931, + "step": 6834 + }, + { + "epoch": 1.3505409659928538, + "grad_norm": 2.125, + "learning_rate": 5.753744948738663e-06, + "loss": 1.0375, + "step": 6835 + }, + { + "epoch": 1.3507408610479499, + "grad_norm": 2.359375, + "learning_rate": 5.75270317339853e-06, + "loss": 0.9461, + "step": 6836 + }, + { + "epoch": 1.350940756103046, + "grad_norm": 2.125, + "learning_rate": 5.751661364623403e-06, + "loss": 0.9873, + "step": 6837 + }, + { + "epoch": 1.351140651158142, + "grad_norm": 2.34375, + "learning_rate": 5.750619522459558e-06, + "loss": 1.1163, + "step": 6838 + }, + { + "epoch": 1.3513405462132382, + "grad_norm": 2.140625, + "learning_rate": 5.749577646953274e-06, + "loss": 1.0067, + "step": 6839 + }, + { + "epoch": 1.351540441268334, + "grad_norm": 1.9921875, + "learning_rate": 5.74853573815083e-06, + "loss": 0.8746, + "step": 6840 + }, + { + "epoch": 1.3517403363234302, + "grad_norm": 2.15625, + "learning_rate": 5.74749379609851e-06, + "loss": 0.9809, + "step": 6841 + }, + { + "epoch": 1.3519402313785263, + "grad_norm": 2.15625, + "learning_rate": 5.746451820842595e-06, + "loss": 0.9222, + "step": 6842 + }, + { + "epoch": 1.3521401264336224, + "grad_norm": 2.140625, + "learning_rate": 5.7454098124293675e-06, + "loss": 0.9275, + "step": 6843 + }, + { + "epoch": 1.3523400214887185, + "grad_norm": 2.203125, + "learning_rate": 5.744367770905119e-06, + "loss": 1.0583, + "step": 6844 + }, + { + "epoch": 1.3525399165438146, + "grad_norm": 2.140625, + "learning_rate": 5.743325696316132e-06, + "loss": 0.9374, + "step": 6845 + }, + { + "epoch": 1.3527398115989104, + "grad_norm": 2.109375, + "learning_rate": 5.742283588708697e-06, + "loss": 1.0193, + "step": 6846 + }, + { + "epoch": 1.3529397066540065, + "grad_norm": 2.21875, + "learning_rate": 5.741241448129105e-06, + "loss": 1.0164, + "step": 6847 + }, + { + "epoch": 1.3531396017091026, + "grad_norm": 2.125, + "learning_rate": 5.740199274623647e-06, + "loss": 0.954, + "step": 6848 + }, + { + "epoch": 1.3533394967641987, + "grad_norm": 2.109375, + "learning_rate": 5.7391570682386165e-06, + "loss": 0.9195, + "step": 6849 + }, + { + "epoch": 1.3535393918192948, + "grad_norm": 2.203125, + "learning_rate": 5.738114829020307e-06, + "loss": 1.0024, + "step": 6850 + }, + { + "epoch": 1.353739286874391, + "grad_norm": 2.234375, + "learning_rate": 5.737072557015016e-06, + "loss": 1.0244, + "step": 6851 + }, + { + "epoch": 1.353939181929487, + "grad_norm": 2.171875, + "learning_rate": 5.73603025226904e-06, + "loss": 0.9202, + "step": 6852 + }, + { + "epoch": 1.3541390769845831, + "grad_norm": 2.125, + "learning_rate": 5.7349879148286804e-06, + "loss": 1.0214, + "step": 6853 + }, + { + "epoch": 1.3543389720396792, + "grad_norm": 2.109375, + "learning_rate": 5.733945544740235e-06, + "loss": 0.8318, + "step": 6854 + }, + { + "epoch": 1.3545388670947753, + "grad_norm": 2.140625, + "learning_rate": 5.732903142050008e-06, + "loss": 0.9818, + "step": 6855 + }, + { + "epoch": 1.3547387621498714, + "grad_norm": 2.21875, + "learning_rate": 5.731860706804301e-06, + "loss": 0.9657, + "step": 6856 + }, + { + "epoch": 1.3549386572049673, + "grad_norm": 2.171875, + "learning_rate": 5.7308182390494185e-06, + "loss": 1.0145, + "step": 6857 + }, + { + "epoch": 1.3551385522600634, + "grad_norm": 2.078125, + "learning_rate": 5.729775738831669e-06, + "loss": 0.9326, + "step": 6858 + }, + { + "epoch": 1.3553384473151595, + "grad_norm": 2.21875, + "learning_rate": 5.728733206197359e-06, + "loss": 1.0115, + "step": 6859 + }, + { + "epoch": 1.3555383423702556, + "grad_norm": 1.96875, + "learning_rate": 5.727690641192797e-06, + "loss": 0.9357, + "step": 6860 + }, + { + "epoch": 1.3557382374253517, + "grad_norm": 2.140625, + "learning_rate": 5.726648043864296e-06, + "loss": 1.0141, + "step": 6861 + }, + { + "epoch": 1.3559381324804478, + "grad_norm": 2.1875, + "learning_rate": 5.725605414258165e-06, + "loss": 0.988, + "step": 6862 + }, + { + "epoch": 1.356138027535544, + "grad_norm": 2.328125, + "learning_rate": 5.724562752420719e-06, + "loss": 0.9958, + "step": 6863 + }, + { + "epoch": 1.3563379225906398, + "grad_norm": 2.15625, + "learning_rate": 5.723520058398275e-06, + "loss": 1.0179, + "step": 6864 + }, + { + "epoch": 1.3565378176457359, + "grad_norm": 2.09375, + "learning_rate": 5.7224773322371466e-06, + "loss": 0.935, + "step": 6865 + }, + { + "epoch": 1.356737712700832, + "grad_norm": 1.9453125, + "learning_rate": 5.721434573983651e-06, + "loss": 0.8396, + "step": 6866 + }, + { + "epoch": 1.356937607755928, + "grad_norm": 2.171875, + "learning_rate": 5.720391783684109e-06, + "loss": 0.9771, + "step": 6867 + }, + { + "epoch": 1.3571375028110242, + "grad_norm": 2.0, + "learning_rate": 5.7193489613848414e-06, + "loss": 0.8618, + "step": 6868 + }, + { + "epoch": 1.3573373978661203, + "grad_norm": 2.15625, + "learning_rate": 5.718306107132169e-06, + "loss": 0.9778, + "step": 6869 + }, + { + "epoch": 1.3575372929212164, + "grad_norm": 2.140625, + "learning_rate": 5.717263220972418e-06, + "loss": 0.9241, + "step": 6870 + }, + { + "epoch": 1.3577371879763125, + "grad_norm": 2.109375, + "learning_rate": 5.716220302951909e-06, + "loss": 1.0834, + "step": 6871 + }, + { + "epoch": 1.3579370830314086, + "grad_norm": 2.15625, + "learning_rate": 5.715177353116972e-06, + "loss": 0.9576, + "step": 6872 + }, + { + "epoch": 1.3581369780865047, + "grad_norm": 2.140625, + "learning_rate": 5.714134371513932e-06, + "loss": 0.9635, + "step": 6873 + }, + { + "epoch": 1.3583368731416008, + "grad_norm": 2.34375, + "learning_rate": 5.71309135818912e-06, + "loss": 0.9623, + "step": 6874 + }, + { + "epoch": 1.3585367681966967, + "grad_norm": 2.140625, + "learning_rate": 5.712048313188867e-06, + "loss": 0.9739, + "step": 6875 + }, + { + "epoch": 1.3587366632517928, + "grad_norm": 2.265625, + "learning_rate": 5.711005236559503e-06, + "loss": 1.0418, + "step": 6876 + }, + { + "epoch": 1.3589365583068889, + "grad_norm": 2.203125, + "learning_rate": 5.7099621283473625e-06, + "loss": 0.9763, + "step": 6877 + }, + { + "epoch": 1.359136453361985, + "grad_norm": 2.125, + "learning_rate": 5.708918988598781e-06, + "loss": 1.0662, + "step": 6878 + }, + { + "epoch": 1.359336348417081, + "grad_norm": 2.140625, + "learning_rate": 5.707875817360092e-06, + "loss": 0.848, + "step": 6879 + }, + { + "epoch": 1.3595362434721772, + "grad_norm": 2.078125, + "learning_rate": 5.706832614677637e-06, + "loss": 1.0198, + "step": 6880 + }, + { + "epoch": 1.3597361385272733, + "grad_norm": 2.21875, + "learning_rate": 5.7057893805977525e-06, + "loss": 1.109, + "step": 6881 + }, + { + "epoch": 1.3599360335823691, + "grad_norm": 2.15625, + "learning_rate": 5.704746115166777e-06, + "loss": 0.9673, + "step": 6882 + }, + { + "epoch": 1.3601359286374652, + "grad_norm": 2.34375, + "learning_rate": 5.703702818431055e-06, + "loss": 1.0013, + "step": 6883 + }, + { + "epoch": 1.3603358236925613, + "grad_norm": 2.234375, + "learning_rate": 5.70265949043693e-06, + "loss": 1.0541, + "step": 6884 + }, + { + "epoch": 1.3605357187476574, + "grad_norm": 2.171875, + "learning_rate": 5.701616131230745e-06, + "loss": 0.9834, + "step": 6885 + }, + { + "epoch": 1.3607356138027535, + "grad_norm": 2.125, + "learning_rate": 5.700572740858847e-06, + "loss": 0.9302, + "step": 6886 + }, + { + "epoch": 1.3609355088578496, + "grad_norm": 2.09375, + "learning_rate": 5.699529319367581e-06, + "loss": 1.0564, + "step": 6887 + }, + { + "epoch": 1.3611354039129457, + "grad_norm": 2.234375, + "learning_rate": 5.698485866803298e-06, + "loss": 1.0996, + "step": 6888 + }, + { + "epoch": 1.3613352989680418, + "grad_norm": 2.21875, + "learning_rate": 5.6974423832123494e-06, + "loss": 0.9551, + "step": 6889 + }, + { + "epoch": 1.361535194023138, + "grad_norm": 2.171875, + "learning_rate": 5.696398868641082e-06, + "loss": 0.9805, + "step": 6890 + }, + { + "epoch": 1.361735089078234, + "grad_norm": 2.140625, + "learning_rate": 5.695355323135852e-06, + "loss": 1.0134, + "step": 6891 + }, + { + "epoch": 1.3619349841333301, + "grad_norm": 2.140625, + "learning_rate": 5.694311746743013e-06, + "loss": 0.9429, + "step": 6892 + }, + { + "epoch": 1.362134879188426, + "grad_norm": 2.046875, + "learning_rate": 5.693268139508921e-06, + "loss": 0.8925, + "step": 6893 + }, + { + "epoch": 1.3623347742435221, + "grad_norm": 2.421875, + "learning_rate": 5.6922245014799316e-06, + "loss": 1.0224, + "step": 6894 + }, + { + "epoch": 1.3625346692986182, + "grad_norm": 2.3125, + "learning_rate": 5.6911808327024035e-06, + "loss": 1.0771, + "step": 6895 + }, + { + "epoch": 1.3627345643537143, + "grad_norm": 2.125, + "learning_rate": 5.6901371332226975e-06, + "loss": 0.9291, + "step": 6896 + }, + { + "epoch": 1.3629344594088104, + "grad_norm": 2.125, + "learning_rate": 5.689093403087173e-06, + "loss": 1.0084, + "step": 6897 + }, + { + "epoch": 1.3631343544639065, + "grad_norm": 2.28125, + "learning_rate": 5.6880496423421936e-06, + "loss": 1.0037, + "step": 6898 + }, + { + "epoch": 1.3633342495190024, + "grad_norm": 2.265625, + "learning_rate": 5.687005851034122e-06, + "loss": 1.0394, + "step": 6899 + }, + { + "epoch": 1.3635341445740985, + "grad_norm": 2.109375, + "learning_rate": 5.685962029209325e-06, + "loss": 1.0239, + "step": 6900 + }, + { + "epoch": 1.3637340396291946, + "grad_norm": 2.34375, + "learning_rate": 5.684918176914167e-06, + "loss": 1.044, + "step": 6901 + }, + { + "epoch": 1.3639339346842907, + "grad_norm": 2.09375, + "learning_rate": 5.683874294195017e-06, + "loss": 0.9391, + "step": 6902 + }, + { + "epoch": 1.3641338297393868, + "grad_norm": 2.015625, + "learning_rate": 5.682830381098243e-06, + "loss": 0.9159, + "step": 6903 + }, + { + "epoch": 1.364333724794483, + "grad_norm": 2.046875, + "learning_rate": 5.681786437670217e-06, + "loss": 0.9494, + "step": 6904 + }, + { + "epoch": 1.364533619849579, + "grad_norm": 2.15625, + "learning_rate": 5.680742463957311e-06, + "loss": 0.9288, + "step": 6905 + }, + { + "epoch": 1.364733514904675, + "grad_norm": 2.15625, + "learning_rate": 5.679698460005897e-06, + "loss": 0.982, + "step": 6906 + }, + { + "epoch": 1.3649334099597712, + "grad_norm": 2.09375, + "learning_rate": 5.678654425862349e-06, + "loss": 0.9884, + "step": 6907 + }, + { + "epoch": 1.3651333050148673, + "grad_norm": 2.1875, + "learning_rate": 5.677610361573045e-06, + "loss": 0.978, + "step": 6908 + }, + { + "epoch": 1.3653332000699634, + "grad_norm": 2.171875, + "learning_rate": 5.6765662671843615e-06, + "loss": 1.0478, + "step": 6909 + }, + { + "epoch": 1.3655330951250593, + "grad_norm": 2.078125, + "learning_rate": 5.675522142742675e-06, + "loss": 0.971, + "step": 6910 + }, + { + "epoch": 1.3657329901801554, + "grad_norm": 2.28125, + "learning_rate": 5.674477988294369e-06, + "loss": 1.0029, + "step": 6911 + }, + { + "epoch": 1.3659328852352515, + "grad_norm": 2.234375, + "learning_rate": 5.6734338038858225e-06, + "loss": 0.9451, + "step": 6912 + }, + { + "epoch": 1.3661327802903476, + "grad_norm": 2.1875, + "learning_rate": 5.672389589563417e-06, + "loss": 0.9869, + "step": 6913 + }, + { + "epoch": 1.3663326753454437, + "grad_norm": 2.171875, + "learning_rate": 5.671345345373538e-06, + "loss": 1.0203, + "step": 6914 + }, + { + "epoch": 1.3665325704005398, + "grad_norm": 2.171875, + "learning_rate": 5.6703010713625715e-06, + "loss": 1.0001, + "step": 6915 + }, + { + "epoch": 1.3667324654556359, + "grad_norm": 2.078125, + "learning_rate": 5.669256767576903e-06, + "loss": 0.9559, + "step": 6916 + }, + { + "epoch": 1.3669323605107317, + "grad_norm": 2.25, + "learning_rate": 5.6682124340629195e-06, + "loss": 0.9698, + "step": 6917 + }, + { + "epoch": 1.3671322555658278, + "grad_norm": 2.25, + "learning_rate": 5.667168070867012e-06, + "loss": 1.0643, + "step": 6918 + }, + { + "epoch": 1.367332150620924, + "grad_norm": 2.15625, + "learning_rate": 5.666123678035569e-06, + "loss": 1.0569, + "step": 6919 + }, + { + "epoch": 1.36753204567602, + "grad_norm": 2.21875, + "learning_rate": 5.665079255614984e-06, + "loss": 0.9833, + "step": 6920 + }, + { + "epoch": 1.3677319407311161, + "grad_norm": 2.078125, + "learning_rate": 5.664034803651649e-06, + "loss": 0.9298, + "step": 6921 + }, + { + "epoch": 1.3679318357862122, + "grad_norm": 2.21875, + "learning_rate": 5.662990322191959e-06, + "loss": 1.0319, + "step": 6922 + }, + { + "epoch": 1.3681317308413083, + "grad_norm": 2.171875, + "learning_rate": 5.66194581128231e-06, + "loss": 1.0111, + "step": 6923 + }, + { + "epoch": 1.3683316258964044, + "grad_norm": 2.453125, + "learning_rate": 5.660901270969098e-06, + "loss": 1.0736, + "step": 6924 + }, + { + "epoch": 1.3685315209515005, + "grad_norm": 2.171875, + "learning_rate": 5.659856701298724e-06, + "loss": 1.0559, + "step": 6925 + }, + { + "epoch": 1.3687314160065966, + "grad_norm": 2.09375, + "learning_rate": 5.658812102317583e-06, + "loss": 0.9378, + "step": 6926 + }, + { + "epoch": 1.3689313110616927, + "grad_norm": 2.21875, + "learning_rate": 5.657767474072082e-06, + "loss": 0.8943, + "step": 6927 + }, + { + "epoch": 1.3691312061167886, + "grad_norm": 2.203125, + "learning_rate": 5.656722816608619e-06, + "loss": 1.0167, + "step": 6928 + }, + { + "epoch": 1.3693311011718847, + "grad_norm": 2.125, + "learning_rate": 5.655678129973597e-06, + "loss": 1.0068, + "step": 6929 + }, + { + "epoch": 1.3695309962269808, + "grad_norm": 2.15625, + "learning_rate": 5.654633414213424e-06, + "loss": 0.9642, + "step": 6930 + }, + { + "epoch": 1.369730891282077, + "grad_norm": 2.09375, + "learning_rate": 5.6535886693745044e-06, + "loss": 0.8641, + "step": 6931 + }, + { + "epoch": 1.369930786337173, + "grad_norm": 2.1875, + "learning_rate": 5.652543895503246e-06, + "loss": 1.0691, + "step": 6932 + }, + { + "epoch": 1.3701306813922691, + "grad_norm": 2.09375, + "learning_rate": 5.651499092646056e-06, + "loss": 0.9777, + "step": 6933 + }, + { + "epoch": 1.370330576447365, + "grad_norm": 2.203125, + "learning_rate": 5.650454260849349e-06, + "loss": 0.9519, + "step": 6934 + }, + { + "epoch": 1.370530471502461, + "grad_norm": 2.0625, + "learning_rate": 5.64940940015953e-06, + "loss": 0.8727, + "step": 6935 + }, + { + "epoch": 1.3707303665575572, + "grad_norm": 2.109375, + "learning_rate": 5.648364510623016e-06, + "loss": 0.9795, + "step": 6936 + }, + { + "epoch": 1.3709302616126533, + "grad_norm": 2.078125, + "learning_rate": 5.64731959228622e-06, + "loss": 0.8466, + "step": 6937 + }, + { + "epoch": 1.3711301566677494, + "grad_norm": 2.140625, + "learning_rate": 5.646274645195556e-06, + "loss": 1.0228, + "step": 6938 + }, + { + "epoch": 1.3713300517228455, + "grad_norm": 2.234375, + "learning_rate": 5.645229669397443e-06, + "loss": 0.9517, + "step": 6939 + }, + { + "epoch": 1.3715299467779416, + "grad_norm": 2.390625, + "learning_rate": 5.644184664938296e-06, + "loss": 0.995, + "step": 6940 + }, + { + "epoch": 1.3717298418330377, + "grad_norm": 2.109375, + "learning_rate": 5.643139631864534e-06, + "loss": 0.9714, + "step": 6941 + }, + { + "epoch": 1.3719297368881338, + "grad_norm": 2.078125, + "learning_rate": 5.642094570222579e-06, + "loss": 1.0516, + "step": 6942 + }, + { + "epoch": 1.37212963194323, + "grad_norm": 2.1875, + "learning_rate": 5.641049480058853e-06, + "loss": 1.0417, + "step": 6943 + }, + { + "epoch": 1.372329526998326, + "grad_norm": 2.3125, + "learning_rate": 5.640004361419776e-06, + "loss": 1.0357, + "step": 6944 + }, + { + "epoch": 1.3725294220534219, + "grad_norm": 2.09375, + "learning_rate": 5.638959214351775e-06, + "loss": 0.9773, + "step": 6945 + }, + { + "epoch": 1.372729317108518, + "grad_norm": 2.234375, + "learning_rate": 5.637914038901273e-06, + "loss": 1.0186, + "step": 6946 + }, + { + "epoch": 1.372929212163614, + "grad_norm": 2.203125, + "learning_rate": 5.6368688351146975e-06, + "loss": 1.0735, + "step": 6947 + }, + { + "epoch": 1.3731291072187102, + "grad_norm": 2.328125, + "learning_rate": 5.635823603038476e-06, + "loss": 1.0468, + "step": 6948 + }, + { + "epoch": 1.3733290022738063, + "grad_norm": 2.09375, + "learning_rate": 5.634778342719038e-06, + "loss": 1.0165, + "step": 6949 + }, + { + "epoch": 1.3735288973289024, + "grad_norm": 2.015625, + "learning_rate": 5.633733054202814e-06, + "loss": 0.9065, + "step": 6950 + }, + { + "epoch": 1.3737287923839985, + "grad_norm": 2.046875, + "learning_rate": 5.632687737536236e-06, + "loss": 0.8383, + "step": 6951 + }, + { + "epoch": 1.3739286874390944, + "grad_norm": 2.1875, + "learning_rate": 5.631642392765736e-06, + "loss": 0.9439, + "step": 6952 + }, + { + "epoch": 1.3741285824941905, + "grad_norm": 2.234375, + "learning_rate": 5.630597019937748e-06, + "loss": 1.0172, + "step": 6953 + }, + { + "epoch": 1.3743284775492866, + "grad_norm": 2.21875, + "learning_rate": 5.6295516190987075e-06, + "loss": 0.996, + "step": 6954 + }, + { + "epoch": 1.3745283726043827, + "grad_norm": 2.21875, + "learning_rate": 5.628506190295052e-06, + "loss": 1.0551, + "step": 6955 + }, + { + "epoch": 1.3747282676594788, + "grad_norm": 2.21875, + "learning_rate": 5.627460733573219e-06, + "loss": 0.9798, + "step": 6956 + }, + { + "epoch": 1.3749281627145749, + "grad_norm": 2.140625, + "learning_rate": 5.626415248979646e-06, + "loss": 1.0186, + "step": 6957 + }, + { + "epoch": 1.375128057769671, + "grad_norm": 2.1875, + "learning_rate": 5.625369736560776e-06, + "loss": 0.9548, + "step": 6958 + }, + { + "epoch": 1.375327952824767, + "grad_norm": 2.203125, + "learning_rate": 5.624324196363048e-06, + "loss": 0.9933, + "step": 6959 + }, + { + "epoch": 1.3755278478798632, + "grad_norm": 2.171875, + "learning_rate": 5.623278628432907e-06, + "loss": 1.0, + "step": 6960 + }, + { + "epoch": 1.3757277429349593, + "grad_norm": 2.28125, + "learning_rate": 5.622233032816795e-06, + "loss": 1.0621, + "step": 6961 + }, + { + "epoch": 1.3759276379900554, + "grad_norm": 2.28125, + "learning_rate": 5.6211874095611585e-06, + "loss": 0.9978, + "step": 6962 + }, + { + "epoch": 1.3761275330451512, + "grad_norm": 1.984375, + "learning_rate": 5.620141758712443e-06, + "loss": 0.9002, + "step": 6963 + }, + { + "epoch": 1.3763274281002473, + "grad_norm": 2.125, + "learning_rate": 5.619096080317097e-06, + "loss": 0.9789, + "step": 6964 + }, + { + "epoch": 1.3765273231553434, + "grad_norm": 2.328125, + "learning_rate": 5.618050374421569e-06, + "loss": 1.0389, + "step": 6965 + }, + { + "epoch": 1.3767272182104395, + "grad_norm": 2.125, + "learning_rate": 5.61700464107231e-06, + "loss": 0.9557, + "step": 6966 + }, + { + "epoch": 1.3769271132655356, + "grad_norm": 2.0625, + "learning_rate": 5.6159588803157705e-06, + "loss": 0.9106, + "step": 6967 + }, + { + "epoch": 1.3771270083206317, + "grad_norm": 2.09375, + "learning_rate": 5.614913092198404e-06, + "loss": 0.9686, + "step": 6968 + }, + { + "epoch": 1.3773269033757276, + "grad_norm": 2.125, + "learning_rate": 5.613867276766662e-06, + "loss": 0.9622, + "step": 6969 + }, + { + "epoch": 1.3775267984308237, + "grad_norm": 2.53125, + "learning_rate": 5.612821434067003e-06, + "loss": 0.9964, + "step": 6970 + }, + { + "epoch": 1.3777266934859198, + "grad_norm": 2.234375, + "learning_rate": 5.61177556414588e-06, + "loss": 0.9981, + "step": 6971 + }, + { + "epoch": 1.377926588541016, + "grad_norm": 2.125, + "learning_rate": 5.610729667049751e-06, + "loss": 0.9791, + "step": 6972 + }, + { + "epoch": 1.378126483596112, + "grad_norm": 1.9296875, + "learning_rate": 5.609683742825078e-06, + "loss": 0.9303, + "step": 6973 + }, + { + "epoch": 1.378326378651208, + "grad_norm": 2.25, + "learning_rate": 5.608637791518318e-06, + "loss": 1.0528, + "step": 6974 + }, + { + "epoch": 1.3785262737063042, + "grad_norm": 2.21875, + "learning_rate": 5.60759181317593e-06, + "loss": 0.934, + "step": 6975 + }, + { + "epoch": 1.3787261687614003, + "grad_norm": 2.203125, + "learning_rate": 5.60654580784438e-06, + "loss": 0.9334, + "step": 6976 + }, + { + "epoch": 1.3789260638164964, + "grad_norm": 2.421875, + "learning_rate": 5.60549977557013e-06, + "loss": 0.946, + "step": 6977 + }, + { + "epoch": 1.3791259588715925, + "grad_norm": 2.21875, + "learning_rate": 5.604453716399643e-06, + "loss": 0.9288, + "step": 6978 + }, + { + "epoch": 1.3793258539266886, + "grad_norm": 2.171875, + "learning_rate": 5.60340763037939e-06, + "loss": 0.9337, + "step": 6979 + }, + { + "epoch": 1.3795257489817845, + "grad_norm": 2.09375, + "learning_rate": 5.602361517555831e-06, + "loss": 0.8842, + "step": 6980 + }, + { + "epoch": 1.3797256440368806, + "grad_norm": 2.28125, + "learning_rate": 5.601315377975439e-06, + "loss": 1.0394, + "step": 6981 + }, + { + "epoch": 1.3799255390919767, + "grad_norm": 2.15625, + "learning_rate": 5.600269211684683e-06, + "loss": 1.0313, + "step": 6982 + }, + { + "epoch": 1.3801254341470728, + "grad_norm": 2.25, + "learning_rate": 5.599223018730031e-06, + "loss": 1.0387, + "step": 6983 + }, + { + "epoch": 1.3803253292021689, + "grad_norm": 2.125, + "learning_rate": 5.5981767991579575e-06, + "loss": 1.0096, + "step": 6984 + }, + { + "epoch": 1.380525224257265, + "grad_norm": 2.234375, + "learning_rate": 5.597130553014934e-06, + "loss": 1.0312, + "step": 6985 + }, + { + "epoch": 1.380725119312361, + "grad_norm": 2.1875, + "learning_rate": 5.596084280347435e-06, + "loss": 1.0451, + "step": 6986 + }, + { + "epoch": 1.380925014367457, + "grad_norm": 2.15625, + "learning_rate": 5.595037981201939e-06, + "loss": 0.9005, + "step": 6987 + }, + { + "epoch": 1.381124909422553, + "grad_norm": 2.125, + "learning_rate": 5.5939916556249155e-06, + "loss": 1.0645, + "step": 6988 + }, + { + "epoch": 1.3813248044776492, + "grad_norm": 2.171875, + "learning_rate": 5.592945303662848e-06, + "loss": 0.9322, + "step": 6989 + }, + { + "epoch": 1.3815246995327453, + "grad_norm": 2.140625, + "learning_rate": 5.591898925362213e-06, + "loss": 0.9669, + "step": 6990 + }, + { + "epoch": 1.3817245945878414, + "grad_norm": 2.234375, + "learning_rate": 5.590852520769491e-06, + "loss": 0.993, + "step": 6991 + }, + { + "epoch": 1.3819244896429375, + "grad_norm": 2.140625, + "learning_rate": 5.589806089931163e-06, + "loss": 1.0358, + "step": 6992 + }, + { + "epoch": 1.3821243846980336, + "grad_norm": 2.09375, + "learning_rate": 5.588759632893712e-06, + "loss": 0.9827, + "step": 6993 + }, + { + "epoch": 1.3823242797531297, + "grad_norm": 2.140625, + "learning_rate": 5.587713149703619e-06, + "loss": 0.9969, + "step": 6994 + }, + { + "epoch": 1.3825241748082258, + "grad_norm": 2.15625, + "learning_rate": 5.586666640407373e-06, + "loss": 0.9349, + "step": 6995 + }, + { + "epoch": 1.3827240698633219, + "grad_norm": 2.21875, + "learning_rate": 5.585620105051457e-06, + "loss": 1.069, + "step": 6996 + }, + { + "epoch": 1.382923964918418, + "grad_norm": 2.09375, + "learning_rate": 5.584573543682358e-06, + "loss": 0.945, + "step": 6997 + }, + { + "epoch": 1.3831238599735138, + "grad_norm": 2.140625, + "learning_rate": 5.583526956346564e-06, + "loss": 0.9837, + "step": 6998 + }, + { + "epoch": 1.38332375502861, + "grad_norm": 2.265625, + "learning_rate": 5.582480343090566e-06, + "loss": 0.8857, + "step": 6999 + }, + { + "epoch": 1.383523650083706, + "grad_norm": 2.046875, + "learning_rate": 5.581433703960853e-06, + "loss": 0.887, + "step": 7000 + }, + { + "epoch": 1.3837235451388021, + "grad_norm": 2.09375, + "learning_rate": 5.580387039003918e-06, + "loss": 0.9843, + "step": 7001 + }, + { + "epoch": 1.3839234401938982, + "grad_norm": 2.09375, + "learning_rate": 5.579340348266251e-06, + "loss": 0.9901, + "step": 7002 + }, + { + "epoch": 1.3841233352489943, + "grad_norm": 2.15625, + "learning_rate": 5.578293631794348e-06, + "loss": 0.9512, + "step": 7003 + }, + { + "epoch": 1.3843232303040904, + "grad_norm": 2.171875, + "learning_rate": 5.5772468896347045e-06, + "loss": 0.9057, + "step": 7004 + }, + { + "epoch": 1.3845231253591863, + "grad_norm": 2.15625, + "learning_rate": 5.576200121833816e-06, + "loss": 0.9077, + "step": 7005 + }, + { + "epoch": 1.3847230204142824, + "grad_norm": 2.125, + "learning_rate": 5.575153328438178e-06, + "loss": 1.0334, + "step": 7006 + }, + { + "epoch": 1.3849229154693785, + "grad_norm": 2.09375, + "learning_rate": 5.574106509494292e-06, + "loss": 0.9105, + "step": 7007 + }, + { + "epoch": 1.3851228105244746, + "grad_norm": 2.265625, + "learning_rate": 5.573059665048656e-06, + "loss": 1.0605, + "step": 7008 + }, + { + "epoch": 1.3853227055795707, + "grad_norm": 2.171875, + "learning_rate": 5.57201279514777e-06, + "loss": 1.0571, + "step": 7009 + }, + { + "epoch": 1.3855226006346668, + "grad_norm": 2.09375, + "learning_rate": 5.570965899838138e-06, + "loss": 0.9358, + "step": 7010 + }, + { + "epoch": 1.385722495689763, + "grad_norm": 2.21875, + "learning_rate": 5.56991897916626e-06, + "loss": 0.9903, + "step": 7011 + }, + { + "epoch": 1.385922390744859, + "grad_norm": 2.21875, + "learning_rate": 5.5688720331786425e-06, + "loss": 1.0151, + "step": 7012 + }, + { + "epoch": 1.3861222857999551, + "grad_norm": 2.125, + "learning_rate": 5.567825061921791e-06, + "loss": 0.9319, + "step": 7013 + }, + { + "epoch": 1.3863221808550512, + "grad_norm": 2.3125, + "learning_rate": 5.56677806544221e-06, + "loss": 0.9802, + "step": 7014 + }, + { + "epoch": 1.3865220759101473, + "grad_norm": 2.1875, + "learning_rate": 5.565731043786409e-06, + "loss": 0.9866, + "step": 7015 + }, + { + "epoch": 1.3867219709652432, + "grad_norm": 2.25, + "learning_rate": 5.564683997000893e-06, + "loss": 1.0267, + "step": 7016 + }, + { + "epoch": 1.3869218660203393, + "grad_norm": 2.25, + "learning_rate": 5.5636369251321765e-06, + "loss": 1.038, + "step": 7017 + }, + { + "epoch": 1.3871217610754354, + "grad_norm": 2.34375, + "learning_rate": 5.5625898282267675e-06, + "loss": 0.9183, + "step": 7018 + }, + { + "epoch": 1.3873216561305315, + "grad_norm": 2.171875, + "learning_rate": 5.561542706331178e-06, + "loss": 1.0668, + "step": 7019 + }, + { + "epoch": 1.3875215511856276, + "grad_norm": 2.28125, + "learning_rate": 5.560495559491922e-06, + "loss": 1.0296, + "step": 7020 + }, + { + "epoch": 1.3877214462407237, + "grad_norm": 2.078125, + "learning_rate": 5.559448387755513e-06, + "loss": 0.9755, + "step": 7021 + }, + { + "epoch": 1.3879213412958196, + "grad_norm": 2.15625, + "learning_rate": 5.558401191168465e-06, + "loss": 1.003, + "step": 7022 + }, + { + "epoch": 1.3881212363509157, + "grad_norm": 2.1875, + "learning_rate": 5.557353969777297e-06, + "loss": 0.9762, + "step": 7023 + }, + { + "epoch": 1.3883211314060118, + "grad_norm": 2.265625, + "learning_rate": 5.556306723628526e-06, + "loss": 0.9916, + "step": 7024 + }, + { + "epoch": 1.3885210264611079, + "grad_norm": 2.140625, + "learning_rate": 5.555259452768668e-06, + "loss": 0.9925, + "step": 7025 + }, + { + "epoch": 1.388720921516204, + "grad_norm": 2.125, + "learning_rate": 5.5542121572442446e-06, + "loss": 0.9753, + "step": 7026 + }, + { + "epoch": 1.3889208165713, + "grad_norm": 2.09375, + "learning_rate": 5.553164837101778e-06, + "loss": 0.971, + "step": 7027 + }, + { + "epoch": 1.3891207116263962, + "grad_norm": 2.046875, + "learning_rate": 5.552117492387786e-06, + "loss": 0.8416, + "step": 7028 + }, + { + "epoch": 1.3893206066814923, + "grad_norm": 2.125, + "learning_rate": 5.551070123148795e-06, + "loss": 0.9155, + "step": 7029 + }, + { + "epoch": 1.3895205017365884, + "grad_norm": 2.15625, + "learning_rate": 5.5500227294313295e-06, + "loss": 1.0296, + "step": 7030 + }, + { + "epoch": 1.3897203967916845, + "grad_norm": 2.203125, + "learning_rate": 5.548975311281911e-06, + "loss": 1.0358, + "step": 7031 + }, + { + "epoch": 1.3899202918467806, + "grad_norm": 2.21875, + "learning_rate": 5.547927868747069e-06, + "loss": 1.0207, + "step": 7032 + }, + { + "epoch": 1.3901201869018764, + "grad_norm": 2.046875, + "learning_rate": 5.546880401873329e-06, + "loss": 0.8628, + "step": 7033 + }, + { + "epoch": 1.3903200819569725, + "grad_norm": 2.125, + "learning_rate": 5.54583291070722e-06, + "loss": 0.9888, + "step": 7034 + }, + { + "epoch": 1.3905199770120686, + "grad_norm": 2.0, + "learning_rate": 5.544785395295273e-06, + "loss": 0.9659, + "step": 7035 + }, + { + "epoch": 1.3907198720671647, + "grad_norm": 2.078125, + "learning_rate": 5.543737855684015e-06, + "loss": 0.9611, + "step": 7036 + }, + { + "epoch": 1.3909197671222608, + "grad_norm": 2.25, + "learning_rate": 5.54269029191998e-06, + "loss": 0.8994, + "step": 7037 + }, + { + "epoch": 1.391119662177357, + "grad_norm": 2.109375, + "learning_rate": 5.541642704049701e-06, + "loss": 0.9457, + "step": 7038 + }, + { + "epoch": 1.391319557232453, + "grad_norm": 2.078125, + "learning_rate": 5.540595092119709e-06, + "loss": 0.9801, + "step": 7039 + }, + { + "epoch": 1.391519452287549, + "grad_norm": 2.125, + "learning_rate": 5.539547456176543e-06, + "loss": 0.965, + "step": 7040 + }, + { + "epoch": 1.391719347342645, + "grad_norm": 2.171875, + "learning_rate": 5.538499796266735e-06, + "loss": 0.9907, + "step": 7041 + }, + { + "epoch": 1.3919192423977411, + "grad_norm": 2.078125, + "learning_rate": 5.537452112436824e-06, + "loss": 0.9189, + "step": 7042 + }, + { + "epoch": 1.3921191374528372, + "grad_norm": 2.078125, + "learning_rate": 5.536404404733348e-06, + "loss": 0.9072, + "step": 7043 + }, + { + "epoch": 1.3923190325079333, + "grad_norm": 2.140625, + "learning_rate": 5.535356673202845e-06, + "loss": 0.9507, + "step": 7044 + }, + { + "epoch": 1.3925189275630294, + "grad_norm": 2.171875, + "learning_rate": 5.534308917891856e-06, + "loss": 0.9909, + "step": 7045 + }, + { + "epoch": 1.3927188226181255, + "grad_norm": 2.1875, + "learning_rate": 5.533261138846922e-06, + "loss": 0.9652, + "step": 7046 + }, + { + "epoch": 1.3929187176732216, + "grad_norm": 2.21875, + "learning_rate": 5.532213336114586e-06, + "loss": 0.9624, + "step": 7047 + }, + { + "epoch": 1.3931186127283177, + "grad_norm": 2.171875, + "learning_rate": 5.531165509741388e-06, + "loss": 0.9854, + "step": 7048 + }, + { + "epoch": 1.3933185077834138, + "grad_norm": 2.234375, + "learning_rate": 5.5301176597738785e-06, + "loss": 1.0033, + "step": 7049 + }, + { + "epoch": 1.39351840283851, + "grad_norm": 2.0625, + "learning_rate": 5.529069786258596e-06, + "loss": 0.9078, + "step": 7050 + }, + { + "epoch": 1.3937182978936058, + "grad_norm": 2.125, + "learning_rate": 5.5280218892420925e-06, + "loss": 0.9249, + "step": 7051 + }, + { + "epoch": 1.393918192948702, + "grad_norm": 2.15625, + "learning_rate": 5.526973968770911e-06, + "loss": 1.0552, + "step": 7052 + }, + { + "epoch": 1.394118088003798, + "grad_norm": 2.078125, + "learning_rate": 5.525926024891603e-06, + "loss": 1.0302, + "step": 7053 + }, + { + "epoch": 1.394317983058894, + "grad_norm": 2.25, + "learning_rate": 5.524878057650717e-06, + "loss": 1.002, + "step": 7054 + }, + { + "epoch": 1.3945178781139902, + "grad_norm": 2.03125, + "learning_rate": 5.5238300670948044e-06, + "loss": 0.9046, + "step": 7055 + }, + { + "epoch": 1.3947177731690863, + "grad_norm": 2.125, + "learning_rate": 5.522782053270414e-06, + "loss": 1.0101, + "step": 7056 + }, + { + "epoch": 1.3949176682241822, + "grad_norm": 2.046875, + "learning_rate": 5.521734016224103e-06, + "loss": 1.0308, + "step": 7057 + }, + { + "epoch": 1.3951175632792783, + "grad_norm": 2.3125, + "learning_rate": 5.5206859560024215e-06, + "loss": 1.0799, + "step": 7058 + }, + { + "epoch": 1.3953174583343744, + "grad_norm": 2.09375, + "learning_rate": 5.519637872651926e-06, + "loss": 0.932, + "step": 7059 + }, + { + "epoch": 1.3955173533894705, + "grad_norm": 2.28125, + "learning_rate": 5.518589766219173e-06, + "loss": 1.0846, + "step": 7060 + }, + { + "epoch": 1.3957172484445666, + "grad_norm": 2.109375, + "learning_rate": 5.517541636750715e-06, + "loss": 0.9484, + "step": 7061 + }, + { + "epoch": 1.3959171434996627, + "grad_norm": 2.140625, + "learning_rate": 5.516493484293114e-06, + "loss": 1.0155, + "step": 7062 + }, + { + "epoch": 1.3961170385547588, + "grad_norm": 2.140625, + "learning_rate": 5.515445308892928e-06, + "loss": 0.9161, + "step": 7063 + }, + { + "epoch": 1.3963169336098549, + "grad_norm": 2.203125, + "learning_rate": 5.514397110596716e-06, + "loss": 0.9939, + "step": 7064 + }, + { + "epoch": 1.396516828664951, + "grad_norm": 2.125, + "learning_rate": 5.513348889451039e-06, + "loss": 0.903, + "step": 7065 + }, + { + "epoch": 1.396716723720047, + "grad_norm": 2.203125, + "learning_rate": 5.512300645502462e-06, + "loss": 0.9597, + "step": 7066 + }, + { + "epoch": 1.3969166187751432, + "grad_norm": 2.125, + "learning_rate": 5.511252378797542e-06, + "loss": 0.9626, + "step": 7067 + }, + { + "epoch": 1.397116513830239, + "grad_norm": 2.1875, + "learning_rate": 5.510204089382847e-06, + "loss": 1.0461, + "step": 7068 + }, + { + "epoch": 1.3973164088853351, + "grad_norm": 2.078125, + "learning_rate": 5.509155777304941e-06, + "loss": 0.9781, + "step": 7069 + }, + { + "epoch": 1.3975163039404312, + "grad_norm": 2.40625, + "learning_rate": 5.508107442610388e-06, + "loss": 1.0899, + "step": 7070 + }, + { + "epoch": 1.3977161989955273, + "grad_norm": 2.09375, + "learning_rate": 5.507059085345759e-06, + "loss": 0.9567, + "step": 7071 + }, + { + "epoch": 1.3979160940506234, + "grad_norm": 2.109375, + "learning_rate": 5.506010705557618e-06, + "loss": 0.9734, + "step": 7072 + }, + { + "epoch": 1.3981159891057195, + "grad_norm": 2.125, + "learning_rate": 5.5049623032925355e-06, + "loss": 0.8962, + "step": 7073 + }, + { + "epoch": 1.3983158841608156, + "grad_norm": 2.21875, + "learning_rate": 5.503913878597082e-06, + "loss": 0.9915, + "step": 7074 + }, + { + "epoch": 1.3985157792159115, + "grad_norm": 2.203125, + "learning_rate": 5.502865431517828e-06, + "loss": 0.966, + "step": 7075 + }, + { + "epoch": 1.3987156742710076, + "grad_norm": 2.265625, + "learning_rate": 5.501816962101345e-06, + "loss": 0.973, + "step": 7076 + }, + { + "epoch": 1.3989155693261037, + "grad_norm": 2.25, + "learning_rate": 5.500768470394207e-06, + "loss": 1.0737, + "step": 7077 + }, + { + "epoch": 1.3991154643811998, + "grad_norm": 2.078125, + "learning_rate": 5.499719956442985e-06, + "loss": 0.867, + "step": 7078 + }, + { + "epoch": 1.399315359436296, + "grad_norm": 2.125, + "learning_rate": 5.498671420294257e-06, + "loss": 0.9767, + "step": 7079 + }, + { + "epoch": 1.399515254491392, + "grad_norm": 2.21875, + "learning_rate": 5.497622861994598e-06, + "loss": 1.0266, + "step": 7080 + }, + { + "epoch": 1.3997151495464881, + "grad_norm": 2.0625, + "learning_rate": 5.496574281590585e-06, + "loss": 0.9242, + "step": 7081 + }, + { + "epoch": 1.3999150446015842, + "grad_norm": 2.015625, + "learning_rate": 5.4955256791287946e-06, + "loss": 0.9276, + "step": 7082 + }, + { + "epoch": 1.4001149396566803, + "grad_norm": 2.296875, + "learning_rate": 5.494477054655808e-06, + "loss": 0.9677, + "step": 7083 + }, + { + "epoch": 1.4003148347117764, + "grad_norm": 2.1875, + "learning_rate": 5.493428408218202e-06, + "loss": 0.9379, + "step": 7084 + }, + { + "epoch": 1.4005147297668725, + "grad_norm": 2.203125, + "learning_rate": 5.492379739862559e-06, + "loss": 0.9599, + "step": 7085 + }, + { + "epoch": 1.4007146248219684, + "grad_norm": 2.296875, + "learning_rate": 5.491331049635461e-06, + "loss": 1.0505, + "step": 7086 + }, + { + "epoch": 1.4009145198770645, + "grad_norm": 2.109375, + "learning_rate": 5.490282337583489e-06, + "loss": 0.9282, + "step": 7087 + }, + { + "epoch": 1.4011144149321606, + "grad_norm": 2.109375, + "learning_rate": 5.489233603753228e-06, + "loss": 0.8507, + "step": 7088 + }, + { + "epoch": 1.4013143099872567, + "grad_norm": 2.15625, + "learning_rate": 5.488184848191265e-06, + "loss": 0.9696, + "step": 7089 + }, + { + "epoch": 1.4015142050423528, + "grad_norm": 2.296875, + "learning_rate": 5.48713607094418e-06, + "loss": 0.9663, + "step": 7090 + }, + { + "epoch": 1.401714100097449, + "grad_norm": 2.078125, + "learning_rate": 5.486087272058566e-06, + "loss": 0.987, + "step": 7091 + }, + { + "epoch": 1.4019139951525448, + "grad_norm": 2.171875, + "learning_rate": 5.485038451581004e-06, + "loss": 0.9364, + "step": 7092 + }, + { + "epoch": 1.4021138902076409, + "grad_norm": 2.03125, + "learning_rate": 5.4839896095580865e-06, + "loss": 0.9397, + "step": 7093 + }, + { + "epoch": 1.402313785262737, + "grad_norm": 2.171875, + "learning_rate": 5.4829407460364045e-06, + "loss": 0.9994, + "step": 7094 + }, + { + "epoch": 1.402513680317833, + "grad_norm": 2.046875, + "learning_rate": 5.481891861062545e-06, + "loss": 0.9481, + "step": 7095 + }, + { + "epoch": 1.4027135753729292, + "grad_norm": 2.25, + "learning_rate": 5.480842954683099e-06, + "loss": 0.9541, + "step": 7096 + }, + { + "epoch": 1.4029134704280253, + "grad_norm": 2.171875, + "learning_rate": 5.479794026944663e-06, + "loss": 1.041, + "step": 7097 + }, + { + "epoch": 1.4031133654831214, + "grad_norm": 2.25, + "learning_rate": 5.478745077893827e-06, + "loss": 1.0423, + "step": 7098 + }, + { + "epoch": 1.4033132605382175, + "grad_norm": 2.15625, + "learning_rate": 5.477696107577184e-06, + "loss": 1.0669, + "step": 7099 + }, + { + "epoch": 1.4035131555933136, + "grad_norm": 2.09375, + "learning_rate": 5.476647116041332e-06, + "loss": 0.9499, + "step": 7100 + }, + { + "epoch": 1.4037130506484097, + "grad_norm": 2.171875, + "learning_rate": 5.4755981033328655e-06, + "loss": 1.0964, + "step": 7101 + }, + { + "epoch": 1.4039129457035058, + "grad_norm": 2.109375, + "learning_rate": 5.474549069498381e-06, + "loss": 0.8984, + "step": 7102 + }, + { + "epoch": 1.4041128407586017, + "grad_norm": 2.15625, + "learning_rate": 5.473500014584481e-06, + "loss": 0.9057, + "step": 7103 + }, + { + "epoch": 1.4043127358136978, + "grad_norm": 2.140625, + "learning_rate": 5.472450938637758e-06, + "loss": 0.9872, + "step": 7104 + }, + { + "epoch": 1.4045126308687939, + "grad_norm": 2.125, + "learning_rate": 5.471401841704816e-06, + "loss": 0.9796, + "step": 7105 + }, + { + "epoch": 1.40471252592389, + "grad_norm": 2.171875, + "learning_rate": 5.470352723832254e-06, + "loss": 1.0426, + "step": 7106 + }, + { + "epoch": 1.404912420978986, + "grad_norm": 2.265625, + "learning_rate": 5.4693035850666734e-06, + "loss": 1.0271, + "step": 7107 + }, + { + "epoch": 1.4051123160340822, + "grad_norm": 2.171875, + "learning_rate": 5.46825442545468e-06, + "loss": 0.9901, + "step": 7108 + }, + { + "epoch": 1.4053122110891783, + "grad_norm": 2.234375, + "learning_rate": 5.467205245042873e-06, + "loss": 0.9917, + "step": 7109 + }, + { + "epoch": 1.4055121061442741, + "grad_norm": 2.09375, + "learning_rate": 5.466156043877859e-06, + "loss": 1.0446, + "step": 7110 + }, + { + "epoch": 1.4057120011993702, + "grad_norm": 2.171875, + "learning_rate": 5.465106822006244e-06, + "loss": 0.9722, + "step": 7111 + }, + { + "epoch": 1.4059118962544663, + "grad_norm": 2.125, + "learning_rate": 5.4640575794746335e-06, + "loss": 1.0074, + "step": 7112 + }, + { + "epoch": 1.4061117913095624, + "grad_norm": 2.296875, + "learning_rate": 5.463008316329636e-06, + "loss": 1.0497, + "step": 7113 + }, + { + "epoch": 1.4063116863646585, + "grad_norm": 2.140625, + "learning_rate": 5.461959032617857e-06, + "loss": 1.0138, + "step": 7114 + }, + { + "epoch": 1.4065115814197546, + "grad_norm": 2.40625, + "learning_rate": 5.4609097283859084e-06, + "loss": 0.9636, + "step": 7115 + }, + { + "epoch": 1.4067114764748507, + "grad_norm": 2.15625, + "learning_rate": 5.459860403680397e-06, + "loss": 0.9554, + "step": 7116 + }, + { + "epoch": 1.4069113715299468, + "grad_norm": 2.171875, + "learning_rate": 5.458811058547937e-06, + "loss": 1.0052, + "step": 7117 + }, + { + "epoch": 1.407111266585043, + "grad_norm": 2.1875, + "learning_rate": 5.457761693035139e-06, + "loss": 1.023, + "step": 7118 + }, + { + "epoch": 1.407311161640139, + "grad_norm": 2.15625, + "learning_rate": 5.456712307188614e-06, + "loss": 1.0921, + "step": 7119 + }, + { + "epoch": 1.4075110566952351, + "grad_norm": 2.203125, + "learning_rate": 5.4556629010549785e-06, + "loss": 0.9256, + "step": 7120 + }, + { + "epoch": 1.407710951750331, + "grad_norm": 2.046875, + "learning_rate": 5.454613474680844e-06, + "loss": 0.854, + "step": 7121 + }, + { + "epoch": 1.407910846805427, + "grad_norm": 2.125, + "learning_rate": 5.45356402811283e-06, + "loss": 0.9923, + "step": 7122 + }, + { + "epoch": 1.4081107418605232, + "grad_norm": 2.171875, + "learning_rate": 5.452514561397549e-06, + "loss": 0.9985, + "step": 7123 + }, + { + "epoch": 1.4083106369156193, + "grad_norm": 2.15625, + "learning_rate": 5.451465074581619e-06, + "loss": 1.0087, + "step": 7124 + }, + { + "epoch": 1.4085105319707154, + "grad_norm": 2.125, + "learning_rate": 5.450415567711659e-06, + "loss": 0.9521, + "step": 7125 + }, + { + "epoch": 1.4087104270258115, + "grad_norm": 2.15625, + "learning_rate": 5.449366040834287e-06, + "loss": 0.9508, + "step": 7126 + }, + { + "epoch": 1.4089103220809076, + "grad_norm": 2.15625, + "learning_rate": 5.448316493996124e-06, + "loss": 1.0183, + "step": 7127 + }, + { + "epoch": 1.4091102171360035, + "grad_norm": 2.234375, + "learning_rate": 5.447266927243792e-06, + "loss": 1.042, + "step": 7128 + }, + { + "epoch": 1.4093101121910996, + "grad_norm": 2.15625, + "learning_rate": 5.446217340623909e-06, + "loss": 1.0149, + "step": 7129 + }, + { + "epoch": 1.4095100072461957, + "grad_norm": 2.203125, + "learning_rate": 5.445167734183099e-06, + "loss": 0.9286, + "step": 7130 + }, + { + "epoch": 1.4097099023012918, + "grad_norm": 2.40625, + "learning_rate": 5.444118107967987e-06, + "loss": 1.0246, + "step": 7131 + }, + { + "epoch": 1.4099097973563879, + "grad_norm": 2.609375, + "learning_rate": 5.443068462025195e-06, + "loss": 1.0767, + "step": 7132 + }, + { + "epoch": 1.410109692411484, + "grad_norm": 2.1875, + "learning_rate": 5.442018796401349e-06, + "loss": 0.9486, + "step": 7133 + }, + { + "epoch": 1.41030958746658, + "grad_norm": 2.1875, + "learning_rate": 5.440969111143076e-06, + "loss": 0.9949, + "step": 7134 + }, + { + "epoch": 1.4105094825216762, + "grad_norm": 2.109375, + "learning_rate": 5.439919406297002e-06, + "loss": 0.8934, + "step": 7135 + }, + { + "epoch": 1.4107093775767723, + "grad_norm": 2.140625, + "learning_rate": 5.438869681909757e-06, + "loss": 0.9253, + "step": 7136 + }, + { + "epoch": 1.4109092726318684, + "grad_norm": 2.109375, + "learning_rate": 5.437819938027964e-06, + "loss": 0.8921, + "step": 7137 + }, + { + "epoch": 1.4111091676869643, + "grad_norm": 2.125, + "learning_rate": 5.436770174698257e-06, + "loss": 1.0573, + "step": 7138 + }, + { + "epoch": 1.4113090627420604, + "grad_norm": 1.984375, + "learning_rate": 5.435720391967267e-06, + "loss": 0.9288, + "step": 7139 + }, + { + "epoch": 1.4115089577971565, + "grad_norm": 2.21875, + "learning_rate": 5.434670589881623e-06, + "loss": 0.9548, + "step": 7140 + }, + { + "epoch": 1.4117088528522526, + "grad_norm": 2.203125, + "learning_rate": 5.433620768487957e-06, + "loss": 0.94, + "step": 7141 + }, + { + "epoch": 1.4119087479073487, + "grad_norm": 2.109375, + "learning_rate": 5.432570927832906e-06, + "loss": 0.9368, + "step": 7142 + }, + { + "epoch": 1.4121086429624448, + "grad_norm": 2.1875, + "learning_rate": 5.431521067963097e-06, + "loss": 0.9632, + "step": 7143 + }, + { + "epoch": 1.4123085380175409, + "grad_norm": 2.125, + "learning_rate": 5.430471188925169e-06, + "loss": 0.9206, + "step": 7144 + }, + { + "epoch": 1.4125084330726367, + "grad_norm": 2.25, + "learning_rate": 5.4294212907657584e-06, + "loss": 0.9516, + "step": 7145 + }, + { + "epoch": 1.4127083281277328, + "grad_norm": 2.1875, + "learning_rate": 5.428371373531498e-06, + "loss": 0.9555, + "step": 7146 + }, + { + "epoch": 1.412908223182829, + "grad_norm": 2.15625, + "learning_rate": 5.427321437269027e-06, + "loss": 0.9503, + "step": 7147 + }, + { + "epoch": 1.413108118237925, + "grad_norm": 2.15625, + "learning_rate": 5.4262714820249855e-06, + "loss": 0.9878, + "step": 7148 + }, + { + "epoch": 1.4133080132930211, + "grad_norm": 2.28125, + "learning_rate": 5.425221507846008e-06, + "loss": 1.0085, + "step": 7149 + }, + { + "epoch": 1.4135079083481172, + "grad_norm": 2.34375, + "learning_rate": 5.424171514778738e-06, + "loss": 1.0855, + "step": 7150 + }, + { + "epoch": 1.4137078034032133, + "grad_norm": 2.1875, + "learning_rate": 5.423121502869814e-06, + "loss": 1.0379, + "step": 7151 + }, + { + "epoch": 1.4139076984583094, + "grad_norm": 2.359375, + "learning_rate": 5.422071472165877e-06, + "loss": 0.9778, + "step": 7152 + }, + { + "epoch": 1.4141075935134055, + "grad_norm": 2.171875, + "learning_rate": 5.421021422713573e-06, + "loss": 0.9182, + "step": 7153 + }, + { + "epoch": 1.4143074885685016, + "grad_norm": 2.15625, + "learning_rate": 5.41997135455954e-06, + "loss": 1.0276, + "step": 7154 + }, + { + "epoch": 1.4145073836235977, + "grad_norm": 2.15625, + "learning_rate": 5.418921267750425e-06, + "loss": 0.9951, + "step": 7155 + }, + { + "epoch": 1.4147072786786936, + "grad_norm": 2.234375, + "learning_rate": 5.417871162332872e-06, + "loss": 0.9143, + "step": 7156 + }, + { + "epoch": 1.4149071737337897, + "grad_norm": 2.125, + "learning_rate": 5.416821038353526e-06, + "loss": 0.9185, + "step": 7157 + }, + { + "epoch": 1.4151070687888858, + "grad_norm": 2.203125, + "learning_rate": 5.415770895859034e-06, + "loss": 0.9611, + "step": 7158 + }, + { + "epoch": 1.415306963843982, + "grad_norm": 2.328125, + "learning_rate": 5.4147207348960466e-06, + "loss": 0.9433, + "step": 7159 + }, + { + "epoch": 1.415506858899078, + "grad_norm": 2.296875, + "learning_rate": 5.413670555511204e-06, + "loss": 0.8962, + "step": 7160 + }, + { + "epoch": 1.4157067539541741, + "grad_norm": 2.15625, + "learning_rate": 5.412620357751161e-06, + "loss": 0.9701, + "step": 7161 + }, + { + "epoch": 1.4159066490092702, + "grad_norm": 2.15625, + "learning_rate": 5.411570141662567e-06, + "loss": 0.9616, + "step": 7162 + }, + { + "epoch": 1.416106544064366, + "grad_norm": 2.34375, + "learning_rate": 5.41051990729207e-06, + "loss": 0.9964, + "step": 7163 + }, + { + "epoch": 1.4163064391194622, + "grad_norm": 2.15625, + "learning_rate": 5.409469654686323e-06, + "loss": 0.9548, + "step": 7164 + }, + { + "epoch": 1.4165063341745583, + "grad_norm": 2.265625, + "learning_rate": 5.408419383891978e-06, + "loss": 1.064, + "step": 7165 + }, + { + "epoch": 1.4167062292296544, + "grad_norm": 2.109375, + "learning_rate": 5.407369094955685e-06, + "loss": 0.9294, + "step": 7166 + }, + { + "epoch": 1.4169061242847505, + "grad_norm": 2.125, + "learning_rate": 5.406318787924104e-06, + "loss": 0.9913, + "step": 7167 + }, + { + "epoch": 1.4171060193398466, + "grad_norm": 2.1875, + "learning_rate": 5.4052684628438836e-06, + "loss": 0.9951, + "step": 7168 + }, + { + "epoch": 1.4173059143949427, + "grad_norm": 2.1875, + "learning_rate": 5.404218119761682e-06, + "loss": 1.0213, + "step": 7169 + }, + { + "epoch": 1.4175058094500388, + "grad_norm": 2.15625, + "learning_rate": 5.403167758724155e-06, + "loss": 0.9688, + "step": 7170 + }, + { + "epoch": 1.417705704505135, + "grad_norm": 2.296875, + "learning_rate": 5.402117379777958e-06, + "loss": 0.982, + "step": 7171 + }, + { + "epoch": 1.417905599560231, + "grad_norm": 2.3125, + "learning_rate": 5.401066982969751e-06, + "loss": 0.9972, + "step": 7172 + }, + { + "epoch": 1.418105494615327, + "grad_norm": 2.390625, + "learning_rate": 5.400016568346192e-06, + "loss": 0.9878, + "step": 7173 + }, + { + "epoch": 1.418305389670423, + "grad_norm": 2.21875, + "learning_rate": 5.398966135953938e-06, + "loss": 0.9699, + "step": 7174 + }, + { + "epoch": 1.418505284725519, + "grad_norm": 2.109375, + "learning_rate": 5.397915685839652e-06, + "loss": 0.9125, + "step": 7175 + }, + { + "epoch": 1.4187051797806152, + "grad_norm": 2.140625, + "learning_rate": 5.396865218049995e-06, + "loss": 0.9142, + "step": 7176 + }, + { + "epoch": 1.4189050748357113, + "grad_norm": 2.296875, + "learning_rate": 5.395814732631625e-06, + "loss": 0.9923, + "step": 7177 + }, + { + "epoch": 1.4191049698908074, + "grad_norm": 2.265625, + "learning_rate": 5.394764229631207e-06, + "loss": 0.976, + "step": 7178 + }, + { + "epoch": 1.4193048649459035, + "grad_norm": 2.21875, + "learning_rate": 5.393713709095406e-06, + "loss": 0.894, + "step": 7179 + }, + { + "epoch": 1.4195047600009993, + "grad_norm": 2.140625, + "learning_rate": 5.3926631710708835e-06, + "loss": 0.956, + "step": 7180 + }, + { + "epoch": 1.4197046550560954, + "grad_norm": 2.015625, + "learning_rate": 5.3916126156043045e-06, + "loss": 0.8711, + "step": 7181 + }, + { + "epoch": 1.4199045501111915, + "grad_norm": 2.15625, + "learning_rate": 5.3905620427423344e-06, + "loss": 0.9661, + "step": 7182 + }, + { + "epoch": 1.4201044451662876, + "grad_norm": 2.1875, + "learning_rate": 5.38951145253164e-06, + "loss": 0.963, + "step": 7183 + }, + { + "epoch": 1.4203043402213837, + "grad_norm": 2.1875, + "learning_rate": 5.388460845018889e-06, + "loss": 1.0755, + "step": 7184 + }, + { + "epoch": 1.4205042352764798, + "grad_norm": 2.21875, + "learning_rate": 5.3874102202507485e-06, + "loss": 0.934, + "step": 7185 + }, + { + "epoch": 1.420704130331576, + "grad_norm": 2.046875, + "learning_rate": 5.386359578273888e-06, + "loss": 0.9272, + "step": 7186 + }, + { + "epoch": 1.420904025386672, + "grad_norm": 2.09375, + "learning_rate": 5.385308919134976e-06, + "loss": 0.9468, + "step": 7187 + }, + { + "epoch": 1.4211039204417681, + "grad_norm": 2.203125, + "learning_rate": 5.384258242880682e-06, + "loss": 1.0303, + "step": 7188 + }, + { + "epoch": 1.4213038154968642, + "grad_norm": 2.359375, + "learning_rate": 5.3832075495576794e-06, + "loss": 0.9291, + "step": 7189 + }, + { + "epoch": 1.4215037105519603, + "grad_norm": 2.25, + "learning_rate": 5.382156839212639e-06, + "loss": 0.9736, + "step": 7190 + }, + { + "epoch": 1.4217036056070562, + "grad_norm": 2.234375, + "learning_rate": 5.381106111892231e-06, + "loss": 0.9767, + "step": 7191 + }, + { + "epoch": 1.4219035006621523, + "grad_norm": 2.25, + "learning_rate": 5.38005536764313e-06, + "loss": 1.0116, + "step": 7192 + }, + { + "epoch": 1.4221033957172484, + "grad_norm": 2.21875, + "learning_rate": 5.3790046065120116e-06, + "loss": 0.9492, + "step": 7193 + }, + { + "epoch": 1.4223032907723445, + "grad_norm": 2.109375, + "learning_rate": 5.377953828545548e-06, + "loss": 0.9302, + "step": 7194 + }, + { + "epoch": 1.4225031858274406, + "grad_norm": 2.140625, + "learning_rate": 5.376903033790416e-06, + "loss": 1.0104, + "step": 7195 + }, + { + "epoch": 1.4227030808825367, + "grad_norm": 2.15625, + "learning_rate": 5.375852222293292e-06, + "loss": 0.9255, + "step": 7196 + }, + { + "epoch": 1.4229029759376328, + "grad_norm": 2.125, + "learning_rate": 5.374801394100851e-06, + "loss": 0.9987, + "step": 7197 + }, + { + "epoch": 1.4231028709927287, + "grad_norm": 2.140625, + "learning_rate": 5.373750549259773e-06, + "loss": 0.9395, + "step": 7198 + }, + { + "epoch": 1.4233027660478248, + "grad_norm": 2.359375, + "learning_rate": 5.372699687816736e-06, + "loss": 0.8662, + "step": 7199 + }, + { + "epoch": 1.423502661102921, + "grad_norm": 2.359375, + "learning_rate": 5.3716488098184175e-06, + "loss": 0.9063, + "step": 7200 + }, + { + "epoch": 1.423702556158017, + "grad_norm": 2.203125, + "learning_rate": 5.3705979153115e-06, + "loss": 0.9422, + "step": 7201 + }, + { + "epoch": 1.423902451213113, + "grad_norm": 2.25, + "learning_rate": 5.369547004342661e-06, + "loss": 0.9789, + "step": 7202 + }, + { + "epoch": 1.4241023462682092, + "grad_norm": 2.078125, + "learning_rate": 5.368496076958584e-06, + "loss": 0.9639, + "step": 7203 + }, + { + "epoch": 1.4243022413233053, + "grad_norm": 2.0625, + "learning_rate": 5.367445133205952e-06, + "loss": 0.9358, + "step": 7204 + }, + { + "epoch": 1.4245021363784014, + "grad_norm": 2.203125, + "learning_rate": 5.366394173131445e-06, + "loss": 0.9495, + "step": 7205 + }, + { + "epoch": 1.4247020314334975, + "grad_norm": 2.046875, + "learning_rate": 5.365343196781749e-06, + "loss": 0.813, + "step": 7206 + }, + { + "epoch": 1.4249019264885936, + "grad_norm": 2.09375, + "learning_rate": 5.364292204203548e-06, + "loss": 1.0009, + "step": 7207 + }, + { + "epoch": 1.4251018215436897, + "grad_norm": 2.046875, + "learning_rate": 5.363241195443524e-06, + "loss": 1.0062, + "step": 7208 + }, + { + "epoch": 1.4253017165987856, + "grad_norm": 2.046875, + "learning_rate": 5.362190170548365e-06, + "loss": 0.9212, + "step": 7209 + }, + { + "epoch": 1.4255016116538817, + "grad_norm": 2.0625, + "learning_rate": 5.3611391295647585e-06, + "loss": 0.8932, + "step": 7210 + }, + { + "epoch": 1.4257015067089778, + "grad_norm": 2.234375, + "learning_rate": 5.36008807253939e-06, + "loss": 0.9705, + "step": 7211 + }, + { + "epoch": 1.4259014017640739, + "grad_norm": 2.09375, + "learning_rate": 5.359036999518948e-06, + "loss": 0.9085, + "step": 7212 + }, + { + "epoch": 1.42610129681917, + "grad_norm": 2.28125, + "learning_rate": 5.35798591055012e-06, + "loss": 0.9985, + "step": 7213 + }, + { + "epoch": 1.426301191874266, + "grad_norm": 2.28125, + "learning_rate": 5.356934805679597e-06, + "loss": 0.8962, + "step": 7214 + }, + { + "epoch": 1.426501086929362, + "grad_norm": 2.28125, + "learning_rate": 5.355883684954068e-06, + "loss": 0.9946, + "step": 7215 + }, + { + "epoch": 1.426700981984458, + "grad_norm": 2.234375, + "learning_rate": 5.354832548420222e-06, + "loss": 1.0097, + "step": 7216 + }, + { + "epoch": 1.4269008770395541, + "grad_norm": 2.15625, + "learning_rate": 5.3537813961247546e-06, + "loss": 0.9663, + "step": 7217 + }, + { + "epoch": 1.4271007720946502, + "grad_norm": 2.140625, + "learning_rate": 5.352730228114354e-06, + "loss": 0.9505, + "step": 7218 + }, + { + "epoch": 1.4273006671497463, + "grad_norm": 2.25, + "learning_rate": 5.351679044435714e-06, + "loss": 1.0214, + "step": 7219 + }, + { + "epoch": 1.4275005622048424, + "grad_norm": 2.25, + "learning_rate": 5.35062784513553e-06, + "loss": 1.0575, + "step": 7220 + }, + { + "epoch": 1.4277004572599385, + "grad_norm": 2.25, + "learning_rate": 5.3495766302604945e-06, + "loss": 0.9601, + "step": 7221 + }, + { + "epoch": 1.4279003523150346, + "grad_norm": 2.015625, + "learning_rate": 5.348525399857301e-06, + "loss": 0.9333, + "step": 7222 + }, + { + "epoch": 1.4281002473701307, + "grad_norm": 2.109375, + "learning_rate": 5.347474153972647e-06, + "loss": 1.025, + "step": 7223 + }, + { + "epoch": 1.4283001424252268, + "grad_norm": 2.15625, + "learning_rate": 5.346422892653229e-06, + "loss": 0.9268, + "step": 7224 + }, + { + "epoch": 1.428500037480323, + "grad_norm": 2.25, + "learning_rate": 5.345371615945742e-06, + "loss": 0.971, + "step": 7225 + }, + { + "epoch": 1.4286999325354188, + "grad_norm": 2.265625, + "learning_rate": 5.344320323896886e-06, + "loss": 0.9422, + "step": 7226 + }, + { + "epoch": 1.428899827590515, + "grad_norm": 2.203125, + "learning_rate": 5.343269016553355e-06, + "loss": 0.9879, + "step": 7227 + }, + { + "epoch": 1.429099722645611, + "grad_norm": 2.140625, + "learning_rate": 5.342217693961853e-06, + "loss": 0.9466, + "step": 7228 + }, + { + "epoch": 1.4292996177007071, + "grad_norm": 2.125, + "learning_rate": 5.341166356169079e-06, + "loss": 0.9764, + "step": 7229 + }, + { + "epoch": 1.4294995127558032, + "grad_norm": 2.15625, + "learning_rate": 5.34011500322173e-06, + "loss": 0.9754, + "step": 7230 + }, + { + "epoch": 1.4296994078108993, + "grad_norm": 2.1875, + "learning_rate": 5.339063635166508e-06, + "loss": 1.0708, + "step": 7231 + }, + { + "epoch": 1.4298993028659954, + "grad_norm": 2.140625, + "learning_rate": 5.338012252050118e-06, + "loss": 1.0137, + "step": 7232 + }, + { + "epoch": 1.4300991979210913, + "grad_norm": 2.03125, + "learning_rate": 5.336960853919259e-06, + "loss": 0.9512, + "step": 7233 + }, + { + "epoch": 1.4302990929761874, + "grad_norm": 1.9921875, + "learning_rate": 5.335909440820635e-06, + "loss": 0.861, + "step": 7234 + }, + { + "epoch": 1.4304989880312835, + "grad_norm": 2.296875, + "learning_rate": 5.334858012800948e-06, + "loss": 1.0068, + "step": 7235 + }, + { + "epoch": 1.4306988830863796, + "grad_norm": 2.15625, + "learning_rate": 5.333806569906904e-06, + "loss": 0.959, + "step": 7236 + }, + { + "epoch": 1.4308987781414757, + "grad_norm": 2.046875, + "learning_rate": 5.3327551121852095e-06, + "loss": 0.8968, + "step": 7237 + }, + { + "epoch": 1.4310986731965718, + "grad_norm": 2.03125, + "learning_rate": 5.331703639682568e-06, + "loss": 0.949, + "step": 7238 + }, + { + "epoch": 1.431298568251668, + "grad_norm": 2.125, + "learning_rate": 5.330652152445686e-06, + "loss": 0.9953, + "step": 7239 + }, + { + "epoch": 1.431498463306764, + "grad_norm": 2.09375, + "learning_rate": 5.329600650521272e-06, + "loss": 0.9063, + "step": 7240 + }, + { + "epoch": 1.43169835836186, + "grad_norm": 2.21875, + "learning_rate": 5.32854913395603e-06, + "loss": 0.9747, + "step": 7241 + }, + { + "epoch": 1.4318982534169562, + "grad_norm": 2.171875, + "learning_rate": 5.327497602796671e-06, + "loss": 1.0319, + "step": 7242 + }, + { + "epoch": 1.4320981484720523, + "grad_norm": 2.21875, + "learning_rate": 5.326446057089905e-06, + "loss": 0.9916, + "step": 7243 + }, + { + "epoch": 1.4322980435271482, + "grad_norm": 2.234375, + "learning_rate": 5.325394496882439e-06, + "loss": 0.9394, + "step": 7244 + }, + { + "epoch": 1.4324979385822443, + "grad_norm": 2.203125, + "learning_rate": 5.324342922220983e-06, + "loss": 0.9727, + "step": 7245 + }, + { + "epoch": 1.4326978336373404, + "grad_norm": 2.09375, + "learning_rate": 5.323291333152251e-06, + "loss": 1.0183, + "step": 7246 + }, + { + "epoch": 1.4328977286924365, + "grad_norm": 2.140625, + "learning_rate": 5.322239729722951e-06, + "loss": 0.9756, + "step": 7247 + }, + { + "epoch": 1.4330976237475326, + "grad_norm": 2.046875, + "learning_rate": 5.321188111979797e-06, + "loss": 0.9, + "step": 7248 + }, + { + "epoch": 1.4332975188026287, + "grad_norm": 2.046875, + "learning_rate": 5.320136479969502e-06, + "loss": 0.892, + "step": 7249 + }, + { + "epoch": 1.4334974138577246, + "grad_norm": 2.09375, + "learning_rate": 5.319084833738779e-06, + "loss": 0.9366, + "step": 7250 + }, + { + "epoch": 1.4336973089128207, + "grad_norm": 2.1875, + "learning_rate": 5.318033173334341e-06, + "loss": 0.9634, + "step": 7251 + }, + { + "epoch": 1.4338972039679168, + "grad_norm": 2.1875, + "learning_rate": 5.316981498802905e-06, + "loss": 0.8643, + "step": 7252 + }, + { + "epoch": 1.4340970990230129, + "grad_norm": 2.078125, + "learning_rate": 5.315929810191183e-06, + "loss": 0.977, + "step": 7253 + }, + { + "epoch": 1.434296994078109, + "grad_norm": 2.125, + "learning_rate": 5.3148781075458924e-06, + "loss": 0.9984, + "step": 7254 + }, + { + "epoch": 1.434496889133205, + "grad_norm": 2.203125, + "learning_rate": 5.313826390913751e-06, + "loss": 0.9491, + "step": 7255 + }, + { + "epoch": 1.4346967841883012, + "grad_norm": 2.21875, + "learning_rate": 5.312774660341473e-06, + "loss": 1.0773, + "step": 7256 + }, + { + "epoch": 1.4348966792433973, + "grad_norm": 2.1875, + "learning_rate": 5.311722915875781e-06, + "loss": 1.0079, + "step": 7257 + }, + { + "epoch": 1.4350965742984934, + "grad_norm": 2.265625, + "learning_rate": 5.310671157563387e-06, + "loss": 1.0116, + "step": 7258 + }, + { + "epoch": 1.4352964693535895, + "grad_norm": 2.15625, + "learning_rate": 5.309619385451016e-06, + "loss": 1.018, + "step": 7259 + }, + { + "epoch": 1.4354963644086856, + "grad_norm": 2.203125, + "learning_rate": 5.308567599585384e-06, + "loss": 0.965, + "step": 7260 + }, + { + "epoch": 1.4356962594637814, + "grad_norm": 2.234375, + "learning_rate": 5.307515800013212e-06, + "loss": 1.0097, + "step": 7261 + }, + { + "epoch": 1.4358961545188775, + "grad_norm": 2.125, + "learning_rate": 5.3064639867812205e-06, + "loss": 1.0474, + "step": 7262 + }, + { + "epoch": 1.4360960495739736, + "grad_norm": 2.046875, + "learning_rate": 5.305412159936133e-06, + "loss": 0.9374, + "step": 7263 + }, + { + "epoch": 1.4362959446290697, + "grad_norm": 2.15625, + "learning_rate": 5.3043603195246684e-06, + "loss": 0.9425, + "step": 7264 + }, + { + "epoch": 1.4364958396841658, + "grad_norm": 2.125, + "learning_rate": 5.303308465593552e-06, + "loss": 0.9616, + "step": 7265 + }, + { + "epoch": 1.436695734739262, + "grad_norm": 2.125, + "learning_rate": 5.3022565981895045e-06, + "loss": 0.994, + "step": 7266 + }, + { + "epoch": 1.436895629794358, + "grad_norm": 2.234375, + "learning_rate": 5.301204717359253e-06, + "loss": 1.0014, + "step": 7267 + }, + { + "epoch": 1.437095524849454, + "grad_norm": 2.203125, + "learning_rate": 5.300152823149519e-06, + "loss": 1.0367, + "step": 7268 + }, + { + "epoch": 1.43729541990455, + "grad_norm": 2.15625, + "learning_rate": 5.299100915607029e-06, + "loss": 0.8804, + "step": 7269 + }, + { + "epoch": 1.437495314959646, + "grad_norm": 2.09375, + "learning_rate": 5.298048994778508e-06, + "loss": 0.8828, + "step": 7270 + }, + { + "epoch": 1.4376952100147422, + "grad_norm": 2.125, + "learning_rate": 5.296997060710684e-06, + "loss": 0.8776, + "step": 7271 + }, + { + "epoch": 1.4378951050698383, + "grad_norm": 2.203125, + "learning_rate": 5.29594511345028e-06, + "loss": 0.98, + "step": 7272 + }, + { + "epoch": 1.4380950001249344, + "grad_norm": 2.265625, + "learning_rate": 5.294893153044027e-06, + "loss": 1.0607, + "step": 7273 + }, + { + "epoch": 1.4382948951800305, + "grad_norm": 2.34375, + "learning_rate": 5.2938411795386516e-06, + "loss": 0.9884, + "step": 7274 + }, + { + "epoch": 1.4384947902351266, + "grad_norm": 2.265625, + "learning_rate": 5.292789192980882e-06, + "loss": 1.0028, + "step": 7275 + }, + { + "epoch": 1.4386946852902227, + "grad_norm": 2.234375, + "learning_rate": 5.291737193417448e-06, + "loss": 0.9676, + "step": 7276 + }, + { + "epoch": 1.4388945803453188, + "grad_norm": 2.234375, + "learning_rate": 5.290685180895078e-06, + "loss": 0.9076, + "step": 7277 + }, + { + "epoch": 1.439094475400415, + "grad_norm": 2.15625, + "learning_rate": 5.289633155460504e-06, + "loss": 0.9082, + "step": 7278 + }, + { + "epoch": 1.4392943704555108, + "grad_norm": 2.109375, + "learning_rate": 5.288581117160457e-06, + "loss": 0.9861, + "step": 7279 + }, + { + "epoch": 1.4394942655106069, + "grad_norm": 2.296875, + "learning_rate": 5.287529066041666e-06, + "loss": 0.9693, + "step": 7280 + }, + { + "epoch": 1.439694160565703, + "grad_norm": 2.1875, + "learning_rate": 5.286477002150866e-06, + "loss": 0.9659, + "step": 7281 + }, + { + "epoch": 1.439894055620799, + "grad_norm": 2.03125, + "learning_rate": 5.285424925534788e-06, + "loss": 0.8919, + "step": 7282 + }, + { + "epoch": 1.4400939506758952, + "grad_norm": 2.1875, + "learning_rate": 5.284372836240166e-06, + "loss": 0.9426, + "step": 7283 + }, + { + "epoch": 1.4402938457309913, + "grad_norm": 2.15625, + "learning_rate": 5.283320734313732e-06, + "loss": 0.9294, + "step": 7284 + }, + { + "epoch": 1.4404937407860874, + "grad_norm": 2.1875, + "learning_rate": 5.282268619802221e-06, + "loss": 1.0199, + "step": 7285 + }, + { + "epoch": 1.4406936358411833, + "grad_norm": 2.109375, + "learning_rate": 5.281216492752368e-06, + "loss": 1.0224, + "step": 7286 + }, + { + "epoch": 1.4408935308962794, + "grad_norm": 2.1875, + "learning_rate": 5.280164353210908e-06, + "loss": 0.851, + "step": 7287 + }, + { + "epoch": 1.4410934259513755, + "grad_norm": 2.1875, + "learning_rate": 5.279112201224579e-06, + "loss": 1.0081, + "step": 7288 + }, + { + "epoch": 1.4412933210064716, + "grad_norm": 2.0625, + "learning_rate": 5.278060036840115e-06, + "loss": 0.9771, + "step": 7289 + }, + { + "epoch": 1.4414932160615677, + "grad_norm": 2.15625, + "learning_rate": 5.2770078601042526e-06, + "loss": 0.9269, + "step": 7290 + }, + { + "epoch": 1.4416931111166638, + "grad_norm": 2.0625, + "learning_rate": 5.275955671063733e-06, + "loss": 0.9333, + "step": 7291 + }, + { + "epoch": 1.4418930061717599, + "grad_norm": 2.15625, + "learning_rate": 5.27490346976529e-06, + "loss": 0.9411, + "step": 7292 + }, + { + "epoch": 1.442092901226856, + "grad_norm": 2.28125, + "learning_rate": 5.273851256255664e-06, + "loss": 1.0253, + "step": 7293 + }, + { + "epoch": 1.442292796281952, + "grad_norm": 2.28125, + "learning_rate": 5.272799030581597e-06, + "loss": 0.947, + "step": 7294 + }, + { + "epoch": 1.4424926913370482, + "grad_norm": 2.078125, + "learning_rate": 5.271746792789824e-06, + "loss": 0.9798, + "step": 7295 + }, + { + "epoch": 1.4426925863921443, + "grad_norm": 2.125, + "learning_rate": 5.270694542927089e-06, + "loss": 0.947, + "step": 7296 + }, + { + "epoch": 1.4428924814472401, + "grad_norm": 2.171875, + "learning_rate": 5.269642281040129e-06, + "loss": 0.8852, + "step": 7297 + }, + { + "epoch": 1.4430923765023362, + "grad_norm": 2.15625, + "learning_rate": 5.26859000717569e-06, + "loss": 0.9008, + "step": 7298 + }, + { + "epoch": 1.4432922715574323, + "grad_norm": 2.125, + "learning_rate": 5.26753772138051e-06, + "loss": 1.0346, + "step": 7299 + }, + { + "epoch": 1.4434921666125284, + "grad_norm": 2.265625, + "learning_rate": 5.266485423701335e-06, + "loss": 1.0262, + "step": 7300 + }, + { + "epoch": 1.4436920616676245, + "grad_norm": 2.171875, + "learning_rate": 5.265433114184903e-06, + "loss": 1.0945, + "step": 7301 + }, + { + "epoch": 1.4438919567227206, + "grad_norm": 2.109375, + "learning_rate": 5.264380792877963e-06, + "loss": 0.9568, + "step": 7302 + }, + { + "epoch": 1.4440918517778165, + "grad_norm": 2.15625, + "learning_rate": 5.263328459827256e-06, + "loss": 0.9913, + "step": 7303 + }, + { + "epoch": 1.4442917468329126, + "grad_norm": 2.28125, + "learning_rate": 5.262276115079526e-06, + "loss": 1.01, + "step": 7304 + }, + { + "epoch": 1.4444916418880087, + "grad_norm": 2.0625, + "learning_rate": 5.26122375868152e-06, + "loss": 0.9188, + "step": 7305 + }, + { + "epoch": 1.4446915369431048, + "grad_norm": 2.15625, + "learning_rate": 5.260171390679981e-06, + "loss": 1.0598, + "step": 7306 + }, + { + "epoch": 1.444891431998201, + "grad_norm": 2.25, + "learning_rate": 5.2591190111216574e-06, + "loss": 0.9469, + "step": 7307 + }, + { + "epoch": 1.445091327053297, + "grad_norm": 2.125, + "learning_rate": 5.258066620053295e-06, + "loss": 0.918, + "step": 7308 + }, + { + "epoch": 1.4452912221083931, + "grad_norm": 2.34375, + "learning_rate": 5.257014217521641e-06, + "loss": 1.0795, + "step": 7309 + }, + { + "epoch": 1.4454911171634892, + "grad_norm": 2.234375, + "learning_rate": 5.255961803573444e-06, + "loss": 1.0236, + "step": 7310 + }, + { + "epoch": 1.4456910122185853, + "grad_norm": 2.25, + "learning_rate": 5.254909378255448e-06, + "loss": 1.0371, + "step": 7311 + }, + { + "epoch": 1.4458909072736814, + "grad_norm": 2.0625, + "learning_rate": 5.2538569416144065e-06, + "loss": 0.8564, + "step": 7312 + }, + { + "epoch": 1.4460908023287775, + "grad_norm": 2.3125, + "learning_rate": 5.252804493697068e-06, + "loss": 1.0691, + "step": 7313 + }, + { + "epoch": 1.4462906973838734, + "grad_norm": 2.25, + "learning_rate": 5.251752034550181e-06, + "loss": 1.0646, + "step": 7314 + }, + { + "epoch": 1.4464905924389695, + "grad_norm": 2.21875, + "learning_rate": 5.250699564220493e-06, + "loss": 0.9689, + "step": 7315 + }, + { + "epoch": 1.4466904874940656, + "grad_norm": 2.21875, + "learning_rate": 5.24964708275476e-06, + "loss": 1.0675, + "step": 7316 + }, + { + "epoch": 1.4468903825491617, + "grad_norm": 2.09375, + "learning_rate": 5.248594590199727e-06, + "loss": 0.9403, + "step": 7317 + }, + { + "epoch": 1.4470902776042578, + "grad_norm": 2.03125, + "learning_rate": 5.247542086602151e-06, + "loss": 0.8519, + "step": 7318 + }, + { + "epoch": 1.447290172659354, + "grad_norm": 2.171875, + "learning_rate": 5.2464895720087816e-06, + "loss": 0.908, + "step": 7319 + }, + { + "epoch": 1.44749006771445, + "grad_norm": 2.140625, + "learning_rate": 5.245437046466371e-06, + "loss": 0.9291, + "step": 7320 + }, + { + "epoch": 1.4476899627695459, + "grad_norm": 2.234375, + "learning_rate": 5.244384510021673e-06, + "loss": 0.9848, + "step": 7321 + }, + { + "epoch": 1.447889857824642, + "grad_norm": 2.140625, + "learning_rate": 5.243331962721443e-06, + "loss": 0.9379, + "step": 7322 + }, + { + "epoch": 1.448089752879738, + "grad_norm": 1.90625, + "learning_rate": 5.242279404612431e-06, + "loss": 0.8042, + "step": 7323 + }, + { + "epoch": 1.4482896479348342, + "grad_norm": 2.125, + "learning_rate": 5.241226835741395e-06, + "loss": 0.943, + "step": 7324 + }, + { + "epoch": 1.4484895429899303, + "grad_norm": 2.015625, + "learning_rate": 5.240174256155088e-06, + "loss": 0.8862, + "step": 7325 + }, + { + "epoch": 1.4486894380450264, + "grad_norm": 2.1875, + "learning_rate": 5.239121665900266e-06, + "loss": 0.9228, + "step": 7326 + }, + { + "epoch": 1.4488893331001225, + "grad_norm": 2.140625, + "learning_rate": 5.238069065023686e-06, + "loss": 1.0576, + "step": 7327 + }, + { + "epoch": 1.4490892281552186, + "grad_norm": 2.1875, + "learning_rate": 5.237016453572103e-06, + "loss": 0.8776, + "step": 7328 + }, + { + "epoch": 1.4492891232103147, + "grad_norm": 2.078125, + "learning_rate": 5.235963831592275e-06, + "loss": 1.0057, + "step": 7329 + }, + { + "epoch": 1.4494890182654108, + "grad_norm": 2.3125, + "learning_rate": 5.23491119913096e-06, + "loss": 1.0676, + "step": 7330 + }, + { + "epoch": 1.4496889133205069, + "grad_norm": 2.140625, + "learning_rate": 5.233858556234913e-06, + "loss": 0.9775, + "step": 7331 + }, + { + "epoch": 1.4498888083756027, + "grad_norm": 2.28125, + "learning_rate": 5.232805902950894e-06, + "loss": 1.0474, + "step": 7332 + }, + { + "epoch": 1.4500887034306988, + "grad_norm": 2.109375, + "learning_rate": 5.231753239325662e-06, + "loss": 0.9413, + "step": 7333 + }, + { + "epoch": 1.450288598485795, + "grad_norm": 2.203125, + "learning_rate": 5.230700565405975e-06, + "loss": 0.9317, + "step": 7334 + }, + { + "epoch": 1.450488493540891, + "grad_norm": 2.25, + "learning_rate": 5.229647881238594e-06, + "loss": 1.0024, + "step": 7335 + }, + { + "epoch": 1.4506883885959871, + "grad_norm": 2.203125, + "learning_rate": 5.228595186870279e-06, + "loss": 0.9927, + "step": 7336 + }, + { + "epoch": 1.4508882836510832, + "grad_norm": 2.1875, + "learning_rate": 5.227542482347789e-06, + "loss": 0.9571, + "step": 7337 + }, + { + "epoch": 1.4510881787061791, + "grad_norm": 2.125, + "learning_rate": 5.226489767717887e-06, + "loss": 0.9475, + "step": 7338 + }, + { + "epoch": 1.4512880737612752, + "grad_norm": 2.140625, + "learning_rate": 5.225437043027335e-06, + "loss": 1.0594, + "step": 7339 + }, + { + "epoch": 1.4514879688163713, + "grad_norm": 2.046875, + "learning_rate": 5.224384308322892e-06, + "loss": 0.9649, + "step": 7340 + }, + { + "epoch": 1.4516878638714674, + "grad_norm": 2.046875, + "learning_rate": 5.2233315636513225e-06, + "loss": 1.0089, + "step": 7341 + }, + { + "epoch": 1.4518877589265635, + "grad_norm": 2.125, + "learning_rate": 5.222278809059389e-06, + "loss": 1.0211, + "step": 7342 + }, + { + "epoch": 1.4520876539816596, + "grad_norm": 2.046875, + "learning_rate": 5.221226044593855e-06, + "loss": 0.8659, + "step": 7343 + }, + { + "epoch": 1.4522875490367557, + "grad_norm": 2.15625, + "learning_rate": 5.220173270301483e-06, + "loss": 0.9456, + "step": 7344 + }, + { + "epoch": 1.4524874440918518, + "grad_norm": 2.140625, + "learning_rate": 5.219120486229038e-06, + "loss": 0.9313, + "step": 7345 + }, + { + "epoch": 1.452687339146948, + "grad_norm": 2.125, + "learning_rate": 5.2180676924232846e-06, + "loss": 0.9382, + "step": 7346 + }, + { + "epoch": 1.452887234202044, + "grad_norm": 2.140625, + "learning_rate": 5.217014888930988e-06, + "loss": 0.9936, + "step": 7347 + }, + { + "epoch": 1.4530871292571401, + "grad_norm": 2.125, + "learning_rate": 5.215962075798912e-06, + "loss": 1.0392, + "step": 7348 + }, + { + "epoch": 1.453287024312236, + "grad_norm": 2.203125, + "learning_rate": 5.214909253073823e-06, + "loss": 1.0348, + "step": 7349 + }, + { + "epoch": 1.453486919367332, + "grad_norm": 2.140625, + "learning_rate": 5.21385642080249e-06, + "loss": 0.8826, + "step": 7350 + }, + { + "epoch": 1.4536868144224282, + "grad_norm": 2.234375, + "learning_rate": 5.212803579031676e-06, + "loss": 0.9775, + "step": 7351 + }, + { + "epoch": 1.4538867094775243, + "grad_norm": 1.96875, + "learning_rate": 5.21175072780815e-06, + "loss": 0.8595, + "step": 7352 + }, + { + "epoch": 1.4540866045326204, + "grad_norm": 2.359375, + "learning_rate": 5.2106978671786815e-06, + "loss": 0.9054, + "step": 7353 + }, + { + "epoch": 1.4542864995877165, + "grad_norm": 2.171875, + "learning_rate": 5.209644997190033e-06, + "loss": 0.9559, + "step": 7354 + }, + { + "epoch": 1.4544863946428126, + "grad_norm": 2.09375, + "learning_rate": 5.2085921178889785e-06, + "loss": 0.9998, + "step": 7355 + }, + { + "epoch": 1.4546862896979085, + "grad_norm": 2.203125, + "learning_rate": 5.207539229322284e-06, + "loss": 1.0, + "step": 7356 + }, + { + "epoch": 1.4548861847530046, + "grad_norm": 2.125, + "learning_rate": 5.206486331536718e-06, + "loss": 1.0222, + "step": 7357 + }, + { + "epoch": 1.4550860798081007, + "grad_norm": 2.203125, + "learning_rate": 5.205433424579052e-06, + "loss": 0.9159, + "step": 7358 + }, + { + "epoch": 1.4552859748631968, + "grad_norm": 2.140625, + "learning_rate": 5.204380508496055e-06, + "loss": 0.9389, + "step": 7359 + }, + { + "epoch": 1.4554858699182929, + "grad_norm": 2.203125, + "learning_rate": 5.203327583334499e-06, + "loss": 1.0703, + "step": 7360 + }, + { + "epoch": 1.455685764973389, + "grad_norm": 2.140625, + "learning_rate": 5.202274649141152e-06, + "loss": 0.9504, + "step": 7361 + }, + { + "epoch": 1.455885660028485, + "grad_norm": 2.28125, + "learning_rate": 5.2012217059627865e-06, + "loss": 1.0219, + "step": 7362 + }, + { + "epoch": 1.4560855550835812, + "grad_norm": 2.09375, + "learning_rate": 5.200168753846174e-06, + "loss": 0.9764, + "step": 7363 + }, + { + "epoch": 1.4562854501386773, + "grad_norm": 2.125, + "learning_rate": 5.199115792838089e-06, + "loss": 1.0029, + "step": 7364 + }, + { + "epoch": 1.4564853451937734, + "grad_norm": 2.265625, + "learning_rate": 5.1980628229852995e-06, + "loss": 0.9554, + "step": 7365 + }, + { + "epoch": 1.4566852402488695, + "grad_norm": 2.3125, + "learning_rate": 5.197009844334581e-06, + "loss": 0.9349, + "step": 7366 + }, + { + "epoch": 1.4568851353039654, + "grad_norm": 2.109375, + "learning_rate": 5.195956856932707e-06, + "loss": 0.9816, + "step": 7367 + }, + { + "epoch": 1.4570850303590615, + "grad_norm": 2.1875, + "learning_rate": 5.194903860826449e-06, + "loss": 1.0072, + "step": 7368 + }, + { + "epoch": 1.4572849254141576, + "grad_norm": 2.109375, + "learning_rate": 5.193850856062585e-06, + "loss": 0.9783, + "step": 7369 + }, + { + "epoch": 1.4574848204692537, + "grad_norm": 2.078125, + "learning_rate": 5.192797842687883e-06, + "loss": 0.8432, + "step": 7370 + }, + { + "epoch": 1.4576847155243498, + "grad_norm": 2.125, + "learning_rate": 5.191744820749124e-06, + "loss": 0.8715, + "step": 7371 + }, + { + "epoch": 1.4578846105794458, + "grad_norm": 2.234375, + "learning_rate": 5.1906917902930795e-06, + "loss": 0.9895, + "step": 7372 + }, + { + "epoch": 1.4580845056345417, + "grad_norm": 2.21875, + "learning_rate": 5.189638751366527e-06, + "loss": 0.9933, + "step": 7373 + }, + { + "epoch": 1.4582844006896378, + "grad_norm": 2.15625, + "learning_rate": 5.1885857040162406e-06, + "loss": 1.0211, + "step": 7374 + }, + { + "epoch": 1.458484295744734, + "grad_norm": 2.296875, + "learning_rate": 5.187532648288997e-06, + "loss": 0.9895, + "step": 7375 + }, + { + "epoch": 1.45868419079983, + "grad_norm": 2.140625, + "learning_rate": 5.186479584231575e-06, + "loss": 1.005, + "step": 7376 + }, + { + "epoch": 1.4588840858549261, + "grad_norm": 2.140625, + "learning_rate": 5.185426511890749e-06, + "loss": 1.0098, + "step": 7377 + }, + { + "epoch": 1.4590839809100222, + "grad_norm": 2.296875, + "learning_rate": 5.184373431313297e-06, + "loss": 0.9266, + "step": 7378 + }, + { + "epoch": 1.4592838759651183, + "grad_norm": 1.96875, + "learning_rate": 5.183320342545995e-06, + "loss": 0.9569, + "step": 7379 + }, + { + "epoch": 1.4594837710202144, + "grad_norm": 2.046875, + "learning_rate": 5.1822672456356245e-06, + "loss": 0.9165, + "step": 7380 + }, + { + "epoch": 1.4596836660753105, + "grad_norm": 2.109375, + "learning_rate": 5.181214140628964e-06, + "loss": 0.9835, + "step": 7381 + }, + { + "epoch": 1.4598835611304066, + "grad_norm": 2.15625, + "learning_rate": 5.180161027572788e-06, + "loss": 0.9744, + "step": 7382 + }, + { + "epoch": 1.4600834561855027, + "grad_norm": 2.140625, + "learning_rate": 5.179107906513879e-06, + "loss": 0.974, + "step": 7383 + }, + { + "epoch": 1.4602833512405986, + "grad_norm": 2.109375, + "learning_rate": 5.178054777499018e-06, + "loss": 0.9033, + "step": 7384 + }, + { + "epoch": 1.4604832462956947, + "grad_norm": 2.0625, + "learning_rate": 5.177001640574981e-06, + "loss": 0.9548, + "step": 7385 + }, + { + "epoch": 1.4606831413507908, + "grad_norm": 2.078125, + "learning_rate": 5.1759484957885505e-06, + "loss": 0.9575, + "step": 7386 + }, + { + "epoch": 1.460883036405887, + "grad_norm": 2.265625, + "learning_rate": 5.1748953431865064e-06, + "loss": 1.0753, + "step": 7387 + }, + { + "epoch": 1.461082931460983, + "grad_norm": 2.15625, + "learning_rate": 5.1738421828156295e-06, + "loss": 0.988, + "step": 7388 + }, + { + "epoch": 1.461282826516079, + "grad_norm": 2.125, + "learning_rate": 5.172789014722702e-06, + "loss": 0.9311, + "step": 7389 + }, + { + "epoch": 1.4614827215711752, + "grad_norm": 2.140625, + "learning_rate": 5.171735838954505e-06, + "loss": 0.9836, + "step": 7390 + }, + { + "epoch": 1.461682616626271, + "grad_norm": 2.09375, + "learning_rate": 5.17068265555782e-06, + "loss": 0.9558, + "step": 7391 + }, + { + "epoch": 1.4618825116813672, + "grad_norm": 2.203125, + "learning_rate": 5.169629464579431e-06, + "loss": 0.9883, + "step": 7392 + }, + { + "epoch": 1.4620824067364633, + "grad_norm": 2.078125, + "learning_rate": 5.168576266066118e-06, + "loss": 0.9813, + "step": 7393 + }, + { + "epoch": 1.4622823017915594, + "grad_norm": 2.109375, + "learning_rate": 5.1675230600646645e-06, + "loss": 0.898, + "step": 7394 + }, + { + "epoch": 1.4624821968466555, + "grad_norm": 2.171875, + "learning_rate": 5.166469846621856e-06, + "loss": 0.9725, + "step": 7395 + }, + { + "epoch": 1.4626820919017516, + "grad_norm": 2.265625, + "learning_rate": 5.165416625784474e-06, + "loss": 0.9968, + "step": 7396 + }, + { + "epoch": 1.4628819869568477, + "grad_norm": 2.328125, + "learning_rate": 5.164363397599302e-06, + "loss": 0.9762, + "step": 7397 + }, + { + "epoch": 1.4630818820119438, + "grad_norm": 2.109375, + "learning_rate": 5.163310162113128e-06, + "loss": 1.0231, + "step": 7398 + }, + { + "epoch": 1.4632817770670399, + "grad_norm": 2.234375, + "learning_rate": 5.162256919372732e-06, + "loss": 1.0243, + "step": 7399 + }, + { + "epoch": 1.463481672122136, + "grad_norm": 2.078125, + "learning_rate": 5.1612036694249025e-06, + "loss": 0.9186, + "step": 7400 + }, + { + "epoch": 1.463681567177232, + "grad_norm": 2.171875, + "learning_rate": 5.160150412316421e-06, + "loss": 0.9296, + "step": 7401 + }, + { + "epoch": 1.463881462232328, + "grad_norm": 2.15625, + "learning_rate": 5.159097148094076e-06, + "loss": 0.8964, + "step": 7402 + }, + { + "epoch": 1.464081357287424, + "grad_norm": 2.171875, + "learning_rate": 5.158043876804655e-06, + "loss": 1.0496, + "step": 7403 + }, + { + "epoch": 1.4642812523425202, + "grad_norm": 2.1875, + "learning_rate": 5.15699059849494e-06, + "loss": 0.9805, + "step": 7404 + }, + { + "epoch": 1.4644811473976163, + "grad_norm": 2.109375, + "learning_rate": 5.155937313211719e-06, + "loss": 0.8961, + "step": 7405 + }, + { + "epoch": 1.4646810424527124, + "grad_norm": 2.109375, + "learning_rate": 5.15488402100178e-06, + "loss": 0.9394, + "step": 7406 + }, + { + "epoch": 1.4648809375078085, + "grad_norm": 2.296875, + "learning_rate": 5.15383072191191e-06, + "loss": 0.9571, + "step": 7407 + }, + { + "epoch": 1.4650808325629046, + "grad_norm": 2.078125, + "learning_rate": 5.152777415988894e-06, + "loss": 0.9525, + "step": 7408 + }, + { + "epoch": 1.4652807276180004, + "grad_norm": 2.109375, + "learning_rate": 5.151724103279524e-06, + "loss": 0.9849, + "step": 7409 + }, + { + "epoch": 1.4654806226730965, + "grad_norm": 2.140625, + "learning_rate": 5.150670783830584e-06, + "loss": 0.9647, + "step": 7410 + }, + { + "epoch": 1.4656805177281926, + "grad_norm": 2.375, + "learning_rate": 5.149617457688862e-06, + "loss": 0.9923, + "step": 7411 + }, + { + "epoch": 1.4658804127832887, + "grad_norm": 2.359375, + "learning_rate": 5.148564124901152e-06, + "loss": 0.9533, + "step": 7412 + }, + { + "epoch": 1.4660803078383848, + "grad_norm": 2.0625, + "learning_rate": 5.147510785514237e-06, + "loss": 0.8886, + "step": 7413 + }, + { + "epoch": 1.466280202893481, + "grad_norm": 2.25, + "learning_rate": 5.14645743957491e-06, + "loss": 1.0243, + "step": 7414 + }, + { + "epoch": 1.466480097948577, + "grad_norm": 2.265625, + "learning_rate": 5.145404087129959e-06, + "loss": 1.0198, + "step": 7415 + }, + { + "epoch": 1.4666799930036731, + "grad_norm": 2.046875, + "learning_rate": 5.144350728226173e-06, + "loss": 0.9834, + "step": 7416 + }, + { + "epoch": 1.4668798880587692, + "grad_norm": 2.171875, + "learning_rate": 5.143297362910344e-06, + "loss": 1.0275, + "step": 7417 + }, + { + "epoch": 1.4670797831138653, + "grad_norm": 5.59375, + "learning_rate": 5.142243991229261e-06, + "loss": 0.9838, + "step": 7418 + }, + { + "epoch": 1.4672796781689614, + "grad_norm": 2.359375, + "learning_rate": 5.141190613229714e-06, + "loss": 0.9877, + "step": 7419 + }, + { + "epoch": 1.4674795732240573, + "grad_norm": 2.15625, + "learning_rate": 5.1401372289584975e-06, + "loss": 0.9014, + "step": 7420 + }, + { + "epoch": 1.4676794682791534, + "grad_norm": 2.109375, + "learning_rate": 5.139083838462398e-06, + "loss": 0.9056, + "step": 7421 + }, + { + "epoch": 1.4678793633342495, + "grad_norm": 2.25, + "learning_rate": 5.138030441788211e-06, + "loss": 0.9881, + "step": 7422 + }, + { + "epoch": 1.4680792583893456, + "grad_norm": 2.125, + "learning_rate": 5.136977038982727e-06, + "loss": 0.8494, + "step": 7423 + }, + { + "epoch": 1.4682791534444417, + "grad_norm": 2.171875, + "learning_rate": 5.135923630092736e-06, + "loss": 1.0048, + "step": 7424 + }, + { + "epoch": 1.4684790484995378, + "grad_norm": 2.234375, + "learning_rate": 5.134870215165032e-06, + "loss": 1.0214, + "step": 7425 + }, + { + "epoch": 1.4686789435546337, + "grad_norm": 2.109375, + "learning_rate": 5.133816794246408e-06, + "loss": 0.9724, + "step": 7426 + }, + { + "epoch": 1.4688788386097298, + "grad_norm": 2.21875, + "learning_rate": 5.132763367383655e-06, + "loss": 1.0002, + "step": 7427 + }, + { + "epoch": 1.4690787336648259, + "grad_norm": 2.125, + "learning_rate": 5.131709934623568e-06, + "loss": 0.9988, + "step": 7428 + }, + { + "epoch": 1.469278628719922, + "grad_norm": 2.28125, + "learning_rate": 5.130656496012941e-06, + "loss": 0.8107, + "step": 7429 + }, + { + "epoch": 1.469478523775018, + "grad_norm": 2.125, + "learning_rate": 5.129603051598565e-06, + "loss": 1.0389, + "step": 7430 + }, + { + "epoch": 1.4696784188301142, + "grad_norm": 2.265625, + "learning_rate": 5.128549601427237e-06, + "loss": 1.0429, + "step": 7431 + }, + { + "epoch": 1.4698783138852103, + "grad_norm": 2.3125, + "learning_rate": 5.127496145545747e-06, + "loss": 1.1101, + "step": 7432 + }, + { + "epoch": 1.4700782089403064, + "grad_norm": 2.203125, + "learning_rate": 5.126442684000893e-06, + "loss": 0.9468, + "step": 7433 + }, + { + "epoch": 1.4702781039954025, + "grad_norm": 2.125, + "learning_rate": 5.125389216839469e-06, + "loss": 0.9814, + "step": 7434 + }, + { + "epoch": 1.4704779990504986, + "grad_norm": 2.203125, + "learning_rate": 5.124335744108269e-06, + "loss": 0.9914, + "step": 7435 + }, + { + "epoch": 1.4706778941055947, + "grad_norm": 2.25, + "learning_rate": 5.123282265854088e-06, + "loss": 0.9354, + "step": 7436 + }, + { + "epoch": 1.4708777891606906, + "grad_norm": 2.078125, + "learning_rate": 5.122228782123723e-06, + "loss": 1.0049, + "step": 7437 + }, + { + "epoch": 1.4710776842157867, + "grad_norm": 2.140625, + "learning_rate": 5.121175292963968e-06, + "loss": 0.9465, + "step": 7438 + }, + { + "epoch": 1.4712775792708828, + "grad_norm": 2.28125, + "learning_rate": 5.120121798421619e-06, + "loss": 0.9718, + "step": 7439 + }, + { + "epoch": 1.4714774743259789, + "grad_norm": 2.0625, + "learning_rate": 5.119068298543474e-06, + "loss": 0.9102, + "step": 7440 + }, + { + "epoch": 1.471677369381075, + "grad_norm": 2.046875, + "learning_rate": 5.1180147933763255e-06, + "loss": 0.9933, + "step": 7441 + }, + { + "epoch": 1.471877264436171, + "grad_norm": 2.109375, + "learning_rate": 5.116961282966975e-06, + "loss": 0.9802, + "step": 7442 + }, + { + "epoch": 1.4720771594912672, + "grad_norm": 2.0625, + "learning_rate": 5.1159077673622145e-06, + "loss": 0.9478, + "step": 7443 + }, + { + "epoch": 1.472277054546363, + "grad_norm": 2.34375, + "learning_rate": 5.114854246608845e-06, + "loss": 1.0779, + "step": 7444 + }, + { + "epoch": 1.4724769496014591, + "grad_norm": 2.234375, + "learning_rate": 5.113800720753662e-06, + "loss": 1.0184, + "step": 7445 + }, + { + "epoch": 1.4726768446565552, + "grad_norm": 2.1875, + "learning_rate": 5.112747189843462e-06, + "loss": 1.0157, + "step": 7446 + }, + { + "epoch": 1.4728767397116513, + "grad_norm": 2.1875, + "learning_rate": 5.111693653925045e-06, + "loss": 1.0297, + "step": 7447 + }, + { + "epoch": 1.4730766347667474, + "grad_norm": 2.171875, + "learning_rate": 5.110640113045208e-06, + "loss": 0.9191, + "step": 7448 + }, + { + "epoch": 1.4732765298218435, + "grad_norm": 2.15625, + "learning_rate": 5.109586567250747e-06, + "loss": 0.9916, + "step": 7449 + }, + { + "epoch": 1.4734764248769396, + "grad_norm": 2.203125, + "learning_rate": 5.108533016588464e-06, + "loss": 0.9522, + "step": 7450 + }, + { + "epoch": 1.4736763199320357, + "grad_norm": 2.21875, + "learning_rate": 5.107479461105156e-06, + "loss": 0.9472, + "step": 7451 + }, + { + "epoch": 1.4738762149871318, + "grad_norm": 2.171875, + "learning_rate": 5.106425900847621e-06, + "loss": 0.9726, + "step": 7452 + }, + { + "epoch": 1.474076110042228, + "grad_norm": 2.125, + "learning_rate": 5.10537233586266e-06, + "loss": 0.9501, + "step": 7453 + }, + { + "epoch": 1.474276005097324, + "grad_norm": 2.15625, + "learning_rate": 5.104318766197072e-06, + "loss": 0.9535, + "step": 7454 + }, + { + "epoch": 1.47447590015242, + "grad_norm": 2.1875, + "learning_rate": 5.103265191897654e-06, + "loss": 0.9887, + "step": 7455 + }, + { + "epoch": 1.474675795207516, + "grad_norm": 2.09375, + "learning_rate": 5.102211613011207e-06, + "loss": 0.964, + "step": 7456 + }, + { + "epoch": 1.4748756902626121, + "grad_norm": 2.0625, + "learning_rate": 5.101158029584531e-06, + "loss": 0.9412, + "step": 7457 + }, + { + "epoch": 1.4750755853177082, + "grad_norm": 2.1875, + "learning_rate": 5.100104441664426e-06, + "loss": 1.0356, + "step": 7458 + }, + { + "epoch": 1.4752754803728043, + "grad_norm": 2.453125, + "learning_rate": 5.099050849297694e-06, + "loss": 0.9289, + "step": 7459 + }, + { + "epoch": 1.4754753754279004, + "grad_norm": 2.15625, + "learning_rate": 5.097997252531133e-06, + "loss": 0.9428, + "step": 7460 + }, + { + "epoch": 1.4756752704829963, + "grad_norm": 2.078125, + "learning_rate": 5.096943651411543e-06, + "loss": 0.98, + "step": 7461 + }, + { + "epoch": 1.4758751655380924, + "grad_norm": 2.203125, + "learning_rate": 5.095890045985729e-06, + "loss": 0.9712, + "step": 7462 + }, + { + "epoch": 1.4760750605931885, + "grad_norm": 2.09375, + "learning_rate": 5.094836436300486e-06, + "loss": 0.9804, + "step": 7463 + }, + { + "epoch": 1.4762749556482846, + "grad_norm": 2.1875, + "learning_rate": 5.093782822402621e-06, + "loss": 0.9727, + "step": 7464 + }, + { + "epoch": 1.4764748507033807, + "grad_norm": 2.265625, + "learning_rate": 5.0927292043389335e-06, + "loss": 1.0436, + "step": 7465 + }, + { + "epoch": 1.4766747457584768, + "grad_norm": 2.125, + "learning_rate": 5.091675582156224e-06, + "loss": 1.0495, + "step": 7466 + }, + { + "epoch": 1.476874640813573, + "grad_norm": 2.203125, + "learning_rate": 5.090621955901294e-06, + "loss": 0.9945, + "step": 7467 + }, + { + "epoch": 1.477074535868669, + "grad_norm": 2.09375, + "learning_rate": 5.0895683256209485e-06, + "loss": 0.9012, + "step": 7468 + }, + { + "epoch": 1.477274430923765, + "grad_norm": 2.109375, + "learning_rate": 5.088514691361985e-06, + "loss": 0.8838, + "step": 7469 + }, + { + "epoch": 1.4774743259788612, + "grad_norm": 2.140625, + "learning_rate": 5.08746105317121e-06, + "loss": 0.9647, + "step": 7470 + }, + { + "epoch": 1.4776742210339573, + "grad_norm": 2.21875, + "learning_rate": 5.086407411095424e-06, + "loss": 1.0347, + "step": 7471 + }, + { + "epoch": 1.4778741160890532, + "grad_norm": 2.078125, + "learning_rate": 5.08535376518143e-06, + "loss": 0.9503, + "step": 7472 + }, + { + "epoch": 1.4780740111441493, + "grad_norm": 2.171875, + "learning_rate": 5.084300115476031e-06, + "loss": 1.0924, + "step": 7473 + }, + { + "epoch": 1.4782739061992454, + "grad_norm": 2.171875, + "learning_rate": 5.083246462026029e-06, + "loss": 0.9997, + "step": 7474 + }, + { + "epoch": 1.4784738012543415, + "grad_norm": 2.15625, + "learning_rate": 5.082192804878229e-06, + "loss": 0.9579, + "step": 7475 + }, + { + "epoch": 1.4786736963094376, + "grad_norm": 2.15625, + "learning_rate": 5.081139144079433e-06, + "loss": 1.0236, + "step": 7476 + }, + { + "epoch": 1.4788735913645337, + "grad_norm": 2.046875, + "learning_rate": 5.080085479676445e-06, + "loss": 0.9185, + "step": 7477 + }, + { + "epoch": 1.4790734864196298, + "grad_norm": 2.109375, + "learning_rate": 5.079031811716068e-06, + "loss": 0.9896, + "step": 7478 + }, + { + "epoch": 1.4792733814747256, + "grad_norm": 2.265625, + "learning_rate": 5.077978140245108e-06, + "loss": 1.0319, + "step": 7479 + }, + { + "epoch": 1.4794732765298217, + "grad_norm": 2.140625, + "learning_rate": 5.076924465310365e-06, + "loss": 0.9257, + "step": 7480 + }, + { + "epoch": 1.4796731715849178, + "grad_norm": 2.03125, + "learning_rate": 5.075870786958646e-06, + "loss": 0.8942, + "step": 7481 + }, + { + "epoch": 1.479873066640014, + "grad_norm": 2.125, + "learning_rate": 5.0748171052367565e-06, + "loss": 0.9094, + "step": 7482 + }, + { + "epoch": 1.48007296169511, + "grad_norm": 2.046875, + "learning_rate": 5.073763420191498e-06, + "loss": 0.9146, + "step": 7483 + }, + { + "epoch": 1.4802728567502061, + "grad_norm": 2.078125, + "learning_rate": 5.072709731869677e-06, + "loss": 0.9358, + "step": 7484 + }, + { + "epoch": 1.4804727518053022, + "grad_norm": 2.296875, + "learning_rate": 5.071656040318098e-06, + "loss": 0.9649, + "step": 7485 + }, + { + "epoch": 1.4806726468603983, + "grad_norm": 2.109375, + "learning_rate": 5.070602345583564e-06, + "loss": 0.9764, + "step": 7486 + }, + { + "epoch": 1.4808725419154944, + "grad_norm": 2.203125, + "learning_rate": 5.069548647712881e-06, + "loss": 0.9947, + "step": 7487 + }, + { + "epoch": 1.4810724369705905, + "grad_norm": 2.140625, + "learning_rate": 5.0684949467528555e-06, + "loss": 1.0016, + "step": 7488 + }, + { + "epoch": 1.4812723320256866, + "grad_norm": 2.21875, + "learning_rate": 5.06744124275029e-06, + "loss": 1.041, + "step": 7489 + }, + { + "epoch": 1.4814722270807825, + "grad_norm": 2.125, + "learning_rate": 5.066387535751995e-06, + "loss": 0.9154, + "step": 7490 + }, + { + "epoch": 1.4816721221358786, + "grad_norm": 2.1875, + "learning_rate": 5.06533382580477e-06, + "loss": 1.0142, + "step": 7491 + }, + { + "epoch": 1.4818720171909747, + "grad_norm": 2.234375, + "learning_rate": 5.064280112955424e-06, + "loss": 1.034, + "step": 7492 + }, + { + "epoch": 1.4820719122460708, + "grad_norm": 2.015625, + "learning_rate": 5.063226397250763e-06, + "loss": 0.9688, + "step": 7493 + }, + { + "epoch": 1.482271807301167, + "grad_norm": 2.265625, + "learning_rate": 5.062172678737591e-06, + "loss": 1.0058, + "step": 7494 + }, + { + "epoch": 1.482471702356263, + "grad_norm": 2.296875, + "learning_rate": 5.061118957462716e-06, + "loss": 0.992, + "step": 7495 + }, + { + "epoch": 1.482671597411359, + "grad_norm": 2.109375, + "learning_rate": 5.060065233472944e-06, + "loss": 1.0407, + "step": 7496 + }, + { + "epoch": 1.482871492466455, + "grad_norm": 2.21875, + "learning_rate": 5.059011506815079e-06, + "loss": 1.036, + "step": 7497 + }, + { + "epoch": 1.483071387521551, + "grad_norm": 2.09375, + "learning_rate": 5.0579577775359314e-06, + "loss": 0.8635, + "step": 7498 + }, + { + "epoch": 1.4832712825766472, + "grad_norm": 2.1875, + "learning_rate": 5.0569040456823045e-06, + "loss": 0.9879, + "step": 7499 + }, + { + "epoch": 1.4834711776317433, + "grad_norm": 2.1875, + "learning_rate": 5.055850311301006e-06, + "loss": 0.9769, + "step": 7500 + }, + { + "epoch": 1.4834711776317433, + "eval_loss": 0.9009870290756226, + "eval_runtime": 593.6995, + "eval_samples_per_second": 3.601, + "eval_steps_per_second": 3.601, + "step": 7500 + }, + { + "epoch": 1.4836710726868394, + "grad_norm": 2.21875, + "learning_rate": 5.054796574438842e-06, + "loss": 1.067, + "step": 7501 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 2.15625, + "learning_rate": 5.053742835142621e-06, + "loss": 0.8663, + "step": 7502 + }, + { + "epoch": 1.4840708627970316, + "grad_norm": 2.109375, + "learning_rate": 5.052689093459148e-06, + "loss": 0.9994, + "step": 7503 + }, + { + "epoch": 1.4842707578521277, + "grad_norm": 2.1875, + "learning_rate": 5.051635349435232e-06, + "loss": 1.0215, + "step": 7504 + }, + { + "epoch": 1.4844706529072238, + "grad_norm": 2.109375, + "learning_rate": 5.050581603117679e-06, + "loss": 0.9514, + "step": 7505 + }, + { + "epoch": 1.48467054796232, + "grad_norm": 2.15625, + "learning_rate": 5.049527854553296e-06, + "loss": 0.9356, + "step": 7506 + }, + { + "epoch": 1.4848704430174158, + "grad_norm": 2.1875, + "learning_rate": 5.0484741037888915e-06, + "loss": 0.996, + "step": 7507 + }, + { + "epoch": 1.4850703380725119, + "grad_norm": 2.171875, + "learning_rate": 5.047420350871273e-06, + "loss": 1.0457, + "step": 7508 + }, + { + "epoch": 1.485270233127608, + "grad_norm": 2.203125, + "learning_rate": 5.046366595847246e-06, + "loss": 0.9795, + "step": 7509 + }, + { + "epoch": 1.485470128182704, + "grad_norm": 2.140625, + "learning_rate": 5.045312838763622e-06, + "loss": 0.9604, + "step": 7510 + }, + { + "epoch": 1.4856700232378002, + "grad_norm": 2.1875, + "learning_rate": 5.044259079667204e-06, + "loss": 0.9675, + "step": 7511 + }, + { + "epoch": 1.4858699182928963, + "grad_norm": 2.171875, + "learning_rate": 5.043205318604804e-06, + "loss": 0.9365, + "step": 7512 + }, + { + "epoch": 1.4860698133479924, + "grad_norm": 2.046875, + "learning_rate": 5.0421515556232285e-06, + "loss": 0.9545, + "step": 7513 + }, + { + "epoch": 1.4862697084030883, + "grad_norm": 2.1875, + "learning_rate": 5.041097790769285e-06, + "loss": 1.0955, + "step": 7514 + }, + { + "epoch": 1.4864696034581844, + "grad_norm": 2.265625, + "learning_rate": 5.040044024089782e-06, + "loss": 0.9641, + "step": 7515 + }, + { + "epoch": 1.4866694985132805, + "grad_norm": 2.171875, + "learning_rate": 5.0389902556315286e-06, + "loss": 1.0313, + "step": 7516 + }, + { + "epoch": 1.4868693935683766, + "grad_norm": 2.140625, + "learning_rate": 5.037936485441332e-06, + "loss": 0.9403, + "step": 7517 + }, + { + "epoch": 1.4870692886234727, + "grad_norm": 2.125, + "learning_rate": 5.0368827135659996e-06, + "loss": 1.0066, + "step": 7518 + }, + { + "epoch": 1.4872691836785688, + "grad_norm": 2.21875, + "learning_rate": 5.035828940052342e-06, + "loss": 1.1098, + "step": 7519 + }, + { + "epoch": 1.4874690787336649, + "grad_norm": 2.1875, + "learning_rate": 5.0347751649471655e-06, + "loss": 1.0444, + "step": 7520 + }, + { + "epoch": 1.487668973788761, + "grad_norm": 2.0625, + "learning_rate": 5.033721388297281e-06, + "loss": 0.91, + "step": 7521 + }, + { + "epoch": 1.487868868843857, + "grad_norm": 2.046875, + "learning_rate": 5.0326676101494955e-06, + "loss": 0.917, + "step": 7522 + }, + { + "epoch": 1.4880687638989532, + "grad_norm": 2.171875, + "learning_rate": 5.031613830550618e-06, + "loss": 0.9712, + "step": 7523 + }, + { + "epoch": 1.4882686589540493, + "grad_norm": 2.109375, + "learning_rate": 5.0305600495474586e-06, + "loss": 0.9923, + "step": 7524 + }, + { + "epoch": 1.4884685540091451, + "grad_norm": 2.125, + "learning_rate": 5.029506267186823e-06, + "loss": 0.9984, + "step": 7525 + }, + { + "epoch": 1.4886684490642412, + "grad_norm": 2.171875, + "learning_rate": 5.028452483515524e-06, + "loss": 1.0691, + "step": 7526 + }, + { + "epoch": 1.4888683441193373, + "grad_norm": 2.125, + "learning_rate": 5.027398698580369e-06, + "loss": 0.9605, + "step": 7527 + }, + { + "epoch": 1.4890682391744334, + "grad_norm": 2.171875, + "learning_rate": 5.026344912428166e-06, + "loss": 0.9946, + "step": 7528 + }, + { + "epoch": 1.4892681342295295, + "grad_norm": 2.296875, + "learning_rate": 5.025291125105725e-06, + "loss": 0.9633, + "step": 7529 + }, + { + "epoch": 1.4894680292846256, + "grad_norm": 2.078125, + "learning_rate": 5.024237336659855e-06, + "loss": 0.9967, + "step": 7530 + }, + { + "epoch": 1.4896679243397217, + "grad_norm": 2.078125, + "learning_rate": 5.023183547137364e-06, + "loss": 1.0632, + "step": 7531 + }, + { + "epoch": 1.4898678193948176, + "grad_norm": 2.078125, + "learning_rate": 5.022129756585063e-06, + "loss": 0.9735, + "step": 7532 + }, + { + "epoch": 1.4900677144499137, + "grad_norm": 2.25, + "learning_rate": 5.021075965049762e-06, + "loss": 1.0718, + "step": 7533 + }, + { + "epoch": 1.4902676095050098, + "grad_norm": 2.21875, + "learning_rate": 5.0200221725782675e-06, + "loss": 1.0503, + "step": 7534 + }, + { + "epoch": 1.490467504560106, + "grad_norm": 1.9609375, + "learning_rate": 5.0189683792173905e-06, + "loss": 0.9196, + "step": 7535 + }, + { + "epoch": 1.490667399615202, + "grad_norm": 2.25, + "learning_rate": 5.01791458501394e-06, + "loss": 0.9358, + "step": 7536 + }, + { + "epoch": 1.490867294670298, + "grad_norm": 2.09375, + "learning_rate": 5.0168607900147244e-06, + "loss": 0.9498, + "step": 7537 + }, + { + "epoch": 1.4910671897253942, + "grad_norm": 2.125, + "learning_rate": 5.015806994266556e-06, + "loss": 0.9967, + "step": 7538 + }, + { + "epoch": 1.4912670847804903, + "grad_norm": 2.203125, + "learning_rate": 5.014753197816243e-06, + "loss": 0.9562, + "step": 7539 + }, + { + "epoch": 1.4914669798355864, + "grad_norm": 2.21875, + "learning_rate": 5.013699400710592e-06, + "loss": 1.0169, + "step": 7540 + }, + { + "epoch": 1.4916668748906825, + "grad_norm": 2.09375, + "learning_rate": 5.0126456029964175e-06, + "loss": 1.0031, + "step": 7541 + }, + { + "epoch": 1.4918667699457786, + "grad_norm": 2.1875, + "learning_rate": 5.011591804720526e-06, + "loss": 0.9976, + "step": 7542 + }, + { + "epoch": 1.4920666650008745, + "grad_norm": 2.234375, + "learning_rate": 5.010538005929728e-06, + "loss": 0.9954, + "step": 7543 + }, + { + "epoch": 1.4922665600559706, + "grad_norm": 2.40625, + "learning_rate": 5.009484206670831e-06, + "loss": 1.028, + "step": 7544 + }, + { + "epoch": 1.4924664551110667, + "grad_norm": 2.140625, + "learning_rate": 5.008430406990649e-06, + "loss": 0.9847, + "step": 7545 + }, + { + "epoch": 1.4926663501661628, + "grad_norm": 2.40625, + "learning_rate": 5.007376606935989e-06, + "loss": 1.0274, + "step": 7546 + }, + { + "epoch": 1.4928662452212589, + "grad_norm": 2.296875, + "learning_rate": 5.0063228065536595e-06, + "loss": 0.9803, + "step": 7547 + }, + { + "epoch": 1.493066140276355, + "grad_norm": 2.109375, + "learning_rate": 5.005269005890473e-06, + "loss": 0.9394, + "step": 7548 + }, + { + "epoch": 1.4932660353314509, + "grad_norm": 2.203125, + "learning_rate": 5.004215204993237e-06, + "loss": 0.9289, + "step": 7549 + }, + { + "epoch": 1.493465930386547, + "grad_norm": 2.109375, + "learning_rate": 5.003161403908762e-06, + "loss": 0.9714, + "step": 7550 + }, + { + "epoch": 1.493665825441643, + "grad_norm": 2.140625, + "learning_rate": 5.002107602683858e-06, + "loss": 0.8793, + "step": 7551 + }, + { + "epoch": 1.4938657204967392, + "grad_norm": 2.234375, + "learning_rate": 5.0010538013653345e-06, + "loss": 1.0672, + "step": 7552 + }, + { + "epoch": 1.4940656155518353, + "grad_norm": 2.28125, + "learning_rate": 5e-06, + "loss": 1.0445, + "step": 7553 + }, + { + "epoch": 1.4942655106069314, + "grad_norm": 2.265625, + "learning_rate": 4.998946198634667e-06, + "loss": 0.9237, + "step": 7554 + }, + { + "epoch": 1.4944654056620275, + "grad_norm": 2.171875, + "learning_rate": 4.997892397316144e-06, + "loss": 0.9672, + "step": 7555 + }, + { + "epoch": 1.4946653007171236, + "grad_norm": 2.21875, + "learning_rate": 4.99683859609124e-06, + "loss": 1.0201, + "step": 7556 + }, + { + "epoch": 1.4948651957722197, + "grad_norm": 2.140625, + "learning_rate": 4.995784795006764e-06, + "loss": 0.9943, + "step": 7557 + }, + { + "epoch": 1.4950650908273158, + "grad_norm": 2.109375, + "learning_rate": 4.994730994109529e-06, + "loss": 0.9561, + "step": 7558 + }, + { + "epoch": 1.4952649858824119, + "grad_norm": 2.15625, + "learning_rate": 4.993677193446341e-06, + "loss": 0.99, + "step": 7559 + }, + { + "epoch": 1.4954648809375077, + "grad_norm": 2.1875, + "learning_rate": 4.9926233930640136e-06, + "loss": 0.9834, + "step": 7560 + }, + { + "epoch": 1.4956647759926038, + "grad_norm": 2.125, + "learning_rate": 4.991569593009352e-06, + "loss": 0.9143, + "step": 7561 + }, + { + "epoch": 1.4958646710477, + "grad_norm": 2.203125, + "learning_rate": 4.990515793329169e-06, + "loss": 1.0046, + "step": 7562 + }, + { + "epoch": 1.496064566102796, + "grad_norm": 2.140625, + "learning_rate": 4.9894619940702735e-06, + "loss": 0.946, + "step": 7563 + }, + { + "epoch": 1.4962644611578921, + "grad_norm": 2.3125, + "learning_rate": 4.988408195279475e-06, + "loss": 1.0447, + "step": 7564 + }, + { + "epoch": 1.4964643562129882, + "grad_norm": 2.421875, + "learning_rate": 4.9873543970035825e-06, + "loss": 1.0406, + "step": 7565 + }, + { + "epoch": 1.4966642512680843, + "grad_norm": 2.21875, + "learning_rate": 4.9863005992894085e-06, + "loss": 1.0905, + "step": 7566 + }, + { + "epoch": 1.4968641463231802, + "grad_norm": 2.15625, + "learning_rate": 4.985246802183758e-06, + "loss": 1.0414, + "step": 7567 + }, + { + "epoch": 1.4970640413782763, + "grad_norm": 2.09375, + "learning_rate": 4.9841930057334446e-06, + "loss": 0.9065, + "step": 7568 + }, + { + "epoch": 1.4972639364333724, + "grad_norm": 2.078125, + "learning_rate": 4.983139209985276e-06, + "loss": 0.9596, + "step": 7569 + }, + { + "epoch": 1.4974638314884685, + "grad_norm": 2.296875, + "learning_rate": 4.982085414986062e-06, + "loss": 1.0192, + "step": 7570 + }, + { + "epoch": 1.4976637265435646, + "grad_norm": 2.1875, + "learning_rate": 4.981031620782612e-06, + "loss": 0.9386, + "step": 7571 + }, + { + "epoch": 1.4978636215986607, + "grad_norm": 2.234375, + "learning_rate": 4.979977827421735e-06, + "loss": 0.899, + "step": 7572 + }, + { + "epoch": 1.4980635166537568, + "grad_norm": 2.25, + "learning_rate": 4.97892403495024e-06, + "loss": 1.084, + "step": 7573 + }, + { + "epoch": 1.498263411708853, + "grad_norm": 2.109375, + "learning_rate": 4.977870243414938e-06, + "loss": 0.9711, + "step": 7574 + }, + { + "epoch": 1.498463306763949, + "grad_norm": 2.171875, + "learning_rate": 4.976816452862638e-06, + "loss": 0.978, + "step": 7575 + }, + { + "epoch": 1.498663201819045, + "grad_norm": 2.21875, + "learning_rate": 4.975762663340148e-06, + "loss": 1.0126, + "step": 7576 + }, + { + "epoch": 1.4988630968741412, + "grad_norm": 2.203125, + "learning_rate": 4.974708874894276e-06, + "loss": 1.0988, + "step": 7577 + }, + { + "epoch": 1.499062991929237, + "grad_norm": 2.25, + "learning_rate": 4.973655087571836e-06, + "loss": 0.9493, + "step": 7578 + }, + { + "epoch": 1.4992628869843332, + "grad_norm": 2.1875, + "learning_rate": 4.972601301419632e-06, + "loss": 0.9735, + "step": 7579 + }, + { + "epoch": 1.4994627820394293, + "grad_norm": 2.265625, + "learning_rate": 4.971547516484477e-06, + "loss": 0.9846, + "step": 7580 + }, + { + "epoch": 1.4996626770945254, + "grad_norm": 2.125, + "learning_rate": 4.970493732813178e-06, + "loss": 0.9615, + "step": 7581 + }, + { + "epoch": 1.4998625721496215, + "grad_norm": 2.265625, + "learning_rate": 4.969439950452543e-06, + "loss": 1.1397, + "step": 7582 + }, + { + "epoch": 1.5000624672047174, + "grad_norm": 2.234375, + "learning_rate": 4.968386169449384e-06, + "loss": 0.8919, + "step": 7583 + }, + { + "epoch": 1.5002623622598135, + "grad_norm": 2.109375, + "learning_rate": 4.967332389850506e-06, + "loss": 0.9603, + "step": 7584 + }, + { + "epoch": 1.5004622573149096, + "grad_norm": 2.03125, + "learning_rate": 4.96627861170272e-06, + "loss": 0.8555, + "step": 7585 + }, + { + "epoch": 1.5006621523700057, + "grad_norm": 2.328125, + "learning_rate": 4.965224835052836e-06, + "loss": 0.9833, + "step": 7586 + }, + { + "epoch": 1.5008620474251018, + "grad_norm": 2.234375, + "learning_rate": 4.96417105994766e-06, + "loss": 0.9636, + "step": 7587 + }, + { + "epoch": 1.5010619424801979, + "grad_norm": 2.125, + "learning_rate": 4.963117286434003e-06, + "loss": 0.9915, + "step": 7588 + }, + { + "epoch": 1.501261837535294, + "grad_norm": 2.28125, + "learning_rate": 4.962063514558671e-06, + "loss": 0.9548, + "step": 7589 + }, + { + "epoch": 1.50146173259039, + "grad_norm": 1.984375, + "learning_rate": 4.961009744368472e-06, + "loss": 0.8947, + "step": 7590 + }, + { + "epoch": 1.5016616276454862, + "grad_norm": 2.09375, + "learning_rate": 4.959955975910219e-06, + "loss": 0.9332, + "step": 7591 + }, + { + "epoch": 1.5018615227005823, + "grad_norm": 2.1875, + "learning_rate": 4.958902209230716e-06, + "loss": 1.0048, + "step": 7592 + }, + { + "epoch": 1.5020614177556784, + "grad_norm": 2.25, + "learning_rate": 4.9578484443767714e-06, + "loss": 0.9865, + "step": 7593 + }, + { + "epoch": 1.5022613128107745, + "grad_norm": 2.09375, + "learning_rate": 4.956794681395197e-06, + "loss": 0.954, + "step": 7594 + }, + { + "epoch": 1.5024612078658706, + "grad_norm": 2.203125, + "learning_rate": 4.955740920332796e-06, + "loss": 0.9588, + "step": 7595 + }, + { + "epoch": 1.5026611029209664, + "grad_norm": 2.03125, + "learning_rate": 4.954687161236379e-06, + "loss": 0.9834, + "step": 7596 + }, + { + "epoch": 1.5028609979760625, + "grad_norm": 2.09375, + "learning_rate": 4.953633404152754e-06, + "loss": 0.9794, + "step": 7597 + }, + { + "epoch": 1.5030608930311586, + "grad_norm": 2.09375, + "learning_rate": 4.952579649128728e-06, + "loss": 0.9282, + "step": 7598 + }, + { + "epoch": 1.5032607880862547, + "grad_norm": 2.234375, + "learning_rate": 4.951525896211109e-06, + "loss": 0.9748, + "step": 7599 + }, + { + "epoch": 1.5034606831413508, + "grad_norm": 2.109375, + "learning_rate": 4.950472145446706e-06, + "loss": 0.992, + "step": 7600 + }, + { + "epoch": 1.5036605781964467, + "grad_norm": 2.296875, + "learning_rate": 4.9494183968823235e-06, + "loss": 0.9436, + "step": 7601 + }, + { + "epoch": 1.5038604732515428, + "grad_norm": 2.296875, + "learning_rate": 4.9483646505647705e-06, + "loss": 0.9967, + "step": 7602 + }, + { + "epoch": 1.504060368306639, + "grad_norm": 2.203125, + "learning_rate": 4.947310906540854e-06, + "loss": 0.9897, + "step": 7603 + }, + { + "epoch": 1.504260263361735, + "grad_norm": 2.1875, + "learning_rate": 4.946257164857382e-06, + "loss": 1.0052, + "step": 7604 + }, + { + "epoch": 1.5044601584168311, + "grad_norm": 2.09375, + "learning_rate": 4.94520342556116e-06, + "loss": 0.9114, + "step": 7605 + }, + { + "epoch": 1.5046600534719272, + "grad_norm": 2.1875, + "learning_rate": 4.944149688698997e-06, + "loss": 0.9928, + "step": 7606 + }, + { + "epoch": 1.5048599485270233, + "grad_norm": 2.25, + "learning_rate": 4.943095954317698e-06, + "loss": 0.91, + "step": 7607 + }, + { + "epoch": 1.5050598435821194, + "grad_norm": 2.1875, + "learning_rate": 4.94204222246407e-06, + "loss": 0.9799, + "step": 7608 + }, + { + "epoch": 1.5052597386372155, + "grad_norm": 2.125, + "learning_rate": 4.940988493184922e-06, + "loss": 0.8856, + "step": 7609 + }, + { + "epoch": 1.5054596336923116, + "grad_norm": 2.296875, + "learning_rate": 4.939934766527058e-06, + "loss": 1.062, + "step": 7610 + }, + { + "epoch": 1.5056595287474077, + "grad_norm": 2.390625, + "learning_rate": 4.938881042537286e-06, + "loss": 0.9589, + "step": 7611 + }, + { + "epoch": 1.5058594238025038, + "grad_norm": 2.125, + "learning_rate": 4.93782732126241e-06, + "loss": 0.982, + "step": 7612 + }, + { + "epoch": 1.5060593188576, + "grad_norm": 2.15625, + "learning_rate": 4.936773602749238e-06, + "loss": 1.015, + "step": 7613 + }, + { + "epoch": 1.5062592139126958, + "grad_norm": 2.140625, + "learning_rate": 4.9357198870445775e-06, + "loss": 0.98, + "step": 7614 + }, + { + "epoch": 1.506459108967792, + "grad_norm": 2.125, + "learning_rate": 4.934666174195231e-06, + "loss": 0.9862, + "step": 7615 + }, + { + "epoch": 1.506659004022888, + "grad_norm": 2.15625, + "learning_rate": 4.933612464248006e-06, + "loss": 1.0111, + "step": 7616 + }, + { + "epoch": 1.506858899077984, + "grad_norm": 2.15625, + "learning_rate": 4.932558757249711e-06, + "loss": 0.9228, + "step": 7617 + }, + { + "epoch": 1.50705879413308, + "grad_norm": 2.421875, + "learning_rate": 4.931505053247146e-06, + "loss": 1.0451, + "step": 7618 + }, + { + "epoch": 1.507258689188176, + "grad_norm": 2.3125, + "learning_rate": 4.930451352287121e-06, + "loss": 1.0573, + "step": 7619 + }, + { + "epoch": 1.5074585842432722, + "grad_norm": 2.140625, + "learning_rate": 4.929397654416438e-06, + "loss": 0.9944, + "step": 7620 + }, + { + "epoch": 1.5076584792983683, + "grad_norm": 2.21875, + "learning_rate": 4.928343959681904e-06, + "loss": 0.9962, + "step": 7621 + }, + { + "epoch": 1.5078583743534644, + "grad_norm": 2.046875, + "learning_rate": 4.927290268130325e-06, + "loss": 0.8596, + "step": 7622 + }, + { + "epoch": 1.5080582694085605, + "grad_norm": 2.1875, + "learning_rate": 4.9262365798085024e-06, + "loss": 1.0626, + "step": 7623 + }, + { + "epoch": 1.5082581644636566, + "grad_norm": 2.0625, + "learning_rate": 4.925182894763243e-06, + "loss": 0.9334, + "step": 7624 + }, + { + "epoch": 1.5084580595187527, + "grad_norm": 2.109375, + "learning_rate": 4.924129213041353e-06, + "loss": 1.0244, + "step": 7625 + }, + { + "epoch": 1.5086579545738488, + "grad_norm": 2.171875, + "learning_rate": 4.923075534689635e-06, + "loss": 1.0285, + "step": 7626 + }, + { + "epoch": 1.5088578496289449, + "grad_norm": 2.0, + "learning_rate": 4.922021859754894e-06, + "loss": 0.9209, + "step": 7627 + }, + { + "epoch": 1.509057744684041, + "grad_norm": 2.15625, + "learning_rate": 4.920968188283932e-06, + "loss": 1.0255, + "step": 7628 + }, + { + "epoch": 1.509257639739137, + "grad_norm": 2.125, + "learning_rate": 4.919914520323555e-06, + "loss": 0.8351, + "step": 7629 + }, + { + "epoch": 1.5094575347942332, + "grad_norm": 2.140625, + "learning_rate": 4.918860855920568e-06, + "loss": 0.9118, + "step": 7630 + }, + { + "epoch": 1.5096574298493293, + "grad_norm": 2.125, + "learning_rate": 4.917807195121773e-06, + "loss": 0.983, + "step": 7631 + }, + { + "epoch": 1.5098573249044251, + "grad_norm": 2.1875, + "learning_rate": 4.916753537973973e-06, + "loss": 0.991, + "step": 7632 + }, + { + "epoch": 1.5100572199595212, + "grad_norm": 2.140625, + "learning_rate": 4.915699884523971e-06, + "loss": 1.0175, + "step": 7633 + }, + { + "epoch": 1.5102571150146173, + "grad_norm": 2.265625, + "learning_rate": 4.914646234818573e-06, + "loss": 1.0203, + "step": 7634 + }, + { + "epoch": 1.5104570100697134, + "grad_norm": 2.234375, + "learning_rate": 4.913592588904578e-06, + "loss": 0.8762, + "step": 7635 + }, + { + "epoch": 1.5106569051248093, + "grad_norm": 2.109375, + "learning_rate": 4.912538946828792e-06, + "loss": 0.9023, + "step": 7636 + }, + { + "epoch": 1.5108568001799054, + "grad_norm": 2.171875, + "learning_rate": 4.9114853086380165e-06, + "loss": 0.9672, + "step": 7637 + }, + { + "epoch": 1.5110566952350015, + "grad_norm": 2.078125, + "learning_rate": 4.910431674379054e-06, + "loss": 0.9385, + "step": 7638 + }, + { + "epoch": 1.5112565902900976, + "grad_norm": 2.140625, + "learning_rate": 4.909378044098708e-06, + "loss": 0.8887, + "step": 7639 + }, + { + "epoch": 1.5114564853451937, + "grad_norm": 2.234375, + "learning_rate": 4.908324417843779e-06, + "loss": 0.9445, + "step": 7640 + }, + { + "epoch": 1.5116563804002898, + "grad_norm": 2.078125, + "learning_rate": 4.907270795661068e-06, + "loss": 1.0104, + "step": 7641 + }, + { + "epoch": 1.511856275455386, + "grad_norm": 2.203125, + "learning_rate": 4.906217177597381e-06, + "loss": 1.0569, + "step": 7642 + }, + { + "epoch": 1.512056170510482, + "grad_norm": 2.1875, + "learning_rate": 4.905163563699515e-06, + "loss": 1.0554, + "step": 7643 + }, + { + "epoch": 1.5122560655655781, + "grad_norm": 2.28125, + "learning_rate": 4.904109954014273e-06, + "loss": 1.0148, + "step": 7644 + }, + { + "epoch": 1.5124559606206742, + "grad_norm": 2.046875, + "learning_rate": 4.9030563485884585e-06, + "loss": 0.8805, + "step": 7645 + }, + { + "epoch": 1.5126558556757703, + "grad_norm": 2.21875, + "learning_rate": 4.902002747468869e-06, + "loss": 1.0549, + "step": 7646 + }, + { + "epoch": 1.5128557507308664, + "grad_norm": 2.234375, + "learning_rate": 4.900949150702308e-06, + "loss": 1.0335, + "step": 7647 + }, + { + "epoch": 1.5130556457859625, + "grad_norm": 2.15625, + "learning_rate": 4.8998955583355755e-06, + "loss": 0.935, + "step": 7648 + }, + { + "epoch": 1.5132555408410584, + "grad_norm": 2.15625, + "learning_rate": 4.89884197041547e-06, + "loss": 1.0581, + "step": 7649 + }, + { + "epoch": 1.5134554358961545, + "grad_norm": 1.8984375, + "learning_rate": 4.897788386988796e-06, + "loss": 0.8837, + "step": 7650 + }, + { + "epoch": 1.5136553309512506, + "grad_norm": 2.390625, + "learning_rate": 4.896734808102348e-06, + "loss": 0.9951, + "step": 7651 + }, + { + "epoch": 1.5138552260063467, + "grad_norm": 2.265625, + "learning_rate": 4.89568123380293e-06, + "loss": 0.9621, + "step": 7652 + }, + { + "epoch": 1.5140551210614426, + "grad_norm": 2.109375, + "learning_rate": 4.8946276641373406e-06, + "loss": 0.9108, + "step": 7653 + }, + { + "epoch": 1.5142550161165387, + "grad_norm": 2.171875, + "learning_rate": 4.893574099152379e-06, + "loss": 1.0131, + "step": 7654 + }, + { + "epoch": 1.5144549111716348, + "grad_norm": 2.15625, + "learning_rate": 4.892520538894844e-06, + "loss": 0.9307, + "step": 7655 + }, + { + "epoch": 1.5146548062267309, + "grad_norm": 1.96875, + "learning_rate": 4.891466983411537e-06, + "loss": 0.8952, + "step": 7656 + }, + { + "epoch": 1.514854701281827, + "grad_norm": 2.5, + "learning_rate": 4.890413432749253e-06, + "loss": 1.0152, + "step": 7657 + }, + { + "epoch": 1.515054596336923, + "grad_norm": 2.09375, + "learning_rate": 4.889359886954794e-06, + "loss": 0.9754, + "step": 7658 + }, + { + "epoch": 1.5152544913920192, + "grad_norm": 2.0625, + "learning_rate": 4.888306346074956e-06, + "loss": 0.8848, + "step": 7659 + }, + { + "epoch": 1.5154543864471153, + "grad_norm": 2.28125, + "learning_rate": 4.887252810156538e-06, + "loss": 1.0255, + "step": 7660 + }, + { + "epoch": 1.5156542815022114, + "grad_norm": 2.109375, + "learning_rate": 4.8861992792463396e-06, + "loss": 0.927, + "step": 7661 + }, + { + "epoch": 1.5158541765573075, + "grad_norm": 2.125, + "learning_rate": 4.885145753391158e-06, + "loss": 1.0345, + "step": 7662 + }, + { + "epoch": 1.5160540716124036, + "grad_norm": 2.0625, + "learning_rate": 4.884092232637787e-06, + "loss": 0.9341, + "step": 7663 + }, + { + "epoch": 1.5162539666674997, + "grad_norm": 2.234375, + "learning_rate": 4.883038717033028e-06, + "loss": 1.0379, + "step": 7664 + }, + { + "epoch": 1.5164538617225958, + "grad_norm": 2.078125, + "learning_rate": 4.881985206623676e-06, + "loss": 0.8957, + "step": 7665 + }, + { + "epoch": 1.5166537567776919, + "grad_norm": 2.09375, + "learning_rate": 4.8809317014565286e-06, + "loss": 1.0101, + "step": 7666 + }, + { + "epoch": 1.5168536518327878, + "grad_norm": 2.046875, + "learning_rate": 4.879878201578382e-06, + "loss": 0.9537, + "step": 7667 + }, + { + "epoch": 1.5170535468878839, + "grad_norm": 2.46875, + "learning_rate": 4.878824707036034e-06, + "loss": 0.9919, + "step": 7668 + }, + { + "epoch": 1.51725344194298, + "grad_norm": 2.171875, + "learning_rate": 4.877771217876279e-06, + "loss": 0.9453, + "step": 7669 + }, + { + "epoch": 1.517453336998076, + "grad_norm": 2.234375, + "learning_rate": 4.876717734145914e-06, + "loss": 1.0619, + "step": 7670 + }, + { + "epoch": 1.517653232053172, + "grad_norm": 2.203125, + "learning_rate": 4.875664255891733e-06, + "loss": 0.915, + "step": 7671 + }, + { + "epoch": 1.517853127108268, + "grad_norm": 2.21875, + "learning_rate": 4.874610783160532e-06, + "loss": 0.9857, + "step": 7672 + }, + { + "epoch": 1.5180530221633641, + "grad_norm": 2.21875, + "learning_rate": 4.873557315999108e-06, + "loss": 1.0185, + "step": 7673 + }, + { + "epoch": 1.5182529172184602, + "grad_norm": 2.09375, + "learning_rate": 4.872503854454254e-06, + "loss": 0.9332, + "step": 7674 + }, + { + "epoch": 1.5184528122735563, + "grad_norm": 2.203125, + "learning_rate": 4.871450398572764e-06, + "loss": 0.9758, + "step": 7675 + }, + { + "epoch": 1.5186527073286524, + "grad_norm": 2.25, + "learning_rate": 4.870396948401436e-06, + "loss": 0.994, + "step": 7676 + }, + { + "epoch": 1.5188526023837485, + "grad_norm": 2.265625, + "learning_rate": 4.86934350398706e-06, + "loss": 1.0261, + "step": 7677 + }, + { + "epoch": 1.5190524974388446, + "grad_norm": 2.09375, + "learning_rate": 4.868290065376433e-06, + "loss": 0.909, + "step": 7678 + }, + { + "epoch": 1.5192523924939407, + "grad_norm": 2.1875, + "learning_rate": 4.8672366326163466e-06, + "loss": 0.9689, + "step": 7679 + }, + { + "epoch": 1.5194522875490368, + "grad_norm": 2.046875, + "learning_rate": 4.866183205753593e-06, + "loss": 0.9509, + "step": 7680 + }, + { + "epoch": 1.519652182604133, + "grad_norm": 2.0, + "learning_rate": 4.86512978483497e-06, + "loss": 0.8929, + "step": 7681 + }, + { + "epoch": 1.519852077659229, + "grad_norm": 2.1875, + "learning_rate": 4.864076369907265e-06, + "loss": 0.9983, + "step": 7682 + }, + { + "epoch": 1.5200519727143251, + "grad_norm": 2.1875, + "learning_rate": 4.863022961017275e-06, + "loss": 1.0158, + "step": 7683 + }, + { + "epoch": 1.520251867769421, + "grad_norm": 2.234375, + "learning_rate": 4.86196955821179e-06, + "loss": 1.0297, + "step": 7684 + }, + { + "epoch": 1.520451762824517, + "grad_norm": 2.34375, + "learning_rate": 4.860916161537602e-06, + "loss": 1.0098, + "step": 7685 + }, + { + "epoch": 1.5206516578796132, + "grad_norm": 2.140625, + "learning_rate": 4.859862771041503e-06, + "loss": 0.9938, + "step": 7686 + }, + { + "epoch": 1.5208515529347093, + "grad_norm": 2.0625, + "learning_rate": 4.858809386770286e-06, + "loss": 0.9878, + "step": 7687 + }, + { + "epoch": 1.5210514479898054, + "grad_norm": 2.078125, + "learning_rate": 4.85775600877074e-06, + "loss": 0.9156, + "step": 7688 + }, + { + "epoch": 1.5212513430449013, + "grad_norm": 2.125, + "learning_rate": 4.856702637089657e-06, + "loss": 0.9137, + "step": 7689 + }, + { + "epoch": 1.5214512380999974, + "grad_norm": 2.421875, + "learning_rate": 4.8556492717738275e-06, + "loss": 1.0033, + "step": 7690 + }, + { + "epoch": 1.5216511331550935, + "grad_norm": 2.1875, + "learning_rate": 4.854595912870041e-06, + "loss": 0.9716, + "step": 7691 + }, + { + "epoch": 1.5218510282101896, + "grad_norm": 2.09375, + "learning_rate": 4.8535425604250915e-06, + "loss": 0.9526, + "step": 7692 + }, + { + "epoch": 1.5220509232652857, + "grad_norm": 2.125, + "learning_rate": 4.8524892144857655e-06, + "loss": 0.9911, + "step": 7693 + }, + { + "epoch": 1.5222508183203818, + "grad_norm": 2.25, + "learning_rate": 4.851435875098851e-06, + "loss": 0.9205, + "step": 7694 + }, + { + "epoch": 1.5224507133754779, + "grad_norm": 2.046875, + "learning_rate": 4.850382542311139e-06, + "loss": 0.918, + "step": 7695 + }, + { + "epoch": 1.522650608430574, + "grad_norm": 2.15625, + "learning_rate": 4.849329216169419e-06, + "loss": 1.0557, + "step": 7696 + }, + { + "epoch": 1.52285050348567, + "grad_norm": 2.21875, + "learning_rate": 4.848275896720478e-06, + "loss": 1.0096, + "step": 7697 + }, + { + "epoch": 1.5230503985407662, + "grad_norm": 2.234375, + "learning_rate": 4.847222584011107e-06, + "loss": 1.0347, + "step": 7698 + }, + { + "epoch": 1.5232502935958623, + "grad_norm": 2.1875, + "learning_rate": 4.846169278088092e-06, + "loss": 0.9796, + "step": 7699 + }, + { + "epoch": 1.5234501886509584, + "grad_norm": 2.265625, + "learning_rate": 4.845115978998221e-06, + "loss": 0.9395, + "step": 7700 + }, + { + "epoch": 1.5236500837060545, + "grad_norm": 2.21875, + "learning_rate": 4.844062686788282e-06, + "loss": 1.0408, + "step": 7701 + }, + { + "epoch": 1.5238499787611504, + "grad_norm": 2.078125, + "learning_rate": 4.843009401505062e-06, + "loss": 1.001, + "step": 7702 + }, + { + "epoch": 1.5240498738162465, + "grad_norm": 2.140625, + "learning_rate": 4.841956123195347e-06, + "loss": 0.9442, + "step": 7703 + }, + { + "epoch": 1.5242497688713426, + "grad_norm": 2.21875, + "learning_rate": 4.8409028519059246e-06, + "loss": 0.9601, + "step": 7704 + }, + { + "epoch": 1.5244496639264387, + "grad_norm": 2.1875, + "learning_rate": 4.8398495876835795e-06, + "loss": 0.933, + "step": 7705 + }, + { + "epoch": 1.5246495589815345, + "grad_norm": 2.15625, + "learning_rate": 4.838796330575099e-06, + "loss": 1.0609, + "step": 7706 + }, + { + "epoch": 1.5248494540366306, + "grad_norm": 2.1875, + "learning_rate": 4.837743080627269e-06, + "loss": 0.8928, + "step": 7707 + }, + { + "epoch": 1.5250493490917267, + "grad_norm": 2.15625, + "learning_rate": 4.836689837886874e-06, + "loss": 0.917, + "step": 7708 + }, + { + "epoch": 1.5252492441468228, + "grad_norm": 2.171875, + "learning_rate": 4.835636602400699e-06, + "loss": 0.9897, + "step": 7709 + }, + { + "epoch": 1.525449139201919, + "grad_norm": 2.078125, + "learning_rate": 4.834583374215528e-06, + "loss": 0.9326, + "step": 7710 + }, + { + "epoch": 1.525649034257015, + "grad_norm": 2.125, + "learning_rate": 4.833530153378145e-06, + "loss": 0.9718, + "step": 7711 + }, + { + "epoch": 1.5258489293121111, + "grad_norm": 2.1875, + "learning_rate": 4.832476939935337e-06, + "loss": 1.0237, + "step": 7712 + }, + { + "epoch": 1.5260488243672072, + "grad_norm": 2.171875, + "learning_rate": 4.831423733933884e-06, + "loss": 0.9722, + "step": 7713 + }, + { + "epoch": 1.5262487194223033, + "grad_norm": 2.09375, + "learning_rate": 4.83037053542057e-06, + "loss": 0.9436, + "step": 7714 + }, + { + "epoch": 1.5264486144773994, + "grad_norm": 2.078125, + "learning_rate": 4.829317344442181e-06, + "loss": 0.9367, + "step": 7715 + }, + { + "epoch": 1.5266485095324955, + "grad_norm": 2.15625, + "learning_rate": 4.828264161045496e-06, + "loss": 0.9521, + "step": 7716 + }, + { + "epoch": 1.5268484045875916, + "grad_norm": 2.15625, + "learning_rate": 4.827210985277299e-06, + "loss": 1.0105, + "step": 7717 + }, + { + "epoch": 1.5270482996426877, + "grad_norm": 2.140625, + "learning_rate": 4.8261578171843705e-06, + "loss": 0.9382, + "step": 7718 + }, + { + "epoch": 1.5272481946977836, + "grad_norm": 2.203125, + "learning_rate": 4.825104656813494e-06, + "loss": 0.9781, + "step": 7719 + }, + { + "epoch": 1.5274480897528797, + "grad_norm": 2.140625, + "learning_rate": 4.82405150421145e-06, + "loss": 1.0679, + "step": 7720 + }, + { + "epoch": 1.5276479848079758, + "grad_norm": 2.203125, + "learning_rate": 4.82299835942502e-06, + "loss": 0.9701, + "step": 7721 + }, + { + "epoch": 1.527847879863072, + "grad_norm": 2.125, + "learning_rate": 4.821945222500983e-06, + "loss": 0.9713, + "step": 7722 + }, + { + "epoch": 1.528047774918168, + "grad_norm": 2.0625, + "learning_rate": 4.8208920934861206e-06, + "loss": 0.9887, + "step": 7723 + }, + { + "epoch": 1.528247669973264, + "grad_norm": 2.09375, + "learning_rate": 4.819838972427214e-06, + "loss": 0.9757, + "step": 7724 + }, + { + "epoch": 1.52844756502836, + "grad_norm": 2.15625, + "learning_rate": 4.818785859371039e-06, + "loss": 0.9716, + "step": 7725 + }, + { + "epoch": 1.528647460083456, + "grad_norm": 2.03125, + "learning_rate": 4.817732754364376e-06, + "loss": 0.8541, + "step": 7726 + }, + { + "epoch": 1.5288473551385522, + "grad_norm": 2.171875, + "learning_rate": 4.8166796574540065e-06, + "loss": 0.9456, + "step": 7727 + }, + { + "epoch": 1.5290472501936483, + "grad_norm": 2.171875, + "learning_rate": 4.815626568686706e-06, + "loss": 0.9526, + "step": 7728 + }, + { + "epoch": 1.5292471452487444, + "grad_norm": 2.078125, + "learning_rate": 4.8145734881092545e-06, + "loss": 1.0064, + "step": 7729 + }, + { + "epoch": 1.5294470403038405, + "grad_norm": 2.03125, + "learning_rate": 4.813520415768427e-06, + "loss": 0.9533, + "step": 7730 + }, + { + "epoch": 1.5296469353589366, + "grad_norm": 2.1875, + "learning_rate": 4.812467351711004e-06, + "loss": 1.0011, + "step": 7731 + }, + { + "epoch": 1.5298468304140327, + "grad_norm": 2.328125, + "learning_rate": 4.811414295983761e-06, + "loss": 0.9818, + "step": 7732 + }, + { + "epoch": 1.5300467254691288, + "grad_norm": 2.1875, + "learning_rate": 4.810361248633475e-06, + "loss": 1.009, + "step": 7733 + }, + { + "epoch": 1.5302466205242249, + "grad_norm": 2.15625, + "learning_rate": 4.809308209706921e-06, + "loss": 0.9351, + "step": 7734 + }, + { + "epoch": 1.530446515579321, + "grad_norm": 2.28125, + "learning_rate": 4.808255179250878e-06, + "loss": 0.9453, + "step": 7735 + }, + { + "epoch": 1.530646410634417, + "grad_norm": 2.109375, + "learning_rate": 4.807202157312118e-06, + "loss": 0.8974, + "step": 7736 + }, + { + "epoch": 1.530846305689513, + "grad_norm": 2.09375, + "learning_rate": 4.806149143937417e-06, + "loss": 0.9417, + "step": 7737 + }, + { + "epoch": 1.531046200744609, + "grad_norm": 2.09375, + "learning_rate": 4.8050961391735516e-06, + "loss": 0.8945, + "step": 7738 + }, + { + "epoch": 1.5312460957997052, + "grad_norm": 2.203125, + "learning_rate": 4.804043143067295e-06, + "loss": 1.0163, + "step": 7739 + }, + { + "epoch": 1.5314459908548013, + "grad_norm": 2.375, + "learning_rate": 4.802990155665421e-06, + "loss": 0.964, + "step": 7740 + }, + { + "epoch": 1.5316458859098971, + "grad_norm": 2.078125, + "learning_rate": 4.801937177014702e-06, + "loss": 0.9406, + "step": 7741 + }, + { + "epoch": 1.5318457809649932, + "grad_norm": 2.109375, + "learning_rate": 4.800884207161913e-06, + "loss": 0.9723, + "step": 7742 + }, + { + "epoch": 1.5320456760200893, + "grad_norm": 2.265625, + "learning_rate": 4.799831246153827e-06, + "loss": 0.9697, + "step": 7743 + }, + { + "epoch": 1.5322455710751854, + "grad_norm": 2.03125, + "learning_rate": 4.798778294037214e-06, + "loss": 0.919, + "step": 7744 + }, + { + "epoch": 1.5324454661302815, + "grad_norm": 2.109375, + "learning_rate": 4.7977253508588496e-06, + "loss": 0.9584, + "step": 7745 + }, + { + "epoch": 1.5326453611853776, + "grad_norm": 2.1875, + "learning_rate": 4.796672416665503e-06, + "loss": 1.0064, + "step": 7746 + }, + { + "epoch": 1.5328452562404737, + "grad_norm": 2.0625, + "learning_rate": 4.795619491503946e-06, + "loss": 0.9836, + "step": 7747 + }, + { + "epoch": 1.5330451512955698, + "grad_norm": 2.25, + "learning_rate": 4.794566575420949e-06, + "loss": 1.0584, + "step": 7748 + }, + { + "epoch": 1.533245046350666, + "grad_norm": 2.171875, + "learning_rate": 4.793513668463283e-06, + "loss": 0.9774, + "step": 7749 + }, + { + "epoch": 1.533444941405762, + "grad_norm": 3.0, + "learning_rate": 4.792460770677716e-06, + "loss": 0.956, + "step": 7750 + }, + { + "epoch": 1.5336448364608581, + "grad_norm": 2.171875, + "learning_rate": 4.791407882111022e-06, + "loss": 1.0692, + "step": 7751 + }, + { + "epoch": 1.5338447315159542, + "grad_norm": 2.078125, + "learning_rate": 4.790355002809967e-06, + "loss": 0.9051, + "step": 7752 + }, + { + "epoch": 1.5340446265710503, + "grad_norm": 2.109375, + "learning_rate": 4.789302132821319e-06, + "loss": 1.0195, + "step": 7753 + }, + { + "epoch": 1.5342445216261462, + "grad_norm": 2.171875, + "learning_rate": 4.78824927219185e-06, + "loss": 0.966, + "step": 7754 + }, + { + "epoch": 1.5344444166812423, + "grad_norm": 2.140625, + "learning_rate": 4.787196420968326e-06, + "loss": 1.0108, + "step": 7755 + }, + { + "epoch": 1.5346443117363384, + "grad_norm": 2.109375, + "learning_rate": 4.7861435791975124e-06, + "loss": 0.9161, + "step": 7756 + }, + { + "epoch": 1.5348442067914345, + "grad_norm": 2.171875, + "learning_rate": 4.785090746926178e-06, + "loss": 0.9542, + "step": 7757 + }, + { + "epoch": 1.5350441018465306, + "grad_norm": 2.28125, + "learning_rate": 4.784037924201091e-06, + "loss": 1.0465, + "step": 7758 + }, + { + "epoch": 1.5352439969016265, + "grad_norm": 2.1875, + "learning_rate": 4.782985111069015e-06, + "loss": 1.057, + "step": 7759 + }, + { + "epoch": 1.5354438919567226, + "grad_norm": 2.109375, + "learning_rate": 4.781932307576718e-06, + "loss": 0.9282, + "step": 7760 + }, + { + "epoch": 1.5356437870118187, + "grad_norm": 2.40625, + "learning_rate": 4.780879513770964e-06, + "loss": 1.0559, + "step": 7761 + }, + { + "epoch": 1.5358436820669148, + "grad_norm": 2.078125, + "learning_rate": 4.779826729698519e-06, + "loss": 1.0127, + "step": 7762 + }, + { + "epoch": 1.536043577122011, + "grad_norm": 2.140625, + "learning_rate": 4.778773955406148e-06, + "loss": 1.0193, + "step": 7763 + }, + { + "epoch": 1.536243472177107, + "grad_norm": 2.125, + "learning_rate": 4.777721190940612e-06, + "loss": 0.9641, + "step": 7764 + }, + { + "epoch": 1.536443367232203, + "grad_norm": 2.296875, + "learning_rate": 4.776668436348678e-06, + "loss": 0.9669, + "step": 7765 + }, + { + "epoch": 1.5366432622872992, + "grad_norm": 2.15625, + "learning_rate": 4.77561569167711e-06, + "loss": 0.9652, + "step": 7766 + }, + { + "epoch": 1.5368431573423953, + "grad_norm": 2.15625, + "learning_rate": 4.7745629569726664e-06, + "loss": 0.9537, + "step": 7767 + }, + { + "epoch": 1.5370430523974914, + "grad_norm": 2.265625, + "learning_rate": 4.7735102322821145e-06, + "loss": 1.0082, + "step": 7768 + }, + { + "epoch": 1.5372429474525875, + "grad_norm": 2.03125, + "learning_rate": 4.772457517652212e-06, + "loss": 0.8862, + "step": 7769 + }, + { + "epoch": 1.5374428425076836, + "grad_norm": 2.1875, + "learning_rate": 4.771404813129722e-06, + "loss": 0.9674, + "step": 7770 + }, + { + "epoch": 1.5376427375627797, + "grad_norm": 2.25, + "learning_rate": 4.7703521187614075e-06, + "loss": 0.9746, + "step": 7771 + }, + { + "epoch": 1.5378426326178756, + "grad_norm": 2.40625, + "learning_rate": 4.769299434594026e-06, + "loss": 0.964, + "step": 7772 + }, + { + "epoch": 1.5380425276729717, + "grad_norm": 2.046875, + "learning_rate": 4.76824676067434e-06, + "loss": 0.9537, + "step": 7773 + }, + { + "epoch": 1.5382424227280678, + "grad_norm": 2.171875, + "learning_rate": 4.767194097049108e-06, + "loss": 0.9013, + "step": 7774 + }, + { + "epoch": 1.5384423177831639, + "grad_norm": 2.171875, + "learning_rate": 4.766141443765088e-06, + "loss": 0.9636, + "step": 7775 + }, + { + "epoch": 1.5386422128382597, + "grad_norm": 2.15625, + "learning_rate": 4.765088800869041e-06, + "loss": 0.9825, + "step": 7776 + }, + { + "epoch": 1.5388421078933558, + "grad_norm": 2.265625, + "learning_rate": 4.764036168407726e-06, + "loss": 0.9128, + "step": 7777 + }, + { + "epoch": 1.539042002948452, + "grad_norm": 2.140625, + "learning_rate": 4.762983546427897e-06, + "loss": 0.9874, + "step": 7778 + }, + { + "epoch": 1.539241898003548, + "grad_norm": 2.0625, + "learning_rate": 4.761930934976315e-06, + "loss": 0.9349, + "step": 7779 + }, + { + "epoch": 1.5394417930586441, + "grad_norm": 2.140625, + "learning_rate": 4.760878334099734e-06, + "loss": 0.9253, + "step": 7780 + }, + { + "epoch": 1.5396416881137402, + "grad_norm": 2.078125, + "learning_rate": 4.759825743844912e-06, + "loss": 0.9425, + "step": 7781 + }, + { + "epoch": 1.5398415831688363, + "grad_norm": 2.09375, + "learning_rate": 4.758773164258606e-06, + "loss": 0.98, + "step": 7782 + }, + { + "epoch": 1.5400414782239324, + "grad_norm": 2.0625, + "learning_rate": 4.75772059538757e-06, + "loss": 1.0028, + "step": 7783 + }, + { + "epoch": 1.5402413732790285, + "grad_norm": 2.171875, + "learning_rate": 4.7566680372785575e-06, + "loss": 0.9386, + "step": 7784 + }, + { + "epoch": 1.5404412683341246, + "grad_norm": 2.390625, + "learning_rate": 4.755615489978328e-06, + "loss": 1.011, + "step": 7785 + }, + { + "epoch": 1.5406411633892207, + "grad_norm": 2.078125, + "learning_rate": 4.7545629535336315e-06, + "loss": 1.0177, + "step": 7786 + }, + { + "epoch": 1.5408410584443168, + "grad_norm": 2.15625, + "learning_rate": 4.753510427991221e-06, + "loss": 0.9757, + "step": 7787 + }, + { + "epoch": 1.541040953499413, + "grad_norm": 2.078125, + "learning_rate": 4.752457913397851e-06, + "loss": 0.967, + "step": 7788 + }, + { + "epoch": 1.541240848554509, + "grad_norm": 2.09375, + "learning_rate": 4.751405409800275e-06, + "loss": 0.8561, + "step": 7789 + }, + { + "epoch": 1.541440743609605, + "grad_norm": 2.28125, + "learning_rate": 4.750352917245244e-06, + "loss": 0.9998, + "step": 7790 + }, + { + "epoch": 1.541640638664701, + "grad_norm": 2.15625, + "learning_rate": 4.74930043577951e-06, + "loss": 0.9589, + "step": 7791 + }, + { + "epoch": 1.5418405337197971, + "grad_norm": 2.1875, + "learning_rate": 4.748247965449822e-06, + "loss": 0.9633, + "step": 7792 + }, + { + "epoch": 1.5420404287748932, + "grad_norm": 2.203125, + "learning_rate": 4.747195506302933e-06, + "loss": 0.9434, + "step": 7793 + }, + { + "epoch": 1.542240323829989, + "grad_norm": 2.125, + "learning_rate": 4.746143058385594e-06, + "loss": 0.9524, + "step": 7794 + }, + { + "epoch": 1.5424402188850852, + "grad_norm": 2.171875, + "learning_rate": 4.745090621744552e-06, + "loss": 0.9472, + "step": 7795 + }, + { + "epoch": 1.5426401139401813, + "grad_norm": 2.125, + "learning_rate": 4.744038196426558e-06, + "loss": 1.007, + "step": 7796 + }, + { + "epoch": 1.5428400089952774, + "grad_norm": 2.078125, + "learning_rate": 4.742985782478361e-06, + "loss": 0.9061, + "step": 7797 + }, + { + "epoch": 1.5430399040503735, + "grad_norm": 2.15625, + "learning_rate": 4.741933379946706e-06, + "loss": 1.0119, + "step": 7798 + }, + { + "epoch": 1.5432397991054696, + "grad_norm": 2.046875, + "learning_rate": 4.740880988878345e-06, + "loss": 0.9332, + "step": 7799 + }, + { + "epoch": 1.5434396941605657, + "grad_norm": 2.140625, + "learning_rate": 4.739828609320021e-06, + "loss": 0.8967, + "step": 7800 + }, + { + "epoch": 1.5436395892156618, + "grad_norm": 2.1875, + "learning_rate": 4.738776241318482e-06, + "loss": 0.8662, + "step": 7801 + }, + { + "epoch": 1.543839484270758, + "grad_norm": 2.125, + "learning_rate": 4.737723884920476e-06, + "loss": 0.9848, + "step": 7802 + }, + { + "epoch": 1.544039379325854, + "grad_norm": 2.1875, + "learning_rate": 4.736671540172746e-06, + "loss": 1.0055, + "step": 7803 + }, + { + "epoch": 1.54423927438095, + "grad_norm": 2.015625, + "learning_rate": 4.735619207122038e-06, + "loss": 0.9009, + "step": 7804 + }, + { + "epoch": 1.5444391694360462, + "grad_norm": 2.1875, + "learning_rate": 4.734566885815098e-06, + "loss": 1.0093, + "step": 7805 + }, + { + "epoch": 1.5446390644911423, + "grad_norm": 2.15625, + "learning_rate": 4.733514576298666e-06, + "loss": 0.9394, + "step": 7806 + }, + { + "epoch": 1.5448389595462382, + "grad_norm": 2.296875, + "learning_rate": 4.732462278619491e-06, + "loss": 1.0647, + "step": 7807 + }, + { + "epoch": 1.5450388546013343, + "grad_norm": 2.140625, + "learning_rate": 4.731409992824311e-06, + "loss": 0.9244, + "step": 7808 + }, + { + "epoch": 1.5452387496564304, + "grad_norm": 2.078125, + "learning_rate": 4.730357718959871e-06, + "loss": 0.9751, + "step": 7809 + }, + { + "epoch": 1.5454386447115265, + "grad_norm": 2.125, + "learning_rate": 4.729305457072913e-06, + "loss": 0.9699, + "step": 7810 + }, + { + "epoch": 1.5456385397666226, + "grad_norm": 2.09375, + "learning_rate": 4.728253207210176e-06, + "loss": 0.9998, + "step": 7811 + }, + { + "epoch": 1.5458384348217185, + "grad_norm": 2.09375, + "learning_rate": 4.727200969418404e-06, + "loss": 0.9604, + "step": 7812 + }, + { + "epoch": 1.5460383298768146, + "grad_norm": 2.21875, + "learning_rate": 4.726148743744336e-06, + "loss": 1.0473, + "step": 7813 + }, + { + "epoch": 1.5462382249319107, + "grad_norm": 2.4375, + "learning_rate": 4.72509653023471e-06, + "loss": 0.9781, + "step": 7814 + }, + { + "epoch": 1.5464381199870068, + "grad_norm": 2.046875, + "learning_rate": 4.724044328936268e-06, + "loss": 0.9372, + "step": 7815 + }, + { + "epoch": 1.5466380150421029, + "grad_norm": 2.109375, + "learning_rate": 4.722992139895748e-06, + "loss": 0.9463, + "step": 7816 + }, + { + "epoch": 1.546837910097199, + "grad_norm": 1.9921875, + "learning_rate": 4.7219399631598875e-06, + "loss": 0.888, + "step": 7817 + }, + { + "epoch": 1.547037805152295, + "grad_norm": 2.125, + "learning_rate": 4.720887798775424e-06, + "loss": 0.978, + "step": 7818 + }, + { + "epoch": 1.5472377002073912, + "grad_norm": 2.09375, + "learning_rate": 4.719835646789094e-06, + "loss": 0.9491, + "step": 7819 + }, + { + "epoch": 1.5474375952624873, + "grad_norm": 2.140625, + "learning_rate": 4.718783507247634e-06, + "loss": 0.9715, + "step": 7820 + }, + { + "epoch": 1.5476374903175834, + "grad_norm": 2.203125, + "learning_rate": 4.717731380197782e-06, + "loss": 0.9565, + "step": 7821 + }, + { + "epoch": 1.5478373853726795, + "grad_norm": 2.21875, + "learning_rate": 4.716679265686271e-06, + "loss": 0.9697, + "step": 7822 + }, + { + "epoch": 1.5480372804277756, + "grad_norm": 2.15625, + "learning_rate": 4.715627163759837e-06, + "loss": 0.9604, + "step": 7823 + }, + { + "epoch": 1.5482371754828717, + "grad_norm": 2.109375, + "learning_rate": 4.714575074465213e-06, + "loss": 1.0961, + "step": 7824 + }, + { + "epoch": 1.5484370705379675, + "grad_norm": 2.125, + "learning_rate": 4.7135229978491354e-06, + "loss": 1.0226, + "step": 7825 + }, + { + "epoch": 1.5486369655930636, + "grad_norm": 2.171875, + "learning_rate": 4.7124709339583345e-06, + "loss": 0.9901, + "step": 7826 + }, + { + "epoch": 1.5488368606481597, + "grad_norm": 2.046875, + "learning_rate": 4.711418882839544e-06, + "loss": 0.9758, + "step": 7827 + }, + { + "epoch": 1.5490367557032558, + "grad_norm": 2.1875, + "learning_rate": 4.710366844539497e-06, + "loss": 1.0121, + "step": 7828 + }, + { + "epoch": 1.5492366507583517, + "grad_norm": 2.15625, + "learning_rate": 4.7093148191049224e-06, + "loss": 0.8907, + "step": 7829 + }, + { + "epoch": 1.5494365458134478, + "grad_norm": 2.234375, + "learning_rate": 4.708262806582554e-06, + "loss": 0.9393, + "step": 7830 + }, + { + "epoch": 1.549636440868544, + "grad_norm": 2.25, + "learning_rate": 4.70721080701912e-06, + "loss": 1.0503, + "step": 7831 + }, + { + "epoch": 1.54983633592364, + "grad_norm": 2.203125, + "learning_rate": 4.706158820461349e-06, + "loss": 0.9926, + "step": 7832 + }, + { + "epoch": 1.550036230978736, + "grad_norm": 2.21875, + "learning_rate": 4.705106846955975e-06, + "loss": 1.0444, + "step": 7833 + }, + { + "epoch": 1.5502361260338322, + "grad_norm": 2.09375, + "learning_rate": 4.704054886549721e-06, + "loss": 0.8473, + "step": 7834 + }, + { + "epoch": 1.5504360210889283, + "grad_norm": 2.109375, + "learning_rate": 4.703002939289317e-06, + "loss": 1.0376, + "step": 7835 + }, + { + "epoch": 1.5506359161440244, + "grad_norm": 2.125, + "learning_rate": 4.7019510052214924e-06, + "loss": 1.0081, + "step": 7836 + }, + { + "epoch": 1.5508358111991205, + "grad_norm": 2.125, + "learning_rate": 4.7008990843929714e-06, + "loss": 0.9817, + "step": 7837 + }, + { + "epoch": 1.5510357062542166, + "grad_norm": 2.28125, + "learning_rate": 4.699847176850482e-06, + "loss": 1.0341, + "step": 7838 + }, + { + "epoch": 1.5512356013093127, + "grad_norm": 2.140625, + "learning_rate": 4.698795282640748e-06, + "loss": 1.0323, + "step": 7839 + }, + { + "epoch": 1.5514354963644088, + "grad_norm": 2.15625, + "learning_rate": 4.6977434018104955e-06, + "loss": 1.0385, + "step": 7840 + }, + { + "epoch": 1.551635391419505, + "grad_norm": 2.1875, + "learning_rate": 4.69669153440645e-06, + "loss": 1.0385, + "step": 7841 + }, + { + "epoch": 1.5518352864746008, + "grad_norm": 2.203125, + "learning_rate": 4.695639680475332e-06, + "loss": 0.8897, + "step": 7842 + }, + { + "epoch": 1.5520351815296969, + "grad_norm": 2.15625, + "learning_rate": 4.694587840063868e-06, + "loss": 0.9869, + "step": 7843 + }, + { + "epoch": 1.552235076584793, + "grad_norm": 2.140625, + "learning_rate": 4.69353601321878e-06, + "loss": 1.0172, + "step": 7844 + }, + { + "epoch": 1.552434971639889, + "grad_norm": 2.0625, + "learning_rate": 4.692484199986789e-06, + "loss": 0.9036, + "step": 7845 + }, + { + "epoch": 1.5526348666949852, + "grad_norm": 2.171875, + "learning_rate": 4.691432400414617e-06, + "loss": 0.9853, + "step": 7846 + }, + { + "epoch": 1.552834761750081, + "grad_norm": 2.171875, + "learning_rate": 4.690380614548986e-06, + "loss": 1.0161, + "step": 7847 + }, + { + "epoch": 1.5530346568051772, + "grad_norm": 2.171875, + "learning_rate": 4.6893288424366145e-06, + "loss": 0.9438, + "step": 7848 + }, + { + "epoch": 1.5532345518602733, + "grad_norm": 2.109375, + "learning_rate": 4.688277084124222e-06, + "loss": 0.9814, + "step": 7849 + }, + { + "epoch": 1.5534344469153694, + "grad_norm": 2.0625, + "learning_rate": 4.687225339658529e-06, + "loss": 0.954, + "step": 7850 + }, + { + "epoch": 1.5536343419704655, + "grad_norm": 2.25, + "learning_rate": 4.686173609086251e-06, + "loss": 1.0238, + "step": 7851 + }, + { + "epoch": 1.5538342370255616, + "grad_norm": 2.0625, + "learning_rate": 4.685121892454109e-06, + "loss": 0.9797, + "step": 7852 + }, + { + "epoch": 1.5540341320806577, + "grad_norm": 2.078125, + "learning_rate": 4.68407018980882e-06, + "loss": 0.9091, + "step": 7853 + }, + { + "epoch": 1.5542340271357538, + "grad_norm": 2.15625, + "learning_rate": 4.683018501197098e-06, + "loss": 0.9587, + "step": 7854 + }, + { + "epoch": 1.5544339221908499, + "grad_norm": 2.078125, + "learning_rate": 4.6819668266656594e-06, + "loss": 0.8859, + "step": 7855 + }, + { + "epoch": 1.554633817245946, + "grad_norm": 2.21875, + "learning_rate": 4.680915166261223e-06, + "loss": 1.0847, + "step": 7856 + }, + { + "epoch": 1.554833712301042, + "grad_norm": 2.28125, + "learning_rate": 4.679863520030499e-06, + "loss": 1.0824, + "step": 7857 + }, + { + "epoch": 1.5550336073561382, + "grad_norm": 2.25, + "learning_rate": 4.678811888020204e-06, + "loss": 0.9992, + "step": 7858 + }, + { + "epoch": 1.5552335024112343, + "grad_norm": 2.171875, + "learning_rate": 4.6777602702770505e-06, + "loss": 0.9977, + "step": 7859 + }, + { + "epoch": 1.5554333974663301, + "grad_norm": 2.203125, + "learning_rate": 4.67670866684775e-06, + "loss": 1.0028, + "step": 7860 + }, + { + "epoch": 1.5556332925214262, + "grad_norm": 2.046875, + "learning_rate": 4.675657077779019e-06, + "loss": 0.894, + "step": 7861 + }, + { + "epoch": 1.5558331875765223, + "grad_norm": 2.125, + "learning_rate": 4.674605503117563e-06, + "loss": 1.0168, + "step": 7862 + }, + { + "epoch": 1.5560330826316184, + "grad_norm": 2.046875, + "learning_rate": 4.673553942910097e-06, + "loss": 0.9088, + "step": 7863 + }, + { + "epoch": 1.5562329776867143, + "grad_norm": 2.0625, + "learning_rate": 4.67250239720333e-06, + "loss": 0.8533, + "step": 7864 + }, + { + "epoch": 1.5564328727418104, + "grad_norm": 2.203125, + "learning_rate": 4.6714508660439715e-06, + "loss": 1.0101, + "step": 7865 + }, + { + "epoch": 1.5566327677969065, + "grad_norm": 2.140625, + "learning_rate": 4.67039934947873e-06, + "loss": 0.9974, + "step": 7866 + }, + { + "epoch": 1.5568326628520026, + "grad_norm": 2.015625, + "learning_rate": 4.669347847554315e-06, + "loss": 0.896, + "step": 7867 + }, + { + "epoch": 1.5570325579070987, + "grad_norm": 2.109375, + "learning_rate": 4.6682963603174326e-06, + "loss": 0.9481, + "step": 7868 + }, + { + "epoch": 1.5572324529621948, + "grad_norm": 2.15625, + "learning_rate": 4.667244887814791e-06, + "loss": 1.0765, + "step": 7869 + }, + { + "epoch": 1.557432348017291, + "grad_norm": 2.140625, + "learning_rate": 4.666193430093096e-06, + "loss": 1.0258, + "step": 7870 + }, + { + "epoch": 1.557632243072387, + "grad_norm": 2.125, + "learning_rate": 4.665141987199052e-06, + "loss": 0.938, + "step": 7871 + }, + { + "epoch": 1.5578321381274831, + "grad_norm": 2.203125, + "learning_rate": 4.664090559179367e-06, + "loss": 1.0506, + "step": 7872 + }, + { + "epoch": 1.5580320331825792, + "grad_norm": 2.234375, + "learning_rate": 4.663039146080742e-06, + "loss": 0.999, + "step": 7873 + }, + { + "epoch": 1.5582319282376753, + "grad_norm": 2.234375, + "learning_rate": 4.661987747949882e-06, + "loss": 0.9825, + "step": 7874 + }, + { + "epoch": 1.5584318232927714, + "grad_norm": 2.265625, + "learning_rate": 4.660936364833492e-06, + "loss": 0.8881, + "step": 7875 + }, + { + "epoch": 1.5586317183478675, + "grad_norm": 2.296875, + "learning_rate": 4.659884996778271e-06, + "loss": 1.021, + "step": 7876 + }, + { + "epoch": 1.5588316134029634, + "grad_norm": 2.21875, + "learning_rate": 4.658833643830923e-06, + "loss": 0.9397, + "step": 7877 + }, + { + "epoch": 1.5590315084580595, + "grad_norm": 2.125, + "learning_rate": 4.657782306038148e-06, + "loss": 0.9747, + "step": 7878 + }, + { + "epoch": 1.5592314035131556, + "grad_norm": 2.140625, + "learning_rate": 4.6567309834466465e-06, + "loss": 1.0115, + "step": 7879 + }, + { + "epoch": 1.5594312985682517, + "grad_norm": 2.171875, + "learning_rate": 4.6556796761031175e-06, + "loss": 0.9302, + "step": 7880 + }, + { + "epoch": 1.5596311936233478, + "grad_norm": 2.203125, + "learning_rate": 4.654628384054261e-06, + "loss": 1.0368, + "step": 7881 + }, + { + "epoch": 1.5598310886784437, + "grad_norm": 2.15625, + "learning_rate": 4.653577107346774e-06, + "loss": 0.9975, + "step": 7882 + }, + { + "epoch": 1.5600309837335398, + "grad_norm": 2.125, + "learning_rate": 4.652525846027355e-06, + "loss": 0.9942, + "step": 7883 + }, + { + "epoch": 1.5602308787886359, + "grad_norm": 2.140625, + "learning_rate": 4.651474600142701e-06, + "loss": 0.8976, + "step": 7884 + }, + { + "epoch": 1.560430773843732, + "grad_norm": 2.3125, + "learning_rate": 4.650423369739508e-06, + "loss": 0.9892, + "step": 7885 + }, + { + "epoch": 1.560630668898828, + "grad_norm": 2.09375, + "learning_rate": 4.649372154864471e-06, + "loss": 0.9206, + "step": 7886 + }, + { + "epoch": 1.5608305639539242, + "grad_norm": 2.140625, + "learning_rate": 4.648320955564287e-06, + "loss": 0.9585, + "step": 7887 + }, + { + "epoch": 1.5610304590090203, + "grad_norm": 2.203125, + "learning_rate": 4.647269771885648e-06, + "loss": 0.9131, + "step": 7888 + }, + { + "epoch": 1.5612303540641164, + "grad_norm": 2.1875, + "learning_rate": 4.646218603875248e-06, + "loss": 1.0652, + "step": 7889 + }, + { + "epoch": 1.5614302491192125, + "grad_norm": 2.140625, + "learning_rate": 4.645167451579779e-06, + "loss": 1.0153, + "step": 7890 + }, + { + "epoch": 1.5616301441743086, + "grad_norm": 2.265625, + "learning_rate": 4.644116315045933e-06, + "loss": 0.965, + "step": 7891 + }, + { + "epoch": 1.5618300392294047, + "grad_norm": 2.140625, + "learning_rate": 4.643065194320405e-06, + "loss": 0.9428, + "step": 7892 + }, + { + "epoch": 1.5620299342845008, + "grad_norm": 2.0625, + "learning_rate": 4.642014089449881e-06, + "loss": 0.9131, + "step": 7893 + }, + { + "epoch": 1.5622298293395969, + "grad_norm": 2.203125, + "learning_rate": 4.6409630004810535e-06, + "loss": 0.9232, + "step": 7894 + }, + { + "epoch": 1.5624297243946927, + "grad_norm": 2.34375, + "learning_rate": 4.639911927460612e-06, + "loss": 1.0809, + "step": 7895 + }, + { + "epoch": 1.5626296194497888, + "grad_norm": 2.125, + "learning_rate": 4.638860870435243e-06, + "loss": 0.9853, + "step": 7896 + }, + { + "epoch": 1.562829514504885, + "grad_norm": 2.140625, + "learning_rate": 4.637809829451637e-06, + "loss": 0.9465, + "step": 7897 + }, + { + "epoch": 1.563029409559981, + "grad_norm": 2.09375, + "learning_rate": 4.636758804556478e-06, + "loss": 0.8824, + "step": 7898 + }, + { + "epoch": 1.563229304615077, + "grad_norm": 2.203125, + "learning_rate": 4.635707795796454e-06, + "loss": 1.1537, + "step": 7899 + }, + { + "epoch": 1.563429199670173, + "grad_norm": 2.25, + "learning_rate": 4.634656803218252e-06, + "loss": 0.9672, + "step": 7900 + }, + { + "epoch": 1.5636290947252691, + "grad_norm": 2.0625, + "learning_rate": 4.633605826868556e-06, + "loss": 0.9607, + "step": 7901 + }, + { + "epoch": 1.5638289897803652, + "grad_norm": 2.140625, + "learning_rate": 4.632554866794048e-06, + "loss": 0.951, + "step": 7902 + }, + { + "epoch": 1.5640288848354613, + "grad_norm": 2.3125, + "learning_rate": 4.631503923041416e-06, + "loss": 0.9414, + "step": 7903 + }, + { + "epoch": 1.5642287798905574, + "grad_norm": 2.046875, + "learning_rate": 4.63045299565734e-06, + "loss": 0.9421, + "step": 7904 + }, + { + "epoch": 1.5644286749456535, + "grad_norm": 2.125, + "learning_rate": 4.629402084688501e-06, + "loss": 0.9926, + "step": 7905 + }, + { + "epoch": 1.5646285700007496, + "grad_norm": 2.109375, + "learning_rate": 4.628351190181583e-06, + "loss": 0.9994, + "step": 7906 + }, + { + "epoch": 1.5648284650558457, + "grad_norm": 2.328125, + "learning_rate": 4.627300312183265e-06, + "loss": 1.1636, + "step": 7907 + }, + { + "epoch": 1.5650283601109418, + "grad_norm": 2.09375, + "learning_rate": 4.6262494507402275e-06, + "loss": 0.9685, + "step": 7908 + }, + { + "epoch": 1.565228255166038, + "grad_norm": 2.0, + "learning_rate": 4.625198605899152e-06, + "loss": 0.9123, + "step": 7909 + }, + { + "epoch": 1.565428150221134, + "grad_norm": 2.046875, + "learning_rate": 4.624147777706711e-06, + "loss": 0.9561, + "step": 7910 + }, + { + "epoch": 1.5656280452762301, + "grad_norm": 2.171875, + "learning_rate": 4.623096966209586e-06, + "loss": 0.9164, + "step": 7911 + }, + { + "epoch": 1.5658279403313262, + "grad_norm": 2.296875, + "learning_rate": 4.622046171454454e-06, + "loss": 0.9847, + "step": 7912 + }, + { + "epoch": 1.566027835386422, + "grad_norm": 2.140625, + "learning_rate": 4.620995393487991e-06, + "loss": 0.9813, + "step": 7913 + }, + { + "epoch": 1.5662277304415182, + "grad_norm": 2.046875, + "learning_rate": 4.619944632356871e-06, + "loss": 0.9009, + "step": 7914 + }, + { + "epoch": 1.5664276254966143, + "grad_norm": 2.109375, + "learning_rate": 4.618893888107772e-06, + "loss": 0.9058, + "step": 7915 + }, + { + "epoch": 1.5666275205517104, + "grad_norm": 2.140625, + "learning_rate": 4.617843160787364e-06, + "loss": 0.9579, + "step": 7916 + }, + { + "epoch": 1.5668274156068063, + "grad_norm": 2.203125, + "learning_rate": 4.616792450442321e-06, + "loss": 0.9935, + "step": 7917 + }, + { + "epoch": 1.5670273106619024, + "grad_norm": 2.140625, + "learning_rate": 4.6157417571193184e-06, + "loss": 0.8968, + "step": 7918 + }, + { + "epoch": 1.5672272057169985, + "grad_norm": 2.15625, + "learning_rate": 4.6146910808650255e-06, + "loss": 0.9871, + "step": 7919 + }, + { + "epoch": 1.5674271007720946, + "grad_norm": 2.171875, + "learning_rate": 4.613640421726114e-06, + "loss": 0.938, + "step": 7920 + }, + { + "epoch": 1.5676269958271907, + "grad_norm": 2.1875, + "learning_rate": 4.612589779749252e-06, + "loss": 0.9615, + "step": 7921 + }, + { + "epoch": 1.5678268908822868, + "grad_norm": 2.21875, + "learning_rate": 4.611539154981112e-06, + "loss": 0.948, + "step": 7922 + }, + { + "epoch": 1.5680267859373829, + "grad_norm": 3.046875, + "learning_rate": 4.610488547468361e-06, + "loss": 0.984, + "step": 7923 + }, + { + "epoch": 1.568226680992479, + "grad_norm": 2.171875, + "learning_rate": 4.609437957257667e-06, + "loss": 1.0223, + "step": 7924 + }, + { + "epoch": 1.568426576047575, + "grad_norm": 2.125, + "learning_rate": 4.608387384395697e-06, + "loss": 0.9318, + "step": 7925 + }, + { + "epoch": 1.5686264711026712, + "grad_norm": 2.203125, + "learning_rate": 4.607336828929119e-06, + "loss": 0.9919, + "step": 7926 + }, + { + "epoch": 1.5688263661577673, + "grad_norm": 2.328125, + "learning_rate": 4.606286290904595e-06, + "loss": 1.0524, + "step": 7927 + }, + { + "epoch": 1.5690262612128634, + "grad_norm": 2.25, + "learning_rate": 4.605235770368794e-06, + "loss": 0.9626, + "step": 7928 + }, + { + "epoch": 1.5692261562679595, + "grad_norm": 2.5, + "learning_rate": 4.604185267368376e-06, + "loss": 0.9993, + "step": 7929 + }, + { + "epoch": 1.5694260513230553, + "grad_norm": 2.0625, + "learning_rate": 4.603134781950007e-06, + "loss": 0.9542, + "step": 7930 + }, + { + "epoch": 1.5696259463781514, + "grad_norm": 2.171875, + "learning_rate": 4.6020843141603484e-06, + "loss": 0.952, + "step": 7931 + }, + { + "epoch": 1.5698258414332475, + "grad_norm": 2.3125, + "learning_rate": 4.601033864046062e-06, + "loss": 0.9748, + "step": 7932 + }, + { + "epoch": 1.5700257364883436, + "grad_norm": 2.171875, + "learning_rate": 4.599983431653809e-06, + "loss": 1.0249, + "step": 7933 + }, + { + "epoch": 1.5702256315434397, + "grad_norm": 2.171875, + "learning_rate": 4.59893301703025e-06, + "loss": 0.9893, + "step": 7934 + }, + { + "epoch": 1.5704255265985356, + "grad_norm": 2.203125, + "learning_rate": 4.597882620222043e-06, + "loss": 0.9827, + "step": 7935 + }, + { + "epoch": 1.5706254216536317, + "grad_norm": 2.21875, + "learning_rate": 4.596832241275846e-06, + "loss": 1.1081, + "step": 7936 + }, + { + "epoch": 1.5708253167087278, + "grad_norm": 2.25, + "learning_rate": 4.595781880238319e-06, + "loss": 0.9389, + "step": 7937 + }, + { + "epoch": 1.571025211763824, + "grad_norm": 2.234375, + "learning_rate": 4.594731537156117e-06, + "loss": 0.9455, + "step": 7938 + }, + { + "epoch": 1.57122510681892, + "grad_norm": 2.234375, + "learning_rate": 4.593681212075898e-06, + "loss": 0.9311, + "step": 7939 + }, + { + "epoch": 1.5714250018740161, + "grad_norm": 2.15625, + "learning_rate": 4.592630905044317e-06, + "loss": 0.9819, + "step": 7940 + }, + { + "epoch": 1.5716248969291122, + "grad_norm": 2.25, + "learning_rate": 4.591580616108026e-06, + "loss": 1.0248, + "step": 7941 + }, + { + "epoch": 1.5718247919842083, + "grad_norm": 2.046875, + "learning_rate": 4.59053034531368e-06, + "loss": 0.9329, + "step": 7942 + }, + { + "epoch": 1.5720246870393044, + "grad_norm": 2.15625, + "learning_rate": 4.589480092707933e-06, + "loss": 1.0111, + "step": 7943 + }, + { + "epoch": 1.5722245820944005, + "grad_norm": 2.203125, + "learning_rate": 4.5884298583374356e-06, + "loss": 0.9432, + "step": 7944 + }, + { + "epoch": 1.5724244771494966, + "grad_norm": 2.15625, + "learning_rate": 4.5873796422488405e-06, + "loss": 1.0184, + "step": 7945 + }, + { + "epoch": 1.5726243722045927, + "grad_norm": 2.046875, + "learning_rate": 4.586329444488798e-06, + "loss": 0.9221, + "step": 7946 + }, + { + "epoch": 1.5728242672596888, + "grad_norm": 2.328125, + "learning_rate": 4.585279265103957e-06, + "loss": 1.0654, + "step": 7947 + }, + { + "epoch": 1.5730241623147847, + "grad_norm": 2.171875, + "learning_rate": 4.584229104140967e-06, + "loss": 1.043, + "step": 7948 + }, + { + "epoch": 1.5732240573698808, + "grad_norm": 2.15625, + "learning_rate": 4.583178961646475e-06, + "loss": 0.8936, + "step": 7949 + }, + { + "epoch": 1.573423952424977, + "grad_norm": 2.234375, + "learning_rate": 4.58212883766713e-06, + "loss": 0.9949, + "step": 7950 + }, + { + "epoch": 1.573623847480073, + "grad_norm": 2.109375, + "learning_rate": 4.581078732249577e-06, + "loss": 0.9255, + "step": 7951 + }, + { + "epoch": 1.5738237425351689, + "grad_norm": 2.25, + "learning_rate": 4.580028645440462e-06, + "loss": 0.9435, + "step": 7952 + }, + { + "epoch": 1.574023637590265, + "grad_norm": 2.453125, + "learning_rate": 4.5789785772864295e-06, + "loss": 1.0914, + "step": 7953 + }, + { + "epoch": 1.574223532645361, + "grad_norm": 2.21875, + "learning_rate": 4.577928527834124e-06, + "loss": 1.0328, + "step": 7954 + }, + { + "epoch": 1.5744234277004572, + "grad_norm": 2.109375, + "learning_rate": 4.576878497130188e-06, + "loss": 0.9528, + "step": 7955 + }, + { + "epoch": 1.5746233227555533, + "grad_norm": 2.09375, + "learning_rate": 4.575828485221263e-06, + "loss": 0.9652, + "step": 7956 + }, + { + "epoch": 1.5748232178106494, + "grad_norm": 2.03125, + "learning_rate": 4.574778492153993e-06, + "loss": 0.8972, + "step": 7957 + }, + { + "epoch": 1.5750231128657455, + "grad_norm": 2.25, + "learning_rate": 4.573728517975016e-06, + "loss": 0.9544, + "step": 7958 + }, + { + "epoch": 1.5752230079208416, + "grad_norm": 2.171875, + "learning_rate": 4.5726785627309736e-06, + "loss": 0.9876, + "step": 7959 + }, + { + "epoch": 1.5754229029759377, + "grad_norm": 2.03125, + "learning_rate": 4.571628626468503e-06, + "loss": 0.9727, + "step": 7960 + }, + { + "epoch": 1.5756227980310338, + "grad_norm": 2.171875, + "learning_rate": 4.570578709234243e-06, + "loss": 1.0302, + "step": 7961 + }, + { + "epoch": 1.5758226930861299, + "grad_norm": 2.078125, + "learning_rate": 4.5695288110748324e-06, + "loss": 0.9296, + "step": 7962 + }, + { + "epoch": 1.576022588141226, + "grad_norm": 2.046875, + "learning_rate": 4.568478932036904e-06, + "loss": 0.9386, + "step": 7963 + }, + { + "epoch": 1.576222483196322, + "grad_norm": 2.109375, + "learning_rate": 4.567429072167095e-06, + "loss": 0.9132, + "step": 7964 + }, + { + "epoch": 1.576422378251418, + "grad_norm": 2.09375, + "learning_rate": 4.566379231512042e-06, + "loss": 0.9991, + "step": 7965 + }, + { + "epoch": 1.576622273306514, + "grad_norm": 2.171875, + "learning_rate": 4.565329410118377e-06, + "loss": 0.927, + "step": 7966 + }, + { + "epoch": 1.5768221683616102, + "grad_norm": 2.234375, + "learning_rate": 4.564279608032734e-06, + "loss": 0.9367, + "step": 7967 + }, + { + "epoch": 1.5770220634167063, + "grad_norm": 2.28125, + "learning_rate": 4.563229825301743e-06, + "loss": 1.0996, + "step": 7968 + }, + { + "epoch": 1.5772219584718024, + "grad_norm": 2.15625, + "learning_rate": 4.562180061972036e-06, + "loss": 0.9513, + "step": 7969 + }, + { + "epoch": 1.5774218535268982, + "grad_norm": 2.21875, + "learning_rate": 4.561130318090246e-06, + "loss": 1.0707, + "step": 7970 + }, + { + "epoch": 1.5776217485819943, + "grad_norm": 2.296875, + "learning_rate": 4.560080593703e-06, + "loss": 0.9936, + "step": 7971 + }, + { + "epoch": 1.5778216436370904, + "grad_norm": 2.125, + "learning_rate": 4.5590308888569256e-06, + "loss": 0.952, + "step": 7972 + }, + { + "epoch": 1.5780215386921865, + "grad_norm": 2.265625, + "learning_rate": 4.557981203598652e-06, + "loss": 0.9375, + "step": 7973 + }, + { + "epoch": 1.5782214337472826, + "grad_norm": 2.125, + "learning_rate": 4.556931537974808e-06, + "loss": 0.9408, + "step": 7974 + }, + { + "epoch": 1.5784213288023787, + "grad_norm": 2.25, + "learning_rate": 4.555881892032016e-06, + "loss": 1.037, + "step": 7975 + }, + { + "epoch": 1.5786212238574748, + "grad_norm": 2.15625, + "learning_rate": 4.554832265816903e-06, + "loss": 0.9225, + "step": 7976 + }, + { + "epoch": 1.578821118912571, + "grad_norm": 2.109375, + "learning_rate": 4.553782659376094e-06, + "loss": 0.9012, + "step": 7977 + }, + { + "epoch": 1.579021013967667, + "grad_norm": 2.203125, + "learning_rate": 4.552733072756211e-06, + "loss": 0.994, + "step": 7978 + }, + { + "epoch": 1.5792209090227631, + "grad_norm": 2.265625, + "learning_rate": 4.551683506003878e-06, + "loss": 0.9952, + "step": 7979 + }, + { + "epoch": 1.5794208040778592, + "grad_norm": 2.25, + "learning_rate": 4.5506339591657145e-06, + "loss": 0.9756, + "step": 7980 + }, + { + "epoch": 1.5796206991329553, + "grad_norm": 2.046875, + "learning_rate": 4.549584432288343e-06, + "loss": 0.9691, + "step": 7981 + }, + { + "epoch": 1.5798205941880514, + "grad_norm": 2.21875, + "learning_rate": 4.548534925418383e-06, + "loss": 1.0193, + "step": 7982 + }, + { + "epoch": 1.5800204892431473, + "grad_norm": 2.296875, + "learning_rate": 4.547485438602453e-06, + "loss": 1.0499, + "step": 7983 + }, + { + "epoch": 1.5802203842982434, + "grad_norm": 2.265625, + "learning_rate": 4.546435971887172e-06, + "loss": 0.9644, + "step": 7984 + }, + { + "epoch": 1.5804202793533395, + "grad_norm": 2.21875, + "learning_rate": 4.545386525319157e-06, + "loss": 0.9267, + "step": 7985 + }, + { + "epoch": 1.5806201744084356, + "grad_norm": 2.140625, + "learning_rate": 4.544337098945023e-06, + "loss": 1.0208, + "step": 7986 + }, + { + "epoch": 1.5808200694635315, + "grad_norm": 2.15625, + "learning_rate": 4.543287692811388e-06, + "loss": 0.903, + "step": 7987 + }, + { + "epoch": 1.5810199645186276, + "grad_norm": 2.125, + "learning_rate": 4.542238306964863e-06, + "loss": 0.9197, + "step": 7988 + }, + { + "epoch": 1.5812198595737237, + "grad_norm": 2.28125, + "learning_rate": 4.5411889414520634e-06, + "loss": 1.0371, + "step": 7989 + }, + { + "epoch": 1.5814197546288198, + "grad_norm": 2.140625, + "learning_rate": 4.540139596319604e-06, + "loss": 0.985, + "step": 7990 + }, + { + "epoch": 1.5816196496839159, + "grad_norm": 2.109375, + "learning_rate": 4.539090271614094e-06, + "loss": 0.8718, + "step": 7991 + }, + { + "epoch": 1.581819544739012, + "grad_norm": 1.96875, + "learning_rate": 4.538040967382144e-06, + "loss": 0.9993, + "step": 7992 + }, + { + "epoch": 1.582019439794108, + "grad_norm": 2.203125, + "learning_rate": 4.536991683670366e-06, + "loss": 1.0313, + "step": 7993 + }, + { + "epoch": 1.5822193348492042, + "grad_norm": 2.109375, + "learning_rate": 4.5359424205253665e-06, + "loss": 1.0063, + "step": 7994 + }, + { + "epoch": 1.5824192299043003, + "grad_norm": 2.03125, + "learning_rate": 4.534893177993756e-06, + "loss": 1.0112, + "step": 7995 + }, + { + "epoch": 1.5826191249593964, + "grad_norm": 2.125, + "learning_rate": 4.533843956122142e-06, + "loss": 0.9603, + "step": 7996 + }, + { + "epoch": 1.5828190200144925, + "grad_norm": 2.046875, + "learning_rate": 4.532794754957128e-06, + "loss": 0.9179, + "step": 7997 + }, + { + "epoch": 1.5830189150695886, + "grad_norm": 2.078125, + "learning_rate": 4.531745574545322e-06, + "loss": 0.9546, + "step": 7998 + }, + { + "epoch": 1.5832188101246847, + "grad_norm": 2.171875, + "learning_rate": 4.5306964149333265e-06, + "loss": 0.9534, + "step": 7999 + }, + { + "epoch": 1.5834187051797806, + "grad_norm": 2.328125, + "learning_rate": 4.529647276167747e-06, + "loss": 0.9989, + "step": 8000 + }, + { + "epoch": 1.5836186002348767, + "grad_norm": 2.15625, + "learning_rate": 4.5285981582951855e-06, + "loss": 0.9472, + "step": 8001 + }, + { + "epoch": 1.5838184952899728, + "grad_norm": 2.171875, + "learning_rate": 4.527549061362245e-06, + "loss": 0.98, + "step": 8002 + }, + { + "epoch": 1.5840183903450689, + "grad_norm": 2.28125, + "learning_rate": 4.526499985415522e-06, + "loss": 1.0411, + "step": 8003 + }, + { + "epoch": 1.584218285400165, + "grad_norm": 2.109375, + "learning_rate": 4.5254509305016195e-06, + "loss": 0.9677, + "step": 8004 + }, + { + "epoch": 1.5844181804552608, + "grad_norm": 2.140625, + "learning_rate": 4.524401896667137e-06, + "loss": 0.952, + "step": 8005 + }, + { + "epoch": 1.584618075510357, + "grad_norm": 2.03125, + "learning_rate": 4.52335288395867e-06, + "loss": 0.8664, + "step": 8006 + }, + { + "epoch": 1.584817970565453, + "grad_norm": 2.046875, + "learning_rate": 4.522303892422818e-06, + "loss": 0.9897, + "step": 8007 + }, + { + "epoch": 1.5850178656205491, + "grad_norm": 2.296875, + "learning_rate": 4.521254922106176e-06, + "loss": 0.9461, + "step": 8008 + }, + { + "epoch": 1.5852177606756452, + "grad_norm": 2.125, + "learning_rate": 4.520205973055339e-06, + "loss": 0.9887, + "step": 8009 + }, + { + "epoch": 1.5854176557307413, + "grad_norm": 2.21875, + "learning_rate": 4.519157045316902e-06, + "loss": 0.9829, + "step": 8010 + }, + { + "epoch": 1.5856175507858374, + "grad_norm": 2.078125, + "learning_rate": 4.518108138937457e-06, + "loss": 0.9488, + "step": 8011 + }, + { + "epoch": 1.5858174458409335, + "grad_norm": 2.234375, + "learning_rate": 4.517059253963596e-06, + "loss": 0.8656, + "step": 8012 + }, + { + "epoch": 1.5860173408960296, + "grad_norm": 2.03125, + "learning_rate": 4.516010390441914e-06, + "loss": 0.8928, + "step": 8013 + }, + { + "epoch": 1.5862172359511257, + "grad_norm": 2.109375, + "learning_rate": 4.5149615484189965e-06, + "loss": 0.8735, + "step": 8014 + }, + { + "epoch": 1.5864171310062218, + "grad_norm": 2.203125, + "learning_rate": 4.513912727941437e-06, + "loss": 0.9873, + "step": 8015 + }, + { + "epoch": 1.586617026061318, + "grad_norm": 2.046875, + "learning_rate": 4.512863929055821e-06, + "loss": 0.9418, + "step": 8016 + }, + { + "epoch": 1.586816921116414, + "grad_norm": 2.109375, + "learning_rate": 4.511815151808737e-06, + "loss": 0.9767, + "step": 8017 + }, + { + "epoch": 1.58701681617151, + "grad_norm": 2.125, + "learning_rate": 4.510766396246773e-06, + "loss": 1.0002, + "step": 8018 + }, + { + "epoch": 1.587216711226606, + "grad_norm": 2.15625, + "learning_rate": 4.5097176624165125e-06, + "loss": 0.9911, + "step": 8019 + }, + { + "epoch": 1.5874166062817021, + "grad_norm": 2.09375, + "learning_rate": 4.508668950364541e-06, + "loss": 0.8995, + "step": 8020 + }, + { + "epoch": 1.5876165013367982, + "grad_norm": 2.25, + "learning_rate": 4.507620260137443e-06, + "loss": 1.0113, + "step": 8021 + }, + { + "epoch": 1.587816396391894, + "grad_norm": 2.171875, + "learning_rate": 4.5065715917818e-06, + "loss": 0.9905, + "step": 8022 + }, + { + "epoch": 1.5880162914469902, + "grad_norm": 2.078125, + "learning_rate": 4.5055229453441935e-06, + "loss": 0.9579, + "step": 8023 + }, + { + "epoch": 1.5882161865020863, + "grad_norm": 2.1875, + "learning_rate": 4.504474320871206e-06, + "loss": 1.0448, + "step": 8024 + }, + { + "epoch": 1.5884160815571824, + "grad_norm": 2.0625, + "learning_rate": 4.503425718409416e-06, + "loss": 0.9604, + "step": 8025 + }, + { + "epoch": 1.5886159766122785, + "grad_norm": 2.125, + "learning_rate": 4.502377138005402e-06, + "loss": 1.0344, + "step": 8026 + }, + { + "epoch": 1.5888158716673746, + "grad_norm": 2.171875, + "learning_rate": 4.501328579705744e-06, + "loss": 1.0418, + "step": 8027 + }, + { + "epoch": 1.5890157667224707, + "grad_norm": 2.078125, + "learning_rate": 4.500280043557015e-06, + "loss": 0.9852, + "step": 8028 + }, + { + "epoch": 1.5892156617775668, + "grad_norm": 2.0625, + "learning_rate": 4.499231529605795e-06, + "loss": 0.8487, + "step": 8029 + }, + { + "epoch": 1.589415556832663, + "grad_norm": 2.140625, + "learning_rate": 4.498183037898656e-06, + "loss": 1.0181, + "step": 8030 + }, + { + "epoch": 1.589615451887759, + "grad_norm": 2.109375, + "learning_rate": 4.497134568482172e-06, + "loss": 0.9443, + "step": 8031 + }, + { + "epoch": 1.589815346942855, + "grad_norm": 2.28125, + "learning_rate": 4.496086121402919e-06, + "loss": 1.0473, + "step": 8032 + }, + { + "epoch": 1.5900152419979512, + "grad_norm": 2.1875, + "learning_rate": 4.495037696707467e-06, + "loss": 0.9742, + "step": 8033 + }, + { + "epoch": 1.5902151370530473, + "grad_norm": 2.265625, + "learning_rate": 4.493989294442385e-06, + "loss": 1.0032, + "step": 8034 + }, + { + "epoch": 1.5904150321081434, + "grad_norm": 2.03125, + "learning_rate": 4.492940914654244e-06, + "loss": 0.9712, + "step": 8035 + }, + { + "epoch": 1.5906149271632393, + "grad_norm": 2.125, + "learning_rate": 4.491892557389614e-06, + "loss": 1.0108, + "step": 8036 + }, + { + "epoch": 1.5908148222183354, + "grad_norm": 2.21875, + "learning_rate": 4.4908442226950625e-06, + "loss": 1.0197, + "step": 8037 + }, + { + "epoch": 1.5910147172734315, + "grad_norm": 2.046875, + "learning_rate": 4.489795910617156e-06, + "loss": 0.998, + "step": 8038 + }, + { + "epoch": 1.5912146123285276, + "grad_norm": 2.203125, + "learning_rate": 4.488747621202461e-06, + "loss": 0.8976, + "step": 8039 + }, + { + "epoch": 1.5914145073836234, + "grad_norm": 2.171875, + "learning_rate": 4.487699354497541e-06, + "loss": 0.9769, + "step": 8040 + }, + { + "epoch": 1.5916144024387195, + "grad_norm": 2.265625, + "learning_rate": 4.4866511105489625e-06, + "loss": 0.9799, + "step": 8041 + }, + { + "epoch": 1.5918142974938156, + "grad_norm": 1.9296875, + "learning_rate": 4.485602889403286e-06, + "loss": 0.7528, + "step": 8042 + }, + { + "epoch": 1.5920141925489117, + "grad_norm": 2.25, + "learning_rate": 4.484554691107073e-06, + "loss": 0.9563, + "step": 8043 + }, + { + "epoch": 1.5922140876040078, + "grad_norm": 2.125, + "learning_rate": 4.4835065157068875e-06, + "loss": 0.9822, + "step": 8044 + }, + { + "epoch": 1.592413982659104, + "grad_norm": 2.15625, + "learning_rate": 4.4824583632492866e-06, + "loss": 1.0397, + "step": 8045 + }, + { + "epoch": 1.5926138777142, + "grad_norm": 2.296875, + "learning_rate": 4.48141023378083e-06, + "loss": 1.0909, + "step": 8046 + }, + { + "epoch": 1.5928137727692961, + "grad_norm": 2.125, + "learning_rate": 4.480362127348075e-06, + "loss": 0.9063, + "step": 8047 + }, + { + "epoch": 1.5930136678243922, + "grad_norm": 2.0, + "learning_rate": 4.479314043997579e-06, + "loss": 0.9052, + "step": 8048 + }, + { + "epoch": 1.5932135628794883, + "grad_norm": 2.125, + "learning_rate": 4.478265983775899e-06, + "loss": 0.9554, + "step": 8049 + }, + { + "epoch": 1.5934134579345844, + "grad_norm": 2.15625, + "learning_rate": 4.4772179467295865e-06, + "loss": 0.919, + "step": 8050 + }, + { + "epoch": 1.5936133529896805, + "grad_norm": 2.15625, + "learning_rate": 4.476169932905197e-06, + "loss": 0.9487, + "step": 8051 + }, + { + "epoch": 1.5938132480447766, + "grad_norm": 2.15625, + "learning_rate": 4.475121942349285e-06, + "loss": 1.0724, + "step": 8052 + }, + { + "epoch": 1.5940131430998725, + "grad_norm": 2.078125, + "learning_rate": 4.474073975108398e-06, + "loss": 0.9217, + "step": 8053 + }, + { + "epoch": 1.5942130381549686, + "grad_norm": 2.359375, + "learning_rate": 4.47302603122909e-06, + "loss": 1.0427, + "step": 8054 + }, + { + "epoch": 1.5944129332100647, + "grad_norm": 2.09375, + "learning_rate": 4.47197811075791e-06, + "loss": 0.8782, + "step": 8055 + }, + { + "epoch": 1.5946128282651608, + "grad_norm": 2.09375, + "learning_rate": 4.470930213741405e-06, + "loss": 0.9156, + "step": 8056 + }, + { + "epoch": 1.594812723320257, + "grad_norm": 2.15625, + "learning_rate": 4.469882340226124e-06, + "loss": 1.0549, + "step": 8057 + }, + { + "epoch": 1.5950126183753528, + "grad_norm": 2.1875, + "learning_rate": 4.468834490258612e-06, + "loss": 0.9348, + "step": 8058 + }, + { + "epoch": 1.595212513430449, + "grad_norm": 2.109375, + "learning_rate": 4.467786663885415e-06, + "loss": 0.959, + "step": 8059 + }, + { + "epoch": 1.595412408485545, + "grad_norm": 2.15625, + "learning_rate": 4.4667388611530785e-06, + "loss": 1.0165, + "step": 8060 + }, + { + "epoch": 1.595612303540641, + "grad_norm": 2.15625, + "learning_rate": 4.465691082108145e-06, + "loss": 0.9735, + "step": 8061 + }, + { + "epoch": 1.5958121985957372, + "grad_norm": 2.3125, + "learning_rate": 4.464643326797155e-06, + "loss": 1.0059, + "step": 8062 + }, + { + "epoch": 1.5960120936508333, + "grad_norm": 2.171875, + "learning_rate": 4.463595595266653e-06, + "loss": 1.0127, + "step": 8063 + }, + { + "epoch": 1.5962119887059294, + "grad_norm": 2.3125, + "learning_rate": 4.4625478875631785e-06, + "loss": 0.9852, + "step": 8064 + }, + { + "epoch": 1.5964118837610255, + "grad_norm": 2.25, + "learning_rate": 4.4615002037332675e-06, + "loss": 1.0747, + "step": 8065 + }, + { + "epoch": 1.5966117788161216, + "grad_norm": 2.1875, + "learning_rate": 4.460452543823459e-06, + "loss": 1.091, + "step": 8066 + }, + { + "epoch": 1.5968116738712177, + "grad_norm": 2.15625, + "learning_rate": 4.459404907880293e-06, + "loss": 0.9338, + "step": 8067 + }, + { + "epoch": 1.5970115689263138, + "grad_norm": 2.125, + "learning_rate": 4.458357295950302e-06, + "loss": 0.9632, + "step": 8068 + }, + { + "epoch": 1.59721146398141, + "grad_norm": 2.375, + "learning_rate": 4.457309708080022e-06, + "loss": 0.9838, + "step": 8069 + }, + { + "epoch": 1.597411359036506, + "grad_norm": 2.125, + "learning_rate": 4.456262144315987e-06, + "loss": 0.9952, + "step": 8070 + }, + { + "epoch": 1.5976112540916019, + "grad_norm": 2.09375, + "learning_rate": 4.455214604704729e-06, + "loss": 0.9566, + "step": 8071 + }, + { + "epoch": 1.597811149146698, + "grad_norm": 2.109375, + "learning_rate": 4.454167089292781e-06, + "loss": 0.9163, + "step": 8072 + }, + { + "epoch": 1.598011044201794, + "grad_norm": 2.109375, + "learning_rate": 4.453119598126672e-06, + "loss": 0.8876, + "step": 8073 + }, + { + "epoch": 1.5982109392568902, + "grad_norm": 2.203125, + "learning_rate": 4.452072131252932e-06, + "loss": 1.0028, + "step": 8074 + }, + { + "epoch": 1.598410834311986, + "grad_norm": 2.1875, + "learning_rate": 4.45102468871809e-06, + "loss": 0.947, + "step": 8075 + }, + { + "epoch": 1.5986107293670822, + "grad_norm": 2.109375, + "learning_rate": 4.449977270568672e-06, + "loss": 0.9892, + "step": 8076 + }, + { + "epoch": 1.5988106244221783, + "grad_norm": 1.984375, + "learning_rate": 4.4489298768512065e-06, + "loss": 0.9009, + "step": 8077 + }, + { + "epoch": 1.5990105194772744, + "grad_norm": 2.078125, + "learning_rate": 4.4478825076122155e-06, + "loss": 0.9217, + "step": 8078 + }, + { + "epoch": 1.5992104145323705, + "grad_norm": 2.0625, + "learning_rate": 4.446835162898224e-06, + "loss": 0.9853, + "step": 8079 + }, + { + "epoch": 1.5994103095874665, + "grad_norm": 2.3125, + "learning_rate": 4.445787842755756e-06, + "loss": 1.0672, + "step": 8080 + }, + { + "epoch": 1.5996102046425626, + "grad_norm": 2.125, + "learning_rate": 4.444740547231334e-06, + "loss": 0.9832, + "step": 8081 + }, + { + "epoch": 1.5998100996976587, + "grad_norm": 2.1875, + "learning_rate": 4.443693276371476e-06, + "loss": 0.9773, + "step": 8082 + }, + { + "epoch": 1.6000099947527548, + "grad_norm": 2.109375, + "learning_rate": 4.442646030222704e-06, + "loss": 0.9381, + "step": 8083 + }, + { + "epoch": 1.600209889807851, + "grad_norm": 2.171875, + "learning_rate": 4.441598808831536e-06, + "loss": 1.1051, + "step": 8084 + }, + { + "epoch": 1.600409784862947, + "grad_norm": 2.390625, + "learning_rate": 4.440551612244489e-06, + "loss": 0.9364, + "step": 8085 + }, + { + "epoch": 1.6006096799180431, + "grad_norm": 2.296875, + "learning_rate": 4.4395044405080805e-06, + "loss": 1.0837, + "step": 8086 + }, + { + "epoch": 1.6008095749731392, + "grad_norm": 2.046875, + "learning_rate": 4.438457293668823e-06, + "loss": 0.9065, + "step": 8087 + }, + { + "epoch": 1.6010094700282351, + "grad_norm": 2.1875, + "learning_rate": 4.437410171773234e-06, + "loss": 0.9178, + "step": 8088 + }, + { + "epoch": 1.6012093650833312, + "grad_norm": 2.171875, + "learning_rate": 4.436363074867824e-06, + "loss": 0.9614, + "step": 8089 + }, + { + "epoch": 1.6014092601384273, + "grad_norm": 2.125, + "learning_rate": 4.435316002999107e-06, + "loss": 0.9914, + "step": 8090 + }, + { + "epoch": 1.6016091551935234, + "grad_norm": 2.140625, + "learning_rate": 4.434268956213593e-06, + "loss": 0.9617, + "step": 8091 + }, + { + "epoch": 1.6018090502486195, + "grad_norm": 2.15625, + "learning_rate": 4.43322193455779e-06, + "loss": 1.0329, + "step": 8092 + }, + { + "epoch": 1.6020089453037154, + "grad_norm": 2.140625, + "learning_rate": 4.432174938078209e-06, + "loss": 0.9898, + "step": 8093 + }, + { + "epoch": 1.6022088403588115, + "grad_norm": 2.203125, + "learning_rate": 4.431127966821358e-06, + "loss": 1.0177, + "step": 8094 + }, + { + "epoch": 1.6024087354139076, + "grad_norm": 2.078125, + "learning_rate": 4.4300810208337425e-06, + "loss": 0.9509, + "step": 8095 + }, + { + "epoch": 1.6026086304690037, + "grad_norm": 2.265625, + "learning_rate": 4.4290341001618645e-06, + "loss": 0.9996, + "step": 8096 + }, + { + "epoch": 1.6028085255240998, + "grad_norm": 2.296875, + "learning_rate": 4.4279872048522316e-06, + "loss": 1.0083, + "step": 8097 + }, + { + "epoch": 1.603008420579196, + "grad_norm": 2.1875, + "learning_rate": 4.426940334951347e-06, + "loss": 0.8973, + "step": 8098 + }, + { + "epoch": 1.603208315634292, + "grad_norm": 2.125, + "learning_rate": 4.42589349050571e-06, + "loss": 0.9218, + "step": 8099 + }, + { + "epoch": 1.603408210689388, + "grad_norm": 2.328125, + "learning_rate": 4.424846671561824e-06, + "loss": 0.9154, + "step": 8100 + }, + { + "epoch": 1.6036081057444842, + "grad_norm": 2.03125, + "learning_rate": 4.423799878166187e-06, + "loss": 1.0048, + "step": 8101 + }, + { + "epoch": 1.6038080007995803, + "grad_norm": 2.0625, + "learning_rate": 4.422753110365297e-06, + "loss": 1.0094, + "step": 8102 + }, + { + "epoch": 1.6040078958546764, + "grad_norm": 2.1875, + "learning_rate": 4.4217063682056534e-06, + "loss": 1.0047, + "step": 8103 + }, + { + "epoch": 1.6042077909097725, + "grad_norm": 2.203125, + "learning_rate": 4.420659651733751e-06, + "loss": 0.9212, + "step": 8104 + }, + { + "epoch": 1.6044076859648686, + "grad_norm": 2.03125, + "learning_rate": 4.419612960996083e-06, + "loss": 0.935, + "step": 8105 + }, + { + "epoch": 1.6046075810199645, + "grad_norm": 2.140625, + "learning_rate": 4.418566296039148e-06, + "loss": 0.9701, + "step": 8106 + }, + { + "epoch": 1.6048074760750606, + "grad_norm": 2.265625, + "learning_rate": 4.417519656909435e-06, + "loss": 0.9098, + "step": 8107 + }, + { + "epoch": 1.6050073711301567, + "grad_norm": 2.125, + "learning_rate": 4.416473043653437e-06, + "loss": 0.9055, + "step": 8108 + }, + { + "epoch": 1.6052072661852528, + "grad_norm": 2.171875, + "learning_rate": 4.415426456317644e-06, + "loss": 0.9648, + "step": 8109 + }, + { + "epoch": 1.6054071612403487, + "grad_norm": 2.0, + "learning_rate": 4.4143798949485445e-06, + "loss": 0.929, + "step": 8110 + }, + { + "epoch": 1.6056070562954448, + "grad_norm": 2.125, + "learning_rate": 4.4133333595926285e-06, + "loss": 0.9921, + "step": 8111 + }, + { + "epoch": 1.6058069513505409, + "grad_norm": 2.296875, + "learning_rate": 4.4122868502963815e-06, + "loss": 0.8996, + "step": 8112 + }, + { + "epoch": 1.606006846405637, + "grad_norm": 2.140625, + "learning_rate": 4.411240367106289e-06, + "loss": 1.0227, + "step": 8113 + }, + { + "epoch": 1.606206741460733, + "grad_norm": 2.296875, + "learning_rate": 4.410193910068838e-06, + "loss": 1.0014, + "step": 8114 + }, + { + "epoch": 1.6064066365158292, + "grad_norm": 2.21875, + "learning_rate": 4.40914747923051e-06, + "loss": 0.9274, + "step": 8115 + }, + { + "epoch": 1.6066065315709253, + "grad_norm": 2.15625, + "learning_rate": 4.4081010746377875e-06, + "loss": 0.9208, + "step": 8116 + }, + { + "epoch": 1.6068064266260214, + "grad_norm": 2.25, + "learning_rate": 4.407054696337153e-06, + "loss": 1.0311, + "step": 8117 + }, + { + "epoch": 1.6070063216811175, + "grad_norm": 2.1875, + "learning_rate": 4.4060083443750844e-06, + "loss": 0.9027, + "step": 8118 + }, + { + "epoch": 1.6072062167362136, + "grad_norm": 2.171875, + "learning_rate": 4.404962018798064e-06, + "loss": 0.9151, + "step": 8119 + }, + { + "epoch": 1.6074061117913097, + "grad_norm": 2.078125, + "learning_rate": 4.403915719652565e-06, + "loss": 0.9553, + "step": 8120 + }, + { + "epoch": 1.6076060068464058, + "grad_norm": 2.15625, + "learning_rate": 4.402869446985066e-06, + "loss": 0.9868, + "step": 8121 + }, + { + "epoch": 1.6078059019015019, + "grad_norm": 2.1875, + "learning_rate": 4.401823200842043e-06, + "loss": 1.0076, + "step": 8122 + }, + { + "epoch": 1.6080057969565977, + "grad_norm": 2.15625, + "learning_rate": 4.400776981269969e-06, + "loss": 1.0536, + "step": 8123 + }, + { + "epoch": 1.6082056920116938, + "grad_norm": 2.21875, + "learning_rate": 4.399730788315317e-06, + "loss": 0.9728, + "step": 8124 + }, + { + "epoch": 1.60840558706679, + "grad_norm": 2.0625, + "learning_rate": 4.398684622024562e-06, + "loss": 0.9305, + "step": 8125 + }, + { + "epoch": 1.608605482121886, + "grad_norm": 2.21875, + "learning_rate": 4.397638482444171e-06, + "loss": 1.0283, + "step": 8126 + }, + { + "epoch": 1.6088053771769821, + "grad_norm": 2.0625, + "learning_rate": 4.396592369620613e-06, + "loss": 0.9209, + "step": 8127 + }, + { + "epoch": 1.609005272232078, + "grad_norm": 2.203125, + "learning_rate": 4.395546283600359e-06, + "loss": 1.0647, + "step": 8128 + }, + { + "epoch": 1.609205167287174, + "grad_norm": 2.34375, + "learning_rate": 4.394500224429873e-06, + "loss": 0.9947, + "step": 8129 + }, + { + "epoch": 1.6094050623422702, + "grad_norm": 2.203125, + "learning_rate": 4.393454192155621e-06, + "loss": 0.9625, + "step": 8130 + }, + { + "epoch": 1.6096049573973663, + "grad_norm": 2.140625, + "learning_rate": 4.392408186824072e-06, + "loss": 0.9379, + "step": 8131 + }, + { + "epoch": 1.6098048524524624, + "grad_norm": 2.171875, + "learning_rate": 4.391362208481685e-06, + "loss": 1.0019, + "step": 8132 + }, + { + "epoch": 1.6100047475075585, + "grad_norm": 2.046875, + "learning_rate": 4.3903162571749234e-06, + "loss": 0.8508, + "step": 8133 + }, + { + "epoch": 1.6102046425626546, + "grad_norm": 2.09375, + "learning_rate": 4.38927033295025e-06, + "loss": 0.9127, + "step": 8134 + }, + { + "epoch": 1.6104045376177507, + "grad_norm": 2.140625, + "learning_rate": 4.388224435854121e-06, + "loss": 0.9434, + "step": 8135 + }, + { + "epoch": 1.6106044326728468, + "grad_norm": 2.046875, + "learning_rate": 4.3871785659329985e-06, + "loss": 0.9151, + "step": 8136 + }, + { + "epoch": 1.610804327727943, + "grad_norm": 2.125, + "learning_rate": 4.386132723233339e-06, + "loss": 0.994, + "step": 8137 + }, + { + "epoch": 1.611004222783039, + "grad_norm": 2.03125, + "learning_rate": 4.385086907801598e-06, + "loss": 0.932, + "step": 8138 + }, + { + "epoch": 1.611204117838135, + "grad_norm": 2.171875, + "learning_rate": 4.384041119684231e-06, + "loss": 1.012, + "step": 8139 + }, + { + "epoch": 1.6114040128932312, + "grad_norm": 2.15625, + "learning_rate": 4.382995358927691e-06, + "loss": 0.9277, + "step": 8140 + }, + { + "epoch": 1.611603907948327, + "grad_norm": 2.0625, + "learning_rate": 4.3819496255784314e-06, + "loss": 0.9755, + "step": 8141 + }, + { + "epoch": 1.6118038030034232, + "grad_norm": 2.328125, + "learning_rate": 4.380903919682904e-06, + "loss": 1.0356, + "step": 8142 + }, + { + "epoch": 1.6120036980585193, + "grad_norm": 2.15625, + "learning_rate": 4.379858241287558e-06, + "loss": 0.979, + "step": 8143 + }, + { + "epoch": 1.6122035931136154, + "grad_norm": 2.171875, + "learning_rate": 4.378812590438843e-06, + "loss": 0.9252, + "step": 8144 + }, + { + "epoch": 1.6124034881687113, + "grad_norm": 2.171875, + "learning_rate": 4.377766967183206e-06, + "loss": 0.9043, + "step": 8145 + }, + { + "epoch": 1.6126033832238074, + "grad_norm": 2.234375, + "learning_rate": 4.376721371567094e-06, + "loss": 0.9587, + "step": 8146 + }, + { + "epoch": 1.6128032782789035, + "grad_norm": 2.15625, + "learning_rate": 4.375675803636953e-06, + "loss": 1.0145, + "step": 8147 + }, + { + "epoch": 1.6130031733339996, + "grad_norm": 2.078125, + "learning_rate": 4.374630263439225e-06, + "loss": 0.9653, + "step": 8148 + }, + { + "epoch": 1.6132030683890957, + "grad_norm": 2.25, + "learning_rate": 4.3735847510203536e-06, + "loss": 0.9276, + "step": 8149 + }, + { + "epoch": 1.6134029634441918, + "grad_norm": 2.125, + "learning_rate": 4.372539266426783e-06, + "loss": 0.9764, + "step": 8150 + }, + { + "epoch": 1.6136028584992879, + "grad_norm": 2.265625, + "learning_rate": 4.371493809704948e-06, + "loss": 1.016, + "step": 8151 + }, + { + "epoch": 1.613802753554384, + "grad_norm": 2.125, + "learning_rate": 4.3704483809012925e-06, + "loss": 1.0168, + "step": 8152 + }, + { + "epoch": 1.61400264860948, + "grad_norm": 2.203125, + "learning_rate": 4.369402980062253e-06, + "loss": 0.987, + "step": 8153 + }, + { + "epoch": 1.6142025436645762, + "grad_norm": 2.140625, + "learning_rate": 4.368357607234265e-06, + "loss": 0.9511, + "step": 8154 + }, + { + "epoch": 1.6144024387196723, + "grad_norm": 2.25, + "learning_rate": 4.367312262463764e-06, + "loss": 0.9389, + "step": 8155 + }, + { + "epoch": 1.6146023337747684, + "grad_norm": 2.125, + "learning_rate": 4.366266945797187e-06, + "loss": 1.0277, + "step": 8156 + }, + { + "epoch": 1.6148022288298645, + "grad_norm": 2.28125, + "learning_rate": 4.3652216572809645e-06, + "loss": 1.0851, + "step": 8157 + }, + { + "epoch": 1.6150021238849606, + "grad_norm": 2.078125, + "learning_rate": 4.3641763969615255e-06, + "loss": 0.9183, + "step": 8158 + }, + { + "epoch": 1.6152020189400564, + "grad_norm": 2.15625, + "learning_rate": 4.363131164885306e-06, + "loss": 0.9552, + "step": 8159 + }, + { + "epoch": 1.6154019139951525, + "grad_norm": 2.171875, + "learning_rate": 4.36208596109873e-06, + "loss": 1.0006, + "step": 8160 + }, + { + "epoch": 1.6156018090502486, + "grad_norm": 2.3125, + "learning_rate": 4.361040785648227e-06, + "loss": 1.0061, + "step": 8161 + }, + { + "epoch": 1.6158017041053447, + "grad_norm": 2.015625, + "learning_rate": 4.359995638580226e-06, + "loss": 0.962, + "step": 8162 + }, + { + "epoch": 1.6160015991604406, + "grad_norm": 2.25, + "learning_rate": 4.358950519941149e-06, + "loss": 1.0112, + "step": 8163 + }, + { + "epoch": 1.6162014942155367, + "grad_norm": 2.171875, + "learning_rate": 4.357905429777422e-06, + "loss": 0.9687, + "step": 8164 + }, + { + "epoch": 1.6164013892706328, + "grad_norm": 2.265625, + "learning_rate": 4.356860368135468e-06, + "loss": 1.0494, + "step": 8165 + }, + { + "epoch": 1.616601284325729, + "grad_norm": 2.28125, + "learning_rate": 4.3558153350617065e-06, + "loss": 0.9395, + "step": 8166 + }, + { + "epoch": 1.616801179380825, + "grad_norm": 2.0, + "learning_rate": 4.354770330602559e-06, + "loss": 0.9022, + "step": 8167 + }, + { + "epoch": 1.6170010744359211, + "grad_norm": 2.28125, + "learning_rate": 4.353725354804445e-06, + "loss": 0.9848, + "step": 8168 + }, + { + "epoch": 1.6172009694910172, + "grad_norm": 2.203125, + "learning_rate": 4.3526804077137816e-06, + "loss": 1.0458, + "step": 8169 + }, + { + "epoch": 1.6174008645461133, + "grad_norm": 2.140625, + "learning_rate": 4.351635489376986e-06, + "loss": 0.9763, + "step": 8170 + }, + { + "epoch": 1.6176007596012094, + "grad_norm": 2.125, + "learning_rate": 4.350590599840472e-06, + "loss": 0.9329, + "step": 8171 + }, + { + "epoch": 1.6178006546563055, + "grad_norm": 2.15625, + "learning_rate": 4.349545739150654e-06, + "loss": 0.8845, + "step": 8172 + }, + { + "epoch": 1.6180005497114016, + "grad_norm": 2.125, + "learning_rate": 4.3485009073539445e-06, + "loss": 0.9565, + "step": 8173 + }, + { + "epoch": 1.6182004447664977, + "grad_norm": 2.125, + "learning_rate": 4.3474561044967555e-06, + "loss": 1.0195, + "step": 8174 + }, + { + "epoch": 1.6184003398215938, + "grad_norm": 2.234375, + "learning_rate": 4.346411330625496e-06, + "loss": 0.9583, + "step": 8175 + }, + { + "epoch": 1.6186002348766897, + "grad_norm": 2.296875, + "learning_rate": 4.345366585786577e-06, + "loss": 1.0182, + "step": 8176 + }, + { + "epoch": 1.6188001299317858, + "grad_norm": 2.1875, + "learning_rate": 4.344321870026404e-06, + "loss": 1.0077, + "step": 8177 + }, + { + "epoch": 1.619000024986882, + "grad_norm": 2.125, + "learning_rate": 4.343277183391384e-06, + "loss": 0.9333, + "step": 8178 + }, + { + "epoch": 1.619199920041978, + "grad_norm": 2.0625, + "learning_rate": 4.342232525927919e-06, + "loss": 0.8814, + "step": 8179 + }, + { + "epoch": 1.619399815097074, + "grad_norm": 2.328125, + "learning_rate": 4.341187897682416e-06, + "loss": 0.9667, + "step": 8180 + }, + { + "epoch": 1.61959971015217, + "grad_norm": 2.203125, + "learning_rate": 4.3401432987012775e-06, + "loss": 1.0133, + "step": 8181 + }, + { + "epoch": 1.619799605207266, + "grad_norm": 2.296875, + "learning_rate": 4.339098729030902e-06, + "loss": 0.913, + "step": 8182 + }, + { + "epoch": 1.6199995002623622, + "grad_norm": 2.078125, + "learning_rate": 4.3380541887176904e-06, + "loss": 1.0063, + "step": 8183 + }, + { + "epoch": 1.6201993953174583, + "grad_norm": 2.171875, + "learning_rate": 4.337009677808042e-06, + "loss": 0.9996, + "step": 8184 + }, + { + "epoch": 1.6203992903725544, + "grad_norm": 2.25, + "learning_rate": 4.335965196348352e-06, + "loss": 0.9258, + "step": 8185 + }, + { + "epoch": 1.6205991854276505, + "grad_norm": 2.03125, + "learning_rate": 4.334920744385017e-06, + "loss": 0.9185, + "step": 8186 + }, + { + "epoch": 1.6207990804827466, + "grad_norm": 2.234375, + "learning_rate": 4.3338763219644335e-06, + "loss": 1.129, + "step": 8187 + }, + { + "epoch": 1.6209989755378427, + "grad_norm": 2.078125, + "learning_rate": 4.332831929132991e-06, + "loss": 0.9931, + "step": 8188 + }, + { + "epoch": 1.6211988705929388, + "grad_norm": 2.015625, + "learning_rate": 4.331787565937082e-06, + "loss": 0.8996, + "step": 8189 + }, + { + "epoch": 1.6213987656480349, + "grad_norm": 2.078125, + "learning_rate": 4.330743232423101e-06, + "loss": 0.956, + "step": 8190 + }, + { + "epoch": 1.621598660703131, + "grad_norm": 2.125, + "learning_rate": 4.32969892863743e-06, + "loss": 0.9862, + "step": 8191 + }, + { + "epoch": 1.621798555758227, + "grad_norm": 2.21875, + "learning_rate": 4.328654654626463e-06, + "loss": 1.0203, + "step": 8192 + }, + { + "epoch": 1.6219984508133232, + "grad_norm": 2.078125, + "learning_rate": 4.3276104104365855e-06, + "loss": 0.9044, + "step": 8193 + }, + { + "epoch": 1.622198345868419, + "grad_norm": 2.03125, + "learning_rate": 4.32656619611418e-06, + "loss": 0.9614, + "step": 8194 + }, + { + "epoch": 1.6223982409235151, + "grad_norm": 2.15625, + "learning_rate": 4.3255220117056325e-06, + "loss": 0.9691, + "step": 8195 + }, + { + "epoch": 1.6225981359786112, + "grad_norm": 2.125, + "learning_rate": 4.324477857257326e-06, + "loss": 0.9605, + "step": 8196 + }, + { + "epoch": 1.6227980310337073, + "grad_norm": 2.21875, + "learning_rate": 4.323433732815641e-06, + "loss": 1.0356, + "step": 8197 + }, + { + "epoch": 1.6229979260888032, + "grad_norm": 2.109375, + "learning_rate": 4.322389638426957e-06, + "loss": 0.8856, + "step": 8198 + }, + { + "epoch": 1.6231978211438993, + "grad_norm": 2.15625, + "learning_rate": 4.321345574137652e-06, + "loss": 0.9635, + "step": 8199 + }, + { + "epoch": 1.6233977161989954, + "grad_norm": 2.15625, + "learning_rate": 4.320301539994105e-06, + "loss": 0.9722, + "step": 8200 + }, + { + "epoch": 1.6235976112540915, + "grad_norm": 2.15625, + "learning_rate": 4.319257536042692e-06, + "loss": 0.9877, + "step": 8201 + }, + { + "epoch": 1.6237975063091876, + "grad_norm": 2.21875, + "learning_rate": 4.318213562329784e-06, + "loss": 0.8835, + "step": 8202 + }, + { + "epoch": 1.6239974013642837, + "grad_norm": 2.296875, + "learning_rate": 4.317169618901758e-06, + "loss": 0.9898, + "step": 8203 + }, + { + "epoch": 1.6241972964193798, + "grad_norm": 2.125, + "learning_rate": 4.316125705804986e-06, + "loss": 0.8772, + "step": 8204 + }, + { + "epoch": 1.624397191474476, + "grad_norm": 2.21875, + "learning_rate": 4.315081823085835e-06, + "loss": 1.0199, + "step": 8205 + }, + { + "epoch": 1.624597086529572, + "grad_norm": 2.1875, + "learning_rate": 4.3140379707906765e-06, + "loss": 1.0246, + "step": 8206 + }, + { + "epoch": 1.6247969815846681, + "grad_norm": 2.1875, + "learning_rate": 4.312994148965879e-06, + "loss": 0.912, + "step": 8207 + }, + { + "epoch": 1.6249968766397642, + "grad_norm": 2.203125, + "learning_rate": 4.311950357657807e-06, + "loss": 0.9617, + "step": 8208 + }, + { + "epoch": 1.6251967716948603, + "grad_norm": 2.1875, + "learning_rate": 4.310906596912828e-06, + "loss": 0.9523, + "step": 8209 + }, + { + "epoch": 1.6253966667499564, + "grad_norm": 2.21875, + "learning_rate": 4.309862866777303e-06, + "loss": 0.9298, + "step": 8210 + }, + { + "epoch": 1.6255965618050523, + "grad_norm": 2.109375, + "learning_rate": 4.3088191672975965e-06, + "loss": 0.9174, + "step": 8211 + }, + { + "epoch": 1.6257964568601484, + "grad_norm": 2.046875, + "learning_rate": 4.30777549852007e-06, + "loss": 1.0472, + "step": 8212 + }, + { + "epoch": 1.6259963519152445, + "grad_norm": 2.234375, + "learning_rate": 4.30673186049108e-06, + "loss": 1.0767, + "step": 8213 + }, + { + "epoch": 1.6261962469703406, + "grad_norm": 2.09375, + "learning_rate": 4.305688253256986e-06, + "loss": 0.9619, + "step": 8214 + }, + { + "epoch": 1.6263961420254367, + "grad_norm": 2.1875, + "learning_rate": 4.304644676864149e-06, + "loss": 1.0446, + "step": 8215 + }, + { + "epoch": 1.6265960370805326, + "grad_norm": 2.15625, + "learning_rate": 4.303601131358918e-06, + "loss": 1.0276, + "step": 8216 + }, + { + "epoch": 1.6267959321356287, + "grad_norm": 2.21875, + "learning_rate": 4.302557616787652e-06, + "loss": 0.953, + "step": 8217 + }, + { + "epoch": 1.6269958271907248, + "grad_norm": 2.1875, + "learning_rate": 4.3015141331967045e-06, + "loss": 0.9412, + "step": 8218 + }, + { + "epoch": 1.6271957222458209, + "grad_norm": 2.046875, + "learning_rate": 4.300470680632421e-06, + "loss": 0.9793, + "step": 8219 + }, + { + "epoch": 1.627395617300917, + "grad_norm": 2.0, + "learning_rate": 4.299427259141155e-06, + "loss": 0.9653, + "step": 8220 + }, + { + "epoch": 1.627595512356013, + "grad_norm": 2.1875, + "learning_rate": 4.298383868769257e-06, + "loss": 0.9435, + "step": 8221 + }, + { + "epoch": 1.6277954074111092, + "grad_norm": 2.140625, + "learning_rate": 4.297340509563072e-06, + "loss": 0.9783, + "step": 8222 + }, + { + "epoch": 1.6279953024662053, + "grad_norm": 2.1875, + "learning_rate": 4.296297181568946e-06, + "loss": 1.0285, + "step": 8223 + }, + { + "epoch": 1.6281951975213014, + "grad_norm": 2.1875, + "learning_rate": 4.295253884833225e-06, + "loss": 0.9742, + "step": 8224 + }, + { + "epoch": 1.6283950925763975, + "grad_norm": 2.203125, + "learning_rate": 4.29421061940225e-06, + "loss": 0.9601, + "step": 8225 + }, + { + "epoch": 1.6285949876314936, + "grad_norm": 2.140625, + "learning_rate": 4.293167385322364e-06, + "loss": 0.9869, + "step": 8226 + }, + { + "epoch": 1.6287948826865897, + "grad_norm": 2.125, + "learning_rate": 4.292124182639909e-06, + "loss": 0.9445, + "step": 8227 + }, + { + "epoch": 1.6289947777416858, + "grad_norm": 2.21875, + "learning_rate": 4.29108101140122e-06, + "loss": 0.9883, + "step": 8228 + }, + { + "epoch": 1.6291946727967817, + "grad_norm": 2.03125, + "learning_rate": 4.290037871652639e-06, + "loss": 0.9173, + "step": 8229 + }, + { + "epoch": 1.6293945678518778, + "grad_norm": 2.203125, + "learning_rate": 4.288994763440498e-06, + "loss": 1.0868, + "step": 8230 + }, + { + "epoch": 1.6295944629069739, + "grad_norm": 2.140625, + "learning_rate": 4.2879516868111346e-06, + "loss": 1.0022, + "step": 8231 + }, + { + "epoch": 1.62979435796207, + "grad_norm": 2.0625, + "learning_rate": 4.2869086418108815e-06, + "loss": 0.9782, + "step": 8232 + }, + { + "epoch": 1.6299942530171658, + "grad_norm": 2.15625, + "learning_rate": 4.285865628486069e-06, + "loss": 0.9706, + "step": 8233 + }, + { + "epoch": 1.630194148072262, + "grad_norm": 2.140625, + "learning_rate": 4.2848226468830295e-06, + "loss": 0.9746, + "step": 8234 + }, + { + "epoch": 1.630394043127358, + "grad_norm": 2.0625, + "learning_rate": 4.283779697048093e-06, + "loss": 0.9263, + "step": 8235 + }, + { + "epoch": 1.6305939381824541, + "grad_norm": 2.328125, + "learning_rate": 4.282736779027584e-06, + "loss": 1.057, + "step": 8236 + }, + { + "epoch": 1.6307938332375502, + "grad_norm": 2.078125, + "learning_rate": 4.281693892867832e-06, + "loss": 0.8657, + "step": 8237 + }, + { + "epoch": 1.6309937282926463, + "grad_norm": 2.09375, + "learning_rate": 4.28065103861516e-06, + "loss": 0.9345, + "step": 8238 + }, + { + "epoch": 1.6311936233477424, + "grad_norm": 2.234375, + "learning_rate": 4.2796082163158914e-06, + "loss": 0.9327, + "step": 8239 + }, + { + "epoch": 1.6313935184028385, + "grad_norm": 2.1875, + "learning_rate": 4.278565426016351e-06, + "loss": 0.8978, + "step": 8240 + }, + { + "epoch": 1.6315934134579346, + "grad_norm": 2.28125, + "learning_rate": 4.277522667762855e-06, + "loss": 0.9943, + "step": 8241 + }, + { + "epoch": 1.6317933085130307, + "grad_norm": 2.09375, + "learning_rate": 4.276479941601726e-06, + "loss": 0.8993, + "step": 8242 + }, + { + "epoch": 1.6319932035681268, + "grad_norm": 2.109375, + "learning_rate": 4.275437247579281e-06, + "loss": 1.0119, + "step": 8243 + }, + { + "epoch": 1.632193098623223, + "grad_norm": 2.15625, + "learning_rate": 4.274394585741835e-06, + "loss": 1.0501, + "step": 8244 + }, + { + "epoch": 1.632392993678319, + "grad_norm": 2.25, + "learning_rate": 4.273351956135704e-06, + "loss": 1.005, + "step": 8245 + }, + { + "epoch": 1.632592888733415, + "grad_norm": 2.234375, + "learning_rate": 4.272309358807203e-06, + "loss": 1.0611, + "step": 8246 + }, + { + "epoch": 1.632792783788511, + "grad_norm": 2.328125, + "learning_rate": 4.271266793802641e-06, + "loss": 1.0112, + "step": 8247 + }, + { + "epoch": 1.632992678843607, + "grad_norm": 2.140625, + "learning_rate": 4.270224261168332e-06, + "loss": 1.0161, + "step": 8248 + }, + { + "epoch": 1.6331925738987032, + "grad_norm": 2.1875, + "learning_rate": 4.269181760950584e-06, + "loss": 0.9708, + "step": 8249 + }, + { + "epoch": 1.6333924689537993, + "grad_norm": 2.171875, + "learning_rate": 4.268139293195702e-06, + "loss": 0.9985, + "step": 8250 + }, + { + "epoch": 1.6335923640088952, + "grad_norm": 2.171875, + "learning_rate": 4.267096857949994e-06, + "loss": 1.0267, + "step": 8251 + }, + { + "epoch": 1.6337922590639913, + "grad_norm": 2.140625, + "learning_rate": 4.266054455259767e-06, + "loss": 0.9011, + "step": 8252 + }, + { + "epoch": 1.6339921541190874, + "grad_norm": 2.171875, + "learning_rate": 4.265012085171322e-06, + "loss": 0.9273, + "step": 8253 + }, + { + "epoch": 1.6341920491741835, + "grad_norm": 2.046875, + "learning_rate": 4.263969747730961e-06, + "loss": 0.8469, + "step": 8254 + }, + { + "epoch": 1.6343919442292796, + "grad_norm": 2.234375, + "learning_rate": 4.262927442984986e-06, + "loss": 0.91, + "step": 8255 + }, + { + "epoch": 1.6345918392843757, + "grad_norm": 2.15625, + "learning_rate": 4.261885170979695e-06, + "loss": 1.0107, + "step": 8256 + }, + { + "epoch": 1.6347917343394718, + "grad_norm": 2.15625, + "learning_rate": 4.260842931761385e-06, + "loss": 0.9458, + "step": 8257 + }, + { + "epoch": 1.6349916293945679, + "grad_norm": 2.109375, + "learning_rate": 4.259800725376355e-06, + "loss": 0.9503, + "step": 8258 + }, + { + "epoch": 1.635191524449664, + "grad_norm": 2.09375, + "learning_rate": 4.258758551870896e-06, + "loss": 0.9429, + "step": 8259 + }, + { + "epoch": 1.63539141950476, + "grad_norm": 2.21875, + "learning_rate": 4.257716411291304e-06, + "loss": 0.9112, + "step": 8260 + }, + { + "epoch": 1.6355913145598562, + "grad_norm": 2.328125, + "learning_rate": 4.256674303683869e-06, + "loss": 1.0088, + "step": 8261 + }, + { + "epoch": 1.6357912096149523, + "grad_norm": 2.109375, + "learning_rate": 4.255632229094882e-06, + "loss": 0.9294, + "step": 8262 + }, + { + "epoch": 1.6359911046700484, + "grad_norm": 2.203125, + "learning_rate": 4.254590187570633e-06, + "loss": 0.9696, + "step": 8263 + }, + { + "epoch": 1.6361909997251443, + "grad_norm": 2.296875, + "learning_rate": 4.253548179157407e-06, + "loss": 0.995, + "step": 8264 + }, + { + "epoch": 1.6363908947802404, + "grad_norm": 2.171875, + "learning_rate": 4.252506203901491e-06, + "loss": 0.9733, + "step": 8265 + }, + { + "epoch": 1.6365907898353365, + "grad_norm": 2.21875, + "learning_rate": 4.251464261849171e-06, + "loss": 0.9563, + "step": 8266 + }, + { + "epoch": 1.6367906848904326, + "grad_norm": 2.234375, + "learning_rate": 4.2504223530467275e-06, + "loss": 1.0219, + "step": 8267 + }, + { + "epoch": 1.6369905799455284, + "grad_norm": 2.234375, + "learning_rate": 4.249380477540444e-06, + "loss": 0.9858, + "step": 8268 + }, + { + "epoch": 1.6371904750006245, + "grad_norm": 2.109375, + "learning_rate": 4.248338635376599e-06, + "loss": 0.9386, + "step": 8269 + }, + { + "epoch": 1.6373903700557206, + "grad_norm": 2.25, + "learning_rate": 4.247296826601471e-06, + "loss": 1.0425, + "step": 8270 + }, + { + "epoch": 1.6375902651108167, + "grad_norm": 2.34375, + "learning_rate": 4.246255051261338e-06, + "loss": 0.9948, + "step": 8271 + }, + { + "epoch": 1.6377901601659128, + "grad_norm": 2.234375, + "learning_rate": 4.245213309402475e-06, + "loss": 0.9659, + "step": 8272 + }, + { + "epoch": 1.637990055221009, + "grad_norm": 2.46875, + "learning_rate": 4.244171601071156e-06, + "loss": 1.1023, + "step": 8273 + }, + { + "epoch": 1.638189950276105, + "grad_norm": 2.15625, + "learning_rate": 4.243129926313654e-06, + "loss": 0.9661, + "step": 8274 + }, + { + "epoch": 1.6383898453312011, + "grad_norm": 2.125, + "learning_rate": 4.242088285176239e-06, + "loss": 0.9441, + "step": 8275 + }, + { + "epoch": 1.6385897403862972, + "grad_norm": 2.171875, + "learning_rate": 4.241046677705183e-06, + "loss": 1.0038, + "step": 8276 + }, + { + "epoch": 1.6387896354413933, + "grad_norm": 2.21875, + "learning_rate": 4.240005103946751e-06, + "loss": 0.9978, + "step": 8277 + }, + { + "epoch": 1.6389895304964894, + "grad_norm": 2.109375, + "learning_rate": 4.238963563947212e-06, + "loss": 0.9425, + "step": 8278 + }, + { + "epoch": 1.6391894255515855, + "grad_norm": 2.25, + "learning_rate": 4.237922057752831e-06, + "loss": 1.0408, + "step": 8279 + }, + { + "epoch": 1.6393893206066816, + "grad_norm": 2.125, + "learning_rate": 4.236880585409872e-06, + "loss": 0.9364, + "step": 8280 + }, + { + "epoch": 1.6395892156617777, + "grad_norm": 2.296875, + "learning_rate": 4.235839146964592e-06, + "loss": 0.9781, + "step": 8281 + }, + { + "epoch": 1.6397891107168736, + "grad_norm": 2.25, + "learning_rate": 4.234797742463258e-06, + "loss": 1.0187, + "step": 8282 + }, + { + "epoch": 1.6399890057719697, + "grad_norm": 2.15625, + "learning_rate": 4.2337563719521254e-06, + "loss": 0.9255, + "step": 8283 + }, + { + "epoch": 1.6401889008270658, + "grad_norm": 2.1875, + "learning_rate": 4.2327150354774536e-06, + "loss": 0.9822, + "step": 8284 + }, + { + "epoch": 1.640388795882162, + "grad_norm": 2.078125, + "learning_rate": 4.231673733085497e-06, + "loss": 0.9543, + "step": 8285 + }, + { + "epoch": 1.6405886909372578, + "grad_norm": 2.09375, + "learning_rate": 4.230632464822513e-06, + "loss": 1.0042, + "step": 8286 + }, + { + "epoch": 1.6407885859923539, + "grad_norm": 2.09375, + "learning_rate": 4.229591230734751e-06, + "loss": 0.8565, + "step": 8287 + }, + { + "epoch": 1.64098848104745, + "grad_norm": 2.171875, + "learning_rate": 4.228550030868465e-06, + "loss": 0.9459, + "step": 8288 + }, + { + "epoch": 1.641188376102546, + "grad_norm": 2.1875, + "learning_rate": 4.227508865269904e-06, + "loss": 1.0064, + "step": 8289 + }, + { + "epoch": 1.6413882711576422, + "grad_norm": 2.25, + "learning_rate": 4.226467733985316e-06, + "loss": 0.999, + "step": 8290 + }, + { + "epoch": 1.6415881662127383, + "grad_norm": 2.0625, + "learning_rate": 4.22542663706095e-06, + "loss": 0.8225, + "step": 8291 + }, + { + "epoch": 1.6417880612678344, + "grad_norm": 2.109375, + "learning_rate": 4.22438557454305e-06, + "loss": 0.9228, + "step": 8292 + }, + { + "epoch": 1.6419879563229305, + "grad_norm": 2.078125, + "learning_rate": 4.223344546477858e-06, + "loss": 0.9381, + "step": 8293 + }, + { + "epoch": 1.6421878513780266, + "grad_norm": 2.109375, + "learning_rate": 4.2223035529116205e-06, + "loss": 1.005, + "step": 8294 + }, + { + "epoch": 1.6423877464331227, + "grad_norm": 2.078125, + "learning_rate": 4.2212625938905745e-06, + "loss": 0.8917, + "step": 8295 + }, + { + "epoch": 1.6425876414882188, + "grad_norm": 2.21875, + "learning_rate": 4.220221669460962e-06, + "loss": 1.0419, + "step": 8296 + }, + { + "epoch": 1.6427875365433149, + "grad_norm": 2.078125, + "learning_rate": 4.21918077966902e-06, + "loss": 0.9797, + "step": 8297 + }, + { + "epoch": 1.642987431598411, + "grad_norm": 2.109375, + "learning_rate": 4.218139924560983e-06, + "loss": 0.9871, + "step": 8298 + }, + { + "epoch": 1.6431873266535069, + "grad_norm": 2.1875, + "learning_rate": 4.217099104183089e-06, + "loss": 0.9627, + "step": 8299 + }, + { + "epoch": 1.643387221708603, + "grad_norm": 2.15625, + "learning_rate": 4.216058318581567e-06, + "loss": 0.9994, + "step": 8300 + }, + { + "epoch": 1.643587116763699, + "grad_norm": 2.203125, + "learning_rate": 4.215017567802651e-06, + "loss": 0.9186, + "step": 8301 + }, + { + "epoch": 1.6437870118187952, + "grad_norm": 2.25, + "learning_rate": 4.213976851892573e-06, + "loss": 0.9584, + "step": 8302 + }, + { + "epoch": 1.643986906873891, + "grad_norm": 2.234375, + "learning_rate": 4.212936170897557e-06, + "loss": 1.0087, + "step": 8303 + }, + { + "epoch": 1.6441868019289871, + "grad_norm": 2.21875, + "learning_rate": 4.211895524863832e-06, + "loss": 1.0012, + "step": 8304 + }, + { + "epoch": 1.6443866969840832, + "grad_norm": 2.171875, + "learning_rate": 4.210854913837625e-06, + "loss": 0.9215, + "step": 8305 + }, + { + "epoch": 1.6445865920391793, + "grad_norm": 2.140625, + "learning_rate": 4.209814337865158e-06, + "loss": 0.9373, + "step": 8306 + }, + { + "epoch": 1.6447864870942754, + "grad_norm": 2.125, + "learning_rate": 4.2087737969926545e-06, + "loss": 0.8611, + "step": 8307 + }, + { + "epoch": 1.6449863821493715, + "grad_norm": 2.03125, + "learning_rate": 4.2077332912663335e-06, + "loss": 0.8602, + "step": 8308 + }, + { + "epoch": 1.6451862772044676, + "grad_norm": 2.1875, + "learning_rate": 4.206692820732415e-06, + "loss": 1.0522, + "step": 8309 + }, + { + "epoch": 1.6453861722595637, + "grad_norm": 2.1875, + "learning_rate": 4.205652385437118e-06, + "loss": 0.9538, + "step": 8310 + }, + { + "epoch": 1.6455860673146598, + "grad_norm": 2.234375, + "learning_rate": 4.204611985426657e-06, + "loss": 1.025, + "step": 8311 + }, + { + "epoch": 1.645785962369756, + "grad_norm": 2.265625, + "learning_rate": 4.203571620747246e-06, + "loss": 1.0084, + "step": 8312 + }, + { + "epoch": 1.645985857424852, + "grad_norm": 2.09375, + "learning_rate": 4.202531291445098e-06, + "loss": 0.9268, + "step": 8313 + }, + { + "epoch": 1.6461857524799481, + "grad_norm": 2.15625, + "learning_rate": 4.201490997566426e-06, + "loss": 0.9529, + "step": 8314 + }, + { + "epoch": 1.6463856475350442, + "grad_norm": 2.0625, + "learning_rate": 4.200450739157437e-06, + "loss": 1.0054, + "step": 8315 + }, + { + "epoch": 1.6465855425901403, + "grad_norm": 2.0625, + "learning_rate": 4.199410516264342e-06, + "loss": 0.9759, + "step": 8316 + }, + { + "epoch": 1.6467854376452362, + "grad_norm": 2.359375, + "learning_rate": 4.198370328933346e-06, + "loss": 1.0142, + "step": 8317 + }, + { + "epoch": 1.6469853327003323, + "grad_norm": 2.109375, + "learning_rate": 4.197330177210654e-06, + "loss": 0.9481, + "step": 8318 + }, + { + "epoch": 1.6471852277554284, + "grad_norm": 2.140625, + "learning_rate": 4.19629006114247e-06, + "loss": 0.9881, + "step": 8319 + }, + { + "epoch": 1.6473851228105245, + "grad_norm": 2.234375, + "learning_rate": 4.195249980774997e-06, + "loss": 0.906, + "step": 8320 + }, + { + "epoch": 1.6475850178656204, + "grad_norm": 2.21875, + "learning_rate": 4.194209936154431e-06, + "loss": 1.0061, + "step": 8321 + }, + { + "epoch": 1.6477849129207165, + "grad_norm": 2.25, + "learning_rate": 4.1931699273269765e-06, + "loss": 0.9856, + "step": 8322 + }, + { + "epoch": 1.6479848079758126, + "grad_norm": 2.296875, + "learning_rate": 4.192129954338826e-06, + "loss": 1.0957, + "step": 8323 + }, + { + "epoch": 1.6481847030309087, + "grad_norm": 2.0625, + "learning_rate": 4.191090017236177e-06, + "loss": 0.9819, + "step": 8324 + }, + { + "epoch": 1.6483845980860048, + "grad_norm": 2.375, + "learning_rate": 4.190050116065224e-06, + "loss": 1.0189, + "step": 8325 + }, + { + "epoch": 1.648584493141101, + "grad_norm": 2.140625, + "learning_rate": 4.1890102508721565e-06, + "loss": 0.9359, + "step": 8326 + }, + { + "epoch": 1.648784388196197, + "grad_norm": 2.234375, + "learning_rate": 4.187970421703168e-06, + "loss": 0.9377, + "step": 8327 + }, + { + "epoch": 1.648984283251293, + "grad_norm": 2.171875, + "learning_rate": 4.186930628604447e-06, + "loss": 1.0538, + "step": 8328 + }, + { + "epoch": 1.6491841783063892, + "grad_norm": 2.125, + "learning_rate": 4.18589087162218e-06, + "loss": 0.9485, + "step": 8329 + }, + { + "epoch": 1.6493840733614853, + "grad_norm": 2.046875, + "learning_rate": 4.184851150802554e-06, + "loss": 0.9153, + "step": 8330 + }, + { + "epoch": 1.6495839684165814, + "grad_norm": 2.28125, + "learning_rate": 4.1838114661917525e-06, + "loss": 1.0567, + "step": 8331 + }, + { + "epoch": 1.6497838634716775, + "grad_norm": 2.046875, + "learning_rate": 4.1827718178359586e-06, + "loss": 0.9752, + "step": 8332 + }, + { + "epoch": 1.6499837585267736, + "grad_norm": 2.078125, + "learning_rate": 4.181732205781354e-06, + "loss": 0.9755, + "step": 8333 + }, + { + "epoch": 1.6501836535818695, + "grad_norm": 2.1875, + "learning_rate": 4.180692630074116e-06, + "loss": 1.0196, + "step": 8334 + }, + { + "epoch": 1.6503835486369656, + "grad_norm": 2.125, + "learning_rate": 4.179653090760424e-06, + "loss": 0.8684, + "step": 8335 + }, + { + "epoch": 1.6505834436920617, + "grad_norm": 2.171875, + "learning_rate": 4.178613587886455e-06, + "loss": 1.0431, + "step": 8336 + }, + { + "epoch": 1.6507833387471578, + "grad_norm": 2.203125, + "learning_rate": 4.177574121498382e-06, + "loss": 0.9899, + "step": 8337 + }, + { + "epoch": 1.6509832338022539, + "grad_norm": 2.15625, + "learning_rate": 4.17653469164238e-06, + "loss": 0.9906, + "step": 8338 + }, + { + "epoch": 1.6511831288573497, + "grad_norm": 2.15625, + "learning_rate": 4.175495298364618e-06, + "loss": 0.9755, + "step": 8339 + }, + { + "epoch": 1.6513830239124458, + "grad_norm": 2.125, + "learning_rate": 4.174455941711266e-06, + "loss": 0.99, + "step": 8340 + }, + { + "epoch": 1.651582918967542, + "grad_norm": 2.1875, + "learning_rate": 4.173416621728495e-06, + "loss": 0.9646, + "step": 8341 + }, + { + "epoch": 1.651782814022638, + "grad_norm": 2.359375, + "learning_rate": 4.17237733846247e-06, + "loss": 0.9579, + "step": 8342 + }, + { + "epoch": 1.6519827090777341, + "grad_norm": 2.171875, + "learning_rate": 4.1713380919593525e-06, + "loss": 0.926, + "step": 8343 + }, + { + "epoch": 1.6521826041328302, + "grad_norm": 2.1875, + "learning_rate": 4.170298882265309e-06, + "loss": 1.0695, + "step": 8344 + }, + { + "epoch": 1.6523824991879263, + "grad_norm": 2.21875, + "learning_rate": 4.169259709426502e-06, + "loss": 0.988, + "step": 8345 + }, + { + "epoch": 1.6525823942430224, + "grad_norm": 2.203125, + "learning_rate": 4.168220573489088e-06, + "loss": 0.9672, + "step": 8346 + }, + { + "epoch": 1.6527822892981185, + "grad_norm": 2.25, + "learning_rate": 4.167181474499228e-06, + "loss": 0.9888, + "step": 8347 + }, + { + "epoch": 1.6529821843532146, + "grad_norm": 2.125, + "learning_rate": 4.166142412503078e-06, + "loss": 1.0004, + "step": 8348 + }, + { + "epoch": 1.6531820794083107, + "grad_norm": 2.09375, + "learning_rate": 4.1651033875467935e-06, + "loss": 0.8566, + "step": 8349 + }, + { + "epoch": 1.6533819744634068, + "grad_norm": 2.28125, + "learning_rate": 4.1640643996765275e-06, + "loss": 1.0011, + "step": 8350 + }, + { + "epoch": 1.653581869518503, + "grad_norm": 2.078125, + "learning_rate": 4.163025448938431e-06, + "loss": 0.9704, + "step": 8351 + }, + { + "epoch": 1.6537817645735988, + "grad_norm": 2.109375, + "learning_rate": 4.1619865353786535e-06, + "loss": 0.9291, + "step": 8352 + }, + { + "epoch": 1.653981659628695, + "grad_norm": 2.171875, + "learning_rate": 4.160947659043347e-06, + "loss": 0.9484, + "step": 8353 + }, + { + "epoch": 1.654181554683791, + "grad_norm": 2.171875, + "learning_rate": 4.159908819978654e-06, + "loss": 0.9807, + "step": 8354 + }, + { + "epoch": 1.6543814497388871, + "grad_norm": 2.046875, + "learning_rate": 4.158870018230722e-06, + "loss": 0.8579, + "step": 8355 + }, + { + "epoch": 1.654581344793983, + "grad_norm": 2.078125, + "learning_rate": 4.157831253845695e-06, + "loss": 0.9327, + "step": 8356 + }, + { + "epoch": 1.654781239849079, + "grad_norm": 2.25, + "learning_rate": 4.156792526869712e-06, + "loss": 0.9865, + "step": 8357 + }, + { + "epoch": 1.6549811349041752, + "grad_norm": 2.0625, + "learning_rate": 4.155753837348917e-06, + "loss": 0.9142, + "step": 8358 + }, + { + "epoch": 1.6551810299592713, + "grad_norm": 2.28125, + "learning_rate": 4.154715185329445e-06, + "loss": 0.9701, + "step": 8359 + }, + { + "epoch": 1.6553809250143674, + "grad_norm": 2.109375, + "learning_rate": 4.153676570857434e-06, + "loss": 0.962, + "step": 8360 + }, + { + "epoch": 1.6555808200694635, + "grad_norm": 2.71875, + "learning_rate": 4.152637993979019e-06, + "loss": 1.0536, + "step": 8361 + }, + { + "epoch": 1.6557807151245596, + "grad_norm": 2.203125, + "learning_rate": 4.151599454740335e-06, + "loss": 1.0314, + "step": 8362 + }, + { + "epoch": 1.6559806101796557, + "grad_norm": 2.078125, + "learning_rate": 4.150560953187511e-06, + "loss": 0.8813, + "step": 8363 + }, + { + "epoch": 1.6561805052347518, + "grad_norm": 2.25, + "learning_rate": 4.149522489366681e-06, + "loss": 1.0078, + "step": 8364 + }, + { + "epoch": 1.656380400289848, + "grad_norm": 2.234375, + "learning_rate": 4.148484063323969e-06, + "loss": 1.0463, + "step": 8365 + }, + { + "epoch": 1.656580295344944, + "grad_norm": 1.984375, + "learning_rate": 4.147445675105506e-06, + "loss": 0.8978, + "step": 8366 + }, + { + "epoch": 1.65678019040004, + "grad_norm": 2.09375, + "learning_rate": 4.146407324757414e-06, + "loss": 0.8991, + "step": 8367 + }, + { + "epoch": 1.6569800854551362, + "grad_norm": 2.09375, + "learning_rate": 4.145369012325816e-06, + "loss": 0.9344, + "step": 8368 + }, + { + "epoch": 1.657179980510232, + "grad_norm": 2.0625, + "learning_rate": 4.1443307378568385e-06, + "loss": 0.9609, + "step": 8369 + }, + { + "epoch": 1.6573798755653282, + "grad_norm": 2.078125, + "learning_rate": 4.143292501396596e-06, + "loss": 1.0138, + "step": 8370 + }, + { + "epoch": 1.6575797706204243, + "grad_norm": 2.3125, + "learning_rate": 4.142254302991209e-06, + "loss": 1.0597, + "step": 8371 + }, + { + "epoch": 1.6577796656755204, + "grad_norm": 2.3125, + "learning_rate": 4.141216142686795e-06, + "loss": 0.9977, + "step": 8372 + }, + { + "epoch": 1.6579795607306165, + "grad_norm": 2.140625, + "learning_rate": 4.14017802052947e-06, + "loss": 0.9488, + "step": 8373 + }, + { + "epoch": 1.6581794557857124, + "grad_norm": 2.09375, + "learning_rate": 4.139139936565343e-06, + "loss": 0.976, + "step": 8374 + }, + { + "epoch": 1.6583793508408085, + "grad_norm": 2.109375, + "learning_rate": 4.138101890840528e-06, + "loss": 0.915, + "step": 8375 + }, + { + "epoch": 1.6585792458959046, + "grad_norm": 2.390625, + "learning_rate": 4.137063883401137e-06, + "loss": 0.9585, + "step": 8376 + }, + { + "epoch": 1.6587791409510007, + "grad_norm": 2.15625, + "learning_rate": 4.136025914293274e-06, + "loss": 0.9513, + "step": 8377 + }, + { + "epoch": 1.6589790360060968, + "grad_norm": 2.203125, + "learning_rate": 4.1349879835630486e-06, + "loss": 0.955, + "step": 8378 + }, + { + "epoch": 1.6591789310611929, + "grad_norm": 2.265625, + "learning_rate": 4.133950091256564e-06, + "loss": 1.0102, + "step": 8379 + }, + { + "epoch": 1.659378826116289, + "grad_norm": 2.171875, + "learning_rate": 4.1329122374199234e-06, + "loss": 0.9872, + "step": 8380 + }, + { + "epoch": 1.659578721171385, + "grad_norm": 2.171875, + "learning_rate": 4.13187442209923e-06, + "loss": 0.9144, + "step": 8381 + }, + { + "epoch": 1.6597786162264812, + "grad_norm": 2.234375, + "learning_rate": 4.130836645340581e-06, + "loss": 0.9445, + "step": 8382 + }, + { + "epoch": 1.6599785112815773, + "grad_norm": 2.15625, + "learning_rate": 4.129798907190076e-06, + "loss": 0.9533, + "step": 8383 + }, + { + "epoch": 1.6601784063366734, + "grad_norm": 2.203125, + "learning_rate": 4.12876120769381e-06, + "loss": 0.8938, + "step": 8384 + }, + { + "epoch": 1.6603783013917695, + "grad_norm": 2.171875, + "learning_rate": 4.127723546897879e-06, + "loss": 0.9748, + "step": 8385 + }, + { + "epoch": 1.6605781964468656, + "grad_norm": 2.125, + "learning_rate": 4.126685924848373e-06, + "loss": 0.9792, + "step": 8386 + }, + { + "epoch": 1.6607780915019614, + "grad_norm": 2.109375, + "learning_rate": 4.125648341591387e-06, + "loss": 0.9547, + "step": 8387 + }, + { + "epoch": 1.6609779865570575, + "grad_norm": 2.15625, + "learning_rate": 4.124610797173008e-06, + "loss": 0.9843, + "step": 8388 + }, + { + "epoch": 1.6611778816121536, + "grad_norm": 2.25, + "learning_rate": 4.123573291639323e-06, + "loss": 0.9881, + "step": 8389 + }, + { + "epoch": 1.6613777766672497, + "grad_norm": 2.234375, + "learning_rate": 4.12253582503642e-06, + "loss": 1.0074, + "step": 8390 + }, + { + "epoch": 1.6615776717223456, + "grad_norm": 2.203125, + "learning_rate": 4.12149839741038e-06, + "loss": 0.9708, + "step": 8391 + }, + { + "epoch": 1.6617775667774417, + "grad_norm": 2.203125, + "learning_rate": 4.120461008807289e-06, + "loss": 0.9391, + "step": 8392 + }, + { + "epoch": 1.6619774618325378, + "grad_norm": 2.125, + "learning_rate": 4.119423659273226e-06, + "loss": 0.975, + "step": 8393 + }, + { + "epoch": 1.662177356887634, + "grad_norm": 2.078125, + "learning_rate": 4.1183863488542686e-06, + "loss": 0.8772, + "step": 8394 + }, + { + "epoch": 1.66237725194273, + "grad_norm": 2.1875, + "learning_rate": 4.117349077596497e-06, + "loss": 1.0748, + "step": 8395 + }, + { + "epoch": 1.662577146997826, + "grad_norm": 2.15625, + "learning_rate": 4.116311845545983e-06, + "loss": 1.0191, + "step": 8396 + }, + { + "epoch": 1.6627770420529222, + "grad_norm": 2.171875, + "learning_rate": 4.115274652748806e-06, + "loss": 0.9597, + "step": 8397 + }, + { + "epoch": 1.6629769371080183, + "grad_norm": 2.21875, + "learning_rate": 4.114237499251031e-06, + "loss": 0.8679, + "step": 8398 + }, + { + "epoch": 1.6631768321631144, + "grad_norm": 2.09375, + "learning_rate": 4.113200385098733e-06, + "loss": 0.9605, + "step": 8399 + }, + { + "epoch": 1.6633767272182105, + "grad_norm": 2.078125, + "learning_rate": 4.112163310337981e-06, + "loss": 0.8945, + "step": 8400 + }, + { + "epoch": 1.6635766222733066, + "grad_norm": 2.125, + "learning_rate": 4.1111262750148375e-06, + "loss": 0.8471, + "step": 8401 + }, + { + "epoch": 1.6637765173284027, + "grad_norm": 2.171875, + "learning_rate": 4.11008927917537e-06, + "loss": 0.9585, + "step": 8402 + }, + { + "epoch": 1.6639764123834988, + "grad_norm": 2.1875, + "learning_rate": 4.109052322865643e-06, + "loss": 1.0087, + "step": 8403 + }, + { + "epoch": 1.664176307438595, + "grad_norm": 2.09375, + "learning_rate": 4.1080154061317175e-06, + "loss": 0.8538, + "step": 8404 + }, + { + "epoch": 1.6643762024936908, + "grad_norm": 2.140625, + "learning_rate": 4.1069785290196505e-06, + "loss": 0.9302, + "step": 8405 + }, + { + "epoch": 1.6645760975487869, + "grad_norm": 2.171875, + "learning_rate": 4.105941691575502e-06, + "loss": 1.0174, + "step": 8406 + }, + { + "epoch": 1.664775992603883, + "grad_norm": 2.09375, + "learning_rate": 4.104904893845328e-06, + "loss": 1.0642, + "step": 8407 + }, + { + "epoch": 1.664975887658979, + "grad_norm": 2.03125, + "learning_rate": 4.103868135875184e-06, + "loss": 0.8731, + "step": 8408 + }, + { + "epoch": 1.665175782714075, + "grad_norm": 2.234375, + "learning_rate": 4.102831417711121e-06, + "loss": 1.0226, + "step": 8409 + }, + { + "epoch": 1.665375677769171, + "grad_norm": 2.375, + "learning_rate": 4.1017947393991905e-06, + "loss": 1.0029, + "step": 8410 + }, + { + "epoch": 1.6655755728242672, + "grad_norm": 2.203125, + "learning_rate": 4.100758100985441e-06, + "loss": 1.0093, + "step": 8411 + }, + { + "epoch": 1.6657754678793633, + "grad_norm": 2.171875, + "learning_rate": 4.099721502515922e-06, + "loss": 1.0087, + "step": 8412 + }, + { + "epoch": 1.6659753629344594, + "grad_norm": 2.109375, + "learning_rate": 4.098684944036677e-06, + "loss": 1.006, + "step": 8413 + }, + { + "epoch": 1.6661752579895555, + "grad_norm": 2.21875, + "learning_rate": 4.09764842559375e-06, + "loss": 1.0076, + "step": 8414 + }, + { + "epoch": 1.6663751530446516, + "grad_norm": 2.09375, + "learning_rate": 4.096611947233185e-06, + "loss": 0.9425, + "step": 8415 + }, + { + "epoch": 1.6665750480997477, + "grad_norm": 2.046875, + "learning_rate": 4.09557550900102e-06, + "loss": 0.9662, + "step": 8416 + }, + { + "epoch": 1.6667749431548438, + "grad_norm": 2.265625, + "learning_rate": 4.094539110943295e-06, + "loss": 1.0501, + "step": 8417 + }, + { + "epoch": 1.6669748382099399, + "grad_norm": 2.203125, + "learning_rate": 4.093502753106044e-06, + "loss": 0.8633, + "step": 8418 + }, + { + "epoch": 1.667174733265036, + "grad_norm": 2.203125, + "learning_rate": 4.092466435535306e-06, + "loss": 0.939, + "step": 8419 + }, + { + "epoch": 1.667374628320132, + "grad_norm": 2.109375, + "learning_rate": 4.091430158277112e-06, + "loss": 1.0204, + "step": 8420 + }, + { + "epoch": 1.6675745233752282, + "grad_norm": 2.3125, + "learning_rate": 4.090393921377491e-06, + "loss": 0.9776, + "step": 8421 + }, + { + "epoch": 1.667774418430324, + "grad_norm": 2.171875, + "learning_rate": 4.089357724882477e-06, + "loss": 1.0552, + "step": 8422 + }, + { + "epoch": 1.6679743134854201, + "grad_norm": 2.21875, + "learning_rate": 4.088321568838095e-06, + "loss": 0.9877, + "step": 8423 + }, + { + "epoch": 1.6681742085405162, + "grad_norm": 2.15625, + "learning_rate": 4.087285453290372e-06, + "loss": 0.9206, + "step": 8424 + }, + { + "epoch": 1.6683741035956123, + "grad_norm": 2.328125, + "learning_rate": 4.08624937828533e-06, + "loss": 0.9945, + "step": 8425 + }, + { + "epoch": 1.6685739986507082, + "grad_norm": 2.125, + "learning_rate": 4.085213343868995e-06, + "loss": 0.9323, + "step": 8426 + }, + { + "epoch": 1.6687738937058043, + "grad_norm": 2.109375, + "learning_rate": 4.084177350087384e-06, + "loss": 0.8495, + "step": 8427 + }, + { + "epoch": 1.6689737887609004, + "grad_norm": 2.125, + "learning_rate": 4.0831413969865195e-06, + "loss": 0.9174, + "step": 8428 + }, + { + "epoch": 1.6691736838159965, + "grad_norm": 2.046875, + "learning_rate": 4.082105484612414e-06, + "loss": 0.9647, + "step": 8429 + }, + { + "epoch": 1.6693735788710926, + "grad_norm": 2.109375, + "learning_rate": 4.0810696130110855e-06, + "loss": 0.9339, + "step": 8430 + }, + { + "epoch": 1.6695734739261887, + "grad_norm": 2.15625, + "learning_rate": 4.080033782228547e-06, + "loss": 0.9517, + "step": 8431 + }, + { + "epoch": 1.6697733689812848, + "grad_norm": 2.09375, + "learning_rate": 4.078997992310809e-06, + "loss": 0.9333, + "step": 8432 + }, + { + "epoch": 1.669973264036381, + "grad_norm": 2.4375, + "learning_rate": 4.0779622433038825e-06, + "loss": 1.0035, + "step": 8433 + }, + { + "epoch": 1.670173159091477, + "grad_norm": 2.25, + "learning_rate": 4.076926535253775e-06, + "loss": 1.0483, + "step": 8434 + }, + { + "epoch": 1.670373054146573, + "grad_norm": 2.203125, + "learning_rate": 4.075890868206494e-06, + "loss": 1.0404, + "step": 8435 + }, + { + "epoch": 1.6705729492016692, + "grad_norm": 2.046875, + "learning_rate": 4.074855242208039e-06, + "loss": 0.94, + "step": 8436 + }, + { + "epoch": 1.6707728442567653, + "grad_norm": 2.046875, + "learning_rate": 4.0738196573044155e-06, + "loss": 0.9711, + "step": 8437 + }, + { + "epoch": 1.6709727393118614, + "grad_norm": 2.21875, + "learning_rate": 4.072784113541625e-06, + "loss": 0.953, + "step": 8438 + }, + { + "epoch": 1.6711726343669575, + "grad_norm": 2.125, + "learning_rate": 4.0717486109656636e-06, + "loss": 0.9248, + "step": 8439 + }, + { + "epoch": 1.6713725294220534, + "grad_norm": 2.21875, + "learning_rate": 4.070713149622532e-06, + "loss": 0.9457, + "step": 8440 + }, + { + "epoch": 1.6715724244771495, + "grad_norm": 2.125, + "learning_rate": 4.06967772955822e-06, + "loss": 0.9245, + "step": 8441 + }, + { + "epoch": 1.6717723195322456, + "grad_norm": 2.078125, + "learning_rate": 4.068642350818726e-06, + "loss": 0.9103, + "step": 8442 + }, + { + "epoch": 1.6719722145873417, + "grad_norm": 2.1875, + "learning_rate": 4.067607013450039e-06, + "loss": 1.0159, + "step": 8443 + }, + { + "epoch": 1.6721721096424376, + "grad_norm": 2.15625, + "learning_rate": 4.066571717498149e-06, + "loss": 0.9965, + "step": 8444 + }, + { + "epoch": 1.6723720046975337, + "grad_norm": 2.21875, + "learning_rate": 4.065536463009043e-06, + "loss": 1.0675, + "step": 8445 + }, + { + "epoch": 1.6725718997526298, + "grad_norm": 2.046875, + "learning_rate": 4.064501250028708e-06, + "loss": 0.9118, + "step": 8446 + }, + { + "epoch": 1.6727717948077259, + "grad_norm": 2.203125, + "learning_rate": 4.063466078603127e-06, + "loss": 0.9403, + "step": 8447 + }, + { + "epoch": 1.672971689862822, + "grad_norm": 2.125, + "learning_rate": 4.062430948778285e-06, + "loss": 0.9674, + "step": 8448 + }, + { + "epoch": 1.673171584917918, + "grad_norm": 2.203125, + "learning_rate": 4.061395860600157e-06, + "loss": 0.933, + "step": 8449 + }, + { + "epoch": 1.6733714799730142, + "grad_norm": 2.21875, + "learning_rate": 4.060360814114725e-06, + "loss": 1.0244, + "step": 8450 + }, + { + "epoch": 1.6735713750281103, + "grad_norm": 2.21875, + "learning_rate": 4.059325809367967e-06, + "loss": 1.1114, + "step": 8451 + }, + { + "epoch": 1.6737712700832064, + "grad_norm": 2.28125, + "learning_rate": 4.058290846405856e-06, + "loss": 1.0581, + "step": 8452 + }, + { + "epoch": 1.6739711651383025, + "grad_norm": 2.09375, + "learning_rate": 4.0572559252743635e-06, + "loss": 0.8494, + "step": 8453 + }, + { + "epoch": 1.6741710601933986, + "grad_norm": 2.140625, + "learning_rate": 4.056221046019464e-06, + "loss": 0.967, + "step": 8454 + }, + { + "epoch": 1.6743709552484947, + "grad_norm": 2.34375, + "learning_rate": 4.055186208687123e-06, + "loss": 0.9342, + "step": 8455 + }, + { + "epoch": 1.6745708503035908, + "grad_norm": 2.125, + "learning_rate": 4.0541514133233115e-06, + "loss": 0.8761, + "step": 8456 + }, + { + "epoch": 1.6747707453586866, + "grad_norm": 2.234375, + "learning_rate": 4.053116659973991e-06, + "loss": 0.9926, + "step": 8457 + }, + { + "epoch": 1.6749706404137827, + "grad_norm": 2.15625, + "learning_rate": 4.052081948685128e-06, + "loss": 0.9398, + "step": 8458 + }, + { + "epoch": 1.6751705354688788, + "grad_norm": 2.140625, + "learning_rate": 4.051047279502686e-06, + "loss": 1.0457, + "step": 8459 + }, + { + "epoch": 1.675370430523975, + "grad_norm": 2.1875, + "learning_rate": 4.05001265247262e-06, + "loss": 0.9247, + "step": 8460 + }, + { + "epoch": 1.675570325579071, + "grad_norm": 2.21875, + "learning_rate": 4.04897806764089e-06, + "loss": 0.9881, + "step": 8461 + }, + { + "epoch": 1.675770220634167, + "grad_norm": 2.140625, + "learning_rate": 4.047943525053455e-06, + "loss": 0.9475, + "step": 8462 + }, + { + "epoch": 1.675970115689263, + "grad_norm": 2.140625, + "learning_rate": 4.046909024756265e-06, + "loss": 0.9269, + "step": 8463 + }, + { + "epoch": 1.6761700107443591, + "grad_norm": 2.578125, + "learning_rate": 4.045874566795275e-06, + "loss": 1.0492, + "step": 8464 + }, + { + "epoch": 1.6763699057994552, + "grad_norm": 2.15625, + "learning_rate": 4.044840151216435e-06, + "loss": 0.9486, + "step": 8465 + }, + { + "epoch": 1.6765698008545513, + "grad_norm": 2.125, + "learning_rate": 4.043805778065696e-06, + "loss": 0.9758, + "step": 8466 + }, + { + "epoch": 1.6767696959096474, + "grad_norm": 2.171875, + "learning_rate": 4.042771447388998e-06, + "loss": 0.8804, + "step": 8467 + }, + { + "epoch": 1.6769695909647435, + "grad_norm": 2.171875, + "learning_rate": 4.041737159232293e-06, + "loss": 0.9541, + "step": 8468 + }, + { + "epoch": 1.6771694860198396, + "grad_norm": 2.125, + "learning_rate": 4.040702913641519e-06, + "loss": 0.9378, + "step": 8469 + }, + { + "epoch": 1.6773693810749357, + "grad_norm": 2.125, + "learning_rate": 4.03966871066262e-06, + "loss": 0.949, + "step": 8470 + }, + { + "epoch": 1.6775692761300318, + "grad_norm": 2.09375, + "learning_rate": 4.038634550341534e-06, + "loss": 0.984, + "step": 8471 + }, + { + "epoch": 1.677769171185128, + "grad_norm": 2.125, + "learning_rate": 4.037600432724199e-06, + "loss": 0.9495, + "step": 8472 + }, + { + "epoch": 1.677969066240224, + "grad_norm": 2.296875, + "learning_rate": 4.036566357856549e-06, + "loss": 1.0034, + "step": 8473 + }, + { + "epoch": 1.6781689612953201, + "grad_norm": 2.09375, + "learning_rate": 4.035532325784519e-06, + "loss": 0.912, + "step": 8474 + }, + { + "epoch": 1.678368856350416, + "grad_norm": 2.140625, + "learning_rate": 4.034498336554041e-06, + "loss": 0.9738, + "step": 8475 + }, + { + "epoch": 1.678568751405512, + "grad_norm": 2.09375, + "learning_rate": 4.033464390211043e-06, + "loss": 0.9138, + "step": 8476 + }, + { + "epoch": 1.6787686464606082, + "grad_norm": 2.046875, + "learning_rate": 4.032430486801454e-06, + "loss": 1.0189, + "step": 8477 + }, + { + "epoch": 1.6789685415157043, + "grad_norm": 2.3125, + "learning_rate": 4.031396626371198e-06, + "loss": 0.9922, + "step": 8478 + }, + { + "epoch": 1.6791684365708002, + "grad_norm": 2.203125, + "learning_rate": 4.030362808966202e-06, + "loss": 1.0465, + "step": 8479 + }, + { + "epoch": 1.6793683316258963, + "grad_norm": 2.140625, + "learning_rate": 4.029329034632386e-06, + "loss": 0.9951, + "step": 8480 + }, + { + "epoch": 1.6795682266809924, + "grad_norm": 2.25, + "learning_rate": 4.02829530341567e-06, + "loss": 1.0458, + "step": 8481 + }, + { + "epoch": 1.6797681217360885, + "grad_norm": 2.09375, + "learning_rate": 4.027261615361973e-06, + "loss": 0.9252, + "step": 8482 + }, + { + "epoch": 1.6799680167911846, + "grad_norm": 2.21875, + "learning_rate": 4.026227970517212e-06, + "loss": 0.982, + "step": 8483 + }, + { + "epoch": 1.6801679118462807, + "grad_norm": 2.046875, + "learning_rate": 4.0251943689273e-06, + "loss": 0.9143, + "step": 8484 + }, + { + "epoch": 1.6803678069013768, + "grad_norm": 2.09375, + "learning_rate": 4.02416081063815e-06, + "loss": 0.9427, + "step": 8485 + }, + { + "epoch": 1.6805677019564729, + "grad_norm": 2.328125, + "learning_rate": 4.0231272956956725e-06, + "loss": 1.0324, + "step": 8486 + }, + { + "epoch": 1.680767597011569, + "grad_norm": 2.15625, + "learning_rate": 4.022093824145777e-06, + "loss": 0.9414, + "step": 8487 + }, + { + "epoch": 1.680967492066665, + "grad_norm": 2.1875, + "learning_rate": 4.021060396034369e-06, + "loss": 0.9265, + "step": 8488 + }, + { + "epoch": 1.6811673871217612, + "grad_norm": 2.140625, + "learning_rate": 4.020027011407353e-06, + "loss": 0.9412, + "step": 8489 + }, + { + "epoch": 1.6813672821768573, + "grad_norm": 2.109375, + "learning_rate": 4.018993670310633e-06, + "loss": 0.9536, + "step": 8490 + }, + { + "epoch": 1.6815671772319534, + "grad_norm": 2.203125, + "learning_rate": 4.017960372790109e-06, + "loss": 0.9672, + "step": 8491 + }, + { + "epoch": 1.6817670722870492, + "grad_norm": 2.03125, + "learning_rate": 4.01692711889168e-06, + "loss": 0.8579, + "step": 8492 + }, + { + "epoch": 1.6819669673421453, + "grad_norm": 2.171875, + "learning_rate": 4.015893908661245e-06, + "loss": 0.9421, + "step": 8493 + }, + { + "epoch": 1.6821668623972414, + "grad_norm": 2.09375, + "learning_rate": 4.014860742144696e-06, + "loss": 0.9703, + "step": 8494 + }, + { + "epoch": 1.6823667574523375, + "grad_norm": 2.046875, + "learning_rate": 4.013827619387928e-06, + "loss": 0.9083, + "step": 8495 + }, + { + "epoch": 1.6825666525074336, + "grad_norm": 2.15625, + "learning_rate": 4.012794540436832e-06, + "loss": 0.9675, + "step": 8496 + }, + { + "epoch": 1.6827665475625295, + "grad_norm": 2.1875, + "learning_rate": 4.0117615053372986e-06, + "loss": 1.0955, + "step": 8497 + }, + { + "epoch": 1.6829664426176256, + "grad_norm": 2.140625, + "learning_rate": 4.010728514135211e-06, + "loss": 0.9257, + "step": 8498 + }, + { + "epoch": 1.6831663376727217, + "grad_norm": 2.125, + "learning_rate": 4.009695566876459e-06, + "loss": 0.9684, + "step": 8499 + }, + { + "epoch": 1.6833662327278178, + "grad_norm": 2.1875, + "learning_rate": 4.008662663606923e-06, + "loss": 0.9572, + "step": 8500 + }, + { + "epoch": 1.683566127782914, + "grad_norm": 2.046875, + "learning_rate": 4.007629804372486e-06, + "loss": 0.9897, + "step": 8501 + }, + { + "epoch": 1.68376602283801, + "grad_norm": 2.296875, + "learning_rate": 4.006596989219027e-06, + "loss": 0.9713, + "step": 8502 + }, + { + "epoch": 1.6839659178931061, + "grad_norm": 2.03125, + "learning_rate": 4.005564218192424e-06, + "loss": 0.8463, + "step": 8503 + }, + { + "epoch": 1.6841658129482022, + "grad_norm": 2.1875, + "learning_rate": 4.004531491338551e-06, + "loss": 0.9783, + "step": 8504 + }, + { + "epoch": 1.6843657080032983, + "grad_norm": 2.15625, + "learning_rate": 4.0034988087032845e-06, + "loss": 0.993, + "step": 8505 + }, + { + "epoch": 1.6845656030583944, + "grad_norm": 2.21875, + "learning_rate": 4.002466170332493e-06, + "loss": 0.8878, + "step": 8506 + }, + { + "epoch": 1.6847654981134905, + "grad_norm": 2.140625, + "learning_rate": 4.001433576272049e-06, + "loss": 1.0262, + "step": 8507 + }, + { + "epoch": 1.6849653931685866, + "grad_norm": 2.140625, + "learning_rate": 4.000401026567818e-06, + "loss": 0.9601, + "step": 8508 + }, + { + "epoch": 1.6851652882236827, + "grad_norm": 2.109375, + "learning_rate": 3.999368521265667e-06, + "loss": 1.006, + "step": 8509 + }, + { + "epoch": 1.6853651832787786, + "grad_norm": 2.1875, + "learning_rate": 3.998336060411459e-06, + "loss": 0.8779, + "step": 8510 + }, + { + "epoch": 1.6855650783338747, + "grad_norm": 2.25, + "learning_rate": 3.997303644051056e-06, + "loss": 1.0226, + "step": 8511 + }, + { + "epoch": 1.6857649733889708, + "grad_norm": 2.203125, + "learning_rate": 3.9962712722303186e-06, + "loss": 0.9109, + "step": 8512 + }, + { + "epoch": 1.685964868444067, + "grad_norm": 2.203125, + "learning_rate": 3.995238944995105e-06, + "loss": 0.9653, + "step": 8513 + }, + { + "epoch": 1.6861647634991628, + "grad_norm": 2.15625, + "learning_rate": 3.994206662391269e-06, + "loss": 0.9835, + "step": 8514 + }, + { + "epoch": 1.6863646585542589, + "grad_norm": 2.1875, + "learning_rate": 3.993174424464665e-06, + "loss": 0.9628, + "step": 8515 + }, + { + "epoch": 1.686564553609355, + "grad_norm": 2.078125, + "learning_rate": 3.992142231261147e-06, + "loss": 0.9007, + "step": 8516 + }, + { + "epoch": 1.686764448664451, + "grad_norm": 2.359375, + "learning_rate": 3.991110082826562e-06, + "loss": 0.9254, + "step": 8517 + }, + { + "epoch": 1.6869643437195472, + "grad_norm": 2.234375, + "learning_rate": 3.990077979206761e-06, + "loss": 1.0035, + "step": 8518 + }, + { + "epoch": 1.6871642387746433, + "grad_norm": 2.109375, + "learning_rate": 3.989045920447587e-06, + "loss": 0.945, + "step": 8519 + }, + { + "epoch": 1.6873641338297394, + "grad_norm": 2.171875, + "learning_rate": 3.988013906594886e-06, + "loss": 0.8934, + "step": 8520 + }, + { + "epoch": 1.6875640288848355, + "grad_norm": 2.15625, + "learning_rate": 3.9869819376944985e-06, + "loss": 0.9832, + "step": 8521 + }, + { + "epoch": 1.6877639239399316, + "grad_norm": 2.171875, + "learning_rate": 3.985950013792265e-06, + "loss": 0.931, + "step": 8522 + }, + { + "epoch": 1.6879638189950277, + "grad_norm": 2.171875, + "learning_rate": 3.984918134934024e-06, + "loss": 0.9826, + "step": 8523 + }, + { + "epoch": 1.6881637140501238, + "grad_norm": 2.171875, + "learning_rate": 3.983886301165611e-06, + "loss": 0.9445, + "step": 8524 + }, + { + "epoch": 1.6883636091052199, + "grad_norm": 2.109375, + "learning_rate": 3.9828545125328606e-06, + "loss": 0.9148, + "step": 8525 + }, + { + "epoch": 1.688563504160316, + "grad_norm": 2.25, + "learning_rate": 3.9818227690816045e-06, + "loss": 1.0161, + "step": 8526 + }, + { + "epoch": 1.6887633992154119, + "grad_norm": 2.15625, + "learning_rate": 3.98079107085767e-06, + "loss": 0.9282, + "step": 8527 + }, + { + "epoch": 1.688963294270508, + "grad_norm": 2.015625, + "learning_rate": 3.979759417906891e-06, + "loss": 0.9579, + "step": 8528 + }, + { + "epoch": 1.689163189325604, + "grad_norm": 2.3125, + "learning_rate": 3.978727810275087e-06, + "loss": 0.9937, + "step": 8529 + }, + { + "epoch": 1.6893630843807002, + "grad_norm": 2.15625, + "learning_rate": 3.977696248008086e-06, + "loss": 1.0588, + "step": 8530 + }, + { + "epoch": 1.6895629794357963, + "grad_norm": 2.21875, + "learning_rate": 3.976664731151707e-06, + "loss": 1.0032, + "step": 8531 + }, + { + "epoch": 1.6897628744908921, + "grad_norm": 2.25, + "learning_rate": 3.975633259751771e-06, + "loss": 1.0594, + "step": 8532 + }, + { + "epoch": 1.6899627695459882, + "grad_norm": 2.1875, + "learning_rate": 3.974601833854097e-06, + "loss": 0.9511, + "step": 8533 + }, + { + "epoch": 1.6901626646010843, + "grad_norm": 2.125, + "learning_rate": 3.9735704535045e-06, + "loss": 0.9048, + "step": 8534 + }, + { + "epoch": 1.6903625596561804, + "grad_norm": 2.15625, + "learning_rate": 3.9725391187487935e-06, + "loss": 0.9463, + "step": 8535 + }, + { + "epoch": 1.6905624547112765, + "grad_norm": 2.078125, + "learning_rate": 3.97150782963279e-06, + "loss": 0.9615, + "step": 8536 + }, + { + "epoch": 1.6907623497663726, + "grad_norm": 2.203125, + "learning_rate": 3.9704765862022985e-06, + "loss": 0.9832, + "step": 8537 + }, + { + "epoch": 1.6909622448214687, + "grad_norm": 2.203125, + "learning_rate": 3.969445388503128e-06, + "loss": 0.8743, + "step": 8538 + }, + { + "epoch": 1.6911621398765648, + "grad_norm": 2.078125, + "learning_rate": 3.968414236581083e-06, + "loss": 0.8841, + "step": 8539 + }, + { + "epoch": 1.691362034931661, + "grad_norm": 2.046875, + "learning_rate": 3.967383130481966e-06, + "loss": 0.9036, + "step": 8540 + }, + { + "epoch": 1.691561929986757, + "grad_norm": 2.203125, + "learning_rate": 3.966352070251582e-06, + "loss": 0.8706, + "step": 8541 + }, + { + "epoch": 1.6917618250418531, + "grad_norm": 2.140625, + "learning_rate": 3.965321055935727e-06, + "loss": 0.996, + "step": 8542 + }, + { + "epoch": 1.6919617200969492, + "grad_norm": 2.109375, + "learning_rate": 3.9642900875802e-06, + "loss": 0.9702, + "step": 8543 + }, + { + "epoch": 1.6921616151520453, + "grad_norm": 2.109375, + "learning_rate": 3.9632591652307985e-06, + "loss": 0.9192, + "step": 8544 + }, + { + "epoch": 1.6923615102071412, + "grad_norm": 1.9921875, + "learning_rate": 3.9622282889333135e-06, + "loss": 0.9064, + "step": 8545 + }, + { + "epoch": 1.6925614052622373, + "grad_norm": 2.015625, + "learning_rate": 3.9611974587335375e-06, + "loss": 0.9418, + "step": 8546 + }, + { + "epoch": 1.6927613003173334, + "grad_norm": 2.234375, + "learning_rate": 3.9601666746772586e-06, + "loss": 0.973, + "step": 8547 + }, + { + "epoch": 1.6929611953724295, + "grad_norm": 2.21875, + "learning_rate": 3.959135936810265e-06, + "loss": 1.0562, + "step": 8548 + }, + { + "epoch": 1.6931610904275254, + "grad_norm": 2.25, + "learning_rate": 3.958105245178342e-06, + "loss": 0.9999, + "step": 8549 + }, + { + "epoch": 1.6933609854826215, + "grad_norm": 2.15625, + "learning_rate": 3.957074599827272e-06, + "loss": 0.9098, + "step": 8550 + }, + { + "epoch": 1.6935608805377176, + "grad_norm": 2.09375, + "learning_rate": 3.956044000802838e-06, + "loss": 0.9153, + "step": 8551 + }, + { + "epoch": 1.6937607755928137, + "grad_norm": 2.25, + "learning_rate": 3.955013448150818e-06, + "loss": 1.0571, + "step": 8552 + }, + { + "epoch": 1.6939606706479098, + "grad_norm": 2.03125, + "learning_rate": 3.953982941916988e-06, + "loss": 0.9183, + "step": 8553 + }, + { + "epoch": 1.6941605657030059, + "grad_norm": 2.125, + "learning_rate": 3.952952482147125e-06, + "loss": 0.9679, + "step": 8554 + }, + { + "epoch": 1.694360460758102, + "grad_norm": 2.109375, + "learning_rate": 3.9519220688870004e-06, + "loss": 0.9365, + "step": 8555 + }, + { + "epoch": 1.694560355813198, + "grad_norm": 2.046875, + "learning_rate": 3.950891702182386e-06, + "loss": 0.8167, + "step": 8556 + }, + { + "epoch": 1.6947602508682942, + "grad_norm": 2.171875, + "learning_rate": 3.94986138207905e-06, + "loss": 0.9672, + "step": 8557 + }, + { + "epoch": 1.6949601459233903, + "grad_norm": 2.15625, + "learning_rate": 3.948831108622759e-06, + "loss": 1.0135, + "step": 8558 + }, + { + "epoch": 1.6951600409784864, + "grad_norm": 2.09375, + "learning_rate": 3.94780088185928e-06, + "loss": 0.941, + "step": 8559 + }, + { + "epoch": 1.6953599360335825, + "grad_norm": 2.171875, + "learning_rate": 3.946770701834372e-06, + "loss": 1.034, + "step": 8560 + }, + { + "epoch": 1.6955598310886786, + "grad_norm": 2.359375, + "learning_rate": 3.945740568593796e-06, + "loss": 1.0687, + "step": 8561 + }, + { + "epoch": 1.6957597261437747, + "grad_norm": 2.109375, + "learning_rate": 3.944710482183312e-06, + "loss": 0.9726, + "step": 8562 + }, + { + "epoch": 1.6959596211988706, + "grad_norm": 2.25, + "learning_rate": 3.943680442648675e-06, + "loss": 1.0225, + "step": 8563 + }, + { + "epoch": 1.6961595162539667, + "grad_norm": 2.109375, + "learning_rate": 3.9426504500356415e-06, + "loss": 0.9074, + "step": 8564 + }, + { + "epoch": 1.6963594113090628, + "grad_norm": 2.078125, + "learning_rate": 3.94162050438996e-06, + "loss": 0.9779, + "step": 8565 + }, + { + "epoch": 1.6965593063641589, + "grad_norm": 2.046875, + "learning_rate": 3.940590605757383e-06, + "loss": 1.0321, + "step": 8566 + }, + { + "epoch": 1.6967592014192547, + "grad_norm": 2.15625, + "learning_rate": 3.93956075418366e-06, + "loss": 1.026, + "step": 8567 + }, + { + "epoch": 1.6969590964743508, + "grad_norm": 2.1875, + "learning_rate": 3.938530949714533e-06, + "loss": 1.0925, + "step": 8568 + }, + { + "epoch": 1.697158991529447, + "grad_norm": 2.15625, + "learning_rate": 3.937501192395749e-06, + "loss": 0.975, + "step": 8569 + }, + { + "epoch": 1.697358886584543, + "grad_norm": 2.09375, + "learning_rate": 3.936471482273048e-06, + "loss": 0.8929, + "step": 8570 + }, + { + "epoch": 1.6975587816396391, + "grad_norm": 2.046875, + "learning_rate": 3.935441819392169e-06, + "loss": 0.9929, + "step": 8571 + }, + { + "epoch": 1.6977586766947352, + "grad_norm": 2.03125, + "learning_rate": 3.934412203798853e-06, + "loss": 0.8692, + "step": 8572 + }, + { + "epoch": 1.6979585717498313, + "grad_norm": 2.125, + "learning_rate": 3.9333826355388325e-06, + "loss": 1.0455, + "step": 8573 + }, + { + "epoch": 1.6981584668049274, + "grad_norm": 2.203125, + "learning_rate": 3.93235311465784e-06, + "loss": 1.0355, + "step": 8574 + }, + { + "epoch": 1.6983583618600235, + "grad_norm": 2.109375, + "learning_rate": 3.93132364120161e-06, + "loss": 0.8788, + "step": 8575 + }, + { + "epoch": 1.6985582569151196, + "grad_norm": 2.171875, + "learning_rate": 3.930294215215868e-06, + "loss": 1.0396, + "step": 8576 + }, + { + "epoch": 1.6987581519702157, + "grad_norm": 2.140625, + "learning_rate": 3.929264836746345e-06, + "loss": 1.0266, + "step": 8577 + }, + { + "epoch": 1.6989580470253118, + "grad_norm": 2.15625, + "learning_rate": 3.928235505838762e-06, + "loss": 1.0414, + "step": 8578 + }, + { + "epoch": 1.699157942080408, + "grad_norm": 2.0625, + "learning_rate": 3.927206222538843e-06, + "loss": 0.8899, + "step": 8579 + }, + { + "epoch": 1.6993578371355038, + "grad_norm": 2.046875, + "learning_rate": 3.92617698689231e-06, + "loss": 0.9934, + "step": 8580 + }, + { + "epoch": 1.6995577321906, + "grad_norm": 2.0625, + "learning_rate": 3.92514779894488e-06, + "loss": 0.9783, + "step": 8581 + }, + { + "epoch": 1.699757627245696, + "grad_norm": 2.1875, + "learning_rate": 3.924118658742269e-06, + "loss": 1.0363, + "step": 8582 + }, + { + "epoch": 1.6999575223007921, + "grad_norm": 2.09375, + "learning_rate": 3.923089566330195e-06, + "loss": 1.0069, + "step": 8583 + }, + { + "epoch": 1.7001574173558882, + "grad_norm": 2.171875, + "learning_rate": 3.922060521754365e-06, + "loss": 0.9617, + "step": 8584 + }, + { + "epoch": 1.700357312410984, + "grad_norm": 2.0, + "learning_rate": 3.921031525060493e-06, + "loss": 0.8853, + "step": 8585 + }, + { + "epoch": 1.7005572074660802, + "grad_norm": 2.359375, + "learning_rate": 3.920002576294284e-06, + "loss": 0.9252, + "step": 8586 + }, + { + "epoch": 1.7007571025211763, + "grad_norm": 2.109375, + "learning_rate": 3.918973675501446e-06, + "loss": 0.9564, + "step": 8587 + }, + { + "epoch": 1.7009569975762724, + "grad_norm": 2.234375, + "learning_rate": 3.917944822727682e-06, + "loss": 0.9961, + "step": 8588 + }, + { + "epoch": 1.7011568926313685, + "grad_norm": 2.25, + "learning_rate": 3.916916018018696e-06, + "loss": 1.062, + "step": 8589 + }, + { + "epoch": 1.7013567876864646, + "grad_norm": 2.0625, + "learning_rate": 3.915887261420181e-06, + "loss": 0.9978, + "step": 8590 + }, + { + "epoch": 1.7015566827415607, + "grad_norm": 2.265625, + "learning_rate": 3.9148585529778385e-06, + "loss": 1.0687, + "step": 8591 + }, + { + "epoch": 1.7017565777966568, + "grad_norm": 2.015625, + "learning_rate": 3.913829892737364e-06, + "loss": 0.8257, + "step": 8592 + }, + { + "epoch": 1.7019564728517529, + "grad_norm": 2.09375, + "learning_rate": 3.912801280744449e-06, + "loss": 0.9383, + "step": 8593 + }, + { + "epoch": 1.702156367906849, + "grad_norm": 2.09375, + "learning_rate": 3.911772717044786e-06, + "loss": 1.0214, + "step": 8594 + }, + { + "epoch": 1.702356262961945, + "grad_norm": 2.0625, + "learning_rate": 3.910744201684062e-06, + "loss": 0.956, + "step": 8595 + }, + { + "epoch": 1.7025561580170412, + "grad_norm": 2.125, + "learning_rate": 3.909715734707964e-06, + "loss": 0.9843, + "step": 8596 + }, + { + "epoch": 1.7027560530721373, + "grad_norm": 2.140625, + "learning_rate": 3.908687316162178e-06, + "loss": 0.9228, + "step": 8597 + }, + { + "epoch": 1.7029559481272332, + "grad_norm": 2.15625, + "learning_rate": 3.907658946092383e-06, + "loss": 0.963, + "step": 8598 + }, + { + "epoch": 1.7031558431823293, + "grad_norm": 2.1875, + "learning_rate": 3.906630624544261e-06, + "loss": 0.9671, + "step": 8599 + }, + { + "epoch": 1.7033557382374254, + "grad_norm": 2.0625, + "learning_rate": 3.905602351563492e-06, + "loss": 1.0168, + "step": 8600 + }, + { + "epoch": 1.7035556332925215, + "grad_norm": 2.21875, + "learning_rate": 3.904574127195747e-06, + "loss": 0.9687, + "step": 8601 + }, + { + "epoch": 1.7037555283476173, + "grad_norm": 2.15625, + "learning_rate": 3.903545951486704e-06, + "loss": 1.0447, + "step": 8602 + }, + { + "epoch": 1.7039554234027134, + "grad_norm": 2.140625, + "learning_rate": 3.902517824482033e-06, + "loss": 0.9336, + "step": 8603 + }, + { + "epoch": 1.7041553184578095, + "grad_norm": 2.046875, + "learning_rate": 3.901489746227402e-06, + "loss": 0.898, + "step": 8604 + }, + { + "epoch": 1.7043552135129056, + "grad_norm": 2.109375, + "learning_rate": 3.900461716768479e-06, + "loss": 0.9529, + "step": 8605 + }, + { + "epoch": 1.7045551085680017, + "grad_norm": 2.109375, + "learning_rate": 3.89943373615093e-06, + "loss": 0.8709, + "step": 8606 + }, + { + "epoch": 1.7047550036230978, + "grad_norm": 2.078125, + "learning_rate": 3.8984058044204164e-06, + "loss": 1.0679, + "step": 8607 + }, + { + "epoch": 1.704954898678194, + "grad_norm": 2.15625, + "learning_rate": 3.8973779216226e-06, + "loss": 0.9793, + "step": 8608 + }, + { + "epoch": 1.70515479373329, + "grad_norm": 2.203125, + "learning_rate": 3.8963500878031376e-06, + "loss": 0.9541, + "step": 8609 + }, + { + "epoch": 1.7053546887883861, + "grad_norm": 2.125, + "learning_rate": 3.895322303007686e-06, + "loss": 0.9899, + "step": 8610 + }, + { + "epoch": 1.7055545838434822, + "grad_norm": 2.359375, + "learning_rate": 3.894294567281901e-06, + "loss": 0.979, + "step": 8611 + }, + { + "epoch": 1.7057544788985783, + "grad_norm": 2.140625, + "learning_rate": 3.893266880671433e-06, + "loss": 0.9598, + "step": 8612 + }, + { + "epoch": 1.7059543739536744, + "grad_norm": 2.328125, + "learning_rate": 3.892239243221931e-06, + "loss": 1.0535, + "step": 8613 + }, + { + "epoch": 1.7061542690087705, + "grad_norm": 2.25, + "learning_rate": 3.891211654979045e-06, + "loss": 1.0797, + "step": 8614 + }, + { + "epoch": 1.7063541640638664, + "grad_norm": 2.1875, + "learning_rate": 3.890184115988418e-06, + "loss": 1.0081, + "step": 8615 + }, + { + "epoch": 1.7065540591189625, + "grad_norm": 2.140625, + "learning_rate": 3.889156626295694e-06, + "loss": 1.0177, + "step": 8616 + }, + { + "epoch": 1.7067539541740586, + "grad_norm": 2.09375, + "learning_rate": 3.888129185946514e-06, + "loss": 1.0065, + "step": 8617 + }, + { + "epoch": 1.7069538492291547, + "grad_norm": 1.9921875, + "learning_rate": 3.887101794986516e-06, + "loss": 0.9566, + "step": 8618 + }, + { + "epoch": 1.7071537442842508, + "grad_norm": 2.25, + "learning_rate": 3.886074453461339e-06, + "loss": 0.9088, + "step": 8619 + }, + { + "epoch": 1.7073536393393467, + "grad_norm": 2.21875, + "learning_rate": 3.885047161416618e-06, + "loss": 1.0895, + "step": 8620 + }, + { + "epoch": 1.7075535343944428, + "grad_norm": 2.0625, + "learning_rate": 3.884019918897979e-06, + "loss": 0.9131, + "step": 8621 + }, + { + "epoch": 1.707753429449539, + "grad_norm": 2.34375, + "learning_rate": 3.882992725951057e-06, + "loss": 1.0781, + "step": 8622 + }, + { + "epoch": 1.707953324504635, + "grad_norm": 2.171875, + "learning_rate": 3.881965582621479e-06, + "loss": 0.9742, + "step": 8623 + }, + { + "epoch": 1.708153219559731, + "grad_norm": 2.078125, + "learning_rate": 3.88093848895487e-06, + "loss": 0.9311, + "step": 8624 + }, + { + "epoch": 1.7083531146148272, + "grad_norm": 2.484375, + "learning_rate": 3.879911444996854e-06, + "loss": 1.1179, + "step": 8625 + }, + { + "epoch": 1.7085530096699233, + "grad_norm": 2.15625, + "learning_rate": 3.878884450793053e-06, + "loss": 0.9329, + "step": 8626 + }, + { + "epoch": 1.7087529047250194, + "grad_norm": 2.328125, + "learning_rate": 3.877857506389083e-06, + "loss": 1.0512, + "step": 8627 + }, + { + "epoch": 1.7089527997801155, + "grad_norm": 2.1875, + "learning_rate": 3.876830611830565e-06, + "loss": 1.0058, + "step": 8628 + }, + { + "epoch": 1.7091526948352116, + "grad_norm": 2.28125, + "learning_rate": 3.87580376716311e-06, + "loss": 0.9564, + "step": 8629 + }, + { + "epoch": 1.7093525898903077, + "grad_norm": 2.109375, + "learning_rate": 3.874776972432331e-06, + "loss": 0.9368, + "step": 8630 + }, + { + "epoch": 1.7095524849454038, + "grad_norm": 2.0625, + "learning_rate": 3.87375022768384e-06, + "loss": 0.8779, + "step": 8631 + }, + { + "epoch": 1.7097523800005, + "grad_norm": 2.15625, + "learning_rate": 3.872723532963242e-06, + "loss": 0.976, + "step": 8632 + }, + { + "epoch": 1.7099522750555958, + "grad_norm": 2.140625, + "learning_rate": 3.871696888316145e-06, + "loss": 1.0064, + "step": 8633 + }, + { + "epoch": 1.7101521701106919, + "grad_norm": 2.296875, + "learning_rate": 3.870670293788153e-06, + "loss": 0.9534, + "step": 8634 + }, + { + "epoch": 1.710352065165788, + "grad_norm": 2.328125, + "learning_rate": 3.8696437494248645e-06, + "loss": 1.0084, + "step": 8635 + }, + { + "epoch": 1.710551960220884, + "grad_norm": 2.140625, + "learning_rate": 3.868617255271881e-06, + "loss": 0.9585, + "step": 8636 + }, + { + "epoch": 1.71075185527598, + "grad_norm": 2.1875, + "learning_rate": 3.867590811374797e-06, + "loss": 1.01, + "step": 8637 + }, + { + "epoch": 1.710951750331076, + "grad_norm": 2.1875, + "learning_rate": 3.866564417779208e-06, + "loss": 0.9748, + "step": 8638 + }, + { + "epoch": 1.7111516453861721, + "grad_norm": 2.0625, + "learning_rate": 3.865538074530708e-06, + "loss": 0.937, + "step": 8639 + }, + { + "epoch": 1.7113515404412682, + "grad_norm": 2.21875, + "learning_rate": 3.864511781674885e-06, + "loss": 0.9954, + "step": 8640 + }, + { + "epoch": 1.7115514354963643, + "grad_norm": 2.234375, + "learning_rate": 3.863485539257326e-06, + "loss": 0.9562, + "step": 8641 + }, + { + "epoch": 1.7117513305514604, + "grad_norm": 2.265625, + "learning_rate": 3.86245934732362e-06, + "loss": 1.0433, + "step": 8642 + }, + { + "epoch": 1.7119512256065565, + "grad_norm": 1.9765625, + "learning_rate": 3.861433205919347e-06, + "loss": 0.8788, + "step": 8643 + }, + { + "epoch": 1.7121511206616526, + "grad_norm": 2.171875, + "learning_rate": 3.86040711509009e-06, + "loss": 0.9968, + "step": 8644 + }, + { + "epoch": 1.7123510157167487, + "grad_norm": 2.21875, + "learning_rate": 3.859381074881427e-06, + "loss": 1.0523, + "step": 8645 + }, + { + "epoch": 1.7125509107718448, + "grad_norm": 2.15625, + "learning_rate": 3.858355085338935e-06, + "loss": 0.9658, + "step": 8646 + }, + { + "epoch": 1.712750805826941, + "grad_norm": 2.234375, + "learning_rate": 3.857329146508188e-06, + "loss": 0.907, + "step": 8647 + }, + { + "epoch": 1.712950700882037, + "grad_norm": 2.15625, + "learning_rate": 3.856303258434758e-06, + "loss": 0.8811, + "step": 8648 + }, + { + "epoch": 1.7131505959371331, + "grad_norm": 2.203125, + "learning_rate": 3.8552774211642154e-06, + "loss": 0.9952, + "step": 8649 + }, + { + "epoch": 1.713350490992229, + "grad_norm": 2.203125, + "learning_rate": 3.854251634742128e-06, + "loss": 0.9703, + "step": 8650 + }, + { + "epoch": 1.7135503860473251, + "grad_norm": 2.171875, + "learning_rate": 3.853225899214062e-06, + "loss": 0.9576, + "step": 8651 + }, + { + "epoch": 1.7137502811024212, + "grad_norm": 2.328125, + "learning_rate": 3.8522002146255765e-06, + "loss": 0.9298, + "step": 8652 + }, + { + "epoch": 1.7139501761575173, + "grad_norm": 2.171875, + "learning_rate": 3.851174581022236e-06, + "loss": 1.0168, + "step": 8653 + }, + { + "epoch": 1.7141500712126134, + "grad_norm": 2.25, + "learning_rate": 3.850148998449597e-06, + "loss": 0.9716, + "step": 8654 + }, + { + "epoch": 1.7143499662677093, + "grad_norm": 2.15625, + "learning_rate": 3.849123466953217e-06, + "loss": 1.003, + "step": 8655 + }, + { + "epoch": 1.7145498613228054, + "grad_norm": 2.15625, + "learning_rate": 3.84809798657865e-06, + "loss": 0.9513, + "step": 8656 + }, + { + "epoch": 1.7147497563779015, + "grad_norm": 2.0625, + "learning_rate": 3.847072557371448e-06, + "loss": 0.9694, + "step": 8657 + }, + { + "epoch": 1.7149496514329976, + "grad_norm": 2.078125, + "learning_rate": 3.846047179377159e-06, + "loss": 0.9758, + "step": 8658 + }, + { + "epoch": 1.7151495464880937, + "grad_norm": 2.078125, + "learning_rate": 3.8450218526413315e-06, + "loss": 0.9551, + "step": 8659 + }, + { + "epoch": 1.7153494415431898, + "grad_norm": 2.046875, + "learning_rate": 3.843996577209509e-06, + "loss": 0.979, + "step": 8660 + }, + { + "epoch": 1.715549336598286, + "grad_norm": 2.140625, + "learning_rate": 3.842971353127235e-06, + "loss": 1.0461, + "step": 8661 + }, + { + "epoch": 1.715749231653382, + "grad_norm": 2.15625, + "learning_rate": 3.841946180440052e-06, + "loss": 1.0534, + "step": 8662 + }, + { + "epoch": 1.715949126708478, + "grad_norm": 2.109375, + "learning_rate": 3.840921059193494e-06, + "loss": 0.9713, + "step": 8663 + }, + { + "epoch": 1.7161490217635742, + "grad_norm": 2.171875, + "learning_rate": 3.839895989433099e-06, + "loss": 0.9854, + "step": 8664 + }, + { + "epoch": 1.7163489168186703, + "grad_norm": 2.109375, + "learning_rate": 3.838870971204401e-06, + "loss": 1.0073, + "step": 8665 + }, + { + "epoch": 1.7165488118737664, + "grad_norm": 2.203125, + "learning_rate": 3.83784600455293e-06, + "loss": 0.9942, + "step": 8666 + }, + { + "epoch": 1.7167487069288625, + "grad_norm": 2.09375, + "learning_rate": 3.836821089524216e-06, + "loss": 0.9617, + "step": 8667 + }, + { + "epoch": 1.7169486019839584, + "grad_norm": 2.15625, + "learning_rate": 3.835796226163784e-06, + "loss": 0.9522, + "step": 8668 + }, + { + "epoch": 1.7171484970390545, + "grad_norm": 2.15625, + "learning_rate": 3.834771414517159e-06, + "loss": 0.9609, + "step": 8669 + }, + { + "epoch": 1.7173483920941506, + "grad_norm": 2.453125, + "learning_rate": 3.833746654629865e-06, + "loss": 0.9881, + "step": 8670 + }, + { + "epoch": 1.7175482871492467, + "grad_norm": 2.3125, + "learning_rate": 3.832721946547418e-06, + "loss": 0.917, + "step": 8671 + }, + { + "epoch": 1.7177481822043426, + "grad_norm": 2.34375, + "learning_rate": 3.831697290315339e-06, + "loss": 1.0694, + "step": 8672 + }, + { + "epoch": 1.7179480772594387, + "grad_norm": 2.1875, + "learning_rate": 3.830672685979142e-06, + "loss": 1.0193, + "step": 8673 + }, + { + "epoch": 1.7181479723145348, + "grad_norm": 2.078125, + "learning_rate": 3.829648133584338e-06, + "loss": 0.895, + "step": 8674 + }, + { + "epoch": 1.7183478673696309, + "grad_norm": 2.21875, + "learning_rate": 3.82862363317644e-06, + "loss": 0.9861, + "step": 8675 + }, + { + "epoch": 1.718547762424727, + "grad_norm": 2.140625, + "learning_rate": 3.827599184800954e-06, + "loss": 0.9176, + "step": 8676 + }, + { + "epoch": 1.718747657479823, + "grad_norm": 2.1875, + "learning_rate": 3.826574788503387e-06, + "loss": 0.9677, + "step": 8677 + }, + { + "epoch": 1.7189475525349192, + "grad_norm": 2.265625, + "learning_rate": 3.825550444329244e-06, + "loss": 1.0463, + "step": 8678 + }, + { + "epoch": 1.7191474475900153, + "grad_norm": 2.25, + "learning_rate": 3.8245261523240235e-06, + "loss": 1.0227, + "step": 8679 + }, + { + "epoch": 1.7193473426451114, + "grad_norm": 2.109375, + "learning_rate": 3.823501912533226e-06, + "loss": 0.9353, + "step": 8680 + }, + { + "epoch": 1.7195472377002075, + "grad_norm": 2.125, + "learning_rate": 3.822477725002348e-06, + "loss": 0.9909, + "step": 8681 + }, + { + "epoch": 1.7197471327553036, + "grad_norm": 2.109375, + "learning_rate": 3.821453589776886e-06, + "loss": 0.9857, + "step": 8682 + }, + { + "epoch": 1.7199470278103997, + "grad_norm": 2.234375, + "learning_rate": 3.820429506902326e-06, + "loss": 1.0048, + "step": 8683 + }, + { + "epoch": 1.7201469228654958, + "grad_norm": 2.203125, + "learning_rate": 3.819405476424164e-06, + "loss": 0.9792, + "step": 8684 + }, + { + "epoch": 1.7203468179205919, + "grad_norm": 2.3125, + "learning_rate": 3.818381498387883e-06, + "loss": 0.9946, + "step": 8685 + }, + { + "epoch": 1.7205467129756877, + "grad_norm": 2.078125, + "learning_rate": 3.81735757283897e-06, + "loss": 0.9009, + "step": 8686 + }, + { + "epoch": 1.7207466080307838, + "grad_norm": 2.15625, + "learning_rate": 3.8163336998229075e-06, + "loss": 0.9599, + "step": 8687 + }, + { + "epoch": 1.72094650308588, + "grad_norm": 2.203125, + "learning_rate": 3.815309879385176e-06, + "loss": 0.9272, + "step": 8688 + }, + { + "epoch": 1.721146398140976, + "grad_norm": 2.328125, + "learning_rate": 3.8142861115712515e-06, + "loss": 0.9367, + "step": 8689 + }, + { + "epoch": 1.721346293196072, + "grad_norm": 2.171875, + "learning_rate": 3.8132623964266136e-06, + "loss": 1.0028, + "step": 8690 + }, + { + "epoch": 1.721546188251168, + "grad_norm": 2.1875, + "learning_rate": 3.8122387339967315e-06, + "loss": 1.0069, + "step": 8691 + }, + { + "epoch": 1.721746083306264, + "grad_norm": 2.1875, + "learning_rate": 3.811215124327078e-06, + "loss": 1.0578, + "step": 8692 + }, + { + "epoch": 1.7219459783613602, + "grad_norm": 2.15625, + "learning_rate": 3.810191567463123e-06, + "loss": 0.9108, + "step": 8693 + }, + { + "epoch": 1.7221458734164563, + "grad_norm": 2.15625, + "learning_rate": 3.8091680634503303e-06, + "loss": 0.9725, + "step": 8694 + }, + { + "epoch": 1.7223457684715524, + "grad_norm": 2.078125, + "learning_rate": 3.808144612334165e-06, + "loss": 0.9575, + "step": 8695 + }, + { + "epoch": 1.7225456635266485, + "grad_norm": 2.1875, + "learning_rate": 3.80712121416009e-06, + "loss": 1.0384, + "step": 8696 + }, + { + "epoch": 1.7227455585817446, + "grad_norm": 2.359375, + "learning_rate": 3.806097868973562e-06, + "loss": 0.977, + "step": 8697 + }, + { + "epoch": 1.7229454536368407, + "grad_norm": 2.125, + "learning_rate": 3.8050745768200404e-06, + "loss": 0.9893, + "step": 8698 + }, + { + "epoch": 1.7231453486919368, + "grad_norm": 2.25, + "learning_rate": 3.8040513377449774e-06, + "loss": 0.9146, + "step": 8699 + }, + { + "epoch": 1.723345243747033, + "grad_norm": 2.234375, + "learning_rate": 3.803028151793826e-06, + "loss": 1.0259, + "step": 8700 + }, + { + "epoch": 1.723545138802129, + "grad_norm": 2.078125, + "learning_rate": 3.802005019012038e-06, + "loss": 0.9262, + "step": 8701 + }, + { + "epoch": 1.723745033857225, + "grad_norm": 2.078125, + "learning_rate": 3.800981939445057e-06, + "loss": 0.9522, + "step": 8702 + }, + { + "epoch": 1.723944928912321, + "grad_norm": 2.203125, + "learning_rate": 3.7999589131383306e-06, + "loss": 0.9703, + "step": 8703 + }, + { + "epoch": 1.724144823967417, + "grad_norm": 2.25, + "learning_rate": 3.798935940137303e-06, + "loss": 1.141, + "step": 8704 + }, + { + "epoch": 1.7243447190225132, + "grad_norm": 2.1875, + "learning_rate": 3.7979130204874106e-06, + "loss": 0.9866, + "step": 8705 + }, + { + "epoch": 1.7245446140776093, + "grad_norm": 2.3125, + "learning_rate": 3.7968901542340946e-06, + "loss": 0.9721, + "step": 8706 + }, + { + "epoch": 1.7247445091327054, + "grad_norm": 2.1875, + "learning_rate": 3.795867341422789e-06, + "loss": 1.0209, + "step": 8707 + }, + { + "epoch": 1.7249444041878013, + "grad_norm": 2.125, + "learning_rate": 3.7948445820989265e-06, + "loss": 0.8676, + "step": 8708 + }, + { + "epoch": 1.7251442992428974, + "grad_norm": 2.203125, + "learning_rate": 3.793821876307941e-06, + "loss": 0.9601, + "step": 8709 + }, + { + "epoch": 1.7253441942979935, + "grad_norm": 2.125, + "learning_rate": 3.792799224095257e-06, + "loss": 0.991, + "step": 8710 + }, + { + "epoch": 1.7255440893530896, + "grad_norm": 2.15625, + "learning_rate": 3.791776625506303e-06, + "loss": 0.983, + "step": 8711 + }, + { + "epoch": 1.7257439844081857, + "grad_norm": 2.140625, + "learning_rate": 3.7907540805865035e-06, + "loss": 0.8535, + "step": 8712 + }, + { + "epoch": 1.7259438794632818, + "grad_norm": 2.296875, + "learning_rate": 3.7897315893812796e-06, + "loss": 0.9902, + "step": 8713 + }, + { + "epoch": 1.7261437745183779, + "grad_norm": 2.125, + "learning_rate": 3.788709151936047e-06, + "loss": 0.9822, + "step": 8714 + }, + { + "epoch": 1.726343669573474, + "grad_norm": 2.046875, + "learning_rate": 3.7876867682962244e-06, + "loss": 0.9171, + "step": 8715 + }, + { + "epoch": 1.72654356462857, + "grad_norm": 2.21875, + "learning_rate": 3.786664438507227e-06, + "loss": 0.9963, + "step": 8716 + }, + { + "epoch": 1.7267434596836662, + "grad_norm": 2.046875, + "learning_rate": 3.7856421626144645e-06, + "loss": 0.866, + "step": 8717 + }, + { + "epoch": 1.7269433547387623, + "grad_norm": 2.125, + "learning_rate": 3.7846199406633493e-06, + "loss": 0.9655, + "step": 8718 + }, + { + "epoch": 1.7271432497938584, + "grad_norm": 2.109375, + "learning_rate": 3.783597772699285e-06, + "loss": 0.9487, + "step": 8719 + }, + { + "epoch": 1.7273431448489545, + "grad_norm": 2.265625, + "learning_rate": 3.782575658767678e-06, + "loss": 0.9771, + "step": 8720 + }, + { + "epoch": 1.7275430399040503, + "grad_norm": 2.234375, + "learning_rate": 3.7815535989139307e-06, + "loss": 0.953, + "step": 8721 + }, + { + "epoch": 1.7277429349591464, + "grad_norm": 2.203125, + "learning_rate": 3.7805315931834413e-06, + "loss": 0.9501, + "step": 8722 + }, + { + "epoch": 1.7279428300142425, + "grad_norm": 2.1875, + "learning_rate": 3.779509641621609e-06, + "loss": 0.9922, + "step": 8723 + }, + { + "epoch": 1.7281427250693386, + "grad_norm": 2.203125, + "learning_rate": 3.7784877442738283e-06, + "loss": 1.0136, + "step": 8724 + }, + { + "epoch": 1.7283426201244345, + "grad_norm": 2.234375, + "learning_rate": 3.777465901185491e-06, + "loss": 0.9256, + "step": 8725 + }, + { + "epoch": 1.7285425151795306, + "grad_norm": 2.140625, + "learning_rate": 3.776444112401989e-06, + "loss": 0.9601, + "step": 8726 + }, + { + "epoch": 1.7287424102346267, + "grad_norm": 2.140625, + "learning_rate": 3.775422377968707e-06, + "loss": 0.9693, + "step": 8727 + }, + { + "epoch": 1.7289423052897228, + "grad_norm": 2.125, + "learning_rate": 3.7744006979310333e-06, + "loss": 0.9509, + "step": 8728 + }, + { + "epoch": 1.729142200344819, + "grad_norm": 2.140625, + "learning_rate": 3.77337907233435e-06, + "loss": 1.0105, + "step": 8729 + }, + { + "epoch": 1.729342095399915, + "grad_norm": 2.078125, + "learning_rate": 3.7723575012240364e-06, + "loss": 1.0143, + "step": 8730 + }, + { + "epoch": 1.7295419904550111, + "grad_norm": 2.109375, + "learning_rate": 3.771335984645471e-06, + "loss": 0.9855, + "step": 8731 + }, + { + "epoch": 1.7297418855101072, + "grad_norm": 2.15625, + "learning_rate": 3.770314522644032e-06, + "loss": 0.9882, + "step": 8732 + }, + { + "epoch": 1.7299417805652033, + "grad_norm": 2.1875, + "learning_rate": 3.7692931152650893e-06, + "loss": 1.0563, + "step": 8733 + }, + { + "epoch": 1.7301416756202994, + "grad_norm": 2.1875, + "learning_rate": 3.7682717625540144e-06, + "loss": 1.0002, + "step": 8734 + }, + { + "epoch": 1.7303415706753955, + "grad_norm": 2.109375, + "learning_rate": 3.7672504645561773e-06, + "loss": 0.9918, + "step": 8735 + }, + { + "epoch": 1.7305414657304916, + "grad_norm": 2.125, + "learning_rate": 3.766229221316942e-06, + "loss": 0.9311, + "step": 8736 + }, + { + "epoch": 1.7307413607855877, + "grad_norm": 2.125, + "learning_rate": 3.7652080328816744e-06, + "loss": 0.968, + "step": 8737 + }, + { + "epoch": 1.7309412558406836, + "grad_norm": 2.0625, + "learning_rate": 3.7641868992957326e-06, + "loss": 0.9163, + "step": 8738 + }, + { + "epoch": 1.7311411508957797, + "grad_norm": 2.15625, + "learning_rate": 3.763165820604477e-06, + "loss": 1.006, + "step": 8739 + }, + { + "epoch": 1.7313410459508758, + "grad_norm": 2.234375, + "learning_rate": 3.7621447968532644e-06, + "loss": 0.9999, + "step": 8740 + }, + { + "epoch": 1.731540941005972, + "grad_norm": 2.15625, + "learning_rate": 3.7611238280874467e-06, + "loss": 0.9689, + "step": 8741 + }, + { + "epoch": 1.731740836061068, + "grad_norm": 2.125, + "learning_rate": 3.7601029143523767e-06, + "loss": 0.9182, + "step": 8742 + }, + { + "epoch": 1.7319407311161639, + "grad_norm": 2.21875, + "learning_rate": 3.759082055693403e-06, + "loss": 0.9651, + "step": 8743 + }, + { + "epoch": 1.73214062617126, + "grad_norm": 2.15625, + "learning_rate": 3.7580612521558737e-06, + "loss": 0.9859, + "step": 8744 + }, + { + "epoch": 1.732340521226356, + "grad_norm": 2.109375, + "learning_rate": 3.7570405037851287e-06, + "loss": 0.9697, + "step": 8745 + }, + { + "epoch": 1.7325404162814522, + "grad_norm": 2.078125, + "learning_rate": 3.756019810626511e-06, + "loss": 0.9336, + "step": 8746 + }, + { + "epoch": 1.7327403113365483, + "grad_norm": 2.25, + "learning_rate": 3.754999172725362e-06, + "loss": 1.0636, + "step": 8747 + }, + { + "epoch": 1.7329402063916444, + "grad_norm": 2.046875, + "learning_rate": 3.7539785901270164e-06, + "loss": 0.9048, + "step": 8748 + }, + { + "epoch": 1.7331401014467405, + "grad_norm": 2.09375, + "learning_rate": 3.752958062876809e-06, + "loss": 0.9167, + "step": 8749 + }, + { + "epoch": 1.7333399965018366, + "grad_norm": 2.109375, + "learning_rate": 3.7519375910200706e-06, + "loss": 0.9092, + "step": 8750 + }, + { + "epoch": 1.7335398915569327, + "grad_norm": 2.171875, + "learning_rate": 3.7509171746021305e-06, + "loss": 0.9155, + "step": 8751 + }, + { + "epoch": 1.7337397866120288, + "grad_norm": 2.171875, + "learning_rate": 3.749896813668318e-06, + "loss": 1.0269, + "step": 8752 + }, + { + "epoch": 1.7339396816671249, + "grad_norm": 2.234375, + "learning_rate": 3.748876508263954e-06, + "loss": 1.039, + "step": 8753 + }, + { + "epoch": 1.734139576722221, + "grad_norm": 2.125, + "learning_rate": 3.7478562584343615e-06, + "loss": 0.9549, + "step": 8754 + }, + { + "epoch": 1.734339471777317, + "grad_norm": 2.25, + "learning_rate": 3.746836064224862e-06, + "loss": 1.0041, + "step": 8755 + }, + { + "epoch": 1.734539366832413, + "grad_norm": 2.28125, + "learning_rate": 3.745815925680769e-06, + "loss": 1.0523, + "step": 8756 + }, + { + "epoch": 1.734739261887509, + "grad_norm": 2.21875, + "learning_rate": 3.7447958428474006e-06, + "loss": 1.0006, + "step": 8757 + }, + { + "epoch": 1.7349391569426051, + "grad_norm": 2.4375, + "learning_rate": 3.743775815770065e-06, + "loss": 1.0765, + "step": 8758 + }, + { + "epoch": 1.7351390519977012, + "grad_norm": 2.15625, + "learning_rate": 3.7427558444940738e-06, + "loss": 0.9442, + "step": 8759 + }, + { + "epoch": 1.7353389470527971, + "grad_norm": 2.21875, + "learning_rate": 3.741735929064735e-06, + "loss": 0.9692, + "step": 8760 + }, + { + "epoch": 1.7355388421078932, + "grad_norm": 2.09375, + "learning_rate": 3.74071606952735e-06, + "loss": 0.9094, + "step": 8761 + }, + { + "epoch": 1.7357387371629893, + "grad_norm": 2.09375, + "learning_rate": 3.739696265927223e-06, + "loss": 0.9134, + "step": 8762 + }, + { + "epoch": 1.7359386322180854, + "grad_norm": 2.125, + "learning_rate": 3.7386765183096545e-06, + "loss": 1.0662, + "step": 8763 + }, + { + "epoch": 1.7361385272731815, + "grad_norm": 2.359375, + "learning_rate": 3.737656826719939e-06, + "loss": 0.9534, + "step": 8764 + }, + { + "epoch": 1.7363384223282776, + "grad_norm": 2.1875, + "learning_rate": 3.7366371912033737e-06, + "loss": 0.9429, + "step": 8765 + }, + { + "epoch": 1.7365383173833737, + "grad_norm": 2.125, + "learning_rate": 3.735617611805249e-06, + "loss": 0.9426, + "step": 8766 + }, + { + "epoch": 1.7367382124384698, + "grad_norm": 2.15625, + "learning_rate": 3.7345980885708545e-06, + "loss": 0.9913, + "step": 8767 + }, + { + "epoch": 1.736938107493566, + "grad_norm": 2.015625, + "learning_rate": 3.733578621545478e-06, + "loss": 0.8853, + "step": 8768 + }, + { + "epoch": 1.737138002548662, + "grad_norm": 2.125, + "learning_rate": 3.7325592107744034e-06, + "loss": 0.9621, + "step": 8769 + }, + { + "epoch": 1.7373378976037581, + "grad_norm": 2.0625, + "learning_rate": 3.7315398563029137e-06, + "loss": 0.9646, + "step": 8770 + }, + { + "epoch": 1.7375377926588542, + "grad_norm": 2.109375, + "learning_rate": 3.7305205581762895e-06, + "loss": 0.9537, + "step": 8771 + }, + { + "epoch": 1.7377376877139503, + "grad_norm": 2.203125, + "learning_rate": 3.7295013164398042e-06, + "loss": 0.9745, + "step": 8772 + }, + { + "epoch": 1.7379375827690462, + "grad_norm": 2.171875, + "learning_rate": 3.7284821311387356e-06, + "loss": 0.901, + "step": 8773 + }, + { + "epoch": 1.7381374778241423, + "grad_norm": 2.28125, + "learning_rate": 3.7274630023183554e-06, + "loss": 1.0101, + "step": 8774 + }, + { + "epoch": 1.7383373728792384, + "grad_norm": 2.171875, + "learning_rate": 3.726443930023934e-06, + "loss": 1.0043, + "step": 8775 + }, + { + "epoch": 1.7385372679343345, + "grad_norm": 2.21875, + "learning_rate": 3.7254249143007356e-06, + "loss": 1.0406, + "step": 8776 + }, + { + "epoch": 1.7387371629894306, + "grad_norm": 2.171875, + "learning_rate": 3.724405955194027e-06, + "loss": 0.9588, + "step": 8777 + }, + { + "epoch": 1.7389370580445265, + "grad_norm": 2.109375, + "learning_rate": 3.7233870527490683e-06, + "loss": 0.8205, + "step": 8778 + }, + { + "epoch": 1.7391369530996226, + "grad_norm": 2.078125, + "learning_rate": 3.7223682070111212e-06, + "loss": 0.9412, + "step": 8779 + }, + { + "epoch": 1.7393368481547187, + "grad_norm": 2.140625, + "learning_rate": 3.7213494180254417e-06, + "loss": 1.0123, + "step": 8780 + }, + { + "epoch": 1.7395367432098148, + "grad_norm": 2.265625, + "learning_rate": 3.7203306858372845e-06, + "loss": 0.9726, + "step": 8781 + }, + { + "epoch": 1.7397366382649109, + "grad_norm": 2.265625, + "learning_rate": 3.719312010491901e-06, + "loss": 0.9801, + "step": 8782 + }, + { + "epoch": 1.739936533320007, + "grad_norm": 2.03125, + "learning_rate": 3.7182933920345426e-06, + "loss": 0.9375, + "step": 8783 + }, + { + "epoch": 1.740136428375103, + "grad_norm": 2.125, + "learning_rate": 3.7172748305104537e-06, + "loss": 0.9314, + "step": 8784 + }, + { + "epoch": 1.7403363234301992, + "grad_norm": 2.140625, + "learning_rate": 3.71625632596488e-06, + "loss": 0.9438, + "step": 8785 + }, + { + "epoch": 1.7405362184852953, + "grad_norm": 2.171875, + "learning_rate": 3.7152378784430643e-06, + "loss": 0.9976, + "step": 8786 + }, + { + "epoch": 1.7407361135403914, + "grad_norm": 2.21875, + "learning_rate": 3.7142194879902434e-06, + "loss": 0.9725, + "step": 8787 + }, + { + "epoch": 1.7409360085954875, + "grad_norm": 2.0, + "learning_rate": 3.7132011546516568e-06, + "loss": 0.8509, + "step": 8788 + }, + { + "epoch": 1.7411359036505836, + "grad_norm": 2.3125, + "learning_rate": 3.7121828784725357e-06, + "loss": 0.9757, + "step": 8789 + }, + { + "epoch": 1.7413357987056797, + "grad_norm": 2.296875, + "learning_rate": 3.711164659498114e-06, + "loss": 0.9933, + "step": 8790 + }, + { + "epoch": 1.7415356937607755, + "grad_norm": 2.03125, + "learning_rate": 3.710146497773622e-06, + "loss": 0.9986, + "step": 8791 + }, + { + "epoch": 1.7417355888158716, + "grad_norm": 2.109375, + "learning_rate": 3.7091283933442835e-06, + "loss": 0.9548, + "step": 8792 + }, + { + "epoch": 1.7419354838709677, + "grad_norm": 2.109375, + "learning_rate": 3.7081103462553234e-06, + "loss": 0.9078, + "step": 8793 + }, + { + "epoch": 1.7421353789260638, + "grad_norm": 2.078125, + "learning_rate": 3.7070923565519655e-06, + "loss": 0.8792, + "step": 8794 + }, + { + "epoch": 1.7423352739811597, + "grad_norm": 2.21875, + "learning_rate": 3.706074424279426e-06, + "loss": 1.042, + "step": 8795 + }, + { + "epoch": 1.7425351690362558, + "grad_norm": 2.109375, + "learning_rate": 3.7050565494829233e-06, + "loss": 0.9285, + "step": 8796 + }, + { + "epoch": 1.742735064091352, + "grad_norm": 2.265625, + "learning_rate": 3.704038732207669e-06, + "loss": 0.9947, + "step": 8797 + }, + { + "epoch": 1.742934959146448, + "grad_norm": 2.171875, + "learning_rate": 3.7030209724988764e-06, + "loss": 0.9517, + "step": 8798 + }, + { + "epoch": 1.7431348542015441, + "grad_norm": 2.15625, + "learning_rate": 3.7020032704017537e-06, + "loss": 1.017, + "step": 8799 + }, + { + "epoch": 1.7433347492566402, + "grad_norm": 2.15625, + "learning_rate": 3.7009856259615074e-06, + "loss": 0.9797, + "step": 8800 + }, + { + "epoch": 1.7435346443117363, + "grad_norm": 2.140625, + "learning_rate": 3.6999680392233397e-06, + "loss": 0.9665, + "step": 8801 + }, + { + "epoch": 1.7437345393668324, + "grad_norm": 2.171875, + "learning_rate": 3.698950510232454e-06, + "loss": 0.9752, + "step": 8802 + }, + { + "epoch": 1.7439344344219285, + "grad_norm": 2.109375, + "learning_rate": 3.6979330390340472e-06, + "loss": 0.8478, + "step": 8803 + }, + { + "epoch": 1.7441343294770246, + "grad_norm": 2.125, + "learning_rate": 3.696915625673317e-06, + "loss": 0.899, + "step": 8804 + }, + { + "epoch": 1.7443342245321207, + "grad_norm": 2.078125, + "learning_rate": 3.6958982701954536e-06, + "loss": 0.8745, + "step": 8805 + }, + { + "epoch": 1.7445341195872168, + "grad_norm": 2.125, + "learning_rate": 3.6948809726456525e-06, + "loss": 0.9022, + "step": 8806 + }, + { + "epoch": 1.744734014642313, + "grad_norm": 2.140625, + "learning_rate": 3.6938637330690967e-06, + "loss": 0.9966, + "step": 8807 + }, + { + "epoch": 1.744933909697409, + "grad_norm": 2.09375, + "learning_rate": 3.6928465515109756e-06, + "loss": 0.9684, + "step": 8808 + }, + { + "epoch": 1.745133804752505, + "grad_norm": 2.21875, + "learning_rate": 3.69182942801647e-06, + "loss": 0.9814, + "step": 8809 + }, + { + "epoch": 1.745333699807601, + "grad_norm": 2.1875, + "learning_rate": 3.6908123626307618e-06, + "loss": 1.0867, + "step": 8810 + }, + { + "epoch": 1.745533594862697, + "grad_norm": 2.125, + "learning_rate": 3.6897953553990296e-06, + "loss": 0.9418, + "step": 8811 + }, + { + "epoch": 1.7457334899177932, + "grad_norm": 2.1875, + "learning_rate": 3.6887784063664467e-06, + "loss": 0.9365, + "step": 8812 + }, + { + "epoch": 1.745933384972889, + "grad_norm": 2.0625, + "learning_rate": 3.6877615155781863e-06, + "loss": 0.9061, + "step": 8813 + }, + { + "epoch": 1.7461332800279852, + "grad_norm": 2.28125, + "learning_rate": 3.6867446830794205e-06, + "loss": 0.9976, + "step": 8814 + }, + { + "epoch": 1.7463331750830813, + "grad_norm": 2.21875, + "learning_rate": 3.6857279089153154e-06, + "loss": 1.0071, + "step": 8815 + }, + { + "epoch": 1.7465330701381774, + "grad_norm": 2.15625, + "learning_rate": 3.6847111931310363e-06, + "loss": 1.0236, + "step": 8816 + }, + { + "epoch": 1.7467329651932735, + "grad_norm": 2.109375, + "learning_rate": 3.683694535771745e-06, + "loss": 0.9984, + "step": 8817 + }, + { + "epoch": 1.7469328602483696, + "grad_norm": 2.03125, + "learning_rate": 3.6826779368826015e-06, + "loss": 0.8575, + "step": 8818 + }, + { + "epoch": 1.7471327553034657, + "grad_norm": 2.09375, + "learning_rate": 3.681661396508765e-06, + "loss": 0.9144, + "step": 8819 + }, + { + "epoch": 1.7473326503585618, + "grad_norm": 2.203125, + "learning_rate": 3.680644914695387e-06, + "loss": 0.9785, + "step": 8820 + }, + { + "epoch": 1.7475325454136579, + "grad_norm": 2.171875, + "learning_rate": 3.679628491487621e-06, + "loss": 0.9525, + "step": 8821 + }, + { + "epoch": 1.747732440468754, + "grad_norm": 2.125, + "learning_rate": 3.6786121269306173e-06, + "loss": 1.0481, + "step": 8822 + }, + { + "epoch": 1.74793233552385, + "grad_norm": 2.15625, + "learning_rate": 3.677595821069521e-06, + "loss": 0.9572, + "step": 8823 + }, + { + "epoch": 1.7481322305789462, + "grad_norm": 2.09375, + "learning_rate": 3.676579573949477e-06, + "loss": 0.9119, + "step": 8824 + }, + { + "epoch": 1.7483321256340423, + "grad_norm": 2.203125, + "learning_rate": 3.6755633856156283e-06, + "loss": 1.0921, + "step": 8825 + }, + { + "epoch": 1.7485320206891382, + "grad_norm": 2.21875, + "learning_rate": 3.6745472561131107e-06, + "loss": 0.9407, + "step": 8826 + }, + { + "epoch": 1.7487319157442343, + "grad_norm": 2.09375, + "learning_rate": 3.673531185487064e-06, + "loss": 1.0492, + "step": 8827 + }, + { + "epoch": 1.7489318107993304, + "grad_norm": 2.09375, + "learning_rate": 3.6725151737826193e-06, + "loss": 0.914, + "step": 8828 + }, + { + "epoch": 1.7491317058544265, + "grad_norm": 2.21875, + "learning_rate": 3.6714992210449084e-06, + "loss": 1.0461, + "step": 8829 + }, + { + "epoch": 1.7493316009095226, + "grad_norm": 2.171875, + "learning_rate": 3.670483327319062e-06, + "loss": 0.965, + "step": 8830 + }, + { + "epoch": 1.7495314959646184, + "grad_norm": 2.328125, + "learning_rate": 3.669467492650203e-06, + "loss": 1.0768, + "step": 8831 + }, + { + "epoch": 1.7497313910197145, + "grad_norm": 2.1875, + "learning_rate": 3.6684517170834554e-06, + "loss": 0.9636, + "step": 8832 + }, + { + "epoch": 1.7499312860748106, + "grad_norm": 2.203125, + "learning_rate": 3.667436000663942e-06, + "loss": 0.9212, + "step": 8833 + }, + { + "epoch": 1.7501311811299067, + "grad_norm": 2.1875, + "learning_rate": 3.666420343436777e-06, + "loss": 0.9206, + "step": 8834 + }, + { + "epoch": 1.7503310761850028, + "grad_norm": 2.234375, + "learning_rate": 3.66540474544708e-06, + "loss": 0.8463, + "step": 8835 + }, + { + "epoch": 1.750530971240099, + "grad_norm": 2.15625, + "learning_rate": 3.6643892067399604e-06, + "loss": 0.9841, + "step": 8836 + }, + { + "epoch": 1.750730866295195, + "grad_norm": 2.171875, + "learning_rate": 3.663373727360532e-06, + "loss": 0.9349, + "step": 8837 + }, + { + "epoch": 1.7509307613502911, + "grad_norm": 2.15625, + "learning_rate": 3.662358307353897e-06, + "loss": 0.9983, + "step": 8838 + }, + { + "epoch": 1.7511306564053872, + "grad_norm": 2.125, + "learning_rate": 3.6613429467651644e-06, + "loss": 0.8888, + "step": 8839 + }, + { + "epoch": 1.7513305514604833, + "grad_norm": 2.03125, + "learning_rate": 3.6603276456394342e-06, + "loss": 0.8516, + "step": 8840 + }, + { + "epoch": 1.7515304465155794, + "grad_norm": 2.109375, + "learning_rate": 3.659312404021807e-06, + "loss": 0.9168, + "step": 8841 + }, + { + "epoch": 1.7517303415706755, + "grad_norm": 2.21875, + "learning_rate": 3.6582972219573808e-06, + "loss": 1.1102, + "step": 8842 + }, + { + "epoch": 1.7519302366257716, + "grad_norm": 2.296875, + "learning_rate": 3.6572820994912474e-06, + "loss": 0.922, + "step": 8843 + }, + { + "epoch": 1.7521301316808675, + "grad_norm": 2.046875, + "learning_rate": 3.6562670366684995e-06, + "loss": 0.9361, + "step": 8844 + }, + { + "epoch": 1.7523300267359636, + "grad_norm": 2.171875, + "learning_rate": 3.655252033534228e-06, + "loss": 1.0295, + "step": 8845 + }, + { + "epoch": 1.7525299217910597, + "grad_norm": 2.203125, + "learning_rate": 3.6542370901335163e-06, + "loss": 0.9455, + "step": 8846 + }, + { + "epoch": 1.7527298168461558, + "grad_norm": 2.15625, + "learning_rate": 3.65322220651145e-06, + "loss": 0.9995, + "step": 8847 + }, + { + "epoch": 1.7529297119012517, + "grad_norm": 2.28125, + "learning_rate": 3.652207382713109e-06, + "loss": 1.013, + "step": 8848 + }, + { + "epoch": 1.7531296069563478, + "grad_norm": 2.140625, + "learning_rate": 3.651192618783571e-06, + "loss": 0.9757, + "step": 8849 + }, + { + "epoch": 1.7533295020114439, + "grad_norm": 2.15625, + "learning_rate": 3.650177914767915e-06, + "loss": 0.9842, + "step": 8850 + }, + { + "epoch": 1.75352939706654, + "grad_norm": 2.234375, + "learning_rate": 3.64916327071121e-06, + "loss": 0.9903, + "step": 8851 + }, + { + "epoch": 1.753729292121636, + "grad_norm": 2.234375, + "learning_rate": 3.6481486866585292e-06, + "loss": 1.0488, + "step": 8852 + }, + { + "epoch": 1.7539291871767322, + "grad_norm": 2.21875, + "learning_rate": 3.6471341626549397e-06, + "loss": 0.9775, + "step": 8853 + }, + { + "epoch": 1.7541290822318283, + "grad_norm": 2.078125, + "learning_rate": 3.646119698745505e-06, + "loss": 0.943, + "step": 8854 + }, + { + "epoch": 1.7543289772869244, + "grad_norm": 2.28125, + "learning_rate": 3.6451052949752902e-06, + "loss": 1.0549, + "step": 8855 + }, + { + "epoch": 1.7545288723420205, + "grad_norm": 2.265625, + "learning_rate": 3.6440909513893517e-06, + "loss": 0.9492, + "step": 8856 + }, + { + "epoch": 1.7547287673971166, + "grad_norm": 2.09375, + "learning_rate": 3.643076668032749e-06, + "loss": 0.96, + "step": 8857 + }, + { + "epoch": 1.7549286624522127, + "grad_norm": 2.125, + "learning_rate": 3.642062444950537e-06, + "loss": 0.9833, + "step": 8858 + }, + { + "epoch": 1.7551285575073088, + "grad_norm": 2.171875, + "learning_rate": 3.6410482821877647e-06, + "loss": 0.9437, + "step": 8859 + }, + { + "epoch": 1.7553284525624049, + "grad_norm": 2.09375, + "learning_rate": 3.640034179789483e-06, + "loss": 0.9675, + "step": 8860 + }, + { + "epoch": 1.7555283476175008, + "grad_norm": 2.15625, + "learning_rate": 3.6390201378007383e-06, + "loss": 0.9612, + "step": 8861 + }, + { + "epoch": 1.7557282426725969, + "grad_norm": 2.328125, + "learning_rate": 3.638006156266573e-06, + "loss": 0.94, + "step": 8862 + }, + { + "epoch": 1.755928137727693, + "grad_norm": 2.078125, + "learning_rate": 3.6369922352320285e-06, + "loss": 0.969, + "step": 8863 + }, + { + "epoch": 1.756128032782789, + "grad_norm": 2.0625, + "learning_rate": 3.635978374742145e-06, + "loss": 0.8889, + "step": 8864 + }, + { + "epoch": 1.7563279278378852, + "grad_norm": 2.0625, + "learning_rate": 3.634964574841955e-06, + "loss": 0.8754, + "step": 8865 + }, + { + "epoch": 1.756527822892981, + "grad_norm": 2.15625, + "learning_rate": 3.6339508355764935e-06, + "loss": 0.9049, + "step": 8866 + }, + { + "epoch": 1.7567277179480771, + "grad_norm": 2.15625, + "learning_rate": 3.632937156990789e-06, + "loss": 0.8882, + "step": 8867 + }, + { + "epoch": 1.7569276130031732, + "grad_norm": 2.15625, + "learning_rate": 3.6319235391298724e-06, + "loss": 0.9551, + "step": 8868 + }, + { + "epoch": 1.7571275080582693, + "grad_norm": 2.171875, + "learning_rate": 3.630909982038764e-06, + "loss": 1.0042, + "step": 8869 + }, + { + "epoch": 1.7573274031133654, + "grad_norm": 2.34375, + "learning_rate": 3.6298964857624885e-06, + "loss": 1.1014, + "step": 8870 + }, + { + "epoch": 1.7575272981684615, + "grad_norm": 2.171875, + "learning_rate": 3.628883050346065e-06, + "loss": 0.8785, + "step": 8871 + }, + { + "epoch": 1.7577271932235576, + "grad_norm": 2.1875, + "learning_rate": 3.627869675834509e-06, + "loss": 0.992, + "step": 8872 + }, + { + "epoch": 1.7579270882786537, + "grad_norm": 2.140625, + "learning_rate": 3.626856362272836e-06, + "loss": 0.9159, + "step": 8873 + }, + { + "epoch": 1.7581269833337498, + "grad_norm": 2.171875, + "learning_rate": 3.6258431097060567e-06, + "loss": 1.1005, + "step": 8874 + }, + { + "epoch": 1.758326878388846, + "grad_norm": 2.1875, + "learning_rate": 3.624829918179179e-06, + "loss": 0.9464, + "step": 8875 + }, + { + "epoch": 1.758526773443942, + "grad_norm": 2.171875, + "learning_rate": 3.623816787737211e-06, + "loss": 0.9043, + "step": 8876 + }, + { + "epoch": 1.7587266684990381, + "grad_norm": 2.09375, + "learning_rate": 3.622803718425153e-06, + "loss": 0.9667, + "step": 8877 + }, + { + "epoch": 1.7589265635541342, + "grad_norm": 2.203125, + "learning_rate": 3.6217907102880075e-06, + "loss": 0.9796, + "step": 8878 + }, + { + "epoch": 1.7591264586092301, + "grad_norm": 2.25, + "learning_rate": 3.620777763370771e-06, + "loss": 0.9393, + "step": 8879 + }, + { + "epoch": 1.7593263536643262, + "grad_norm": 2.296875, + "learning_rate": 3.619764877718438e-06, + "loss": 1.1024, + "step": 8880 + }, + { + "epoch": 1.7595262487194223, + "grad_norm": 2.28125, + "learning_rate": 3.6187520533760034e-06, + "loss": 0.9499, + "step": 8881 + }, + { + "epoch": 1.7597261437745184, + "grad_norm": 2.203125, + "learning_rate": 3.6177392903884545e-06, + "loss": 1.0295, + "step": 8882 + }, + { + "epoch": 1.7599260388296143, + "grad_norm": 2.109375, + "learning_rate": 3.616726588800778e-06, + "loss": 0.9496, + "step": 8883 + }, + { + "epoch": 1.7601259338847104, + "grad_norm": 2.015625, + "learning_rate": 3.61571394865796e-06, + "loss": 0.9193, + "step": 8884 + }, + { + "epoch": 1.7603258289398065, + "grad_norm": 2.171875, + "learning_rate": 3.6147013700049793e-06, + "loss": 0.974, + "step": 8885 + }, + { + "epoch": 1.7605257239949026, + "grad_norm": 2.015625, + "learning_rate": 3.6136888528868177e-06, + "loss": 0.9011, + "step": 8886 + }, + { + "epoch": 1.7607256190499987, + "grad_norm": 2.09375, + "learning_rate": 3.612676397348447e-06, + "loss": 0.9091, + "step": 8887 + }, + { + "epoch": 1.7609255141050948, + "grad_norm": 2.03125, + "learning_rate": 3.6116640034348426e-06, + "loss": 0.9124, + "step": 8888 + }, + { + "epoch": 1.761125409160191, + "grad_norm": 2.046875, + "learning_rate": 3.610651671190977e-06, + "loss": 0.9292, + "step": 8889 + }, + { + "epoch": 1.761325304215287, + "grad_norm": 2.0625, + "learning_rate": 3.609639400661814e-06, + "loss": 0.9587, + "step": 8890 + }, + { + "epoch": 1.761525199270383, + "grad_norm": 2.203125, + "learning_rate": 3.60862719189232e-06, + "loss": 0.9473, + "step": 8891 + }, + { + "epoch": 1.7617250943254792, + "grad_norm": 2.1875, + "learning_rate": 3.6076150449274595e-06, + "loss": 0.9856, + "step": 8892 + }, + { + "epoch": 1.7619249893805753, + "grad_norm": 2.265625, + "learning_rate": 3.6066029598121883e-06, + "loss": 0.9389, + "step": 8893 + }, + { + "epoch": 1.7621248844356714, + "grad_norm": 2.21875, + "learning_rate": 3.605590936591466e-06, + "loss": 0.9567, + "step": 8894 + }, + { + "epoch": 1.7623247794907675, + "grad_norm": 2.140625, + "learning_rate": 3.6045789753102443e-06, + "loss": 0.9552, + "step": 8895 + }, + { + "epoch": 1.7625246745458634, + "grad_norm": 2.25, + "learning_rate": 3.6035670760134756e-06, + "loss": 1.0126, + "step": 8896 + }, + { + "epoch": 1.7627245696009595, + "grad_norm": 2.203125, + "learning_rate": 3.6025552387461086e-06, + "loss": 0.9932, + "step": 8897 + }, + { + "epoch": 1.7629244646560556, + "grad_norm": 2.09375, + "learning_rate": 3.6015434635530888e-06, + "loss": 1.0051, + "step": 8898 + }, + { + "epoch": 1.7631243597111517, + "grad_norm": 2.15625, + "learning_rate": 3.6005317504793604e-06, + "loss": 0.9816, + "step": 8899 + }, + { + "epoch": 1.7633242547662478, + "grad_norm": 2.125, + "learning_rate": 3.59952009956986e-06, + "loss": 0.9812, + "step": 8900 + }, + { + "epoch": 1.7635241498213436, + "grad_norm": 2.34375, + "learning_rate": 3.5985085108695285e-06, + "loss": 0.975, + "step": 8901 + }, + { + "epoch": 1.7637240448764397, + "grad_norm": 2.234375, + "learning_rate": 3.5974969844232974e-06, + "loss": 0.9464, + "step": 8902 + }, + { + "epoch": 1.7639239399315358, + "grad_norm": 2.1875, + "learning_rate": 3.5964855202761016e-06, + "loss": 0.939, + "step": 8903 + }, + { + "epoch": 1.764123834986632, + "grad_norm": 2.078125, + "learning_rate": 3.5954741184728698e-06, + "loss": 0.8953, + "step": 8904 + }, + { + "epoch": 1.764323730041728, + "grad_norm": 2.3125, + "learning_rate": 3.5944627790585264e-06, + "loss": 1.086, + "step": 8905 + }, + { + "epoch": 1.7645236250968241, + "grad_norm": 2.046875, + "learning_rate": 3.5934515020779974e-06, + "loss": 0.931, + "step": 8906 + }, + { + "epoch": 1.7647235201519202, + "grad_norm": 2.21875, + "learning_rate": 3.592440287576201e-06, + "loss": 0.9439, + "step": 8907 + }, + { + "epoch": 1.7649234152070163, + "grad_norm": 2.140625, + "learning_rate": 3.5914291355980573e-06, + "loss": 0.9707, + "step": 8908 + }, + { + "epoch": 1.7651233102621124, + "grad_norm": 2.203125, + "learning_rate": 3.5904180461884818e-06, + "loss": 0.9281, + "step": 8909 + }, + { + "epoch": 1.7653232053172085, + "grad_norm": 2.109375, + "learning_rate": 3.589407019392385e-06, + "loss": 0.9643, + "step": 8910 + }, + { + "epoch": 1.7655231003723046, + "grad_norm": 2.125, + "learning_rate": 3.5883960552546783e-06, + "loss": 0.9312, + "step": 8911 + }, + { + "epoch": 1.7657229954274007, + "grad_norm": 2.078125, + "learning_rate": 3.587385153820269e-06, + "loss": 0.9281, + "step": 8912 + }, + { + "epoch": 1.7659228904824968, + "grad_norm": 2.09375, + "learning_rate": 3.586374315134058e-06, + "loss": 1.0413, + "step": 8913 + }, + { + "epoch": 1.7661227855375927, + "grad_norm": 2.203125, + "learning_rate": 3.5853635392409504e-06, + "loss": 0.9381, + "step": 8914 + }, + { + "epoch": 1.7663226805926888, + "grad_norm": 2.078125, + "learning_rate": 3.5843528261858434e-06, + "loss": 0.9366, + "step": 8915 + }, + { + "epoch": 1.766522575647785, + "grad_norm": 2.171875, + "learning_rate": 3.5833421760136323e-06, + "loss": 0.9485, + "step": 8916 + }, + { + "epoch": 1.766722470702881, + "grad_norm": 2.21875, + "learning_rate": 3.582331588769211e-06, + "loss": 0.9681, + "step": 8917 + }, + { + "epoch": 1.766922365757977, + "grad_norm": 2.109375, + "learning_rate": 3.5813210644974685e-06, + "loss": 0.8463, + "step": 8918 + }, + { + "epoch": 1.767122260813073, + "grad_norm": 2.046875, + "learning_rate": 3.5803106032432923e-06, + "loss": 0.9746, + "step": 8919 + }, + { + "epoch": 1.767322155868169, + "grad_norm": 2.125, + "learning_rate": 3.5793002050515686e-06, + "loss": 0.8872, + "step": 8920 + }, + { + "epoch": 1.7675220509232652, + "grad_norm": 2.125, + "learning_rate": 3.578289869967177e-06, + "loss": 0.8397, + "step": 8921 + }, + { + "epoch": 1.7677219459783613, + "grad_norm": 2.0625, + "learning_rate": 3.5772795980349976e-06, + "loss": 0.993, + "step": 8922 + }, + { + "epoch": 1.7679218410334574, + "grad_norm": 2.0625, + "learning_rate": 3.576269389299908e-06, + "loss": 0.9071, + "step": 8923 + }, + { + "epoch": 1.7681217360885535, + "grad_norm": 2.28125, + "learning_rate": 3.5752592438067785e-06, + "loss": 1.0297, + "step": 8924 + }, + { + "epoch": 1.7683216311436496, + "grad_norm": 2.140625, + "learning_rate": 3.5742491616004828e-06, + "loss": 0.9557, + "step": 8925 + }, + { + "epoch": 1.7685215261987457, + "grad_norm": 2.125, + "learning_rate": 3.573239142725885e-06, + "loss": 0.9459, + "step": 8926 + }, + { + "epoch": 1.7687214212538418, + "grad_norm": 2.078125, + "learning_rate": 3.572229187227853e-06, + "loss": 1.0032, + "step": 8927 + }, + { + "epoch": 1.768921316308938, + "grad_norm": 2.125, + "learning_rate": 3.571219295151249e-06, + "loss": 1.0218, + "step": 8928 + }, + { + "epoch": 1.769121211364034, + "grad_norm": 2.203125, + "learning_rate": 3.57020946654093e-06, + "loss": 1.0085, + "step": 8929 + }, + { + "epoch": 1.76932110641913, + "grad_norm": 2.296875, + "learning_rate": 3.5691997014417558e-06, + "loss": 1.0393, + "step": 8930 + }, + { + "epoch": 1.7695210014742262, + "grad_norm": 2.171875, + "learning_rate": 3.568189999898576e-06, + "loss": 0.9354, + "step": 8931 + }, + { + "epoch": 1.769720896529322, + "grad_norm": 2.1875, + "learning_rate": 3.567180361956245e-06, + "loss": 0.9892, + "step": 8932 + }, + { + "epoch": 1.7699207915844182, + "grad_norm": 2.203125, + "learning_rate": 3.5661707876596073e-06, + "loss": 0.9455, + "step": 8933 + }, + { + "epoch": 1.7701206866395143, + "grad_norm": 2.1875, + "learning_rate": 3.565161277053511e-06, + "loss": 0.9616, + "step": 8934 + }, + { + "epoch": 1.7703205816946104, + "grad_norm": 2.078125, + "learning_rate": 3.5641518301827983e-06, + "loss": 0.9205, + "step": 8935 + }, + { + "epoch": 1.7705204767497063, + "grad_norm": 2.1875, + "learning_rate": 3.563142447092307e-06, + "loss": 1.0233, + "step": 8936 + }, + { + "epoch": 1.7707203718048024, + "grad_norm": 2.3125, + "learning_rate": 3.5621331278268754e-06, + "loss": 1.0387, + "step": 8937 + }, + { + "epoch": 1.7709202668598985, + "grad_norm": 2.09375, + "learning_rate": 3.5611238724313357e-06, + "loss": 0.9357, + "step": 8938 + }, + { + "epoch": 1.7711201619149946, + "grad_norm": 2.078125, + "learning_rate": 3.5601146809505204e-06, + "loss": 0.8826, + "step": 8939 + }, + { + "epoch": 1.7713200569700907, + "grad_norm": 2.25, + "learning_rate": 3.5591055534292573e-06, + "loss": 1.0714, + "step": 8940 + }, + { + "epoch": 1.7715199520251868, + "grad_norm": 2.140625, + "learning_rate": 3.5580964899123715e-06, + "loss": 0.9932, + "step": 8941 + }, + { + "epoch": 1.7717198470802829, + "grad_norm": 2.078125, + "learning_rate": 3.557087490444685e-06, + "loss": 0.8739, + "step": 8942 + }, + { + "epoch": 1.771919742135379, + "grad_norm": 2.140625, + "learning_rate": 3.55607855507102e-06, + "loss": 1.0024, + "step": 8943 + }, + { + "epoch": 1.772119637190475, + "grad_norm": 2.109375, + "learning_rate": 3.55506968383619e-06, + "loss": 0.9236, + "step": 8944 + }, + { + "epoch": 1.7723195322455711, + "grad_norm": 2.09375, + "learning_rate": 3.5540608767850106e-06, + "loss": 0.985, + "step": 8945 + }, + { + "epoch": 1.7725194273006672, + "grad_norm": 2.28125, + "learning_rate": 3.5530521339622923e-06, + "loss": 0.985, + "step": 8946 + }, + { + "epoch": 1.7727193223557633, + "grad_norm": 2.328125, + "learning_rate": 3.5520434554128437e-06, + "loss": 1.0439, + "step": 8947 + }, + { + "epoch": 1.7729192174108594, + "grad_norm": 2.03125, + "learning_rate": 3.5510348411814705e-06, + "loss": 1.0106, + "step": 8948 + }, + { + "epoch": 1.7731191124659553, + "grad_norm": 2.0625, + "learning_rate": 3.5500262913129745e-06, + "loss": 0.9443, + "step": 8949 + }, + { + "epoch": 1.7733190075210514, + "grad_norm": 2.09375, + "learning_rate": 3.5490178058521553e-06, + "loss": 0.9458, + "step": 8950 + }, + { + "epoch": 1.7735189025761475, + "grad_norm": 2.140625, + "learning_rate": 3.548009384843811e-06, + "loss": 0.9217, + "step": 8951 + }, + { + "epoch": 1.7737187976312436, + "grad_norm": 2.21875, + "learning_rate": 3.547001028332735e-06, + "loss": 0.9978, + "step": 8952 + }, + { + "epoch": 1.7739186926863397, + "grad_norm": 2.28125, + "learning_rate": 3.545992736363717e-06, + "loss": 1.0665, + "step": 8953 + }, + { + "epoch": 1.7741185877414356, + "grad_norm": 2.265625, + "learning_rate": 3.544984508981548e-06, + "loss": 0.9647, + "step": 8954 + }, + { + "epoch": 1.7743184827965317, + "grad_norm": 2.0625, + "learning_rate": 3.5439763462310107e-06, + "loss": 0.8975, + "step": 8955 + }, + { + "epoch": 1.7745183778516278, + "grad_norm": 2.171875, + "learning_rate": 3.5429682481568894e-06, + "loss": 1.015, + "step": 8956 + }, + { + "epoch": 1.774718272906724, + "grad_norm": 2.15625, + "learning_rate": 3.5419602148039618e-06, + "loss": 0.8829, + "step": 8957 + }, + { + "epoch": 1.77491816796182, + "grad_norm": 2.21875, + "learning_rate": 3.5409522462170054e-06, + "loss": 0.9894, + "step": 8958 + }, + { + "epoch": 1.775118063016916, + "grad_norm": 2.34375, + "learning_rate": 3.539944342440796e-06, + "loss": 1.0176, + "step": 8959 + }, + { + "epoch": 1.7753179580720122, + "grad_norm": 2.40625, + "learning_rate": 3.5389365035201016e-06, + "loss": 0.9431, + "step": 8960 + }, + { + "epoch": 1.7755178531271083, + "grad_norm": 2.25, + "learning_rate": 3.537928729499694e-06, + "loss": 0.9652, + "step": 8961 + }, + { + "epoch": 1.7757177481822044, + "grad_norm": 2.140625, + "learning_rate": 3.536921020424334e-06, + "loss": 0.8999, + "step": 8962 + }, + { + "epoch": 1.7759176432373005, + "grad_norm": 2.21875, + "learning_rate": 3.5359133763387866e-06, + "loss": 1.0306, + "step": 8963 + }, + { + "epoch": 1.7761175382923966, + "grad_norm": 2.34375, + "learning_rate": 3.53490579728781e-06, + "loss": 1.0175, + "step": 8964 + }, + { + "epoch": 1.7763174333474927, + "grad_norm": 2.171875, + "learning_rate": 3.533898283316162e-06, + "loss": 0.9064, + "step": 8965 + }, + { + "epoch": 1.7765173284025888, + "grad_norm": 2.109375, + "learning_rate": 3.5328908344685952e-06, + "loss": 0.9434, + "step": 8966 + }, + { + "epoch": 1.7767172234576847, + "grad_norm": 2.234375, + "learning_rate": 3.5318834507898607e-06, + "loss": 0.9599, + "step": 8967 + }, + { + "epoch": 1.7769171185127808, + "grad_norm": 2.109375, + "learning_rate": 3.5308761323247077e-06, + "loss": 0.9761, + "step": 8968 + }, + { + "epoch": 1.7771170135678769, + "grad_norm": 2.109375, + "learning_rate": 3.5298688791178788e-06, + "loss": 0.9672, + "step": 8969 + }, + { + "epoch": 1.777316908622973, + "grad_norm": 2.171875, + "learning_rate": 3.528861691214117e-06, + "loss": 0.9342, + "step": 8970 + }, + { + "epoch": 1.7775168036780689, + "grad_norm": 2.171875, + "learning_rate": 3.5278545686581633e-06, + "loss": 1.0538, + "step": 8971 + }, + { + "epoch": 1.777716698733165, + "grad_norm": 2.078125, + "learning_rate": 3.526847511494751e-06, + "loss": 0.9025, + "step": 8972 + }, + { + "epoch": 1.777916593788261, + "grad_norm": 2.0625, + "learning_rate": 3.5258405197686154e-06, + "loss": 0.9172, + "step": 8973 + }, + { + "epoch": 1.7781164888433572, + "grad_norm": 2.109375, + "learning_rate": 3.524833593524487e-06, + "loss": 0.8952, + "step": 8974 + }, + { + "epoch": 1.7783163838984533, + "grad_norm": 2.21875, + "learning_rate": 3.523826732807092e-06, + "loss": 0.9515, + "step": 8975 + }, + { + "epoch": 1.7785162789535494, + "grad_norm": 2.109375, + "learning_rate": 3.5228199376611564e-06, + "loss": 0.923, + "step": 8976 + }, + { + "epoch": 1.7787161740086455, + "grad_norm": 2.15625, + "learning_rate": 3.521813208131401e-06, + "loss": 0.9622, + "step": 8977 + }, + { + "epoch": 1.7789160690637416, + "grad_norm": 2.359375, + "learning_rate": 3.520806544262545e-06, + "loss": 1.0126, + "step": 8978 + }, + { + "epoch": 1.7791159641188377, + "grad_norm": 2.34375, + "learning_rate": 3.519799946099305e-06, + "loss": 0.8961, + "step": 8979 + }, + { + "epoch": 1.7793158591739338, + "grad_norm": 2.125, + "learning_rate": 3.518793413686392e-06, + "loss": 1.0055, + "step": 8980 + }, + { + "epoch": 1.7795157542290299, + "grad_norm": 2.34375, + "learning_rate": 3.5177869470685175e-06, + "loss": 0.9555, + "step": 8981 + }, + { + "epoch": 1.779715649284126, + "grad_norm": 2.25, + "learning_rate": 3.5167805462903903e-06, + "loss": 0.9801, + "step": 8982 + }, + { + "epoch": 1.779915544339222, + "grad_norm": 2.15625, + "learning_rate": 3.5157742113967113e-06, + "loss": 0.9423, + "step": 8983 + }, + { + "epoch": 1.780115439394318, + "grad_norm": 2.109375, + "learning_rate": 3.5147679424321848e-06, + "loss": 0.9504, + "step": 8984 + }, + { + "epoch": 1.780315334449414, + "grad_norm": 2.1875, + "learning_rate": 3.5137617394415057e-06, + "loss": 1.0039, + "step": 8985 + }, + { + "epoch": 1.7805152295045101, + "grad_norm": 2.09375, + "learning_rate": 3.512755602469372e-06, + "loss": 1.0008, + "step": 8986 + }, + { + "epoch": 1.7807151245596062, + "grad_norm": 2.109375, + "learning_rate": 3.5117495315604766e-06, + "loss": 0.9502, + "step": 8987 + }, + { + "epoch": 1.7809150196147023, + "grad_norm": 2.125, + "learning_rate": 3.5107435267595067e-06, + "loss": 0.9923, + "step": 8988 + }, + { + "epoch": 1.7811149146697982, + "grad_norm": 2.296875, + "learning_rate": 3.5097375881111506e-06, + "loss": 1.0216, + "step": 8989 + }, + { + "epoch": 1.7813148097248943, + "grad_norm": 2.15625, + "learning_rate": 3.508731715660093e-06, + "loss": 0.9487, + "step": 8990 + }, + { + "epoch": 1.7815147047799904, + "grad_norm": 2.21875, + "learning_rate": 3.5077259094510118e-06, + "loss": 1.0448, + "step": 8991 + }, + { + "epoch": 1.7817145998350865, + "grad_norm": 2.1875, + "learning_rate": 3.5067201695285884e-06, + "loss": 0.9856, + "step": 8992 + }, + { + "epoch": 1.7819144948901826, + "grad_norm": 2.296875, + "learning_rate": 3.5057144959374934e-06, + "loss": 1.033, + "step": 8993 + }, + { + "epoch": 1.7821143899452787, + "grad_norm": 2.15625, + "learning_rate": 3.5047088887224024e-06, + "loss": 0.9622, + "step": 8994 + }, + { + "epoch": 1.7823142850003748, + "grad_norm": 1.9765625, + "learning_rate": 3.5037033479279813e-06, + "loss": 0.8222, + "step": 8995 + }, + { + "epoch": 1.782514180055471, + "grad_norm": 2.140625, + "learning_rate": 3.502697873598899e-06, + "loss": 0.9553, + "step": 8996 + }, + { + "epoch": 1.782714075110567, + "grad_norm": 2.109375, + "learning_rate": 3.5016924657798166e-06, + "loss": 0.9852, + "step": 8997 + }, + { + "epoch": 1.782913970165663, + "grad_norm": 2.078125, + "learning_rate": 3.5006871245153947e-06, + "loss": 0.9359, + "step": 8998 + }, + { + "epoch": 1.7831138652207592, + "grad_norm": 2.171875, + "learning_rate": 3.499681849850291e-06, + "loss": 0.9947, + "step": 8999 + }, + { + "epoch": 1.7833137602758553, + "grad_norm": 2.140625, + "learning_rate": 3.4986766418291584e-06, + "loss": 1.0271, + "step": 9000 + }, + { + "epoch": 1.7833137602758553, + "eval_loss": 0.897596001625061, + "eval_runtime": 593.4535, + "eval_samples_per_second": 3.603, + "eval_steps_per_second": 3.603, + "step": 9000 + }, + { + "epoch": 1.7835136553309514, + "grad_norm": 2.25, + "learning_rate": 3.4976715004966492e-06, + "loss": 1.1224, + "step": 9001 + }, + { + "epoch": 1.7837135503860473, + "grad_norm": 2.140625, + "learning_rate": 3.4966664258974128e-06, + "loss": 0.8661, + "step": 9002 + }, + { + "epoch": 1.7839134454411434, + "grad_norm": 2.0625, + "learning_rate": 3.4956614180760918e-06, + "loss": 0.8782, + "step": 9003 + }, + { + "epoch": 1.7841133404962395, + "grad_norm": 2.1875, + "learning_rate": 3.4946564770773305e-06, + "loss": 0.9052, + "step": 9004 + }, + { + "epoch": 1.7843132355513356, + "grad_norm": 2.296875, + "learning_rate": 3.493651602945768e-06, + "loss": 0.9965, + "step": 9005 + }, + { + "epoch": 1.7845131306064315, + "grad_norm": 2.125, + "learning_rate": 3.49264679572604e-06, + "loss": 0.9936, + "step": 9006 + }, + { + "epoch": 1.7847130256615276, + "grad_norm": 2.15625, + "learning_rate": 3.4916420554627806e-06, + "loss": 0.9557, + "step": 9007 + }, + { + "epoch": 1.7849129207166237, + "grad_norm": 2.21875, + "learning_rate": 3.49063738220062e-06, + "loss": 0.8943, + "step": 9008 + }, + { + "epoch": 1.7851128157717198, + "grad_norm": 2.15625, + "learning_rate": 3.4896327759841854e-06, + "loss": 0.9469, + "step": 9009 + }, + { + "epoch": 1.7853127108268159, + "grad_norm": 2.234375, + "learning_rate": 3.4886282368581014e-06, + "loss": 1.0564, + "step": 9010 + }, + { + "epoch": 1.785512605881912, + "grad_norm": 2.1875, + "learning_rate": 3.4876237648669896e-06, + "loss": 0.9653, + "step": 9011 + }, + { + "epoch": 1.785712500937008, + "grad_norm": 2.375, + "learning_rate": 3.486619360055469e-06, + "loss": 1.0446, + "step": 9012 + }, + { + "epoch": 1.7859123959921042, + "grad_norm": 2.265625, + "learning_rate": 3.4856150224681557e-06, + "loss": 0.9555, + "step": 9013 + }, + { + "epoch": 1.7861122910472003, + "grad_norm": 2.265625, + "learning_rate": 3.48461075214966e-06, + "loss": 0.9931, + "step": 9014 + }, + { + "epoch": 1.7863121861022964, + "grad_norm": 2.296875, + "learning_rate": 3.4836065491445935e-06, + "loss": 1.013, + "step": 9015 + }, + { + "epoch": 1.7865120811573925, + "grad_norm": 2.15625, + "learning_rate": 3.482602413497562e-06, + "loss": 0.9331, + "step": 9016 + }, + { + "epoch": 1.7867119762124886, + "grad_norm": 2.171875, + "learning_rate": 3.481598345253169e-06, + "loss": 1.0055, + "step": 9017 + }, + { + "epoch": 1.7869118712675847, + "grad_norm": 2.28125, + "learning_rate": 3.480594344456016e-06, + "loss": 1.0414, + "step": 9018 + }, + { + "epoch": 1.7871117663226805, + "grad_norm": 2.34375, + "learning_rate": 3.4795904111506988e-06, + "loss": 1.0796, + "step": 9019 + }, + { + "epoch": 1.7873116613777766, + "grad_norm": 2.1875, + "learning_rate": 3.4785865453818135e-06, + "loss": 0.9932, + "step": 9020 + }, + { + "epoch": 1.7875115564328727, + "grad_norm": 2.125, + "learning_rate": 3.477582747193953e-06, + "loss": 1.0241, + "step": 9021 + }, + { + "epoch": 1.7877114514879688, + "grad_norm": 2.3125, + "learning_rate": 3.4765790166317036e-06, + "loss": 0.9822, + "step": 9022 + }, + { + "epoch": 1.787911346543065, + "grad_norm": 2.203125, + "learning_rate": 3.475575353739651e-06, + "loss": 0.9923, + "step": 9023 + }, + { + "epoch": 1.7881112415981608, + "grad_norm": 2.125, + "learning_rate": 3.4745717585623778e-06, + "loss": 0.9738, + "step": 9024 + }, + { + "epoch": 1.788311136653257, + "grad_norm": 2.1875, + "learning_rate": 3.473568231144465e-06, + "loss": 1.0057, + "step": 9025 + }, + { + "epoch": 1.788511031708353, + "grad_norm": 2.0625, + "learning_rate": 3.4725647715304876e-06, + "loss": 0.9126, + "step": 9026 + }, + { + "epoch": 1.7887109267634491, + "grad_norm": 2.125, + "learning_rate": 3.4715613797650216e-06, + "loss": 0.9076, + "step": 9027 + }, + { + "epoch": 1.7889108218185452, + "grad_norm": 2.078125, + "learning_rate": 3.4705580558926337e-06, + "loss": 0.9161, + "step": 9028 + }, + { + "epoch": 1.7891107168736413, + "grad_norm": 2.21875, + "learning_rate": 3.4695547999578946e-06, + "loss": 0.9221, + "step": 9029 + }, + { + "epoch": 1.7893106119287374, + "grad_norm": 2.15625, + "learning_rate": 3.468551612005368e-06, + "loss": 0.9742, + "step": 9030 + }, + { + "epoch": 1.7895105069838335, + "grad_norm": 2.265625, + "learning_rate": 3.4675484920796143e-06, + "loss": 1.0193, + "step": 9031 + }, + { + "epoch": 1.7897104020389296, + "grad_norm": 2.09375, + "learning_rate": 3.4665454402251937e-06, + "loss": 0.9693, + "step": 9032 + }, + { + "epoch": 1.7899102970940257, + "grad_norm": 2.1875, + "learning_rate": 3.4655424564866615e-06, + "loss": 0.9386, + "step": 9033 + }, + { + "epoch": 1.7901101921491218, + "grad_norm": 2.1875, + "learning_rate": 3.464539540908568e-06, + "loss": 1.0275, + "step": 9034 + }, + { + "epoch": 1.790310087204218, + "grad_norm": 2.09375, + "learning_rate": 3.4635366935354654e-06, + "loss": 0.9418, + "step": 9035 + }, + { + "epoch": 1.790509982259314, + "grad_norm": 2.21875, + "learning_rate": 3.462533914411898e-06, + "loss": 0.9292, + "step": 9036 + }, + { + "epoch": 1.79070987731441, + "grad_norm": 2.0625, + "learning_rate": 3.4615312035824093e-06, + "loss": 0.928, + "step": 9037 + }, + { + "epoch": 1.790909772369506, + "grad_norm": 2.140625, + "learning_rate": 3.4605285610915417e-06, + "loss": 0.9658, + "step": 9038 + }, + { + "epoch": 1.791109667424602, + "grad_norm": 2.0625, + "learning_rate": 3.4595259869838306e-06, + "loss": 0.9019, + "step": 9039 + }, + { + "epoch": 1.7913095624796982, + "grad_norm": 2.109375, + "learning_rate": 3.4585234813038095e-06, + "loss": 1.0131, + "step": 9040 + }, + { + "epoch": 1.791509457534794, + "grad_norm": 2.15625, + "learning_rate": 3.457521044096012e-06, + "loss": 0.9699, + "step": 9041 + }, + { + "epoch": 1.7917093525898902, + "grad_norm": 2.125, + "learning_rate": 3.4565186754049647e-06, + "loss": 1.0297, + "step": 9042 + }, + { + "epoch": 1.7919092476449863, + "grad_norm": 2.171875, + "learning_rate": 3.455516375275192e-06, + "loss": 0.9788, + "step": 9043 + }, + { + "epoch": 1.7921091427000824, + "grad_norm": 2.234375, + "learning_rate": 3.4545141437512186e-06, + "loss": 0.9358, + "step": 9044 + }, + { + "epoch": 1.7923090377551785, + "grad_norm": 2.328125, + "learning_rate": 3.4535119808775608e-06, + "loss": 1.0411, + "step": 9045 + }, + { + "epoch": 1.7925089328102746, + "grad_norm": 2.25, + "learning_rate": 3.4525098866987366e-06, + "loss": 1.0401, + "step": 9046 + }, + { + "epoch": 1.7927088278653707, + "grad_norm": 2.203125, + "learning_rate": 3.451507861259257e-06, + "loss": 1.0765, + "step": 9047 + }, + { + "epoch": 1.7929087229204668, + "grad_norm": 2.0, + "learning_rate": 3.4505059046036326e-06, + "loss": 0.9126, + "step": 9048 + }, + { + "epoch": 1.7931086179755629, + "grad_norm": 2.3125, + "learning_rate": 3.4495040167763717e-06, + "loss": 0.8841, + "step": 9049 + }, + { + "epoch": 1.793308513030659, + "grad_norm": 2.203125, + "learning_rate": 3.4485021978219755e-06, + "loss": 0.9835, + "step": 9050 + }, + { + "epoch": 1.793508408085755, + "grad_norm": 2.078125, + "learning_rate": 3.447500447784946e-06, + "loss": 1.0108, + "step": 9051 + }, + { + "epoch": 1.7937083031408512, + "grad_norm": 2.234375, + "learning_rate": 3.4464987667097818e-06, + "loss": 1.069, + "step": 9052 + }, + { + "epoch": 1.7939081981959473, + "grad_norm": 2.125, + "learning_rate": 3.4454971546409777e-06, + "loss": 0.9084, + "step": 9053 + }, + { + "epoch": 1.7941080932510434, + "grad_norm": 2.140625, + "learning_rate": 3.4444956116230216e-06, + "loss": 0.9625, + "step": 9054 + }, + { + "epoch": 1.7943079883061392, + "grad_norm": 2.1875, + "learning_rate": 3.443494137700405e-06, + "loss": 0.9626, + "step": 9055 + }, + { + "epoch": 1.7945078833612353, + "grad_norm": 2.28125, + "learning_rate": 3.4424927329176128e-06, + "loss": 1.0615, + "step": 9056 + }, + { + "epoch": 1.7947077784163314, + "grad_norm": 2.09375, + "learning_rate": 3.441491397319126e-06, + "loss": 0.9225, + "step": 9057 + }, + { + "epoch": 1.7949076734714275, + "grad_norm": 2.03125, + "learning_rate": 3.4404901309494264e-06, + "loss": 0.9529, + "step": 9058 + }, + { + "epoch": 1.7951075685265234, + "grad_norm": 2.25, + "learning_rate": 3.4394889338529874e-06, + "loss": 1.0643, + "step": 9059 + }, + { + "epoch": 1.7953074635816195, + "grad_norm": 2.125, + "learning_rate": 3.4384878060742833e-06, + "loss": 0.9179, + "step": 9060 + }, + { + "epoch": 1.7955073586367156, + "grad_norm": 2.171875, + "learning_rate": 3.437486747657785e-06, + "loss": 0.9156, + "step": 9061 + }, + { + "epoch": 1.7957072536918117, + "grad_norm": 2.125, + "learning_rate": 3.4364857586479576e-06, + "loss": 0.9051, + "step": 9062 + }, + { + "epoch": 1.7959071487469078, + "grad_norm": 2.109375, + "learning_rate": 3.435484839089266e-06, + "loss": 0.99, + "step": 9063 + }, + { + "epoch": 1.796107043802004, + "grad_norm": 2.3125, + "learning_rate": 3.4344839890261718e-06, + "loss": 0.9345, + "step": 9064 + }, + { + "epoch": 1.7963069388571, + "grad_norm": 2.0625, + "learning_rate": 3.43348320850313e-06, + "loss": 0.8536, + "step": 9065 + }, + { + "epoch": 1.7965068339121961, + "grad_norm": 2.1875, + "learning_rate": 3.4324824975645984e-06, + "loss": 0.8766, + "step": 9066 + }, + { + "epoch": 1.7967067289672922, + "grad_norm": 2.109375, + "learning_rate": 3.4314818562550254e-06, + "loss": 1.0165, + "step": 9067 + }, + { + "epoch": 1.7969066240223883, + "grad_norm": 2.171875, + "learning_rate": 3.430481284618861e-06, + "loss": 0.8893, + "step": 9068 + }, + { + "epoch": 1.7971065190774844, + "grad_norm": 2.171875, + "learning_rate": 3.429480782700551e-06, + "loss": 1.0252, + "step": 9069 + }, + { + "epoch": 1.7973064141325805, + "grad_norm": 2.25, + "learning_rate": 3.4284803505445363e-06, + "loss": 1.0497, + "step": 9070 + }, + { + "epoch": 1.7975063091876766, + "grad_norm": 2.140625, + "learning_rate": 3.4274799881952564e-06, + "loss": 0.9632, + "step": 9071 + }, + { + "epoch": 1.7977062042427725, + "grad_norm": 2.140625, + "learning_rate": 3.4264796956971486e-06, + "loss": 0.979, + "step": 9072 + }, + { + "epoch": 1.7979060992978686, + "grad_norm": 2.140625, + "learning_rate": 3.425479473094644e-06, + "loss": 1.0513, + "step": 9073 + }, + { + "epoch": 1.7981059943529647, + "grad_norm": 2.03125, + "learning_rate": 3.4244793204321727e-06, + "loss": 0.9315, + "step": 9074 + }, + { + "epoch": 1.7983058894080608, + "grad_norm": 2.140625, + "learning_rate": 3.423479237754162e-06, + "loss": 0.9543, + "step": 9075 + }, + { + "epoch": 1.798505784463157, + "grad_norm": 2.234375, + "learning_rate": 3.422479225105035e-06, + "loss": 1.0306, + "step": 9076 + }, + { + "epoch": 1.7987056795182528, + "grad_norm": 2.125, + "learning_rate": 3.4214792825292133e-06, + "loss": 0.9779, + "step": 9077 + }, + { + "epoch": 1.7989055745733489, + "grad_norm": 2.171875, + "learning_rate": 3.420479410071112e-06, + "loss": 1.0356, + "step": 9078 + }, + { + "epoch": 1.799105469628445, + "grad_norm": 2.09375, + "learning_rate": 3.419479607775147e-06, + "loss": 0.9675, + "step": 9079 + }, + { + "epoch": 1.799305364683541, + "grad_norm": 2.109375, + "learning_rate": 3.4184798756857294e-06, + "loss": 0.9125, + "step": 9080 + }, + { + "epoch": 1.7995052597386372, + "grad_norm": 2.203125, + "learning_rate": 3.4174802138472663e-06, + "loss": 1.0142, + "step": 9081 + }, + { + "epoch": 1.7997051547937333, + "grad_norm": 2.125, + "learning_rate": 3.416480622304163e-06, + "loss": 0.9753, + "step": 9082 + }, + { + "epoch": 1.7999050498488294, + "grad_norm": 2.296875, + "learning_rate": 3.4154811011008224e-06, + "loss": 1.0182, + "step": 9083 + }, + { + "epoch": 1.8001049449039255, + "grad_norm": 2.171875, + "learning_rate": 3.414481650281643e-06, + "loss": 0.9982, + "step": 9084 + }, + { + "epoch": 1.8003048399590216, + "grad_norm": 2.515625, + "learning_rate": 3.4134822698910176e-06, + "loss": 0.9966, + "step": 9085 + }, + { + "epoch": 1.8005047350141177, + "grad_norm": 2.21875, + "learning_rate": 3.4124829599733404e-06, + "loss": 0.9943, + "step": 9086 + }, + { + "epoch": 1.8007046300692138, + "grad_norm": 2.078125, + "learning_rate": 3.411483720573e-06, + "loss": 0.9798, + "step": 9087 + }, + { + "epoch": 1.8009045251243099, + "grad_norm": 2.078125, + "learning_rate": 3.4104845517343837e-06, + "loss": 0.8583, + "step": 9088 + }, + { + "epoch": 1.801104420179406, + "grad_norm": 2.171875, + "learning_rate": 3.4094854535018744e-06, + "loss": 1.0153, + "step": 9089 + }, + { + "epoch": 1.8013043152345019, + "grad_norm": 2.109375, + "learning_rate": 3.40848642591985e-06, + "loss": 0.9333, + "step": 9090 + }, + { + "epoch": 1.801504210289598, + "grad_norm": 2.1875, + "learning_rate": 3.4074874690326897e-06, + "loss": 1.0667, + "step": 9091 + }, + { + "epoch": 1.801704105344694, + "grad_norm": 2.078125, + "learning_rate": 3.406488582884766e-06, + "loss": 0.87, + "step": 9092 + }, + { + "epoch": 1.8019040003997902, + "grad_norm": 2.25, + "learning_rate": 3.4054897675204485e-06, + "loss": 1.0195, + "step": 9093 + }, + { + "epoch": 1.802103895454886, + "grad_norm": 2.109375, + "learning_rate": 3.404491022984105e-06, + "loss": 0.9611, + "step": 9094 + }, + { + "epoch": 1.8023037905099821, + "grad_norm": 2.09375, + "learning_rate": 3.403492349320101e-06, + "loss": 1.0533, + "step": 9095 + }, + { + "epoch": 1.8025036855650782, + "grad_norm": 2.25, + "learning_rate": 3.402493746572796e-06, + "loss": 1.0336, + "step": 9096 + }, + { + "epoch": 1.8027035806201743, + "grad_norm": 2.171875, + "learning_rate": 3.401495214786548e-06, + "loss": 0.9208, + "step": 9097 + }, + { + "epoch": 1.8029034756752704, + "grad_norm": 2.1875, + "learning_rate": 3.4004967540057117e-06, + "loss": 0.965, + "step": 9098 + }, + { + "epoch": 1.8031033707303665, + "grad_norm": 2.0625, + "learning_rate": 3.3994983642746384e-06, + "loss": 0.9449, + "step": 9099 + }, + { + "epoch": 1.8033032657854626, + "grad_norm": 2.21875, + "learning_rate": 3.3985000456376775e-06, + "loss": 0.9817, + "step": 9100 + }, + { + "epoch": 1.8035031608405587, + "grad_norm": 2.15625, + "learning_rate": 3.3975017981391735e-06, + "loss": 0.9605, + "step": 9101 + }, + { + "epoch": 1.8037030558956548, + "grad_norm": 2.078125, + "learning_rate": 3.396503621823467e-06, + "loss": 0.9228, + "step": 9102 + }, + { + "epoch": 1.803902950950751, + "grad_norm": 2.25, + "learning_rate": 3.3955055167349006e-06, + "loss": 1.0007, + "step": 9103 + }, + { + "epoch": 1.804102846005847, + "grad_norm": 2.25, + "learning_rate": 3.394507482917806e-06, + "loss": 0.9854, + "step": 9104 + }, + { + "epoch": 1.8043027410609431, + "grad_norm": 2.125, + "learning_rate": 3.393509520416519e-06, + "loss": 1.02, + "step": 9105 + }, + { + "epoch": 1.8045026361160392, + "grad_norm": 2.21875, + "learning_rate": 3.392511629275367e-06, + "loss": 0.8621, + "step": 9106 + }, + { + "epoch": 1.804702531171135, + "grad_norm": 2.234375, + "learning_rate": 3.3915138095386758e-06, + "loss": 0.9195, + "step": 9107 + }, + { + "epoch": 1.8049024262262312, + "grad_norm": 2.234375, + "learning_rate": 3.390516061250771e-06, + "loss": 0.9501, + "step": 9108 + }, + { + "epoch": 1.8051023212813273, + "grad_norm": 2.25, + "learning_rate": 3.38951838445597e-06, + "loss": 0.9412, + "step": 9109 + }, + { + "epoch": 1.8053022163364234, + "grad_norm": 2.109375, + "learning_rate": 3.3885207791985898e-06, + "loss": 1.0443, + "step": 9110 + }, + { + "epoch": 1.8055021113915195, + "grad_norm": 2.15625, + "learning_rate": 3.3875232455229455e-06, + "loss": 0.9549, + "step": 9111 + }, + { + "epoch": 1.8057020064466154, + "grad_norm": 2.328125, + "learning_rate": 3.3865257834733454e-06, + "loss": 0.9883, + "step": 9112 + }, + { + "epoch": 1.8059019015017115, + "grad_norm": 2.0625, + "learning_rate": 3.385528393094098e-06, + "loss": 0.9083, + "step": 9113 + }, + { + "epoch": 1.8061017965568076, + "grad_norm": 2.078125, + "learning_rate": 3.3845310744295078e-06, + "loss": 0.9786, + "step": 9114 + }, + { + "epoch": 1.8063016916119037, + "grad_norm": 2.15625, + "learning_rate": 3.383533827523876e-06, + "loss": 0.9428, + "step": 9115 + }, + { + "epoch": 1.8065015866669998, + "grad_norm": 2.171875, + "learning_rate": 3.3825366524214965e-06, + "loss": 0.9264, + "step": 9116 + }, + { + "epoch": 1.8067014817220959, + "grad_norm": 2.125, + "learning_rate": 3.3815395491666676e-06, + "loss": 0.9085, + "step": 9117 + }, + { + "epoch": 1.806901376777192, + "grad_norm": 2.203125, + "learning_rate": 3.3805425178036776e-06, + "loss": 1.0277, + "step": 9118 + }, + { + "epoch": 1.807101271832288, + "grad_norm": 2.078125, + "learning_rate": 3.379545558376816e-06, + "loss": 0.9798, + "step": 9119 + }, + { + "epoch": 1.8073011668873842, + "grad_norm": 2.1875, + "learning_rate": 3.378548670930369e-06, + "loss": 0.9551, + "step": 9120 + }, + { + "epoch": 1.8075010619424803, + "grad_norm": 2.453125, + "learning_rate": 3.3775518555086158e-06, + "loss": 1.0912, + "step": 9121 + }, + { + "epoch": 1.8077009569975764, + "grad_norm": 2.046875, + "learning_rate": 3.376555112155836e-06, + "loss": 0.9048, + "step": 9122 + }, + { + "epoch": 1.8079008520526725, + "grad_norm": 2.125, + "learning_rate": 3.3755584409163058e-06, + "loss": 0.937, + "step": 9123 + }, + { + "epoch": 1.8081007471077686, + "grad_norm": 2.21875, + "learning_rate": 3.3745618418342942e-06, + "loss": 0.9743, + "step": 9124 + }, + { + "epoch": 1.8083006421628645, + "grad_norm": 2.03125, + "learning_rate": 3.3735653149540737e-06, + "loss": 0.9448, + "step": 9125 + }, + { + "epoch": 1.8085005372179606, + "grad_norm": 2.109375, + "learning_rate": 3.372568860319907e-06, + "loss": 1.0084, + "step": 9126 + }, + { + "epoch": 1.8087004322730567, + "grad_norm": 2.125, + "learning_rate": 3.3715724779760586e-06, + "loss": 0.9859, + "step": 9127 + }, + { + "epoch": 1.8089003273281528, + "grad_norm": 2.203125, + "learning_rate": 3.3705761679667865e-06, + "loss": 0.9299, + "step": 9128 + }, + { + "epoch": 1.8091002223832486, + "grad_norm": 2.109375, + "learning_rate": 3.3695799303363463e-06, + "loss": 0.9784, + "step": 9129 + }, + { + "epoch": 1.8093001174383447, + "grad_norm": 2.3125, + "learning_rate": 3.3685837651289922e-06, + "loss": 1.0141, + "step": 9130 + }, + { + "epoch": 1.8095000124934408, + "grad_norm": 2.234375, + "learning_rate": 3.3675876723889735e-06, + "loss": 1.0483, + "step": 9131 + }, + { + "epoch": 1.809699907548537, + "grad_norm": 2.171875, + "learning_rate": 3.3665916521605346e-06, + "loss": 0.8994, + "step": 9132 + }, + { + "epoch": 1.809899802603633, + "grad_norm": 2.09375, + "learning_rate": 3.3655957044879207e-06, + "loss": 0.9049, + "step": 9133 + }, + { + "epoch": 1.8100996976587291, + "grad_norm": 2.21875, + "learning_rate": 3.364599829415372e-06, + "loss": 0.9387, + "step": 9134 + }, + { + "epoch": 1.8102995927138252, + "grad_norm": 2.1875, + "learning_rate": 3.3636040269871227e-06, + "loss": 0.9692, + "step": 9135 + }, + { + "epoch": 1.8104994877689213, + "grad_norm": 2.125, + "learning_rate": 3.3626082972474096e-06, + "loss": 0.9207, + "step": 9136 + }, + { + "epoch": 1.8106993828240174, + "grad_norm": 2.078125, + "learning_rate": 3.3616126402404594e-06, + "loss": 0.9487, + "step": 9137 + }, + { + "epoch": 1.8108992778791135, + "grad_norm": 2.140625, + "learning_rate": 3.360617056010501e-06, + "loss": 0.9734, + "step": 9138 + }, + { + "epoch": 1.8110991729342096, + "grad_norm": 2.125, + "learning_rate": 3.3596215446017587e-06, + "loss": 1.07, + "step": 9139 + }, + { + "epoch": 1.8112990679893057, + "grad_norm": 2.1875, + "learning_rate": 3.358626106058451e-06, + "loss": 1.0255, + "step": 9140 + }, + { + "epoch": 1.8114989630444018, + "grad_norm": 2.125, + "learning_rate": 3.357630740424797e-06, + "loss": 0.9221, + "step": 9141 + }, + { + "epoch": 1.8116988580994977, + "grad_norm": 2.140625, + "learning_rate": 3.356635447745011e-06, + "loss": 0.9733, + "step": 9142 + }, + { + "epoch": 1.8118987531545938, + "grad_norm": 2.421875, + "learning_rate": 3.3556402280633017e-06, + "loss": 1.0517, + "step": 9143 + }, + { + "epoch": 1.81209864820969, + "grad_norm": 2.1875, + "learning_rate": 3.3546450814238786e-06, + "loss": 1.0527, + "step": 9144 + }, + { + "epoch": 1.812298543264786, + "grad_norm": 2.1875, + "learning_rate": 3.3536500078709445e-06, + "loss": 0.9925, + "step": 9145 + }, + { + "epoch": 1.812498438319882, + "grad_norm": 2.125, + "learning_rate": 3.352655007448703e-06, + "loss": 0.9813, + "step": 9146 + }, + { + "epoch": 1.812698333374978, + "grad_norm": 2.171875, + "learning_rate": 3.3516600802013487e-06, + "loss": 1.0226, + "step": 9147 + }, + { + "epoch": 1.812898228430074, + "grad_norm": 2.03125, + "learning_rate": 3.350665226173078e-06, + "loss": 0.9334, + "step": 9148 + }, + { + "epoch": 1.8130981234851702, + "grad_norm": 2.203125, + "learning_rate": 3.3496704454080807e-06, + "loss": 0.9609, + "step": 9149 + }, + { + "epoch": 1.8132980185402663, + "grad_norm": 2.078125, + "learning_rate": 3.3486757379505465e-06, + "loss": 0.8925, + "step": 9150 + }, + { + "epoch": 1.8134979135953624, + "grad_norm": 2.1875, + "learning_rate": 3.3476811038446603e-06, + "loss": 1.0665, + "step": 9151 + }, + { + "epoch": 1.8136978086504585, + "grad_norm": 2.375, + "learning_rate": 3.3466865431346017e-06, + "loss": 0.9094, + "step": 9152 + }, + { + "epoch": 1.8138977037055546, + "grad_norm": 2.21875, + "learning_rate": 3.345692055864551e-06, + "loss": 0.9556, + "step": 9153 + }, + { + "epoch": 1.8140975987606507, + "grad_norm": 2.15625, + "learning_rate": 3.3446976420786835e-06, + "loss": 0.9073, + "step": 9154 + }, + { + "epoch": 1.8142974938157468, + "grad_norm": 2.265625, + "learning_rate": 3.3437033018211682e-06, + "loss": 0.9369, + "step": 9155 + }, + { + "epoch": 1.8144973888708429, + "grad_norm": 2.1875, + "learning_rate": 3.3427090351361767e-06, + "loss": 0.9804, + "step": 9156 + }, + { + "epoch": 1.814697283925939, + "grad_norm": 2.3125, + "learning_rate": 3.3417148420678723e-06, + "loss": 1.0827, + "step": 9157 + }, + { + "epoch": 1.814897178981035, + "grad_norm": 2.15625, + "learning_rate": 3.3407207226604164e-06, + "loss": 0.8971, + "step": 9158 + }, + { + "epoch": 1.8150970740361312, + "grad_norm": 2.140625, + "learning_rate": 3.339726676957971e-06, + "loss": 0.9962, + "step": 9159 + }, + { + "epoch": 1.815296969091227, + "grad_norm": 2.1875, + "learning_rate": 3.338732705004688e-06, + "loss": 0.9177, + "step": 9160 + }, + { + "epoch": 1.8154968641463232, + "grad_norm": 2.0625, + "learning_rate": 3.3377388068447203e-06, + "loss": 1.0041, + "step": 9161 + }, + { + "epoch": 1.8156967592014193, + "grad_norm": 2.171875, + "learning_rate": 3.3367449825222188e-06, + "loss": 1.0346, + "step": 9162 + }, + { + "epoch": 1.8158966542565154, + "grad_norm": 2.28125, + "learning_rate": 3.3357512320813258e-06, + "loss": 0.9842, + "step": 9163 + }, + { + "epoch": 1.8160965493116112, + "grad_norm": 2.125, + "learning_rate": 3.334757555566186e-06, + "loss": 0.9713, + "step": 9164 + }, + { + "epoch": 1.8162964443667073, + "grad_norm": 2.078125, + "learning_rate": 3.333763953020939e-06, + "loss": 0.9551, + "step": 9165 + }, + { + "epoch": 1.8164963394218034, + "grad_norm": 2.15625, + "learning_rate": 3.3327704244897176e-06, + "loss": 0.9508, + "step": 9166 + }, + { + "epoch": 1.8166962344768995, + "grad_norm": 2.1875, + "learning_rate": 3.331776970016657e-06, + "loss": 1.0225, + "step": 9167 + }, + { + "epoch": 1.8168961295319956, + "grad_norm": 2.203125, + "learning_rate": 3.330783589645884e-06, + "loss": 0.9797, + "step": 9168 + }, + { + "epoch": 1.8170960245870917, + "grad_norm": 2.125, + "learning_rate": 3.329790283421526e-06, + "loss": 0.9614, + "step": 9169 + }, + { + "epoch": 1.8172959196421878, + "grad_norm": 2.203125, + "learning_rate": 3.3287970513877067e-06, + "loss": 1.0246, + "step": 9170 + }, + { + "epoch": 1.817495814697284, + "grad_norm": 2.109375, + "learning_rate": 3.3278038935885415e-06, + "loss": 0.9702, + "step": 9171 + }, + { + "epoch": 1.81769570975238, + "grad_norm": 2.1875, + "learning_rate": 3.3268108100681494e-06, + "loss": 0.9394, + "step": 9172 + }, + { + "epoch": 1.8178956048074761, + "grad_norm": 2.171875, + "learning_rate": 3.325817800870644e-06, + "loss": 0.906, + "step": 9173 + }, + { + "epoch": 1.8180954998625722, + "grad_norm": 2.25, + "learning_rate": 3.3248248660401317e-06, + "loss": 1.0423, + "step": 9174 + }, + { + "epoch": 1.8182953949176683, + "grad_norm": 2.0625, + "learning_rate": 3.3238320056207208e-06, + "loss": 0.8253, + "step": 9175 + }, + { + "epoch": 1.8184952899727644, + "grad_norm": 2.234375, + "learning_rate": 3.322839219656513e-06, + "loss": 1.006, + "step": 9176 + }, + { + "epoch": 1.8186951850278605, + "grad_norm": 2.328125, + "learning_rate": 3.321846508191609e-06, + "loss": 1.0534, + "step": 9177 + }, + { + "epoch": 1.8188950800829564, + "grad_norm": 2.09375, + "learning_rate": 3.320853871270102e-06, + "loss": 0.9099, + "step": 9178 + }, + { + "epoch": 1.8190949751380525, + "grad_norm": 2.0625, + "learning_rate": 3.3198613089360875e-06, + "loss": 0.9262, + "step": 9179 + }, + { + "epoch": 1.8192948701931486, + "grad_norm": 2.25, + "learning_rate": 3.318868821233654e-06, + "loss": 0.9757, + "step": 9180 + }, + { + "epoch": 1.8194947652482447, + "grad_norm": 2.203125, + "learning_rate": 3.317876408206887e-06, + "loss": 1.0132, + "step": 9181 + }, + { + "epoch": 1.8196946603033406, + "grad_norm": 2.125, + "learning_rate": 3.3168840698998722e-06, + "loss": 0.905, + "step": 9182 + }, + { + "epoch": 1.8198945553584367, + "grad_norm": 2.203125, + "learning_rate": 3.315891806356686e-06, + "loss": 1.0229, + "step": 9183 + }, + { + "epoch": 1.8200944504135328, + "grad_norm": 2.09375, + "learning_rate": 3.3148996176214054e-06, + "loss": 0.9095, + "step": 9184 + }, + { + "epoch": 1.820294345468629, + "grad_norm": 2.109375, + "learning_rate": 3.3139075037381053e-06, + "loss": 0.9197, + "step": 9185 + }, + { + "epoch": 1.820494240523725, + "grad_norm": 2.171875, + "learning_rate": 3.312915464750852e-06, + "loss": 0.9501, + "step": 9186 + }, + { + "epoch": 1.820694135578821, + "grad_norm": 2.03125, + "learning_rate": 3.3119235007037155e-06, + "loss": 0.959, + "step": 9187 + }, + { + "epoch": 1.8208940306339172, + "grad_norm": 2.15625, + "learning_rate": 3.3109316116407554e-06, + "loss": 0.963, + "step": 9188 + }, + { + "epoch": 1.8210939256890133, + "grad_norm": 2.125, + "learning_rate": 3.309939797606033e-06, + "loss": 0.9995, + "step": 9189 + }, + { + "epoch": 1.8212938207441094, + "grad_norm": 2.125, + "learning_rate": 3.308948058643605e-06, + "loss": 0.9072, + "step": 9190 + }, + { + "epoch": 1.8214937157992055, + "grad_norm": 2.25, + "learning_rate": 3.3079563947975225e-06, + "loss": 0.98, + "step": 9191 + }, + { + "epoch": 1.8216936108543016, + "grad_norm": 2.171875, + "learning_rate": 3.3069648061118366e-06, + "loss": 1.0131, + "step": 9192 + }, + { + "epoch": 1.8218935059093977, + "grad_norm": 2.03125, + "learning_rate": 3.3059732926305943e-06, + "loss": 0.889, + "step": 9193 + }, + { + "epoch": 1.8220934009644938, + "grad_norm": 2.1875, + "learning_rate": 3.3049818543978363e-06, + "loss": 0.9544, + "step": 9194 + }, + { + "epoch": 1.8222932960195897, + "grad_norm": 2.109375, + "learning_rate": 3.3039904914576036e-06, + "loss": 0.8702, + "step": 9195 + }, + { + "epoch": 1.8224931910746858, + "grad_norm": 2.21875, + "learning_rate": 3.3029992038539318e-06, + "loss": 0.9753, + "step": 9196 + }, + { + "epoch": 1.8226930861297819, + "grad_norm": 2.125, + "learning_rate": 3.302007991630854e-06, + "loss": 0.9726, + "step": 9197 + }, + { + "epoch": 1.822892981184878, + "grad_norm": 2.203125, + "learning_rate": 3.3010168548324006e-06, + "loss": 1.0036, + "step": 9198 + }, + { + "epoch": 1.8230928762399738, + "grad_norm": 2.171875, + "learning_rate": 3.3000257935025963e-06, + "loss": 0.9817, + "step": 9199 + }, + { + "epoch": 1.82329277129507, + "grad_norm": 2.109375, + "learning_rate": 3.299034807685465e-06, + "loss": 0.9199, + "step": 9200 + }, + { + "epoch": 1.823492666350166, + "grad_norm": 2.28125, + "learning_rate": 3.2980438974250262e-06, + "loss": 1.0242, + "step": 9201 + }, + { + "epoch": 1.8236925614052621, + "grad_norm": 2.171875, + "learning_rate": 3.297053062765295e-06, + "loss": 0.9565, + "step": 9202 + }, + { + "epoch": 1.8238924564603582, + "grad_norm": 2.140625, + "learning_rate": 3.2960623037502847e-06, + "loss": 0.9995, + "step": 9203 + }, + { + "epoch": 1.8240923515154543, + "grad_norm": 2.28125, + "learning_rate": 3.2950716204240065e-06, + "loss": 1.005, + "step": 9204 + }, + { + "epoch": 1.8242922465705504, + "grad_norm": 2.234375, + "learning_rate": 3.2940810128304634e-06, + "loss": 0.9792, + "step": 9205 + }, + { + "epoch": 1.8244921416256465, + "grad_norm": 2.171875, + "learning_rate": 3.293090481013661e-06, + "loss": 0.9746, + "step": 9206 + }, + { + "epoch": 1.8246920366807426, + "grad_norm": 2.140625, + "learning_rate": 3.2921000250175948e-06, + "loss": 1.0629, + "step": 9207 + }, + { + "epoch": 1.8248919317358387, + "grad_norm": 2.171875, + "learning_rate": 3.2911096448862666e-06, + "loss": 0.9234, + "step": 9208 + }, + { + "epoch": 1.8250918267909348, + "grad_norm": 2.296875, + "learning_rate": 3.290119340663663e-06, + "loss": 1.0217, + "step": 9209 + }, + { + "epoch": 1.825291721846031, + "grad_norm": 2.15625, + "learning_rate": 3.2891291123937764e-06, + "loss": 0.9754, + "step": 9210 + }, + { + "epoch": 1.825491616901127, + "grad_norm": 2.234375, + "learning_rate": 3.2881389601205906e-06, + "loss": 1.0121, + "step": 9211 + }, + { + "epoch": 1.8256915119562231, + "grad_norm": 2.0625, + "learning_rate": 3.28714888388809e-06, + "loss": 0.9866, + "step": 9212 + }, + { + "epoch": 1.825891407011319, + "grad_norm": 2.09375, + "learning_rate": 3.2861588837402534e-06, + "loss": 0.97, + "step": 9213 + }, + { + "epoch": 1.8260913020664151, + "grad_norm": 2.171875, + "learning_rate": 3.2851689597210555e-06, + "loss": 0.9061, + "step": 9214 + }, + { + "epoch": 1.8262911971215112, + "grad_norm": 2.21875, + "learning_rate": 3.2841791118744704e-06, + "loss": 1.002, + "step": 9215 + }, + { + "epoch": 1.8264910921766073, + "grad_norm": 2.15625, + "learning_rate": 3.2831893402444648e-06, + "loss": 0.9203, + "step": 9216 + }, + { + "epoch": 1.8266909872317032, + "grad_norm": 2.3125, + "learning_rate": 3.2821996448750054e-06, + "loss": 1.0905, + "step": 9217 + }, + { + "epoch": 1.8268908822867993, + "grad_norm": 2.203125, + "learning_rate": 3.2812100258100556e-06, + "loss": 1.0289, + "step": 9218 + }, + { + "epoch": 1.8270907773418954, + "grad_norm": 2.328125, + "learning_rate": 3.280220483093571e-06, + "loss": 0.9328, + "step": 9219 + }, + { + "epoch": 1.8272906723969915, + "grad_norm": 2.15625, + "learning_rate": 3.2792310167695097e-06, + "loss": 0.9384, + "step": 9220 + }, + { + "epoch": 1.8274905674520876, + "grad_norm": 2.359375, + "learning_rate": 3.278241626881823e-06, + "loss": 0.9116, + "step": 9221 + }, + { + "epoch": 1.8276904625071837, + "grad_norm": 2.25, + "learning_rate": 3.2772523134744592e-06, + "loss": 0.918, + "step": 9222 + }, + { + "epoch": 1.8278903575622798, + "grad_norm": 2.1875, + "learning_rate": 3.2762630765913626e-06, + "loss": 0.9216, + "step": 9223 + }, + { + "epoch": 1.828090252617376, + "grad_norm": 2.171875, + "learning_rate": 3.275273916276478e-06, + "loss": 1.0595, + "step": 9224 + }, + { + "epoch": 1.828290147672472, + "grad_norm": 2.234375, + "learning_rate": 3.274284832573741e-06, + "loss": 1.0279, + "step": 9225 + }, + { + "epoch": 1.828490042727568, + "grad_norm": 2.140625, + "learning_rate": 3.273295825527088e-06, + "loss": 0.9786, + "step": 9226 + }, + { + "epoch": 1.8286899377826642, + "grad_norm": 2.15625, + "learning_rate": 3.2723068951804486e-06, + "loss": 1.0637, + "step": 9227 + }, + { + "epoch": 1.8288898328377603, + "grad_norm": 2.15625, + "learning_rate": 3.2713180415777536e-06, + "loss": 1.0626, + "step": 9228 + }, + { + "epoch": 1.8290897278928564, + "grad_norm": 2.125, + "learning_rate": 3.270329264762926e-06, + "loss": 0.9125, + "step": 9229 + }, + { + "epoch": 1.8292896229479523, + "grad_norm": 2.203125, + "learning_rate": 3.2693405647798878e-06, + "loss": 1.0265, + "step": 9230 + }, + { + "epoch": 1.8294895180030484, + "grad_norm": 2.140625, + "learning_rate": 3.2683519416725564e-06, + "loss": 0.9802, + "step": 9231 + }, + { + "epoch": 1.8296894130581445, + "grad_norm": 2.1875, + "learning_rate": 3.267363395484848e-06, + "loss": 0.9942, + "step": 9232 + }, + { + "epoch": 1.8298893081132406, + "grad_norm": 2.03125, + "learning_rate": 3.2663749262606715e-06, + "loss": 0.965, + "step": 9233 + }, + { + "epoch": 1.8300892031683367, + "grad_norm": 2.296875, + "learning_rate": 3.2653865340439366e-06, + "loss": 0.9769, + "step": 9234 + }, + { + "epoch": 1.8302890982234326, + "grad_norm": 2.0625, + "learning_rate": 3.2643982188785457e-06, + "loss": 0.9969, + "step": 9235 + }, + { + "epoch": 1.8304889932785287, + "grad_norm": 2.171875, + "learning_rate": 3.2634099808084004e-06, + "loss": 1.0316, + "step": 9236 + }, + { + "epoch": 1.8306888883336248, + "grad_norm": 2.125, + "learning_rate": 3.2624218198773994e-06, + "loss": 0.9977, + "step": 9237 + }, + { + "epoch": 1.8308887833887209, + "grad_norm": 2.40625, + "learning_rate": 3.2614337361294345e-06, + "loss": 0.9797, + "step": 9238 + }, + { + "epoch": 1.831088678443817, + "grad_norm": 2.125, + "learning_rate": 3.260445729608399e-06, + "loss": 0.95, + "step": 9239 + }, + { + "epoch": 1.831288573498913, + "grad_norm": 2.28125, + "learning_rate": 3.259457800358177e-06, + "loss": 1.0814, + "step": 9240 + }, + { + "epoch": 1.8314884685540092, + "grad_norm": 2.25, + "learning_rate": 3.2584699484226547e-06, + "loss": 0.9625, + "step": 9241 + }, + { + "epoch": 1.8316883636091053, + "grad_norm": 2.21875, + "learning_rate": 3.2574821738457096e-06, + "loss": 0.94, + "step": 9242 + }, + { + "epoch": 1.8318882586642014, + "grad_norm": 2.015625, + "learning_rate": 3.25649447667122e-06, + "loss": 0.911, + "step": 9243 + }, + { + "epoch": 1.8320881537192975, + "grad_norm": 2.15625, + "learning_rate": 3.2555068569430614e-06, + "loss": 0.9125, + "step": 9244 + }, + { + "epoch": 1.8322880487743936, + "grad_norm": 2.375, + "learning_rate": 3.2545193147051015e-06, + "loss": 1.056, + "step": 9245 + }, + { + "epoch": 1.8324879438294897, + "grad_norm": 2.28125, + "learning_rate": 3.253531850001207e-06, + "loss": 1.0154, + "step": 9246 + }, + { + "epoch": 1.8326878388845858, + "grad_norm": 2.15625, + "learning_rate": 3.25254446287524e-06, + "loss": 1.0136, + "step": 9247 + }, + { + "epoch": 1.8328877339396816, + "grad_norm": 2.234375, + "learning_rate": 3.251557153371062e-06, + "loss": 0.9848, + "step": 9248 + }, + { + "epoch": 1.8330876289947777, + "grad_norm": 2.125, + "learning_rate": 3.250569921532529e-06, + "loss": 0.9669, + "step": 9249 + }, + { + "epoch": 1.8332875240498738, + "grad_norm": 2.15625, + "learning_rate": 3.249582767403493e-06, + "loss": 1.0233, + "step": 9250 + }, + { + "epoch": 1.83348741910497, + "grad_norm": 2.265625, + "learning_rate": 3.2485956910278033e-06, + "loss": 1.081, + "step": 9251 + }, + { + "epoch": 1.8336873141600658, + "grad_norm": 2.265625, + "learning_rate": 3.2476086924493067e-06, + "loss": 1.0206, + "step": 9252 + }, + { + "epoch": 1.833887209215162, + "grad_norm": 2.078125, + "learning_rate": 3.2466217717118442e-06, + "loss": 0.9491, + "step": 9253 + }, + { + "epoch": 1.834087104270258, + "grad_norm": 2.15625, + "learning_rate": 3.2456349288592547e-06, + "loss": 0.947, + "step": 9254 + }, + { + "epoch": 1.834286999325354, + "grad_norm": 2.171875, + "learning_rate": 3.2446481639353757e-06, + "loss": 0.9895, + "step": 9255 + }, + { + "epoch": 1.8344868943804502, + "grad_norm": 2.125, + "learning_rate": 3.2436614769840367e-06, + "loss": 0.9144, + "step": 9256 + }, + { + "epoch": 1.8346867894355463, + "grad_norm": 2.21875, + "learning_rate": 3.2426748680490684e-06, + "loss": 1.0338, + "step": 9257 + }, + { + "epoch": 1.8348866844906424, + "grad_norm": 2.203125, + "learning_rate": 3.2416883371742937e-06, + "loss": 0.9259, + "step": 9258 + }, + { + "epoch": 1.8350865795457385, + "grad_norm": 2.0625, + "learning_rate": 3.2407018844035348e-06, + "loss": 0.8412, + "step": 9259 + }, + { + "epoch": 1.8352864746008346, + "grad_norm": 2.21875, + "learning_rate": 3.239715509780612e-06, + "loss": 0.964, + "step": 9260 + }, + { + "epoch": 1.8354863696559307, + "grad_norm": 2.328125, + "learning_rate": 3.238729213349337e-06, + "loss": 1.0716, + "step": 9261 + }, + { + "epoch": 1.8356862647110268, + "grad_norm": 2.1875, + "learning_rate": 3.2377429951535223e-06, + "loss": 1.0144, + "step": 9262 + }, + { + "epoch": 1.835886159766123, + "grad_norm": 2.109375, + "learning_rate": 3.2367568552369765e-06, + "loss": 0.9675, + "step": 9263 + }, + { + "epoch": 1.836086054821219, + "grad_norm": 2.109375, + "learning_rate": 3.2357707936435013e-06, + "loss": 0.9698, + "step": 9264 + }, + { + "epoch": 1.8362859498763149, + "grad_norm": 2.125, + "learning_rate": 3.2347848104169012e-06, + "loss": 0.9298, + "step": 9265 + }, + { + "epoch": 1.836485844931411, + "grad_norm": 2.140625, + "learning_rate": 3.233798905600969e-06, + "loss": 0.8792, + "step": 9266 + }, + { + "epoch": 1.836685739986507, + "grad_norm": 2.15625, + "learning_rate": 3.232813079239502e-06, + "loss": 0.9292, + "step": 9267 + }, + { + "epoch": 1.8368856350416032, + "grad_norm": 2.171875, + "learning_rate": 3.231827331376289e-06, + "loss": 0.8959, + "step": 9268 + }, + { + "epoch": 1.8370855300966993, + "grad_norm": 2.15625, + "learning_rate": 3.2308416620551175e-06, + "loss": 1.0066, + "step": 9269 + }, + { + "epoch": 1.8372854251517952, + "grad_norm": 2.125, + "learning_rate": 3.2298560713197712e-06, + "loss": 1.0907, + "step": 9270 + }, + { + "epoch": 1.8374853202068913, + "grad_norm": 2.171875, + "learning_rate": 3.228870559214028e-06, + "loss": 1.0026, + "step": 9271 + }, + { + "epoch": 1.8376852152619874, + "grad_norm": 2.125, + "learning_rate": 3.2278851257816655e-06, + "loss": 0.9451, + "step": 9272 + }, + { + "epoch": 1.8378851103170835, + "grad_norm": 2.15625, + "learning_rate": 3.226899771066456e-06, + "loss": 0.9575, + "step": 9273 + }, + { + "epoch": 1.8380850053721796, + "grad_norm": 2.203125, + "learning_rate": 3.2259144951121697e-06, + "loss": 0.9289, + "step": 9274 + }, + { + "epoch": 1.8382849004272757, + "grad_norm": 2.1875, + "learning_rate": 3.224929297962572e-06, + "loss": 0.9889, + "step": 9275 + }, + { + "epoch": 1.8384847954823718, + "grad_norm": 2.1875, + "learning_rate": 3.2239441796614256e-06, + "loss": 1.069, + "step": 9276 + }, + { + "epoch": 1.8386846905374679, + "grad_norm": 2.296875, + "learning_rate": 3.2229591402524894e-06, + "loss": 1.095, + "step": 9277 + }, + { + "epoch": 1.838884585592564, + "grad_norm": 2.21875, + "learning_rate": 3.2219741797795175e-06, + "loss": 1.061, + "step": 9278 + }, + { + "epoch": 1.83908448064766, + "grad_norm": 2.203125, + "learning_rate": 3.220989298286262e-06, + "loss": 0.9144, + "step": 9279 + }, + { + "epoch": 1.8392843757027562, + "grad_norm": 2.203125, + "learning_rate": 3.220004495816474e-06, + "loss": 0.9368, + "step": 9280 + }, + { + "epoch": 1.8394842707578523, + "grad_norm": 2.1875, + "learning_rate": 3.2190197724138943e-06, + "loss": 0.9001, + "step": 9281 + }, + { + "epoch": 1.8396841658129484, + "grad_norm": 2.265625, + "learning_rate": 3.2180351281222665e-06, + "loss": 1.0191, + "step": 9282 + }, + { + "epoch": 1.8398840608680442, + "grad_norm": 2.109375, + "learning_rate": 3.217050562985329e-06, + "loss": 0.9633, + "step": 9283 + }, + { + "epoch": 1.8400839559231403, + "grad_norm": 2.140625, + "learning_rate": 3.216066077046814e-06, + "loss": 1.0508, + "step": 9284 + }, + { + "epoch": 1.8402838509782364, + "grad_norm": 2.1875, + "learning_rate": 3.215081670350454e-06, + "loss": 1.0296, + "step": 9285 + }, + { + "epoch": 1.8404837460333325, + "grad_norm": 2.25, + "learning_rate": 3.2140973429399747e-06, + "loss": 1.0299, + "step": 9286 + }, + { + "epoch": 1.8406836410884284, + "grad_norm": 2.1875, + "learning_rate": 3.213113094859101e-06, + "loss": 0.9849, + "step": 9287 + }, + { + "epoch": 1.8408835361435245, + "grad_norm": 2.171875, + "learning_rate": 3.2121289261515535e-06, + "loss": 0.9708, + "step": 9288 + }, + { + "epoch": 1.8410834311986206, + "grad_norm": 2.140625, + "learning_rate": 3.2111448368610476e-06, + "loss": 0.9623, + "step": 9289 + }, + { + "epoch": 1.8412833262537167, + "grad_norm": 2.34375, + "learning_rate": 3.2101608270312963e-06, + "loss": 1.0313, + "step": 9290 + }, + { + "epoch": 1.8414832213088128, + "grad_norm": 2.1875, + "learning_rate": 3.209176896706011e-06, + "loss": 0.9675, + "step": 9291 + }, + { + "epoch": 1.841683116363909, + "grad_norm": 2.125, + "learning_rate": 3.2081930459288963e-06, + "loss": 1.0305, + "step": 9292 + }, + { + "epoch": 1.841883011419005, + "grad_norm": 2.0625, + "learning_rate": 3.2072092747436546e-06, + "loss": 0.9266, + "step": 9293 + }, + { + "epoch": 1.8420829064741011, + "grad_norm": 2.15625, + "learning_rate": 3.2062255831939863e-06, + "loss": 0.9272, + "step": 9294 + }, + { + "epoch": 1.8422828015291972, + "grad_norm": 2.3125, + "learning_rate": 3.2052419713235854e-06, + "loss": 0.9379, + "step": 9295 + }, + { + "epoch": 1.8424826965842933, + "grad_norm": 2.21875, + "learning_rate": 3.2042584391761457e-06, + "loss": 1.0046, + "step": 9296 + }, + { + "epoch": 1.8426825916393894, + "grad_norm": 2.25, + "learning_rate": 3.203274986795353e-06, + "loss": 0.9611, + "step": 9297 + }, + { + "epoch": 1.8428824866944855, + "grad_norm": 2.1875, + "learning_rate": 3.2022916142248933e-06, + "loss": 1.0321, + "step": 9298 + }, + { + "epoch": 1.8430823817495816, + "grad_norm": 2.15625, + "learning_rate": 3.2013083215084495e-06, + "loss": 0.9563, + "step": 9299 + }, + { + "epoch": 1.8432822768046777, + "grad_norm": 2.25, + "learning_rate": 3.2003251086896963e-06, + "loss": 1.0356, + "step": 9300 + }, + { + "epoch": 1.8434821718597736, + "grad_norm": 2.21875, + "learning_rate": 3.199341975812312e-06, + "loss": 1.0392, + "step": 9301 + }, + { + "epoch": 1.8436820669148697, + "grad_norm": 2.203125, + "learning_rate": 3.198358922919963e-06, + "loss": 1.0086, + "step": 9302 + }, + { + "epoch": 1.8438819619699658, + "grad_norm": 2.125, + "learning_rate": 3.197375950056319e-06, + "loss": 0.9148, + "step": 9303 + }, + { + "epoch": 1.8440818570250619, + "grad_norm": 2.0625, + "learning_rate": 3.1963930572650414e-06, + "loss": 0.8526, + "step": 9304 + }, + { + "epoch": 1.8442817520801578, + "grad_norm": 2.171875, + "learning_rate": 3.1954102445897923e-06, + "loss": 0.9934, + "step": 9305 + }, + { + "epoch": 1.8444816471352539, + "grad_norm": 2.1875, + "learning_rate": 3.194427512074228e-06, + "loss": 0.9847, + "step": 9306 + }, + { + "epoch": 1.84468154219035, + "grad_norm": 2.109375, + "learning_rate": 3.1934448597619997e-06, + "loss": 0.9596, + "step": 9307 + }, + { + "epoch": 1.844881437245446, + "grad_norm": 2.3125, + "learning_rate": 3.192462287696759e-06, + "loss": 1.0517, + "step": 9308 + }, + { + "epoch": 1.8450813323005422, + "grad_norm": 2.203125, + "learning_rate": 3.1914797959221495e-06, + "loss": 0.9667, + "step": 9309 + }, + { + "epoch": 1.8452812273556383, + "grad_norm": 2.15625, + "learning_rate": 3.1904973844818144e-06, + "loss": 0.9874, + "step": 9310 + }, + { + "epoch": 1.8454811224107344, + "grad_norm": 2.171875, + "learning_rate": 3.189515053419393e-06, + "loss": 0.9769, + "step": 9311 + }, + { + "epoch": 1.8456810174658305, + "grad_norm": 2.25, + "learning_rate": 3.188532802778518e-06, + "loss": 1.0035, + "step": 9312 + }, + { + "epoch": 1.8458809125209266, + "grad_norm": 2.046875, + "learning_rate": 3.1875506326028226e-06, + "loss": 0.9881, + "step": 9313 + }, + { + "epoch": 1.8460808075760227, + "grad_norm": 2.296875, + "learning_rate": 3.186568542935936e-06, + "loss": 1.0568, + "step": 9314 + }, + { + "epoch": 1.8462807026311188, + "grad_norm": 2.15625, + "learning_rate": 3.1855865338214803e-06, + "loss": 0.914, + "step": 9315 + }, + { + "epoch": 1.8464805976862149, + "grad_norm": 2.125, + "learning_rate": 3.1846046053030778e-06, + "loss": 0.9923, + "step": 9316 + }, + { + "epoch": 1.846680492741311, + "grad_norm": 2.203125, + "learning_rate": 3.1836227574243434e-06, + "loss": 0.9697, + "step": 9317 + }, + { + "epoch": 1.8468803877964068, + "grad_norm": 2.203125, + "learning_rate": 3.1826409902288925e-06, + "loss": 1.0062, + "step": 9318 + }, + { + "epoch": 1.847080282851503, + "grad_norm": 2.15625, + "learning_rate": 3.1816593037603356e-06, + "loss": 0.9864, + "step": 9319 + }, + { + "epoch": 1.847280177906599, + "grad_norm": 2.171875, + "learning_rate": 3.1806776980622773e-06, + "loss": 0.9804, + "step": 9320 + }, + { + "epoch": 1.8474800729616951, + "grad_norm": 2.140625, + "learning_rate": 3.179696173178321e-06, + "loss": 0.985, + "step": 9321 + }, + { + "epoch": 1.847679968016791, + "grad_norm": 2.109375, + "learning_rate": 3.1787147291520675e-06, + "loss": 0.9131, + "step": 9322 + }, + { + "epoch": 1.8478798630718871, + "grad_norm": 2.1875, + "learning_rate": 3.1777333660271103e-06, + "loss": 0.8775, + "step": 9323 + }, + { + "epoch": 1.8480797581269832, + "grad_norm": 2.125, + "learning_rate": 3.1767520838470433e-06, + "loss": 0.9141, + "step": 9324 + }, + { + "epoch": 1.8482796531820793, + "grad_norm": 2.078125, + "learning_rate": 3.175770882655453e-06, + "loss": 0.9623, + "step": 9325 + }, + { + "epoch": 1.8484795482371754, + "grad_norm": 2.171875, + "learning_rate": 3.174789762495925e-06, + "loss": 0.9542, + "step": 9326 + }, + { + "epoch": 1.8486794432922715, + "grad_norm": 2.25, + "learning_rate": 3.1738087234120426e-06, + "loss": 0.9142, + "step": 9327 + }, + { + "epoch": 1.8488793383473676, + "grad_norm": 2.21875, + "learning_rate": 3.1728277654473793e-06, + "loss": 0.9389, + "step": 9328 + }, + { + "epoch": 1.8490792334024637, + "grad_norm": 2.234375, + "learning_rate": 3.171846888645513e-06, + "loss": 0.9108, + "step": 9329 + }, + { + "epoch": 1.8492791284575598, + "grad_norm": 2.140625, + "learning_rate": 3.1708660930500124e-06, + "loss": 0.942, + "step": 9330 + }, + { + "epoch": 1.849479023512656, + "grad_norm": 2.09375, + "learning_rate": 3.1698853787044447e-06, + "loss": 0.9554, + "step": 9331 + }, + { + "epoch": 1.849678918567752, + "grad_norm": 2.21875, + "learning_rate": 3.1689047456523746e-06, + "loss": 0.9705, + "step": 9332 + }, + { + "epoch": 1.8498788136228481, + "grad_norm": 2.1875, + "learning_rate": 3.1679241939373578e-06, + "loss": 1.073, + "step": 9333 + }, + { + "epoch": 1.8500787086779442, + "grad_norm": 2.140625, + "learning_rate": 3.1669437236029536e-06, + "loss": 0.9328, + "step": 9334 + }, + { + "epoch": 1.8502786037330403, + "grad_norm": 2.09375, + "learning_rate": 3.1659633346927133e-06, + "loss": 0.983, + "step": 9335 + }, + { + "epoch": 1.8504784987881362, + "grad_norm": 2.140625, + "learning_rate": 3.1649830272501857e-06, + "loss": 0.9917, + "step": 9336 + }, + { + "epoch": 1.8506783938432323, + "grad_norm": 2.21875, + "learning_rate": 3.1640028013189155e-06, + "loss": 1.005, + "step": 9337 + }, + { + "epoch": 1.8508782888983284, + "grad_norm": 2.328125, + "learning_rate": 3.163022656942445e-06, + "loss": 0.9806, + "step": 9338 + }, + { + "epoch": 1.8510781839534245, + "grad_norm": 2.234375, + "learning_rate": 3.162042594164313e-06, + "loss": 0.9685, + "step": 9339 + }, + { + "epoch": 1.8512780790085204, + "grad_norm": 2.21875, + "learning_rate": 3.1610626130280507e-06, + "loss": 0.9463, + "step": 9340 + }, + { + "epoch": 1.8514779740636165, + "grad_norm": 2.0625, + "learning_rate": 3.1600827135771914e-06, + "loss": 0.9251, + "step": 9341 + }, + { + "epoch": 1.8516778691187126, + "grad_norm": 2.25, + "learning_rate": 3.1591028958552627e-06, + "loss": 1.0171, + "step": 9342 + }, + { + "epoch": 1.8518777641738087, + "grad_norm": 2.140625, + "learning_rate": 3.158123159905785e-06, + "loss": 0.8933, + "step": 9343 + }, + { + "epoch": 1.8520776592289048, + "grad_norm": 2.34375, + "learning_rate": 3.1571435057722806e-06, + "loss": 0.8957, + "step": 9344 + }, + { + "epoch": 1.8522775542840009, + "grad_norm": 2.234375, + "learning_rate": 3.1561639334982653e-06, + "loss": 0.9443, + "step": 9345 + }, + { + "epoch": 1.852477449339097, + "grad_norm": 2.078125, + "learning_rate": 3.1551844431272505e-06, + "loss": 0.9494, + "step": 9346 + }, + { + "epoch": 1.852677344394193, + "grad_norm": 2.078125, + "learning_rate": 3.1542050347027465e-06, + "loss": 0.922, + "step": 9347 + }, + { + "epoch": 1.8528772394492892, + "grad_norm": 2.015625, + "learning_rate": 3.1532257082682573e-06, + "loss": 0.8955, + "step": 9348 + }, + { + "epoch": 1.8530771345043853, + "grad_norm": 2.203125, + "learning_rate": 3.152246463867284e-06, + "loss": 1.1093, + "step": 9349 + }, + { + "epoch": 1.8532770295594814, + "grad_norm": 2.078125, + "learning_rate": 3.1512673015433272e-06, + "loss": 0.9333, + "step": 9350 + }, + { + "epoch": 1.8534769246145775, + "grad_norm": 2.171875, + "learning_rate": 3.1502882213398776e-06, + "loss": 0.9682, + "step": 9351 + }, + { + "epoch": 1.8536768196696736, + "grad_norm": 2.140625, + "learning_rate": 3.149309223300428e-06, + "loss": 0.9936, + "step": 9352 + }, + { + "epoch": 1.8538767147247694, + "grad_norm": 2.140625, + "learning_rate": 3.1483303074684663e-06, + "loss": 0.9197, + "step": 9353 + }, + { + "epoch": 1.8540766097798655, + "grad_norm": 2.1875, + "learning_rate": 3.1473514738874734e-06, + "loss": 1.0184, + "step": 9354 + }, + { + "epoch": 1.8542765048349616, + "grad_norm": 2.28125, + "learning_rate": 3.146372722600931e-06, + "loss": 0.9506, + "step": 9355 + }, + { + "epoch": 1.8544763998900577, + "grad_norm": 2.203125, + "learning_rate": 3.1453940536523135e-06, + "loss": 0.9858, + "step": 9356 + }, + { + "epoch": 1.8546762949451538, + "grad_norm": 2.0625, + "learning_rate": 3.144415467085094e-06, + "loss": 0.998, + "step": 9357 + }, + { + "epoch": 1.8548761900002497, + "grad_norm": 2.109375, + "learning_rate": 3.1434369629427425e-06, + "loss": 0.898, + "step": 9358 + }, + { + "epoch": 1.8550760850553458, + "grad_norm": 2.171875, + "learning_rate": 3.1424585412687215e-06, + "loss": 1.0378, + "step": 9359 + }, + { + "epoch": 1.855275980110442, + "grad_norm": 2.1875, + "learning_rate": 3.141480202106494e-06, + "loss": 0.9777, + "step": 9360 + }, + { + "epoch": 1.855475875165538, + "grad_norm": 2.109375, + "learning_rate": 3.1405019454995178e-06, + "loss": 0.9472, + "step": 9361 + }, + { + "epoch": 1.8556757702206341, + "grad_norm": 2.203125, + "learning_rate": 3.139523771491246e-06, + "loss": 0.9036, + "step": 9362 + }, + { + "epoch": 1.8558756652757302, + "grad_norm": 2.28125, + "learning_rate": 3.138545680125132e-06, + "loss": 1.049, + "step": 9363 + }, + { + "epoch": 1.8560755603308263, + "grad_norm": 2.09375, + "learning_rate": 3.1375676714446173e-06, + "loss": 0.9575, + "step": 9364 + }, + { + "epoch": 1.8562754553859224, + "grad_norm": 2.78125, + "learning_rate": 3.1365897454931495e-06, + "loss": 0.9965, + "step": 9365 + }, + { + "epoch": 1.8564753504410185, + "grad_norm": 2.171875, + "learning_rate": 3.1356119023141644e-06, + "loss": 0.9977, + "step": 9366 + }, + { + "epoch": 1.8566752454961146, + "grad_norm": 2.0625, + "learning_rate": 3.1346341419511008e-06, + "loss": 0.9958, + "step": 9367 + }, + { + "epoch": 1.8568751405512107, + "grad_norm": 2.15625, + "learning_rate": 3.1336564644473886e-06, + "loss": 1.0637, + "step": 9368 + }, + { + "epoch": 1.8570750356063068, + "grad_norm": 2.0625, + "learning_rate": 3.1326788698464565e-06, + "loss": 0.9981, + "step": 9369 + }, + { + "epoch": 1.857274930661403, + "grad_norm": 2.21875, + "learning_rate": 3.131701358191731e-06, + "loss": 0.9778, + "step": 9370 + }, + { + "epoch": 1.8574748257164988, + "grad_norm": 2.1875, + "learning_rate": 3.1307239295266303e-06, + "loss": 0.9657, + "step": 9371 + }, + { + "epoch": 1.857674720771595, + "grad_norm": 2.078125, + "learning_rate": 3.129746583894573e-06, + "loss": 1.0535, + "step": 9372 + }, + { + "epoch": 1.857874615826691, + "grad_norm": 2.203125, + "learning_rate": 3.128769321338974e-06, + "loss": 0.9256, + "step": 9373 + }, + { + "epoch": 1.858074510881787, + "grad_norm": 2.140625, + "learning_rate": 3.127792141903241e-06, + "loss": 0.9787, + "step": 9374 + }, + { + "epoch": 1.858274405936883, + "grad_norm": 2.0625, + "learning_rate": 3.1268150456307817e-06, + "loss": 0.9957, + "step": 9375 + }, + { + "epoch": 1.858474300991979, + "grad_norm": 2.234375, + "learning_rate": 3.1258380325649973e-06, + "loss": 1.0009, + "step": 9376 + }, + { + "epoch": 1.8586741960470752, + "grad_norm": 2.125, + "learning_rate": 3.124861102749287e-06, + "loss": 0.9454, + "step": 9377 + }, + { + "epoch": 1.8588740911021713, + "grad_norm": 2.1875, + "learning_rate": 3.123884256227047e-06, + "loss": 0.8927, + "step": 9378 + }, + { + "epoch": 1.8590739861572674, + "grad_norm": 2.125, + "learning_rate": 3.1229074930416674e-06, + "loss": 0.963, + "step": 9379 + }, + { + "epoch": 1.8592738812123635, + "grad_norm": 2.109375, + "learning_rate": 3.1219308132365365e-06, + "loss": 0.969, + "step": 9380 + }, + { + "epoch": 1.8594737762674596, + "grad_norm": 2.21875, + "learning_rate": 3.1209542168550393e-06, + "loss": 0.9602, + "step": 9381 + }, + { + "epoch": 1.8596736713225557, + "grad_norm": 2.109375, + "learning_rate": 3.119977703940554e-06, + "loss": 1.013, + "step": 9382 + }, + { + "epoch": 1.8598735663776518, + "grad_norm": 2.1875, + "learning_rate": 3.1190012745364584e-06, + "loss": 1.064, + "step": 9383 + }, + { + "epoch": 1.8600734614327479, + "grad_norm": 2.09375, + "learning_rate": 3.118024928686126e-06, + "loss": 0.9555, + "step": 9384 + }, + { + "epoch": 1.860273356487844, + "grad_norm": 2.109375, + "learning_rate": 3.1170486664329246e-06, + "loss": 1.0132, + "step": 9385 + }, + { + "epoch": 1.86047325154294, + "grad_norm": 2.0625, + "learning_rate": 3.1160724878202213e-06, + "loss": 1.0332, + "step": 9386 + }, + { + "epoch": 1.8606731465980362, + "grad_norm": 2.125, + "learning_rate": 3.1150963928913756e-06, + "loss": 1.0393, + "step": 9387 + }, + { + "epoch": 1.860873041653132, + "grad_norm": 2.125, + "learning_rate": 3.114120381689747e-06, + "loss": 0.9029, + "step": 9388 + }, + { + "epoch": 1.8610729367082282, + "grad_norm": 2.0625, + "learning_rate": 3.1131444542586906e-06, + "loss": 0.9409, + "step": 9389 + }, + { + "epoch": 1.8612728317633243, + "grad_norm": 2.25, + "learning_rate": 3.1121686106415553e-06, + "loss": 0.9847, + "step": 9390 + }, + { + "epoch": 1.8614727268184204, + "grad_norm": 2.125, + "learning_rate": 3.1111928508816885e-06, + "loss": 0.9834, + "step": 9391 + }, + { + "epoch": 1.8616726218735165, + "grad_norm": 2.1875, + "learning_rate": 3.110217175022434e-06, + "loss": 0.9899, + "step": 9392 + }, + { + "epoch": 1.8618725169286123, + "grad_norm": 2.140625, + "learning_rate": 3.109241583107131e-06, + "loss": 0.9833, + "step": 9393 + }, + { + "epoch": 1.8620724119837084, + "grad_norm": 2.078125, + "learning_rate": 3.108266075179116e-06, + "loss": 0.9947, + "step": 9394 + }, + { + "epoch": 1.8622723070388045, + "grad_norm": 2.15625, + "learning_rate": 3.107290651281718e-06, + "loss": 0.8924, + "step": 9395 + }, + { + "epoch": 1.8624722020939006, + "grad_norm": 2.21875, + "learning_rate": 3.1063153114582677e-06, + "loss": 0.9985, + "step": 9396 + }, + { + "epoch": 1.8626720971489967, + "grad_norm": 2.109375, + "learning_rate": 3.105340055752089e-06, + "loss": 0.9268, + "step": 9397 + }, + { + "epoch": 1.8628719922040928, + "grad_norm": 2.234375, + "learning_rate": 3.1043648842065032e-06, + "loss": 1.05, + "step": 9398 + }, + { + "epoch": 1.863071887259189, + "grad_norm": 2.203125, + "learning_rate": 3.1033897968648258e-06, + "loss": 0.9761, + "step": 9399 + }, + { + "epoch": 1.863271782314285, + "grad_norm": 2.171875, + "learning_rate": 3.102414793770371e-06, + "loss": 0.9489, + "step": 9400 + }, + { + "epoch": 1.8634716773693811, + "grad_norm": 2.140625, + "learning_rate": 3.1014398749664494e-06, + "loss": 0.9223, + "step": 9401 + }, + { + "epoch": 1.8636715724244772, + "grad_norm": 2.078125, + "learning_rate": 3.1004650404963643e-06, + "loss": 0.9064, + "step": 9402 + }, + { + "epoch": 1.8638714674795733, + "grad_norm": 2.140625, + "learning_rate": 3.0994902904034196e-06, + "loss": 0.9444, + "step": 9403 + }, + { + "epoch": 1.8640713625346694, + "grad_norm": 2.046875, + "learning_rate": 3.0985156247309145e-06, + "loss": 0.8982, + "step": 9404 + }, + { + "epoch": 1.8642712575897655, + "grad_norm": 2.125, + "learning_rate": 3.0975410435221415e-06, + "loss": 0.9376, + "step": 9405 + }, + { + "epoch": 1.8644711526448614, + "grad_norm": 2.1875, + "learning_rate": 3.0965665468203926e-06, + "loss": 1.0606, + "step": 9406 + }, + { + "epoch": 1.8646710476999575, + "grad_norm": 2.21875, + "learning_rate": 3.0955921346689533e-06, + "loss": 0.9864, + "step": 9407 + }, + { + "epoch": 1.8648709427550536, + "grad_norm": 2.234375, + "learning_rate": 3.094617807111109e-06, + "loss": 0.9607, + "step": 9408 + }, + { + "epoch": 1.8650708378101497, + "grad_norm": 2.109375, + "learning_rate": 3.093643564190138e-06, + "loss": 0.9449, + "step": 9409 + }, + { + "epoch": 1.8652707328652456, + "grad_norm": 2.171875, + "learning_rate": 3.0926694059493155e-06, + "loss": 0.9976, + "step": 9410 + }, + { + "epoch": 1.8654706279203417, + "grad_norm": 2.140625, + "learning_rate": 3.0916953324319144e-06, + "loss": 0.9687, + "step": 9411 + }, + { + "epoch": 1.8656705229754378, + "grad_norm": 2.0625, + "learning_rate": 3.0907213436812037e-06, + "loss": 0.9989, + "step": 9412 + }, + { + "epoch": 1.8658704180305339, + "grad_norm": 2.265625, + "learning_rate": 3.0897474397404466e-06, + "loss": 0.9618, + "step": 9413 + }, + { + "epoch": 1.86607031308563, + "grad_norm": 2.171875, + "learning_rate": 3.0887736206529053e-06, + "loss": 1.0123, + "step": 9414 + }, + { + "epoch": 1.866270208140726, + "grad_norm": 2.171875, + "learning_rate": 3.0877998864618334e-06, + "loss": 1.0452, + "step": 9415 + }, + { + "epoch": 1.8664701031958222, + "grad_norm": 2.09375, + "learning_rate": 3.0868262372104873e-06, + "loss": 0.9167, + "step": 9416 + }, + { + "epoch": 1.8666699982509183, + "grad_norm": 2.140625, + "learning_rate": 3.085852672942116e-06, + "loss": 1.0233, + "step": 9417 + }, + { + "epoch": 1.8668698933060144, + "grad_norm": 2.28125, + "learning_rate": 3.084879193699963e-06, + "loss": 1.0442, + "step": 9418 + }, + { + "epoch": 1.8670697883611105, + "grad_norm": 2.15625, + "learning_rate": 3.083905799527273e-06, + "loss": 1.0295, + "step": 9419 + }, + { + "epoch": 1.8672696834162066, + "grad_norm": 2.25, + "learning_rate": 3.082932490467283e-06, + "loss": 0.9886, + "step": 9420 + }, + { + "epoch": 1.8674695784713027, + "grad_norm": 2.125, + "learning_rate": 3.0819592665632262e-06, + "loss": 0.9874, + "step": 9421 + }, + { + "epoch": 1.8676694735263988, + "grad_norm": 2.171875, + "learning_rate": 3.080986127858334e-06, + "loss": 0.991, + "step": 9422 + }, + { + "epoch": 1.8678693685814947, + "grad_norm": 2.125, + "learning_rate": 3.080013074395834e-06, + "loss": 0.9268, + "step": 9423 + }, + { + "epoch": 1.8680692636365908, + "grad_norm": 2.140625, + "learning_rate": 3.0790401062189488e-06, + "loss": 0.8571, + "step": 9424 + }, + { + "epoch": 1.8682691586916869, + "grad_norm": 2.125, + "learning_rate": 3.0780672233708954e-06, + "loss": 1.0306, + "step": 9425 + }, + { + "epoch": 1.868469053746783, + "grad_norm": 2.140625, + "learning_rate": 3.0770944258948913e-06, + "loss": 0.9527, + "step": 9426 + }, + { + "epoch": 1.868668948801879, + "grad_norm": 2.046875, + "learning_rate": 3.076121713834147e-06, + "loss": 0.8627, + "step": 9427 + }, + { + "epoch": 1.868868843856975, + "grad_norm": 2.109375, + "learning_rate": 3.07514908723187e-06, + "loss": 0.918, + "step": 9428 + }, + { + "epoch": 1.869068738912071, + "grad_norm": 2.0625, + "learning_rate": 3.074176546131266e-06, + "loss": 0.9442, + "step": 9429 + }, + { + "epoch": 1.8692686339671671, + "grad_norm": 2.109375, + "learning_rate": 3.0732040905755333e-06, + "loss": 1.0042, + "step": 9430 + }, + { + "epoch": 1.8694685290222632, + "grad_norm": 2.15625, + "learning_rate": 3.0722317206078692e-06, + "loss": 1.0081, + "step": 9431 + }, + { + "epoch": 1.8696684240773593, + "grad_norm": 2.21875, + "learning_rate": 3.0712594362714677e-06, + "loss": 0.9252, + "step": 9432 + }, + { + "epoch": 1.8698683191324554, + "grad_norm": 2.25, + "learning_rate": 3.0702872376095146e-06, + "loss": 0.8877, + "step": 9433 + }, + { + "epoch": 1.8700682141875515, + "grad_norm": 2.125, + "learning_rate": 3.069315124665196e-06, + "loss": 0.9872, + "step": 9434 + }, + { + "epoch": 1.8702681092426476, + "grad_norm": 2.140625, + "learning_rate": 3.068343097481694e-06, + "loss": 0.9615, + "step": 9435 + }, + { + "epoch": 1.8704680042977437, + "grad_norm": 2.203125, + "learning_rate": 3.0673711561021857e-06, + "loss": 1.0107, + "step": 9436 + }, + { + "epoch": 1.8706678993528398, + "grad_norm": 2.0, + "learning_rate": 3.0663993005698444e-06, + "loss": 0.9041, + "step": 9437 + }, + { + "epoch": 1.870867794407936, + "grad_norm": 2.25, + "learning_rate": 3.0654275309278382e-06, + "loss": 1.033, + "step": 9438 + }, + { + "epoch": 1.871067689463032, + "grad_norm": 2.171875, + "learning_rate": 3.0644558472193355e-06, + "loss": 0.9913, + "step": 9439 + }, + { + "epoch": 1.8712675845181281, + "grad_norm": 2.0625, + "learning_rate": 3.0634842494874974e-06, + "loss": 0.9967, + "step": 9440 + }, + { + "epoch": 1.871467479573224, + "grad_norm": 2.140625, + "learning_rate": 3.0625127377754814e-06, + "loss": 0.9785, + "step": 9441 + }, + { + "epoch": 1.8716673746283201, + "grad_norm": 2.125, + "learning_rate": 3.0615413121264427e-06, + "loss": 0.9842, + "step": 9442 + }, + { + "epoch": 1.8718672696834162, + "grad_norm": 2.171875, + "learning_rate": 3.060569972583533e-06, + "loss": 0.9799, + "step": 9443 + }, + { + "epoch": 1.8720671647385123, + "grad_norm": 2.234375, + "learning_rate": 3.0595987191898968e-06, + "loss": 0.9764, + "step": 9444 + }, + { + "epoch": 1.8722670597936082, + "grad_norm": 2.3125, + "learning_rate": 3.0586275519886792e-06, + "loss": 1.0285, + "step": 9445 + }, + { + "epoch": 1.8724669548487043, + "grad_norm": 2.0, + "learning_rate": 3.057656471023018e-06, + "loss": 0.9092, + "step": 9446 + }, + { + "epoch": 1.8726668499038004, + "grad_norm": 2.15625, + "learning_rate": 3.056685476336048e-06, + "loss": 0.9515, + "step": 9447 + }, + { + "epoch": 1.8728667449588965, + "grad_norm": 2.09375, + "learning_rate": 3.0557145679709033e-06, + "loss": 0.9823, + "step": 9448 + }, + { + "epoch": 1.8730666400139926, + "grad_norm": 2.28125, + "learning_rate": 3.0547437459707086e-06, + "loss": 0.9461, + "step": 9449 + }, + { + "epoch": 1.8732665350690887, + "grad_norm": 2.203125, + "learning_rate": 3.053773010378589e-06, + "loss": 0.8853, + "step": 9450 + }, + { + "epoch": 1.8734664301241848, + "grad_norm": 2.265625, + "learning_rate": 3.052802361237665e-06, + "loss": 1.0347, + "step": 9451 + }, + { + "epoch": 1.873666325179281, + "grad_norm": 2.09375, + "learning_rate": 3.0518317985910516e-06, + "loss": 0.9607, + "step": 9452 + }, + { + "epoch": 1.873866220234377, + "grad_norm": 2.203125, + "learning_rate": 3.050861322481863e-06, + "loss": 0.9928, + "step": 9453 + }, + { + "epoch": 1.874066115289473, + "grad_norm": 2.234375, + "learning_rate": 3.0498909329532047e-06, + "loss": 1.031, + "step": 9454 + }, + { + "epoch": 1.8742660103445692, + "grad_norm": 2.1875, + "learning_rate": 3.0489206300481846e-06, + "loss": 0.9818, + "step": 9455 + }, + { + "epoch": 1.8744659053996653, + "grad_norm": 2.203125, + "learning_rate": 3.0479504138099e-06, + "loss": 1.0457, + "step": 9456 + }, + { + "epoch": 1.8746658004547614, + "grad_norm": 2.046875, + "learning_rate": 3.046980284281451e-06, + "loss": 0.8744, + "step": 9457 + }, + { + "epoch": 1.8748656955098575, + "grad_norm": 2.265625, + "learning_rate": 3.046010241505928e-06, + "loss": 0.9854, + "step": 9458 + }, + { + "epoch": 1.8750655905649534, + "grad_norm": 2.21875, + "learning_rate": 3.0450402855264204e-06, + "loss": 0.9403, + "step": 9459 + }, + { + "epoch": 1.8752654856200495, + "grad_norm": 2.109375, + "learning_rate": 3.0440704163860167e-06, + "loss": 1.02, + "step": 9460 + }, + { + "epoch": 1.8754653806751456, + "grad_norm": 2.0625, + "learning_rate": 3.0431006341277945e-06, + "loss": 0.9089, + "step": 9461 + }, + { + "epoch": 1.8756652757302417, + "grad_norm": 2.109375, + "learning_rate": 3.042130938794834e-06, + "loss": 0.8881, + "step": 9462 + }, + { + "epoch": 1.8758651707853375, + "grad_norm": 2.171875, + "learning_rate": 3.041161330430208e-06, + "loss": 0.8822, + "step": 9463 + }, + { + "epoch": 1.8760650658404336, + "grad_norm": 2.09375, + "learning_rate": 3.0401918090769868e-06, + "loss": 0.9422, + "step": 9464 + }, + { + "epoch": 1.8762649608955297, + "grad_norm": 2.140625, + "learning_rate": 3.0392223747782358e-06, + "loss": 0.9783, + "step": 9465 + }, + { + "epoch": 1.8764648559506258, + "grad_norm": 2.140625, + "learning_rate": 3.0382530275770172e-06, + "loss": 0.9372, + "step": 9466 + }, + { + "epoch": 1.876664751005722, + "grad_norm": 2.15625, + "learning_rate": 3.037283767516389e-06, + "loss": 0.9253, + "step": 9467 + }, + { + "epoch": 1.876864646060818, + "grad_norm": 2.140625, + "learning_rate": 3.036314594639408e-06, + "loss": 1.0243, + "step": 9468 + }, + { + "epoch": 1.8770645411159141, + "grad_norm": 2.125, + "learning_rate": 3.0353455089891213e-06, + "loss": 0.9791, + "step": 9469 + }, + { + "epoch": 1.8772644361710102, + "grad_norm": 2.203125, + "learning_rate": 3.0343765106085778e-06, + "loss": 0.973, + "step": 9470 + }, + { + "epoch": 1.8774643312261063, + "grad_norm": 2.21875, + "learning_rate": 3.0334075995408206e-06, + "loss": 1.0732, + "step": 9471 + }, + { + "epoch": 1.8776642262812024, + "grad_norm": 2.09375, + "learning_rate": 3.032438775828887e-06, + "loss": 0.931, + "step": 9472 + }, + { + "epoch": 1.8778641213362985, + "grad_norm": 2.09375, + "learning_rate": 3.0314700395158125e-06, + "loss": 1.0091, + "step": 9473 + }, + { + "epoch": 1.8780640163913946, + "grad_norm": 2.125, + "learning_rate": 3.0305013906446296e-06, + "loss": 0.9834, + "step": 9474 + }, + { + "epoch": 1.8782639114464907, + "grad_norm": 2.078125, + "learning_rate": 3.0295328292583636e-06, + "loss": 0.9206, + "step": 9475 + }, + { + "epoch": 1.8784638065015866, + "grad_norm": 2.171875, + "learning_rate": 3.0285643554000398e-06, + "loss": 1.083, + "step": 9476 + }, + { + "epoch": 1.8786637015566827, + "grad_norm": 2.046875, + "learning_rate": 3.0275959691126763e-06, + "loss": 0.9827, + "step": 9477 + }, + { + "epoch": 1.8788635966117788, + "grad_norm": 2.140625, + "learning_rate": 3.0266276704392884e-06, + "loss": 0.97, + "step": 9478 + }, + { + "epoch": 1.879063491666875, + "grad_norm": 2.328125, + "learning_rate": 3.0256594594228906e-06, + "loss": 1.0129, + "step": 9479 + }, + { + "epoch": 1.879263386721971, + "grad_norm": 2.09375, + "learning_rate": 3.024691336106487e-06, + "loss": 0.9386, + "step": 9480 + }, + { + "epoch": 1.879463281777067, + "grad_norm": 2.203125, + "learning_rate": 3.0237233005330833e-06, + "loss": 0.9983, + "step": 9481 + }, + { + "epoch": 1.879663176832163, + "grad_norm": 2.15625, + "learning_rate": 3.022755352745681e-06, + "loss": 0.9233, + "step": 9482 + }, + { + "epoch": 1.879863071887259, + "grad_norm": 2.125, + "learning_rate": 3.021787492787273e-06, + "loss": 1.0003, + "step": 9483 + }, + { + "epoch": 1.8800629669423552, + "grad_norm": 2.265625, + "learning_rate": 3.020819720700855e-06, + "loss": 1.0818, + "step": 9484 + }, + { + "epoch": 1.8802628619974513, + "grad_norm": 2.09375, + "learning_rate": 3.019852036529412e-06, + "loss": 0.9582, + "step": 9485 + }, + { + "epoch": 1.8804627570525474, + "grad_norm": 2.109375, + "learning_rate": 3.018884440315932e-06, + "loss": 0.9328, + "step": 9486 + }, + { + "epoch": 1.8806626521076435, + "grad_norm": 2.1875, + "learning_rate": 3.0179169321033926e-06, + "loss": 0.9897, + "step": 9487 + }, + { + "epoch": 1.8808625471627396, + "grad_norm": 2.0625, + "learning_rate": 3.016949511934771e-06, + "loss": 0.9947, + "step": 9488 + }, + { + "epoch": 1.8810624422178357, + "grad_norm": 2.125, + "learning_rate": 3.015982179853041e-06, + "loss": 0.9682, + "step": 9489 + }, + { + "epoch": 1.8812623372729318, + "grad_norm": 2.203125, + "learning_rate": 3.0150149359011694e-06, + "loss": 0.962, + "step": 9490 + }, + { + "epoch": 1.881462232328028, + "grad_norm": 2.234375, + "learning_rate": 3.0140477801221235e-06, + "loss": 1.0181, + "step": 9491 + }, + { + "epoch": 1.881662127383124, + "grad_norm": 2.28125, + "learning_rate": 3.0130807125588625e-06, + "loss": 1.0336, + "step": 9492 + }, + { + "epoch": 1.88186202243822, + "grad_norm": 2.125, + "learning_rate": 3.0121137332543438e-06, + "loss": 0.9699, + "step": 9493 + }, + { + "epoch": 1.882061917493316, + "grad_norm": 2.328125, + "learning_rate": 3.0111468422515215e-06, + "loss": 1.0088, + "step": 9494 + }, + { + "epoch": 1.882261812548412, + "grad_norm": 2.0625, + "learning_rate": 3.0101800395933433e-06, + "loss": 0.9987, + "step": 9495 + }, + { + "epoch": 1.8824617076035082, + "grad_norm": 2.125, + "learning_rate": 3.0092133253227563e-06, + "loss": 0.9607, + "step": 9496 + }, + { + "epoch": 1.8826616026586043, + "grad_norm": 2.21875, + "learning_rate": 3.0082466994827e-06, + "loss": 0.9607, + "step": 9497 + }, + { + "epoch": 1.8828614977137001, + "grad_norm": 2.171875, + "learning_rate": 3.007280162116112e-06, + "loss": 0.9824, + "step": 9498 + }, + { + "epoch": 1.8830613927687962, + "grad_norm": 2.109375, + "learning_rate": 3.0063137132659277e-06, + "loss": 1.0015, + "step": 9499 + }, + { + "epoch": 1.8832612878238923, + "grad_norm": 2.140625, + "learning_rate": 3.0053473529750743e-06, + "loss": 0.907, + "step": 9500 + }, + { + "epoch": 1.8834611828789884, + "grad_norm": 2.046875, + "learning_rate": 3.0043810812864783e-06, + "loss": 0.9253, + "step": 9501 + }, + { + "epoch": 1.8836610779340845, + "grad_norm": 2.21875, + "learning_rate": 3.0034148982430624e-06, + "loss": 0.951, + "step": 9502 + }, + { + "epoch": 1.8838609729891806, + "grad_norm": 2.1875, + "learning_rate": 3.002448803887743e-06, + "loss": 0.9961, + "step": 9503 + }, + { + "epoch": 1.8840608680442767, + "grad_norm": 2.203125, + "learning_rate": 3.001482798263435e-06, + "loss": 1.0291, + "step": 9504 + }, + { + "epoch": 1.8842607630993728, + "grad_norm": 2.09375, + "learning_rate": 3.0005168814130463e-06, + "loss": 0.9918, + "step": 9505 + }, + { + "epoch": 1.884460658154469, + "grad_norm": 2.78125, + "learning_rate": 2.9995510533794846e-06, + "loss": 0.971, + "step": 9506 + }, + { + "epoch": 1.884660553209565, + "grad_norm": 2.40625, + "learning_rate": 2.9985853142056527e-06, + "loss": 0.9807, + "step": 9507 + }, + { + "epoch": 1.8848604482646611, + "grad_norm": 2.1875, + "learning_rate": 2.997619663934446e-06, + "loss": 0.9397, + "step": 9508 + }, + { + "epoch": 1.8850603433197572, + "grad_norm": 2.3125, + "learning_rate": 2.9966541026087602e-06, + "loss": 0.964, + "step": 9509 + }, + { + "epoch": 1.8852602383748533, + "grad_norm": 2.0625, + "learning_rate": 2.995688630271486e-06, + "loss": 1.0068, + "step": 9510 + }, + { + "epoch": 1.8854601334299492, + "grad_norm": 2.171875, + "learning_rate": 2.994723246965508e-06, + "loss": 0.9264, + "step": 9511 + }, + { + "epoch": 1.8856600284850453, + "grad_norm": 2.09375, + "learning_rate": 2.9937579527337092e-06, + "loss": 0.9491, + "step": 9512 + }, + { + "epoch": 1.8858599235401414, + "grad_norm": 2.125, + "learning_rate": 2.992792747618969e-06, + "loss": 1.0221, + "step": 9513 + }, + { + "epoch": 1.8860598185952375, + "grad_norm": 2.171875, + "learning_rate": 2.9918276316641592e-06, + "loss": 0.957, + "step": 9514 + }, + { + "epoch": 1.8862597136503336, + "grad_norm": 2.171875, + "learning_rate": 2.9908626049121523e-06, + "loss": 0.936, + "step": 9515 + }, + { + "epoch": 1.8864596087054295, + "grad_norm": 2.1875, + "learning_rate": 2.989897667405813e-06, + "loss": 0.9702, + "step": 9516 + }, + { + "epoch": 1.8866595037605256, + "grad_norm": 2.109375, + "learning_rate": 2.9889328191880075e-06, + "loss": 0.9699, + "step": 9517 + }, + { + "epoch": 1.8868593988156217, + "grad_norm": 2.09375, + "learning_rate": 2.987968060301588e-06, + "loss": 1.0664, + "step": 9518 + }, + { + "epoch": 1.8870592938707178, + "grad_norm": 2.25, + "learning_rate": 2.9870033907894146e-06, + "loss": 1.07, + "step": 9519 + }, + { + "epoch": 1.887259188925814, + "grad_norm": 2.125, + "learning_rate": 2.986038810694334e-06, + "loss": 0.9554, + "step": 9520 + }, + { + "epoch": 1.88745908398091, + "grad_norm": 2.078125, + "learning_rate": 2.985074320059195e-06, + "loss": 0.9381, + "step": 9521 + }, + { + "epoch": 1.887658979036006, + "grad_norm": 2.015625, + "learning_rate": 2.9841099189268397e-06, + "loss": 0.8263, + "step": 9522 + }, + { + "epoch": 1.8878588740911022, + "grad_norm": 2.0, + "learning_rate": 2.9831456073401056e-06, + "loss": 0.8908, + "step": 9523 + }, + { + "epoch": 1.8880587691461983, + "grad_norm": 2.15625, + "learning_rate": 2.982181385341828e-06, + "loss": 1.0653, + "step": 9524 + }, + { + "epoch": 1.8882586642012944, + "grad_norm": 2.125, + "learning_rate": 2.9812172529748395e-06, + "loss": 0.9158, + "step": 9525 + }, + { + "epoch": 1.8884585592563905, + "grad_norm": 2.203125, + "learning_rate": 2.9802532102819637e-06, + "loss": 1.1446, + "step": 9526 + }, + { + "epoch": 1.8886584543114866, + "grad_norm": 2.15625, + "learning_rate": 2.9792892573060257e-06, + "loss": 0.9912, + "step": 9527 + }, + { + "epoch": 1.8888583493665827, + "grad_norm": 2.125, + "learning_rate": 2.9783253940898417e-06, + "loss": 0.9292, + "step": 9528 + }, + { + "epoch": 1.8890582444216786, + "grad_norm": 2.1875, + "learning_rate": 2.977361620676228e-06, + "loss": 0.9676, + "step": 9529 + }, + { + "epoch": 1.8892581394767747, + "grad_norm": 2.25, + "learning_rate": 2.976397937107996e-06, + "loss": 1.0119, + "step": 9530 + }, + { + "epoch": 1.8894580345318708, + "grad_norm": 2.140625, + "learning_rate": 2.9754343434279504e-06, + "loss": 1.0663, + "step": 9531 + }, + { + "epoch": 1.8896579295869669, + "grad_norm": 2.09375, + "learning_rate": 2.974470839678895e-06, + "loss": 0.939, + "step": 9532 + }, + { + "epoch": 1.8898578246420628, + "grad_norm": 2.5, + "learning_rate": 2.97350742590363e-06, + "loss": 0.9304, + "step": 9533 + }, + { + "epoch": 1.8900577196971589, + "grad_norm": 2.140625, + "learning_rate": 2.9725441021449477e-06, + "loss": 0.9278, + "step": 9534 + }, + { + "epoch": 1.890257614752255, + "grad_norm": 2.21875, + "learning_rate": 2.9715808684456402e-06, + "loss": 0.952, + "step": 9535 + }, + { + "epoch": 1.890457509807351, + "grad_norm": 2.15625, + "learning_rate": 2.9706177248484936e-06, + "loss": 1.0302, + "step": 9536 + }, + { + "epoch": 1.8906574048624472, + "grad_norm": 2.234375, + "learning_rate": 2.9696546713962904e-06, + "loss": 0.9503, + "step": 9537 + }, + { + "epoch": 1.8908572999175433, + "grad_norm": 2.125, + "learning_rate": 2.968691708131811e-06, + "loss": 0.9209, + "step": 9538 + }, + { + "epoch": 1.8910571949726394, + "grad_norm": 2.34375, + "learning_rate": 2.9677288350978286e-06, + "loss": 0.9386, + "step": 9539 + }, + { + "epoch": 1.8912570900277355, + "grad_norm": 2.1875, + "learning_rate": 2.9667660523371134e-06, + "loss": 0.9958, + "step": 9540 + }, + { + "epoch": 1.8914569850828316, + "grad_norm": 2.25, + "learning_rate": 2.965803359892434e-06, + "loss": 0.9637, + "step": 9541 + }, + { + "epoch": 1.8916568801379277, + "grad_norm": 2.078125, + "learning_rate": 2.9648407578065515e-06, + "loss": 1.0209, + "step": 9542 + }, + { + "epoch": 1.8918567751930238, + "grad_norm": 2.234375, + "learning_rate": 2.9638782461222258e-06, + "loss": 1.0401, + "step": 9543 + }, + { + "epoch": 1.8920566702481199, + "grad_norm": 2.1875, + "learning_rate": 2.9629158248822105e-06, + "loss": 0.9794, + "step": 9544 + }, + { + "epoch": 1.892256565303216, + "grad_norm": 2.09375, + "learning_rate": 2.9619534941292562e-06, + "loss": 0.9372, + "step": 9545 + }, + { + "epoch": 1.8924564603583118, + "grad_norm": 2.1875, + "learning_rate": 2.9609912539061114e-06, + "loss": 0.9428, + "step": 9546 + }, + { + "epoch": 1.892656355413408, + "grad_norm": 2.171875, + "learning_rate": 2.960029104255516e-06, + "loss": 0.9534, + "step": 9547 + }, + { + "epoch": 1.892856250468504, + "grad_norm": 2.234375, + "learning_rate": 2.959067045220212e-06, + "loss": 1.0459, + "step": 9548 + }, + { + "epoch": 1.8930561455236001, + "grad_norm": 2.140625, + "learning_rate": 2.95810507684293e-06, + "loss": 1.0112, + "step": 9549 + }, + { + "epoch": 1.8932560405786962, + "grad_norm": 2.109375, + "learning_rate": 2.9571431991664036e-06, + "loss": 0.9578, + "step": 9550 + }, + { + "epoch": 1.893455935633792, + "grad_norm": 2.140625, + "learning_rate": 2.9561814122333564e-06, + "loss": 1.0109, + "step": 9551 + }, + { + "epoch": 1.8936558306888882, + "grad_norm": 2.109375, + "learning_rate": 2.955219716086514e-06, + "loss": 0.8693, + "step": 9552 + }, + { + "epoch": 1.8938557257439843, + "grad_norm": 2.078125, + "learning_rate": 2.954258110768593e-06, + "loss": 0.9462, + "step": 9553 + }, + { + "epoch": 1.8940556207990804, + "grad_norm": 2.109375, + "learning_rate": 2.9532965963223076e-06, + "loss": 1.0066, + "step": 9554 + }, + { + "epoch": 1.8942555158541765, + "grad_norm": 2.21875, + "learning_rate": 2.95233517279037e-06, + "loss": 0.9776, + "step": 9555 + }, + { + "epoch": 1.8944554109092726, + "grad_norm": 2.15625, + "learning_rate": 2.951373840215484e-06, + "loss": 1.0024, + "step": 9556 + }, + { + "epoch": 1.8946553059643687, + "grad_norm": 2.078125, + "learning_rate": 2.950412598640353e-06, + "loss": 0.9587, + "step": 9557 + }, + { + "epoch": 1.8948552010194648, + "grad_norm": 2.21875, + "learning_rate": 2.9494514481076773e-06, + "loss": 1.0363, + "step": 9558 + }, + { + "epoch": 1.895055096074561, + "grad_norm": 2.25, + "learning_rate": 2.948490388660148e-06, + "loss": 1.0407, + "step": 9559 + }, + { + "epoch": 1.895254991129657, + "grad_norm": 2.109375, + "learning_rate": 2.9475294203404557e-06, + "loss": 0.9162, + "step": 9560 + }, + { + "epoch": 1.895454886184753, + "grad_norm": 2.046875, + "learning_rate": 2.9465685431912895e-06, + "loss": 0.9416, + "step": 9561 + }, + { + "epoch": 1.8956547812398492, + "grad_norm": 2.25, + "learning_rate": 2.945607757255328e-06, + "loss": 1.0465, + "step": 9562 + }, + { + "epoch": 1.8958546762949453, + "grad_norm": 2.09375, + "learning_rate": 2.9446470625752497e-06, + "loss": 0.964, + "step": 9563 + }, + { + "epoch": 1.8960545713500412, + "grad_norm": 2.109375, + "learning_rate": 2.9436864591937312e-06, + "loss": 0.9395, + "step": 9564 + }, + { + "epoch": 1.8962544664051373, + "grad_norm": 2.15625, + "learning_rate": 2.9427259471534396e-06, + "loss": 0.9681, + "step": 9565 + }, + { + "epoch": 1.8964543614602334, + "grad_norm": 2.1875, + "learning_rate": 2.9417655264970424e-06, + "loss": 1.02, + "step": 9566 + }, + { + "epoch": 1.8966542565153295, + "grad_norm": 2.09375, + "learning_rate": 2.9408051972672e-06, + "loss": 0.9638, + "step": 9567 + }, + { + "epoch": 1.8968541515704254, + "grad_norm": 2.140625, + "learning_rate": 2.9398449595065705e-06, + "loss": 1.0314, + "step": 9568 + }, + { + "epoch": 1.8970540466255215, + "grad_norm": 2.15625, + "learning_rate": 2.9388848132578087e-06, + "loss": 0.8992, + "step": 9569 + }, + { + "epoch": 1.8972539416806176, + "grad_norm": 2.15625, + "learning_rate": 2.937924758563563e-06, + "loss": 0.9696, + "step": 9570 + }, + { + "epoch": 1.8974538367357137, + "grad_norm": 2.171875, + "learning_rate": 2.9369647954664783e-06, + "loss": 0.9212, + "step": 9571 + }, + { + "epoch": 1.8976537317908098, + "grad_norm": 2.265625, + "learning_rate": 2.9360049240091988e-06, + "loss": 0.9186, + "step": 9572 + }, + { + "epoch": 1.8978536268459059, + "grad_norm": 2.421875, + "learning_rate": 2.9350451442343585e-06, + "loss": 0.9811, + "step": 9573 + }, + { + "epoch": 1.898053521901002, + "grad_norm": 2.140625, + "learning_rate": 2.9340854561845945e-06, + "loss": 1.0047, + "step": 9574 + }, + { + "epoch": 1.898253416956098, + "grad_norm": 2.21875, + "learning_rate": 2.933125859902532e-06, + "loss": 1.0303, + "step": 9575 + }, + { + "epoch": 1.8984533120111942, + "grad_norm": 2.109375, + "learning_rate": 2.9321663554307977e-06, + "loss": 0.9834, + "step": 9576 + }, + { + "epoch": 1.8986532070662903, + "grad_norm": 2.109375, + "learning_rate": 2.931206942812015e-06, + "loss": 0.9697, + "step": 9577 + }, + { + "epoch": 1.8988531021213864, + "grad_norm": 2.25, + "learning_rate": 2.9302476220887975e-06, + "loss": 1.0374, + "step": 9578 + }, + { + "epoch": 1.8990529971764825, + "grad_norm": 2.21875, + "learning_rate": 2.929288393303762e-06, + "loss": 0.9615, + "step": 9579 + }, + { + "epoch": 1.8992528922315786, + "grad_norm": 2.078125, + "learning_rate": 2.928329256499512e-06, + "loss": 0.9689, + "step": 9580 + }, + { + "epoch": 1.8994527872866747, + "grad_norm": 2.171875, + "learning_rate": 2.9273702117186564e-06, + "loss": 0.9642, + "step": 9581 + }, + { + "epoch": 1.8996526823417705, + "grad_norm": 2.140625, + "learning_rate": 2.926411259003794e-06, + "loss": 1.0004, + "step": 9582 + }, + { + "epoch": 1.8998525773968666, + "grad_norm": 2.234375, + "learning_rate": 2.9254523983975224e-06, + "loss": 0.9807, + "step": 9583 + }, + { + "epoch": 1.9000524724519627, + "grad_norm": 2.171875, + "learning_rate": 2.924493629942434e-06, + "loss": 0.9657, + "step": 9584 + }, + { + "epoch": 1.9002523675070588, + "grad_norm": 2.09375, + "learning_rate": 2.923534953681116e-06, + "loss": 1.0369, + "step": 9585 + }, + { + "epoch": 1.9004522625621547, + "grad_norm": 2.09375, + "learning_rate": 2.922576369656155e-06, + "loss": 0.8696, + "step": 9586 + }, + { + "epoch": 1.9006521576172508, + "grad_norm": 2.0625, + "learning_rate": 2.9216178779101276e-06, + "loss": 0.9599, + "step": 9587 + }, + { + "epoch": 1.900852052672347, + "grad_norm": 2.09375, + "learning_rate": 2.9206594784856133e-06, + "loss": 0.9909, + "step": 9588 + }, + { + "epoch": 1.901051947727443, + "grad_norm": 2.125, + "learning_rate": 2.9197011714251833e-06, + "loss": 1.0464, + "step": 9589 + }, + { + "epoch": 1.9012518427825391, + "grad_norm": 2.0625, + "learning_rate": 2.9187429567714044e-06, + "loss": 0.9008, + "step": 9590 + }, + { + "epoch": 1.9014517378376352, + "grad_norm": 2.125, + "learning_rate": 2.9177848345668426e-06, + "loss": 1.0138, + "step": 9591 + }, + { + "epoch": 1.9016516328927313, + "grad_norm": 2.109375, + "learning_rate": 2.9168268048540527e-06, + "loss": 1.0418, + "step": 9592 + }, + { + "epoch": 1.9018515279478274, + "grad_norm": 2.109375, + "learning_rate": 2.9158688676755966e-06, + "loss": 0.9981, + "step": 9593 + }, + { + "epoch": 1.9020514230029235, + "grad_norm": 2.015625, + "learning_rate": 2.914911023074023e-06, + "loss": 0.8566, + "step": 9594 + }, + { + "epoch": 1.9022513180580196, + "grad_norm": 2.171875, + "learning_rate": 2.913953271091876e-06, + "loss": 0.9912, + "step": 9595 + }, + { + "epoch": 1.9024512131131157, + "grad_norm": 2.125, + "learning_rate": 2.912995611771705e-06, + "loss": 0.9226, + "step": 9596 + }, + { + "epoch": 1.9026511081682118, + "grad_norm": 2.203125, + "learning_rate": 2.9120380451560456e-06, + "loss": 0.948, + "step": 9597 + }, + { + "epoch": 1.902851003223308, + "grad_norm": 2.296875, + "learning_rate": 2.911080571287433e-06, + "loss": 0.9533, + "step": 9598 + }, + { + "epoch": 1.9030508982784038, + "grad_norm": 2.296875, + "learning_rate": 2.9101231902083963e-06, + "loss": 1.0581, + "step": 9599 + }, + { + "epoch": 1.9032507933335, + "grad_norm": 2.1875, + "learning_rate": 2.909165901961467e-06, + "loss": 1.0024, + "step": 9600 + }, + { + "epoch": 1.903450688388596, + "grad_norm": 2.15625, + "learning_rate": 2.908208706589164e-06, + "loss": 0.9489, + "step": 9601 + }, + { + "epoch": 1.903650583443692, + "grad_norm": 2.125, + "learning_rate": 2.907251604134006e-06, + "loss": 0.9358, + "step": 9602 + }, + { + "epoch": 1.9038504784987882, + "grad_norm": 2.25, + "learning_rate": 2.9062945946385092e-06, + "loss": 1.04, + "step": 9603 + }, + { + "epoch": 1.904050373553884, + "grad_norm": 2.0625, + "learning_rate": 2.9053376781451836e-06, + "loss": 0.9114, + "step": 9604 + }, + { + "epoch": 1.9042502686089802, + "grad_norm": 2.109375, + "learning_rate": 2.904380854696532e-06, + "loss": 0.9902, + "step": 9605 + }, + { + "epoch": 1.9044501636640763, + "grad_norm": 2.15625, + "learning_rate": 2.9034241243350615e-06, + "loss": 0.9319, + "step": 9606 + }, + { + "epoch": 1.9046500587191724, + "grad_norm": 2.125, + "learning_rate": 2.902467487103267e-06, + "loss": 0.96, + "step": 9607 + }, + { + "epoch": 1.9048499537742685, + "grad_norm": 2.171875, + "learning_rate": 2.901510943043641e-06, + "loss": 0.9625, + "step": 9608 + }, + { + "epoch": 1.9050498488293646, + "grad_norm": 2.046875, + "learning_rate": 2.9005544921986774e-06, + "loss": 0.8741, + "step": 9609 + }, + { + "epoch": 1.9052497438844607, + "grad_norm": 2.203125, + "learning_rate": 2.8995981346108598e-06, + "loss": 1.0453, + "step": 9610 + }, + { + "epoch": 1.9054496389395568, + "grad_norm": 2.046875, + "learning_rate": 2.8986418703226655e-06, + "loss": 0.9462, + "step": 9611 + }, + { + "epoch": 1.9056495339946529, + "grad_norm": 2.09375, + "learning_rate": 2.8976856993765766e-06, + "loss": 0.9167, + "step": 9612 + }, + { + "epoch": 1.905849429049749, + "grad_norm": 2.265625, + "learning_rate": 2.896729621815064e-06, + "loss": 1.0164, + "step": 9613 + }, + { + "epoch": 1.906049324104845, + "grad_norm": 2.15625, + "learning_rate": 2.8957736376805963e-06, + "loss": 0.9024, + "step": 9614 + }, + { + "epoch": 1.9062492191599412, + "grad_norm": 2.046875, + "learning_rate": 2.8948177470156404e-06, + "loss": 0.9204, + "step": 9615 + }, + { + "epoch": 1.9064491142150373, + "grad_norm": 2.046875, + "learning_rate": 2.8938619498626542e-06, + "loss": 0.9531, + "step": 9616 + }, + { + "epoch": 1.9066490092701331, + "grad_norm": 2.296875, + "learning_rate": 2.8929062462640946e-06, + "loss": 1.0757, + "step": 9617 + }, + { + "epoch": 1.9068489043252292, + "grad_norm": 2.140625, + "learning_rate": 2.8919506362624156e-06, + "loss": 0.9839, + "step": 9618 + }, + { + "epoch": 1.9070487993803253, + "grad_norm": 2.046875, + "learning_rate": 2.8909951199000645e-06, + "loss": 0.8796, + "step": 9619 + }, + { + "epoch": 1.9072486944354214, + "grad_norm": 2.203125, + "learning_rate": 2.8900396972194834e-06, + "loss": 1.0045, + "step": 9620 + }, + { + "epoch": 1.9074485894905173, + "grad_norm": 2.234375, + "learning_rate": 2.8890843682631147e-06, + "loss": 0.9391, + "step": 9621 + }, + { + "epoch": 1.9076484845456134, + "grad_norm": 2.21875, + "learning_rate": 2.8881291330733933e-06, + "loss": 0.9263, + "step": 9622 + }, + { + "epoch": 1.9078483796007095, + "grad_norm": 2.15625, + "learning_rate": 2.8871739916927487e-06, + "loss": 1.0736, + "step": 9623 + }, + { + "epoch": 1.9080482746558056, + "grad_norm": 2.0625, + "learning_rate": 2.8862189441636113e-06, + "loss": 0.8903, + "step": 9624 + }, + { + "epoch": 1.9082481697109017, + "grad_norm": 2.109375, + "learning_rate": 2.8852639905284026e-06, + "loss": 0.9666, + "step": 9625 + }, + { + "epoch": 1.9084480647659978, + "grad_norm": 2.203125, + "learning_rate": 2.8843091308295395e-06, + "loss": 0.9188, + "step": 9626 + }, + { + "epoch": 1.908647959821094, + "grad_norm": 2.03125, + "learning_rate": 2.883354365109441e-06, + "loss": 0.917, + "step": 9627 + }, + { + "epoch": 1.90884785487619, + "grad_norm": 2.0625, + "learning_rate": 2.882399693410516e-06, + "loss": 0.9333, + "step": 9628 + }, + { + "epoch": 1.9090477499312861, + "grad_norm": 2.046875, + "learning_rate": 2.8814451157751697e-06, + "loss": 0.9324, + "step": 9629 + }, + { + "epoch": 1.9092476449863822, + "grad_norm": 2.0625, + "learning_rate": 2.880490632245803e-06, + "loss": 0.9385, + "step": 9630 + }, + { + "epoch": 1.9094475400414783, + "grad_norm": 2.21875, + "learning_rate": 2.8795362428648186e-06, + "loss": 0.8715, + "step": 9631 + }, + { + "epoch": 1.9096474350965744, + "grad_norm": 2.21875, + "learning_rate": 2.878581947674608e-06, + "loss": 0.8694, + "step": 9632 + }, + { + "epoch": 1.9098473301516705, + "grad_norm": 2.0625, + "learning_rate": 2.8776277467175583e-06, + "loss": 0.8945, + "step": 9633 + }, + { + "epoch": 1.9100472252067664, + "grad_norm": 2.21875, + "learning_rate": 2.8766736400360595e-06, + "loss": 0.9825, + "step": 9634 + }, + { + "epoch": 1.9102471202618625, + "grad_norm": 2.203125, + "learning_rate": 2.875719627672491e-06, + "loss": 1.0558, + "step": 9635 + }, + { + "epoch": 1.9104470153169586, + "grad_norm": 2.15625, + "learning_rate": 2.874765709669227e-06, + "loss": 0.9971, + "step": 9636 + }, + { + "epoch": 1.9106469103720547, + "grad_norm": 2.21875, + "learning_rate": 2.8738118860686457e-06, + "loss": 0.9931, + "step": 9637 + }, + { + "epoch": 1.9108468054271508, + "grad_norm": 2.203125, + "learning_rate": 2.8728581569131137e-06, + "loss": 1.0429, + "step": 9638 + }, + { + "epoch": 1.9110467004822467, + "grad_norm": 2.15625, + "learning_rate": 2.8719045222449925e-06, + "loss": 0.9666, + "step": 9639 + }, + { + "epoch": 1.9112465955373428, + "grad_norm": 2.046875, + "learning_rate": 2.8709509821066478e-06, + "loss": 0.9532, + "step": 9640 + }, + { + "epoch": 1.9114464905924389, + "grad_norm": 2.171875, + "learning_rate": 2.869997536540435e-06, + "loss": 0.9724, + "step": 9641 + }, + { + "epoch": 1.911646385647535, + "grad_norm": 2.265625, + "learning_rate": 2.8690441855887e-06, + "loss": 0.9422, + "step": 9642 + }, + { + "epoch": 1.911846280702631, + "grad_norm": 2.09375, + "learning_rate": 2.8680909292937965e-06, + "loss": 0.963, + "step": 9643 + }, + { + "epoch": 1.9120461757577272, + "grad_norm": 2.28125, + "learning_rate": 2.867137767698066e-06, + "loss": 0.9424, + "step": 9644 + }, + { + "epoch": 1.9122460708128233, + "grad_norm": 2.203125, + "learning_rate": 2.8661847008438466e-06, + "loss": 0.9263, + "step": 9645 + }, + { + "epoch": 1.9124459658679194, + "grad_norm": 2.171875, + "learning_rate": 2.8652317287734766e-06, + "loss": 0.9459, + "step": 9646 + }, + { + "epoch": 1.9126458609230155, + "grad_norm": 2.15625, + "learning_rate": 2.8642788515292854e-06, + "loss": 0.9803, + "step": 9647 + }, + { + "epoch": 1.9128457559781116, + "grad_norm": 2.21875, + "learning_rate": 2.8633260691535973e-06, + "loss": 1.0479, + "step": 9648 + }, + { + "epoch": 1.9130456510332077, + "grad_norm": 2.265625, + "learning_rate": 2.862373381688739e-06, + "loss": 0.8759, + "step": 9649 + }, + { + "epoch": 1.9132455460883038, + "grad_norm": 2.140625, + "learning_rate": 2.8614207891770275e-06, + "loss": 0.9741, + "step": 9650 + }, + { + "epoch": 1.9134454411433999, + "grad_norm": 2.359375, + "learning_rate": 2.8604682916607728e-06, + "loss": 0.9743, + "step": 9651 + }, + { + "epoch": 1.9136453361984957, + "grad_norm": 2.09375, + "learning_rate": 2.859515889182291e-06, + "loss": 1.0026, + "step": 9652 + }, + { + "epoch": 1.9138452312535918, + "grad_norm": 2.125, + "learning_rate": 2.858563581783885e-06, + "loss": 0.9185, + "step": 9653 + }, + { + "epoch": 1.914045126308688, + "grad_norm": 2.125, + "learning_rate": 2.8576113695078534e-06, + "loss": 0.9673, + "step": 9654 + }, + { + "epoch": 1.914245021363784, + "grad_norm": 2.15625, + "learning_rate": 2.856659252396498e-06, + "loss": 0.9226, + "step": 9655 + }, + { + "epoch": 1.91444491641888, + "grad_norm": 2.15625, + "learning_rate": 2.8557072304921094e-06, + "loss": 1.0275, + "step": 9656 + }, + { + "epoch": 1.914644811473976, + "grad_norm": 2.40625, + "learning_rate": 2.8547553038369756e-06, + "loss": 0.9081, + "step": 9657 + }, + { + "epoch": 1.9148447065290721, + "grad_norm": 2.265625, + "learning_rate": 2.853803472473383e-06, + "loss": 0.988, + "step": 9658 + }, + { + "epoch": 1.9150446015841682, + "grad_norm": 2.125, + "learning_rate": 2.8528517364436116e-06, + "loss": 0.9797, + "step": 9659 + }, + { + "epoch": 1.9152444966392643, + "grad_norm": 2.1875, + "learning_rate": 2.8519000957899368e-06, + "loss": 0.9943, + "step": 9660 + }, + { + "epoch": 1.9154443916943604, + "grad_norm": 2.21875, + "learning_rate": 2.850948550554628e-06, + "loss": 1.0108, + "step": 9661 + }, + { + "epoch": 1.9156442867494565, + "grad_norm": 2.25, + "learning_rate": 2.8499971007799576e-06, + "loss": 0.9656, + "step": 9662 + }, + { + "epoch": 1.9158441818045526, + "grad_norm": 1.984375, + "learning_rate": 2.8490457465081853e-06, + "loss": 0.8991, + "step": 9663 + }, + { + "epoch": 1.9160440768596487, + "grad_norm": 2.328125, + "learning_rate": 2.84809448778157e-06, + "loss": 1.0282, + "step": 9664 + }, + { + "epoch": 1.9162439719147448, + "grad_norm": 2.078125, + "learning_rate": 2.8471433246423697e-06, + "loss": 0.9533, + "step": 9665 + }, + { + "epoch": 1.916443866969841, + "grad_norm": 2.203125, + "learning_rate": 2.8461922571328327e-06, + "loss": 0.9021, + "step": 9666 + }, + { + "epoch": 1.916643762024937, + "grad_norm": 2.171875, + "learning_rate": 2.8452412852952037e-06, + "loss": 0.8995, + "step": 9667 + }, + { + "epoch": 1.9168436570800331, + "grad_norm": 2.21875, + "learning_rate": 2.844290409171729e-06, + "loss": 0.9884, + "step": 9668 + }, + { + "epoch": 1.917043552135129, + "grad_norm": 2.171875, + "learning_rate": 2.8433396288046433e-06, + "loss": 0.977, + "step": 9669 + }, + { + "epoch": 1.917243447190225, + "grad_norm": 2.171875, + "learning_rate": 2.8423889442361797e-06, + "loss": 1.0235, + "step": 9670 + }, + { + "epoch": 1.9174433422453212, + "grad_norm": 2.1875, + "learning_rate": 2.8414383555085708e-06, + "loss": 0.9775, + "step": 9671 + }, + { + "epoch": 1.9176432373004173, + "grad_norm": 2.140625, + "learning_rate": 2.8404878626640408e-06, + "loss": 0.9528, + "step": 9672 + }, + { + "epoch": 1.9178431323555134, + "grad_norm": 2.328125, + "learning_rate": 2.839537465744806e-06, + "loss": 1.037, + "step": 9673 + }, + { + "epoch": 1.9180430274106093, + "grad_norm": 2.234375, + "learning_rate": 2.8385871647930886e-06, + "loss": 1.0233, + "step": 9674 + }, + { + "epoch": 1.9182429224657054, + "grad_norm": 2.140625, + "learning_rate": 2.837636959851098e-06, + "loss": 1.0528, + "step": 9675 + }, + { + "epoch": 1.9184428175208015, + "grad_norm": 2.1875, + "learning_rate": 2.83668685096104e-06, + "loss": 0.9207, + "step": 9676 + }, + { + "epoch": 1.9186427125758976, + "grad_norm": 2.171875, + "learning_rate": 2.8357368381651242e-06, + "loss": 1.0023, + "step": 9677 + }, + { + "epoch": 1.9188426076309937, + "grad_norm": 2.265625, + "learning_rate": 2.8347869215055455e-06, + "loss": 1.0113, + "step": 9678 + }, + { + "epoch": 1.9190425026860898, + "grad_norm": 2.140625, + "learning_rate": 2.8338371010244997e-06, + "loss": 1.0195, + "step": 9679 + }, + { + "epoch": 1.9192423977411859, + "grad_norm": 2.078125, + "learning_rate": 2.83288737676418e-06, + "loss": 0.9366, + "step": 9680 + }, + { + "epoch": 1.919442292796282, + "grad_norm": 2.1875, + "learning_rate": 2.831937748766772e-06, + "loss": 0.9912, + "step": 9681 + }, + { + "epoch": 1.919642187851378, + "grad_norm": 2.171875, + "learning_rate": 2.830988217074455e-06, + "loss": 0.9882, + "step": 9682 + }, + { + "epoch": 1.9198420829064742, + "grad_norm": 2.21875, + "learning_rate": 2.8300387817294122e-06, + "loss": 1.0041, + "step": 9683 + }, + { + "epoch": 1.9200419779615703, + "grad_norm": 2.0625, + "learning_rate": 2.8290894427738148e-06, + "loss": 0.9378, + "step": 9684 + }, + { + "epoch": 1.9202418730166664, + "grad_norm": 2.15625, + "learning_rate": 2.828140200249831e-06, + "loss": 0.9609, + "step": 9685 + }, + { + "epoch": 1.9204417680717625, + "grad_norm": 2.171875, + "learning_rate": 2.827191054199629e-06, + "loss": 0.9683, + "step": 9686 + }, + { + "epoch": 1.9206416631268584, + "grad_norm": 2.15625, + "learning_rate": 2.826242004665368e-06, + "loss": 1.0936, + "step": 9687 + }, + { + "epoch": 1.9208415581819545, + "grad_norm": 2.109375, + "learning_rate": 2.825293051689204e-06, + "loss": 0.9332, + "step": 9688 + }, + { + "epoch": 1.9210414532370506, + "grad_norm": 2.140625, + "learning_rate": 2.8243441953132918e-06, + "loss": 1.0441, + "step": 9689 + }, + { + "epoch": 1.9212413482921467, + "grad_norm": 2.0625, + "learning_rate": 2.8233954355797775e-06, + "loss": 0.9193, + "step": 9690 + }, + { + "epoch": 1.9214412433472425, + "grad_norm": 2.109375, + "learning_rate": 2.8224467725308064e-06, + "loss": 0.9336, + "step": 9691 + }, + { + "epoch": 1.9216411384023386, + "grad_norm": 2.109375, + "learning_rate": 2.821498206208515e-06, + "loss": 0.9434, + "step": 9692 + }, + { + "epoch": 1.9218410334574347, + "grad_norm": 2.203125, + "learning_rate": 2.8205497366550414e-06, + "loss": 1.0168, + "step": 9693 + }, + { + "epoch": 1.9220409285125308, + "grad_norm": 2.09375, + "learning_rate": 2.8196013639125175e-06, + "loss": 0.9444, + "step": 9694 + }, + { + "epoch": 1.922240823567627, + "grad_norm": 2.296875, + "learning_rate": 2.818653088023065e-06, + "loss": 0.9618, + "step": 9695 + }, + { + "epoch": 1.922440718622723, + "grad_norm": 2.09375, + "learning_rate": 2.8177049090288115e-06, + "loss": 0.9761, + "step": 9696 + }, + { + "epoch": 1.9226406136778191, + "grad_norm": 2.046875, + "learning_rate": 2.816756826971873e-06, + "loss": 0.9491, + "step": 9697 + }, + { + "epoch": 1.9228405087329152, + "grad_norm": 2.1875, + "learning_rate": 2.8158088418943613e-06, + "loss": 0.9522, + "step": 9698 + }, + { + "epoch": 1.9230404037880113, + "grad_norm": 2.21875, + "learning_rate": 2.814860953838389e-06, + "loss": 0.9115, + "step": 9699 + }, + { + "epoch": 1.9232402988431074, + "grad_norm": 2.15625, + "learning_rate": 2.8139131628460605e-06, + "loss": 0.883, + "step": 9700 + }, + { + "epoch": 1.9234401938982035, + "grad_norm": 2.171875, + "learning_rate": 2.8129654689594733e-06, + "loss": 1.0504, + "step": 9701 + }, + { + "epoch": 1.9236400889532996, + "grad_norm": 2.15625, + "learning_rate": 2.8120178722207287e-06, + "loss": 0.9033, + "step": 9702 + }, + { + "epoch": 1.9238399840083957, + "grad_norm": 2.328125, + "learning_rate": 2.811070372671918e-06, + "loss": 1.075, + "step": 9703 + }, + { + "epoch": 1.9240398790634918, + "grad_norm": 2.25, + "learning_rate": 2.810122970355124e-06, + "loss": 1.0194, + "step": 9704 + }, + { + "epoch": 1.9242397741185877, + "grad_norm": 2.109375, + "learning_rate": 2.809175665312436e-06, + "loss": 0.9821, + "step": 9705 + }, + { + "epoch": 1.9244396691736838, + "grad_norm": 2.203125, + "learning_rate": 2.8082284575859302e-06, + "loss": 0.986, + "step": 9706 + }, + { + "epoch": 1.92463956422878, + "grad_norm": 2.140625, + "learning_rate": 2.8072813472176807e-06, + "loss": 1.0015, + "step": 9707 + }, + { + "epoch": 1.924839459283876, + "grad_norm": 2.171875, + "learning_rate": 2.8063343342497616e-06, + "loss": 0.8788, + "step": 9708 + }, + { + "epoch": 1.9250393543389719, + "grad_norm": 2.171875, + "learning_rate": 2.805387418724237e-06, + "loss": 0.9795, + "step": 9709 + }, + { + "epoch": 1.925239249394068, + "grad_norm": 2.140625, + "learning_rate": 2.804440600683167e-06, + "loss": 0.9401, + "step": 9710 + }, + { + "epoch": 1.925439144449164, + "grad_norm": 2.203125, + "learning_rate": 2.803493880168613e-06, + "loss": 0.9852, + "step": 9711 + }, + { + "epoch": 1.9256390395042602, + "grad_norm": 2.15625, + "learning_rate": 2.8025472572226266e-06, + "loss": 0.9925, + "step": 9712 + }, + { + "epoch": 1.9258389345593563, + "grad_norm": 2.03125, + "learning_rate": 2.8016007318872532e-06, + "loss": 0.946, + "step": 9713 + }, + { + "epoch": 1.9260388296144524, + "grad_norm": 2.09375, + "learning_rate": 2.800654304204543e-06, + "loss": 0.9504, + "step": 9714 + }, + { + "epoch": 1.9262387246695485, + "grad_norm": 2.171875, + "learning_rate": 2.7997079742165346e-06, + "loss": 0.9226, + "step": 9715 + }, + { + "epoch": 1.9264386197246446, + "grad_norm": 2.171875, + "learning_rate": 2.7987617419652603e-06, + "loss": 0.9339, + "step": 9716 + }, + { + "epoch": 1.9266385147797407, + "grad_norm": 2.171875, + "learning_rate": 2.797815607492756e-06, + "loss": 1.0389, + "step": 9717 + }, + { + "epoch": 1.9268384098348368, + "grad_norm": 2.28125, + "learning_rate": 2.7968695708410476e-06, + "loss": 0.9964, + "step": 9718 + }, + { + "epoch": 1.9270383048899329, + "grad_norm": 2.1875, + "learning_rate": 2.7959236320521573e-06, + "loss": 1.0257, + "step": 9719 + }, + { + "epoch": 1.927238199945029, + "grad_norm": 2.203125, + "learning_rate": 2.794977791168102e-06, + "loss": 0.9098, + "step": 9720 + }, + { + "epoch": 1.927438095000125, + "grad_norm": 2.046875, + "learning_rate": 2.7940320482308995e-06, + "loss": 0.857, + "step": 9721 + }, + { + "epoch": 1.927637990055221, + "grad_norm": 2.109375, + "learning_rate": 2.7930864032825582e-06, + "loss": 1.0006, + "step": 9722 + }, + { + "epoch": 1.927837885110317, + "grad_norm": 2.09375, + "learning_rate": 2.792140856365081e-06, + "loss": 0.9269, + "step": 9723 + }, + { + "epoch": 1.9280377801654132, + "grad_norm": 2.1875, + "learning_rate": 2.7911954075204734e-06, + "loss": 0.9125, + "step": 9724 + }, + { + "epoch": 1.9282376752205093, + "grad_norm": 2.234375, + "learning_rate": 2.7902500567907297e-06, + "loss": 0.8546, + "step": 9725 + }, + { + "epoch": 1.9284375702756054, + "grad_norm": 2.140625, + "learning_rate": 2.7893048042178405e-06, + "loss": 1.0623, + "step": 9726 + }, + { + "epoch": 1.9286374653307012, + "grad_norm": 2.234375, + "learning_rate": 2.788359649843797e-06, + "loss": 0.9814, + "step": 9727 + }, + { + "epoch": 1.9288373603857973, + "grad_norm": 2.15625, + "learning_rate": 2.787414593710583e-06, + "loss": 1.0193, + "step": 9728 + }, + { + "epoch": 1.9290372554408934, + "grad_norm": 2.234375, + "learning_rate": 2.786469635860174e-06, + "loss": 0.8954, + "step": 9729 + }, + { + "epoch": 1.9292371504959895, + "grad_norm": 2.1875, + "learning_rate": 2.7855247763345483e-06, + "loss": 1.1665, + "step": 9730 + }, + { + "epoch": 1.9294370455510856, + "grad_norm": 2.234375, + "learning_rate": 2.7845800151756768e-06, + "loss": 0.9786, + "step": 9731 + }, + { + "epoch": 1.9296369406061817, + "grad_norm": 2.125, + "learning_rate": 2.783635352425522e-06, + "loss": 0.9566, + "step": 9732 + }, + { + "epoch": 1.9298368356612778, + "grad_norm": 2.109375, + "learning_rate": 2.78269078812605e-06, + "loss": 0.8177, + "step": 9733 + }, + { + "epoch": 1.930036730716374, + "grad_norm": 2.078125, + "learning_rate": 2.781746322319219e-06, + "loss": 1.009, + "step": 9734 + }, + { + "epoch": 1.93023662577147, + "grad_norm": 2.046875, + "learning_rate": 2.7808019550469745e-06, + "loss": 0.9294, + "step": 9735 + }, + { + "epoch": 1.9304365208265661, + "grad_norm": 2.234375, + "learning_rate": 2.779857686351273e-06, + "loss": 0.9833, + "step": 9736 + }, + { + "epoch": 1.9306364158816622, + "grad_norm": 2.1875, + "learning_rate": 2.7789135162740555e-06, + "loss": 1.0077, + "step": 9737 + }, + { + "epoch": 1.9308363109367583, + "grad_norm": 2.203125, + "learning_rate": 2.7779694448572605e-06, + "loss": 1.0207, + "step": 9738 + }, + { + "epoch": 1.9310362059918544, + "grad_norm": 2.140625, + "learning_rate": 2.777025472142827e-06, + "loss": 0.9238, + "step": 9739 + }, + { + "epoch": 1.9312361010469503, + "grad_norm": 2.125, + "learning_rate": 2.7760815981726854e-06, + "loss": 0.9788, + "step": 9740 + }, + { + "epoch": 1.9314359961020464, + "grad_norm": 2.171875, + "learning_rate": 2.7751378229887586e-06, + "loss": 0.9759, + "step": 9741 + }, + { + "epoch": 1.9316358911571425, + "grad_norm": 2.234375, + "learning_rate": 2.774194146632975e-06, + "loss": 0.9478, + "step": 9742 + }, + { + "epoch": 1.9318357862122386, + "grad_norm": 2.125, + "learning_rate": 2.773250569147249e-06, + "loss": 1.08, + "step": 9743 + }, + { + "epoch": 1.9320356812673345, + "grad_norm": 2.109375, + "learning_rate": 2.772307090573494e-06, + "loss": 0.9411, + "step": 9744 + }, + { + "epoch": 1.9322355763224306, + "grad_norm": 2.25, + "learning_rate": 2.7713637109536207e-06, + "loss": 1.0289, + "step": 9745 + }, + { + "epoch": 1.9324354713775267, + "grad_norm": 2.28125, + "learning_rate": 2.7704204303295348e-06, + "loss": 0.9375, + "step": 9746 + }, + { + "epoch": 1.9326353664326228, + "grad_norm": 2.234375, + "learning_rate": 2.769477248743132e-06, + "loss": 1.0683, + "step": 9747 + }, + { + "epoch": 1.932835261487719, + "grad_norm": 1.9453125, + "learning_rate": 2.768534166236314e-06, + "loss": 0.8, + "step": 9748 + }, + { + "epoch": 1.933035156542815, + "grad_norm": 2.140625, + "learning_rate": 2.7675911828509703e-06, + "loss": 0.9387, + "step": 9749 + }, + { + "epoch": 1.933235051597911, + "grad_norm": 2.078125, + "learning_rate": 2.7666482986289876e-06, + "loss": 0.9022, + "step": 9750 + }, + { + "epoch": 1.9334349466530072, + "grad_norm": 2.21875, + "learning_rate": 2.765705513612247e-06, + "loss": 0.9885, + "step": 9751 + }, + { + "epoch": 1.9336348417081033, + "grad_norm": 2.1875, + "learning_rate": 2.7647628278426306e-06, + "loss": 0.9144, + "step": 9752 + }, + { + "epoch": 1.9338347367631994, + "grad_norm": 2.15625, + "learning_rate": 2.7638202413620106e-06, + "loss": 1.0567, + "step": 9753 + }, + { + "epoch": 1.9340346318182955, + "grad_norm": 2.125, + "learning_rate": 2.7628777542122553e-06, + "loss": 0.941, + "step": 9754 + }, + { + "epoch": 1.9342345268733916, + "grad_norm": 2.046875, + "learning_rate": 2.7619353664352326e-06, + "loss": 0.9615, + "step": 9755 + }, + { + "epoch": 1.9344344219284877, + "grad_norm": 2.15625, + "learning_rate": 2.760993078072802e-06, + "loss": 0.9721, + "step": 9756 + }, + { + "epoch": 1.9346343169835836, + "grad_norm": 2.234375, + "learning_rate": 2.760050889166818e-06, + "loss": 0.9166, + "step": 9757 + }, + { + "epoch": 1.9348342120386797, + "grad_norm": 2.171875, + "learning_rate": 2.7591087997591366e-06, + "loss": 0.9897, + "step": 9758 + }, + { + "epoch": 1.9350341070937758, + "grad_norm": 2.25, + "learning_rate": 2.7581668098916024e-06, + "loss": 1.0961, + "step": 9759 + }, + { + "epoch": 1.9352340021488719, + "grad_norm": 2.4375, + "learning_rate": 2.7572249196060575e-06, + "loss": 0.9858, + "step": 9760 + }, + { + "epoch": 1.935433897203968, + "grad_norm": 2.234375, + "learning_rate": 2.756283128944344e-06, + "loss": 1.0277, + "step": 9761 + }, + { + "epoch": 1.9356337922590638, + "grad_norm": 2.203125, + "learning_rate": 2.7553414379482936e-06, + "loss": 1.0197, + "step": 9762 + }, + { + "epoch": 1.93583368731416, + "grad_norm": 2.140625, + "learning_rate": 2.7543998466597357e-06, + "loss": 0.8993, + "step": 9763 + }, + { + "epoch": 1.936033582369256, + "grad_norm": 2.34375, + "learning_rate": 2.753458355120498e-06, + "loss": 0.9828, + "step": 9764 + }, + { + "epoch": 1.9362334774243521, + "grad_norm": 2.234375, + "learning_rate": 2.7525169633724024e-06, + "loss": 0.9341, + "step": 9765 + }, + { + "epoch": 1.9364333724794482, + "grad_norm": 2.125, + "learning_rate": 2.7515756714572593e-06, + "loss": 1.0403, + "step": 9766 + }, + { + "epoch": 1.9366332675345443, + "grad_norm": 2.125, + "learning_rate": 2.750634479416887e-06, + "loss": 0.9437, + "step": 9767 + }, + { + "epoch": 1.9368331625896404, + "grad_norm": 2.203125, + "learning_rate": 2.7496933872930907e-06, + "loss": 0.9842, + "step": 9768 + }, + { + "epoch": 1.9370330576447365, + "grad_norm": 2.25, + "learning_rate": 2.7487523951276716e-06, + "loss": 0.9598, + "step": 9769 + }, + { + "epoch": 1.9372329526998326, + "grad_norm": 2.1875, + "learning_rate": 2.747811502962433e-06, + "loss": 1.0267, + "step": 9770 + }, + { + "epoch": 1.9374328477549287, + "grad_norm": 2.109375, + "learning_rate": 2.7468707108391667e-06, + "loss": 0.9902, + "step": 9771 + }, + { + "epoch": 1.9376327428100248, + "grad_norm": 2.234375, + "learning_rate": 2.7459300187996614e-06, + "loss": 1.0448, + "step": 9772 + }, + { + "epoch": 1.937832637865121, + "grad_norm": 2.21875, + "learning_rate": 2.7449894268857055e-06, + "loss": 0.9244, + "step": 9773 + }, + { + "epoch": 1.938032532920217, + "grad_norm": 2.234375, + "learning_rate": 2.7440489351390782e-06, + "loss": 1.0769, + "step": 9774 + }, + { + "epoch": 1.938232427975313, + "grad_norm": 2.046875, + "learning_rate": 2.743108543601554e-06, + "loss": 0.9104, + "step": 9775 + }, + { + "epoch": 1.938432323030409, + "grad_norm": 2.15625, + "learning_rate": 2.7421682523149097e-06, + "loss": 0.9787, + "step": 9776 + }, + { + "epoch": 1.9386322180855051, + "grad_norm": 2.390625, + "learning_rate": 2.741228061320911e-06, + "loss": 0.9726, + "step": 9777 + }, + { + "epoch": 1.9388321131406012, + "grad_norm": 2.234375, + "learning_rate": 2.7402879706613176e-06, + "loss": 0.9924, + "step": 9778 + }, + { + "epoch": 1.939032008195697, + "grad_norm": 2.28125, + "learning_rate": 2.7393479803778933e-06, + "loss": 1.009, + "step": 9779 + }, + { + "epoch": 1.9392319032507932, + "grad_norm": 2.203125, + "learning_rate": 2.7384080905123912e-06, + "loss": 0.9448, + "step": 9780 + }, + { + "epoch": 1.9394317983058893, + "grad_norm": 2.125, + "learning_rate": 2.7374683011065594e-06, + "loss": 0.9685, + "step": 9781 + }, + { + "epoch": 1.9396316933609854, + "grad_norm": 2.265625, + "learning_rate": 2.736528612202142e-06, + "loss": 1.0365, + "step": 9782 + }, + { + "epoch": 1.9398315884160815, + "grad_norm": 2.3125, + "learning_rate": 2.7355890238408845e-06, + "loss": 1.0848, + "step": 9783 + }, + { + "epoch": 1.9400314834711776, + "grad_norm": 2.140625, + "learning_rate": 2.73464953606452e-06, + "loss": 0.9731, + "step": 9784 + }, + { + "epoch": 1.9402313785262737, + "grad_norm": 2.171875, + "learning_rate": 2.7337101489147792e-06, + "loss": 1.0464, + "step": 9785 + }, + { + "epoch": 1.9404312735813698, + "grad_norm": 2.34375, + "learning_rate": 2.7327708624333936e-06, + "loss": 1.0343, + "step": 9786 + }, + { + "epoch": 1.940631168636466, + "grad_norm": 2.046875, + "learning_rate": 2.7318316766620845e-06, + "loss": 0.9942, + "step": 9787 + }, + { + "epoch": 1.940831063691562, + "grad_norm": 2.203125, + "learning_rate": 2.7308925916425676e-06, + "loss": 0.9997, + "step": 9788 + }, + { + "epoch": 1.941030958746658, + "grad_norm": 2.171875, + "learning_rate": 2.7299536074165624e-06, + "loss": 0.9747, + "step": 9789 + }, + { + "epoch": 1.9412308538017542, + "grad_norm": 2.109375, + "learning_rate": 2.729014724025775e-06, + "loss": 0.9891, + "step": 9790 + }, + { + "epoch": 1.9414307488568503, + "grad_norm": 2.234375, + "learning_rate": 2.7280759415119087e-06, + "loss": 0.961, + "step": 9791 + }, + { + "epoch": 1.9416306439119462, + "grad_norm": 2.234375, + "learning_rate": 2.727137259916668e-06, + "loss": 1.0758, + "step": 9792 + }, + { + "epoch": 1.9418305389670423, + "grad_norm": 2.15625, + "learning_rate": 2.7261986792817484e-06, + "loss": 1.0119, + "step": 9793 + }, + { + "epoch": 1.9420304340221384, + "grad_norm": 2.203125, + "learning_rate": 2.725260199648838e-06, + "loss": 0.9422, + "step": 9794 + }, + { + "epoch": 1.9422303290772345, + "grad_norm": 2.234375, + "learning_rate": 2.7243218210596288e-06, + "loss": 0.9585, + "step": 9795 + }, + { + "epoch": 1.9424302241323306, + "grad_norm": 2.140625, + "learning_rate": 2.7233835435558033e-06, + "loss": 0.9701, + "step": 9796 + }, + { + "epoch": 1.9426301191874265, + "grad_norm": 2.265625, + "learning_rate": 2.722445367179034e-06, + "loss": 0.9562, + "step": 9797 + }, + { + "epoch": 1.9428300142425226, + "grad_norm": 2.1875, + "learning_rate": 2.7215072919709996e-06, + "loss": 0.963, + "step": 9798 + }, + { + "epoch": 1.9430299092976187, + "grad_norm": 2.1875, + "learning_rate": 2.720569317973368e-06, + "loss": 0.9333, + "step": 9799 + }, + { + "epoch": 1.9432298043527148, + "grad_norm": 2.171875, + "learning_rate": 2.719631445227802e-06, + "loss": 0.9703, + "step": 9800 + }, + { + "epoch": 1.9434296994078109, + "grad_norm": 2.25, + "learning_rate": 2.718693673775966e-06, + "loss": 1.0323, + "step": 9801 + }, + { + "epoch": 1.943629594462907, + "grad_norm": 2.15625, + "learning_rate": 2.7177560036595128e-06, + "loss": 0.9448, + "step": 9802 + }, + { + "epoch": 1.943829489518003, + "grad_norm": 2.171875, + "learning_rate": 2.7168184349200926e-06, + "loss": 0.9027, + "step": 9803 + }, + { + "epoch": 1.9440293845730992, + "grad_norm": 2.375, + "learning_rate": 2.7158809675993556e-06, + "loss": 1.0262, + "step": 9804 + }, + { + "epoch": 1.9442292796281953, + "grad_norm": 2.1875, + "learning_rate": 2.714943601738942e-06, + "loss": 0.9659, + "step": 9805 + }, + { + "epoch": 1.9444291746832914, + "grad_norm": 2.1875, + "learning_rate": 2.714006337380487e-06, + "loss": 1.0365, + "step": 9806 + }, + { + "epoch": 1.9446290697383875, + "grad_norm": 2.15625, + "learning_rate": 2.713069174565629e-06, + "loss": 1.0349, + "step": 9807 + }, + { + "epoch": 1.9448289647934836, + "grad_norm": 2.25, + "learning_rate": 2.712132113335994e-06, + "loss": 1.0042, + "step": 9808 + }, + { + "epoch": 1.9450288598485796, + "grad_norm": 2.15625, + "learning_rate": 2.7111951537332058e-06, + "loss": 0.8987, + "step": 9809 + }, + { + "epoch": 1.9452287549036755, + "grad_norm": 2.078125, + "learning_rate": 2.710258295798883e-06, + "loss": 0.9283, + "step": 9810 + }, + { + "epoch": 1.9454286499587716, + "grad_norm": 2.046875, + "learning_rate": 2.709321539574644e-06, + "loss": 0.8604, + "step": 9811 + }, + { + "epoch": 1.9456285450138677, + "grad_norm": 2.1875, + "learning_rate": 2.708384885102097e-06, + "loss": 1.0184, + "step": 9812 + }, + { + "epoch": 1.9458284400689638, + "grad_norm": 2.1875, + "learning_rate": 2.7074483324228474e-06, + "loss": 0.8505, + "step": 9813 + }, + { + "epoch": 1.9460283351240597, + "grad_norm": 2.078125, + "learning_rate": 2.7065118815785e-06, + "loss": 1.018, + "step": 9814 + }, + { + "epoch": 1.9462282301791558, + "grad_norm": 2.1875, + "learning_rate": 2.705575532610649e-06, + "loss": 1.0354, + "step": 9815 + }, + { + "epoch": 1.946428125234252, + "grad_norm": 2.046875, + "learning_rate": 2.704639285560886e-06, + "loss": 0.9605, + "step": 9816 + }, + { + "epoch": 1.946628020289348, + "grad_norm": 2.234375, + "learning_rate": 2.7037031404708038e-06, + "loss": 1.0619, + "step": 9817 + }, + { + "epoch": 1.946827915344444, + "grad_norm": 2.15625, + "learning_rate": 2.702767097381982e-06, + "loss": 1.083, + "step": 9818 + }, + { + "epoch": 1.9470278103995402, + "grad_norm": 2.125, + "learning_rate": 2.7018311563359977e-06, + "loss": 1.0053, + "step": 9819 + }, + { + "epoch": 1.9472277054546363, + "grad_norm": 2.03125, + "learning_rate": 2.700895317374431e-06, + "loss": 0.9911, + "step": 9820 + }, + { + "epoch": 1.9474276005097324, + "grad_norm": 2.25, + "learning_rate": 2.699959580538849e-06, + "loss": 0.9673, + "step": 9821 + }, + { + "epoch": 1.9476274955648285, + "grad_norm": 2.171875, + "learning_rate": 2.6990239458708145e-06, + "loss": 1.0125, + "step": 9822 + }, + { + "epoch": 1.9478273906199246, + "grad_norm": 2.15625, + "learning_rate": 2.6980884134118925e-06, + "loss": 1.0734, + "step": 9823 + }, + { + "epoch": 1.9480272856750207, + "grad_norm": 2.125, + "learning_rate": 2.697152983203637e-06, + "loss": 0.89, + "step": 9824 + }, + { + "epoch": 1.9482271807301168, + "grad_norm": 2.0625, + "learning_rate": 2.696217655287598e-06, + "loss": 0.8876, + "step": 9825 + }, + { + "epoch": 1.948427075785213, + "grad_norm": 2.0, + "learning_rate": 2.6952824297053272e-06, + "loss": 0.911, + "step": 9826 + }, + { + "epoch": 1.948626970840309, + "grad_norm": 2.046875, + "learning_rate": 2.694347306498366e-06, + "loss": 0.8906, + "step": 9827 + }, + { + "epoch": 1.9488268658954049, + "grad_norm": 2.0625, + "learning_rate": 2.6934122857082478e-06, + "loss": 0.9755, + "step": 9828 + }, + { + "epoch": 1.949026760950501, + "grad_norm": 2.28125, + "learning_rate": 2.6924773673765114e-06, + "loss": 0.9884, + "step": 9829 + }, + { + "epoch": 1.949226656005597, + "grad_norm": 2.21875, + "learning_rate": 2.6915425515446835e-06, + "loss": 1.0249, + "step": 9830 + }, + { + "epoch": 1.9494265510606932, + "grad_norm": 2.15625, + "learning_rate": 2.6906078382542877e-06, + "loss": 1.0337, + "step": 9831 + }, + { + "epoch": 1.949626446115789, + "grad_norm": 2.296875, + "learning_rate": 2.689673227546847e-06, + "loss": 1.0812, + "step": 9832 + }, + { + "epoch": 1.9498263411708852, + "grad_norm": 2.28125, + "learning_rate": 2.6887387194638744e-06, + "loss": 1.0124, + "step": 9833 + }, + { + "epoch": 1.9500262362259813, + "grad_norm": 2.234375, + "learning_rate": 2.687804314046879e-06, + "loss": 0.9958, + "step": 9834 + }, + { + "epoch": 1.9502261312810774, + "grad_norm": 2.140625, + "learning_rate": 2.686870011337371e-06, + "loss": 1.0071, + "step": 9835 + }, + { + "epoch": 1.9504260263361735, + "grad_norm": 2.171875, + "learning_rate": 2.6859358113768496e-06, + "loss": 0.8691, + "step": 9836 + }, + { + "epoch": 1.9506259213912696, + "grad_norm": 2.171875, + "learning_rate": 2.6850017142068113e-06, + "loss": 1.082, + "step": 9837 + }, + { + "epoch": 1.9508258164463657, + "grad_norm": 2.15625, + "learning_rate": 2.6840677198687515e-06, + "loss": 1.072, + "step": 9838 + }, + { + "epoch": 1.9510257115014618, + "grad_norm": 2.09375, + "learning_rate": 2.683133828404155e-06, + "loss": 0.8452, + "step": 9839 + }, + { + "epoch": 1.9512256065565579, + "grad_norm": 2.1875, + "learning_rate": 2.6822000398545078e-06, + "loss": 1.0922, + "step": 9840 + }, + { + "epoch": 1.951425501611654, + "grad_norm": 2.25, + "learning_rate": 2.681266354261285e-06, + "loss": 1.0021, + "step": 9841 + }, + { + "epoch": 1.95162539666675, + "grad_norm": 2.140625, + "learning_rate": 2.6803327716659644e-06, + "loss": 0.9239, + "step": 9842 + }, + { + "epoch": 1.9518252917218462, + "grad_norm": 2.265625, + "learning_rate": 2.6793992921100153e-06, + "loss": 0.9683, + "step": 9843 + }, + { + "epoch": 1.9520251867769423, + "grad_norm": 2.078125, + "learning_rate": 2.678465915634899e-06, + "loss": 0.9021, + "step": 9844 + }, + { + "epoch": 1.9522250818320381, + "grad_norm": 2.03125, + "learning_rate": 2.6775326422820813e-06, + "loss": 0.8585, + "step": 9845 + }, + { + "epoch": 1.9524249768871342, + "grad_norm": 2.203125, + "learning_rate": 2.676599472093015e-06, + "loss": 0.924, + "step": 9846 + }, + { + "epoch": 1.9526248719422303, + "grad_norm": 2.078125, + "learning_rate": 2.675666405109151e-06, + "loss": 0.8473, + "step": 9847 + }, + { + "epoch": 1.9528247669973264, + "grad_norm": 2.25, + "learning_rate": 2.6747334413719377e-06, + "loss": 1.0141, + "step": 9848 + }, + { + "epoch": 1.9530246620524225, + "grad_norm": 2.296875, + "learning_rate": 2.6738005809228175e-06, + "loss": 0.95, + "step": 9849 + }, + { + "epoch": 1.9532245571075184, + "grad_norm": 2.109375, + "learning_rate": 2.6728678238032245e-06, + "loss": 0.9748, + "step": 9850 + }, + { + "epoch": 1.9534244521626145, + "grad_norm": 2.1875, + "learning_rate": 2.671935170054597e-06, + "loss": 1.0148, + "step": 9851 + }, + { + "epoch": 1.9536243472177106, + "grad_norm": 2.171875, + "learning_rate": 2.6710026197183595e-06, + "loss": 1.0376, + "step": 9852 + }, + { + "epoch": 1.9538242422728067, + "grad_norm": 2.15625, + "learning_rate": 2.670070172835936e-06, + "loss": 1.0051, + "step": 9853 + }, + { + "epoch": 1.9540241373279028, + "grad_norm": 2.125, + "learning_rate": 2.669137829448748e-06, + "loss": 0.9586, + "step": 9854 + }, + { + "epoch": 1.954224032382999, + "grad_norm": 2.15625, + "learning_rate": 2.6682055895982085e-06, + "loss": 0.9388, + "step": 9855 + }, + { + "epoch": 1.954423927438095, + "grad_norm": 2.21875, + "learning_rate": 2.667273453325726e-06, + "loss": 1.0055, + "step": 9856 + }, + { + "epoch": 1.954623822493191, + "grad_norm": 2.109375, + "learning_rate": 2.6663414206727116e-06, + "loss": 0.9695, + "step": 9857 + }, + { + "epoch": 1.9548237175482872, + "grad_norm": 2.203125, + "learning_rate": 2.66540949168056e-06, + "loss": 1.0922, + "step": 9858 + }, + { + "epoch": 1.9550236126033833, + "grad_norm": 2.1875, + "learning_rate": 2.6644776663906674e-06, + "loss": 0.9678, + "step": 9859 + }, + { + "epoch": 1.9552235076584794, + "grad_norm": 2.328125, + "learning_rate": 2.663545944844429e-06, + "loss": 1.1365, + "step": 9860 + }, + { + "epoch": 1.9554234027135755, + "grad_norm": 2.21875, + "learning_rate": 2.6626143270832313e-06, + "loss": 0.9636, + "step": 9861 + }, + { + "epoch": 1.9556232977686716, + "grad_norm": 2.140625, + "learning_rate": 2.6616828131484528e-06, + "loss": 0.9908, + "step": 9862 + }, + { + "epoch": 1.9558231928237675, + "grad_norm": 2.125, + "learning_rate": 2.6607514030814757e-06, + "loss": 0.9, + "step": 9863 + }, + { + "epoch": 1.9560230878788636, + "grad_norm": 2.171875, + "learning_rate": 2.659820096923672e-06, + "loss": 0.9088, + "step": 9864 + }, + { + "epoch": 1.9562229829339597, + "grad_norm": 2.09375, + "learning_rate": 2.658888894716407e-06, + "loss": 0.9174, + "step": 9865 + }, + { + "epoch": 1.9564228779890558, + "grad_norm": 2.203125, + "learning_rate": 2.65795779650105e-06, + "loss": 1.1063, + "step": 9866 + }, + { + "epoch": 1.9566227730441517, + "grad_norm": 2.15625, + "learning_rate": 2.657026802318957e-06, + "loss": 0.9581, + "step": 9867 + }, + { + "epoch": 1.9568226680992478, + "grad_norm": 2.171875, + "learning_rate": 2.6560959122114815e-06, + "loss": 1.0477, + "step": 9868 + }, + { + "epoch": 1.9570225631543439, + "grad_norm": 2.0625, + "learning_rate": 2.6551651262199773e-06, + "loss": 0.9407, + "step": 9869 + }, + { + "epoch": 1.95722245820944, + "grad_norm": 2.1875, + "learning_rate": 2.6542344443857874e-06, + "loss": 0.9956, + "step": 9870 + }, + { + "epoch": 1.957422353264536, + "grad_norm": 2.09375, + "learning_rate": 2.653303866750253e-06, + "loss": 0.9238, + "step": 9871 + }, + { + "epoch": 1.9576222483196322, + "grad_norm": 2.046875, + "learning_rate": 2.652373393354709e-06, + "loss": 0.932, + "step": 9872 + }, + { + "epoch": 1.9578221433747283, + "grad_norm": 2.1875, + "learning_rate": 2.651443024240489e-06, + "loss": 1.0184, + "step": 9873 + }, + { + "epoch": 1.9580220384298244, + "grad_norm": 2.1875, + "learning_rate": 2.65051275944892e-06, + "loss": 0.9246, + "step": 9874 + }, + { + "epoch": 1.9582219334849205, + "grad_norm": 2.25, + "learning_rate": 2.6495825990213208e-06, + "loss": 0.9082, + "step": 9875 + }, + { + "epoch": 1.9584218285400166, + "grad_norm": 2.125, + "learning_rate": 2.6486525429990133e-06, + "loss": 0.9802, + "step": 9876 + }, + { + "epoch": 1.9586217235951127, + "grad_norm": 2.1875, + "learning_rate": 2.647722591423309e-06, + "loss": 0.9473, + "step": 9877 + }, + { + "epoch": 1.9588216186502088, + "grad_norm": 2.171875, + "learning_rate": 2.646792744335514e-06, + "loss": 0.9576, + "step": 9878 + }, + { + "epoch": 1.9590215137053049, + "grad_norm": 2.21875, + "learning_rate": 2.645863001776936e-06, + "loss": 0.9337, + "step": 9879 + }, + { + "epoch": 1.9592214087604007, + "grad_norm": 2.109375, + "learning_rate": 2.6449333637888717e-06, + "loss": 0.9523, + "step": 9880 + }, + { + "epoch": 1.9594213038154968, + "grad_norm": 2.125, + "learning_rate": 2.644003830412614e-06, + "loss": 0.95, + "step": 9881 + }, + { + "epoch": 1.959621198870593, + "grad_norm": 2.078125, + "learning_rate": 2.643074401689457e-06, + "loss": 0.9245, + "step": 9882 + }, + { + "epoch": 1.959821093925689, + "grad_norm": 2.15625, + "learning_rate": 2.6421450776606827e-06, + "loss": 0.9675, + "step": 9883 + }, + { + "epoch": 1.9600209889807851, + "grad_norm": 2.203125, + "learning_rate": 2.6412158583675707e-06, + "loss": 1.0271, + "step": 9884 + }, + { + "epoch": 1.960220884035881, + "grad_norm": 2.140625, + "learning_rate": 2.6402867438514e-06, + "loss": 0.9307, + "step": 9885 + }, + { + "epoch": 1.9604207790909771, + "grad_norm": 2.171875, + "learning_rate": 2.63935773415344e-06, + "loss": 1.0636, + "step": 9886 + }, + { + "epoch": 1.9606206741460732, + "grad_norm": 2.03125, + "learning_rate": 2.6384288293149572e-06, + "loss": 0.9815, + "step": 9887 + }, + { + "epoch": 1.9608205692011693, + "grad_norm": 2.265625, + "learning_rate": 2.6375000293772144e-06, + "loss": 1.0368, + "step": 9888 + }, + { + "epoch": 1.9610204642562654, + "grad_norm": 2.046875, + "learning_rate": 2.636571334381467e-06, + "loss": 0.9559, + "step": 9889 + }, + { + "epoch": 1.9612203593113615, + "grad_norm": 2.0625, + "learning_rate": 2.635642744368967e-06, + "loss": 0.9255, + "step": 9890 + }, + { + "epoch": 1.9614202543664576, + "grad_norm": 2.421875, + "learning_rate": 2.634714259380966e-06, + "loss": 0.9569, + "step": 9891 + }, + { + "epoch": 1.9616201494215537, + "grad_norm": 2.15625, + "learning_rate": 2.6337858794587046e-06, + "loss": 0.8442, + "step": 9892 + }, + { + "epoch": 1.9618200444766498, + "grad_norm": 2.21875, + "learning_rate": 2.63285760464342e-06, + "loss": 0.9336, + "step": 9893 + }, + { + "epoch": 1.962019939531746, + "grad_norm": 2.203125, + "learning_rate": 2.6319294349763495e-06, + "loss": 0.9557, + "step": 9894 + }, + { + "epoch": 1.962219834586842, + "grad_norm": 2.203125, + "learning_rate": 2.6310013704987207e-06, + "loss": 0.956, + "step": 9895 + }, + { + "epoch": 1.9624197296419381, + "grad_norm": 2.078125, + "learning_rate": 2.6300734112517562e-06, + "loss": 0.9001, + "step": 9896 + }, + { + "epoch": 1.9626196246970342, + "grad_norm": 2.1875, + "learning_rate": 2.6291455572766794e-06, + "loss": 0.8764, + "step": 9897 + }, + { + "epoch": 1.96281951975213, + "grad_norm": 2.125, + "learning_rate": 2.6282178086147036e-06, + "loss": 0.9871, + "step": 9898 + }, + { + "epoch": 1.9630194148072262, + "grad_norm": 2.25, + "learning_rate": 2.6272901653070397e-06, + "loss": 1.0325, + "step": 9899 + }, + { + "epoch": 1.9632193098623223, + "grad_norm": 2.1875, + "learning_rate": 2.626362627394892e-06, + "loss": 0.9382, + "step": 9900 + }, + { + "epoch": 1.9634192049174184, + "grad_norm": 2.21875, + "learning_rate": 2.6254351949194634e-06, + "loss": 1.0445, + "step": 9901 + }, + { + "epoch": 1.9636190999725143, + "grad_norm": 2.28125, + "learning_rate": 2.6245078679219503e-06, + "loss": 0.9785, + "step": 9902 + }, + { + "epoch": 1.9638189950276104, + "grad_norm": 2.09375, + "learning_rate": 2.6235806464435425e-06, + "loss": 1.0077, + "step": 9903 + }, + { + "epoch": 1.9640188900827065, + "grad_norm": 2.171875, + "learning_rate": 2.6226535305254303e-06, + "loss": 0.9668, + "step": 9904 + }, + { + "epoch": 1.9642187851378026, + "grad_norm": 2.171875, + "learning_rate": 2.6217265202087944e-06, + "loss": 0.9459, + "step": 9905 + }, + { + "epoch": 1.9644186801928987, + "grad_norm": 2.65625, + "learning_rate": 2.62079961553481e-06, + "loss": 0.9683, + "step": 9906 + }, + { + "epoch": 1.9646185752479948, + "grad_norm": 2.234375, + "learning_rate": 2.619872816544655e-06, + "loss": 0.909, + "step": 9907 + }, + { + "epoch": 1.9648184703030909, + "grad_norm": 2.09375, + "learning_rate": 2.6189461232794956e-06, + "loss": 0.882, + "step": 9908 + }, + { + "epoch": 1.965018365358187, + "grad_norm": 2.203125, + "learning_rate": 2.6180195357804926e-06, + "loss": 1.0062, + "step": 9909 + }, + { + "epoch": 1.965218260413283, + "grad_norm": 2.125, + "learning_rate": 2.6170930540888096e-06, + "loss": 0.9332, + "step": 9910 + }, + { + "epoch": 1.9654181554683792, + "grad_norm": 2.125, + "learning_rate": 2.6161666782455986e-06, + "loss": 0.9635, + "step": 9911 + }, + { + "epoch": 1.9656180505234753, + "grad_norm": 2.046875, + "learning_rate": 2.615240408292007e-06, + "loss": 0.9407, + "step": 9912 + }, + { + "epoch": 1.9658179455785714, + "grad_norm": 2.171875, + "learning_rate": 2.614314244269184e-06, + "loss": 1.0038, + "step": 9913 + }, + { + "epoch": 1.9660178406336675, + "grad_norm": 2.046875, + "learning_rate": 2.6133881862182676e-06, + "loss": 0.9732, + "step": 9914 + }, + { + "epoch": 1.9662177356887633, + "grad_norm": 2.25, + "learning_rate": 2.612462234180391e-06, + "loss": 1.0319, + "step": 9915 + }, + { + "epoch": 1.9664176307438594, + "grad_norm": 2.09375, + "learning_rate": 2.611536388196688e-06, + "loss": 0.9009, + "step": 9916 + }, + { + "epoch": 1.9666175257989555, + "grad_norm": 2.125, + "learning_rate": 2.610610648308285e-06, + "loss": 1.0454, + "step": 9917 + }, + { + "epoch": 1.9668174208540516, + "grad_norm": 2.109375, + "learning_rate": 2.6096850145563014e-06, + "loss": 0.9378, + "step": 9918 + }, + { + "epoch": 1.9670173159091477, + "grad_norm": 2.109375, + "learning_rate": 2.608759486981853e-06, + "loss": 0.9732, + "step": 9919 + }, + { + "epoch": 1.9672172109642436, + "grad_norm": 2.25, + "learning_rate": 2.6078340656260535e-06, + "loss": 1.0111, + "step": 9920 + }, + { + "epoch": 1.9674171060193397, + "grad_norm": 2.203125, + "learning_rate": 2.606908750530008e-06, + "loss": 1.0066, + "step": 9921 + }, + { + "epoch": 1.9676170010744358, + "grad_norm": 2.390625, + "learning_rate": 2.605983541734822e-06, + "loss": 0.9409, + "step": 9922 + }, + { + "epoch": 1.967816896129532, + "grad_norm": 2.3125, + "learning_rate": 2.605058439281591e-06, + "loss": 0.9418, + "step": 9923 + }, + { + "epoch": 1.968016791184628, + "grad_norm": 2.0625, + "learning_rate": 2.6041334432114064e-06, + "loss": 0.9253, + "step": 9924 + }, + { + "epoch": 1.9682166862397241, + "grad_norm": 2.0625, + "learning_rate": 2.6032085535653605e-06, + "loss": 0.9287, + "step": 9925 + }, + { + "epoch": 1.9684165812948202, + "grad_norm": 2.15625, + "learning_rate": 2.6022837703845346e-06, + "loss": 0.9976, + "step": 9926 + }, + { + "epoch": 1.9686164763499163, + "grad_norm": 2.1875, + "learning_rate": 2.6013590937100054e-06, + "loss": 0.9212, + "step": 9927 + }, + { + "epoch": 1.9688163714050124, + "grad_norm": 2.0625, + "learning_rate": 2.600434523582851e-06, + "loss": 0.9482, + "step": 9928 + }, + { + "epoch": 1.9690162664601085, + "grad_norm": 2.15625, + "learning_rate": 2.5995100600441392e-06, + "loss": 0.9942, + "step": 9929 + }, + { + "epoch": 1.9692161615152046, + "grad_norm": 2.25, + "learning_rate": 2.598585703134934e-06, + "loss": 0.9624, + "step": 9930 + }, + { + "epoch": 1.9694160565703007, + "grad_norm": 2.171875, + "learning_rate": 2.597661452896293e-06, + "loss": 0.9544, + "step": 9931 + }, + { + "epoch": 1.9696159516253968, + "grad_norm": 2.203125, + "learning_rate": 2.596737309369276e-06, + "loss": 1.0619, + "step": 9932 + }, + { + "epoch": 1.9698158466804927, + "grad_norm": 2.265625, + "learning_rate": 2.5958132725949314e-06, + "loss": 1.0108, + "step": 9933 + }, + { + "epoch": 1.9700157417355888, + "grad_norm": 2.25, + "learning_rate": 2.5948893426143018e-06, + "loss": 1.0122, + "step": 9934 + }, + { + "epoch": 1.970215636790685, + "grad_norm": 2.046875, + "learning_rate": 2.5939655194684334e-06, + "loss": 0.9055, + "step": 9935 + }, + { + "epoch": 1.970415531845781, + "grad_norm": 2.125, + "learning_rate": 2.593041803198359e-06, + "loss": 1.0203, + "step": 9936 + }, + { + "epoch": 1.9706154269008769, + "grad_norm": 2.203125, + "learning_rate": 2.592118193845109e-06, + "loss": 1.0048, + "step": 9937 + }, + { + "epoch": 1.970815321955973, + "grad_norm": 2.234375, + "learning_rate": 2.5911946914497133e-06, + "loss": 1.0231, + "step": 9938 + }, + { + "epoch": 1.971015217011069, + "grad_norm": 2.125, + "learning_rate": 2.590271296053193e-06, + "loss": 0.9282, + "step": 9939 + }, + { + "epoch": 1.9712151120661652, + "grad_norm": 2.078125, + "learning_rate": 2.5893480076965615e-06, + "loss": 0.9355, + "step": 9940 + }, + { + "epoch": 1.9714150071212613, + "grad_norm": 2.21875, + "learning_rate": 2.588424826420836e-06, + "loss": 1.033, + "step": 9941 + }, + { + "epoch": 1.9716149021763574, + "grad_norm": 2.25, + "learning_rate": 2.5875017522670227e-06, + "loss": 0.9113, + "step": 9942 + }, + { + "epoch": 1.9718147972314535, + "grad_norm": 2.171875, + "learning_rate": 2.5865787852761217e-06, + "loss": 1.0371, + "step": 9943 + }, + { + "epoch": 1.9720146922865496, + "grad_norm": 2.125, + "learning_rate": 2.585655925489135e-06, + "loss": 0.9781, + "step": 9944 + }, + { + "epoch": 1.9722145873416457, + "grad_norm": 2.1875, + "learning_rate": 2.584733172947055e-06, + "loss": 0.9384, + "step": 9945 + }, + { + "epoch": 1.9724144823967418, + "grad_norm": 2.1875, + "learning_rate": 2.5838105276908667e-06, + "loss": 1.029, + "step": 9946 + }, + { + "epoch": 1.9726143774518379, + "grad_norm": 2.234375, + "learning_rate": 2.5828879897615587e-06, + "loss": 1.0109, + "step": 9947 + }, + { + "epoch": 1.972814272506934, + "grad_norm": 2.234375, + "learning_rate": 2.581965559200108e-06, + "loss": 0.9886, + "step": 9948 + }, + { + "epoch": 1.97301416756203, + "grad_norm": 2.1875, + "learning_rate": 2.58104323604749e-06, + "loss": 0.9924, + "step": 9949 + }, + { + "epoch": 1.9732140626171262, + "grad_norm": 2.09375, + "learning_rate": 2.5801210203446718e-06, + "loss": 0.9618, + "step": 9950 + }, + { + "epoch": 1.973413957672222, + "grad_norm": 2.09375, + "learning_rate": 2.57919891213262e-06, + "loss": 0.9703, + "step": 9951 + }, + { + "epoch": 1.9736138527273182, + "grad_norm": 2.1875, + "learning_rate": 2.578276911452292e-06, + "loss": 0.9618, + "step": 9952 + }, + { + "epoch": 1.9738137477824143, + "grad_norm": 2.171875, + "learning_rate": 2.5773550183446465e-06, + "loss": 0.9932, + "step": 9953 + }, + { + "epoch": 1.9740136428375104, + "grad_norm": 2.171875, + "learning_rate": 2.5764332328506327e-06, + "loss": 0.9966, + "step": 9954 + }, + { + "epoch": 1.9742135378926062, + "grad_norm": 2.0625, + "learning_rate": 2.5755115550111942e-06, + "loss": 0.986, + "step": 9955 + }, + { + "epoch": 1.9744134329477023, + "grad_norm": 2.125, + "learning_rate": 2.574589984867275e-06, + "loss": 0.9848, + "step": 9956 + }, + { + "epoch": 1.9746133280027984, + "grad_norm": 2.15625, + "learning_rate": 2.5736685224598097e-06, + "loss": 1.038, + "step": 9957 + }, + { + "epoch": 1.9748132230578945, + "grad_norm": 2.203125, + "learning_rate": 2.5727471678297277e-06, + "loss": 0.9692, + "step": 9958 + }, + { + "epoch": 1.9750131181129906, + "grad_norm": 2.125, + "learning_rate": 2.5718259210179588e-06, + "loss": 0.9844, + "step": 9959 + }, + { + "epoch": 1.9752130131680867, + "grad_norm": 2.1875, + "learning_rate": 2.5709047820654236e-06, + "loss": 0.9823, + "step": 9960 + }, + { + "epoch": 1.9754129082231828, + "grad_norm": 2.078125, + "learning_rate": 2.569983751013039e-06, + "loss": 0.9745, + "step": 9961 + }, + { + "epoch": 1.975612803278279, + "grad_norm": 2.46875, + "learning_rate": 2.5690628279017136e-06, + "loss": 0.921, + "step": 9962 + }, + { + "epoch": 1.975812698333375, + "grad_norm": 2.09375, + "learning_rate": 2.56814201277236e-06, + "loss": 0.9791, + "step": 9963 + }, + { + "epoch": 1.9760125933884711, + "grad_norm": 1.9921875, + "learning_rate": 2.567221305665879e-06, + "loss": 0.8148, + "step": 9964 + }, + { + "epoch": 1.9762124884435672, + "grad_norm": 2.28125, + "learning_rate": 2.566300706623165e-06, + "loss": 0.9585, + "step": 9965 + }, + { + "epoch": 1.9764123834986633, + "grad_norm": 2.265625, + "learning_rate": 2.5653802156851158e-06, + "loss": 0.9921, + "step": 9966 + }, + { + "epoch": 1.9766122785537594, + "grad_norm": 2.109375, + "learning_rate": 2.5644598328926183e-06, + "loss": 0.8986, + "step": 9967 + }, + { + "epoch": 1.9768121736088553, + "grad_norm": 2.1875, + "learning_rate": 2.563539558286552e-06, + "loss": 0.9104, + "step": 9968 + }, + { + "epoch": 1.9770120686639514, + "grad_norm": 2.15625, + "learning_rate": 2.5626193919078008e-06, + "loss": 0.9292, + "step": 9969 + }, + { + "epoch": 1.9772119637190475, + "grad_norm": 2.234375, + "learning_rate": 2.561699333797236e-06, + "loss": 0.9468, + "step": 9970 + }, + { + "epoch": 1.9774118587741436, + "grad_norm": 2.15625, + "learning_rate": 2.560779383995724e-06, + "loss": 1.0181, + "step": 9971 + }, + { + "epoch": 1.9776117538292397, + "grad_norm": 2.0625, + "learning_rate": 2.559859542544133e-06, + "loss": 0.9923, + "step": 9972 + }, + { + "epoch": 1.9778116488843356, + "grad_norm": 2.171875, + "learning_rate": 2.5589398094833205e-06, + "loss": 0.9749, + "step": 9973 + }, + { + "epoch": 1.9780115439394317, + "grad_norm": 2.1875, + "learning_rate": 2.55802018485414e-06, + "loss": 0.9581, + "step": 9974 + }, + { + "epoch": 1.9782114389945278, + "grad_norm": 2.203125, + "learning_rate": 2.557100668697443e-06, + "loss": 0.9778, + "step": 9975 + }, + { + "epoch": 1.9784113340496239, + "grad_norm": 2.09375, + "learning_rate": 2.5561812610540736e-06, + "loss": 0.9616, + "step": 9976 + }, + { + "epoch": 1.97861122910472, + "grad_norm": 2.140625, + "learning_rate": 2.555261961964872e-06, + "loss": 0.9669, + "step": 9977 + }, + { + "epoch": 1.978811124159816, + "grad_norm": 2.0, + "learning_rate": 2.5543427714706705e-06, + "loss": 0.8204, + "step": 9978 + }, + { + "epoch": 1.9790110192149122, + "grad_norm": 2.21875, + "learning_rate": 2.5534236896123043e-06, + "loss": 1.0263, + "step": 9979 + }, + { + "epoch": 1.9792109142700083, + "grad_norm": 2.171875, + "learning_rate": 2.552504716430596e-06, + "loss": 0.9726, + "step": 9980 + }, + { + "epoch": 1.9794108093251044, + "grad_norm": 2.21875, + "learning_rate": 2.551585851966367e-06, + "loss": 0.9656, + "step": 9981 + }, + { + "epoch": 1.9796107043802005, + "grad_norm": 2.171875, + "learning_rate": 2.550667096260433e-06, + "loss": 0.9558, + "step": 9982 + }, + { + "epoch": 1.9798105994352966, + "grad_norm": 2.15625, + "learning_rate": 2.549748449353603e-06, + "loss": 0.9942, + "step": 9983 + }, + { + "epoch": 1.9800104944903927, + "grad_norm": 2.203125, + "learning_rate": 2.548829911286687e-06, + "loss": 0.9188, + "step": 9984 + }, + { + "epoch": 1.9802103895454888, + "grad_norm": 2.234375, + "learning_rate": 2.5479114821004845e-06, + "loss": 1.0964, + "step": 9985 + }, + { + "epoch": 1.9804102846005847, + "grad_norm": 2.296875, + "learning_rate": 2.5469931618357907e-06, + "loss": 1.0178, + "step": 9986 + }, + { + "epoch": 1.9806101796556808, + "grad_norm": 2.140625, + "learning_rate": 2.5460749505334004e-06, + "loss": 0.9592, + "step": 9987 + }, + { + "epoch": 1.9808100747107769, + "grad_norm": 2.046875, + "learning_rate": 2.5451568482340983e-06, + "loss": 0.9825, + "step": 9988 + }, + { + "epoch": 1.981009969765873, + "grad_norm": 2.25, + "learning_rate": 2.5442388549786668e-06, + "loss": 0.9168, + "step": 9989 + }, + { + "epoch": 1.9812098648209688, + "grad_norm": 1.9765625, + "learning_rate": 2.543320970807882e-06, + "loss": 0.8362, + "step": 9990 + }, + { + "epoch": 1.981409759876065, + "grad_norm": 2.265625, + "learning_rate": 2.5424031957625184e-06, + "loss": 1.0724, + "step": 9991 + }, + { + "epoch": 1.981609654931161, + "grad_norm": 2.1875, + "learning_rate": 2.5414855298833423e-06, + "loss": 0.9505, + "step": 9992 + }, + { + "epoch": 1.9818095499862571, + "grad_norm": 2.140625, + "learning_rate": 2.540567973211115e-06, + "loss": 1.0578, + "step": 9993 + }, + { + "epoch": 1.9820094450413532, + "grad_norm": 2.0625, + "learning_rate": 2.539650525786597e-06, + "loss": 0.9175, + "step": 9994 + }, + { + "epoch": 1.9822093400964493, + "grad_norm": 2.078125, + "learning_rate": 2.5387331876505405e-06, + "loss": 0.952, + "step": 9995 + }, + { + "epoch": 1.9824092351515454, + "grad_norm": 2.078125, + "learning_rate": 2.5378159588436907e-06, + "loss": 1.0021, + "step": 9996 + }, + { + "epoch": 1.9826091302066415, + "grad_norm": 2.21875, + "learning_rate": 2.536898839406795e-06, + "loss": 0.9979, + "step": 9997 + }, + { + "epoch": 1.9828090252617376, + "grad_norm": 2.15625, + "learning_rate": 2.5359818293805893e-06, + "loss": 0.9755, + "step": 9998 + }, + { + "epoch": 1.9830089203168337, + "grad_norm": 2.15625, + "learning_rate": 2.5350649288058065e-06, + "loss": 0.9423, + "step": 9999 + }, + { + "epoch": 1.9832088153719298, + "grad_norm": 2.203125, + "learning_rate": 2.534148137723178e-06, + "loss": 0.9666, + "step": 10000 + }, + { + "epoch": 1.983408710427026, + "grad_norm": 2.046875, + "learning_rate": 2.5332314561734257e-06, + "loss": 0.823, + "step": 10001 + }, + { + "epoch": 1.983608605482122, + "grad_norm": 2.125, + "learning_rate": 2.532314884197267e-06, + "loss": 1.0064, + "step": 10002 + }, + { + "epoch": 1.983808500537218, + "grad_norm": 2.171875, + "learning_rate": 2.5313984218354185e-06, + "loss": 1.0481, + "step": 10003 + }, + { + "epoch": 1.984008395592314, + "grad_norm": 2.078125, + "learning_rate": 2.530482069128589e-06, + "loss": 0.9209, + "step": 10004 + } + ], + "logging_steps": 1, + "max_steps": 15006, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5002, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.952236882409082e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}