diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7041 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.08776958290248525, + "eval_steps": 1000, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.776958290248525e-05, + "grad_norm": 121.5, + "learning_rate": 3e-06, + "loss": 7.9287, + "step": 1 + }, + { + "epoch": 0.0001755391658049705, + "grad_norm": 112.0, + "learning_rate": 6e-06, + "loss": 7.9365, + "step": 2 + }, + { + "epoch": 0.00026330874870745575, + "grad_norm": 73.0, + "learning_rate": 9e-06, + "loss": 7.9922, + "step": 3 + }, + { + "epoch": 0.000351078331609941, + "grad_norm": 67.0, + "learning_rate": 1.2e-05, + "loss": 8.0557, + "step": 4 + }, + { + "epoch": 0.00043884791451242626, + "grad_norm": 82.0, + "learning_rate": 1.5e-05, + "loss": 7.8389, + "step": 5 + }, + { + "epoch": 0.0005266174974149115, + "grad_norm": 66.5, + "learning_rate": 1.8e-05, + "loss": 7.8574, + "step": 6 + }, + { + "epoch": 0.0006143870803173967, + "grad_norm": 158.0, + "learning_rate": 2.1000000000000002e-05, + "loss": 7.708, + "step": 7 + }, + { + "epoch": 0.000702156663219882, + "grad_norm": 76.5, + "learning_rate": 2.4e-05, + "loss": 7.4805, + "step": 8 + }, + { + "epoch": 0.0007899262461223672, + "grad_norm": 76.5, + "learning_rate": 2.7e-05, + "loss": 7.667, + "step": 9 + }, + { + "epoch": 0.0008776958290248525, + "grad_norm": 59.25, + "learning_rate": 3e-05, + "loss": 7.4551, + "step": 10 + }, + { + "epoch": 0.0009654654119273377, + "grad_norm": 60.0, + "learning_rate": 3.2999999999999996e-05, + "loss": 7.4072, + "step": 11 + }, + { + "epoch": 0.001053234994829823, + "grad_norm": 169.0, + "learning_rate": 3.6e-05, + "loss": 7.2051, + "step": 12 + }, + { + "epoch": 0.0011410045777323083, + "grad_norm": 198.0, + "learning_rate": 3.9e-05, + "loss": 6.9873, + "step": 13 + }, + { + "epoch": 0.0012287741606347934, + "grad_norm": 52.75, + "learning_rate": 4.2000000000000004e-05, + "loss": 6.8613, + "step": 14 + }, + { + "epoch": 0.0013165437435372787, + "grad_norm": 84.5, + "learning_rate": 4.4999999999999996e-05, + "loss": 6.665, + "step": 15 + }, + { + "epoch": 0.001404313326439764, + "grad_norm": 42.75, + "learning_rate": 4.8e-05, + "loss": 6.6465, + "step": 16 + }, + { + "epoch": 0.0014920829093422493, + "grad_norm": 139.0, + "learning_rate": 5.1000000000000006e-05, + "loss": 6.5166, + "step": 17 + }, + { + "epoch": 0.0015798524922447344, + "grad_norm": 41.75, + "learning_rate": 5.4e-05, + "loss": 6.4814, + "step": 18 + }, + { + "epoch": 0.0016676220751472197, + "grad_norm": 39.5, + "learning_rate": 5.7e-05, + "loss": 6.2256, + "step": 19 + }, + { + "epoch": 0.001755391658049705, + "grad_norm": 17.125, + "learning_rate": 6e-05, + "loss": 6.0605, + "step": 20 + }, + { + "epoch": 0.0018431612409521903, + "grad_norm": 16.75, + "learning_rate": 6.3e-05, + "loss": 6.0781, + "step": 21 + }, + { + "epoch": 0.0019309308238546754, + "grad_norm": 83.5, + "learning_rate": 6.599999999999999e-05, + "loss": 6.0078, + "step": 22 + }, + { + "epoch": 0.002018700406757161, + "grad_norm": 44.75, + "learning_rate": 6.9e-05, + "loss": 5.9941, + "step": 23 + }, + { + "epoch": 0.002106469989659646, + "grad_norm": 25.625, + "learning_rate": 7.2e-05, + "loss": 5.917, + "step": 24 + }, + { + "epoch": 0.002194239572562131, + "grad_norm": 65.5, + "learning_rate": 7.500000000000001e-05, + "loss": 5.7607, + "step": 25 + }, + { + "epoch": 0.0022820091554646166, + "grad_norm": 16.875, + "learning_rate": 7.8e-05, + "loss": 5.7617, + "step": 26 + }, + { + "epoch": 0.0023697787383671017, + "grad_norm": 22.625, + "learning_rate": 8.1e-05, + "loss": 5.8018, + "step": 27 + }, + { + "epoch": 0.002457548321269587, + "grad_norm": 27.375, + "learning_rate": 8.400000000000001e-05, + "loss": 5.6592, + "step": 28 + }, + { + "epoch": 0.0025453179041720723, + "grad_norm": 28.875, + "learning_rate": 8.7e-05, + "loss": 5.5781, + "step": 29 + }, + { + "epoch": 0.0026330874870745574, + "grad_norm": 25.875, + "learning_rate": 8.999999999999999e-05, + "loss": 5.4229, + "step": 30 + }, + { + "epoch": 0.002720857069977043, + "grad_norm": 256.0, + "learning_rate": 9.3e-05, + "loss": 5.373, + "step": 31 + }, + { + "epoch": 0.002808626652879528, + "grad_norm": 21.5, + "learning_rate": 9.6e-05, + "loss": 5.374, + "step": 32 + }, + { + "epoch": 0.002896396235782013, + "grad_norm": 15.625, + "learning_rate": 9.900000000000001e-05, + "loss": 5.3916, + "step": 33 + }, + { + "epoch": 0.0029841658186844987, + "grad_norm": 17.625, + "learning_rate": 0.00010200000000000001, + "loss": 5.2695, + "step": 34 + }, + { + "epoch": 0.0030719354015869837, + "grad_norm": 26.75, + "learning_rate": 0.00010500000000000002, + "loss": 5.1943, + "step": 35 + }, + { + "epoch": 0.003159704984489469, + "grad_norm": 18.375, + "learning_rate": 0.000108, + "loss": 5.2988, + "step": 36 + }, + { + "epoch": 0.0032474745673919544, + "grad_norm": 15.375, + "learning_rate": 0.000111, + "loss": 5.1738, + "step": 37 + }, + { + "epoch": 0.0033352441502944394, + "grad_norm": 3.1875, + "learning_rate": 0.000114, + "loss": 5.0938, + "step": 38 + }, + { + "epoch": 0.003423013733196925, + "grad_norm": 13.875, + "learning_rate": 0.000117, + "loss": 5.0488, + "step": 39 + }, + { + "epoch": 0.00351078331609941, + "grad_norm": 18.375, + "learning_rate": 0.00012, + "loss": 5.0342, + "step": 40 + }, + { + "epoch": 0.003598552899001895, + "grad_norm": 13.25, + "learning_rate": 0.000123, + "loss": 4.9072, + "step": 41 + }, + { + "epoch": 0.0036863224819043807, + "grad_norm": 14.375, + "learning_rate": 0.000126, + "loss": 5.0068, + "step": 42 + }, + { + "epoch": 0.0037740920648068658, + "grad_norm": 8.5625, + "learning_rate": 0.000129, + "loss": 4.9346, + "step": 43 + }, + { + "epoch": 0.003861861647709351, + "grad_norm": 4.53125, + "learning_rate": 0.00013199999999999998, + "loss": 4.9004, + "step": 44 + }, + { + "epoch": 0.003949631230611836, + "grad_norm": 44.5, + "learning_rate": 0.000135, + "loss": 4.96, + "step": 45 + }, + { + "epoch": 0.004037400813514322, + "grad_norm": 2.09375, + "learning_rate": 0.000138, + "loss": 4.8945, + "step": 46 + }, + { + "epoch": 0.0041251703964168066, + "grad_norm": 5.3125, + "learning_rate": 0.000141, + "loss": 4.9229, + "step": 47 + }, + { + "epoch": 0.004212939979319292, + "grad_norm": 4.8125, + "learning_rate": 0.000144, + "loss": 4.8545, + "step": 48 + }, + { + "epoch": 0.004300709562221778, + "grad_norm": 1.6484375, + "learning_rate": 0.000147, + "loss": 4.8369, + "step": 49 + }, + { + "epoch": 0.004388479145124262, + "grad_norm": 6.8125, + "learning_rate": 0.00015000000000000001, + "loss": 4.9775, + "step": 50 + }, + { + "epoch": 0.004476248728026748, + "grad_norm": 5.0, + "learning_rate": 0.000153, + "loss": 4.9795, + "step": 51 + }, + { + "epoch": 0.004564018310929233, + "grad_norm": 1.0859375, + "learning_rate": 0.000156, + "loss": 4.7822, + "step": 52 + }, + { + "epoch": 0.004651787893831718, + "grad_norm": 7.40625, + "learning_rate": 0.000159, + "loss": 4.8193, + "step": 53 + }, + { + "epoch": 0.0047395574767342035, + "grad_norm": 5.65625, + "learning_rate": 0.000162, + "loss": 4.8438, + "step": 54 + }, + { + "epoch": 0.004827327059636689, + "grad_norm": 6.0, + "learning_rate": 0.000165, + "loss": 4.8115, + "step": 55 + }, + { + "epoch": 0.004915096642539174, + "grad_norm": 6.75, + "learning_rate": 0.00016800000000000002, + "loss": 4.7568, + "step": 56 + }, + { + "epoch": 0.005002866225441659, + "grad_norm": 8.125, + "learning_rate": 0.000171, + "loss": 4.8145, + "step": 57 + }, + { + "epoch": 0.005090635808344145, + "grad_norm": 12.125, + "learning_rate": 0.000174, + "loss": 4.8076, + "step": 58 + }, + { + "epoch": 0.005178405391246629, + "grad_norm": 11.875, + "learning_rate": 0.000177, + "loss": 4.7861, + "step": 59 + }, + { + "epoch": 0.005266174974149115, + "grad_norm": 1.7109375, + "learning_rate": 0.00017999999999999998, + "loss": 4.7412, + "step": 60 + }, + { + "epoch": 0.0053539445570516, + "grad_norm": 5.875, + "learning_rate": 0.000183, + "loss": 4.7051, + "step": 61 + }, + { + "epoch": 0.005441714139954086, + "grad_norm": 1.71875, + "learning_rate": 0.000186, + "loss": 4.7061, + "step": 62 + }, + { + "epoch": 0.005529483722856571, + "grad_norm": 5.90625, + "learning_rate": 0.000189, + "loss": 4.7012, + "step": 63 + }, + { + "epoch": 0.005617253305759056, + "grad_norm": 4.375, + "learning_rate": 0.000192, + "loss": 4.7412, + "step": 64 + }, + { + "epoch": 0.005705022888661542, + "grad_norm": 2.28125, + "learning_rate": 0.00019500000000000002, + "loss": 4.708, + "step": 65 + }, + { + "epoch": 0.005792792471564026, + "grad_norm": 5.5, + "learning_rate": 0.00019800000000000002, + "loss": 4.7158, + "step": 66 + }, + { + "epoch": 0.005880562054466512, + "grad_norm": 3.375, + "learning_rate": 0.000201, + "loss": 4.6943, + "step": 67 + }, + { + "epoch": 0.005968331637368997, + "grad_norm": 2.1875, + "learning_rate": 0.00020400000000000003, + "loss": 4.6689, + "step": 68 + }, + { + "epoch": 0.006056101220271482, + "grad_norm": 5.625, + "learning_rate": 0.00020700000000000002, + "loss": 4.7139, + "step": 69 + }, + { + "epoch": 0.0061438708031739675, + "grad_norm": 1.453125, + "learning_rate": 0.00021000000000000004, + "loss": 4.748, + "step": 70 + }, + { + "epoch": 0.006231640386076453, + "grad_norm": 6.40625, + "learning_rate": 0.00021299999999999997, + "loss": 4.7041, + "step": 71 + }, + { + "epoch": 0.006319409968978938, + "grad_norm": 2.25, + "learning_rate": 0.000216, + "loss": 4.709, + "step": 72 + }, + { + "epoch": 0.006407179551881423, + "grad_norm": 13.25, + "learning_rate": 0.00021899999999999998, + "loss": 4.8428, + "step": 73 + }, + { + "epoch": 0.006494949134783909, + "grad_norm": 14.5, + "learning_rate": 0.000222, + "loss": 4.7979, + "step": 74 + }, + { + "epoch": 0.006582718717686393, + "grad_norm": 7.4375, + "learning_rate": 0.000225, + "loss": 4.6299, + "step": 75 + }, + { + "epoch": 0.006670488300588879, + "grad_norm": 13.75, + "learning_rate": 0.000228, + "loss": 4.7627, + "step": 76 + }, + { + "epoch": 0.006758257883491364, + "grad_norm": 15.1875, + "learning_rate": 0.000231, + "loss": 4.8945, + "step": 77 + }, + { + "epoch": 0.00684602746639385, + "grad_norm": 12.5, + "learning_rate": 0.000234, + "loss": 4.7734, + "step": 78 + }, + { + "epoch": 0.006933797049296335, + "grad_norm": 1.1484375, + "learning_rate": 0.00023700000000000001, + "loss": 4.6152, + "step": 79 + }, + { + "epoch": 0.00702156663219882, + "grad_norm": 11.125, + "learning_rate": 0.00024, + "loss": 4.7637, + "step": 80 + }, + { + "epoch": 0.007109336215101306, + "grad_norm": 11.5, + "learning_rate": 0.00024300000000000002, + "loss": 4.7783, + "step": 81 + }, + { + "epoch": 0.00719710579800379, + "grad_norm": 4.4375, + "learning_rate": 0.000246, + "loss": 4.6436, + "step": 82 + }, + { + "epoch": 0.007284875380906276, + "grad_norm": 12.6875, + "learning_rate": 0.00024900000000000004, + "loss": 4.8057, + "step": 83 + }, + { + "epoch": 0.007372644963808761, + "grad_norm": 14.0625, + "learning_rate": 0.000252, + "loss": 4.8232, + "step": 84 + }, + { + "epoch": 0.007460414546711246, + "grad_norm": 10.875, + "learning_rate": 0.000255, + "loss": 4.6621, + "step": 85 + }, + { + "epoch": 0.0075481841296137315, + "grad_norm": 1.5625, + "learning_rate": 0.000258, + "loss": 4.6309, + "step": 86 + }, + { + "epoch": 0.007635953712516217, + "grad_norm": 6.5, + "learning_rate": 0.000261, + "loss": 4.6016, + "step": 87 + }, + { + "epoch": 0.007723723295418702, + "grad_norm": 5.5625, + "learning_rate": 0.00026399999999999997, + "loss": 4.6582, + "step": 88 + }, + { + "epoch": 0.007811492878321187, + "grad_norm": 3.453125, + "learning_rate": 0.000267, + "loss": 4.6748, + "step": 89 + }, + { + "epoch": 0.007899262461223673, + "grad_norm": 4.03125, + "learning_rate": 0.00027, + "loss": 4.6484, + "step": 90 + }, + { + "epoch": 0.007987032044126157, + "grad_norm": 1.8828125, + "learning_rate": 0.000273, + "loss": 4.6318, + "step": 91 + }, + { + "epoch": 0.008074801627028644, + "grad_norm": 6.25, + "learning_rate": 0.000276, + "loss": 4.6748, + "step": 92 + }, + { + "epoch": 0.008162571209931128, + "grad_norm": 6.03125, + "learning_rate": 0.000279, + "loss": 4.6309, + "step": 93 + }, + { + "epoch": 0.008250340792833613, + "grad_norm": 2.09375, + "learning_rate": 0.000282, + "loss": 4.6621, + "step": 94 + }, + { + "epoch": 0.0083381103757361, + "grad_norm": 5.03125, + "learning_rate": 0.000285, + "loss": 4.6953, + "step": 95 + }, + { + "epoch": 0.008425879958638584, + "grad_norm": 3.984375, + "learning_rate": 0.000288, + "loss": 4.6104, + "step": 96 + }, + { + "epoch": 0.008513649541541069, + "grad_norm": 2.546875, + "learning_rate": 0.000291, + "loss": 4.6055, + "step": 97 + }, + { + "epoch": 0.008601419124443555, + "grad_norm": 2.484375, + "learning_rate": 0.000294, + "loss": 4.6123, + "step": 98 + }, + { + "epoch": 0.00868918870734604, + "grad_norm": 2.328125, + "learning_rate": 0.000297, + "loss": 4.5986, + "step": 99 + }, + { + "epoch": 0.008776958290248525, + "grad_norm": 1.7578125, + "learning_rate": 0.00030000000000000003, + "loss": 4.5312, + "step": 100 + }, + { + "epoch": 0.008864727873151011, + "grad_norm": 3.296875, + "learning_rate": 0.00030300000000000005, + "loss": 4.6201, + "step": 101 + }, + { + "epoch": 0.008952497456053496, + "grad_norm": 2.515625, + "learning_rate": 0.000306, + "loss": 4.626, + "step": 102 + }, + { + "epoch": 0.00904026703895598, + "grad_norm": 3.140625, + "learning_rate": 0.000309, + "loss": 4.624, + "step": 103 + }, + { + "epoch": 0.009128036621858467, + "grad_norm": 2.734375, + "learning_rate": 0.000312, + "loss": 4.5771, + "step": 104 + }, + { + "epoch": 0.009215806204760951, + "grad_norm": 2.9375, + "learning_rate": 0.000315, + "loss": 4.6221, + "step": 105 + }, + { + "epoch": 0.009303575787663436, + "grad_norm": 2.84375, + "learning_rate": 0.000318, + "loss": 4.5703, + "step": 106 + }, + { + "epoch": 0.009391345370565922, + "grad_norm": 2.09375, + "learning_rate": 0.000321, + "loss": 4.5283, + "step": 107 + }, + { + "epoch": 0.009479114953468407, + "grad_norm": 1.9765625, + "learning_rate": 0.000324, + "loss": 4.6152, + "step": 108 + }, + { + "epoch": 0.009566884536370892, + "grad_norm": 2.765625, + "learning_rate": 0.000327, + "loss": 4.5557, + "step": 109 + }, + { + "epoch": 0.009654654119273378, + "grad_norm": 2.09375, + "learning_rate": 0.00033, + "loss": 4.6279, + "step": 110 + }, + { + "epoch": 0.009742423702175863, + "grad_norm": 3.046875, + "learning_rate": 0.000333, + "loss": 4.542, + "step": 111 + }, + { + "epoch": 0.009830193285078347, + "grad_norm": 2.71875, + "learning_rate": 0.00033600000000000004, + "loss": 4.5586, + "step": 112 + }, + { + "epoch": 0.009917962867980834, + "grad_norm": 2.171875, + "learning_rate": 0.000339, + "loss": 4.5127, + "step": 113 + }, + { + "epoch": 0.010005732450883318, + "grad_norm": 1.9609375, + "learning_rate": 0.000342, + "loss": 4.5742, + "step": 114 + }, + { + "epoch": 0.010093502033785803, + "grad_norm": 2.296875, + "learning_rate": 0.00034500000000000004, + "loss": 4.5088, + "step": 115 + }, + { + "epoch": 0.01018127161668829, + "grad_norm": 1.8046875, + "learning_rate": 0.000348, + "loss": 4.543, + "step": 116 + }, + { + "epoch": 0.010269041199590774, + "grad_norm": 3.140625, + "learning_rate": 0.000351, + "loss": 4.4609, + "step": 117 + }, + { + "epoch": 0.010356810782493259, + "grad_norm": 2.609375, + "learning_rate": 0.000354, + "loss": 4.5303, + "step": 118 + }, + { + "epoch": 0.010444580365395745, + "grad_norm": 2.765625, + "learning_rate": 0.000357, + "loss": 4.4971, + "step": 119 + }, + { + "epoch": 0.01053234994829823, + "grad_norm": 2.3125, + "learning_rate": 0.00035999999999999997, + "loss": 4.5342, + "step": 120 + }, + { + "epoch": 0.010620119531200716, + "grad_norm": 2.640625, + "learning_rate": 0.000363, + "loss": 4.4941, + "step": 121 + }, + { + "epoch": 0.0107078891141032, + "grad_norm": 1.9453125, + "learning_rate": 0.000366, + "loss": 4.4658, + "step": 122 + }, + { + "epoch": 0.010795658697005685, + "grad_norm": 2.921875, + "learning_rate": 0.000369, + "loss": 4.4941, + "step": 123 + }, + { + "epoch": 0.010883428279908172, + "grad_norm": 2.59375, + "learning_rate": 0.000372, + "loss": 4.4248, + "step": 124 + }, + { + "epoch": 0.010971197862810656, + "grad_norm": 1.84375, + "learning_rate": 0.000375, + "loss": 4.4961, + "step": 125 + }, + { + "epoch": 0.011058967445713141, + "grad_norm": 1.609375, + "learning_rate": 0.000378, + "loss": 4.4292, + "step": 126 + }, + { + "epoch": 0.011146737028615628, + "grad_norm": 2.96875, + "learning_rate": 0.000381, + "loss": 4.5234, + "step": 127 + }, + { + "epoch": 0.011234506611518112, + "grad_norm": 2.640625, + "learning_rate": 0.000384, + "loss": 4.5049, + "step": 128 + }, + { + "epoch": 0.011322276194420597, + "grad_norm": 2.0625, + "learning_rate": 0.00038700000000000003, + "loss": 4.4512, + "step": 129 + }, + { + "epoch": 0.011410045777323083, + "grad_norm": 1.7734375, + "learning_rate": 0.00039000000000000005, + "loss": 4.4355, + "step": 130 + }, + { + "epoch": 0.011497815360225568, + "grad_norm": 1.4765625, + "learning_rate": 0.000393, + "loss": 4.4238, + "step": 131 + }, + { + "epoch": 0.011585584943128053, + "grad_norm": 2.046875, + "learning_rate": 0.00039600000000000003, + "loss": 4.5137, + "step": 132 + }, + { + "epoch": 0.011673354526030539, + "grad_norm": 1.734375, + "learning_rate": 0.00039900000000000005, + "loss": 4.4004, + "step": 133 + }, + { + "epoch": 0.011761124108933024, + "grad_norm": 2.21875, + "learning_rate": 0.000402, + "loss": 4.4141, + "step": 134 + }, + { + "epoch": 0.011848893691835508, + "grad_norm": 1.75, + "learning_rate": 0.00040500000000000003, + "loss": 4.4688, + "step": 135 + }, + { + "epoch": 0.011936663274737995, + "grad_norm": 1.1328125, + "learning_rate": 0.00040800000000000005, + "loss": 4.4326, + "step": 136 + }, + { + "epoch": 0.01202443285764048, + "grad_norm": 2.90625, + "learning_rate": 0.000411, + "loss": 4.5049, + "step": 137 + }, + { + "epoch": 0.012112202440542964, + "grad_norm": 2.203125, + "learning_rate": 0.00041400000000000003, + "loss": 4.5088, + "step": 138 + }, + { + "epoch": 0.01219997202344545, + "grad_norm": 2.421875, + "learning_rate": 0.00041700000000000005, + "loss": 4.4717, + "step": 139 + }, + { + "epoch": 0.012287741606347935, + "grad_norm": 1.2421875, + "learning_rate": 0.00042000000000000007, + "loss": 4.4717, + "step": 140 + }, + { + "epoch": 0.01237551118925042, + "grad_norm": 1.8359375, + "learning_rate": 0.000423, + "loss": 4.3867, + "step": 141 + }, + { + "epoch": 0.012463280772152906, + "grad_norm": 1.390625, + "learning_rate": 0.00042599999999999995, + "loss": 4.3896, + "step": 142 + }, + { + "epoch": 0.01255105035505539, + "grad_norm": 1.9609375, + "learning_rate": 0.00042899999999999997, + "loss": 4.3428, + "step": 143 + }, + { + "epoch": 0.012638819937957875, + "grad_norm": 2.1875, + "learning_rate": 0.000432, + "loss": 4.4297, + "step": 144 + }, + { + "epoch": 0.012726589520860362, + "grad_norm": 1.2109375, + "learning_rate": 0.000435, + "loss": 4.418, + "step": 145 + }, + { + "epoch": 0.012814359103762846, + "grad_norm": 1.890625, + "learning_rate": 0.00043799999999999997, + "loss": 4.3877, + "step": 146 + }, + { + "epoch": 0.012902128686665331, + "grad_norm": 1.7734375, + "learning_rate": 0.000441, + "loss": 4.415, + "step": 147 + }, + { + "epoch": 0.012989898269567817, + "grad_norm": 1.1328125, + "learning_rate": 0.000444, + "loss": 4.4492, + "step": 148 + }, + { + "epoch": 0.013077667852470302, + "grad_norm": 1.3515625, + "learning_rate": 0.00044699999999999997, + "loss": 4.4111, + "step": 149 + }, + { + "epoch": 0.013165437435372787, + "grad_norm": 2.21875, + "learning_rate": 0.00045, + "loss": 4.3945, + "step": 150 + }, + { + "epoch": 0.013253207018275273, + "grad_norm": 2.234375, + "learning_rate": 0.000453, + "loss": 4.3818, + "step": 151 + }, + { + "epoch": 0.013340976601177758, + "grad_norm": 1.4453125, + "learning_rate": 0.000456, + "loss": 4.3496, + "step": 152 + }, + { + "epoch": 0.013428746184080244, + "grad_norm": 3.734375, + "learning_rate": 0.000459, + "loss": 4.418, + "step": 153 + }, + { + "epoch": 0.013516515766982729, + "grad_norm": 2.078125, + "learning_rate": 0.000462, + "loss": 4.4609, + "step": 154 + }, + { + "epoch": 0.013604285349885213, + "grad_norm": 6.5, + "learning_rate": 0.000465, + "loss": 4.4658, + "step": 155 + }, + { + "epoch": 0.0136920549327877, + "grad_norm": 5.8125, + "learning_rate": 0.000468, + "loss": 4.4727, + "step": 156 + }, + { + "epoch": 0.013779824515690185, + "grad_norm": 1.6015625, + "learning_rate": 0.000471, + "loss": 4.3848, + "step": 157 + }, + { + "epoch": 0.01386759409859267, + "grad_norm": 3.390625, + "learning_rate": 0.00047400000000000003, + "loss": 4.3906, + "step": 158 + }, + { + "epoch": 0.013955363681495156, + "grad_norm": 1.515625, + "learning_rate": 0.000477, + "loss": 4.373, + "step": 159 + }, + { + "epoch": 0.01404313326439764, + "grad_norm": 3.671875, + "learning_rate": 0.00048, + "loss": 4.3984, + "step": 160 + }, + { + "epoch": 0.014130902847300125, + "grad_norm": 2.078125, + "learning_rate": 0.00048300000000000003, + "loss": 4.4072, + "step": 161 + }, + { + "epoch": 0.014218672430202611, + "grad_norm": 3.703125, + "learning_rate": 0.00048600000000000005, + "loss": 4.418, + "step": 162 + }, + { + "epoch": 0.014306442013105096, + "grad_norm": 2.671875, + "learning_rate": 0.0004890000000000001, + "loss": 4.4209, + "step": 163 + }, + { + "epoch": 0.01439421159600758, + "grad_norm": 4.53125, + "learning_rate": 0.000492, + "loss": 4.374, + "step": 164 + }, + { + "epoch": 0.014481981178910067, + "grad_norm": 3.890625, + "learning_rate": 0.000495, + "loss": 4.4346, + "step": 165 + }, + { + "epoch": 0.014569750761812552, + "grad_norm": 1.828125, + "learning_rate": 0.0004980000000000001, + "loss": 4.3232, + "step": 166 + }, + { + "epoch": 0.014657520344715036, + "grad_norm": 1.9140625, + "learning_rate": 0.000501, + "loss": 4.332, + "step": 167 + }, + { + "epoch": 0.014745289927617523, + "grad_norm": 2.3125, + "learning_rate": 0.000504, + "loss": 4.4102, + "step": 168 + }, + { + "epoch": 0.014833059510520007, + "grad_norm": 1.40625, + "learning_rate": 0.0005070000000000001, + "loss": 4.3809, + "step": 169 + }, + { + "epoch": 0.014920829093422492, + "grad_norm": 2.828125, + "learning_rate": 0.00051, + "loss": 4.3984, + "step": 170 + }, + { + "epoch": 0.015008598676324978, + "grad_norm": 1.8671875, + "learning_rate": 0.000513, + "loss": 4.3887, + "step": 171 + }, + { + "epoch": 0.015096368259227463, + "grad_norm": 3.09375, + "learning_rate": 0.000516, + "loss": 4.3398, + "step": 172 + }, + { + "epoch": 0.015184137842129948, + "grad_norm": 2.625, + "learning_rate": 0.0005189999999999999, + "loss": 4.418, + "step": 173 + }, + { + "epoch": 0.015271907425032434, + "grad_norm": 2.359375, + "learning_rate": 0.000522, + "loss": 4.3408, + "step": 174 + }, + { + "epoch": 0.015359677007934919, + "grad_norm": 2.3125, + "learning_rate": 0.000525, + "loss": 4.4092, + "step": 175 + }, + { + "epoch": 0.015447446590837403, + "grad_norm": 1.5234375, + "learning_rate": 0.0005279999999999999, + "loss": 4.3594, + "step": 176 + }, + { + "epoch": 0.01553521617373989, + "grad_norm": 1.8828125, + "learning_rate": 0.000531, + "loss": 4.3291, + "step": 177 + }, + { + "epoch": 0.015622985756642374, + "grad_norm": 1.171875, + "learning_rate": 0.000534, + "loss": 4.3228, + "step": 178 + }, + { + "epoch": 0.01571075533954486, + "grad_norm": 1.3125, + "learning_rate": 0.000537, + "loss": 4.3418, + "step": 179 + }, + { + "epoch": 0.015798524922447345, + "grad_norm": 1.015625, + "learning_rate": 0.00054, + "loss": 4.3486, + "step": 180 + }, + { + "epoch": 0.01588629450534983, + "grad_norm": 0.7109375, + "learning_rate": 0.000543, + "loss": 4.3457, + "step": 181 + }, + { + "epoch": 0.015974064088252315, + "grad_norm": 0.87109375, + "learning_rate": 0.000546, + "loss": 4.3867, + "step": 182 + }, + { + "epoch": 0.0160618336711548, + "grad_norm": 1.1953125, + "learning_rate": 0.000549, + "loss": 4.3555, + "step": 183 + }, + { + "epoch": 0.016149603254057288, + "grad_norm": 1.203125, + "learning_rate": 0.000552, + "loss": 4.334, + "step": 184 + }, + { + "epoch": 0.016237372836959772, + "grad_norm": 0.9453125, + "learning_rate": 0.000555, + "loss": 4.292, + "step": 185 + }, + { + "epoch": 0.016325142419862257, + "grad_norm": 1.3359375, + "learning_rate": 0.000558, + "loss": 4.3467, + "step": 186 + }, + { + "epoch": 0.01641291200276474, + "grad_norm": 1.015625, + "learning_rate": 0.000561, + "loss": 4.3145, + "step": 187 + }, + { + "epoch": 0.016500681585667226, + "grad_norm": 1.2890625, + "learning_rate": 0.000564, + "loss": 4.2754, + "step": 188 + }, + { + "epoch": 0.01658845116856971, + "grad_norm": 1.015625, + "learning_rate": 0.000567, + "loss": 4.2783, + "step": 189 + }, + { + "epoch": 0.0166762207514722, + "grad_norm": 1.1171875, + "learning_rate": 0.00057, + "loss": 4.269, + "step": 190 + }, + { + "epoch": 0.016763990334374684, + "grad_norm": 1.453125, + "learning_rate": 0.000573, + "loss": 4.332, + "step": 191 + }, + { + "epoch": 0.01685175991727717, + "grad_norm": 1.515625, + "learning_rate": 0.000576, + "loss": 4.3018, + "step": 192 + }, + { + "epoch": 0.016939529500179653, + "grad_norm": 0.625, + "learning_rate": 0.000579, + "loss": 4.2607, + "step": 193 + }, + { + "epoch": 0.017027299083082138, + "grad_norm": 0.82421875, + "learning_rate": 0.000582, + "loss": 4.3398, + "step": 194 + }, + { + "epoch": 0.017115068665984622, + "grad_norm": 1.1328125, + "learning_rate": 0.000585, + "loss": 4.2788, + "step": 195 + }, + { + "epoch": 0.01720283824888711, + "grad_norm": 2.140625, + "learning_rate": 0.000588, + "loss": 4.2266, + "step": 196 + }, + { + "epoch": 0.017290607831789595, + "grad_norm": 0.83203125, + "learning_rate": 0.000591, + "loss": 4.3252, + "step": 197 + }, + { + "epoch": 0.01737837741469208, + "grad_norm": 2.375, + "learning_rate": 0.000594, + "loss": 4.2686, + "step": 198 + }, + { + "epoch": 0.017466146997594564, + "grad_norm": 1.015625, + "learning_rate": 0.0005970000000000001, + "loss": 4.2939, + "step": 199 + }, + { + "epoch": 0.01755391658049705, + "grad_norm": 1.84375, + "learning_rate": 0.0006000000000000001, + "loss": 4.2803, + "step": 200 + }, + { + "epoch": 0.017641686163399534, + "grad_norm": 1.4921875, + "learning_rate": 0.000603, + "loss": 4.2744, + "step": 201 + }, + { + "epoch": 0.017729455746302022, + "grad_norm": 1.2578125, + "learning_rate": 0.0006060000000000001, + "loss": 4.292, + "step": 202 + }, + { + "epoch": 0.017817225329204506, + "grad_norm": 0.81640625, + "learning_rate": 0.0006090000000000001, + "loss": 4.2046, + "step": 203 + }, + { + "epoch": 0.01790499491210699, + "grad_norm": 1.8203125, + "learning_rate": 0.000612, + "loss": 4.2178, + "step": 204 + }, + { + "epoch": 0.017992764495009476, + "grad_norm": 1.109375, + "learning_rate": 0.000615, + "loss": 4.2183, + "step": 205 + }, + { + "epoch": 0.01808053407791196, + "grad_norm": 0.7890625, + "learning_rate": 0.000618, + "loss": 4.2139, + "step": 206 + }, + { + "epoch": 0.01816830366081445, + "grad_norm": 1.078125, + "learning_rate": 0.000621, + "loss": 4.2363, + "step": 207 + }, + { + "epoch": 0.018256073243716933, + "grad_norm": 2.15625, + "learning_rate": 0.000624, + "loss": 4.2891, + "step": 208 + }, + { + "epoch": 0.018343842826619418, + "grad_norm": 0.9765625, + "learning_rate": 0.000627, + "loss": 4.2212, + "step": 209 + }, + { + "epoch": 0.018431612409521902, + "grad_norm": 3.390625, + "learning_rate": 0.00063, + "loss": 4.3203, + "step": 210 + }, + { + "epoch": 0.018519381992424387, + "grad_norm": 2.109375, + "learning_rate": 0.000633, + "loss": 4.3159, + "step": 211 + }, + { + "epoch": 0.018607151575326872, + "grad_norm": 3.734375, + "learning_rate": 0.000636, + "loss": 4.272, + "step": 212 + }, + { + "epoch": 0.01869492115822936, + "grad_norm": 3.0625, + "learning_rate": 0.000639, + "loss": 4.2368, + "step": 213 + }, + { + "epoch": 0.018782690741131845, + "grad_norm": 2.875, + "learning_rate": 0.000642, + "loss": 4.2749, + "step": 214 + }, + { + "epoch": 0.01887046032403433, + "grad_norm": 1.8515625, + "learning_rate": 0.000645, + "loss": 4.1772, + "step": 215 + }, + { + "epoch": 0.018958229906936814, + "grad_norm": 2.125, + "learning_rate": 0.000648, + "loss": 4.3271, + "step": 216 + }, + { + "epoch": 0.0190459994898393, + "grad_norm": 1.546875, + "learning_rate": 0.000651, + "loss": 4.2402, + "step": 217 + }, + { + "epoch": 0.019133769072741783, + "grad_norm": 1.140625, + "learning_rate": 0.000654, + "loss": 4.2407, + "step": 218 + }, + { + "epoch": 0.01922153865564427, + "grad_norm": 1.84375, + "learning_rate": 0.000657, + "loss": 4.2031, + "step": 219 + }, + { + "epoch": 0.019309308238546756, + "grad_norm": 1.0703125, + "learning_rate": 0.00066, + "loss": 4.2534, + "step": 220 + }, + { + "epoch": 0.01939707782144924, + "grad_norm": 1.4921875, + "learning_rate": 0.0006630000000000001, + "loss": 4.2451, + "step": 221 + }, + { + "epoch": 0.019484847404351725, + "grad_norm": 1.1015625, + "learning_rate": 0.000666, + "loss": 4.3438, + "step": 222 + }, + { + "epoch": 0.01957261698725421, + "grad_norm": 1.5625, + "learning_rate": 0.000669, + "loss": 4.2666, + "step": 223 + }, + { + "epoch": 0.019660386570156695, + "grad_norm": 1.3203125, + "learning_rate": 0.0006720000000000001, + "loss": 4.2119, + "step": 224 + }, + { + "epoch": 0.019748156153059183, + "grad_norm": 1.2578125, + "learning_rate": 0.000675, + "loss": 4.1357, + "step": 225 + }, + { + "epoch": 0.019835925735961667, + "grad_norm": 1.3125, + "learning_rate": 0.000678, + "loss": 4.2373, + "step": 226 + }, + { + "epoch": 0.019923695318864152, + "grad_norm": 0.9921875, + "learning_rate": 0.0006810000000000001, + "loss": 4.2134, + "step": 227 + }, + { + "epoch": 0.020011464901766637, + "grad_norm": 0.75390625, + "learning_rate": 0.000684, + "loss": 4.1768, + "step": 228 + }, + { + "epoch": 0.02009923448466912, + "grad_norm": 1.15625, + "learning_rate": 0.000687, + "loss": 4.165, + "step": 229 + }, + { + "epoch": 0.020187004067571606, + "grad_norm": 1.9609375, + "learning_rate": 0.0006900000000000001, + "loss": 4.1826, + "step": 230 + }, + { + "epoch": 0.020274773650474094, + "grad_norm": 0.609375, + "learning_rate": 0.000693, + "loss": 4.1729, + "step": 231 + }, + { + "epoch": 0.02036254323337658, + "grad_norm": 2.265625, + "learning_rate": 0.000696, + "loss": 4.1919, + "step": 232 + }, + { + "epoch": 0.020450312816279063, + "grad_norm": 0.703125, + "learning_rate": 0.0006990000000000001, + "loss": 4.2305, + "step": 233 + }, + { + "epoch": 0.020538082399181548, + "grad_norm": 2.71875, + "learning_rate": 0.000702, + "loss": 4.2744, + "step": 234 + }, + { + "epoch": 0.020625851982084033, + "grad_norm": 1.421875, + "learning_rate": 0.000705, + "loss": 4.1924, + "step": 235 + }, + { + "epoch": 0.020713621564986517, + "grad_norm": 5.4375, + "learning_rate": 0.000708, + "loss": 4.2466, + "step": 236 + }, + { + "epoch": 0.020801391147889006, + "grad_norm": 4.6875, + "learning_rate": 0.0007109999999999999, + "loss": 4.312, + "step": 237 + }, + { + "epoch": 0.02088916073079149, + "grad_norm": 1.8203125, + "learning_rate": 0.000714, + "loss": 4.1802, + "step": 238 + }, + { + "epoch": 0.020976930313693975, + "grad_norm": 2.75, + "learning_rate": 0.000717, + "loss": 4.3037, + "step": 239 + }, + { + "epoch": 0.02106469989659646, + "grad_norm": 1.9140625, + "learning_rate": 0.0007199999999999999, + "loss": 4.2271, + "step": 240 + }, + { + "epoch": 0.021152469479498944, + "grad_norm": 2.890625, + "learning_rate": 0.000723, + "loss": 4.2114, + "step": 241 + }, + { + "epoch": 0.021240239062401432, + "grad_norm": 1.65625, + "learning_rate": 0.000726, + "loss": 4.2104, + "step": 242 + }, + { + "epoch": 0.021328008645303917, + "grad_norm": 2.53125, + "learning_rate": 0.000729, + "loss": 4.1665, + "step": 243 + }, + { + "epoch": 0.0214157782282064, + "grad_norm": 1.53125, + "learning_rate": 0.000732, + "loss": 4.1475, + "step": 244 + }, + { + "epoch": 0.021503547811108886, + "grad_norm": 2.015625, + "learning_rate": 0.000735, + "loss": 4.2339, + "step": 245 + }, + { + "epoch": 0.02159131739401137, + "grad_norm": 1.125, + "learning_rate": 0.000738, + "loss": 4.2329, + "step": 246 + }, + { + "epoch": 0.021679086976913856, + "grad_norm": 2.609375, + "learning_rate": 0.000741, + "loss": 4.21, + "step": 247 + }, + { + "epoch": 0.021766856559816344, + "grad_norm": 2.171875, + "learning_rate": 0.000744, + "loss": 4.2168, + "step": 248 + }, + { + "epoch": 0.02185462614271883, + "grad_norm": 1.484375, + "learning_rate": 0.000747, + "loss": 4.1685, + "step": 249 + }, + { + "epoch": 0.021942395725621313, + "grad_norm": 1.4609375, + "learning_rate": 0.00075, + "loss": 4.1772, + "step": 250 + }, + { + "epoch": 0.022030165308523798, + "grad_norm": 1.234375, + "learning_rate": 0.000753, + "loss": 4.1802, + "step": 251 + }, + { + "epoch": 0.022117934891426282, + "grad_norm": 1.3671875, + "learning_rate": 0.000756, + "loss": 4.1489, + "step": 252 + }, + { + "epoch": 0.022205704474328767, + "grad_norm": 0.88671875, + "learning_rate": 0.000759, + "loss": 4.1973, + "step": 253 + }, + { + "epoch": 0.022293474057231255, + "grad_norm": 1.1484375, + "learning_rate": 0.000762, + "loss": 4.1558, + "step": 254 + }, + { + "epoch": 0.02238124364013374, + "grad_norm": 1.40625, + "learning_rate": 0.0007650000000000001, + "loss": 4.2026, + "step": 255 + }, + { + "epoch": 0.022469013223036224, + "grad_norm": 1.109375, + "learning_rate": 0.000768, + "loss": 4.123, + "step": 256 + }, + { + "epoch": 0.02255678280593871, + "grad_norm": 1.2109375, + "learning_rate": 0.000771, + "loss": 4.1147, + "step": 257 + }, + { + "epoch": 0.022644552388841194, + "grad_norm": 0.96875, + "learning_rate": 0.0007740000000000001, + "loss": 4.1553, + "step": 258 + }, + { + "epoch": 0.02273232197174368, + "grad_norm": 0.65234375, + "learning_rate": 0.000777, + "loss": 4.21, + "step": 259 + }, + { + "epoch": 0.022820091554646166, + "grad_norm": 0.5859375, + "learning_rate": 0.0007800000000000001, + "loss": 4.1685, + "step": 260 + }, + { + "epoch": 0.02290786113754865, + "grad_norm": 0.80078125, + "learning_rate": 0.0007830000000000001, + "loss": 4.0781, + "step": 261 + }, + { + "epoch": 0.022995630720451136, + "grad_norm": 0.58984375, + "learning_rate": 0.000786, + "loss": 4.1138, + "step": 262 + }, + { + "epoch": 0.02308340030335362, + "grad_norm": 0.546875, + "learning_rate": 0.0007890000000000001, + "loss": 4.1313, + "step": 263 + }, + { + "epoch": 0.023171169886256105, + "grad_norm": 0.71875, + "learning_rate": 0.0007920000000000001, + "loss": 4.1675, + "step": 264 + }, + { + "epoch": 0.02325893946915859, + "grad_norm": 0.7265625, + "learning_rate": 0.000795, + "loss": 4.228, + "step": 265 + }, + { + "epoch": 0.023346709052061078, + "grad_norm": 0.796875, + "learning_rate": 0.0007980000000000001, + "loss": 4.1475, + "step": 266 + }, + { + "epoch": 0.023434478634963563, + "grad_norm": 1.078125, + "learning_rate": 0.0008010000000000001, + "loss": 4.1196, + "step": 267 + }, + { + "epoch": 0.023522248217866047, + "grad_norm": 1.734375, + "learning_rate": 0.000804, + "loss": 4.1523, + "step": 268 + }, + { + "epoch": 0.023610017800768532, + "grad_norm": 1.2109375, + "learning_rate": 0.0008070000000000001, + "loss": 4.1338, + "step": 269 + }, + { + "epoch": 0.023697787383671017, + "grad_norm": 1.078125, + "learning_rate": 0.0008100000000000001, + "loss": 4.0962, + "step": 270 + }, + { + "epoch": 0.023785556966573505, + "grad_norm": 0.65625, + "learning_rate": 0.000813, + "loss": 4.1567, + "step": 271 + }, + { + "epoch": 0.02387332654947599, + "grad_norm": 0.8671875, + "learning_rate": 0.0008160000000000001, + "loss": 4.1865, + "step": 272 + }, + { + "epoch": 0.023961096132378474, + "grad_norm": 1.15625, + "learning_rate": 0.0008190000000000001, + "loss": 4.061, + "step": 273 + }, + { + "epoch": 0.02404886571528096, + "grad_norm": 1.7734375, + "learning_rate": 0.000822, + "loss": 4.0957, + "step": 274 + }, + { + "epoch": 0.024136635298183443, + "grad_norm": 0.80859375, + "learning_rate": 0.0008250000000000001, + "loss": 4.1304, + "step": 275 + }, + { + "epoch": 0.024224404881085928, + "grad_norm": 1.0859375, + "learning_rate": 0.0008280000000000001, + "loss": 4.1309, + "step": 276 + }, + { + "epoch": 0.024312174463988416, + "grad_norm": 1.03125, + "learning_rate": 0.0008310000000000001, + "loss": 4.1338, + "step": 277 + }, + { + "epoch": 0.0243999440468909, + "grad_norm": 1.03125, + "learning_rate": 0.0008340000000000001, + "loss": 4.0967, + "step": 278 + }, + { + "epoch": 0.024487713629793385, + "grad_norm": 1.1328125, + "learning_rate": 0.0008370000000000001, + "loss": 4.0981, + "step": 279 + }, + { + "epoch": 0.02457548321269587, + "grad_norm": 0.71875, + "learning_rate": 0.0008400000000000001, + "loss": 4.0859, + "step": 280 + }, + { + "epoch": 0.024663252795598355, + "grad_norm": 1.2421875, + "learning_rate": 0.0008430000000000001, + "loss": 4.126, + "step": 281 + }, + { + "epoch": 0.02475102237850084, + "grad_norm": 1.328125, + "learning_rate": 0.000846, + "loss": 4.2065, + "step": 282 + }, + { + "epoch": 0.024838791961403327, + "grad_norm": 1.0234375, + "learning_rate": 0.0008489999999999999, + "loss": 4.0684, + "step": 283 + }, + { + "epoch": 0.024926561544305812, + "grad_norm": 1.3515625, + "learning_rate": 0.0008519999999999999, + "loss": 4.0898, + "step": 284 + }, + { + "epoch": 0.025014331127208297, + "grad_norm": 2.828125, + "learning_rate": 0.000855, + "loss": 4.1406, + "step": 285 + }, + { + "epoch": 0.02510210071011078, + "grad_norm": 1.484375, + "learning_rate": 0.0008579999999999999, + "loss": 4.1655, + "step": 286 + }, + { + "epoch": 0.025189870293013266, + "grad_norm": 5.3125, + "learning_rate": 0.000861, + "loss": 4.1816, + "step": 287 + }, + { + "epoch": 0.02527763987591575, + "grad_norm": 4.75, + "learning_rate": 0.000864, + "loss": 4.252, + "step": 288 + }, + { + "epoch": 0.02536540945881824, + "grad_norm": 1.5078125, + "learning_rate": 0.0008669999999999999, + "loss": 4.146, + "step": 289 + }, + { + "epoch": 0.025453179041720723, + "grad_norm": 2.84375, + "learning_rate": 0.00087, + "loss": 4.1196, + "step": 290 + }, + { + "epoch": 0.025540948624623208, + "grad_norm": 1.4375, + "learning_rate": 0.000873, + "loss": 4.123, + "step": 291 + }, + { + "epoch": 0.025628718207525693, + "grad_norm": 1.4765625, + "learning_rate": 0.0008759999999999999, + "loss": 4.1416, + "step": 292 + }, + { + "epoch": 0.025716487790428177, + "grad_norm": 1.8125, + "learning_rate": 0.000879, + "loss": 4.1191, + "step": 293 + }, + { + "epoch": 0.025804257373330662, + "grad_norm": 0.9609375, + "learning_rate": 0.000882, + "loss": 4.1421, + "step": 294 + }, + { + "epoch": 0.02589202695623315, + "grad_norm": 1.265625, + "learning_rate": 0.0008849999999999999, + "loss": 4.123, + "step": 295 + }, + { + "epoch": 0.025979796539135635, + "grad_norm": 1.296875, + "learning_rate": 0.000888, + "loss": 4.1802, + "step": 296 + }, + { + "epoch": 0.02606756612203812, + "grad_norm": 0.890625, + "learning_rate": 0.000891, + "loss": 4.0566, + "step": 297 + }, + { + "epoch": 0.026155335704940604, + "grad_norm": 0.703125, + "learning_rate": 0.0008939999999999999, + "loss": 4.1284, + "step": 298 + }, + { + "epoch": 0.02624310528784309, + "grad_norm": 0.88671875, + "learning_rate": 0.000897, + "loss": 4.0513, + "step": 299 + }, + { + "epoch": 0.026330874870745574, + "grad_norm": 1.0234375, + "learning_rate": 0.0009, + "loss": 4.1318, + "step": 300 + }, + { + "epoch": 0.02641864445364806, + "grad_norm": 1.2578125, + "learning_rate": 0.0009029999999999999, + "loss": 4.0923, + "step": 301 + }, + { + "epoch": 0.026506414036550546, + "grad_norm": 1.390625, + "learning_rate": 0.000906, + "loss": 4.063, + "step": 302 + }, + { + "epoch": 0.02659418361945303, + "grad_norm": 0.55859375, + "learning_rate": 0.000909, + "loss": 4.0669, + "step": 303 + }, + { + "epoch": 0.026681953202355516, + "grad_norm": 1.1171875, + "learning_rate": 0.000912, + "loss": 4.1074, + "step": 304 + }, + { + "epoch": 0.026769722785258, + "grad_norm": 1.46875, + "learning_rate": 0.000915, + "loss": 4.1069, + "step": 305 + }, + { + "epoch": 0.02685749236816049, + "grad_norm": 0.74609375, + "learning_rate": 0.000918, + "loss": 4.0347, + "step": 306 + }, + { + "epoch": 0.026945261951062973, + "grad_norm": 1.3515625, + "learning_rate": 0.000921, + "loss": 4.0503, + "step": 307 + }, + { + "epoch": 0.027033031533965458, + "grad_norm": 1.2578125, + "learning_rate": 0.000924, + "loss": 4.0518, + "step": 308 + }, + { + "epoch": 0.027120801116867942, + "grad_norm": 0.7109375, + "learning_rate": 0.000927, + "loss": 4.0605, + "step": 309 + }, + { + "epoch": 0.027208570699770427, + "grad_norm": 0.84765625, + "learning_rate": 0.00093, + "loss": 4.1006, + "step": 310 + }, + { + "epoch": 0.02729634028267291, + "grad_norm": 0.953125, + "learning_rate": 0.000933, + "loss": 4.0596, + "step": 311 + }, + { + "epoch": 0.0273841098655754, + "grad_norm": 1.328125, + "learning_rate": 0.000936, + "loss": 4.1206, + "step": 312 + }, + { + "epoch": 0.027471879448477884, + "grad_norm": 0.8359375, + "learning_rate": 0.0009390000000000001, + "loss": 4.0425, + "step": 313 + }, + { + "epoch": 0.02755964903138037, + "grad_norm": 0.84375, + "learning_rate": 0.000942, + "loss": 4.1162, + "step": 314 + }, + { + "epoch": 0.027647418614282854, + "grad_norm": 3.515625, + "learning_rate": 0.000945, + "loss": 4.0737, + "step": 315 + }, + { + "epoch": 0.02773518819718534, + "grad_norm": 1.2421875, + "learning_rate": 0.0009480000000000001, + "loss": 4.0366, + "step": 316 + }, + { + "epoch": 0.027822957780087823, + "grad_norm": 1.0078125, + "learning_rate": 0.000951, + "loss": 4.021, + "step": 317 + }, + { + "epoch": 0.02791072736299031, + "grad_norm": 1.625, + "learning_rate": 0.000954, + "loss": 4.0576, + "step": 318 + }, + { + "epoch": 0.027998496945892796, + "grad_norm": 0.9921875, + "learning_rate": 0.0009570000000000001, + "loss": 4.0444, + "step": 319 + }, + { + "epoch": 0.02808626652879528, + "grad_norm": 1.5859375, + "learning_rate": 0.00096, + "loss": 4.0332, + "step": 320 + }, + { + "epoch": 0.028174036111697765, + "grad_norm": 1.671875, + "learning_rate": 0.000963, + "loss": 4.1543, + "step": 321 + }, + { + "epoch": 0.02826180569460025, + "grad_norm": 0.94140625, + "learning_rate": 0.0009660000000000001, + "loss": 4.0342, + "step": 322 + }, + { + "epoch": 0.028349575277502734, + "grad_norm": 3.53125, + "learning_rate": 0.000969, + "loss": 4.064, + "step": 323 + }, + { + "epoch": 0.028437344860405223, + "grad_norm": 1.9609375, + "learning_rate": 0.0009720000000000001, + "loss": 4.0474, + "step": 324 + }, + { + "epoch": 0.028525114443307707, + "grad_norm": 2.03125, + "learning_rate": 0.0009750000000000001, + "loss": 4.1118, + "step": 325 + }, + { + "epoch": 0.028612884026210192, + "grad_norm": 1.328125, + "learning_rate": 0.0009780000000000001, + "loss": 4.0039, + "step": 326 + }, + { + "epoch": 0.028700653609112677, + "grad_norm": 1.6953125, + "learning_rate": 0.000981, + "loss": 4.061, + "step": 327 + }, + { + "epoch": 0.02878842319201516, + "grad_norm": 1.59375, + "learning_rate": 0.000984, + "loss": 4.0532, + "step": 328 + }, + { + "epoch": 0.028876192774917646, + "grad_norm": 1.15625, + "learning_rate": 0.000987, + "loss": 4.0806, + "step": 329 + }, + { + "epoch": 0.028963962357820134, + "grad_norm": 1.1328125, + "learning_rate": 0.00099, + "loss": 4.022, + "step": 330 + }, + { + "epoch": 0.02905173194072262, + "grad_norm": 1.6015625, + "learning_rate": 0.0009930000000000002, + "loss": 4.0552, + "step": 331 + }, + { + "epoch": 0.029139501523625103, + "grad_norm": 0.9765625, + "learning_rate": 0.0009960000000000001, + "loss": 3.9961, + "step": 332 + }, + { + "epoch": 0.029227271106527588, + "grad_norm": 1.3515625, + "learning_rate": 0.000999, + "loss": 4.0605, + "step": 333 + }, + { + "epoch": 0.029315040689430073, + "grad_norm": 1.296875, + "learning_rate": 0.001002, + "loss": 3.9985, + "step": 334 + }, + { + "epoch": 0.029402810272332557, + "grad_norm": 2.0625, + "learning_rate": 0.001005, + "loss": 4.0396, + "step": 335 + }, + { + "epoch": 0.029490579855235045, + "grad_norm": 1.0546875, + "learning_rate": 0.001008, + "loss": 4.0532, + "step": 336 + }, + { + "epoch": 0.02957834943813753, + "grad_norm": 1.09375, + "learning_rate": 0.0010110000000000002, + "loss": 4.0146, + "step": 337 + }, + { + "epoch": 0.029666119021040015, + "grad_norm": 2.0, + "learning_rate": 0.0010140000000000001, + "loss": 4.0566, + "step": 338 + }, + { + "epoch": 0.0297538886039425, + "grad_norm": 0.86328125, + "learning_rate": 0.0010170000000000001, + "loss": 4.0024, + "step": 339 + }, + { + "epoch": 0.029841658186844984, + "grad_norm": 3.4375, + "learning_rate": 0.00102, + "loss": 4.0923, + "step": 340 + }, + { + "epoch": 0.029929427769747472, + "grad_norm": 2.109375, + "learning_rate": 0.001023, + "loss": 4.0166, + "step": 341 + }, + { + "epoch": 0.030017197352649957, + "grad_norm": 3.578125, + "learning_rate": 0.001026, + "loss": 4.1646, + "step": 342 + }, + { + "epoch": 0.03010496693555244, + "grad_norm": 2.296875, + "learning_rate": 0.0010290000000000002, + "loss": 4.0503, + "step": 343 + }, + { + "epoch": 0.030192736518454926, + "grad_norm": 4.125, + "learning_rate": 0.001032, + "loss": 4.1504, + "step": 344 + }, + { + "epoch": 0.03028050610135741, + "grad_norm": 2.734375, + "learning_rate": 0.001035, + "loss": 4.0747, + "step": 345 + }, + { + "epoch": 0.030368275684259895, + "grad_norm": 4.71875, + "learning_rate": 0.0010379999999999999, + "loss": 4.1255, + "step": 346 + }, + { + "epoch": 0.030456045267162384, + "grad_norm": 7.4375, + "learning_rate": 0.001041, + "loss": 4.1182, + "step": 347 + }, + { + "epoch": 0.030543814850064868, + "grad_norm": 1.09375, + "learning_rate": 0.001044, + "loss": 4.0532, + "step": 348 + }, + { + "epoch": 0.030631584432967353, + "grad_norm": 2.21875, + "learning_rate": 0.001047, + "loss": 4.0298, + "step": 349 + }, + { + "epoch": 0.030719354015869837, + "grad_norm": 1.21875, + "learning_rate": 0.00105, + "loss": 4.0298, + "step": 350 + }, + { + "epoch": 0.030807123598772322, + "grad_norm": 2.203125, + "learning_rate": 0.001053, + "loss": 4.0601, + "step": 351 + }, + { + "epoch": 0.030894893181674807, + "grad_norm": 1.265625, + "learning_rate": 0.0010559999999999999, + "loss": 3.9873, + "step": 352 + }, + { + "epoch": 0.030982662764577295, + "grad_norm": 2.375, + "learning_rate": 0.001059, + "loss": 4.0327, + "step": 353 + }, + { + "epoch": 0.03107043234747978, + "grad_norm": 1.390625, + "learning_rate": 0.001062, + "loss": 4.0474, + "step": 354 + }, + { + "epoch": 0.031158201930382264, + "grad_norm": 1.71875, + "learning_rate": 0.001065, + "loss": 4.085, + "step": 355 + }, + { + "epoch": 0.03124597151328475, + "grad_norm": 1.234375, + "learning_rate": 0.001068, + "loss": 4.0176, + "step": 356 + }, + { + "epoch": 0.03133374109618724, + "grad_norm": 1.3125, + "learning_rate": 0.001071, + "loss": 3.9917, + "step": 357 + }, + { + "epoch": 0.03142151067908972, + "grad_norm": 0.796875, + "learning_rate": 0.001074, + "loss": 4.0342, + "step": 358 + }, + { + "epoch": 0.031509280261992206, + "grad_norm": 1.765625, + "learning_rate": 0.001077, + "loss": 4.0479, + "step": 359 + }, + { + "epoch": 0.03159704984489469, + "grad_norm": 0.8515625, + "learning_rate": 0.00108, + "loss": 4.1104, + "step": 360 + }, + { + "epoch": 0.031684819427797176, + "grad_norm": 1.0625, + "learning_rate": 0.001083, + "loss": 4.0, + "step": 361 + }, + { + "epoch": 0.03177258901069966, + "grad_norm": 0.91796875, + "learning_rate": 0.001086, + "loss": 4.1157, + "step": 362 + }, + { + "epoch": 0.031860358593602145, + "grad_norm": 0.91796875, + "learning_rate": 0.001089, + "loss": 4.0947, + "step": 363 + }, + { + "epoch": 0.03194812817650463, + "grad_norm": 1.1875, + "learning_rate": 0.001092, + "loss": 3.9888, + "step": 364 + }, + { + "epoch": 0.032035897759407114, + "grad_norm": 1.1640625, + "learning_rate": 0.001095, + "loss": 4.0068, + "step": 365 + }, + { + "epoch": 0.0321236673423096, + "grad_norm": 0.80859375, + "learning_rate": 0.001098, + "loss": 4.0063, + "step": 366 + }, + { + "epoch": 0.032211436925212084, + "grad_norm": 0.71484375, + "learning_rate": 0.001101, + "loss": 4.0361, + "step": 367 + }, + { + "epoch": 0.032299206508114575, + "grad_norm": 0.77734375, + "learning_rate": 0.001104, + "loss": 4.042, + "step": 368 + }, + { + "epoch": 0.03238697609101706, + "grad_norm": 1.1484375, + "learning_rate": 0.001107, + "loss": 3.9937, + "step": 369 + }, + { + "epoch": 0.032474745673919544, + "grad_norm": 1.0703125, + "learning_rate": 0.00111, + "loss": 4.0044, + "step": 370 + }, + { + "epoch": 0.03256251525682203, + "grad_norm": 0.82421875, + "learning_rate": 0.001113, + "loss": 3.9927, + "step": 371 + }, + { + "epoch": 0.032650284839724514, + "grad_norm": 0.796875, + "learning_rate": 0.001116, + "loss": 3.9639, + "step": 372 + }, + { + "epoch": 0.032738054422627, + "grad_norm": 1.3671875, + "learning_rate": 0.001119, + "loss": 3.9951, + "step": 373 + }, + { + "epoch": 0.03282582400552948, + "grad_norm": 0.51171875, + "learning_rate": 0.001122, + "loss": 4.0352, + "step": 374 + }, + { + "epoch": 0.03291359358843197, + "grad_norm": 0.953125, + "learning_rate": 0.0011250000000000001, + "loss": 3.9482, + "step": 375 + }, + { + "epoch": 0.03300136317133445, + "grad_norm": 0.96875, + "learning_rate": 0.001128, + "loss": 3.9829, + "step": 376 + }, + { + "epoch": 0.03308913275423694, + "grad_norm": 0.74609375, + "learning_rate": 0.001131, + "loss": 4.0322, + "step": 377 + }, + { + "epoch": 0.03317690233713942, + "grad_norm": 0.7578125, + "learning_rate": 0.001134, + "loss": 3.9609, + "step": 378 + }, + { + "epoch": 0.03326467192004191, + "grad_norm": 0.640625, + "learning_rate": 0.001137, + "loss": 4.0239, + "step": 379 + }, + { + "epoch": 0.0333524415029444, + "grad_norm": 0.6484375, + "learning_rate": 0.00114, + "loss": 3.9888, + "step": 380 + }, + { + "epoch": 0.03344021108584688, + "grad_norm": 0.49609375, + "learning_rate": 0.0011430000000000001, + "loss": 3.9292, + "step": 381 + }, + { + "epoch": 0.03352798066874937, + "grad_norm": 0.5234375, + "learning_rate": 0.001146, + "loss": 3.958, + "step": 382 + }, + { + "epoch": 0.03361575025165185, + "grad_norm": 0.671875, + "learning_rate": 0.001149, + "loss": 3.9438, + "step": 383 + }, + { + "epoch": 0.03370351983455434, + "grad_norm": 0.57421875, + "learning_rate": 0.001152, + "loss": 3.9805, + "step": 384 + }, + { + "epoch": 0.03379128941745682, + "grad_norm": 0.56640625, + "learning_rate": 0.001155, + "loss": 4.0659, + "step": 385 + }, + { + "epoch": 0.033879059000359306, + "grad_norm": 0.86328125, + "learning_rate": 0.001158, + "loss": 3.9604, + "step": 386 + }, + { + "epoch": 0.03396682858326179, + "grad_norm": 1.5703125, + "learning_rate": 0.0011610000000000001, + "loss": 3.9927, + "step": 387 + }, + { + "epoch": 0.034054598166164275, + "grad_norm": 1.0, + "learning_rate": 0.001164, + "loss": 4.0088, + "step": 388 + }, + { + "epoch": 0.03414236774906676, + "grad_norm": 1.3671875, + "learning_rate": 0.001167, + "loss": 3.9712, + "step": 389 + }, + { + "epoch": 0.034230137331969245, + "grad_norm": 1.015625, + "learning_rate": 0.00117, + "loss": 3.9395, + "step": 390 + }, + { + "epoch": 0.034317906914871736, + "grad_norm": 0.97265625, + "learning_rate": 0.001173, + "loss": 3.9497, + "step": 391 + }, + { + "epoch": 0.03440567649777422, + "grad_norm": 1.1953125, + "learning_rate": 0.001176, + "loss": 3.998, + "step": 392 + }, + { + "epoch": 0.034493446080676705, + "grad_norm": 1.3671875, + "learning_rate": 0.0011790000000000001, + "loss": 3.9795, + "step": 393 + }, + { + "epoch": 0.03458121566357919, + "grad_norm": 0.875, + "learning_rate": 0.001182, + "loss": 3.9468, + "step": 394 + }, + { + "epoch": 0.034668985246481675, + "grad_norm": 0.66015625, + "learning_rate": 0.001185, + "loss": 3.9731, + "step": 395 + }, + { + "epoch": 0.03475675482938416, + "grad_norm": 0.80859375, + "learning_rate": 0.001188, + "loss": 3.9263, + "step": 396 + }, + { + "epoch": 0.034844524412286644, + "grad_norm": 0.89453125, + "learning_rate": 0.001191, + "loss": 3.9502, + "step": 397 + }, + { + "epoch": 0.03493229399518913, + "grad_norm": 1.1796875, + "learning_rate": 0.0011940000000000002, + "loss": 3.9434, + "step": 398 + }, + { + "epoch": 0.03502006357809161, + "grad_norm": 2.4375, + "learning_rate": 0.0011970000000000001, + "loss": 4.0049, + "step": 399 + }, + { + "epoch": 0.0351078331609941, + "grad_norm": 1.1953125, + "learning_rate": 0.0012000000000000001, + "loss": 3.9722, + "step": 400 + }, + { + "epoch": 0.03519560274389658, + "grad_norm": 2.59375, + "learning_rate": 0.001203, + "loss": 3.9683, + "step": 401 + }, + { + "epoch": 0.03528337232679907, + "grad_norm": 1.8125, + "learning_rate": 0.001206, + "loss": 3.9512, + "step": 402 + }, + { + "epoch": 0.03537114190970156, + "grad_norm": 3.140625, + "learning_rate": 0.001209, + "loss": 4.0049, + "step": 403 + }, + { + "epoch": 0.035458911492604044, + "grad_norm": 2.03125, + "learning_rate": 0.0012120000000000002, + "loss": 4.0151, + "step": 404 + }, + { + "epoch": 0.03554668107550653, + "grad_norm": 3.0, + "learning_rate": 0.0012150000000000002, + "loss": 4.0083, + "step": 405 + }, + { + "epoch": 0.03563445065840901, + "grad_norm": 1.859375, + "learning_rate": 0.0012180000000000001, + "loss": 4.0024, + "step": 406 + }, + { + "epoch": 0.0357222202413115, + "grad_norm": 3.171875, + "learning_rate": 0.0012209999999999999, + "loss": 4.0562, + "step": 407 + }, + { + "epoch": 0.03580998982421398, + "grad_norm": 2.046875, + "learning_rate": 0.001224, + "loss": 3.9976, + "step": 408 + }, + { + "epoch": 0.03589775940711647, + "grad_norm": 2.234375, + "learning_rate": 0.001227, + "loss": 4.021, + "step": 409 + }, + { + "epoch": 0.03598552899001895, + "grad_norm": 1.6171875, + "learning_rate": 0.00123, + "loss": 4.002, + "step": 410 + }, + { + "epoch": 0.036073298572921436, + "grad_norm": 2.265625, + "learning_rate": 0.001233, + "loss": 4.0156, + "step": 411 + }, + { + "epoch": 0.03616106815582392, + "grad_norm": 1.3984375, + "learning_rate": 0.001236, + "loss": 4.0059, + "step": 412 + }, + { + "epoch": 0.036248837738726405, + "grad_norm": 2.65625, + "learning_rate": 0.0012389999999999999, + "loss": 3.959, + "step": 413 + }, + { + "epoch": 0.0363366073216289, + "grad_norm": 1.78125, + "learning_rate": 0.001242, + "loss": 3.9736, + "step": 414 + }, + { + "epoch": 0.03642437690453138, + "grad_norm": 1.2109375, + "learning_rate": 0.001245, + "loss": 4.0049, + "step": 415 + }, + { + "epoch": 0.036512146487433866, + "grad_norm": 1.1875, + "learning_rate": 0.001248, + "loss": 4.002, + "step": 416 + }, + { + "epoch": 0.03659991607033635, + "grad_norm": 0.77734375, + "learning_rate": 0.001251, + "loss": 3.9365, + "step": 417 + }, + { + "epoch": 0.036687685653238836, + "grad_norm": 1.0703125, + "learning_rate": 0.001254, + "loss": 3.9277, + "step": 418 + }, + { + "epoch": 0.03677545523614132, + "grad_norm": 1.1484375, + "learning_rate": 0.0012569999999999999, + "loss": 3.9312, + "step": 419 + }, + { + "epoch": 0.036863224819043805, + "grad_norm": 1.171875, + "learning_rate": 0.00126, + "loss": 3.98, + "step": 420 + }, + { + "epoch": 0.03695099440194629, + "grad_norm": 0.83984375, + "learning_rate": 0.001263, + "loss": 3.9614, + "step": 421 + }, + { + "epoch": 0.037038763984848774, + "grad_norm": 0.5546875, + "learning_rate": 0.001266, + "loss": 3.9854, + "step": 422 + }, + { + "epoch": 0.03712653356775126, + "grad_norm": 0.70703125, + "learning_rate": 0.001269, + "loss": 3.9487, + "step": 423 + }, + { + "epoch": 0.037214303150653744, + "grad_norm": 0.69921875, + "learning_rate": 0.001272, + "loss": 3.9883, + "step": 424 + }, + { + "epoch": 0.03730207273355623, + "grad_norm": 0.56640625, + "learning_rate": 0.001275, + "loss": 3.9438, + "step": 425 + }, + { + "epoch": 0.03738984231645872, + "grad_norm": 0.5, + "learning_rate": 0.001278, + "loss": 3.9678, + "step": 426 + }, + { + "epoch": 0.037477611899361205, + "grad_norm": 0.455078125, + "learning_rate": 0.001281, + "loss": 3.9448, + "step": 427 + }, + { + "epoch": 0.03756538148226369, + "grad_norm": 0.4921875, + "learning_rate": 0.001284, + "loss": 3.9224, + "step": 428 + }, + { + "epoch": 0.037653151065166174, + "grad_norm": 0.52734375, + "learning_rate": 0.001287, + "loss": 3.874, + "step": 429 + }, + { + "epoch": 0.03774092064806866, + "grad_norm": 0.5703125, + "learning_rate": 0.00129, + "loss": 3.9141, + "step": 430 + }, + { + "epoch": 0.03782869023097114, + "grad_norm": 0.51171875, + "learning_rate": 0.001293, + "loss": 3.9487, + "step": 431 + }, + { + "epoch": 0.03791645981387363, + "grad_norm": 0.6015625, + "learning_rate": 0.001296, + "loss": 3.9058, + "step": 432 + }, + { + "epoch": 0.03800422939677611, + "grad_norm": 0.703125, + "learning_rate": 0.001299, + "loss": 4.0063, + "step": 433 + }, + { + "epoch": 0.0380919989796786, + "grad_norm": 0.67578125, + "learning_rate": 0.001302, + "loss": 3.9141, + "step": 434 + }, + { + "epoch": 0.03817976856258108, + "grad_norm": 0.63671875, + "learning_rate": 0.001305, + "loss": 3.8501, + "step": 435 + }, + { + "epoch": 0.038267538145483566, + "grad_norm": 0.61328125, + "learning_rate": 0.001308, + "loss": 3.8936, + "step": 436 + }, + { + "epoch": 0.03835530772838605, + "grad_norm": 0.5390625, + "learning_rate": 0.001311, + "loss": 3.9009, + "step": 437 + }, + { + "epoch": 0.03844307731128854, + "grad_norm": 0.349609375, + "learning_rate": 0.001314, + "loss": 3.8857, + "step": 438 + }, + { + "epoch": 0.03853084689419103, + "grad_norm": 0.58984375, + "learning_rate": 0.001317, + "loss": 3.8975, + "step": 439 + }, + { + "epoch": 0.03861861647709351, + "grad_norm": 0.67578125, + "learning_rate": 0.00132, + "loss": 3.9072, + "step": 440 + }, + { + "epoch": 0.038706386059996, + "grad_norm": 0.80078125, + "learning_rate": 0.001323, + "loss": 3.9507, + "step": 441 + }, + { + "epoch": 0.03879415564289848, + "grad_norm": 0.859375, + "learning_rate": 0.0013260000000000001, + "loss": 3.9082, + "step": 442 + }, + { + "epoch": 0.038881925225800966, + "grad_norm": 0.82421875, + "learning_rate": 0.001329, + "loss": 3.8965, + "step": 443 + }, + { + "epoch": 0.03896969480870345, + "grad_norm": 1.1875, + "learning_rate": 0.001332, + "loss": 3.9629, + "step": 444 + }, + { + "epoch": 0.039057464391605935, + "grad_norm": 0.6953125, + "learning_rate": 0.001335, + "loss": 4.0127, + "step": 445 + }, + { + "epoch": 0.03914523397450842, + "grad_norm": 1.078125, + "learning_rate": 0.001338, + "loss": 3.9023, + "step": 446 + }, + { + "epoch": 0.039233003557410905, + "grad_norm": 2.140625, + "learning_rate": 0.001341, + "loss": 3.9448, + "step": 447 + }, + { + "epoch": 0.03932077314031339, + "grad_norm": 0.9375, + "learning_rate": 0.0013440000000000001, + "loss": 3.9077, + "step": 448 + }, + { + "epoch": 0.03940854272321588, + "grad_norm": 2.890625, + "learning_rate": 0.001347, + "loss": 4.0171, + "step": 449 + }, + { + "epoch": 0.039496312306118365, + "grad_norm": 1.9765625, + "learning_rate": 0.00135, + "loss": 3.9956, + "step": 450 + }, + { + "epoch": 0.03958408188902085, + "grad_norm": 2.625, + "learning_rate": 0.001353, + "loss": 3.8921, + "step": 451 + }, + { + "epoch": 0.039671851471923335, + "grad_norm": 1.2734375, + "learning_rate": 0.001356, + "loss": 3.9917, + "step": 452 + }, + { + "epoch": 0.03975962105482582, + "grad_norm": 1.875, + "learning_rate": 0.001359, + "loss": 3.9468, + "step": 453 + }, + { + "epoch": 0.039847390637728304, + "grad_norm": 1.203125, + "learning_rate": 0.0013620000000000001, + "loss": 3.8755, + "step": 454 + }, + { + "epoch": 0.03993516022063079, + "grad_norm": 2.796875, + "learning_rate": 0.0013650000000000001, + "loss": 3.9326, + "step": 455 + }, + { + "epoch": 0.04002292980353327, + "grad_norm": 1.984375, + "learning_rate": 0.001368, + "loss": 3.918, + "step": 456 + }, + { + "epoch": 0.04011069938643576, + "grad_norm": 1.3828125, + "learning_rate": 0.001371, + "loss": 3.8735, + "step": 457 + }, + { + "epoch": 0.04019846896933824, + "grad_norm": 0.83203125, + "learning_rate": 0.001374, + "loss": 3.9316, + "step": 458 + }, + { + "epoch": 0.04028623855224073, + "grad_norm": 1.2578125, + "learning_rate": 0.0013770000000000002, + "loss": 3.9751, + "step": 459 + }, + { + "epoch": 0.04037400813514321, + "grad_norm": 0.90625, + "learning_rate": 0.0013800000000000002, + "loss": 3.9214, + "step": 460 + }, + { + "epoch": 0.040461777718045704, + "grad_norm": 0.8984375, + "learning_rate": 0.0013830000000000001, + "loss": 3.9629, + "step": 461 + }, + { + "epoch": 0.04054954730094819, + "grad_norm": 0.98828125, + "learning_rate": 0.001386, + "loss": 3.9033, + "step": 462 + }, + { + "epoch": 0.04063731688385067, + "grad_norm": 1.6328125, + "learning_rate": 0.001389, + "loss": 3.9839, + "step": 463 + }, + { + "epoch": 0.04072508646675316, + "grad_norm": 0.83984375, + "learning_rate": 0.001392, + "loss": 3.937, + "step": 464 + }, + { + "epoch": 0.04081285604965564, + "grad_norm": 1.265625, + "learning_rate": 0.0013950000000000002, + "loss": 3.9165, + "step": 465 + }, + { + "epoch": 0.04090062563255813, + "grad_norm": 1.296875, + "learning_rate": 0.0013980000000000002, + "loss": 3.9062, + "step": 466 + }, + { + "epoch": 0.04098839521546061, + "grad_norm": 0.91796875, + "learning_rate": 0.0014010000000000001, + "loss": 3.9053, + "step": 467 + }, + { + "epoch": 0.041076164798363096, + "grad_norm": 0.7109375, + "learning_rate": 0.001404, + "loss": 3.9014, + "step": 468 + }, + { + "epoch": 0.04116393438126558, + "grad_norm": 0.72265625, + "learning_rate": 0.001407, + "loss": 3.9634, + "step": 469 + }, + { + "epoch": 0.041251703964168066, + "grad_norm": 0.83203125, + "learning_rate": 0.00141, + "loss": 3.9297, + "step": 470 + }, + { + "epoch": 0.04133947354707055, + "grad_norm": 0.53515625, + "learning_rate": 0.001413, + "loss": 3.9546, + "step": 471 + }, + { + "epoch": 0.041427243129973035, + "grad_norm": 0.466796875, + "learning_rate": 0.001416, + "loss": 3.9058, + "step": 472 + }, + { + "epoch": 0.041515012712875526, + "grad_norm": 0.58203125, + "learning_rate": 0.001419, + "loss": 3.8677, + "step": 473 + }, + { + "epoch": 0.04160278229577801, + "grad_norm": 0.59765625, + "learning_rate": 0.0014219999999999999, + "loss": 3.9424, + "step": 474 + }, + { + "epoch": 0.041690551878680496, + "grad_norm": 0.59375, + "learning_rate": 0.001425, + "loss": 3.9219, + "step": 475 + }, + { + "epoch": 0.04177832146158298, + "grad_norm": 1.125, + "learning_rate": 0.001428, + "loss": 3.8311, + "step": 476 + }, + { + "epoch": 0.041866091044485465, + "grad_norm": 0.52734375, + "learning_rate": 0.001431, + "loss": 3.8345, + "step": 477 + }, + { + "epoch": 0.04195386062738795, + "grad_norm": 0.5625, + "learning_rate": 0.001434, + "loss": 3.8394, + "step": 478 + }, + { + "epoch": 0.042041630210290434, + "grad_norm": 0.58984375, + "learning_rate": 0.001437, + "loss": 3.9087, + "step": 479 + }, + { + "epoch": 0.04212939979319292, + "grad_norm": 0.62109375, + "learning_rate": 0.0014399999999999999, + "loss": 3.8911, + "step": 480 + }, + { + "epoch": 0.042217169376095404, + "grad_norm": 0.80078125, + "learning_rate": 0.001443, + "loss": 3.9209, + "step": 481 + }, + { + "epoch": 0.04230493895899789, + "grad_norm": 1.140625, + "learning_rate": 0.001446, + "loss": 3.9258, + "step": 482 + }, + { + "epoch": 0.04239270854190037, + "grad_norm": 0.7265625, + "learning_rate": 0.001449, + "loss": 3.9434, + "step": 483 + }, + { + "epoch": 0.042480478124802865, + "grad_norm": 0.75390625, + "learning_rate": 0.001452, + "loss": 3.9185, + "step": 484 + }, + { + "epoch": 0.04256824770770535, + "grad_norm": 0.9765625, + "learning_rate": 0.001455, + "loss": 3.957, + "step": 485 + }, + { + "epoch": 0.042656017290607834, + "grad_norm": 2.453125, + "learning_rate": 0.001458, + "loss": 3.9399, + "step": 486 + }, + { + "epoch": 0.04274378687351032, + "grad_norm": 1.765625, + "learning_rate": 0.001461, + "loss": 3.9497, + "step": 487 + }, + { + "epoch": 0.0428315564564128, + "grad_norm": 2.5625, + "learning_rate": 0.001464, + "loss": 3.8267, + "step": 488 + }, + { + "epoch": 0.04291932603931529, + "grad_norm": 2.25, + "learning_rate": 0.001467, + "loss": 3.918, + "step": 489 + }, + { + "epoch": 0.04300709562221777, + "grad_norm": 0.9609375, + "learning_rate": 0.00147, + "loss": 3.8647, + "step": 490 + }, + { + "epoch": 0.04309486520512026, + "grad_norm": 1.546875, + "learning_rate": 0.001473, + "loss": 3.9663, + "step": 491 + }, + { + "epoch": 0.04318263478802274, + "grad_norm": 1.1484375, + "learning_rate": 0.001476, + "loss": 3.9067, + "step": 492 + }, + { + "epoch": 0.043270404370925226, + "grad_norm": 1.40625, + "learning_rate": 0.001479, + "loss": 3.8442, + "step": 493 + }, + { + "epoch": 0.04335817395382771, + "grad_norm": 1.0546875, + "learning_rate": 0.001482, + "loss": 3.9346, + "step": 494 + }, + { + "epoch": 0.043445943536730196, + "grad_norm": 0.796875, + "learning_rate": 0.001485, + "loss": 3.9224, + "step": 495 + }, + { + "epoch": 0.04353371311963269, + "grad_norm": 0.625, + "learning_rate": 0.001488, + "loss": 3.9116, + "step": 496 + }, + { + "epoch": 0.04362148270253517, + "grad_norm": 0.828125, + "learning_rate": 0.001491, + "loss": 3.8931, + "step": 497 + }, + { + "epoch": 0.04370925228543766, + "grad_norm": 1.0703125, + "learning_rate": 0.001494, + "loss": 3.9458, + "step": 498 + }, + { + "epoch": 0.04379702186834014, + "grad_norm": 1.1484375, + "learning_rate": 0.001497, + "loss": 3.8657, + "step": 499 + }, + { + "epoch": 0.043884791451242626, + "grad_norm": 1.25, + "learning_rate": 0.0015, + "loss": 3.937, + "step": 500 + }, + { + "epoch": 0.04397256103414511, + "grad_norm": 1.421875, + "learning_rate": 0.001503, + "loss": 3.8296, + "step": 501 + }, + { + "epoch": 0.044060330617047595, + "grad_norm": 0.5234375, + "learning_rate": 0.001506, + "loss": 3.8706, + "step": 502 + }, + { + "epoch": 0.04414810019995008, + "grad_norm": 1.7734375, + "learning_rate": 0.0015090000000000001, + "loss": 3.8838, + "step": 503 + }, + { + "epoch": 0.044235869782852565, + "grad_norm": 0.79296875, + "learning_rate": 0.001512, + "loss": 3.8638, + "step": 504 + }, + { + "epoch": 0.04432363936575505, + "grad_norm": 1.234375, + "learning_rate": 0.001515, + "loss": 3.936, + "step": 505 + }, + { + "epoch": 0.044411408948657534, + "grad_norm": 1.1953125, + "learning_rate": 0.001518, + "loss": 3.8638, + "step": 506 + }, + { + "epoch": 0.044499178531560026, + "grad_norm": 1.15625, + "learning_rate": 0.001521, + "loss": 3.9365, + "step": 507 + }, + { + "epoch": 0.04458694811446251, + "grad_norm": 0.7734375, + "learning_rate": 0.001524, + "loss": 3.8721, + "step": 508 + }, + { + "epoch": 0.044674717697364995, + "grad_norm": 0.9609375, + "learning_rate": 0.0015270000000000001, + "loss": 3.8794, + "step": 509 + }, + { + "epoch": 0.04476248728026748, + "grad_norm": 1.0703125, + "learning_rate": 0.0015300000000000001, + "loss": 3.8472, + "step": 510 + }, + { + "epoch": 0.044850256863169964, + "grad_norm": 0.5703125, + "learning_rate": 0.001533, + "loss": 3.8096, + "step": 511 + }, + { + "epoch": 0.04493802644607245, + "grad_norm": 0.921875, + "learning_rate": 0.001536, + "loss": 3.877, + "step": 512 + }, + { + "epoch": 0.04502579602897493, + "grad_norm": 0.75390625, + "learning_rate": 0.001539, + "loss": 3.8677, + "step": 513 + }, + { + "epoch": 0.04511356561187742, + "grad_norm": 0.7890625, + "learning_rate": 0.001542, + "loss": 3.8823, + "step": 514 + }, + { + "epoch": 0.0452013351947799, + "grad_norm": 0.74609375, + "learning_rate": 0.0015450000000000001, + "loss": 3.8525, + "step": 515 + }, + { + "epoch": 0.04528910477768239, + "grad_norm": 0.94140625, + "learning_rate": 0.0015480000000000001, + "loss": 3.957, + "step": 516 + }, + { + "epoch": 0.04537687436058487, + "grad_norm": 1.0078125, + "learning_rate": 0.001551, + "loss": 3.8887, + "step": 517 + }, + { + "epoch": 0.04546464394348736, + "grad_norm": 4.5625, + "learning_rate": 0.001554, + "loss": 3.8979, + "step": 518 + }, + { + "epoch": 0.04555241352638985, + "grad_norm": 1.8125, + "learning_rate": 0.001557, + "loss": 3.8882, + "step": 519 + }, + { + "epoch": 0.04564018310929233, + "grad_norm": 0.8828125, + "learning_rate": 0.0015600000000000002, + "loss": 3.8604, + "step": 520 + }, + { + "epoch": 0.04572795269219482, + "grad_norm": 0.921875, + "learning_rate": 0.0015630000000000002, + "loss": 3.8828, + "step": 521 + }, + { + "epoch": 0.0458157222750973, + "grad_norm": 1.1015625, + "learning_rate": 0.0015660000000000001, + "loss": 3.9438, + "step": 522 + }, + { + "epoch": 0.04590349185799979, + "grad_norm": 1.0546875, + "learning_rate": 0.001569, + "loss": 3.873, + "step": 523 + }, + { + "epoch": 0.04599126144090227, + "grad_norm": 0.73828125, + "learning_rate": 0.001572, + "loss": 3.834, + "step": 524 + }, + { + "epoch": 0.046079031023804756, + "grad_norm": 0.66796875, + "learning_rate": 0.001575, + "loss": 3.9067, + "step": 525 + }, + { + "epoch": 0.04616680060670724, + "grad_norm": 0.80078125, + "learning_rate": 0.0015780000000000002, + "loss": 3.8608, + "step": 526 + }, + { + "epoch": 0.046254570189609726, + "grad_norm": 0.44140625, + "learning_rate": 0.0015810000000000002, + "loss": 3.8252, + "step": 527 + }, + { + "epoch": 0.04634233977251221, + "grad_norm": 0.4765625, + "learning_rate": 0.0015840000000000001, + "loss": 3.8516, + "step": 528 + }, + { + "epoch": 0.046430109355414695, + "grad_norm": 0.41796875, + "learning_rate": 0.001587, + "loss": 3.8423, + "step": 529 + }, + { + "epoch": 0.04651787893831718, + "grad_norm": 0.3671875, + "learning_rate": 0.00159, + "loss": 3.9004, + "step": 530 + }, + { + "epoch": 0.04660564852121967, + "grad_norm": 0.41015625, + "learning_rate": 0.001593, + "loss": 3.9033, + "step": 531 + }, + { + "epoch": 0.046693418104122156, + "grad_norm": 0.44140625, + "learning_rate": 0.0015960000000000002, + "loss": 3.8877, + "step": 532 + }, + { + "epoch": 0.04678118768702464, + "grad_norm": 0.4609375, + "learning_rate": 0.0015990000000000002, + "loss": 3.833, + "step": 533 + }, + { + "epoch": 0.046868957269927125, + "grad_norm": 0.462890625, + "learning_rate": 0.0016020000000000001, + "loss": 3.8491, + "step": 534 + }, + { + "epoch": 0.04695672685282961, + "grad_norm": 0.41796875, + "learning_rate": 0.001605, + "loss": 3.7886, + "step": 535 + }, + { + "epoch": 0.047044496435732094, + "grad_norm": 0.498046875, + "learning_rate": 0.001608, + "loss": 3.8159, + "step": 536 + }, + { + "epoch": 0.04713226601863458, + "grad_norm": 0.55078125, + "learning_rate": 0.0016110000000000002, + "loss": 3.8486, + "step": 537 + }, + { + "epoch": 0.047220035601537064, + "grad_norm": 0.89453125, + "learning_rate": 0.0016140000000000002, + "loss": 3.8882, + "step": 538 + }, + { + "epoch": 0.04730780518443955, + "grad_norm": 0.5390625, + "learning_rate": 0.0016170000000000002, + "loss": 3.8584, + "step": 539 + }, + { + "epoch": 0.04739557476734203, + "grad_norm": 0.89453125, + "learning_rate": 0.0016200000000000001, + "loss": 3.8384, + "step": 540 + }, + { + "epoch": 0.04748334435024452, + "grad_norm": 1.7578125, + "learning_rate": 0.001623, + "loss": 3.9512, + "step": 541 + }, + { + "epoch": 0.04757111393314701, + "grad_norm": 1.6015625, + "learning_rate": 0.001626, + "loss": 3.9414, + "step": 542 + }, + { + "epoch": 0.047658883516049494, + "grad_norm": 0.83203125, + "learning_rate": 0.0016290000000000002, + "loss": 3.8887, + "step": 543 + }, + { + "epoch": 0.04774665309895198, + "grad_norm": 1.171875, + "learning_rate": 0.0016320000000000002, + "loss": 3.9502, + "step": 544 + }, + { + "epoch": 0.04783442268185446, + "grad_norm": 1.6015625, + "learning_rate": 0.0016350000000000002, + "loss": 3.8711, + "step": 545 + }, + { + "epoch": 0.04792219226475695, + "grad_norm": 1.2734375, + "learning_rate": 0.0016380000000000001, + "loss": 3.9072, + "step": 546 + }, + { + "epoch": 0.04800996184765943, + "grad_norm": 0.98046875, + "learning_rate": 0.001641, + "loss": 3.9033, + "step": 547 + }, + { + "epoch": 0.04809773143056192, + "grad_norm": 1.2734375, + "learning_rate": 0.001644, + "loss": 3.8638, + "step": 548 + }, + { + "epoch": 0.0481855010134644, + "grad_norm": 0.7265625, + "learning_rate": 0.0016470000000000002, + "loss": 3.8281, + "step": 549 + }, + { + "epoch": 0.048273270596366886, + "grad_norm": 1.265625, + "learning_rate": 0.0016500000000000002, + "loss": 3.8472, + "step": 550 + }, + { + "epoch": 0.04836104017926937, + "grad_norm": 0.83984375, + "learning_rate": 0.0016530000000000002, + "loss": 3.8496, + "step": 551 + }, + { + "epoch": 0.048448809762171856, + "grad_norm": 0.6484375, + "learning_rate": 0.0016560000000000001, + "loss": 3.9116, + "step": 552 + }, + { + "epoch": 0.04853657934507434, + "grad_norm": 0.41796875, + "learning_rate": 0.001659, + "loss": 3.8252, + "step": 553 + }, + { + "epoch": 0.04862434892797683, + "grad_norm": 0.578125, + "learning_rate": 0.0016620000000000003, + "loss": 3.79, + "step": 554 + }, + { + "epoch": 0.04871211851087932, + "grad_norm": 0.5390625, + "learning_rate": 0.0016650000000000002, + "loss": 3.8506, + "step": 555 + }, + { + "epoch": 0.0487998880937818, + "grad_norm": 0.515625, + "learning_rate": 0.0016680000000000002, + "loss": 3.8159, + "step": 556 + }, + { + "epoch": 0.048887657676684286, + "grad_norm": 0.5234375, + "learning_rate": 0.0016710000000000002, + "loss": 3.811, + "step": 557 + }, + { + "epoch": 0.04897542725958677, + "grad_norm": 0.44140625, + "learning_rate": 0.0016740000000000001, + "loss": 3.877, + "step": 558 + }, + { + "epoch": 0.049063196842489255, + "grad_norm": 0.6171875, + "learning_rate": 0.001677, + "loss": 3.873, + "step": 559 + }, + { + "epoch": 0.04915096642539174, + "grad_norm": 0.8046875, + "learning_rate": 0.0016800000000000003, + "loss": 3.8652, + "step": 560 + }, + { + "epoch": 0.049238736008294225, + "grad_norm": 0.482421875, + "learning_rate": 0.0016830000000000003, + "loss": 3.8296, + "step": 561 + }, + { + "epoch": 0.04932650559119671, + "grad_norm": 0.46484375, + "learning_rate": 0.0016860000000000002, + "loss": 3.9292, + "step": 562 + }, + { + "epoch": 0.049414275174099194, + "grad_norm": 0.46484375, + "learning_rate": 0.001689, + "loss": 3.7979, + "step": 563 + }, + { + "epoch": 0.04950204475700168, + "grad_norm": 0.84375, + "learning_rate": 0.001692, + "loss": 3.8418, + "step": 564 + }, + { + "epoch": 0.04958981433990416, + "grad_norm": 2.84375, + "learning_rate": 0.001695, + "loss": 3.8462, + "step": 565 + }, + { + "epoch": 0.049677583922806655, + "grad_norm": 0.80859375, + "learning_rate": 0.0016979999999999999, + "loss": 3.8521, + "step": 566 + }, + { + "epoch": 0.04976535350570914, + "grad_norm": 1.7265625, + "learning_rate": 0.0017009999999999998, + "loss": 3.8589, + "step": 567 + }, + { + "epoch": 0.049853123088611624, + "grad_norm": 1.0234375, + "learning_rate": 0.0017039999999999998, + "loss": 3.7671, + "step": 568 + }, + { + "epoch": 0.04994089267151411, + "grad_norm": 1.2890625, + "learning_rate": 0.001707, + "loss": 3.8135, + "step": 569 + }, + { + "epoch": 0.050028662254416593, + "grad_norm": 1.40625, + "learning_rate": 0.00171, + "loss": 3.8589, + "step": 570 + }, + { + "epoch": 0.05011643183731908, + "grad_norm": 0.71484375, + "learning_rate": 0.001713, + "loss": 3.8003, + "step": 571 + }, + { + "epoch": 0.05020420142022156, + "grad_norm": 1.09375, + "learning_rate": 0.0017159999999999999, + "loss": 3.8389, + "step": 572 + }, + { + "epoch": 0.05029197100312405, + "grad_norm": 1.2421875, + "learning_rate": 0.0017189999999999998, + "loss": 3.7534, + "step": 573 + }, + { + "epoch": 0.05037974058602653, + "grad_norm": 0.5234375, + "learning_rate": 0.001722, + "loss": 3.7896, + "step": 574 + }, + { + "epoch": 0.05046751016892902, + "grad_norm": 0.8828125, + "learning_rate": 0.001725, + "loss": 3.8091, + "step": 575 + }, + { + "epoch": 0.0505552797518315, + "grad_norm": 1.015625, + "learning_rate": 0.001728, + "loss": 3.8789, + "step": 576 + }, + { + "epoch": 0.05064304933473399, + "grad_norm": 0.55859375, + "learning_rate": 0.001731, + "loss": 3.7671, + "step": 577 + }, + { + "epoch": 0.05073081891763648, + "grad_norm": 0.58203125, + "learning_rate": 0.0017339999999999999, + "loss": 3.8643, + "step": 578 + }, + { + "epoch": 0.05081858850053896, + "grad_norm": 2.609375, + "learning_rate": 0.0017369999999999998, + "loss": 3.8315, + "step": 579 + }, + { + "epoch": 0.05090635808344145, + "grad_norm": 0.9296875, + "learning_rate": 0.00174, + "loss": 3.7935, + "step": 580 + }, + { + "epoch": 0.05099412766634393, + "grad_norm": 0.57421875, + "learning_rate": 0.001743, + "loss": 3.7778, + "step": 581 + }, + { + "epoch": 0.051081897249246416, + "grad_norm": 0.62890625, + "learning_rate": 0.001746, + "loss": 3.8335, + "step": 582 + }, + { + "epoch": 0.0511696668321489, + "grad_norm": 0.58203125, + "learning_rate": 0.001749, + "loss": 3.7993, + "step": 583 + }, + { + "epoch": 0.051257436415051386, + "grad_norm": 0.65234375, + "learning_rate": 0.0017519999999999999, + "loss": 3.8672, + "step": 584 + }, + { + "epoch": 0.05134520599795387, + "grad_norm": 1.984375, + "learning_rate": 0.0017549999999999998, + "loss": 3.8003, + "step": 585 + }, + { + "epoch": 0.051432975580856355, + "grad_norm": 0.6796875, + "learning_rate": 0.001758, + "loss": 3.7607, + "step": 586 + }, + { + "epoch": 0.05152074516375884, + "grad_norm": 0.91015625, + "learning_rate": 0.001761, + "loss": 3.8931, + "step": 587 + }, + { + "epoch": 0.051608514746661324, + "grad_norm": 2.65625, + "learning_rate": 0.001764, + "loss": 3.7676, + "step": 588 + }, + { + "epoch": 0.051696284329563816, + "grad_norm": 2.5625, + "learning_rate": 0.001767, + "loss": 3.8403, + "step": 589 + }, + { + "epoch": 0.0517840539124663, + "grad_norm": 1.390625, + "learning_rate": 0.0017699999999999999, + "loss": 3.8208, + "step": 590 + }, + { + "epoch": 0.051871823495368785, + "grad_norm": 0.87109375, + "learning_rate": 0.001773, + "loss": 3.8799, + "step": 591 + }, + { + "epoch": 0.05195959307827127, + "grad_norm": 1.15625, + "learning_rate": 0.001776, + "loss": 3.8438, + "step": 592 + }, + { + "epoch": 0.052047362661173754, + "grad_norm": 1.2578125, + "learning_rate": 0.001779, + "loss": 3.8594, + "step": 593 + }, + { + "epoch": 0.05213513224407624, + "grad_norm": 1.0625, + "learning_rate": 0.001782, + "loss": 3.8071, + "step": 594 + }, + { + "epoch": 0.052222901826978724, + "grad_norm": 0.8671875, + "learning_rate": 0.001785, + "loss": 3.8213, + "step": 595 + }, + { + "epoch": 0.05231067140988121, + "grad_norm": 0.66796875, + "learning_rate": 0.0017879999999999999, + "loss": 3.812, + "step": 596 + }, + { + "epoch": 0.05239844099278369, + "grad_norm": 0.57421875, + "learning_rate": 0.001791, + "loss": 3.8872, + "step": 597 + }, + { + "epoch": 0.05248621057568618, + "grad_norm": 0.73828125, + "learning_rate": 0.001794, + "loss": 3.894, + "step": 598 + }, + { + "epoch": 0.05257398015858866, + "grad_norm": 0.58984375, + "learning_rate": 0.001797, + "loss": 3.8413, + "step": 599 + }, + { + "epoch": 0.05266174974149115, + "grad_norm": 0.60546875, + "learning_rate": 0.0018, + "loss": 3.9023, + "step": 600 + }, + { + "epoch": 0.05274951932439364, + "grad_norm": 6.1875, + "learning_rate": 0.001803, + "loss": 3.8579, + "step": 601 + }, + { + "epoch": 0.05283728890729612, + "grad_norm": 1.03125, + "learning_rate": 0.0018059999999999999, + "loss": 3.7905, + "step": 602 + }, + { + "epoch": 0.05292505849019861, + "grad_norm": 0.78515625, + "learning_rate": 0.001809, + "loss": 3.8398, + "step": 603 + }, + { + "epoch": 0.05301282807310109, + "grad_norm": 0.6875, + "learning_rate": 0.001812, + "loss": 3.7539, + "step": 604 + }, + { + "epoch": 0.05310059765600358, + "grad_norm": 0.56640625, + "learning_rate": 0.001815, + "loss": 3.8086, + "step": 605 + }, + { + "epoch": 0.05318836723890606, + "grad_norm": 0.55078125, + "learning_rate": 0.001818, + "loss": 3.7642, + "step": 606 + }, + { + "epoch": 0.05327613682180855, + "grad_norm": 0.703125, + "learning_rate": 0.001821, + "loss": 3.7954, + "step": 607 + }, + { + "epoch": 0.05336390640471103, + "grad_norm": 0.4765625, + "learning_rate": 0.001824, + "loss": 3.7739, + "step": 608 + }, + { + "epoch": 0.053451675987613516, + "grad_norm": 0.578125, + "learning_rate": 0.001827, + "loss": 3.8013, + "step": 609 + }, + { + "epoch": 0.053539445570516, + "grad_norm": 0.53515625, + "learning_rate": 0.00183, + "loss": 3.7886, + "step": 610 + }, + { + "epoch": 0.053627215153418485, + "grad_norm": 0.79296875, + "learning_rate": 0.001833, + "loss": 3.7124, + "step": 611 + }, + { + "epoch": 0.05371498473632098, + "grad_norm": 0.75390625, + "learning_rate": 0.001836, + "loss": 3.8032, + "step": 612 + }, + { + "epoch": 0.05380275431922346, + "grad_norm": 0.80859375, + "learning_rate": 0.001839, + "loss": 3.7935, + "step": 613 + }, + { + "epoch": 0.053890523902125946, + "grad_norm": 1.2421875, + "learning_rate": 0.001842, + "loss": 3.73, + "step": 614 + }, + { + "epoch": 0.05397829348502843, + "grad_norm": 1.4453125, + "learning_rate": 0.001845, + "loss": 3.8667, + "step": 615 + }, + { + "epoch": 0.054066063067930915, + "grad_norm": 0.7890625, + "learning_rate": 0.001848, + "loss": 3.8242, + "step": 616 + }, + { + "epoch": 0.0541538326508334, + "grad_norm": 0.5625, + "learning_rate": 0.001851, + "loss": 3.8115, + "step": 617 + }, + { + "epoch": 0.054241602233735885, + "grad_norm": 0.65625, + "learning_rate": 0.001854, + "loss": 3.7793, + "step": 618 + }, + { + "epoch": 0.05432937181663837, + "grad_norm": 0.765625, + "learning_rate": 0.001857, + "loss": 3.7954, + "step": 619 + }, + { + "epoch": 0.054417141399540854, + "grad_norm": 0.65234375, + "learning_rate": 0.00186, + "loss": 3.8774, + "step": 620 + }, + { + "epoch": 0.05450491098244334, + "grad_norm": 0.439453125, + "learning_rate": 0.001863, + "loss": 3.8413, + "step": 621 + }, + { + "epoch": 0.05459268056534582, + "grad_norm": 0.7109375, + "learning_rate": 0.001866, + "loss": 3.8218, + "step": 622 + }, + { + "epoch": 0.05468045014824831, + "grad_norm": 0.74609375, + "learning_rate": 0.001869, + "loss": 3.7842, + "step": 623 + }, + { + "epoch": 0.0547682197311508, + "grad_norm": 0.703125, + "learning_rate": 0.001872, + "loss": 3.8037, + "step": 624 + }, + { + "epoch": 0.054855989314053284, + "grad_norm": 0.365234375, + "learning_rate": 0.001875, + "loss": 3.7056, + "step": 625 + }, + { + "epoch": 0.05494375889695577, + "grad_norm": 0.478515625, + "learning_rate": 0.0018780000000000001, + "loss": 3.8032, + "step": 626 + }, + { + "epoch": 0.055031528479858254, + "grad_norm": 0.484375, + "learning_rate": 0.001881, + "loss": 3.8687, + "step": 627 + }, + { + "epoch": 0.05511929806276074, + "grad_norm": 0.345703125, + "learning_rate": 0.001884, + "loss": 3.7104, + "step": 628 + }, + { + "epoch": 0.05520706764566322, + "grad_norm": 0.36328125, + "learning_rate": 0.001887, + "loss": 3.7793, + "step": 629 + }, + { + "epoch": 0.05529483722856571, + "grad_norm": 0.31640625, + "learning_rate": 0.00189, + "loss": 3.8047, + "step": 630 + }, + { + "epoch": 0.05538260681146819, + "grad_norm": 0.291015625, + "learning_rate": 0.0018930000000000002, + "loss": 3.791, + "step": 631 + }, + { + "epoch": 0.05547037639437068, + "grad_norm": 0.373046875, + "learning_rate": 0.0018960000000000001, + "loss": 3.7681, + "step": 632 + }, + { + "epoch": 0.05555814597727316, + "grad_norm": 0.341796875, + "learning_rate": 0.001899, + "loss": 3.7451, + "step": 633 + }, + { + "epoch": 0.055645915560175646, + "grad_norm": 0.330078125, + "learning_rate": 0.001902, + "loss": 3.772, + "step": 634 + }, + { + "epoch": 0.05573368514307813, + "grad_norm": 0.3203125, + "learning_rate": 0.001905, + "loss": 3.8345, + "step": 635 + }, + { + "epoch": 0.05582145472598062, + "grad_norm": 0.51171875, + "learning_rate": 0.001908, + "loss": 3.77, + "step": 636 + }, + { + "epoch": 0.05590922430888311, + "grad_norm": 0.6640625, + "learning_rate": 0.0019110000000000002, + "loss": 3.8013, + "step": 637 + }, + { + "epoch": 0.05599699389178559, + "grad_norm": 0.65234375, + "learning_rate": 0.0019140000000000001, + "loss": 3.7749, + "step": 638 + }, + { + "epoch": 0.056084763474688076, + "grad_norm": 0.671875, + "learning_rate": 0.001917, + "loss": 3.8413, + "step": 639 + }, + { + "epoch": 0.05617253305759056, + "grad_norm": 0.703125, + "learning_rate": 0.00192, + "loss": 3.7783, + "step": 640 + }, + { + "epoch": 0.056260302640493046, + "grad_norm": 0.6171875, + "learning_rate": 0.001923, + "loss": 3.7515, + "step": 641 + }, + { + "epoch": 0.05634807222339553, + "grad_norm": 0.6953125, + "learning_rate": 0.001926, + "loss": 3.8101, + "step": 642 + }, + { + "epoch": 0.056435841806298015, + "grad_norm": 1.5390625, + "learning_rate": 0.0019290000000000002, + "loss": 3.8691, + "step": 643 + }, + { + "epoch": 0.0565236113892005, + "grad_norm": 1.40625, + "learning_rate": 0.0019320000000000001, + "loss": 3.8071, + "step": 644 + }, + { + "epoch": 0.056611380972102984, + "grad_norm": 0.7421875, + "learning_rate": 0.001935, + "loss": 3.8032, + "step": 645 + }, + { + "epoch": 0.05669915055500547, + "grad_norm": 0.796875, + "learning_rate": 0.001938, + "loss": 3.7886, + "step": 646 + }, + { + "epoch": 0.05678692013790796, + "grad_norm": 1.53125, + "learning_rate": 0.001941, + "loss": 3.7485, + "step": 647 + }, + { + "epoch": 0.056874689720810445, + "grad_norm": 0.75, + "learning_rate": 0.0019440000000000002, + "loss": 3.7437, + "step": 648 + }, + { + "epoch": 0.05696245930371293, + "grad_norm": 0.671875, + "learning_rate": 0.0019470000000000002, + "loss": 3.8135, + "step": 649 + }, + { + "epoch": 0.057050228886615414, + "grad_norm": 0.62109375, + "learning_rate": 0.0019500000000000001, + "loss": 3.8784, + "step": 650 + }, + { + "epoch": 0.0571379984695179, + "grad_norm": 0.6015625, + "learning_rate": 0.001953, + "loss": 3.7544, + "step": 651 + }, + { + "epoch": 0.057225768052420384, + "grad_norm": 0.53125, + "learning_rate": 0.0019560000000000003, + "loss": 3.7856, + "step": 652 + }, + { + "epoch": 0.05731353763532287, + "grad_norm": 0.376953125, + "learning_rate": 0.0019590000000000002, + "loss": 3.7559, + "step": 653 + }, + { + "epoch": 0.05740130721822535, + "grad_norm": 0.5390625, + "learning_rate": 0.001962, + "loss": 3.6909, + "step": 654 + }, + { + "epoch": 0.05748907680112784, + "grad_norm": 0.4453125, + "learning_rate": 0.001965, + "loss": 3.8457, + "step": 655 + }, + { + "epoch": 0.05757684638403032, + "grad_norm": 0.32421875, + "learning_rate": 0.001968, + "loss": 3.7324, + "step": 656 + }, + { + "epoch": 0.05766461596693281, + "grad_norm": 0.4140625, + "learning_rate": 0.001971, + "loss": 3.7222, + "step": 657 + }, + { + "epoch": 0.05775238554983529, + "grad_norm": 0.337890625, + "learning_rate": 0.001974, + "loss": 3.7612, + "step": 658 + }, + { + "epoch": 0.05784015513273778, + "grad_norm": 0.310546875, + "learning_rate": 0.001977, + "loss": 3.6855, + "step": 659 + }, + { + "epoch": 0.05792792471564027, + "grad_norm": 0.44140625, + "learning_rate": 0.00198, + "loss": 3.7402, + "step": 660 + }, + { + "epoch": 0.05801569429854275, + "grad_norm": 0.388671875, + "learning_rate": 0.001983, + "loss": 3.7339, + "step": 661 + }, + { + "epoch": 0.05810346388144524, + "grad_norm": 0.31640625, + "learning_rate": 0.0019860000000000004, + "loss": 3.7827, + "step": 662 + }, + { + "epoch": 0.05819123346434772, + "grad_norm": 0.3046875, + "learning_rate": 0.0019890000000000003, + "loss": 3.7568, + "step": 663 + }, + { + "epoch": 0.05827900304725021, + "grad_norm": 0.302734375, + "learning_rate": 0.0019920000000000003, + "loss": 3.8013, + "step": 664 + }, + { + "epoch": 0.05836677263015269, + "grad_norm": 0.421875, + "learning_rate": 0.0019950000000000002, + "loss": 3.7363, + "step": 665 + }, + { + "epoch": 0.058454542213055176, + "grad_norm": 0.51171875, + "learning_rate": 0.001998, + "loss": 3.7886, + "step": 666 + }, + { + "epoch": 0.05854231179595766, + "grad_norm": 0.498046875, + "learning_rate": 0.002001, + "loss": 3.7139, + "step": 667 + }, + { + "epoch": 0.058630081378860145, + "grad_norm": 0.52734375, + "learning_rate": 0.002004, + "loss": 3.689, + "step": 668 + }, + { + "epoch": 0.05871785096176263, + "grad_norm": 0.51171875, + "learning_rate": 0.002007, + "loss": 3.6919, + "step": 669 + }, + { + "epoch": 0.058805620544665115, + "grad_norm": 0.58203125, + "learning_rate": 0.00201, + "loss": 3.7578, + "step": 670 + }, + { + "epoch": 0.058893390127567606, + "grad_norm": 0.69140625, + "learning_rate": 0.002013, + "loss": 3.813, + "step": 671 + }, + { + "epoch": 0.05898115971047009, + "grad_norm": 0.9375, + "learning_rate": 0.002016, + "loss": 3.814, + "step": 672 + }, + { + "epoch": 0.059068929293372575, + "grad_norm": 0.7734375, + "learning_rate": 0.002019, + "loss": 3.8003, + "step": 673 + }, + { + "epoch": 0.05915669887627506, + "grad_norm": 0.484375, + "learning_rate": 0.0020220000000000004, + "loss": 3.73, + "step": 674 + }, + { + "epoch": 0.059244468459177545, + "grad_norm": 0.93359375, + "learning_rate": 0.0020250000000000003, + "loss": 3.7837, + "step": 675 + }, + { + "epoch": 0.05933223804208003, + "grad_norm": 0.88671875, + "learning_rate": 0.0020280000000000003, + "loss": 3.7627, + "step": 676 + }, + { + "epoch": 0.059420007624982514, + "grad_norm": 1.1015625, + "learning_rate": 0.0020310000000000003, + "loss": 3.7266, + "step": 677 + }, + { + "epoch": 0.059507777207885, + "grad_norm": 1.3046875, + "learning_rate": 0.0020340000000000002, + "loss": 3.7349, + "step": 678 + }, + { + "epoch": 0.05959554679078748, + "grad_norm": 0.9453125, + "learning_rate": 0.002037, + "loss": 3.832, + "step": 679 + }, + { + "epoch": 0.05968331637368997, + "grad_norm": 1.1171875, + "learning_rate": 0.00204, + "loss": 3.8428, + "step": 680 + }, + { + "epoch": 0.05977108595659245, + "grad_norm": 0.400390625, + "learning_rate": 0.002043, + "loss": 3.8057, + "step": 681 + }, + { + "epoch": 0.059858855539494944, + "grad_norm": 0.90625, + "learning_rate": 0.002046, + "loss": 3.7397, + "step": 682 + }, + { + "epoch": 0.05994662512239743, + "grad_norm": 0.462890625, + "learning_rate": 0.002049, + "loss": 3.769, + "step": 683 + }, + { + "epoch": 0.060034394705299914, + "grad_norm": 0.52734375, + "learning_rate": 0.002052, + "loss": 3.7515, + "step": 684 + }, + { + "epoch": 0.0601221642882024, + "grad_norm": 0.416015625, + "learning_rate": 0.0020550000000000004, + "loss": 3.7881, + "step": 685 + }, + { + "epoch": 0.06020993387110488, + "grad_norm": 0.326171875, + "learning_rate": 0.0020580000000000004, + "loss": 3.7822, + "step": 686 + }, + { + "epoch": 0.06029770345400737, + "grad_norm": 0.375, + "learning_rate": 0.0020610000000000003, + "loss": 3.7397, + "step": 687 + }, + { + "epoch": 0.06038547303690985, + "grad_norm": 0.310546875, + "learning_rate": 0.002064, + "loss": 3.7529, + "step": 688 + }, + { + "epoch": 0.06047324261981234, + "grad_norm": 0.3046875, + "learning_rate": 0.002067, + "loss": 3.6914, + "step": 689 + }, + { + "epoch": 0.06056101220271482, + "grad_norm": 0.39453125, + "learning_rate": 0.00207, + "loss": 3.7534, + "step": 690 + }, + { + "epoch": 0.060648781785617306, + "grad_norm": 0.404296875, + "learning_rate": 0.0020729999999999998, + "loss": 3.731, + "step": 691 + }, + { + "epoch": 0.06073655136851979, + "grad_norm": 0.400390625, + "learning_rate": 0.0020759999999999997, + "loss": 3.7261, + "step": 692 + }, + { + "epoch": 0.060824320951422275, + "grad_norm": 0.423828125, + "learning_rate": 0.0020789999999999997, + "loss": 3.7568, + "step": 693 + }, + { + "epoch": 0.06091209053432477, + "grad_norm": 0.341796875, + "learning_rate": 0.002082, + "loss": 3.7568, + "step": 694 + }, + { + "epoch": 0.06099986011722725, + "grad_norm": 0.396484375, + "learning_rate": 0.002085, + "loss": 3.8052, + "step": 695 + }, + { + "epoch": 0.061087629700129736, + "grad_norm": 0.46875, + "learning_rate": 0.002088, + "loss": 3.7393, + "step": 696 + }, + { + "epoch": 0.06117539928303222, + "grad_norm": 0.75390625, + "learning_rate": 0.002091, + "loss": 3.7939, + "step": 697 + }, + { + "epoch": 0.061263168865934706, + "grad_norm": 0.9296875, + "learning_rate": 0.002094, + "loss": 3.7563, + "step": 698 + }, + { + "epoch": 0.06135093844883719, + "grad_norm": 0.8125, + "learning_rate": 0.002097, + "loss": 3.7168, + "step": 699 + }, + { + "epoch": 0.061438708031739675, + "grad_norm": 1.2734375, + "learning_rate": 0.0021, + "loss": 3.7871, + "step": 700 + }, + { + "epoch": 0.06152647761464216, + "grad_norm": 0.84765625, + "learning_rate": 0.002103, + "loss": 3.7847, + "step": 701 + }, + { + "epoch": 0.061614247197544644, + "grad_norm": 0.82421875, + "learning_rate": 0.002106, + "loss": 3.7852, + "step": 702 + }, + { + "epoch": 0.06170201678044713, + "grad_norm": 0.78125, + "learning_rate": 0.0021089999999999998, + "loss": 3.7314, + "step": 703 + }, + { + "epoch": 0.061789786363349614, + "grad_norm": 0.365234375, + "learning_rate": 0.0021119999999999997, + "loss": 3.7412, + "step": 704 + }, + { + "epoch": 0.0618775559462521, + "grad_norm": 0.58203125, + "learning_rate": 0.002115, + "loss": 3.7026, + "step": 705 + }, + { + "epoch": 0.06196532552915459, + "grad_norm": 0.44140625, + "learning_rate": 0.002118, + "loss": 3.7529, + "step": 706 + }, + { + "epoch": 0.062053095112057075, + "grad_norm": 0.3671875, + "learning_rate": 0.002121, + "loss": 3.7627, + "step": 707 + }, + { + "epoch": 0.06214086469495956, + "grad_norm": 0.365234375, + "learning_rate": 0.002124, + "loss": 3.7388, + "step": 708 + }, + { + "epoch": 0.062228634277862044, + "grad_norm": 0.8203125, + "learning_rate": 0.002127, + "loss": 3.7339, + "step": 709 + }, + { + "epoch": 0.06231640386076453, + "grad_norm": 0.42578125, + "learning_rate": 0.00213, + "loss": 3.7705, + "step": 710 + }, + { + "epoch": 0.06240417344366701, + "grad_norm": 0.2451171875, + "learning_rate": 0.002133, + "loss": 3.7134, + "step": 711 + }, + { + "epoch": 0.0624919430265695, + "grad_norm": 0.390625, + "learning_rate": 0.002136, + "loss": 3.7803, + "step": 712 + }, + { + "epoch": 0.06257971260947198, + "grad_norm": 0.34765625, + "learning_rate": 0.002139, + "loss": 3.7539, + "step": 713 + }, + { + "epoch": 0.06266748219237447, + "grad_norm": 0.255859375, + "learning_rate": 0.002142, + "loss": 3.8052, + "step": 714 + }, + { + "epoch": 0.06275525177527695, + "grad_norm": 0.275390625, + "learning_rate": 0.0021449999999999998, + "loss": 3.6963, + "step": 715 + }, + { + "epoch": 0.06284302135817944, + "grad_norm": 0.41015625, + "learning_rate": 0.002148, + "loss": 3.7109, + "step": 716 + }, + { + "epoch": 0.06293079094108192, + "grad_norm": 0.61328125, + "learning_rate": 0.002151, + "loss": 3.665, + "step": 717 + }, + { + "epoch": 0.06301856052398441, + "grad_norm": 0.8203125, + "learning_rate": 0.002154, + "loss": 3.71, + "step": 718 + }, + { + "epoch": 0.06310633010688689, + "grad_norm": 0.63671875, + "learning_rate": 0.002157, + "loss": 3.7148, + "step": 719 + }, + { + "epoch": 0.06319409968978938, + "grad_norm": 0.61328125, + "learning_rate": 0.00216, + "loss": 3.6938, + "step": 720 + }, + { + "epoch": 0.06328186927269186, + "grad_norm": 6.9375, + "learning_rate": 0.002163, + "loss": 3.8232, + "step": 721 + }, + { + "epoch": 0.06336963885559435, + "grad_norm": 1.28125, + "learning_rate": 0.002166, + "loss": 3.8701, + "step": 722 + }, + { + "epoch": 0.06345740843849684, + "grad_norm": 1.6484375, + "learning_rate": 0.002169, + "loss": 3.7744, + "step": 723 + }, + { + "epoch": 0.06354517802139932, + "grad_norm": 0.77734375, + "learning_rate": 0.002172, + "loss": 3.7949, + "step": 724 + }, + { + "epoch": 0.06363294760430181, + "grad_norm": 1.1796875, + "learning_rate": 0.002175, + "loss": 3.7656, + "step": 725 + }, + { + "epoch": 0.06372071718720429, + "grad_norm": 0.5, + "learning_rate": 0.002178, + "loss": 3.7471, + "step": 726 + }, + { + "epoch": 0.06380848677010678, + "grad_norm": 0.52734375, + "learning_rate": 0.0021809999999999998, + "loss": 3.686, + "step": 727 + }, + { + "epoch": 0.06389625635300926, + "grad_norm": 0.494140625, + "learning_rate": 0.002184, + "loss": 3.7002, + "step": 728 + }, + { + "epoch": 0.06398402593591175, + "grad_norm": 0.423828125, + "learning_rate": 0.002187, + "loss": 3.6968, + "step": 729 + }, + { + "epoch": 0.06407179551881423, + "grad_norm": 0.43359375, + "learning_rate": 0.00219, + "loss": 3.7095, + "step": 730 + }, + { + "epoch": 0.06415956510171672, + "grad_norm": 0.384765625, + "learning_rate": 0.002193, + "loss": 3.7622, + "step": 731 + }, + { + "epoch": 0.0642473346846192, + "grad_norm": 0.333984375, + "learning_rate": 0.002196, + "loss": 3.8252, + "step": 732 + }, + { + "epoch": 0.06433510426752169, + "grad_norm": 0.314453125, + "learning_rate": 0.002199, + "loss": 3.7144, + "step": 733 + }, + { + "epoch": 0.06442287385042417, + "grad_norm": 0.26953125, + "learning_rate": 0.002202, + "loss": 3.7085, + "step": 734 + }, + { + "epoch": 0.06451064343332666, + "grad_norm": 0.314453125, + "learning_rate": 0.002205, + "loss": 3.7637, + "step": 735 + }, + { + "epoch": 0.06459841301622915, + "grad_norm": 0.3515625, + "learning_rate": 0.002208, + "loss": 3.7861, + "step": 736 + }, + { + "epoch": 0.06468618259913163, + "grad_norm": 0.30859375, + "learning_rate": 0.002211, + "loss": 3.7471, + "step": 737 + }, + { + "epoch": 0.06477395218203412, + "grad_norm": 0.310546875, + "learning_rate": 0.002214, + "loss": 3.748, + "step": 738 + }, + { + "epoch": 0.0648617217649366, + "grad_norm": 0.3828125, + "learning_rate": 0.0022170000000000002, + "loss": 3.7354, + "step": 739 + }, + { + "epoch": 0.06494949134783909, + "grad_norm": 0.5234375, + "learning_rate": 0.00222, + "loss": 3.6177, + "step": 740 + }, + { + "epoch": 0.06503726093074157, + "grad_norm": 0.7265625, + "learning_rate": 0.002223, + "loss": 3.7783, + "step": 741 + }, + { + "epoch": 0.06512503051364406, + "grad_norm": 1.3359375, + "learning_rate": 0.002226, + "loss": 3.7266, + "step": 742 + }, + { + "epoch": 0.06521280009654654, + "grad_norm": 1.1171875, + "learning_rate": 0.002229, + "loss": 3.7812, + "step": 743 + }, + { + "epoch": 0.06530056967944903, + "grad_norm": 0.75390625, + "learning_rate": 0.002232, + "loss": 3.6904, + "step": 744 + }, + { + "epoch": 0.0653883392623515, + "grad_norm": 0.32421875, + "learning_rate": 0.002235, + "loss": 3.7261, + "step": 745 + }, + { + "epoch": 0.065476108845254, + "grad_norm": 0.5859375, + "learning_rate": 0.002238, + "loss": 3.7495, + "step": 746 + }, + { + "epoch": 0.06556387842815649, + "grad_norm": 0.53515625, + "learning_rate": 0.002241, + "loss": 3.7397, + "step": 747 + }, + { + "epoch": 0.06565164801105897, + "grad_norm": 0.265625, + "learning_rate": 0.002244, + "loss": 3.7744, + "step": 748 + }, + { + "epoch": 0.06573941759396146, + "grad_norm": 0.392578125, + "learning_rate": 0.002247, + "loss": 3.7305, + "step": 749 + }, + { + "epoch": 0.06582718717686394, + "grad_norm": 0.388671875, + "learning_rate": 0.0022500000000000003, + "loss": 3.6714, + "step": 750 + }, + { + "epoch": 0.06591495675976643, + "grad_norm": 0.26953125, + "learning_rate": 0.0022530000000000002, + "loss": 3.7026, + "step": 751 + }, + { + "epoch": 0.0660027263426689, + "grad_norm": 0.439453125, + "learning_rate": 0.002256, + "loss": 3.7227, + "step": 752 + }, + { + "epoch": 0.0660904959255714, + "grad_norm": 0.392578125, + "learning_rate": 0.002259, + "loss": 3.6636, + "step": 753 + }, + { + "epoch": 0.06617826550847387, + "grad_norm": 0.341796875, + "learning_rate": 0.002262, + "loss": 3.7505, + "step": 754 + }, + { + "epoch": 0.06626603509137637, + "grad_norm": 0.51953125, + "learning_rate": 0.002265, + "loss": 3.6899, + "step": 755 + }, + { + "epoch": 0.06635380467427884, + "grad_norm": 0.455078125, + "learning_rate": 0.002268, + "loss": 3.7573, + "step": 756 + }, + { + "epoch": 0.06644157425718134, + "grad_norm": 0.369140625, + "learning_rate": 0.002271, + "loss": 3.7549, + "step": 757 + }, + { + "epoch": 0.06652934384008383, + "grad_norm": 0.279296875, + "learning_rate": 0.002274, + "loss": 3.7002, + "step": 758 + }, + { + "epoch": 0.0666171134229863, + "grad_norm": 0.40625, + "learning_rate": 0.002277, + "loss": 3.7773, + "step": 759 + }, + { + "epoch": 0.0667048830058888, + "grad_norm": 0.326171875, + "learning_rate": 0.00228, + "loss": 3.769, + "step": 760 + }, + { + "epoch": 0.06679265258879127, + "grad_norm": 0.310546875, + "learning_rate": 0.002283, + "loss": 3.6855, + "step": 761 + }, + { + "epoch": 0.06688042217169377, + "grad_norm": 0.451171875, + "learning_rate": 0.0022860000000000003, + "loss": 3.7285, + "step": 762 + }, + { + "epoch": 0.06696819175459624, + "grad_norm": 0.427734375, + "learning_rate": 0.0022890000000000002, + "loss": 3.7085, + "step": 763 + }, + { + "epoch": 0.06705596133749873, + "grad_norm": 0.30859375, + "learning_rate": 0.002292, + "loss": 3.7393, + "step": 764 + }, + { + "epoch": 0.06714373092040121, + "grad_norm": 0.439453125, + "learning_rate": 0.002295, + "loss": 3.7393, + "step": 765 + }, + { + "epoch": 0.0672315005033037, + "grad_norm": 0.48046875, + "learning_rate": 0.002298, + "loss": 3.7456, + "step": 766 + }, + { + "epoch": 0.06731927008620618, + "grad_norm": 0.64453125, + "learning_rate": 0.002301, + "loss": 3.686, + "step": 767 + }, + { + "epoch": 0.06740703966910867, + "grad_norm": 0.76171875, + "learning_rate": 0.002304, + "loss": 3.7363, + "step": 768 + }, + { + "epoch": 0.06749480925201115, + "grad_norm": 0.80078125, + "learning_rate": 0.002307, + "loss": 3.686, + "step": 769 + }, + { + "epoch": 0.06758257883491364, + "grad_norm": 1.0390625, + "learning_rate": 0.00231, + "loss": 3.7188, + "step": 770 + }, + { + "epoch": 0.06767034841781613, + "grad_norm": 1.2265625, + "learning_rate": 0.002313, + "loss": 3.6997, + "step": 771 + }, + { + "epoch": 0.06775811800071861, + "grad_norm": 0.83203125, + "learning_rate": 0.002316, + "loss": 3.7358, + "step": 772 + }, + { + "epoch": 0.0678458875836211, + "grad_norm": 0.63671875, + "learning_rate": 0.0023190000000000003, + "loss": 3.7314, + "step": 773 + }, + { + "epoch": 0.06793365716652358, + "grad_norm": 0.69140625, + "learning_rate": 0.0023220000000000003, + "loss": 3.7417, + "step": 774 + }, + { + "epoch": 0.06802142674942607, + "grad_norm": 0.625, + "learning_rate": 0.0023250000000000002, + "loss": 3.7671, + "step": 775 + }, + { + "epoch": 0.06810919633232855, + "grad_norm": 0.53515625, + "learning_rate": 0.002328, + "loss": 3.7241, + "step": 776 + }, + { + "epoch": 0.06819696591523104, + "grad_norm": 0.43359375, + "learning_rate": 0.002331, + "loss": 3.748, + "step": 777 + }, + { + "epoch": 0.06828473549813352, + "grad_norm": 0.357421875, + "learning_rate": 0.002334, + "loss": 3.7295, + "step": 778 + }, + { + "epoch": 0.06837250508103601, + "grad_norm": 0.37890625, + "learning_rate": 0.002337, + "loss": 3.7632, + "step": 779 + }, + { + "epoch": 0.06846027466393849, + "grad_norm": 0.326171875, + "learning_rate": 0.00234, + "loss": 3.708, + "step": 780 + }, + { + "epoch": 0.06854804424684098, + "grad_norm": 0.431640625, + "learning_rate": 0.002343, + "loss": 3.7207, + "step": 781 + }, + { + "epoch": 0.06863581382974347, + "grad_norm": 0.267578125, + "learning_rate": 0.002346, + "loss": 3.6948, + "step": 782 + }, + { + "epoch": 0.06872358341264595, + "grad_norm": 0.32421875, + "learning_rate": 0.002349, + "loss": 3.7104, + "step": 783 + }, + { + "epoch": 0.06881135299554844, + "grad_norm": 0.2392578125, + "learning_rate": 0.002352, + "loss": 3.6846, + "step": 784 + }, + { + "epoch": 0.06889912257845092, + "grad_norm": 0.294921875, + "learning_rate": 0.0023550000000000003, + "loss": 3.7046, + "step": 785 + }, + { + "epoch": 0.06898689216135341, + "grad_norm": 0.357421875, + "learning_rate": 0.0023580000000000003, + "loss": 3.6807, + "step": 786 + }, + { + "epoch": 0.06907466174425589, + "grad_norm": 0.28125, + "learning_rate": 0.0023610000000000003, + "loss": 3.7573, + "step": 787 + }, + { + "epoch": 0.06916243132715838, + "grad_norm": 0.32421875, + "learning_rate": 0.002364, + "loss": 3.6899, + "step": 788 + }, + { + "epoch": 0.06925020091006086, + "grad_norm": 0.33203125, + "learning_rate": 0.002367, + "loss": 3.7749, + "step": 789 + }, + { + "epoch": 0.06933797049296335, + "grad_norm": 0.216796875, + "learning_rate": 0.00237, + "loss": 3.6626, + "step": 790 + }, + { + "epoch": 0.06942574007586583, + "grad_norm": 0.30078125, + "learning_rate": 0.002373, + "loss": 3.7949, + "step": 791 + }, + { + "epoch": 0.06951350965876832, + "grad_norm": 0.462890625, + "learning_rate": 0.002376, + "loss": 3.6807, + "step": 792 + }, + { + "epoch": 0.06960127924167081, + "grad_norm": 0.376953125, + "learning_rate": 0.002379, + "loss": 3.6987, + "step": 793 + }, + { + "epoch": 0.06968904882457329, + "grad_norm": 0.37890625, + "learning_rate": 0.002382, + "loss": 3.7559, + "step": 794 + }, + { + "epoch": 0.06977681840747578, + "grad_norm": 0.375, + "learning_rate": 0.002385, + "loss": 3.6919, + "step": 795 + }, + { + "epoch": 0.06986458799037826, + "grad_norm": 0.306640625, + "learning_rate": 0.0023880000000000004, + "loss": 3.6982, + "step": 796 + }, + { + "epoch": 0.06995235757328075, + "grad_norm": 0.306640625, + "learning_rate": 0.0023910000000000003, + "loss": 3.7075, + "step": 797 + }, + { + "epoch": 0.07004012715618323, + "grad_norm": 0.3125, + "learning_rate": 0.0023940000000000003, + "loss": 3.6851, + "step": 798 + }, + { + "epoch": 0.07012789673908572, + "grad_norm": 0.302734375, + "learning_rate": 0.0023970000000000003, + "loss": 3.6602, + "step": 799 + }, + { + "epoch": 0.0702156663219882, + "grad_norm": 0.32421875, + "learning_rate": 0.0024000000000000002, + "loss": 3.7002, + "step": 800 + }, + { + "epoch": 0.07030343590489069, + "grad_norm": 0.28125, + "learning_rate": 0.002403, + "loss": 3.626, + "step": 801 + }, + { + "epoch": 0.07039120548779317, + "grad_norm": 0.369140625, + "learning_rate": 0.002406, + "loss": 3.6548, + "step": 802 + }, + { + "epoch": 0.07047897507069566, + "grad_norm": 0.63671875, + "learning_rate": 0.002409, + "loss": 3.73, + "step": 803 + }, + { + "epoch": 0.07056674465359813, + "grad_norm": 1.09375, + "learning_rate": 0.002412, + "loss": 3.6968, + "step": 804 + }, + { + "epoch": 0.07065451423650063, + "grad_norm": 1.421875, + "learning_rate": 0.002415, + "loss": 3.7441, + "step": 805 + }, + { + "epoch": 0.07074228381940312, + "grad_norm": 1.171875, + "learning_rate": 0.002418, + "loss": 3.7603, + "step": 806 + }, + { + "epoch": 0.0708300534023056, + "grad_norm": 1.3671875, + "learning_rate": 0.0024210000000000004, + "loss": 3.791, + "step": 807 + }, + { + "epoch": 0.07091782298520809, + "grad_norm": 0.578125, + "learning_rate": 0.0024240000000000004, + "loss": 3.7222, + "step": 808 + }, + { + "epoch": 0.07100559256811056, + "grad_norm": 0.86328125, + "learning_rate": 0.0024270000000000003, + "loss": 3.6899, + "step": 809 + }, + { + "epoch": 0.07109336215101306, + "grad_norm": 0.96484375, + "learning_rate": 0.0024300000000000003, + "loss": 3.7476, + "step": 810 + }, + { + "epoch": 0.07118113173391553, + "grad_norm": 0.8125, + "learning_rate": 0.0024330000000000003, + "loss": 3.7065, + "step": 811 + }, + { + "epoch": 0.07126890131681803, + "grad_norm": 0.498046875, + "learning_rate": 0.0024360000000000002, + "loss": 3.7705, + "step": 812 + }, + { + "epoch": 0.0713566708997205, + "grad_norm": 3.5, + "learning_rate": 0.0024389999999999998, + "loss": 3.7329, + "step": 813 + }, + { + "epoch": 0.071444440482623, + "grad_norm": 0.96875, + "learning_rate": 0.0024419999999999997, + "loss": 3.7852, + "step": 814 + }, + { + "epoch": 0.07153221006552547, + "grad_norm": 0.98828125, + "learning_rate": 0.0024449999999999997, + "loss": 3.7646, + "step": 815 + }, + { + "epoch": 0.07161997964842796, + "grad_norm": 3.765625, + "learning_rate": 0.002448, + "loss": 3.7314, + "step": 816 + }, + { + "epoch": 0.07170774923133046, + "grad_norm": 0.90625, + "learning_rate": 0.002451, + "loss": 3.6938, + "step": 817 + }, + { + "epoch": 0.07179551881423293, + "grad_norm": 0.83984375, + "learning_rate": 0.002454, + "loss": 3.7383, + "step": 818 + }, + { + "epoch": 0.07188328839713543, + "grad_norm": 0.94140625, + "learning_rate": 0.002457, + "loss": 3.7593, + "step": 819 + }, + { + "epoch": 0.0719710579800379, + "grad_norm": 0.73046875, + "learning_rate": 0.00246, + "loss": 3.6699, + "step": 820 + }, + { + "epoch": 0.0720588275629404, + "grad_norm": 0.52734375, + "learning_rate": 0.002463, + "loss": 3.752, + "step": 821 + }, + { + "epoch": 0.07214659714584287, + "grad_norm": 0.62109375, + "learning_rate": 0.002466, + "loss": 3.7798, + "step": 822 + }, + { + "epoch": 0.07223436672874536, + "grad_norm": 0.859375, + "learning_rate": 0.002469, + "loss": 3.7441, + "step": 823 + }, + { + "epoch": 0.07232213631164784, + "grad_norm": 0.53125, + "learning_rate": 0.002472, + "loss": 3.7021, + "step": 824 + }, + { + "epoch": 0.07240990589455033, + "grad_norm": 0.443359375, + "learning_rate": 0.0024749999999999998, + "loss": 3.7612, + "step": 825 + }, + { + "epoch": 0.07249767547745281, + "grad_norm": 0.3046875, + "learning_rate": 0.0024779999999999997, + "loss": 3.6611, + "step": 826 + }, + { + "epoch": 0.0725854450603553, + "grad_norm": 0.259765625, + "learning_rate": 0.002481, + "loss": 3.6909, + "step": 827 + }, + { + "epoch": 0.0726732146432578, + "grad_norm": 0.32421875, + "learning_rate": 0.002484, + "loss": 3.7026, + "step": 828 + }, + { + "epoch": 0.07276098422616027, + "grad_norm": 0.26953125, + "learning_rate": 0.002487, + "loss": 3.6343, + "step": 829 + }, + { + "epoch": 0.07284875380906276, + "grad_norm": 0.400390625, + "learning_rate": 0.00249, + "loss": 3.7246, + "step": 830 + }, + { + "epoch": 0.07293652339196524, + "grad_norm": 0.96875, + "learning_rate": 0.002493, + "loss": 3.6538, + "step": 831 + }, + { + "epoch": 0.07302429297486773, + "grad_norm": 0.94140625, + "learning_rate": 0.002496, + "loss": 3.6812, + "step": 832 + }, + { + "epoch": 0.07311206255777021, + "grad_norm": 0.546875, + "learning_rate": 0.002499, + "loss": 3.73, + "step": 833 + }, + { + "epoch": 0.0731998321406727, + "grad_norm": 0.515625, + "learning_rate": 0.002502, + "loss": 3.7378, + "step": 834 + }, + { + "epoch": 0.07328760172357518, + "grad_norm": 0.48828125, + "learning_rate": 0.002505, + "loss": 3.7598, + "step": 835 + }, + { + "epoch": 0.07337537130647767, + "grad_norm": 0.5703125, + "learning_rate": 0.002508, + "loss": 3.7412, + "step": 836 + }, + { + "epoch": 0.07346314088938015, + "grad_norm": 0.42578125, + "learning_rate": 0.0025109999999999998, + "loss": 3.7393, + "step": 837 + }, + { + "epoch": 0.07355091047228264, + "grad_norm": 0.248046875, + "learning_rate": 0.0025139999999999997, + "loss": 3.7256, + "step": 838 + }, + { + "epoch": 0.07363868005518512, + "grad_norm": 0.43359375, + "learning_rate": 0.002517, + "loss": 3.6777, + "step": 839 + }, + { + "epoch": 0.07372644963808761, + "grad_norm": 0.33203125, + "learning_rate": 0.00252, + "loss": 3.6694, + "step": 840 + }, + { + "epoch": 0.0738142192209901, + "grad_norm": 0.31640625, + "learning_rate": 0.002523, + "loss": 3.6685, + "step": 841 + }, + { + "epoch": 0.07390198880389258, + "grad_norm": 0.251953125, + "learning_rate": 0.002526, + "loss": 3.7007, + "step": 842 + }, + { + "epoch": 0.07398975838679507, + "grad_norm": 0.2470703125, + "learning_rate": 0.002529, + "loss": 3.7432, + "step": 843 + }, + { + "epoch": 0.07407752796969755, + "grad_norm": 0.69140625, + "learning_rate": 0.002532, + "loss": 3.6738, + "step": 844 + }, + { + "epoch": 0.07416529755260004, + "grad_norm": 0.6875, + "learning_rate": 0.002535, + "loss": 3.646, + "step": 845 + }, + { + "epoch": 0.07425306713550252, + "grad_norm": 0.35546875, + "learning_rate": 0.002538, + "loss": 3.6494, + "step": 846 + }, + { + "epoch": 0.07434083671840501, + "grad_norm": 0.361328125, + "learning_rate": 0.002541, + "loss": 3.6348, + "step": 847 + }, + { + "epoch": 0.07442860630130749, + "grad_norm": 0.3359375, + "learning_rate": 0.002544, + "loss": 3.7183, + "step": 848 + }, + { + "epoch": 0.07451637588420998, + "grad_norm": 0.322265625, + "learning_rate": 0.002547, + "loss": 3.728, + "step": 849 + }, + { + "epoch": 0.07460414546711246, + "grad_norm": 0.271484375, + "learning_rate": 0.00255, + "loss": 3.7329, + "step": 850 + }, + { + "epoch": 0.07469191505001495, + "grad_norm": 0.251953125, + "learning_rate": 0.002553, + "loss": 3.6992, + "step": 851 + }, + { + "epoch": 0.07477968463291744, + "grad_norm": 0.310546875, + "learning_rate": 0.002556, + "loss": 3.626, + "step": 852 + }, + { + "epoch": 0.07486745421581992, + "grad_norm": 0.369140625, + "learning_rate": 0.002559, + "loss": 3.6987, + "step": 853 + }, + { + "epoch": 0.07495522379872241, + "grad_norm": 0.6640625, + "learning_rate": 0.002562, + "loss": 3.6509, + "step": 854 + }, + { + "epoch": 0.07504299338162489, + "grad_norm": 1.234375, + "learning_rate": 0.002565, + "loss": 3.7505, + "step": 855 + }, + { + "epoch": 0.07513076296452738, + "grad_norm": 1.4375, + "learning_rate": 0.002568, + "loss": 3.7656, + "step": 856 + }, + { + "epoch": 0.07521853254742986, + "grad_norm": 0.83984375, + "learning_rate": 0.002571, + "loss": 3.687, + "step": 857 + }, + { + "epoch": 0.07530630213033235, + "grad_norm": 0.6015625, + "learning_rate": 0.002574, + "loss": 3.7065, + "step": 858 + }, + { + "epoch": 0.07539407171323483, + "grad_norm": 0.53125, + "learning_rate": 0.002577, + "loss": 3.71, + "step": 859 + }, + { + "epoch": 0.07548184129613732, + "grad_norm": 0.51171875, + "learning_rate": 0.00258, + "loss": 3.7144, + "step": 860 + }, + { + "epoch": 0.0755696108790398, + "grad_norm": 0.361328125, + "learning_rate": 0.0025830000000000002, + "loss": 3.6265, + "step": 861 + }, + { + "epoch": 0.07565738046194229, + "grad_norm": 0.337890625, + "learning_rate": 0.002586, + "loss": 3.6665, + "step": 862 + }, + { + "epoch": 0.07574515004484478, + "grad_norm": 0.302734375, + "learning_rate": 0.002589, + "loss": 3.6675, + "step": 863 + }, + { + "epoch": 0.07583291962774726, + "grad_norm": 0.265625, + "learning_rate": 0.002592, + "loss": 3.6807, + "step": 864 + }, + { + "epoch": 0.07592068921064975, + "grad_norm": 0.244140625, + "learning_rate": 0.002595, + "loss": 3.728, + "step": 865 + }, + { + "epoch": 0.07600845879355222, + "grad_norm": 0.298828125, + "learning_rate": 0.002598, + "loss": 3.6689, + "step": 866 + }, + { + "epoch": 0.07609622837645472, + "grad_norm": 0.30078125, + "learning_rate": 0.002601, + "loss": 3.6865, + "step": 867 + }, + { + "epoch": 0.0761839979593572, + "grad_norm": 0.28125, + "learning_rate": 0.002604, + "loss": 3.6787, + "step": 868 + }, + { + "epoch": 0.07627176754225969, + "grad_norm": 0.404296875, + "learning_rate": 0.002607, + "loss": 3.6592, + "step": 869 + }, + { + "epoch": 0.07635953712516216, + "grad_norm": 0.21875, + "learning_rate": 0.00261, + "loss": 3.7295, + "step": 870 + }, + { + "epoch": 0.07644730670806466, + "grad_norm": 0.34765625, + "learning_rate": 0.002613, + "loss": 3.7124, + "step": 871 + }, + { + "epoch": 0.07653507629096713, + "grad_norm": 0.5, + "learning_rate": 0.002616, + "loss": 3.6597, + "step": 872 + }, + { + "epoch": 0.07662284587386962, + "grad_norm": 0.23828125, + "learning_rate": 0.0026190000000000002, + "loss": 3.7861, + "step": 873 + }, + { + "epoch": 0.0767106154567721, + "grad_norm": 0.31640625, + "learning_rate": 0.002622, + "loss": 3.6782, + "step": 874 + }, + { + "epoch": 0.0767983850396746, + "grad_norm": 0.45703125, + "learning_rate": 0.002625, + "loss": 3.6807, + "step": 875 + }, + { + "epoch": 0.07688615462257709, + "grad_norm": 0.390625, + "learning_rate": 0.002628, + "loss": 3.6914, + "step": 876 + }, + { + "epoch": 0.07697392420547956, + "grad_norm": 0.30859375, + "learning_rate": 0.002631, + "loss": 3.6938, + "step": 877 + }, + { + "epoch": 0.07706169378838205, + "grad_norm": 0.427734375, + "learning_rate": 0.002634, + "loss": 3.6724, + "step": 878 + }, + { + "epoch": 0.07714946337128453, + "grad_norm": 0.49609375, + "learning_rate": 0.002637, + "loss": 3.6084, + "step": 879 + }, + { + "epoch": 0.07723723295418702, + "grad_norm": 0.365234375, + "learning_rate": 0.00264, + "loss": 3.6646, + "step": 880 + }, + { + "epoch": 0.0773250025370895, + "grad_norm": 0.56640625, + "learning_rate": 0.002643, + "loss": 3.6909, + "step": 881 + }, + { + "epoch": 0.077412772119992, + "grad_norm": 0.8046875, + "learning_rate": 0.002646, + "loss": 3.7378, + "step": 882 + }, + { + "epoch": 0.07750054170289447, + "grad_norm": 0.79296875, + "learning_rate": 0.002649, + "loss": 3.7207, + "step": 883 + }, + { + "epoch": 0.07758831128579696, + "grad_norm": 0.40234375, + "learning_rate": 0.0026520000000000003, + "loss": 3.7061, + "step": 884 + }, + { + "epoch": 0.07767608086869944, + "grad_norm": 0.32421875, + "learning_rate": 0.0026550000000000002, + "loss": 3.731, + "step": 885 + }, + { + "epoch": 0.07776385045160193, + "grad_norm": 0.326171875, + "learning_rate": 0.002658, + "loss": 3.6167, + "step": 886 + }, + { + "epoch": 0.07785162003450442, + "grad_norm": 0.337890625, + "learning_rate": 0.002661, + "loss": 3.6455, + "step": 887 + }, + { + "epoch": 0.0779393896174069, + "grad_norm": 0.3984375, + "learning_rate": 0.002664, + "loss": 3.6997, + "step": 888 + }, + { + "epoch": 0.07802715920030939, + "grad_norm": 0.64453125, + "learning_rate": 0.002667, + "loss": 3.6294, + "step": 889 + }, + { + "epoch": 0.07811492878321187, + "grad_norm": 0.953125, + "learning_rate": 0.00267, + "loss": 3.7241, + "step": 890 + }, + { + "epoch": 0.07820269836611436, + "grad_norm": 1.328125, + "learning_rate": 0.002673, + "loss": 3.7808, + "step": 891 + }, + { + "epoch": 0.07829046794901684, + "grad_norm": 0.796875, + "learning_rate": 0.002676, + "loss": 3.5918, + "step": 892 + }, + { + "epoch": 0.07837823753191933, + "grad_norm": 0.8046875, + "learning_rate": 0.002679, + "loss": 3.6763, + "step": 893 + }, + { + "epoch": 0.07846600711482181, + "grad_norm": 0.39453125, + "learning_rate": 0.002682, + "loss": 3.7134, + "step": 894 + }, + { + "epoch": 0.0785537766977243, + "grad_norm": 0.81640625, + "learning_rate": 0.0026850000000000003, + "loss": 3.7212, + "step": 895 + }, + { + "epoch": 0.07864154628062678, + "grad_norm": 0.427734375, + "learning_rate": 0.0026880000000000003, + "loss": 3.5791, + "step": 896 + }, + { + "epoch": 0.07872931586352927, + "grad_norm": 0.47265625, + "learning_rate": 0.0026910000000000002, + "loss": 3.7041, + "step": 897 + }, + { + "epoch": 0.07881708544643176, + "grad_norm": 0.3125, + "learning_rate": 0.002694, + "loss": 3.7158, + "step": 898 + }, + { + "epoch": 0.07890485502933424, + "grad_norm": 0.275390625, + "learning_rate": 0.002697, + "loss": 3.7212, + "step": 899 + }, + { + "epoch": 0.07899262461223673, + "grad_norm": 0.28125, + "learning_rate": 0.0027, + "loss": 3.604, + "step": 900 + }, + { + "epoch": 0.07908039419513921, + "grad_norm": 0.291015625, + "learning_rate": 0.002703, + "loss": 3.7148, + "step": 901 + }, + { + "epoch": 0.0791681637780417, + "grad_norm": 0.265625, + "learning_rate": 0.002706, + "loss": 3.646, + "step": 902 + }, + { + "epoch": 0.07925593336094418, + "grad_norm": 0.294921875, + "learning_rate": 0.002709, + "loss": 3.6235, + "step": 903 + }, + { + "epoch": 0.07934370294384667, + "grad_norm": 0.318359375, + "learning_rate": 0.002712, + "loss": 3.668, + "step": 904 + }, + { + "epoch": 0.07943147252674915, + "grad_norm": 0.26953125, + "learning_rate": 0.002715, + "loss": 3.7256, + "step": 905 + }, + { + "epoch": 0.07951924210965164, + "grad_norm": 0.251953125, + "learning_rate": 0.002718, + "loss": 3.6484, + "step": 906 + }, + { + "epoch": 0.07960701169255412, + "grad_norm": 0.2333984375, + "learning_rate": 0.0027210000000000003, + "loss": 3.6211, + "step": 907 + }, + { + "epoch": 0.07969478127545661, + "grad_norm": 0.2255859375, + "learning_rate": 0.0027240000000000003, + "loss": 3.7407, + "step": 908 + }, + { + "epoch": 0.07978255085835909, + "grad_norm": 0.328125, + "learning_rate": 0.0027270000000000003, + "loss": 3.6343, + "step": 909 + }, + { + "epoch": 0.07987032044126158, + "grad_norm": 0.34765625, + "learning_rate": 0.0027300000000000002, + "loss": 3.6108, + "step": 910 + }, + { + "epoch": 0.07995809002416407, + "grad_norm": 0.361328125, + "learning_rate": 0.002733, + "loss": 3.6875, + "step": 911 + }, + { + "epoch": 0.08004585960706655, + "grad_norm": 0.431640625, + "learning_rate": 0.002736, + "loss": 3.585, + "step": 912 + }, + { + "epoch": 0.08013362918996904, + "grad_norm": 0.796875, + "learning_rate": 0.002739, + "loss": 3.6724, + "step": 913 + }, + { + "epoch": 0.08022139877287152, + "grad_norm": 0.33984375, + "learning_rate": 0.002742, + "loss": 3.6455, + "step": 914 + }, + { + "epoch": 0.08030916835577401, + "grad_norm": 0.3515625, + "learning_rate": 0.002745, + "loss": 3.6523, + "step": 915 + }, + { + "epoch": 0.08039693793867649, + "grad_norm": 0.30859375, + "learning_rate": 0.002748, + "loss": 3.6792, + "step": 916 + }, + { + "epoch": 0.08048470752157898, + "grad_norm": 0.294921875, + "learning_rate": 0.002751, + "loss": 3.7769, + "step": 917 + }, + { + "epoch": 0.08057247710448145, + "grad_norm": 0.404296875, + "learning_rate": 0.0027540000000000004, + "loss": 3.665, + "step": 918 + }, + { + "epoch": 0.08066024668738395, + "grad_norm": 1.515625, + "learning_rate": 0.0027570000000000003, + "loss": 3.6743, + "step": 919 + }, + { + "epoch": 0.08074801627028642, + "grad_norm": 0.57421875, + "learning_rate": 0.0027600000000000003, + "loss": 3.6426, + "step": 920 + }, + { + "epoch": 0.08083578585318892, + "grad_norm": 0.498046875, + "learning_rate": 0.0027630000000000003, + "loss": 3.6484, + "step": 921 + }, + { + "epoch": 0.08092355543609141, + "grad_norm": 0.578125, + "learning_rate": 0.0027660000000000002, + "loss": 3.6919, + "step": 922 + }, + { + "epoch": 0.08101132501899388, + "grad_norm": 0.62890625, + "learning_rate": 0.002769, + "loss": 3.6587, + "step": 923 + }, + { + "epoch": 0.08109909460189638, + "grad_norm": 1.0703125, + "learning_rate": 0.002772, + "loss": 3.6606, + "step": 924 + }, + { + "epoch": 0.08118686418479885, + "grad_norm": 1.3359375, + "learning_rate": 0.002775, + "loss": 3.7056, + "step": 925 + }, + { + "epoch": 0.08127463376770135, + "grad_norm": 0.69140625, + "learning_rate": 0.002778, + "loss": 3.6943, + "step": 926 + }, + { + "epoch": 0.08136240335060382, + "grad_norm": 1.828125, + "learning_rate": 0.002781, + "loss": 3.6484, + "step": 927 + }, + { + "epoch": 0.08145017293350632, + "grad_norm": 0.5390625, + "learning_rate": 0.002784, + "loss": 3.6758, + "step": 928 + }, + { + "epoch": 0.08153794251640879, + "grad_norm": 0.91796875, + "learning_rate": 0.0027870000000000004, + "loss": 3.7661, + "step": 929 + }, + { + "epoch": 0.08162571209931128, + "grad_norm": 0.54296875, + "learning_rate": 0.0027900000000000004, + "loss": 3.7485, + "step": 930 + }, + { + "epoch": 0.08171348168221376, + "grad_norm": 0.458984375, + "learning_rate": 0.0027930000000000003, + "loss": 3.7007, + "step": 931 + }, + { + "epoch": 0.08180125126511625, + "grad_norm": 0.490234375, + "learning_rate": 0.0027960000000000003, + "loss": 3.7114, + "step": 932 + }, + { + "epoch": 0.08188902084801875, + "grad_norm": 0.390625, + "learning_rate": 0.0027990000000000003, + "loss": 3.7549, + "step": 933 + }, + { + "epoch": 0.08197679043092122, + "grad_norm": 0.458984375, + "learning_rate": 0.0028020000000000002, + "loss": 3.6338, + "step": 934 + }, + { + "epoch": 0.08206456001382371, + "grad_norm": 0.41796875, + "learning_rate": 0.002805, + "loss": 3.6816, + "step": 935 + }, + { + "epoch": 0.08215232959672619, + "grad_norm": 0.26171875, + "learning_rate": 0.002808, + "loss": 3.6445, + "step": 936 + }, + { + "epoch": 0.08224009917962868, + "grad_norm": 0.59765625, + "learning_rate": 0.002811, + "loss": 3.6426, + "step": 937 + }, + { + "epoch": 0.08232786876253116, + "grad_norm": 0.34375, + "learning_rate": 0.002814, + "loss": 3.6396, + "step": 938 + }, + { + "epoch": 0.08241563834543365, + "grad_norm": 0.28125, + "learning_rate": 0.002817, + "loss": 3.6426, + "step": 939 + }, + { + "epoch": 0.08250340792833613, + "grad_norm": 0.287109375, + "learning_rate": 0.00282, + "loss": 3.6167, + "step": 940 + }, + { + "epoch": 0.08259117751123862, + "grad_norm": 0.18359375, + "learning_rate": 0.002823, + "loss": 3.6108, + "step": 941 + }, + { + "epoch": 0.0826789470941411, + "grad_norm": 0.2353515625, + "learning_rate": 0.002826, + "loss": 3.689, + "step": 942 + }, + { + "epoch": 0.08276671667704359, + "grad_norm": 0.34765625, + "learning_rate": 0.002829, + "loss": 3.6821, + "step": 943 + }, + { + "epoch": 0.08285448625994607, + "grad_norm": 0.45703125, + "learning_rate": 0.002832, + "loss": 3.6182, + "step": 944 + }, + { + "epoch": 0.08294225584284856, + "grad_norm": 4.0625, + "learning_rate": 0.002835, + "loss": 3.5884, + "step": 945 + }, + { + "epoch": 0.08303002542575105, + "grad_norm": 1.96875, + "learning_rate": 0.002838, + "loss": 3.7207, + "step": 946 + }, + { + "epoch": 0.08311779500865353, + "grad_norm": 17.5, + "learning_rate": 0.0028409999999999998, + "loss": 4.4404, + "step": 947 + }, + { + "epoch": 0.08320556459155602, + "grad_norm": 6.59375, + "learning_rate": 0.0028439999999999997, + "loss": 3.8545, + "step": 948 + }, + { + "epoch": 0.0832933341744585, + "grad_norm": 1.296875, + "learning_rate": 0.002847, + "loss": 3.7534, + "step": 949 + }, + { + "epoch": 0.08338110375736099, + "grad_norm": 2.3125, + "learning_rate": 0.00285, + "loss": 3.8262, + "step": 950 + }, + { + "epoch": 0.08346887334026347, + "grad_norm": 1.9921875, + "learning_rate": 0.002853, + "loss": 3.8135, + "step": 951 + }, + { + "epoch": 0.08355664292316596, + "grad_norm": 0.859375, + "learning_rate": 0.002856, + "loss": 3.7271, + "step": 952 + }, + { + "epoch": 0.08364441250606844, + "grad_norm": 1.2421875, + "learning_rate": 0.002859, + "loss": 3.7886, + "step": 953 + }, + { + "epoch": 0.08373218208897093, + "grad_norm": 1.109375, + "learning_rate": 0.002862, + "loss": 3.7295, + "step": 954 + }, + { + "epoch": 0.08381995167187341, + "grad_norm": 0.8359375, + "learning_rate": 0.002865, + "loss": 3.6191, + "step": 955 + }, + { + "epoch": 0.0839077212547759, + "grad_norm": 0.7890625, + "learning_rate": 0.002868, + "loss": 3.7207, + "step": 956 + }, + { + "epoch": 0.08399549083767839, + "grad_norm": 0.765625, + "learning_rate": 0.002871, + "loss": 3.6973, + "step": 957 + }, + { + "epoch": 0.08408326042058087, + "grad_norm": 0.43359375, + "learning_rate": 0.002874, + "loss": 3.7539, + "step": 958 + }, + { + "epoch": 0.08417103000348336, + "grad_norm": 1.1328125, + "learning_rate": 0.002877, + "loss": 3.7188, + "step": 959 + }, + { + "epoch": 0.08425879958638584, + "grad_norm": 0.43359375, + "learning_rate": 0.0028799999999999997, + "loss": 3.73, + "step": 960 + }, + { + "epoch": 0.08434656916928833, + "grad_norm": 0.71484375, + "learning_rate": 0.002883, + "loss": 3.6167, + "step": 961 + }, + { + "epoch": 0.08443433875219081, + "grad_norm": 0.419921875, + "learning_rate": 0.002886, + "loss": 3.6514, + "step": 962 + }, + { + "epoch": 0.0845221083350933, + "grad_norm": 0.3515625, + "learning_rate": 0.002889, + "loss": 3.7183, + "step": 963 + }, + { + "epoch": 0.08460987791799578, + "grad_norm": 0.263671875, + "learning_rate": 0.002892, + "loss": 3.7275, + "step": 964 + }, + { + "epoch": 0.08469764750089827, + "grad_norm": 0.322265625, + "learning_rate": 0.002895, + "loss": 3.6123, + "step": 965 + }, + { + "epoch": 0.08478541708380075, + "grad_norm": 0.3515625, + "learning_rate": 0.002898, + "loss": 3.687, + "step": 966 + }, + { + "epoch": 0.08487318666670324, + "grad_norm": 0.2431640625, + "learning_rate": 0.002901, + "loss": 3.6724, + "step": 967 + }, + { + "epoch": 0.08496095624960573, + "grad_norm": 0.435546875, + "learning_rate": 0.002904, + "loss": 3.7109, + "step": 968 + }, + { + "epoch": 0.0850487258325082, + "grad_norm": 0.373046875, + "learning_rate": 0.002907, + "loss": 3.5679, + "step": 969 + }, + { + "epoch": 0.0851364954154107, + "grad_norm": 0.33203125, + "learning_rate": 0.00291, + "loss": 3.6958, + "step": 970 + }, + { + "epoch": 0.08522426499831318, + "grad_norm": 0.8203125, + "learning_rate": 0.002913, + "loss": 3.6597, + "step": 971 + }, + { + "epoch": 0.08531203458121567, + "grad_norm": 0.39453125, + "learning_rate": 0.002916, + "loss": 3.7212, + "step": 972 + }, + { + "epoch": 0.08539980416411815, + "grad_norm": 0.73828125, + "learning_rate": 0.002919, + "loss": 3.6523, + "step": 973 + }, + { + "epoch": 0.08548757374702064, + "grad_norm": 0.314453125, + "learning_rate": 0.002922, + "loss": 3.647, + "step": 974 + }, + { + "epoch": 0.08557534332992311, + "grad_norm": 0.65625, + "learning_rate": 0.002925, + "loss": 3.6421, + "step": 975 + }, + { + "epoch": 0.0856631129128256, + "grad_norm": 17.0, + "learning_rate": 0.002928, + "loss": 3.7246, + "step": 976 + }, + { + "epoch": 0.08575088249572808, + "grad_norm": 1.125, + "learning_rate": 0.002931, + "loss": 3.6187, + "step": 977 + }, + { + "epoch": 0.08583865207863058, + "grad_norm": 0.74609375, + "learning_rate": 0.002934, + "loss": 3.7295, + "step": 978 + }, + { + "epoch": 0.08592642166153307, + "grad_norm": 0.431640625, + "learning_rate": 0.002937, + "loss": 3.7075, + "step": 979 + }, + { + "epoch": 0.08601419124443554, + "grad_norm": 0.61328125, + "learning_rate": 0.00294, + "loss": 3.7109, + "step": 980 + }, + { + "epoch": 0.08610196082733804, + "grad_norm": 0.359375, + "learning_rate": 0.002943, + "loss": 3.5815, + "step": 981 + }, + { + "epoch": 0.08618973041024051, + "grad_norm": 0.39453125, + "learning_rate": 0.002946, + "loss": 3.6768, + "step": 982 + }, + { + "epoch": 0.086277499993143, + "grad_norm": 0.3984375, + "learning_rate": 0.0029490000000000002, + "loss": 3.6567, + "step": 983 + }, + { + "epoch": 0.08636526957604548, + "grad_norm": 0.271484375, + "learning_rate": 0.002952, + "loss": 3.6489, + "step": 984 + }, + { + "epoch": 0.08645303915894798, + "grad_norm": 0.279296875, + "learning_rate": 0.002955, + "loss": 3.709, + "step": 985 + }, + { + "epoch": 0.08654080874185045, + "grad_norm": 0.25, + "learning_rate": 0.002958, + "loss": 3.6841, + "step": 986 + }, + { + "epoch": 0.08662857832475294, + "grad_norm": 0.333984375, + "learning_rate": 0.002961, + "loss": 3.6665, + "step": 987 + }, + { + "epoch": 0.08671634790765542, + "grad_norm": 0.2470703125, + "learning_rate": 0.002964, + "loss": 3.6299, + "step": 988 + }, + { + "epoch": 0.08680411749055791, + "grad_norm": 0.296875, + "learning_rate": 0.002967, + "loss": 3.5996, + "step": 989 + }, + { + "epoch": 0.08689188707346039, + "grad_norm": 24.75, + "learning_rate": 0.00297, + "loss": 3.7173, + "step": 990 + }, + { + "epoch": 0.08697965665636288, + "grad_norm": 0.91796875, + "learning_rate": 0.002973, + "loss": 3.6875, + "step": 991 + }, + { + "epoch": 0.08706742623926537, + "grad_norm": 0.890625, + "learning_rate": 0.002976, + "loss": 3.7153, + "step": 992 + }, + { + "epoch": 0.08715519582216785, + "grad_norm": 1.0234375, + "learning_rate": 0.002979, + "loss": 3.6968, + "step": 993 + }, + { + "epoch": 0.08724296540507034, + "grad_norm": 1.15625, + "learning_rate": 0.002982, + "loss": 3.7632, + "step": 994 + }, + { + "epoch": 0.08733073498797282, + "grad_norm": 0.80078125, + "learning_rate": 0.0029850000000000002, + "loss": 3.6479, + "step": 995 + }, + { + "epoch": 0.08741850457087531, + "grad_norm": 0.6015625, + "learning_rate": 0.002988, + "loss": 3.6816, + "step": 996 + }, + { + "epoch": 0.08750627415377779, + "grad_norm": 0.796875, + "learning_rate": 0.002991, + "loss": 3.6348, + "step": 997 + }, + { + "epoch": 0.08759404373668028, + "grad_norm": 0.53125, + "learning_rate": 0.002994, + "loss": 3.6948, + "step": 998 + }, + { + "epoch": 0.08768181331958276, + "grad_norm": 0.39453125, + "learning_rate": 0.002997, + "loss": 3.6421, + "step": 999 + }, + { + "epoch": 0.08776958290248525, + "grad_norm": 0.421875, + "learning_rate": 0.003, + "loss": 3.584, + "step": 1000 + }, + { + "epoch": 0.08776958290248525, + "eval_loss": 0.11389437317848206, + "eval_runtime": 106.9972, + "eval_samples_per_second": 137.845, + "eval_steps_per_second": 17.234, + "step": 1000 + } + ], + "logging_steps": 1.0, + "max_steps": 11393, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.948783792128e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}