{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.12479653867516, "learning_rate": 2.734375e-08, "loss": 0.5861, "step": 1 }, { "epoch": 0.0, "grad_norm": 3.4327081284010026, "learning_rate": 5.46875e-08, "loss": 0.5638, "step": 2 }, { "epoch": 0.01, "grad_norm": 2.5568031920380836, "learning_rate": 8.203125e-08, "loss": 0.5699, "step": 3 }, { "epoch": 0.01, "grad_norm": 2.763847840088451, "learning_rate": 1.09375e-07, "loss": 0.5205, "step": 4 }, { "epoch": 0.01, "grad_norm": 3.1534479919852747, "learning_rate": 1.3671875e-07, "loss": 0.5981, "step": 5 }, { "epoch": 0.01, "grad_norm": 3.6519630701310004, "learning_rate": 1.640625e-07, "loss": 0.6026, "step": 6 }, { "epoch": 0.01, "grad_norm": 3.0551873568820564, "learning_rate": 1.9140625e-07, "loss": 0.6514, "step": 7 }, { "epoch": 0.02, "grad_norm": 2.7213862411162, "learning_rate": 2.1875e-07, "loss": 0.6034, "step": 8 }, { "epoch": 0.02, "grad_norm": 3.173439796110619, "learning_rate": 2.4609375e-07, "loss": 0.6255, "step": 9 }, { "epoch": 0.02, "grad_norm": 2.871323983043392, "learning_rate": 2.734375e-07, "loss": 0.5582, "step": 10 }, { "epoch": 0.02, "grad_norm": 2.666355918213876, "learning_rate": 3.0078125e-07, "loss": 0.5025, "step": 11 }, { "epoch": 0.02, "grad_norm": 3.069246299609569, "learning_rate": 3.28125e-07, "loss": 0.6004, "step": 12 }, { "epoch": 0.03, "grad_norm": 2.828421478897632, "learning_rate": 3.5546875e-07, "loss": 0.5952, "step": 13 }, { "epoch": 0.03, "grad_norm": 2.9667749656892064, "learning_rate": 3.828125e-07, "loss": 0.6452, "step": 14 }, { "epoch": 0.03, "grad_norm": 2.990112008784599, "learning_rate": 4.1015625e-07, "loss": 0.5755, "step": 15 }, { "epoch": 0.03, "grad_norm": 3.0226128328924022, "learning_rate": 4.375e-07, "loss": 0.5446, "step": 16 }, { "epoch": 0.03, "grad_norm": 2.7732778056589686, "learning_rate": 4.6484374999999997e-07, "loss": 0.537, "step": 17 }, { "epoch": 0.04, "grad_norm": 2.502326973248151, "learning_rate": 4.921875e-07, "loss": 0.4911, "step": 18 }, { "epoch": 0.04, "grad_norm": 2.894819761270773, "learning_rate": 5.1953125e-07, "loss": 0.631, "step": 19 }, { "epoch": 0.04, "grad_norm": 2.5302166081949378, "learning_rate": 5.46875e-07, "loss": 0.5397, "step": 20 }, { "epoch": 0.04, "grad_norm": 2.4584277694878636, "learning_rate": 5.7421875e-07, "loss": 0.4821, "step": 21 }, { "epoch": 0.04, "grad_norm": 2.76392532963757, "learning_rate": 6.015625e-07, "loss": 0.5298, "step": 22 }, { "epoch": 0.04, "grad_norm": 2.747970763515135, "learning_rate": 6.2890625e-07, "loss": 0.5478, "step": 23 }, { "epoch": 0.05, "grad_norm": 2.343975627385572, "learning_rate": 6.5625e-07, "loss": 0.5093, "step": 24 }, { "epoch": 0.05, "grad_norm": 2.3534865691445224, "learning_rate": 6.8359375e-07, "loss": 0.5687, "step": 25 }, { "epoch": 0.05, "grad_norm": 2.4741285918443734, "learning_rate": 7.109375e-07, "loss": 0.5154, "step": 26 }, { "epoch": 0.05, "grad_norm": 2.104008330099634, "learning_rate": 7.382812499999999e-07, "loss": 0.4679, "step": 27 }, { "epoch": 0.05, "grad_norm": 2.094677745221641, "learning_rate": 7.65625e-07, "loss": 0.556, "step": 28 }, { "epoch": 0.06, "grad_norm": 2.0972108622606407, "learning_rate": 7.9296875e-07, "loss": 0.5225, "step": 29 }, { "epoch": 0.06, "grad_norm": 1.9019460552243679, "learning_rate": 8.203125e-07, "loss": 0.5475, "step": 30 }, { "epoch": 0.06, "grad_norm": 1.9563383746932521, "learning_rate": 8.4765625e-07, "loss": 0.5377, "step": 31 }, { "epoch": 0.06, "grad_norm": 1.959431236402202, "learning_rate": 8.75e-07, "loss": 0.5423, "step": 32 }, { "epoch": 0.06, "grad_norm": 2.1323399078310974, "learning_rate": 9.0234375e-07, "loss": 0.4937, "step": 33 }, { "epoch": 0.07, "grad_norm": 1.7714320455619716, "learning_rate": 9.296874999999999e-07, "loss": 0.4173, "step": 34 }, { "epoch": 0.07, "grad_norm": 1.9309742159990382, "learning_rate": 9.5703125e-07, "loss": 0.4842, "step": 35 }, { "epoch": 0.07, "grad_norm": 2.0130615419630487, "learning_rate": 9.84375e-07, "loss": 0.4888, "step": 36 }, { "epoch": 0.07, "grad_norm": 2.1297312482886146, "learning_rate": 1.01171875e-06, "loss": 0.5822, "step": 37 }, { "epoch": 0.07, "grad_norm": 1.8584485359413976, "learning_rate": 1.0390625e-06, "loss": 0.4496, "step": 38 }, { "epoch": 0.08, "grad_norm": 2.151288492331948, "learning_rate": 1.06640625e-06, "loss": 0.5686, "step": 39 }, { "epoch": 0.08, "grad_norm": 2.3081499385256077, "learning_rate": 1.09375e-06, "loss": 0.4895, "step": 40 }, { "epoch": 0.08, "grad_norm": 1.814269031221346, "learning_rate": 1.12109375e-06, "loss": 0.4594, "step": 41 }, { "epoch": 0.08, "grad_norm": 1.8048595929506783, "learning_rate": 1.1484375e-06, "loss": 0.4529, "step": 42 }, { "epoch": 0.08, "grad_norm": 1.7914946241126668, "learning_rate": 1.17578125e-06, "loss": 0.454, "step": 43 }, { "epoch": 0.09, "grad_norm": 1.7235953203060048, "learning_rate": 1.203125e-06, "loss": 0.4018, "step": 44 }, { "epoch": 0.09, "grad_norm": 1.7826130500110466, "learning_rate": 1.23046875e-06, "loss": 0.4373, "step": 45 }, { "epoch": 0.09, "grad_norm": 2.0108636192241365, "learning_rate": 1.2578125e-06, "loss": 0.4903, "step": 46 }, { "epoch": 0.09, "grad_norm": 2.0145729261689325, "learning_rate": 1.28515625e-06, "loss": 0.4803, "step": 47 }, { "epoch": 0.09, "grad_norm": 2.065690930495505, "learning_rate": 1.3125e-06, "loss": 0.5295, "step": 48 }, { "epoch": 0.1, "grad_norm": 1.5881761502663563, "learning_rate": 1.33984375e-06, "loss": 0.4314, "step": 49 }, { "epoch": 0.1, "grad_norm": 1.6174445159844109, "learning_rate": 1.3671875e-06, "loss": 0.4784, "step": 50 }, { "epoch": 0.1, "grad_norm": 1.911416540546183, "learning_rate": 1.39453125e-06, "loss": 0.4977, "step": 51 }, { "epoch": 0.1, "grad_norm": 1.7539927778683315, "learning_rate": 1.421875e-06, "loss": 0.4705, "step": 52 }, { "epoch": 0.1, "grad_norm": 1.5780434754792398, "learning_rate": 1.44921875e-06, "loss": 0.4442, "step": 53 }, { "epoch": 0.11, "grad_norm": 1.9533819265513517, "learning_rate": 1.4765624999999999e-06, "loss": 0.4722, "step": 54 }, { "epoch": 0.11, "grad_norm": 1.9070263390537898, "learning_rate": 1.50390625e-06, "loss": 0.4974, "step": 55 }, { "epoch": 0.11, "grad_norm": 1.8006671245648067, "learning_rate": 1.53125e-06, "loss": 0.4697, "step": 56 }, { "epoch": 0.11, "grad_norm": 1.854976044747699, "learning_rate": 1.55859375e-06, "loss": 0.4388, "step": 57 }, { "epoch": 0.11, "grad_norm": 1.9082980409242953, "learning_rate": 1.5859375e-06, "loss": 0.4641, "step": 58 }, { "epoch": 0.12, "grad_norm": 1.6883982780024662, "learning_rate": 1.61328125e-06, "loss": 0.4187, "step": 59 }, { "epoch": 0.12, "grad_norm": 1.8094706167839905, "learning_rate": 1.640625e-06, "loss": 0.4916, "step": 60 }, { "epoch": 0.12, "grad_norm": 1.8328705090506234, "learning_rate": 1.6679687499999999e-06, "loss": 0.5259, "step": 61 }, { "epoch": 0.12, "grad_norm": 2.013141202057912, "learning_rate": 1.6953125e-06, "loss": 0.4772, "step": 62 }, { "epoch": 0.12, "grad_norm": 1.4706555641644756, "learning_rate": 1.72265625e-06, "loss": 0.3947, "step": 63 }, { "epoch": 0.12, "grad_norm": 1.9544483635873917, "learning_rate": 1.75e-06, "loss": 0.5514, "step": 64 }, { "epoch": 0.13, "grad_norm": 1.9081962353339974, "learning_rate": 1.77734375e-06, "loss": 0.4999, "step": 65 }, { "epoch": 0.13, "grad_norm": 1.7629639097208152, "learning_rate": 1.8046875e-06, "loss": 0.4432, "step": 66 }, { "epoch": 0.13, "grad_norm": 1.9204652834062548, "learning_rate": 1.83203125e-06, "loss": 0.5264, "step": 67 }, { "epoch": 0.13, "grad_norm": 1.4696709267567245, "learning_rate": 1.8593749999999999e-06, "loss": 0.3969, "step": 68 }, { "epoch": 0.13, "grad_norm": 1.8726152071120379, "learning_rate": 1.88671875e-06, "loss": 0.5675, "step": 69 }, { "epoch": 0.14, "grad_norm": 1.8771054048125606, "learning_rate": 1.9140625e-06, "loss": 0.4755, "step": 70 }, { "epoch": 0.14, "grad_norm": 1.6556425067938199, "learning_rate": 1.94140625e-06, "loss": 0.4566, "step": 71 }, { "epoch": 0.14, "grad_norm": 1.8466175935628664, "learning_rate": 1.96875e-06, "loss": 0.4427, "step": 72 }, { "epoch": 0.14, "grad_norm": 1.686459695223073, "learning_rate": 1.99609375e-06, "loss": 0.4499, "step": 73 }, { "epoch": 0.14, "grad_norm": 1.7468421881609446, "learning_rate": 2.0234375e-06, "loss": 0.487, "step": 74 }, { "epoch": 0.15, "grad_norm": 2.0709518255548134, "learning_rate": 2.05078125e-06, "loss": 0.5887, "step": 75 }, { "epoch": 0.15, "grad_norm": 1.870827323216515, "learning_rate": 2.078125e-06, "loss": 0.4571, "step": 76 }, { "epoch": 0.15, "grad_norm": 1.550405595027127, "learning_rate": 2.10546875e-06, "loss": 0.3801, "step": 77 }, { "epoch": 0.15, "grad_norm": 1.6908130153708054, "learning_rate": 2.1328125e-06, "loss": 0.4198, "step": 78 }, { "epoch": 0.15, "grad_norm": 1.6644690547119, "learning_rate": 2.16015625e-06, "loss": 0.3685, "step": 79 }, { "epoch": 0.16, "grad_norm": 1.6505264811660256, "learning_rate": 2.1875e-06, "loss": 0.4611, "step": 80 }, { "epoch": 0.16, "grad_norm": 1.7943777269503862, "learning_rate": 2.21484375e-06, "loss": 0.483, "step": 81 }, { "epoch": 0.16, "grad_norm": 1.7248746586664772, "learning_rate": 2.2421875e-06, "loss": 0.4196, "step": 82 }, { "epoch": 0.16, "grad_norm": 1.6599902854892024, "learning_rate": 2.26953125e-06, "loss": 0.4512, "step": 83 }, { "epoch": 0.16, "grad_norm": 1.595109699486187, "learning_rate": 2.296875e-06, "loss": 0.4208, "step": 84 }, { "epoch": 0.17, "grad_norm": 2.024525650847996, "learning_rate": 2.32421875e-06, "loss": 0.4615, "step": 85 }, { "epoch": 0.17, "grad_norm": 1.9175537728598773, "learning_rate": 2.3515625e-06, "loss": 0.5082, "step": 86 }, { "epoch": 0.17, "grad_norm": 1.7244046892816707, "learning_rate": 2.37890625e-06, "loss": 0.4014, "step": 87 }, { "epoch": 0.17, "grad_norm": 2.102644148583947, "learning_rate": 2.40625e-06, "loss": 0.4506, "step": 88 }, { "epoch": 0.17, "grad_norm": 1.6872823542477997, "learning_rate": 2.43359375e-06, "loss": 0.429, "step": 89 }, { "epoch": 0.18, "grad_norm": 1.7650870483851036, "learning_rate": 2.4609375e-06, "loss": 0.4964, "step": 90 }, { "epoch": 0.18, "grad_norm": 1.7477469395085181, "learning_rate": 2.48828125e-06, "loss": 0.3185, "step": 91 }, { "epoch": 0.18, "grad_norm": 1.7189761643109716, "learning_rate": 2.515625e-06, "loss": 0.4627, "step": 92 }, { "epoch": 0.18, "grad_norm": 1.71704519376906, "learning_rate": 2.54296875e-06, "loss": 0.476, "step": 93 }, { "epoch": 0.18, "grad_norm": 1.497017365831549, "learning_rate": 2.5703125e-06, "loss": 0.4144, "step": 94 }, { "epoch": 0.19, "grad_norm": 1.7450525339657175, "learning_rate": 2.59765625e-06, "loss": 0.5124, "step": 95 }, { "epoch": 0.19, "grad_norm": 1.819097613560267, "learning_rate": 2.625e-06, "loss": 0.4888, "step": 96 }, { "epoch": 0.19, "grad_norm": 2.001958147535639, "learning_rate": 2.65234375e-06, "loss": 0.4484, "step": 97 }, { "epoch": 0.19, "grad_norm": 2.1012339012906733, "learning_rate": 2.6796875e-06, "loss": 0.4416, "step": 98 }, { "epoch": 0.19, "grad_norm": 1.7009285137046402, "learning_rate": 2.70703125e-06, "loss": 0.4204, "step": 99 }, { "epoch": 0.2, "grad_norm": 1.6321513012202273, "learning_rate": 2.734375e-06, "loss": 0.3799, "step": 100 }, { "epoch": 0.2, "grad_norm": 1.4461088915192952, "learning_rate": 2.76171875e-06, "loss": 0.3614, "step": 101 }, { "epoch": 0.2, "grad_norm": 1.984531883085068, "learning_rate": 2.7890625e-06, "loss": 0.4088, "step": 102 }, { "epoch": 0.2, "grad_norm": 1.7698381615645395, "learning_rate": 2.81640625e-06, "loss": 0.437, "step": 103 }, { "epoch": 0.2, "grad_norm": 1.8195216294513308, "learning_rate": 2.84375e-06, "loss": 0.461, "step": 104 }, { "epoch": 0.21, "grad_norm": 1.6755177167537636, "learning_rate": 2.87109375e-06, "loss": 0.4087, "step": 105 }, { "epoch": 0.21, "grad_norm": 1.7630699304396844, "learning_rate": 2.8984375e-06, "loss": 0.4924, "step": 106 }, { "epoch": 0.21, "grad_norm": 1.6836260576636768, "learning_rate": 2.92578125e-06, "loss": 0.4397, "step": 107 }, { "epoch": 0.21, "grad_norm": 1.7731819557785997, "learning_rate": 2.9531249999999998e-06, "loss": 0.4014, "step": 108 }, { "epoch": 0.21, "grad_norm": 1.4510457365600038, "learning_rate": 2.98046875e-06, "loss": 0.379, "step": 109 }, { "epoch": 0.21, "grad_norm": 1.6781077550601724, "learning_rate": 3.0078125e-06, "loss": 0.4043, "step": 110 }, { "epoch": 0.22, "grad_norm": 1.6482638605717044, "learning_rate": 3.03515625e-06, "loss": 0.4384, "step": 111 }, { "epoch": 0.22, "grad_norm": 1.4884580935712903, "learning_rate": 3.0625e-06, "loss": 0.4293, "step": 112 }, { "epoch": 0.22, "grad_norm": 1.7711449638950658, "learning_rate": 3.08984375e-06, "loss": 0.4079, "step": 113 }, { "epoch": 0.22, "grad_norm": 1.6112110440434502, "learning_rate": 3.1171875e-06, "loss": 0.3694, "step": 114 }, { "epoch": 0.22, "grad_norm": 1.7881718918617886, "learning_rate": 3.1445312499999998e-06, "loss": 0.3924, "step": 115 }, { "epoch": 0.23, "grad_norm": 1.532453009735892, "learning_rate": 3.171875e-06, "loss": 0.4012, "step": 116 }, { "epoch": 0.23, "grad_norm": 1.9514585945194156, "learning_rate": 3.19921875e-06, "loss": 0.4186, "step": 117 }, { "epoch": 0.23, "grad_norm": 1.441317225391445, "learning_rate": 3.2265625e-06, "loss": 0.3296, "step": 118 }, { "epoch": 0.23, "grad_norm": 1.9292724929102105, "learning_rate": 3.25390625e-06, "loss": 0.5275, "step": 119 }, { "epoch": 0.23, "grad_norm": 1.5025585373215582, "learning_rate": 3.28125e-06, "loss": 0.3646, "step": 120 }, { "epoch": 0.24, "grad_norm": 1.7379270730861727, "learning_rate": 3.30859375e-06, "loss": 0.4164, "step": 121 }, { "epoch": 0.24, "grad_norm": 1.568096337815179, "learning_rate": 3.3359374999999998e-06, "loss": 0.4188, "step": 122 }, { "epoch": 0.24, "grad_norm": 1.6013867697231303, "learning_rate": 3.36328125e-06, "loss": 0.3913, "step": 123 }, { "epoch": 0.24, "grad_norm": 1.6797286223745098, "learning_rate": 3.390625e-06, "loss": 0.3573, "step": 124 }, { "epoch": 0.24, "grad_norm": 1.6282377081510715, "learning_rate": 3.41796875e-06, "loss": 0.4142, "step": 125 }, { "epoch": 0.25, "grad_norm": 1.6843749596830924, "learning_rate": 3.4453125e-06, "loss": 0.4532, "step": 126 }, { "epoch": 0.25, "grad_norm": 1.5689953854434129, "learning_rate": 3.47265625e-06, "loss": 0.3824, "step": 127 }, { "epoch": 0.25, "grad_norm": 1.7087616059981858, "learning_rate": 3.5e-06, "loss": 0.4042, "step": 128 }, { "epoch": 0.25, "grad_norm": 1.8327826872067765, "learning_rate": 3.5273437499999998e-06, "loss": 0.513, "step": 129 }, { "epoch": 0.25, "grad_norm": 1.397404740283114, "learning_rate": 3.5546875e-06, "loss": 0.3597, "step": 130 }, { "epoch": 0.26, "grad_norm": 1.8545103721188279, "learning_rate": 3.58203125e-06, "loss": 0.4128, "step": 131 }, { "epoch": 0.26, "grad_norm": 1.7237575823682803, "learning_rate": 3.609375e-06, "loss": 0.3629, "step": 132 }, { "epoch": 0.26, "grad_norm": 1.591661001527436, "learning_rate": 3.63671875e-06, "loss": 0.3645, "step": 133 }, { "epoch": 0.26, "grad_norm": 1.797306959277975, "learning_rate": 3.6640625e-06, "loss": 0.4391, "step": 134 }, { "epoch": 0.26, "grad_norm": 1.8361743513439444, "learning_rate": 3.69140625e-06, "loss": 0.4416, "step": 135 }, { "epoch": 0.27, "grad_norm": 2.0365410993409645, "learning_rate": 3.7187499999999998e-06, "loss": 0.5031, "step": 136 }, { "epoch": 0.27, "grad_norm": 1.8367234716585397, "learning_rate": 3.74609375e-06, "loss": 0.4779, "step": 137 }, { "epoch": 0.27, "grad_norm": 1.5724371543362001, "learning_rate": 3.7734375e-06, "loss": 0.4078, "step": 138 }, { "epoch": 0.27, "grad_norm": 1.7078402703673015, "learning_rate": 3.80078125e-06, "loss": 0.4266, "step": 139 }, { "epoch": 0.27, "grad_norm": 1.568779979108734, "learning_rate": 3.828125e-06, "loss": 0.4111, "step": 140 }, { "epoch": 0.28, "grad_norm": 1.9936739941200035, "learning_rate": 3.85546875e-06, "loss": 0.4988, "step": 141 }, { "epoch": 0.28, "grad_norm": 2.033967834690862, "learning_rate": 3.8828125e-06, "loss": 0.4425, "step": 142 }, { "epoch": 0.28, "grad_norm": 1.7965156531031705, "learning_rate": 3.91015625e-06, "loss": 0.4275, "step": 143 }, { "epoch": 0.28, "grad_norm": 1.6422101420596205, "learning_rate": 3.9375e-06, "loss": 0.4353, "step": 144 }, { "epoch": 0.28, "grad_norm": 1.912904322477388, "learning_rate": 3.96484375e-06, "loss": 0.5171, "step": 145 }, { "epoch": 0.29, "grad_norm": 1.623122089854728, "learning_rate": 3.9921875e-06, "loss": 0.3784, "step": 146 }, { "epoch": 0.29, "grad_norm": 2.1726292455724012, "learning_rate": 4.01953125e-06, "loss": 0.4885, "step": 147 }, { "epoch": 0.29, "grad_norm": 1.7160902473040487, "learning_rate": 4.046875e-06, "loss": 0.4483, "step": 148 }, { "epoch": 0.29, "grad_norm": 1.6611698446143048, "learning_rate": 4.07421875e-06, "loss": 0.4517, "step": 149 }, { "epoch": 0.29, "grad_norm": 1.6660810433457787, "learning_rate": 4.1015625e-06, "loss": 0.4259, "step": 150 }, { "epoch": 0.29, "grad_norm": 1.8465264569653483, "learning_rate": 4.12890625e-06, "loss": 0.4503, "step": 151 }, { "epoch": 0.3, "grad_norm": 1.7629550903668525, "learning_rate": 4.15625e-06, "loss": 0.4145, "step": 152 }, { "epoch": 0.3, "grad_norm": 2.1202130366413474, "learning_rate": 4.18359375e-06, "loss": 0.4428, "step": 153 }, { "epoch": 0.3, "grad_norm": 1.701465436450958, "learning_rate": 4.2109375e-06, "loss": 0.4375, "step": 154 }, { "epoch": 0.3, "grad_norm": 1.6407280792466103, "learning_rate": 4.23828125e-06, "loss": 0.4255, "step": 155 }, { "epoch": 0.3, "grad_norm": 1.821352668745983, "learning_rate": 4.265625e-06, "loss": 0.362, "step": 156 }, { "epoch": 0.31, "grad_norm": 2.394805597013304, "learning_rate": 4.29296875e-06, "loss": 0.5299, "step": 157 }, { "epoch": 0.31, "grad_norm": 1.7124078622660246, "learning_rate": 4.3203125e-06, "loss": 0.4561, "step": 158 }, { "epoch": 0.31, "grad_norm": 1.7069519837651514, "learning_rate": 4.34765625e-06, "loss": 0.4266, "step": 159 }, { "epoch": 0.31, "grad_norm": 1.7313931655926338, "learning_rate": 4.375e-06, "loss": 0.4293, "step": 160 }, { "epoch": 0.31, "grad_norm": 1.8844043048464512, "learning_rate": 4.40234375e-06, "loss": 0.4173, "step": 161 }, { "epoch": 0.32, "grad_norm": 1.7092015421110573, "learning_rate": 4.4296875e-06, "loss": 0.4249, "step": 162 }, { "epoch": 0.32, "grad_norm": 1.5668713237163892, "learning_rate": 4.45703125e-06, "loss": 0.3622, "step": 163 }, { "epoch": 0.32, "grad_norm": 1.6849391505335922, "learning_rate": 4.484375e-06, "loss": 0.4183, "step": 164 }, { "epoch": 0.32, "grad_norm": 1.801054184825556, "learning_rate": 4.51171875e-06, "loss": 0.4316, "step": 165 }, { "epoch": 0.32, "grad_norm": 1.696185769119538, "learning_rate": 4.5390625e-06, "loss": 0.4416, "step": 166 }, { "epoch": 0.33, "grad_norm": 1.7240282411365215, "learning_rate": 4.56640625e-06, "loss": 0.4453, "step": 167 }, { "epoch": 0.33, "grad_norm": 2.1193931519394087, "learning_rate": 4.59375e-06, "loss": 0.481, "step": 168 }, { "epoch": 0.33, "grad_norm": 1.5575590327208104, "learning_rate": 4.62109375e-06, "loss": 0.4287, "step": 169 }, { "epoch": 0.33, "grad_norm": 1.6355855342748369, "learning_rate": 4.6484375e-06, "loss": 0.4335, "step": 170 }, { "epoch": 0.33, "grad_norm": 1.9378817843761114, "learning_rate": 4.67578125e-06, "loss": 0.4564, "step": 171 }, { "epoch": 0.34, "grad_norm": 1.951031815100943, "learning_rate": 4.703125e-06, "loss": 0.5223, "step": 172 }, { "epoch": 0.34, "grad_norm": 1.8902350569204727, "learning_rate": 4.73046875e-06, "loss": 0.4532, "step": 173 }, { "epoch": 0.34, "grad_norm": 1.803078400680445, "learning_rate": 4.7578125e-06, "loss": 0.435, "step": 174 }, { "epoch": 0.34, "grad_norm": 1.7770622451308296, "learning_rate": 4.78515625e-06, "loss": 0.4004, "step": 175 }, { "epoch": 0.34, "grad_norm": 1.768301829806762, "learning_rate": 4.8125e-06, "loss": 0.4309, "step": 176 }, { "epoch": 0.35, "grad_norm": 1.7693972402629146, "learning_rate": 4.83984375e-06, "loss": 0.3853, "step": 177 }, { "epoch": 0.35, "grad_norm": 1.8036812256391843, "learning_rate": 4.8671875e-06, "loss": 0.4832, "step": 178 }, { "epoch": 0.35, "grad_norm": 1.8682534133832587, "learning_rate": 4.89453125e-06, "loss": 0.4022, "step": 179 }, { "epoch": 0.35, "grad_norm": 1.8971277250017673, "learning_rate": 4.921875e-06, "loss": 0.4281, "step": 180 }, { "epoch": 0.35, "grad_norm": 1.5971046535673123, "learning_rate": 4.94921875e-06, "loss": 0.373, "step": 181 }, { "epoch": 0.36, "grad_norm": 1.9804483462519393, "learning_rate": 4.9765625e-06, "loss": 0.4493, "step": 182 }, { "epoch": 0.36, "grad_norm": 1.5281917284645978, "learning_rate": 5.00390625e-06, "loss": 0.4387, "step": 183 }, { "epoch": 0.36, "grad_norm": 1.592523743219491, "learning_rate": 5.03125e-06, "loss": 0.4117, "step": 184 }, { "epoch": 0.36, "grad_norm": 1.7760285967910396, "learning_rate": 5.05859375e-06, "loss": 0.4623, "step": 185 }, { "epoch": 0.36, "grad_norm": 2.0959205245208863, "learning_rate": 5.0859375e-06, "loss": 0.4623, "step": 186 }, { "epoch": 0.37, "grad_norm": 1.7550282587310155, "learning_rate": 5.11328125e-06, "loss": 0.3707, "step": 187 }, { "epoch": 0.37, "grad_norm": 1.5621360116899734, "learning_rate": 5.140625e-06, "loss": 0.3746, "step": 188 }, { "epoch": 0.37, "grad_norm": 2.0790330911598702, "learning_rate": 5.16796875e-06, "loss": 0.445, "step": 189 }, { "epoch": 0.37, "grad_norm": 1.9413324002508687, "learning_rate": 5.1953125e-06, "loss": 0.4513, "step": 190 }, { "epoch": 0.37, "grad_norm": 1.8149678890264702, "learning_rate": 5.22265625e-06, "loss": 0.4247, "step": 191 }, { "epoch": 0.38, "grad_norm": 1.9996002234613923, "learning_rate": 5.25e-06, "loss": 0.483, "step": 192 }, { "epoch": 0.38, "grad_norm": 1.7930966343215924, "learning_rate": 5.27734375e-06, "loss": 0.4251, "step": 193 }, { "epoch": 0.38, "grad_norm": 1.658210172615541, "learning_rate": 5.3046875e-06, "loss": 0.3867, "step": 194 }, { "epoch": 0.38, "grad_norm": 1.877592894136001, "learning_rate": 5.33203125e-06, "loss": 0.3832, "step": 195 }, { "epoch": 0.38, "grad_norm": 1.8328266694263893, "learning_rate": 5.359375e-06, "loss": 0.4324, "step": 196 }, { "epoch": 0.38, "grad_norm": 1.7385711118597726, "learning_rate": 5.38671875e-06, "loss": 0.4195, "step": 197 }, { "epoch": 0.39, "grad_norm": 1.5967196701317956, "learning_rate": 5.4140625e-06, "loss": 0.4165, "step": 198 }, { "epoch": 0.39, "grad_norm": 1.6283665320681795, "learning_rate": 5.44140625e-06, "loss": 0.3906, "step": 199 }, { "epoch": 0.39, "grad_norm": 1.5159797221628395, "learning_rate": 5.46875e-06, "loss": 0.4237, "step": 200 }, { "epoch": 0.39, "grad_norm": 1.803886745826042, "learning_rate": 5.49609375e-06, "loss": 0.3558, "step": 201 }, { "epoch": 0.39, "grad_norm": 1.876429799515415, "learning_rate": 5.5234375e-06, "loss": 0.3974, "step": 202 }, { "epoch": 0.4, "grad_norm": 1.5425670146491983, "learning_rate": 5.55078125e-06, "loss": 0.3348, "step": 203 }, { "epoch": 0.4, "grad_norm": 1.6459168004273308, "learning_rate": 5.578125e-06, "loss": 0.4219, "step": 204 }, { "epoch": 0.4, "grad_norm": 1.915568110920411, "learning_rate": 5.60546875e-06, "loss": 0.4291, "step": 205 }, { "epoch": 0.4, "grad_norm": 1.7806053811825309, "learning_rate": 5.6328125e-06, "loss": 0.4655, "step": 206 }, { "epoch": 0.4, "grad_norm": 1.6418504022502025, "learning_rate": 5.66015625e-06, "loss": 0.409, "step": 207 }, { "epoch": 0.41, "grad_norm": 1.8500313301769666, "learning_rate": 5.6875e-06, "loss": 0.4519, "step": 208 }, { "epoch": 0.41, "grad_norm": 1.5434160976939948, "learning_rate": 5.7148437499999996e-06, "loss": 0.3783, "step": 209 }, { "epoch": 0.41, "grad_norm": 1.5911856225169334, "learning_rate": 5.7421875e-06, "loss": 0.3827, "step": 210 }, { "epoch": 0.41, "grad_norm": 1.8008531140016548, "learning_rate": 5.76953125e-06, "loss": 0.4285, "step": 211 }, { "epoch": 0.41, "grad_norm": 1.7182258459224742, "learning_rate": 5.796875e-06, "loss": 0.4174, "step": 212 }, { "epoch": 0.42, "grad_norm": 2.0480616280504615, "learning_rate": 5.82421875e-06, "loss": 0.4272, "step": 213 }, { "epoch": 0.42, "grad_norm": 1.6471482622583642, "learning_rate": 5.8515625e-06, "loss": 0.3477, "step": 214 }, { "epoch": 0.42, "grad_norm": 1.6111730599484522, "learning_rate": 5.87890625e-06, "loss": 0.3707, "step": 215 }, { "epoch": 0.42, "grad_norm": 2.0138263618933783, "learning_rate": 5.9062499999999996e-06, "loss": 0.394, "step": 216 }, { "epoch": 0.42, "grad_norm": 1.5753463628441053, "learning_rate": 5.93359375e-06, "loss": 0.3833, "step": 217 }, { "epoch": 0.43, "grad_norm": 1.767102718540825, "learning_rate": 5.9609375e-06, "loss": 0.4506, "step": 218 }, { "epoch": 0.43, "grad_norm": 1.8257088094442036, "learning_rate": 5.98828125e-06, "loss": 0.4622, "step": 219 }, { "epoch": 0.43, "grad_norm": 1.9833804053555721, "learning_rate": 6.015625e-06, "loss": 0.4697, "step": 220 }, { "epoch": 0.43, "grad_norm": 1.846406147814906, "learning_rate": 6.04296875e-06, "loss": 0.4011, "step": 221 }, { "epoch": 0.43, "grad_norm": 1.9183577730861174, "learning_rate": 6.0703125e-06, "loss": 0.4713, "step": 222 }, { "epoch": 0.44, "grad_norm": 1.7362083162488955, "learning_rate": 6.0976562499999996e-06, "loss": 0.3846, "step": 223 }, { "epoch": 0.44, "grad_norm": 1.8193332882472566, "learning_rate": 6.125e-06, "loss": 0.3929, "step": 224 }, { "epoch": 0.44, "grad_norm": 1.9325778614179148, "learning_rate": 6.15234375e-06, "loss": 0.5014, "step": 225 }, { "epoch": 0.44, "grad_norm": 1.616759750331512, "learning_rate": 6.1796875e-06, "loss": 0.3841, "step": 226 }, { "epoch": 0.44, "grad_norm": 2.0221131856658285, "learning_rate": 6.20703125e-06, "loss": 0.4692, "step": 227 }, { "epoch": 0.45, "grad_norm": 1.6983120929103905, "learning_rate": 6.234375e-06, "loss": 0.412, "step": 228 }, { "epoch": 0.45, "grad_norm": 1.6228158943041258, "learning_rate": 6.26171875e-06, "loss": 0.3527, "step": 229 }, { "epoch": 0.45, "grad_norm": 1.8094886707624351, "learning_rate": 6.2890624999999996e-06, "loss": 0.4393, "step": 230 }, { "epoch": 0.45, "grad_norm": 2.13302596832327, "learning_rate": 6.31640625e-06, "loss": 0.4933, "step": 231 }, { "epoch": 0.45, "grad_norm": 1.670652264440813, "learning_rate": 6.34375e-06, "loss": 0.4102, "step": 232 }, { "epoch": 0.46, "grad_norm": 1.5620467239945488, "learning_rate": 6.37109375e-06, "loss": 0.4271, "step": 233 }, { "epoch": 0.46, "grad_norm": 1.7953067660463502, "learning_rate": 6.3984375e-06, "loss": 0.4577, "step": 234 }, { "epoch": 0.46, "grad_norm": 1.6420061549742626, "learning_rate": 6.42578125e-06, "loss": 0.3694, "step": 235 }, { "epoch": 0.46, "grad_norm": 1.5903224612784936, "learning_rate": 6.453125e-06, "loss": 0.4247, "step": 236 }, { "epoch": 0.46, "grad_norm": 1.7682300681683656, "learning_rate": 6.4804687499999995e-06, "loss": 0.4037, "step": 237 }, { "epoch": 0.46, "grad_norm": 1.6459772408768538, "learning_rate": 6.5078125e-06, "loss": 0.4275, "step": 238 }, { "epoch": 0.47, "grad_norm": 1.624638337065052, "learning_rate": 6.53515625e-06, "loss": 0.3937, "step": 239 }, { "epoch": 0.47, "grad_norm": 1.7827197326034063, "learning_rate": 6.5625e-06, "loss": 0.4188, "step": 240 }, { "epoch": 0.47, "grad_norm": 1.5191240429659272, "learning_rate": 6.58984375e-06, "loss": 0.3876, "step": 241 }, { "epoch": 0.47, "grad_norm": 1.6314704317180526, "learning_rate": 6.6171875e-06, "loss": 0.4315, "step": 242 }, { "epoch": 0.47, "grad_norm": 1.6584817931539928, "learning_rate": 6.64453125e-06, "loss": 0.3955, "step": 243 }, { "epoch": 0.48, "grad_norm": 1.4810404791693625, "learning_rate": 6.6718749999999995e-06, "loss": 0.3432, "step": 244 }, { "epoch": 0.48, "grad_norm": 1.540917732358922, "learning_rate": 6.69921875e-06, "loss": 0.4112, "step": 245 }, { "epoch": 0.48, "grad_norm": 1.8194404261529604, "learning_rate": 6.7265625e-06, "loss": 0.4812, "step": 246 }, { "epoch": 0.48, "grad_norm": 1.5845515731634556, "learning_rate": 6.75390625e-06, "loss": 0.4092, "step": 247 }, { "epoch": 0.48, "grad_norm": 1.82771937654066, "learning_rate": 6.78125e-06, "loss": 0.4134, "step": 248 }, { "epoch": 0.49, "grad_norm": 1.4702155965217318, "learning_rate": 6.80859375e-06, "loss": 0.3741, "step": 249 }, { "epoch": 0.49, "grad_norm": 1.5473416881484212, "learning_rate": 6.8359375e-06, "loss": 0.4081, "step": 250 }, { "epoch": 0.49, "grad_norm": 1.7644979128479044, "learning_rate": 6.8632812499999995e-06, "loss": 0.4252, "step": 251 }, { "epoch": 0.49, "grad_norm": 1.7298970928095976, "learning_rate": 6.890625e-06, "loss": 0.3672, "step": 252 }, { "epoch": 0.49, "grad_norm": 1.4941448201429415, "learning_rate": 6.91796875e-06, "loss": 0.3216, "step": 253 }, { "epoch": 0.5, "grad_norm": 1.8025332973409465, "learning_rate": 6.9453125e-06, "loss": 0.4435, "step": 254 }, { "epoch": 0.5, "grad_norm": 1.5659309208044063, "learning_rate": 6.97265625e-06, "loss": 0.4513, "step": 255 }, { "epoch": 0.5, "grad_norm": 1.6108925499738436, "learning_rate": 7e-06, "loss": 0.4075, "step": 256 }, { "epoch": 0.5, "grad_norm": 1.77966493308843, "learning_rate": 6.999996746335437e-06, "loss": 0.4145, "step": 257 }, { "epoch": 0.5, "grad_norm": 1.8142486979804886, "learning_rate": 6.9999869853477956e-06, "loss": 0.3775, "step": 258 }, { "epoch": 0.51, "grad_norm": 2.272958151403246, "learning_rate": 6.999970717055227e-06, "loss": 0.4787, "step": 259 }, { "epoch": 0.51, "grad_norm": 1.5975068204571796, "learning_rate": 6.9999479414879755e-06, "loss": 0.4188, "step": 260 }, { "epoch": 0.51, "grad_norm": 1.6728586747493654, "learning_rate": 6.999918658688386e-06, "loss": 0.4457, "step": 261 }, { "epoch": 0.51, "grad_norm": 1.6348269617320335, "learning_rate": 6.9998828687109035e-06, "loss": 0.4531, "step": 262 }, { "epoch": 0.51, "grad_norm": 1.8843662526503226, "learning_rate": 6.99984057162207e-06, "loss": 0.4611, "step": 263 }, { "epoch": 0.52, "grad_norm": 1.5603473859248387, "learning_rate": 6.999791767500524e-06, "loss": 0.4059, "step": 264 }, { "epoch": 0.52, "grad_norm": 1.4976200907504218, "learning_rate": 6.999736456437006e-06, "loss": 0.4013, "step": 265 }, { "epoch": 0.52, "grad_norm": 1.439549792077437, "learning_rate": 6.9996746385343505e-06, "loss": 0.3685, "step": 266 }, { "epoch": 0.52, "grad_norm": 1.4059876237382736, "learning_rate": 6.999606313907494e-06, "loss": 0.4143, "step": 267 }, { "epoch": 0.52, "grad_norm": 1.8648547436486331, "learning_rate": 6.999531482683467e-06, "loss": 0.4408, "step": 268 }, { "epoch": 0.53, "grad_norm": 1.5506316605732253, "learning_rate": 6.999450145001397e-06, "loss": 0.3603, "step": 269 }, { "epoch": 0.53, "grad_norm": 1.675360390781692, "learning_rate": 6.999362301012511e-06, "loss": 0.4172, "step": 270 }, { "epoch": 0.53, "grad_norm": 1.7414161251496474, "learning_rate": 6.999267950880133e-06, "loss": 0.4247, "step": 271 }, { "epoch": 0.53, "grad_norm": 1.6272872690938545, "learning_rate": 6.999167094779681e-06, "loss": 0.4757, "step": 272 }, { "epoch": 0.53, "grad_norm": 1.5072493607292983, "learning_rate": 6.999059732898672e-06, "loss": 0.4261, "step": 273 }, { "epoch": 0.54, "grad_norm": 1.618729357902901, "learning_rate": 6.998945865436715e-06, "loss": 0.371, "step": 274 }, { "epoch": 0.54, "grad_norm": 1.8371028764124393, "learning_rate": 6.998825492605517e-06, "loss": 0.4292, "step": 275 }, { "epoch": 0.54, "grad_norm": 1.7815557389517658, "learning_rate": 6.998698614628881e-06, "loss": 0.458, "step": 276 }, { "epoch": 0.54, "grad_norm": 1.3487967207631975, "learning_rate": 6.998565231742702e-06, "loss": 0.3742, "step": 277 }, { "epoch": 0.54, "grad_norm": 1.7595339290603802, "learning_rate": 6.99842534419497e-06, "loss": 0.4128, "step": 278 }, { "epoch": 0.54, "grad_norm": 1.4296098808923818, "learning_rate": 6.99827895224577e-06, "loss": 0.3506, "step": 279 }, { "epoch": 0.55, "grad_norm": 1.5617017085687892, "learning_rate": 6.99812605616728e-06, "loss": 0.3398, "step": 280 }, { "epoch": 0.55, "grad_norm": 1.6448744422173096, "learning_rate": 6.997966656243768e-06, "loss": 0.4404, "step": 281 }, { "epoch": 0.55, "grad_norm": 1.7710041949610176, "learning_rate": 6.997800752771598e-06, "loss": 0.4472, "step": 282 }, { "epoch": 0.55, "grad_norm": 1.7938493012423797, "learning_rate": 6.997628346059223e-06, "loss": 0.5032, "step": 283 }, { "epoch": 0.55, "grad_norm": 1.6991271104380439, "learning_rate": 6.997449436427188e-06, "loss": 0.3976, "step": 284 }, { "epoch": 0.56, "grad_norm": 1.814785109040746, "learning_rate": 6.997264024208129e-06, "loss": 0.4199, "step": 285 }, { "epoch": 0.56, "grad_norm": 1.790270879255454, "learning_rate": 6.99707210974677e-06, "loss": 0.4248, "step": 286 }, { "epoch": 0.56, "grad_norm": 1.5854467335575229, "learning_rate": 6.9968736933999275e-06, "loss": 0.424, "step": 287 }, { "epoch": 0.56, "grad_norm": 1.7738595428942683, "learning_rate": 6.996668775536502e-06, "loss": 0.4171, "step": 288 }, { "epoch": 0.56, "grad_norm": 1.4548281088689705, "learning_rate": 6.996457356537486e-06, "loss": 0.3852, "step": 289 }, { "epoch": 0.57, "grad_norm": 1.5598518160481014, "learning_rate": 6.996239436795957e-06, "loss": 0.407, "step": 290 }, { "epoch": 0.57, "grad_norm": 1.7457374698389236, "learning_rate": 6.996015016717079e-06, "loss": 0.4751, "step": 291 }, { "epoch": 0.57, "grad_norm": 1.7892545587912443, "learning_rate": 6.9957840967181034e-06, "loss": 0.4649, "step": 292 }, { "epoch": 0.57, "grad_norm": 1.711583196888272, "learning_rate": 6.9955466772283635e-06, "loss": 0.4476, "step": 293 }, { "epoch": 0.57, "grad_norm": 1.569578930585416, "learning_rate": 6.99530275868928e-06, "loss": 0.3878, "step": 294 }, { "epoch": 0.58, "grad_norm": 1.6210245712859779, "learning_rate": 6.995052341554354e-06, "loss": 0.3853, "step": 295 }, { "epoch": 0.58, "grad_norm": 1.5298641740153949, "learning_rate": 6.994795426289171e-06, "loss": 0.394, "step": 296 }, { "epoch": 0.58, "grad_norm": 1.830746670645888, "learning_rate": 6.9945320133713965e-06, "loss": 0.4243, "step": 297 }, { "epoch": 0.58, "grad_norm": 1.7764045845400278, "learning_rate": 6.994262103290778e-06, "loss": 0.4219, "step": 298 }, { "epoch": 0.58, "grad_norm": 1.495613922657766, "learning_rate": 6.993985696549143e-06, "loss": 0.4044, "step": 299 }, { "epoch": 0.59, "grad_norm": 1.8562530988174428, "learning_rate": 6.993702793660396e-06, "loss": 0.4095, "step": 300 }, { "epoch": 0.59, "grad_norm": 1.577549340169475, "learning_rate": 6.993413395150521e-06, "loss": 0.4212, "step": 301 }, { "epoch": 0.59, "grad_norm": 1.6340408304152676, "learning_rate": 6.99311750155758e-06, "loss": 0.3566, "step": 302 }, { "epoch": 0.59, "grad_norm": 1.6739183573971523, "learning_rate": 6.992815113431707e-06, "loss": 0.4461, "step": 303 }, { "epoch": 0.59, "grad_norm": 1.462385931910692, "learning_rate": 6.992506231335112e-06, "loss": 0.332, "step": 304 }, { "epoch": 0.6, "grad_norm": 1.5431194128001153, "learning_rate": 6.992190855842082e-06, "loss": 0.4381, "step": 305 }, { "epoch": 0.6, "grad_norm": 1.7307635989309542, "learning_rate": 6.9918689875389766e-06, "loss": 0.3948, "step": 306 }, { "epoch": 0.6, "grad_norm": 1.6578996527982661, "learning_rate": 6.991540627024222e-06, "loss": 0.3607, "step": 307 }, { "epoch": 0.6, "grad_norm": 1.6522456527203302, "learning_rate": 6.991205774908319e-06, "loss": 0.3908, "step": 308 }, { "epoch": 0.6, "grad_norm": 1.5423980636914572, "learning_rate": 6.990864431813838e-06, "loss": 0.4198, "step": 309 }, { "epoch": 0.61, "grad_norm": 1.4239210983712924, "learning_rate": 6.990516598375416e-06, "loss": 0.336, "step": 310 }, { "epoch": 0.61, "grad_norm": 1.7826550137500996, "learning_rate": 6.990162275239758e-06, "loss": 0.3912, "step": 311 }, { "epoch": 0.61, "grad_norm": 1.5932384719200907, "learning_rate": 6.9898014630656335e-06, "loss": 0.3614, "step": 312 }, { "epoch": 0.61, "grad_norm": 1.4464847436342674, "learning_rate": 6.989434162523879e-06, "loss": 0.4149, "step": 313 }, { "epoch": 0.61, "grad_norm": 1.8407620839879029, "learning_rate": 6.9890603742973934e-06, "loss": 0.3662, "step": 314 }, { "epoch": 0.62, "grad_norm": 1.6313470630471536, "learning_rate": 6.988680099081137e-06, "loss": 0.4567, "step": 315 }, { "epoch": 0.62, "grad_norm": 1.5597838145742624, "learning_rate": 6.988293337582131e-06, "loss": 0.3646, "step": 316 }, { "epoch": 0.62, "grad_norm": 1.6996876587818344, "learning_rate": 6.987900090519458e-06, "loss": 0.4091, "step": 317 }, { "epoch": 0.62, "grad_norm": 1.671377835993801, "learning_rate": 6.987500358624256e-06, "loss": 0.3909, "step": 318 }, { "epoch": 0.62, "grad_norm": 1.584455027676944, "learning_rate": 6.987094142639722e-06, "loss": 0.4256, "step": 319 }, { "epoch": 0.62, "grad_norm": 1.6233110426094417, "learning_rate": 6.9866814433211094e-06, "loss": 0.3491, "step": 320 }, { "epoch": 0.63, "grad_norm": 1.5949271935990748, "learning_rate": 6.986262261435721e-06, "loss": 0.4298, "step": 321 }, { "epoch": 0.63, "grad_norm": 1.6835999347155548, "learning_rate": 6.985836597762917e-06, "loss": 0.3586, "step": 322 }, { "epoch": 0.63, "grad_norm": 1.7817003734481303, "learning_rate": 6.985404453094107e-06, "loss": 0.4084, "step": 323 }, { "epoch": 0.63, "grad_norm": 1.7379187834920424, "learning_rate": 6.984965828232749e-06, "loss": 0.4166, "step": 324 }, { "epoch": 0.63, "grad_norm": 1.754191755440118, "learning_rate": 6.984520723994351e-06, "loss": 0.4714, "step": 325 }, { "epoch": 0.64, "grad_norm": 1.7124412500079302, "learning_rate": 6.984069141206469e-06, "loss": 0.3732, "step": 326 }, { "epoch": 0.64, "grad_norm": 1.6585965943080438, "learning_rate": 6.983611080708701e-06, "loss": 0.409, "step": 327 }, { "epoch": 0.64, "grad_norm": 1.8475880006787537, "learning_rate": 6.983146543352689e-06, "loss": 0.4432, "step": 328 }, { "epoch": 0.64, "grad_norm": 1.4808005984802626, "learning_rate": 6.982675530002119e-06, "loss": 0.3787, "step": 329 }, { "epoch": 0.64, "grad_norm": 1.5609198179240116, "learning_rate": 6.9821980415327175e-06, "loss": 0.4176, "step": 330 }, { "epoch": 0.65, "grad_norm": 1.5552952593528626, "learning_rate": 6.981714078832247e-06, "loss": 0.3688, "step": 331 }, { "epoch": 0.65, "grad_norm": 2.0850811811060863, "learning_rate": 6.981223642800509e-06, "loss": 0.49, "step": 332 }, { "epoch": 0.65, "grad_norm": 1.5953674519870735, "learning_rate": 6.980726734349341e-06, "loss": 0.4053, "step": 333 }, { "epoch": 0.65, "grad_norm": 1.5591062111725176, "learning_rate": 6.980223354402614e-06, "loss": 0.3887, "step": 334 }, { "epoch": 0.65, "grad_norm": 1.6175165617739473, "learning_rate": 6.979713503896228e-06, "loss": 0.3874, "step": 335 }, { "epoch": 0.66, "grad_norm": 1.464925732886883, "learning_rate": 6.979197183778118e-06, "loss": 0.3434, "step": 336 }, { "epoch": 0.66, "grad_norm": 1.6147782164668136, "learning_rate": 6.978674395008247e-06, "loss": 0.3978, "step": 337 }, { "epoch": 0.66, "grad_norm": 1.6105099225677524, "learning_rate": 6.978145138558598e-06, "loss": 0.3667, "step": 338 }, { "epoch": 0.66, "grad_norm": 1.4343309029868834, "learning_rate": 6.97760941541319e-06, "loss": 0.3428, "step": 339 }, { "epoch": 0.66, "grad_norm": 1.4926215457885592, "learning_rate": 6.977067226568055e-06, "loss": 0.4336, "step": 340 }, { "epoch": 0.67, "grad_norm": 1.9587756923518749, "learning_rate": 6.9765185730312525e-06, "loss": 0.4411, "step": 341 }, { "epoch": 0.67, "grad_norm": 1.5831316437878091, "learning_rate": 6.975963455822859e-06, "loss": 0.3742, "step": 342 }, { "epoch": 0.67, "grad_norm": 1.3904678462242204, "learning_rate": 6.975401875974969e-06, "loss": 0.3816, "step": 343 }, { "epoch": 0.67, "grad_norm": 1.8450465013762163, "learning_rate": 6.974833834531692e-06, "loss": 0.443, "step": 344 }, { "epoch": 0.67, "grad_norm": 1.4025660610923578, "learning_rate": 6.974259332549153e-06, "loss": 0.3887, "step": 345 }, { "epoch": 0.68, "grad_norm": 1.747553311044943, "learning_rate": 6.973678371095485e-06, "loss": 0.4579, "step": 346 }, { "epoch": 0.68, "grad_norm": 1.5514347569637017, "learning_rate": 6.9730909512508345e-06, "loss": 0.404, "step": 347 }, { "epoch": 0.68, "grad_norm": 1.469137452994614, "learning_rate": 6.972497074107354e-06, "loss": 0.4202, "step": 348 }, { "epoch": 0.68, "grad_norm": 1.587244059409342, "learning_rate": 6.971896740769201e-06, "loss": 0.412, "step": 349 }, { "epoch": 0.68, "grad_norm": 1.546804646814895, "learning_rate": 6.971289952352539e-06, "loss": 0.3978, "step": 350 }, { "epoch": 0.69, "grad_norm": 1.7275777403364916, "learning_rate": 6.970676709985529e-06, "loss": 0.4671, "step": 351 }, { "epoch": 0.69, "grad_norm": 1.4475153535492766, "learning_rate": 6.970057014808337e-06, "loss": 0.3931, "step": 352 }, { "epoch": 0.69, "grad_norm": 1.4038363504969085, "learning_rate": 6.96943086797312e-06, "loss": 0.3732, "step": 353 }, { "epoch": 0.69, "grad_norm": 1.8143347932657685, "learning_rate": 6.9687982706440355e-06, "loss": 0.429, "step": 354 }, { "epoch": 0.69, "grad_norm": 1.7155309052358394, "learning_rate": 6.968159223997229e-06, "loss": 0.4214, "step": 355 }, { "epoch": 0.7, "grad_norm": 1.4010011419167332, "learning_rate": 6.967513729220844e-06, "loss": 0.3522, "step": 356 }, { "epoch": 0.7, "grad_norm": 1.488092808879101, "learning_rate": 6.966861787515006e-06, "loss": 0.3733, "step": 357 }, { "epoch": 0.7, "grad_norm": 1.7063341786321782, "learning_rate": 6.966203400091827e-06, "loss": 0.4259, "step": 358 }, { "epoch": 0.7, "grad_norm": 1.528182447140462, "learning_rate": 6.965538568175408e-06, "loss": 0.4388, "step": 359 }, { "epoch": 0.7, "grad_norm": 1.7379367259248975, "learning_rate": 6.964867293001827e-06, "loss": 0.4247, "step": 360 }, { "epoch": 0.71, "grad_norm": 1.7638496207431, "learning_rate": 6.964189575819146e-06, "loss": 0.412, "step": 361 }, { "epoch": 0.71, "grad_norm": 1.3737417230231808, "learning_rate": 6.9635054178874e-06, "loss": 0.3595, "step": 362 }, { "epoch": 0.71, "grad_norm": 1.7460269296453368, "learning_rate": 6.9628148204786e-06, "loss": 0.4214, "step": 363 }, { "epoch": 0.71, "grad_norm": 1.7756434203830385, "learning_rate": 6.962117784876734e-06, "loss": 0.4263, "step": 364 }, { "epoch": 0.71, "grad_norm": 1.411900128529412, "learning_rate": 6.961414312377751e-06, "loss": 0.3805, "step": 365 }, { "epoch": 0.71, "grad_norm": 1.637825005533128, "learning_rate": 6.960704404289577e-06, "loss": 0.4542, "step": 366 }, { "epoch": 0.72, "grad_norm": 1.65547546157827, "learning_rate": 6.959988061932097e-06, "loss": 0.4396, "step": 367 }, { "epoch": 0.72, "grad_norm": 1.578911023734177, "learning_rate": 6.959265286637163e-06, "loss": 0.4077, "step": 368 }, { "epoch": 0.72, "grad_norm": 1.477259737485903, "learning_rate": 6.958536079748583e-06, "loss": 0.3818, "step": 369 }, { "epoch": 0.72, "grad_norm": 1.542943713029193, "learning_rate": 6.957800442622129e-06, "loss": 0.3817, "step": 370 }, { "epoch": 0.72, "grad_norm": 1.7088848436119706, "learning_rate": 6.95705837662552e-06, "loss": 0.4277, "step": 371 }, { "epoch": 0.73, "grad_norm": 1.594744438125576, "learning_rate": 6.956309883138437e-06, "loss": 0.4209, "step": 372 }, { "epoch": 0.73, "grad_norm": 1.750756477208487, "learning_rate": 6.9555549635525045e-06, "loss": 0.4307, "step": 373 }, { "epoch": 0.73, "grad_norm": 1.4222332863108347, "learning_rate": 6.954793619271297e-06, "loss": 0.3786, "step": 374 }, { "epoch": 0.73, "grad_norm": 1.703653349923959, "learning_rate": 6.954025851710333e-06, "loss": 0.4549, "step": 375 }, { "epoch": 0.73, "grad_norm": 1.448077978304154, "learning_rate": 6.953251662297077e-06, "loss": 0.3848, "step": 376 }, { "epoch": 0.74, "grad_norm": 1.4736599600809437, "learning_rate": 6.952471052470927e-06, "loss": 0.3737, "step": 377 }, { "epoch": 0.74, "grad_norm": 1.6233917090580536, "learning_rate": 6.9516840236832244e-06, "loss": 0.3839, "step": 378 }, { "epoch": 0.74, "grad_norm": 1.5182803184121305, "learning_rate": 6.9508905773972405e-06, "loss": 0.3964, "step": 379 }, { "epoch": 0.74, "grad_norm": 1.575758940023686, "learning_rate": 6.950090715088181e-06, "loss": 0.4002, "step": 380 }, { "epoch": 0.74, "grad_norm": 1.6690277269976965, "learning_rate": 6.949284438243179e-06, "loss": 0.4251, "step": 381 }, { "epoch": 0.75, "grad_norm": 1.7927706168550368, "learning_rate": 6.9484717483612935e-06, "loss": 0.3813, "step": 382 }, { "epoch": 0.75, "grad_norm": 1.5231852548815457, "learning_rate": 6.947652646953509e-06, "loss": 0.377, "step": 383 }, { "epoch": 0.75, "grad_norm": 1.6190206471371613, "learning_rate": 6.946827135542729e-06, "loss": 0.3952, "step": 384 }, { "epoch": 0.75, "grad_norm": 1.7235521194452834, "learning_rate": 6.945995215663772e-06, "loss": 0.4246, "step": 385 }, { "epoch": 0.75, "grad_norm": 1.4413443696969053, "learning_rate": 6.945156888863377e-06, "loss": 0.3215, "step": 386 }, { "epoch": 0.76, "grad_norm": 1.6801149180853931, "learning_rate": 6.944312156700191e-06, "loss": 0.4338, "step": 387 }, { "epoch": 0.76, "grad_norm": 1.6346618496809786, "learning_rate": 6.94346102074477e-06, "loss": 0.4068, "step": 388 }, { "epoch": 0.76, "grad_norm": 1.95536040070131, "learning_rate": 6.942603482579581e-06, "loss": 0.3845, "step": 389 }, { "epoch": 0.76, "grad_norm": 1.638582102804758, "learning_rate": 6.9417395437989875e-06, "loss": 0.3335, "step": 390 }, { "epoch": 0.76, "grad_norm": 1.7255078365743213, "learning_rate": 6.9408692060092574e-06, "loss": 0.4317, "step": 391 }, { "epoch": 0.77, "grad_norm": 1.7563477059333192, "learning_rate": 6.939992470828554e-06, "loss": 0.406, "step": 392 }, { "epoch": 0.77, "grad_norm": 1.7159009438447363, "learning_rate": 6.939109339886937e-06, "loss": 0.3935, "step": 393 }, { "epoch": 0.77, "grad_norm": 1.5045863141262417, "learning_rate": 6.938219814826355e-06, "loss": 0.3936, "step": 394 }, { "epoch": 0.77, "grad_norm": 1.6612557919287263, "learning_rate": 6.937323897300646e-06, "loss": 0.4289, "step": 395 }, { "epoch": 0.77, "grad_norm": 1.4726931989162806, "learning_rate": 6.936421588975533e-06, "loss": 0.3645, "step": 396 }, { "epoch": 0.78, "grad_norm": 1.4331570440953625, "learning_rate": 6.935512891528622e-06, "loss": 0.3441, "step": 397 }, { "epoch": 0.78, "grad_norm": 1.6117712817484529, "learning_rate": 6.934597806649395e-06, "loss": 0.3915, "step": 398 }, { "epoch": 0.78, "grad_norm": 1.867798016246586, "learning_rate": 6.9336763360392125e-06, "loss": 0.4515, "step": 399 }, { "epoch": 0.78, "grad_norm": 1.6257274256792371, "learning_rate": 6.932748481411306e-06, "loss": 0.394, "step": 400 }, { "epoch": 0.78, "grad_norm": 1.8739225952317118, "learning_rate": 6.931814244490778e-06, "loss": 0.4416, "step": 401 }, { "epoch": 0.79, "grad_norm": 1.260935959336582, "learning_rate": 6.930873627014596e-06, "loss": 0.3331, "step": 402 }, { "epoch": 0.79, "grad_norm": 1.306378785567923, "learning_rate": 6.929926630731591e-06, "loss": 0.3495, "step": 403 }, { "epoch": 0.79, "grad_norm": 1.6956467660265793, "learning_rate": 6.928973257402453e-06, "loss": 0.443, "step": 404 }, { "epoch": 0.79, "grad_norm": 1.634613319223637, "learning_rate": 6.928013508799728e-06, "loss": 0.3903, "step": 405 }, { "epoch": 0.79, "grad_norm": 1.8549535817691105, "learning_rate": 6.9270473867078185e-06, "loss": 0.4018, "step": 406 }, { "epoch": 0.79, "grad_norm": 1.5251986695471493, "learning_rate": 6.926074892922971e-06, "loss": 0.3611, "step": 407 }, { "epoch": 0.8, "grad_norm": 1.7325265391081326, "learning_rate": 6.925096029253284e-06, "loss": 0.3777, "step": 408 }, { "epoch": 0.8, "grad_norm": 1.4556726153659625, "learning_rate": 6.924110797518696e-06, "loss": 0.3952, "step": 409 }, { "epoch": 0.8, "grad_norm": 1.6463628802326389, "learning_rate": 6.923119199550988e-06, "loss": 0.4288, "step": 410 }, { "epoch": 0.8, "grad_norm": 1.7694427122732754, "learning_rate": 6.922121237193773e-06, "loss": 0.4354, "step": 411 }, { "epoch": 0.8, "grad_norm": 1.5130698215361666, "learning_rate": 6.921116912302502e-06, "loss": 0.4194, "step": 412 }, { "epoch": 0.81, "grad_norm": 1.6548223032184335, "learning_rate": 6.92010622674445e-06, "loss": 0.3652, "step": 413 }, { "epoch": 0.81, "grad_norm": 1.4453953465720348, "learning_rate": 6.919089182398723e-06, "loss": 0.367, "step": 414 }, { "epoch": 0.81, "grad_norm": 1.642643072613519, "learning_rate": 6.918065781156246e-06, "loss": 0.3644, "step": 415 }, { "epoch": 0.81, "grad_norm": 1.387462565513097, "learning_rate": 6.917036024919767e-06, "loss": 0.3548, "step": 416 }, { "epoch": 0.81, "grad_norm": 1.7434517401904146, "learning_rate": 6.915999915603844e-06, "loss": 0.4132, "step": 417 }, { "epoch": 0.82, "grad_norm": 1.347085307300691, "learning_rate": 6.9149574551348496e-06, "loss": 0.3992, "step": 418 }, { "epoch": 0.82, "grad_norm": 1.6770686226102254, "learning_rate": 6.913908645450967e-06, "loss": 0.4391, "step": 419 }, { "epoch": 0.82, "grad_norm": 1.5019992634460038, "learning_rate": 6.912853488502181e-06, "loss": 0.4043, "step": 420 }, { "epoch": 0.82, "grad_norm": 1.4768304665707173, "learning_rate": 6.911791986250275e-06, "loss": 0.3752, "step": 421 }, { "epoch": 0.82, "grad_norm": 1.4354880820842866, "learning_rate": 6.910724140668839e-06, "loss": 0.3899, "step": 422 }, { "epoch": 0.83, "grad_norm": 1.4002882723492887, "learning_rate": 6.909649953743247e-06, "loss": 0.3128, "step": 423 }, { "epoch": 0.83, "grad_norm": 1.8336861360507277, "learning_rate": 6.908569427470668e-06, "loss": 0.3911, "step": 424 }, { "epoch": 0.83, "grad_norm": 1.641829170404737, "learning_rate": 6.907482563860056e-06, "loss": 0.3849, "step": 425 }, { "epoch": 0.83, "grad_norm": 1.5063621623440684, "learning_rate": 6.906389364932148e-06, "loss": 0.4526, "step": 426 }, { "epoch": 0.83, "grad_norm": 1.7175740305936653, "learning_rate": 6.905289832719461e-06, "loss": 0.4161, "step": 427 }, { "epoch": 0.84, "grad_norm": 1.6039585346625382, "learning_rate": 6.904183969266283e-06, "loss": 0.3987, "step": 428 }, { "epoch": 0.84, "grad_norm": 1.3903773041163252, "learning_rate": 6.90307177662868e-06, "loss": 0.366, "step": 429 }, { "epoch": 0.84, "grad_norm": 1.8650076554150041, "learning_rate": 6.901953256874478e-06, "loss": 0.4803, "step": 430 }, { "epoch": 0.84, "grad_norm": 1.390526146310638, "learning_rate": 6.900828412083273e-06, "loss": 0.347, "step": 431 }, { "epoch": 0.84, "grad_norm": 1.9346042215054817, "learning_rate": 6.899697244346414e-06, "loss": 0.4391, "step": 432 }, { "epoch": 0.85, "grad_norm": 1.5497027712387426, "learning_rate": 6.8985597557670156e-06, "loss": 0.3333, "step": 433 }, { "epoch": 0.85, "grad_norm": 1.83034042346164, "learning_rate": 6.897415948459933e-06, "loss": 0.3666, "step": 434 }, { "epoch": 0.85, "grad_norm": 1.6125355743571397, "learning_rate": 6.8962658245517785e-06, "loss": 0.3877, "step": 435 }, { "epoch": 0.85, "grad_norm": 1.5138273556693025, "learning_rate": 6.8951093861809044e-06, "loss": 0.3631, "step": 436 }, { "epoch": 0.85, "grad_norm": 1.4617888576842644, "learning_rate": 6.8939466354974015e-06, "loss": 0.3638, "step": 437 }, { "epoch": 0.86, "grad_norm": 1.8129161730682712, "learning_rate": 6.8927775746631e-06, "loss": 0.3948, "step": 438 }, { "epoch": 0.86, "grad_norm": 2.1860972865871764, "learning_rate": 6.8916022058515625e-06, "loss": 0.5044, "step": 439 }, { "epoch": 0.86, "grad_norm": 1.7157094669289017, "learning_rate": 6.890420531248076e-06, "loss": 0.4062, "step": 440 }, { "epoch": 0.86, "grad_norm": 1.7400662849448478, "learning_rate": 6.889232553049655e-06, "loss": 0.5122, "step": 441 }, { "epoch": 0.86, "grad_norm": 1.5481763912260595, "learning_rate": 6.888038273465029e-06, "loss": 0.3665, "step": 442 }, { "epoch": 0.87, "grad_norm": 1.4186507662486547, "learning_rate": 6.8868376947146514e-06, "loss": 0.342, "step": 443 }, { "epoch": 0.87, "grad_norm": 1.5507720213363796, "learning_rate": 6.885630819030679e-06, "loss": 0.4365, "step": 444 }, { "epoch": 0.87, "grad_norm": 1.6665380849371114, "learning_rate": 6.884417648656982e-06, "loss": 0.3551, "step": 445 }, { "epoch": 0.87, "grad_norm": 1.4771278837932451, "learning_rate": 6.883198185849131e-06, "loss": 0.3836, "step": 446 }, { "epoch": 0.87, "grad_norm": 1.4481939870986762, "learning_rate": 6.881972432874394e-06, "loss": 0.3272, "step": 447 }, { "epoch": 0.88, "grad_norm": 1.5516257223887142, "learning_rate": 6.880740392011738e-06, "loss": 0.3995, "step": 448 }, { "epoch": 0.88, "grad_norm": 1.6352472100838398, "learning_rate": 6.87950206555182e-06, "loss": 0.3285, "step": 449 }, { "epoch": 0.88, "grad_norm": 1.5204393129602674, "learning_rate": 6.87825745579698e-06, "loss": 0.3914, "step": 450 }, { "epoch": 0.88, "grad_norm": 1.486775387240555, "learning_rate": 6.877006565061244e-06, "loss": 0.4094, "step": 451 }, { "epoch": 0.88, "grad_norm": 1.6757833593140687, "learning_rate": 6.875749395670313e-06, "loss": 0.4033, "step": 452 }, { "epoch": 0.88, "grad_norm": 1.5775749628997324, "learning_rate": 6.874485949961563e-06, "loss": 0.3564, "step": 453 }, { "epoch": 0.89, "grad_norm": 1.525143907475638, "learning_rate": 6.87321623028404e-06, "loss": 0.3307, "step": 454 }, { "epoch": 0.89, "grad_norm": 1.5040781469005653, "learning_rate": 6.871940238998452e-06, "loss": 0.3882, "step": 455 }, { "epoch": 0.89, "grad_norm": 1.6099764272747048, "learning_rate": 6.870657978477169e-06, "loss": 0.3563, "step": 456 }, { "epoch": 0.89, "grad_norm": 1.6027362956600524, "learning_rate": 6.86936945110422e-06, "loss": 0.4318, "step": 457 }, { "epoch": 0.89, "grad_norm": 1.750493384329669, "learning_rate": 6.868074659275278e-06, "loss": 0.4042, "step": 458 }, { "epoch": 0.9, "grad_norm": 1.5160110333713162, "learning_rate": 6.866773605397672e-06, "loss": 0.4292, "step": 459 }, { "epoch": 0.9, "grad_norm": 1.4756149801832856, "learning_rate": 6.865466291890367e-06, "loss": 0.38, "step": 460 }, { "epoch": 0.9, "grad_norm": 1.7420236753462615, "learning_rate": 6.864152721183969e-06, "loss": 0.3816, "step": 461 }, { "epoch": 0.9, "grad_norm": 1.5800544419003433, "learning_rate": 6.862832895720718e-06, "loss": 0.3784, "step": 462 }, { "epoch": 0.9, "grad_norm": 1.5277796277141729, "learning_rate": 6.86150681795448e-06, "loss": 0.4352, "step": 463 }, { "epoch": 0.91, "grad_norm": 1.6648813427130578, "learning_rate": 6.860174490350751e-06, "loss": 0.3918, "step": 464 }, { "epoch": 0.91, "grad_norm": 1.4194767224927884, "learning_rate": 6.8588359153866394e-06, "loss": 0.3683, "step": 465 }, { "epoch": 0.91, "grad_norm": 1.690401595635384, "learning_rate": 6.857491095550878e-06, "loss": 0.4606, "step": 466 }, { "epoch": 0.91, "grad_norm": 1.4884409084785801, "learning_rate": 6.8561400333438015e-06, "loss": 0.4092, "step": 467 }, { "epoch": 0.91, "grad_norm": 1.815806971485844, "learning_rate": 6.854782731277357e-06, "loss": 0.4231, "step": 468 }, { "epoch": 0.92, "grad_norm": 1.5478274452173697, "learning_rate": 6.8534191918750885e-06, "loss": 0.4122, "step": 469 }, { "epoch": 0.92, "grad_norm": 1.6604334882248586, "learning_rate": 6.852049417672141e-06, "loss": 0.3782, "step": 470 }, { "epoch": 0.92, "grad_norm": 1.8148218171867665, "learning_rate": 6.850673411215248e-06, "loss": 0.4275, "step": 471 }, { "epoch": 0.92, "grad_norm": 1.663830480101605, "learning_rate": 6.849291175062731e-06, "loss": 0.383, "step": 472 }, { "epoch": 0.92, "grad_norm": 1.679011564161711, "learning_rate": 6.847902711784495e-06, "loss": 0.3606, "step": 473 }, { "epoch": 0.93, "grad_norm": 1.5104479844896042, "learning_rate": 6.846508023962023e-06, "loss": 0.3879, "step": 474 }, { "epoch": 0.93, "grad_norm": 1.7606812509393381, "learning_rate": 6.845107114188369e-06, "loss": 0.4245, "step": 475 }, { "epoch": 0.93, "grad_norm": 1.4174043265741767, "learning_rate": 6.843699985068156e-06, "loss": 0.3386, "step": 476 }, { "epoch": 0.93, "grad_norm": 1.647379606889349, "learning_rate": 6.842286639217572e-06, "loss": 0.4246, "step": 477 }, { "epoch": 0.93, "grad_norm": 1.682689795975946, "learning_rate": 6.8408670792643595e-06, "loss": 0.444, "step": 478 }, { "epoch": 0.94, "grad_norm": 1.5512620708295217, "learning_rate": 6.839441307847818e-06, "loss": 0.3948, "step": 479 }, { "epoch": 0.94, "grad_norm": 1.2828243170194242, "learning_rate": 6.838009327618794e-06, "loss": 0.3365, "step": 480 }, { "epoch": 0.94, "grad_norm": 1.621479074600859, "learning_rate": 6.836571141239678e-06, "loss": 0.4244, "step": 481 }, { "epoch": 0.94, "grad_norm": 1.3283898421782954, "learning_rate": 6.8351267513844e-06, "loss": 0.3673, "step": 482 }, { "epoch": 0.94, "grad_norm": 1.528894641679477, "learning_rate": 6.8336761607384215e-06, "loss": 0.3842, "step": 483 }, { "epoch": 0.95, "grad_norm": 1.4934720957876582, "learning_rate": 6.8322193719987345e-06, "loss": 0.3812, "step": 484 }, { "epoch": 0.95, "grad_norm": 1.8438975139194977, "learning_rate": 6.830756387873856e-06, "loss": 0.4071, "step": 485 }, { "epoch": 0.95, "grad_norm": 1.7426105515856984, "learning_rate": 6.829287211083817e-06, "loss": 0.4284, "step": 486 }, { "epoch": 0.95, "grad_norm": 1.4255855821956083, "learning_rate": 6.827811844360168e-06, "loss": 0.3899, "step": 487 }, { "epoch": 0.95, "grad_norm": 1.4401645684101536, "learning_rate": 6.8263302904459634e-06, "loss": 0.3665, "step": 488 }, { "epoch": 0.96, "grad_norm": 1.9399174005640054, "learning_rate": 6.824842552095764e-06, "loss": 0.4115, "step": 489 }, { "epoch": 0.96, "grad_norm": 1.3808231009410363, "learning_rate": 6.823348632075628e-06, "loss": 0.328, "step": 490 }, { "epoch": 0.96, "grad_norm": 1.3937106650978832, "learning_rate": 6.821848533163106e-06, "loss": 0.3618, "step": 491 }, { "epoch": 0.96, "grad_norm": 1.5368142231486275, "learning_rate": 6.820342258147237e-06, "loss": 0.3349, "step": 492 }, { "epoch": 0.96, "grad_norm": 1.752408016955722, "learning_rate": 6.818829809828544e-06, "loss": 0.365, "step": 493 }, { "epoch": 0.96, "grad_norm": 1.3954724910366554, "learning_rate": 6.817311191019026e-06, "loss": 0.3519, "step": 494 }, { "epoch": 0.97, "grad_norm": 1.3490089296722834, "learning_rate": 6.815786404542154e-06, "loss": 0.3717, "step": 495 }, { "epoch": 0.97, "grad_norm": 1.6705210112411417, "learning_rate": 6.81425545323287e-06, "loss": 0.4334, "step": 496 }, { "epoch": 0.97, "grad_norm": 1.766079244546002, "learning_rate": 6.812718339937573e-06, "loss": 0.4257, "step": 497 }, { "epoch": 0.97, "grad_norm": 1.8332667577570418, "learning_rate": 6.8111750675141215e-06, "loss": 0.4486, "step": 498 }, { "epoch": 0.97, "grad_norm": 1.5015491746655512, "learning_rate": 6.8096256388318245e-06, "loss": 0.3933, "step": 499 }, { "epoch": 0.98, "grad_norm": 1.5807507888737526, "learning_rate": 6.808070056771437e-06, "loss": 0.4177, "step": 500 }, { "epoch": 0.98, "grad_norm": 1.797889829242378, "learning_rate": 6.806508324225154e-06, "loss": 0.4877, "step": 501 }, { "epoch": 0.98, "grad_norm": 1.6272056942386348, "learning_rate": 6.804940444096608e-06, "loss": 0.3744, "step": 502 }, { "epoch": 0.98, "grad_norm": 1.444866310535126, "learning_rate": 6.803366419300858e-06, "loss": 0.3684, "step": 503 }, { "epoch": 0.98, "grad_norm": 1.463885170631707, "learning_rate": 6.801786252764388e-06, "loss": 0.4322, "step": 504 }, { "epoch": 0.99, "grad_norm": 1.742212864676786, "learning_rate": 6.8001999474251034e-06, "loss": 0.4398, "step": 505 }, { "epoch": 0.99, "grad_norm": 1.345794459183557, "learning_rate": 6.798607506232324e-06, "loss": 0.3987, "step": 506 }, { "epoch": 0.99, "grad_norm": 1.7029490253653985, "learning_rate": 6.797008932146771e-06, "loss": 0.3978, "step": 507 }, { "epoch": 0.99, "grad_norm": 1.6183974806707546, "learning_rate": 6.795404228140573e-06, "loss": 0.4079, "step": 508 }, { "epoch": 0.99, "grad_norm": 1.3090301926085908, "learning_rate": 6.793793397197257e-06, "loss": 0.3349, "step": 509 }, { "epoch": 1.0, "grad_norm": 1.484710294055213, "learning_rate": 6.792176442311738e-06, "loss": 0.3788, "step": 510 }, { "epoch": 1.0, "grad_norm": 1.882648521289744, "learning_rate": 6.790553366490317e-06, "loss": 0.4796, "step": 511 }, { "epoch": 1.0, "grad_norm": 1.4059515612271645, "learning_rate": 6.788924172750679e-06, "loss": 0.3518, "step": 512 }, { "epoch": 1.0, "grad_norm": 1.596847686697904, "learning_rate": 6.78728886412188e-06, "loss": 0.4009, "step": 513 }, { "epoch": 1.0, "grad_norm": 1.7328140010738338, "learning_rate": 6.785647443644346e-06, "loss": 0.3237, "step": 514 }, { "epoch": 1.01, "grad_norm": 1.49821514595923, "learning_rate": 6.783999914369867e-06, "loss": 0.3914, "step": 515 }, { "epoch": 1.01, "grad_norm": 1.378052236596182, "learning_rate": 6.782346279361589e-06, "loss": 0.3551, "step": 516 }, { "epoch": 1.01, "grad_norm": 1.7985823740676978, "learning_rate": 6.7806865416940126e-06, "loss": 0.4018, "step": 517 }, { "epoch": 1.01, "grad_norm": 1.7174601884545342, "learning_rate": 6.779020704452983e-06, "loss": 0.3768, "step": 518 }, { "epoch": 1.01, "grad_norm": 1.672618734354619, "learning_rate": 6.7773487707356845e-06, "loss": 0.4364, "step": 519 }, { "epoch": 1.02, "grad_norm": 1.63942268514506, "learning_rate": 6.77567074365064e-06, "loss": 0.4137, "step": 520 }, { "epoch": 1.02, "grad_norm": 1.5713598558411601, "learning_rate": 6.773986626317699e-06, "loss": 0.3569, "step": 521 }, { "epoch": 1.02, "grad_norm": 1.528706214266134, "learning_rate": 6.772296421868033e-06, "loss": 0.3914, "step": 522 }, { "epoch": 1.02, "grad_norm": 1.3568389232208868, "learning_rate": 6.770600133444136e-06, "loss": 0.3444, "step": 523 }, { "epoch": 1.02, "grad_norm": 1.4531924046430054, "learning_rate": 6.768897764199808e-06, "loss": 0.3611, "step": 524 }, { "epoch": 1.03, "grad_norm": 1.360951038718442, "learning_rate": 6.767189317300154e-06, "loss": 0.3631, "step": 525 }, { "epoch": 1.03, "grad_norm": 1.7388883597244693, "learning_rate": 6.765474795921586e-06, "loss": 0.4296, "step": 526 }, { "epoch": 1.03, "grad_norm": 1.4413126482654373, "learning_rate": 6.763754203251803e-06, "loss": 0.3742, "step": 527 }, { "epoch": 1.03, "grad_norm": 1.5422232280449133, "learning_rate": 6.762027542489795e-06, "loss": 0.3457, "step": 528 }, { "epoch": 1.03, "grad_norm": 1.5085338037744558, "learning_rate": 6.760294816845832e-06, "loss": 0.3569, "step": 529 }, { "epoch": 1.04, "grad_norm": 1.480497071984457, "learning_rate": 6.7585560295414646e-06, "loss": 0.3081, "step": 530 }, { "epoch": 1.04, "grad_norm": 1.8097874374642493, "learning_rate": 6.756811183809507e-06, "loss": 0.3973, "step": 531 }, { "epoch": 1.04, "grad_norm": 1.5260453665929192, "learning_rate": 6.755060282894042e-06, "loss": 0.3768, "step": 532 }, { "epoch": 1.04, "grad_norm": 1.3832800273475556, "learning_rate": 6.75330333005041e-06, "loss": 0.3403, "step": 533 }, { "epoch": 1.04, "grad_norm": 1.6314857188404743, "learning_rate": 6.7515403285452015e-06, "loss": 0.3047, "step": 534 }, { "epoch": 1.04, "grad_norm": 1.707692109840766, "learning_rate": 6.7497712816562545e-06, "loss": 0.3239, "step": 535 }, { "epoch": 1.05, "grad_norm": 1.4328670173823705, "learning_rate": 6.747996192672646e-06, "loss": 0.3361, "step": 536 }, { "epoch": 1.05, "grad_norm": 1.5054148304239428, "learning_rate": 6.7462150648946865e-06, "loss": 0.4148, "step": 537 }, { "epoch": 1.05, "grad_norm": 1.584276827034149, "learning_rate": 6.744427901633915e-06, "loss": 0.3491, "step": 538 }, { "epoch": 1.05, "grad_norm": 1.404072978015599, "learning_rate": 6.742634706213091e-06, "loss": 0.3137, "step": 539 }, { "epoch": 1.05, "grad_norm": 1.6030336019222375, "learning_rate": 6.740835481966191e-06, "loss": 0.391, "step": 540 }, { "epoch": 1.06, "grad_norm": 1.5634020440151852, "learning_rate": 6.739030232238398e-06, "loss": 0.3638, "step": 541 }, { "epoch": 1.06, "grad_norm": 1.4900242742133327, "learning_rate": 6.737218960386098e-06, "loss": 0.4003, "step": 542 }, { "epoch": 1.06, "grad_norm": 1.4060765194698752, "learning_rate": 6.735401669776875e-06, "loss": 0.3413, "step": 543 }, { "epoch": 1.06, "grad_norm": 1.4347137326251413, "learning_rate": 6.733578363789503e-06, "loss": 0.3699, "step": 544 }, { "epoch": 1.06, "grad_norm": 1.5483656965291546, "learning_rate": 6.73174904581394e-06, "loss": 0.312, "step": 545 }, { "epoch": 1.07, "grad_norm": 1.3316083106758043, "learning_rate": 6.729913719251323e-06, "loss": 0.2939, "step": 546 }, { "epoch": 1.07, "grad_norm": 1.5539008705099495, "learning_rate": 6.728072387513955e-06, "loss": 0.3529, "step": 547 }, { "epoch": 1.07, "grad_norm": 1.3864373410590376, "learning_rate": 6.726225054025311e-06, "loss": 0.3396, "step": 548 }, { "epoch": 1.07, "grad_norm": 1.701827177069072, "learning_rate": 6.724371722220021e-06, "loss": 0.409, "step": 549 }, { "epoch": 1.07, "grad_norm": 1.4726955948356502, "learning_rate": 6.722512395543867e-06, "loss": 0.3409, "step": 550 }, { "epoch": 1.08, "grad_norm": 1.6267260156524637, "learning_rate": 6.720647077453778e-06, "loss": 0.402, "step": 551 }, { "epoch": 1.08, "grad_norm": 1.6078555708532256, "learning_rate": 6.718775771417823e-06, "loss": 0.308, "step": 552 }, { "epoch": 1.08, "grad_norm": 1.5242327715151553, "learning_rate": 6.716898480915203e-06, "loss": 0.3562, "step": 553 }, { "epoch": 1.08, "grad_norm": 1.4259514231088786, "learning_rate": 6.715015209436244e-06, "loss": 0.3322, "step": 554 }, { "epoch": 1.08, "grad_norm": 1.4784343334289873, "learning_rate": 6.713125960482396e-06, "loss": 0.3446, "step": 555 }, { "epoch": 1.09, "grad_norm": 1.4074491400571254, "learning_rate": 6.711230737566219e-06, "loss": 0.3115, "step": 556 }, { "epoch": 1.09, "grad_norm": 1.3707828403408049, "learning_rate": 6.709329544211383e-06, "loss": 0.328, "step": 557 }, { "epoch": 1.09, "grad_norm": 1.6985496244804195, "learning_rate": 6.707422383952656e-06, "loss": 0.3909, "step": 558 }, { "epoch": 1.09, "grad_norm": 1.6496673847156236, "learning_rate": 6.7055092603359e-06, "loss": 0.3201, "step": 559 }, { "epoch": 1.09, "grad_norm": 1.7668419365295218, "learning_rate": 6.7035901769180656e-06, "loss": 0.3796, "step": 560 }, { "epoch": 1.1, "grad_norm": 1.381661930297508, "learning_rate": 6.701665137267182e-06, "loss": 0.3108, "step": 561 }, { "epoch": 1.1, "grad_norm": 1.432723654605713, "learning_rate": 6.699734144962357e-06, "loss": 0.3651, "step": 562 }, { "epoch": 1.1, "grad_norm": 1.594002963127838, "learning_rate": 6.6977972035937605e-06, "loss": 0.3621, "step": 563 }, { "epoch": 1.1, "grad_norm": 1.5211814427857404, "learning_rate": 6.6958543167626265e-06, "loss": 0.3408, "step": 564 }, { "epoch": 1.1, "grad_norm": 1.4327188727465234, "learning_rate": 6.6939054880812415e-06, "loss": 0.3336, "step": 565 }, { "epoch": 1.11, "grad_norm": 1.5664113637617196, "learning_rate": 6.6919507211729395e-06, "loss": 0.3334, "step": 566 }, { "epoch": 1.11, "grad_norm": 1.5923031254263549, "learning_rate": 6.689990019672093e-06, "loss": 0.3398, "step": 567 }, { "epoch": 1.11, "grad_norm": 1.6033202323573337, "learning_rate": 6.688023387224115e-06, "loss": 0.3407, "step": 568 }, { "epoch": 1.11, "grad_norm": 1.6181275192833218, "learning_rate": 6.686050827485439e-06, "loss": 0.3263, "step": 569 }, { "epoch": 1.11, "grad_norm": 1.5939241868695309, "learning_rate": 6.68407234412352e-06, "loss": 0.3512, "step": 570 }, { "epoch": 1.12, "grad_norm": 1.3669404813877861, "learning_rate": 6.682087940816828e-06, "loss": 0.2914, "step": 571 }, { "epoch": 1.12, "grad_norm": 1.6022977575232307, "learning_rate": 6.6800976212548396e-06, "loss": 0.3386, "step": 572 }, { "epoch": 1.12, "grad_norm": 1.6495609957743245, "learning_rate": 6.678101389138029e-06, "loss": 0.3625, "step": 573 }, { "epoch": 1.12, "grad_norm": 1.6594548115380698, "learning_rate": 6.676099248177865e-06, "loss": 0.3575, "step": 574 }, { "epoch": 1.12, "grad_norm": 1.308209793025058, "learning_rate": 6.6740912020968026e-06, "loss": 0.2836, "step": 575 }, { "epoch": 1.12, "grad_norm": 1.6639836317139411, "learning_rate": 6.672077254628275e-06, "loss": 0.3446, "step": 576 }, { "epoch": 1.13, "grad_norm": 1.6286948310425742, "learning_rate": 6.6700574095166866e-06, "loss": 0.3428, "step": 577 }, { "epoch": 1.13, "grad_norm": 1.5426464672724065, "learning_rate": 6.6680316705174095e-06, "loss": 0.3134, "step": 578 }, { "epoch": 1.13, "grad_norm": 1.637235909303668, "learning_rate": 6.666000041396771e-06, "loss": 0.3648, "step": 579 }, { "epoch": 1.13, "grad_norm": 1.3864368083120655, "learning_rate": 6.663962525932052e-06, "loss": 0.3041, "step": 580 }, { "epoch": 1.13, "grad_norm": 1.717483776073637, "learning_rate": 6.6619191279114745e-06, "loss": 0.3645, "step": 581 }, { "epoch": 1.14, "grad_norm": 1.618652156874343, "learning_rate": 6.659869851134203e-06, "loss": 0.3376, "step": 582 }, { "epoch": 1.14, "grad_norm": 1.5057970254007205, "learning_rate": 6.657814699410325e-06, "loss": 0.3173, "step": 583 }, { "epoch": 1.14, "grad_norm": 1.6274525530285684, "learning_rate": 6.655753676560856e-06, "loss": 0.306, "step": 584 }, { "epoch": 1.14, "grad_norm": 1.5113395928647968, "learning_rate": 6.653686786417726e-06, "loss": 0.3317, "step": 585 }, { "epoch": 1.14, "grad_norm": 1.5661237737040183, "learning_rate": 6.651614032823773e-06, "loss": 0.3426, "step": 586 }, { "epoch": 1.15, "grad_norm": 1.836695534280435, "learning_rate": 6.649535419632736e-06, "loss": 0.3434, "step": 587 }, { "epoch": 1.15, "grad_norm": 1.6259727781174806, "learning_rate": 6.647450950709251e-06, "loss": 0.3042, "step": 588 }, { "epoch": 1.15, "grad_norm": 1.3178477685849916, "learning_rate": 6.645360629928838e-06, "loss": 0.2923, "step": 589 }, { "epoch": 1.15, "grad_norm": 1.452203108959364, "learning_rate": 6.6432644611779e-06, "loss": 0.329, "step": 590 }, { "epoch": 1.15, "grad_norm": 1.3601381782561088, "learning_rate": 6.641162448353711e-06, "loss": 0.2556, "step": 591 }, { "epoch": 1.16, "grad_norm": 1.4621674914205687, "learning_rate": 6.639054595364409e-06, "loss": 0.3549, "step": 592 }, { "epoch": 1.16, "grad_norm": 1.5962610875733478, "learning_rate": 6.6369409061289945e-06, "loss": 0.3215, "step": 593 }, { "epoch": 1.16, "grad_norm": 1.444544072688777, "learning_rate": 6.634821384577314e-06, "loss": 0.3189, "step": 594 }, { "epoch": 1.16, "grad_norm": 1.4552356519224634, "learning_rate": 6.632696034650063e-06, "loss": 0.3337, "step": 595 }, { "epoch": 1.16, "grad_norm": 1.395724498277965, "learning_rate": 6.630564860298768e-06, "loss": 0.3085, "step": 596 }, { "epoch": 1.17, "grad_norm": 1.7529372917232449, "learning_rate": 6.628427865485789e-06, "loss": 0.3109, "step": 597 }, { "epoch": 1.17, "grad_norm": 1.6991722191193963, "learning_rate": 6.626285054184303e-06, "loss": 0.3683, "step": 598 }, { "epoch": 1.17, "grad_norm": 1.5270014141769412, "learning_rate": 6.624136430378307e-06, "loss": 0.3127, "step": 599 }, { "epoch": 1.17, "grad_norm": 1.65412875713091, "learning_rate": 6.6219819980625995e-06, "loss": 0.3196, "step": 600 }, { "epoch": 1.17, "grad_norm": 1.5006342001271835, "learning_rate": 6.619821761242781e-06, "loss": 0.3142, "step": 601 }, { "epoch": 1.18, "grad_norm": 1.5725573832121778, "learning_rate": 6.617655723935244e-06, "loss": 0.3467, "step": 602 }, { "epoch": 1.18, "grad_norm": 1.4453316880592126, "learning_rate": 6.615483890167164e-06, "loss": 0.2207, "step": 603 }, { "epoch": 1.18, "grad_norm": 1.4751981897961317, "learning_rate": 6.613306263976496e-06, "loss": 0.308, "step": 604 }, { "epoch": 1.18, "grad_norm": 1.5363064886830635, "learning_rate": 6.6111228494119616e-06, "loss": 0.3116, "step": 605 }, { "epoch": 1.18, "grad_norm": 1.38339586398149, "learning_rate": 6.6089336505330466e-06, "loss": 0.3063, "step": 606 }, { "epoch": 1.19, "grad_norm": 1.564918434622021, "learning_rate": 6.606738671409989e-06, "loss": 0.3423, "step": 607 }, { "epoch": 1.19, "grad_norm": 1.604080350591797, "learning_rate": 6.604537916123775e-06, "loss": 0.3504, "step": 608 }, { "epoch": 1.19, "grad_norm": 1.7130448170843353, "learning_rate": 6.602331388766133e-06, "loss": 0.2876, "step": 609 }, { "epoch": 1.19, "grad_norm": 1.6532701102754153, "learning_rate": 6.600119093439517e-06, "loss": 0.2854, "step": 610 }, { "epoch": 1.19, "grad_norm": 1.5260035998024783, "learning_rate": 6.5979010342571085e-06, "loss": 0.3069, "step": 611 }, { "epoch": 1.2, "grad_norm": 1.4639553371440497, "learning_rate": 6.595677215342806e-06, "loss": 0.2731, "step": 612 }, { "epoch": 1.2, "grad_norm": 1.3381626729995708, "learning_rate": 6.593447640831215e-06, "loss": 0.2783, "step": 613 }, { "epoch": 1.2, "grad_norm": 1.6167768895928034, "learning_rate": 6.591212314867643e-06, "loss": 0.2608, "step": 614 }, { "epoch": 1.2, "grad_norm": 1.5521703196320586, "learning_rate": 6.58897124160809e-06, "loss": 0.3282, "step": 615 }, { "epoch": 1.2, "grad_norm": 1.6198540990344164, "learning_rate": 6.5867244252192426e-06, "loss": 0.339, "step": 616 }, { "epoch": 1.21, "grad_norm": 1.44605580941395, "learning_rate": 6.584471869878464e-06, "loss": 0.2884, "step": 617 }, { "epoch": 1.21, "grad_norm": 1.6496069636580764, "learning_rate": 6.58221357977379e-06, "loss": 0.3643, "step": 618 }, { "epoch": 1.21, "grad_norm": 1.5830592917381443, "learning_rate": 6.579949559103914e-06, "loss": 0.2985, "step": 619 }, { "epoch": 1.21, "grad_norm": 1.5957531787456913, "learning_rate": 6.577679812078189e-06, "loss": 0.263, "step": 620 }, { "epoch": 1.21, "grad_norm": 1.3712035168694994, "learning_rate": 6.575404342916612e-06, "loss": 0.2687, "step": 621 }, { "epoch": 1.21, "grad_norm": 1.535652780607988, "learning_rate": 6.573123155849819e-06, "loss": 0.2801, "step": 622 }, { "epoch": 1.22, "grad_norm": 1.5424644753412209, "learning_rate": 6.570836255119078e-06, "loss": 0.3096, "step": 623 }, { "epoch": 1.22, "grad_norm": 1.4318858516855941, "learning_rate": 6.568543644976277e-06, "loss": 0.3217, "step": 624 }, { "epoch": 1.22, "grad_norm": 1.5579496441712164, "learning_rate": 6.566245329683923e-06, "loss": 0.2727, "step": 625 }, { "epoch": 1.22, "grad_norm": 1.369793456852105, "learning_rate": 6.563941313515128e-06, "loss": 0.2162, "step": 626 }, { "epoch": 1.22, "grad_norm": 1.4751691318680786, "learning_rate": 6.5616316007536055e-06, "loss": 0.2686, "step": 627 }, { "epoch": 1.23, "grad_norm": 1.4024895138088038, "learning_rate": 6.559316195693656e-06, "loss": 0.2774, "step": 628 }, { "epoch": 1.23, "grad_norm": 1.6085960250192906, "learning_rate": 6.556995102640168e-06, "loss": 0.2375, "step": 629 }, { "epoch": 1.23, "grad_norm": 1.3074395021200447, "learning_rate": 6.5546683259086015e-06, "loss": 0.2504, "step": 630 }, { "epoch": 1.23, "grad_norm": 1.6778357785029727, "learning_rate": 6.552335869824988e-06, "loss": 0.3797, "step": 631 }, { "epoch": 1.23, "grad_norm": 1.3408838760884152, "learning_rate": 6.549997738725915e-06, "loss": 0.2715, "step": 632 }, { "epoch": 1.24, "grad_norm": 1.4586599206231676, "learning_rate": 6.547653936958522e-06, "loss": 0.2566, "step": 633 }, { "epoch": 1.24, "grad_norm": 1.4065283904881503, "learning_rate": 6.54530446888049e-06, "loss": 0.2884, "step": 634 }, { "epoch": 1.24, "grad_norm": 1.4444187529856813, "learning_rate": 6.542949338860039e-06, "loss": 0.2515, "step": 635 }, { "epoch": 1.24, "grad_norm": 1.4522579552334676, "learning_rate": 6.540588551275913e-06, "loss": 0.2386, "step": 636 }, { "epoch": 1.24, "grad_norm": 1.523144908608106, "learning_rate": 6.538222110517375e-06, "loss": 0.2931, "step": 637 }, { "epoch": 1.25, "grad_norm": 1.4863004458662323, "learning_rate": 6.5358500209842005e-06, "loss": 0.2789, "step": 638 }, { "epoch": 1.25, "grad_norm": 1.3828377714306712, "learning_rate": 6.533472287086663e-06, "loss": 0.2532, "step": 639 }, { "epoch": 1.25, "grad_norm": 1.5524809047985966, "learning_rate": 6.531088913245536e-06, "loss": 0.2781, "step": 640 }, { "epoch": 1.25, "grad_norm": 1.7184543746534546, "learning_rate": 6.528699903892073e-06, "loss": 0.3558, "step": 641 }, { "epoch": 1.25, "grad_norm": 1.3627015199988983, "learning_rate": 6.526305263468012e-06, "loss": 0.266, "step": 642 }, { "epoch": 1.26, "grad_norm": 1.6039389811647342, "learning_rate": 6.523904996425554e-06, "loss": 0.2442, "step": 643 }, { "epoch": 1.26, "grad_norm": 1.5629681026835025, "learning_rate": 6.5214991072273635e-06, "loss": 0.2438, "step": 644 }, { "epoch": 1.26, "grad_norm": 1.4635124660594963, "learning_rate": 6.5190876003465626e-06, "loss": 0.2403, "step": 645 }, { "epoch": 1.26, "grad_norm": 1.6302836580522437, "learning_rate": 6.516670480266711e-06, "loss": 0.257, "step": 646 }, { "epoch": 1.26, "grad_norm": 1.6952462361249983, "learning_rate": 6.514247751481805e-06, "loss": 0.2804, "step": 647 }, { "epoch": 1.27, "grad_norm": 1.7281136666012025, "learning_rate": 6.511819418496276e-06, "loss": 0.2745, "step": 648 }, { "epoch": 1.27, "grad_norm": 1.704926687993403, "learning_rate": 6.509385485824968e-06, "loss": 0.2823, "step": 649 }, { "epoch": 1.27, "grad_norm": 1.4575652185207486, "learning_rate": 6.506945957993139e-06, "loss": 0.2676, "step": 650 }, { "epoch": 1.27, "grad_norm": 1.6305006456791387, "learning_rate": 6.504500839536449e-06, "loss": 0.298, "step": 651 }, { "epoch": 1.27, "grad_norm": 1.4901799332298455, "learning_rate": 6.502050135000952e-06, "loss": 0.2794, "step": 652 }, { "epoch": 1.28, "grad_norm": 1.7987016912271911, "learning_rate": 6.499593848943089e-06, "loss": 0.3, "step": 653 }, { "epoch": 1.28, "grad_norm": 1.77531469277802, "learning_rate": 6.4971319859296766e-06, "loss": 0.2692, "step": 654 }, { "epoch": 1.28, "grad_norm": 1.5772927923618247, "learning_rate": 6.494664550537902e-06, "loss": 0.2582, "step": 655 }, { "epoch": 1.28, "grad_norm": 1.569475531115483, "learning_rate": 6.492191547355313e-06, "loss": 0.2952, "step": 656 }, { "epoch": 1.28, "grad_norm": 1.7240539456935176, "learning_rate": 6.489712980979807e-06, "loss": 0.325, "step": 657 }, { "epoch": 1.29, "grad_norm": 1.6016863045490726, "learning_rate": 6.4872288560196266e-06, "loss": 0.2353, "step": 658 }, { "epoch": 1.29, "grad_norm": 1.7721175642237323, "learning_rate": 6.484739177093348e-06, "loss": 0.2355, "step": 659 }, { "epoch": 1.29, "grad_norm": 1.6692087857617322, "learning_rate": 6.482243948829876e-06, "loss": 0.2916, "step": 660 }, { "epoch": 1.29, "grad_norm": 1.5905845078823142, "learning_rate": 6.479743175868428e-06, "loss": 0.3013, "step": 661 }, { "epoch": 1.29, "grad_norm": 1.586766632109702, "learning_rate": 6.477236862858536e-06, "loss": 0.2816, "step": 662 }, { "epoch": 1.29, "grad_norm": 1.6445651356221325, "learning_rate": 6.474725014460028e-06, "loss": 0.2889, "step": 663 }, { "epoch": 1.3, "grad_norm": 1.5991291802175065, "learning_rate": 6.472207635343026e-06, "loss": 0.2212, "step": 664 }, { "epoch": 1.3, "grad_norm": 1.8917724662787687, "learning_rate": 6.469684730187934e-06, "loss": 0.2493, "step": 665 }, { "epoch": 1.3, "grad_norm": 1.6006514737080917, "learning_rate": 6.467156303685431e-06, "loss": 0.2922, "step": 666 }, { "epoch": 1.3, "grad_norm": 1.4847893881194167, "learning_rate": 6.4646223605364595e-06, "loss": 0.2747, "step": 667 }, { "epoch": 1.3, "grad_norm": 1.5625219412180447, "learning_rate": 6.46208290545222e-06, "loss": 0.2243, "step": 668 }, { "epoch": 1.31, "grad_norm": 1.9385116622641985, "learning_rate": 6.459537943154163e-06, "loss": 0.2609, "step": 669 }, { "epoch": 1.31, "grad_norm": 1.5777393822298202, "learning_rate": 6.456987478373975e-06, "loss": 0.3074, "step": 670 }, { "epoch": 1.31, "grad_norm": 1.6430563974928607, "learning_rate": 6.454431515853573e-06, "loss": 0.2886, "step": 671 }, { "epoch": 1.31, "grad_norm": 1.523774306076172, "learning_rate": 6.4518700603451e-06, "loss": 0.2527, "step": 672 }, { "epoch": 1.31, "grad_norm": 1.6081984141571752, "learning_rate": 6.449303116610906e-06, "loss": 0.2632, "step": 673 }, { "epoch": 1.32, "grad_norm": 1.5474124685873085, "learning_rate": 6.446730689423548e-06, "loss": 0.266, "step": 674 }, { "epoch": 1.32, "grad_norm": 1.4764470722431944, "learning_rate": 6.444152783565778e-06, "loss": 0.2222, "step": 675 }, { "epoch": 1.32, "grad_norm": 1.6123598952825695, "learning_rate": 6.441569403830533e-06, "loss": 0.2621, "step": 676 }, { "epoch": 1.32, "grad_norm": 1.627023528388627, "learning_rate": 6.438980555020928e-06, "loss": 0.2602, "step": 677 }, { "epoch": 1.32, "grad_norm": 1.650144182810641, "learning_rate": 6.436386241950248e-06, "loss": 0.306, "step": 678 }, { "epoch": 1.33, "grad_norm": 1.782240549047988, "learning_rate": 6.433786469441933e-06, "loss": 0.2528, "step": 679 }, { "epoch": 1.33, "grad_norm": 1.8117222670762192, "learning_rate": 6.431181242329578e-06, "loss": 0.2308, "step": 680 }, { "epoch": 1.33, "grad_norm": 1.4815742533573486, "learning_rate": 6.428570565456915e-06, "loss": 0.2606, "step": 681 }, { "epoch": 1.33, "grad_norm": 1.49523546013908, "learning_rate": 6.4259544436778135e-06, "loss": 0.2959, "step": 682 }, { "epoch": 1.33, "grad_norm": 1.5772670010903462, "learning_rate": 6.423332881856262e-06, "loss": 0.2263, "step": 683 }, { "epoch": 1.34, "grad_norm": 1.6440080491743119, "learning_rate": 6.420705884866365e-06, "loss": 0.2488, "step": 684 }, { "epoch": 1.34, "grad_norm": 1.6490212292294542, "learning_rate": 6.418073457592333e-06, "loss": 0.2571, "step": 685 }, { "epoch": 1.34, "grad_norm": 1.6199639288679781, "learning_rate": 6.415435604928471e-06, "loss": 0.2449, "step": 686 }, { "epoch": 1.34, "grad_norm": 1.5912574040265144, "learning_rate": 6.412792331779172e-06, "loss": 0.238, "step": 687 }, { "epoch": 1.34, "grad_norm": 1.6761807366552248, "learning_rate": 6.4101436430589085e-06, "loss": 0.28, "step": 688 }, { "epoch": 1.35, "grad_norm": 1.5803376499183392, "learning_rate": 6.407489543692218e-06, "loss": 0.2092, "step": 689 }, { "epoch": 1.35, "grad_norm": 1.7114133332347166, "learning_rate": 6.4048300386137025e-06, "loss": 0.2771, "step": 690 }, { "epoch": 1.35, "grad_norm": 1.5603992464111838, "learning_rate": 6.4021651327680095e-06, "loss": 0.221, "step": 691 }, { "epoch": 1.35, "grad_norm": 1.6703241290610231, "learning_rate": 6.399494831109832e-06, "loss": 0.2442, "step": 692 }, { "epoch": 1.35, "grad_norm": 1.5752776142514031, "learning_rate": 6.396819138603892e-06, "loss": 0.2572, "step": 693 }, { "epoch": 1.36, "grad_norm": 1.7797840596297922, "learning_rate": 6.394138060224937e-06, "loss": 0.2781, "step": 694 }, { "epoch": 1.36, "grad_norm": 1.446581585481034, "learning_rate": 6.391451600957725e-06, "loss": 0.2888, "step": 695 }, { "epoch": 1.36, "grad_norm": 1.4894533816580797, "learning_rate": 6.3887597657970235e-06, "loss": 0.2457, "step": 696 }, { "epoch": 1.36, "grad_norm": 1.573447662075768, "learning_rate": 6.386062559747589e-06, "loss": 0.2602, "step": 697 }, { "epoch": 1.36, "grad_norm": 1.6883817632166387, "learning_rate": 6.383359987824167e-06, "loss": 0.2362, "step": 698 }, { "epoch": 1.37, "grad_norm": 1.5113470148941421, "learning_rate": 6.380652055051478e-06, "loss": 0.2361, "step": 699 }, { "epoch": 1.37, "grad_norm": 1.4451985850032678, "learning_rate": 6.377938766464212e-06, "loss": 0.2338, "step": 700 }, { "epoch": 1.37, "grad_norm": 1.7114444747636215, "learning_rate": 6.375220127107016e-06, "loss": 0.2378, "step": 701 }, { "epoch": 1.37, "grad_norm": 1.6631250128655835, "learning_rate": 6.372496142034483e-06, "loss": 0.238, "step": 702 }, { "epoch": 1.37, "grad_norm": 1.6423572601728769, "learning_rate": 6.369766816311148e-06, "loss": 0.2506, "step": 703 }, { "epoch": 1.38, "grad_norm": 1.6864921124410874, "learning_rate": 6.367032155011471e-06, "loss": 0.2352, "step": 704 }, { "epoch": 1.38, "grad_norm": 1.590983494485953, "learning_rate": 6.364292163219839e-06, "loss": 0.2558, "step": 705 }, { "epoch": 1.38, "grad_norm": 1.604378972269676, "learning_rate": 6.361546846030543e-06, "loss": 0.2251, "step": 706 }, { "epoch": 1.38, "grad_norm": 1.5441312368761344, "learning_rate": 6.358796208547779e-06, "loss": 0.2394, "step": 707 }, { "epoch": 1.38, "grad_norm": 1.7180148294442463, "learning_rate": 6.3560402558856354e-06, "loss": 0.2612, "step": 708 }, { "epoch": 1.38, "grad_norm": 1.5664441865119705, "learning_rate": 6.353278993168078e-06, "loss": 0.2223, "step": 709 }, { "epoch": 1.39, "grad_norm": 1.6085534245023454, "learning_rate": 6.350512425528949e-06, "loss": 0.2814, "step": 710 }, { "epoch": 1.39, "grad_norm": 1.4491187436118929, "learning_rate": 6.347740558111955e-06, "loss": 0.2014, "step": 711 }, { "epoch": 1.39, "grad_norm": 1.4930036689867838, "learning_rate": 6.3449633960706536e-06, "loss": 0.2506, "step": 712 }, { "epoch": 1.39, "grad_norm": 1.5382383066615783, "learning_rate": 6.342180944568445e-06, "loss": 0.1809, "step": 713 }, { "epoch": 1.39, "grad_norm": 1.6471137335815311, "learning_rate": 6.339393208778568e-06, "loss": 0.252, "step": 714 }, { "epoch": 1.4, "grad_norm": 1.369494845016825, "learning_rate": 6.336600193884082e-06, "loss": 0.2141, "step": 715 }, { "epoch": 1.4, "grad_norm": 1.4596516987357409, "learning_rate": 6.333801905077864e-06, "loss": 0.2602, "step": 716 }, { "epoch": 1.4, "grad_norm": 1.5817242376804705, "learning_rate": 6.330998347562596e-06, "loss": 0.2176, "step": 717 }, { "epoch": 1.4, "grad_norm": 1.6078447402915181, "learning_rate": 6.328189526550756e-06, "loss": 0.2791, "step": 718 }, { "epoch": 1.4, "grad_norm": 1.506888145662639, "learning_rate": 6.325375447264607e-06, "loss": 0.2437, "step": 719 }, { "epoch": 1.41, "grad_norm": 1.6211742646909224, "learning_rate": 6.322556114936189e-06, "loss": 0.2463, "step": 720 }, { "epoch": 1.41, "grad_norm": 1.4290821919705445, "learning_rate": 6.319731534807309e-06, "loss": 0.2262, "step": 721 }, { "epoch": 1.41, "grad_norm": 1.5262709374679297, "learning_rate": 6.31690171212953e-06, "loss": 0.2094, "step": 722 }, { "epoch": 1.41, "grad_norm": 1.5871935954240537, "learning_rate": 6.314066652164164e-06, "loss": 0.2284, "step": 723 }, { "epoch": 1.41, "grad_norm": 1.528703360240016, "learning_rate": 6.311226360182257e-06, "loss": 0.2156, "step": 724 }, { "epoch": 1.42, "grad_norm": 1.585630125360189, "learning_rate": 6.308380841464587e-06, "loss": 0.225, "step": 725 }, { "epoch": 1.42, "grad_norm": 1.4722254097301521, "learning_rate": 6.305530101301645e-06, "loss": 0.1938, "step": 726 }, { "epoch": 1.42, "grad_norm": 1.4965368304787072, "learning_rate": 6.302674144993634e-06, "loss": 0.1983, "step": 727 }, { "epoch": 1.42, "grad_norm": 1.6916953823543392, "learning_rate": 6.2998129778504535e-06, "loss": 0.15, "step": 728 }, { "epoch": 1.42, "grad_norm": 1.5807874736674112, "learning_rate": 6.2969466051916905e-06, "loss": 0.2225, "step": 729 }, { "epoch": 1.43, "grad_norm": 1.6024934490560239, "learning_rate": 6.29407503234661e-06, "loss": 0.2051, "step": 730 }, { "epoch": 1.43, "grad_norm": 1.6963311480252483, "learning_rate": 6.291198264654147e-06, "loss": 0.236, "step": 731 }, { "epoch": 1.43, "grad_norm": 1.747708263980439, "learning_rate": 6.288316307462895e-06, "loss": 0.2256, "step": 732 }, { "epoch": 1.43, "grad_norm": 1.5842632051569467, "learning_rate": 6.285429166131092e-06, "loss": 0.1766, "step": 733 }, { "epoch": 1.43, "grad_norm": 1.6823206219851494, "learning_rate": 6.282536846026621e-06, "loss": 0.232, "step": 734 }, { "epoch": 1.44, "grad_norm": 1.6176040395973508, "learning_rate": 6.279639352526989e-06, "loss": 0.2195, "step": 735 }, { "epoch": 1.44, "grad_norm": 1.6402459042437478, "learning_rate": 6.276736691019323e-06, "loss": 0.2073, "step": 736 }, { "epoch": 1.44, "grad_norm": 1.7166508906521776, "learning_rate": 6.273828866900358e-06, "loss": 0.2605, "step": 737 }, { "epoch": 1.44, "grad_norm": 1.4205001784798386, "learning_rate": 6.270915885576429e-06, "loss": 0.1933, "step": 738 }, { "epoch": 1.44, "grad_norm": 1.7362158431947659, "learning_rate": 6.267997752463455e-06, "loss": 0.2242, "step": 739 }, { "epoch": 1.45, "grad_norm": 1.5234829049565342, "learning_rate": 6.265074472986942e-06, "loss": 0.2105, "step": 740 }, { "epoch": 1.45, "grad_norm": 1.498375875164791, "learning_rate": 6.262146052581954e-06, "loss": 0.1928, "step": 741 }, { "epoch": 1.45, "grad_norm": 1.6695301809388576, "learning_rate": 6.259212496693122e-06, "loss": 0.2485, "step": 742 }, { "epoch": 1.45, "grad_norm": 2.01790915275668, "learning_rate": 6.2562738107746195e-06, "loss": 0.2289, "step": 743 }, { "epoch": 1.45, "grad_norm": 1.549092925771211, "learning_rate": 6.253330000290159e-06, "loss": 0.2277, "step": 744 }, { "epoch": 1.46, "grad_norm": 1.5726041250261302, "learning_rate": 6.250381070712984e-06, "loss": 0.2796, "step": 745 }, { "epoch": 1.46, "grad_norm": 1.670709849936573, "learning_rate": 6.247427027525851e-06, "loss": 0.2313, "step": 746 }, { "epoch": 1.46, "grad_norm": 1.655556394460029, "learning_rate": 6.244467876221027e-06, "loss": 0.2011, "step": 747 }, { "epoch": 1.46, "grad_norm": 1.4562848458638482, "learning_rate": 6.241503622300277e-06, "loss": 0.2322, "step": 748 }, { "epoch": 1.46, "grad_norm": 1.5309070528223143, "learning_rate": 6.238534271274847e-06, "loss": 0.1892, "step": 749 }, { "epoch": 1.46, "grad_norm": 1.5545358420805504, "learning_rate": 6.235559828665468e-06, "loss": 0.2401, "step": 750 }, { "epoch": 1.47, "grad_norm": 1.4735862860454505, "learning_rate": 6.2325803000023306e-06, "loss": 0.2082, "step": 751 }, { "epoch": 1.47, "grad_norm": 1.4955055399057995, "learning_rate": 6.229595690825086e-06, "loss": 0.242, "step": 752 }, { "epoch": 1.47, "grad_norm": 1.4533649149795171, "learning_rate": 6.2266060066828295e-06, "loss": 0.2352, "step": 753 }, { "epoch": 1.47, "grad_norm": 1.5504058765097382, "learning_rate": 6.223611253134092e-06, "loss": 0.2554, "step": 754 }, { "epoch": 1.47, "grad_norm": 1.4771784518254232, "learning_rate": 6.22061143574683e-06, "loss": 0.2131, "step": 755 }, { "epoch": 1.48, "grad_norm": 1.3517199690819452, "learning_rate": 6.217606560098415e-06, "loss": 0.1921, "step": 756 }, { "epoch": 1.48, "grad_norm": 1.4700348058892618, "learning_rate": 6.214596631775621e-06, "loss": 0.2232, "step": 757 }, { "epoch": 1.48, "grad_norm": 1.6844235205995572, "learning_rate": 6.21158165637462e-06, "loss": 0.239, "step": 758 }, { "epoch": 1.48, "grad_norm": 1.5348599752773822, "learning_rate": 6.208561639500964e-06, "loss": 0.2249, "step": 759 }, { "epoch": 1.48, "grad_norm": 1.6407422584235305, "learning_rate": 6.205536586769579e-06, "loss": 0.2085, "step": 760 }, { "epoch": 1.49, "grad_norm": 1.4611237591123658, "learning_rate": 6.2025065038047566e-06, "loss": 0.217, "step": 761 }, { "epoch": 1.49, "grad_norm": 1.5121157506637923, "learning_rate": 6.199471396240139e-06, "loss": 0.2193, "step": 762 }, { "epoch": 1.49, "grad_norm": 1.5563509473366244, "learning_rate": 6.196431269718709e-06, "loss": 0.2446, "step": 763 }, { "epoch": 1.49, "grad_norm": 1.529722981597024, "learning_rate": 6.193386129892782e-06, "loss": 0.1754, "step": 764 }, { "epoch": 1.49, "grad_norm": 1.3183937021198622, "learning_rate": 6.1903359824239935e-06, "loss": 0.1635, "step": 765 }, { "epoch": 1.5, "grad_norm": 1.765905668356703, "learning_rate": 6.1872808329832926e-06, "loss": 0.2154, "step": 766 }, { "epoch": 1.5, "grad_norm": 1.5752158316495857, "learning_rate": 6.184220687250923e-06, "loss": 0.2443, "step": 767 }, { "epoch": 1.5, "grad_norm": 1.5915045582338319, "learning_rate": 6.181155550916423e-06, "loss": 0.2218, "step": 768 }, { "epoch": 1.5, "grad_norm": 1.532512213948594, "learning_rate": 6.178085429678607e-06, "loss": 0.1789, "step": 769 }, { "epoch": 1.5, "grad_norm": 1.675348488012146, "learning_rate": 6.175010329245555e-06, "loss": 0.155, "step": 770 }, { "epoch": 1.51, "grad_norm": 1.6693298203439413, "learning_rate": 6.1719302553346105e-06, "loss": 0.1671, "step": 771 }, { "epoch": 1.51, "grad_norm": 1.5278229242750316, "learning_rate": 6.168845213672358e-06, "loss": 0.2286, "step": 772 }, { "epoch": 1.51, "grad_norm": 1.6044757963682923, "learning_rate": 6.165755209994623e-06, "loss": 0.2528, "step": 773 }, { "epoch": 1.51, "grad_norm": 1.536173350290537, "learning_rate": 6.162660250046452e-06, "loss": 0.2383, "step": 774 }, { "epoch": 1.51, "grad_norm": 1.5710445868162255, "learning_rate": 6.15956033958211e-06, "loss": 0.2054, "step": 775 }, { "epoch": 1.52, "grad_norm": 1.552157970684704, "learning_rate": 6.156455484365066e-06, "loss": 0.2048, "step": 776 }, { "epoch": 1.52, "grad_norm": 1.4921743227990962, "learning_rate": 6.1533456901679806e-06, "loss": 0.2177, "step": 777 }, { "epoch": 1.52, "grad_norm": 1.4021878835817736, "learning_rate": 6.150230962772696e-06, "loss": 0.2268, "step": 778 }, { "epoch": 1.52, "grad_norm": 1.4512898649007486, "learning_rate": 6.147111307970229e-06, "loss": 0.2493, "step": 779 }, { "epoch": 1.52, "grad_norm": 1.5585930283071743, "learning_rate": 6.143986731560761e-06, "loss": 0.1912, "step": 780 }, { "epoch": 1.53, "grad_norm": 1.440538901395143, "learning_rate": 6.140857239353613e-06, "loss": 0.1808, "step": 781 }, { "epoch": 1.53, "grad_norm": 1.5252483227880906, "learning_rate": 6.137722837167257e-06, "loss": 0.2234, "step": 782 }, { "epoch": 1.53, "grad_norm": 1.6849525715002978, "learning_rate": 6.134583530829289e-06, "loss": 0.2082, "step": 783 }, { "epoch": 1.53, "grad_norm": 1.6650042816889952, "learning_rate": 6.131439326176421e-06, "loss": 0.252, "step": 784 }, { "epoch": 1.53, "grad_norm": 1.5338253924338963, "learning_rate": 6.1282902290544755e-06, "loss": 0.2357, "step": 785 }, { "epoch": 1.54, "grad_norm": 1.4427145667038659, "learning_rate": 6.125136245318369e-06, "loss": 0.2144, "step": 786 }, { "epoch": 1.54, "grad_norm": 1.5539105296047335, "learning_rate": 6.121977380832107e-06, "loss": 0.1846, "step": 787 }, { "epoch": 1.54, "grad_norm": 1.6968290813637037, "learning_rate": 6.118813641468765e-06, "loss": 0.2087, "step": 788 }, { "epoch": 1.54, "grad_norm": 1.4520442066236454, "learning_rate": 6.115645033110484e-06, "loss": 0.2221, "step": 789 }, { "epoch": 1.54, "grad_norm": 1.5904721831987416, "learning_rate": 6.112471561648458e-06, "loss": 0.2059, "step": 790 }, { "epoch": 1.54, "grad_norm": 1.3563914933070467, "learning_rate": 6.109293232982922e-06, "loss": 0.1916, "step": 791 }, { "epoch": 1.55, "grad_norm": 1.4326005538417677, "learning_rate": 6.1061100530231424e-06, "loss": 0.1648, "step": 792 }, { "epoch": 1.55, "grad_norm": 1.5596339993706587, "learning_rate": 6.102922027687403e-06, "loss": 0.2471, "step": 793 }, { "epoch": 1.55, "grad_norm": 1.5510593566375217, "learning_rate": 6.0997291629030006e-06, "loss": 0.2298, "step": 794 }, { "epoch": 1.55, "grad_norm": 1.5542671308448452, "learning_rate": 6.0965314646062255e-06, "loss": 0.2175, "step": 795 }, { "epoch": 1.55, "grad_norm": 1.5680151001564862, "learning_rate": 6.093328938742357e-06, "loss": 0.214, "step": 796 }, { "epoch": 1.56, "grad_norm": 1.512568667217441, "learning_rate": 6.090121591265649e-06, "loss": 0.1956, "step": 797 }, { "epoch": 1.56, "grad_norm": 1.5191795007826676, "learning_rate": 6.086909428139321e-06, "loss": 0.1991, "step": 798 }, { "epoch": 1.56, "grad_norm": 1.548332631029857, "learning_rate": 6.083692455335545e-06, "loss": 0.2283, "step": 799 }, { "epoch": 1.56, "grad_norm": 1.5728370524711224, "learning_rate": 6.080470678835434e-06, "loss": 0.1767, "step": 800 }, { "epoch": 1.56, "grad_norm": 1.4672258136624463, "learning_rate": 6.077244104629035e-06, "loss": 0.2451, "step": 801 }, { "epoch": 1.57, "grad_norm": 1.5325016172857406, "learning_rate": 6.074012738715316e-06, "loss": 0.2319, "step": 802 }, { "epoch": 1.57, "grad_norm": 1.7612636121903307, "learning_rate": 6.070776587102147e-06, "loss": 0.2362, "step": 803 }, { "epoch": 1.57, "grad_norm": 1.7153441327075105, "learning_rate": 6.067535655806304e-06, "loss": 0.2303, "step": 804 }, { "epoch": 1.57, "grad_norm": 1.5993286439885155, "learning_rate": 6.064289950853444e-06, "loss": 0.2388, "step": 805 }, { "epoch": 1.57, "grad_norm": 1.4716292744206727, "learning_rate": 6.061039478278104e-06, "loss": 0.2212, "step": 806 }, { "epoch": 1.58, "grad_norm": 1.5057706605609193, "learning_rate": 6.05778424412368e-06, "loss": 0.2023, "step": 807 }, { "epoch": 1.58, "grad_norm": 1.583690374805723, "learning_rate": 6.054524254442424e-06, "loss": 0.2262, "step": 808 }, { "epoch": 1.58, "grad_norm": 1.580877596266967, "learning_rate": 6.0512595152954305e-06, "loss": 0.1781, "step": 809 }, { "epoch": 1.58, "grad_norm": 1.6445308695618015, "learning_rate": 6.047990032752622e-06, "loss": 0.2172, "step": 810 }, { "epoch": 1.58, "grad_norm": 1.4868883022550186, "learning_rate": 6.04471581289274e-06, "loss": 0.2143, "step": 811 }, { "epoch": 1.59, "grad_norm": 1.6321870050354803, "learning_rate": 6.0414368618033354e-06, "loss": 0.1715, "step": 812 }, { "epoch": 1.59, "grad_norm": 1.4376514380011434, "learning_rate": 6.038153185580757e-06, "loss": 0.2095, "step": 813 }, { "epoch": 1.59, "grad_norm": 1.5197089681844589, "learning_rate": 6.0348647903301345e-06, "loss": 0.1844, "step": 814 }, { "epoch": 1.59, "grad_norm": 1.6244694603090353, "learning_rate": 6.031571682165374e-06, "loss": 0.2269, "step": 815 }, { "epoch": 1.59, "grad_norm": 1.5424084121255826, "learning_rate": 6.028273867209144e-06, "loss": 0.1848, "step": 816 }, { "epoch": 1.6, "grad_norm": 1.5581845430543129, "learning_rate": 6.0249713515928645e-06, "loss": 0.2075, "step": 817 }, { "epoch": 1.6, "grad_norm": 1.4404386701020757, "learning_rate": 6.0216641414566945e-06, "loss": 0.168, "step": 818 }, { "epoch": 1.6, "grad_norm": 1.506996468729856, "learning_rate": 6.018352242949519e-06, "loss": 0.1678, "step": 819 }, { "epoch": 1.6, "grad_norm": 1.4908140318574843, "learning_rate": 6.015035662228943e-06, "loss": 0.1928, "step": 820 }, { "epoch": 1.6, "grad_norm": 1.5449352761193949, "learning_rate": 6.011714405461277e-06, "loss": 0.2274, "step": 821 }, { "epoch": 1.61, "grad_norm": 1.404495359118684, "learning_rate": 6.008388478821523e-06, "loss": 0.1872, "step": 822 }, { "epoch": 1.61, "grad_norm": 1.611360304691286, "learning_rate": 6.005057888493365e-06, "loss": 0.1778, "step": 823 }, { "epoch": 1.61, "grad_norm": 1.4729198789068578, "learning_rate": 6.001722640669162e-06, "loss": 0.1899, "step": 824 }, { "epoch": 1.61, "grad_norm": 1.454275902790667, "learning_rate": 5.998382741549929e-06, "loss": 0.2478, "step": 825 }, { "epoch": 1.61, "grad_norm": 1.4942496995662455, "learning_rate": 5.995038197345329e-06, "loss": 0.162, "step": 826 }, { "epoch": 1.62, "grad_norm": 1.5079819254858808, "learning_rate": 5.991689014273663e-06, "loss": 0.2407, "step": 827 }, { "epoch": 1.62, "grad_norm": 1.49550239826416, "learning_rate": 5.988335198561855e-06, "loss": 0.1895, "step": 828 }, { "epoch": 1.62, "grad_norm": 1.6176117689291414, "learning_rate": 5.984976756445443e-06, "loss": 0.2164, "step": 829 }, { "epoch": 1.62, "grad_norm": 1.5195824851605562, "learning_rate": 5.981613694168567e-06, "loss": 0.1939, "step": 830 }, { "epoch": 1.62, "grad_norm": 1.6004753789306458, "learning_rate": 5.978246017983955e-06, "loss": 0.2292, "step": 831 }, { "epoch": 1.62, "grad_norm": 1.5466380195876284, "learning_rate": 5.974873734152916e-06, "loss": 0.1816, "step": 832 }, { "epoch": 1.63, "grad_norm": 1.562357188335768, "learning_rate": 5.971496848945324e-06, "loss": 0.2437, "step": 833 }, { "epoch": 1.63, "grad_norm": 1.5401361313032471, "learning_rate": 5.96811536863961e-06, "loss": 0.1989, "step": 834 }, { "epoch": 1.63, "grad_norm": 1.6091873459494557, "learning_rate": 5.964729299522746e-06, "loss": 0.1972, "step": 835 }, { "epoch": 1.63, "grad_norm": 1.7190106310795428, "learning_rate": 5.961338647890235e-06, "loss": 0.2397, "step": 836 }, { "epoch": 1.63, "grad_norm": 1.6242691662311548, "learning_rate": 5.9579434200461045e-06, "loss": 0.2044, "step": 837 }, { "epoch": 1.64, "grad_norm": 1.5531805257059634, "learning_rate": 5.954543622302885e-06, "loss": 0.1758, "step": 838 }, { "epoch": 1.64, "grad_norm": 1.5823453158045508, "learning_rate": 5.951139260981607e-06, "loss": 0.2219, "step": 839 }, { "epoch": 1.64, "grad_norm": 1.5742409418665089, "learning_rate": 5.947730342411785e-06, "loss": 0.2261, "step": 840 }, { "epoch": 1.64, "grad_norm": 1.488115505881121, "learning_rate": 5.944316872931405e-06, "loss": 0.191, "step": 841 }, { "epoch": 1.64, "grad_norm": 1.42868097665317, "learning_rate": 5.940898858886916e-06, "loss": 0.1882, "step": 842 }, { "epoch": 1.65, "grad_norm": 1.4818435411202657, "learning_rate": 5.937476306633216e-06, "loss": 0.2019, "step": 843 }, { "epoch": 1.65, "grad_norm": 2.1398913291835933, "learning_rate": 5.93404922253364e-06, "loss": 0.1795, "step": 844 }, { "epoch": 1.65, "grad_norm": 1.5073074853636819, "learning_rate": 5.9306176129599504e-06, "loss": 0.2092, "step": 845 }, { "epoch": 1.65, "grad_norm": 1.4493109015036638, "learning_rate": 5.927181484292321e-06, "loss": 0.1993, "step": 846 }, { "epoch": 1.65, "grad_norm": 1.4843098333119364, "learning_rate": 5.923740842919329e-06, "loss": 0.1839, "step": 847 }, { "epoch": 1.66, "grad_norm": 1.5190311168376478, "learning_rate": 5.9202956952379435e-06, "loss": 0.201, "step": 848 }, { "epoch": 1.66, "grad_norm": 1.5624115941271182, "learning_rate": 5.916846047653508e-06, "loss": 0.2248, "step": 849 }, { "epoch": 1.66, "grad_norm": 1.5662558070124424, "learning_rate": 5.913391906579735e-06, "loss": 0.2043, "step": 850 }, { "epoch": 1.66, "grad_norm": 1.3697307001277905, "learning_rate": 5.909933278438691e-06, "loss": 0.1858, "step": 851 }, { "epoch": 1.66, "grad_norm": 1.495079746445899, "learning_rate": 5.9064701696607854e-06, "loss": 0.2364, "step": 852 }, { "epoch": 1.67, "grad_norm": 1.725736018548126, "learning_rate": 5.903002586684759e-06, "loss": 0.2206, "step": 853 }, { "epoch": 1.67, "grad_norm": 1.4718747984335425, "learning_rate": 5.8995305359576685e-06, "loss": 0.1945, "step": 854 }, { "epoch": 1.67, "grad_norm": 2.2725863455938873, "learning_rate": 5.896054023934879e-06, "loss": 0.2344, "step": 855 }, { "epoch": 1.67, "grad_norm": 1.593554810370741, "learning_rate": 5.892573057080049e-06, "loss": 0.1882, "step": 856 }, { "epoch": 1.67, "grad_norm": 1.4037769003841023, "learning_rate": 5.8890876418651235e-06, "loss": 0.2261, "step": 857 }, { "epoch": 1.68, "grad_norm": 1.6945828304482113, "learning_rate": 5.885597784770311e-06, "loss": 0.2146, "step": 858 }, { "epoch": 1.68, "grad_norm": 1.553515054574054, "learning_rate": 5.882103492284086e-06, "loss": 0.2225, "step": 859 }, { "epoch": 1.68, "grad_norm": 1.476832246767893, "learning_rate": 5.878604770903163e-06, "loss": 0.2395, "step": 860 }, { "epoch": 1.68, "grad_norm": 1.5066320624622234, "learning_rate": 5.875101627132497e-06, "loss": 0.2265, "step": 861 }, { "epoch": 1.68, "grad_norm": 1.4206439378506275, "learning_rate": 5.8715940674852605e-06, "loss": 0.1875, "step": 862 }, { "epoch": 1.69, "grad_norm": 1.5871482892618716, "learning_rate": 5.868082098482837e-06, "loss": 0.2167, "step": 863 }, { "epoch": 1.69, "grad_norm": 1.4603409981675464, "learning_rate": 5.864565726654811e-06, "loss": 0.2338, "step": 864 }, { "epoch": 1.69, "grad_norm": 1.4732174771589595, "learning_rate": 5.8610449585389485e-06, "loss": 0.2192, "step": 865 }, { "epoch": 1.69, "grad_norm": 1.5787237480814973, "learning_rate": 5.857519800681193e-06, "loss": 0.1833, "step": 866 }, { "epoch": 1.69, "grad_norm": 1.515217845643917, "learning_rate": 5.853990259635647e-06, "loss": 0.1783, "step": 867 }, { "epoch": 1.7, "grad_norm": 1.4374853445358118, "learning_rate": 5.850456341964565e-06, "loss": 0.1982, "step": 868 }, { "epoch": 1.7, "grad_norm": 1.467721049574277, "learning_rate": 5.846918054238335e-06, "loss": 0.2037, "step": 869 }, { "epoch": 1.7, "grad_norm": 1.4929433420786429, "learning_rate": 5.8433754030354725e-06, "loss": 0.1935, "step": 870 }, { "epoch": 1.7, "grad_norm": 1.5195864940370416, "learning_rate": 5.839828394942607e-06, "loss": 0.2526, "step": 871 }, { "epoch": 1.7, "grad_norm": 1.4624089383968868, "learning_rate": 5.836277036554466e-06, "loss": 0.1837, "step": 872 }, { "epoch": 1.71, "grad_norm": 1.5167931060197903, "learning_rate": 5.8327213344738656e-06, "loss": 0.1753, "step": 873 }, { "epoch": 1.71, "grad_norm": 1.3956798637005763, "learning_rate": 5.829161295311698e-06, "loss": 0.1974, "step": 874 }, { "epoch": 1.71, "grad_norm": 1.5234030746494518, "learning_rate": 5.8255969256869195e-06, "loss": 0.2069, "step": 875 }, { "epoch": 1.71, "grad_norm": 1.6919481721100054, "learning_rate": 5.822028232226539e-06, "loss": 0.1814, "step": 876 }, { "epoch": 1.71, "grad_norm": 1.4096913795257404, "learning_rate": 5.8184552215656015e-06, "loss": 0.2111, "step": 877 }, { "epoch": 1.71, "grad_norm": 1.8126342470335342, "learning_rate": 5.81487790034718e-06, "loss": 0.2748, "step": 878 }, { "epoch": 1.72, "grad_norm": 1.5453247415436804, "learning_rate": 5.811296275222363e-06, "loss": 0.2411, "step": 879 }, { "epoch": 1.72, "grad_norm": 1.5085961028830772, "learning_rate": 5.807710352850241e-06, "loss": 0.2181, "step": 880 }, { "epoch": 1.72, "grad_norm": 1.4520310083769652, "learning_rate": 5.804120139897891e-06, "loss": 0.1958, "step": 881 }, { "epoch": 1.72, "grad_norm": 1.4845449262700783, "learning_rate": 5.800525643040371e-06, "loss": 0.1877, "step": 882 }, { "epoch": 1.72, "grad_norm": 1.5298335228495592, "learning_rate": 5.796926868960701e-06, "loss": 0.2231, "step": 883 }, { "epoch": 1.73, "grad_norm": 1.4733608914574314, "learning_rate": 5.793323824349856e-06, "loss": 0.1943, "step": 884 }, { "epoch": 1.73, "grad_norm": 1.5421270431763836, "learning_rate": 5.7897165159067485e-06, "loss": 0.1901, "step": 885 }, { "epoch": 1.73, "grad_norm": 1.549422322748746, "learning_rate": 5.7861049503382185e-06, "loss": 0.2333, "step": 886 }, { "epoch": 1.73, "grad_norm": 1.554389774878112, "learning_rate": 5.782489134359023e-06, "loss": 0.2222, "step": 887 }, { "epoch": 1.73, "grad_norm": 1.5289791882651025, "learning_rate": 5.778869074691822e-06, "loss": 0.2217, "step": 888 }, { "epoch": 1.74, "grad_norm": 1.490365207133443, "learning_rate": 5.775244778067161e-06, "loss": 0.2113, "step": 889 }, { "epoch": 1.74, "grad_norm": 1.5232422582924288, "learning_rate": 5.771616251223469e-06, "loss": 0.199, "step": 890 }, { "epoch": 1.74, "grad_norm": 1.5126305593777163, "learning_rate": 5.767983500907034e-06, "loss": 0.203, "step": 891 }, { "epoch": 1.74, "grad_norm": 1.532123179909865, "learning_rate": 5.764346533872001e-06, "loss": 0.2182, "step": 892 }, { "epoch": 1.74, "grad_norm": 1.4881275534966363, "learning_rate": 5.760705356880353e-06, "loss": 0.2053, "step": 893 }, { "epoch": 1.75, "grad_norm": 1.562348316162778, "learning_rate": 5.757059976701901e-06, "loss": 0.1629, "step": 894 }, { "epoch": 1.75, "grad_norm": 1.4896423452739929, "learning_rate": 5.75341040011427e-06, "loss": 0.2186, "step": 895 }, { "epoch": 1.75, "grad_norm": 1.5822820305950516, "learning_rate": 5.749756633902887e-06, "loss": 0.2141, "step": 896 }, { "epoch": 1.75, "grad_norm": 1.5930001564280412, "learning_rate": 5.74609868486097e-06, "loss": 0.2054, "step": 897 }, { "epoch": 1.75, "grad_norm": 1.3938590988546011, "learning_rate": 5.742436559789513e-06, "loss": 0.1641, "step": 898 }, { "epoch": 1.76, "grad_norm": 1.6400236764265115, "learning_rate": 5.738770265497272e-06, "loss": 0.2246, "step": 899 }, { "epoch": 1.76, "grad_norm": 1.571528049424452, "learning_rate": 5.735099808800758e-06, "loss": 0.211, "step": 900 }, { "epoch": 1.76, "grad_norm": 1.587285343826691, "learning_rate": 5.73142519652422e-06, "loss": 0.1466, "step": 901 }, { "epoch": 1.76, "grad_norm": 1.4322804677820546, "learning_rate": 5.727746435499632e-06, "loss": 0.1493, "step": 902 }, { "epoch": 1.76, "grad_norm": 1.5814341567998893, "learning_rate": 5.724063532566682e-06, "loss": 0.2037, "step": 903 }, { "epoch": 1.77, "grad_norm": 1.7819406609408943, "learning_rate": 5.720376494572759e-06, "loss": 0.2159, "step": 904 }, { "epoch": 1.77, "grad_norm": 1.6552966266351143, "learning_rate": 5.716685328372941e-06, "loss": 0.1995, "step": 905 }, { "epoch": 1.77, "grad_norm": 1.5513275356511933, "learning_rate": 5.712990040829979e-06, "loss": 0.2179, "step": 906 }, { "epoch": 1.77, "grad_norm": 1.542957513583932, "learning_rate": 5.70929063881429e-06, "loss": 0.2446, "step": 907 }, { "epoch": 1.77, "grad_norm": 1.556426541332733, "learning_rate": 5.705587129203936e-06, "loss": 0.2069, "step": 908 }, { "epoch": 1.78, "grad_norm": 1.3973770828864236, "learning_rate": 5.701879518884622e-06, "loss": 0.1997, "step": 909 }, { "epoch": 1.78, "grad_norm": 1.490570060222638, "learning_rate": 5.69816781474967e-06, "loss": 0.1817, "step": 910 }, { "epoch": 1.78, "grad_norm": 1.47662548654607, "learning_rate": 5.694452023700021e-06, "loss": 0.1841, "step": 911 }, { "epoch": 1.78, "grad_norm": 1.4424071020043063, "learning_rate": 5.690732152644207e-06, "loss": 0.1977, "step": 912 }, { "epoch": 1.78, "grad_norm": 1.6489940789607038, "learning_rate": 5.687008208498352e-06, "loss": 0.2045, "step": 913 }, { "epoch": 1.79, "grad_norm": 1.3098160110263568, "learning_rate": 5.6832801981861506e-06, "loss": 0.1909, "step": 914 }, { "epoch": 1.79, "grad_norm": 1.391153280981679, "learning_rate": 5.6795481286388565e-06, "loss": 0.2196, "step": 915 }, { "epoch": 1.79, "grad_norm": 1.5868146308132831, "learning_rate": 5.675812006795271e-06, "loss": 0.2359, "step": 916 }, { "epoch": 1.79, "grad_norm": 1.395613556214658, "learning_rate": 5.67207183960173e-06, "loss": 0.1906, "step": 917 }, { "epoch": 1.79, "grad_norm": 1.4869027359130254, "learning_rate": 5.668327634012089e-06, "loss": 0.1619, "step": 918 }, { "epoch": 1.79, "grad_norm": 1.4098987253749389, "learning_rate": 5.664579396987714e-06, "loss": 0.1819, "step": 919 }, { "epoch": 1.8, "grad_norm": 1.4645764743040035, "learning_rate": 5.6608271354974675e-06, "loss": 0.1692, "step": 920 }, { "epoch": 1.8, "grad_norm": 1.4326780800568164, "learning_rate": 5.657070856517689e-06, "loss": 0.2305, "step": 921 }, { "epoch": 1.8, "grad_norm": 1.615199298727607, "learning_rate": 5.653310567032194e-06, "loss": 0.2303, "step": 922 }, { "epoch": 1.8, "grad_norm": 1.6392604222387446, "learning_rate": 5.64954627403225e-06, "loss": 0.2521, "step": 923 }, { "epoch": 1.8, "grad_norm": 1.4743308217691315, "learning_rate": 5.645777984516568e-06, "loss": 0.2269, "step": 924 }, { "epoch": 1.81, "grad_norm": 1.557739372973194, "learning_rate": 5.6420057054912946e-06, "loss": 0.1774, "step": 925 }, { "epoch": 1.81, "grad_norm": 1.4407574867350597, "learning_rate": 5.638229443969987e-06, "loss": 0.2151, "step": 926 }, { "epoch": 1.81, "grad_norm": 1.5931950702871178, "learning_rate": 5.63444920697361e-06, "loss": 0.1853, "step": 927 }, { "epoch": 1.81, "grad_norm": 1.4064468960543883, "learning_rate": 5.630665001530522e-06, "loss": 0.2086, "step": 928 }, { "epoch": 1.81, "grad_norm": 1.6076928819761562, "learning_rate": 5.6268768346764565e-06, "loss": 0.2024, "step": 929 }, { "epoch": 1.82, "grad_norm": 1.4590205205855162, "learning_rate": 5.623084713454511e-06, "loss": 0.2604, "step": 930 }, { "epoch": 1.82, "grad_norm": 1.724305705321706, "learning_rate": 5.61928864491514e-06, "loss": 0.2413, "step": 931 }, { "epoch": 1.82, "grad_norm": 1.4930303956141382, "learning_rate": 5.615488636116131e-06, "loss": 0.2202, "step": 932 }, { "epoch": 1.82, "grad_norm": 1.4371642628335974, "learning_rate": 5.611684694122604e-06, "loss": 0.2108, "step": 933 }, { "epoch": 1.82, "grad_norm": 1.4975661411229741, "learning_rate": 5.607876826006988e-06, "loss": 0.2256, "step": 934 }, { "epoch": 1.83, "grad_norm": 1.383346380303676, "learning_rate": 5.604065038849008e-06, "loss": 0.1774, "step": 935 }, { "epoch": 1.83, "grad_norm": 1.6045799334961655, "learning_rate": 5.600249339735683e-06, "loss": 0.1708, "step": 936 }, { "epoch": 1.83, "grad_norm": 1.5175089362390413, "learning_rate": 5.596429735761302e-06, "loss": 0.1977, "step": 937 }, { "epoch": 1.83, "grad_norm": 1.5264938726333115, "learning_rate": 5.592606234027411e-06, "loss": 0.2629, "step": 938 }, { "epoch": 1.83, "grad_norm": 1.565128868244776, "learning_rate": 5.588778841642805e-06, "loss": 0.206, "step": 939 }, { "epoch": 1.84, "grad_norm": 1.5307085262772036, "learning_rate": 5.584947565723517e-06, "loss": 0.1956, "step": 940 }, { "epoch": 1.84, "grad_norm": 1.4013536765645869, "learning_rate": 5.581112413392794e-06, "loss": 0.2122, "step": 941 }, { "epoch": 1.84, "grad_norm": 1.8494241753486695, "learning_rate": 5.577273391781091e-06, "loss": 0.2235, "step": 942 }, { "epoch": 1.84, "grad_norm": 1.456530194361418, "learning_rate": 5.573430508026063e-06, "loss": 0.1976, "step": 943 }, { "epoch": 1.84, "grad_norm": 1.5239059830330353, "learning_rate": 5.569583769272539e-06, "loss": 0.1695, "step": 944 }, { "epoch": 1.85, "grad_norm": 1.375494741905397, "learning_rate": 5.5657331826725164e-06, "loss": 0.1627, "step": 945 }, { "epoch": 1.85, "grad_norm": 1.6720598408556684, "learning_rate": 5.561878755385149e-06, "loss": 0.1714, "step": 946 }, { "epoch": 1.85, "grad_norm": 1.6907360479983842, "learning_rate": 5.55802049457673e-06, "loss": 0.2071, "step": 947 }, { "epoch": 1.85, "grad_norm": 1.697427544165424, "learning_rate": 5.554158407420681e-06, "loss": 0.2103, "step": 948 }, { "epoch": 1.85, "grad_norm": 1.5867390371361578, "learning_rate": 5.550292501097536e-06, "loss": 0.2073, "step": 949 }, { "epoch": 1.86, "grad_norm": 1.7587328564505205, "learning_rate": 5.546422782794931e-06, "loss": 0.1705, "step": 950 }, { "epoch": 1.86, "grad_norm": 1.8453638219509245, "learning_rate": 5.542549259707588e-06, "loss": 0.223, "step": 951 }, { "epoch": 1.86, "grad_norm": 1.5480370480703756, "learning_rate": 5.5386719390373075e-06, "loss": 0.2079, "step": 952 }, { "epoch": 1.86, "grad_norm": 1.7251570179442886, "learning_rate": 5.5347908279929435e-06, "loss": 0.2703, "step": 953 }, { "epoch": 1.86, "grad_norm": 1.6020609558050354, "learning_rate": 5.530905933790402e-06, "loss": 0.199, "step": 954 }, { "epoch": 1.87, "grad_norm": 1.4860180111061638, "learning_rate": 5.527017263652621e-06, "loss": 0.188, "step": 955 }, { "epoch": 1.87, "grad_norm": 1.6919994512696839, "learning_rate": 5.523124824809562e-06, "loss": 0.2098, "step": 956 }, { "epoch": 1.87, "grad_norm": 1.5231203625636283, "learning_rate": 5.519228624498188e-06, "loss": 0.1826, "step": 957 }, { "epoch": 1.87, "grad_norm": 1.4385966897540254, "learning_rate": 5.515328669962459e-06, "loss": 0.2267, "step": 958 }, { "epoch": 1.87, "grad_norm": 1.5082670550394852, "learning_rate": 5.5114249684533145e-06, "loss": 0.1919, "step": 959 }, { "epoch": 1.88, "grad_norm": 1.502205619565587, "learning_rate": 5.507517527228661e-06, "loss": 0.2103, "step": 960 }, { "epoch": 1.88, "grad_norm": 1.4278627961744124, "learning_rate": 5.503606353553358e-06, "loss": 0.1422, "step": 961 }, { "epoch": 1.88, "grad_norm": 1.5652072984145602, "learning_rate": 5.499691454699202e-06, "loss": 0.2169, "step": 962 }, { "epoch": 1.88, "grad_norm": 1.6809986578331388, "learning_rate": 5.495772837944917e-06, "loss": 0.253, "step": 963 }, { "epoch": 1.88, "grad_norm": 1.7059773241697571, "learning_rate": 5.4918505105761435e-06, "loss": 0.193, "step": 964 }, { "epoch": 1.88, "grad_norm": 1.4934362286868652, "learning_rate": 5.4879244798854145e-06, "loss": 0.2035, "step": 965 }, { "epoch": 1.89, "grad_norm": 1.5558325673618607, "learning_rate": 5.483994753172151e-06, "loss": 0.179, "step": 966 }, { "epoch": 1.89, "grad_norm": 1.5237010038316467, "learning_rate": 5.4800613377426455e-06, "loss": 0.2212, "step": 967 }, { "epoch": 1.89, "grad_norm": 1.5126074285798818, "learning_rate": 5.476124240910052e-06, "loss": 0.1758, "step": 968 }, { "epoch": 1.89, "grad_norm": 1.6762678628504186, "learning_rate": 5.472183469994362e-06, "loss": 0.2504, "step": 969 }, { "epoch": 1.89, "grad_norm": 1.6026881787392375, "learning_rate": 5.468239032322407e-06, "loss": 0.1744, "step": 970 }, { "epoch": 1.9, "grad_norm": 1.7658039196870783, "learning_rate": 5.464290935227826e-06, "loss": 0.2652, "step": 971 }, { "epoch": 1.9, "grad_norm": 1.4775799561094631, "learning_rate": 5.460339186051069e-06, "loss": 0.2163, "step": 972 }, { "epoch": 1.9, "grad_norm": 1.5658642560610148, "learning_rate": 5.456383792139375e-06, "loss": 0.1961, "step": 973 }, { "epoch": 1.9, "grad_norm": 1.4583166227016233, "learning_rate": 5.452424760846757e-06, "loss": 0.1742, "step": 974 }, { "epoch": 1.9, "grad_norm": 1.6160322961200122, "learning_rate": 5.4484620995339936e-06, "loss": 0.2557, "step": 975 }, { "epoch": 1.91, "grad_norm": 1.6440592564114886, "learning_rate": 5.444495815568607e-06, "loss": 0.2078, "step": 976 }, { "epoch": 1.91, "grad_norm": 1.5144202657590966, "learning_rate": 5.440525916324864e-06, "loss": 0.22, "step": 977 }, { "epoch": 1.91, "grad_norm": 1.5497424149959034, "learning_rate": 5.436552409183743e-06, "loss": 0.2159, "step": 978 }, { "epoch": 1.91, "grad_norm": 1.520533855170838, "learning_rate": 5.432575301532938e-06, "loss": 0.218, "step": 979 }, { "epoch": 1.91, "grad_norm": 1.5695170177993987, "learning_rate": 5.428594600766834e-06, "loss": 0.1935, "step": 980 }, { "epoch": 1.92, "grad_norm": 1.4747771426659602, "learning_rate": 5.424610314286495e-06, "loss": 0.2392, "step": 981 }, { "epoch": 1.92, "grad_norm": 1.4851590157528525, "learning_rate": 5.420622449499655e-06, "loss": 0.1815, "step": 982 }, { "epoch": 1.92, "grad_norm": 1.623012602867834, "learning_rate": 5.4166310138207e-06, "loss": 0.1994, "step": 983 }, { "epoch": 1.92, "grad_norm": 1.596564284924492, "learning_rate": 5.412636014670652e-06, "loss": 0.1957, "step": 984 }, { "epoch": 1.92, "grad_norm": 1.5020002773023513, "learning_rate": 5.408637459477162e-06, "loss": 0.18, "step": 985 }, { "epoch": 1.93, "grad_norm": 1.5354667262386859, "learning_rate": 5.404635355674492e-06, "loss": 0.2204, "step": 986 }, { "epoch": 1.93, "grad_norm": 1.6158746184855797, "learning_rate": 5.400629710703501e-06, "loss": 0.2204, "step": 987 }, { "epoch": 1.93, "grad_norm": 1.4295037035300706, "learning_rate": 5.396620532011631e-06, "loss": 0.185, "step": 988 }, { "epoch": 1.93, "grad_norm": 1.6459481657853157, "learning_rate": 5.392607827052896e-06, "loss": 0.2388, "step": 989 }, { "epoch": 1.93, "grad_norm": 1.5468243535855548, "learning_rate": 5.388591603287863e-06, "loss": 0.2338, "step": 990 }, { "epoch": 1.94, "grad_norm": 1.484344092051154, "learning_rate": 5.384571868183646e-06, "loss": 0.2087, "step": 991 }, { "epoch": 1.94, "grad_norm": 1.4403149758844793, "learning_rate": 5.380548629213884e-06, "loss": 0.2042, "step": 992 }, { "epoch": 1.94, "grad_norm": 1.6348083611298123, "learning_rate": 5.37652189385873e-06, "loss": 0.2235, "step": 993 }, { "epoch": 1.94, "grad_norm": 1.4000133500833691, "learning_rate": 5.372491669604841e-06, "loss": 0.2344, "step": 994 }, { "epoch": 1.94, "grad_norm": 1.4119217472154753, "learning_rate": 5.368457963945356e-06, "loss": 0.1959, "step": 995 }, { "epoch": 1.95, "grad_norm": 1.5304438206865616, "learning_rate": 5.364420784379892e-06, "loss": 0.2168, "step": 996 }, { "epoch": 1.95, "grad_norm": 1.5704029582487604, "learning_rate": 5.360380138414521e-06, "loss": 0.1768, "step": 997 }, { "epoch": 1.95, "grad_norm": 1.5778856752021329, "learning_rate": 5.356336033561761e-06, "loss": 0.2351, "step": 998 }, { "epoch": 1.95, "grad_norm": 1.427044103849254, "learning_rate": 5.352288477340562e-06, "loss": 0.2348, "step": 999 }, { "epoch": 1.95, "grad_norm": 1.4728273159728156, "learning_rate": 5.348237477276288e-06, "loss": 0.2231, "step": 1000 }, { "epoch": 1.96, "grad_norm": 1.6678090465389848, "learning_rate": 5.344183040900709e-06, "loss": 0.1758, "step": 1001 }, { "epoch": 1.96, "grad_norm": 1.4991466684606622, "learning_rate": 5.340125175751983e-06, "loss": 0.1881, "step": 1002 }, { "epoch": 1.96, "grad_norm": 1.4757941843058378, "learning_rate": 5.336063889374641e-06, "loss": 0.2133, "step": 1003 }, { "epoch": 1.96, "grad_norm": 1.4497302145220115, "learning_rate": 5.331999189319578e-06, "loss": 0.1554, "step": 1004 }, { "epoch": 1.96, "grad_norm": 1.6390344915631472, "learning_rate": 5.327931083144033e-06, "loss": 0.164, "step": 1005 }, { "epoch": 1.96, "grad_norm": 1.5504285829742515, "learning_rate": 5.323859578411582e-06, "loss": 0.2021, "step": 1006 }, { "epoch": 1.97, "grad_norm": 1.4439159988681582, "learning_rate": 5.319784682692114e-06, "loss": 0.2141, "step": 1007 }, { "epoch": 1.97, "grad_norm": 1.7838389728034263, "learning_rate": 5.315706403561825e-06, "loss": 0.2551, "step": 1008 }, { "epoch": 1.97, "grad_norm": 1.680348282149841, "learning_rate": 5.311624748603203e-06, "loss": 0.204, "step": 1009 }, { "epoch": 1.97, "grad_norm": 1.7092828642871154, "learning_rate": 5.3075397254050135e-06, "loss": 0.2026, "step": 1010 }, { "epoch": 1.97, "grad_norm": 1.6053107314068327, "learning_rate": 5.30345134156228e-06, "loss": 0.223, "step": 1011 }, { "epoch": 1.98, "grad_norm": 1.510822783048769, "learning_rate": 5.299359604676275e-06, "loss": 0.2222, "step": 1012 }, { "epoch": 1.98, "grad_norm": 1.7660359991272145, "learning_rate": 5.295264522354512e-06, "loss": 0.2476, "step": 1013 }, { "epoch": 1.98, "grad_norm": 1.579209925300224, "learning_rate": 5.291166102210713e-06, "loss": 0.177, "step": 1014 }, { "epoch": 1.98, "grad_norm": 1.4986042483615938, "learning_rate": 5.287064351864818e-06, "loss": 0.2127, "step": 1015 }, { "epoch": 1.98, "grad_norm": 1.5940693602766265, "learning_rate": 5.282959278942947e-06, "loss": 0.2595, "step": 1016 }, { "epoch": 1.99, "grad_norm": 1.6359927701016461, "learning_rate": 5.2788508910774055e-06, "loss": 0.2115, "step": 1017 }, { "epoch": 1.99, "grad_norm": 1.4114738566677265, "learning_rate": 5.27473919590666e-06, "loss": 0.2348, "step": 1018 }, { "epoch": 1.99, "grad_norm": 1.5981486059791814, "learning_rate": 5.270624201075326e-06, "loss": 0.1901, "step": 1019 }, { "epoch": 1.99, "grad_norm": 1.5847060429204445, "learning_rate": 5.266505914234152e-06, "loss": 0.1986, "step": 1020 }, { "epoch": 1.99, "grad_norm": 1.3674547706455131, "learning_rate": 5.2623843430400116e-06, "loss": 0.205, "step": 1021 }, { "epoch": 2.0, "grad_norm": 1.437984337304766, "learning_rate": 5.25825949515588e-06, "loss": 0.2113, "step": 1022 }, { "epoch": 2.0, "grad_norm": 1.839473777991614, "learning_rate": 5.254131378250826e-06, "loss": 0.2307, "step": 1023 }, { "epoch": 2.0, "grad_norm": 1.3977186307466876, "learning_rate": 5.25e-06, "loss": 0.2026, "step": 1024 }, { "epoch": 2.0, "grad_norm": 1.507992347128485, "learning_rate": 5.24586536808461e-06, "loss": 0.2047, "step": 1025 }, { "epoch": 2.0, "grad_norm": 1.4180699776832637, "learning_rate": 5.241727490191916e-06, "loss": 0.13, "step": 1026 }, { "epoch": 2.01, "grad_norm": 1.4237332016720932, "learning_rate": 5.237586374015216e-06, "loss": 0.2074, "step": 1027 }, { "epoch": 2.01, "grad_norm": 1.4456363897541993, "learning_rate": 5.233442027253823e-06, "loss": 0.2092, "step": 1028 }, { "epoch": 2.01, "grad_norm": 1.565422856985889, "learning_rate": 5.229294457613061e-06, "loss": 0.1854, "step": 1029 }, { "epoch": 2.01, "grad_norm": 1.6207731659463627, "learning_rate": 5.2251436728042444e-06, "loss": 0.1797, "step": 1030 }, { "epoch": 2.01, "grad_norm": 1.678026845084981, "learning_rate": 5.2209896805446645e-06, "loss": 0.2272, "step": 1031 }, { "epoch": 2.02, "grad_norm": 1.62958534297537, "learning_rate": 5.216832488557577e-06, "loss": 0.2161, "step": 1032 }, { "epoch": 2.02, "grad_norm": 1.5173980707118515, "learning_rate": 5.212672104572189e-06, "loss": 0.1673, "step": 1033 }, { "epoch": 2.02, "grad_norm": 1.5878450782827946, "learning_rate": 5.208508536323637e-06, "loss": 0.2184, "step": 1034 }, { "epoch": 2.02, "grad_norm": 1.5232990135289146, "learning_rate": 5.204341791552983e-06, "loss": 0.2003, "step": 1035 }, { "epoch": 2.02, "grad_norm": 1.4961377322269938, "learning_rate": 5.20017187800719e-06, "loss": 0.2024, "step": 1036 }, { "epoch": 2.03, "grad_norm": 1.5311196680493049, "learning_rate": 5.195998803439117e-06, "loss": 0.2058, "step": 1037 }, { "epoch": 2.03, "grad_norm": 1.6195983516937287, "learning_rate": 5.191822575607498e-06, "loss": 0.2147, "step": 1038 }, { "epoch": 2.03, "grad_norm": 1.4285180703489377, "learning_rate": 5.18764320227693e-06, "loss": 0.2086, "step": 1039 }, { "epoch": 2.03, "grad_norm": 1.4556717090914295, "learning_rate": 5.183460691217857e-06, "loss": 0.1878, "step": 1040 }, { "epoch": 2.03, "grad_norm": 1.4217659701248018, "learning_rate": 5.179275050206558e-06, "loss": 0.1945, "step": 1041 }, { "epoch": 2.04, "grad_norm": 1.3023334414741499, "learning_rate": 5.175086287025134e-06, "loss": 0.1499, "step": 1042 }, { "epoch": 2.04, "grad_norm": 1.4413226727978252, "learning_rate": 5.170894409461483e-06, "loss": 0.177, "step": 1043 }, { "epoch": 2.04, "grad_norm": 1.5041734207604809, "learning_rate": 5.166699425309303e-06, "loss": 0.2142, "step": 1044 }, { "epoch": 2.04, "grad_norm": 1.3859055876779218, "learning_rate": 5.1625013423680605e-06, "loss": 0.2045, "step": 1045 }, { "epoch": 2.04, "grad_norm": 1.263929794326746, "learning_rate": 5.158300168442987e-06, "loss": 0.1158, "step": 1046 }, { "epoch": 2.04, "grad_norm": 1.5835340393826198, "learning_rate": 5.154095911345061e-06, "loss": 0.127, "step": 1047 }, { "epoch": 2.05, "grad_norm": 1.404697012904978, "learning_rate": 5.1498885788909926e-06, "loss": 0.1694, "step": 1048 }, { "epoch": 2.05, "grad_norm": 1.6348854524427128, "learning_rate": 5.1456781789032064e-06, "loss": 0.2479, "step": 1049 }, { "epoch": 2.05, "grad_norm": 1.5330896931122913, "learning_rate": 5.141464719209837e-06, "loss": 0.1676, "step": 1050 }, { "epoch": 2.05, "grad_norm": 1.4434226142931537, "learning_rate": 5.137248207644702e-06, "loss": 0.1693, "step": 1051 }, { "epoch": 2.05, "grad_norm": 1.5129239942913304, "learning_rate": 5.133028652047296e-06, "loss": 0.2059, "step": 1052 }, { "epoch": 2.06, "grad_norm": 1.5756340808908464, "learning_rate": 5.128806060262774e-06, "loss": 0.1958, "step": 1053 }, { "epoch": 2.06, "grad_norm": 1.5053947421634706, "learning_rate": 5.12458044014193e-06, "loss": 0.2092, "step": 1054 }, { "epoch": 2.06, "grad_norm": 1.5576080736407454, "learning_rate": 5.120351799541198e-06, "loss": 0.1878, "step": 1055 }, { "epoch": 2.06, "grad_norm": 1.6106968079532002, "learning_rate": 5.11612014632262e-06, "loss": 0.2113, "step": 1056 }, { "epoch": 2.06, "grad_norm": 1.3767809178691706, "learning_rate": 5.1118854883538396e-06, "loss": 0.1398, "step": 1057 }, { "epoch": 2.07, "grad_norm": 1.4421699528486192, "learning_rate": 5.107647833508094e-06, "loss": 0.1694, "step": 1058 }, { "epoch": 2.07, "grad_norm": 1.575032839140396, "learning_rate": 5.103407189664184e-06, "loss": 0.1887, "step": 1059 }, { "epoch": 2.07, "grad_norm": 1.4705643213790875, "learning_rate": 5.099163564706473e-06, "loss": 0.1899, "step": 1060 }, { "epoch": 2.07, "grad_norm": 1.5202092876993683, "learning_rate": 5.094916966524863e-06, "loss": 0.1887, "step": 1061 }, { "epoch": 2.07, "grad_norm": 1.5486612082642675, "learning_rate": 5.090667403014788e-06, "loss": 0.1917, "step": 1062 }, { "epoch": 2.08, "grad_norm": 1.5563273452052548, "learning_rate": 5.0864148820771915e-06, "loss": 0.1946, "step": 1063 }, { "epoch": 2.08, "grad_norm": 1.4985057614636275, "learning_rate": 5.082159411618519e-06, "loss": 0.1418, "step": 1064 }, { "epoch": 2.08, "grad_norm": 1.474077241545786, "learning_rate": 5.077900999550697e-06, "loss": 0.1867, "step": 1065 }, { "epoch": 2.08, "grad_norm": 1.416004864358388, "learning_rate": 5.0736396537911234e-06, "loss": 0.1875, "step": 1066 }, { "epoch": 2.08, "grad_norm": 1.421353822763772, "learning_rate": 5.069375382262648e-06, "loss": 0.1799, "step": 1067 }, { "epoch": 2.09, "grad_norm": 1.5321273345368276, "learning_rate": 5.065108192893563e-06, "loss": 0.1711, "step": 1068 }, { "epoch": 2.09, "grad_norm": 1.4892021043302694, "learning_rate": 5.0608380936175835e-06, "loss": 0.166, "step": 1069 }, { "epoch": 2.09, "grad_norm": 1.6745381249330173, "learning_rate": 5.056565092373836e-06, "loss": 0.1861, "step": 1070 }, { "epoch": 2.09, "grad_norm": 1.8788150792734495, "learning_rate": 5.052289197106843e-06, "loss": 0.137, "step": 1071 }, { "epoch": 2.09, "grad_norm": 1.691606093565409, "learning_rate": 5.048010415766505e-06, "loss": 0.1701, "step": 1072 }, { "epoch": 2.1, "grad_norm": 1.4143201451828973, "learning_rate": 5.043728756308091e-06, "loss": 0.1624, "step": 1073 }, { "epoch": 2.1, "grad_norm": 1.5918485430667713, "learning_rate": 5.0394442266922196e-06, "loss": 0.2099, "step": 1074 }, { "epoch": 2.1, "grad_norm": 1.7645425614400474, "learning_rate": 5.035156834884847e-06, "loss": 0.1721, "step": 1075 }, { "epoch": 2.1, "grad_norm": 1.5751513129083368, "learning_rate": 5.030866588857251e-06, "loss": 0.1752, "step": 1076 }, { "epoch": 2.1, "grad_norm": 1.6770722860308254, "learning_rate": 5.026573496586013e-06, "loss": 0.1726, "step": 1077 }, { "epoch": 2.11, "grad_norm": 1.4982593616596884, "learning_rate": 5.02227756605301e-06, "loss": 0.1616, "step": 1078 }, { "epoch": 2.11, "grad_norm": 1.688908451728461, "learning_rate": 5.017978805245394e-06, "loss": 0.1535, "step": 1079 }, { "epoch": 2.11, "grad_norm": 1.5758582721486598, "learning_rate": 5.013677222155581e-06, "loss": 0.1735, "step": 1080 }, { "epoch": 2.11, "grad_norm": 1.5046121844410383, "learning_rate": 5.009372824781232e-06, "loss": 0.14, "step": 1081 }, { "epoch": 2.11, "grad_norm": 1.5507596761614495, "learning_rate": 5.0050656211252426e-06, "loss": 0.1694, "step": 1082 }, { "epoch": 2.12, "grad_norm": 1.5059939501754849, "learning_rate": 5.000755619195723e-06, "loss": 0.1501, "step": 1083 }, { "epoch": 2.12, "grad_norm": 1.7020649060912085, "learning_rate": 4.996442827005987e-06, "loss": 0.1671, "step": 1084 }, { "epoch": 2.12, "grad_norm": 1.5419850378685844, "learning_rate": 4.992127252574539e-06, "loss": 0.1618, "step": 1085 }, { "epoch": 2.12, "grad_norm": 1.7185920033649857, "learning_rate": 4.987808903925054e-06, "loss": 0.1704, "step": 1086 }, { "epoch": 2.12, "grad_norm": 1.3909925795911562, "learning_rate": 4.983487789086366e-06, "loss": 0.1477, "step": 1087 }, { "epoch": 2.12, "grad_norm": 1.7149797589858238, "learning_rate": 4.979163916092448e-06, "loss": 0.1452, "step": 1088 }, { "epoch": 2.13, "grad_norm": 1.5349811978915828, "learning_rate": 4.974837292982406e-06, "loss": 0.1574, "step": 1089 }, { "epoch": 2.13, "grad_norm": 1.642148428513541, "learning_rate": 4.970507927800459e-06, "loss": 0.1565, "step": 1090 }, { "epoch": 2.13, "grad_norm": 1.4939497887170976, "learning_rate": 4.966175828595919e-06, "loss": 0.1718, "step": 1091 }, { "epoch": 2.13, "grad_norm": 1.4865876555004593, "learning_rate": 4.961841003423187e-06, "loss": 0.1739, "step": 1092 }, { "epoch": 2.13, "grad_norm": 1.5240970230481368, "learning_rate": 4.95750346034173e-06, "loss": 0.1612, "step": 1093 }, { "epoch": 2.14, "grad_norm": 1.6368518911374574, "learning_rate": 4.953163207416067e-06, "loss": 0.1574, "step": 1094 }, { "epoch": 2.14, "grad_norm": 1.4829963682643221, "learning_rate": 4.948820252715757e-06, "loss": 0.1621, "step": 1095 }, { "epoch": 2.14, "grad_norm": 1.631283022405469, "learning_rate": 4.944474604315381e-06, "loss": 0.1249, "step": 1096 }, { "epoch": 2.14, "grad_norm": 1.4877803057738876, "learning_rate": 4.9401262702945304e-06, "loss": 0.1707, "step": 1097 }, { "epoch": 2.14, "grad_norm": 1.5943201787073402, "learning_rate": 4.935775258737787e-06, "loss": 0.155, "step": 1098 }, { "epoch": 2.15, "grad_norm": 1.4726575274226479, "learning_rate": 4.931421577734711e-06, "loss": 0.1329, "step": 1099 }, { "epoch": 2.15, "grad_norm": 1.5211084055231698, "learning_rate": 4.927065235379828e-06, "loss": 0.1382, "step": 1100 }, { "epoch": 2.15, "grad_norm": 1.5023363074433393, "learning_rate": 4.922706239772611e-06, "loss": 0.1727, "step": 1101 }, { "epoch": 2.15, "grad_norm": 1.5784270656249473, "learning_rate": 4.918344599017464e-06, "loss": 0.1775, "step": 1102 }, { "epoch": 2.15, "grad_norm": 1.3706693557323213, "learning_rate": 4.913980321223712e-06, "loss": 0.1174, "step": 1103 }, { "epoch": 2.16, "grad_norm": 1.5593675679362624, "learning_rate": 4.9096134145055806e-06, "loss": 0.1788, "step": 1104 }, { "epoch": 2.16, "grad_norm": 1.525897532924995, "learning_rate": 4.905243886982183e-06, "loss": 0.1426, "step": 1105 }, { "epoch": 2.16, "grad_norm": 1.4872580739479742, "learning_rate": 4.900871746777507e-06, "loss": 0.1625, "step": 1106 }, { "epoch": 2.16, "grad_norm": 1.4527697628939023, "learning_rate": 4.896497002020397e-06, "loss": 0.1679, "step": 1107 }, { "epoch": 2.16, "grad_norm": 1.4149424139347593, "learning_rate": 4.892119660844538e-06, "loss": 0.168, "step": 1108 }, { "epoch": 2.17, "grad_norm": 1.5181775324988525, "learning_rate": 4.8877397313884485e-06, "loss": 0.1248, "step": 1109 }, { "epoch": 2.17, "grad_norm": 1.614187280172174, "learning_rate": 4.883357221795449e-06, "loss": 0.1631, "step": 1110 }, { "epoch": 2.17, "grad_norm": 1.5002556101724447, "learning_rate": 4.878972140213669e-06, "loss": 0.1496, "step": 1111 }, { "epoch": 2.17, "grad_norm": 1.5552324995505677, "learning_rate": 4.87458449479601e-06, "loss": 0.1457, "step": 1112 }, { "epoch": 2.17, "grad_norm": 1.461699588189261, "learning_rate": 4.8701942937001455e-06, "loss": 0.1564, "step": 1113 }, { "epoch": 2.18, "grad_norm": 1.5538114493814832, "learning_rate": 4.865801545088499e-06, "loss": 0.1654, "step": 1114 }, { "epoch": 2.18, "grad_norm": 1.3438443978433399, "learning_rate": 4.8614062571282305e-06, "loss": 0.0958, "step": 1115 }, { "epoch": 2.18, "grad_norm": 1.5564511508832684, "learning_rate": 4.857008437991222e-06, "loss": 0.1513, "step": 1116 }, { "epoch": 2.18, "grad_norm": 1.563755482072096, "learning_rate": 4.852608095854062e-06, "loss": 0.1434, "step": 1117 }, { "epoch": 2.18, "grad_norm": 1.5181387936040769, "learning_rate": 4.848205238898028e-06, "loss": 0.1526, "step": 1118 }, { "epoch": 2.19, "grad_norm": 1.6069791994801264, "learning_rate": 4.843799875309074e-06, "loss": 0.1661, "step": 1119 }, { "epoch": 2.19, "grad_norm": 1.605490865717672, "learning_rate": 4.8393920132778144e-06, "loss": 0.1852, "step": 1120 }, { "epoch": 2.19, "grad_norm": 1.6853060134664788, "learning_rate": 4.834981660999509e-06, "loss": 0.1215, "step": 1121 }, { "epoch": 2.19, "grad_norm": 1.6623528904266727, "learning_rate": 4.830568826674048e-06, "loss": 0.1107, "step": 1122 }, { "epoch": 2.19, "grad_norm": 1.7268387224718045, "learning_rate": 4.826153518505937e-06, "loss": 0.1351, "step": 1123 }, { "epoch": 2.2, "grad_norm": 1.449553062103938, "learning_rate": 4.821735744704276e-06, "loss": 0.1168, "step": 1124 }, { "epoch": 2.2, "grad_norm": 1.4267966708660862, "learning_rate": 4.817315513482755e-06, "loss": 0.151, "step": 1125 }, { "epoch": 2.2, "grad_norm": 1.578058169676317, "learning_rate": 4.812892833059633e-06, "loss": 0.092, "step": 1126 }, { "epoch": 2.2, "grad_norm": 1.666122903839035, "learning_rate": 4.808467711657718e-06, "loss": 0.1535, "step": 1127 }, { "epoch": 2.2, "grad_norm": 1.6159678880581645, "learning_rate": 4.804040157504361e-06, "loss": 0.1509, "step": 1128 }, { "epoch": 2.21, "grad_norm": 1.5047526748282665, "learning_rate": 4.7996101788314315e-06, "loss": 0.139, "step": 1129 }, { "epoch": 2.21, "grad_norm": 1.6164441077075358, "learning_rate": 4.795177783875312e-06, "loss": 0.1721, "step": 1130 }, { "epoch": 2.21, "grad_norm": 1.427595450855479, "learning_rate": 4.7907429808768716e-06, "loss": 0.1289, "step": 1131 }, { "epoch": 2.21, "grad_norm": 1.5427688546283438, "learning_rate": 4.786305778081462e-06, "loss": 0.1046, "step": 1132 }, { "epoch": 2.21, "grad_norm": 1.3633654864963831, "learning_rate": 4.7818661837388945e-06, "loss": 0.1271, "step": 1133 }, { "epoch": 2.21, "grad_norm": 1.398408324057982, "learning_rate": 4.777424206103426e-06, "loss": 0.1181, "step": 1134 }, { "epoch": 2.22, "grad_norm": 1.4895155057554086, "learning_rate": 4.772979853433746e-06, "loss": 0.131, "step": 1135 }, { "epoch": 2.22, "grad_norm": 1.5062428237009944, "learning_rate": 4.7685331339929555e-06, "loss": 0.1568, "step": 1136 }, { "epoch": 2.22, "grad_norm": 1.5935793974212835, "learning_rate": 4.764084056048564e-06, "loss": 0.1271, "step": 1137 }, { "epoch": 2.22, "grad_norm": 1.4097392421984842, "learning_rate": 4.759632627872458e-06, "loss": 0.0961, "step": 1138 }, { "epoch": 2.22, "grad_norm": 1.4791262826854417, "learning_rate": 4.755178857740899e-06, "loss": 0.1188, "step": 1139 }, { "epoch": 2.23, "grad_norm": 1.5204003329203983, "learning_rate": 4.750722753934501e-06, "loss": 0.1337, "step": 1140 }, { "epoch": 2.23, "grad_norm": 1.5528692797555401, "learning_rate": 4.746264324738215e-06, "loss": 0.094, "step": 1141 }, { "epoch": 2.23, "grad_norm": 1.562956849259058, "learning_rate": 4.741803578441318e-06, "loss": 0.1296, "step": 1142 }, { "epoch": 2.23, "grad_norm": 1.8262792533307466, "learning_rate": 4.737340523337393e-06, "loss": 0.1657, "step": 1143 }, { "epoch": 2.23, "grad_norm": 1.5095721863727827, "learning_rate": 4.732875167724318e-06, "loss": 0.133, "step": 1144 }, { "epoch": 2.24, "grad_norm": 1.558002508566382, "learning_rate": 4.728407519904245e-06, "loss": 0.118, "step": 1145 }, { "epoch": 2.24, "grad_norm": 1.5217137393391562, "learning_rate": 4.723937588183593e-06, "loss": 0.1413, "step": 1146 }, { "epoch": 2.24, "grad_norm": 1.450154227740711, "learning_rate": 4.71946538087302e-06, "loss": 0.1214, "step": 1147 }, { "epoch": 2.24, "grad_norm": 1.3881950152268336, "learning_rate": 4.714990906287423e-06, "loss": 0.0953, "step": 1148 }, { "epoch": 2.24, "grad_norm": 1.5491689685116705, "learning_rate": 4.710514172745907e-06, "loss": 0.1415, "step": 1149 }, { "epoch": 2.25, "grad_norm": 1.4474001441901558, "learning_rate": 4.706035188571782e-06, "loss": 0.1304, "step": 1150 }, { "epoch": 2.25, "grad_norm": 1.3402102027168823, "learning_rate": 4.70155396209254e-06, "loss": 0.1056, "step": 1151 }, { "epoch": 2.25, "grad_norm": 1.4823550744634768, "learning_rate": 4.697070501639841e-06, "loss": 0.1202, "step": 1152 }, { "epoch": 2.25, "grad_norm": 1.5763049979511132, "learning_rate": 4.692584815549502e-06, "loss": 0.1525, "step": 1153 }, { "epoch": 2.25, "grad_norm": 1.500084656859413, "learning_rate": 4.688096912161476e-06, "loss": 0.1313, "step": 1154 }, { "epoch": 2.26, "grad_norm": 1.6116431894468515, "learning_rate": 4.683606799819838e-06, "loss": 0.0864, "step": 1155 }, { "epoch": 2.26, "grad_norm": 1.5757980859869531, "learning_rate": 4.67911448687277e-06, "loss": 0.089, "step": 1156 }, { "epoch": 2.26, "grad_norm": 1.4926148102554084, "learning_rate": 4.674619981672548e-06, "loss": 0.101, "step": 1157 }, { "epoch": 2.26, "grad_norm": 1.5704327391808552, "learning_rate": 4.67012329257552e-06, "loss": 0.1107, "step": 1158 }, { "epoch": 2.26, "grad_norm": 1.8069818177843149, "learning_rate": 4.665624427942096e-06, "loss": 0.1255, "step": 1159 }, { "epoch": 2.27, "grad_norm": 1.7494995669777578, "learning_rate": 4.661123396136733e-06, "loss": 0.0975, "step": 1160 }, { "epoch": 2.27, "grad_norm": 1.718358802991872, "learning_rate": 4.656620205527914e-06, "loss": 0.1118, "step": 1161 }, { "epoch": 2.27, "grad_norm": 1.5322418401236837, "learning_rate": 4.652114864488136e-06, "loss": 0.1228, "step": 1162 }, { "epoch": 2.27, "grad_norm": 1.7981822121222637, "learning_rate": 4.647607381393899e-06, "loss": 0.1305, "step": 1163 }, { "epoch": 2.27, "grad_norm": 1.5136652342567989, "learning_rate": 4.643097764625678e-06, "loss": 0.1237, "step": 1164 }, { "epoch": 2.28, "grad_norm": 1.6334326264714492, "learning_rate": 4.638586022567921e-06, "loss": 0.1089, "step": 1165 }, { "epoch": 2.28, "grad_norm": 1.717196976217118, "learning_rate": 4.634072163609024e-06, "loss": 0.0939, "step": 1166 }, { "epoch": 2.28, "grad_norm": 1.4981746760123513, "learning_rate": 4.62955619614132e-06, "loss": 0.103, "step": 1167 }, { "epoch": 2.28, "grad_norm": 1.5872694925294109, "learning_rate": 4.625038128561065e-06, "loss": 0.1338, "step": 1168 }, { "epoch": 2.28, "grad_norm": 1.6240274362085532, "learning_rate": 4.620517969268416e-06, "loss": 0.1247, "step": 1169 }, { "epoch": 2.29, "grad_norm": 1.4615491824194708, "learning_rate": 4.615995726667416e-06, "loss": 0.0983, "step": 1170 }, { "epoch": 2.29, "grad_norm": 1.6405944042237226, "learning_rate": 4.61147140916599e-06, "loss": 0.0774, "step": 1171 }, { "epoch": 2.29, "grad_norm": 1.6183742413455224, "learning_rate": 4.606945025175914e-06, "loss": 0.1197, "step": 1172 }, { "epoch": 2.29, "grad_norm": 1.5644650856703728, "learning_rate": 4.602416583112809e-06, "loss": 0.1187, "step": 1173 }, { "epoch": 2.29, "grad_norm": 1.7751392940772348, "learning_rate": 4.597886091396121e-06, "loss": 0.1234, "step": 1174 }, { "epoch": 2.29, "grad_norm": 1.752101631130338, "learning_rate": 4.593353558449106e-06, "loss": 0.1199, "step": 1175 }, { "epoch": 2.3, "grad_norm": 1.3619442689060266, "learning_rate": 4.588818992698818e-06, "loss": 0.0821, "step": 1176 }, { "epoch": 2.3, "grad_norm": 1.9197534217407775, "learning_rate": 4.58428240257609e-06, "loss": 0.088, "step": 1177 }, { "epoch": 2.3, "grad_norm": 1.8502589456327885, "learning_rate": 4.579743796515515e-06, "loss": 0.1234, "step": 1178 }, { "epoch": 2.3, "grad_norm": 1.6816692470335188, "learning_rate": 4.5752031829554385e-06, "loss": 0.1229, "step": 1179 }, { "epoch": 2.3, "grad_norm": 1.6064996138953025, "learning_rate": 4.570660570337937e-06, "loss": 0.0886, "step": 1180 }, { "epoch": 2.31, "grad_norm": 1.9841478529478989, "learning_rate": 4.566115967108803e-06, "loss": 0.0812, "step": 1181 }, { "epoch": 2.31, "grad_norm": 1.6947278947270288, "learning_rate": 4.561569381717531e-06, "loss": 0.134, "step": 1182 }, { "epoch": 2.31, "grad_norm": 1.593034703441953, "learning_rate": 4.557020822617298e-06, "loss": 0.1232, "step": 1183 }, { "epoch": 2.31, "grad_norm": 1.5440533606508509, "learning_rate": 4.552470298264956e-06, "loss": 0.11, "step": 1184 }, { "epoch": 2.31, "grad_norm": 1.674003544611226, "learning_rate": 4.547917817121006e-06, "loss": 0.1039, "step": 1185 }, { "epoch": 2.32, "grad_norm": 1.5382598789245914, "learning_rate": 4.54336338764959e-06, "loss": 0.1068, "step": 1186 }, { "epoch": 2.32, "grad_norm": 1.4676820006582285, "learning_rate": 4.5388070183184695e-06, "loss": 0.0869, "step": 1187 }, { "epoch": 2.32, "grad_norm": 1.6259579184912778, "learning_rate": 4.534248717599016e-06, "loss": 0.0951, "step": 1188 }, { "epoch": 2.32, "grad_norm": 1.683114289539866, "learning_rate": 4.52968849396619e-06, "loss": 0.104, "step": 1189 }, { "epoch": 2.32, "grad_norm": 1.8242911264510855, "learning_rate": 4.525126355898528e-06, "loss": 0.136, "step": 1190 }, { "epoch": 2.33, "grad_norm": 1.5706340345694934, "learning_rate": 4.520562311878125e-06, "loss": 0.1116, "step": 1191 }, { "epoch": 2.33, "grad_norm": 1.654667386539333, "learning_rate": 4.5159963703906175e-06, "loss": 0.0779, "step": 1192 }, { "epoch": 2.33, "grad_norm": 1.4978755280711196, "learning_rate": 4.511428539925177e-06, "loss": 0.111, "step": 1193 }, { "epoch": 2.33, "grad_norm": 1.701546806160673, "learning_rate": 4.50685882897448e-06, "loss": 0.14, "step": 1194 }, { "epoch": 2.33, "grad_norm": 1.5592382308470962, "learning_rate": 4.502287246034701e-06, "loss": 0.0882, "step": 1195 }, { "epoch": 2.34, "grad_norm": 1.6702981394059562, "learning_rate": 4.497713799605498e-06, "loss": 0.0997, "step": 1196 }, { "epoch": 2.34, "grad_norm": 1.7646057803896993, "learning_rate": 4.493138498189989e-06, "loss": 0.1035, "step": 1197 }, { "epoch": 2.34, "grad_norm": 1.5132271332822134, "learning_rate": 4.488561350294743e-06, "loss": 0.0942, "step": 1198 }, { "epoch": 2.34, "grad_norm": 1.6641946140096262, "learning_rate": 4.483982364429766e-06, "loss": 0.1002, "step": 1199 }, { "epoch": 2.34, "grad_norm": 1.6462737290272296, "learning_rate": 4.479401549108473e-06, "loss": 0.1162, "step": 1200 }, { "epoch": 2.35, "grad_norm": 1.4946104635630506, "learning_rate": 4.474818912847685e-06, "loss": 0.0762, "step": 1201 }, { "epoch": 2.35, "grad_norm": 1.560843177737139, "learning_rate": 4.470234464167612e-06, "loss": 0.1113, "step": 1202 }, { "epoch": 2.35, "grad_norm": 1.6368254198530996, "learning_rate": 4.465648211591828e-06, "loss": 0.0722, "step": 1203 }, { "epoch": 2.35, "grad_norm": 1.4827350746619476, "learning_rate": 4.4610601636472636e-06, "loss": 0.0883, "step": 1204 }, { "epoch": 2.35, "grad_norm": 1.6676308908341626, "learning_rate": 4.456470328864186e-06, "loss": 0.1182, "step": 1205 }, { "epoch": 2.36, "grad_norm": 1.7689551488698976, "learning_rate": 4.451878715776184e-06, "loss": 0.1152, "step": 1206 }, { "epoch": 2.36, "grad_norm": 1.5798910123181773, "learning_rate": 4.447285332920157e-06, "loss": 0.1574, "step": 1207 }, { "epoch": 2.36, "grad_norm": 1.5804818241514036, "learning_rate": 4.442690188836292e-06, "loss": 0.1038, "step": 1208 }, { "epoch": 2.36, "grad_norm": 1.6729778097276342, "learning_rate": 4.438093292068047e-06, "loss": 0.1218, "step": 1209 }, { "epoch": 2.36, "grad_norm": 1.5243886837823166, "learning_rate": 4.433494651162144e-06, "loss": 0.0814, "step": 1210 }, { "epoch": 2.37, "grad_norm": 1.5268552452886017, "learning_rate": 4.428894274668547e-06, "loss": 0.0955, "step": 1211 }, { "epoch": 2.37, "grad_norm": 1.6839198519507084, "learning_rate": 4.424292171140445e-06, "loss": 0.1117, "step": 1212 }, { "epoch": 2.37, "grad_norm": 1.6753146961319445, "learning_rate": 4.419688349134237e-06, "loss": 0.0912, "step": 1213 }, { "epoch": 2.37, "grad_norm": 1.6220133269457033, "learning_rate": 4.4150828172095205e-06, "loss": 0.0851, "step": 1214 }, { "epoch": 2.37, "grad_norm": 1.7155102757229121, "learning_rate": 4.410475583929069e-06, "loss": 0.1071, "step": 1215 }, { "epoch": 2.38, "grad_norm": 1.550351368259341, "learning_rate": 4.405866657858823e-06, "loss": 0.0829, "step": 1216 }, { "epoch": 2.38, "grad_norm": 1.6437917585618995, "learning_rate": 4.401256047567866e-06, "loss": 0.1061, "step": 1217 }, { "epoch": 2.38, "grad_norm": 1.6412436749832247, "learning_rate": 4.396643761628414e-06, "loss": 0.0957, "step": 1218 }, { "epoch": 2.38, "grad_norm": 1.5631272597398203, "learning_rate": 4.392029808615802e-06, "loss": 0.109, "step": 1219 }, { "epoch": 2.38, "grad_norm": 1.6660120897533321, "learning_rate": 4.387414197108459e-06, "loss": 0.0939, "step": 1220 }, { "epoch": 2.38, "grad_norm": 1.5947612428807583, "learning_rate": 4.382796935687899e-06, "loss": 0.0913, "step": 1221 }, { "epoch": 2.39, "grad_norm": 1.790214905362722, "learning_rate": 4.378178032938711e-06, "loss": 0.1304, "step": 1222 }, { "epoch": 2.39, "grad_norm": 1.5118538663467238, "learning_rate": 4.373557497448522e-06, "loss": 0.0653, "step": 1223 }, { "epoch": 2.39, "grad_norm": 1.5213689813418945, "learning_rate": 4.368935337808006e-06, "loss": 0.1121, "step": 1224 }, { "epoch": 2.39, "grad_norm": 1.6116445772325667, "learning_rate": 4.364311562610854e-06, "loss": 0.0715, "step": 1225 }, { "epoch": 2.39, "grad_norm": 1.7208962898973539, "learning_rate": 4.359686180453757e-06, "loss": 0.1099, "step": 1226 }, { "epoch": 2.4, "grad_norm": 1.558397043604805, "learning_rate": 4.355059199936396e-06, "loss": 0.1026, "step": 1227 }, { "epoch": 2.4, "grad_norm": 1.5739397296503126, "learning_rate": 4.350430629661424e-06, "loss": 0.1149, "step": 1228 }, { "epoch": 2.4, "grad_norm": 1.6672153168946704, "learning_rate": 4.34580047823445e-06, "loss": 0.0944, "step": 1229 }, { "epoch": 2.4, "grad_norm": 1.6768027040339215, "learning_rate": 4.341168754264023e-06, "loss": 0.1229, "step": 1230 }, { "epoch": 2.4, "grad_norm": 1.5214330895708958, "learning_rate": 4.336535466361616e-06, "loss": 0.1101, "step": 1231 }, { "epoch": 2.41, "grad_norm": 1.602379396040259, "learning_rate": 4.3319006231416055e-06, "loss": 0.0986, "step": 1232 }, { "epoch": 2.41, "grad_norm": 1.428461140185046, "learning_rate": 4.327264233221266e-06, "loss": 0.096, "step": 1233 }, { "epoch": 2.41, "grad_norm": 1.4417281459197508, "learning_rate": 4.3226263052207435e-06, "loss": 0.0882, "step": 1234 }, { "epoch": 2.41, "grad_norm": 1.530533492223422, "learning_rate": 4.317986847763045e-06, "loss": 0.08, "step": 1235 }, { "epoch": 2.41, "grad_norm": 1.5163352314719005, "learning_rate": 4.313345869474022e-06, "loss": 0.0796, "step": 1236 }, { "epoch": 2.42, "grad_norm": 1.4312423967586982, "learning_rate": 4.308703378982349e-06, "loss": 0.0791, "step": 1237 }, { "epoch": 2.42, "grad_norm": 1.313225033343894, "learning_rate": 4.3040593849195195e-06, "loss": 0.0667, "step": 1238 }, { "epoch": 2.42, "grad_norm": 1.3942065584392183, "learning_rate": 4.299413895919817e-06, "loss": 0.0707, "step": 1239 }, { "epoch": 2.42, "grad_norm": 1.2987059288153924, "learning_rate": 4.294766920620306e-06, "loss": 0.041, "step": 1240 }, { "epoch": 2.42, "grad_norm": 1.5292215940667377, "learning_rate": 4.290118467660815e-06, "loss": 0.0876, "step": 1241 }, { "epoch": 2.43, "grad_norm": 1.526726220799809, "learning_rate": 4.285468545683919e-06, "loss": 0.0747, "step": 1242 }, { "epoch": 2.43, "grad_norm": 1.8560500297656553, "learning_rate": 4.280817163334925e-06, "loss": 0.0857, "step": 1243 }, { "epoch": 2.43, "grad_norm": 1.8559456941031165, "learning_rate": 4.276164329261853e-06, "loss": 0.0789, "step": 1244 }, { "epoch": 2.43, "grad_norm": 1.5911679191622483, "learning_rate": 4.2715100521154245e-06, "loss": 0.0612, "step": 1245 }, { "epoch": 2.43, "grad_norm": 1.9470027537411805, "learning_rate": 4.266854340549044e-06, "loss": 0.0865, "step": 1246 }, { "epoch": 2.44, "grad_norm": 2.0180850000549757, "learning_rate": 4.262197203218782e-06, "loss": 0.0896, "step": 1247 }, { "epoch": 2.44, "grad_norm": 1.9373155017926644, "learning_rate": 4.25753864878336e-06, "loss": 0.0773, "step": 1248 }, { "epoch": 2.44, "grad_norm": 2.070667184554768, "learning_rate": 4.252878685904134e-06, "loss": 0.1063, "step": 1249 }, { "epoch": 2.44, "grad_norm": 1.7551722363170412, "learning_rate": 4.248217323245079e-06, "loss": 0.0811, "step": 1250 }, { "epoch": 2.44, "grad_norm": 2.0904961927268353, "learning_rate": 4.243554569472773e-06, "loss": 0.0812, "step": 1251 }, { "epoch": 2.45, "grad_norm": 1.6129650924493708, "learning_rate": 4.238890433256378e-06, "loss": 0.0787, "step": 1252 }, { "epoch": 2.45, "grad_norm": 1.497511996066989, "learning_rate": 4.234224923267631e-06, "loss": 0.0781, "step": 1253 }, { "epoch": 2.45, "grad_norm": 1.722830827857964, "learning_rate": 4.2295580481808165e-06, "loss": 0.1019, "step": 1254 }, { "epoch": 2.45, "grad_norm": 1.7314388476248024, "learning_rate": 4.224889816672765e-06, "loss": 0.0966, "step": 1255 }, { "epoch": 2.45, "grad_norm": 1.5338264199770952, "learning_rate": 4.220220237422822e-06, "loss": 0.098, "step": 1256 }, { "epoch": 2.46, "grad_norm": 1.5218966797638824, "learning_rate": 4.215549319112843e-06, "loss": 0.1257, "step": 1257 }, { "epoch": 2.46, "grad_norm": 1.5580221022788898, "learning_rate": 4.21087707042717e-06, "loss": 0.0893, "step": 1258 }, { "epoch": 2.46, "grad_norm": 1.3755959101156179, "learning_rate": 4.206203500052622e-06, "loss": 0.092, "step": 1259 }, { "epoch": 2.46, "grad_norm": 1.4998312583832756, "learning_rate": 4.201528616678472e-06, "loss": 0.0993, "step": 1260 }, { "epoch": 2.46, "grad_norm": 1.5469793400570984, "learning_rate": 4.196852428996435e-06, "loss": 0.0769, "step": 1261 }, { "epoch": 2.46, "grad_norm": 1.5160201163019345, "learning_rate": 4.192174945700656e-06, "loss": 0.1063, "step": 1262 }, { "epoch": 2.47, "grad_norm": 1.4611715630352688, "learning_rate": 4.187496175487679e-06, "loss": 0.0806, "step": 1263 }, { "epoch": 2.47, "grad_norm": 1.5210988161657837, "learning_rate": 4.182816127056449e-06, "loss": 0.1106, "step": 1264 }, { "epoch": 2.47, "grad_norm": 1.5649558029050388, "learning_rate": 4.178134809108284e-06, "loss": 0.1173, "step": 1265 }, { "epoch": 2.47, "grad_norm": 1.6770619985208617, "learning_rate": 4.173452230346864e-06, "loss": 0.1267, "step": 1266 }, { "epoch": 2.47, "grad_norm": 1.686726375532015, "learning_rate": 4.168768399478211e-06, "loss": 0.094, "step": 1267 }, { "epoch": 2.48, "grad_norm": 1.4463746830792894, "learning_rate": 4.1640833252106775e-06, "loss": 0.0838, "step": 1268 }, { "epoch": 2.48, "grad_norm": 1.5986930497358538, "learning_rate": 4.1593970162549244e-06, "loss": 0.1015, "step": 1269 }, { "epoch": 2.48, "grad_norm": 1.7812986102399275, "learning_rate": 4.154709481323912e-06, "loss": 0.0953, "step": 1270 }, { "epoch": 2.48, "grad_norm": 1.7515252320638828, "learning_rate": 4.150020729132878e-06, "loss": 0.0944, "step": 1271 }, { "epoch": 2.48, "grad_norm": 1.6626578317258074, "learning_rate": 4.1453307683993216e-06, "loss": 0.0857, "step": 1272 }, { "epoch": 2.49, "grad_norm": 1.3945906562844175, "learning_rate": 4.140639607842994e-06, "loss": 0.094, "step": 1273 }, { "epoch": 2.49, "grad_norm": 1.4540359466996529, "learning_rate": 4.135947256185871e-06, "loss": 0.0905, "step": 1274 }, { "epoch": 2.49, "grad_norm": 1.6497891155839808, "learning_rate": 4.131253722152147e-06, "loss": 0.1077, "step": 1275 }, { "epoch": 2.49, "grad_norm": 1.6199556814773366, "learning_rate": 4.1265590144682155e-06, "loss": 0.0635, "step": 1276 }, { "epoch": 2.49, "grad_norm": 1.582375527811755, "learning_rate": 4.121863141862647e-06, "loss": 0.0718, "step": 1277 }, { "epoch": 2.5, "grad_norm": 1.7760656784351658, "learning_rate": 4.117166113066182e-06, "loss": 0.0849, "step": 1278 }, { "epoch": 2.5, "grad_norm": 1.6364538475744579, "learning_rate": 4.112467936811711e-06, "loss": 0.1047, "step": 1279 }, { "epoch": 2.5, "grad_norm": 1.4994182750324199, "learning_rate": 4.107768621834257e-06, "loss": 0.0963, "step": 1280 }, { "epoch": 2.5, "grad_norm": 1.4994685982889278, "learning_rate": 4.103068176870958e-06, "loss": 0.0692, "step": 1281 }, { "epoch": 2.5, "grad_norm": 1.587730191950317, "learning_rate": 4.098366610661054e-06, "loss": 0.0606, "step": 1282 }, { "epoch": 2.51, "grad_norm": 1.6729733155283568, "learning_rate": 4.093663931945873e-06, "loss": 0.0639, "step": 1283 }, { "epoch": 2.51, "grad_norm": 1.7374101194350589, "learning_rate": 4.088960149468808e-06, "loss": 0.105, "step": 1284 }, { "epoch": 2.51, "grad_norm": 1.735254752129886, "learning_rate": 4.084255271975304e-06, "loss": 0.1195, "step": 1285 }, { "epoch": 2.51, "grad_norm": 1.562450109025653, "learning_rate": 4.079549308212843e-06, "loss": 0.1007, "step": 1286 }, { "epoch": 2.51, "grad_norm": 1.4921214935534568, "learning_rate": 4.074842266930927e-06, "loss": 0.0697, "step": 1287 }, { "epoch": 2.52, "grad_norm": 1.4972489597370693, "learning_rate": 4.070134156881061e-06, "loss": 0.0762, "step": 1288 }, { "epoch": 2.52, "grad_norm": 1.5346788957149748, "learning_rate": 4.065424986816736e-06, "loss": 0.0907, "step": 1289 }, { "epoch": 2.52, "grad_norm": 1.4273638279438452, "learning_rate": 4.060714765493415e-06, "loss": 0.0997, "step": 1290 }, { "epoch": 2.52, "grad_norm": 1.5844881390143686, "learning_rate": 4.0560035016685145e-06, "loss": 0.1185, "step": 1291 }, { "epoch": 2.52, "grad_norm": 1.6453268968369996, "learning_rate": 4.051291204101393e-06, "loss": 0.0694, "step": 1292 }, { "epoch": 2.53, "grad_norm": 1.5031488446357693, "learning_rate": 4.046577881553324e-06, "loss": 0.0679, "step": 1293 }, { "epoch": 2.53, "grad_norm": 1.4883521844344179, "learning_rate": 4.041863542787494e-06, "loss": 0.0854, "step": 1294 }, { "epoch": 2.53, "grad_norm": 1.705354877563369, "learning_rate": 4.037148196568974e-06, "loss": 0.0831, "step": 1295 }, { "epoch": 2.53, "grad_norm": 1.4790125507389078, "learning_rate": 4.032431851664708e-06, "loss": 0.1085, "step": 1296 }, { "epoch": 2.53, "grad_norm": 1.4093397539914418, "learning_rate": 4.027714516843502e-06, "loss": 0.0947, "step": 1297 }, { "epoch": 2.54, "grad_norm": 1.4025362641586037, "learning_rate": 4.022996200875997e-06, "loss": 0.078, "step": 1298 }, { "epoch": 2.54, "grad_norm": 1.3924139576153802, "learning_rate": 4.01827691253466e-06, "loss": 0.0582, "step": 1299 }, { "epoch": 2.54, "grad_norm": 1.629344053535201, "learning_rate": 4.013556660593766e-06, "loss": 0.0755, "step": 1300 }, { "epoch": 2.54, "grad_norm": 1.4800145202730481, "learning_rate": 4.008835453829383e-06, "loss": 0.0945, "step": 1301 }, { "epoch": 2.54, "grad_norm": 1.7104235284655005, "learning_rate": 4.004113301019351e-06, "loss": 0.0859, "step": 1302 }, { "epoch": 2.54, "grad_norm": 1.6143651608334988, "learning_rate": 3.999390210943271e-06, "loss": 0.0861, "step": 1303 }, { "epoch": 2.55, "grad_norm": 1.5417155911780138, "learning_rate": 3.9946661923824864e-06, "loss": 0.0674, "step": 1304 }, { "epoch": 2.55, "grad_norm": 1.7384603065997861, "learning_rate": 3.989941254120068e-06, "loss": 0.1044, "step": 1305 }, { "epoch": 2.55, "grad_norm": 1.6526680767570352, "learning_rate": 3.9852154049407935e-06, "loss": 0.0895, "step": 1306 }, { "epoch": 2.55, "grad_norm": 1.5817498876315408, "learning_rate": 3.980488653631138e-06, "loss": 0.0851, "step": 1307 }, { "epoch": 2.55, "grad_norm": 1.6040205633923772, "learning_rate": 3.97576100897925e-06, "loss": 0.0909, "step": 1308 }, { "epoch": 2.56, "grad_norm": 1.602556488063956, "learning_rate": 3.9710324797749415e-06, "loss": 0.0779, "step": 1309 }, { "epoch": 2.56, "grad_norm": 1.7194616038507824, "learning_rate": 3.96630307480967e-06, "loss": 0.0784, "step": 1310 }, { "epoch": 2.56, "grad_norm": 2.191883135507353, "learning_rate": 3.961572802876516e-06, "loss": 0.1041, "step": 1311 }, { "epoch": 2.56, "grad_norm": 1.5809597464524947, "learning_rate": 3.956841672770181e-06, "loss": 0.065, "step": 1312 }, { "epoch": 2.56, "grad_norm": 1.5918812825376956, "learning_rate": 3.952109693286952e-06, "loss": 0.1279, "step": 1313 }, { "epoch": 2.57, "grad_norm": 1.5630895658907993, "learning_rate": 3.947376873224701e-06, "loss": 0.1049, "step": 1314 }, { "epoch": 2.57, "grad_norm": 1.5533079124641018, "learning_rate": 3.942643221382863e-06, "loss": 0.0937, "step": 1315 }, { "epoch": 2.57, "grad_norm": 1.5058666236871736, "learning_rate": 3.937908746562417e-06, "loss": 0.0941, "step": 1316 }, { "epoch": 2.57, "grad_norm": 1.4599875063973582, "learning_rate": 3.9331734575658735e-06, "loss": 0.1004, "step": 1317 }, { "epoch": 2.57, "grad_norm": 1.4349195286194572, "learning_rate": 3.928437363197257e-06, "loss": 0.0978, "step": 1318 }, { "epoch": 2.58, "grad_norm": 1.453663489601979, "learning_rate": 3.923700472262088e-06, "loss": 0.0834, "step": 1319 }, { "epoch": 2.58, "grad_norm": 1.5619811605972937, "learning_rate": 3.918962793567368e-06, "loss": 0.0953, "step": 1320 }, { "epoch": 2.58, "grad_norm": 1.504454575260288, "learning_rate": 3.914224335921568e-06, "loss": 0.0664, "step": 1321 }, { "epoch": 2.58, "grad_norm": 1.7231988376387637, "learning_rate": 3.909485108134598e-06, "loss": 0.0891, "step": 1322 }, { "epoch": 2.58, "grad_norm": 1.5613198625931897, "learning_rate": 3.90474511901781e-06, "loss": 0.0956, "step": 1323 }, { "epoch": 2.59, "grad_norm": 1.8329467972354605, "learning_rate": 3.900004377383963e-06, "loss": 0.0625, "step": 1324 }, { "epoch": 2.59, "grad_norm": 1.5463355857797432, "learning_rate": 3.89526289204722e-06, "loss": 0.0976, "step": 1325 }, { "epoch": 2.59, "grad_norm": 1.664315947837471, "learning_rate": 3.890520671823126e-06, "loss": 0.0764, "step": 1326 }, { "epoch": 2.59, "grad_norm": 1.5938578151955212, "learning_rate": 3.8857777255285915e-06, "loss": 0.0896, "step": 1327 }, { "epoch": 2.59, "grad_norm": 1.607714182686459, "learning_rate": 3.881034061981876e-06, "loss": 0.0679, "step": 1328 }, { "epoch": 2.6, "grad_norm": 1.5672283774680182, "learning_rate": 3.876289690002576e-06, "loss": 0.083, "step": 1329 }, { "epoch": 2.6, "grad_norm": 1.7216958646083738, "learning_rate": 3.871544618411602e-06, "loss": 0.0626, "step": 1330 }, { "epoch": 2.6, "grad_norm": 1.6589066333091629, "learning_rate": 3.866798856031164e-06, "loss": 0.0602, "step": 1331 }, { "epoch": 2.6, "grad_norm": 1.6110958738457173, "learning_rate": 3.862052411684763e-06, "loss": 0.0692, "step": 1332 }, { "epoch": 2.6, "grad_norm": 1.6695787129014241, "learning_rate": 3.85730529419716e-06, "loss": 0.0906, "step": 1333 }, { "epoch": 2.61, "grad_norm": 1.5446531362757248, "learning_rate": 3.852557512394371e-06, "loss": 0.0843, "step": 1334 }, { "epoch": 2.61, "grad_norm": 1.68006740058084, "learning_rate": 3.8478090751036495e-06, "loss": 0.0706, "step": 1335 }, { "epoch": 2.61, "grad_norm": 1.6018922121576877, "learning_rate": 3.843059991153463e-06, "loss": 0.0769, "step": 1336 }, { "epoch": 2.61, "grad_norm": 1.6107572917568804, "learning_rate": 3.838310269373483e-06, "loss": 0.1206, "step": 1337 }, { "epoch": 2.61, "grad_norm": 1.4379215528830023, "learning_rate": 3.83355991859457e-06, "loss": 0.0615, "step": 1338 }, { "epoch": 2.62, "grad_norm": 1.4462029739161868, "learning_rate": 3.828808947648751e-06, "loss": 0.0974, "step": 1339 }, { "epoch": 2.62, "grad_norm": 1.371860836625029, "learning_rate": 3.824057365369205e-06, "loss": 0.0836, "step": 1340 }, { "epoch": 2.62, "grad_norm": 1.6770099723929373, "learning_rate": 3.8193051805902496e-06, "loss": 0.0814, "step": 1341 }, { "epoch": 2.62, "grad_norm": 1.4595270483865816, "learning_rate": 3.8145524021473225e-06, "loss": 0.0746, "step": 1342 }, { "epoch": 2.62, "grad_norm": 1.6068819314625746, "learning_rate": 3.809799038876965e-06, "loss": 0.0986, "step": 1343 }, { "epoch": 2.62, "grad_norm": 1.4343321740538089, "learning_rate": 3.805045099616804e-06, "loss": 0.0673, "step": 1344 }, { "epoch": 2.63, "grad_norm": 1.5303663615844416, "learning_rate": 3.80029059320554e-06, "loss": 0.0977, "step": 1345 }, { "epoch": 2.63, "grad_norm": 1.542579495759221, "learning_rate": 3.7955355284829257e-06, "loss": 0.0808, "step": 1346 }, { "epoch": 2.63, "grad_norm": 1.4932921229668423, "learning_rate": 3.7907799142897547e-06, "loss": 0.0702, "step": 1347 }, { "epoch": 2.63, "grad_norm": 1.6985118522701903, "learning_rate": 3.786023759467839e-06, "loss": 0.0998, "step": 1348 }, { "epoch": 2.63, "grad_norm": 1.5507078534899073, "learning_rate": 3.7812670728599973e-06, "loss": 0.0728, "step": 1349 }, { "epoch": 2.64, "grad_norm": 1.4566692318142076, "learning_rate": 3.776509863310037e-06, "loss": 0.0705, "step": 1350 }, { "epoch": 2.64, "grad_norm": 1.538968408274816, "learning_rate": 3.771752139662736e-06, "loss": 0.0923, "step": 1351 }, { "epoch": 2.64, "grad_norm": 1.6257921691545252, "learning_rate": 3.766993910763834e-06, "loss": 0.0949, "step": 1352 }, { "epoch": 2.64, "grad_norm": 1.375594846983006, "learning_rate": 3.7622351854600005e-06, "loss": 0.073, "step": 1353 }, { "epoch": 2.64, "grad_norm": 1.4770482831740919, "learning_rate": 3.7574759725988363e-06, "loss": 0.0748, "step": 1354 }, { "epoch": 2.65, "grad_norm": 1.623827335631092, "learning_rate": 3.7527162810288446e-06, "loss": 0.0771, "step": 1355 }, { "epoch": 2.65, "grad_norm": 1.6192181127692729, "learning_rate": 3.7479561195994195e-06, "loss": 0.0663, "step": 1356 }, { "epoch": 2.65, "grad_norm": 1.7566037483491797, "learning_rate": 3.743195497160829e-06, "loss": 0.0892, "step": 1357 }, { "epoch": 2.65, "grad_norm": 1.5290994169247902, "learning_rate": 3.7384344225641987e-06, "loss": 0.0822, "step": 1358 }, { "epoch": 2.65, "grad_norm": 1.524336620759879, "learning_rate": 3.733672904661492e-06, "loss": 0.0664, "step": 1359 }, { "epoch": 2.66, "grad_norm": 1.7230976851073332, "learning_rate": 3.728910952305501e-06, "loss": 0.0922, "step": 1360 }, { "epoch": 2.66, "grad_norm": 1.6267033968496738, "learning_rate": 3.724148574349822e-06, "loss": 0.0965, "step": 1361 }, { "epoch": 2.66, "grad_norm": 1.652298003974374, "learning_rate": 3.719385779648844e-06, "loss": 0.0972, "step": 1362 }, { "epoch": 2.66, "grad_norm": 1.3997250701773818, "learning_rate": 3.71462257705773e-06, "loss": 0.0751, "step": 1363 }, { "epoch": 2.66, "grad_norm": 1.552922328707782, "learning_rate": 3.7098589754324037e-06, "loss": 0.1085, "step": 1364 }, { "epoch": 2.67, "grad_norm": 1.7445631854018921, "learning_rate": 3.7050949836295268e-06, "loss": 0.086, "step": 1365 }, { "epoch": 2.67, "grad_norm": 1.5024124978296538, "learning_rate": 3.700330610506491e-06, "loss": 0.0848, "step": 1366 }, { "epoch": 2.67, "grad_norm": 1.5315379108574816, "learning_rate": 3.695565864921392e-06, "loss": 0.1166, "step": 1367 }, { "epoch": 2.67, "grad_norm": 1.3541312383239168, "learning_rate": 3.6908007557330225e-06, "loss": 0.0673, "step": 1368 }, { "epoch": 2.67, "grad_norm": 1.3740785379206029, "learning_rate": 3.6860352918008482e-06, "loss": 0.1055, "step": 1369 }, { "epoch": 2.68, "grad_norm": 1.670848913075325, "learning_rate": 3.6812694819849964e-06, "loss": 0.0866, "step": 1370 }, { "epoch": 2.68, "grad_norm": 1.502774814621381, "learning_rate": 3.6765033351462366e-06, "loss": 0.0976, "step": 1371 }, { "epoch": 2.68, "grad_norm": 1.5188642283034317, "learning_rate": 3.6717368601459635e-06, "loss": 0.1083, "step": 1372 }, { "epoch": 2.68, "grad_norm": 1.4955409917516185, "learning_rate": 3.6669700658461837e-06, "loss": 0.0924, "step": 1373 }, { "epoch": 2.68, "grad_norm": 1.4097217802478634, "learning_rate": 3.662202961109498e-06, "loss": 0.0738, "step": 1374 }, { "epoch": 2.69, "grad_norm": 1.7179729803961836, "learning_rate": 3.657435554799083e-06, "loss": 0.095, "step": 1375 }, { "epoch": 2.69, "grad_norm": 1.5546914389530369, "learning_rate": 3.6526678557786763e-06, "loss": 0.1027, "step": 1376 }, { "epoch": 2.69, "grad_norm": 1.491729438297257, "learning_rate": 3.6478998729125588e-06, "loss": 0.1016, "step": 1377 }, { "epoch": 2.69, "grad_norm": 1.6449987525159309, "learning_rate": 3.643131615065542e-06, "loss": 0.0705, "step": 1378 }, { "epoch": 2.69, "grad_norm": 1.4163889827970118, "learning_rate": 3.6383630911029457e-06, "loss": 0.0633, "step": 1379 }, { "epoch": 2.7, "grad_norm": 1.3393367069360846, "learning_rate": 3.633594309890586e-06, "loss": 0.0784, "step": 1380 }, { "epoch": 2.7, "grad_norm": 1.4669138577029817, "learning_rate": 3.628825280294756e-06, "loss": 0.0893, "step": 1381 }, { "epoch": 2.7, "grad_norm": 1.4869962737873503, "learning_rate": 3.6240560111822124e-06, "loss": 0.0712, "step": 1382 }, { "epoch": 2.7, "grad_norm": 1.5951841823326853, "learning_rate": 3.619286511420156e-06, "loss": 0.1164, "step": 1383 }, { "epoch": 2.7, "grad_norm": 1.4622396905094492, "learning_rate": 3.6145167898762167e-06, "loss": 0.0684, "step": 1384 }, { "epoch": 2.71, "grad_norm": 1.4183919396828149, "learning_rate": 3.609746855418437e-06, "loss": 0.0579, "step": 1385 }, { "epoch": 2.71, "grad_norm": 1.5672043595791931, "learning_rate": 3.6049767169152543e-06, "loss": 0.0921, "step": 1386 }, { "epoch": 2.71, "grad_norm": 1.590264159715172, "learning_rate": 3.6002063832354873e-06, "loss": 0.0797, "step": 1387 }, { "epoch": 2.71, "grad_norm": 1.3982543872692372, "learning_rate": 3.595435863248315e-06, "loss": 0.0567, "step": 1388 }, { "epoch": 2.71, "grad_norm": 1.5806261540956388, "learning_rate": 3.5906651658232647e-06, "loss": 0.0876, "step": 1389 }, { "epoch": 2.71, "grad_norm": 1.6480506549216978, "learning_rate": 3.585894299830193e-06, "loss": 0.0954, "step": 1390 }, { "epoch": 2.72, "grad_norm": 1.5747846641491954, "learning_rate": 3.5811232741392703e-06, "loss": 0.093, "step": 1391 }, { "epoch": 2.72, "grad_norm": 1.4668506490161863, "learning_rate": 3.576352097620964e-06, "loss": 0.0833, "step": 1392 }, { "epoch": 2.72, "grad_norm": 1.500462862478817, "learning_rate": 3.571580779146021e-06, "loss": 0.0819, "step": 1393 }, { "epoch": 2.72, "grad_norm": 1.5468130011034882, "learning_rate": 3.5668093275854518e-06, "loss": 0.0708, "step": 1394 }, { "epoch": 2.72, "grad_norm": 1.713056509957107, "learning_rate": 3.5620377518105167e-06, "loss": 0.1016, "step": 1395 }, { "epoch": 2.73, "grad_norm": 1.4959107823001978, "learning_rate": 3.557266060692704e-06, "loss": 0.0799, "step": 1396 }, { "epoch": 2.73, "grad_norm": 1.5883225444902183, "learning_rate": 3.5524942631037195e-06, "loss": 0.0667, "step": 1397 }, { "epoch": 2.73, "grad_norm": 1.6216218325239806, "learning_rate": 3.547722367915463e-06, "loss": 0.118, "step": 1398 }, { "epoch": 2.73, "grad_norm": 1.5676039536931974, "learning_rate": 3.5429503840000197e-06, "loss": 0.0863, "step": 1399 }, { "epoch": 2.73, "grad_norm": 1.5079612985422632, "learning_rate": 3.5381783202296382e-06, "loss": 0.0965, "step": 1400 }, { "epoch": 2.74, "grad_norm": 1.6323132507856537, "learning_rate": 3.533406185476716e-06, "loss": 0.1017, "step": 1401 }, { "epoch": 2.74, "grad_norm": 1.5952159288634953, "learning_rate": 3.5286339886137804e-06, "loss": 0.0794, "step": 1402 }, { "epoch": 2.74, "grad_norm": 1.5046472969578655, "learning_rate": 3.5238617385134766e-06, "loss": 0.0822, "step": 1403 }, { "epoch": 2.74, "grad_norm": 1.5072418095915459, "learning_rate": 3.5190894440485483e-06, "loss": 0.0914, "step": 1404 }, { "epoch": 2.74, "grad_norm": 1.4573932989846177, "learning_rate": 3.5143171140918213e-06, "loss": 0.0842, "step": 1405 }, { "epoch": 2.75, "grad_norm": 1.4962574154653685, "learning_rate": 3.509544757516189e-06, "loss": 0.0552, "step": 1406 }, { "epoch": 2.75, "grad_norm": 1.4375695845111809, "learning_rate": 3.5047723831945895e-06, "loss": 0.0916, "step": 1407 }, { "epoch": 2.75, "grad_norm": 1.5762820317674286, "learning_rate": 3.5e-06, "loss": 0.0889, "step": 1408 }, { "epoch": 2.75, "grad_norm": 1.5641061799526137, "learning_rate": 3.4952276168054104e-06, "loss": 0.0763, "step": 1409 }, { "epoch": 2.75, "grad_norm": 1.3550967085439272, "learning_rate": 3.490455242483811e-06, "loss": 0.063, "step": 1410 }, { "epoch": 2.76, "grad_norm": 1.7758728951887086, "learning_rate": 3.485682885908178e-06, "loss": 0.091, "step": 1411 }, { "epoch": 2.76, "grad_norm": 1.6585199295260362, "learning_rate": 3.480910555951451e-06, "loss": 0.0805, "step": 1412 }, { "epoch": 2.76, "grad_norm": 1.561623477211864, "learning_rate": 3.476138261486524e-06, "loss": 0.0457, "step": 1413 }, { "epoch": 2.76, "grad_norm": 1.5179149335995548, "learning_rate": 3.471366011386221e-06, "loss": 0.0593, "step": 1414 }, { "epoch": 2.76, "grad_norm": 1.5918720739639565, "learning_rate": 3.466593814523285e-06, "loss": 0.0772, "step": 1415 }, { "epoch": 2.77, "grad_norm": 1.687865535327758, "learning_rate": 3.461821679770362e-06, "loss": 0.0859, "step": 1416 }, { "epoch": 2.77, "grad_norm": 1.7205229387147076, "learning_rate": 3.4570496159999806e-06, "loss": 0.0753, "step": 1417 }, { "epoch": 2.77, "grad_norm": 1.5594021169569958, "learning_rate": 3.452277632084538e-06, "loss": 0.0873, "step": 1418 }, { "epoch": 2.77, "grad_norm": 1.6646891267148354, "learning_rate": 3.4475057368962812e-06, "loss": 0.0992, "step": 1419 }, { "epoch": 2.77, "grad_norm": 1.683681594740457, "learning_rate": 3.442733939307296e-06, "loss": 0.0853, "step": 1420 }, { "epoch": 2.78, "grad_norm": 1.5490588102324472, "learning_rate": 3.4379622481894836e-06, "loss": 0.0923, "step": 1421 }, { "epoch": 2.78, "grad_norm": 1.5600166677535956, "learning_rate": 3.433190672414549e-06, "loss": 0.0696, "step": 1422 }, { "epoch": 2.78, "grad_norm": 1.612338314956105, "learning_rate": 3.4284192208539802e-06, "loss": 0.0697, "step": 1423 }, { "epoch": 2.78, "grad_norm": 1.5831873692983573, "learning_rate": 3.4236479023790363e-06, "loss": 0.0778, "step": 1424 }, { "epoch": 2.78, "grad_norm": 1.8296176503323507, "learning_rate": 3.4188767258607296e-06, "loss": 0.0765, "step": 1425 }, { "epoch": 2.79, "grad_norm": 1.444479019782247, "learning_rate": 3.4141057001698072e-06, "loss": 0.0937, "step": 1426 }, { "epoch": 2.79, "grad_norm": 1.4793955009931807, "learning_rate": 3.4093348341767356e-06, "loss": 0.1092, "step": 1427 }, { "epoch": 2.79, "grad_norm": 1.6102707161111898, "learning_rate": 3.4045641367516857e-06, "loss": 0.0936, "step": 1428 }, { "epoch": 2.79, "grad_norm": 1.4326607434489098, "learning_rate": 3.3997936167645135e-06, "loss": 0.0777, "step": 1429 }, { "epoch": 2.79, "grad_norm": 1.3924744861644025, "learning_rate": 3.395023283084745e-06, "loss": 0.0612, "step": 1430 }, { "epoch": 2.79, "grad_norm": 1.5201923774909092, "learning_rate": 3.3902531445815646e-06, "loss": 0.0824, "step": 1431 }, { "epoch": 2.8, "grad_norm": 1.4143097955807562, "learning_rate": 3.3854832101237836e-06, "loss": 0.0587, "step": 1432 }, { "epoch": 2.8, "grad_norm": 1.4321922727460117, "learning_rate": 3.3807134885798448e-06, "loss": 0.1063, "step": 1433 }, { "epoch": 2.8, "grad_norm": 1.5684976900713017, "learning_rate": 3.3759439888177883e-06, "loss": 0.1018, "step": 1434 }, { "epoch": 2.8, "grad_norm": 1.6221081959959445, "learning_rate": 3.3711747197052438e-06, "loss": 0.109, "step": 1435 }, { "epoch": 2.8, "grad_norm": 1.4215988737571634, "learning_rate": 3.366405690109414e-06, "loss": 0.0972, "step": 1436 }, { "epoch": 2.81, "grad_norm": 1.3176402840511312, "learning_rate": 3.3616369088970542e-06, "loss": 0.0684, "step": 1437 }, { "epoch": 2.81, "grad_norm": 1.3342999032681389, "learning_rate": 3.3568683849344583e-06, "loss": 0.0934, "step": 1438 }, { "epoch": 2.81, "grad_norm": 1.5389230393719409, "learning_rate": 3.3521001270874403e-06, "loss": 0.0726, "step": 1439 }, { "epoch": 2.81, "grad_norm": 1.371203086372357, "learning_rate": 3.3473321442213245e-06, "loss": 0.0986, "step": 1440 }, { "epoch": 2.81, "grad_norm": 1.5038483062929893, "learning_rate": 3.342564445200917e-06, "loss": 0.0756, "step": 1441 }, { "epoch": 2.82, "grad_norm": 1.4715552985790075, "learning_rate": 3.3377970388905024e-06, "loss": 0.126, "step": 1442 }, { "epoch": 2.82, "grad_norm": 1.6701007049338823, "learning_rate": 3.333029934153817e-06, "loss": 0.0927, "step": 1443 }, { "epoch": 2.82, "grad_norm": 1.398541389213539, "learning_rate": 3.328263139854037e-06, "loss": 0.0963, "step": 1444 }, { "epoch": 2.82, "grad_norm": 1.4597718068270518, "learning_rate": 3.323496664853764e-06, "loss": 0.0902, "step": 1445 }, { "epoch": 2.82, "grad_norm": 1.4754248523800102, "learning_rate": 3.3187305180150035e-06, "loss": 0.0914, "step": 1446 }, { "epoch": 2.83, "grad_norm": 1.4276947518942273, "learning_rate": 3.3139647081991513e-06, "loss": 0.0794, "step": 1447 }, { "epoch": 2.83, "grad_norm": 1.6776976146764024, "learning_rate": 3.3091992442669774e-06, "loss": 0.0607, "step": 1448 }, { "epoch": 2.83, "grad_norm": 1.5515958189969175, "learning_rate": 3.304434135078609e-06, "loss": 0.0771, "step": 1449 }, { "epoch": 2.83, "grad_norm": 1.591078079637085, "learning_rate": 3.2996693894935104e-06, "loss": 0.1231, "step": 1450 }, { "epoch": 2.83, "grad_norm": 1.5861220623463244, "learning_rate": 3.294905016370474e-06, "loss": 0.0789, "step": 1451 }, { "epoch": 2.84, "grad_norm": 1.590274058076155, "learning_rate": 3.290141024567597e-06, "loss": 0.0759, "step": 1452 }, { "epoch": 2.84, "grad_norm": 1.4432310545467195, "learning_rate": 3.28537742294227e-06, "loss": 0.1019, "step": 1453 }, { "epoch": 2.84, "grad_norm": 1.9366565701471272, "learning_rate": 3.280614220351157e-06, "loss": 0.0867, "step": 1454 }, { "epoch": 2.84, "grad_norm": 1.3206286490048222, "learning_rate": 3.275851425650178e-06, "loss": 0.079, "step": 1455 }, { "epoch": 2.84, "grad_norm": 1.549166300747664, "learning_rate": 3.271089047694499e-06, "loss": 0.0581, "step": 1456 }, { "epoch": 2.85, "grad_norm": 1.328507928071646, "learning_rate": 3.2663270953385075e-06, "loss": 0.067, "step": 1457 }, { "epoch": 2.85, "grad_norm": 1.6817282028556062, "learning_rate": 3.2615655774358025e-06, "loss": 0.0558, "step": 1458 }, { "epoch": 2.85, "grad_norm": 1.668451421027869, "learning_rate": 3.2568045028391715e-06, "loss": 0.08, "step": 1459 }, { "epoch": 2.85, "grad_norm": 1.6381163813558237, "learning_rate": 3.252043880400581e-06, "loss": 0.0831, "step": 1460 }, { "epoch": 2.85, "grad_norm": 1.6845530068496068, "learning_rate": 3.2472837189711557e-06, "loss": 0.0864, "step": 1461 }, { "epoch": 2.86, "grad_norm": 2.122632137760492, "learning_rate": 3.2425240274011644e-06, "loss": 0.0673, "step": 1462 }, { "epoch": 2.86, "grad_norm": 1.775725726227819, "learning_rate": 3.2377648145400002e-06, "loss": 0.0742, "step": 1463 }, { "epoch": 2.86, "grad_norm": 1.7368297881443122, "learning_rate": 3.2330060892361665e-06, "loss": 0.0776, "step": 1464 }, { "epoch": 2.86, "grad_norm": 2.002149256378852, "learning_rate": 3.2282478603372634e-06, "loss": 0.1179, "step": 1465 }, { "epoch": 2.86, "grad_norm": 1.574072399310232, "learning_rate": 3.2234901366899633e-06, "loss": 0.0766, "step": 1466 }, { "epoch": 2.87, "grad_norm": 1.5021666415968773, "learning_rate": 3.218732927140004e-06, "loss": 0.0808, "step": 1467 }, { "epoch": 2.87, "grad_norm": 1.6495342623683718, "learning_rate": 3.2139762405321623e-06, "loss": 0.0938, "step": 1468 }, { "epoch": 2.87, "grad_norm": 1.491410408938515, "learning_rate": 3.2092200857102456e-06, "loss": 0.0739, "step": 1469 }, { "epoch": 2.87, "grad_norm": 1.5613505104238472, "learning_rate": 3.204464471517074e-06, "loss": 0.1035, "step": 1470 }, { "epoch": 2.87, "grad_norm": 1.4697581515867277, "learning_rate": 3.1997094067944606e-06, "loss": 0.0868, "step": 1471 }, { "epoch": 2.88, "grad_norm": 1.6717513221216425, "learning_rate": 3.1949549003831962e-06, "loss": 0.0875, "step": 1472 }, { "epoch": 2.88, "grad_norm": 1.34427507228404, "learning_rate": 3.1902009611230357e-06, "loss": 0.0556, "step": 1473 }, { "epoch": 2.88, "grad_norm": 1.6004712239927035, "learning_rate": 3.1854475978526774e-06, "loss": 0.0925, "step": 1474 }, { "epoch": 2.88, "grad_norm": 1.540156471401566, "learning_rate": 3.18069481940975e-06, "loss": 0.1121, "step": 1475 }, { "epoch": 2.88, "grad_norm": 1.4867984930738523, "learning_rate": 3.1759426346307963e-06, "loss": 0.0738, "step": 1476 }, { "epoch": 2.88, "grad_norm": 1.4715728041445604, "learning_rate": 3.1711910523512493e-06, "loss": 0.0893, "step": 1477 }, { "epoch": 2.89, "grad_norm": 1.547474881510477, "learning_rate": 3.16644008140543e-06, "loss": 0.0737, "step": 1478 }, { "epoch": 2.89, "grad_norm": 1.648722656447873, "learning_rate": 3.161689730626517e-06, "loss": 0.0929, "step": 1479 }, { "epoch": 2.89, "grad_norm": 1.4350672907326014, "learning_rate": 3.1569400088465375e-06, "loss": 0.0752, "step": 1480 }, { "epoch": 2.89, "grad_norm": 1.6509026600628938, "learning_rate": 3.152190924896351e-06, "loss": 0.1091, "step": 1481 }, { "epoch": 2.89, "grad_norm": 1.607382839443433, "learning_rate": 3.1474424876056288e-06, "loss": 0.0633, "step": 1482 }, { "epoch": 2.9, "grad_norm": 1.7840526184888605, "learning_rate": 3.142694705802841e-06, "loss": 0.1334, "step": 1483 }, { "epoch": 2.9, "grad_norm": 1.5165198439268468, "learning_rate": 3.137947588315237e-06, "loss": 0.0965, "step": 1484 }, { "epoch": 2.9, "grad_norm": 1.5312032154695827, "learning_rate": 3.1332011439688366e-06, "loss": 0.0695, "step": 1485 }, { "epoch": 2.9, "grad_norm": 1.3454094991881984, "learning_rate": 3.1284553815883995e-06, "loss": 0.0656, "step": 1486 }, { "epoch": 2.9, "grad_norm": 1.7372003271574008, "learning_rate": 3.123710309997425e-06, "loss": 0.1132, "step": 1487 }, { "epoch": 2.91, "grad_norm": 1.6067959294314385, "learning_rate": 3.118965938018125e-06, "loss": 0.0864, "step": 1488 }, { "epoch": 2.91, "grad_norm": 1.6125471914325296, "learning_rate": 3.1142222744714093e-06, "loss": 0.1062, "step": 1489 }, { "epoch": 2.91, "grad_norm": 1.5333223356871235, "learning_rate": 3.1094793281768745e-06, "loss": 0.09, "step": 1490 }, { "epoch": 2.91, "grad_norm": 1.532683081146072, "learning_rate": 3.1047371079527805e-06, "loss": 0.0954, "step": 1491 }, { "epoch": 2.91, "grad_norm": 1.52557105508598, "learning_rate": 3.099995622616037e-06, "loss": 0.0702, "step": 1492 }, { "epoch": 2.92, "grad_norm": 1.5023995404826884, "learning_rate": 3.0952548809821907e-06, "loss": 0.1118, "step": 1493 }, { "epoch": 2.92, "grad_norm": 1.5422593357094734, "learning_rate": 3.090514891865402e-06, "loss": 0.0732, "step": 1494 }, { "epoch": 2.92, "grad_norm": 1.7448421067531403, "learning_rate": 3.085775664078433e-06, "loss": 0.0706, "step": 1495 }, { "epoch": 2.92, "grad_norm": 1.625128474274894, "learning_rate": 3.0810372064326317e-06, "loss": 0.0795, "step": 1496 }, { "epoch": 2.92, "grad_norm": 1.5601260960152818, "learning_rate": 3.0762995277379124e-06, "loss": 0.0652, "step": 1497 }, { "epoch": 2.93, "grad_norm": 1.6368284624497802, "learning_rate": 3.0715626368027436e-06, "loss": 0.1006, "step": 1498 }, { "epoch": 2.93, "grad_norm": 1.560121070860711, "learning_rate": 3.0668265424341264e-06, "loss": 0.0804, "step": 1499 }, { "epoch": 2.93, "grad_norm": 1.5210429259163014, "learning_rate": 3.0620912534375834e-06, "loss": 0.0808, "step": 1500 }, { "epoch": 2.93, "grad_norm": 1.6130438984629951, "learning_rate": 3.0573567786171366e-06, "loss": 0.1012, "step": 1501 }, { "epoch": 2.93, "grad_norm": 1.5950741166501616, "learning_rate": 3.052623126775298e-06, "loss": 0.1039, "step": 1502 }, { "epoch": 2.94, "grad_norm": 1.5177496830879404, "learning_rate": 3.0478903067130487e-06, "loss": 0.0909, "step": 1503 }, { "epoch": 2.94, "grad_norm": 1.4551725200327674, "learning_rate": 3.0431583272298204e-06, "loss": 0.0875, "step": 1504 }, { "epoch": 2.94, "grad_norm": 1.7420819789118747, "learning_rate": 3.0384271971234837e-06, "loss": 0.0844, "step": 1505 }, { "epoch": 2.94, "grad_norm": 1.5947304486116545, "learning_rate": 3.0336969251903305e-06, "loss": 0.1103, "step": 1506 }, { "epoch": 2.94, "grad_norm": 1.6097034884649462, "learning_rate": 3.0289675202250584e-06, "loss": 0.0882, "step": 1507 }, { "epoch": 2.95, "grad_norm": 1.4855980349553142, "learning_rate": 3.0242389910207505e-06, "loss": 0.0931, "step": 1508 }, { "epoch": 2.95, "grad_norm": 1.7094262325369265, "learning_rate": 3.0195113463688624e-06, "loss": 0.0714, "step": 1509 }, { "epoch": 2.95, "grad_norm": 1.6743710713127697, "learning_rate": 3.0147845950592064e-06, "loss": 0.1044, "step": 1510 }, { "epoch": 2.95, "grad_norm": 1.5465292425645538, "learning_rate": 3.0100587458799325e-06, "loss": 0.1127, "step": 1511 }, { "epoch": 2.95, "grad_norm": 1.5157753250949075, "learning_rate": 3.0053338076175147e-06, "loss": 0.1079, "step": 1512 }, { "epoch": 2.96, "grad_norm": 1.4405571020025567, "learning_rate": 3.0006097890567304e-06, "loss": 0.0546, "step": 1513 }, { "epoch": 2.96, "grad_norm": 1.3785756886783784, "learning_rate": 2.99588669898065e-06, "loss": 0.0782, "step": 1514 }, { "epoch": 2.96, "grad_norm": 1.506091562233513, "learning_rate": 2.991164546170618e-06, "loss": 0.0886, "step": 1515 }, { "epoch": 2.96, "grad_norm": 1.2932369260013736, "learning_rate": 2.986443339406234e-06, "loss": 0.057, "step": 1516 }, { "epoch": 2.96, "grad_norm": 1.4496295623595679, "learning_rate": 2.9817230874653398e-06, "loss": 0.0545, "step": 1517 }, { "epoch": 2.96, "grad_norm": 1.4721681453479794, "learning_rate": 2.977003799124003e-06, "loss": 0.0866, "step": 1518 }, { "epoch": 2.97, "grad_norm": 1.3609260671197703, "learning_rate": 2.972285483156498e-06, "loss": 0.0887, "step": 1519 }, { "epoch": 2.97, "grad_norm": 1.7018618837105501, "learning_rate": 2.9675681483352915e-06, "loss": 0.1105, "step": 1520 }, { "epoch": 2.97, "grad_norm": 1.5884785655684515, "learning_rate": 2.9628518034310278e-06, "loss": 0.0684, "step": 1521 }, { "epoch": 2.97, "grad_norm": 1.7796812594619784, "learning_rate": 2.9581364572125067e-06, "loss": 0.0801, "step": 1522 }, { "epoch": 2.97, "grad_norm": 1.642206107571221, "learning_rate": 2.953422118446676e-06, "loss": 0.0889, "step": 1523 }, { "epoch": 2.98, "grad_norm": 1.6199307253268247, "learning_rate": 2.948708795898608e-06, "loss": 0.0994, "step": 1524 }, { "epoch": 2.98, "grad_norm": 1.8035329043105781, "learning_rate": 2.9439964983314854e-06, "loss": 0.0883, "step": 1525 }, { "epoch": 2.98, "grad_norm": 1.4734972457617244, "learning_rate": 2.9392852345065854e-06, "loss": 0.0742, "step": 1526 }, { "epoch": 2.98, "grad_norm": 1.5519765119030835, "learning_rate": 2.9345750131832646e-06, "loss": 0.0893, "step": 1527 }, { "epoch": 2.98, "grad_norm": 1.6027844337492263, "learning_rate": 2.9298658431189395e-06, "loss": 0.1084, "step": 1528 }, { "epoch": 2.99, "grad_norm": 1.6093685512088525, "learning_rate": 2.925157733069072e-06, "loss": 0.0706, "step": 1529 }, { "epoch": 2.99, "grad_norm": 1.6239087074163843, "learning_rate": 2.9204506917871577e-06, "loss": 0.1161, "step": 1530 }, { "epoch": 2.99, "grad_norm": 1.5268202060718372, "learning_rate": 2.9157447280246964e-06, "loss": 0.0664, "step": 1531 }, { "epoch": 2.99, "grad_norm": 1.4345895104960076, "learning_rate": 2.9110398505311927e-06, "loss": 0.0762, "step": 1532 }, { "epoch": 2.99, "grad_norm": 1.6306802288794346, "learning_rate": 2.906336068054127e-06, "loss": 0.0992, "step": 1533 }, { "epoch": 3.0, "grad_norm": 1.5093499734618516, "learning_rate": 2.9016333893389455e-06, "loss": 0.09, "step": 1534 }, { "epoch": 3.0, "grad_norm": 1.8134969570199961, "learning_rate": 2.896931823129043e-06, "loss": 0.0833, "step": 1535 }, { "epoch": 3.0, "grad_norm": 1.517904720991008, "learning_rate": 2.8922313781657437e-06, "loss": 0.0873, "step": 1536 }, { "epoch": 3.0, "grad_norm": 1.5403906888215277, "learning_rate": 2.8875320631882885e-06, "loss": 0.0737, "step": 1537 }, { "epoch": 3.0, "grad_norm": 1.3424509303122174, "learning_rate": 2.8828338869338175e-06, "loss": 0.0427, "step": 1538 }, { "epoch": 3.01, "grad_norm": 1.3466689629935742, "learning_rate": 2.8781368581373545e-06, "loss": 0.0834, "step": 1539 }, { "epoch": 3.01, "grad_norm": 1.4526732633915531, "learning_rate": 2.873440985531786e-06, "loss": 0.0883, "step": 1540 }, { "epoch": 3.01, "grad_norm": 1.4220512848171871, "learning_rate": 2.8687462778478535e-06, "loss": 0.0587, "step": 1541 }, { "epoch": 3.01, "grad_norm": 1.426185630268188, "learning_rate": 2.864052743814129e-06, "loss": 0.0625, "step": 1542 }, { "epoch": 3.01, "grad_norm": 1.5314446846203247, "learning_rate": 2.859360392157007e-06, "loss": 0.0926, "step": 1543 }, { "epoch": 3.02, "grad_norm": 1.5101453492935433, "learning_rate": 2.8546692316006783e-06, "loss": 0.0854, "step": 1544 }, { "epoch": 3.02, "grad_norm": 1.2326748874761617, "learning_rate": 2.8499792708671227e-06, "loss": 0.0593, "step": 1545 }, { "epoch": 3.02, "grad_norm": 1.536560288578872, "learning_rate": 2.845290518676088e-06, "loss": 0.0878, "step": 1546 }, { "epoch": 3.02, "grad_norm": 1.5361715859024911, "learning_rate": 2.840602983745075e-06, "loss": 0.089, "step": 1547 }, { "epoch": 3.02, "grad_norm": 1.5604100665329068, "learning_rate": 2.8359166747893237e-06, "loss": 0.082, "step": 1548 }, { "epoch": 3.03, "grad_norm": 1.3621652884955202, "learning_rate": 2.8312316005217896e-06, "loss": 0.0788, "step": 1549 }, { "epoch": 3.03, "grad_norm": 1.6110982193632222, "learning_rate": 2.826547769653137e-06, "loss": 0.0763, "step": 1550 }, { "epoch": 3.03, "grad_norm": 1.5049981734310156, "learning_rate": 2.821865190891716e-06, "loss": 0.0888, "step": 1551 }, { "epoch": 3.03, "grad_norm": 1.471759078065554, "learning_rate": 2.8171838729435513e-06, "loss": 0.073, "step": 1552 }, { "epoch": 3.03, "grad_norm": 1.5566277196848248, "learning_rate": 2.8125038245123216e-06, "loss": 0.0806, "step": 1553 }, { "epoch": 3.04, "grad_norm": 1.3606020043466527, "learning_rate": 2.8078250542993445e-06, "loss": 0.0553, "step": 1554 }, { "epoch": 3.04, "grad_norm": 1.6627351541908966, "learning_rate": 2.8031475710035637e-06, "loss": 0.0724, "step": 1555 }, { "epoch": 3.04, "grad_norm": 1.677851142488937, "learning_rate": 2.7984713833215282e-06, "loss": 0.0982, "step": 1556 }, { "epoch": 3.04, "grad_norm": 1.654990103424566, "learning_rate": 2.793796499947379e-06, "loss": 0.0996, "step": 1557 }, { "epoch": 3.04, "grad_norm": 1.1656371385453586, "learning_rate": 2.7891229295728308e-06, "loss": 0.0377, "step": 1558 }, { "epoch": 3.04, "grad_norm": 1.31312513933071, "learning_rate": 2.7844506808871573e-06, "loss": 0.0476, "step": 1559 }, { "epoch": 3.05, "grad_norm": 1.3443744162727564, "learning_rate": 2.779779762577178e-06, "loss": 0.0676, "step": 1560 }, { "epoch": 3.05, "grad_norm": 1.5665672348758652, "learning_rate": 2.7751101833272356e-06, "loss": 0.1044, "step": 1561 }, { "epoch": 3.05, "grad_norm": 1.2930008452303443, "learning_rate": 2.7704419518191834e-06, "loss": 0.0537, "step": 1562 }, { "epoch": 3.05, "grad_norm": 1.4159072186133492, "learning_rate": 2.7657750767323693e-06, "loss": 0.072, "step": 1563 }, { "epoch": 3.05, "grad_norm": 1.364235675089893, "learning_rate": 2.761109566743622e-06, "loss": 0.0805, "step": 1564 }, { "epoch": 3.06, "grad_norm": 1.4762952196599886, "learning_rate": 2.756445430527228e-06, "loss": 0.0774, "step": 1565 }, { "epoch": 3.06, "grad_norm": 1.5187018410035822, "learning_rate": 2.751782676754922e-06, "loss": 0.0899, "step": 1566 }, { "epoch": 3.06, "grad_norm": 1.3565904789477965, "learning_rate": 2.7471213140958667e-06, "loss": 0.0708, "step": 1567 }, { "epoch": 3.06, "grad_norm": 1.4266828777329732, "learning_rate": 2.7424613512166398e-06, "loss": 0.0872, "step": 1568 }, { "epoch": 3.06, "grad_norm": 1.404737356450187, "learning_rate": 2.7378027967812183e-06, "loss": 0.0486, "step": 1569 }, { "epoch": 3.07, "grad_norm": 1.4088450984829584, "learning_rate": 2.733145659450956e-06, "loss": 0.0653, "step": 1570 }, { "epoch": 3.07, "grad_norm": 1.5393987979275823, "learning_rate": 2.728489947884575e-06, "loss": 0.0755, "step": 1571 }, { "epoch": 3.07, "grad_norm": 1.4496643659336674, "learning_rate": 2.7238356707381474e-06, "loss": 0.0749, "step": 1572 }, { "epoch": 3.07, "grad_norm": 1.533584212736249, "learning_rate": 2.7191828366650756e-06, "loss": 0.0734, "step": 1573 }, { "epoch": 3.07, "grad_norm": 1.5587432447823275, "learning_rate": 2.7145314543160805e-06, "loss": 0.0842, "step": 1574 }, { "epoch": 3.08, "grad_norm": 1.5384980899975529, "learning_rate": 2.709881532339186e-06, "loss": 0.0713, "step": 1575 }, { "epoch": 3.08, "grad_norm": 1.3794430144460212, "learning_rate": 2.705233079379694e-06, "loss": 0.0541, "step": 1576 }, { "epoch": 3.08, "grad_norm": 1.475387948056582, "learning_rate": 2.7005861040801835e-06, "loss": 0.0688, "step": 1577 }, { "epoch": 3.08, "grad_norm": 1.5896037301934423, "learning_rate": 2.695940615080481e-06, "loss": 0.087, "step": 1578 }, { "epoch": 3.08, "grad_norm": 1.4206388305004616, "learning_rate": 2.691296621017651e-06, "loss": 0.0734, "step": 1579 }, { "epoch": 3.09, "grad_norm": 1.619064703733071, "learning_rate": 2.6866541305259794e-06, "loss": 0.075, "step": 1580 }, { "epoch": 3.09, "grad_norm": 1.5219148182626288, "learning_rate": 2.682013152236955e-06, "loss": 0.061, "step": 1581 }, { "epoch": 3.09, "grad_norm": 1.652863123758761, "learning_rate": 2.677373694779257e-06, "loss": 0.0637, "step": 1582 }, { "epoch": 3.09, "grad_norm": 1.4414637660207736, "learning_rate": 2.6727357667787335e-06, "loss": 0.0441, "step": 1583 }, { "epoch": 3.09, "grad_norm": 1.5285913709293644, "learning_rate": 2.6680993768583944e-06, "loss": 0.0545, "step": 1584 }, { "epoch": 3.1, "grad_norm": 1.4273914358234476, "learning_rate": 2.663464533638385e-06, "loss": 0.0677, "step": 1585 }, { "epoch": 3.1, "grad_norm": 1.6138846666209605, "learning_rate": 2.6588312457359768e-06, "loss": 0.0907, "step": 1586 }, { "epoch": 3.1, "grad_norm": 1.4712847980553567, "learning_rate": 2.6541995217655503e-06, "loss": 0.0538, "step": 1587 }, { "epoch": 3.1, "grad_norm": 1.3870263214519563, "learning_rate": 2.6495693703385765e-06, "loss": 0.0629, "step": 1588 }, { "epoch": 3.1, "grad_norm": 1.4105026545717672, "learning_rate": 2.644940800063605e-06, "loss": 0.0643, "step": 1589 }, { "epoch": 3.11, "grad_norm": 1.5681692207255595, "learning_rate": 2.640313819546244e-06, "loss": 0.0611, "step": 1590 }, { "epoch": 3.11, "grad_norm": 1.5369127459051386, "learning_rate": 2.635688437389146e-06, "loss": 0.0584, "step": 1591 }, { "epoch": 3.11, "grad_norm": 1.6474355672697563, "learning_rate": 2.6310646621919923e-06, "loss": 0.0708, "step": 1592 }, { "epoch": 3.11, "grad_norm": 1.4229372528149518, "learning_rate": 2.626442502551478e-06, "loss": 0.0447, "step": 1593 }, { "epoch": 3.11, "grad_norm": 1.7402678011768629, "learning_rate": 2.6218219670612907e-06, "loss": 0.0679, "step": 1594 }, { "epoch": 3.12, "grad_norm": 1.435537699031508, "learning_rate": 2.6172030643121005e-06, "loss": 0.0561, "step": 1595 }, { "epoch": 3.12, "grad_norm": 1.5715933358727971, "learning_rate": 2.612585802891542e-06, "loss": 0.0642, "step": 1596 }, { "epoch": 3.12, "grad_norm": 1.5348765899222288, "learning_rate": 2.607970191384199e-06, "loss": 0.0554, "step": 1597 }, { "epoch": 3.12, "grad_norm": 1.6820658193644205, "learning_rate": 2.6033562383715864e-06, "loss": 0.0613, "step": 1598 }, { "epoch": 3.12, "grad_norm": 1.3312324444438222, "learning_rate": 2.598743952432134e-06, "loss": 0.0603, "step": 1599 }, { "epoch": 3.12, "grad_norm": 1.3376052957654978, "learning_rate": 2.594133342141177e-06, "loss": 0.0453, "step": 1600 }, { "epoch": 3.13, "grad_norm": 1.8738345331306114, "learning_rate": 2.58952441607093e-06, "loss": 0.0659, "step": 1601 }, { "epoch": 3.13, "grad_norm": 1.6301576021031483, "learning_rate": 2.5849171827904802e-06, "loss": 0.0594, "step": 1602 }, { "epoch": 3.13, "grad_norm": 1.650670853596828, "learning_rate": 2.580311650865764e-06, "loss": 0.0634, "step": 1603 }, { "epoch": 3.13, "grad_norm": 1.4836420482068362, "learning_rate": 2.575707828859556e-06, "loss": 0.0763, "step": 1604 }, { "epoch": 3.13, "grad_norm": 1.3687258159771252, "learning_rate": 2.571105725331453e-06, "loss": 0.0605, "step": 1605 }, { "epoch": 3.14, "grad_norm": 1.4506165956702661, "learning_rate": 2.566505348837856e-06, "loss": 0.0555, "step": 1606 }, { "epoch": 3.14, "grad_norm": 1.433048393972533, "learning_rate": 2.5619067079319538e-06, "loss": 0.0643, "step": 1607 }, { "epoch": 3.14, "grad_norm": 1.7614516832357316, "learning_rate": 2.5573098111637088e-06, "loss": 0.0446, "step": 1608 }, { "epoch": 3.14, "grad_norm": 1.6611813923351346, "learning_rate": 2.5527146670798423e-06, "loss": 0.0676, "step": 1609 }, { "epoch": 3.14, "grad_norm": 1.3862459587194498, "learning_rate": 2.5481212842238152e-06, "loss": 0.0559, "step": 1610 }, { "epoch": 3.15, "grad_norm": 1.3169042463699052, "learning_rate": 2.543529671135816e-06, "loss": 0.0458, "step": 1611 }, { "epoch": 3.15, "grad_norm": 1.357713573235298, "learning_rate": 2.5389398363527376e-06, "loss": 0.051, "step": 1612 }, { "epoch": 3.15, "grad_norm": 1.4336068820114432, "learning_rate": 2.5343517884081723e-06, "loss": 0.0749, "step": 1613 }, { "epoch": 3.15, "grad_norm": 1.4525794379251935, "learning_rate": 2.5297655358323877e-06, "loss": 0.0751, "step": 1614 }, { "epoch": 3.15, "grad_norm": 1.455139096067688, "learning_rate": 2.5251810871523144e-06, "loss": 0.0468, "step": 1615 }, { "epoch": 3.16, "grad_norm": 1.3972124984993979, "learning_rate": 2.5205984508915277e-06, "loss": 0.0679, "step": 1616 }, { "epoch": 3.16, "grad_norm": 1.3731534781398844, "learning_rate": 2.516017635570235e-06, "loss": 0.0529, "step": 1617 }, { "epoch": 3.16, "grad_norm": 1.5654003417196503, "learning_rate": 2.5114386497052563e-06, "loss": 0.0685, "step": 1618 }, { "epoch": 3.16, "grad_norm": 1.4922679541724462, "learning_rate": 2.506861501810011e-06, "loss": 0.0749, "step": 1619 }, { "epoch": 3.16, "grad_norm": 1.6015006891952992, "learning_rate": 2.502286200394503e-06, "loss": 0.0727, "step": 1620 }, { "epoch": 3.17, "grad_norm": 1.457208149943652, "learning_rate": 2.4977127539652985e-06, "loss": 0.0428, "step": 1621 }, { "epoch": 3.17, "grad_norm": 1.4172379776842394, "learning_rate": 2.4931411710255205e-06, "loss": 0.0608, "step": 1622 }, { "epoch": 3.17, "grad_norm": 1.5584582994933427, "learning_rate": 2.488571460074823e-06, "loss": 0.0578, "step": 1623 }, { "epoch": 3.17, "grad_norm": 1.384034848461642, "learning_rate": 2.4840036296093824e-06, "loss": 0.055, "step": 1624 }, { "epoch": 3.17, "grad_norm": 1.3387067109351594, "learning_rate": 2.479437688121876e-06, "loss": 0.0661, "step": 1625 }, { "epoch": 3.18, "grad_norm": 1.6609859162218565, "learning_rate": 2.4748736441014728e-06, "loss": 0.0671, "step": 1626 }, { "epoch": 3.18, "grad_norm": 1.3255248453720263, "learning_rate": 2.4703115060338096e-06, "loss": 0.038, "step": 1627 }, { "epoch": 3.18, "grad_norm": 1.4402940554136192, "learning_rate": 2.465751282400983e-06, "loss": 0.0629, "step": 1628 }, { "epoch": 3.18, "grad_norm": 1.4454146989973915, "learning_rate": 2.4611929816815317e-06, "loss": 0.0623, "step": 1629 }, { "epoch": 3.18, "grad_norm": 1.4070694637952748, "learning_rate": 2.456636612350411e-06, "loss": 0.0637, "step": 1630 }, { "epoch": 3.19, "grad_norm": 1.5128033886001513, "learning_rate": 2.4520821828789946e-06, "loss": 0.0674, "step": 1631 }, { "epoch": 3.19, "grad_norm": 1.5972814799438317, "learning_rate": 2.4475297017350445e-06, "loss": 0.0758, "step": 1632 }, { "epoch": 3.19, "grad_norm": 1.5730669914010618, "learning_rate": 2.4429791773827017e-06, "loss": 0.0448, "step": 1633 }, { "epoch": 3.19, "grad_norm": 1.6859115871385666, "learning_rate": 2.43843061828247e-06, "loss": 0.0431, "step": 1634 }, { "epoch": 3.19, "grad_norm": 1.6490275023943044, "learning_rate": 2.4338840328911975e-06, "loss": 0.0534, "step": 1635 }, { "epoch": 3.2, "grad_norm": 1.3830075112404876, "learning_rate": 2.4293394296620625e-06, "loss": 0.0425, "step": 1636 }, { "epoch": 3.2, "grad_norm": 1.4578688244048899, "learning_rate": 2.4247968170445606e-06, "loss": 0.0638, "step": 1637 }, { "epoch": 3.2, "grad_norm": 1.8652433402223119, "learning_rate": 2.420256203484486e-06, "loss": 0.0337, "step": 1638 }, { "epoch": 3.2, "grad_norm": 1.9869311217847496, "learning_rate": 2.4157175974239114e-06, "loss": 0.062, "step": 1639 }, { "epoch": 3.2, "grad_norm": 1.7982896354453677, "learning_rate": 2.411181007301182e-06, "loss": 0.057, "step": 1640 }, { "epoch": 3.21, "grad_norm": 1.430613220524837, "learning_rate": 2.4066464415508943e-06, "loss": 0.0579, "step": 1641 }, { "epoch": 3.21, "grad_norm": 1.4010114402870557, "learning_rate": 2.40211390860388e-06, "loss": 0.0664, "step": 1642 }, { "epoch": 3.21, "grad_norm": 1.4208830902622809, "learning_rate": 2.3975834168871913e-06, "loss": 0.0529, "step": 1643 }, { "epoch": 3.21, "grad_norm": 1.5383739385004014, "learning_rate": 2.393054974824086e-06, "loss": 0.035, "step": 1644 }, { "epoch": 3.21, "grad_norm": 1.3152548685772911, "learning_rate": 2.3885285908340092e-06, "loss": 0.0493, "step": 1645 }, { "epoch": 3.21, "grad_norm": 1.4312546328412254, "learning_rate": 2.384004273332583e-06, "loss": 0.0516, "step": 1646 }, { "epoch": 3.22, "grad_norm": 1.3434575777432731, "learning_rate": 2.379482030731586e-06, "loss": 0.0509, "step": 1647 }, { "epoch": 3.22, "grad_norm": 1.5039588123115974, "learning_rate": 2.374961871438935e-06, "loss": 0.0621, "step": 1648 }, { "epoch": 3.22, "grad_norm": 1.5752990693117233, "learning_rate": 2.3704438038586796e-06, "loss": 0.0546, "step": 1649 }, { "epoch": 3.22, "grad_norm": 1.202941753679949, "learning_rate": 2.3659278363909763e-06, "loss": 0.0345, "step": 1650 }, { "epoch": 3.22, "grad_norm": 1.4193536349464553, "learning_rate": 2.36141397743208e-06, "loss": 0.0446, "step": 1651 }, { "epoch": 3.23, "grad_norm": 1.4070991332612404, "learning_rate": 2.356902235374323e-06, "loss": 0.0543, "step": 1652 }, { "epoch": 3.23, "grad_norm": 1.4898407159490978, "learning_rate": 2.352392618606102e-06, "loss": 0.0353, "step": 1653 }, { "epoch": 3.23, "grad_norm": 1.5135419546308646, "learning_rate": 2.3478851355118637e-06, "loss": 0.0528, "step": 1654 }, { "epoch": 3.23, "grad_norm": 1.7502216605612293, "learning_rate": 2.343379794472087e-06, "loss": 0.0715, "step": 1655 }, { "epoch": 3.23, "grad_norm": 1.427448207031386, "learning_rate": 2.3388766038632686e-06, "loss": 0.0483, "step": 1656 }, { "epoch": 3.24, "grad_norm": 1.4759424268703707, "learning_rate": 2.3343755720579052e-06, "loss": 0.0455, "step": 1657 }, { "epoch": 3.24, "grad_norm": 1.589196859590701, "learning_rate": 2.329876707424481e-06, "loss": 0.0598, "step": 1658 }, { "epoch": 3.24, "grad_norm": 1.3329303160381591, "learning_rate": 2.3253800183274527e-06, "loss": 0.0453, "step": 1659 }, { "epoch": 3.24, "grad_norm": 1.347881561169112, "learning_rate": 2.32088551312723e-06, "loss": 0.038, "step": 1660 }, { "epoch": 3.24, "grad_norm": 1.4789682523812104, "learning_rate": 2.3163932001801625e-06, "loss": 0.053, "step": 1661 }, { "epoch": 3.25, "grad_norm": 1.3584401789620215, "learning_rate": 2.311903087838524e-06, "loss": 0.0519, "step": 1662 }, { "epoch": 3.25, "grad_norm": 1.3050068587223715, "learning_rate": 2.307415184450498e-06, "loss": 0.0394, "step": 1663 }, { "epoch": 3.25, "grad_norm": 1.4189224170387396, "learning_rate": 2.3029294983601598e-06, "loss": 0.0441, "step": 1664 }, { "epoch": 3.25, "grad_norm": 1.4373123636755594, "learning_rate": 2.298446037907462e-06, "loss": 0.0569, "step": 1665 }, { "epoch": 3.25, "grad_norm": 1.4912994260139012, "learning_rate": 2.2939648114282187e-06, "loss": 0.0573, "step": 1666 }, { "epoch": 3.26, "grad_norm": 1.8425398410130411, "learning_rate": 2.289485827254093e-06, "loss": 0.0335, "step": 1667 }, { "epoch": 3.26, "grad_norm": 1.4885070531674733, "learning_rate": 2.2850090937125775e-06, "loss": 0.0334, "step": 1668 }, { "epoch": 3.26, "grad_norm": 1.4645394056638676, "learning_rate": 2.2805346191269795e-06, "loss": 0.036, "step": 1669 }, { "epoch": 3.26, "grad_norm": 1.4031924216205796, "learning_rate": 2.276062411816407e-06, "loss": 0.0399, "step": 1670 }, { "epoch": 3.26, "grad_norm": 1.654017575200224, "learning_rate": 2.2715924800957543e-06, "loss": 0.0438, "step": 1671 }, { "epoch": 3.27, "grad_norm": 1.7280099038472334, "learning_rate": 2.2671248322756827e-06, "loss": 0.0395, "step": 1672 }, { "epoch": 3.27, "grad_norm": 1.5294378603279764, "learning_rate": 2.2626594766626067e-06, "loss": 0.0452, "step": 1673 }, { "epoch": 3.27, "grad_norm": 1.4699622224892321, "learning_rate": 2.258196421558684e-06, "loss": 0.0464, "step": 1674 }, { "epoch": 3.27, "grad_norm": 1.6100626187639282, "learning_rate": 2.2537356752617857e-06, "loss": 0.0498, "step": 1675 }, { "epoch": 3.27, "grad_norm": 1.40005757992126, "learning_rate": 2.2492772460655e-06, "loss": 0.0525, "step": 1676 }, { "epoch": 3.28, "grad_norm": 1.7849782648046026, "learning_rate": 2.244821142259101e-06, "loss": 0.0412, "step": 1677 }, { "epoch": 3.28, "grad_norm": 1.9752991036442078, "learning_rate": 2.240367372127541e-06, "loss": 0.0408, "step": 1678 }, { "epoch": 3.28, "grad_norm": 1.3233812227146757, "learning_rate": 2.2359159439514365e-06, "loss": 0.0362, "step": 1679 }, { "epoch": 3.28, "grad_norm": 1.548989953558539, "learning_rate": 2.2314668660070444e-06, "loss": 0.0571, "step": 1680 }, { "epoch": 3.28, "grad_norm": 1.4661824884587544, "learning_rate": 2.2270201465662547e-06, "loss": 0.0476, "step": 1681 }, { "epoch": 3.29, "grad_norm": 1.2736154790795098, "learning_rate": 2.2225757938965737e-06, "loss": 0.0357, "step": 1682 }, { "epoch": 3.29, "grad_norm": 1.774782298650735, "learning_rate": 2.218133816261106e-06, "loss": 0.0295, "step": 1683 }, { "epoch": 3.29, "grad_norm": 1.4900424340084304, "learning_rate": 2.213694221918538e-06, "loss": 0.0469, "step": 1684 }, { "epoch": 3.29, "grad_norm": 1.3220144252477737, "learning_rate": 2.2092570191231287e-06, "loss": 0.037, "step": 1685 }, { "epoch": 3.29, "grad_norm": 1.650387816890382, "learning_rate": 2.2048222161246893e-06, "loss": 0.046, "step": 1686 }, { "epoch": 3.29, "grad_norm": 1.5091250116945136, "learning_rate": 2.200389821168569e-06, "loss": 0.0416, "step": 1687 }, { "epoch": 3.3, "grad_norm": 1.105530086509642, "learning_rate": 2.1959598424956397e-06, "loss": 0.0303, "step": 1688 }, { "epoch": 3.3, "grad_norm": 1.572120295123975, "learning_rate": 2.191532288342282e-06, "loss": 0.0337, "step": 1689 }, { "epoch": 3.3, "grad_norm": 1.6849574884496388, "learning_rate": 2.187107166940367e-06, "loss": 0.0503, "step": 1690 }, { "epoch": 3.3, "grad_norm": 1.4930389647679347, "learning_rate": 2.1826844865172438e-06, "loss": 0.0453, "step": 1691 }, { "epoch": 3.3, "grad_norm": 1.6600935532215522, "learning_rate": 2.178264255295725e-06, "loss": 0.0356, "step": 1692 }, { "epoch": 3.31, "grad_norm": 2.0275138468558076, "learning_rate": 2.1738464814940648e-06, "loss": 0.0312, "step": 1693 }, { "epoch": 3.31, "grad_norm": 1.523403031860925, "learning_rate": 2.1694311733259525e-06, "loss": 0.0497, "step": 1694 }, { "epoch": 3.31, "grad_norm": 1.5338382992954314, "learning_rate": 2.165018339000491e-06, "loss": 0.0502, "step": 1695 }, { "epoch": 3.31, "grad_norm": 1.4005633617151687, "learning_rate": 2.160607986722186e-06, "loss": 0.045, "step": 1696 }, { "epoch": 3.31, "grad_norm": 1.4369902264455205, "learning_rate": 2.1562001246909267e-06, "loss": 0.0367, "step": 1697 }, { "epoch": 3.32, "grad_norm": 1.2881434188611325, "learning_rate": 2.151794761101972e-06, "loss": 0.0389, "step": 1698 }, { "epoch": 3.32, "grad_norm": 1.2868017472360667, "learning_rate": 2.147391904145938e-06, "loss": 0.0308, "step": 1699 }, { "epoch": 3.32, "grad_norm": 1.3127054010644577, "learning_rate": 2.1429915620087775e-06, "loss": 0.0376, "step": 1700 }, { "epoch": 3.32, "grad_norm": 1.3433787512570294, "learning_rate": 2.1385937428717707e-06, "loss": 0.0405, "step": 1701 }, { "epoch": 3.32, "grad_norm": 1.4044129323664742, "learning_rate": 2.134198454911503e-06, "loss": 0.0502, "step": 1702 }, { "epoch": 3.33, "grad_norm": 1.1657063078328977, "learning_rate": 2.1298057062998557e-06, "loss": 0.0407, "step": 1703 }, { "epoch": 3.33, "grad_norm": 1.398833517857331, "learning_rate": 2.125415505203991e-06, "loss": 0.0286, "step": 1704 }, { "epoch": 3.33, "grad_norm": 1.2485733337400788, "learning_rate": 2.121027859786332e-06, "loss": 0.0411, "step": 1705 }, { "epoch": 3.33, "grad_norm": 1.5365787455327982, "learning_rate": 2.1166427782045506e-06, "loss": 0.0562, "step": 1706 }, { "epoch": 3.33, "grad_norm": 1.2576021249677778, "learning_rate": 2.1122602686115522e-06, "loss": 0.0346, "step": 1707 }, { "epoch": 3.34, "grad_norm": 1.3397321038792354, "learning_rate": 2.1078803391554616e-06, "loss": 0.0369, "step": 1708 }, { "epoch": 3.34, "grad_norm": 1.6140493860359857, "learning_rate": 2.1035029979796034e-06, "loss": 0.0379, "step": 1709 }, { "epoch": 3.34, "grad_norm": 1.254410389345508, "learning_rate": 2.099128253222494e-06, "loss": 0.0337, "step": 1710 }, { "epoch": 3.34, "grad_norm": 1.3292539567138497, "learning_rate": 2.094756113017818e-06, "loss": 0.0371, "step": 1711 }, { "epoch": 3.34, "grad_norm": 1.5459900972852356, "learning_rate": 2.09038658549442e-06, "loss": 0.0432, "step": 1712 }, { "epoch": 3.35, "grad_norm": 1.1850787122072177, "learning_rate": 2.0860196787762884e-06, "loss": 0.0301, "step": 1713 }, { "epoch": 3.35, "grad_norm": 1.3680077655395422, "learning_rate": 2.0816554009825357e-06, "loss": 0.0405, "step": 1714 }, { "epoch": 3.35, "grad_norm": 1.312478142243202, "learning_rate": 2.0772937602273888e-06, "loss": 0.0241, "step": 1715 }, { "epoch": 3.35, "grad_norm": 1.1998597467325445, "learning_rate": 2.0729347646201717e-06, "loss": 0.0316, "step": 1716 }, { "epoch": 3.35, "grad_norm": 1.3279051354659375, "learning_rate": 2.0685784222652893e-06, "loss": 0.0439, "step": 1717 }, { "epoch": 3.36, "grad_norm": 1.5722297553236857, "learning_rate": 2.064224741262213e-06, "loss": 0.0408, "step": 1718 }, { "epoch": 3.36, "grad_norm": 1.4342065926481347, "learning_rate": 2.0598737297054707e-06, "loss": 0.0698, "step": 1719 }, { "epoch": 3.36, "grad_norm": 1.3023348382189455, "learning_rate": 2.0555253956846186e-06, "loss": 0.0376, "step": 1720 }, { "epoch": 3.36, "grad_norm": 1.5333608584387983, "learning_rate": 2.0511797472842434e-06, "loss": 0.0431, "step": 1721 }, { "epoch": 3.36, "grad_norm": 1.3322668127073134, "learning_rate": 2.046836792583933e-06, "loss": 0.028, "step": 1722 }, { "epoch": 3.37, "grad_norm": 1.4609591159668522, "learning_rate": 2.0424965396582706e-06, "loss": 0.0368, "step": 1723 }, { "epoch": 3.37, "grad_norm": 1.3048107323719635, "learning_rate": 2.0381589965768126e-06, "loss": 0.0436, "step": 1724 }, { "epoch": 3.37, "grad_norm": 1.6695410635737038, "learning_rate": 2.03382417140408e-06, "loss": 0.0317, "step": 1725 }, { "epoch": 3.37, "grad_norm": 1.511836462348901, "learning_rate": 2.0294920721995417e-06, "loss": 0.0315, "step": 1726 }, { "epoch": 3.37, "grad_norm": 1.2674559701399313, "learning_rate": 2.0251627070175925e-06, "loss": 0.035, "step": 1727 }, { "epoch": 3.38, "grad_norm": 1.4912653482931688, "learning_rate": 2.0208360839075525e-06, "loss": 0.0311, "step": 1728 }, { "epoch": 3.38, "grad_norm": 1.4706907738097028, "learning_rate": 2.0165122109136357e-06, "loss": 0.0403, "step": 1729 }, { "epoch": 3.38, "grad_norm": 1.4660159152590735, "learning_rate": 2.0121910960749458e-06, "loss": 0.0383, "step": 1730 }, { "epoch": 3.38, "grad_norm": 1.3778513046479635, "learning_rate": 2.0078727474254598e-06, "loss": 0.0495, "step": 1731 }, { "epoch": 3.38, "grad_norm": 1.3269852433103433, "learning_rate": 2.0035571729940133e-06, "loss": 0.0306, "step": 1732 }, { "epoch": 3.38, "grad_norm": 1.4107022541478014, "learning_rate": 1.9992443808042776e-06, "loss": 0.0355, "step": 1733 }, { "epoch": 3.39, "grad_norm": 1.446635379140195, "learning_rate": 1.994934378874757e-06, "loss": 0.0504, "step": 1734 }, { "epoch": 3.39, "grad_norm": 1.3088777937523284, "learning_rate": 1.9906271752187675e-06, "loss": 0.0248, "step": 1735 }, { "epoch": 3.39, "grad_norm": 1.339246741485156, "learning_rate": 1.9863227778444176e-06, "loss": 0.0432, "step": 1736 }, { "epoch": 3.39, "grad_norm": 1.640626851046982, "learning_rate": 1.982021194754606e-06, "loss": 0.0322, "step": 1737 }, { "epoch": 3.39, "grad_norm": 1.4874723990793082, "learning_rate": 1.9777224339469897e-06, "loss": 0.0405, "step": 1738 }, { "epoch": 3.4, "grad_norm": 1.2247556085059215, "learning_rate": 1.9734265034139883e-06, "loss": 0.0367, "step": 1739 }, { "epoch": 3.4, "grad_norm": 1.4372031220046606, "learning_rate": 1.9691334111427506e-06, "loss": 0.0423, "step": 1740 }, { "epoch": 3.4, "grad_norm": 1.5537666503864995, "learning_rate": 1.964843165115154e-06, "loss": 0.0331, "step": 1741 }, { "epoch": 3.4, "grad_norm": 1.4391102400504001, "learning_rate": 1.960555773307781e-06, "loss": 0.0489, "step": 1742 }, { "epoch": 3.4, "grad_norm": 1.3100983807319064, "learning_rate": 1.9562712436919092e-06, "loss": 0.0392, "step": 1743 }, { "epoch": 3.41, "grad_norm": 1.5508857004488024, "learning_rate": 1.951989584233496e-06, "loss": 0.0335, "step": 1744 }, { "epoch": 3.41, "grad_norm": 1.1658875383456886, "learning_rate": 1.9477108028931577e-06, "loss": 0.0342, "step": 1745 }, { "epoch": 3.41, "grad_norm": 1.1519980958323126, "learning_rate": 1.943434907626164e-06, "loss": 0.0326, "step": 1746 }, { "epoch": 3.41, "grad_norm": 1.188445990386364, "learning_rate": 1.9391619063824164e-06, "loss": 0.0267, "step": 1747 }, { "epoch": 3.41, "grad_norm": 1.0491534983893338, "learning_rate": 1.934891807106438e-06, "loss": 0.0261, "step": 1748 }, { "epoch": 3.42, "grad_norm": 1.293309784055279, "learning_rate": 1.930624617737352e-06, "loss": 0.0295, "step": 1749 }, { "epoch": 3.42, "grad_norm": 1.1372130595249952, "learning_rate": 1.9263603462088765e-06, "loss": 0.0259, "step": 1750 }, { "epoch": 3.42, "grad_norm": 1.222889798291941, "learning_rate": 1.9220990004493035e-06, "loss": 0.0306, "step": 1751 }, { "epoch": 3.42, "grad_norm": 1.0167658534073558, "learning_rate": 1.917840588381481e-06, "loss": 0.0138, "step": 1752 }, { "epoch": 3.42, "grad_norm": 1.1782714238704282, "learning_rate": 1.9135851179228076e-06, "loss": 0.034, "step": 1753 }, { "epoch": 3.43, "grad_norm": 1.1983246589212688, "learning_rate": 1.9093325969852126e-06, "loss": 0.0303, "step": 1754 }, { "epoch": 3.43, "grad_norm": 1.3223461238163856, "learning_rate": 1.905083033475138e-06, "loss": 0.031, "step": 1755 }, { "epoch": 3.43, "grad_norm": 1.5082969604138547, "learning_rate": 1.9008364352935276e-06, "loss": 0.0365, "step": 1756 }, { "epoch": 3.43, "grad_norm": 1.2168132103188354, "learning_rate": 1.896592810335817e-06, "loss": 0.0232, "step": 1757 }, { "epoch": 3.43, "grad_norm": 1.3429754823680504, "learning_rate": 1.8923521664919068e-06, "loss": 0.0321, "step": 1758 }, { "epoch": 3.44, "grad_norm": 1.5305162323797437, "learning_rate": 1.8881145116461597e-06, "loss": 0.0376, "step": 1759 }, { "epoch": 3.44, "grad_norm": 1.5622772355291104, "learning_rate": 1.8838798536773821e-06, "loss": 0.0286, "step": 1760 }, { "epoch": 3.44, "grad_norm": 1.4532900251024907, "learning_rate": 1.8796482004588025e-06, "loss": 0.0335, "step": 1761 }, { "epoch": 3.44, "grad_norm": 1.2782930602083256, "learning_rate": 1.875419559858069e-06, "loss": 0.0284, "step": 1762 }, { "epoch": 3.44, "grad_norm": 1.686205900793192, "learning_rate": 1.8711939397372273e-06, "loss": 0.034, "step": 1763 }, { "epoch": 3.45, "grad_norm": 1.3600293055103865, "learning_rate": 1.8669713479527048e-06, "loss": 0.0295, "step": 1764 }, { "epoch": 3.45, "grad_norm": 1.1921043106659506, "learning_rate": 1.8627517923552982e-06, "loss": 0.026, "step": 1765 }, { "epoch": 3.45, "grad_norm": 1.6341278007792666, "learning_rate": 1.8585352807901644e-06, "loss": 0.0402, "step": 1766 }, { "epoch": 3.45, "grad_norm": 1.1830851389632717, "learning_rate": 1.8543218210967937e-06, "loss": 0.0279, "step": 1767 }, { "epoch": 3.45, "grad_norm": 1.523828347871911, "learning_rate": 1.850111421109008e-06, "loss": 0.0406, "step": 1768 }, { "epoch": 3.46, "grad_norm": 1.5035956398730266, "learning_rate": 1.8459040886549394e-06, "loss": 0.046, "step": 1769 }, { "epoch": 3.46, "grad_norm": 1.4452298985528904, "learning_rate": 1.8416998315570125e-06, "loss": 0.0316, "step": 1770 }, { "epoch": 3.46, "grad_norm": 1.4516687409882034, "learning_rate": 1.8374986576319388e-06, "loss": 0.0359, "step": 1771 }, { "epoch": 3.46, "grad_norm": 1.3566686280797562, "learning_rate": 1.8333005746906976e-06, "loss": 0.0374, "step": 1772 }, { "epoch": 3.46, "grad_norm": 1.293263453968416, "learning_rate": 1.8291055905385179e-06, "loss": 0.0246, "step": 1773 }, { "epoch": 3.46, "grad_norm": 1.348613838373495, "learning_rate": 1.8249137129748679e-06, "loss": 0.0395, "step": 1774 }, { "epoch": 3.47, "grad_norm": 1.1781512818723012, "learning_rate": 1.8207249497934416e-06, "loss": 0.0265, "step": 1775 }, { "epoch": 3.47, "grad_norm": 1.2649744619482655, "learning_rate": 1.8165393087821438e-06, "loss": 0.0402, "step": 1776 }, { "epoch": 3.47, "grad_norm": 1.5374056856951261, "learning_rate": 1.8123567977230706e-06, "loss": 0.0494, "step": 1777 }, { "epoch": 3.47, "grad_norm": 1.3081220761828096, "learning_rate": 1.8081774243925025e-06, "loss": 0.0471, "step": 1778 }, { "epoch": 3.47, "grad_norm": 1.39953967058408, "learning_rate": 1.8040011965608827e-06, "loss": 0.0352, "step": 1779 }, { "epoch": 3.48, "grad_norm": 1.1979153717946065, "learning_rate": 1.7998281219928094e-06, "loss": 0.0333, "step": 1780 }, { "epoch": 3.48, "grad_norm": 1.304004685709425, "learning_rate": 1.7956582084470179e-06, "loss": 0.0389, "step": 1781 }, { "epoch": 3.48, "grad_norm": 1.1263609597051047, "learning_rate": 1.7914914636763638e-06, "loss": 0.027, "step": 1782 }, { "epoch": 3.48, "grad_norm": 1.4839746229030135, "learning_rate": 1.7873278954278112e-06, "loss": 0.034, "step": 1783 }, { "epoch": 3.48, "grad_norm": 1.2787216679636682, "learning_rate": 1.783167511442422e-06, "loss": 0.0299, "step": 1784 }, { "epoch": 3.49, "grad_norm": 1.0871356208649121, "learning_rate": 1.7790103194553362e-06, "loss": 0.0339, "step": 1785 }, { "epoch": 3.49, "grad_norm": 1.1069290484030598, "learning_rate": 1.7748563271957563e-06, "loss": 0.0307, "step": 1786 }, { "epoch": 3.49, "grad_norm": 1.4091296853811481, "learning_rate": 1.7707055423869382e-06, "loss": 0.0447, "step": 1787 }, { "epoch": 3.49, "grad_norm": 1.2811033084978496, "learning_rate": 1.7665579727461771e-06, "loss": 0.0242, "step": 1788 }, { "epoch": 3.49, "grad_norm": 1.2169032104549915, "learning_rate": 1.762413625984784e-06, "loss": 0.0296, "step": 1789 }, { "epoch": 3.5, "grad_norm": 1.164460574932742, "learning_rate": 1.7582725098080826e-06, "loss": 0.0295, "step": 1790 }, { "epoch": 3.5, "grad_norm": 1.2344816884063627, "learning_rate": 1.7541346319153915e-06, "loss": 0.0336, "step": 1791 }, { "epoch": 3.5, "grad_norm": 1.0862394537376805, "learning_rate": 1.7500000000000008e-06, "loss": 0.0348, "step": 1792 }, { "epoch": 3.5, "grad_norm": 1.0628556706154573, "learning_rate": 1.7458686217491734e-06, "loss": 0.021, "step": 1793 }, { "epoch": 3.5, "grad_norm": 1.2561599327526862, "learning_rate": 1.741740504844121e-06, "loss": 0.0261, "step": 1794 }, { "epoch": 3.51, "grad_norm": 1.26067717291248, "learning_rate": 1.7376156569599887e-06, "loss": 0.0199, "step": 1795 }, { "epoch": 3.51, "grad_norm": 1.5338604363223554, "learning_rate": 1.7334940857658472e-06, "loss": 0.0401, "step": 1796 }, { "epoch": 3.51, "grad_norm": 1.469495656392644, "learning_rate": 1.729375798924675e-06, "loss": 0.0481, "step": 1797 }, { "epoch": 3.51, "grad_norm": 1.2191555981981106, "learning_rate": 1.7252608040933402e-06, "loss": 0.0391, "step": 1798 }, { "epoch": 3.51, "grad_norm": 1.325956482463199, "learning_rate": 1.721149108922594e-06, "loss": 0.0249, "step": 1799 }, { "epoch": 3.52, "grad_norm": 1.1333787560154776, "learning_rate": 1.7170407210570539e-06, "loss": 0.0253, "step": 1800 }, { "epoch": 3.52, "grad_norm": 1.100693348147773, "learning_rate": 1.712935648135184e-06, "loss": 0.0286, "step": 1801 }, { "epoch": 3.52, "grad_norm": 1.1862772914719861, "learning_rate": 1.7088338977892866e-06, "loss": 0.0379, "step": 1802 }, { "epoch": 3.52, "grad_norm": 1.361455198634858, "learning_rate": 1.7047354776454897e-06, "loss": 0.046, "step": 1803 }, { "epoch": 3.52, "grad_norm": 1.2611036632648365, "learning_rate": 1.700640395323724e-06, "loss": 0.0235, "step": 1804 }, { "epoch": 3.53, "grad_norm": 1.232489268126895, "learning_rate": 1.6965486584377205e-06, "loss": 0.0243, "step": 1805 }, { "epoch": 3.53, "grad_norm": 1.1611886844386596, "learning_rate": 1.692460274594987e-06, "loss": 0.0266, "step": 1806 }, { "epoch": 3.53, "grad_norm": 1.358483245201853, "learning_rate": 1.6883752513967963e-06, "loss": 0.0295, "step": 1807 }, { "epoch": 3.53, "grad_norm": 1.398672240483201, "learning_rate": 1.6842935964381741e-06, "loss": 0.0392, "step": 1808 }, { "epoch": 3.53, "grad_norm": 1.1194897412571263, "learning_rate": 1.6802153173078865e-06, "loss": 0.0343, "step": 1809 }, { "epoch": 3.54, "grad_norm": 1.2167556922585687, "learning_rate": 1.6761404215884189e-06, "loss": 0.0264, "step": 1810 }, { "epoch": 3.54, "grad_norm": 1.4185657417297393, "learning_rate": 1.6720689168559663e-06, "loss": 0.0231, "step": 1811 }, { "epoch": 3.54, "grad_norm": 1.3140538165720745, "learning_rate": 1.6680008106804213e-06, "loss": 0.0245, "step": 1812 }, { "epoch": 3.54, "grad_norm": 1.1654513352404396, "learning_rate": 1.6639361106253595e-06, "loss": 0.0353, "step": 1813 }, { "epoch": 3.54, "grad_norm": 1.329420140134874, "learning_rate": 1.6598748242480173e-06, "loss": 0.0328, "step": 1814 }, { "epoch": 3.54, "grad_norm": 1.214656691453147, "learning_rate": 1.6558169590992901e-06, "loss": 0.0284, "step": 1815 }, { "epoch": 3.55, "grad_norm": 1.0939410879758171, "learning_rate": 1.651762522723712e-06, "loss": 0.0217, "step": 1816 }, { "epoch": 3.55, "grad_norm": 1.269027340519018, "learning_rate": 1.6477115226594378e-06, "loss": 0.032, "step": 1817 }, { "epoch": 3.55, "grad_norm": 1.274197911183843, "learning_rate": 1.643663966438239e-06, "loss": 0.0283, "step": 1818 }, { "epoch": 3.55, "grad_norm": 1.2729419659481929, "learning_rate": 1.6396198615854799e-06, "loss": 0.0277, "step": 1819 }, { "epoch": 3.55, "grad_norm": 1.3991809351788196, "learning_rate": 1.6355792156201085e-06, "loss": 0.0325, "step": 1820 }, { "epoch": 3.56, "grad_norm": 1.1975407284317257, "learning_rate": 1.6315420360546436e-06, "loss": 0.0243, "step": 1821 }, { "epoch": 3.56, "grad_norm": 1.2707818118900829, "learning_rate": 1.6275083303951604e-06, "loss": 0.0254, "step": 1822 }, { "epoch": 3.56, "grad_norm": 1.3909990564078634, "learning_rate": 1.62347810614127e-06, "loss": 0.0464, "step": 1823 }, { "epoch": 3.56, "grad_norm": 1.5766699033067544, "learning_rate": 1.619451370786116e-06, "loss": 0.0257, "step": 1824 }, { "epoch": 3.56, "grad_norm": 1.5093340937875748, "learning_rate": 1.6154281318163542e-06, "loss": 0.0524, "step": 1825 }, { "epoch": 3.57, "grad_norm": 1.3630079326895153, "learning_rate": 1.6114083967121365e-06, "loss": 0.0395, "step": 1826 }, { "epoch": 3.57, "grad_norm": 1.1952123841165598, "learning_rate": 1.607392172947105e-06, "loss": 0.0306, "step": 1827 }, { "epoch": 3.57, "grad_norm": 1.3752324351829306, "learning_rate": 1.60337946798837e-06, "loss": 0.035, "step": 1828 }, { "epoch": 3.57, "grad_norm": 1.2574062044931313, "learning_rate": 1.5993702892964996e-06, "loss": 0.0361, "step": 1829 }, { "epoch": 3.57, "grad_norm": 1.1982351742297714, "learning_rate": 1.5953646443255076e-06, "loss": 0.0337, "step": 1830 }, { "epoch": 3.58, "grad_norm": 1.1792295345050055, "learning_rate": 1.591362540522838e-06, "loss": 0.0269, "step": 1831 }, { "epoch": 3.58, "grad_norm": 1.2732944336598087, "learning_rate": 1.5873639853293484e-06, "loss": 0.0354, "step": 1832 }, { "epoch": 3.58, "grad_norm": 1.1532537334003756, "learning_rate": 1.5833689861793e-06, "loss": 0.0235, "step": 1833 }, { "epoch": 3.58, "grad_norm": 1.2883618503411318, "learning_rate": 1.5793775505003446e-06, "loss": 0.0285, "step": 1834 }, { "epoch": 3.58, "grad_norm": 1.1657520993243211, "learning_rate": 1.5753896857135043e-06, "loss": 0.0381, "step": 1835 }, { "epoch": 3.59, "grad_norm": 1.228675624243318, "learning_rate": 1.5714053992331667e-06, "loss": 0.0209, "step": 1836 }, { "epoch": 3.59, "grad_norm": 1.157138083872014, "learning_rate": 1.5674246984670614e-06, "loss": 0.0341, "step": 1837 }, { "epoch": 3.59, "grad_norm": 1.3453024428551257, "learning_rate": 1.5634475908162573e-06, "loss": 0.0259, "step": 1838 }, { "epoch": 3.59, "grad_norm": 1.198959865451635, "learning_rate": 1.5594740836751365e-06, "loss": 0.0313, "step": 1839 }, { "epoch": 3.59, "grad_norm": 1.1803248138757985, "learning_rate": 1.5555041844313931e-06, "loss": 0.0222, "step": 1840 }, { "epoch": 3.6, "grad_norm": 1.0922762273443871, "learning_rate": 1.5515379004660076e-06, "loss": 0.0308, "step": 1841 }, { "epoch": 3.6, "grad_norm": 1.2879015392974262, "learning_rate": 1.5475752391532423e-06, "loss": 0.0244, "step": 1842 }, { "epoch": 3.6, "grad_norm": 1.1738282134843503, "learning_rate": 1.5436162078606252e-06, "loss": 0.0197, "step": 1843 }, { "epoch": 3.6, "grad_norm": 1.1602413315048294, "learning_rate": 1.5396608139489307e-06, "loss": 0.0212, "step": 1844 }, { "epoch": 3.6, "grad_norm": 1.1561400826162904, "learning_rate": 1.5357090647721752e-06, "loss": 0.0297, "step": 1845 }, { "epoch": 3.61, "grad_norm": 1.1766033918371301, "learning_rate": 1.5317609676775944e-06, "loss": 0.029, "step": 1846 }, { "epoch": 3.61, "grad_norm": 1.1357685159180064, "learning_rate": 1.5278165300056381e-06, "loss": 0.0198, "step": 1847 }, { "epoch": 3.61, "grad_norm": 1.1514475756408453, "learning_rate": 1.5238757590899485e-06, "loss": 0.0251, "step": 1848 }, { "epoch": 3.61, "grad_norm": 1.2837036551225771, "learning_rate": 1.5199386622573537e-06, "loss": 0.0419, "step": 1849 }, { "epoch": 3.61, "grad_norm": 1.2556775522923334, "learning_rate": 1.5160052468278497e-06, "loss": 0.0241, "step": 1850 }, { "epoch": 3.62, "grad_norm": 1.1264568399109551, "learning_rate": 1.5120755201145856e-06, "loss": 0.0342, "step": 1851 }, { "epoch": 3.62, "grad_norm": 1.5140866954568868, "learning_rate": 1.5081494894238554e-06, "loss": 0.0329, "step": 1852 }, { "epoch": 3.62, "grad_norm": 1.530337834985548, "learning_rate": 1.504227162055082e-06, "loss": 0.0243, "step": 1853 }, { "epoch": 3.62, "grad_norm": 1.350407789132536, "learning_rate": 1.500308545300799e-06, "loss": 0.0251, "step": 1854 }, { "epoch": 3.62, "grad_norm": 1.3898368716266143, "learning_rate": 1.4963936464466426e-06, "loss": 0.035, "step": 1855 }, { "epoch": 3.62, "grad_norm": 1.210495935917907, "learning_rate": 1.4924824727713396e-06, "loss": 0.0209, "step": 1856 }, { "epoch": 3.63, "grad_norm": 1.2677442297900536, "learning_rate": 1.4885750315466856e-06, "loss": 0.0328, "step": 1857 }, { "epoch": 3.63, "grad_norm": 1.4330053854229505, "learning_rate": 1.4846713300375413e-06, "loss": 0.0279, "step": 1858 }, { "epoch": 3.63, "grad_norm": 1.199046935673201, "learning_rate": 1.4807713755018133e-06, "loss": 0.0202, "step": 1859 }, { "epoch": 3.63, "grad_norm": 1.374861772103957, "learning_rate": 1.4768751751904387e-06, "loss": 0.0341, "step": 1860 }, { "epoch": 3.63, "grad_norm": 1.128463223244296, "learning_rate": 1.472982736347378e-06, "loss": 0.0224, "step": 1861 }, { "epoch": 3.64, "grad_norm": 0.8825009679592597, "learning_rate": 1.4690940662095984e-06, "loss": 0.0196, "step": 1862 }, { "epoch": 3.64, "grad_norm": 1.3064189795249164, "learning_rate": 1.4652091720070573e-06, "loss": 0.0316, "step": 1863 }, { "epoch": 3.64, "grad_norm": 1.1360100501672261, "learning_rate": 1.4613280609626928e-06, "loss": 0.0287, "step": 1864 }, { "epoch": 3.64, "grad_norm": 1.052091132610523, "learning_rate": 1.4574507402924117e-06, "loss": 0.0232, "step": 1865 }, { "epoch": 3.64, "grad_norm": 1.2367589272367954, "learning_rate": 1.4535772172050692e-06, "loss": 0.0243, "step": 1866 }, { "epoch": 3.65, "grad_norm": 1.3044128398311285, "learning_rate": 1.449707498902464e-06, "loss": 0.0259, "step": 1867 }, { "epoch": 3.65, "grad_norm": 1.144479854424098, "learning_rate": 1.4458415925793196e-06, "loss": 0.0186, "step": 1868 }, { "epoch": 3.65, "grad_norm": 1.3505817048439366, "learning_rate": 1.4419795054232702e-06, "loss": 0.0339, "step": 1869 }, { "epoch": 3.65, "grad_norm": 1.1655777626146104, "learning_rate": 1.4381212446148507e-06, "loss": 0.0269, "step": 1870 }, { "epoch": 3.65, "grad_norm": 1.1986929777672048, "learning_rate": 1.4342668173274843e-06, "loss": 0.0215, "step": 1871 }, { "epoch": 3.66, "grad_norm": 1.2989255584287611, "learning_rate": 1.4304162307274625e-06, "loss": 0.0291, "step": 1872 }, { "epoch": 3.66, "grad_norm": 1.273186785294215, "learning_rate": 1.4265694919739373e-06, "loss": 0.0295, "step": 1873 }, { "epoch": 3.66, "grad_norm": 1.3346271137900423, "learning_rate": 1.422726608218908e-06, "loss": 0.0304, "step": 1874 }, { "epoch": 3.66, "grad_norm": 1.0699078574245535, "learning_rate": 1.4188875866072074e-06, "loss": 0.0258, "step": 1875 }, { "epoch": 3.66, "grad_norm": 1.3792386622734427, "learning_rate": 1.4150524342764833e-06, "loss": 0.0412, "step": 1876 }, { "epoch": 3.67, "grad_norm": 1.2646531669190992, "learning_rate": 1.4112211583571942e-06, "loss": 0.0237, "step": 1877 }, { "epoch": 3.67, "grad_norm": 1.0546089898789466, "learning_rate": 1.4073937659725903e-06, "loss": 0.0269, "step": 1878 }, { "epoch": 3.67, "grad_norm": 1.1821836110791397, "learning_rate": 1.4035702642386989e-06, "loss": 0.0438, "step": 1879 }, { "epoch": 3.67, "grad_norm": 1.0172570219879105, "learning_rate": 1.399750660264317e-06, "loss": 0.0189, "step": 1880 }, { "epoch": 3.67, "grad_norm": 1.2417944132495493, "learning_rate": 1.3959349611509929e-06, "loss": 0.0402, "step": 1881 }, { "epoch": 3.68, "grad_norm": 1.1702292276808366, "learning_rate": 1.3921231739930136e-06, "loss": 0.0242, "step": 1882 }, { "epoch": 3.68, "grad_norm": 1.5179202674777859, "learning_rate": 1.3883153058773957e-06, "loss": 0.0379, "step": 1883 }, { "epoch": 3.68, "grad_norm": 1.2618008078282108, "learning_rate": 1.384511363883869e-06, "loss": 0.0401, "step": 1884 }, { "epoch": 3.68, "grad_norm": 1.3953750674675314, "learning_rate": 1.380711355084861e-06, "loss": 0.0335, "step": 1885 }, { "epoch": 3.68, "grad_norm": 1.1237800060844048, "learning_rate": 1.3769152865454887e-06, "loss": 0.023, "step": 1886 }, { "epoch": 3.69, "grad_norm": 1.4028018926332042, "learning_rate": 1.3731231653235445e-06, "loss": 0.032, "step": 1887 }, { "epoch": 3.69, "grad_norm": 1.2778446346777779, "learning_rate": 1.3693349984694776e-06, "loss": 0.0359, "step": 1888 }, { "epoch": 3.69, "grad_norm": 1.1283049233380502, "learning_rate": 1.3655507930263885e-06, "loss": 0.0343, "step": 1889 }, { "epoch": 3.69, "grad_norm": 1.2482123056849932, "learning_rate": 1.3617705560300144e-06, "loss": 0.0223, "step": 1890 }, { "epoch": 3.69, "grad_norm": 0.9976449812997137, "learning_rate": 1.3579942945087064e-06, "loss": 0.018, "step": 1891 }, { "epoch": 3.7, "grad_norm": 1.038831075795062, "learning_rate": 1.3542220154834316e-06, "loss": 0.0251, "step": 1892 }, { "epoch": 3.7, "grad_norm": 1.160238245277424, "learning_rate": 1.3504537259677512e-06, "loss": 0.0304, "step": 1893 }, { "epoch": 3.7, "grad_norm": 1.2000080072457897, "learning_rate": 1.3466894329678065e-06, "loss": 0.0215, "step": 1894 }, { "epoch": 3.7, "grad_norm": 1.3046340669199037, "learning_rate": 1.3429291434823101e-06, "loss": 0.0432, "step": 1895 }, { "epoch": 3.7, "grad_norm": 1.0340527897428735, "learning_rate": 1.339172864502533e-06, "loss": 0.0211, "step": 1896 }, { "epoch": 3.71, "grad_norm": 1.0224965093329759, "learning_rate": 1.3354206030122852e-06, "loss": 0.0159, "step": 1897 }, { "epoch": 3.71, "grad_norm": 1.1958986300915877, "learning_rate": 1.3316723659879105e-06, "loss": 0.0375, "step": 1898 }, { "epoch": 3.71, "grad_norm": 1.3869385943279, "learning_rate": 1.3279281603982706e-06, "loss": 0.0268, "step": 1899 }, { "epoch": 3.71, "grad_norm": 0.9923128351448146, "learning_rate": 1.32418799320473e-06, "loss": 0.0156, "step": 1900 }, { "epoch": 3.71, "grad_norm": 1.1206343679267545, "learning_rate": 1.3204518713611436e-06, "loss": 0.0237, "step": 1901 }, { "epoch": 3.71, "grad_norm": 1.2149691755511396, "learning_rate": 1.316719801813849e-06, "loss": 0.0323, "step": 1902 }, { "epoch": 3.72, "grad_norm": 1.2329474169136723, "learning_rate": 1.3129917915016482e-06, "loss": 0.0301, "step": 1903 }, { "epoch": 3.72, "grad_norm": 1.2093762273457238, "learning_rate": 1.3092678473557933e-06, "loss": 0.028, "step": 1904 }, { "epoch": 3.72, "grad_norm": 1.0292062618454563, "learning_rate": 1.3055479762999807e-06, "loss": 0.024, "step": 1905 }, { "epoch": 3.72, "grad_norm": 0.9831064103766358, "learning_rate": 1.3018321852503304e-06, "loss": 0.0197, "step": 1906 }, { "epoch": 3.72, "grad_norm": 1.1907685530372774, "learning_rate": 1.2981204811153784e-06, "loss": 0.0311, "step": 1907 }, { "epoch": 3.73, "grad_norm": 1.1880925642993387, "learning_rate": 1.294412870796064e-06, "loss": 0.0244, "step": 1908 }, { "epoch": 3.73, "grad_norm": 1.3161743784894093, "learning_rate": 1.2907093611857113e-06, "loss": 0.0239, "step": 1909 }, { "epoch": 3.73, "grad_norm": 1.3183253729184619, "learning_rate": 1.287009959170021e-06, "loss": 0.0425, "step": 1910 }, { "epoch": 3.73, "grad_norm": 1.175826787424256, "learning_rate": 1.283314671627059e-06, "loss": 0.0262, "step": 1911 }, { "epoch": 3.73, "grad_norm": 1.14243912961959, "learning_rate": 1.2796235054272411e-06, "loss": 0.0303, "step": 1912 }, { "epoch": 3.74, "grad_norm": 1.3627453645467191, "learning_rate": 1.2759364674333183e-06, "loss": 0.0368, "step": 1913 }, { "epoch": 3.74, "grad_norm": 1.1126270092799495, "learning_rate": 1.2722535645003675e-06, "loss": 0.0239, "step": 1914 }, { "epoch": 3.74, "grad_norm": 1.2216527603244398, "learning_rate": 1.26857480347578e-06, "loss": 0.026, "step": 1915 }, { "epoch": 3.74, "grad_norm": 1.3979237616701123, "learning_rate": 1.2649001911992413e-06, "loss": 0.0304, "step": 1916 }, { "epoch": 3.74, "grad_norm": 1.2582260302211332, "learning_rate": 1.2612297345027284e-06, "loss": 0.0279, "step": 1917 }, { "epoch": 3.75, "grad_norm": 0.9827021082973235, "learning_rate": 1.2575634402104883e-06, "loss": 0.0142, "step": 1918 }, { "epoch": 3.75, "grad_norm": 1.1355653189059722, "learning_rate": 1.2539013151390298e-06, "loss": 0.0301, "step": 1919 }, { "epoch": 3.75, "grad_norm": 1.2100676379129354, "learning_rate": 1.2502433660971122e-06, "loss": 0.0275, "step": 1920 }, { "epoch": 3.75, "grad_norm": 1.3780602526811492, "learning_rate": 1.2465895998857306e-06, "loss": 0.0274, "step": 1921 }, { "epoch": 3.75, "grad_norm": 1.1485938781992373, "learning_rate": 1.2429400232980989e-06, "loss": 0.02, "step": 1922 }, { "epoch": 3.76, "grad_norm": 1.369533264798539, "learning_rate": 1.2392946431196465e-06, "loss": 0.0304, "step": 1923 }, { "epoch": 3.76, "grad_norm": 1.226825404461134, "learning_rate": 1.2356534661279994e-06, "loss": 0.029, "step": 1924 }, { "epoch": 3.76, "grad_norm": 1.0469630919341775, "learning_rate": 1.2320164990929661e-06, "loss": 0.0119, "step": 1925 }, { "epoch": 3.76, "grad_norm": 1.0552888503087257, "learning_rate": 1.2283837487765322e-06, "loss": 0.0162, "step": 1926 }, { "epoch": 3.76, "grad_norm": 1.1892297746636973, "learning_rate": 1.22475522193284e-06, "loss": 0.0253, "step": 1927 }, { "epoch": 3.77, "grad_norm": 1.2975529575338485, "learning_rate": 1.2211309253081786e-06, "loss": 0.0284, "step": 1928 }, { "epoch": 3.77, "grad_norm": 1.1595093901156592, "learning_rate": 1.2175108656409762e-06, "loss": 0.023, "step": 1929 }, { "epoch": 3.77, "grad_norm": 1.214142459483469, "learning_rate": 1.213895049661782e-06, "loss": 0.0286, "step": 1930 }, { "epoch": 3.77, "grad_norm": 1.1982582243859345, "learning_rate": 1.2102834840932523e-06, "loss": 0.0322, "step": 1931 }, { "epoch": 3.77, "grad_norm": 1.2088200734823387, "learning_rate": 1.2066761756501436e-06, "loss": 0.0302, "step": 1932 }, { "epoch": 3.78, "grad_norm": 1.291669853962898, "learning_rate": 1.2030731310392987e-06, "loss": 0.0337, "step": 1933 }, { "epoch": 3.78, "grad_norm": 1.1388501965558617, "learning_rate": 1.1994743569596289e-06, "loss": 0.0239, "step": 1934 }, { "epoch": 3.78, "grad_norm": 1.068972612355637, "learning_rate": 1.195879860102109e-06, "loss": 0.0165, "step": 1935 }, { "epoch": 3.78, "grad_norm": 1.1403158518046737, "learning_rate": 1.192289647149759e-06, "loss": 0.0204, "step": 1936 }, { "epoch": 3.78, "grad_norm": 1.4783200162399968, "learning_rate": 1.188703724777637e-06, "loss": 0.0224, "step": 1937 }, { "epoch": 3.79, "grad_norm": 1.0822791337227284, "learning_rate": 1.1851220996528198e-06, "loss": 0.0298, "step": 1938 }, { "epoch": 3.79, "grad_norm": 1.239659742550553, "learning_rate": 1.1815447784343984e-06, "loss": 0.0397, "step": 1939 }, { "epoch": 3.79, "grad_norm": 1.3151225644833235, "learning_rate": 1.1779717677734615e-06, "loss": 0.0287, "step": 1940 }, { "epoch": 3.79, "grad_norm": 1.1093182432530133, "learning_rate": 1.17440307431308e-06, "loss": 0.0254, "step": 1941 }, { "epoch": 3.79, "grad_norm": 1.1013845237533624, "learning_rate": 1.1708387046883027e-06, "loss": 0.0189, "step": 1942 }, { "epoch": 3.79, "grad_norm": 1.2214596882360482, "learning_rate": 1.1672786655261346e-06, "loss": 0.0273, "step": 1943 }, { "epoch": 3.8, "grad_norm": 1.293804374706724, "learning_rate": 1.1637229634455348e-06, "loss": 0.0203, "step": 1944 }, { "epoch": 3.8, "grad_norm": 1.2170331482683112, "learning_rate": 1.160171605057393e-06, "loss": 0.0387, "step": 1945 }, { "epoch": 3.8, "grad_norm": 1.2853577023443115, "learning_rate": 1.1566245969645276e-06, "loss": 0.0338, "step": 1946 }, { "epoch": 3.8, "grad_norm": 1.3848598880701, "learning_rate": 1.1530819457616656e-06, "loss": 0.0378, "step": 1947 }, { "epoch": 3.8, "grad_norm": 1.2336876995171338, "learning_rate": 1.1495436580354353e-06, "loss": 0.033, "step": 1948 }, { "epoch": 3.81, "grad_norm": 1.0237828333672152, "learning_rate": 1.1460097403643532e-06, "loss": 0.0196, "step": 1949 }, { "epoch": 3.81, "grad_norm": 1.3932171470760517, "learning_rate": 1.142480199318807e-06, "loss": 0.0358, "step": 1950 }, { "epoch": 3.81, "grad_norm": 1.423984403681504, "learning_rate": 1.1389550414610507e-06, "loss": 0.024, "step": 1951 }, { "epoch": 3.81, "grad_norm": 1.1085037746789896, "learning_rate": 1.135434273345189e-06, "loss": 0.0347, "step": 1952 }, { "epoch": 3.81, "grad_norm": 1.1836148485662752, "learning_rate": 1.1319179015171633e-06, "loss": 0.0235, "step": 1953 }, { "epoch": 3.82, "grad_norm": 1.129763527892518, "learning_rate": 1.1284059325147396e-06, "loss": 0.0499, "step": 1954 }, { "epoch": 3.82, "grad_norm": 1.4117732036351196, "learning_rate": 1.1248983728675037e-06, "loss": 0.0301, "step": 1955 }, { "epoch": 3.82, "grad_norm": 1.1408377633258076, "learning_rate": 1.1213952290968368e-06, "loss": 0.0341, "step": 1956 }, { "epoch": 3.82, "grad_norm": 1.3750215163239812, "learning_rate": 1.1178965077159144e-06, "loss": 0.036, "step": 1957 }, { "epoch": 3.82, "grad_norm": 1.0794847468669242, "learning_rate": 1.1144022152296895e-06, "loss": 0.0286, "step": 1958 }, { "epoch": 3.83, "grad_norm": 1.010044436737873, "learning_rate": 1.110912358134877e-06, "loss": 0.025, "step": 1959 }, { "epoch": 3.83, "grad_norm": 1.2548304737843625, "learning_rate": 1.1074269429199503e-06, "loss": 0.0198, "step": 1960 }, { "epoch": 3.83, "grad_norm": 1.2237905740853667, "learning_rate": 1.1039459760651216e-06, "loss": 0.0217, "step": 1961 }, { "epoch": 3.83, "grad_norm": 1.2678855804796023, "learning_rate": 1.1004694640423325e-06, "loss": 0.0464, "step": 1962 }, { "epoch": 3.83, "grad_norm": 0.9101783051336376, "learning_rate": 1.0969974133152416e-06, "loss": 0.0212, "step": 1963 }, { "epoch": 3.84, "grad_norm": 1.103411070047276, "learning_rate": 1.093529830339214e-06, "loss": 0.0222, "step": 1964 }, { "epoch": 3.84, "grad_norm": 1.1171698746858267, "learning_rate": 1.09006672156131e-06, "loss": 0.0413, "step": 1965 }, { "epoch": 3.84, "grad_norm": 1.0367031425331994, "learning_rate": 1.0866080934202657e-06, "loss": 0.0248, "step": 1966 }, { "epoch": 3.84, "grad_norm": 1.0508402348096133, "learning_rate": 1.0831539523464935e-06, "loss": 0.0264, "step": 1967 }, { "epoch": 3.84, "grad_norm": 1.1303159520438908, "learning_rate": 1.0797043047620575e-06, "loss": 0.0171, "step": 1968 }, { "epoch": 3.85, "grad_norm": 1.0399463783347411, "learning_rate": 1.0762591570806703e-06, "loss": 0.0228, "step": 1969 }, { "epoch": 3.85, "grad_norm": 1.2463384480790431, "learning_rate": 1.072818515707679e-06, "loss": 0.0159, "step": 1970 }, { "epoch": 3.85, "grad_norm": 1.2350130795920797, "learning_rate": 1.0693823870400503e-06, "loss": 0.0259, "step": 1971 }, { "epoch": 3.85, "grad_norm": 1.3975162779301797, "learning_rate": 1.0659507774663595e-06, "loss": 0.0299, "step": 1972 }, { "epoch": 3.85, "grad_norm": 1.2391630926192763, "learning_rate": 1.0625236933667838e-06, "loss": 0.0303, "step": 1973 }, { "epoch": 3.86, "grad_norm": 1.2424588440010942, "learning_rate": 1.0591011411130844e-06, "loss": 0.0258, "step": 1974 }, { "epoch": 3.86, "grad_norm": 1.318485822476663, "learning_rate": 1.0556831270685953e-06, "loss": 0.0224, "step": 1975 }, { "epoch": 3.86, "grad_norm": 1.16795848898562, "learning_rate": 1.0522696575882148e-06, "loss": 0.0245, "step": 1976 }, { "epoch": 3.86, "grad_norm": 1.2426046866622853, "learning_rate": 1.048860739018393e-06, "loss": 0.0372, "step": 1977 }, { "epoch": 3.86, "grad_norm": 1.2371654481595447, "learning_rate": 1.0454563776971147e-06, "loss": 0.022, "step": 1978 }, { "epoch": 3.87, "grad_norm": 1.080894866883366, "learning_rate": 1.042056579953895e-06, "loss": 0.0268, "step": 1979 }, { "epoch": 3.87, "grad_norm": 1.1468003730509742, "learning_rate": 1.0386613521097656e-06, "loss": 0.0327, "step": 1980 }, { "epoch": 3.87, "grad_norm": 1.3514762669757572, "learning_rate": 1.0352707004772549e-06, "loss": 0.0251, "step": 1981 }, { "epoch": 3.87, "grad_norm": 1.1969762916666522, "learning_rate": 1.0318846313603895e-06, "loss": 0.0371, "step": 1982 }, { "epoch": 3.87, "grad_norm": 1.0791771284519454, "learning_rate": 1.0285031510546756e-06, "loss": 0.0252, "step": 1983 }, { "epoch": 3.88, "grad_norm": 1.2414121032735026, "learning_rate": 1.0251262658470838e-06, "loss": 0.0287, "step": 1984 }, { "epoch": 3.88, "grad_norm": 0.8861760218531249, "learning_rate": 1.0217539820160445e-06, "loss": 0.0142, "step": 1985 }, { "epoch": 3.88, "grad_norm": 1.436000440219443, "learning_rate": 1.0183863058314338e-06, "loss": 0.0299, "step": 1986 }, { "epoch": 3.88, "grad_norm": 1.228757867802478, "learning_rate": 1.0150232435545569e-06, "loss": 0.0386, "step": 1987 }, { "epoch": 3.88, "grad_norm": 1.323870334674649, "learning_rate": 1.0116648014381442e-06, "loss": 0.0251, "step": 1988 }, { "epoch": 3.88, "grad_norm": 1.153135040409466, "learning_rate": 1.0083109857263376e-06, "loss": 0.0291, "step": 1989 }, { "epoch": 3.89, "grad_norm": 1.0844958859669123, "learning_rate": 1.0049618026546712e-06, "loss": 0.0209, "step": 1990 }, { "epoch": 3.89, "grad_norm": 1.3900139309694965, "learning_rate": 1.001617258450071e-06, "loss": 0.0311, "step": 1991 }, { "epoch": 3.89, "grad_norm": 1.0982457101373109, "learning_rate": 9.982773593308383e-07, "loss": 0.0241, "step": 1992 }, { "epoch": 3.89, "grad_norm": 1.1190965343069772, "learning_rate": 9.94942111506635e-07, "loss": 0.0326, "step": 1993 }, { "epoch": 3.89, "grad_norm": 1.4006601927443192, "learning_rate": 9.916115211784778e-07, "loss": 0.0189, "step": 1994 }, { "epoch": 3.9, "grad_norm": 1.4127453799199627, "learning_rate": 9.882855945387237e-07, "loss": 0.0442, "step": 1995 }, { "epoch": 3.9, "grad_norm": 1.1825615974746688, "learning_rate": 9.849643377710566e-07, "loss": 0.0344, "step": 1996 }, { "epoch": 3.9, "grad_norm": 1.1686500680161216, "learning_rate": 9.816477570504808e-07, "loss": 0.02, "step": 1997 }, { "epoch": 3.9, "grad_norm": 1.0633206288292212, "learning_rate": 9.78335858543306e-07, "loss": 0.0221, "step": 1998 }, { "epoch": 3.9, "grad_norm": 1.2444975652689307, "learning_rate": 9.750286484071358e-07, "loss": 0.0369, "step": 1999 }, { "epoch": 3.91, "grad_norm": 1.1650526740424076, "learning_rate": 9.717261327908557e-07, "loss": 0.0258, "step": 2000 }, { "epoch": 3.91, "grad_norm": 1.245126036264225, "learning_rate": 9.684283178346259e-07, "loss": 0.0355, "step": 2001 }, { "epoch": 3.91, "grad_norm": 1.3258770940648212, "learning_rate": 9.651352096698663e-07, "loss": 0.0369, "step": 2002 }, { "epoch": 3.91, "grad_norm": 1.139002576245059, "learning_rate": 9.61846814419243e-07, "loss": 0.0331, "step": 2003 }, { "epoch": 3.91, "grad_norm": 1.2528658131724384, "learning_rate": 9.585631381966645e-07, "loss": 0.0225, "step": 2004 }, { "epoch": 3.92, "grad_norm": 1.270974004702528, "learning_rate": 9.552841871072603e-07, "loss": 0.0449, "step": 2005 }, { "epoch": 3.92, "grad_norm": 1.2088756318964737, "learning_rate": 9.520099672473782e-07, "loss": 0.0247, "step": 2006 }, { "epoch": 3.92, "grad_norm": 1.4389756528059545, "learning_rate": 9.487404847045695e-07, "loss": 0.0223, "step": 2007 }, { "epoch": 3.92, "grad_norm": 1.1702066103774558, "learning_rate": 9.454757455575762e-07, "loss": 0.0206, "step": 2008 }, { "epoch": 3.92, "grad_norm": 0.9252817137552869, "learning_rate": 9.422157558763201e-07, "loss": 0.0175, "step": 2009 }, { "epoch": 3.93, "grad_norm": 1.2935298646783868, "learning_rate": 9.389605217218959e-07, "loss": 0.035, "step": 2010 }, { "epoch": 3.93, "grad_norm": 1.041314239226113, "learning_rate": 9.357100491465556e-07, "loss": 0.0249, "step": 2011 }, { "epoch": 3.93, "grad_norm": 1.1202295720370559, "learning_rate": 9.324643441936959e-07, "loss": 0.0276, "step": 2012 }, { "epoch": 3.93, "grad_norm": 1.4120869113541024, "learning_rate": 9.292234128978525e-07, "loss": 0.0327, "step": 2013 }, { "epoch": 3.93, "grad_norm": 1.2040482121906777, "learning_rate": 9.25987261284685e-07, "loss": 0.0352, "step": 2014 }, { "epoch": 3.94, "grad_norm": 1.1190745036920822, "learning_rate": 9.227558953709638e-07, "loss": 0.0338, "step": 2015 }, { "epoch": 3.94, "grad_norm": 1.2317143122745648, "learning_rate": 9.195293211645661e-07, "loss": 0.0282, "step": 2016 }, { "epoch": 3.94, "grad_norm": 1.2160277172143172, "learning_rate": 9.163075446644564e-07, "loss": 0.0241, "step": 2017 }, { "epoch": 3.94, "grad_norm": 1.3489146307224882, "learning_rate": 9.130905718606795e-07, "loss": 0.0392, "step": 2018 }, { "epoch": 3.94, "grad_norm": 1.22184867412566, "learning_rate": 9.098784087343511e-07, "loss": 0.0293, "step": 2019 }, { "epoch": 3.95, "grad_norm": 1.1284183694047383, "learning_rate": 9.066710612576439e-07, "loss": 0.0283, "step": 2020 }, { "epoch": 3.95, "grad_norm": 1.088057439223084, "learning_rate": 9.034685353937748e-07, "loss": 0.0181, "step": 2021 }, { "epoch": 3.95, "grad_norm": 1.4050480506860974, "learning_rate": 9.002708370969993e-07, "loss": 0.0386, "step": 2022 }, { "epoch": 3.95, "grad_norm": 1.3315903040728985, "learning_rate": 8.97077972312597e-07, "loss": 0.0398, "step": 2023 }, { "epoch": 3.95, "grad_norm": 1.3376471530285083, "learning_rate": 8.938899469768581e-07, "loss": 0.0385, "step": 2024 }, { "epoch": 3.96, "grad_norm": 1.0758571394040741, "learning_rate": 8.907067670170782e-07, "loss": 0.0131, "step": 2025 }, { "epoch": 3.96, "grad_norm": 1.114287760767581, "learning_rate": 8.875284383515417e-07, "loss": 0.0254, "step": 2026 }, { "epoch": 3.96, "grad_norm": 1.1281104600760914, "learning_rate": 8.843549668895162e-07, "loss": 0.0286, "step": 2027 }, { "epoch": 3.96, "grad_norm": 1.0042863979284826, "learning_rate": 8.811863585312348e-07, "loss": 0.0164, "step": 2028 }, { "epoch": 3.96, "grad_norm": 1.2700633264039356, "learning_rate": 8.780226191678929e-07, "loss": 0.0146, "step": 2029 }, { "epoch": 3.96, "grad_norm": 1.2207481557060824, "learning_rate": 8.748637546816303e-07, "loss": 0.0299, "step": 2030 }, { "epoch": 3.97, "grad_norm": 1.187032099322897, "learning_rate": 8.717097709455242e-07, "loss": 0.0329, "step": 2031 }, { "epoch": 3.97, "grad_norm": 1.357106066750458, "learning_rate": 8.685606738235796e-07, "loss": 0.0395, "step": 2032 }, { "epoch": 3.97, "grad_norm": 1.0339868641956136, "learning_rate": 8.654164691707113e-07, "loss": 0.0183, "step": 2033 }, { "epoch": 3.97, "grad_norm": 1.1085003474978818, "learning_rate": 8.622771628327429e-07, "loss": 0.021, "step": 2034 }, { "epoch": 3.97, "grad_norm": 1.1806093623359606, "learning_rate": 8.591427606463867e-07, "loss": 0.0277, "step": 2035 }, { "epoch": 3.98, "grad_norm": 1.17177376012875, "learning_rate": 8.560132684392404e-07, "loss": 0.0321, "step": 2036 }, { "epoch": 3.98, "grad_norm": 1.6169743135593246, "learning_rate": 8.528886920297698e-07, "loss": 0.0277, "step": 2037 }, { "epoch": 3.98, "grad_norm": 1.140328619814552, "learning_rate": 8.49769037227304e-07, "loss": 0.0213, "step": 2038 }, { "epoch": 3.98, "grad_norm": 1.195645997398998, "learning_rate": 8.466543098320205e-07, "loss": 0.029, "step": 2039 }, { "epoch": 3.98, "grad_norm": 1.3533166672547217, "learning_rate": 8.435445156349334e-07, "loss": 0.0399, "step": 2040 }, { "epoch": 3.99, "grad_norm": 1.1173989016253878, "learning_rate": 8.404396604178883e-07, "loss": 0.0191, "step": 2041 }, { "epoch": 3.99, "grad_norm": 1.230772572488412, "learning_rate": 8.373397499535475e-07, "loss": 0.0444, "step": 2042 }, { "epoch": 3.99, "grad_norm": 1.0543064343907143, "learning_rate": 8.342447900053779e-07, "loss": 0.0184, "step": 2043 }, { "epoch": 3.99, "grad_norm": 1.2591324770618777, "learning_rate": 8.311547863276417e-07, "loss": 0.0262, "step": 2044 }, { "epoch": 3.99, "grad_norm": 1.178987855783076, "learning_rate": 8.280697446653906e-07, "loss": 0.03, "step": 2045 }, { "epoch": 4.0, "grad_norm": 1.3371991604999345, "learning_rate": 8.249896707544451e-07, "loss": 0.0277, "step": 2046 }, { "epoch": 4.0, "grad_norm": 1.3059639803565715, "learning_rate": 8.219145703213937e-07, "loss": 0.0213, "step": 2047 }, { "epoch": 4.0, "grad_norm": 1.12982799431942, "learning_rate": 8.188444490835774e-07, "loss": 0.0264, "step": 2048 }, { "epoch": 4.0, "grad_norm": 1.1484659571902813, "learning_rate": 8.157793127490769e-07, "loss": 0.0229, "step": 2049 }, { "epoch": 4.0, "grad_norm": 0.9610241679350182, "learning_rate": 8.127191670167078e-07, "loss": 0.0112, "step": 2050 }, { "epoch": 4.01, "grad_norm": 1.0112149023567367, "learning_rate": 8.096640175760066e-07, "loss": 0.0286, "step": 2051 }, { "epoch": 4.01, "grad_norm": 1.2080308119199783, "learning_rate": 8.066138701072195e-07, "loss": 0.0331, "step": 2052 }, { "epoch": 4.01, "grad_norm": 1.1975246232842343, "learning_rate": 8.035687302812919e-07, "loss": 0.0155, "step": 2053 }, { "epoch": 4.01, "grad_norm": 0.9550960031835776, "learning_rate": 8.005286037598621e-07, "loss": 0.017, "step": 2054 }, { "epoch": 4.01, "grad_norm": 1.2856161720851422, "learning_rate": 7.974934961952433e-07, "loss": 0.0333, "step": 2055 }, { "epoch": 4.02, "grad_norm": 1.0585801447884915, "learning_rate": 7.944634132304205e-07, "loss": 0.0261, "step": 2056 }, { "epoch": 4.02, "grad_norm": 0.8673539654914161, "learning_rate": 7.914383604990372e-07, "loss": 0.0186, "step": 2057 }, { "epoch": 4.02, "grad_norm": 1.1710925588826158, "learning_rate": 7.884183436253804e-07, "loss": 0.0315, "step": 2058 }, { "epoch": 4.02, "grad_norm": 1.044517159340853, "learning_rate": 7.854033682243785e-07, "loss": 0.0313, "step": 2059 }, { "epoch": 4.02, "grad_norm": 1.0489523681398114, "learning_rate": 7.823934399015856e-07, "loss": 0.0282, "step": 2060 }, { "epoch": 4.03, "grad_norm": 0.9625443848014793, "learning_rate": 7.793885642531703e-07, "loss": 0.0269, "step": 2061 }, { "epoch": 4.03, "grad_norm": 1.1277037453130168, "learning_rate": 7.763887468659081e-07, "loss": 0.0215, "step": 2062 }, { "epoch": 4.03, "grad_norm": 1.1585548826582217, "learning_rate": 7.733939933171702e-07, "loss": 0.0301, "step": 2063 }, { "epoch": 4.03, "grad_norm": 0.9858374321384156, "learning_rate": 7.704043091749143e-07, "loss": 0.0211, "step": 2064 }, { "epoch": 4.03, "grad_norm": 1.1064706998669787, "learning_rate": 7.674196999976693e-07, "loss": 0.0237, "step": 2065 }, { "epoch": 4.04, "grad_norm": 0.8908615326152374, "learning_rate": 7.644401713345332e-07, "loss": 0.0167, "step": 2066 }, { "epoch": 4.04, "grad_norm": 1.2802981263412527, "learning_rate": 7.614657287251531e-07, "loss": 0.0231, "step": 2067 }, { "epoch": 4.04, "grad_norm": 1.2132337358706469, "learning_rate": 7.584963776997237e-07, "loss": 0.0317, "step": 2068 }, { "epoch": 4.04, "grad_norm": 1.2196253328801046, "learning_rate": 7.555321237789723e-07, "loss": 0.0356, "step": 2069 }, { "epoch": 4.04, "grad_norm": 0.6393654259681957, "learning_rate": 7.525729724741495e-07, "loss": 0.0074, "step": 2070 }, { "epoch": 4.04, "grad_norm": 0.6941653161597121, "learning_rate": 7.496189292870161e-07, "loss": 0.0108, "step": 2071 }, { "epoch": 4.05, "grad_norm": 1.1683988200864264, "learning_rate": 7.466699997098405e-07, "loss": 0.0247, "step": 2072 }, { "epoch": 4.05, "grad_norm": 1.3336717122443975, "learning_rate": 7.437261892253815e-07, "loss": 0.0381, "step": 2073 }, { "epoch": 4.05, "grad_norm": 0.9071823972667638, "learning_rate": 7.407875033068782e-07, "loss": 0.017, "step": 2074 }, { "epoch": 4.05, "grad_norm": 1.2997444557249338, "learning_rate": 7.378539474180453e-07, "loss": 0.0242, "step": 2075 }, { "epoch": 4.05, "grad_norm": 1.077456109077788, "learning_rate": 7.349255270130589e-07, "loss": 0.0301, "step": 2076 }, { "epoch": 4.06, "grad_norm": 1.1788421362891677, "learning_rate": 7.320022475365443e-07, "loss": 0.0257, "step": 2077 }, { "epoch": 4.06, "grad_norm": 1.0902527263272932, "learning_rate": 7.290841144235711e-07, "loss": 0.0294, "step": 2078 }, { "epoch": 4.06, "grad_norm": 0.9375569604989108, "learning_rate": 7.261711330996429e-07, "loss": 0.0208, "step": 2079 }, { "epoch": 4.06, "grad_norm": 1.1215385319404456, "learning_rate": 7.232633089806773e-07, "loss": 0.0326, "step": 2080 }, { "epoch": 4.06, "grad_norm": 1.0578040175329957, "learning_rate": 7.203606474730107e-07, "loss": 0.015, "step": 2081 }, { "epoch": 4.07, "grad_norm": 0.925236628828232, "learning_rate": 7.174631539733795e-07, "loss": 0.0182, "step": 2082 }, { "epoch": 4.07, "grad_norm": 1.1248328982186209, "learning_rate": 7.145708338689079e-07, "loss": 0.0245, "step": 2083 }, { "epoch": 4.07, "grad_norm": 1.0517802488826553, "learning_rate": 7.116836925371055e-07, "loss": 0.024, "step": 2084 }, { "epoch": 4.07, "grad_norm": 1.0697945691216544, "learning_rate": 7.088017353458533e-07, "loss": 0.0237, "step": 2085 }, { "epoch": 4.07, "grad_norm": 0.9919826285969846, "learning_rate": 7.059249676533898e-07, "loss": 0.0246, "step": 2086 }, { "epoch": 4.08, "grad_norm": 1.0022662199194194, "learning_rate": 7.03053394808309e-07, "loss": 0.0209, "step": 2087 }, { "epoch": 4.08, "grad_norm": 0.8703516312788839, "learning_rate": 7.001870221495463e-07, "loss": 0.0159, "step": 2088 }, { "epoch": 4.08, "grad_norm": 0.9631491756284972, "learning_rate": 6.973258550063658e-07, "loss": 0.021, "step": 2089 }, { "epoch": 4.08, "grad_norm": 1.3291851850469065, "learning_rate": 6.944698986983546e-07, "loss": 0.0316, "step": 2090 }, { "epoch": 4.08, "grad_norm": 0.9608665360922574, "learning_rate": 6.91619158535414e-07, "loss": 0.0253, "step": 2091 }, { "epoch": 4.09, "grad_norm": 1.203089866628453, "learning_rate": 6.88773639817743e-07, "loss": 0.0263, "step": 2092 }, { "epoch": 4.09, "grad_norm": 1.118270717638135, "learning_rate": 6.859333478358361e-07, "loss": 0.0185, "step": 2093 }, { "epoch": 4.09, "grad_norm": 1.0759907397828548, "learning_rate": 6.830982878704702e-07, "loss": 0.0166, "step": 2094 }, { "epoch": 4.09, "grad_norm": 0.9374855310193274, "learning_rate": 6.802684651926911e-07, "loss": 0.0113, "step": 2095 }, { "epoch": 4.09, "grad_norm": 0.9701904600504303, "learning_rate": 6.774438850638107e-07, "loss": 0.0136, "step": 2096 }, { "epoch": 4.1, "grad_norm": 0.8445318246441742, "learning_rate": 6.74624552735393e-07, "loss": 0.0206, "step": 2097 }, { "epoch": 4.1, "grad_norm": 1.0316168891461546, "learning_rate": 6.718104734492447e-07, "loss": 0.0269, "step": 2098 }, { "epoch": 4.1, "grad_norm": 0.9970077321231461, "learning_rate": 6.69001652437404e-07, "loss": 0.0172, "step": 2099 }, { "epoch": 4.1, "grad_norm": 0.9315240609248536, "learning_rate": 6.661980949221356e-07, "loss": 0.0206, "step": 2100 }, { "epoch": 4.1, "grad_norm": 1.0404537715453244, "learning_rate": 6.633998061159187e-07, "loss": 0.0192, "step": 2101 }, { "epoch": 4.11, "grad_norm": 0.9705253428732203, "learning_rate": 6.606067912214323e-07, "loss": 0.0192, "step": 2102 }, { "epoch": 4.11, "grad_norm": 0.8737659401364778, "learning_rate": 6.578190554315545e-07, "loss": 0.0134, "step": 2103 }, { "epoch": 4.11, "grad_norm": 1.2779103501114688, "learning_rate": 6.550366039293471e-07, "loss": 0.0219, "step": 2104 }, { "epoch": 4.11, "grad_norm": 0.7984436439584951, "learning_rate": 6.522594418880442e-07, "loss": 0.0104, "step": 2105 }, { "epoch": 4.11, "grad_norm": 1.1618232851321377, "learning_rate": 6.494875744710507e-07, "loss": 0.0198, "step": 2106 }, { "epoch": 4.12, "grad_norm": 1.0100686740742302, "learning_rate": 6.467210068319233e-07, "loss": 0.0187, "step": 2107 }, { "epoch": 4.12, "grad_norm": 1.1166902033263455, "learning_rate": 6.439597441143655e-07, "loss": 0.0188, "step": 2108 }, { "epoch": 4.12, "grad_norm": 1.0079221534817127, "learning_rate": 6.412037914522204e-07, "loss": 0.0155, "step": 2109 }, { "epoch": 4.12, "grad_norm": 1.2331852717875365, "learning_rate": 6.384531539694574e-07, "loss": 0.0192, "step": 2110 }, { "epoch": 4.12, "grad_norm": 0.9485093970370633, "learning_rate": 6.357078367801617e-07, "loss": 0.0193, "step": 2111 }, { "epoch": 4.12, "grad_norm": 0.7558898252009129, "learning_rate": 6.329678449885283e-07, "loss": 0.0105, "step": 2112 }, { "epoch": 4.13, "grad_norm": 1.2212086221284981, "learning_rate": 6.302331836888529e-07, "loss": 0.0217, "step": 2113 }, { "epoch": 4.13, "grad_norm": 0.9387409586587668, "learning_rate": 6.275038579655167e-07, "loss": 0.0168, "step": 2114 }, { "epoch": 4.13, "grad_norm": 1.3029542301437902, "learning_rate": 6.24779872892984e-07, "loss": 0.0202, "step": 2115 }, { "epoch": 4.13, "grad_norm": 1.0072933701975189, "learning_rate": 6.22061233535788e-07, "loss": 0.0246, "step": 2116 }, { "epoch": 4.13, "grad_norm": 1.0291044686875792, "learning_rate": 6.193479449485223e-07, "loss": 0.02, "step": 2117 }, { "epoch": 4.14, "grad_norm": 1.0668147469534583, "learning_rate": 6.166400121758337e-07, "loss": 0.0158, "step": 2118 }, { "epoch": 4.14, "grad_norm": 0.9032127026769448, "learning_rate": 6.139374402524123e-07, "loss": 0.0182, "step": 2119 }, { "epoch": 4.14, "grad_norm": 1.0153077282405487, "learning_rate": 6.112402342029767e-07, "loss": 0.0114, "step": 2120 }, { "epoch": 4.14, "grad_norm": 0.918013345754676, "learning_rate": 6.08548399042274e-07, "loss": 0.0204, "step": 2121 }, { "epoch": 4.14, "grad_norm": 1.1792356258331511, "learning_rate": 6.058619397750635e-07, "loss": 0.0227, "step": 2122 }, { "epoch": 4.15, "grad_norm": 0.8603578273585719, "learning_rate": 6.03180861396108e-07, "loss": 0.0124, "step": 2123 }, { "epoch": 4.15, "grad_norm": 1.0060790320272608, "learning_rate": 6.005051688901686e-07, "loss": 0.0157, "step": 2124 }, { "epoch": 4.15, "grad_norm": 1.040674015717714, "learning_rate": 5.978348672319908e-07, "loss": 0.024, "step": 2125 }, { "epoch": 4.15, "grad_norm": 1.068212256237597, "learning_rate": 5.951699613862985e-07, "loss": 0.0245, "step": 2126 }, { "epoch": 4.15, "grad_norm": 1.0899567145980977, "learning_rate": 5.925104563077817e-07, "loss": 0.0135, "step": 2127 }, { "epoch": 4.16, "grad_norm": 1.0886060062717233, "learning_rate": 5.898563569410913e-07, "loss": 0.0228, "step": 2128 }, { "epoch": 4.16, "grad_norm": 1.0075055088453753, "learning_rate": 5.87207668220828e-07, "loss": 0.0169, "step": 2129 }, { "epoch": 4.16, "grad_norm": 1.1447859362674846, "learning_rate": 5.845643950715289e-07, "loss": 0.019, "step": 2130 }, { "epoch": 4.16, "grad_norm": 1.1322555700035328, "learning_rate": 5.819265424076679e-07, "loss": 0.025, "step": 2131 }, { "epoch": 4.16, "grad_norm": 1.0566614425528043, "learning_rate": 5.79294115133635e-07, "loss": 0.0256, "step": 2132 }, { "epoch": 4.17, "grad_norm": 0.8677293782001028, "learning_rate": 5.766671181437387e-07, "loss": 0.0105, "step": 2133 }, { "epoch": 4.17, "grad_norm": 1.0088998011289378, "learning_rate": 5.740455563221866e-07, "loss": 0.0162, "step": 2134 }, { "epoch": 4.17, "grad_norm": 1.2000616329590743, "learning_rate": 5.714294345430853e-07, "loss": 0.0167, "step": 2135 }, { "epoch": 4.17, "grad_norm": 0.8992850040309159, "learning_rate": 5.688187576704227e-07, "loss": 0.0154, "step": 2136 }, { "epoch": 4.17, "grad_norm": 1.0267740180772373, "learning_rate": 5.662135305580667e-07, "loss": 0.0213, "step": 2137 }, { "epoch": 4.18, "grad_norm": 1.2032345667068143, "learning_rate": 5.636137580497524e-07, "loss": 0.0171, "step": 2138 }, { "epoch": 4.18, "grad_norm": 0.8570802984798628, "learning_rate": 5.610194449790711e-07, "loss": 0.01, "step": 2139 }, { "epoch": 4.18, "grad_norm": 0.7808615049421546, "learning_rate": 5.584305961694664e-07, "loss": 0.0166, "step": 2140 }, { "epoch": 4.18, "grad_norm": 0.879551792021926, "learning_rate": 5.558472164342222e-07, "loss": 0.0208, "step": 2141 }, { "epoch": 4.18, "grad_norm": 0.9522656622462153, "learning_rate": 5.532693105764526e-07, "loss": 0.0189, "step": 2142 }, { "epoch": 4.19, "grad_norm": 0.9059760203157562, "learning_rate": 5.506968833890943e-07, "loss": 0.0204, "step": 2143 }, { "epoch": 4.19, "grad_norm": 0.9168371908479395, "learning_rate": 5.481299396549007e-07, "loss": 0.0247, "step": 2144 }, { "epoch": 4.19, "grad_norm": 0.9864164349372855, "learning_rate": 5.455684841464266e-07, "loss": 0.0107, "step": 2145 }, { "epoch": 4.19, "grad_norm": 1.1827660345920197, "learning_rate": 5.43012521626025e-07, "loss": 0.0133, "step": 2146 }, { "epoch": 4.19, "grad_norm": 0.8290531734611689, "learning_rate": 5.404620568458372e-07, "loss": 0.0124, "step": 2147 }, { "epoch": 4.2, "grad_norm": 0.8702898316365009, "learning_rate": 5.379170945477797e-07, "loss": 0.0123, "step": 2148 }, { "epoch": 4.2, "grad_norm": 1.0071981613612635, "learning_rate": 5.353776394635403e-07, "loss": 0.0209, "step": 2149 }, { "epoch": 4.2, "grad_norm": 1.1598219167967225, "learning_rate": 5.328436963145696e-07, "loss": 0.0093, "step": 2150 }, { "epoch": 4.2, "grad_norm": 1.0616878060306303, "learning_rate": 5.303152698120663e-07, "loss": 0.0185, "step": 2151 }, { "epoch": 4.2, "grad_norm": 1.1992828815512218, "learning_rate": 5.277923646569743e-07, "loss": 0.0191, "step": 2152 }, { "epoch": 4.21, "grad_norm": 0.9782484184334774, "learning_rate": 5.252749855399728e-07, "loss": 0.0167, "step": 2153 }, { "epoch": 4.21, "grad_norm": 1.055280810959499, "learning_rate": 5.227631371414648e-07, "loss": 0.0213, "step": 2154 }, { "epoch": 4.21, "grad_norm": 1.077364177072198, "learning_rate": 5.202568241315718e-07, "loss": 0.0187, "step": 2155 }, { "epoch": 4.21, "grad_norm": 1.0152414858571617, "learning_rate": 5.177560511701249e-07, "loss": 0.0105, "step": 2156 }, { "epoch": 4.21, "grad_norm": 0.9913225426981442, "learning_rate": 5.152608229066519e-07, "loss": 0.0147, "step": 2157 }, { "epoch": 4.21, "grad_norm": 0.8732488599096355, "learning_rate": 5.127711439803733e-07, "loss": 0.0143, "step": 2158 }, { "epoch": 4.22, "grad_norm": 0.831360405622544, "learning_rate": 5.10287019020193e-07, "loss": 0.0165, "step": 2159 }, { "epoch": 4.22, "grad_norm": 0.9630319573826622, "learning_rate": 5.078084526446877e-07, "loss": 0.0187, "step": 2160 }, { "epoch": 4.22, "grad_norm": 0.8631822855336664, "learning_rate": 5.053354494620977e-07, "loss": 0.016, "step": 2161 }, { "epoch": 4.22, "grad_norm": 0.91197878932227, "learning_rate": 5.028680140703231e-07, "loss": 0.0123, "step": 2162 }, { "epoch": 4.22, "grad_norm": 0.7973573258022357, "learning_rate": 5.004061510569114e-07, "loss": 0.0106, "step": 2163 }, { "epoch": 4.23, "grad_norm": 0.8285376844819713, "learning_rate": 4.97949864999048e-07, "loss": 0.0151, "step": 2164 }, { "epoch": 4.23, "grad_norm": 1.0092792165838163, "learning_rate": 4.954991604635503e-07, "loss": 0.0086, "step": 2165 }, { "epoch": 4.23, "grad_norm": 0.8581751904595035, "learning_rate": 4.930540420068608e-07, "loss": 0.015, "step": 2166 }, { "epoch": 4.23, "grad_norm": 1.0591963567567573, "learning_rate": 4.906145141750314e-07, "loss": 0.0204, "step": 2167 }, { "epoch": 4.23, "grad_norm": 1.4518797366216056, "learning_rate": 4.881805815037239e-07, "loss": 0.0155, "step": 2168 }, { "epoch": 4.24, "grad_norm": 1.0900443072678878, "learning_rate": 4.857522485181948e-07, "loss": 0.0131, "step": 2169 }, { "epoch": 4.24, "grad_norm": 1.1552390166818334, "learning_rate": 4.833295197332904e-07, "loss": 0.018, "step": 2170 }, { "epoch": 4.24, "grad_norm": 0.8817923888941194, "learning_rate": 4.809123996534373e-07, "loss": 0.0135, "step": 2171 }, { "epoch": 4.24, "grad_norm": 0.7140190711770258, "learning_rate": 4.785008927726359e-07, "loss": 0.0096, "step": 2172 }, { "epoch": 4.24, "grad_norm": 0.8514139941895789, "learning_rate": 4.7609500357444654e-07, "loss": 0.015, "step": 2173 }, { "epoch": 4.25, "grad_norm": 1.0174839786915952, "learning_rate": 4.736947365319881e-07, "loss": 0.0171, "step": 2174 }, { "epoch": 4.25, "grad_norm": 0.7205771191371261, "learning_rate": 4.7130009610792695e-07, "loss": 0.0115, "step": 2175 }, { "epoch": 4.25, "grad_norm": 0.8930412121515744, "learning_rate": 4.6891108675446453e-07, "loss": 0.0125, "step": 2176 }, { "epoch": 4.25, "grad_norm": 0.8144198904162193, "learning_rate": 4.665277129133368e-07, "loss": 0.0164, "step": 2177 }, { "epoch": 4.25, "grad_norm": 0.9848459953472141, "learning_rate": 4.6414997901580083e-07, "loss": 0.0172, "step": 2178 }, { "epoch": 4.26, "grad_norm": 0.8732986254080565, "learning_rate": 4.61777889482625e-07, "loss": 0.0075, "step": 2179 }, { "epoch": 4.26, "grad_norm": 0.9592469611343994, "learning_rate": 4.59411448724087e-07, "loss": 0.0097, "step": 2180 }, { "epoch": 4.26, "grad_norm": 1.0828416289142737, "learning_rate": 4.5705066113996144e-07, "loss": 0.0111, "step": 2181 }, { "epoch": 4.26, "grad_norm": 0.9411559605673818, "learning_rate": 4.5469553111951026e-07, "loss": 0.0109, "step": 2182 }, { "epoch": 4.26, "grad_norm": 1.049179964882657, "learning_rate": 4.5234606304147895e-07, "loss": 0.0119, "step": 2183 }, { "epoch": 4.27, "grad_norm": 0.8283664317076926, "learning_rate": 4.500022612740856e-07, "loss": 0.0066, "step": 2184 }, { "epoch": 4.27, "grad_norm": 0.6540431277626987, "learning_rate": 4.4766413017501164e-07, "loss": 0.0108, "step": 2185 }, { "epoch": 4.27, "grad_norm": 0.7888559835457778, "learning_rate": 4.453316740913976e-07, "loss": 0.0129, "step": 2186 }, { "epoch": 4.27, "grad_norm": 1.0901093483810134, "learning_rate": 4.430048973598325e-07, "loss": 0.0145, "step": 2187 }, { "epoch": 4.27, "grad_norm": 0.8659463713180632, "learning_rate": 4.406838043063446e-07, "loss": 0.0165, "step": 2188 }, { "epoch": 4.28, "grad_norm": 1.1579199221584278, "learning_rate": 4.383683992463951e-07, "loss": 0.0135, "step": 2189 }, { "epoch": 4.28, "grad_norm": 0.9287669414172782, "learning_rate": 4.3605868648487136e-07, "loss": 0.0091, "step": 2190 }, { "epoch": 4.28, "grad_norm": 0.7301881300464877, "learning_rate": 4.3375467031607726e-07, "loss": 0.0089, "step": 2191 }, { "epoch": 4.28, "grad_norm": 0.9420463886077, "learning_rate": 4.314563550237231e-07, "loss": 0.0164, "step": 2192 }, { "epoch": 4.28, "grad_norm": 0.7404505020952566, "learning_rate": 4.291637448809228e-07, "loss": 0.0123, "step": 2193 }, { "epoch": 4.29, "grad_norm": 0.8012822336492341, "learning_rate": 4.268768441501807e-07, "loss": 0.0099, "step": 2194 }, { "epoch": 4.29, "grad_norm": 1.13966697056065, "learning_rate": 4.24595657083387e-07, "loss": 0.0073, "step": 2195 }, { "epoch": 4.29, "grad_norm": 1.009548132929698, "learning_rate": 4.2232018792181037e-07, "loss": 0.0128, "step": 2196 }, { "epoch": 4.29, "grad_norm": 0.6722560822747327, "learning_rate": 4.200504408960861e-07, "loss": 0.0086, "step": 2197 }, { "epoch": 4.29, "grad_norm": 0.789844120193937, "learning_rate": 4.177864202262105e-07, "loss": 0.0106, "step": 2198 }, { "epoch": 4.29, "grad_norm": 0.6302197996607267, "learning_rate": 4.155281301215353e-07, "loss": 0.0096, "step": 2199 }, { "epoch": 4.3, "grad_norm": 0.7192137074893635, "learning_rate": 4.132755747807577e-07, "loss": 0.0089, "step": 2200 }, { "epoch": 4.3, "grad_norm": 0.7271212124583992, "learning_rate": 4.1102875839191017e-07, "loss": 0.0077, "step": 2201 }, { "epoch": 4.3, "grad_norm": 0.791016713311569, "learning_rate": 4.087876851323568e-07, "loss": 0.0116, "step": 2202 }, { "epoch": 4.3, "grad_norm": 0.7524147532236568, "learning_rate": 4.0655235916878516e-07, "loss": 0.011, "step": 2203 }, { "epoch": 4.3, "grad_norm": 0.9882814300830517, "learning_rate": 4.0432278465719386e-07, "loss": 0.007, "step": 2204 }, { "epoch": 4.31, "grad_norm": 1.114489532583681, "learning_rate": 4.0209896574289155e-07, "loss": 0.0065, "step": 2205 }, { "epoch": 4.31, "grad_norm": 0.8267079342270753, "learning_rate": 3.9988090656048367e-07, "loss": 0.0115, "step": 2206 }, { "epoch": 4.31, "grad_norm": 1.0101368114508396, "learning_rate": 3.976686112338672e-07, "loss": 0.0158, "step": 2207 }, { "epoch": 4.31, "grad_norm": 0.8850718938757224, "learning_rate": 3.95462083876224e-07, "loss": 0.0107, "step": 2208 }, { "epoch": 4.31, "grad_norm": 1.446636765282233, "learning_rate": 3.932613285900116e-07, "loss": 0.0093, "step": 2209 }, { "epoch": 4.32, "grad_norm": 0.7255671053075252, "learning_rate": 3.9106634946695387e-07, "loss": 0.0097, "step": 2210 }, { "epoch": 4.32, "grad_norm": 0.7788323779379877, "learning_rate": 3.888771505880383e-07, "loss": 0.0073, "step": 2211 }, { "epoch": 4.32, "grad_norm": 0.8556265233136966, "learning_rate": 3.8669373602350414e-07, "loss": 0.0076, "step": 2212 }, { "epoch": 4.32, "grad_norm": 0.8227397428225579, "learning_rate": 3.845161098328354e-07, "loss": 0.0118, "step": 2213 }, { "epoch": 4.32, "grad_norm": 0.8551888411068671, "learning_rate": 3.823442760647562e-07, "loss": 0.0128, "step": 2214 }, { "epoch": 4.33, "grad_norm": 0.769880655958475, "learning_rate": 3.8017823875721947e-07, "loss": 0.0112, "step": 2215 }, { "epoch": 4.33, "grad_norm": 0.7873954623721464, "learning_rate": 3.7801800193740066e-07, "loss": 0.0059, "step": 2216 }, { "epoch": 4.33, "grad_norm": 0.8570414474013865, "learning_rate": 3.7586356962169313e-07, "loss": 0.0145, "step": 2217 }, { "epoch": 4.33, "grad_norm": 1.082579918340218, "learning_rate": 3.7371494581569677e-07, "loss": 0.0148, "step": 2218 }, { "epoch": 4.33, "grad_norm": 0.6739386365624194, "learning_rate": 3.715721345142115e-07, "loss": 0.0067, "step": 2219 }, { "epoch": 4.34, "grad_norm": 0.7457646942550853, "learning_rate": 3.6943513970123184e-07, "loss": 0.01, "step": 2220 }, { "epoch": 4.34, "grad_norm": 1.3477428822370063, "learning_rate": 3.673039653499374e-07, "loss": 0.0078, "step": 2221 }, { "epoch": 4.34, "grad_norm": 0.8625696315749538, "learning_rate": 3.651786154226854e-07, "loss": 0.0095, "step": 2222 }, { "epoch": 4.34, "grad_norm": 0.691613804452843, "learning_rate": 3.630590938710062e-07, "loss": 0.0087, "step": 2223 }, { "epoch": 4.34, "grad_norm": 0.9760974334311598, "learning_rate": 3.609454046355911e-07, "loss": 0.0106, "step": 2224 }, { "epoch": 4.35, "grad_norm": 0.9101297781390008, "learning_rate": 3.588375516462901e-07, "loss": 0.0074, "step": 2225 }, { "epoch": 4.35, "grad_norm": 0.6194613119266024, "learning_rate": 3.5673553882209986e-07, "loss": 0.0092, "step": 2226 }, { "epoch": 4.35, "grad_norm": 0.9089215273020887, "learning_rate": 3.5463937007116125e-07, "loss": 0.0067, "step": 2227 }, { "epoch": 4.35, "grad_norm": 0.6609003743491086, "learning_rate": 3.525490492907494e-07, "loss": 0.0076, "step": 2228 }, { "epoch": 4.35, "grad_norm": 0.8894387747801777, "learning_rate": 3.5046458036726355e-07, "loss": 0.011, "step": 2229 }, { "epoch": 4.36, "grad_norm": 0.8750683764953999, "learning_rate": 3.483859671762278e-07, "loss": 0.0098, "step": 2230 }, { "epoch": 4.36, "grad_norm": 0.9934424939964446, "learning_rate": 3.4631321358227384e-07, "loss": 0.0242, "step": 2231 }, { "epoch": 4.36, "grad_norm": 0.6166188592433546, "learning_rate": 3.442463234391441e-07, "loss": 0.0093, "step": 2232 }, { "epoch": 4.36, "grad_norm": 0.8222979059957632, "learning_rate": 3.421853005896751e-07, "loss": 0.0096, "step": 2233 }, { "epoch": 4.36, "grad_norm": 0.6447523929536818, "learning_rate": 3.401301488657978e-07, "loss": 0.0063, "step": 2234 }, { "epoch": 4.37, "grad_norm": 0.9778189465447806, "learning_rate": 3.380808720885251e-07, "loss": 0.0093, "step": 2235 }, { "epoch": 4.37, "grad_norm": 0.6641283856905676, "learning_rate": 3.3603747406794833e-07, "loss": 0.0111, "step": 2236 }, { "epoch": 4.37, "grad_norm": 0.7499215657358077, "learning_rate": 3.3399995860322934e-07, "loss": 0.007, "step": 2237 }, { "epoch": 4.37, "grad_norm": 0.9443326081449869, "learning_rate": 3.3196832948259083e-07, "loss": 0.0083, "step": 2238 }, { "epoch": 4.37, "grad_norm": 0.7366114684978433, "learning_rate": 3.2994259048331295e-07, "loss": 0.008, "step": 2239 }, { "epoch": 4.38, "grad_norm": 0.7853917775668857, "learning_rate": 3.279227453717252e-07, "loss": 0.0071, "step": 2240 }, { "epoch": 4.38, "grad_norm": 0.7103601268262298, "learning_rate": 3.2590879790319744e-07, "loss": 0.01, "step": 2241 }, { "epoch": 4.38, "grad_norm": 0.7371842481055698, "learning_rate": 3.23900751822135e-07, "loss": 0.0099, "step": 2242 }, { "epoch": 4.38, "grad_norm": 0.8793705854679112, "learning_rate": 3.2189861086197146e-07, "loss": 0.0135, "step": 2243 }, { "epoch": 4.38, "grad_norm": 0.8705773477541504, "learning_rate": 3.1990237874516066e-07, "loss": 0.0081, "step": 2244 }, { "epoch": 4.38, "grad_norm": 0.9387249449778714, "learning_rate": 3.1791205918317164e-07, "loss": 0.0102, "step": 2245 }, { "epoch": 4.39, "grad_norm": 0.6996641577702766, "learning_rate": 3.1592765587648043e-07, "loss": 0.0133, "step": 2246 }, { "epoch": 4.39, "grad_norm": 0.5566991788153055, "learning_rate": 3.1394917251456133e-07, "loss": 0.0048, "step": 2247 }, { "epoch": 4.39, "grad_norm": 0.8090183727097974, "learning_rate": 3.1197661277588436e-07, "loss": 0.0138, "step": 2248 }, { "epoch": 4.39, "grad_norm": 0.9957574443233609, "learning_rate": 3.100099803279063e-07, "loss": 0.009, "step": 2249 }, { "epoch": 4.39, "grad_norm": 0.9794344041591084, "learning_rate": 3.0804927882706196e-07, "loss": 0.0098, "step": 2250 }, { "epoch": 4.4, "grad_norm": 0.6461253529014759, "learning_rate": 3.0609451191875913e-07, "loss": 0.0083, "step": 2251 }, { "epoch": 4.4, "grad_norm": 0.8696848167858718, "learning_rate": 3.0414568323737346e-07, "loss": 0.0108, "step": 2252 }, { "epoch": 4.4, "grad_norm": 0.9714292508957464, "learning_rate": 3.0220279640623946e-07, "loss": 0.0083, "step": 2253 }, { "epoch": 4.4, "grad_norm": 0.8400090683333469, "learning_rate": 3.002658550376426e-07, "loss": 0.0139, "step": 2254 }, { "epoch": 4.4, "grad_norm": 1.0293919265968725, "learning_rate": 2.983348627328177e-07, "loss": 0.0131, "step": 2255 }, { "epoch": 4.41, "grad_norm": 0.8166965167107386, "learning_rate": 2.964098230819351e-07, "loss": 0.0071, "step": 2256 }, { "epoch": 4.41, "grad_norm": 0.9118547669771392, "learning_rate": 2.9449073966410027e-07, "loss": 0.0093, "step": 2257 }, { "epoch": 4.41, "grad_norm": 0.5838547345648152, "learning_rate": 2.925776160473445e-07, "loss": 0.0083, "step": 2258 }, { "epoch": 4.41, "grad_norm": 0.7923936644081419, "learning_rate": 2.906704557886173e-07, "loss": 0.0071, "step": 2259 }, { "epoch": 4.41, "grad_norm": 0.6866354248360457, "learning_rate": 2.887692624337806e-07, "loss": 0.0067, "step": 2260 }, { "epoch": 4.42, "grad_norm": 0.7772728517290615, "learning_rate": 2.8687403951760417e-07, "loss": 0.0082, "step": 2261 }, { "epoch": 4.42, "grad_norm": 0.6937349593771819, "learning_rate": 2.8498479056375656e-07, "loss": 0.0072, "step": 2262 }, { "epoch": 4.42, "grad_norm": 0.6576530831371276, "learning_rate": 2.831015190847978e-07, "loss": 0.0067, "step": 2263 }, { "epoch": 4.42, "grad_norm": 0.43586071696543294, "learning_rate": 2.812242285821771e-07, "loss": 0.0033, "step": 2264 }, { "epoch": 4.42, "grad_norm": 0.6362336809228495, "learning_rate": 2.793529225462219e-07, "loss": 0.009, "step": 2265 }, { "epoch": 4.43, "grad_norm": 0.6988421293408299, "learning_rate": 2.774876044561331e-07, "loss": 0.0076, "step": 2266 }, { "epoch": 4.43, "grad_norm": 0.5668875036160492, "learning_rate": 2.7562827777997873e-07, "loss": 0.0083, "step": 2267 }, { "epoch": 4.43, "grad_norm": 0.6771407496753742, "learning_rate": 2.7377494597468916e-07, "loss": 0.0077, "step": 2268 }, { "epoch": 4.43, "grad_norm": 0.6295523643690123, "learning_rate": 2.719276124860448e-07, "loss": 0.0053, "step": 2269 }, { "epoch": 4.43, "grad_norm": 0.5964840505387873, "learning_rate": 2.700862807486774e-07, "loss": 0.006, "step": 2270 }, { "epoch": 4.44, "grad_norm": 0.7692968655209426, "learning_rate": 2.682509541860595e-07, "loss": 0.0095, "step": 2271 }, { "epoch": 4.44, "grad_norm": 0.9487395787159527, "learning_rate": 2.664216362104964e-07, "loss": 0.0064, "step": 2272 }, { "epoch": 4.44, "grad_norm": 0.755187624203599, "learning_rate": 2.6459833022312473e-07, "loss": 0.0072, "step": 2273 }, { "epoch": 4.44, "grad_norm": 0.6744060303068414, "learning_rate": 2.6278103961390257e-07, "loss": 0.0054, "step": 2274 }, { "epoch": 4.44, "grad_norm": 0.767050522982307, "learning_rate": 2.6096976776160246e-07, "loss": 0.0066, "step": 2275 }, { "epoch": 4.45, "grad_norm": 0.5867720147074326, "learning_rate": 2.591645180338085e-07, "loss": 0.0061, "step": 2276 }, { "epoch": 4.45, "grad_norm": 0.734349362618803, "learning_rate": 2.573652937869088e-07, "loss": 0.0065, "step": 2277 }, { "epoch": 4.45, "grad_norm": 0.781615927926399, "learning_rate": 2.555720983660852e-07, "loss": 0.0106, "step": 2278 }, { "epoch": 4.45, "grad_norm": 0.7026241037573061, "learning_rate": 2.5378493510531367e-07, "loss": 0.0079, "step": 2279 }, { "epoch": 4.45, "grad_norm": 0.7438702173405503, "learning_rate": 2.5200380732735444e-07, "loss": 0.0098, "step": 2280 }, { "epoch": 4.46, "grad_norm": 0.8068372629906113, "learning_rate": 2.502287183437458e-07, "loss": 0.0114, "step": 2281 }, { "epoch": 4.46, "grad_norm": 0.8405000112322127, "learning_rate": 2.4845967145479826e-07, "loss": 0.0076, "step": 2282 }, { "epoch": 4.46, "grad_norm": 0.797026634058468, "learning_rate": 2.4669666994959026e-07, "loss": 0.0074, "step": 2283 }, { "epoch": 4.46, "grad_norm": 0.8960697048565445, "learning_rate": 2.4493971710595773e-07, "loss": 0.009, "step": 2284 }, { "epoch": 4.46, "grad_norm": 1.7012192995953372, "learning_rate": 2.431888161904926e-07, "loss": 0.0075, "step": 2285 }, { "epoch": 4.46, "grad_norm": 0.7240418435536967, "learning_rate": 2.4144397045853586e-07, "loss": 0.0104, "step": 2286 }, { "epoch": 4.47, "grad_norm": 0.9670555749206023, "learning_rate": 2.397051831541677e-07, "loss": 0.0067, "step": 2287 }, { "epoch": 4.47, "grad_norm": 0.8995110046249202, "learning_rate": 2.3797245751020545e-07, "loss": 0.0118, "step": 2288 }, { "epoch": 4.47, "grad_norm": 0.784770712645654, "learning_rate": 2.3624579674819684e-07, "loss": 0.0114, "step": 2289 }, { "epoch": 4.47, "grad_norm": 0.7971098072622336, "learning_rate": 2.3452520407841404e-07, "loss": 0.0139, "step": 2290 }, { "epoch": 4.47, "grad_norm": 0.7242871097636779, "learning_rate": 2.3281068269984535e-07, "loss": 0.0069, "step": 2291 }, { "epoch": 4.48, "grad_norm": 0.924483551364942, "learning_rate": 2.3110223580019317e-07, "loss": 0.0086, "step": 2292 }, { "epoch": 4.48, "grad_norm": 0.7024241552474204, "learning_rate": 2.2939986655586364e-07, "loss": 0.0085, "step": 2293 }, { "epoch": 4.48, "grad_norm": 0.5427915249580988, "learning_rate": 2.2770357813196568e-07, "loss": 0.0069, "step": 2294 }, { "epoch": 4.48, "grad_norm": 0.6367411862334439, "learning_rate": 2.260133736823014e-07, "loss": 0.0085, "step": 2295 }, { "epoch": 4.48, "grad_norm": 0.6666320778608873, "learning_rate": 2.2432925634936062e-07, "loss": 0.0082, "step": 2296 }, { "epoch": 4.49, "grad_norm": 0.7118547690724517, "learning_rate": 2.2265122926431585e-07, "loss": 0.011, "step": 2297 }, { "epoch": 4.49, "grad_norm": 0.568950044212232, "learning_rate": 2.2097929554701795e-07, "loss": 0.0077, "step": 2298 }, { "epoch": 4.49, "grad_norm": 0.754215859109306, "learning_rate": 2.1931345830598803e-07, "loss": 0.0115, "step": 2299 }, { "epoch": 4.49, "grad_norm": 0.6556296032657493, "learning_rate": 2.176537206384112e-07, "loss": 0.0041, "step": 2300 }, { "epoch": 4.49, "grad_norm": 0.719610241908642, "learning_rate": 2.160000856301331e-07, "loss": 0.0084, "step": 2301 }, { "epoch": 4.5, "grad_norm": 0.6150238202425997, "learning_rate": 2.143525563556541e-07, "loss": 0.007, "step": 2302 }, { "epoch": 4.5, "grad_norm": 0.5963808727637655, "learning_rate": 2.127111358781198e-07, "loss": 0.0092, "step": 2303 }, { "epoch": 4.5, "grad_norm": 0.5707409095395829, "learning_rate": 2.1107582724932088e-07, "loss": 0.012, "step": 2304 }, { "epoch": 4.5, "grad_norm": 0.5956602086259971, "learning_rate": 2.0944663350968328e-07, "loss": 0.0045, "step": 2305 }, { "epoch": 4.5, "grad_norm": 0.4831961169480734, "learning_rate": 2.078235576882631e-07, "loss": 0.0052, "step": 2306 }, { "epoch": 4.51, "grad_norm": 0.5504734614233882, "learning_rate": 2.0620660280274355e-07, "loss": 0.0054, "step": 2307 }, { "epoch": 4.51, "grad_norm": 0.7205856057226538, "learning_rate": 2.0459577185942756e-07, "loss": 0.0116, "step": 2308 }, { "epoch": 4.51, "grad_norm": 0.8701075273513083, "learning_rate": 2.0299106785323e-07, "loss": 0.0143, "step": 2309 }, { "epoch": 4.51, "grad_norm": 0.7528571021010002, "learning_rate": 2.0139249376767654e-07, "loss": 0.0125, "step": 2310 }, { "epoch": 4.51, "grad_norm": 0.6976829421121338, "learning_rate": 1.998000525748958e-07, "loss": 0.0062, "step": 2311 }, { "epoch": 4.52, "grad_norm": 0.6063664380604492, "learning_rate": 1.9821374723561168e-07, "loss": 0.0063, "step": 2312 }, { "epoch": 4.52, "grad_norm": 1.4640443763034952, "learning_rate": 1.9663358069914292e-07, "loss": 0.0091, "step": 2313 }, { "epoch": 4.52, "grad_norm": 0.8189861813300339, "learning_rate": 1.9505955590339224e-07, "loss": 0.0113, "step": 2314 }, { "epoch": 4.52, "grad_norm": 0.7223203698740699, "learning_rate": 1.934916757748455e-07, "loss": 0.0144, "step": 2315 }, { "epoch": 4.52, "grad_norm": 0.4333191304928938, "learning_rate": 1.9192994322856282e-07, "loss": 0.0049, "step": 2316 }, { "epoch": 4.53, "grad_norm": 0.6741294501992041, "learning_rate": 1.903743611681759e-07, "loss": 0.0067, "step": 2317 }, { "epoch": 4.53, "grad_norm": 0.645421631142225, "learning_rate": 1.888249324858786e-07, "loss": 0.0085, "step": 2318 }, { "epoch": 4.53, "grad_norm": 0.6147342949883855, "learning_rate": 1.8728166006242702e-07, "loss": 0.0058, "step": 2319 }, { "epoch": 4.53, "grad_norm": 0.7249099792091177, "learning_rate": 1.8574454676713047e-07, "loss": 0.0085, "step": 2320 }, { "epoch": 4.53, "grad_norm": 0.7219034557803944, "learning_rate": 1.8421359545784576e-07, "loss": 0.0103, "step": 2321 }, { "epoch": 4.54, "grad_norm": 0.685979436148072, "learning_rate": 1.826888089809748e-07, "loss": 0.0063, "step": 2322 }, { "epoch": 4.54, "grad_norm": 0.6779000136451813, "learning_rate": 1.8117019017145636e-07, "loss": 0.0059, "step": 2323 }, { "epoch": 4.54, "grad_norm": 0.5099072241033692, "learning_rate": 1.7965774185276317e-07, "loss": 0.0055, "step": 2324 }, { "epoch": 4.54, "grad_norm": 0.6917903533333499, "learning_rate": 1.7815146683689398e-07, "loss": 0.0095, "step": 2325 }, { "epoch": 4.54, "grad_norm": 0.9394603738641661, "learning_rate": 1.7665136792437163e-07, "loss": 0.0092, "step": 2326 }, { "epoch": 4.54, "grad_norm": 0.6324900720350459, "learning_rate": 1.7515744790423538e-07, "loss": 0.0077, "step": 2327 }, { "epoch": 4.55, "grad_norm": 0.5635482931139341, "learning_rate": 1.736697095540361e-07, "loss": 0.0054, "step": 2328 }, { "epoch": 4.55, "grad_norm": 0.5199890912356024, "learning_rate": 1.7218815563983176e-07, "loss": 0.0068, "step": 2329 }, { "epoch": 4.55, "grad_norm": 0.6451483273029964, "learning_rate": 1.7071278891618263e-07, "loss": 0.0075, "step": 2330 }, { "epoch": 4.55, "grad_norm": 0.7537947834024276, "learning_rate": 1.692436121261448e-07, "loss": 0.0085, "step": 2331 }, { "epoch": 4.55, "grad_norm": 0.586675889409422, "learning_rate": 1.6778062800126503e-07, "loss": 0.0071, "step": 2332 }, { "epoch": 4.56, "grad_norm": 0.7250285731300471, "learning_rate": 1.6632383926157883e-07, "loss": 0.0067, "step": 2333 }, { "epoch": 4.56, "grad_norm": 0.6251724551629074, "learning_rate": 1.6487324861560043e-07, "loss": 0.0056, "step": 2334 }, { "epoch": 4.56, "grad_norm": 0.8215757190972992, "learning_rate": 1.6342885876032148e-07, "loss": 0.0115, "step": 2335 }, { "epoch": 4.56, "grad_norm": 0.764454242246146, "learning_rate": 1.6199067238120613e-07, "loss": 0.0052, "step": 2336 }, { "epoch": 4.56, "grad_norm": 0.8128899293679218, "learning_rate": 1.6055869215218199e-07, "loss": 0.0141, "step": 2337 }, { "epoch": 4.57, "grad_norm": 0.886813674742381, "learning_rate": 1.5913292073564023e-07, "loss": 0.0101, "step": 2338 }, { "epoch": 4.57, "grad_norm": 0.7738508744731303, "learning_rate": 1.577133607824281e-07, "loss": 0.0076, "step": 2339 }, { "epoch": 4.57, "grad_norm": 0.6419682850939733, "learning_rate": 1.563000149318439e-07, "loss": 0.0085, "step": 2340 }, { "epoch": 4.57, "grad_norm": 0.7022627438153904, "learning_rate": 1.548928858116309e-07, "loss": 0.0096, "step": 2341 }, { "epoch": 4.57, "grad_norm": 0.6910246386777852, "learning_rate": 1.534919760379771e-07, "loss": 0.0082, "step": 2342 }, { "epoch": 4.58, "grad_norm": 0.6776127367589944, "learning_rate": 1.5209728821550488e-07, "loss": 0.0068, "step": 2343 }, { "epoch": 4.58, "grad_norm": 0.6297088398925065, "learning_rate": 1.5070882493726911e-07, "loss": 0.0084, "step": 2344 }, { "epoch": 4.58, "grad_norm": 0.7309378248296826, "learning_rate": 1.4932658878475274e-07, "loss": 0.0069, "step": 2345 }, { "epoch": 4.58, "grad_norm": 0.7074389511179021, "learning_rate": 1.4795058232785913e-07, "loss": 0.0065, "step": 2346 }, { "epoch": 4.58, "grad_norm": 0.8027277695075743, "learning_rate": 1.465808081249112e-07, "loss": 0.0118, "step": 2347 }, { "epoch": 4.59, "grad_norm": 0.5323675955098753, "learning_rate": 1.4521726872264334e-07, "loss": 0.0048, "step": 2348 }, { "epoch": 4.59, "grad_norm": 0.6765860849905462, "learning_rate": 1.4385996665619865e-07, "loss": 0.0087, "step": 2349 }, { "epoch": 4.59, "grad_norm": 0.6984539442096124, "learning_rate": 1.4250890444912235e-07, "loss": 0.0058, "step": 2350 }, { "epoch": 4.59, "grad_norm": 0.6071108254892154, "learning_rate": 1.4116408461335976e-07, "loss": 0.0081, "step": 2351 }, { "epoch": 4.59, "grad_norm": 0.7969568426827094, "learning_rate": 1.398255096492499e-07, "loss": 0.0056, "step": 2352 }, { "epoch": 4.6, "grad_norm": 0.5615807402621124, "learning_rate": 1.3849318204551976e-07, "loss": 0.0084, "step": 2353 }, { "epoch": 4.6, "grad_norm": 0.49449073836627777, "learning_rate": 1.3716710427928297e-07, "loss": 0.0039, "step": 2354 }, { "epoch": 4.6, "grad_norm": 0.789003226420277, "learning_rate": 1.358472788160312e-07, "loss": 0.0044, "step": 2355 }, { "epoch": 4.6, "grad_norm": 0.5026079776603285, "learning_rate": 1.3453370810963294e-07, "loss": 0.0053, "step": 2356 }, { "epoch": 4.6, "grad_norm": 0.5624160676711326, "learning_rate": 1.332263946023285e-07, "loss": 0.0074, "step": 2357 }, { "epoch": 4.61, "grad_norm": 0.7311263659636638, "learning_rate": 1.3192534072472216e-07, "loss": 0.0087, "step": 2358 }, { "epoch": 4.61, "grad_norm": 0.4998872895937923, "learning_rate": 1.3063054889578118e-07, "loss": 0.0046, "step": 2359 }, { "epoch": 4.61, "grad_norm": 0.6157896586062911, "learning_rate": 1.2934202152283052e-07, "loss": 0.0056, "step": 2360 }, { "epoch": 4.61, "grad_norm": 0.7034386969562145, "learning_rate": 1.2805976100154875e-07, "loss": 0.0132, "step": 2361 }, { "epoch": 4.61, "grad_norm": 0.5461820659579241, "learning_rate": 1.2678376971596057e-07, "loss": 0.0056, "step": 2362 }, { "epoch": 4.62, "grad_norm": 0.6128927286855306, "learning_rate": 1.2551405003843678e-07, "loss": 0.0107, "step": 2363 }, { "epoch": 4.62, "grad_norm": 0.5908380008093889, "learning_rate": 1.242506043296871e-07, "loss": 0.0082, "step": 2364 }, { "epoch": 4.62, "grad_norm": 0.5391734636977055, "learning_rate": 1.2299343493875598e-07, "loss": 0.0042, "step": 2365 }, { "epoch": 4.62, "grad_norm": 0.5117862800976363, "learning_rate": 1.2174254420301934e-07, "loss": 0.005, "step": 2366 }, { "epoch": 4.62, "grad_norm": 0.7847698567038798, "learning_rate": 1.204979344481802e-07, "loss": 0.0103, "step": 2367 }, { "epoch": 4.62, "grad_norm": 0.6588301141924102, "learning_rate": 1.192596079882613e-07, "loss": 0.0054, "step": 2368 }, { "epoch": 4.63, "grad_norm": 0.8010396626893873, "learning_rate": 1.1802756712560553e-07, "loss": 0.0092, "step": 2369 }, { "epoch": 4.63, "grad_norm": 0.4677511790138229, "learning_rate": 1.1680181415086965e-07, "loss": 0.0053, "step": 2370 }, { "epoch": 4.63, "grad_norm": 0.4067797866748764, "learning_rate": 1.1558235134301776e-07, "loss": 0.0048, "step": 2371 }, { "epoch": 4.63, "grad_norm": 0.5898043109195867, "learning_rate": 1.1436918096932042e-07, "loss": 0.0077, "step": 2372 }, { "epoch": 4.63, "grad_norm": 0.46064824385201414, "learning_rate": 1.1316230528534892e-07, "loss": 0.0054, "step": 2373 }, { "epoch": 4.64, "grad_norm": 0.4473588902315319, "learning_rate": 1.1196172653497061e-07, "loss": 0.006, "step": 2374 }, { "epoch": 4.64, "grad_norm": 0.7269399338737582, "learning_rate": 1.1076744695034606e-07, "loss": 0.0086, "step": 2375 }, { "epoch": 4.64, "grad_norm": 0.6530034745906561, "learning_rate": 1.095794687519242e-07, "loss": 0.0085, "step": 2376 }, { "epoch": 4.64, "grad_norm": 0.7618873597098633, "learning_rate": 1.0839779414843786e-07, "loss": 0.0071, "step": 2377 }, { "epoch": 4.64, "grad_norm": 0.5396529620091278, "learning_rate": 1.0722242533689924e-07, "loss": 0.0059, "step": 2378 }, { "epoch": 4.65, "grad_norm": 0.712482411956327, "learning_rate": 1.0605336450259867e-07, "loss": 0.0052, "step": 2379 }, { "epoch": 4.65, "grad_norm": 0.6490666934045675, "learning_rate": 1.0489061381909609e-07, "loss": 0.0039, "step": 2380 }, { "epoch": 4.65, "grad_norm": 0.4342401255073375, "learning_rate": 1.0373417544822106e-07, "loss": 0.0065, "step": 2381 }, { "epoch": 4.65, "grad_norm": 0.5834670933858809, "learning_rate": 1.025840515400665e-07, "loss": 0.0069, "step": 2382 }, { "epoch": 4.65, "grad_norm": 0.5760685268030562, "learning_rate": 1.0144024423298487e-07, "loss": 0.0053, "step": 2383 }, { "epoch": 4.66, "grad_norm": 0.7433831742773176, "learning_rate": 1.0030275565358499e-07, "loss": 0.0067, "step": 2384 }, { "epoch": 4.66, "grad_norm": 0.9737709768277468, "learning_rate": 9.91715879167278e-08, "loss": 0.0074, "step": 2385 }, { "epoch": 4.66, "grad_norm": 0.7362827406544866, "learning_rate": 9.804674312552214e-08, "loss": 0.0075, "step": 2386 }, { "epoch": 4.66, "grad_norm": 0.7866304270572921, "learning_rate": 9.692822337132074e-08, "loss": 0.0072, "step": 2387 }, { "epoch": 4.66, "grad_norm": 0.7420939279109842, "learning_rate": 9.581603073371642e-08, "loss": 0.0116, "step": 2388 }, { "epoch": 4.67, "grad_norm": 0.40234617723778915, "learning_rate": 9.471016728053976e-08, "loss": 0.005, "step": 2389 }, { "epoch": 4.67, "grad_norm": 0.5368273526476893, "learning_rate": 9.361063506785172e-08, "loss": 0.0073, "step": 2390 }, { "epoch": 4.67, "grad_norm": 0.7478582873971582, "learning_rate": 9.251743613994395e-08, "loss": 0.0144, "step": 2391 }, { "epoch": 4.67, "grad_norm": 0.5255466615714839, "learning_rate": 9.143057252933229e-08, "loss": 0.0056, "step": 2392 }, { "epoch": 4.67, "grad_norm": 0.7608294662174574, "learning_rate": 9.035004625675319e-08, "loss": 0.0116, "step": 2393 }, { "epoch": 4.68, "grad_norm": 0.42159445701887793, "learning_rate": 8.927585933116144e-08, "loss": 0.0048, "step": 2394 }, { "epoch": 4.68, "grad_norm": 0.6388485192444422, "learning_rate": 8.82080137497243e-08, "loss": 0.009, "step": 2395 }, { "epoch": 4.68, "grad_norm": 0.6307294632184463, "learning_rate": 8.714651149782038e-08, "loss": 0.0106, "step": 2396 }, { "epoch": 4.68, "grad_norm": 0.6533021612487023, "learning_rate": 8.609135454903332e-08, "loss": 0.0086, "step": 2397 }, { "epoch": 4.68, "grad_norm": 0.571430565523492, "learning_rate": 8.504254486515039e-08, "loss": 0.0064, "step": 2398 }, { "epoch": 4.69, "grad_norm": 0.7628405279520145, "learning_rate": 8.400008439615653e-08, "loss": 0.0085, "step": 2399 }, { "epoch": 4.69, "grad_norm": 0.6354102811530968, "learning_rate": 8.296397508023323e-08, "loss": 0.0091, "step": 2400 }, { "epoch": 4.69, "grad_norm": 0.6066979253058663, "learning_rate": 8.193421884375312e-08, "loss": 0.0113, "step": 2401 }, { "epoch": 4.69, "grad_norm": 0.813720125773974, "learning_rate": 8.091081760127683e-08, "loss": 0.0065, "step": 2402 }, { "epoch": 4.69, "grad_norm": 0.44982274093368324, "learning_rate": 7.989377325554986e-08, "loss": 0.0054, "step": 2403 }, { "epoch": 4.7, "grad_norm": 0.5635965844738847, "learning_rate": 7.888308769749875e-08, "loss": 0.0079, "step": 2404 }, { "epoch": 4.7, "grad_norm": 0.6861004140428949, "learning_rate": 7.787876280622674e-08, "loss": 0.0086, "step": 2405 }, { "epoch": 4.7, "grad_norm": 0.8074093410916349, "learning_rate": 7.688080044901191e-08, "loss": 0.0061, "step": 2406 }, { "epoch": 4.7, "grad_norm": 0.7908883711230914, "learning_rate": 7.588920248130359e-08, "loss": 0.0135, "step": 2407 }, { "epoch": 4.7, "grad_norm": 0.4879579716104912, "learning_rate": 7.490397074671583e-08, "loss": 0.005, "step": 2408 }, { "epoch": 4.71, "grad_norm": 0.47379497276955934, "learning_rate": 7.392510707702892e-08, "loss": 0.0055, "step": 2409 }, { "epoch": 4.71, "grad_norm": 0.7724665091097025, "learning_rate": 7.2952613292182e-08, "loss": 0.0122, "step": 2410 }, { "epoch": 4.71, "grad_norm": 0.623941249627725, "learning_rate": 7.19864912002715e-08, "loss": 0.0076, "step": 2411 }, { "epoch": 4.71, "grad_norm": 0.32465434001442367, "learning_rate": 7.102674259754693e-08, "loss": 0.0039, "step": 2412 }, { "epoch": 4.71, "grad_norm": 0.5781388109864856, "learning_rate": 7.007336926840846e-08, "loss": 0.0057, "step": 2413 }, { "epoch": 4.71, "grad_norm": 0.5924228164628174, "learning_rate": 6.912637298540347e-08, "loss": 0.008, "step": 2414 }, { "epoch": 4.72, "grad_norm": 0.5411622635263504, "learning_rate": 6.818575550922112e-08, "loss": 0.008, "step": 2415 }, { "epoch": 4.72, "grad_norm": 0.7097326424413541, "learning_rate": 6.72515185886935e-08, "loss": 0.0071, "step": 2416 }, { "epoch": 4.72, "grad_norm": 0.6441263132474666, "learning_rate": 6.632366396078782e-08, "loss": 0.0073, "step": 2417 }, { "epoch": 4.72, "grad_norm": 0.49035759693180636, "learning_rate": 6.540219335060493e-08, "loss": 0.006, "step": 2418 }, { "epoch": 4.72, "grad_norm": 0.6444942771866286, "learning_rate": 6.44871084713785e-08, "loss": 0.0074, "step": 2419 }, { "epoch": 4.73, "grad_norm": 0.5388160812617293, "learning_rate": 6.357841102446649e-08, "loss": 0.0067, "step": 2420 }, { "epoch": 4.73, "grad_norm": 0.5870660472369815, "learning_rate": 6.267610269935419e-08, "loss": 0.0055, "step": 2421 }, { "epoch": 4.73, "grad_norm": 0.9814288988854892, "learning_rate": 6.178018517364503e-08, "loss": 0.0159, "step": 2422 }, { "epoch": 4.73, "grad_norm": 0.5973559345416917, "learning_rate": 6.089066011306354e-08, "loss": 0.0073, "step": 2423 }, { "epoch": 4.73, "grad_norm": 0.7074882943657331, "learning_rate": 6.000752917144614e-08, "loss": 0.0092, "step": 2424 }, { "epoch": 4.74, "grad_norm": 0.715158675873135, "learning_rate": 5.9130793990743004e-08, "loss": 0.0099, "step": 2425 }, { "epoch": 4.74, "grad_norm": 0.5781003802790617, "learning_rate": 5.8260456201012664e-08, "loss": 0.0072, "step": 2426 }, { "epoch": 4.74, "grad_norm": 0.5897956078844387, "learning_rate": 5.73965174204189e-08, "loss": 0.0062, "step": 2427 }, { "epoch": 4.74, "grad_norm": 0.6179543916382112, "learning_rate": 5.653897925522877e-08, "loss": 0.0091, "step": 2428 }, { "epoch": 4.74, "grad_norm": 0.7016893821207093, "learning_rate": 5.5687843299809524e-08, "loss": 0.008, "step": 2429 }, { "epoch": 4.75, "grad_norm": 0.43385360213059615, "learning_rate": 5.4843111136623545e-08, "loss": 0.0036, "step": 2430 }, { "epoch": 4.75, "grad_norm": 0.5129467256906228, "learning_rate": 5.400478433622835e-08, "loss": 0.0073, "step": 2431 }, { "epoch": 4.75, "grad_norm": 0.5587226465837027, "learning_rate": 5.3172864457271926e-08, "loss": 0.0065, "step": 2432 }, { "epoch": 4.75, "grad_norm": 0.6435223019206732, "learning_rate": 5.2347353046490795e-08, "loss": 0.007, "step": 2433 }, { "epoch": 4.75, "grad_norm": 0.4163067357677234, "learning_rate": 5.1528251638705724e-08, "loss": 0.0043, "step": 2434 }, { "epoch": 4.76, "grad_norm": 0.5666160943907882, "learning_rate": 5.071556175682057e-08, "loss": 0.0067, "step": 2435 }, { "epoch": 4.76, "grad_norm": 0.6759769983340403, "learning_rate": 4.990928491181839e-08, "loss": 0.0098, "step": 2436 }, { "epoch": 4.76, "grad_norm": 0.4149986106615274, "learning_rate": 4.9109422602758746e-08, "loss": 0.0032, "step": 2437 }, { "epoch": 4.76, "grad_norm": 0.3546879432707271, "learning_rate": 4.83159763167757e-08, "loss": 0.0036, "step": 2438 }, { "epoch": 4.76, "grad_norm": 0.6433217161335806, "learning_rate": 4.752894752907283e-08, "loss": 0.0062, "step": 2439 }, { "epoch": 4.77, "grad_norm": 0.7040632890152784, "learning_rate": 4.674833770292358e-08, "loss": 0.0095, "step": 2440 }, { "epoch": 4.77, "grad_norm": 0.5057220801434031, "learning_rate": 4.597414828966661e-08, "loss": 0.0054, "step": 2441 }, { "epoch": 4.77, "grad_norm": 0.6907903075857185, "learning_rate": 4.5206380728703474e-08, "loss": 0.0088, "step": 2442 }, { "epoch": 4.77, "grad_norm": 0.6291797468352361, "learning_rate": 4.444503644749548e-08, "loss": 0.0089, "step": 2443 }, { "epoch": 4.77, "grad_norm": 0.5436655514901892, "learning_rate": 4.369011686156293e-08, "loss": 0.0075, "step": 2444 }, { "epoch": 4.78, "grad_norm": 0.8213098552263824, "learning_rate": 4.294162337447932e-08, "loss": 0.011, "step": 2445 }, { "epoch": 4.78, "grad_norm": 0.5190747399939006, "learning_rate": 4.2199557377871676e-08, "loss": 0.0059, "step": 2446 }, { "epoch": 4.78, "grad_norm": 0.6462969229678435, "learning_rate": 4.146392025141671e-08, "loss": 0.0051, "step": 2447 }, { "epoch": 4.78, "grad_norm": 0.5858487755420504, "learning_rate": 4.073471336283768e-08, "loss": 0.0052, "step": 2448 }, { "epoch": 4.78, "grad_norm": 0.36386478777058745, "learning_rate": 4.0011938067902874e-08, "loss": 0.0042, "step": 2449 }, { "epoch": 4.79, "grad_norm": 0.5593622928101735, "learning_rate": 3.929559571042324e-08, "loss": 0.0099, "step": 2450 }, { "epoch": 4.79, "grad_norm": 0.7504241124477065, "learning_rate": 3.85856876222489e-08, "loss": 0.0149, "step": 2451 }, { "epoch": 4.79, "grad_norm": 0.590886491522491, "learning_rate": 3.788221512326645e-08, "loss": 0.0082, "step": 2452 }, { "epoch": 4.79, "grad_norm": 0.5596647503727203, "learning_rate": 3.718517952139894e-08, "loss": 0.0078, "step": 2453 }, { "epoch": 4.79, "grad_norm": 0.4414940494361714, "learning_rate": 3.6494582112600036e-08, "loss": 0.0048, "step": 2454 }, { "epoch": 4.79, "grad_norm": 0.6008700480900856, "learning_rate": 3.5810424180853674e-08, "loss": 0.0079, "step": 2455 }, { "epoch": 4.8, "grad_norm": 0.597744799132511, "learning_rate": 3.5132706998172444e-08, "loss": 0.0052, "step": 2456 }, { "epoch": 4.8, "grad_norm": 0.7165035937486808, "learning_rate": 3.4461431824592604e-08, "loss": 0.014, "step": 2457 }, { "epoch": 4.8, "grad_norm": 0.6400067118704326, "learning_rate": 3.3796599908173244e-08, "loss": 0.0097, "step": 2458 }, { "epoch": 4.8, "grad_norm": 0.7575196750183034, "learning_rate": 3.3138212484994764e-08, "loss": 0.01, "step": 2459 }, { "epoch": 4.8, "grad_norm": 0.6646579824838726, "learning_rate": 3.248627077915578e-08, "loss": 0.0111, "step": 2460 }, { "epoch": 4.81, "grad_norm": 0.5377334679964209, "learning_rate": 3.1840776002769965e-08, "loss": 0.0058, "step": 2461 }, { "epoch": 4.81, "grad_norm": 0.9058421382877891, "learning_rate": 3.1201729355964934e-08, "loss": 0.0117, "step": 2462 }, { "epoch": 4.81, "grad_norm": 0.7598461308407503, "learning_rate": 3.0569132026880276e-08, "loss": 0.007, "step": 2463 }, { "epoch": 4.81, "grad_norm": 0.7268803344535921, "learning_rate": 2.994298519166366e-08, "loss": 0.0126, "step": 2464 }, { "epoch": 4.81, "grad_norm": 0.5819843362666477, "learning_rate": 2.9323290014470483e-08, "loss": 0.0054, "step": 2465 }, { "epoch": 4.82, "grad_norm": 0.8223315939296895, "learning_rate": 2.871004764746149e-08, "loss": 0.0231, "step": 2466 }, { "epoch": 4.82, "grad_norm": 0.6040033535248468, "learning_rate": 2.8103259230798925e-08, "loss": 0.0081, "step": 2467 }, { "epoch": 4.82, "grad_norm": 0.7567079147835991, "learning_rate": 2.7502925892646135e-08, "loss": 0.0118, "step": 2468 }, { "epoch": 4.82, "grad_norm": 0.7457675428462858, "learning_rate": 2.6909048749165607e-08, "loss": 0.0088, "step": 2469 }, { "epoch": 4.82, "grad_norm": 0.5907784123672692, "learning_rate": 2.6321628904515114e-08, "loss": 0.0104, "step": 2470 }, { "epoch": 4.83, "grad_norm": 0.5875252869517189, "learning_rate": 2.5740667450847297e-08, "loss": 0.0076, "step": 2471 }, { "epoch": 4.83, "grad_norm": 0.4603332048690236, "learning_rate": 2.5166165468307356e-08, "loss": 0.0043, "step": 2472 }, { "epoch": 4.83, "grad_norm": 0.5883497307592532, "learning_rate": 2.45981240250307e-08, "loss": 0.0057, "step": 2473 }, { "epoch": 4.83, "grad_norm": 0.8394940697821736, "learning_rate": 2.403654417714024e-08, "loss": 0.0168, "step": 2474 }, { "epoch": 4.83, "grad_norm": 0.5743702333717633, "learning_rate": 2.3481426968747165e-08, "loss": 0.006, "step": 2475 }, { "epoch": 4.84, "grad_norm": 0.5829076615903376, "learning_rate": 2.293277343194472e-08, "loss": 0.0067, "step": 2476 }, { "epoch": 4.84, "grad_norm": 0.8031426490510309, "learning_rate": 2.2390584586810147e-08, "loss": 0.0176, "step": 2477 }, { "epoch": 4.84, "grad_norm": 0.5735485488632319, "learning_rate": 2.1854861441401195e-08, "loss": 0.0082, "step": 2478 }, { "epoch": 4.84, "grad_norm": 0.5645845490321463, "learning_rate": 2.132560499175379e-08, "loss": 0.0086, "step": 2479 }, { "epoch": 4.84, "grad_norm": 0.41918971491254015, "learning_rate": 2.0802816221881235e-08, "loss": 0.0043, "step": 2480 }, { "epoch": 4.85, "grad_norm": 0.6400080794020628, "learning_rate": 2.0286496103771922e-08, "loss": 0.0072, "step": 2481 }, { "epoch": 4.85, "grad_norm": 0.5518028219228481, "learning_rate": 1.9776645597386564e-08, "loss": 0.0034, "step": 2482 }, { "epoch": 4.85, "grad_norm": 0.6760728388731301, "learning_rate": 1.927326565065862e-08, "loss": 0.0081, "step": 2483 }, { "epoch": 4.85, "grad_norm": 0.5708848881600952, "learning_rate": 1.8776357199490778e-08, "loss": 0.0084, "step": 2484 }, { "epoch": 4.85, "grad_norm": 0.629903912307707, "learning_rate": 1.8285921167753403e-08, "loss": 0.0089, "step": 2485 }, { "epoch": 4.86, "grad_norm": 0.4932786505842596, "learning_rate": 1.780195846728261e-08, "loss": 0.0043, "step": 2486 }, { "epoch": 4.86, "grad_norm": 0.38084008605713293, "learning_rate": 1.732446999788023e-08, "loss": 0.0049, "step": 2487 }, { "epoch": 4.86, "grad_norm": 0.5594494886081243, "learning_rate": 1.6853456647311137e-08, "loss": 0.0071, "step": 2488 }, { "epoch": 4.86, "grad_norm": 0.7968424290887429, "learning_rate": 1.638891929129932e-08, "loss": 0.0128, "step": 2489 }, { "epoch": 4.86, "grad_norm": 0.8026723743226606, "learning_rate": 1.593085879353062e-08, "loss": 0.0069, "step": 2490 }, { "epoch": 4.87, "grad_norm": 0.5908927668269276, "learning_rate": 1.5479276005648467e-08, "loss": 0.0091, "step": 2491 }, { "epoch": 4.87, "grad_norm": 0.617116103122423, "learning_rate": 1.5034171767251135e-08, "loss": 0.0098, "step": 2492 }, { "epoch": 4.87, "grad_norm": 0.5847791881717312, "learning_rate": 1.459554690589332e-08, "loss": 0.0067, "step": 2493 }, { "epoch": 4.87, "grad_norm": 0.7353151955237581, "learning_rate": 1.4163402237083011e-08, "loss": 0.0125, "step": 2494 }, { "epoch": 4.87, "grad_norm": 0.5620116100754905, "learning_rate": 1.3737738564278789e-08, "loss": 0.0084, "step": 2495 }, { "epoch": 4.88, "grad_norm": 0.6416885597619675, "learning_rate": 1.3318556678890592e-08, "loss": 0.0092, "step": 2496 }, { "epoch": 4.88, "grad_norm": 0.5003108860029222, "learning_rate": 1.2905857360276996e-08, "loss": 0.004, "step": 2497 }, { "epoch": 4.88, "grad_norm": 0.5082574549335531, "learning_rate": 1.2499641375743664e-08, "loss": 0.0066, "step": 2498 }, { "epoch": 4.88, "grad_norm": 0.7022078299904477, "learning_rate": 1.2099909480542181e-08, "loss": 0.0129, "step": 2499 }, { "epoch": 4.88, "grad_norm": 0.7343750477953438, "learning_rate": 1.1706662417868885e-08, "loss": 0.008, "step": 2500 }, { "epoch": 4.88, "grad_norm": 0.6902620466123787, "learning_rate": 1.1319900918863313e-08, "loss": 0.0094, "step": 2501 }, { "epoch": 4.89, "grad_norm": 0.5015932761028837, "learning_rate": 1.0939625702607036e-08, "loss": 0.006, "step": 2502 }, { "epoch": 4.89, "grad_norm": 1.0128866519516877, "learning_rate": 1.0565837476121332e-08, "loss": 0.0088, "step": 2503 }, { "epoch": 4.89, "grad_norm": 0.6049428036050116, "learning_rate": 1.0198536934366786e-08, "loss": 0.0084, "step": 2504 }, { "epoch": 4.89, "grad_norm": 0.7093547025357841, "learning_rate": 9.837724760242916e-09, "loss": 0.0122, "step": 2505 }, { "epoch": 4.89, "grad_norm": 0.4102008018794113, "learning_rate": 9.483401624584276e-09, "loss": 0.0038, "step": 2506 }, { "epoch": 4.9, "grad_norm": 0.8037876778280714, "learning_rate": 9.135568186162012e-09, "loss": 0.0173, "step": 2507 }, { "epoch": 4.9, "grad_norm": 0.7400040294979336, "learning_rate": 8.794225091680763e-09, "loss": 0.0133, "step": 2508 }, { "epoch": 4.9, "grad_norm": 0.5216234601575764, "learning_rate": 8.459372975777868e-09, "loss": 0.0055, "step": 2509 }, { "epoch": 4.9, "grad_norm": 0.6209520383520167, "learning_rate": 8.13101246102338e-09, "loss": 0.0068, "step": 2510 }, { "epoch": 4.9, "grad_norm": 0.709914128557043, "learning_rate": 7.809144157916947e-09, "loss": 0.0139, "step": 2511 }, { "epoch": 4.91, "grad_norm": 0.5606710948766461, "learning_rate": 7.493768664887822e-09, "loss": 0.0065, "step": 2512 }, { "epoch": 4.91, "grad_norm": 0.7821002542003698, "learning_rate": 7.18488656829408e-09, "loss": 0.013, "step": 2513 }, { "epoch": 4.91, "grad_norm": 0.7110982125432198, "learning_rate": 6.882498442420282e-09, "loss": 0.0117, "step": 2514 }, { "epoch": 4.91, "grad_norm": 0.6803672647108202, "learning_rate": 6.586604849477873e-09, "loss": 0.0127, "step": 2515 }, { "epoch": 4.91, "grad_norm": 0.5481096416510047, "learning_rate": 6.2972063396032336e-09, "loss": 0.0055, "step": 2516 }, { "epoch": 4.92, "grad_norm": 0.8016408253988913, "learning_rate": 6.0143034508565175e-09, "loss": 0.0174, "step": 2517 }, { "epoch": 4.92, "grad_norm": 0.5181442214803618, "learning_rate": 5.737896709221257e-09, "loss": 0.0064, "step": 2518 }, { "epoch": 4.92, "grad_norm": 0.4478982128994506, "learning_rate": 5.467986628603205e-09, "loss": 0.0045, "step": 2519 }, { "epoch": 4.92, "grad_norm": 0.49028023405870635, "learning_rate": 5.204573710829163e-09, "loss": 0.0065, "step": 2520 }, { "epoch": 4.92, "grad_norm": 0.4644531569330712, "learning_rate": 4.947658445645819e-09, "loss": 0.0051, "step": 2521 }, { "epoch": 4.93, "grad_norm": 0.7539058597251096, "learning_rate": 4.697241310720135e-09, "loss": 0.0127, "step": 2522 }, { "epoch": 4.93, "grad_norm": 0.5961465793094147, "learning_rate": 4.453322771636236e-09, "loss": 0.0088, "step": 2523 }, { "epoch": 4.93, "grad_norm": 0.5643052017647299, "learning_rate": 4.2159032818965825e-09, "loss": 0.0092, "step": 2524 }, { "epoch": 4.93, "grad_norm": 0.6686739409448493, "learning_rate": 3.984983282920795e-09, "loss": 0.0103, "step": 2525 }, { "epoch": 4.93, "grad_norm": 0.6762848681205267, "learning_rate": 3.760563204042944e-09, "loss": 0.0131, "step": 2526 }, { "epoch": 4.94, "grad_norm": 0.6261200383967055, "learning_rate": 3.5426434625138724e-09, "loss": 0.0123, "step": 2527 }, { "epoch": 4.94, "grad_norm": 0.5478710329511033, "learning_rate": 3.331224463497706e-09, "loss": 0.0088, "step": 2528 }, { "epoch": 4.94, "grad_norm": 0.506257338463317, "learning_rate": 3.126306600072626e-09, "loss": 0.0067, "step": 2529 }, { "epoch": 4.94, "grad_norm": 0.7423105521603498, "learning_rate": 2.9278902532293148e-09, "loss": 0.015, "step": 2530 }, { "epoch": 4.94, "grad_norm": 0.6719684539688784, "learning_rate": 2.7359757918709593e-09, "loss": 0.0097, "step": 2531 }, { "epoch": 4.95, "grad_norm": 0.6248719295989908, "learning_rate": 2.5505635728116927e-09, "loss": 0.0106, "step": 2532 }, { "epoch": 4.95, "grad_norm": 0.44691156211009997, "learning_rate": 2.371653940776597e-09, "loss": 0.0042, "step": 2533 }, { "epoch": 4.95, "grad_norm": 0.6803814455764753, "learning_rate": 2.199247228401702e-09, "loss": 0.0117, "step": 2534 }, { "epoch": 4.95, "grad_norm": 0.7646627954120644, "learning_rate": 2.0333437562316535e-09, "loss": 0.0151, "step": 2535 }, { "epoch": 4.95, "grad_norm": 0.6561545216096235, "learning_rate": 1.873943832720104e-09, "loss": 0.0122, "step": 2536 }, { "epoch": 4.96, "grad_norm": 0.458676167205206, "learning_rate": 1.7210477542297098e-09, "loss": 0.0037, "step": 2537 }, { "epoch": 4.96, "grad_norm": 0.6255993192870032, "learning_rate": 1.5746558050298009e-09, "loss": 0.0092, "step": 2538 }, { "epoch": 4.96, "grad_norm": 0.6098144460854176, "learning_rate": 1.4347682572983244e-09, "loss": 0.0099, "step": 2539 }, { "epoch": 4.96, "grad_norm": 0.46680440057369726, "learning_rate": 1.3013853711191237e-09, "loss": 0.0051, "step": 2540 }, { "epoch": 4.96, "grad_norm": 0.39336148817996, "learning_rate": 1.1745073944827156e-09, "loss": 0.0036, "step": 2541 }, { "epoch": 4.96, "grad_norm": 0.6217222573154036, "learning_rate": 1.054134563285125e-09, "loss": 0.0103, "step": 2542 }, { "epoch": 4.97, "grad_norm": 0.7765375852591551, "learning_rate": 9.402671013282738e-10, "loss": 0.014, "step": 2543 }, { "epoch": 4.97, "grad_norm": 0.765536414620917, "learning_rate": 8.329052203180364e-10, "loss": 0.0145, "step": 2544 }, { "epoch": 4.97, "grad_norm": 0.5439211639886006, "learning_rate": 7.320491198665735e-10, "loss": 0.0063, "step": 2545 }, { "epoch": 4.97, "grad_norm": 0.4566196445414991, "learning_rate": 6.376989874884443e-10, "loss": 0.0071, "step": 2546 }, { "epoch": 4.97, "grad_norm": 0.5345307876553409, "learning_rate": 5.498549986033274e-10, "loss": 0.009, "step": 2547 }, { "epoch": 4.98, "grad_norm": 0.6431793667378372, "learning_rate": 4.685173165336897e-10, "loss": 0.0118, "step": 2548 }, { "epoch": 4.98, "grad_norm": 0.5546944352896954, "learning_rate": 3.9368609250595154e-10, "loss": 0.0077, "step": 2549 }, { "epoch": 4.98, "grad_norm": 0.586695724635473, "learning_rate": 3.253614656489323e-10, "loss": 0.0069, "step": 2550 }, { "epoch": 4.98, "grad_norm": 0.5688268201208754, "learning_rate": 2.6354356299423954e-10, "loss": 0.0096, "step": 2551 }, { "epoch": 4.98, "grad_norm": 0.716840436744899, "learning_rate": 2.0823249947587997e-10, "loss": 0.0145, "step": 2552 }, { "epoch": 4.99, "grad_norm": 0.539301350441711, "learning_rate": 1.5942837793025965e-10, "loss": 0.0061, "step": 2553 }, { "epoch": 4.99, "grad_norm": 0.8006199558084057, "learning_rate": 1.1713128909618397e-10, "loss": 0.0197, "step": 2554 }, { "epoch": 4.99, "grad_norm": 0.43925492152264967, "learning_rate": 8.134131161330327e-11, "loss": 0.0049, "step": 2555 }, { "epoch": 4.99, "grad_norm": 0.6484968919471484, "learning_rate": 5.205851202444434e-11, "loss": 0.009, "step": 2556 }, { "epoch": 4.99, "grad_norm": 0.6876739610623027, "learning_rate": 2.92829447728904e-11, "loss": 0.01, "step": 2557 }, { "epoch": 5.0, "grad_norm": 0.5755598805868147, "learning_rate": 1.3014652203546806e-11, "loss": 0.0082, "step": 2558 }, { "epoch": 5.0, "grad_norm": 0.697212696464128, "learning_rate": 3.25366456332965e-12, "loss": 0.0061, "step": 2559 }, { "epoch": 5.0, "grad_norm": 0.6190211719717219, "learning_rate": 0.0, "loss": 0.0094, "step": 2560 }, { "epoch": 5.0, "step": 2560, "total_flos": 0.0, "train_loss": 0.16601209418440704, "train_runtime": 5278.4751, "train_samples_per_second": 15.536, "train_steps_per_second": 0.485 } ], "logging_steps": 1.0, "max_steps": 2560, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }