diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4914 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.995695839311334, + "eval_steps": 500, + "global_step": 696, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00430416068866571, + "grad_norm": 5.716188907623291, + "learning_rate": 1.4285714285714287e-07, + "loss": 0.7934, + "step": 1 + }, + { + "epoch": 0.00860832137733142, + "grad_norm": 5.855449199676514, + "learning_rate": 2.8571428571428575e-07, + "loss": 0.8156, + "step": 2 + }, + { + "epoch": 0.01291248206599713, + "grad_norm": 5.917946815490723, + "learning_rate": 4.285714285714286e-07, + "loss": 0.8216, + "step": 3 + }, + { + "epoch": 0.01721664275466284, + "grad_norm": 6.008902072906494, + "learning_rate": 5.714285714285715e-07, + "loss": 0.8243, + "step": 4 + }, + { + "epoch": 0.021520803443328552, + "grad_norm": 6.13193941116333, + "learning_rate": 7.142857142857143e-07, + "loss": 0.8402, + "step": 5 + }, + { + "epoch": 0.02582496413199426, + "grad_norm": 5.711304664611816, + "learning_rate": 8.571428571428572e-07, + "loss": 0.7946, + "step": 6 + }, + { + "epoch": 0.03012912482065997, + "grad_norm": 5.6730499267578125, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.8204, + "step": 7 + }, + { + "epoch": 0.03443328550932568, + "grad_norm": 5.452306747436523, + "learning_rate": 1.142857142857143e-06, + "loss": 0.8125, + "step": 8 + }, + { + "epoch": 0.03873744619799139, + "grad_norm": 4.5766520500183105, + "learning_rate": 1.2857142857142856e-06, + "loss": 0.7809, + "step": 9 + }, + { + "epoch": 0.043041606886657105, + "grad_norm": 4.3750481605529785, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.7808, + "step": 10 + }, + { + "epoch": 0.047345767575322814, + "grad_norm": 3.954547166824341, + "learning_rate": 1.5714285714285714e-06, + "loss": 0.7559, + "step": 11 + }, + { + "epoch": 0.05164992826398852, + "grad_norm": 2.556342363357544, + "learning_rate": 1.7142857142857145e-06, + "loss": 0.734, + "step": 12 + }, + { + "epoch": 0.05595408895265423, + "grad_norm": 2.303656578063965, + "learning_rate": 1.8571428571428573e-06, + "loss": 0.7309, + "step": 13 + }, + { + "epoch": 0.06025824964131994, + "grad_norm": 2.232937812805176, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7365, + "step": 14 + }, + { + "epoch": 0.06456241032998565, + "grad_norm": 1.9870636463165283, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.7036, + "step": 15 + }, + { + "epoch": 0.06886657101865136, + "grad_norm": 1.6483427286148071, + "learning_rate": 2.285714285714286e-06, + "loss": 0.7078, + "step": 16 + }, + { + "epoch": 0.07317073170731707, + "grad_norm": 2.942274808883667, + "learning_rate": 2.428571428571429e-06, + "loss": 0.695, + "step": 17 + }, + { + "epoch": 0.07747489239598278, + "grad_norm": 3.4096434116363525, + "learning_rate": 2.571428571428571e-06, + "loss": 0.69, + "step": 18 + }, + { + "epoch": 0.08177905308464849, + "grad_norm": 3.4464306831359863, + "learning_rate": 2.7142857142857144e-06, + "loss": 0.6966, + "step": 19 + }, + { + "epoch": 0.08608321377331421, + "grad_norm": 3.2228825092315674, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.6711, + "step": 20 + }, + { + "epoch": 0.09038737446197992, + "grad_norm": 3.1530938148498535, + "learning_rate": 3e-06, + "loss": 0.7096, + "step": 21 + }, + { + "epoch": 0.09469153515064563, + "grad_norm": 2.6545050144195557, + "learning_rate": 3.142857142857143e-06, + "loss": 0.6677, + "step": 22 + }, + { + "epoch": 0.09899569583931134, + "grad_norm": 1.9846737384796143, + "learning_rate": 3.285714285714286e-06, + "loss": 0.7003, + "step": 23 + }, + { + "epoch": 0.10329985652797705, + "grad_norm": 1.4019083976745605, + "learning_rate": 3.428571428571429e-06, + "loss": 0.6554, + "step": 24 + }, + { + "epoch": 0.10760401721664276, + "grad_norm": 1.2153626680374146, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.6168, + "step": 25 + }, + { + "epoch": 0.11190817790530846, + "grad_norm": 1.1019742488861084, + "learning_rate": 3.7142857142857146e-06, + "loss": 0.6308, + "step": 26 + }, + { + "epoch": 0.11621233859397417, + "grad_norm": 1.05548894405365, + "learning_rate": 3.857142857142858e-06, + "loss": 0.6174, + "step": 27 + }, + { + "epoch": 0.12051649928263988, + "grad_norm": 1.1632239818572998, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6314, + "step": 28 + }, + { + "epoch": 0.12482065997130559, + "grad_norm": 1.0276652574539185, + "learning_rate": 4.1428571428571435e-06, + "loss": 0.5832, + "step": 29 + }, + { + "epoch": 0.1291248206599713, + "grad_norm": 0.8600188493728638, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.6304, + "step": 30 + }, + { + "epoch": 0.133428981348637, + "grad_norm": 0.6312229037284851, + "learning_rate": 4.428571428571429e-06, + "loss": 0.6215, + "step": 31 + }, + { + "epoch": 0.13773314203730272, + "grad_norm": 0.6575469970703125, + "learning_rate": 4.571428571428572e-06, + "loss": 0.5946, + "step": 32 + }, + { + "epoch": 0.14203730272596843, + "grad_norm": 0.8034297227859497, + "learning_rate": 4.714285714285715e-06, + "loss": 0.5832, + "step": 33 + }, + { + "epoch": 0.14634146341463414, + "grad_norm": 0.8478543162345886, + "learning_rate": 4.857142857142858e-06, + "loss": 0.5804, + "step": 34 + }, + { + "epoch": 0.15064562410329985, + "grad_norm": 0.6925229430198669, + "learning_rate": 5e-06, + "loss": 0.5901, + "step": 35 + }, + { + "epoch": 0.15494978479196556, + "grad_norm": 0.637363076210022, + "learning_rate": 5.142857142857142e-06, + "loss": 0.6189, + "step": 36 + }, + { + "epoch": 0.15925394548063126, + "grad_norm": 0.6068949699401855, + "learning_rate": 5.285714285714286e-06, + "loss": 0.5708, + "step": 37 + }, + { + "epoch": 0.16355810616929697, + "grad_norm": 0.7009698748588562, + "learning_rate": 5.428571428571429e-06, + "loss": 0.5692, + "step": 38 + }, + { + "epoch": 0.1678622668579627, + "grad_norm": 0.5943602919578552, + "learning_rate": 5.571428571428572e-06, + "loss": 0.5631, + "step": 39 + }, + { + "epoch": 0.17216642754662842, + "grad_norm": 0.6478841304779053, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.5581, + "step": 40 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.5670358538627625, + "learning_rate": 5.857142857142858e-06, + "loss": 0.5764, + "step": 41 + }, + { + "epoch": 0.18077474892395984, + "grad_norm": 0.47529032826423645, + "learning_rate": 6e-06, + "loss": 0.5248, + "step": 42 + }, + { + "epoch": 0.18507890961262555, + "grad_norm": 0.664316713809967, + "learning_rate": 6.142857142857144e-06, + "loss": 0.5609, + "step": 43 + }, + { + "epoch": 0.18938307030129126, + "grad_norm": 0.5606933236122131, + "learning_rate": 6.285714285714286e-06, + "loss": 0.5283, + "step": 44 + }, + { + "epoch": 0.19368723098995697, + "grad_norm": 0.5545839667320251, + "learning_rate": 6.4285714285714295e-06, + "loss": 0.5505, + "step": 45 + }, + { + "epoch": 0.19799139167862267, + "grad_norm": 0.5529095530509949, + "learning_rate": 6.571428571428572e-06, + "loss": 0.5599, + "step": 46 + }, + { + "epoch": 0.20229555236728838, + "grad_norm": 0.5661711096763611, + "learning_rate": 6.714285714285714e-06, + "loss": 0.5741, + "step": 47 + }, + { + "epoch": 0.2065997130559541, + "grad_norm": 0.535641610622406, + "learning_rate": 6.857142857142858e-06, + "loss": 0.5149, + "step": 48 + }, + { + "epoch": 0.2109038737446198, + "grad_norm": 0.5135163068771362, + "learning_rate": 7e-06, + "loss": 0.5407, + "step": 49 + }, + { + "epoch": 0.2152080344332855, + "grad_norm": 0.46351608633995056, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.5325, + "step": 50 + }, + { + "epoch": 0.21951219512195122, + "grad_norm": 0.49139636754989624, + "learning_rate": 7.285714285714286e-06, + "loss": 0.5325, + "step": 51 + }, + { + "epoch": 0.22381635581061693, + "grad_norm": 0.4905931353569031, + "learning_rate": 7.428571428571429e-06, + "loss": 0.5468, + "step": 52 + }, + { + "epoch": 0.22812051649928264, + "grad_norm": 0.46065065264701843, + "learning_rate": 7.571428571428572e-06, + "loss": 0.5144, + "step": 53 + }, + { + "epoch": 0.23242467718794835, + "grad_norm": 0.42439892888069153, + "learning_rate": 7.714285714285716e-06, + "loss": 0.5478, + "step": 54 + }, + { + "epoch": 0.23672883787661406, + "grad_norm": 0.4570675194263458, + "learning_rate": 7.857142857142858e-06, + "loss": 0.5162, + "step": 55 + }, + { + "epoch": 0.24103299856527977, + "grad_norm": 0.36280250549316406, + "learning_rate": 8.000000000000001e-06, + "loss": 0.4896, + "step": 56 + }, + { + "epoch": 0.24533715925394547, + "grad_norm": 0.42444807291030884, + "learning_rate": 8.142857142857143e-06, + "loss": 0.5184, + "step": 57 + }, + { + "epoch": 0.24964131994261118, + "grad_norm": 0.45756882429122925, + "learning_rate": 8.285714285714287e-06, + "loss": 0.5381, + "step": 58 + }, + { + "epoch": 0.2539454806312769, + "grad_norm": 0.42676690220832825, + "learning_rate": 8.428571428571429e-06, + "loss": 0.5344, + "step": 59 + }, + { + "epoch": 0.2582496413199426, + "grad_norm": 0.44801363348960876, + "learning_rate": 8.571428571428571e-06, + "loss": 0.5177, + "step": 60 + }, + { + "epoch": 0.26255380200860834, + "grad_norm": 0.42160654067993164, + "learning_rate": 8.714285714285715e-06, + "loss": 0.5148, + "step": 61 + }, + { + "epoch": 0.266857962697274, + "grad_norm": 0.42496350407600403, + "learning_rate": 8.857142857142858e-06, + "loss": 0.5086, + "step": 62 + }, + { + "epoch": 0.27116212338593976, + "grad_norm": 0.3931051194667816, + "learning_rate": 9e-06, + "loss": 0.5029, + "step": 63 + }, + { + "epoch": 0.27546628407460544, + "grad_norm": 0.4256575405597687, + "learning_rate": 9.142857142857144e-06, + "loss": 0.5208, + "step": 64 + }, + { + "epoch": 0.2797704447632712, + "grad_norm": 0.4178571403026581, + "learning_rate": 9.285714285714288e-06, + "loss": 0.5016, + "step": 65 + }, + { + "epoch": 0.28407460545193686, + "grad_norm": 0.4503072202205658, + "learning_rate": 9.42857142857143e-06, + "loss": 0.4973, + "step": 66 + }, + { + "epoch": 0.2883787661406026, + "grad_norm": 0.4047144055366516, + "learning_rate": 9.571428571428573e-06, + "loss": 0.5038, + "step": 67 + }, + { + "epoch": 0.2926829268292683, + "grad_norm": 0.5516330003738403, + "learning_rate": 9.714285714285715e-06, + "loss": 0.5281, + "step": 68 + }, + { + "epoch": 0.296987087517934, + "grad_norm": 0.43566030263900757, + "learning_rate": 9.857142857142859e-06, + "loss": 0.5086, + "step": 69 + }, + { + "epoch": 0.3012912482065997, + "grad_norm": 0.41416284441947937, + "learning_rate": 1e-05, + "loss": 0.5203, + "step": 70 + }, + { + "epoch": 0.30559540889526543, + "grad_norm": 0.47125673294067383, + "learning_rate": 9.999937036309402e-06, + "loss": 0.498, + "step": 71 + }, + { + "epoch": 0.3098995695839311, + "grad_norm": 0.38754016160964966, + "learning_rate": 9.999748146823376e-06, + "loss": 0.4988, + "step": 72 + }, + { + "epoch": 0.31420373027259685, + "grad_norm": 0.4142284393310547, + "learning_rate": 9.999433336299195e-06, + "loss": 0.4885, + "step": 73 + }, + { + "epoch": 0.31850789096126253, + "grad_norm": 0.41517406702041626, + "learning_rate": 9.99899261266551e-06, + "loss": 0.4906, + "step": 74 + }, + { + "epoch": 0.32281205164992827, + "grad_norm": 0.3982820212841034, + "learning_rate": 9.99842598702216e-06, + "loss": 0.5063, + "step": 75 + }, + { + "epoch": 0.32711621233859395, + "grad_norm": 0.3950318396091461, + "learning_rate": 9.997733473639876e-06, + "loss": 0.4885, + "step": 76 + }, + { + "epoch": 0.3314203730272597, + "grad_norm": 0.40950194001197815, + "learning_rate": 9.996915089959942e-06, + "loss": 0.4966, + "step": 77 + }, + { + "epoch": 0.3357245337159254, + "grad_norm": 0.4177294373512268, + "learning_rate": 9.995970856593739e-06, + "loss": 0.5168, + "step": 78 + }, + { + "epoch": 0.3400286944045911, + "grad_norm": 0.4307166039943695, + "learning_rate": 9.994900797322233e-06, + "loss": 0.4935, + "step": 79 + }, + { + "epoch": 0.34433285509325684, + "grad_norm": 0.452928751707077, + "learning_rate": 9.993704939095376e-06, + "loss": 0.5037, + "step": 80 + }, + { + "epoch": 0.3486370157819225, + "grad_norm": 0.4000836908817291, + "learning_rate": 9.99238331203143e-06, + "loss": 0.4974, + "step": 81 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.3994639813899994, + "learning_rate": 9.9909359494162e-06, + "loss": 0.4869, + "step": 82 + }, + { + "epoch": 0.35724533715925394, + "grad_norm": 0.40204912424087524, + "learning_rate": 9.989362887702203e-06, + "loss": 0.4933, + "step": 83 + }, + { + "epoch": 0.3615494978479197, + "grad_norm": 0.4123329818248749, + "learning_rate": 9.987664166507749e-06, + "loss": 0.4969, + "step": 84 + }, + { + "epoch": 0.36585365853658536, + "grad_norm": 0.40340450406074524, + "learning_rate": 9.985839828615937e-06, + "loss": 0.5092, + "step": 85 + }, + { + "epoch": 0.3701578192252511, + "grad_norm": 0.3741258382797241, + "learning_rate": 9.983889919973586e-06, + "loss": 0.4925, + "step": 86 + }, + { + "epoch": 0.3744619799139168, + "grad_norm": 0.40813329815864563, + "learning_rate": 9.981814489690077e-06, + "loss": 0.4614, + "step": 87 + }, + { + "epoch": 0.3787661406025825, + "grad_norm": 0.4270995259284973, + "learning_rate": 9.979613590036108e-06, + "loss": 0.5068, + "step": 88 + }, + { + "epoch": 0.3830703012912482, + "grad_norm": 0.4126576781272888, + "learning_rate": 9.977287276442385e-06, + "loss": 0.5127, + "step": 89 + }, + { + "epoch": 0.38737446197991393, + "grad_norm": 0.41353291273117065, + "learning_rate": 9.974835607498224e-06, + "loss": 0.4714, + "step": 90 + }, + { + "epoch": 0.3916786226685796, + "grad_norm": 0.43727749586105347, + "learning_rate": 9.972258644950074e-06, + "loss": 0.5083, + "step": 91 + }, + { + "epoch": 0.39598278335724535, + "grad_norm": 0.4587208032608032, + "learning_rate": 9.969556453699966e-06, + "loss": 0.4958, + "step": 92 + }, + { + "epoch": 0.40028694404591103, + "grad_norm": 0.3831060528755188, + "learning_rate": 9.966729101803872e-06, + "loss": 0.4657, + "step": 93 + }, + { + "epoch": 0.40459110473457677, + "grad_norm": 0.4357382357120514, + "learning_rate": 9.963776660469996e-06, + "loss": 0.4826, + "step": 94 + }, + { + "epoch": 0.40889526542324245, + "grad_norm": 0.4969067871570587, + "learning_rate": 9.960699204056978e-06, + "loss": 0.4997, + "step": 95 + }, + { + "epoch": 0.4131994261119082, + "grad_norm": 0.4527207612991333, + "learning_rate": 9.957496810072027e-06, + "loss": 0.5259, + "step": 96 + }, + { + "epoch": 0.41750358680057387, + "grad_norm": 0.4595460891723633, + "learning_rate": 9.954169559168958e-06, + "loss": 0.5006, + "step": 97 + }, + { + "epoch": 0.4218077474892396, + "grad_norm": 0.5569490790367126, + "learning_rate": 9.95071753514617e-06, + "loss": 0.5234, + "step": 98 + }, + { + "epoch": 0.4261119081779053, + "grad_norm": 0.3885694146156311, + "learning_rate": 9.947140824944533e-06, + "loss": 0.4944, + "step": 99 + }, + { + "epoch": 0.430416068866571, + "grad_norm": 0.4560917019844055, + "learning_rate": 9.943439518645193e-06, + "loss": 0.4932, + "step": 100 + }, + { + "epoch": 0.4347202295552367, + "grad_norm": 0.4711724519729614, + "learning_rate": 9.939613709467317e-06, + "loss": 0.4882, + "step": 101 + }, + { + "epoch": 0.43902439024390244, + "grad_norm": 0.39771363139152527, + "learning_rate": 9.935663493765726e-06, + "loss": 0.4464, + "step": 102 + }, + { + "epoch": 0.4433285509325681, + "grad_norm": 0.4277415871620178, + "learning_rate": 9.93158897102849e-06, + "loss": 0.4761, + "step": 103 + }, + { + "epoch": 0.44763271162123386, + "grad_norm": 0.5095153450965881, + "learning_rate": 9.9273902438744e-06, + "loss": 0.48, + "step": 104 + }, + { + "epoch": 0.4519368723098996, + "grad_norm": 0.5148225426673889, + "learning_rate": 9.923067418050399e-06, + "loss": 0.4836, + "step": 105 + }, + { + "epoch": 0.4562410329985653, + "grad_norm": 0.48084282875061035, + "learning_rate": 9.918620602428916e-06, + "loss": 0.4873, + "step": 106 + }, + { + "epoch": 0.460545193687231, + "grad_norm": 0.47235897183418274, + "learning_rate": 9.91404990900512e-06, + "loss": 0.4939, + "step": 107 + }, + { + "epoch": 0.4648493543758967, + "grad_norm": 0.4564887285232544, + "learning_rate": 9.909355452894098e-06, + "loss": 0.4857, + "step": 108 + }, + { + "epoch": 0.46915351506456243, + "grad_norm": 0.5023767948150635, + "learning_rate": 9.904537352327968e-06, + "loss": 0.504, + "step": 109 + }, + { + "epoch": 0.4734576757532281, + "grad_norm": 0.44549015164375305, + "learning_rate": 9.899595728652883e-06, + "loss": 0.476, + "step": 110 + }, + { + "epoch": 0.47776183644189385, + "grad_norm": 0.4371887743473053, + "learning_rate": 9.894530706325994e-06, + "loss": 0.4842, + "step": 111 + }, + { + "epoch": 0.48206599713055953, + "grad_norm": 0.5438153147697449, + "learning_rate": 9.889342412912296e-06, + "loss": 0.4878, + "step": 112 + }, + { + "epoch": 0.48637015781922527, + "grad_norm": 0.5181624293327332, + "learning_rate": 9.88403097908143e-06, + "loss": 0.4924, + "step": 113 + }, + { + "epoch": 0.49067431850789095, + "grad_norm": 0.4505234658718109, + "learning_rate": 9.878596538604388e-06, + "loss": 0.5077, + "step": 114 + }, + { + "epoch": 0.4949784791965567, + "grad_norm": 0.546039342880249, + "learning_rate": 9.87303922835014e-06, + "loss": 0.5147, + "step": 115 + }, + { + "epoch": 0.49928263988522237, + "grad_norm": 0.45228955149650574, + "learning_rate": 9.867359188282193e-06, + "loss": 0.4933, + "step": 116 + }, + { + "epoch": 0.503586800573888, + "grad_norm": 0.4075925052165985, + "learning_rate": 9.861556561455061e-06, + "loss": 0.4635, + "step": 117 + }, + { + "epoch": 0.5078909612625538, + "grad_norm": 0.5900917649269104, + "learning_rate": 9.855631494010661e-06, + "loss": 0.495, + "step": 118 + }, + { + "epoch": 0.5121951219512195, + "grad_norm": 0.5058228969573975, + "learning_rate": 9.849584135174642e-06, + "loss": 0.5013, + "step": 119 + }, + { + "epoch": 0.5164992826398852, + "grad_norm": 0.526889443397522, + "learning_rate": 9.843414637252615e-06, + "loss": 0.4774, + "step": 120 + }, + { + "epoch": 0.5208034433285509, + "grad_norm": 0.5015537142753601, + "learning_rate": 9.837123155626323e-06, + "loss": 0.4835, + "step": 121 + }, + { + "epoch": 0.5251076040172167, + "grad_norm": 0.405774861574173, + "learning_rate": 9.830709848749727e-06, + "loss": 0.4766, + "step": 122 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.5407970547676086, + "learning_rate": 9.824174878145017e-06, + "loss": 0.4937, + "step": 123 + }, + { + "epoch": 0.533715925394548, + "grad_norm": 0.47976917028427124, + "learning_rate": 9.817518408398536e-06, + "loss": 0.4854, + "step": 124 + }, + { + "epoch": 0.5380200860832137, + "grad_norm": 0.49154919385910034, + "learning_rate": 9.810740607156647e-06, + "loss": 0.4837, + "step": 125 + }, + { + "epoch": 0.5423242467718795, + "grad_norm": 0.4680994749069214, + "learning_rate": 9.803841645121505e-06, + "loss": 0.4777, + "step": 126 + }, + { + "epoch": 0.5466284074605452, + "grad_norm": 0.47835245728492737, + "learning_rate": 9.796821696046748e-06, + "loss": 0.4878, + "step": 127 + }, + { + "epoch": 0.5509325681492109, + "grad_norm": 0.5058233737945557, + "learning_rate": 9.78968093673314e-06, + "loss": 0.4846, + "step": 128 + }, + { + "epoch": 0.5552367288378766, + "grad_norm": 0.4391472637653351, + "learning_rate": 9.782419547024108e-06, + "loss": 0.4835, + "step": 129 + }, + { + "epoch": 0.5595408895265424, + "grad_norm": 0.5265413522720337, + "learning_rate": 9.775037709801206e-06, + "loss": 0.5061, + "step": 130 + }, + { + "epoch": 0.563845050215208, + "grad_norm": 0.48637405037879944, + "learning_rate": 9.76753561097952e-06, + "loss": 0.4811, + "step": 131 + }, + { + "epoch": 0.5681492109038737, + "grad_norm": 0.4475197494029999, + "learning_rate": 9.759913439502982e-06, + "loss": 0.4518, + "step": 132 + }, + { + "epoch": 0.5724533715925395, + "grad_norm": 0.4481680393218994, + "learning_rate": 9.752171387339612e-06, + "loss": 0.4864, + "step": 133 + }, + { + "epoch": 0.5767575322812052, + "grad_norm": 0.4027429521083832, + "learning_rate": 9.74430964947668e-06, + "loss": 0.4815, + "step": 134 + }, + { + "epoch": 0.5810616929698709, + "grad_norm": 0.4797030985355377, + "learning_rate": 9.736328423915797e-06, + "loss": 0.4953, + "step": 135 + }, + { + "epoch": 0.5853658536585366, + "grad_norm": 0.4120473861694336, + "learning_rate": 9.728227911667934e-06, + "loss": 0.4841, + "step": 136 + }, + { + "epoch": 0.5896700143472023, + "grad_norm": 0.46151697635650635, + "learning_rate": 9.720008316748344e-06, + "loss": 0.4704, + "step": 137 + }, + { + "epoch": 0.593974175035868, + "grad_norm": 0.4600050151348114, + "learning_rate": 9.711669846171443e-06, + "loss": 0.4652, + "step": 138 + }, + { + "epoch": 0.5982783357245337, + "grad_norm": 0.4155121147632599, + "learning_rate": 9.703212709945583e-06, + "loss": 0.4754, + "step": 139 + }, + { + "epoch": 0.6025824964131994, + "grad_norm": 0.4314727783203125, + "learning_rate": 9.694637121067764e-06, + "loss": 0.4822, + "step": 140 + }, + { + "epoch": 0.6068866571018652, + "grad_norm": 0.5324988961219788, + "learning_rate": 9.685943295518283e-06, + "loss": 0.4795, + "step": 141 + }, + { + "epoch": 0.6111908177905309, + "grad_norm": 0.45600083470344543, + "learning_rate": 9.677131452255272e-06, + "loss": 0.4697, + "step": 142 + }, + { + "epoch": 0.6154949784791965, + "grad_norm": 0.5205792188644409, + "learning_rate": 9.668201813209202e-06, + "loss": 0.4803, + "step": 143 + }, + { + "epoch": 0.6197991391678622, + "grad_norm": 0.4811612069606781, + "learning_rate": 9.659154603277283e-06, + "loss": 0.4715, + "step": 144 + }, + { + "epoch": 0.624103299856528, + "grad_norm": 0.4459843933582306, + "learning_rate": 9.649990050317806e-06, + "loss": 0.4775, + "step": 145 + }, + { + "epoch": 0.6284074605451937, + "grad_norm": 0.5670981407165527, + "learning_rate": 9.640708385144403e-06, + "loss": 0.4789, + "step": 146 + }, + { + "epoch": 0.6327116212338594, + "grad_norm": 0.5041674971580505, + "learning_rate": 9.631309841520233e-06, + "loss": 0.4936, + "step": 147 + }, + { + "epoch": 0.6370157819225251, + "grad_norm": 0.5009613633155823, + "learning_rate": 9.62179465615209e-06, + "loss": 0.5072, + "step": 148 + }, + { + "epoch": 0.6413199426111909, + "grad_norm": 0.5727151036262512, + "learning_rate": 9.612163068684453e-06, + "loss": 0.4764, + "step": 149 + }, + { + "epoch": 0.6456241032998565, + "grad_norm": 0.6043273210525513, + "learning_rate": 9.602415321693434e-06, + "loss": 0.4818, + "step": 150 + }, + { + "epoch": 0.6499282639885222, + "grad_norm": 0.445709764957428, + "learning_rate": 9.592551660680687e-06, + "loss": 0.465, + "step": 151 + }, + { + "epoch": 0.6542324246771879, + "grad_norm": 0.6107316613197327, + "learning_rate": 9.582572334067213e-06, + "loss": 0.49, + "step": 152 + }, + { + "epoch": 0.6585365853658537, + "grad_norm": 0.4649333655834198, + "learning_rate": 9.572477593187101e-06, + "loss": 0.4583, + "step": 153 + }, + { + "epoch": 0.6628407460545194, + "grad_norm": 0.499577134847641, + "learning_rate": 9.562267692281212e-06, + "loss": 0.4864, + "step": 154 + }, + { + "epoch": 0.667144906743185, + "grad_norm": 0.5428804755210876, + "learning_rate": 9.551942888490759e-06, + "loss": 0.4746, + "step": 155 + }, + { + "epoch": 0.6714490674318508, + "grad_norm": 0.48842427134513855, + "learning_rate": 9.541503441850844e-06, + "loss": 0.5099, + "step": 156 + }, + { + "epoch": 0.6757532281205165, + "grad_norm": 0.4386967420578003, + "learning_rate": 9.530949615283902e-06, + "loss": 0.478, + "step": 157 + }, + { + "epoch": 0.6800573888091822, + "grad_norm": 0.48663437366485596, + "learning_rate": 9.520281674593084e-06, + "loss": 0.4968, + "step": 158 + }, + { + "epoch": 0.6843615494978479, + "grad_norm": 0.4161711633205414, + "learning_rate": 9.509499888455554e-06, + "loss": 0.4686, + "step": 159 + }, + { + "epoch": 0.6886657101865137, + "grad_norm": 0.4658878445625305, + "learning_rate": 9.498604528415731e-06, + "loss": 0.4669, + "step": 160 + }, + { + "epoch": 0.6929698708751794, + "grad_norm": 0.33149388432502747, + "learning_rate": 9.487595868878447e-06, + "loss": 0.4801, + "step": 161 + }, + { + "epoch": 0.697274031563845, + "grad_norm": 0.4957387447357178, + "learning_rate": 9.476474187102033e-06, + "loss": 0.4546, + "step": 162 + }, + { + "epoch": 0.7015781922525107, + "grad_norm": 0.4245833158493042, + "learning_rate": 9.465239763191345e-06, + "loss": 0.4939, + "step": 163 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.4390011131763458, + "learning_rate": 9.453892880090696e-06, + "loss": 0.4745, + "step": 164 + }, + { + "epoch": 0.7101865136298422, + "grad_norm": 0.5233902931213379, + "learning_rate": 9.442433823576741e-06, + "loss": 0.488, + "step": 165 + }, + { + "epoch": 0.7144906743185079, + "grad_norm": 0.4004855155944824, + "learning_rate": 9.430862882251279e-06, + "loss": 0.4666, + "step": 166 + }, + { + "epoch": 0.7187948350071736, + "grad_norm": 0.48547080159187317, + "learning_rate": 9.419180347533976e-06, + "loss": 0.475, + "step": 167 + }, + { + "epoch": 0.7230989956958394, + "grad_norm": 0.5068047046661377, + "learning_rate": 9.40738651365503e-06, + "loss": 0.4952, + "step": 168 + }, + { + "epoch": 0.727403156384505, + "grad_norm": 0.49898117780685425, + "learning_rate": 9.395481677647767e-06, + "loss": 0.4736, + "step": 169 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.41030001640319824, + "learning_rate": 9.38346613934115e-06, + "loss": 0.4796, + "step": 170 + }, + { + "epoch": 0.7360114777618364, + "grad_norm": 0.47655925154685974, + "learning_rate": 9.371340201352234e-06, + "loss": 0.4819, + "step": 171 + }, + { + "epoch": 0.7403156384505022, + "grad_norm": 0.4729374945163727, + "learning_rate": 9.359104169078541e-06, + "loss": 0.4819, + "step": 172 + }, + { + "epoch": 0.7446197991391679, + "grad_norm": 0.45545288920402527, + "learning_rate": 9.346758350690373e-06, + "loss": 0.4931, + "step": 173 + }, + { + "epoch": 0.7489239598278336, + "grad_norm": 0.4552542269229889, + "learning_rate": 9.334303057123044e-06, + "loss": 0.4707, + "step": 174 + }, + { + "epoch": 0.7532281205164992, + "grad_norm": 0.45603325963020325, + "learning_rate": 9.321738602069057e-06, + "loss": 0.4824, + "step": 175 + }, + { + "epoch": 0.757532281205165, + "grad_norm": 0.5039983987808228, + "learning_rate": 9.309065301970193e-06, + "loss": 0.4862, + "step": 176 + }, + { + "epoch": 0.7618364418938307, + "grad_norm": 0.4347105622291565, + "learning_rate": 9.296283476009551e-06, + "loss": 0.481, + "step": 177 + }, + { + "epoch": 0.7661406025824964, + "grad_norm": 0.507379949092865, + "learning_rate": 9.283393446103506e-06, + "loss": 0.4724, + "step": 178 + }, + { + "epoch": 0.7704447632711621, + "grad_norm": 0.44477155804634094, + "learning_rate": 9.270395536893599e-06, + "loss": 0.4549, + "step": 179 + }, + { + "epoch": 0.7747489239598279, + "grad_norm": 0.4589724540710449, + "learning_rate": 9.257290075738365e-06, + "loss": 0.4739, + "step": 180 + }, + { + "epoch": 0.7790530846484935, + "grad_norm": 0.3952377140522003, + "learning_rate": 9.244077392705085e-06, + "loss": 0.4385, + "step": 181 + }, + { + "epoch": 0.7833572453371592, + "grad_norm": 0.4503597021102905, + "learning_rate": 9.23075782056147e-06, + "loss": 0.4746, + "step": 182 + }, + { + "epoch": 0.787661406025825, + "grad_norm": 0.44907307624816895, + "learning_rate": 9.217331694767291e-06, + "loss": 0.4657, + "step": 183 + }, + { + "epoch": 0.7919655667144907, + "grad_norm": 0.4200761318206787, + "learning_rate": 9.20379935346592e-06, + "loss": 0.4742, + "step": 184 + }, + { + "epoch": 0.7962697274031564, + "grad_norm": 0.4928026795387268, + "learning_rate": 9.190161137475814e-06, + "loss": 0.4548, + "step": 185 + }, + { + "epoch": 0.8005738880918221, + "grad_norm": 0.41791635751724243, + "learning_rate": 9.176417390281944e-06, + "loss": 0.4717, + "step": 186 + }, + { + "epoch": 0.8048780487804879, + "grad_norm": 0.4696124792098999, + "learning_rate": 9.162568458027122e-06, + "loss": 0.4748, + "step": 187 + }, + { + "epoch": 0.8091822094691535, + "grad_norm": 0.41637182235717773, + "learning_rate": 9.148614689503307e-06, + "loss": 0.4905, + "step": 188 + }, + { + "epoch": 0.8134863701578192, + "grad_norm": 0.42210012674331665, + "learning_rate": 9.134556436142801e-06, + "loss": 0.4865, + "step": 189 + }, + { + "epoch": 0.8177905308464849, + "grad_norm": 0.3945121169090271, + "learning_rate": 9.120394052009412e-06, + "loss": 0.4805, + "step": 190 + }, + { + "epoch": 0.8220946915351507, + "grad_norm": 0.4084663987159729, + "learning_rate": 9.10612789378953e-06, + "loss": 0.4546, + "step": 191 + }, + { + "epoch": 0.8263988522238164, + "grad_norm": 0.39801859855651855, + "learning_rate": 9.091758320783139e-06, + "loss": 0.4813, + "step": 192 + }, + { + "epoch": 0.830703012912482, + "grad_norm": 0.374301940202713, + "learning_rate": 9.077285694894786e-06, + "loss": 0.4629, + "step": 193 + }, + { + "epoch": 0.8350071736011477, + "grad_norm": 0.34360548853874207, + "learning_rate": 9.062710380624439e-06, + "loss": 0.437, + "step": 194 + }, + { + "epoch": 0.8393113342898135, + "grad_norm": 0.40194225311279297, + "learning_rate": 9.048032745058335e-06, + "loss": 0.4503, + "step": 195 + }, + { + "epoch": 0.8436154949784792, + "grad_norm": 0.38807958364486694, + "learning_rate": 9.033253157859715e-06, + "loss": 0.4817, + "step": 196 + }, + { + "epoch": 0.8479196556671449, + "grad_norm": 0.3624962270259857, + "learning_rate": 9.018371991259516e-06, + "loss": 0.4466, + "step": 197 + }, + { + "epoch": 0.8522238163558106, + "grad_norm": 0.40797358751296997, + "learning_rate": 9.003389620047012e-06, + "loss": 0.4774, + "step": 198 + }, + { + "epoch": 0.8565279770444764, + "grad_norm": 0.40521928668022156, + "learning_rate": 8.988306421560354e-06, + "loss": 0.4807, + "step": 199 + }, + { + "epoch": 0.860832137733142, + "grad_norm": 0.4247490465641022, + "learning_rate": 8.973122775677078e-06, + "loss": 0.4626, + "step": 200 + }, + { + "epoch": 0.8651362984218077, + "grad_norm": 0.49826568365097046, + "learning_rate": 8.957839064804542e-06, + "loss": 0.4802, + "step": 201 + }, + { + "epoch": 0.8694404591104734, + "grad_norm": 0.4205709397792816, + "learning_rate": 8.942455673870278e-06, + "loss": 0.4867, + "step": 202 + }, + { + "epoch": 0.8737446197991392, + "grad_norm": 0.42230424284935, + "learning_rate": 8.926972990312314e-06, + "loss": 0.4918, + "step": 203 + }, + { + "epoch": 0.8780487804878049, + "grad_norm": 0.4043838381767273, + "learning_rate": 8.91139140406941e-06, + "loss": 0.4936, + "step": 204 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.38208943605422974, + "learning_rate": 8.895711307571235e-06, + "loss": 0.4485, + "step": 205 + }, + { + "epoch": 0.8866571018651362, + "grad_norm": 0.41558074951171875, + "learning_rate": 8.879933095728485e-06, + "loss": 0.4658, + "step": 206 + }, + { + "epoch": 0.890961262553802, + "grad_norm": 0.3710712492465973, + "learning_rate": 8.864057165922944e-06, + "loss": 0.4531, + "step": 207 + }, + { + "epoch": 0.8952654232424677, + "grad_norm": 0.43846508860588074, + "learning_rate": 8.848083917997463e-06, + "loss": 0.4754, + "step": 208 + }, + { + "epoch": 0.8995695839311334, + "grad_norm": 0.44018861651420593, + "learning_rate": 8.832013754245895e-06, + "loss": 0.4828, + "step": 209 + }, + { + "epoch": 0.9038737446197992, + "grad_norm": 0.47094517946243286, + "learning_rate": 8.815847079402972e-06, + "loss": 0.4455, + "step": 210 + }, + { + "epoch": 0.9081779053084649, + "grad_norm": 0.40249302983283997, + "learning_rate": 8.799584300634096e-06, + "loss": 0.4841, + "step": 211 + }, + { + "epoch": 0.9124820659971306, + "grad_norm": 0.4624997675418854, + "learning_rate": 8.783225827525098e-06, + "loss": 0.4533, + "step": 212 + }, + { + "epoch": 0.9167862266857962, + "grad_norm": 0.4431861639022827, + "learning_rate": 8.766772072071911e-06, + "loss": 0.4676, + "step": 213 + }, + { + "epoch": 0.921090387374462, + "grad_norm": 0.4086032211780548, + "learning_rate": 8.750223448670204e-06, + "loss": 0.4625, + "step": 214 + }, + { + "epoch": 0.9253945480631277, + "grad_norm": 0.496051162481308, + "learning_rate": 8.733580374104936e-06, + "loss": 0.4569, + "step": 215 + }, + { + "epoch": 0.9296987087517934, + "grad_norm": 0.545372724533081, + "learning_rate": 8.716843267539868e-06, + "loss": 0.4738, + "step": 216 + }, + { + "epoch": 0.9340028694404591, + "grad_norm": 0.45664075016975403, + "learning_rate": 8.700012550507e-06, + "loss": 0.4887, + "step": 217 + }, + { + "epoch": 0.9383070301291249, + "grad_norm": 0.4230859577655792, + "learning_rate": 8.683088646895955e-06, + "loss": 0.4349, + "step": 218 + }, + { + "epoch": 0.9426111908177905, + "grad_norm": 0.453646719455719, + "learning_rate": 8.666071982943306e-06, + "loss": 0.453, + "step": 219 + }, + { + "epoch": 0.9469153515064562, + "grad_norm": 0.38937658071517944, + "learning_rate": 8.648962987221837e-06, + "loss": 0.4767, + "step": 220 + }, + { + "epoch": 0.9512195121951219, + "grad_norm": 0.4459855258464813, + "learning_rate": 8.631762090629756e-06, + "loss": 0.4814, + "step": 221 + }, + { + "epoch": 0.9555236728837877, + "grad_norm": 0.3893451392650604, + "learning_rate": 8.614469726379833e-06, + "loss": 0.4761, + "step": 222 + }, + { + "epoch": 0.9598278335724534, + "grad_norm": 0.37297987937927246, + "learning_rate": 8.597086329988498e-06, + "loss": 0.4673, + "step": 223 + }, + { + "epoch": 0.9641319942611191, + "grad_norm": 0.46203163266181946, + "learning_rate": 8.579612339264867e-06, + "loss": 0.4946, + "step": 224 + }, + { + "epoch": 0.9684361549497847, + "grad_norm": 0.42642268538475037, + "learning_rate": 8.562048194299719e-06, + "loss": 0.4776, + "step": 225 + }, + { + "epoch": 0.9727403156384505, + "grad_norm": 0.3741326928138733, + "learning_rate": 8.544394337454409e-06, + "loss": 0.4632, + "step": 226 + }, + { + "epoch": 0.9770444763271162, + "grad_norm": 0.39677464962005615, + "learning_rate": 8.52665121334973e-06, + "loss": 0.4821, + "step": 227 + }, + { + "epoch": 0.9813486370157819, + "grad_norm": 0.42544877529144287, + "learning_rate": 8.508819268854713e-06, + "loss": 0.4893, + "step": 228 + }, + { + "epoch": 0.9856527977044476, + "grad_norm": 0.4025561213493347, + "learning_rate": 8.49089895307537e-06, + "loss": 0.4682, + "step": 229 + }, + { + "epoch": 0.9899569583931134, + "grad_norm": 0.4395465552806854, + "learning_rate": 8.472890717343391e-06, + "loss": 0.4558, + "step": 230 + }, + { + "epoch": 0.994261119081779, + "grad_norm": 0.3767266571521759, + "learning_rate": 8.454795015204767e-06, + "loss": 0.4616, + "step": 231 + }, + { + "epoch": 0.9985652797704447, + "grad_norm": 0.42802631855010986, + "learning_rate": 8.436612302408376e-06, + "loss": 0.4594, + "step": 232 + }, + { + "epoch": 1.0028694404591105, + "grad_norm": 0.7791727781295776, + "learning_rate": 8.418343036894497e-06, + "loss": 0.7338, + "step": 233 + }, + { + "epoch": 1.007173601147776, + "grad_norm": 0.446108341217041, + "learning_rate": 8.399987678783285e-06, + "loss": 0.4693, + "step": 234 + }, + { + "epoch": 1.011477761836442, + "grad_norm": 0.44180288910865784, + "learning_rate": 8.381546690363174e-06, + "loss": 0.4326, + "step": 235 + }, + { + "epoch": 1.0157819225251077, + "grad_norm": 0.4678577184677124, + "learning_rate": 8.36302053607924e-06, + "loss": 0.4729, + "step": 236 + }, + { + "epoch": 1.0200860832137733, + "grad_norm": 0.4438740015029907, + "learning_rate": 8.344409682521499e-06, + "loss": 0.4454, + "step": 237 + }, + { + "epoch": 1.024390243902439, + "grad_norm": 0.46316948533058167, + "learning_rate": 8.325714598413169e-06, + "loss": 0.4322, + "step": 238 + }, + { + "epoch": 1.0286944045911048, + "grad_norm": 0.3941633403301239, + "learning_rate": 8.306935754598838e-06, + "loss": 0.4176, + "step": 239 + }, + { + "epoch": 1.0329985652797704, + "grad_norm": 0.4438393712043762, + "learning_rate": 8.288073624032634e-06, + "loss": 0.4339, + "step": 240 + }, + { + "epoch": 1.0373027259684362, + "grad_norm": 0.440329372882843, + "learning_rate": 8.269128681766296e-06, + "loss": 0.4535, + "step": 241 + }, + { + "epoch": 1.0416068866571018, + "grad_norm": 0.4060475826263428, + "learning_rate": 8.250101404937223e-06, + "loss": 0.4235, + "step": 242 + }, + { + "epoch": 1.0459110473457676, + "grad_norm": 0.41474077105522156, + "learning_rate": 8.230992272756438e-06, + "loss": 0.4531, + "step": 243 + }, + { + "epoch": 1.0502152080344334, + "grad_norm": 0.44152936339378357, + "learning_rate": 8.211801766496537e-06, + "loss": 0.4601, + "step": 244 + }, + { + "epoch": 1.054519368723099, + "grad_norm": 0.42042216658592224, + "learning_rate": 8.192530369479562e-06, + "loss": 0.4376, + "step": 245 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.49132099747657776, + "learning_rate": 8.17317856706482e-06, + "loss": 0.4598, + "step": 246 + }, + { + "epoch": 1.0631276901004305, + "grad_norm": 0.44164198637008667, + "learning_rate": 8.153746846636675e-06, + "loss": 0.427, + "step": 247 + }, + { + "epoch": 1.067431850789096, + "grad_norm": 0.4028110206127167, + "learning_rate": 8.13423569759226e-06, + "loss": 0.435, + "step": 248 + }, + { + "epoch": 1.0717360114777619, + "grad_norm": 0.41196516156196594, + "learning_rate": 8.114645611329152e-06, + "loss": 0.4283, + "step": 249 + }, + { + "epoch": 1.0760401721664274, + "grad_norm": 0.42356353998184204, + "learning_rate": 8.094977081233006e-06, + "loss": 0.4165, + "step": 250 + }, + { + "epoch": 1.0803443328550932, + "grad_norm": 0.44623392820358276, + "learning_rate": 8.075230602665118e-06, + "loss": 0.4703, + "step": 251 + }, + { + "epoch": 1.084648493543759, + "grad_norm": 0.42129871249198914, + "learning_rate": 8.055406672949957e-06, + "loss": 0.449, + "step": 252 + }, + { + "epoch": 1.0889526542324246, + "grad_norm": 0.44216564297676086, + "learning_rate": 8.03550579136263e-06, + "loss": 0.4292, + "step": 253 + }, + { + "epoch": 1.0932568149210904, + "grad_norm": 0.4126872718334198, + "learning_rate": 8.015528459116321e-06, + "loss": 0.4383, + "step": 254 + }, + { + "epoch": 1.0975609756097562, + "grad_norm": 0.44940850138664246, + "learning_rate": 7.995475179349657e-06, + "loss": 0.4499, + "step": 255 + }, + { + "epoch": 1.1018651362984218, + "grad_norm": 0.4903542995452881, + "learning_rate": 7.975346457114034e-06, + "loss": 0.4388, + "step": 256 + }, + { + "epoch": 1.1061692969870875, + "grad_norm": 0.5634150505065918, + "learning_rate": 7.955142799360914e-06, + "loss": 0.4649, + "step": 257 + }, + { + "epoch": 1.1104734576757531, + "grad_norm": 0.3710925877094269, + "learning_rate": 7.934864714929036e-06, + "loss": 0.4136, + "step": 258 + }, + { + "epoch": 1.114777618364419, + "grad_norm": 0.5903193354606628, + "learning_rate": 7.914512714531612e-06, + "loss": 0.4384, + "step": 259 + }, + { + "epoch": 1.1190817790530847, + "grad_norm": 0.435581773519516, + "learning_rate": 7.894087310743468e-06, + "loss": 0.4433, + "step": 260 + }, + { + "epoch": 1.1233859397417503, + "grad_norm": 0.3968298137187958, + "learning_rate": 7.873589017988124e-06, + "loss": 0.4362, + "step": 261 + }, + { + "epoch": 1.127690100430416, + "grad_norm": 0.38424989581108093, + "learning_rate": 7.853018352524845e-06, + "loss": 0.4054, + "step": 262 + }, + { + "epoch": 1.1319942611190819, + "grad_norm": 0.41407302021980286, + "learning_rate": 7.832375832435637e-06, + "loss": 0.4461, + "step": 263 + }, + { + "epoch": 1.1362984218077474, + "grad_norm": 0.40711095929145813, + "learning_rate": 7.811661977612202e-06, + "loss": 0.4898, + "step": 264 + }, + { + "epoch": 1.1406025824964132, + "grad_norm": 0.41025951504707336, + "learning_rate": 7.790877309742833e-06, + "loss": 0.4188, + "step": 265 + }, + { + "epoch": 1.144906743185079, + "grad_norm": 0.49598750472068787, + "learning_rate": 7.770022352299294e-06, + "loss": 0.4233, + "step": 266 + }, + { + "epoch": 1.1492109038737446, + "grad_norm": 0.41980111598968506, + "learning_rate": 7.749097630523618e-06, + "loss": 0.449, + "step": 267 + }, + { + "epoch": 1.1535150645624104, + "grad_norm": 0.37297967076301575, + "learning_rate": 7.728103671414889e-06, + "loss": 0.444, + "step": 268 + }, + { + "epoch": 1.157819225251076, + "grad_norm": 0.4400876462459564, + "learning_rate": 7.707041003715962e-06, + "loss": 0.4478, + "step": 269 + }, + { + "epoch": 1.1621233859397417, + "grad_norm": 0.40423017740249634, + "learning_rate": 7.685910157900158e-06, + "loss": 0.4579, + "step": 270 + }, + { + "epoch": 1.1664275466284075, + "grad_norm": 0.3636547029018402, + "learning_rate": 7.66471166615789e-06, + "loss": 0.4137, + "step": 271 + }, + { + "epoch": 1.170731707317073, + "grad_norm": 0.37857159972190857, + "learning_rate": 7.643446062383273e-06, + "loss": 0.4582, + "step": 272 + }, + { + "epoch": 1.175035868005739, + "grad_norm": 0.37849536538124084, + "learning_rate": 7.622113882160658e-06, + "loss": 0.4222, + "step": 273 + }, + { + "epoch": 1.1793400286944047, + "grad_norm": 0.36707597970962524, + "learning_rate": 7.600715662751166e-06, + "loss": 0.4181, + "step": 274 + }, + { + "epoch": 1.1836441893830703, + "grad_norm": 0.4041919708251953, + "learning_rate": 7.579251943079145e-06, + "loss": 0.4115, + "step": 275 + }, + { + "epoch": 1.187948350071736, + "grad_norm": 0.4002331793308258, + "learning_rate": 7.557723263718596e-06, + "loss": 0.4649, + "step": 276 + }, + { + "epoch": 1.1922525107604018, + "grad_norm": 0.38895294070243835, + "learning_rate": 7.536130166879561e-06, + "loss": 0.4328, + "step": 277 + }, + { + "epoch": 1.1965566714490674, + "grad_norm": 0.35401999950408936, + "learning_rate": 7.514473196394467e-06, + "loss": 0.4379, + "step": 278 + }, + { + "epoch": 1.2008608321377332, + "grad_norm": 0.4220169484615326, + "learning_rate": 7.492752897704432e-06, + "loss": 0.4216, + "step": 279 + }, + { + "epoch": 1.2051649928263988, + "grad_norm": 0.39525651931762695, + "learning_rate": 7.470969817845518e-06, + "loss": 0.4707, + "step": 280 + }, + { + "epoch": 1.2094691535150646, + "grad_norm": 0.4009878635406494, + "learning_rate": 7.4491245054349716e-06, + "loss": 0.4229, + "step": 281 + }, + { + "epoch": 1.2137733142037304, + "grad_norm": 0.4798336923122406, + "learning_rate": 7.427217510657383e-06, + "loss": 0.4593, + "step": 282 + }, + { + "epoch": 1.218077474892396, + "grad_norm": 0.3721896708011627, + "learning_rate": 7.405249385250854e-06, + "loss": 0.4281, + "step": 283 + }, + { + "epoch": 1.2223816355810617, + "grad_norm": 0.40978696942329407, + "learning_rate": 7.383220682493081e-06, + "loss": 0.4507, + "step": 284 + }, + { + "epoch": 1.2266857962697273, + "grad_norm": 0.4719083905220032, + "learning_rate": 7.361131957187435e-06, + "loss": 0.4228, + "step": 285 + }, + { + "epoch": 1.230989956958393, + "grad_norm": 0.38987070322036743, + "learning_rate": 7.338983765648985e-06, + "loss": 0.4602, + "step": 286 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.4016108512878418, + "learning_rate": 7.31677666569048e-06, + "loss": 0.4095, + "step": 287 + }, + { + "epoch": 1.2395982783357244, + "grad_norm": 0.4143194854259491, + "learning_rate": 7.294511216608308e-06, + "loss": 0.4236, + "step": 288 + }, + { + "epoch": 1.2439024390243902, + "grad_norm": 0.4156009554862976, + "learning_rate": 7.272187979168408e-06, + "loss": 0.4422, + "step": 289 + }, + { + "epoch": 1.248206599713056, + "grad_norm": 0.4072224199771881, + "learning_rate": 7.249807515592149e-06, + "loss": 0.389, + "step": 290 + }, + { + "epoch": 1.2525107604017216, + "grad_norm": 0.37442606687545776, + "learning_rate": 7.227370389542161e-06, + "loss": 0.4358, + "step": 291 + }, + { + "epoch": 1.2568149210903874, + "grad_norm": 0.40852388739585876, + "learning_rate": 7.2048771661081515e-06, + "loss": 0.4325, + "step": 292 + }, + { + "epoch": 1.2611190817790532, + "grad_norm": 0.39829540252685547, + "learning_rate": 7.182328411792664e-06, + "loss": 0.4262, + "step": 293 + }, + { + "epoch": 1.2654232424677188, + "grad_norm": 0.3537748456001282, + "learning_rate": 7.159724694496815e-06, + "loss": 0.4488, + "step": 294 + }, + { + "epoch": 1.2697274031563845, + "grad_norm": 0.3861173689365387, + "learning_rate": 7.137066583505987e-06, + "loss": 0.4491, + "step": 295 + }, + { + "epoch": 1.2740315638450501, + "grad_norm": 0.38991910219192505, + "learning_rate": 7.114354649475499e-06, + "loss": 0.4399, + "step": 296 + }, + { + "epoch": 1.278335724533716, + "grad_norm": 0.3736940622329712, + "learning_rate": 7.091589464416225e-06, + "loss": 0.4123, + "step": 297 + }, + { + "epoch": 1.2826398852223817, + "grad_norm": 0.35150569677352905, + "learning_rate": 7.068771601680191e-06, + "loss": 0.3991, + "step": 298 + }, + { + "epoch": 1.2869440459110473, + "grad_norm": 0.38645830750465393, + "learning_rate": 7.04590163594614e-06, + "loss": 0.45, + "step": 299 + }, + { + "epoch": 1.291248206599713, + "grad_norm": 0.45290979743003845, + "learning_rate": 7.022980143205046e-06, + "loss": 0.4321, + "step": 300 + }, + { + "epoch": 1.2955523672883786, + "grad_norm": 0.3679206669330597, + "learning_rate": 7.000007700745622e-06, + "loss": 0.4263, + "step": 301 + }, + { + "epoch": 1.2998565279770444, + "grad_norm": 0.3815595507621765, + "learning_rate": 6.976984887139775e-06, + "loss": 0.453, + "step": 302 + }, + { + "epoch": 1.3041606886657102, + "grad_norm": 0.4131976068019867, + "learning_rate": 6.9539122822280246e-06, + "loss": 0.4471, + "step": 303 + }, + { + "epoch": 1.308464849354376, + "grad_norm": 0.3435700237751007, + "learning_rate": 6.930790467104916e-06, + "loss": 0.4367, + "step": 304 + }, + { + "epoch": 1.3127690100430416, + "grad_norm": 0.36125117540359497, + "learning_rate": 6.907620024104377e-06, + "loss": 0.38, + "step": 305 + }, + { + "epoch": 1.3170731707317074, + "grad_norm": 0.35791122913360596, + "learning_rate": 6.884401536785045e-06, + "loss": 0.4911, + "step": 306 + }, + { + "epoch": 1.321377331420373, + "grad_norm": 0.33072197437286377, + "learning_rate": 6.861135589915583e-06, + "loss": 0.429, + "step": 307 + }, + { + "epoch": 1.3256814921090387, + "grad_norm": 0.3363823890686035, + "learning_rate": 6.837822769459942e-06, + "loss": 0.4131, + "step": 308 + }, + { + "epoch": 1.3299856527977045, + "grad_norm": 0.3425028324127197, + "learning_rate": 6.814463662562609e-06, + "loss": 0.4293, + "step": 309 + }, + { + "epoch": 1.33428981348637, + "grad_norm": 0.3752707839012146, + "learning_rate": 6.791058857533814e-06, + "loss": 0.4421, + "step": 310 + }, + { + "epoch": 1.338593974175036, + "grad_norm": 0.3743986189365387, + "learning_rate": 6.767608943834721e-06, + "loss": 0.4438, + "step": 311 + }, + { + "epoch": 1.3428981348637015, + "grad_norm": 0.36510851979255676, + "learning_rate": 6.744114512062571e-06, + "loss": 0.4087, + "step": 312 + }, + { + "epoch": 1.3472022955523673, + "grad_norm": 0.4204147160053253, + "learning_rate": 6.720576153935818e-06, + "loss": 0.4873, + "step": 313 + }, + { + "epoch": 1.351506456241033, + "grad_norm": 0.3257698118686676, + "learning_rate": 6.696994462279223e-06, + "loss": 0.4085, + "step": 314 + }, + { + "epoch": 1.3558106169296988, + "grad_norm": 0.40403416752815247, + "learning_rate": 6.673370031008919e-06, + "loss": 0.4829, + "step": 315 + }, + { + "epoch": 1.3601147776183644, + "grad_norm": 0.3832758665084839, + "learning_rate": 6.6497034551174585e-06, + "loss": 0.421, + "step": 316 + }, + { + "epoch": 1.3644189383070302, + "grad_norm": 0.35449251532554626, + "learning_rate": 6.625995330658828e-06, + "loss": 0.4128, + "step": 317 + }, + { + "epoch": 1.3687230989956958, + "grad_norm": 0.37400197982788086, + "learning_rate": 6.602246254733431e-06, + "loss": 0.4257, + "step": 318 + }, + { + "epoch": 1.3730272596843616, + "grad_norm": 0.35631948709487915, + "learning_rate": 6.578456825473055e-06, + "loss": 0.4299, + "step": 319 + }, + { + "epoch": 1.3773314203730274, + "grad_norm": 0.33373865485191345, + "learning_rate": 6.554627642025807e-06, + "loss": 0.4412, + "step": 320 + }, + { + "epoch": 1.381635581061693, + "grad_norm": 0.38946038484573364, + "learning_rate": 6.53075930454102e-06, + "loss": 0.3981, + "step": 321 + }, + { + "epoch": 1.3859397417503587, + "grad_norm": 0.3750453591346741, + "learning_rate": 6.506852414154138e-06, + "loss": 0.461, + "step": 322 + }, + { + "epoch": 1.3902439024390243, + "grad_norm": 0.3329356014728546, + "learning_rate": 6.482907572971584e-06, + "loss": 0.3842, + "step": 323 + }, + { + "epoch": 1.39454806312769, + "grad_norm": 0.352209210395813, + "learning_rate": 6.4589253840555856e-06, + "loss": 0.4419, + "step": 324 + }, + { + "epoch": 1.3988522238163559, + "grad_norm": 0.3593578636646271, + "learning_rate": 6.434906451408991e-06, + "loss": 0.403, + "step": 325 + }, + { + "epoch": 1.4031563845050214, + "grad_norm": 0.4038272500038147, + "learning_rate": 6.41085137996006e-06, + "loss": 0.472, + "step": 326 + }, + { + "epoch": 1.4074605451936872, + "grad_norm": 0.30662405490875244, + "learning_rate": 6.386760775547221e-06, + "loss": 0.406, + "step": 327 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.3446487486362457, + "learning_rate": 6.362635244903818e-06, + "loss": 0.4379, + "step": 328 + }, + { + "epoch": 1.4160688665710186, + "grad_norm": 0.3683339059352875, + "learning_rate": 6.338475395642834e-06, + "loss": 0.4427, + "step": 329 + }, + { + "epoch": 1.4203730272596844, + "grad_norm": 0.3524536192417145, + "learning_rate": 6.314281836241573e-06, + "loss": 0.4482, + "step": 330 + }, + { + "epoch": 1.4246771879483502, + "grad_norm": 0.381590336561203, + "learning_rate": 6.2900551760263564e-06, + "loss": 0.4505, + "step": 331 + }, + { + "epoch": 1.4289813486370158, + "grad_norm": 0.3800303041934967, + "learning_rate": 6.265796025157154e-06, + "loss": 0.4342, + "step": 332 + }, + { + "epoch": 1.4332855093256816, + "grad_norm": 0.357076495885849, + "learning_rate": 6.241504994612237e-06, + "loss": 0.4149, + "step": 333 + }, + { + "epoch": 1.4375896700143471, + "grad_norm": 0.432861328125, + "learning_rate": 6.217182696172776e-06, + "loss": 0.4795, + "step": 334 + }, + { + "epoch": 1.441893830703013, + "grad_norm": 0.3767949342727661, + "learning_rate": 6.192829742407442e-06, + "loss": 0.3823, + "step": 335 + }, + { + "epoch": 1.4461979913916787, + "grad_norm": 0.3795372247695923, + "learning_rate": 6.168446746656973e-06, + "loss": 0.4557, + "step": 336 + }, + { + "epoch": 1.4505021520803443, + "grad_norm": 0.4189460277557373, + "learning_rate": 6.144034323018728e-06, + "loss": 0.4635, + "step": 337 + }, + { + "epoch": 1.45480631276901, + "grad_norm": 0.3688943684101105, + "learning_rate": 6.119593086331225e-06, + "loss": 0.372, + "step": 338 + }, + { + "epoch": 1.4591104734576756, + "grad_norm": 0.3465519845485687, + "learning_rate": 6.095123652158648e-06, + "loss": 0.4509, + "step": 339 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 0.42170581221580505, + "learning_rate": 6.070626636775349e-06, + "loss": 0.4685, + "step": 340 + }, + { + "epoch": 1.4677187948350072, + "grad_norm": 0.3757818639278412, + "learning_rate": 6.046102657150328e-06, + "loss": 0.455, + "step": 341 + }, + { + "epoch": 1.472022955523673, + "grad_norm": 0.3707659840583801, + "learning_rate": 6.021552330931693e-06, + "loss": 0.4176, + "step": 342 + }, + { + "epoch": 1.4763271162123386, + "grad_norm": 0.3855435848236084, + "learning_rate": 5.996976276431097e-06, + "loss": 0.4216, + "step": 343 + }, + { + "epoch": 1.4806312769010044, + "grad_norm": 0.37388697266578674, + "learning_rate": 5.972375112608182e-06, + "loss": 0.4122, + "step": 344 + }, + { + "epoch": 1.48493543758967, + "grad_norm": 0.38593074679374695, + "learning_rate": 5.947749459054972e-06, + "loss": 0.4459, + "step": 345 + }, + { + "epoch": 1.4892395982783357, + "grad_norm": 0.4058336615562439, + "learning_rate": 5.923099935980278e-06, + "loss": 0.4616, + "step": 346 + }, + { + "epoch": 1.4935437589670015, + "grad_norm": 0.3796309232711792, + "learning_rate": 5.898427164194084e-06, + "loss": 0.4336, + "step": 347 + }, + { + "epoch": 1.497847919655667, + "grad_norm": 0.3734838366508484, + "learning_rate": 5.8737317650918905e-06, + "loss": 0.4126, + "step": 348 + }, + { + "epoch": 1.502152080344333, + "grad_norm": 0.3841530680656433, + "learning_rate": 5.849014360639087e-06, + "loss": 0.4702, + "step": 349 + }, + { + "epoch": 1.5064562410329985, + "grad_norm": 0.3562658727169037, + "learning_rate": 5.824275573355278e-06, + "loss": 0.4034, + "step": 350 + }, + { + "epoch": 1.5107604017216643, + "grad_norm": 0.3666515350341797, + "learning_rate": 5.799516026298601e-06, + "loss": 0.4229, + "step": 351 + }, + { + "epoch": 1.51506456241033, + "grad_norm": 0.4212346374988556, + "learning_rate": 5.7747363430500395e-06, + "loss": 0.4376, + "step": 352 + }, + { + "epoch": 1.5193687230989958, + "grad_norm": 0.3742626905441284, + "learning_rate": 5.74993714769772e-06, + "loss": 0.4227, + "step": 353 + }, + { + "epoch": 1.5236728837876614, + "grad_norm": 0.4419170618057251, + "learning_rate": 5.725119064821185e-06, + "loss": 0.4358, + "step": 354 + }, + { + "epoch": 1.527977044476327, + "grad_norm": 0.35307490825653076, + "learning_rate": 5.700282719475672e-06, + "loss": 0.4292, + "step": 355 + }, + { + "epoch": 1.5322812051649928, + "grad_norm": 0.38687315583229065, + "learning_rate": 5.675428737176367e-06, + "loss": 0.398, + "step": 356 + }, + { + "epoch": 1.5365853658536586, + "grad_norm": 0.4023524224758148, + "learning_rate": 5.65055774388265e-06, + "loss": 0.4389, + "step": 357 + }, + { + "epoch": 1.5408895265423244, + "grad_norm": 0.3677155375480652, + "learning_rate": 5.625670365982332e-06, + "loss": 0.3917, + "step": 358 + }, + { + "epoch": 1.54519368723099, + "grad_norm": 0.3998677432537079, + "learning_rate": 5.600767230275878e-06, + "loss": 0.4452, + "step": 359 + }, + { + "epoch": 1.5494978479196555, + "grad_norm": 0.358852356672287, + "learning_rate": 5.575848963960621e-06, + "loss": 0.4241, + "step": 360 + }, + { + "epoch": 1.5538020086083213, + "grad_norm": 0.32302260398864746, + "learning_rate": 5.5509161946149635e-06, + "loss": 0.4185, + "step": 361 + }, + { + "epoch": 1.558106169296987, + "grad_norm": 0.4188165068626404, + "learning_rate": 5.525969550182577e-06, + "loss": 0.4654, + "step": 362 + }, + { + "epoch": 1.5624103299856529, + "grad_norm": 0.39718538522720337, + "learning_rate": 5.501009658956583e-06, + "loss": 0.4181, + "step": 363 + }, + { + "epoch": 1.5667144906743187, + "grad_norm": 0.3609788715839386, + "learning_rate": 5.4760371495637256e-06, + "loss": 0.4091, + "step": 364 + }, + { + "epoch": 1.5710186513629842, + "grad_norm": 0.35094860196113586, + "learning_rate": 5.451052650948549e-06, + "loss": 0.4354, + "step": 365 + }, + { + "epoch": 1.5753228120516498, + "grad_norm": 0.3525417149066925, + "learning_rate": 5.426056792357552e-06, + "loss": 0.4212, + "step": 366 + }, + { + "epoch": 1.5796269727403156, + "grad_norm": 0.36507681012153625, + "learning_rate": 5.40105020332333e-06, + "loss": 0.412, + "step": 367 + }, + { + "epoch": 1.5839311334289814, + "grad_norm": 0.3849915564060211, + "learning_rate": 5.376033513648743e-06, + "loss": 0.4572, + "step": 368 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.3586712181568146, + "learning_rate": 5.3510073533910344e-06, + "loss": 0.4308, + "step": 369 + }, + { + "epoch": 1.5925394548063128, + "grad_norm": 0.3649207651615143, + "learning_rate": 5.325972352845965e-06, + "loss": 0.4256, + "step": 370 + }, + { + "epoch": 1.5968436154949783, + "grad_norm": 0.3403324484825134, + "learning_rate": 5.30092914253195e-06, + "loss": 0.4058, + "step": 371 + }, + { + "epoch": 1.6011477761836441, + "grad_norm": 0.3676261603832245, + "learning_rate": 5.2758783531741655e-06, + "loss": 0.4122, + "step": 372 + }, + { + "epoch": 1.60545193687231, + "grad_norm": 0.34809696674346924, + "learning_rate": 5.25082061568867e-06, + "loss": 0.4101, + "step": 373 + }, + { + "epoch": 1.6097560975609757, + "grad_norm": 0.3801212012767792, + "learning_rate": 5.225756561166521e-06, + "loss": 0.3959, + "step": 374 + }, + { + "epoch": 1.6140602582496413, + "grad_norm": 0.39335742592811584, + "learning_rate": 5.200686820857862e-06, + "loss": 0.4715, + "step": 375 + }, + { + "epoch": 1.618364418938307, + "grad_norm": 0.33622682094573975, + "learning_rate": 5.175612026156045e-06, + "loss": 0.4125, + "step": 376 + }, + { + "epoch": 1.6226685796269726, + "grad_norm": 0.39736634492874146, + "learning_rate": 5.150532808581718e-06, + "loss": 0.4685, + "step": 377 + }, + { + "epoch": 1.6269727403156384, + "grad_norm": 0.32356390357017517, + "learning_rate": 5.125449799766916e-06, + "loss": 0.3995, + "step": 378 + }, + { + "epoch": 1.6312769010043042, + "grad_norm": 0.3594512939453125, + "learning_rate": 5.100363631439162e-06, + "loss": 0.4497, + "step": 379 + }, + { + "epoch": 1.63558106169297, + "grad_norm": 0.4013254642486572, + "learning_rate": 5.075274935405554e-06, + "loss": 0.4195, + "step": 380 + }, + { + "epoch": 1.6398852223816356, + "grad_norm": 0.3409561514854431, + "learning_rate": 5.0501843435368495e-06, + "loss": 0.4051, + "step": 381 + }, + { + "epoch": 1.6441893830703012, + "grad_norm": 0.3325178325176239, + "learning_rate": 5.025092487751552e-06, + "loss": 0.4421, + "step": 382 + }, + { + "epoch": 1.648493543758967, + "grad_norm": 0.3224661946296692, + "learning_rate": 5e-06, + "loss": 0.4121, + "step": 383 + }, + { + "epoch": 1.6527977044476327, + "grad_norm": 0.3134234845638275, + "learning_rate": 4.974907512248451e-06, + "loss": 0.4057, + "step": 384 + }, + { + "epoch": 1.6571018651362985, + "grad_norm": 0.351633220911026, + "learning_rate": 4.949815656463151e-06, + "loss": 0.4283, + "step": 385 + }, + { + "epoch": 1.661406025824964, + "grad_norm": 0.3710573613643646, + "learning_rate": 4.924725064594448e-06, + "loss": 0.4705, + "step": 386 + }, + { + "epoch": 1.6657101865136297, + "grad_norm": 0.33121249079704285, + "learning_rate": 4.89963636856084e-06, + "loss": 0.409, + "step": 387 + }, + { + "epoch": 1.6700143472022955, + "grad_norm": 0.3725872039794922, + "learning_rate": 4.874550200233085e-06, + "loss": 0.4838, + "step": 388 + }, + { + "epoch": 1.6743185078909613, + "grad_norm": 0.33540400862693787, + "learning_rate": 4.8494671914182835e-06, + "loss": 0.3972, + "step": 389 + }, + { + "epoch": 1.678622668579627, + "grad_norm": 0.3285572826862335, + "learning_rate": 4.824387973843957e-06, + "loss": 0.4366, + "step": 390 + }, + { + "epoch": 1.6829268292682928, + "grad_norm": 0.338501900434494, + "learning_rate": 4.7993131791421385e-06, + "loss": 0.4344, + "step": 391 + }, + { + "epoch": 1.6872309899569584, + "grad_norm": 0.3199228048324585, + "learning_rate": 4.7742434388334815e-06, + "loss": 0.4171, + "step": 392 + }, + { + "epoch": 1.691535150645624, + "grad_norm": 0.3764464855194092, + "learning_rate": 4.749179384311331e-06, + "loss": 0.3928, + "step": 393 + }, + { + "epoch": 1.6958393113342898, + "grad_norm": 0.3166040778160095, + "learning_rate": 4.724121646825838e-06, + "loss": 0.4239, + "step": 394 + }, + { + "epoch": 1.7001434720229556, + "grad_norm": 0.40935027599334717, + "learning_rate": 4.699070857468052e-06, + "loss": 0.4921, + "step": 395 + }, + { + "epoch": 1.7044476327116214, + "grad_norm": 0.3424462080001831, + "learning_rate": 4.674027647154037e-06, + "loss": 0.4045, + "step": 396 + }, + { + "epoch": 1.708751793400287, + "grad_norm": 0.3410986065864563, + "learning_rate": 4.648992646608968e-06, + "loss": 0.4157, + "step": 397 + }, + { + "epoch": 1.7130559540889525, + "grad_norm": 0.3328428864479065, + "learning_rate": 4.623966486351257e-06, + "loss": 0.4137, + "step": 398 + }, + { + "epoch": 1.7173601147776183, + "grad_norm": 0.3369297385215759, + "learning_rate": 4.598949796676672e-06, + "loss": 0.4417, + "step": 399 + }, + { + "epoch": 1.721664275466284, + "grad_norm": 0.3781452476978302, + "learning_rate": 4.573943207642452e-06, + "loss": 0.4371, + "step": 400 + }, + { + "epoch": 1.7259684361549499, + "grad_norm": 0.31260576844215393, + "learning_rate": 4.548947349051452e-06, + "loss": 0.4133, + "step": 401 + }, + { + "epoch": 1.7302725968436155, + "grad_norm": 0.3287489712238312, + "learning_rate": 4.523962850436276e-06, + "loss": 0.3952, + "step": 402 + }, + { + "epoch": 1.7345767575322812, + "grad_norm": 0.357338011264801, + "learning_rate": 4.498990341043419e-06, + "loss": 0.4311, + "step": 403 + }, + { + "epoch": 1.7388809182209468, + "grad_norm": 0.33581385016441345, + "learning_rate": 4.474030449817423e-06, + "loss": 0.4234, + "step": 404 + }, + { + "epoch": 1.7431850789096126, + "grad_norm": 0.32600918412208557, + "learning_rate": 4.449083805385037e-06, + "loss": 0.4112, + "step": 405 + }, + { + "epoch": 1.7474892395982784, + "grad_norm": 0.33547377586364746, + "learning_rate": 4.424151036039381e-06, + "loss": 0.4262, + "step": 406 + }, + { + "epoch": 1.7517934002869442, + "grad_norm": 0.3635512888431549, + "learning_rate": 4.3992327697241225e-06, + "loss": 0.4538, + "step": 407 + }, + { + "epoch": 1.7560975609756098, + "grad_norm": 0.34227895736694336, + "learning_rate": 4.3743296340176694e-06, + "loss": 0.4468, + "step": 408 + }, + { + "epoch": 1.7604017216642753, + "grad_norm": 0.280943363904953, + "learning_rate": 4.3494422561173515e-06, + "loss": 0.3928, + "step": 409 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.36496126651763916, + "learning_rate": 4.3245712628236356e-06, + "loss": 0.4792, + "step": 410 + }, + { + "epoch": 1.769010043041607, + "grad_norm": 0.34355199337005615, + "learning_rate": 4.299717280524329e-06, + "loss": 0.3907, + "step": 411 + }, + { + "epoch": 1.7733142037302727, + "grad_norm": 0.3443046510219574, + "learning_rate": 4.274880935178817e-06, + "loss": 0.433, + "step": 412 + }, + { + "epoch": 1.7776183644189383, + "grad_norm": 0.3240680694580078, + "learning_rate": 4.250062852302283e-06, + "loss": 0.4214, + "step": 413 + }, + { + "epoch": 1.781922525107604, + "grad_norm": 0.3351806700229645, + "learning_rate": 4.225263656949961e-06, + "loss": 0.4287, + "step": 414 + }, + { + "epoch": 1.7862266857962696, + "grad_norm": 0.30206426978111267, + "learning_rate": 4.200483973701401e-06, + "loss": 0.3835, + "step": 415 + }, + { + "epoch": 1.7905308464849354, + "grad_norm": 0.3471432328224182, + "learning_rate": 4.175724426644724e-06, + "loss": 0.4838, + "step": 416 + }, + { + "epoch": 1.7948350071736012, + "grad_norm": 0.30823495984077454, + "learning_rate": 4.150985639360914e-06, + "loss": 0.3922, + "step": 417 + }, + { + "epoch": 1.799139167862267, + "grad_norm": 0.3459091782569885, + "learning_rate": 4.12626823490811e-06, + "loss": 0.444, + "step": 418 + }, + { + "epoch": 1.8034433285509326, + "grad_norm": 0.35498541593551636, + "learning_rate": 4.1015728358059185e-06, + "loss": 0.4414, + "step": 419 + }, + { + "epoch": 1.8077474892395982, + "grad_norm": 0.32903069257736206, + "learning_rate": 4.076900064019721e-06, + "loss": 0.4428, + "step": 420 + }, + { + "epoch": 1.812051649928264, + "grad_norm": 0.3732050955295563, + "learning_rate": 4.052250540945029e-06, + "loss": 0.4471, + "step": 421 + }, + { + "epoch": 1.8163558106169297, + "grad_norm": 0.30928969383239746, + "learning_rate": 4.02762488739182e-06, + "loss": 0.3788, + "step": 422 + }, + { + "epoch": 1.8206599713055955, + "grad_norm": 0.3515232503414154, + "learning_rate": 4.003023723568903e-06, + "loss": 0.4587, + "step": 423 + }, + { + "epoch": 1.824964131994261, + "grad_norm": 0.32477593421936035, + "learning_rate": 3.978447669068309e-06, + "loss": 0.4536, + "step": 424 + }, + { + "epoch": 1.8292682926829267, + "grad_norm": 0.35645440220832825, + "learning_rate": 3.953897342849673e-06, + "loss": 0.3964, + "step": 425 + }, + { + "epoch": 1.8335724533715925, + "grad_norm": 0.3410753309726715, + "learning_rate": 3.929373363224654e-06, + "loss": 0.4749, + "step": 426 + }, + { + "epoch": 1.8378766140602583, + "grad_norm": 0.3486350178718567, + "learning_rate": 3.904876347841354e-06, + "loss": 0.4494, + "step": 427 + }, + { + "epoch": 1.842180774748924, + "grad_norm": 0.3229897916316986, + "learning_rate": 3.8804069136687775e-06, + "loss": 0.4181, + "step": 428 + }, + { + "epoch": 1.8464849354375896, + "grad_norm": 0.3268226087093353, + "learning_rate": 3.8559656769812746e-06, + "loss": 0.4477, + "step": 429 + }, + { + "epoch": 1.8507890961262554, + "grad_norm": 0.31242066621780396, + "learning_rate": 3.8315532533430285e-06, + "loss": 0.4122, + "step": 430 + }, + { + "epoch": 1.855093256814921, + "grad_norm": 0.3421877920627594, + "learning_rate": 3.8071702575925594e-06, + "loss": 0.4484, + "step": 431 + }, + { + "epoch": 1.8593974175035868, + "grad_norm": 0.31596559286117554, + "learning_rate": 3.7828173038272266e-06, + "loss": 0.3822, + "step": 432 + }, + { + "epoch": 1.8637015781922526, + "grad_norm": 0.32587170600891113, + "learning_rate": 3.7584950053877646e-06, + "loss": 0.4536, + "step": 433 + }, + { + "epoch": 1.8680057388809184, + "grad_norm": 0.3373381197452545, + "learning_rate": 3.7342039748428473e-06, + "loss": 0.3904, + "step": 434 + }, + { + "epoch": 1.872309899569584, + "grad_norm": 0.33278679847717285, + "learning_rate": 3.709944823973647e-06, + "loss": 0.3923, + "step": 435 + }, + { + "epoch": 1.8766140602582495, + "grad_norm": 0.3359312415122986, + "learning_rate": 3.685718163758427e-06, + "loss": 0.464, + "step": 436 + }, + { + "epoch": 1.8809182209469153, + "grad_norm": 0.3137849271297455, + "learning_rate": 3.6615246043571674e-06, + "loss": 0.3741, + "step": 437 + }, + { + "epoch": 1.885222381635581, + "grad_norm": 0.3578011095523834, + "learning_rate": 3.6373647550961834e-06, + "loss": 0.4495, + "step": 438 + }, + { + "epoch": 1.8895265423242469, + "grad_norm": 0.39701202511787415, + "learning_rate": 3.61323922445278e-06, + "loss": 0.4643, + "step": 439 + }, + { + "epoch": 1.8938307030129125, + "grad_norm": 0.3316022455692291, + "learning_rate": 3.5891486200399413e-06, + "loss": 0.3997, + "step": 440 + }, + { + "epoch": 1.8981348637015782, + "grad_norm": 0.3504200279712677, + "learning_rate": 3.5650935485910103e-06, + "loss": 0.4349, + "step": 441 + }, + { + "epoch": 1.9024390243902438, + "grad_norm": 0.33614853024482727, + "learning_rate": 3.5410746159444165e-06, + "loss": 0.4419, + "step": 442 + }, + { + "epoch": 1.9067431850789096, + "grad_norm": 0.3440527319908142, + "learning_rate": 3.5170924270284166e-06, + "loss": 0.4213, + "step": 443 + }, + { + "epoch": 1.9110473457675754, + "grad_norm": 0.3421337902545929, + "learning_rate": 3.4931475858458634e-06, + "loss": 0.43, + "step": 444 + }, + { + "epoch": 1.9153515064562412, + "grad_norm": 0.35556983947753906, + "learning_rate": 3.469240695458983e-06, + "loss": 0.3998, + "step": 445 + }, + { + "epoch": 1.9196556671449068, + "grad_norm": 0.35267797112464905, + "learning_rate": 3.445372357974194e-06, + "loss": 0.4149, + "step": 446 + }, + { + "epoch": 1.9239598278335723, + "grad_norm": 0.331310898065567, + "learning_rate": 3.4215431745269463e-06, + "loss": 0.4086, + "step": 447 + }, + { + "epoch": 1.9282639885222381, + "grad_norm": 0.3272462785243988, + "learning_rate": 3.397753745266571e-06, + "loss": 0.4314, + "step": 448 + }, + { + "epoch": 1.932568149210904, + "grad_norm": 0.31959936022758484, + "learning_rate": 3.374004669341173e-06, + "loss": 0.4381, + "step": 449 + }, + { + "epoch": 1.9368723098995697, + "grad_norm": 0.340181827545166, + "learning_rate": 3.350296544882543e-06, + "loss": 0.4569, + "step": 450 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.3325444459915161, + "learning_rate": 3.326629968991083e-06, + "loss": 0.4094, + "step": 451 + }, + { + "epoch": 1.9454806312769009, + "grad_norm": 0.32819491624832153, + "learning_rate": 3.303005537720778e-06, + "loss": 0.4435, + "step": 452 + }, + { + "epoch": 1.9497847919655666, + "grad_norm": 0.3121723234653473, + "learning_rate": 3.2794238460641837e-06, + "loss": 0.4096, + "step": 453 + }, + { + "epoch": 1.9540889526542324, + "grad_norm": 0.3106290400028229, + "learning_rate": 3.255885487937431e-06, + "loss": 0.3645, + "step": 454 + }, + { + "epoch": 1.9583931133428982, + "grad_norm": 0.3480536937713623, + "learning_rate": 3.2323910561652798e-06, + "loss": 0.4206, + "step": 455 + }, + { + "epoch": 1.9626972740315638, + "grad_norm": 0.3264220654964447, + "learning_rate": 3.2089411424661864e-06, + "loss": 0.4522, + "step": 456 + }, + { + "epoch": 1.9670014347202296, + "grad_norm": 0.29471534490585327, + "learning_rate": 3.185536337437393e-06, + "loss": 0.4188, + "step": 457 + }, + { + "epoch": 1.9713055954088952, + "grad_norm": 0.31906241178512573, + "learning_rate": 3.1621772305400603e-06, + "loss": 0.4532, + "step": 458 + }, + { + "epoch": 1.975609756097561, + "grad_norm": 0.30077633261680603, + "learning_rate": 3.138864410084419e-06, + "loss": 0.4052, + "step": 459 + }, + { + "epoch": 1.9799139167862267, + "grad_norm": 0.30726587772369385, + "learning_rate": 3.1155984632149565e-06, + "loss": 0.3927, + "step": 460 + }, + { + "epoch": 1.9842180774748925, + "grad_norm": 0.3261038661003113, + "learning_rate": 3.0923799758956265e-06, + "loss": 0.4457, + "step": 461 + }, + { + "epoch": 1.988522238163558, + "grad_norm": 0.2910383939743042, + "learning_rate": 3.0692095328950843e-06, + "loss": 0.404, + "step": 462 + }, + { + "epoch": 1.9928263988522237, + "grad_norm": 0.2886095345020294, + "learning_rate": 3.0460877177719763e-06, + "loss": 0.4016, + "step": 463 + }, + { + "epoch": 1.9971305595408895, + "grad_norm": 0.31026142835617065, + "learning_rate": 3.023015112860228e-06, + "loss": 0.4471, + "step": 464 + }, + { + "epoch": 2.0014347202295553, + "grad_norm": 0.670920193195343, + "learning_rate": 2.9999922992543777e-06, + "loss": 0.6493, + "step": 465 + }, + { + "epoch": 2.005738880918221, + "grad_norm": 0.3565112054347992, + "learning_rate": 2.977019856794955e-06, + "loss": 0.3958, + "step": 466 + }, + { + "epoch": 2.010043041606887, + "grad_norm": 0.38834989070892334, + "learning_rate": 2.9540983640538635e-06, + "loss": 0.4022, + "step": 467 + }, + { + "epoch": 2.014347202295552, + "grad_norm": 0.35440927743911743, + "learning_rate": 2.93122839831981e-06, + "loss": 0.4266, + "step": 468 + }, + { + "epoch": 2.018651362984218, + "grad_norm": 0.3439231216907501, + "learning_rate": 2.908410535583777e-06, + "loss": 0.4115, + "step": 469 + }, + { + "epoch": 2.022955523672884, + "grad_norm": 0.30981284379959106, + "learning_rate": 2.8856453505245018e-06, + "loss": 0.3427, + "step": 470 + }, + { + "epoch": 2.0272596843615496, + "grad_norm": 0.3863826096057892, + "learning_rate": 2.8629334164940127e-06, + "loss": 0.407, + "step": 471 + }, + { + "epoch": 2.0315638450502154, + "grad_norm": 0.39452433586120605, + "learning_rate": 2.840275305503186e-06, + "loss": 0.3971, + "step": 472 + }, + { + "epoch": 2.0358680057388807, + "grad_norm": 0.35829833149909973, + "learning_rate": 2.817671588207338e-06, + "loss": 0.4088, + "step": 473 + }, + { + "epoch": 2.0401721664275465, + "grad_norm": 0.31108009815216064, + "learning_rate": 2.7951228338918506e-06, + "loss": 0.3774, + "step": 474 + }, + { + "epoch": 2.0444763271162123, + "grad_norm": 0.4098561704158783, + "learning_rate": 2.77262961045784e-06, + "loss": 0.4484, + "step": 475 + }, + { + "epoch": 2.048780487804878, + "grad_norm": 0.3637750744819641, + "learning_rate": 2.7501924844078538e-06, + "loss": 0.3895, + "step": 476 + }, + { + "epoch": 2.053084648493544, + "grad_norm": 0.3582223653793335, + "learning_rate": 2.7278120208315927e-06, + "loss": 0.4144, + "step": 477 + }, + { + "epoch": 2.0573888091822097, + "grad_norm": 0.3238336443901062, + "learning_rate": 2.7054887833916933e-06, + "loss": 0.3799, + "step": 478 + }, + { + "epoch": 2.061692969870875, + "grad_norm": 0.32167771458625793, + "learning_rate": 2.6832233343095225e-06, + "loss": 0.3839, + "step": 479 + }, + { + "epoch": 2.065997130559541, + "grad_norm": 0.30850154161453247, + "learning_rate": 2.6610162343510183e-06, + "loss": 0.3808, + "step": 480 + }, + { + "epoch": 2.0703012912482066, + "grad_norm": 0.38707244396209717, + "learning_rate": 2.6388680428125657e-06, + "loss": 0.4356, + "step": 481 + }, + { + "epoch": 2.0746054519368724, + "grad_norm": 0.32622581720352173, + "learning_rate": 2.616779317506921e-06, + "loss": 0.3889, + "step": 482 + }, + { + "epoch": 2.078909612625538, + "grad_norm": 0.35619160532951355, + "learning_rate": 2.594750614749148e-06, + "loss": 0.4568, + "step": 483 + }, + { + "epoch": 2.0832137733142035, + "grad_norm": 0.3068256378173828, + "learning_rate": 2.572782489342617e-06, + "loss": 0.3612, + "step": 484 + }, + { + "epoch": 2.0875179340028693, + "grad_norm": 0.3251802623271942, + "learning_rate": 2.5508754945650305e-06, + "loss": 0.4148, + "step": 485 + }, + { + "epoch": 2.091822094691535, + "grad_norm": 0.28180989623069763, + "learning_rate": 2.5290301821544826e-06, + "loss": 0.3438, + "step": 486 + }, + { + "epoch": 2.096126255380201, + "grad_norm": 0.3368677794933319, + "learning_rate": 2.5072471022955703e-06, + "loss": 0.4339, + "step": 487 + }, + { + "epoch": 2.1004304160688667, + "grad_norm": 0.3352259397506714, + "learning_rate": 2.4855268036055346e-06, + "loss": 0.4137, + "step": 488 + }, + { + "epoch": 2.104734576757532, + "grad_norm": 0.3315313160419464, + "learning_rate": 2.4638698331204404e-06, + "loss": 0.4156, + "step": 489 + }, + { + "epoch": 2.109038737446198, + "grad_norm": 0.28988370299339294, + "learning_rate": 2.4422767362814045e-06, + "loss": 0.3509, + "step": 490 + }, + { + "epoch": 2.1133428981348636, + "grad_norm": 0.31385964155197144, + "learning_rate": 2.420748056920856e-06, + "loss": 0.3794, + "step": 491 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.3286735713481903, + "learning_rate": 2.3992843372488357e-06, + "loss": 0.4653, + "step": 492 + }, + { + "epoch": 2.1219512195121952, + "grad_norm": 0.29086706042289734, + "learning_rate": 2.3778861178393453e-06, + "loss": 0.3721, + "step": 493 + }, + { + "epoch": 2.126255380200861, + "grad_norm": 0.3042701482772827, + "learning_rate": 2.3565539376167295e-06, + "loss": 0.3946, + "step": 494 + }, + { + "epoch": 2.1305595408895264, + "grad_norm": 0.29335102438926697, + "learning_rate": 2.3352883338421085e-06, + "loss": 0.383, + "step": 495 + }, + { + "epoch": 2.134863701578192, + "grad_norm": 0.3304314911365509, + "learning_rate": 2.3140898420998425e-06, + "loss": 0.4103, + "step": 496 + }, + { + "epoch": 2.139167862266858, + "grad_norm": 0.31538569927215576, + "learning_rate": 2.2929589962840375e-06, + "loss": 0.4033, + "step": 497 + }, + { + "epoch": 2.1434720229555237, + "grad_norm": 0.3131985366344452, + "learning_rate": 2.271896328585114e-06, + "loss": 0.387, + "step": 498 + }, + { + "epoch": 2.1477761836441895, + "grad_norm": 0.3355647027492523, + "learning_rate": 2.2509023694763844e-06, + "loss": 0.3912, + "step": 499 + }, + { + "epoch": 2.152080344332855, + "grad_norm": 0.2865258753299713, + "learning_rate": 2.2299776477007073e-06, + "loss": 0.3855, + "step": 500 + }, + { + "epoch": 2.1563845050215207, + "grad_norm": 0.29301708936691284, + "learning_rate": 2.2091226902571673e-06, + "loss": 0.3829, + "step": 501 + }, + { + "epoch": 2.1606886657101865, + "grad_norm": 0.3320524990558624, + "learning_rate": 2.1883380223878004e-06, + "loss": 0.4205, + "step": 502 + }, + { + "epoch": 2.1649928263988523, + "grad_norm": 0.32117411494255066, + "learning_rate": 2.1676241675643627e-06, + "loss": 0.4102, + "step": 503 + }, + { + "epoch": 2.169296987087518, + "grad_norm": 0.3084646165370941, + "learning_rate": 2.1469816474751566e-06, + "loss": 0.408, + "step": 504 + }, + { + "epoch": 2.173601147776184, + "grad_norm": 0.30409905314445496, + "learning_rate": 2.1264109820118783e-06, + "loss": 0.3981, + "step": 505 + }, + { + "epoch": 2.177905308464849, + "grad_norm": 0.28728926181793213, + "learning_rate": 2.105912689256533e-06, + "loss": 0.3768, + "step": 506 + }, + { + "epoch": 2.182209469153515, + "grad_norm": 0.30246901512145996, + "learning_rate": 2.0854872854683877e-06, + "loss": 0.3924, + "step": 507 + }, + { + "epoch": 2.186513629842181, + "grad_norm": 0.31316882371902466, + "learning_rate": 2.0651352850709656e-06, + "loss": 0.4331, + "step": 508 + }, + { + "epoch": 2.1908177905308466, + "grad_norm": 0.31881269812583923, + "learning_rate": 2.0448572006390875e-06, + "loss": 0.3617, + "step": 509 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 0.30993711948394775, + "learning_rate": 2.0246535428859652e-06, + "loss": 0.4185, + "step": 510 + }, + { + "epoch": 2.1994261119081777, + "grad_norm": 0.32910722494125366, + "learning_rate": 2.0045248206503454e-06, + "loss": 0.4396, + "step": 511 + }, + { + "epoch": 2.2037302725968435, + "grad_norm": 0.31692197918891907, + "learning_rate": 1.984471540883679e-06, + "loss": 0.4397, + "step": 512 + }, + { + "epoch": 2.2080344332855093, + "grad_norm": 0.26367130875587463, + "learning_rate": 1.964494208637369e-06, + "loss": 0.3652, + "step": 513 + }, + { + "epoch": 2.212338593974175, + "grad_norm": 0.2917594611644745, + "learning_rate": 1.9445933270500444e-06, + "loss": 0.4279, + "step": 514 + }, + { + "epoch": 2.216642754662841, + "grad_norm": 0.34075671434402466, + "learning_rate": 1.9247693973348834e-06, + "loss": 0.4294, + "step": 515 + }, + { + "epoch": 2.2209469153515062, + "grad_norm": 0.2909022271633148, + "learning_rate": 1.905022918766995e-06, + "loss": 0.3783, + "step": 516 + }, + { + "epoch": 2.225251076040172, + "grad_norm": 0.3097451627254486, + "learning_rate": 1.8853543886708498e-06, + "loss": 0.4208, + "step": 517 + }, + { + "epoch": 2.229555236728838, + "grad_norm": 0.2984945774078369, + "learning_rate": 1.8657643024077431e-06, + "loss": 0.4076, + "step": 518 + }, + { + "epoch": 2.2338593974175036, + "grad_norm": 0.2853682041168213, + "learning_rate": 1.8462531533633238e-06, + "loss": 0.3759, + "step": 519 + }, + { + "epoch": 2.2381635581061694, + "grad_norm": 0.2992599606513977, + "learning_rate": 1.8268214329351797e-06, + "loss": 0.3984, + "step": 520 + }, + { + "epoch": 2.242467718794835, + "grad_norm": 0.28934013843536377, + "learning_rate": 1.8074696305204397e-06, + "loss": 0.399, + "step": 521 + }, + { + "epoch": 2.2467718794835005, + "grad_norm": 0.30769601464271545, + "learning_rate": 1.7881982335034625e-06, + "loss": 0.4008, + "step": 522 + }, + { + "epoch": 2.2510760401721663, + "grad_norm": 0.3330642879009247, + "learning_rate": 1.7690077272435636e-06, + "loss": 0.4239, + "step": 523 + }, + { + "epoch": 2.255380200860832, + "grad_norm": 0.28844162821769714, + "learning_rate": 1.7498985950627794e-06, + "loss": 0.4153, + "step": 524 + }, + { + "epoch": 2.259684361549498, + "grad_norm": 0.2728285789489746, + "learning_rate": 1.7308713182337044e-06, + "loss": 0.4052, + "step": 525 + }, + { + "epoch": 2.2639885222381637, + "grad_norm": 0.32398754358291626, + "learning_rate": 1.7119263759673677e-06, + "loss": 0.4153, + "step": 526 + }, + { + "epoch": 2.2682926829268295, + "grad_norm": 0.30444568395614624, + "learning_rate": 1.6930642454011647e-06, + "loss": 0.3952, + "step": 527 + }, + { + "epoch": 2.272596843615495, + "grad_norm": 0.2704574167728424, + "learning_rate": 1.6742854015868349e-06, + "loss": 0.3686, + "step": 528 + }, + { + "epoch": 2.2769010043041606, + "grad_norm": 0.30521517992019653, + "learning_rate": 1.655590317478501e-06, + "loss": 0.4278, + "step": 529 + }, + { + "epoch": 2.2812051649928264, + "grad_norm": 0.28752925992012024, + "learning_rate": 1.6369794639207626e-06, + "loss": 0.4023, + "step": 530 + }, + { + "epoch": 2.2855093256814922, + "grad_norm": 0.2987408936023712, + "learning_rate": 1.6184533096368277e-06, + "loss": 0.414, + "step": 531 + }, + { + "epoch": 2.289813486370158, + "grad_norm": 0.28139418363571167, + "learning_rate": 1.6000123212167158e-06, + "loss": 0.3826, + "step": 532 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 0.3225473165512085, + "learning_rate": 1.581656963105504e-06, + "loss": 0.4275, + "step": 533 + }, + { + "epoch": 2.298421807747489, + "grad_norm": 0.2836931049823761, + "learning_rate": 1.5633876975916261e-06, + "loss": 0.3899, + "step": 534 + }, + { + "epoch": 2.302725968436155, + "grad_norm": 0.31612229347229004, + "learning_rate": 1.5452049847952338e-06, + "loss": 0.3921, + "step": 535 + }, + { + "epoch": 2.3070301291248207, + "grad_norm": 0.2987075448036194, + "learning_rate": 1.5271092826566108e-06, + "loss": 0.4058, + "step": 536 + }, + { + "epoch": 2.3113342898134865, + "grad_norm": 0.27526581287384033, + "learning_rate": 1.5091010469246303e-06, + "loss": 0.3879, + "step": 537 + }, + { + "epoch": 2.315638450502152, + "grad_norm": 0.28727975487709045, + "learning_rate": 1.4911807311452874e-06, + "loss": 0.389, + "step": 538 + }, + { + "epoch": 2.3199426111908177, + "grad_norm": 0.294012188911438, + "learning_rate": 1.4733487866502698e-06, + "loss": 0.3775, + "step": 539 + }, + { + "epoch": 2.3242467718794835, + "grad_norm": 0.29785969853401184, + "learning_rate": 1.4556056625455922e-06, + "loss": 0.404, + "step": 540 + }, + { + "epoch": 2.3285509325681493, + "grad_norm": 0.322480171918869, + "learning_rate": 1.4379518057002834e-06, + "loss": 0.4039, + "step": 541 + }, + { + "epoch": 2.332855093256815, + "grad_norm": 0.29027432203292847, + "learning_rate": 1.4203876607351347e-06, + "loss": 0.4156, + "step": 542 + }, + { + "epoch": 2.3371592539454804, + "grad_norm": 0.30819234251976013, + "learning_rate": 1.4029136700115031e-06, + "loss": 0.4104, + "step": 543 + }, + { + "epoch": 2.341463414634146, + "grad_norm": 0.29122260212898254, + "learning_rate": 1.3855302736201686e-06, + "loss": 0.3695, + "step": 544 + }, + { + "epoch": 2.345767575322812, + "grad_norm": 0.2812859117984772, + "learning_rate": 1.3682379093702447e-06, + "loss": 0.3635, + "step": 545 + }, + { + "epoch": 2.350071736011478, + "grad_norm": 0.3218896687030792, + "learning_rate": 1.3510370127781635e-06, + "loss": 0.4021, + "step": 546 + }, + { + "epoch": 2.3543758967001436, + "grad_norm": 0.3179861009120941, + "learning_rate": 1.3339280170566959e-06, + "loss": 0.3962, + "step": 547 + }, + { + "epoch": 2.3586800573888094, + "grad_norm": 0.2889609932899475, + "learning_rate": 1.3169113531040462e-06, + "loss": 0.4131, + "step": 548 + }, + { + "epoch": 2.3629842180774747, + "grad_norm": 0.2856752574443817, + "learning_rate": 1.2999874494930004e-06, + "loss": 0.3709, + "step": 549 + }, + { + "epoch": 2.3672883787661405, + "grad_norm": 0.3044317364692688, + "learning_rate": 1.2831567324601325e-06, + "loss": 0.3988, + "step": 550 + }, + { + "epoch": 2.3715925394548063, + "grad_norm": 0.3010687828063965, + "learning_rate": 1.266419625895064e-06, + "loss": 0.4016, + "step": 551 + }, + { + "epoch": 2.375896700143472, + "grad_norm": 0.3065761923789978, + "learning_rate": 1.2497765513297976e-06, + "loss": 0.4227, + "step": 552 + }, + { + "epoch": 2.380200860832138, + "grad_norm": 0.28684499859809875, + "learning_rate": 1.2332279279280907e-06, + "loss": 0.377, + "step": 553 + }, + { + "epoch": 2.3845050215208037, + "grad_norm": 0.2905068099498749, + "learning_rate": 1.2167741724749026e-06, + "loss": 0.389, + "step": 554 + }, + { + "epoch": 2.388809182209469, + "grad_norm": 0.3115670680999756, + "learning_rate": 1.2004156993659028e-06, + "loss": 0.4171, + "step": 555 + }, + { + "epoch": 2.393113342898135, + "grad_norm": 0.29904720187187195, + "learning_rate": 1.1841529205970281e-06, + "loss": 0.3995, + "step": 556 + }, + { + "epoch": 2.3974175035868006, + "grad_norm": 0.2813907861709595, + "learning_rate": 1.1679862457541052e-06, + "loss": 0.3995, + "step": 557 + }, + { + "epoch": 2.4017216642754664, + "grad_norm": 0.3307220935821533, + "learning_rate": 1.1519160820025382e-06, + "loss": 0.412, + "step": 558 + }, + { + "epoch": 2.406025824964132, + "grad_norm": 0.30656975507736206, + "learning_rate": 1.1359428340770567e-06, + "loss": 0.4061, + "step": 559 + }, + { + "epoch": 2.4103299856527975, + "grad_norm": 0.30962732434272766, + "learning_rate": 1.1200669042715163e-06, + "loss": 0.4016, + "step": 560 + }, + { + "epoch": 2.4146341463414633, + "grad_norm": 0.29920291900634766, + "learning_rate": 1.104288692428766e-06, + "loss": 0.3848, + "step": 561 + }, + { + "epoch": 2.418938307030129, + "grad_norm": 0.32098498940467834, + "learning_rate": 1.0886085959305915e-06, + "loss": 0.4224, + "step": 562 + }, + { + "epoch": 2.423242467718795, + "grad_norm": 0.28412872552871704, + "learning_rate": 1.0730270096876876e-06, + "loss": 0.3759, + "step": 563 + }, + { + "epoch": 2.4275466284074607, + "grad_norm": 0.336220920085907, + "learning_rate": 1.057544326129723e-06, + "loss": 0.4437, + "step": 564 + }, + { + "epoch": 2.431850789096126, + "grad_norm": 0.2975854277610779, + "learning_rate": 1.0421609351954599e-06, + "loss": 0.3788, + "step": 565 + }, + { + "epoch": 2.436154949784792, + "grad_norm": 0.2922147214412689, + "learning_rate": 1.026877224322923e-06, + "loss": 0.4049, + "step": 566 + }, + { + "epoch": 2.4404591104734576, + "grad_norm": 0.3053329288959503, + "learning_rate": 1.0116935784396482e-06, + "loss": 0.4196, + "step": 567 + }, + { + "epoch": 2.4447632711621234, + "grad_norm": 0.2925123870372772, + "learning_rate": 9.966103799529891e-07, + "loss": 0.3872, + "step": 568 + }, + { + "epoch": 2.4490674318507892, + "grad_norm": 0.25717809796333313, + "learning_rate": 9.816280087404851e-07, + "loss": 0.3512, + "step": 569 + }, + { + "epoch": 2.4533715925394546, + "grad_norm": 0.2897495925426483, + "learning_rate": 9.66746842140287e-07, + "loss": 0.451, + "step": 570 + }, + { + "epoch": 2.4576757532281204, + "grad_norm": 0.279236376285553, + "learning_rate": 9.519672549416659e-07, + "loss": 0.3622, + "step": 571 + }, + { + "epoch": 2.461979913916786, + "grad_norm": 0.29578229784965515, + "learning_rate": 9.372896193755621e-07, + "loss": 0.419, + "step": 572 + }, + { + "epoch": 2.466284074605452, + "grad_norm": 0.28431037068367004, + "learning_rate": 9.227143051052162e-07, + "loss": 0.3895, + "step": 573 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.30916470289230347, + "learning_rate": 9.082416792168608e-07, + "loss": 0.3914, + "step": 574 + }, + { + "epoch": 2.4748923959827835, + "grad_norm": 0.2944723963737488, + "learning_rate": 8.938721062104727e-07, + "loss": 0.4186, + "step": 575 + }, + { + "epoch": 2.479196556671449, + "grad_norm": 0.2983667254447937, + "learning_rate": 8.7960594799059e-07, + "loss": 0.4174, + "step": 576 + }, + { + "epoch": 2.4835007173601147, + "grad_norm": 0.3032497763633728, + "learning_rate": 8.654435638572e-07, + "loss": 0.3805, + "step": 577 + }, + { + "epoch": 2.4878048780487805, + "grad_norm": 0.3096209466457367, + "learning_rate": 8.513853104966951e-07, + "loss": 0.4586, + "step": 578 + }, + { + "epoch": 2.4921090387374463, + "grad_norm": 0.285931259393692, + "learning_rate": 8.374315419728784e-07, + "loss": 0.365, + "step": 579 + }, + { + "epoch": 2.496413199426112, + "grad_norm": 0.2848215401172638, + "learning_rate": 8.235826097180566e-07, + "loss": 0.4182, + "step": 580 + }, + { + "epoch": 2.500717360114778, + "grad_norm": 0.299034059047699, + "learning_rate": 8.098388625241854e-07, + "loss": 0.4259, + "step": 581 + }, + { + "epoch": 2.505021520803443, + "grad_norm": 0.29463037848472595, + "learning_rate": 7.962006465340821e-07, + "loss": 0.3974, + "step": 582 + }, + { + "epoch": 2.509325681492109, + "grad_norm": 0.2773649990558624, + "learning_rate": 7.8266830523271e-07, + "loss": 0.3681, + "step": 583 + }, + { + "epoch": 2.513629842180775, + "grad_norm": 0.2890886068344116, + "learning_rate": 7.692421794385313e-07, + "loss": 0.4115, + "step": 584 + }, + { + "epoch": 2.5179340028694406, + "grad_norm": 0.2763752043247223, + "learning_rate": 7.559226072949166e-07, + "loss": 0.3985, + "step": 585 + }, + { + "epoch": 2.5222381635581064, + "grad_norm": 0.28498420119285583, + "learning_rate": 7.427099242616348e-07, + "loss": 0.3819, + "step": 586 + }, + { + "epoch": 2.5265423242467717, + "grad_norm": 0.2716716527938843, + "learning_rate": 7.296044631064014e-07, + "loss": 0.4024, + "step": 587 + }, + { + "epoch": 2.5308464849354375, + "grad_norm": 0.295413613319397, + "learning_rate": 7.166065538964955e-07, + "loss": 0.4236, + "step": 588 + }, + { + "epoch": 2.5351506456241033, + "grad_norm": 0.27543219923973083, + "learning_rate": 7.037165239904514e-07, + "loss": 0.41, + "step": 589 + }, + { + "epoch": 2.539454806312769, + "grad_norm": 0.2738160789012909, + "learning_rate": 6.909346980298093e-07, + "loss": 0.4002, + "step": 590 + }, + { + "epoch": 2.543758967001435, + "grad_norm": 0.2854576110839844, + "learning_rate": 6.782613979309443e-07, + "loss": 0.3978, + "step": 591 + }, + { + "epoch": 2.5480631276901002, + "grad_norm": 0.2689387798309326, + "learning_rate": 6.656969428769567e-07, + "loss": 0.3963, + "step": 592 + }, + { + "epoch": 2.552367288378766, + "grad_norm": 0.2699349522590637, + "learning_rate": 6.532416493096272e-07, + "loss": 0.4085, + "step": 593 + }, + { + "epoch": 2.556671449067432, + "grad_norm": 0.25454413890838623, + "learning_rate": 6.408958309214597e-07, + "loss": 0.3766, + "step": 594 + }, + { + "epoch": 2.5609756097560976, + "grad_norm": 0.3066865801811218, + "learning_rate": 6.286597986477683e-07, + "loss": 0.4144, + "step": 595 + }, + { + "epoch": 2.5652797704447634, + "grad_norm": 0.2773991823196411, + "learning_rate": 6.165338606588517e-07, + "loss": 0.4158, + "step": 596 + }, + { + "epoch": 2.5695839311334288, + "grad_norm": 0.28849199414253235, + "learning_rate": 6.045183223522339e-07, + "loss": 0.3781, + "step": 597 + }, + { + "epoch": 2.5738880918220945, + "grad_norm": 0.32376885414123535, + "learning_rate": 5.926134863449712e-07, + "loss": 0.3767, + "step": 598 + }, + { + "epoch": 2.5781922525107603, + "grad_norm": 0.2920165956020355, + "learning_rate": 5.808196524660253e-07, + "loss": 0.4263, + "step": 599 + }, + { + "epoch": 2.582496413199426, + "grad_norm": 0.2848154604434967, + "learning_rate": 5.691371177487215e-07, + "loss": 0.3821, + "step": 600 + }, + { + "epoch": 2.586800573888092, + "grad_norm": 0.28486862778663635, + "learning_rate": 5.575661764232593e-07, + "loss": 0.4167, + "step": 601 + }, + { + "epoch": 2.5911047345767573, + "grad_norm": 0.28236979246139526, + "learning_rate": 5.461071199093048e-07, + "loss": 0.3902, + "step": 602 + }, + { + "epoch": 2.5954088952654235, + "grad_norm": 0.2632196545600891, + "learning_rate": 5.347602368086563e-07, + "loss": 0.3849, + "step": 603 + }, + { + "epoch": 2.599713055954089, + "grad_norm": 0.29081490635871887, + "learning_rate": 5.235258128979676e-07, + "loss": 0.4345, + "step": 604 + }, + { + "epoch": 2.6040172166427547, + "grad_norm": 0.2874118983745575, + "learning_rate": 5.124041311215544e-07, + "loss": 0.3843, + "step": 605 + }, + { + "epoch": 2.6083213773314204, + "grad_norm": 0.2967386841773987, + "learning_rate": 5.0139547158427e-07, + "loss": 0.3959, + "step": 606 + }, + { + "epoch": 2.6126255380200862, + "grad_norm": 0.27277684211730957, + "learning_rate": 4.905001115444475e-07, + "loss": 0.4323, + "step": 607 + }, + { + "epoch": 2.616929698708752, + "grad_norm": 0.2767053544521332, + "learning_rate": 4.797183254069176e-07, + "loss": 0.3858, + "step": 608 + }, + { + "epoch": 2.6212338593974174, + "grad_norm": 0.28239718079566956, + "learning_rate": 4.690503847160982e-07, + "loss": 0.4297, + "step": 609 + }, + { + "epoch": 2.625538020086083, + "grad_norm": 0.2802097201347351, + "learning_rate": 4.5849655814915683e-07, + "loss": 0.3975, + "step": 610 + }, + { + "epoch": 2.629842180774749, + "grad_norm": 0.2911134958267212, + "learning_rate": 4.4805711150924304e-07, + "loss": 0.3648, + "step": 611 + }, + { + "epoch": 2.6341463414634148, + "grad_norm": 0.30303090810775757, + "learning_rate": 4.3773230771879004e-07, + "loss": 0.4397, + "step": 612 + }, + { + "epoch": 2.6384505021520805, + "grad_norm": 0.29142701625823975, + "learning_rate": 4.2752240681290027e-07, + "loss": 0.3775, + "step": 613 + }, + { + "epoch": 2.642754662840746, + "grad_norm": 0.28723323345184326, + "learning_rate": 4.1742766593278974e-07, + "loss": 0.4052, + "step": 614 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 0.2821033000946045, + "learning_rate": 4.074483393193135e-07, + "loss": 0.3822, + "step": 615 + }, + { + "epoch": 2.6513629842180775, + "grad_norm": 0.28537237644195557, + "learning_rate": 3.9758467830656623e-07, + "loss": 0.3862, + "step": 616 + }, + { + "epoch": 2.6556671449067433, + "grad_norm": 0.29671528935432434, + "learning_rate": 3.8783693131554836e-07, + "loss": 0.3844, + "step": 617 + }, + { + "epoch": 2.659971305595409, + "grad_norm": 0.6448749303817749, + "learning_rate": 3.782053438479094e-07, + "loss": 0.4243, + "step": 618 + }, + { + "epoch": 2.6642754662840744, + "grad_norm": 0.280321329832077, + "learning_rate": 3.686901584797675e-07, + "loss": 0.3809, + "step": 619 + }, + { + "epoch": 2.66857962697274, + "grad_norm": 0.2735440135002136, + "learning_rate": 3.5929161485559694e-07, + "loss": 0.3873, + "step": 620 + }, + { + "epoch": 2.672883787661406, + "grad_norm": 0.2726655602455139, + "learning_rate": 3.5000994968219406e-07, + "loss": 0.3545, + "step": 621 + }, + { + "epoch": 2.677187948350072, + "grad_norm": 0.30105406045913696, + "learning_rate": 3.4084539672271764e-07, + "loss": 0.4052, + "step": 622 + }, + { + "epoch": 2.6814921090387376, + "grad_norm": 0.2675243318080902, + "learning_rate": 3.3179818679079936e-07, + "loss": 0.3843, + "step": 623 + }, + { + "epoch": 2.685796269727403, + "grad_norm": 0.281837522983551, + "learning_rate": 3.228685477447291e-07, + "loss": 0.3866, + "step": 624 + }, + { + "epoch": 2.6901004304160687, + "grad_norm": 0.27979665994644165, + "learning_rate": 3.140567044817172e-07, + "loss": 0.3993, + "step": 625 + }, + { + "epoch": 2.6944045911047345, + "grad_norm": 0.2708576023578644, + "learning_rate": 3.0536287893223603e-07, + "loss": 0.3876, + "step": 626 + }, + { + "epoch": 2.6987087517934003, + "grad_norm": 0.2992185950279236, + "learning_rate": 2.967872900544194e-07, + "loss": 0.3943, + "step": 627 + }, + { + "epoch": 2.703012912482066, + "grad_norm": 0.2894390821456909, + "learning_rate": 2.883301538285582e-07, + "loss": 0.4108, + "step": 628 + }, + { + "epoch": 2.7073170731707314, + "grad_norm": 0.27265292406082153, + "learning_rate": 2.799916832516575e-07, + "loss": 0.3659, + "step": 629 + }, + { + "epoch": 2.7116212338593977, + "grad_norm": 0.2490788698196411, + "learning_rate": 2.717720883320685e-07, + "loss": 0.3805, + "step": 630 + }, + { + "epoch": 2.715925394548063, + "grad_norm": 0.29212287068367004, + "learning_rate": 2.6367157608420347e-07, + "loss": 0.417, + "step": 631 + }, + { + "epoch": 2.720229555236729, + "grad_norm": 0.2623874247074127, + "learning_rate": 2.556903505233216e-07, + "loss": 0.3635, + "step": 632 + }, + { + "epoch": 2.7245337159253946, + "grad_norm": 0.28832924365997314, + "learning_rate": 2.4782861266038904e-07, + "loss": 0.4361, + "step": 633 + }, + { + "epoch": 2.7288378766140604, + "grad_norm": 0.2711504101753235, + "learning_rate": 2.4008656049701875e-07, + "loss": 0.4118, + "step": 634 + }, + { + "epoch": 2.733142037302726, + "grad_norm": 0.2761891782283783, + "learning_rate": 2.3246438902048196e-07, + "loss": 0.3608, + "step": 635 + }, + { + "epoch": 2.7374461979913915, + "grad_norm": 0.2847788631916046, + "learning_rate": 2.2496229019879635e-07, + "loss": 0.4244, + "step": 636 + }, + { + "epoch": 2.7417503586800573, + "grad_norm": 0.27423596382141113, + "learning_rate": 2.175804529758929e-07, + "loss": 0.4173, + "step": 637 + }, + { + "epoch": 2.746054519368723, + "grad_norm": 0.27934539318084717, + "learning_rate": 2.1031906326685946e-07, + "loss": 0.399, + "step": 638 + }, + { + "epoch": 2.750358680057389, + "grad_norm": 0.27947354316711426, + "learning_rate": 2.0317830395325255e-07, + "loss": 0.3802, + "step": 639 + }, + { + "epoch": 2.7546628407460547, + "grad_norm": 0.29531410336494446, + "learning_rate": 1.9615835487849677e-07, + "loss": 0.3867, + "step": 640 + }, + { + "epoch": 2.75896700143472, + "grad_norm": 0.29944562911987305, + "learning_rate": 1.8925939284335225e-07, + "loss": 0.3963, + "step": 641 + }, + { + "epoch": 2.763271162123386, + "grad_norm": 0.2796535789966583, + "learning_rate": 1.824815916014644e-07, + "loss": 0.4116, + "step": 642 + }, + { + "epoch": 2.7675753228120517, + "grad_norm": 0.2771192789077759, + "learning_rate": 1.7582512185498446e-07, + "loss": 0.3818, + "step": 643 + }, + { + "epoch": 2.7718794835007174, + "grad_norm": 0.2795744240283966, + "learning_rate": 1.6929015125027314e-07, + "loss": 0.4179, + "step": 644 + }, + { + "epoch": 2.7761836441893832, + "grad_norm": 0.2809208035469055, + "learning_rate": 1.6287684437367724e-07, + "loss": 0.4116, + "step": 645 + }, + { + "epoch": 2.7804878048780486, + "grad_norm": 0.31142884492874146, + "learning_rate": 1.5658536274738623e-07, + "loss": 0.4216, + "step": 646 + }, + { + "epoch": 2.7847919655667144, + "grad_norm": 0.3072478175163269, + "learning_rate": 1.504158648253584e-07, + "loss": 0.4378, + "step": 647 + }, + { + "epoch": 2.78909612625538, + "grad_norm": 0.2876189947128296, + "learning_rate": 1.443685059893396e-07, + "loss": 0.4068, + "step": 648 + }, + { + "epoch": 2.793400286944046, + "grad_norm": 0.28023388981819153, + "learning_rate": 1.3844343854494123e-07, + "loss": 0.39, + "step": 649 + }, + { + "epoch": 2.7977044476327118, + "grad_norm": 0.2727784514427185, + "learning_rate": 1.3264081171780797e-07, + "loss": 0.3683, + "step": 650 + }, + { + "epoch": 2.802008608321377, + "grad_norm": 0.28944262862205505, + "learning_rate": 1.2696077164986e-07, + "loss": 0.3965, + "step": 651 + }, + { + "epoch": 2.806312769010043, + "grad_norm": 0.27676907181739807, + "learning_rate": 1.2140346139561277e-07, + "loss": 0.4027, + "step": 652 + }, + { + "epoch": 2.8106169296987087, + "grad_norm": 0.4450412094593048, + "learning_rate": 1.1596902091857043e-07, + "loss": 0.4236, + "step": 653 + }, + { + "epoch": 2.8149210903873745, + "grad_norm": 0.27743786573410034, + "learning_rate": 1.1065758708770468e-07, + "loss": 0.4033, + "step": 654 + }, + { + "epoch": 2.8192252510760403, + "grad_norm": 0.26591548323631287, + "learning_rate": 1.0546929367400705e-07, + "loss": 0.3805, + "step": 655 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.28023067116737366, + "learning_rate": 1.004042713471165e-07, + "loss": 0.3917, + "step": 656 + }, + { + "epoch": 2.827833572453372, + "grad_norm": 0.2958438992500305, + "learning_rate": 9.546264767203328e-08, + "loss": 0.4197, + "step": 657 + }, + { + "epoch": 2.832137733142037, + "grad_norm": 0.26877066493034363, + "learning_rate": 9.064454710590253e-08, + "loss": 0.3731, + "step": 658 + }, + { + "epoch": 2.836441893830703, + "grad_norm": 0.2890089750289917, + "learning_rate": 8.595009099488238e-08, + "loss": 0.3894, + "step": 659 + }, + { + "epoch": 2.840746054519369, + "grad_norm": 0.27975407242774963, + "learning_rate": 8.137939757108526e-08, + "loss": 0.4, + "step": 660 + }, + { + "epoch": 2.8450502152080346, + "grad_norm": 0.3021981418132782, + "learning_rate": 7.693258194960252e-08, + "loss": 0.427, + "step": 661 + }, + { + "epoch": 2.8493543758967004, + "grad_norm": 0.2643302381038666, + "learning_rate": 7.260975612560173e-08, + "loss": 0.3931, + "step": 662 + }, + { + "epoch": 2.8536585365853657, + "grad_norm": 0.31636062264442444, + "learning_rate": 6.84110289715112e-08, + "loss": 0.3934, + "step": 663 + }, + { + "epoch": 2.8579626972740315, + "grad_norm": 0.29942587018013, + "learning_rate": 6.433650623427379e-08, + "loss": 0.4066, + "step": 664 + }, + { + "epoch": 2.8622668579626973, + "grad_norm": 0.3114181160926819, + "learning_rate": 6.038629053268464e-08, + "loss": 0.4122, + "step": 665 + }, + { + "epoch": 2.866571018651363, + "grad_norm": 0.2796793282032013, + "learning_rate": 5.6560481354807625e-08, + "loss": 0.3884, + "step": 666 + }, + { + "epoch": 2.870875179340029, + "grad_norm": 0.2825842797756195, + "learning_rate": 5.285917505546967e-08, + "loss": 0.4035, + "step": 667 + }, + { + "epoch": 2.8751793400286942, + "grad_norm": 0.28558847308158875, + "learning_rate": 4.928246485383148e-08, + "loss": 0.3689, + "step": 668 + }, + { + "epoch": 2.87948350071736, + "grad_norm": 0.26647746562957764, + "learning_rate": 4.583044083104282e-08, + "loss": 0.389, + "step": 669 + }, + { + "epoch": 2.883787661406026, + "grad_norm": 0.29167062044143677, + "learning_rate": 4.250318992797375e-08, + "loss": 0.4015, + "step": 670 + }, + { + "epoch": 2.8880918220946916, + "grad_norm": 0.2896280288696289, + "learning_rate": 3.9300795943021943e-08, + "loss": 0.3971, + "step": 671 + }, + { + "epoch": 2.8923959827833574, + "grad_norm": 0.26931023597717285, + "learning_rate": 3.622333953000601e-08, + "loss": 0.369, + "step": 672 + }, + { + "epoch": 2.8967001434720228, + "grad_norm": 0.28395572304725647, + "learning_rate": 3.3270898196129944e-08, + "loss": 0.403, + "step": 673 + }, + { + "epoch": 2.9010043041606886, + "grad_norm": 0.29338788986206055, + "learning_rate": 3.0443546300035764e-08, + "loss": 0.4154, + "step": 674 + }, + { + "epoch": 2.9053084648493543, + "grad_norm": 0.29898518323898315, + "learning_rate": 2.77413550499267e-08, + "loss": 0.4208, + "step": 675 + }, + { + "epoch": 2.90961262553802, + "grad_norm": 0.26712504029273987, + "learning_rate": 2.516439250177749e-08, + "loss": 0.3764, + "step": 676 + }, + { + "epoch": 2.913916786226686, + "grad_norm": 0.2811000347137451, + "learning_rate": 2.2712723557616335e-08, + "loss": 0.3865, + "step": 677 + }, + { + "epoch": 2.9182209469153513, + "grad_norm": 0.2783554792404175, + "learning_rate": 2.038640996389285e-08, + "loss": 0.4393, + "step": 678 + }, + { + "epoch": 2.922525107604017, + "grad_norm": 0.27642354369163513, + "learning_rate": 1.818551030992377e-08, + "loss": 0.377, + "step": 679 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 0.30214452743530273, + "learning_rate": 1.6110080026414123e-08, + "loss": 0.3919, + "step": 680 + }, + { + "epoch": 2.9311334289813487, + "grad_norm": 0.2953914999961853, + "learning_rate": 1.4160171384064447e-08, + "loss": 0.4057, + "step": 681 + }, + { + "epoch": 2.9354375896700144, + "grad_norm": 0.28328144550323486, + "learning_rate": 1.2335833492252425e-08, + "loss": 0.4022, + "step": 682 + }, + { + "epoch": 2.93974175035868, + "grad_norm": 0.2768747806549072, + "learning_rate": 1.063711229779718e-08, + "loss": 0.3835, + "step": 683 + }, + { + "epoch": 2.944045911047346, + "grad_norm": 0.2623550295829773, + "learning_rate": 9.06405058380022e-09, + "loss": 0.3517, + "step": 684 + }, + { + "epoch": 2.9483500717360114, + "grad_norm": 0.29734596610069275, + "learning_rate": 7.61668796857018e-09, + "loss": 0.4198, + "step": 685 + }, + { + "epoch": 2.952654232424677, + "grad_norm": 0.27858829498291016, + "learning_rate": 6.295060904623618e-09, + "loss": 0.3796, + "step": 686 + }, + { + "epoch": 2.956958393113343, + "grad_norm": 0.2829955518245697, + "learning_rate": 5.099202677767978e-09, + "loss": 0.4188, + "step": 687 + }, + { + "epoch": 2.9612625538020088, + "grad_norm": 0.2835357189178467, + "learning_rate": 4.02914340626226e-09, + "loss": 0.42, + "step": 688 + }, + { + "epoch": 2.9655667144906745, + "grad_norm": 0.28330615162849426, + "learning_rate": 3.0849100400587307e-09, + "loss": 0.3739, + "step": 689 + }, + { + "epoch": 2.96987087517934, + "grad_norm": 0.30792394280433655, + "learning_rate": 2.2665263601240328e-09, + "loss": 0.4282, + "step": 690 + }, + { + "epoch": 2.9741750358680057, + "grad_norm": 0.2824040651321411, + "learning_rate": 1.5740129778413215e-09, + "loss": 0.409, + "step": 691 + }, + { + "epoch": 2.9784791965566715, + "grad_norm": 0.2838318943977356, + "learning_rate": 1.0073873344895735e-09, + "loss": 0.383, + "step": 692 + }, + { + "epoch": 2.9827833572453373, + "grad_norm": 0.2872404456138611, + "learning_rate": 5.666637008061582e-10, + "loss": 0.3953, + "step": 693 + }, + { + "epoch": 2.987087517934003, + "grad_norm": 0.2668392062187195, + "learning_rate": 2.5185317662490547e-10, + "loss": 0.383, + "step": 694 + }, + { + "epoch": 2.9913916786226684, + "grad_norm": 0.2843668460845947, + "learning_rate": 6.296369059854978e-11, + "loss": 0.4103, + "step": 695 + }, + { + "epoch": 2.995695839311334, + "grad_norm": 0.27907493710517883, + "learning_rate": 0.0, + "loss": 0.3768, + "step": 696 + }, + { + "epoch": 2.995695839311334, + "step": 696, + "total_flos": 1029628835528704.0, + "train_loss": 0.45175294526692095, + "train_runtime": 14742.7033, + "train_samples_per_second": 4.534, + "train_steps_per_second": 0.047 + } + ], + "logging_steps": 1.0, + "max_steps": 696, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1029628835528704.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}