diff --git "a/checkpoint-3880/trainer_state.json" "b/checkpoint-3880/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-3880/trainer_state.json" @@ -0,0 +1,6823 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 3880, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002061855670103093, + "grad_norm": 4.401242256164551, + "learning_rate": 2.061855670103093e-06, + "loss": 5.7831, + "step": 4 + }, + { + "epoch": 0.004123711340206186, + "grad_norm": 4.165046691894531, + "learning_rate": 4.123711340206186e-06, + "loss": 5.6527, + "step": 8 + }, + { + "epoch": 0.006185567010309278, + "grad_norm": 3.4843931198120117, + "learning_rate": 6.185567010309279e-06, + "loss": 5.5896, + "step": 12 + }, + { + "epoch": 0.008247422680412371, + "grad_norm": 3.29345965385437, + "learning_rate": 8.247422680412371e-06, + "loss": 5.3939, + "step": 16 + }, + { + "epoch": 0.010309278350515464, + "grad_norm": 3.1640310287475586, + "learning_rate": 1.0309278350515464e-05, + "loss": 5.0285, + "step": 20 + }, + { + "epoch": 0.012371134020618556, + "grad_norm": 3.238795757293701, + "learning_rate": 1.2371134020618558e-05, + "loss": 5.3506, + "step": 24 + }, + { + "epoch": 0.01443298969072165, + "grad_norm": 2.786606550216675, + "learning_rate": 1.4432989690721649e-05, + "loss": 4.8622, + "step": 28 + }, + { + "epoch": 0.016494845360824743, + "grad_norm": 2.689506769180298, + "learning_rate": 1.6494845360824743e-05, + "loss": 4.7998, + "step": 32 + }, + { + "epoch": 0.018556701030927835, + "grad_norm": 3.6669228076934814, + "learning_rate": 1.8556701030927837e-05, + "loss": 4.8852, + "step": 36 + }, + { + "epoch": 0.020618556701030927, + "grad_norm": 2.614325761795044, + "learning_rate": 2.0618556701030927e-05, + "loss": 4.5481, + "step": 40 + }, + { + "epoch": 0.02268041237113402, + "grad_norm": 2.6969540119171143, + "learning_rate": 2.268041237113402e-05, + "loss": 4.4287, + "step": 44 + }, + { + "epoch": 0.024742268041237112, + "grad_norm": 2.5721373558044434, + "learning_rate": 2.4742268041237116e-05, + "loss": 4.2744, + "step": 48 + }, + { + "epoch": 0.026804123711340205, + "grad_norm": 2.7030205726623535, + "learning_rate": 2.6804123711340206e-05, + "loss": 4.2424, + "step": 52 + }, + { + "epoch": 0.0288659793814433, + "grad_norm": 2.5551116466522217, + "learning_rate": 2.8865979381443297e-05, + "loss": 4.1868, + "step": 56 + }, + { + "epoch": 0.030927835051546393, + "grad_norm": 2.6516504287719727, + "learning_rate": 3.0927835051546395e-05, + "loss": 3.9569, + "step": 60 + }, + { + "epoch": 0.032989690721649485, + "grad_norm": 2.516420602798462, + "learning_rate": 3.2989690721649485e-05, + "loss": 4.0263, + "step": 64 + }, + { + "epoch": 0.03505154639175258, + "grad_norm": 2.4037609100341797, + "learning_rate": 3.5051546391752576e-05, + "loss": 4.001, + "step": 68 + }, + { + "epoch": 0.03711340206185567, + "grad_norm": 2.3336949348449707, + "learning_rate": 3.7113402061855674e-05, + "loss": 3.7654, + "step": 72 + }, + { + "epoch": 0.03917525773195876, + "grad_norm": 2.5022168159484863, + "learning_rate": 3.9175257731958764e-05, + "loss": 3.7868, + "step": 76 + }, + { + "epoch": 0.041237113402061855, + "grad_norm": 2.496216297149658, + "learning_rate": 4.1237113402061855e-05, + "loss": 3.7257, + "step": 80 + }, + { + "epoch": 0.04329896907216495, + "grad_norm": 2.3523142337799072, + "learning_rate": 4.329896907216495e-05, + "loss": 3.6547, + "step": 84 + }, + { + "epoch": 0.04536082474226804, + "grad_norm": 2.4950451850891113, + "learning_rate": 4.536082474226804e-05, + "loss": 3.5265, + "step": 88 + }, + { + "epoch": 0.04742268041237113, + "grad_norm": 2.5210957527160645, + "learning_rate": 4.7422680412371134e-05, + "loss": 3.5772, + "step": 92 + }, + { + "epoch": 0.049484536082474224, + "grad_norm": 2.3704540729522705, + "learning_rate": 4.948453608247423e-05, + "loss": 3.6708, + "step": 96 + }, + { + "epoch": 0.05154639175257732, + "grad_norm": 2.361452102661133, + "learning_rate": 5.1546391752577315e-05, + "loss": 3.4935, + "step": 100 + }, + { + "epoch": 0.05360824742268041, + "grad_norm": 2.4770519733428955, + "learning_rate": 5.360824742268041e-05, + "loss": 3.4953, + "step": 104 + }, + { + "epoch": 0.05567010309278351, + "grad_norm": 2.6673684120178223, + "learning_rate": 5.567010309278351e-05, + "loss": 3.4362, + "step": 108 + }, + { + "epoch": 0.0577319587628866, + "grad_norm": 2.3836777210235596, + "learning_rate": 5.7731958762886594e-05, + "loss": 3.4341, + "step": 112 + }, + { + "epoch": 0.05979381443298969, + "grad_norm": 2.4742355346679688, + "learning_rate": 5.979381443298969e-05, + "loss": 3.4443, + "step": 116 + }, + { + "epoch": 0.061855670103092786, + "grad_norm": 2.513885974884033, + "learning_rate": 6.185567010309279e-05, + "loss": 3.3867, + "step": 120 + }, + { + "epoch": 0.06391752577319587, + "grad_norm": 2.633009433746338, + "learning_rate": 6.391752577319587e-05, + "loss": 3.3804, + "step": 124 + }, + { + "epoch": 0.06597938144329897, + "grad_norm": 2.7852959632873535, + "learning_rate": 6.597938144329897e-05, + "loss": 3.4522, + "step": 128 + }, + { + "epoch": 0.06804123711340206, + "grad_norm": 2.9940905570983887, + "learning_rate": 6.804123711340207e-05, + "loss": 3.3606, + "step": 132 + }, + { + "epoch": 0.07010309278350516, + "grad_norm": 2.4730873107910156, + "learning_rate": 7.010309278350515e-05, + "loss": 3.2392, + "step": 136 + }, + { + "epoch": 0.07216494845360824, + "grad_norm": 2.1951076984405518, + "learning_rate": 7.216494845360825e-05, + "loss": 3.4019, + "step": 140 + }, + { + "epoch": 0.07422680412371134, + "grad_norm": 1.9874905347824097, + "learning_rate": 7.422680412371135e-05, + "loss": 3.1746, + "step": 144 + }, + { + "epoch": 0.07628865979381444, + "grad_norm": 2.2700319290161133, + "learning_rate": 7.628865979381443e-05, + "loss": 3.1944, + "step": 148 + }, + { + "epoch": 0.07835051546391752, + "grad_norm": 2.2276201248168945, + "learning_rate": 7.835051546391753e-05, + "loss": 3.2753, + "step": 152 + }, + { + "epoch": 0.08041237113402062, + "grad_norm": 2.150624990463257, + "learning_rate": 8.041237113402063e-05, + "loss": 3.2761, + "step": 156 + }, + { + "epoch": 0.08247422680412371, + "grad_norm": 2.2154808044433594, + "learning_rate": 8.247422680412371e-05, + "loss": 3.1575, + "step": 160 + }, + { + "epoch": 0.08453608247422681, + "grad_norm": 2.6334950923919678, + "learning_rate": 8.453608247422681e-05, + "loss": 3.185, + "step": 164 + }, + { + "epoch": 0.0865979381443299, + "grad_norm": 2.70011305809021, + "learning_rate": 8.65979381443299e-05, + "loss": 3.1316, + "step": 168 + }, + { + "epoch": 0.088659793814433, + "grad_norm": 2.533686637878418, + "learning_rate": 8.865979381443299e-05, + "loss": 3.1387, + "step": 172 + }, + { + "epoch": 0.09072164948453608, + "grad_norm": 2.154860019683838, + "learning_rate": 9.072164948453609e-05, + "loss": 3.1592, + "step": 176 + }, + { + "epoch": 0.09278350515463918, + "grad_norm": 2.1940603256225586, + "learning_rate": 9.278350515463918e-05, + "loss": 3.1773, + "step": 180 + }, + { + "epoch": 0.09484536082474226, + "grad_norm": 2.257709264755249, + "learning_rate": 9.484536082474227e-05, + "loss": 3.1052, + "step": 184 + }, + { + "epoch": 0.09690721649484536, + "grad_norm": 2.08589768409729, + "learning_rate": 9.690721649484537e-05, + "loss": 3.0373, + "step": 188 + }, + { + "epoch": 0.09896907216494845, + "grad_norm": 2.060812473297119, + "learning_rate": 9.896907216494846e-05, + "loss": 3.112, + "step": 192 + }, + { + "epoch": 0.10103092783505155, + "grad_norm": 2.3649418354034424, + "learning_rate": 9.999992735780168e-05, + "loss": 3.198, + "step": 196 + }, + { + "epoch": 0.10309278350515463, + "grad_norm": 2.339231252670288, + "learning_rate": 9.999934622148157e-05, + "loss": 3.1736, + "step": 200 + }, + { + "epoch": 0.10515463917525773, + "grad_norm": 2.037454128265381, + "learning_rate": 9.999818395559577e-05, + "loss": 3.1006, + "step": 204 + }, + { + "epoch": 0.10721649484536082, + "grad_norm": 2.343181848526001, + "learning_rate": 9.999644057365295e-05, + "loss": 3.1055, + "step": 208 + }, + { + "epoch": 0.10927835051546392, + "grad_norm": 2.3469326496124268, + "learning_rate": 9.999411609591603e-05, + "loss": 3.0211, + "step": 212 + }, + { + "epoch": 0.11134020618556702, + "grad_norm": 2.2599899768829346, + "learning_rate": 9.999121054940182e-05, + "loss": 3.0236, + "step": 216 + }, + { + "epoch": 0.1134020618556701, + "grad_norm": 2.114220142364502, + "learning_rate": 9.998772396788072e-05, + "loss": 3.0895, + "step": 220 + }, + { + "epoch": 0.1154639175257732, + "grad_norm": 2.0050344467163086, + "learning_rate": 9.998365639187638e-05, + "loss": 3.0542, + "step": 224 + }, + { + "epoch": 0.11752577319587629, + "grad_norm": 2.1245343685150146, + "learning_rate": 9.997900786866519e-05, + "loss": 3.0631, + "step": 228 + }, + { + "epoch": 0.11958762886597939, + "grad_norm": 2.0271108150482178, + "learning_rate": 9.997377845227576e-05, + "loss": 3.1621, + "step": 232 + }, + { + "epoch": 0.12164948453608247, + "grad_norm": 2.075441837310791, + "learning_rate": 9.996796820348822e-05, + "loss": 2.9906, + "step": 236 + }, + { + "epoch": 0.12371134020618557, + "grad_norm": 2.118668556213379, + "learning_rate": 9.996157718983362e-05, + "loss": 3.0718, + "step": 240 + }, + { + "epoch": 0.12577319587628866, + "grad_norm": 2.0540637969970703, + "learning_rate": 9.995460548559307e-05, + "loss": 3.077, + "step": 244 + }, + { + "epoch": 0.12783505154639174, + "grad_norm": 2.2028098106384277, + "learning_rate": 9.99470531717969e-05, + "loss": 3.1064, + "step": 248 + }, + { + "epoch": 0.12989690721649486, + "grad_norm": 2.0607352256774902, + "learning_rate": 9.993892033622374e-05, + "loss": 3.0582, + "step": 252 + }, + { + "epoch": 0.13195876288659794, + "grad_norm": 2.054838180541992, + "learning_rate": 9.993020707339939e-05, + "loss": 2.991, + "step": 256 + }, + { + "epoch": 0.13402061855670103, + "grad_norm": 1.9367263317108154, + "learning_rate": 9.992091348459591e-05, + "loss": 2.9583, + "step": 260 + }, + { + "epoch": 0.1360824742268041, + "grad_norm": 1.9889065027236938, + "learning_rate": 9.99110396778303e-05, + "loss": 2.9845, + "step": 264 + }, + { + "epoch": 0.13814432989690723, + "grad_norm": 2.249467134475708, + "learning_rate": 9.990058576786325e-05, + "loss": 3.0511, + "step": 268 + }, + { + "epoch": 0.1402061855670103, + "grad_norm": 2.011195659637451, + "learning_rate": 9.98895518761979e-05, + "loss": 2.9925, + "step": 272 + }, + { + "epoch": 0.1422680412371134, + "grad_norm": 2.0568485260009766, + "learning_rate": 9.987793813107833e-05, + "loss": 2.9491, + "step": 276 + }, + { + "epoch": 0.14432989690721648, + "grad_norm": 1.9748948812484741, + "learning_rate": 9.986574466748812e-05, + "loss": 3.0667, + "step": 280 + }, + { + "epoch": 0.1463917525773196, + "grad_norm": 2.234513282775879, + "learning_rate": 9.985297162714877e-05, + "loss": 2.8514, + "step": 284 + }, + { + "epoch": 0.14845360824742268, + "grad_norm": 1.9163928031921387, + "learning_rate": 9.983961915851804e-05, + "loss": 3.1231, + "step": 288 + }, + { + "epoch": 0.15051546391752577, + "grad_norm": 1.9212491512298584, + "learning_rate": 9.982568741678823e-05, + "loss": 2.9029, + "step": 292 + }, + { + "epoch": 0.15257731958762888, + "grad_norm": 2.065171241760254, + "learning_rate": 9.981117656388445e-05, + "loss": 2.9171, + "step": 296 + }, + { + "epoch": 0.15463917525773196, + "grad_norm": 1.979385495185852, + "learning_rate": 9.979608676846258e-05, + "loss": 3.0767, + "step": 300 + }, + { + "epoch": 0.15670103092783505, + "grad_norm": 1.9221059083938599, + "learning_rate": 9.978041820590743e-05, + "loss": 3.0627, + "step": 304 + }, + { + "epoch": 0.15876288659793814, + "grad_norm": 2.0437042713165283, + "learning_rate": 9.97641710583307e-05, + "loss": 2.938, + "step": 308 + }, + { + "epoch": 0.16082474226804125, + "grad_norm": 1.9110627174377441, + "learning_rate": 9.974734551456881e-05, + "loss": 2.9462, + "step": 312 + }, + { + "epoch": 0.16288659793814433, + "grad_norm": 1.8492796421051025, + "learning_rate": 9.972994177018074e-05, + "loss": 3.0006, + "step": 316 + }, + { + "epoch": 0.16494845360824742, + "grad_norm": 1.9782150983810425, + "learning_rate": 9.971196002744575e-05, + "loss": 2.9449, + "step": 320 + }, + { + "epoch": 0.1670103092783505, + "grad_norm": 1.8963398933410645, + "learning_rate": 9.969340049536099e-05, + "loss": 2.9932, + "step": 324 + }, + { + "epoch": 0.16907216494845362, + "grad_norm": 2.0604023933410645, + "learning_rate": 9.967426338963917e-05, + "loss": 2.8904, + "step": 328 + }, + { + "epoch": 0.1711340206185567, + "grad_norm": 2.0546135902404785, + "learning_rate": 9.965454893270592e-05, + "loss": 3.0603, + "step": 332 + }, + { + "epoch": 0.1731958762886598, + "grad_norm": 1.9946964979171753, + "learning_rate": 9.963425735369736e-05, + "loss": 2.917, + "step": 336 + }, + { + "epoch": 0.17525773195876287, + "grad_norm": 1.8940315246582031, + "learning_rate": 9.961338888845725e-05, + "loss": 2.9194, + "step": 340 + }, + { + "epoch": 0.177319587628866, + "grad_norm": 1.9344205856323242, + "learning_rate": 9.959194377953447e-05, + "loss": 2.9884, + "step": 344 + }, + { + "epoch": 0.17938144329896907, + "grad_norm": 1.9542304277420044, + "learning_rate": 9.956992227617995e-05, + "loss": 2.9051, + "step": 348 + }, + { + "epoch": 0.18144329896907216, + "grad_norm": 1.9588041305541992, + "learning_rate": 9.954732463434402e-05, + "loss": 2.7897, + "step": 352 + }, + { + "epoch": 0.18350515463917524, + "grad_norm": 1.7251778841018677, + "learning_rate": 9.952415111667324e-05, + "loss": 2.7511, + "step": 356 + }, + { + "epoch": 0.18556701030927836, + "grad_norm": 1.930525302886963, + "learning_rate": 9.950040199250746e-05, + "loss": 2.8544, + "step": 360 + }, + { + "epoch": 0.18762886597938144, + "grad_norm": 2.0370941162109375, + "learning_rate": 9.947607753787667e-05, + "loss": 3.0097, + "step": 364 + }, + { + "epoch": 0.18969072164948453, + "grad_norm": 2.0568158626556396, + "learning_rate": 9.945117803549774e-05, + "loss": 2.8412, + "step": 368 + }, + { + "epoch": 0.19175257731958764, + "grad_norm": 1.9520659446716309, + "learning_rate": 9.942570377477121e-05, + "loss": 2.8904, + "step": 372 + }, + { + "epoch": 0.19381443298969073, + "grad_norm": 1.9777973890304565, + "learning_rate": 9.939965505177786e-05, + "loss": 2.9093, + "step": 376 + }, + { + "epoch": 0.1958762886597938, + "grad_norm": 2.0709102153778076, + "learning_rate": 9.937303216927534e-05, + "loss": 2.7659, + "step": 380 + }, + { + "epoch": 0.1979381443298969, + "grad_norm": 1.919057846069336, + "learning_rate": 9.934583543669453e-05, + "loss": 2.8857, + "step": 384 + }, + { + "epoch": 0.2, + "grad_norm": 1.9447135925292969, + "learning_rate": 9.931806517013612e-05, + "loss": 2.7969, + "step": 388 + }, + { + "epoch": 0.2020618556701031, + "grad_norm": 1.9707754850387573, + "learning_rate": 9.928972169236676e-05, + "loss": 2.9706, + "step": 392 + }, + { + "epoch": 0.20412371134020618, + "grad_norm": 2.1095566749572754, + "learning_rate": 9.926080533281543e-05, + "loss": 2.8079, + "step": 396 + }, + { + "epoch": 0.20618556701030927, + "grad_norm": 1.9369230270385742, + "learning_rate": 9.923131642756954e-05, + "loss": 2.759, + "step": 400 + }, + { + "epoch": 0.20824742268041238, + "grad_norm": 1.8451541662216187, + "learning_rate": 9.920125531937107e-05, + "loss": 2.8428, + "step": 404 + }, + { + "epoch": 0.21030927835051547, + "grad_norm": 1.8733794689178467, + "learning_rate": 9.917062235761259e-05, + "loss": 2.7717, + "step": 408 + }, + { + "epoch": 0.21237113402061855, + "grad_norm": 1.8459867238998413, + "learning_rate": 9.913941789833311e-05, + "loss": 2.8008, + "step": 412 + }, + { + "epoch": 0.21443298969072164, + "grad_norm": 1.8867549896240234, + "learning_rate": 9.910764230421406e-05, + "loss": 2.7071, + "step": 416 + }, + { + "epoch": 0.21649484536082475, + "grad_norm": 1.8722797632217407, + "learning_rate": 9.907529594457504e-05, + "loss": 2.8076, + "step": 420 + }, + { + "epoch": 0.21855670103092784, + "grad_norm": 1.9473061561584473, + "learning_rate": 9.904237919536945e-05, + "loss": 2.8613, + "step": 424 + }, + { + "epoch": 0.22061855670103092, + "grad_norm": 1.8926331996917725, + "learning_rate": 9.900889243918024e-05, + "loss": 2.7264, + "step": 428 + }, + { + "epoch": 0.22268041237113403, + "grad_norm": 2.013500928878784, + "learning_rate": 9.897483606521536e-05, + "loss": 2.7307, + "step": 432 + }, + { + "epoch": 0.22474226804123712, + "grad_norm": 1.7798408269882202, + "learning_rate": 9.894021046930333e-05, + "loss": 2.7089, + "step": 436 + }, + { + "epoch": 0.2268041237113402, + "grad_norm": 1.8650320768356323, + "learning_rate": 9.890501605388853e-05, + "loss": 2.812, + "step": 440 + }, + { + "epoch": 0.2288659793814433, + "grad_norm": 1.8471648693084717, + "learning_rate": 9.886925322802663e-05, + "loss": 2.8819, + "step": 444 + }, + { + "epoch": 0.2309278350515464, + "grad_norm": 1.7936201095581055, + "learning_rate": 9.883292240737978e-05, + "loss": 2.8319, + "step": 448 + }, + { + "epoch": 0.2329896907216495, + "grad_norm": 1.89383065700531, + "learning_rate": 9.87960240142118e-05, + "loss": 2.7643, + "step": 452 + }, + { + "epoch": 0.23505154639175257, + "grad_norm": 1.7932506799697876, + "learning_rate": 9.875855847738319e-05, + "loss": 2.7604, + "step": 456 + }, + { + "epoch": 0.23711340206185566, + "grad_norm": 1.8939586877822876, + "learning_rate": 9.872052623234632e-05, + "loss": 2.6743, + "step": 460 + }, + { + "epoch": 0.23917525773195877, + "grad_norm": 1.8440996408462524, + "learning_rate": 9.868192772114016e-05, + "loss": 2.6986, + "step": 464 + }, + { + "epoch": 0.24123711340206186, + "grad_norm": 2.047183036804199, + "learning_rate": 9.864276339238534e-05, + "loss": 2.7008, + "step": 468 + }, + { + "epoch": 0.24329896907216494, + "grad_norm": 1.7253282070159912, + "learning_rate": 9.860303370127876e-05, + "loss": 2.7944, + "step": 472 + }, + { + "epoch": 0.24536082474226803, + "grad_norm": 1.8298146724700928, + "learning_rate": 9.856273910958847e-05, + "loss": 2.637, + "step": 476 + }, + { + "epoch": 0.24742268041237114, + "grad_norm": 1.7853765487670898, + "learning_rate": 9.852188008564813e-05, + "loss": 2.7162, + "step": 480 + }, + { + "epoch": 0.24948453608247423, + "grad_norm": 1.8834904432296753, + "learning_rate": 9.84804571043517e-05, + "loss": 2.8762, + "step": 484 + }, + { + "epoch": 0.2515463917525773, + "grad_norm": 1.9652595520019531, + "learning_rate": 9.843847064714785e-05, + "loss": 2.7794, + "step": 488 + }, + { + "epoch": 0.2536082474226804, + "grad_norm": 1.983886957168579, + "learning_rate": 9.839592120203441e-05, + "loss": 2.8331, + "step": 492 + }, + { + "epoch": 0.2556701030927835, + "grad_norm": 1.9896190166473389, + "learning_rate": 9.835280926355261e-05, + "loss": 2.8015, + "step": 496 + }, + { + "epoch": 0.25773195876288657, + "grad_norm": 1.70455002784729, + "learning_rate": 9.83091353327815e-05, + "loss": 2.7135, + "step": 500 + }, + { + "epoch": 0.2597938144329897, + "grad_norm": 2.0131478309631348, + "learning_rate": 9.826489991733194e-05, + "loss": 2.7678, + "step": 504 + }, + { + "epoch": 0.2618556701030928, + "grad_norm": 2.189298152923584, + "learning_rate": 9.822010353134081e-05, + "loss": 2.8352, + "step": 508 + }, + { + "epoch": 0.2639175257731959, + "grad_norm": 2.022023916244507, + "learning_rate": 9.817474669546501e-05, + "loss": 2.7118, + "step": 512 + }, + { + "epoch": 0.26597938144329897, + "grad_norm": 2.1792008876800537, + "learning_rate": 9.812882993687539e-05, + "loss": 2.8367, + "step": 516 + }, + { + "epoch": 0.26804123711340205, + "grad_norm": 1.8317099809646606, + "learning_rate": 9.808235378925066e-05, + "loss": 2.7326, + "step": 520 + }, + { + "epoch": 0.27010309278350514, + "grad_norm": 1.7824715375900269, + "learning_rate": 9.803531879277113e-05, + "loss": 2.7147, + "step": 524 + }, + { + "epoch": 0.2721649484536082, + "grad_norm": 1.7565789222717285, + "learning_rate": 9.798772549411252e-05, + "loss": 2.8029, + "step": 528 + }, + { + "epoch": 0.27422680412371137, + "grad_norm": 1.805213212966919, + "learning_rate": 9.793957444643951e-05, + "loss": 2.6505, + "step": 532 + }, + { + "epoch": 0.27628865979381445, + "grad_norm": 1.8401093482971191, + "learning_rate": 9.789086620939936e-05, + "loss": 2.7262, + "step": 536 + }, + { + "epoch": 0.27835051546391754, + "grad_norm": 1.7650929689407349, + "learning_rate": 9.784160134911541e-05, + "loss": 2.6988, + "step": 540 + }, + { + "epoch": 0.2804123711340206, + "grad_norm": 1.902664065361023, + "learning_rate": 9.77917804381805e-05, + "loss": 2.7028, + "step": 544 + }, + { + "epoch": 0.2824742268041237, + "grad_norm": 1.6840468645095825, + "learning_rate": 9.774140405565024e-05, + "loss": 2.7141, + "step": 548 + }, + { + "epoch": 0.2845360824742268, + "grad_norm": 1.8542104959487915, + "learning_rate": 9.769047278703644e-05, + "loss": 2.7512, + "step": 552 + }, + { + "epoch": 0.2865979381443299, + "grad_norm": 1.9419035911560059, + "learning_rate": 9.763898722430015e-05, + "loss": 2.7085, + "step": 556 + }, + { + "epoch": 0.28865979381443296, + "grad_norm": 1.8914318084716797, + "learning_rate": 9.758694796584483e-05, + "loss": 2.7119, + "step": 560 + }, + { + "epoch": 0.2907216494845361, + "grad_norm": 2.028513193130493, + "learning_rate": 9.753435561650946e-05, + "loss": 2.7293, + "step": 564 + }, + { + "epoch": 0.2927835051546392, + "grad_norm": 1.7596354484558105, + "learning_rate": 9.748121078756137e-05, + "loss": 2.6624, + "step": 568 + }, + { + "epoch": 0.2948453608247423, + "grad_norm": 1.955361247062683, + "learning_rate": 9.742751409668929e-05, + "loss": 2.6958, + "step": 572 + }, + { + "epoch": 0.29690721649484536, + "grad_norm": 1.980281949043274, + "learning_rate": 9.737326616799605e-05, + "loss": 2.6921, + "step": 576 + }, + { + "epoch": 0.29896907216494845, + "grad_norm": 2.0537736415863037, + "learning_rate": 9.731846763199144e-05, + "loss": 2.7734, + "step": 580 + }, + { + "epoch": 0.30103092783505153, + "grad_norm": 2.0544698238372803, + "learning_rate": 9.726311912558474e-05, + "loss": 2.622, + "step": 584 + }, + { + "epoch": 0.3030927835051546, + "grad_norm": 1.8842084407806396, + "learning_rate": 9.720722129207746e-05, + "loss": 2.6809, + "step": 588 + }, + { + "epoch": 0.30515463917525776, + "grad_norm": 1.9250359535217285, + "learning_rate": 9.715077478115574e-05, + "loss": 2.7359, + "step": 592 + }, + { + "epoch": 0.30721649484536084, + "grad_norm": 1.7907671928405762, + "learning_rate": 9.709378024888292e-05, + "loss": 2.7575, + "step": 596 + }, + { + "epoch": 0.30927835051546393, + "grad_norm": 1.8203668594360352, + "learning_rate": 9.703623835769178e-05, + "loss": 2.7181, + "step": 600 + }, + { + "epoch": 0.311340206185567, + "grad_norm": 1.837925672531128, + "learning_rate": 9.697814977637696e-05, + "loss": 2.743, + "step": 604 + }, + { + "epoch": 0.3134020618556701, + "grad_norm": 1.7200803756713867, + "learning_rate": 9.691951518008715e-05, + "loss": 2.6844, + "step": 608 + }, + { + "epoch": 0.3154639175257732, + "grad_norm": 1.9397820234298706, + "learning_rate": 9.686033525031719e-05, + "loss": 2.5767, + "step": 612 + }, + { + "epoch": 0.31752577319587627, + "grad_norm": 1.9038435220718384, + "learning_rate": 9.680061067490021e-05, + "loss": 2.6503, + "step": 616 + }, + { + "epoch": 0.31958762886597936, + "grad_norm": 1.9289252758026123, + "learning_rate": 9.674034214799964e-05, + "loss": 2.6878, + "step": 620 + }, + { + "epoch": 0.3216494845360825, + "grad_norm": 1.7512463331222534, + "learning_rate": 9.667953037010108e-05, + "loss": 2.7551, + "step": 624 + }, + { + "epoch": 0.3237113402061856, + "grad_norm": 1.9182549715042114, + "learning_rate": 9.661817604800421e-05, + "loss": 2.7986, + "step": 628 + }, + { + "epoch": 0.32577319587628867, + "grad_norm": 1.8200740814208984, + "learning_rate": 9.655627989481458e-05, + "loss": 2.6923, + "step": 632 + }, + { + "epoch": 0.32783505154639175, + "grad_norm": 1.8738125562667847, + "learning_rate": 9.649384262993525e-05, + "loss": 2.7518, + "step": 636 + }, + { + "epoch": 0.32989690721649484, + "grad_norm": 1.9261345863342285, + "learning_rate": 9.64308649790586e-05, + "loss": 2.6037, + "step": 640 + }, + { + "epoch": 0.3319587628865979, + "grad_norm": 1.7359293699264526, + "learning_rate": 9.636734767415763e-05, + "loss": 2.7102, + "step": 644 + }, + { + "epoch": 0.334020618556701, + "grad_norm": 2.001077175140381, + "learning_rate": 9.630329145347767e-05, + "loss": 2.7068, + "step": 648 + }, + { + "epoch": 0.33608247422680415, + "grad_norm": 1.7505935430526733, + "learning_rate": 9.623869706152777e-05, + "loss": 2.6348, + "step": 652 + }, + { + "epoch": 0.33814432989690724, + "grad_norm": 1.9153965711593628, + "learning_rate": 9.617356524907193e-05, + "loss": 2.6874, + "step": 656 + }, + { + "epoch": 0.3402061855670103, + "grad_norm": 1.805038332939148, + "learning_rate": 9.61078967731205e-05, + "loss": 2.5985, + "step": 660 + }, + { + "epoch": 0.3422680412371134, + "grad_norm": 1.7926819324493408, + "learning_rate": 9.604169239692133e-05, + "loss": 2.7203, + "step": 664 + }, + { + "epoch": 0.3443298969072165, + "grad_norm": 1.8701540231704712, + "learning_rate": 9.597495288995089e-05, + "loss": 2.6819, + "step": 668 + }, + { + "epoch": 0.3463917525773196, + "grad_norm": 2.0685527324676514, + "learning_rate": 9.590767902790529e-05, + "loss": 2.7401, + "step": 672 + }, + { + "epoch": 0.34845360824742266, + "grad_norm": 1.7549872398376465, + "learning_rate": 9.583987159269143e-05, + "loss": 2.5833, + "step": 676 + }, + { + "epoch": 0.35051546391752575, + "grad_norm": 1.8161414861679077, + "learning_rate": 9.577153137241765e-05, + "loss": 2.7274, + "step": 680 + }, + { + "epoch": 0.3525773195876289, + "grad_norm": 1.7566990852355957, + "learning_rate": 9.570265916138484e-05, + "loss": 2.7536, + "step": 684 + }, + { + "epoch": 0.354639175257732, + "grad_norm": 1.8565185070037842, + "learning_rate": 9.563325576007701e-05, + "loss": 2.6785, + "step": 688 + }, + { + "epoch": 0.35670103092783506, + "grad_norm": 1.7326369285583496, + "learning_rate": 9.556332197515207e-05, + "loss": 2.7096, + "step": 692 + }, + { + "epoch": 0.35876288659793815, + "grad_norm": 1.705234408378601, + "learning_rate": 9.549285861943247e-05, + "loss": 2.7314, + "step": 696 + }, + { + "epoch": 0.36082474226804123, + "grad_norm": 1.9541505575180054, + "learning_rate": 9.542186651189569e-05, + "loss": 2.6204, + "step": 700 + }, + { + "epoch": 0.3628865979381443, + "grad_norm": 1.8346929550170898, + "learning_rate": 9.535034647766476e-05, + "loss": 2.6913, + "step": 704 + }, + { + "epoch": 0.3649484536082474, + "grad_norm": 1.9132497310638428, + "learning_rate": 9.527829934799869e-05, + "loss": 2.7041, + "step": 708 + }, + { + "epoch": 0.3670103092783505, + "grad_norm": 1.763213038444519, + "learning_rate": 9.520572596028278e-05, + "loss": 2.6216, + "step": 712 + }, + { + "epoch": 0.36907216494845363, + "grad_norm": 1.9224257469177246, + "learning_rate": 9.513262715801887e-05, + "loss": 2.6632, + "step": 716 + }, + { + "epoch": 0.3711340206185567, + "grad_norm": 1.8817148208618164, + "learning_rate": 9.505900379081559e-05, + "loss": 2.6868, + "step": 720 + }, + { + "epoch": 0.3731958762886598, + "grad_norm": 1.7395341396331787, + "learning_rate": 9.498485671437842e-05, + "loss": 2.5171, + "step": 724 + }, + { + "epoch": 0.3752577319587629, + "grad_norm": 1.7571380138397217, + "learning_rate": 9.491018679049981e-05, + "loss": 2.6194, + "step": 728 + }, + { + "epoch": 0.37731958762886597, + "grad_norm": 1.8208237886428833, + "learning_rate": 9.48349948870491e-05, + "loss": 2.6101, + "step": 732 + }, + { + "epoch": 0.37938144329896906, + "grad_norm": 1.888909935951233, + "learning_rate": 9.47592818779625e-05, + "loss": 2.6569, + "step": 736 + }, + { + "epoch": 0.38144329896907214, + "grad_norm": 1.7806968688964844, + "learning_rate": 9.468304864323288e-05, + "loss": 2.5329, + "step": 740 + }, + { + "epoch": 0.3835051546391753, + "grad_norm": 1.8010936975479126, + "learning_rate": 9.460629606889952e-05, + "loss": 2.6971, + "step": 744 + }, + { + "epoch": 0.38556701030927837, + "grad_norm": 1.7562525272369385, + "learning_rate": 9.452902504703793e-05, + "loss": 2.676, + "step": 748 + }, + { + "epoch": 0.38762886597938145, + "grad_norm": 1.828466534614563, + "learning_rate": 9.445123647574936e-05, + "loss": 2.5952, + "step": 752 + }, + { + "epoch": 0.38969072164948454, + "grad_norm": 1.7644858360290527, + "learning_rate": 9.437293125915037e-05, + "loss": 2.6475, + "step": 756 + }, + { + "epoch": 0.3917525773195876, + "grad_norm": 1.6723216772079468, + "learning_rate": 9.429411030736242e-05, + "loss": 2.6823, + "step": 760 + }, + { + "epoch": 0.3938144329896907, + "grad_norm": 1.7757784128189087, + "learning_rate": 9.421477453650118e-05, + "loss": 2.5711, + "step": 764 + }, + { + "epoch": 0.3958762886597938, + "grad_norm": 1.647575855255127, + "learning_rate": 9.413492486866598e-05, + "loss": 2.6141, + "step": 768 + }, + { + "epoch": 0.3979381443298969, + "grad_norm": 1.9347645044326782, + "learning_rate": 9.405456223192897e-05, + "loss": 2.6708, + "step": 772 + }, + { + "epoch": 0.4, + "grad_norm": 1.838582158088684, + "learning_rate": 9.397368756032445e-05, + "loss": 2.7496, + "step": 776 + }, + { + "epoch": 0.4020618556701031, + "grad_norm": 1.8382269144058228, + "learning_rate": 9.389230179383801e-05, + "loss": 2.5654, + "step": 780 + }, + { + "epoch": 0.4041237113402062, + "grad_norm": 1.8177680969238281, + "learning_rate": 9.381040587839548e-05, + "loss": 2.6553, + "step": 784 + }, + { + "epoch": 0.4061855670103093, + "grad_norm": 1.7643057107925415, + "learning_rate": 9.372800076585207e-05, + "loss": 2.5951, + "step": 788 + }, + { + "epoch": 0.40824742268041236, + "grad_norm": 1.8181430101394653, + "learning_rate": 9.364508741398127e-05, + "loss": 2.6452, + "step": 792 + }, + { + "epoch": 0.41030927835051545, + "grad_norm": 1.8445159196853638, + "learning_rate": 9.356166678646366e-05, + "loss": 2.6172, + "step": 796 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 1.6617608070373535, + "learning_rate": 9.347773985287578e-05, + "loss": 2.5781, + "step": 800 + }, + { + "epoch": 0.4144329896907217, + "grad_norm": 1.9522396326065063, + "learning_rate": 9.339330758867883e-05, + "loss": 2.6309, + "step": 804 + }, + { + "epoch": 0.41649484536082476, + "grad_norm": 2.5656485557556152, + "learning_rate": 9.330837097520738e-05, + "loss": 2.6967, + "step": 808 + }, + { + "epoch": 0.41855670103092785, + "grad_norm": 1.6885048151016235, + "learning_rate": 9.322293099965784e-05, + "loss": 2.617, + "step": 812 + }, + { + "epoch": 0.42061855670103093, + "grad_norm": 1.7637752294540405, + "learning_rate": 9.313698865507713e-05, + "loss": 2.6538, + "step": 816 + }, + { + "epoch": 0.422680412371134, + "grad_norm": 1.7737189531326294, + "learning_rate": 9.305054494035106e-05, + "loss": 2.6113, + "step": 820 + }, + { + "epoch": 0.4247422680412371, + "grad_norm": 1.8070309162139893, + "learning_rate": 9.296360086019272e-05, + "loss": 2.5118, + "step": 824 + }, + { + "epoch": 0.4268041237113402, + "grad_norm": 1.73598313331604, + "learning_rate": 9.287615742513086e-05, + "loss": 2.5826, + "step": 828 + }, + { + "epoch": 0.4288659793814433, + "grad_norm": 1.688819169998169, + "learning_rate": 9.278821565149806e-05, + "loss": 2.5375, + "step": 832 + }, + { + "epoch": 0.4309278350515464, + "grad_norm": 1.7314485311508179, + "learning_rate": 9.269977656141898e-05, + "loss": 2.5819, + "step": 836 + }, + { + "epoch": 0.4329896907216495, + "grad_norm": 1.657240390777588, + "learning_rate": 9.261084118279847e-05, + "loss": 2.6643, + "step": 840 + }, + { + "epoch": 0.4350515463917526, + "grad_norm": 1.7966091632843018, + "learning_rate": 9.25214105493096e-05, + "loss": 2.7022, + "step": 844 + }, + { + "epoch": 0.43711340206185567, + "grad_norm": 1.732354760169983, + "learning_rate": 9.243148570038164e-05, + "loss": 2.6098, + "step": 848 + }, + { + "epoch": 0.43917525773195876, + "grad_norm": 1.737281322479248, + "learning_rate": 9.234106768118809e-05, + "loss": 2.5879, + "step": 852 + }, + { + "epoch": 0.44123711340206184, + "grad_norm": 1.7843495607376099, + "learning_rate": 9.225015754263431e-05, + "loss": 2.5793, + "step": 856 + }, + { + "epoch": 0.44329896907216493, + "grad_norm": 1.7821077108383179, + "learning_rate": 9.215875634134552e-05, + "loss": 2.6578, + "step": 860 + }, + { + "epoch": 0.44536082474226807, + "grad_norm": 1.8063935041427612, + "learning_rate": 9.206686513965445e-05, + "loss": 2.7075, + "step": 864 + }, + { + "epoch": 0.44742268041237115, + "grad_norm": 1.5605168342590332, + "learning_rate": 9.19744850055889e-05, + "loss": 2.5988, + "step": 868 + }, + { + "epoch": 0.44948453608247424, + "grad_norm": 2.078798294067383, + "learning_rate": 9.188161701285949e-05, + "loss": 2.6126, + "step": 872 + }, + { + "epoch": 0.4515463917525773, + "grad_norm": 1.7260762453079224, + "learning_rate": 9.178826224084705e-05, + "loss": 2.6352, + "step": 876 + }, + { + "epoch": 0.4536082474226804, + "grad_norm": 1.7088130712509155, + "learning_rate": 9.169442177459011e-05, + "loss": 2.6324, + "step": 880 + }, + { + "epoch": 0.4556701030927835, + "grad_norm": 1.8762470483779907, + "learning_rate": 9.160009670477234e-05, + "loss": 2.517, + "step": 884 + }, + { + "epoch": 0.4577319587628866, + "grad_norm": 1.9028972387313843, + "learning_rate": 9.150528812770981e-05, + "loss": 2.5386, + "step": 888 + }, + { + "epoch": 0.45979381443298967, + "grad_norm": 1.5713077783584595, + "learning_rate": 9.140999714533827e-05, + "loss": 2.6347, + "step": 892 + }, + { + "epoch": 0.4618556701030928, + "grad_norm": 1.603164792060852, + "learning_rate": 9.131422486520034e-05, + "loss": 2.5138, + "step": 896 + }, + { + "epoch": 0.4639175257731959, + "grad_norm": 1.7028288841247559, + "learning_rate": 9.121797240043267e-05, + "loss": 2.5366, + "step": 900 + }, + { + "epoch": 0.465979381443299, + "grad_norm": 1.7099529504776, + "learning_rate": 9.11212408697529e-05, + "loss": 2.6465, + "step": 904 + }, + { + "epoch": 0.46804123711340206, + "grad_norm": 1.7134875059127808, + "learning_rate": 9.102403139744683e-05, + "loss": 2.6331, + "step": 908 + }, + { + "epoch": 0.47010309278350515, + "grad_norm": 1.6669530868530273, + "learning_rate": 9.092634511335519e-05, + "loss": 2.5381, + "step": 912 + }, + { + "epoch": 0.47216494845360824, + "grad_norm": 1.7473549842834473, + "learning_rate": 9.082818315286055e-05, + "loss": 2.584, + "step": 916 + }, + { + "epoch": 0.4742268041237113, + "grad_norm": 1.673309087753296, + "learning_rate": 9.07295466568742e-05, + "loss": 2.4649, + "step": 920 + }, + { + "epoch": 0.4762886597938144, + "grad_norm": 1.8044261932373047, + "learning_rate": 9.063043677182283e-05, + "loss": 2.6512, + "step": 924 + }, + { + "epoch": 0.47835051546391755, + "grad_norm": 1.7258899211883545, + "learning_rate": 9.053085464963518e-05, + "loss": 2.5977, + "step": 928 + }, + { + "epoch": 0.48041237113402063, + "grad_norm": 1.7187895774841309, + "learning_rate": 9.043080144772868e-05, + "loss": 2.5948, + "step": 932 + }, + { + "epoch": 0.4824742268041237, + "grad_norm": 1.797176718711853, + "learning_rate": 9.033027832899601e-05, + "loss": 2.659, + "step": 936 + }, + { + "epoch": 0.4845360824742268, + "grad_norm": 1.6816799640655518, + "learning_rate": 9.022928646179159e-05, + "loss": 2.6123, + "step": 940 + }, + { + "epoch": 0.4865979381443299, + "grad_norm": 1.7294647693634033, + "learning_rate": 9.012782701991795e-05, + "loss": 2.5824, + "step": 944 + }, + { + "epoch": 0.488659793814433, + "grad_norm": 1.6819491386413574, + "learning_rate": 9.002590118261216e-05, + "loss": 2.5717, + "step": 948 + }, + { + "epoch": 0.49072164948453606, + "grad_norm": 1.667133092880249, + "learning_rate": 8.992351013453204e-05, + "loss": 2.6282, + "step": 952 + }, + { + "epoch": 0.4927835051546392, + "grad_norm": 1.6454589366912842, + "learning_rate": 8.982065506574247e-05, + "loss": 2.5939, + "step": 956 + }, + { + "epoch": 0.4948453608247423, + "grad_norm": 1.8332362174987793, + "learning_rate": 8.971733717170148e-05, + "loss": 2.4949, + "step": 960 + }, + { + "epoch": 0.49690721649484537, + "grad_norm": 6.3988037109375, + "learning_rate": 8.961355765324648e-05, + "loss": 2.6143, + "step": 964 + }, + { + "epoch": 0.49896907216494846, + "grad_norm": 1.6663529872894287, + "learning_rate": 8.950931771658014e-05, + "loss": 2.5308, + "step": 968 + }, + { + "epoch": 0.5010309278350515, + "grad_norm": 1.6272313594818115, + "learning_rate": 8.940461857325647e-05, + "loss": 2.4458, + "step": 972 + }, + { + "epoch": 0.5030927835051546, + "grad_norm": 1.758596658706665, + "learning_rate": 8.929946144016677e-05, + "loss": 2.517, + "step": 976 + }, + { + "epoch": 0.5051546391752577, + "grad_norm": 1.6491873264312744, + "learning_rate": 8.919384753952538e-05, + "loss": 2.5408, + "step": 980 + }, + { + "epoch": 0.5072164948453608, + "grad_norm": 1.5984055995941162, + "learning_rate": 8.908777809885557e-05, + "loss": 2.5359, + "step": 984 + }, + { + "epoch": 0.5092783505154639, + "grad_norm": 1.651795744895935, + "learning_rate": 8.898125435097521e-05, + "loss": 2.5151, + "step": 988 + }, + { + "epoch": 0.511340206185567, + "grad_norm": 1.6190180778503418, + "learning_rate": 8.887427753398248e-05, + "loss": 2.6733, + "step": 992 + }, + { + "epoch": 0.51340206185567, + "grad_norm": 1.6881654262542725, + "learning_rate": 8.876684889124145e-05, + "loss": 2.5783, + "step": 996 + }, + { + "epoch": 0.5154639175257731, + "grad_norm": 1.786693811416626, + "learning_rate": 8.865896967136766e-05, + "loss": 2.6061, + "step": 1000 + }, + { + "epoch": 0.5175257731958763, + "grad_norm": 1.841579556465149, + "learning_rate": 8.855064112821361e-05, + "loss": 2.6615, + "step": 1004 + }, + { + "epoch": 0.5195876288659794, + "grad_norm": 1.5715938806533813, + "learning_rate": 8.844186452085412e-05, + "loss": 2.6044, + "step": 1008 + }, + { + "epoch": 0.5216494845360825, + "grad_norm": 1.616853952407837, + "learning_rate": 8.83326411135718e-05, + "loss": 2.538, + "step": 1012 + }, + { + "epoch": 0.5237113402061856, + "grad_norm": 1.7380198240280151, + "learning_rate": 8.822297217584225e-05, + "loss": 2.5838, + "step": 1016 + }, + { + "epoch": 0.5257731958762887, + "grad_norm": 1.703635334968567, + "learning_rate": 8.81128589823194e-05, + "loss": 2.5035, + "step": 1020 + }, + { + "epoch": 0.5278350515463918, + "grad_norm": 1.9964070320129395, + "learning_rate": 8.80023028128206e-05, + "loss": 2.544, + "step": 1024 + }, + { + "epoch": 0.5298969072164949, + "grad_norm": 1.5779422521591187, + "learning_rate": 8.789130495231186e-05, + "loss": 2.6569, + "step": 1028 + }, + { + "epoch": 0.5319587628865979, + "grad_norm": 1.771633505821228, + "learning_rate": 8.77798666908928e-05, + "loss": 2.5315, + "step": 1032 + }, + { + "epoch": 0.534020618556701, + "grad_norm": 1.7763731479644775, + "learning_rate": 8.766798932378172e-05, + "loss": 2.5138, + "step": 1036 + }, + { + "epoch": 0.5360824742268041, + "grad_norm": 1.699804425239563, + "learning_rate": 8.755567415130058e-05, + "loss": 2.5168, + "step": 1040 + }, + { + "epoch": 0.5381443298969072, + "grad_norm": 1.8998210430145264, + "learning_rate": 8.744292247885975e-05, + "loss": 2.4795, + "step": 1044 + }, + { + "epoch": 0.5402061855670103, + "grad_norm": 1.7309194803237915, + "learning_rate": 8.732973561694297e-05, + "loss": 2.5866, + "step": 1048 + }, + { + "epoch": 0.5422680412371134, + "grad_norm": 1.667391061782837, + "learning_rate": 8.721611488109212e-05, + "loss": 2.5016, + "step": 1052 + }, + { + "epoch": 0.5443298969072164, + "grad_norm": 1.7690322399139404, + "learning_rate": 8.710206159189182e-05, + "loss": 2.6189, + "step": 1056 + }, + { + "epoch": 0.5463917525773195, + "grad_norm": 1.6563546657562256, + "learning_rate": 8.698757707495417e-05, + "loss": 2.4559, + "step": 1060 + }, + { + "epoch": 0.5484536082474227, + "grad_norm": 1.8325282335281372, + "learning_rate": 8.687266266090333e-05, + "loss": 2.546, + "step": 1064 + }, + { + "epoch": 0.5505154639175258, + "grad_norm": 1.6227245330810547, + "learning_rate": 8.675731968536002e-05, + "loss": 2.5709, + "step": 1068 + }, + { + "epoch": 0.5525773195876289, + "grad_norm": 1.6123030185699463, + "learning_rate": 8.664154948892607e-05, + "loss": 2.5047, + "step": 1072 + }, + { + "epoch": 0.554639175257732, + "grad_norm": 1.631011724472046, + "learning_rate": 8.65253534171687e-05, + "loss": 2.4568, + "step": 1076 + }, + { + "epoch": 0.5567010309278351, + "grad_norm": 1.746801495552063, + "learning_rate": 8.640873282060506e-05, + "loss": 2.5825, + "step": 1080 + }, + { + "epoch": 0.5587628865979382, + "grad_norm": 1.727461338043213, + "learning_rate": 8.629168905468641e-05, + "loss": 2.5724, + "step": 1084 + }, + { + "epoch": 0.5608247422680412, + "grad_norm": 1.7059168815612793, + "learning_rate": 8.617422347978239e-05, + "loss": 2.5469, + "step": 1088 + }, + { + "epoch": 0.5628865979381443, + "grad_norm": 1.6282740831375122, + "learning_rate": 8.605633746116519e-05, + "loss": 2.5503, + "step": 1092 + }, + { + "epoch": 0.5649484536082474, + "grad_norm": 1.5870437622070312, + "learning_rate": 8.593803236899379e-05, + "loss": 2.5664, + "step": 1096 + }, + { + "epoch": 0.5670103092783505, + "grad_norm": 1.7030541896820068, + "learning_rate": 8.581930957829786e-05, + "loss": 2.4574, + "step": 1100 + }, + { + "epoch": 0.5690721649484536, + "grad_norm": 1.6299339532852173, + "learning_rate": 8.570017046896197e-05, + "loss": 2.5025, + "step": 1104 + }, + { + "epoch": 0.5711340206185567, + "grad_norm": 1.648901104927063, + "learning_rate": 8.558061642570936e-05, + "loss": 2.5028, + "step": 1108 + }, + { + "epoch": 0.5731958762886598, + "grad_norm": 1.5893465280532837, + "learning_rate": 8.5460648838086e-05, + "loss": 2.4432, + "step": 1112 + }, + { + "epoch": 0.5752577319587628, + "grad_norm": 1.686439871788025, + "learning_rate": 8.534026910044435e-05, + "loss": 2.4683, + "step": 1116 + }, + { + "epoch": 0.5773195876288659, + "grad_norm": 1.687406301498413, + "learning_rate": 8.521947861192723e-05, + "loss": 2.4717, + "step": 1120 + }, + { + "epoch": 0.5793814432989691, + "grad_norm": 1.7713639736175537, + "learning_rate": 8.509827877645144e-05, + "loss": 2.6082, + "step": 1124 + }, + { + "epoch": 0.5814432989690722, + "grad_norm": 1.7374292612075806, + "learning_rate": 8.49766710026916e-05, + "loss": 2.4267, + "step": 1128 + }, + { + "epoch": 0.5835051546391753, + "grad_norm": 1.5629856586456299, + "learning_rate": 8.48546567040636e-05, + "loss": 2.4698, + "step": 1132 + }, + { + "epoch": 0.5855670103092784, + "grad_norm": 1.6608953475952148, + "learning_rate": 8.473223729870834e-05, + "loss": 2.4221, + "step": 1136 + }, + { + "epoch": 0.5876288659793815, + "grad_norm": 1.6824475526809692, + "learning_rate": 8.460941420947514e-05, + "loss": 2.5411, + "step": 1140 + }, + { + "epoch": 0.5896907216494846, + "grad_norm": 1.590063214302063, + "learning_rate": 8.448618886390522e-05, + "loss": 2.4957, + "step": 1144 + }, + { + "epoch": 0.5917525773195876, + "grad_norm": 1.6329950094223022, + "learning_rate": 8.436256269421515e-05, + "loss": 2.489, + "step": 1148 + }, + { + "epoch": 0.5938144329896907, + "grad_norm": 1.7138408422470093, + "learning_rate": 8.423853713728016e-05, + "loss": 2.4355, + "step": 1152 + }, + { + "epoch": 0.5958762886597938, + "grad_norm": 1.707245111465454, + "learning_rate": 8.411411363461745e-05, + "loss": 2.4351, + "step": 1156 + }, + { + "epoch": 0.5979381443298969, + "grad_norm": 1.6063274145126343, + "learning_rate": 8.398929363236948e-05, + "loss": 2.4771, + "step": 1160 + }, + { + "epoch": 0.6, + "grad_norm": 1.62711763381958, + "learning_rate": 8.386407858128706e-05, + "loss": 2.4136, + "step": 1164 + }, + { + "epoch": 0.6020618556701031, + "grad_norm": 1.7263373136520386, + "learning_rate": 8.373846993671261e-05, + "loss": 2.6003, + "step": 1168 + }, + { + "epoch": 0.6041237113402061, + "grad_norm": 1.6953507661819458, + "learning_rate": 8.361246915856314e-05, + "loss": 2.5567, + "step": 1172 + }, + { + "epoch": 0.6061855670103092, + "grad_norm": 1.6107468605041504, + "learning_rate": 8.348607771131336e-05, + "loss": 2.3889, + "step": 1176 + }, + { + "epoch": 0.6082474226804123, + "grad_norm": 1.5828678607940674, + "learning_rate": 8.335929706397863e-05, + "loss": 2.4063, + "step": 1180 + }, + { + "epoch": 0.6103092783505155, + "grad_norm": 1.6693129539489746, + "learning_rate": 8.323212869009782e-05, + "loss": 2.4474, + "step": 1184 + }, + { + "epoch": 0.6123711340206186, + "grad_norm": 1.575134515762329, + "learning_rate": 8.310457406771635e-05, + "loss": 2.5156, + "step": 1188 + }, + { + "epoch": 0.6144329896907217, + "grad_norm": 1.600196123123169, + "learning_rate": 8.297663467936882e-05, + "loss": 2.4151, + "step": 1192 + }, + { + "epoch": 0.6164948453608248, + "grad_norm": 1.6348146200180054, + "learning_rate": 8.28483120120619e-05, + "loss": 2.4336, + "step": 1196 + }, + { + "epoch": 0.6185567010309279, + "grad_norm": 1.6424046754837036, + "learning_rate": 8.271960755725702e-05, + "loss": 2.45, + "step": 1200 + }, + { + "epoch": 0.6206185567010309, + "grad_norm": 1.6451953649520874, + "learning_rate": 8.2590522810853e-05, + "loss": 2.4414, + "step": 1204 + }, + { + "epoch": 0.622680412371134, + "grad_norm": 1.6768877506256104, + "learning_rate": 8.246105927316874e-05, + "loss": 2.459, + "step": 1208 + }, + { + "epoch": 0.6247422680412371, + "grad_norm": 1.7277884483337402, + "learning_rate": 8.233121844892568e-05, + "loss": 2.4398, + "step": 1212 + }, + { + "epoch": 0.6268041237113402, + "grad_norm": 1.6326000690460205, + "learning_rate": 8.220100184723038e-05, + "loss": 2.5278, + "step": 1216 + }, + { + "epoch": 0.6288659793814433, + "grad_norm": 1.6363636255264282, + "learning_rate": 8.2070410981557e-05, + "loss": 2.466, + "step": 1220 + }, + { + "epoch": 0.6309278350515464, + "grad_norm": 1.7234989404678345, + "learning_rate": 8.193944736972963e-05, + "loss": 2.4129, + "step": 1224 + }, + { + "epoch": 0.6329896907216495, + "grad_norm": 1.6150864362716675, + "learning_rate": 8.180811253390472e-05, + "loss": 2.4872, + "step": 1228 + }, + { + "epoch": 0.6350515463917525, + "grad_norm": 1.7305302619934082, + "learning_rate": 8.167640800055335e-05, + "loss": 2.5246, + "step": 1232 + }, + { + "epoch": 0.6371134020618556, + "grad_norm": 1.7068026065826416, + "learning_rate": 8.15443353004435e-05, + "loss": 2.4428, + "step": 1236 + }, + { + "epoch": 0.6391752577319587, + "grad_norm": 1.7080657482147217, + "learning_rate": 8.141189596862225e-05, + "loss": 2.3974, + "step": 1240 + }, + { + "epoch": 0.6412371134020619, + "grad_norm": 1.690262794494629, + "learning_rate": 8.127909154439796e-05, + "loss": 2.5146, + "step": 1244 + }, + { + "epoch": 0.643298969072165, + "grad_norm": 1.7586865425109863, + "learning_rate": 8.114592357132236e-05, + "loss": 2.459, + "step": 1248 + }, + { + "epoch": 0.6453608247422681, + "grad_norm": 1.6623564958572388, + "learning_rate": 8.101239359717263e-05, + "loss": 2.4779, + "step": 1252 + }, + { + "epoch": 0.6474226804123712, + "grad_norm": 1.694476842880249, + "learning_rate": 8.087850317393335e-05, + "loss": 2.4863, + "step": 1256 + }, + { + "epoch": 0.6494845360824743, + "grad_norm": 1.7273415327072144, + "learning_rate": 8.074425385777857e-05, + "loss": 2.5528, + "step": 1260 + }, + { + "epoch": 0.6515463917525773, + "grad_norm": 1.61915922164917, + "learning_rate": 8.06096472090536e-05, + "loss": 2.5224, + "step": 1264 + }, + { + "epoch": 0.6536082474226804, + "grad_norm": 1.64993417263031, + "learning_rate": 8.047468479225699e-05, + "loss": 2.475, + "step": 1268 + }, + { + "epoch": 0.6556701030927835, + "grad_norm": 1.6789292097091675, + "learning_rate": 8.033936817602225e-05, + "loss": 2.5414, + "step": 1272 + }, + { + "epoch": 0.6577319587628866, + "grad_norm": 1.534625768661499, + "learning_rate": 8.020369893309969e-05, + "loss": 2.5186, + "step": 1276 + }, + { + "epoch": 0.6597938144329897, + "grad_norm": 1.6226372718811035, + "learning_rate": 8.006767864033805e-05, + "loss": 2.5079, + "step": 1280 + }, + { + "epoch": 0.6618556701030928, + "grad_norm": 1.6352249383926392, + "learning_rate": 7.993130887866631e-05, + "loss": 2.4257, + "step": 1284 + }, + { + "epoch": 0.6639175257731958, + "grad_norm": 1.6394164562225342, + "learning_rate": 7.97945912330752e-05, + "loss": 2.4692, + "step": 1288 + }, + { + "epoch": 0.6659793814432989, + "grad_norm": 1.7224717140197754, + "learning_rate": 7.965752729259881e-05, + "loss": 2.5426, + "step": 1292 + }, + { + "epoch": 0.668041237113402, + "grad_norm": 1.662453532218933, + "learning_rate": 7.952011865029614e-05, + "loss": 2.5418, + "step": 1296 + }, + { + "epoch": 0.6701030927835051, + "grad_norm": 1.5507217645645142, + "learning_rate": 7.938236690323255e-05, + "loss": 2.3688, + "step": 1300 + }, + { + "epoch": 0.6721649484536083, + "grad_norm": 1.6199603080749512, + "learning_rate": 7.924427365246125e-05, + "loss": 2.4501, + "step": 1304 + }, + { + "epoch": 0.6742268041237114, + "grad_norm": 1.5297261476516724, + "learning_rate": 7.910584050300465e-05, + "loss": 2.4709, + "step": 1308 + }, + { + "epoch": 0.6762886597938145, + "grad_norm": 1.5516424179077148, + "learning_rate": 7.896706906383568e-05, + "loss": 2.4583, + "step": 1312 + }, + { + "epoch": 0.6783505154639176, + "grad_norm": 1.5144871473312378, + "learning_rate": 7.882796094785918e-05, + "loss": 2.4077, + "step": 1316 + }, + { + "epoch": 0.6804123711340206, + "grad_norm": 1.6823511123657227, + "learning_rate": 7.868851777189306e-05, + "loss": 2.4394, + "step": 1320 + }, + { + "epoch": 0.6824742268041237, + "grad_norm": 1.5894042253494263, + "learning_rate": 7.854874115664957e-05, + "loss": 2.4256, + "step": 1324 + }, + { + "epoch": 0.6845360824742268, + "grad_norm": 1.682361125946045, + "learning_rate": 7.84086327267164e-05, + "loss": 2.4082, + "step": 1328 + }, + { + "epoch": 0.6865979381443299, + "grad_norm": 1.6730499267578125, + "learning_rate": 7.826819411053787e-05, + "loss": 2.43, + "step": 1332 + }, + { + "epoch": 0.688659793814433, + "grad_norm": 1.5576122999191284, + "learning_rate": 7.812742694039599e-05, + "loss": 2.4109, + "step": 1336 + }, + { + "epoch": 0.6907216494845361, + "grad_norm": 1.735579013824463, + "learning_rate": 7.798633285239141e-05, + "loss": 2.5218, + "step": 1340 + }, + { + "epoch": 0.6927835051546392, + "grad_norm": 1.5843349695205688, + "learning_rate": 7.784491348642452e-05, + "loss": 2.4067, + "step": 1344 + }, + { + "epoch": 0.6948453608247422, + "grad_norm": 1.5844374895095825, + "learning_rate": 7.770317048617631e-05, + "loss": 2.4879, + "step": 1348 + }, + { + "epoch": 0.6969072164948453, + "grad_norm": 1.6641557216644287, + "learning_rate": 7.756110549908924e-05, + "loss": 2.4427, + "step": 1352 + }, + { + "epoch": 0.6989690721649484, + "grad_norm": 1.567859172821045, + "learning_rate": 7.741872017634824e-05, + "loss": 2.458, + "step": 1356 + }, + { + "epoch": 0.7010309278350515, + "grad_norm": 1.649179220199585, + "learning_rate": 7.727601617286132e-05, + "loss": 2.4483, + "step": 1360 + }, + { + "epoch": 0.7030927835051546, + "grad_norm": 1.6399873495101929, + "learning_rate": 7.713299514724051e-05, + "loss": 2.5019, + "step": 1364 + }, + { + "epoch": 0.7051546391752578, + "grad_norm": 1.724725365638733, + "learning_rate": 7.698965876178246e-05, + "loss": 2.4014, + "step": 1368 + }, + { + "epoch": 0.7072164948453609, + "grad_norm": 1.734067440032959, + "learning_rate": 7.68460086824492e-05, + "loss": 2.3332, + "step": 1372 + }, + { + "epoch": 0.709278350515464, + "grad_norm": 1.5960577726364136, + "learning_rate": 7.67020465788487e-05, + "loss": 2.4735, + "step": 1376 + }, + { + "epoch": 0.711340206185567, + "grad_norm": 1.6822943687438965, + "learning_rate": 7.655777412421554e-05, + "loss": 2.515, + "step": 1380 + }, + { + "epoch": 0.7134020618556701, + "grad_norm": 1.7459498643875122, + "learning_rate": 7.641319299539145e-05, + "loss": 2.4009, + "step": 1384 + }, + { + "epoch": 0.7154639175257732, + "grad_norm": 1.5728472471237183, + "learning_rate": 7.626830487280573e-05, + "loss": 2.3888, + "step": 1388 + }, + { + "epoch": 0.7175257731958763, + "grad_norm": 1.621801495552063, + "learning_rate": 7.612311144045588e-05, + "loss": 2.4036, + "step": 1392 + }, + { + "epoch": 0.7195876288659794, + "grad_norm": 1.6353496313095093, + "learning_rate": 7.597761438588784e-05, + "loss": 2.4504, + "step": 1396 + }, + { + "epoch": 0.7216494845360825, + "grad_norm": 1.5395163297653198, + "learning_rate": 7.583181540017656e-05, + "loss": 2.3526, + "step": 1400 + }, + { + "epoch": 0.7237113402061855, + "grad_norm": 1.6341197490692139, + "learning_rate": 7.568571617790624e-05, + "loss": 2.4689, + "step": 1404 + }, + { + "epoch": 0.7257731958762886, + "grad_norm": 1.5934208631515503, + "learning_rate": 7.553931841715061e-05, + "loss": 2.5371, + "step": 1408 + }, + { + "epoch": 0.7278350515463917, + "grad_norm": 1.6311752796173096, + "learning_rate": 7.539262381945323e-05, + "loss": 2.4693, + "step": 1412 + }, + { + "epoch": 0.7298969072164948, + "grad_norm": 1.6778818368911743, + "learning_rate": 7.52456340898078e-05, + "loss": 2.4041, + "step": 1416 + }, + { + "epoch": 0.7319587628865979, + "grad_norm": 1.5758112668991089, + "learning_rate": 7.509835093663821e-05, + "loss": 2.4328, + "step": 1420 + }, + { + "epoch": 0.734020618556701, + "grad_norm": 1.6166560649871826, + "learning_rate": 7.495077607177872e-05, + "loss": 2.386, + "step": 1424 + }, + { + "epoch": 0.7360824742268042, + "grad_norm": 1.4999052286148071, + "learning_rate": 7.48029112104541e-05, + "loss": 2.3724, + "step": 1428 + }, + { + "epoch": 0.7381443298969073, + "grad_norm": 1.6186021566390991, + "learning_rate": 7.465475807125968e-05, + "loss": 2.3181, + "step": 1432 + }, + { + "epoch": 0.7402061855670103, + "grad_norm": 1.7516733407974243, + "learning_rate": 7.450631837614138e-05, + "loss": 2.4565, + "step": 1436 + }, + { + "epoch": 0.7422680412371134, + "grad_norm": 1.647589087486267, + "learning_rate": 7.435759385037565e-05, + "loss": 2.4047, + "step": 1440 + }, + { + "epoch": 0.7443298969072165, + "grad_norm": 1.5819942951202393, + "learning_rate": 7.420858622254946e-05, + "loss": 2.455, + "step": 1444 + }, + { + "epoch": 0.7463917525773196, + "grad_norm": 1.5764845609664917, + "learning_rate": 7.405929722454026e-05, + "loss": 2.3826, + "step": 1448 + }, + { + "epoch": 0.7484536082474227, + "grad_norm": 1.6311144828796387, + "learning_rate": 7.39097285914957e-05, + "loss": 2.3991, + "step": 1452 + }, + { + "epoch": 0.7505154639175258, + "grad_norm": 1.5981923341751099, + "learning_rate": 7.375988206181365e-05, + "loss": 2.4517, + "step": 1456 + }, + { + "epoch": 0.7525773195876289, + "grad_norm": 1.5369350910186768, + "learning_rate": 7.360975937712185e-05, + "loss": 2.4181, + "step": 1460 + }, + { + "epoch": 0.7546391752577319, + "grad_norm": 1.6119554042816162, + "learning_rate": 7.345936228225769e-05, + "loss": 2.4652, + "step": 1464 + }, + { + "epoch": 0.756701030927835, + "grad_norm": 1.7482960224151611, + "learning_rate": 7.330869252524804e-05, + "loss": 2.4213, + "step": 1468 + }, + { + "epoch": 0.7587628865979381, + "grad_norm": 1.676172137260437, + "learning_rate": 7.315775185728877e-05, + "loss": 2.3816, + "step": 1472 + }, + { + "epoch": 0.7608247422680412, + "grad_norm": 4.144953727722168, + "learning_rate": 7.300654203272454e-05, + "loss": 2.412, + "step": 1476 + }, + { + "epoch": 0.7628865979381443, + "grad_norm": 1.5259236097335815, + "learning_rate": 7.285506480902831e-05, + "loss": 2.4274, + "step": 1480 + }, + { + "epoch": 0.7649484536082474, + "grad_norm": 1.5517241954803467, + "learning_rate": 7.270332194678097e-05, + "loss": 2.3633, + "step": 1484 + }, + { + "epoch": 0.7670103092783506, + "grad_norm": 1.583932638168335, + "learning_rate": 7.255131520965087e-05, + "loss": 2.3964, + "step": 1488 + }, + { + "epoch": 0.7690721649484537, + "grad_norm": 1.5848530530929565, + "learning_rate": 7.239904636437325e-05, + "loss": 2.42, + "step": 1492 + }, + { + "epoch": 0.7711340206185567, + "grad_norm": 1.5512683391571045, + "learning_rate": 7.224651718072984e-05, + "loss": 2.4654, + "step": 1496 + }, + { + "epoch": 0.7731958762886598, + "grad_norm": 1.5761094093322754, + "learning_rate": 7.20937294315282e-05, + "loss": 2.4011, + "step": 1500 + }, + { + "epoch": 0.7752577319587629, + "grad_norm": 1.519158124923706, + "learning_rate": 7.194068489258109e-05, + "loss": 2.4315, + "step": 1504 + }, + { + "epoch": 0.777319587628866, + "grad_norm": 1.5774180889129639, + "learning_rate": 7.178738534268591e-05, + "loss": 2.4667, + "step": 1508 + }, + { + "epoch": 0.7793814432989691, + "grad_norm": 1.6394808292388916, + "learning_rate": 7.163383256360398e-05, + "loss": 2.4861, + "step": 1512 + }, + { + "epoch": 0.7814432989690722, + "grad_norm": 1.549655795097351, + "learning_rate": 7.14800283400398e-05, + "loss": 2.3323, + "step": 1516 + }, + { + "epoch": 0.7835051546391752, + "grad_norm": 1.6154853105545044, + "learning_rate": 7.132597445962042e-05, + "loss": 2.5378, + "step": 1520 + }, + { + "epoch": 0.7855670103092783, + "grad_norm": 1.6021648645401, + "learning_rate": 7.117167271287453e-05, + "loss": 2.4249, + "step": 1524 + }, + { + "epoch": 0.7876288659793814, + "grad_norm": 1.665974736213684, + "learning_rate": 7.101712489321169e-05, + "loss": 2.4566, + "step": 1528 + }, + { + "epoch": 0.7896907216494845, + "grad_norm": 1.6290616989135742, + "learning_rate": 7.086233279690158e-05, + "loss": 2.3565, + "step": 1532 + }, + { + "epoch": 0.7917525773195876, + "grad_norm": 1.5475256443023682, + "learning_rate": 7.070729822305298e-05, + "loss": 2.4542, + "step": 1536 + }, + { + "epoch": 0.7938144329896907, + "grad_norm": 1.6090917587280273, + "learning_rate": 7.055202297359293e-05, + "loss": 2.4498, + "step": 1540 + }, + { + "epoch": 0.7958762886597938, + "grad_norm": 1.5616388320922852, + "learning_rate": 7.039650885324582e-05, + "loss": 2.3708, + "step": 1544 + }, + { + "epoch": 0.797938144329897, + "grad_norm": 1.711113452911377, + "learning_rate": 7.024075766951233e-05, + "loss": 2.5234, + "step": 1548 + }, + { + "epoch": 0.8, + "grad_norm": 1.6016921997070312, + "learning_rate": 7.008477123264848e-05, + "loss": 2.4755, + "step": 1552 + }, + { + "epoch": 0.8020618556701031, + "grad_norm": 1.5505585670471191, + "learning_rate": 6.99285513556446e-05, + "loss": 2.3435, + "step": 1556 + }, + { + "epoch": 0.8041237113402062, + "grad_norm": 1.532699704170227, + "learning_rate": 6.977209985420419e-05, + "loss": 2.4878, + "step": 1560 + }, + { + "epoch": 0.8061855670103093, + "grad_norm": 2.197500705718994, + "learning_rate": 6.961541854672293e-05, + "loss": 2.3707, + "step": 1564 + }, + { + "epoch": 0.8082474226804124, + "grad_norm": 1.6112059354782104, + "learning_rate": 6.945850925426742e-05, + "loss": 2.4803, + "step": 1568 + }, + { + "epoch": 0.8103092783505155, + "grad_norm": 1.6318986415863037, + "learning_rate": 6.930137380055403e-05, + "loss": 2.315, + "step": 1572 + }, + { + "epoch": 0.8123711340206186, + "grad_norm": 1.607735514640808, + "learning_rate": 6.914401401192789e-05, + "loss": 2.3804, + "step": 1576 + }, + { + "epoch": 0.8144329896907216, + "grad_norm": 1.474547266960144, + "learning_rate": 6.898643171734137e-05, + "loss": 2.3736, + "step": 1580 + }, + { + "epoch": 0.8164948453608247, + "grad_norm": 1.9736120700836182, + "learning_rate": 6.882862874833305e-05, + "loss": 2.3804, + "step": 1584 + }, + { + "epoch": 0.8185567010309278, + "grad_norm": 1.5962013006210327, + "learning_rate": 6.867060693900631e-05, + "loss": 2.4503, + "step": 1588 + }, + { + "epoch": 0.8206185567010309, + "grad_norm": 1.565443754196167, + "learning_rate": 6.851236812600808e-05, + "loss": 2.4805, + "step": 1592 + }, + { + "epoch": 0.822680412371134, + "grad_norm": 1.6651315689086914, + "learning_rate": 6.835391414850748e-05, + "loss": 2.4496, + "step": 1596 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 1.5796141624450684, + "learning_rate": 6.819524684817438e-05, + "loss": 2.4143, + "step": 1600 + }, + { + "epoch": 0.8268041237113402, + "grad_norm": 1.513759732246399, + "learning_rate": 6.803636806915812e-05, + "loss": 2.4383, + "step": 1604 + }, + { + "epoch": 0.8288659793814434, + "grad_norm": 1.556221842765808, + "learning_rate": 6.787727965806591e-05, + "loss": 2.4026, + "step": 1608 + }, + { + "epoch": 0.8309278350515464, + "grad_norm": 1.5572640895843506, + "learning_rate": 6.771798346394157e-05, + "loss": 2.4022, + "step": 1612 + }, + { + "epoch": 0.8329896907216495, + "grad_norm": 1.602418065071106, + "learning_rate": 6.755848133824383e-05, + "loss": 2.4555, + "step": 1616 + }, + { + "epoch": 0.8350515463917526, + "grad_norm": 1.676102638244629, + "learning_rate": 6.739877513482497e-05, + "loss": 2.3695, + "step": 1620 + }, + { + "epoch": 0.8371134020618557, + "grad_norm": 1.748834252357483, + "learning_rate": 6.72388667099092e-05, + "loss": 2.2801, + "step": 1624 + }, + { + "epoch": 0.8391752577319588, + "grad_norm": 1.5271378755569458, + "learning_rate": 6.707875792207108e-05, + "loss": 2.2845, + "step": 1628 + }, + { + "epoch": 0.8412371134020619, + "grad_norm": 1.7049288749694824, + "learning_rate": 6.691845063221393e-05, + "loss": 2.4126, + "step": 1632 + }, + { + "epoch": 0.843298969072165, + "grad_norm": 1.590855360031128, + "learning_rate": 6.675794670354826e-05, + "loss": 2.3571, + "step": 1636 + }, + { + "epoch": 0.845360824742268, + "grad_norm": 1.7120413780212402, + "learning_rate": 6.659724800157002e-05, + "loss": 2.4111, + "step": 1640 + }, + { + "epoch": 0.8474226804123711, + "grad_norm": 1.5834614038467407, + "learning_rate": 6.643635639403897e-05, + "loss": 2.4045, + "step": 1644 + }, + { + "epoch": 0.8494845360824742, + "grad_norm": 1.4899585247039795, + "learning_rate": 6.627527375095697e-05, + "loss": 2.2914, + "step": 1648 + }, + { + "epoch": 0.8515463917525773, + "grad_norm": 1.573357105255127, + "learning_rate": 6.611400194454619e-05, + "loss": 2.3746, + "step": 1652 + }, + { + "epoch": 0.8536082474226804, + "grad_norm": 1.528746247291565, + "learning_rate": 6.595254284922748e-05, + "loss": 2.3568, + "step": 1656 + }, + { + "epoch": 0.8556701030927835, + "grad_norm": 1.6635682582855225, + "learning_rate": 6.579089834159844e-05, + "loss": 2.3868, + "step": 1660 + }, + { + "epoch": 0.8577319587628865, + "grad_norm": 1.5613417625427246, + "learning_rate": 6.562907030041168e-05, + "loss": 2.377, + "step": 1664 + }, + { + "epoch": 0.8597938144329897, + "grad_norm": 1.6135239601135254, + "learning_rate": 6.546706060655298e-05, + "loss": 2.4542, + "step": 1668 + }, + { + "epoch": 0.8618556701030928, + "grad_norm": 1.5151220560073853, + "learning_rate": 6.530487114301944e-05, + "loss": 2.2057, + "step": 1672 + }, + { + "epoch": 0.8639175257731959, + "grad_norm": 1.6280299425125122, + "learning_rate": 6.514250379489753e-05, + "loss": 2.3594, + "step": 1676 + }, + { + "epoch": 0.865979381443299, + "grad_norm": 1.5538045167922974, + "learning_rate": 6.49799604493413e-05, + "loss": 2.4155, + "step": 1680 + }, + { + "epoch": 0.8680412371134021, + "grad_norm": 1.5412131547927856, + "learning_rate": 6.481724299555029e-05, + "loss": 2.3285, + "step": 1684 + }, + { + "epoch": 0.8701030927835052, + "grad_norm": 1.7468520402908325, + "learning_rate": 6.465435332474768e-05, + "loss": 2.2616, + "step": 1688 + }, + { + "epoch": 0.8721649484536083, + "grad_norm": 1.4656267166137695, + "learning_rate": 6.449129333015834e-05, + "loss": 2.2932, + "step": 1692 + }, + { + "epoch": 0.8742268041237113, + "grad_norm": 1.554957628250122, + "learning_rate": 6.432806490698671e-05, + "loss": 2.4188, + "step": 1696 + }, + { + "epoch": 0.8762886597938144, + "grad_norm": 1.6073874235153198, + "learning_rate": 6.416466995239485e-05, + "loss": 2.2739, + "step": 1700 + }, + { + "epoch": 0.8783505154639175, + "grad_norm": 1.6184226274490356, + "learning_rate": 6.400111036548037e-05, + "loss": 2.3879, + "step": 1704 + }, + { + "epoch": 0.8804123711340206, + "grad_norm": 1.5735539197921753, + "learning_rate": 6.383738804725438e-05, + "loss": 2.3191, + "step": 1708 + }, + { + "epoch": 0.8824742268041237, + "grad_norm": 1.6026270389556885, + "learning_rate": 6.36735049006193e-05, + "loss": 2.5497, + "step": 1712 + }, + { + "epoch": 0.8845360824742268, + "grad_norm": 1.6282131671905518, + "learning_rate": 6.350946283034696e-05, + "loss": 2.4152, + "step": 1716 + }, + { + "epoch": 0.8865979381443299, + "grad_norm": 1.553600549697876, + "learning_rate": 6.334526374305615e-05, + "loss": 2.3417, + "step": 1720 + }, + { + "epoch": 0.8886597938144329, + "grad_norm": 1.5668585300445557, + "learning_rate": 6.318090954719074e-05, + "loss": 2.3811, + "step": 1724 + }, + { + "epoch": 0.8907216494845361, + "grad_norm": 1.7151823043823242, + "learning_rate": 6.301640215299735e-05, + "loss": 2.2925, + "step": 1728 + }, + { + "epoch": 0.8927835051546392, + "grad_norm": 1.5615097284317017, + "learning_rate": 6.285174347250322e-05, + "loss": 2.3591, + "step": 1732 + }, + { + "epoch": 0.8948453608247423, + "grad_norm": 1.6054131984710693, + "learning_rate": 6.26869354194939e-05, + "loss": 2.3084, + "step": 1736 + }, + { + "epoch": 0.8969072164948454, + "grad_norm": 1.606719732284546, + "learning_rate": 6.252197990949108e-05, + "loss": 2.4081, + "step": 1740 + }, + { + "epoch": 0.8989690721649485, + "grad_norm": 1.6772600412368774, + "learning_rate": 6.235687885973032e-05, + "loss": 2.3733, + "step": 1744 + }, + { + "epoch": 0.9010309278350516, + "grad_norm": 1.508259654045105, + "learning_rate": 6.219163418913872e-05, + "loss": 2.3318, + "step": 1748 + }, + { + "epoch": 0.9030927835051547, + "grad_norm": 1.5906274318695068, + "learning_rate": 6.202624781831268e-05, + "loss": 2.297, + "step": 1752 + }, + { + "epoch": 0.9051546391752577, + "grad_norm": 1.5747168064117432, + "learning_rate": 6.186072166949552e-05, + "loss": 2.4498, + "step": 1756 + }, + { + "epoch": 0.9072164948453608, + "grad_norm": 1.5119340419769287, + "learning_rate": 6.16950576665552e-05, + "loss": 2.3045, + "step": 1760 + }, + { + "epoch": 0.9092783505154639, + "grad_norm": 1.5936625003814697, + "learning_rate": 6.152925773496189e-05, + "loss": 2.3989, + "step": 1764 + }, + { + "epoch": 0.911340206185567, + "grad_norm": 1.6385735273361206, + "learning_rate": 6.13633238017656e-05, + "loss": 2.3415, + "step": 1768 + }, + { + "epoch": 0.9134020618556701, + "grad_norm": 1.6806796789169312, + "learning_rate": 6.119725779557386e-05, + "loss": 2.37, + "step": 1772 + }, + { + "epoch": 0.9154639175257732, + "grad_norm": 1.5404062271118164, + "learning_rate": 6.103106164652924e-05, + "loss": 2.476, + "step": 1776 + }, + { + "epoch": 0.9175257731958762, + "grad_norm": 1.5629997253417969, + "learning_rate": 6.086473728628691e-05, + "loss": 2.3843, + "step": 1780 + }, + { + "epoch": 0.9195876288659793, + "grad_norm": 1.5114765167236328, + "learning_rate": 6.069828664799221e-05, + "loss": 2.3972, + "step": 1784 + }, + { + "epoch": 0.9216494845360824, + "grad_norm": 1.5878251791000366, + "learning_rate": 6.053171166625817e-05, + "loss": 2.3686, + "step": 1788 + }, + { + "epoch": 0.9237113402061856, + "grad_norm": 1.6494956016540527, + "learning_rate": 6.036501427714304e-05, + "loss": 2.4627, + "step": 1792 + }, + { + "epoch": 0.9257731958762887, + "grad_norm": 1.5250719785690308, + "learning_rate": 6.0198196418127804e-05, + "loss": 2.332, + "step": 1796 + }, + { + "epoch": 0.9278350515463918, + "grad_norm": 1.5805870294570923, + "learning_rate": 6.0031260028093596e-05, + "loss": 2.2881, + "step": 1800 + }, + { + "epoch": 0.9298969072164949, + "grad_norm": 1.6690597534179688, + "learning_rate": 5.98642070472992e-05, + "loss": 2.2946, + "step": 1804 + }, + { + "epoch": 0.931958762886598, + "grad_norm": 1.5214745998382568, + "learning_rate": 5.969703941735858e-05, + "loss": 2.3984, + "step": 1808 + }, + { + "epoch": 0.934020618556701, + "grad_norm": 1.6358650922775269, + "learning_rate": 5.952975908121815e-05, + "loss": 2.3758, + "step": 1812 + }, + { + "epoch": 0.9360824742268041, + "grad_norm": 1.9057590961456299, + "learning_rate": 5.936236798313431e-05, + "loss": 2.3347, + "step": 1816 + }, + { + "epoch": 0.9381443298969072, + "grad_norm": 1.6489713191986084, + "learning_rate": 5.9194868068650845e-05, + "loss": 2.2552, + "step": 1820 + }, + { + "epoch": 0.9402061855670103, + "grad_norm": 1.5380582809448242, + "learning_rate": 5.902726128457625e-05, + "loss": 2.2898, + "step": 1824 + }, + { + "epoch": 0.9422680412371134, + "grad_norm": 1.5303738117218018, + "learning_rate": 5.885954957896115e-05, + "loss": 2.2896, + "step": 1828 + }, + { + "epoch": 0.9443298969072165, + "grad_norm": 1.577243685722351, + "learning_rate": 5.8691734901075634e-05, + "loss": 2.3281, + "step": 1832 + }, + { + "epoch": 0.9463917525773196, + "grad_norm": 1.545732021331787, + "learning_rate": 5.852381920138663e-05, + "loss": 2.2197, + "step": 1836 + }, + { + "epoch": 0.9484536082474226, + "grad_norm": 1.5541272163391113, + "learning_rate": 5.835580443153522e-05, + "loss": 2.2776, + "step": 1840 + }, + { + "epoch": 0.9505154639175257, + "grad_norm": 1.5543440580368042, + "learning_rate": 5.818769254431395e-05, + "loss": 2.3597, + "step": 1844 + }, + { + "epoch": 0.9525773195876288, + "grad_norm": 1.5079783201217651, + "learning_rate": 5.80194854936441e-05, + "loss": 2.3476, + "step": 1848 + }, + { + "epoch": 0.954639175257732, + "grad_norm": 1.503811240196228, + "learning_rate": 5.7851185234553064e-05, + "loss": 2.3318, + "step": 1852 + }, + { + "epoch": 0.9567010309278351, + "grad_norm": 1.532036542892456, + "learning_rate": 5.768279372315153e-05, + "loss": 2.2278, + "step": 1856 + }, + { + "epoch": 0.9587628865979382, + "grad_norm": 1.5316864252090454, + "learning_rate": 5.7514312916610814e-05, + "loss": 2.3835, + "step": 1860 + }, + { + "epoch": 0.9608247422680413, + "grad_norm": 1.5309964418411255, + "learning_rate": 5.7345744773140086e-05, + "loss": 2.3779, + "step": 1864 + }, + { + "epoch": 0.9628865979381444, + "grad_norm": 1.577561855316162, + "learning_rate": 5.71770912519636e-05, + "loss": 2.3334, + "step": 1868 + }, + { + "epoch": 0.9649484536082474, + "grad_norm": 1.5817878246307373, + "learning_rate": 5.7008354313297926e-05, + "loss": 2.28, + "step": 1872 + }, + { + "epoch": 0.9670103092783505, + "grad_norm": 1.5670493841171265, + "learning_rate": 5.683953591832922e-05, + "loss": 2.304, + "step": 1876 + }, + { + "epoch": 0.9690721649484536, + "grad_norm": 1.5366458892822266, + "learning_rate": 5.667063802919032e-05, + "loss": 2.4577, + "step": 1880 + }, + { + "epoch": 0.9711340206185567, + "grad_norm": 1.5330809354782104, + "learning_rate": 5.6501662608938014e-05, + "loss": 2.4178, + "step": 1884 + }, + { + "epoch": 0.9731958762886598, + "grad_norm": 1.4714388847351074, + "learning_rate": 5.633261162153027e-05, + "loss": 2.3252, + "step": 1888 + }, + { + "epoch": 0.9752577319587629, + "grad_norm": 1.5558996200561523, + "learning_rate": 5.6163487031803305e-05, + "loss": 2.3597, + "step": 1892 + }, + { + "epoch": 0.977319587628866, + "grad_norm": 1.5362873077392578, + "learning_rate": 5.5994290805448826e-05, + "loss": 2.2979, + "step": 1896 + }, + { + "epoch": 0.979381443298969, + "grad_norm": 1.5119233131408691, + "learning_rate": 5.582502490899111e-05, + "loss": 2.3582, + "step": 1900 + }, + { + "epoch": 0.9814432989690721, + "grad_norm": 1.6467958688735962, + "learning_rate": 5.565569130976422e-05, + "loss": 2.3366, + "step": 1904 + }, + { + "epoch": 0.9835051546391752, + "grad_norm": 1.6580411195755005, + "learning_rate": 5.548629197588913e-05, + "loss": 2.2901, + "step": 1908 + }, + { + "epoch": 0.9855670103092784, + "grad_norm": 1.4866169691085815, + "learning_rate": 5.5316828876250795e-05, + "loss": 2.3313, + "step": 1912 + }, + { + "epoch": 0.9876288659793815, + "grad_norm": 1.6460485458374023, + "learning_rate": 5.514730398047533e-05, + "loss": 2.2917, + "step": 1916 + }, + { + "epoch": 0.9896907216494846, + "grad_norm": 1.4895657300949097, + "learning_rate": 5.497771925890706e-05, + "loss": 2.3709, + "step": 1920 + }, + { + "epoch": 0.9917525773195877, + "grad_norm": 1.4158916473388672, + "learning_rate": 5.48080766825857e-05, + "loss": 2.1582, + "step": 1924 + }, + { + "epoch": 0.9938144329896907, + "grad_norm": 1.611509919166565, + "learning_rate": 5.463837822322333e-05, + "loss": 2.5048, + "step": 1928 + }, + { + "epoch": 0.9958762886597938, + "grad_norm": 1.6175845861434937, + "learning_rate": 5.446862585318161e-05, + "loss": 2.3836, + "step": 1932 + }, + { + "epoch": 0.9979381443298969, + "grad_norm": 1.6315233707427979, + "learning_rate": 5.429882154544875e-05, + "loss": 2.2926, + "step": 1936 + }, + { + "epoch": 1.0, + "grad_norm": 4.427371025085449, + "learning_rate": 5.4128967273616625e-05, + "loss": 2.3215, + "step": 1940 + }, + { + "epoch": 1.002061855670103, + "grad_norm": 1.421981692314148, + "learning_rate": 5.395906501185783e-05, + "loss": 1.8189, + "step": 1944 + }, + { + "epoch": 1.0041237113402062, + "grad_norm": 1.4998725652694702, + "learning_rate": 5.3789116734902746e-05, + "loss": 1.9157, + "step": 1948 + }, + { + "epoch": 1.0061855670103093, + "grad_norm": 1.4507402181625366, + "learning_rate": 5.361912441801654e-05, + "loss": 1.9084, + "step": 1952 + }, + { + "epoch": 1.0082474226804123, + "grad_norm": 1.5483083724975586, + "learning_rate": 5.344909003697634e-05, + "loss": 1.824, + "step": 1956 + }, + { + "epoch": 1.0103092783505154, + "grad_norm": 1.579430103302002, + "learning_rate": 5.327901556804803e-05, + "loss": 1.8283, + "step": 1960 + }, + { + "epoch": 1.0123711340206185, + "grad_norm": 1.5646827220916748, + "learning_rate": 5.310890298796353e-05, + "loss": 1.8367, + "step": 1964 + }, + { + "epoch": 1.0144329896907216, + "grad_norm": 1.6156511306762695, + "learning_rate": 5.293875427389771e-05, + "loss": 1.9887, + "step": 1968 + }, + { + "epoch": 1.0164948453608247, + "grad_norm": 1.416155457496643, + "learning_rate": 5.2768571403445366e-05, + "loss": 1.7009, + "step": 1972 + }, + { + "epoch": 1.0185567010309278, + "grad_norm": 1.4358748197555542, + "learning_rate": 5.259835635459833e-05, + "loss": 1.7943, + "step": 1976 + }, + { + "epoch": 1.0206185567010309, + "grad_norm": 1.567254900932312, + "learning_rate": 5.242811110572242e-05, + "loss": 1.8559, + "step": 1980 + }, + { + "epoch": 1.022680412371134, + "grad_norm": 1.6107474565505981, + "learning_rate": 5.2257837635534446e-05, + "loss": 1.8663, + "step": 1984 + }, + { + "epoch": 1.024742268041237, + "grad_norm": 1.8279513120651245, + "learning_rate": 5.208753792307923e-05, + "loss": 1.85, + "step": 1988 + }, + { + "epoch": 1.02680412371134, + "grad_norm": 1.7236356735229492, + "learning_rate": 5.191721394770667e-05, + "loss": 1.806, + "step": 1992 + }, + { + "epoch": 1.0288659793814432, + "grad_norm": 1.4513978958129883, + "learning_rate": 5.174686768904855e-05, + "loss": 1.881, + "step": 1996 + }, + { + "epoch": 1.0309278350515463, + "grad_norm": 1.497959852218628, + "learning_rate": 5.15765011269957e-05, + "loss": 1.776, + "step": 2000 + }, + { + "epoch": 1.0329896907216496, + "grad_norm": 1.4081003665924072, + "learning_rate": 5.140611624167497e-05, + "loss": 1.7823, + "step": 2004 + }, + { + "epoch": 1.0350515463917527, + "grad_norm": 1.4467487335205078, + "learning_rate": 5.1235715013426146e-05, + "loss": 1.7799, + "step": 2008 + }, + { + "epoch": 1.0371134020618558, + "grad_norm": 1.518298625946045, + "learning_rate": 5.1065299422778936e-05, + "loss": 1.8807, + "step": 2012 + }, + { + "epoch": 1.0391752577319588, + "grad_norm": 1.3984616994857788, + "learning_rate": 5.0894871450430004e-05, + "loss": 1.8241, + "step": 2016 + }, + { + "epoch": 1.041237113402062, + "grad_norm": 1.5757567882537842, + "learning_rate": 5.072443307721992e-05, + "loss": 1.8525, + "step": 2020 + }, + { + "epoch": 1.043298969072165, + "grad_norm": 1.7042030096054077, + "learning_rate": 5.0553986284110164e-05, + "loss": 1.7814, + "step": 2024 + }, + { + "epoch": 1.045360824742268, + "grad_norm": 1.5050081014633179, + "learning_rate": 5.0383533052160045e-05, + "loss": 1.8751, + "step": 2028 + }, + { + "epoch": 1.0474226804123712, + "grad_norm": 1.522011160850525, + "learning_rate": 5.02130753625037e-05, + "loss": 1.8369, + "step": 2032 + }, + { + "epoch": 1.0494845360824743, + "grad_norm": 1.5450692176818848, + "learning_rate": 5.004261519632713e-05, + "loss": 1.8428, + "step": 2036 + }, + { + "epoch": 1.0515463917525774, + "grad_norm": 2.67858624458313, + "learning_rate": 4.987215453484507e-05, + "loss": 1.7921, + "step": 2040 + }, + { + "epoch": 1.0536082474226804, + "grad_norm": 1.628353476524353, + "learning_rate": 4.9701695359278035e-05, + "loss": 1.862, + "step": 2044 + }, + { + "epoch": 1.0556701030927835, + "grad_norm": 1.5472396612167358, + "learning_rate": 4.953123965082927e-05, + "loss": 1.8028, + "step": 2048 + }, + { + "epoch": 1.0577319587628866, + "grad_norm": 1.4193578958511353, + "learning_rate": 4.936078939066169e-05, + "loss": 1.7504, + "step": 2052 + }, + { + "epoch": 1.0597938144329897, + "grad_norm": 1.6026136875152588, + "learning_rate": 4.919034655987493e-05, + "loss": 1.8197, + "step": 2056 + }, + { + "epoch": 1.0618556701030928, + "grad_norm": 1.494009256362915, + "learning_rate": 4.901991313948227e-05, + "loss": 1.813, + "step": 2060 + }, + { + "epoch": 1.0639175257731959, + "grad_norm": 1.569420337677002, + "learning_rate": 4.884949111038757e-05, + "loss": 1.8011, + "step": 2064 + }, + { + "epoch": 1.065979381443299, + "grad_norm": 1.5701764822006226, + "learning_rate": 4.8679082453362354e-05, + "loss": 1.793, + "step": 2068 + }, + { + "epoch": 1.068041237113402, + "grad_norm": 1.4102658033370972, + "learning_rate": 4.850868914902269e-05, + "loss": 1.7566, + "step": 2072 + }, + { + "epoch": 1.0701030927835051, + "grad_norm": 1.5660536289215088, + "learning_rate": 4.833831317780621e-05, + "loss": 1.8408, + "step": 2076 + }, + { + "epoch": 1.0721649484536082, + "grad_norm": 1.5886646509170532, + "learning_rate": 4.81679565199491e-05, + "loss": 1.8117, + "step": 2080 + }, + { + "epoch": 1.0742268041237113, + "grad_norm": 1.531605839729309, + "learning_rate": 4.799762115546304e-05, + "loss": 1.8381, + "step": 2084 + }, + { + "epoch": 1.0762886597938144, + "grad_norm": 1.4816917181015015, + "learning_rate": 4.782730906411225e-05, + "loss": 1.8265, + "step": 2088 + }, + { + "epoch": 1.0783505154639175, + "grad_norm": 1.535863995552063, + "learning_rate": 4.7657022225390454e-05, + "loss": 1.7533, + "step": 2092 + }, + { + "epoch": 1.0804123711340206, + "grad_norm": 1.5383597612380981, + "learning_rate": 4.748676261849787e-05, + "loss": 1.8174, + "step": 2096 + }, + { + "epoch": 1.0824742268041236, + "grad_norm": 1.5802308320999146, + "learning_rate": 4.731653222231819e-05, + "loss": 1.814, + "step": 2100 + }, + { + "epoch": 1.0845360824742267, + "grad_norm": 1.6186000108718872, + "learning_rate": 4.7146333015395606e-05, + "loss": 1.836, + "step": 2104 + }, + { + "epoch": 1.0865979381443298, + "grad_norm": 1.4844691753387451, + "learning_rate": 4.6976166975911815e-05, + "loss": 1.7533, + "step": 2108 + }, + { + "epoch": 1.088659793814433, + "grad_norm": 1.4020297527313232, + "learning_rate": 4.6806036081663026e-05, + "loss": 1.7584, + "step": 2112 + }, + { + "epoch": 1.090721649484536, + "grad_norm": 1.5726157426834106, + "learning_rate": 4.663594231003692e-05, + "loss": 1.78, + "step": 2116 + }, + { + "epoch": 1.0927835051546393, + "grad_norm": 1.5943951606750488, + "learning_rate": 4.646588763798976e-05, + "loss": 1.8729, + "step": 2120 + }, + { + "epoch": 1.0948453608247424, + "grad_norm": 1.499718427658081, + "learning_rate": 4.629587404202331e-05, + "loss": 1.7568, + "step": 2124 + }, + { + "epoch": 1.0969072164948455, + "grad_norm": 1.5948656797409058, + "learning_rate": 4.6125903498162e-05, + "loss": 1.7359, + "step": 2128 + }, + { + "epoch": 1.0989690721649485, + "grad_norm": 1.4080160856246948, + "learning_rate": 4.59559779819298e-05, + "loss": 1.691, + "step": 2132 + }, + { + "epoch": 1.1010309278350516, + "grad_norm": 1.509520411491394, + "learning_rate": 4.5786099468327365e-05, + "loss": 1.8097, + "step": 2136 + }, + { + "epoch": 1.1030927835051547, + "grad_norm": 1.4405332803726196, + "learning_rate": 4.561626993180904e-05, + "loss": 1.7825, + "step": 2140 + }, + { + "epoch": 1.1051546391752578, + "grad_norm": 1.5002368688583374, + "learning_rate": 4.5446491346259975e-05, + "loss": 1.7178, + "step": 2144 + }, + { + "epoch": 1.1072164948453609, + "grad_norm": 1.4368022680282593, + "learning_rate": 4.5276765684973024e-05, + "loss": 1.7831, + "step": 2148 + }, + { + "epoch": 1.109278350515464, + "grad_norm": 1.4888081550598145, + "learning_rate": 4.510709492062601e-05, + "loss": 1.7647, + "step": 2152 + }, + { + "epoch": 1.111340206185567, + "grad_norm": 1.5033565759658813, + "learning_rate": 4.4937481025258674e-05, + "loss": 1.8698, + "step": 2156 + }, + { + "epoch": 1.1134020618556701, + "grad_norm": 1.4232312440872192, + "learning_rate": 4.476792597024976e-05, + "loss": 1.9078, + "step": 2160 + }, + { + "epoch": 1.1154639175257732, + "grad_norm": 1.4969874620437622, + "learning_rate": 4.4598431726294175e-05, + "loss": 1.7898, + "step": 2164 + }, + { + "epoch": 1.1175257731958763, + "grad_norm": 1.5777771472930908, + "learning_rate": 4.442900026337998e-05, + "loss": 1.7817, + "step": 2168 + }, + { + "epoch": 1.1195876288659794, + "grad_norm": 2.387895345687866, + "learning_rate": 4.425963355076558e-05, + "loss": 1.7712, + "step": 2172 + }, + { + "epoch": 1.1216494845360825, + "grad_norm": 1.4322453737258911, + "learning_rate": 4.409033355695683e-05, + "loss": 1.7369, + "step": 2176 + }, + { + "epoch": 1.1237113402061856, + "grad_norm": 1.4894506931304932, + "learning_rate": 4.392110224968403e-05, + "loss": 1.7687, + "step": 2180 + }, + { + "epoch": 1.1257731958762887, + "grad_norm": 1.5217411518096924, + "learning_rate": 4.3751941595879265e-05, + "loss": 1.8199, + "step": 2184 + }, + { + "epoch": 1.1278350515463917, + "grad_norm": 1.5253269672393799, + "learning_rate": 4.358285356165338e-05, + "loss": 1.8539, + "step": 2188 + }, + { + "epoch": 1.1298969072164948, + "grad_norm": 1.4793150424957275, + "learning_rate": 4.34138401122732e-05, + "loss": 1.805, + "step": 2192 + }, + { + "epoch": 1.131958762886598, + "grad_norm": 1.5564053058624268, + "learning_rate": 4.3244903212138634e-05, + "loss": 1.7565, + "step": 2196 + }, + { + "epoch": 1.134020618556701, + "grad_norm": 1.4678430557250977, + "learning_rate": 4.307604482475993e-05, + "loss": 1.6859, + "step": 2200 + }, + { + "epoch": 1.136082474226804, + "grad_norm": 1.4880309104919434, + "learning_rate": 4.290726691273477e-05, + "loss": 1.7329, + "step": 2204 + }, + { + "epoch": 1.1381443298969072, + "grad_norm": 1.8314374685287476, + "learning_rate": 4.27385714377255e-05, + "loss": 1.7669, + "step": 2208 + }, + { + "epoch": 1.1402061855670103, + "grad_norm": 1.5519520044326782, + "learning_rate": 4.256996036043631e-05, + "loss": 1.7761, + "step": 2212 + }, + { + "epoch": 1.1422680412371133, + "grad_norm": 1.4087693691253662, + "learning_rate": 4.240143564059045e-05, + "loss": 1.743, + "step": 2216 + }, + { + "epoch": 1.1443298969072164, + "grad_norm": 1.5219017267227173, + "learning_rate": 4.2232999236907524e-05, + "loss": 1.8374, + "step": 2220 + }, + { + "epoch": 1.1463917525773195, + "grad_norm": 1.6253538131713867, + "learning_rate": 4.206465310708058e-05, + "loss": 1.841, + "step": 2224 + }, + { + "epoch": 1.1484536082474226, + "grad_norm": 1.4791438579559326, + "learning_rate": 4.189639920775346e-05, + "loss": 1.6623, + "step": 2228 + }, + { + "epoch": 1.1505154639175257, + "grad_norm": 1.4935940504074097, + "learning_rate": 4.172823949449807e-05, + "loss": 1.7644, + "step": 2232 + }, + { + "epoch": 1.1525773195876288, + "grad_norm": 1.6122525930404663, + "learning_rate": 4.156017592179158e-05, + "loss": 1.8102, + "step": 2236 + }, + { + "epoch": 1.1546391752577319, + "grad_norm": 1.6047066450119019, + "learning_rate": 4.139221044299376e-05, + "loss": 1.8315, + "step": 2240 + }, + { + "epoch": 1.156701030927835, + "grad_norm": 1.4906078577041626, + "learning_rate": 4.122434501032423e-05, + "loss": 1.7467, + "step": 2244 + }, + { + "epoch": 1.158762886597938, + "grad_norm": 2.4341137409210205, + "learning_rate": 4.105658157483983e-05, + "loss": 1.828, + "step": 2248 + }, + { + "epoch": 1.1608247422680413, + "grad_norm": 1.490577220916748, + "learning_rate": 4.088892208641188e-05, + "loss": 1.792, + "step": 2252 + }, + { + "epoch": 1.1628865979381444, + "grad_norm": 1.4628297090530396, + "learning_rate": 4.072136849370363e-05, + "loss": 1.765, + "step": 2256 + }, + { + "epoch": 1.1649484536082475, + "grad_norm": 1.4456610679626465, + "learning_rate": 4.055392274414743e-05, + "loss": 1.8382, + "step": 2260 + }, + { + "epoch": 1.1670103092783506, + "grad_norm": 1.5569915771484375, + "learning_rate": 4.0386586783922256e-05, + "loss": 1.8598, + "step": 2264 + }, + { + "epoch": 1.1690721649484537, + "grad_norm": 1.5139881372451782, + "learning_rate": 4.021936255793101e-05, + "loss": 1.7757, + "step": 2268 + }, + { + "epoch": 1.1711340206185568, + "grad_norm": 1.5133105516433716, + "learning_rate": 4.005225200977796e-05, + "loss": 1.7901, + "step": 2272 + }, + { + "epoch": 1.1731958762886598, + "grad_norm": 1.4687427282333374, + "learning_rate": 3.9885257081746104e-05, + "loss": 1.8429, + "step": 2276 + }, + { + "epoch": 1.175257731958763, + "grad_norm": 1.58542001247406, + "learning_rate": 3.97183797147746e-05, + "loss": 1.724, + "step": 2280 + }, + { + "epoch": 1.177319587628866, + "grad_norm": 1.4827735424041748, + "learning_rate": 3.955162184843625e-05, + "loss": 1.7716, + "step": 2284 + }, + { + "epoch": 1.179381443298969, + "grad_norm": 1.5209972858428955, + "learning_rate": 3.938498542091494e-05, + "loss": 1.8294, + "step": 2288 + }, + { + "epoch": 1.1814432989690722, + "grad_norm": 1.5304073095321655, + "learning_rate": 3.9218472368983084e-05, + "loss": 1.8146, + "step": 2292 + }, + { + "epoch": 1.1835051546391753, + "grad_norm": 1.4535797834396362, + "learning_rate": 3.905208462797914e-05, + "loss": 1.6667, + "step": 2296 + }, + { + "epoch": 1.1855670103092784, + "grad_norm": 1.5424301624298096, + "learning_rate": 3.888582413178509e-05, + "loss": 1.8117, + "step": 2300 + }, + { + "epoch": 1.1876288659793814, + "grad_norm": 1.5533775091171265, + "learning_rate": 3.8719692812804034e-05, + "loss": 1.779, + "step": 2304 + }, + { + "epoch": 1.1896907216494845, + "grad_norm": 1.5786696672439575, + "learning_rate": 3.8553692601937636e-05, + "loss": 1.8124, + "step": 2308 + }, + { + "epoch": 1.1917525773195876, + "grad_norm": 1.9105570316314697, + "learning_rate": 3.8387825428563706e-05, + "loss": 1.7474, + "step": 2312 + }, + { + "epoch": 1.1938144329896907, + "grad_norm": 1.4653416872024536, + "learning_rate": 3.822209322051384e-05, + "loss": 1.7107, + "step": 2316 + }, + { + "epoch": 1.1958762886597938, + "grad_norm": 1.6114842891693115, + "learning_rate": 3.805649790405094e-05, + "loss": 1.8546, + "step": 2320 + }, + { + "epoch": 1.1979381443298969, + "grad_norm": 1.525752305984497, + "learning_rate": 3.789104140384686e-05, + "loss": 1.7314, + "step": 2324 + }, + { + "epoch": 1.2, + "grad_norm": 1.5032085180282593, + "learning_rate": 3.772572564296005e-05, + "loss": 1.884, + "step": 2328 + }, + { + "epoch": 1.202061855670103, + "grad_norm": 1.60364830493927, + "learning_rate": 3.7560552542813125e-05, + "loss": 1.7848, + "step": 2332 + }, + { + "epoch": 1.2041237113402061, + "grad_norm": 1.538434624671936, + "learning_rate": 3.739552402317064e-05, + "loss": 1.7257, + "step": 2336 + }, + { + "epoch": 1.2061855670103092, + "grad_norm": 1.4421312808990479, + "learning_rate": 3.723064200211674e-05, + "loss": 1.6767, + "step": 2340 + }, + { + "epoch": 1.2082474226804123, + "grad_norm": 1.5462045669555664, + "learning_rate": 3.706590839603278e-05, + "loss": 1.7877, + "step": 2344 + }, + { + "epoch": 1.2103092783505154, + "grad_norm": 1.4567534923553467, + "learning_rate": 3.6901325119575194e-05, + "loss": 1.7842, + "step": 2348 + }, + { + "epoch": 1.2123711340206185, + "grad_norm": 2.0819544792175293, + "learning_rate": 3.673689408565317e-05, + "loss": 1.7632, + "step": 2352 + }, + { + "epoch": 1.2144329896907216, + "grad_norm": 1.5570515394210815, + "learning_rate": 3.657261720540639e-05, + "loss": 1.7674, + "step": 2356 + }, + { + "epoch": 1.2164948453608249, + "grad_norm": 1.6284637451171875, + "learning_rate": 3.640849638818286e-05, + "loss": 1.7434, + "step": 2360 + }, + { + "epoch": 1.218556701030928, + "grad_norm": 1.4900153875350952, + "learning_rate": 3.6244533541516713e-05, + "loss": 1.7535, + "step": 2364 + }, + { + "epoch": 1.220618556701031, + "grad_norm": 1.5037250518798828, + "learning_rate": 3.608073057110601e-05, + "loss": 1.7759, + "step": 2368 + }, + { + "epoch": 1.2226804123711341, + "grad_norm": 5.1282219886779785, + "learning_rate": 3.591708938079068e-05, + "loss": 1.7459, + "step": 2372 + }, + { + "epoch": 1.2247422680412372, + "grad_norm": 1.5764257907867432, + "learning_rate": 3.575361187253019e-05, + "loss": 1.8166, + "step": 2376 + }, + { + "epoch": 1.2268041237113403, + "grad_norm": 1.4848092794418335, + "learning_rate": 3.55902999463817e-05, + "loss": 1.7265, + "step": 2380 + }, + { + "epoch": 1.2288659793814434, + "grad_norm": 1.583311915397644, + "learning_rate": 3.5427155500477795e-05, + "loss": 1.7755, + "step": 2384 + }, + { + "epoch": 1.2309278350515465, + "grad_norm": 1.9810250997543335, + "learning_rate": 3.526418043100448e-05, + "loss": 1.8337, + "step": 2388 + }, + { + "epoch": 1.2329896907216495, + "grad_norm": 1.5246814489364624, + "learning_rate": 3.510137663217915e-05, + "loss": 1.7465, + "step": 2392 + }, + { + "epoch": 1.2350515463917526, + "grad_norm": 1.4694105386734009, + "learning_rate": 3.4938745996228586e-05, + "loss": 1.7879, + "step": 2396 + }, + { + "epoch": 1.2371134020618557, + "grad_norm": 1.551598072052002, + "learning_rate": 3.47762904133669e-05, + "loss": 1.7552, + "step": 2400 + }, + { + "epoch": 1.2391752577319588, + "grad_norm": 1.4880276918411255, + "learning_rate": 3.4614011771773644e-05, + "loss": 1.7289, + "step": 2404 + }, + { + "epoch": 1.2412371134020619, + "grad_norm": 1.4992886781692505, + "learning_rate": 3.445191195757179e-05, + "loss": 1.7777, + "step": 2408 + }, + { + "epoch": 1.243298969072165, + "grad_norm": 1.5845531225204468, + "learning_rate": 3.428999285480586e-05, + "loss": 1.7689, + "step": 2412 + }, + { + "epoch": 1.245360824742268, + "grad_norm": 1.5566850900650024, + "learning_rate": 3.412825634542005e-05, + "loss": 1.7837, + "step": 2416 + }, + { + "epoch": 1.2474226804123711, + "grad_norm": 1.5015497207641602, + "learning_rate": 3.396670430923628e-05, + "loss": 1.7094, + "step": 2420 + }, + { + "epoch": 1.2494845360824742, + "grad_norm": 1.4368141889572144, + "learning_rate": 3.380533862393236e-05, + "loss": 1.7203, + "step": 2424 + }, + { + "epoch": 1.2515463917525773, + "grad_norm": 1.4907705783843994, + "learning_rate": 3.364416116502027e-05, + "loss": 1.7477, + "step": 2428 + }, + { + "epoch": 1.2536082474226804, + "grad_norm": 2.8708791732788086, + "learning_rate": 3.348317380582423e-05, + "loss": 1.7573, + "step": 2432 + }, + { + "epoch": 1.2556701030927835, + "grad_norm": 1.5418363809585571, + "learning_rate": 3.332237841745898e-05, + "loss": 1.7518, + "step": 2436 + }, + { + "epoch": 1.2577319587628866, + "grad_norm": 1.4115269184112549, + "learning_rate": 3.3161776868808036e-05, + "loss": 1.7183, + "step": 2440 + }, + { + "epoch": 1.2597938144329897, + "grad_norm": 1.5407153367996216, + "learning_rate": 3.300137102650197e-05, + "loss": 1.7318, + "step": 2444 + }, + { + "epoch": 1.2618556701030927, + "grad_norm": 1.5344724655151367, + "learning_rate": 3.284116275489672e-05, + "loss": 1.7652, + "step": 2448 + }, + { + "epoch": 1.2639175257731958, + "grad_norm": 1.4697588682174683, + "learning_rate": 3.268115391605189e-05, + "loss": 1.7609, + "step": 2452 + }, + { + "epoch": 1.265979381443299, + "grad_norm": 1.5398411750793457, + "learning_rate": 3.2521346369709135e-05, + "loss": 1.7362, + "step": 2456 + }, + { + "epoch": 1.268041237113402, + "grad_norm": 1.5244160890579224, + "learning_rate": 3.236174197327055e-05, + "loss": 1.7365, + "step": 2460 + }, + { + "epoch": 1.270103092783505, + "grad_norm": 1.5290905237197876, + "learning_rate": 3.220234258177707e-05, + "loss": 1.7776, + "step": 2464 + }, + { + "epoch": 1.2721649484536082, + "grad_norm": 1.4930541515350342, + "learning_rate": 3.2043150047886894e-05, + "loss": 1.7694, + "step": 2468 + }, + { + "epoch": 1.2742268041237113, + "grad_norm": 1.489328384399414, + "learning_rate": 3.188416622185397e-05, + "loss": 1.8005, + "step": 2472 + }, + { + "epoch": 1.2762886597938143, + "grad_norm": 1.4238892793655396, + "learning_rate": 3.1725392951506504e-05, + "loss": 1.6492, + "step": 2476 + }, + { + "epoch": 1.2783505154639174, + "grad_norm": 1.566266417503357, + "learning_rate": 3.156683208222544e-05, + "loss": 1.719, + "step": 2480 + }, + { + "epoch": 1.2804123711340205, + "grad_norm": 1.5726135969161987, + "learning_rate": 3.140848545692309e-05, + "loss": 1.7275, + "step": 2484 + }, + { + "epoch": 1.2824742268041236, + "grad_norm": 1.5072052478790283, + "learning_rate": 3.12503549160216e-05, + "loss": 1.7749, + "step": 2488 + }, + { + "epoch": 1.2845360824742267, + "grad_norm": 1.540624976158142, + "learning_rate": 3.109244229743167e-05, + "loss": 1.8149, + "step": 2492 + }, + { + "epoch": 1.2865979381443298, + "grad_norm": 1.9997202157974243, + "learning_rate": 3.0934749436531116e-05, + "loss": 1.7481, + "step": 2496 + }, + { + "epoch": 1.2886597938144329, + "grad_norm": 1.4298206567764282, + "learning_rate": 3.0777278166143616e-05, + "loss": 1.7337, + "step": 2500 + }, + { + "epoch": 1.2907216494845362, + "grad_norm": 1.5551060438156128, + "learning_rate": 3.062003031651727e-05, + "loss": 1.8913, + "step": 2504 + }, + { + "epoch": 1.2927835051546392, + "grad_norm": 2.138140916824341, + "learning_rate": 3.0463007715303483e-05, + "loss": 1.7596, + "step": 2508 + }, + { + "epoch": 1.2948453608247423, + "grad_norm": 1.5046266317367554, + "learning_rate": 3.0306212187535653e-05, + "loss": 1.8142, + "step": 2512 + }, + { + "epoch": 1.2969072164948454, + "grad_norm": 1.426291823387146, + "learning_rate": 3.014964555560792e-05, + "loss": 1.7556, + "step": 2516 + }, + { + "epoch": 1.2989690721649485, + "grad_norm": 1.474760890007019, + "learning_rate": 2.9993309639254092e-05, + "loss": 1.8039, + "step": 2520 + }, + { + "epoch": 1.3010309278350516, + "grad_norm": 1.5093271732330322, + "learning_rate": 2.9837206255526373e-05, + "loss": 1.7842, + "step": 2524 + }, + { + "epoch": 1.3030927835051547, + "grad_norm": 1.5908783674240112, + "learning_rate": 2.9681337218774313e-05, + "loss": 1.846, + "step": 2528 + }, + { + "epoch": 1.3051546391752578, + "grad_norm": 1.6266577243804932, + "learning_rate": 2.952570434062371e-05, + "loss": 1.7245, + "step": 2532 + }, + { + "epoch": 1.3072164948453608, + "grad_norm": 1.5125893354415894, + "learning_rate": 2.9370309429955595e-05, + "loss": 1.749, + "step": 2536 + }, + { + "epoch": 1.309278350515464, + "grad_norm": 1.4698082208633423, + "learning_rate": 2.9215154292885062e-05, + "loss": 1.8698, + "step": 2540 + }, + { + "epoch": 1.311340206185567, + "grad_norm": 1.5628530979156494, + "learning_rate": 2.9060240732740467e-05, + "loss": 1.7998, + "step": 2544 + }, + { + "epoch": 1.31340206185567, + "grad_norm": 1.476222276687622, + "learning_rate": 2.890557055004236e-05, + "loss": 1.6143, + "step": 2548 + }, + { + "epoch": 1.3154639175257732, + "grad_norm": 1.5117274522781372, + "learning_rate": 2.875114554248256e-05, + "loss": 1.6858, + "step": 2552 + }, + { + "epoch": 1.3175257731958763, + "grad_norm": 1.5859644412994385, + "learning_rate": 2.8596967504903283e-05, + "loss": 1.8203, + "step": 2556 + }, + { + "epoch": 1.3195876288659794, + "grad_norm": 1.470045566558838, + "learning_rate": 2.844303822927627e-05, + "loss": 1.6019, + "step": 2560 + }, + { + "epoch": 1.3216494845360824, + "grad_norm": 1.5127053260803223, + "learning_rate": 2.828935950468201e-05, + "loss": 1.7554, + "step": 2564 + }, + { + "epoch": 1.3237113402061855, + "grad_norm": 1.5124480724334717, + "learning_rate": 2.8135933117288872e-05, + "loss": 1.7442, + "step": 2568 + }, + { + "epoch": 1.3257731958762886, + "grad_norm": 1.54438316822052, + "learning_rate": 2.7982760850332316e-05, + "loss": 1.7874, + "step": 2572 + }, + { + "epoch": 1.3278350515463917, + "grad_norm": 1.460144281387329, + "learning_rate": 2.7829844484094318e-05, + "loss": 1.6712, + "step": 2576 + }, + { + "epoch": 1.3298969072164948, + "grad_norm": 1.5282338857650757, + "learning_rate": 2.767718579588252e-05, + "loss": 1.7423, + "step": 2580 + }, + { + "epoch": 1.3319587628865979, + "grad_norm": 1.6081974506378174, + "learning_rate": 2.7524786560009707e-05, + "loss": 1.715, + "step": 2584 + }, + { + "epoch": 1.334020618556701, + "grad_norm": 2.33564829826355, + "learning_rate": 2.737264854777306e-05, + "loss": 1.769, + "step": 2588 + }, + { + "epoch": 1.3360824742268043, + "grad_norm": 1.538969874382019, + "learning_rate": 2.7220773527433625e-05, + "loss": 1.732, + "step": 2592 + }, + { + "epoch": 1.3381443298969073, + "grad_norm": 1.4337162971496582, + "learning_rate": 2.7069163264195784e-05, + "loss": 1.7052, + "step": 2596 + }, + { + "epoch": 1.3402061855670104, + "grad_norm": 1.4599614143371582, + "learning_rate": 2.691781952018671e-05, + "loss": 1.7344, + "step": 2600 + }, + { + "epoch": 1.3422680412371135, + "grad_norm": 1.4979077577590942, + "learning_rate": 2.6766744054435883e-05, + "loss": 1.7518, + "step": 2604 + }, + { + "epoch": 1.3443298969072166, + "grad_norm": 1.5297064781188965, + "learning_rate": 2.6615938622854664e-05, + "loss": 1.733, + "step": 2608 + }, + { + "epoch": 1.3463917525773197, + "grad_norm": 1.50015127658844, + "learning_rate": 2.6465404978215857e-05, + "loss": 1.6939, + "step": 2612 + }, + { + "epoch": 1.3484536082474228, + "grad_norm": 1.5101268291473389, + "learning_rate": 2.6315144870133384e-05, + "loss": 1.6588, + "step": 2616 + }, + { + "epoch": 1.3505154639175259, + "grad_norm": 1.506935715675354, + "learning_rate": 2.6165160045041904e-05, + "loss": 1.7295, + "step": 2620 + }, + { + "epoch": 1.352577319587629, + "grad_norm": 1.9392681121826172, + "learning_rate": 2.6015452246176537e-05, + "loss": 1.7802, + "step": 2624 + }, + { + "epoch": 1.354639175257732, + "grad_norm": 1.5243134498596191, + "learning_rate": 2.586602321355257e-05, + "loss": 1.6937, + "step": 2628 + }, + { + "epoch": 1.3567010309278351, + "grad_norm": 1.5403796434402466, + "learning_rate": 2.571687468394536e-05, + "loss": 1.7298, + "step": 2632 + }, + { + "epoch": 1.3587628865979382, + "grad_norm": 1.4517806768417358, + "learning_rate": 2.5568008390869915e-05, + "loss": 1.6566, + "step": 2636 + }, + { + "epoch": 1.3608247422680413, + "grad_norm": 1.4871841669082642, + "learning_rate": 2.5419426064560957e-05, + "loss": 1.7271, + "step": 2640 + }, + { + "epoch": 1.3628865979381444, + "grad_norm": 1.4928479194641113, + "learning_rate": 2.5271129431952727e-05, + "loss": 1.7903, + "step": 2644 + }, + { + "epoch": 1.3649484536082475, + "grad_norm": 1.517552137374878, + "learning_rate": 2.5123120216658903e-05, + "loss": 1.803, + "step": 2648 + }, + { + "epoch": 1.3670103092783505, + "grad_norm": 1.4770328998565674, + "learning_rate": 2.497540013895262e-05, + "loss": 1.7156, + "step": 2652 + }, + { + "epoch": 1.3690721649484536, + "grad_norm": 1.518353819847107, + "learning_rate": 2.482797091574642e-05, + "loss": 1.6648, + "step": 2656 + }, + { + "epoch": 1.3711340206185567, + "grad_norm": 1.5015162229537964, + "learning_rate": 2.4680834260572304e-05, + "loss": 1.7288, + "step": 2660 + }, + { + "epoch": 1.3731958762886598, + "grad_norm": 1.5462713241577148, + "learning_rate": 2.4533991883561868e-05, + "loss": 1.8064, + "step": 2664 + }, + { + "epoch": 1.3752577319587629, + "grad_norm": 1.4880030155181885, + "learning_rate": 2.4387445491426327e-05, + "loss": 1.7886, + "step": 2668 + }, + { + "epoch": 1.377319587628866, + "grad_norm": 1.5615061521530151, + "learning_rate": 2.424119678743677e-05, + "loss": 1.643, + "step": 2672 + }, + { + "epoch": 1.379381443298969, + "grad_norm": 1.5558855533599854, + "learning_rate": 2.4095247471404387e-05, + "loss": 1.7335, + "step": 2676 + }, + { + "epoch": 1.3814432989690721, + "grad_norm": 1.5481622219085693, + "learning_rate": 2.3949599239660585e-05, + "loss": 1.7311, + "step": 2680 + }, + { + "epoch": 1.3835051546391752, + "grad_norm": 1.471251130104065, + "learning_rate": 2.380425378503738e-05, + "loss": 1.711, + "step": 2684 + }, + { + "epoch": 1.3855670103092783, + "grad_norm": 1.4730710983276367, + "learning_rate": 2.365921279684767e-05, + "loss": 1.6672, + "step": 2688 + }, + { + "epoch": 1.3876288659793814, + "grad_norm": 1.4712542295455933, + "learning_rate": 2.3514477960865627e-05, + "loss": 1.628, + "step": 2692 + }, + { + "epoch": 1.3896907216494845, + "grad_norm": 1.4011591672897339, + "learning_rate": 2.33700509593071e-05, + "loss": 1.6386, + "step": 2696 + }, + { + "epoch": 1.3917525773195876, + "grad_norm": 1.4686334133148193, + "learning_rate": 2.322593347081005e-05, + "loss": 1.6377, + "step": 2700 + }, + { + "epoch": 1.3938144329896907, + "grad_norm": 1.5750575065612793, + "learning_rate": 2.308212717041505e-05, + "loss": 1.7587, + "step": 2704 + }, + { + "epoch": 1.3958762886597937, + "grad_norm": 1.5493422746658325, + "learning_rate": 2.2938633729545816e-05, + "loss": 1.7323, + "step": 2708 + }, + { + "epoch": 1.3979381443298968, + "grad_norm": 1.4504088163375854, + "learning_rate": 2.279545481598977e-05, + "loss": 1.7473, + "step": 2712 + }, + { + "epoch": 1.4, + "grad_norm": 1.5989000797271729, + "learning_rate": 2.2652592093878666e-05, + "loss": 1.7669, + "step": 2716 + }, + { + "epoch": 1.402061855670103, + "grad_norm": 1.5275282859802246, + "learning_rate": 2.2510047223669233e-05, + "loss": 1.8059, + "step": 2720 + }, + { + "epoch": 1.404123711340206, + "grad_norm": 1.5826573371887207, + "learning_rate": 2.2367821862123944e-05, + "loss": 1.7529, + "step": 2724 + }, + { + "epoch": 1.4061855670103092, + "grad_norm": 1.5159717798233032, + "learning_rate": 2.2225917662291663e-05, + "loss": 1.6796, + "step": 2728 + }, + { + "epoch": 1.4082474226804123, + "grad_norm": 1.537111759185791, + "learning_rate": 2.2084336273488443e-05, + "loss": 1.6562, + "step": 2732 + }, + { + "epoch": 1.4103092783505153, + "grad_norm": 1.5044008493423462, + "learning_rate": 2.1943079341278427e-05, + "loss": 1.6649, + "step": 2736 + }, + { + "epoch": 1.4123711340206184, + "grad_norm": 1.5891298055648804, + "learning_rate": 2.180214850745467e-05, + "loss": 1.7776, + "step": 2740 + }, + { + "epoch": 1.4144329896907217, + "grad_norm": 1.5218358039855957, + "learning_rate": 2.166154541002011e-05, + "loss": 1.6714, + "step": 2744 + }, + { + "epoch": 1.4164948453608248, + "grad_norm": 1.467380404472351, + "learning_rate": 2.152127168316843e-05, + "loss": 1.6669, + "step": 2748 + }, + { + "epoch": 1.418556701030928, + "grad_norm": 1.4052174091339111, + "learning_rate": 2.1381328957265152e-05, + "loss": 1.5786, + "step": 2752 + }, + { + "epoch": 1.420618556701031, + "grad_norm": 1.505531907081604, + "learning_rate": 2.1241718858828653e-05, + "loss": 1.7733, + "step": 2756 + }, + { + "epoch": 1.422680412371134, + "grad_norm": 1.9099241495132446, + "learning_rate": 2.110244301051128e-05, + "loss": 1.6561, + "step": 2760 + }, + { + "epoch": 1.4247422680412372, + "grad_norm": 1.6298562288284302, + "learning_rate": 2.0963503031080415e-05, + "loss": 1.8327, + "step": 2764 + }, + { + "epoch": 1.4268041237113402, + "grad_norm": 1.4871844053268433, + "learning_rate": 2.0824900535399834e-05, + "loss": 1.726, + "step": 2768 + }, + { + "epoch": 1.4288659793814433, + "grad_norm": 1.4929816722869873, + "learning_rate": 2.068663713441073e-05, + "loss": 1.7089, + "step": 2772 + }, + { + "epoch": 1.4309278350515464, + "grad_norm": 1.5244849920272827, + "learning_rate": 2.054871443511313e-05, + "loss": 1.724, + "step": 2776 + }, + { + "epoch": 1.4329896907216495, + "grad_norm": 1.3867377042770386, + "learning_rate": 2.0411134040547154e-05, + "loss": 1.6098, + "step": 2780 + }, + { + "epoch": 1.4350515463917526, + "grad_norm": 1.552032470703125, + "learning_rate": 2.0273897549774416e-05, + "loss": 1.7457, + "step": 2784 + }, + { + "epoch": 1.4371134020618557, + "grad_norm": 1.4442955255508423, + "learning_rate": 2.0137006557859396e-05, + "loss": 1.7092, + "step": 2788 + }, + { + "epoch": 1.4391752577319588, + "grad_norm": 1.504491925239563, + "learning_rate": 2.000046265585099e-05, + "loss": 1.6646, + "step": 2792 + }, + { + "epoch": 1.4412371134020618, + "grad_norm": 1.5086458921432495, + "learning_rate": 1.986426743076391e-05, + "loss": 1.738, + "step": 2796 + }, + { + "epoch": 1.443298969072165, + "grad_norm": 1.4446042776107788, + "learning_rate": 1.9728422465560275e-05, + "loss": 1.6802, + "step": 2800 + }, + { + "epoch": 1.445360824742268, + "grad_norm": 1.5059337615966797, + "learning_rate": 1.9592929339131243e-05, + "loss": 1.5487, + "step": 2804 + }, + { + "epoch": 1.447422680412371, + "grad_norm": 1.4459666013717651, + "learning_rate": 1.9457789626278657e-05, + "loss": 1.6508, + "step": 2808 + }, + { + "epoch": 1.4494845360824742, + "grad_norm": 1.757739543914795, + "learning_rate": 1.9323004897696673e-05, + "loss": 1.78, + "step": 2812 + }, + { + "epoch": 1.4515463917525773, + "grad_norm": 1.5750166177749634, + "learning_rate": 1.9188576719953633e-05, + "loss": 1.7814, + "step": 2816 + }, + { + "epoch": 1.4536082474226804, + "grad_norm": 1.4686044454574585, + "learning_rate": 1.9054506655473724e-05, + "loss": 1.6798, + "step": 2820 + }, + { + "epoch": 1.4556701030927834, + "grad_norm": 1.4810421466827393, + "learning_rate": 1.892079626251888e-05, + "loss": 1.653, + "step": 2824 + }, + { + "epoch": 1.4577319587628865, + "grad_norm": 1.4591325521469116, + "learning_rate": 1.8787447095170702e-05, + "loss": 1.7042, + "step": 2828 + }, + { + "epoch": 1.4597938144329896, + "grad_norm": 1.4883931875228882, + "learning_rate": 1.8654460703312266e-05, + "loss": 1.7581, + "step": 2832 + }, + { + "epoch": 1.461855670103093, + "grad_norm": 1.4359487295150757, + "learning_rate": 1.852183863261032e-05, + "loss": 1.6085, + "step": 2836 + }, + { + "epoch": 1.463917525773196, + "grad_norm": 1.4442452192306519, + "learning_rate": 1.8389582424497136e-05, + "loss": 1.6518, + "step": 2840 + }, + { + "epoch": 1.465979381443299, + "grad_norm": 1.5850920677185059, + "learning_rate": 1.825769361615266e-05, + "loss": 1.7989, + "step": 2844 + }, + { + "epoch": 1.4680412371134022, + "grad_norm": 1.4874738454818726, + "learning_rate": 1.8126173740486664e-05, + "loss": 1.7408, + "step": 2848 + }, + { + "epoch": 1.4701030927835053, + "grad_norm": 1.5400047302246094, + "learning_rate": 1.799502432612089e-05, + "loss": 1.7256, + "step": 2852 + }, + { + "epoch": 1.4721649484536083, + "grad_norm": 1.4548730850219727, + "learning_rate": 1.7864246897371316e-05, + "loss": 1.7029, + "step": 2856 + }, + { + "epoch": 1.4742268041237114, + "grad_norm": 1.4862723350524902, + "learning_rate": 1.773384297423043e-05, + "loss": 1.6857, + "step": 2860 + }, + { + "epoch": 1.4762886597938145, + "grad_norm": 1.4751530885696411, + "learning_rate": 1.760381407234955e-05, + "loss": 1.6826, + "step": 2864 + }, + { + "epoch": 1.4783505154639176, + "grad_norm": 1.4540187120437622, + "learning_rate": 1.7474161703021218e-05, + "loss": 1.6407, + "step": 2868 + }, + { + "epoch": 1.4804123711340207, + "grad_norm": 1.55220365524292, + "learning_rate": 1.734488737316165e-05, + "loss": 1.7539, + "step": 2872 + }, + { + "epoch": 1.4824742268041238, + "grad_norm": 1.4643796682357788, + "learning_rate": 1.721599258529319e-05, + "loss": 1.7111, + "step": 2876 + }, + { + "epoch": 1.4845360824742269, + "grad_norm": 1.6646052598953247, + "learning_rate": 1.708747883752685e-05, + "loss": 1.7609, + "step": 2880 + }, + { + "epoch": 1.48659793814433, + "grad_norm": 1.3996634483337402, + "learning_rate": 1.695934762354497e-05, + "loss": 1.6767, + "step": 2884 + }, + { + "epoch": 1.488659793814433, + "grad_norm": 1.4679023027420044, + "learning_rate": 1.6831600432583727e-05, + "loss": 1.6345, + "step": 2888 + }, + { + "epoch": 1.4907216494845361, + "grad_norm": 1.4191251993179321, + "learning_rate": 1.6704238749415957e-05, + "loss": 1.689, + "step": 2892 + }, + { + "epoch": 1.4927835051546392, + "grad_norm": 1.6434361934661865, + "learning_rate": 1.6577264054333768e-05, + "loss": 1.7641, + "step": 2896 + }, + { + "epoch": 1.4948453608247423, + "grad_norm": 1.5503323078155518, + "learning_rate": 1.6450677823131472e-05, + "loss": 1.6991, + "step": 2900 + }, + { + "epoch": 1.4969072164948454, + "grad_norm": 1.7964752912521362, + "learning_rate": 1.6324481527088302e-05, + "loss": 1.707, + "step": 2904 + }, + { + "epoch": 1.4989690721649485, + "grad_norm": 1.470279574394226, + "learning_rate": 1.6198676632951464e-05, + "loss": 1.7253, + "step": 2908 + }, + { + "epoch": 1.5010309278350515, + "grad_norm": 1.5459221601486206, + "learning_rate": 1.6073264602918918e-05, + "loss": 1.6298, + "step": 2912 + }, + { + "epoch": 1.5030927835051546, + "grad_norm": 1.4339414834976196, + "learning_rate": 1.5948246894622497e-05, + "loss": 1.6155, + "step": 2916 + }, + { + "epoch": 1.5051546391752577, + "grad_norm": 1.5148845911026, + "learning_rate": 1.582362496111094e-05, + "loss": 1.7269, + "step": 2920 + }, + { + "epoch": 1.5072164948453608, + "grad_norm": 1.8622448444366455, + "learning_rate": 1.569940025083297e-05, + "loss": 1.6962, + "step": 2924 + }, + { + "epoch": 1.5092783505154639, + "grad_norm": 1.5076379776000977, + "learning_rate": 1.5575574207620517e-05, + "loss": 1.6369, + "step": 2928 + }, + { + "epoch": 1.511340206185567, + "grad_norm": 1.511597752571106, + "learning_rate": 1.5452148270671894e-05, + "loss": 1.7466, + "step": 2932 + }, + { + "epoch": 1.51340206185567, + "grad_norm": 1.4678515195846558, + "learning_rate": 1.5329123874535085e-05, + "loss": 1.7594, + "step": 2936 + }, + { + "epoch": 1.5154639175257731, + "grad_norm": 1.545318841934204, + "learning_rate": 1.5206502449091054e-05, + "loss": 1.7206, + "step": 2940 + }, + { + "epoch": 1.5175257731958762, + "grad_norm": 1.458013653755188, + "learning_rate": 1.5084285419537153e-05, + "loss": 1.7171, + "step": 2944 + }, + { + "epoch": 1.5195876288659793, + "grad_norm": 1.4504350423812866, + "learning_rate": 1.4962474206370541e-05, + "loss": 1.7554, + "step": 2948 + }, + { + "epoch": 1.5216494845360824, + "grad_norm": 1.4534108638763428, + "learning_rate": 1.4841070225371673e-05, + "loss": 1.6591, + "step": 2952 + }, + { + "epoch": 1.5237113402061855, + "grad_norm": 1.5048794746398926, + "learning_rate": 1.4720074887587887e-05, + "loss": 1.6382, + "step": 2956 + }, + { + "epoch": 1.5257731958762886, + "grad_norm": 1.566748857498169, + "learning_rate": 1.4599489599316896e-05, + "loss": 1.73, + "step": 2960 + }, + { + "epoch": 1.5278350515463917, + "grad_norm": 1.5701326131820679, + "learning_rate": 1.4479315762090562e-05, + "loss": 1.694, + "step": 2964 + }, + { + "epoch": 1.5298969072164947, + "grad_norm": 1.5283926725387573, + "learning_rate": 1.4359554772658552e-05, + "loss": 1.6844, + "step": 2968 + }, + { + "epoch": 1.5319587628865978, + "grad_norm": 1.5629454851150513, + "learning_rate": 1.4240208022972074e-05, + "loss": 1.7401, + "step": 2972 + }, + { + "epoch": 1.534020618556701, + "grad_norm": 1.537468671798706, + "learning_rate": 1.4121276900167796e-05, + "loss": 1.7075, + "step": 2976 + }, + { + "epoch": 1.536082474226804, + "grad_norm": 1.488130807876587, + "learning_rate": 1.400276278655162e-05, + "loss": 1.7203, + "step": 2980 + }, + { + "epoch": 1.538144329896907, + "grad_norm": 1.385069727897644, + "learning_rate": 1.3884667059582656e-05, + "loss": 1.6419, + "step": 2984 + }, + { + "epoch": 1.5402061855670102, + "grad_norm": 1.4615826606750488, + "learning_rate": 1.3766991091857246e-05, + "loss": 1.7021, + "step": 2988 + }, + { + "epoch": 1.5422680412371133, + "grad_norm": 1.5640617609024048, + "learning_rate": 1.3649736251092898e-05, + "loss": 1.7353, + "step": 2992 + }, + { + "epoch": 1.5443298969072163, + "grad_norm": 1.561741590499878, + "learning_rate": 1.353290390011258e-05, + "loss": 1.6911, + "step": 2996 + }, + { + "epoch": 1.5463917525773194, + "grad_norm": 1.5131316184997559, + "learning_rate": 1.3416495396828694e-05, + "loss": 1.6645, + "step": 3000 + }, + { + "epoch": 1.5484536082474227, + "grad_norm": 1.5184184312820435, + "learning_rate": 1.3300512094227391e-05, + "loss": 1.6424, + "step": 3004 + }, + { + "epoch": 1.5505154639175258, + "grad_norm": 1.4697967767715454, + "learning_rate": 1.3184955340352828e-05, + "loss": 1.7499, + "step": 3008 + }, + { + "epoch": 1.552577319587629, + "grad_norm": 1.62870192527771, + "learning_rate": 1.3069826478291486e-05, + "loss": 1.676, + "step": 3012 + }, + { + "epoch": 1.554639175257732, + "grad_norm": 1.531766653060913, + "learning_rate": 1.2955126846156573e-05, + "loss": 1.6913, + "step": 3016 + }, + { + "epoch": 1.556701030927835, + "grad_norm": 2.1785354614257812, + "learning_rate": 1.2840857777072468e-05, + "loss": 1.6899, + "step": 3020 + }, + { + "epoch": 1.5587628865979382, + "grad_norm": 1.403219223022461, + "learning_rate": 1.2727020599159235e-05, + "loss": 1.6918, + "step": 3024 + }, + { + "epoch": 1.5608247422680412, + "grad_norm": 1.5640937089920044, + "learning_rate": 1.2613616635517161e-05, + "loss": 1.6787, + "step": 3028 + }, + { + "epoch": 1.5628865979381443, + "grad_norm": 1.4356787204742432, + "learning_rate": 1.2500647204211402e-05, + "loss": 1.6923, + "step": 3032 + }, + { + "epoch": 1.5649484536082474, + "grad_norm": 1.4457017183303833, + "learning_rate": 1.238811361825667e-05, + "loss": 1.6461, + "step": 3036 + }, + { + "epoch": 1.5670103092783505, + "grad_norm": 2.227945566177368, + "learning_rate": 1.2276017185601912e-05, + "loss": 1.7129, + "step": 3040 + }, + { + "epoch": 1.5690721649484536, + "grad_norm": 1.4856458902359009, + "learning_rate": 1.2164359209115234e-05, + "loss": 1.6834, + "step": 3044 + }, + { + "epoch": 1.5711340206185567, + "grad_norm": 1.5406701564788818, + "learning_rate": 1.2053140986568612e-05, + "loss": 1.6185, + "step": 3048 + }, + { + "epoch": 1.5731958762886598, + "grad_norm": 1.554215908050537, + "learning_rate": 1.1942363810622909e-05, + "loss": 1.6404, + "step": 3052 + }, + { + "epoch": 1.5752577319587628, + "grad_norm": 1.6283259391784668, + "learning_rate": 1.1832028968812774e-05, + "loss": 1.6818, + "step": 3056 + }, + { + "epoch": 1.577319587628866, + "grad_norm": 1.4719226360321045, + "learning_rate": 1.172213774353177e-05, + "loss": 1.6619, + "step": 3060 + }, + { + "epoch": 1.5793814432989692, + "grad_norm": 1.5907206535339355, + "learning_rate": 1.1612691412017373e-05, + "loss": 1.6963, + "step": 3064 + }, + { + "epoch": 1.5814432989690723, + "grad_norm": 1.426649570465088, + "learning_rate": 1.1503691246336234e-05, + "loss": 1.7719, + "step": 3068 + }, + { + "epoch": 1.5835051546391754, + "grad_norm": 1.5006170272827148, + "learning_rate": 1.1395138513369286e-05, + "loss": 1.632, + "step": 3072 + }, + { + "epoch": 1.5855670103092785, + "grad_norm": 1.4917073249816895, + "learning_rate": 1.1287034474797065e-05, + "loss": 1.7659, + "step": 3076 + }, + { + "epoch": 1.5876288659793816, + "grad_norm": 1.4859620332717896, + "learning_rate": 1.117938038708507e-05, + "loss": 1.7446, + "step": 3080 + }, + { + "epoch": 1.5896907216494847, + "grad_norm": 1.4105993509292603, + "learning_rate": 1.1072177501469128e-05, + "loss": 1.6825, + "step": 3084 + }, + { + "epoch": 1.5917525773195877, + "grad_norm": 1.499107837677002, + "learning_rate": 1.0965427063940853e-05, + "loss": 1.7547, + "step": 3088 + }, + { + "epoch": 1.5938144329896908, + "grad_norm": 1.5192339420318604, + "learning_rate": 1.0859130315233174e-05, + "loss": 1.6612, + "step": 3092 + }, + { + "epoch": 1.595876288659794, + "grad_norm": 1.4649794101715088, + "learning_rate": 1.0753288490805918e-05, + "loss": 1.6028, + "step": 3096 + }, + { + "epoch": 1.597938144329897, + "grad_norm": 1.538676381111145, + "learning_rate": 1.064790282083144e-05, + "loss": 1.5495, + "step": 3100 + }, + { + "epoch": 1.6, + "grad_norm": 1.5038882493972778, + "learning_rate": 1.0542974530180327e-05, + "loss": 1.7314, + "step": 3104 + }, + { + "epoch": 1.6020618556701032, + "grad_norm": 1.5052063465118408, + "learning_rate": 1.0438504838407165e-05, + "loss": 1.6695, + "step": 3108 + }, + { + "epoch": 1.6041237113402063, + "grad_norm": 1.5474481582641602, + "learning_rate": 1.0334494959736347e-05, + "loss": 1.6846, + "step": 3112 + }, + { + "epoch": 1.6061855670103093, + "grad_norm": 1.4491521120071411, + "learning_rate": 1.0230946103048022e-05, + "loss": 1.601, + "step": 3116 + }, + { + "epoch": 1.6082474226804124, + "grad_norm": 1.4010319709777832, + "learning_rate": 1.012785947186397e-05, + "loss": 1.701, + "step": 3120 + }, + { + "epoch": 1.6103092783505155, + "grad_norm": 1.4654611349105835, + "learning_rate": 1.002523626433361e-05, + "loss": 1.6416, + "step": 3124 + }, + { + "epoch": 1.6123711340206186, + "grad_norm": 1.5731595754623413, + "learning_rate": 9.92307767322016e-06, + "loss": 1.7372, + "step": 3128 + }, + { + "epoch": 1.6144329896907217, + "grad_norm": 1.546984076499939, + "learning_rate": 9.821384885886676e-06, + "loss": 1.7713, + "step": 3132 + }, + { + "epoch": 1.6164948453608248, + "grad_norm": 1.4809237718582153, + "learning_rate": 9.720159084282355e-06, + "loss": 1.6343, + "step": 3136 + }, + { + "epoch": 1.6185567010309279, + "grad_norm": 1.392882227897644, + "learning_rate": 9.619401444928683e-06, + "loss": 1.6566, + "step": 3140 + }, + { + "epoch": 1.620618556701031, + "grad_norm": 1.4652302265167236, + "learning_rate": 9.519113138905838e-06, + "loss": 1.7887, + "step": 3144 + }, + { + "epoch": 1.622680412371134, + "grad_norm": 1.5458775758743286, + "learning_rate": 9.419295331839061e-06, + "loss": 1.731, + "step": 3148 + }, + { + "epoch": 1.6247422680412371, + "grad_norm": 1.5418813228607178, + "learning_rate": 9.319949183885108e-06, + "loss": 1.6952, + "step": 3152 + }, + { + "epoch": 1.6268041237113402, + "grad_norm": 1.5179048776626587, + "learning_rate": 9.221075849718713e-06, + "loss": 1.7358, + "step": 3156 + }, + { + "epoch": 1.6288659793814433, + "grad_norm": 1.53363037109375, + "learning_rate": 9.1226764785193e-06, + "loss": 1.6322, + "step": 3160 + }, + { + "epoch": 1.6309278350515464, + "grad_norm": 1.4504185914993286, + "learning_rate": 9.024752213957482e-06, + "loss": 1.6023, + "step": 3164 + }, + { + "epoch": 1.6329896907216495, + "grad_norm": 1.5878132581710815, + "learning_rate": 8.927304194181857e-06, + "loss": 1.7398, + "step": 3168 + }, + { + "epoch": 1.6350515463917525, + "grad_norm": 1.533153772354126, + "learning_rate": 8.83033355180573e-06, + "loss": 1.766, + "step": 3172 + }, + { + "epoch": 1.6371134020618556, + "grad_norm": 1.4958655834197998, + "learning_rate": 8.73384141389399e-06, + "loss": 1.6772, + "step": 3176 + }, + { + "epoch": 1.6391752577319587, + "grad_norm": 1.4422835111618042, + "learning_rate": 8.63782890194998e-06, + "loss": 1.6434, + "step": 3180 + }, + { + "epoch": 1.6412371134020618, + "grad_norm": 1.6030116081237793, + "learning_rate": 8.5422971319025e-06, + "loss": 1.5916, + "step": 3184 + }, + { + "epoch": 1.6432989690721649, + "grad_norm": 1.4988054037094116, + "learning_rate": 8.447247214092768e-06, + "loss": 1.7199, + "step": 3188 + }, + { + "epoch": 1.645360824742268, + "grad_norm": 1.5074199438095093, + "learning_rate": 8.35268025326158e-06, + "loss": 1.6741, + "step": 3192 + }, + { + "epoch": 1.647422680412371, + "grad_norm": 1.4830085039138794, + "learning_rate": 8.25859734853645e-06, + "loss": 1.637, + "step": 3196 + }, + { + "epoch": 1.6494845360824741, + "grad_norm": 1.6016579866409302, + "learning_rate": 8.164999593418826e-06, + "loss": 1.6371, + "step": 3200 + }, + { + "epoch": 1.6515463917525772, + "grad_norm": 1.4918121099472046, + "learning_rate": 8.071888075771378e-06, + "loss": 1.6303, + "step": 3204 + }, + { + "epoch": 1.6536082474226803, + "grad_norm": 1.42795991897583, + "learning_rate": 7.979263877805394e-06, + "loss": 1.6543, + "step": 3208 + }, + { + "epoch": 1.6556701030927834, + "grad_norm": 1.5172784328460693, + "learning_rate": 7.887128076068134e-06, + "loss": 1.71, + "step": 3212 + }, + { + "epoch": 1.6577319587628865, + "grad_norm": 1.493607521057129, + "learning_rate": 7.795481741430393e-06, + "loss": 1.6678, + "step": 3216 + }, + { + "epoch": 1.6597938144329896, + "grad_norm": 1.5664525032043457, + "learning_rate": 7.704325939073958e-06, + "loss": 1.7857, + "step": 3220 + }, + { + "epoch": 1.6618556701030927, + "grad_norm": 1.5040996074676514, + "learning_rate": 7.613661728479321e-06, + "loss": 1.8031, + "step": 3224 + }, + { + "epoch": 1.6639175257731957, + "grad_norm": 1.462308645248413, + "learning_rate": 7.52349016341335e-06, + "loss": 1.7017, + "step": 3228 + }, + { + "epoch": 1.6659793814432988, + "grad_norm": 1.4423810243606567, + "learning_rate": 7.433812291916992e-06, + "loss": 1.7352, + "step": 3232 + }, + { + "epoch": 1.668041237113402, + "grad_norm": 1.4877485036849976, + "learning_rate": 7.344629156293114e-06, + "loss": 1.6151, + "step": 3236 + }, + { + "epoch": 1.670103092783505, + "grad_norm": 1.4941354990005493, + "learning_rate": 7.255941793094423e-06, + "loss": 1.6128, + "step": 3240 + }, + { + "epoch": 1.6721649484536083, + "grad_norm": 1.5927449464797974, + "learning_rate": 7.167751233111358e-06, + "loss": 1.706, + "step": 3244 + }, + { + "epoch": 1.6742268041237114, + "grad_norm": 1.6141644716262817, + "learning_rate": 7.080058501360171e-06, + "loss": 1.6318, + "step": 3248 + }, + { + "epoch": 1.6762886597938145, + "grad_norm": 1.5904043912887573, + "learning_rate": 6.9928646170709656e-06, + "loss": 1.6387, + "step": 3252 + }, + { + "epoch": 1.6783505154639176, + "grad_norm": 1.4306963682174683, + "learning_rate": 6.906170593675876e-06, + "loss": 1.6231, + "step": 3256 + }, + { + "epoch": 1.6804123711340206, + "grad_norm": 1.6362873315811157, + "learning_rate": 6.819977438797281e-06, + "loss": 1.7401, + "step": 3260 + }, + { + "epoch": 1.6824742268041237, + "grad_norm": 1.4902920722961426, + "learning_rate": 6.734286154236091e-06, + "loss": 1.676, + "step": 3264 + }, + { + "epoch": 1.6845360824742268, + "grad_norm": 1.499423623085022, + "learning_rate": 6.649097735960108e-06, + "loss": 1.643, + "step": 3268 + }, + { + "epoch": 1.68659793814433, + "grad_norm": 1.4293397665023804, + "learning_rate": 6.564413174092443e-06, + "loss": 1.6775, + "step": 3272 + }, + { + "epoch": 1.688659793814433, + "grad_norm": 1.5098601579666138, + "learning_rate": 6.480233452900036e-06, + "loss": 1.5725, + "step": 3276 + }, + { + "epoch": 1.690721649484536, + "grad_norm": 1.6427370309829712, + "learning_rate": 6.396559550782177e-06, + "loss": 1.7431, + "step": 3280 + }, + { + "epoch": 1.6927835051546392, + "grad_norm": 1.4890693426132202, + "learning_rate": 6.313392440259136e-06, + "loss": 1.6801, + "step": 3284 + }, + { + "epoch": 1.6948453608247422, + "grad_norm": 1.4871695041656494, + "learning_rate": 6.230733087960888e-06, + "loss": 1.7111, + "step": 3288 + }, + { + "epoch": 1.6969072164948453, + "grad_norm": 1.6153513193130493, + "learning_rate": 6.14858245461587e-06, + "loss": 1.7601, + "step": 3292 + }, + { + "epoch": 1.6989690721649484, + "grad_norm": 1.5377050638198853, + "learning_rate": 6.066941495039796e-06, + "loss": 1.6788, + "step": 3296 + }, + { + "epoch": 1.7010309278350515, + "grad_norm": 1.4935017824172974, + "learning_rate": 5.985811158124599e-06, + "loss": 1.7559, + "step": 3300 + }, + { + "epoch": 1.7030927835051546, + "grad_norm": 1.4736721515655518, + "learning_rate": 5.905192386827352e-06, + "loss": 1.731, + "step": 3304 + }, + { + "epoch": 1.705154639175258, + "grad_norm": 1.5431056022644043, + "learning_rate": 5.825086118159329e-06, + "loss": 1.6675, + "step": 3308 + }, + { + "epoch": 1.707216494845361, + "grad_norm": 1.7205324172973633, + "learning_rate": 5.745493283175146e-06, + "loss": 1.6615, + "step": 3312 + }, + { + "epoch": 1.709278350515464, + "grad_norm": 1.5387599468231201, + "learning_rate": 5.666414806961856e-06, + "loss": 1.7063, + "step": 3316 + }, + { + "epoch": 1.7113402061855671, + "grad_norm": 1.43263578414917, + "learning_rate": 5.58785160862832e-06, + "loss": 1.6548, + "step": 3320 + }, + { + "epoch": 1.7134020618556702, + "grad_norm": 1.5297513008117676, + "learning_rate": 5.50980460129441e-06, + "loss": 1.6402, + "step": 3324 + }, + { + "epoch": 1.7154639175257733, + "grad_norm": 1.521336555480957, + "learning_rate": 5.432274692080464e-06, + "loss": 1.6973, + "step": 3328 + }, + { + "epoch": 1.7175257731958764, + "grad_norm": 1.4514061212539673, + "learning_rate": 5.355262782096709e-06, + "loss": 1.6918, + "step": 3332 + }, + { + "epoch": 1.7195876288659795, + "grad_norm": 1.48029363155365, + "learning_rate": 5.278769766432801e-06, + "loss": 1.7185, + "step": 3336 + }, + { + "epoch": 1.7216494845360826, + "grad_norm": 1.4712656736373901, + "learning_rate": 5.202796534147436e-06, + "loss": 1.6631, + "step": 3340 + }, + { + "epoch": 1.7237113402061857, + "grad_norm": 1.4765143394470215, + "learning_rate": 5.127343968257969e-06, + "loss": 1.6347, + "step": 3344 + }, + { + "epoch": 1.7257731958762887, + "grad_norm": 1.5265347957611084, + "learning_rate": 5.05241294573024e-06, + "loss": 1.7666, + "step": 3348 + }, + { + "epoch": 1.7278350515463918, + "grad_norm": 1.5441725254058838, + "learning_rate": 4.978004337468256e-06, + "loss": 1.6875, + "step": 3352 + }, + { + "epoch": 1.729896907216495, + "grad_norm": 1.4827260971069336, + "learning_rate": 4.904119008304175e-06, + "loss": 1.63, + "step": 3356 + }, + { + "epoch": 1.731958762886598, + "grad_norm": 1.488405466079712, + "learning_rate": 4.830757816988219e-06, + "loss": 1.6433, + "step": 3360 + }, + { + "epoch": 1.734020618556701, + "grad_norm": 1.5134429931640625, + "learning_rate": 4.757921616178662e-06, + "loss": 1.5377, + "step": 3364 + }, + { + "epoch": 1.7360824742268042, + "grad_norm": 1.469539999961853, + "learning_rate": 4.685611252431998e-06, + "loss": 1.6067, + "step": 3368 + }, + { + "epoch": 1.7381443298969073, + "grad_norm": 1.5114245414733887, + "learning_rate": 4.6138275661930075e-06, + "loss": 1.6444, + "step": 3372 + }, + { + "epoch": 1.7402061855670103, + "grad_norm": 1.5059422254562378, + "learning_rate": 4.54257139178505e-06, + "loss": 1.7052, + "step": 3376 + }, + { + "epoch": 1.7422680412371134, + "grad_norm": 1.5071382522583008, + "learning_rate": 4.4718435574003514e-06, + "loss": 1.7082, + "step": 3380 + }, + { + "epoch": 1.7443298969072165, + "grad_norm": 1.500165581703186, + "learning_rate": 4.401644885090356e-06, + "loss": 1.6129, + "step": 3384 + }, + { + "epoch": 1.7463917525773196, + "grad_norm": 1.5666110515594482, + "learning_rate": 4.331976190756226e-06, + "loss": 1.7796, + "step": 3388 + }, + { + "epoch": 1.7484536082474227, + "grad_norm": 1.4586031436920166, + "learning_rate": 4.2628382841393145e-06, + "loss": 1.6676, + "step": 3392 + }, + { + "epoch": 1.7505154639175258, + "grad_norm": 1.5573093891143799, + "learning_rate": 4.194231968811757e-06, + "loss": 1.7022, + "step": 3396 + }, + { + "epoch": 1.7525773195876289, + "grad_norm": 1.5714795589447021, + "learning_rate": 4.126158042167139e-06, + "loss": 1.7316, + "step": 3400 + }, + { + "epoch": 1.754639175257732, + "grad_norm": 1.4940961599349976, + "learning_rate": 4.058617295411243e-06, + "loss": 1.6096, + "step": 3404 + }, + { + "epoch": 1.756701030927835, + "grad_norm": 1.6426243782043457, + "learning_rate": 3.991610513552829e-06, + "loss": 1.6747, + "step": 3408 + }, + { + "epoch": 1.7587628865979381, + "grad_norm": 1.5700215101242065, + "learning_rate": 3.925138475394513e-06, + "loss": 1.7568, + "step": 3412 + }, + { + "epoch": 1.7608247422680412, + "grad_norm": 7.2929534912109375, + "learning_rate": 3.859201953523739e-06, + "loss": 1.8217, + "step": 3416 + }, + { + "epoch": 1.7628865979381443, + "grad_norm": 1.4354559183120728, + "learning_rate": 3.7938017143037685e-06, + "loss": 1.6345, + "step": 3420 + }, + { + "epoch": 1.7649484536082474, + "grad_norm": 1.439919114112854, + "learning_rate": 3.728938517864794e-06, + "loss": 1.6007, + "step": 3424 + }, + { + "epoch": 1.7670103092783505, + "grad_norm": 1.496739387512207, + "learning_rate": 3.6646131180951006e-06, + "loss": 1.7252, + "step": 3428 + }, + { + "epoch": 1.7690721649484535, + "grad_norm": 1.4635599851608276, + "learning_rate": 3.6008262626322843e-06, + "loss": 1.647, + "step": 3432 + }, + { + "epoch": 1.7711340206185566, + "grad_norm": 1.4412612915039062, + "learning_rate": 3.5375786928546097e-06, + "loss": 1.6424, + "step": 3436 + }, + { + "epoch": 1.7731958762886597, + "grad_norm": 1.5021198987960815, + "learning_rate": 3.474871143872338e-06, + "loss": 1.6989, + "step": 3440 + }, + { + "epoch": 1.7752577319587628, + "grad_norm": 1.5215860605239868, + "learning_rate": 3.412704344519213e-06, + "loss": 1.59, + "step": 3444 + }, + { + "epoch": 1.7773195876288659, + "grad_norm": 1.4903206825256348, + "learning_rate": 3.3510790173439775e-06, + "loss": 1.6253, + "step": 3448 + }, + { + "epoch": 1.779381443298969, + "grad_norm": 1.4466136693954468, + "learning_rate": 3.2899958786019926e-06, + "loss": 1.6756, + "step": 3452 + }, + { + "epoch": 1.781443298969072, + "grad_norm": 1.7129707336425781, + "learning_rate": 3.229455638246898e-06, + "loss": 1.6763, + "step": 3456 + }, + { + "epoch": 1.7835051546391751, + "grad_norm": 1.4847286939620972, + "learning_rate": 3.16945899992237e-06, + "loss": 1.7183, + "step": 3460 + }, + { + "epoch": 1.7855670103092782, + "grad_norm": 1.4779984951019287, + "learning_rate": 3.1100066609539347e-06, + "loss": 1.7174, + "step": 3464 + }, + { + "epoch": 1.7876288659793813, + "grad_norm": 1.5085519552230835, + "learning_rate": 3.0510993123408604e-06, + "loss": 1.6961, + "step": 3468 + }, + { + "epoch": 1.7896907216494844, + "grad_norm": 1.5164207220077515, + "learning_rate": 2.992737638748144e-06, + "loss": 1.5474, + "step": 3472 + }, + { + "epoch": 1.7917525773195875, + "grad_norm": 1.5061603784561157, + "learning_rate": 2.9349223184985288e-06, + "loss": 1.6408, + "step": 3476 + }, + { + "epoch": 1.7938144329896906, + "grad_norm": 1.4625122547149658, + "learning_rate": 2.8776540235646463e-06, + "loss": 1.6354, + "step": 3480 + }, + { + "epoch": 1.7958762886597937, + "grad_norm": 1.4752477407455444, + "learning_rate": 2.8209334195611836e-06, + "loss": 1.7005, + "step": 3484 + }, + { + "epoch": 1.797938144329897, + "grad_norm": 1.5524309873580933, + "learning_rate": 2.764761165737162e-06, + "loss": 1.6028, + "step": 3488 + }, + { + "epoch": 1.8, + "grad_norm": 1.4997243881225586, + "learning_rate": 2.7091379149682685e-06, + "loss": 1.6429, + "step": 3492 + }, + { + "epoch": 1.8020618556701031, + "grad_norm": 1.5430649518966675, + "learning_rate": 2.654064313749266e-06, + "loss": 1.7119, + "step": 3496 + }, + { + "epoch": 1.8041237113402062, + "grad_norm": 1.5093491077423096, + "learning_rate": 2.5995410021864787e-06, + "loss": 1.6618, + "step": 3500 + }, + { + "epoch": 1.8061855670103093, + "grad_norm": 1.492345929145813, + "learning_rate": 2.545568613990362e-06, + "loss": 1.6848, + "step": 3504 + }, + { + "epoch": 1.8082474226804124, + "grad_norm": 1.542244553565979, + "learning_rate": 2.4921477764681457e-06, + "loss": 1.7365, + "step": 3508 + }, + { + "epoch": 1.8103092783505155, + "grad_norm": 1.4747071266174316, + "learning_rate": 2.4392791105164846e-06, + "loss": 1.6802, + "step": 3512 + }, + { + "epoch": 1.8123711340206186, + "grad_norm": 1.4944431781768799, + "learning_rate": 2.386963230614325e-06, + "loss": 1.6634, + "step": 3516 + }, + { + "epoch": 1.8144329896907216, + "grad_norm": 1.4797273874282837, + "learning_rate": 2.335200744815708e-06, + "loss": 1.7111, + "step": 3520 + }, + { + "epoch": 1.8164948453608247, + "grad_norm": 1.4942731857299805, + "learning_rate": 2.283992254742706e-06, + "loss": 1.7994, + "step": 3524 + }, + { + "epoch": 1.8185567010309278, + "grad_norm": 1.5504451990127563, + "learning_rate": 2.2333383555784737e-06, + "loss": 1.7724, + "step": 3528 + }, + { + "epoch": 1.820618556701031, + "grad_norm": 1.5781549215316772, + "learning_rate": 2.1832396360602693e-06, + "loss": 1.695, + "step": 3532 + }, + { + "epoch": 1.822680412371134, + "grad_norm": 1.7646689414978027, + "learning_rate": 2.1336966784726465e-06, + "loss": 1.7736, + "step": 3536 + }, + { + "epoch": 1.824742268041237, + "grad_norm": 1.4293087720870972, + "learning_rate": 2.0847100586406877e-06, + "loss": 1.608, + "step": 3540 + }, + { + "epoch": 1.8268041237113402, + "grad_norm": 1.5219712257385254, + "learning_rate": 2.0362803459232914e-06, + "loss": 1.6738, + "step": 3544 + }, + { + "epoch": 1.8288659793814435, + "grad_norm": 1.4095622301101685, + "learning_rate": 1.9884081032065737e-06, + "loss": 1.6249, + "step": 3548 + }, + { + "epoch": 1.8309278350515465, + "grad_norm": 3.822401285171509, + "learning_rate": 1.9410938868973327e-06, + "loss": 1.7157, + "step": 3552 + }, + { + "epoch": 1.8329896907216496, + "grad_norm": 1.7696354389190674, + "learning_rate": 1.894338246916555e-06, + "loss": 1.6029, + "step": 3556 + }, + { + "epoch": 1.8350515463917527, + "grad_norm": 1.427482008934021, + "learning_rate": 1.848141726693048e-06, + "loss": 1.5971, + "step": 3560 + }, + { + "epoch": 1.8371134020618558, + "grad_norm": 1.6495184898376465, + "learning_rate": 1.802504863157095e-06, + "loss": 1.6344, + "step": 3564 + }, + { + "epoch": 1.839175257731959, + "grad_norm": 1.4954864978790283, + "learning_rate": 1.7574281867342502e-06, + "loss": 1.7968, + "step": 3568 + }, + { + "epoch": 1.841237113402062, + "grad_norm": 1.578550934791565, + "learning_rate": 1.7129122213391523e-06, + "loss": 1.5697, + "step": 3572 + }, + { + "epoch": 1.843298969072165, + "grad_norm": 1.8335249423980713, + "learning_rate": 1.6689574843694433e-06, + "loss": 1.6689, + "step": 3576 + }, + { + "epoch": 1.8453608247422681, + "grad_norm": 1.4485599994659424, + "learning_rate": 1.6255644866997378e-06, + "loss": 1.606, + "step": 3580 + }, + { + "epoch": 1.8474226804123712, + "grad_norm": 1.5148478746414185, + "learning_rate": 1.5827337326757174e-06, + "loss": 1.7588, + "step": 3584 + }, + { + "epoch": 1.8494845360824743, + "grad_norm": 1.4764747619628906, + "learning_rate": 1.5404657201082363e-06, + "loss": 1.6629, + "step": 3588 + }, + { + "epoch": 1.8515463917525774, + "grad_norm": 1.4880666732788086, + "learning_rate": 1.4987609402675573e-06, + "loss": 1.6173, + "step": 3592 + }, + { + "epoch": 1.8536082474226805, + "grad_norm": 1.5709367990493774, + "learning_rate": 1.4576198778776195e-06, + "loss": 1.6901, + "step": 3596 + }, + { + "epoch": 1.8556701030927836, + "grad_norm": 1.4975337982177734, + "learning_rate": 1.4170430111104472e-06, + "loss": 1.7681, + "step": 3600 + }, + { + "epoch": 1.8577319587628867, + "grad_norm": 1.6037880182266235, + "learning_rate": 1.377030811580543e-06, + "loss": 1.7217, + "step": 3604 + }, + { + "epoch": 1.8597938144329897, + "grad_norm": 1.5378186702728271, + "learning_rate": 1.3375837443394157e-06, + "loss": 1.7365, + "step": 3608 + }, + { + "epoch": 1.8618556701030928, + "grad_norm": 1.527868628501892, + "learning_rate": 1.298702267870211e-06, + "loss": 1.6275, + "step": 3612 + }, + { + "epoch": 1.863917525773196, + "grad_norm": 1.530971646308899, + "learning_rate": 1.2603868340823444e-06, + "loss": 1.7038, + "step": 3616 + }, + { + "epoch": 1.865979381443299, + "grad_norm": 1.5488992929458618, + "learning_rate": 1.2226378883062716e-06, + "loss": 1.6488, + "step": 3620 + }, + { + "epoch": 1.868041237113402, + "grad_norm": 1.4633854627609253, + "learning_rate": 1.185455869288299e-06, + "loss": 1.7151, + "step": 3624 + }, + { + "epoch": 1.8701030927835052, + "grad_norm": 1.463667631149292, + "learning_rate": 1.1488412091854917e-06, + "loss": 1.7143, + "step": 3628 + }, + { + "epoch": 1.8721649484536083, + "grad_norm": 1.5287612676620483, + "learning_rate": 1.112794333560635e-06, + "loss": 1.7644, + "step": 3632 + }, + { + "epoch": 1.8742268041237113, + "grad_norm": 1.5180222988128662, + "learning_rate": 1.0773156613773206e-06, + "loss": 1.6957, + "step": 3636 + }, + { + "epoch": 1.8762886597938144, + "grad_norm": 1.490254282951355, + "learning_rate": 1.0424056049950392e-06, + "loss": 1.6732, + "step": 3640 + }, + { + "epoch": 1.8783505154639175, + "grad_norm": 1.6682502031326294, + "learning_rate": 1.008064570164413e-06, + "loss": 1.6909, + "step": 3644 + }, + { + "epoch": 1.8804123711340206, + "grad_norm": 1.5792622566223145, + "learning_rate": 9.742929560224879e-07, + "loss": 1.7305, + "step": 3648 + }, + { + "epoch": 1.8824742268041237, + "grad_norm": 1.510890245437622, + "learning_rate": 9.410911550880475e-07, + "loss": 1.5359, + "step": 3652 + }, + { + "epoch": 1.8845360824742268, + "grad_norm": 1.5118955373764038, + "learning_rate": 9.084595532571127e-07, + "loss": 1.7005, + "step": 3656 + }, + { + "epoch": 1.8865979381443299, + "grad_norm": 1.5225536823272705, + "learning_rate": 8.763985297984167e-07, + "loss": 1.699, + "step": 3660 + }, + { + "epoch": 1.888659793814433, + "grad_norm": 1.539660930633545, + "learning_rate": 8.449084573489918e-07, + "loss": 1.7721, + "step": 3664 + }, + { + "epoch": 1.890721649484536, + "grad_norm": 1.3900866508483887, + "learning_rate": 8.13989701909873e-07, + "loss": 1.53, + "step": 3668 + }, + { + "epoch": 1.8927835051546391, + "grad_norm": 1.519671082496643, + "learning_rate": 7.836426228418292e-07, + "loss": 1.6317, + "step": 3672 + }, + { + "epoch": 1.8948453608247422, + "grad_norm": 1.4589074850082397, + "learning_rate": 7.538675728611555e-07, + "loss": 1.6417, + "step": 3676 + }, + { + "epoch": 1.8969072164948453, + "grad_norm": 1.5633991956710815, + "learning_rate": 7.246648980356153e-07, + "loss": 1.6138, + "step": 3680 + }, + { + "epoch": 1.8989690721649484, + "grad_norm": 1.5480862855911255, + "learning_rate": 6.960349377804043e-07, + "loss": 1.6623, + "step": 3684 + }, + { + "epoch": 1.9010309278350515, + "grad_norm": 1.5118290185928345, + "learning_rate": 6.679780248541933e-07, + "loss": 1.8283, + "step": 3688 + }, + { + "epoch": 1.9030927835051545, + "grad_norm": 1.479387879371643, + "learning_rate": 6.404944853552974e-07, + "loss": 1.7512, + "step": 3692 + }, + { + "epoch": 1.9051546391752576, + "grad_norm": 1.4344489574432373, + "learning_rate": 6.135846387178235e-07, + "loss": 1.6923, + "step": 3696 + }, + { + "epoch": 1.9072164948453607, + "grad_norm": 1.4317504167556763, + "learning_rate": 5.872487977080232e-07, + "loss": 1.6294, + "step": 3700 + }, + { + "epoch": 1.9092783505154638, + "grad_norm": 1.4593110084533691, + "learning_rate": 5.614872684206074e-07, + "loss": 1.6398, + "step": 3704 + }, + { + "epoch": 1.9113402061855669, + "grad_norm": 1.5218007564544678, + "learning_rate": 5.36300350275215e-07, + "loss": 1.7071, + "step": 3708 + }, + { + "epoch": 1.91340206185567, + "grad_norm": 1.5086115598678589, + "learning_rate": 5.116883360129388e-07, + "loss": 1.626, + "step": 3712 + }, + { + "epoch": 1.915463917525773, + "grad_norm": 1.75813627243042, + "learning_rate": 4.876515116928993e-07, + "loss": 1.7443, + "step": 3716 + }, + { + "epoch": 1.9175257731958761, + "grad_norm": 1.46547269821167, + "learning_rate": 4.641901566889317e-07, + "loss": 1.5604, + "step": 3720 + }, + { + "epoch": 1.9195876288659792, + "grad_norm": 1.5154309272766113, + "learning_rate": 4.413045436863439e-07, + "loss": 1.6684, + "step": 3724 + }, + { + "epoch": 1.9216494845360823, + "grad_norm": 1.4802361726760864, + "learning_rate": 4.189949386787462e-07, + "loss": 1.6636, + "step": 3728 + }, + { + "epoch": 1.9237113402061856, + "grad_norm": 1.56373131275177, + "learning_rate": 3.9726160096494325e-07, + "loss": 1.6667, + "step": 3732 + }, + { + "epoch": 1.9257731958762887, + "grad_norm": 1.4294252395629883, + "learning_rate": 3.761047831459419e-07, + "loss": 1.6603, + "step": 3736 + }, + { + "epoch": 1.9278350515463918, + "grad_norm": 1.5142799615859985, + "learning_rate": 3.555247311220089e-07, + "loss": 1.6348, + "step": 3740 + }, + { + "epoch": 1.9298969072164949, + "grad_norm": 1.4750471115112305, + "learning_rate": 3.355216840898012e-07, + "loss": 1.6561, + "step": 3744 + }, + { + "epoch": 1.931958762886598, + "grad_norm": 1.4013311862945557, + "learning_rate": 3.160958745395959e-07, + "loss": 1.6969, + "step": 3748 + }, + { + "epoch": 1.934020618556701, + "grad_norm": 1.6206867694854736, + "learning_rate": 2.9724752825259774e-07, + "loss": 1.6877, + "step": 3752 + }, + { + "epoch": 1.9360824742268041, + "grad_norm": 1.5444731712341309, + "learning_rate": 2.7897686429829703e-07, + "loss": 1.7525, + "step": 3756 + }, + { + "epoch": 1.9381443298969072, + "grad_norm": 1.6207637786865234, + "learning_rate": 2.612840950319273e-07, + "loss": 1.6376, + "step": 3760 + }, + { + "epoch": 1.9402061855670103, + "grad_norm": 1.4988189935684204, + "learning_rate": 2.441694260920002e-07, + "loss": 1.6437, + "step": 3764 + }, + { + "epoch": 1.9422680412371134, + "grad_norm": 1.4861098527908325, + "learning_rate": 2.276330563979301e-07, + "loss": 1.636, + "step": 3768 + }, + { + "epoch": 1.9443298969072165, + "grad_norm": 1.4013590812683105, + "learning_rate": 2.1167517814768557e-07, + "loss": 1.5653, + "step": 3772 + }, + { + "epoch": 1.9463917525773196, + "grad_norm": 1.5663115978240967, + "learning_rate": 1.9629597681559698e-07, + "loss": 1.7384, + "step": 3776 + }, + { + "epoch": 1.9484536082474226, + "grad_norm": 1.4937258958816528, + "learning_rate": 1.814956311501692e-07, + "loss": 1.6718, + "step": 3780 + }, + { + "epoch": 1.9505154639175257, + "grad_norm": 1.5502136945724487, + "learning_rate": 1.6727431317202224e-07, + "loss": 1.6496, + "step": 3784 + }, + { + "epoch": 1.9525773195876288, + "grad_norm": 1.40053391456604, + "learning_rate": 1.5363218817188162e-07, + "loss": 1.6056, + "step": 3788 + }, + { + "epoch": 1.9546391752577321, + "grad_norm": 1.4303723573684692, + "learning_rate": 1.405694147086689e-07, + "loss": 1.6602, + "step": 3792 + }, + { + "epoch": 1.9567010309278352, + "grad_norm": 1.4497610330581665, + "learning_rate": 1.28086144607642e-07, + "loss": 1.6338, + "step": 3796 + }, + { + "epoch": 1.9587628865979383, + "grad_norm": 1.5628520250320435, + "learning_rate": 1.1618252295864107e-07, + "loss": 1.7174, + "step": 3800 + }, + { + "epoch": 1.9608247422680414, + "grad_norm": 1.4401839971542358, + "learning_rate": 1.0485868811441757e-07, + "loss": 1.6945, + "step": 3804 + }, + { + "epoch": 1.9628865979381445, + "grad_norm": 1.5296087265014648, + "learning_rate": 9.411477168898563e-08, + "loss": 1.7328, + "step": 3808 + }, + { + "epoch": 1.9649484536082475, + "grad_norm": 1.556199312210083, + "learning_rate": 8.395089855613437e-08, + "loss": 1.7358, + "step": 3812 + }, + { + "epoch": 1.9670103092783506, + "grad_norm": 1.5304512977600098, + "learning_rate": 7.436718684794563e-08, + "loss": 1.7665, + "step": 3816 + }, + { + "epoch": 1.9690721649484537, + "grad_norm": 1.4749436378479004, + "learning_rate": 6.536374795344524e-08, + "loss": 1.6267, + "step": 3820 + }, + { + "epoch": 1.9711340206185568, + "grad_norm": 1.5322349071502686, + "learning_rate": 5.694068651729279e-08, + "loss": 1.789, + "step": 3824 + }, + { + "epoch": 1.97319587628866, + "grad_norm": 1.553791880607605, + "learning_rate": 4.9098100438566e-08, + "loss": 1.6623, + "step": 3828 + }, + { + "epoch": 1.975257731958763, + "grad_norm": 1.543352484703064, + "learning_rate": 4.183608086963386e-08, + "loss": 1.7421, + "step": 3832 + }, + { + "epoch": 1.977319587628866, + "grad_norm": 1.5451158285140991, + "learning_rate": 3.515471221507416e-08, + "loss": 1.7402, + "step": 3836 + }, + { + "epoch": 1.9793814432989691, + "grad_norm": 1.4942976236343384, + "learning_rate": 2.90540721307353e-08, + "loss": 1.6397, + "step": 3840 + }, + { + "epoch": 1.9814432989690722, + "grad_norm": 1.4658242464065552, + "learning_rate": 2.3534231522781557e-08, + "loss": 1.6033, + "step": 3844 + }, + { + "epoch": 1.9835051546391753, + "grad_norm": 1.4824728965759277, + "learning_rate": 1.8595254546904806e-08, + "loss": 1.6109, + "step": 3848 + }, + { + "epoch": 1.9855670103092784, + "grad_norm": 1.3989909887313843, + "learning_rate": 1.4237198607564007e-08, + "loss": 1.599, + "step": 3852 + }, + { + "epoch": 1.9876288659793815, + "grad_norm": 1.497368574142456, + "learning_rate": 1.0460114357330187e-08, + "loss": 1.6775, + "step": 3856 + }, + { + "epoch": 1.9896907216494846, + "grad_norm": 1.4998079538345337, + "learning_rate": 7.264045696281363e-09, + "loss": 1.6132, + "step": 3860 + }, + { + "epoch": 1.9917525773195877, + "grad_norm": 1.5205719470977783, + "learning_rate": 4.649029771502944e-09, + "loss": 1.6882, + "step": 3864 + }, + { + "epoch": 1.9938144329896907, + "grad_norm": 1.5549403429031372, + "learning_rate": 2.6150969766547406e-09, + "loss": 1.7583, + "step": 3868 + }, + { + "epoch": 1.9958762886597938, + "grad_norm": 1.5483360290527344, + "learning_rate": 1.162270951615696e-09, + "loss": 1.6231, + "step": 3872 + }, + { + "epoch": 1.997938144329897, + "grad_norm": 1.444972276687622, + "learning_rate": 2.9056858220632974e-10, + "loss": 1.7232, + "step": 3876 + }, + { + "epoch": 2.0, + "grad_norm": 4.258011817932129, + "learning_rate": 0.0, + "loss": 1.6374, + "step": 3880 + } + ], + "logging_steps": 4, + "max_steps": 3880, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1940, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.564802121202401e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}