{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1940, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002061855670103093, "grad_norm": 4.401242256164551, "learning_rate": 2.061855670103093e-06, "loss": 5.7831, "step": 4 }, { "epoch": 0.004123711340206186, "grad_norm": 4.165046691894531, "learning_rate": 4.123711340206186e-06, "loss": 5.6527, "step": 8 }, { "epoch": 0.006185567010309278, "grad_norm": 3.4843931198120117, "learning_rate": 6.185567010309279e-06, "loss": 5.5896, "step": 12 }, { "epoch": 0.008247422680412371, "grad_norm": 3.29345965385437, "learning_rate": 8.247422680412371e-06, "loss": 5.3939, "step": 16 }, { "epoch": 0.010309278350515464, "grad_norm": 3.1640310287475586, "learning_rate": 1.0309278350515464e-05, "loss": 5.0285, "step": 20 }, { "epoch": 0.012371134020618556, "grad_norm": 3.238795757293701, "learning_rate": 1.2371134020618558e-05, "loss": 5.3506, "step": 24 }, { "epoch": 0.01443298969072165, "grad_norm": 2.786606550216675, "learning_rate": 1.4432989690721649e-05, "loss": 4.8622, "step": 28 }, { "epoch": 0.016494845360824743, "grad_norm": 2.689506769180298, "learning_rate": 1.6494845360824743e-05, "loss": 4.7998, "step": 32 }, { "epoch": 0.018556701030927835, "grad_norm": 3.6669228076934814, "learning_rate": 1.8556701030927837e-05, "loss": 4.8852, "step": 36 }, { "epoch": 0.020618556701030927, "grad_norm": 2.614325761795044, "learning_rate": 2.0618556701030927e-05, "loss": 4.5481, "step": 40 }, { "epoch": 0.02268041237113402, "grad_norm": 2.6969540119171143, "learning_rate": 2.268041237113402e-05, "loss": 4.4287, "step": 44 }, { "epoch": 0.024742268041237112, "grad_norm": 2.5721373558044434, "learning_rate": 2.4742268041237116e-05, "loss": 4.2744, "step": 48 }, { "epoch": 0.026804123711340205, "grad_norm": 2.7030205726623535, "learning_rate": 2.6804123711340206e-05, "loss": 4.2424, "step": 52 }, { "epoch": 0.0288659793814433, "grad_norm": 2.5551116466522217, "learning_rate": 2.8865979381443297e-05, "loss": 4.1868, "step": 56 }, { "epoch": 0.030927835051546393, "grad_norm": 2.6516504287719727, "learning_rate": 3.0927835051546395e-05, "loss": 3.9569, "step": 60 }, { "epoch": 0.032989690721649485, "grad_norm": 2.516420602798462, "learning_rate": 3.2989690721649485e-05, "loss": 4.0263, "step": 64 }, { "epoch": 0.03505154639175258, "grad_norm": 2.4037609100341797, "learning_rate": 3.5051546391752576e-05, "loss": 4.001, "step": 68 }, { "epoch": 0.03711340206185567, "grad_norm": 2.3336949348449707, "learning_rate": 3.7113402061855674e-05, "loss": 3.7654, "step": 72 }, { "epoch": 0.03917525773195876, "grad_norm": 2.5022168159484863, "learning_rate": 3.9175257731958764e-05, "loss": 3.7868, "step": 76 }, { "epoch": 0.041237113402061855, "grad_norm": 2.496216297149658, "learning_rate": 4.1237113402061855e-05, "loss": 3.7257, "step": 80 }, { "epoch": 0.04329896907216495, "grad_norm": 2.3523142337799072, "learning_rate": 4.329896907216495e-05, "loss": 3.6547, "step": 84 }, { "epoch": 0.04536082474226804, "grad_norm": 2.4950451850891113, "learning_rate": 4.536082474226804e-05, "loss": 3.5265, "step": 88 }, { "epoch": 0.04742268041237113, "grad_norm": 2.5210957527160645, "learning_rate": 4.7422680412371134e-05, "loss": 3.5772, "step": 92 }, { "epoch": 0.049484536082474224, "grad_norm": 2.3704540729522705, "learning_rate": 4.948453608247423e-05, "loss": 3.6708, "step": 96 }, { "epoch": 0.05154639175257732, "grad_norm": 2.361452102661133, "learning_rate": 5.1546391752577315e-05, "loss": 3.4935, "step": 100 }, { "epoch": 0.05360824742268041, "grad_norm": 2.4770519733428955, "learning_rate": 5.360824742268041e-05, "loss": 3.4953, "step": 104 }, { "epoch": 0.05567010309278351, "grad_norm": 2.6673684120178223, "learning_rate": 5.567010309278351e-05, "loss": 3.4362, "step": 108 }, { "epoch": 0.0577319587628866, "grad_norm": 2.3836777210235596, "learning_rate": 5.7731958762886594e-05, "loss": 3.4341, "step": 112 }, { "epoch": 0.05979381443298969, "grad_norm": 2.4742355346679688, "learning_rate": 5.979381443298969e-05, "loss": 3.4443, "step": 116 }, { "epoch": 0.061855670103092786, "grad_norm": 2.513885974884033, "learning_rate": 6.185567010309279e-05, "loss": 3.3867, "step": 120 }, { "epoch": 0.06391752577319587, "grad_norm": 2.633009433746338, "learning_rate": 6.391752577319587e-05, "loss": 3.3804, "step": 124 }, { "epoch": 0.06597938144329897, "grad_norm": 2.7852959632873535, "learning_rate": 6.597938144329897e-05, "loss": 3.4522, "step": 128 }, { "epoch": 0.06804123711340206, "grad_norm": 2.9940905570983887, "learning_rate": 6.804123711340207e-05, "loss": 3.3606, "step": 132 }, { "epoch": 0.07010309278350516, "grad_norm": 2.4730873107910156, "learning_rate": 7.010309278350515e-05, "loss": 3.2392, "step": 136 }, { "epoch": 0.07216494845360824, "grad_norm": 2.1951076984405518, "learning_rate": 7.216494845360825e-05, "loss": 3.4019, "step": 140 }, { "epoch": 0.07422680412371134, "grad_norm": 1.9874905347824097, "learning_rate": 7.422680412371135e-05, "loss": 3.1746, "step": 144 }, { "epoch": 0.07628865979381444, "grad_norm": 2.2700319290161133, "learning_rate": 7.628865979381443e-05, "loss": 3.1944, "step": 148 }, { "epoch": 0.07835051546391752, "grad_norm": 2.2276201248168945, "learning_rate": 7.835051546391753e-05, "loss": 3.2753, "step": 152 }, { "epoch": 0.08041237113402062, "grad_norm": 2.150624990463257, "learning_rate": 8.041237113402063e-05, "loss": 3.2761, "step": 156 }, { "epoch": 0.08247422680412371, "grad_norm": 2.2154808044433594, "learning_rate": 8.247422680412371e-05, "loss": 3.1575, "step": 160 }, { "epoch": 0.08453608247422681, "grad_norm": 2.6334950923919678, "learning_rate": 8.453608247422681e-05, "loss": 3.185, "step": 164 }, { "epoch": 0.0865979381443299, "grad_norm": 2.70011305809021, "learning_rate": 8.65979381443299e-05, "loss": 3.1316, "step": 168 }, { "epoch": 0.088659793814433, "grad_norm": 2.533686637878418, "learning_rate": 8.865979381443299e-05, "loss": 3.1387, "step": 172 }, { "epoch": 0.09072164948453608, "grad_norm": 2.154860019683838, "learning_rate": 9.072164948453609e-05, "loss": 3.1592, "step": 176 }, { "epoch": 0.09278350515463918, "grad_norm": 2.1940603256225586, "learning_rate": 9.278350515463918e-05, "loss": 3.1773, "step": 180 }, { "epoch": 0.09484536082474226, "grad_norm": 2.257709264755249, "learning_rate": 9.484536082474227e-05, "loss": 3.1052, "step": 184 }, { "epoch": 0.09690721649484536, "grad_norm": 2.08589768409729, "learning_rate": 9.690721649484537e-05, "loss": 3.0373, "step": 188 }, { "epoch": 0.09896907216494845, "grad_norm": 2.060812473297119, "learning_rate": 9.896907216494846e-05, "loss": 3.112, "step": 192 }, { "epoch": 0.10103092783505155, "grad_norm": 2.3649418354034424, "learning_rate": 9.999992735780168e-05, "loss": 3.198, "step": 196 }, { "epoch": 0.10309278350515463, "grad_norm": 2.339231252670288, "learning_rate": 9.999934622148157e-05, "loss": 3.1736, "step": 200 }, { "epoch": 0.10515463917525773, "grad_norm": 2.037454128265381, "learning_rate": 9.999818395559577e-05, "loss": 3.1006, "step": 204 }, { "epoch": 0.10721649484536082, "grad_norm": 2.343181848526001, "learning_rate": 9.999644057365295e-05, "loss": 3.1055, "step": 208 }, { "epoch": 0.10927835051546392, "grad_norm": 2.3469326496124268, "learning_rate": 9.999411609591603e-05, "loss": 3.0211, "step": 212 }, { "epoch": 0.11134020618556702, "grad_norm": 2.2599899768829346, "learning_rate": 9.999121054940182e-05, "loss": 3.0236, "step": 216 }, { "epoch": 0.1134020618556701, "grad_norm": 2.114220142364502, "learning_rate": 9.998772396788072e-05, "loss": 3.0895, "step": 220 }, { "epoch": 0.1154639175257732, "grad_norm": 2.0050344467163086, "learning_rate": 9.998365639187638e-05, "loss": 3.0542, "step": 224 }, { "epoch": 0.11752577319587629, "grad_norm": 2.1245343685150146, "learning_rate": 9.997900786866519e-05, "loss": 3.0631, "step": 228 }, { "epoch": 0.11958762886597939, "grad_norm": 2.0271108150482178, "learning_rate": 9.997377845227576e-05, "loss": 3.1621, "step": 232 }, { "epoch": 0.12164948453608247, "grad_norm": 2.075441837310791, "learning_rate": 9.996796820348822e-05, "loss": 2.9906, "step": 236 }, { "epoch": 0.12371134020618557, "grad_norm": 2.118668556213379, "learning_rate": 9.996157718983362e-05, "loss": 3.0718, "step": 240 }, { "epoch": 0.12577319587628866, "grad_norm": 2.0540637969970703, "learning_rate": 9.995460548559307e-05, "loss": 3.077, "step": 244 }, { "epoch": 0.12783505154639174, "grad_norm": 2.2028098106384277, "learning_rate": 9.99470531717969e-05, "loss": 3.1064, "step": 248 }, { "epoch": 0.12989690721649486, "grad_norm": 2.0607352256774902, "learning_rate": 9.993892033622374e-05, "loss": 3.0582, "step": 252 }, { "epoch": 0.13195876288659794, "grad_norm": 2.054838180541992, "learning_rate": 9.993020707339939e-05, "loss": 2.991, "step": 256 }, { "epoch": 0.13402061855670103, "grad_norm": 1.9367263317108154, "learning_rate": 9.992091348459591e-05, "loss": 2.9583, "step": 260 }, { "epoch": 0.1360824742268041, "grad_norm": 1.9889065027236938, "learning_rate": 9.99110396778303e-05, "loss": 2.9845, "step": 264 }, { "epoch": 0.13814432989690723, "grad_norm": 2.249467134475708, "learning_rate": 9.990058576786325e-05, "loss": 3.0511, "step": 268 }, { "epoch": 0.1402061855670103, "grad_norm": 2.011195659637451, "learning_rate": 9.98895518761979e-05, "loss": 2.9925, "step": 272 }, { "epoch": 0.1422680412371134, "grad_norm": 2.0568485260009766, "learning_rate": 9.987793813107833e-05, "loss": 2.9491, "step": 276 }, { "epoch": 0.14432989690721648, "grad_norm": 1.9748948812484741, "learning_rate": 9.986574466748812e-05, "loss": 3.0667, "step": 280 }, { "epoch": 0.1463917525773196, "grad_norm": 2.234513282775879, "learning_rate": 9.985297162714877e-05, "loss": 2.8514, "step": 284 }, { "epoch": 0.14845360824742268, "grad_norm": 1.9163928031921387, "learning_rate": 9.983961915851804e-05, "loss": 3.1231, "step": 288 }, { "epoch": 0.15051546391752577, "grad_norm": 1.9212491512298584, "learning_rate": 9.982568741678823e-05, "loss": 2.9029, "step": 292 }, { "epoch": 0.15257731958762888, "grad_norm": 2.065171241760254, "learning_rate": 9.981117656388445e-05, "loss": 2.9171, "step": 296 }, { "epoch": 0.15463917525773196, "grad_norm": 1.979385495185852, "learning_rate": 9.979608676846258e-05, "loss": 3.0767, "step": 300 }, { "epoch": 0.15670103092783505, "grad_norm": 1.9221059083938599, "learning_rate": 9.978041820590743e-05, "loss": 3.0627, "step": 304 }, { "epoch": 0.15876288659793814, "grad_norm": 2.0437042713165283, "learning_rate": 9.97641710583307e-05, "loss": 2.938, "step": 308 }, { "epoch": 0.16082474226804125, "grad_norm": 1.9110627174377441, "learning_rate": 9.974734551456881e-05, "loss": 2.9462, "step": 312 }, { "epoch": 0.16288659793814433, "grad_norm": 1.8492796421051025, "learning_rate": 9.972994177018074e-05, "loss": 3.0006, "step": 316 }, { "epoch": 0.16494845360824742, "grad_norm": 1.9782150983810425, "learning_rate": 9.971196002744575e-05, "loss": 2.9449, "step": 320 }, { "epoch": 0.1670103092783505, "grad_norm": 1.8963398933410645, "learning_rate": 9.969340049536099e-05, "loss": 2.9932, "step": 324 }, { "epoch": 0.16907216494845362, "grad_norm": 2.0604023933410645, "learning_rate": 9.967426338963917e-05, "loss": 2.8904, "step": 328 }, { "epoch": 0.1711340206185567, "grad_norm": 2.0546135902404785, "learning_rate": 9.965454893270592e-05, "loss": 3.0603, "step": 332 }, { "epoch": 0.1731958762886598, "grad_norm": 1.9946964979171753, "learning_rate": 9.963425735369736e-05, "loss": 2.917, "step": 336 }, { "epoch": 0.17525773195876287, "grad_norm": 1.8940315246582031, "learning_rate": 9.961338888845725e-05, "loss": 2.9194, "step": 340 }, { "epoch": 0.177319587628866, "grad_norm": 1.9344205856323242, "learning_rate": 9.959194377953447e-05, "loss": 2.9884, "step": 344 }, { "epoch": 0.17938144329896907, "grad_norm": 1.9542304277420044, "learning_rate": 9.956992227617995e-05, "loss": 2.9051, "step": 348 }, { "epoch": 0.18144329896907216, "grad_norm": 1.9588041305541992, "learning_rate": 9.954732463434402e-05, "loss": 2.7897, "step": 352 }, { "epoch": 0.18350515463917524, "grad_norm": 1.7251778841018677, "learning_rate": 9.952415111667324e-05, "loss": 2.7511, "step": 356 }, { "epoch": 0.18556701030927836, "grad_norm": 1.930525302886963, "learning_rate": 9.950040199250746e-05, "loss": 2.8544, "step": 360 }, { "epoch": 0.18762886597938144, "grad_norm": 2.0370941162109375, "learning_rate": 9.947607753787667e-05, "loss": 3.0097, "step": 364 }, { "epoch": 0.18969072164948453, "grad_norm": 2.0568158626556396, "learning_rate": 9.945117803549774e-05, "loss": 2.8412, "step": 368 }, { "epoch": 0.19175257731958764, "grad_norm": 1.9520659446716309, "learning_rate": 9.942570377477121e-05, "loss": 2.8904, "step": 372 }, { "epoch": 0.19381443298969073, "grad_norm": 1.9777973890304565, "learning_rate": 9.939965505177786e-05, "loss": 2.9093, "step": 376 }, { "epoch": 0.1958762886597938, "grad_norm": 2.0709102153778076, "learning_rate": 9.937303216927534e-05, "loss": 2.7659, "step": 380 }, { "epoch": 0.1979381443298969, "grad_norm": 1.919057846069336, "learning_rate": 9.934583543669453e-05, "loss": 2.8857, "step": 384 }, { "epoch": 0.2, "grad_norm": 1.9447135925292969, "learning_rate": 9.931806517013612e-05, "loss": 2.7969, "step": 388 }, { "epoch": 0.2020618556701031, "grad_norm": 1.9707754850387573, "learning_rate": 9.928972169236676e-05, "loss": 2.9706, "step": 392 }, { "epoch": 0.20412371134020618, "grad_norm": 2.1095566749572754, "learning_rate": 9.926080533281543e-05, "loss": 2.8079, "step": 396 }, { "epoch": 0.20618556701030927, "grad_norm": 1.9369230270385742, "learning_rate": 9.923131642756954e-05, "loss": 2.759, "step": 400 }, { "epoch": 0.20824742268041238, "grad_norm": 1.8451541662216187, "learning_rate": 9.920125531937107e-05, "loss": 2.8428, "step": 404 }, { "epoch": 0.21030927835051547, "grad_norm": 1.8733794689178467, "learning_rate": 9.917062235761259e-05, "loss": 2.7717, "step": 408 }, { "epoch": 0.21237113402061855, "grad_norm": 1.8459867238998413, "learning_rate": 9.913941789833311e-05, "loss": 2.8008, "step": 412 }, { "epoch": 0.21443298969072164, "grad_norm": 1.8867549896240234, "learning_rate": 9.910764230421406e-05, "loss": 2.7071, "step": 416 }, { "epoch": 0.21649484536082475, "grad_norm": 1.8722797632217407, "learning_rate": 9.907529594457504e-05, "loss": 2.8076, "step": 420 }, { "epoch": 0.21855670103092784, "grad_norm": 1.9473061561584473, "learning_rate": 9.904237919536945e-05, "loss": 2.8613, "step": 424 }, { "epoch": 0.22061855670103092, "grad_norm": 1.8926331996917725, "learning_rate": 9.900889243918024e-05, "loss": 2.7264, "step": 428 }, { "epoch": 0.22268041237113403, "grad_norm": 2.013500928878784, "learning_rate": 9.897483606521536e-05, "loss": 2.7307, "step": 432 }, { "epoch": 0.22474226804123712, "grad_norm": 1.7798408269882202, "learning_rate": 9.894021046930333e-05, "loss": 2.7089, "step": 436 }, { "epoch": 0.2268041237113402, "grad_norm": 1.8650320768356323, "learning_rate": 9.890501605388853e-05, "loss": 2.812, "step": 440 }, { "epoch": 0.2288659793814433, "grad_norm": 1.8471648693084717, "learning_rate": 9.886925322802663e-05, "loss": 2.8819, "step": 444 }, { "epoch": 0.2309278350515464, "grad_norm": 1.7936201095581055, "learning_rate": 9.883292240737978e-05, "loss": 2.8319, "step": 448 }, { "epoch": 0.2329896907216495, "grad_norm": 1.89383065700531, "learning_rate": 9.87960240142118e-05, "loss": 2.7643, "step": 452 }, { "epoch": 0.23505154639175257, "grad_norm": 1.7932506799697876, "learning_rate": 9.875855847738319e-05, "loss": 2.7604, "step": 456 }, { "epoch": 0.23711340206185566, "grad_norm": 1.8939586877822876, "learning_rate": 9.872052623234632e-05, "loss": 2.6743, "step": 460 }, { "epoch": 0.23917525773195877, "grad_norm": 1.8440996408462524, "learning_rate": 9.868192772114016e-05, "loss": 2.6986, "step": 464 }, { "epoch": 0.24123711340206186, "grad_norm": 2.047183036804199, "learning_rate": 9.864276339238534e-05, "loss": 2.7008, "step": 468 }, { "epoch": 0.24329896907216494, "grad_norm": 1.7253282070159912, "learning_rate": 9.860303370127876e-05, "loss": 2.7944, "step": 472 }, { "epoch": 0.24536082474226803, "grad_norm": 1.8298146724700928, "learning_rate": 9.856273910958847e-05, "loss": 2.637, "step": 476 }, { "epoch": 0.24742268041237114, "grad_norm": 1.7853765487670898, "learning_rate": 9.852188008564813e-05, "loss": 2.7162, "step": 480 }, { "epoch": 0.24948453608247423, "grad_norm": 1.8834904432296753, "learning_rate": 9.84804571043517e-05, "loss": 2.8762, "step": 484 }, { "epoch": 0.2515463917525773, "grad_norm": 1.9652595520019531, "learning_rate": 9.843847064714785e-05, "loss": 2.7794, "step": 488 }, { "epoch": 0.2536082474226804, "grad_norm": 1.983886957168579, "learning_rate": 9.839592120203441e-05, "loss": 2.8331, "step": 492 }, { "epoch": 0.2556701030927835, "grad_norm": 1.9896190166473389, "learning_rate": 9.835280926355261e-05, "loss": 2.8015, "step": 496 }, { "epoch": 0.25773195876288657, "grad_norm": 1.70455002784729, "learning_rate": 9.83091353327815e-05, "loss": 2.7135, "step": 500 }, { "epoch": 0.2597938144329897, "grad_norm": 2.0131478309631348, "learning_rate": 9.826489991733194e-05, "loss": 2.7678, "step": 504 }, { "epoch": 0.2618556701030928, "grad_norm": 2.189298152923584, "learning_rate": 9.822010353134081e-05, "loss": 2.8352, "step": 508 }, { "epoch": 0.2639175257731959, "grad_norm": 2.022023916244507, "learning_rate": 9.817474669546501e-05, "loss": 2.7118, "step": 512 }, { "epoch": 0.26597938144329897, "grad_norm": 2.1792008876800537, "learning_rate": 9.812882993687539e-05, "loss": 2.8367, "step": 516 }, { "epoch": 0.26804123711340205, "grad_norm": 1.8317099809646606, "learning_rate": 9.808235378925066e-05, "loss": 2.7326, "step": 520 }, { "epoch": 0.27010309278350514, "grad_norm": 1.7824715375900269, "learning_rate": 9.803531879277113e-05, "loss": 2.7147, "step": 524 }, { "epoch": 0.2721649484536082, "grad_norm": 1.7565789222717285, "learning_rate": 9.798772549411252e-05, "loss": 2.8029, "step": 528 }, { "epoch": 0.27422680412371137, "grad_norm": 1.805213212966919, "learning_rate": 9.793957444643951e-05, "loss": 2.6505, "step": 532 }, { "epoch": 0.27628865979381445, "grad_norm": 1.8401093482971191, "learning_rate": 9.789086620939936e-05, "loss": 2.7262, "step": 536 }, { "epoch": 0.27835051546391754, "grad_norm": 1.7650929689407349, "learning_rate": 9.784160134911541e-05, "loss": 2.6988, "step": 540 }, { "epoch": 0.2804123711340206, "grad_norm": 1.902664065361023, "learning_rate": 9.77917804381805e-05, "loss": 2.7028, "step": 544 }, { "epoch": 0.2824742268041237, "grad_norm": 1.6840468645095825, "learning_rate": 9.774140405565024e-05, "loss": 2.7141, "step": 548 }, { "epoch": 0.2845360824742268, "grad_norm": 1.8542104959487915, "learning_rate": 9.769047278703644e-05, "loss": 2.7512, "step": 552 }, { "epoch": 0.2865979381443299, "grad_norm": 1.9419035911560059, "learning_rate": 9.763898722430015e-05, "loss": 2.7085, "step": 556 }, { "epoch": 0.28865979381443296, "grad_norm": 1.8914318084716797, "learning_rate": 9.758694796584483e-05, "loss": 2.7119, "step": 560 }, { "epoch": 0.2907216494845361, "grad_norm": 2.028513193130493, "learning_rate": 9.753435561650946e-05, "loss": 2.7293, "step": 564 }, { "epoch": 0.2927835051546392, "grad_norm": 1.7596354484558105, "learning_rate": 9.748121078756137e-05, "loss": 2.6624, "step": 568 }, { "epoch": 0.2948453608247423, "grad_norm": 1.955361247062683, "learning_rate": 9.742751409668929e-05, "loss": 2.6958, "step": 572 }, { "epoch": 0.29690721649484536, "grad_norm": 1.980281949043274, "learning_rate": 9.737326616799605e-05, "loss": 2.6921, "step": 576 }, { "epoch": 0.29896907216494845, "grad_norm": 2.0537736415863037, "learning_rate": 9.731846763199144e-05, "loss": 2.7734, "step": 580 }, { "epoch": 0.30103092783505153, "grad_norm": 2.0544698238372803, "learning_rate": 9.726311912558474e-05, "loss": 2.622, "step": 584 }, { "epoch": 0.3030927835051546, "grad_norm": 1.8842084407806396, "learning_rate": 9.720722129207746e-05, "loss": 2.6809, "step": 588 }, { "epoch": 0.30515463917525776, "grad_norm": 1.9250359535217285, "learning_rate": 9.715077478115574e-05, "loss": 2.7359, "step": 592 }, { "epoch": 0.30721649484536084, "grad_norm": 1.7907671928405762, "learning_rate": 9.709378024888292e-05, "loss": 2.7575, "step": 596 }, { "epoch": 0.30927835051546393, "grad_norm": 1.8203668594360352, "learning_rate": 9.703623835769178e-05, "loss": 2.7181, "step": 600 }, { "epoch": 0.311340206185567, "grad_norm": 1.837925672531128, "learning_rate": 9.697814977637696e-05, "loss": 2.743, "step": 604 }, { "epoch": 0.3134020618556701, "grad_norm": 1.7200803756713867, "learning_rate": 9.691951518008715e-05, "loss": 2.6844, "step": 608 }, { "epoch": 0.3154639175257732, "grad_norm": 1.9397820234298706, "learning_rate": 9.686033525031719e-05, "loss": 2.5767, "step": 612 }, { "epoch": 0.31752577319587627, "grad_norm": 1.9038435220718384, "learning_rate": 9.680061067490021e-05, "loss": 2.6503, "step": 616 }, { "epoch": 0.31958762886597936, "grad_norm": 1.9289252758026123, "learning_rate": 9.674034214799964e-05, "loss": 2.6878, "step": 620 }, { "epoch": 0.3216494845360825, "grad_norm": 1.7512463331222534, "learning_rate": 9.667953037010108e-05, "loss": 2.7551, "step": 624 }, { "epoch": 0.3237113402061856, "grad_norm": 1.9182549715042114, "learning_rate": 9.661817604800421e-05, "loss": 2.7986, "step": 628 }, { "epoch": 0.32577319587628867, "grad_norm": 1.8200740814208984, "learning_rate": 9.655627989481458e-05, "loss": 2.6923, "step": 632 }, { "epoch": 0.32783505154639175, "grad_norm": 1.8738125562667847, "learning_rate": 9.649384262993525e-05, "loss": 2.7518, "step": 636 }, { "epoch": 0.32989690721649484, "grad_norm": 1.9261345863342285, "learning_rate": 9.64308649790586e-05, "loss": 2.6037, "step": 640 }, { "epoch": 0.3319587628865979, "grad_norm": 1.7359293699264526, "learning_rate": 9.636734767415763e-05, "loss": 2.7102, "step": 644 }, { "epoch": 0.334020618556701, "grad_norm": 2.001077175140381, "learning_rate": 9.630329145347767e-05, "loss": 2.7068, "step": 648 }, { "epoch": 0.33608247422680415, "grad_norm": 1.7505935430526733, "learning_rate": 9.623869706152777e-05, "loss": 2.6348, "step": 652 }, { "epoch": 0.33814432989690724, "grad_norm": 1.9153965711593628, "learning_rate": 9.617356524907193e-05, "loss": 2.6874, "step": 656 }, { "epoch": 0.3402061855670103, "grad_norm": 1.805038332939148, "learning_rate": 9.61078967731205e-05, "loss": 2.5985, "step": 660 }, { "epoch": 0.3422680412371134, "grad_norm": 1.7926819324493408, "learning_rate": 9.604169239692133e-05, "loss": 2.7203, "step": 664 }, { "epoch": 0.3443298969072165, "grad_norm": 1.8701540231704712, "learning_rate": 9.597495288995089e-05, "loss": 2.6819, "step": 668 }, { "epoch": 0.3463917525773196, "grad_norm": 2.0685527324676514, "learning_rate": 9.590767902790529e-05, "loss": 2.7401, "step": 672 }, { "epoch": 0.34845360824742266, "grad_norm": 1.7549872398376465, "learning_rate": 9.583987159269143e-05, "loss": 2.5833, "step": 676 }, { "epoch": 0.35051546391752575, "grad_norm": 1.8161414861679077, "learning_rate": 9.577153137241765e-05, "loss": 2.7274, "step": 680 }, { "epoch": 0.3525773195876289, "grad_norm": 1.7566990852355957, "learning_rate": 9.570265916138484e-05, "loss": 2.7536, "step": 684 }, { "epoch": 0.354639175257732, "grad_norm": 1.8565185070037842, "learning_rate": 9.563325576007701e-05, "loss": 2.6785, "step": 688 }, { "epoch": 0.35670103092783506, "grad_norm": 1.7326369285583496, "learning_rate": 9.556332197515207e-05, "loss": 2.7096, "step": 692 }, { "epoch": 0.35876288659793815, "grad_norm": 1.705234408378601, "learning_rate": 9.549285861943247e-05, "loss": 2.7314, "step": 696 }, { "epoch": 0.36082474226804123, "grad_norm": 1.9541505575180054, "learning_rate": 9.542186651189569e-05, "loss": 2.6204, "step": 700 }, { "epoch": 0.3628865979381443, "grad_norm": 1.8346929550170898, "learning_rate": 9.535034647766476e-05, "loss": 2.6913, "step": 704 }, { "epoch": 0.3649484536082474, "grad_norm": 1.9132497310638428, "learning_rate": 9.527829934799869e-05, "loss": 2.7041, "step": 708 }, { "epoch": 0.3670103092783505, "grad_norm": 1.763213038444519, "learning_rate": 9.520572596028278e-05, "loss": 2.6216, "step": 712 }, { "epoch": 0.36907216494845363, "grad_norm": 1.9224257469177246, "learning_rate": 9.513262715801887e-05, "loss": 2.6632, "step": 716 }, { "epoch": 0.3711340206185567, "grad_norm": 1.8817148208618164, "learning_rate": 9.505900379081559e-05, "loss": 2.6868, "step": 720 }, { "epoch": 0.3731958762886598, "grad_norm": 1.7395341396331787, "learning_rate": 9.498485671437842e-05, "loss": 2.5171, "step": 724 }, { "epoch": 0.3752577319587629, "grad_norm": 1.7571380138397217, "learning_rate": 9.491018679049981e-05, "loss": 2.6194, "step": 728 }, { "epoch": 0.37731958762886597, "grad_norm": 1.8208237886428833, "learning_rate": 9.48349948870491e-05, "loss": 2.6101, "step": 732 }, { "epoch": 0.37938144329896906, "grad_norm": 1.888909935951233, "learning_rate": 9.47592818779625e-05, "loss": 2.6569, "step": 736 }, { "epoch": 0.38144329896907214, "grad_norm": 1.7806968688964844, "learning_rate": 9.468304864323288e-05, "loss": 2.5329, "step": 740 }, { "epoch": 0.3835051546391753, "grad_norm": 1.8010936975479126, "learning_rate": 9.460629606889952e-05, "loss": 2.6971, "step": 744 }, { "epoch": 0.38556701030927837, "grad_norm": 1.7562525272369385, "learning_rate": 9.452902504703793e-05, "loss": 2.676, "step": 748 }, { "epoch": 0.38762886597938145, "grad_norm": 1.828466534614563, "learning_rate": 9.445123647574936e-05, "loss": 2.5952, "step": 752 }, { "epoch": 0.38969072164948454, "grad_norm": 1.7644858360290527, "learning_rate": 9.437293125915037e-05, "loss": 2.6475, "step": 756 }, { "epoch": 0.3917525773195876, "grad_norm": 1.6723216772079468, "learning_rate": 9.429411030736242e-05, "loss": 2.6823, "step": 760 }, { "epoch": 0.3938144329896907, "grad_norm": 1.7757784128189087, "learning_rate": 9.421477453650118e-05, "loss": 2.5711, "step": 764 }, { "epoch": 0.3958762886597938, "grad_norm": 1.647575855255127, "learning_rate": 9.413492486866598e-05, "loss": 2.6141, "step": 768 }, { "epoch": 0.3979381443298969, "grad_norm": 1.9347645044326782, "learning_rate": 9.405456223192897e-05, "loss": 2.6708, "step": 772 }, { "epoch": 0.4, "grad_norm": 1.838582158088684, "learning_rate": 9.397368756032445e-05, "loss": 2.7496, "step": 776 }, { "epoch": 0.4020618556701031, "grad_norm": 1.8382269144058228, "learning_rate": 9.389230179383801e-05, "loss": 2.5654, "step": 780 }, { "epoch": 0.4041237113402062, "grad_norm": 1.8177680969238281, "learning_rate": 9.381040587839548e-05, "loss": 2.6553, "step": 784 }, { "epoch": 0.4061855670103093, "grad_norm": 1.7643057107925415, "learning_rate": 9.372800076585207e-05, "loss": 2.5951, "step": 788 }, { "epoch": 0.40824742268041236, "grad_norm": 1.8181430101394653, "learning_rate": 9.364508741398127e-05, "loss": 2.6452, "step": 792 }, { "epoch": 0.41030927835051545, "grad_norm": 1.8445159196853638, "learning_rate": 9.356166678646366e-05, "loss": 2.6172, "step": 796 }, { "epoch": 0.41237113402061853, "grad_norm": 1.6617608070373535, "learning_rate": 9.347773985287578e-05, "loss": 2.5781, "step": 800 }, { "epoch": 0.4144329896907217, "grad_norm": 1.9522396326065063, "learning_rate": 9.339330758867883e-05, "loss": 2.6309, "step": 804 }, { "epoch": 0.41649484536082476, "grad_norm": 2.5656485557556152, "learning_rate": 9.330837097520738e-05, "loss": 2.6967, "step": 808 }, { "epoch": 0.41855670103092785, "grad_norm": 1.6885048151016235, "learning_rate": 9.322293099965784e-05, "loss": 2.617, "step": 812 }, { "epoch": 0.42061855670103093, "grad_norm": 1.7637752294540405, "learning_rate": 9.313698865507713e-05, "loss": 2.6538, "step": 816 }, { "epoch": 0.422680412371134, "grad_norm": 1.7737189531326294, "learning_rate": 9.305054494035106e-05, "loss": 2.6113, "step": 820 }, { "epoch": 0.4247422680412371, "grad_norm": 1.8070309162139893, "learning_rate": 9.296360086019272e-05, "loss": 2.5118, "step": 824 }, { "epoch": 0.4268041237113402, "grad_norm": 1.73598313331604, "learning_rate": 9.287615742513086e-05, "loss": 2.5826, "step": 828 }, { "epoch": 0.4288659793814433, "grad_norm": 1.688819169998169, "learning_rate": 9.278821565149806e-05, "loss": 2.5375, "step": 832 }, { "epoch": 0.4309278350515464, "grad_norm": 1.7314485311508179, "learning_rate": 9.269977656141898e-05, "loss": 2.5819, "step": 836 }, { "epoch": 0.4329896907216495, "grad_norm": 1.657240390777588, "learning_rate": 9.261084118279847e-05, "loss": 2.6643, "step": 840 }, { "epoch": 0.4350515463917526, "grad_norm": 1.7966091632843018, "learning_rate": 9.25214105493096e-05, "loss": 2.7022, "step": 844 }, { "epoch": 0.43711340206185567, "grad_norm": 1.732354760169983, "learning_rate": 9.243148570038164e-05, "loss": 2.6098, "step": 848 }, { "epoch": 0.43917525773195876, "grad_norm": 1.737281322479248, "learning_rate": 9.234106768118809e-05, "loss": 2.5879, "step": 852 }, { "epoch": 0.44123711340206184, "grad_norm": 1.7843495607376099, "learning_rate": 9.225015754263431e-05, "loss": 2.5793, "step": 856 }, { "epoch": 0.44329896907216493, "grad_norm": 1.7821077108383179, "learning_rate": 9.215875634134552e-05, "loss": 2.6578, "step": 860 }, { "epoch": 0.44536082474226807, "grad_norm": 1.8063935041427612, "learning_rate": 9.206686513965445e-05, "loss": 2.7075, "step": 864 }, { "epoch": 0.44742268041237115, "grad_norm": 1.5605168342590332, "learning_rate": 9.19744850055889e-05, "loss": 2.5988, "step": 868 }, { "epoch": 0.44948453608247424, "grad_norm": 2.078798294067383, "learning_rate": 9.188161701285949e-05, "loss": 2.6126, "step": 872 }, { "epoch": 0.4515463917525773, "grad_norm": 1.7260762453079224, "learning_rate": 9.178826224084705e-05, "loss": 2.6352, "step": 876 }, { "epoch": 0.4536082474226804, "grad_norm": 1.7088130712509155, "learning_rate": 9.169442177459011e-05, "loss": 2.6324, "step": 880 }, { "epoch": 0.4556701030927835, "grad_norm": 1.8762470483779907, "learning_rate": 9.160009670477234e-05, "loss": 2.517, "step": 884 }, { "epoch": 0.4577319587628866, "grad_norm": 1.9028972387313843, "learning_rate": 9.150528812770981e-05, "loss": 2.5386, "step": 888 }, { "epoch": 0.45979381443298967, "grad_norm": 1.5713077783584595, "learning_rate": 9.140999714533827e-05, "loss": 2.6347, "step": 892 }, { "epoch": 0.4618556701030928, "grad_norm": 1.603164792060852, "learning_rate": 9.131422486520034e-05, "loss": 2.5138, "step": 896 }, { "epoch": 0.4639175257731959, "grad_norm": 1.7028288841247559, "learning_rate": 9.121797240043267e-05, "loss": 2.5366, "step": 900 }, { "epoch": 0.465979381443299, "grad_norm": 1.7099529504776, "learning_rate": 9.11212408697529e-05, "loss": 2.6465, "step": 904 }, { "epoch": 0.46804123711340206, "grad_norm": 1.7134875059127808, "learning_rate": 9.102403139744683e-05, "loss": 2.6331, "step": 908 }, { "epoch": 0.47010309278350515, "grad_norm": 1.6669530868530273, "learning_rate": 9.092634511335519e-05, "loss": 2.5381, "step": 912 }, { "epoch": 0.47216494845360824, "grad_norm": 1.7473549842834473, "learning_rate": 9.082818315286055e-05, "loss": 2.584, "step": 916 }, { "epoch": 0.4742268041237113, "grad_norm": 1.673309087753296, "learning_rate": 9.07295466568742e-05, "loss": 2.4649, "step": 920 }, { "epoch": 0.4762886597938144, "grad_norm": 1.8044261932373047, "learning_rate": 9.063043677182283e-05, "loss": 2.6512, "step": 924 }, { "epoch": 0.47835051546391755, "grad_norm": 1.7258899211883545, "learning_rate": 9.053085464963518e-05, "loss": 2.5977, "step": 928 }, { "epoch": 0.48041237113402063, "grad_norm": 1.7187895774841309, "learning_rate": 9.043080144772868e-05, "loss": 2.5948, "step": 932 }, { "epoch": 0.4824742268041237, "grad_norm": 1.797176718711853, "learning_rate": 9.033027832899601e-05, "loss": 2.659, "step": 936 }, { "epoch": 0.4845360824742268, "grad_norm": 1.6816799640655518, "learning_rate": 9.022928646179159e-05, "loss": 2.6123, "step": 940 }, { "epoch": 0.4865979381443299, "grad_norm": 1.7294647693634033, "learning_rate": 9.012782701991795e-05, "loss": 2.5824, "step": 944 }, { "epoch": 0.488659793814433, "grad_norm": 1.6819491386413574, "learning_rate": 9.002590118261216e-05, "loss": 2.5717, "step": 948 }, { "epoch": 0.49072164948453606, "grad_norm": 1.667133092880249, "learning_rate": 8.992351013453204e-05, "loss": 2.6282, "step": 952 }, { "epoch": 0.4927835051546392, "grad_norm": 1.6454589366912842, "learning_rate": 8.982065506574247e-05, "loss": 2.5939, "step": 956 }, { "epoch": 0.4948453608247423, "grad_norm": 1.8332362174987793, "learning_rate": 8.971733717170148e-05, "loss": 2.4949, "step": 960 }, { "epoch": 0.49690721649484537, "grad_norm": 6.3988037109375, "learning_rate": 8.961355765324648e-05, "loss": 2.6143, "step": 964 }, { "epoch": 0.49896907216494846, "grad_norm": 1.6663529872894287, "learning_rate": 8.950931771658014e-05, "loss": 2.5308, "step": 968 }, { "epoch": 0.5010309278350515, "grad_norm": 1.6272313594818115, "learning_rate": 8.940461857325647e-05, "loss": 2.4458, "step": 972 }, { "epoch": 0.5030927835051546, "grad_norm": 1.758596658706665, "learning_rate": 8.929946144016677e-05, "loss": 2.517, "step": 976 }, { "epoch": 0.5051546391752577, "grad_norm": 1.6491873264312744, "learning_rate": 8.919384753952538e-05, "loss": 2.5408, "step": 980 }, { "epoch": 0.5072164948453608, "grad_norm": 1.5984055995941162, "learning_rate": 8.908777809885557e-05, "loss": 2.5359, "step": 984 }, { "epoch": 0.5092783505154639, "grad_norm": 1.651795744895935, "learning_rate": 8.898125435097521e-05, "loss": 2.5151, "step": 988 }, { "epoch": 0.511340206185567, "grad_norm": 1.6190180778503418, "learning_rate": 8.887427753398248e-05, "loss": 2.6733, "step": 992 }, { "epoch": 0.51340206185567, "grad_norm": 1.6881654262542725, "learning_rate": 8.876684889124145e-05, "loss": 2.5783, "step": 996 }, { "epoch": 0.5154639175257731, "grad_norm": 1.786693811416626, "learning_rate": 8.865896967136766e-05, "loss": 2.6061, "step": 1000 }, { "epoch": 0.5175257731958763, "grad_norm": 1.841579556465149, "learning_rate": 8.855064112821361e-05, "loss": 2.6615, "step": 1004 }, { "epoch": 0.5195876288659794, "grad_norm": 1.5715938806533813, "learning_rate": 8.844186452085412e-05, "loss": 2.6044, "step": 1008 }, { "epoch": 0.5216494845360825, "grad_norm": 1.616853952407837, "learning_rate": 8.83326411135718e-05, "loss": 2.538, "step": 1012 }, { "epoch": 0.5237113402061856, "grad_norm": 1.7380198240280151, "learning_rate": 8.822297217584225e-05, "loss": 2.5838, "step": 1016 }, { "epoch": 0.5257731958762887, "grad_norm": 1.703635334968567, "learning_rate": 8.81128589823194e-05, "loss": 2.5035, "step": 1020 }, { "epoch": 0.5278350515463918, "grad_norm": 1.9964070320129395, "learning_rate": 8.80023028128206e-05, "loss": 2.544, "step": 1024 }, { "epoch": 0.5298969072164949, "grad_norm": 1.5779422521591187, "learning_rate": 8.789130495231186e-05, "loss": 2.6569, "step": 1028 }, { "epoch": 0.5319587628865979, "grad_norm": 1.771633505821228, "learning_rate": 8.77798666908928e-05, "loss": 2.5315, "step": 1032 }, { "epoch": 0.534020618556701, "grad_norm": 1.7763731479644775, "learning_rate": 8.766798932378172e-05, "loss": 2.5138, "step": 1036 }, { "epoch": 0.5360824742268041, "grad_norm": 1.699804425239563, "learning_rate": 8.755567415130058e-05, "loss": 2.5168, "step": 1040 }, { "epoch": 0.5381443298969072, "grad_norm": 1.8998210430145264, "learning_rate": 8.744292247885975e-05, "loss": 2.4795, "step": 1044 }, { "epoch": 0.5402061855670103, "grad_norm": 1.7309194803237915, "learning_rate": 8.732973561694297e-05, "loss": 2.5866, "step": 1048 }, { "epoch": 0.5422680412371134, "grad_norm": 1.667391061782837, "learning_rate": 8.721611488109212e-05, "loss": 2.5016, "step": 1052 }, { "epoch": 0.5443298969072164, "grad_norm": 1.7690322399139404, "learning_rate": 8.710206159189182e-05, "loss": 2.6189, "step": 1056 }, { "epoch": 0.5463917525773195, "grad_norm": 1.6563546657562256, "learning_rate": 8.698757707495417e-05, "loss": 2.4559, "step": 1060 }, { "epoch": 0.5484536082474227, "grad_norm": 1.8325282335281372, "learning_rate": 8.687266266090333e-05, "loss": 2.546, "step": 1064 }, { "epoch": 0.5505154639175258, "grad_norm": 1.6227245330810547, "learning_rate": 8.675731968536002e-05, "loss": 2.5709, "step": 1068 }, { "epoch": 0.5525773195876289, "grad_norm": 1.6123030185699463, "learning_rate": 8.664154948892607e-05, "loss": 2.5047, "step": 1072 }, { "epoch": 0.554639175257732, "grad_norm": 1.631011724472046, "learning_rate": 8.65253534171687e-05, "loss": 2.4568, "step": 1076 }, { "epoch": 0.5567010309278351, "grad_norm": 1.746801495552063, "learning_rate": 8.640873282060506e-05, "loss": 2.5825, "step": 1080 }, { "epoch": 0.5587628865979382, "grad_norm": 1.727461338043213, "learning_rate": 8.629168905468641e-05, "loss": 2.5724, "step": 1084 }, { "epoch": 0.5608247422680412, "grad_norm": 1.7059168815612793, "learning_rate": 8.617422347978239e-05, "loss": 2.5469, "step": 1088 }, { "epoch": 0.5628865979381443, "grad_norm": 1.6282740831375122, "learning_rate": 8.605633746116519e-05, "loss": 2.5503, "step": 1092 }, { "epoch": 0.5649484536082474, "grad_norm": 1.5870437622070312, "learning_rate": 8.593803236899379e-05, "loss": 2.5664, "step": 1096 }, { "epoch": 0.5670103092783505, "grad_norm": 1.7030541896820068, "learning_rate": 8.581930957829786e-05, "loss": 2.4574, "step": 1100 }, { "epoch": 0.5690721649484536, "grad_norm": 1.6299339532852173, "learning_rate": 8.570017046896197e-05, "loss": 2.5025, "step": 1104 }, { "epoch": 0.5711340206185567, "grad_norm": 1.648901104927063, "learning_rate": 8.558061642570936e-05, "loss": 2.5028, "step": 1108 }, { "epoch": 0.5731958762886598, "grad_norm": 1.5893465280532837, "learning_rate": 8.5460648838086e-05, "loss": 2.4432, "step": 1112 }, { "epoch": 0.5752577319587628, "grad_norm": 1.686439871788025, "learning_rate": 8.534026910044435e-05, "loss": 2.4683, "step": 1116 }, { "epoch": 0.5773195876288659, "grad_norm": 1.687406301498413, "learning_rate": 8.521947861192723e-05, "loss": 2.4717, "step": 1120 }, { "epoch": 0.5793814432989691, "grad_norm": 1.7713639736175537, "learning_rate": 8.509827877645144e-05, "loss": 2.6082, "step": 1124 }, { "epoch": 0.5814432989690722, "grad_norm": 1.7374292612075806, "learning_rate": 8.49766710026916e-05, "loss": 2.4267, "step": 1128 }, { "epoch": 0.5835051546391753, "grad_norm": 1.5629856586456299, "learning_rate": 8.48546567040636e-05, "loss": 2.4698, "step": 1132 }, { "epoch": 0.5855670103092784, "grad_norm": 1.6608953475952148, "learning_rate": 8.473223729870834e-05, "loss": 2.4221, "step": 1136 }, { "epoch": 0.5876288659793815, "grad_norm": 1.6824475526809692, "learning_rate": 8.460941420947514e-05, "loss": 2.5411, "step": 1140 }, { "epoch": 0.5896907216494846, "grad_norm": 1.590063214302063, "learning_rate": 8.448618886390522e-05, "loss": 2.4957, "step": 1144 }, { "epoch": 0.5917525773195876, "grad_norm": 1.6329950094223022, "learning_rate": 8.436256269421515e-05, "loss": 2.489, "step": 1148 }, { "epoch": 0.5938144329896907, "grad_norm": 1.7138408422470093, "learning_rate": 8.423853713728016e-05, "loss": 2.4355, "step": 1152 }, { "epoch": 0.5958762886597938, "grad_norm": 1.707245111465454, "learning_rate": 8.411411363461745e-05, "loss": 2.4351, "step": 1156 }, { "epoch": 0.5979381443298969, "grad_norm": 1.6063274145126343, "learning_rate": 8.398929363236948e-05, "loss": 2.4771, "step": 1160 }, { "epoch": 0.6, "grad_norm": 1.62711763381958, "learning_rate": 8.386407858128706e-05, "loss": 2.4136, "step": 1164 }, { "epoch": 0.6020618556701031, "grad_norm": 1.7263373136520386, "learning_rate": 8.373846993671261e-05, "loss": 2.6003, "step": 1168 }, { "epoch": 0.6041237113402061, "grad_norm": 1.6953507661819458, "learning_rate": 8.361246915856314e-05, "loss": 2.5567, "step": 1172 }, { "epoch": 0.6061855670103092, "grad_norm": 1.6107468605041504, "learning_rate": 8.348607771131336e-05, "loss": 2.3889, "step": 1176 }, { "epoch": 0.6082474226804123, "grad_norm": 1.5828678607940674, "learning_rate": 8.335929706397863e-05, "loss": 2.4063, "step": 1180 }, { "epoch": 0.6103092783505155, "grad_norm": 1.6693129539489746, "learning_rate": 8.323212869009782e-05, "loss": 2.4474, "step": 1184 }, { "epoch": 0.6123711340206186, "grad_norm": 1.575134515762329, "learning_rate": 8.310457406771635e-05, "loss": 2.5156, "step": 1188 }, { "epoch": 0.6144329896907217, "grad_norm": 1.600196123123169, "learning_rate": 8.297663467936882e-05, "loss": 2.4151, "step": 1192 }, { "epoch": 0.6164948453608248, "grad_norm": 1.6348146200180054, "learning_rate": 8.28483120120619e-05, "loss": 2.4336, "step": 1196 }, { "epoch": 0.6185567010309279, "grad_norm": 1.6424046754837036, "learning_rate": 8.271960755725702e-05, "loss": 2.45, "step": 1200 }, { "epoch": 0.6206185567010309, "grad_norm": 1.6451953649520874, "learning_rate": 8.2590522810853e-05, "loss": 2.4414, "step": 1204 }, { "epoch": 0.622680412371134, "grad_norm": 1.6768877506256104, "learning_rate": 8.246105927316874e-05, "loss": 2.459, "step": 1208 }, { "epoch": 0.6247422680412371, "grad_norm": 1.7277884483337402, "learning_rate": 8.233121844892568e-05, "loss": 2.4398, "step": 1212 }, { "epoch": 0.6268041237113402, "grad_norm": 1.6326000690460205, "learning_rate": 8.220100184723038e-05, "loss": 2.5278, "step": 1216 }, { "epoch": 0.6288659793814433, "grad_norm": 1.6363636255264282, "learning_rate": 8.2070410981557e-05, "loss": 2.466, "step": 1220 }, { "epoch": 0.6309278350515464, "grad_norm": 1.7234989404678345, "learning_rate": 8.193944736972963e-05, "loss": 2.4129, "step": 1224 }, { "epoch": 0.6329896907216495, "grad_norm": 1.6150864362716675, "learning_rate": 8.180811253390472e-05, "loss": 2.4872, "step": 1228 }, { "epoch": 0.6350515463917525, "grad_norm": 1.7305302619934082, "learning_rate": 8.167640800055335e-05, "loss": 2.5246, "step": 1232 }, { "epoch": 0.6371134020618556, "grad_norm": 1.7068026065826416, "learning_rate": 8.15443353004435e-05, "loss": 2.4428, "step": 1236 }, { "epoch": 0.6391752577319587, "grad_norm": 1.7080657482147217, "learning_rate": 8.141189596862225e-05, "loss": 2.3974, "step": 1240 }, { "epoch": 0.6412371134020619, "grad_norm": 1.690262794494629, "learning_rate": 8.127909154439796e-05, "loss": 2.5146, "step": 1244 }, { "epoch": 0.643298969072165, "grad_norm": 1.7586865425109863, "learning_rate": 8.114592357132236e-05, "loss": 2.459, "step": 1248 }, { "epoch": 0.6453608247422681, "grad_norm": 1.6623564958572388, "learning_rate": 8.101239359717263e-05, "loss": 2.4779, "step": 1252 }, { "epoch": 0.6474226804123712, "grad_norm": 1.694476842880249, "learning_rate": 8.087850317393335e-05, "loss": 2.4863, "step": 1256 }, { "epoch": 0.6494845360824743, "grad_norm": 1.7273415327072144, "learning_rate": 8.074425385777857e-05, "loss": 2.5528, "step": 1260 }, { "epoch": 0.6515463917525773, "grad_norm": 1.61915922164917, "learning_rate": 8.06096472090536e-05, "loss": 2.5224, "step": 1264 }, { "epoch": 0.6536082474226804, "grad_norm": 1.64993417263031, "learning_rate": 8.047468479225699e-05, "loss": 2.475, "step": 1268 }, { "epoch": 0.6556701030927835, "grad_norm": 1.6789292097091675, "learning_rate": 8.033936817602225e-05, "loss": 2.5414, "step": 1272 }, { "epoch": 0.6577319587628866, "grad_norm": 1.534625768661499, "learning_rate": 8.020369893309969e-05, "loss": 2.5186, "step": 1276 }, { "epoch": 0.6597938144329897, "grad_norm": 1.6226372718811035, "learning_rate": 8.006767864033805e-05, "loss": 2.5079, "step": 1280 }, { "epoch": 0.6618556701030928, "grad_norm": 1.6352249383926392, "learning_rate": 7.993130887866631e-05, "loss": 2.4257, "step": 1284 }, { "epoch": 0.6639175257731958, "grad_norm": 1.6394164562225342, "learning_rate": 7.97945912330752e-05, "loss": 2.4692, "step": 1288 }, { "epoch": 0.6659793814432989, "grad_norm": 1.7224717140197754, "learning_rate": 7.965752729259881e-05, "loss": 2.5426, "step": 1292 }, { "epoch": 0.668041237113402, "grad_norm": 1.662453532218933, "learning_rate": 7.952011865029614e-05, "loss": 2.5418, "step": 1296 }, { "epoch": 0.6701030927835051, "grad_norm": 1.5507217645645142, "learning_rate": 7.938236690323255e-05, "loss": 2.3688, "step": 1300 }, { "epoch": 0.6721649484536083, "grad_norm": 1.6199603080749512, "learning_rate": 7.924427365246125e-05, "loss": 2.4501, "step": 1304 }, { "epoch": 0.6742268041237114, "grad_norm": 1.5297261476516724, "learning_rate": 7.910584050300465e-05, "loss": 2.4709, "step": 1308 }, { "epoch": 0.6762886597938145, "grad_norm": 1.5516424179077148, "learning_rate": 7.896706906383568e-05, "loss": 2.4583, "step": 1312 }, { "epoch": 0.6783505154639176, "grad_norm": 1.5144871473312378, "learning_rate": 7.882796094785918e-05, "loss": 2.4077, "step": 1316 }, { "epoch": 0.6804123711340206, "grad_norm": 1.6823511123657227, "learning_rate": 7.868851777189306e-05, "loss": 2.4394, "step": 1320 }, { "epoch": 0.6824742268041237, "grad_norm": 1.5894042253494263, "learning_rate": 7.854874115664957e-05, "loss": 2.4256, "step": 1324 }, { "epoch": 0.6845360824742268, "grad_norm": 1.682361125946045, "learning_rate": 7.84086327267164e-05, "loss": 2.4082, "step": 1328 }, { "epoch": 0.6865979381443299, "grad_norm": 1.6730499267578125, "learning_rate": 7.826819411053787e-05, "loss": 2.43, "step": 1332 }, { "epoch": 0.688659793814433, "grad_norm": 1.5576122999191284, "learning_rate": 7.812742694039599e-05, "loss": 2.4109, "step": 1336 }, { "epoch": 0.6907216494845361, "grad_norm": 1.735579013824463, "learning_rate": 7.798633285239141e-05, "loss": 2.5218, "step": 1340 }, { "epoch": 0.6927835051546392, "grad_norm": 1.5843349695205688, "learning_rate": 7.784491348642452e-05, "loss": 2.4067, "step": 1344 }, { "epoch": 0.6948453608247422, "grad_norm": 1.5844374895095825, "learning_rate": 7.770317048617631e-05, "loss": 2.4879, "step": 1348 }, { "epoch": 0.6969072164948453, "grad_norm": 1.6641557216644287, "learning_rate": 7.756110549908924e-05, "loss": 2.4427, "step": 1352 }, { "epoch": 0.6989690721649484, "grad_norm": 1.567859172821045, "learning_rate": 7.741872017634824e-05, "loss": 2.458, "step": 1356 }, { "epoch": 0.7010309278350515, "grad_norm": 1.649179220199585, "learning_rate": 7.727601617286132e-05, "loss": 2.4483, "step": 1360 }, { "epoch": 0.7030927835051546, "grad_norm": 1.6399873495101929, "learning_rate": 7.713299514724051e-05, "loss": 2.5019, "step": 1364 }, { "epoch": 0.7051546391752578, "grad_norm": 1.724725365638733, "learning_rate": 7.698965876178246e-05, "loss": 2.4014, "step": 1368 }, { "epoch": 0.7072164948453609, "grad_norm": 1.734067440032959, "learning_rate": 7.68460086824492e-05, "loss": 2.3332, "step": 1372 }, { "epoch": 0.709278350515464, "grad_norm": 1.5960577726364136, "learning_rate": 7.67020465788487e-05, "loss": 2.4735, "step": 1376 }, { "epoch": 0.711340206185567, "grad_norm": 1.6822943687438965, "learning_rate": 7.655777412421554e-05, "loss": 2.515, "step": 1380 }, { "epoch": 0.7134020618556701, "grad_norm": 1.7459498643875122, "learning_rate": 7.641319299539145e-05, "loss": 2.4009, "step": 1384 }, { "epoch": 0.7154639175257732, "grad_norm": 1.5728472471237183, "learning_rate": 7.626830487280573e-05, "loss": 2.3888, "step": 1388 }, { "epoch": 0.7175257731958763, "grad_norm": 1.621801495552063, "learning_rate": 7.612311144045588e-05, "loss": 2.4036, "step": 1392 }, { "epoch": 0.7195876288659794, "grad_norm": 1.6353496313095093, "learning_rate": 7.597761438588784e-05, "loss": 2.4504, "step": 1396 }, { "epoch": 0.7216494845360825, "grad_norm": 1.5395163297653198, "learning_rate": 7.583181540017656e-05, "loss": 2.3526, "step": 1400 }, { "epoch": 0.7237113402061855, "grad_norm": 1.6341197490692139, "learning_rate": 7.568571617790624e-05, "loss": 2.4689, "step": 1404 }, { "epoch": 0.7257731958762886, "grad_norm": 1.5934208631515503, "learning_rate": 7.553931841715061e-05, "loss": 2.5371, "step": 1408 }, { "epoch": 0.7278350515463917, "grad_norm": 1.6311752796173096, "learning_rate": 7.539262381945323e-05, "loss": 2.4693, "step": 1412 }, { "epoch": 0.7298969072164948, "grad_norm": 1.6778818368911743, "learning_rate": 7.52456340898078e-05, "loss": 2.4041, "step": 1416 }, { "epoch": 0.7319587628865979, "grad_norm": 1.5758112668991089, "learning_rate": 7.509835093663821e-05, "loss": 2.4328, "step": 1420 }, { "epoch": 0.734020618556701, "grad_norm": 1.6166560649871826, "learning_rate": 7.495077607177872e-05, "loss": 2.386, "step": 1424 }, { "epoch": 0.7360824742268042, "grad_norm": 1.4999052286148071, "learning_rate": 7.48029112104541e-05, "loss": 2.3724, "step": 1428 }, { "epoch": 0.7381443298969073, "grad_norm": 1.6186021566390991, "learning_rate": 7.465475807125968e-05, "loss": 2.3181, "step": 1432 }, { "epoch": 0.7402061855670103, "grad_norm": 1.7516733407974243, "learning_rate": 7.450631837614138e-05, "loss": 2.4565, "step": 1436 }, { "epoch": 0.7422680412371134, "grad_norm": 1.647589087486267, "learning_rate": 7.435759385037565e-05, "loss": 2.4047, "step": 1440 }, { "epoch": 0.7443298969072165, "grad_norm": 1.5819942951202393, "learning_rate": 7.420858622254946e-05, "loss": 2.455, "step": 1444 }, { "epoch": 0.7463917525773196, "grad_norm": 1.5764845609664917, "learning_rate": 7.405929722454026e-05, "loss": 2.3826, "step": 1448 }, { "epoch": 0.7484536082474227, "grad_norm": 1.6311144828796387, "learning_rate": 7.39097285914957e-05, "loss": 2.3991, "step": 1452 }, { "epoch": 0.7505154639175258, "grad_norm": 1.5981923341751099, "learning_rate": 7.375988206181365e-05, "loss": 2.4517, "step": 1456 }, { "epoch": 0.7525773195876289, "grad_norm": 1.5369350910186768, "learning_rate": 7.360975937712185e-05, "loss": 2.4181, "step": 1460 }, { "epoch": 0.7546391752577319, "grad_norm": 1.6119554042816162, "learning_rate": 7.345936228225769e-05, "loss": 2.4652, "step": 1464 }, { "epoch": 0.756701030927835, "grad_norm": 1.7482960224151611, "learning_rate": 7.330869252524804e-05, "loss": 2.4213, "step": 1468 }, { "epoch": 0.7587628865979381, "grad_norm": 1.676172137260437, "learning_rate": 7.315775185728877e-05, "loss": 2.3816, "step": 1472 }, { "epoch": 0.7608247422680412, "grad_norm": 4.144953727722168, "learning_rate": 7.300654203272454e-05, "loss": 2.412, "step": 1476 }, { "epoch": 0.7628865979381443, "grad_norm": 1.5259236097335815, "learning_rate": 7.285506480902831e-05, "loss": 2.4274, "step": 1480 }, { "epoch": 0.7649484536082474, "grad_norm": 1.5517241954803467, "learning_rate": 7.270332194678097e-05, "loss": 2.3633, "step": 1484 }, { "epoch": 0.7670103092783506, "grad_norm": 1.583932638168335, "learning_rate": 7.255131520965087e-05, "loss": 2.3964, "step": 1488 }, { "epoch": 0.7690721649484537, "grad_norm": 1.5848530530929565, "learning_rate": 7.239904636437325e-05, "loss": 2.42, "step": 1492 }, { "epoch": 0.7711340206185567, "grad_norm": 1.5512683391571045, "learning_rate": 7.224651718072984e-05, "loss": 2.4654, "step": 1496 }, { "epoch": 0.7731958762886598, "grad_norm": 1.5761094093322754, "learning_rate": 7.20937294315282e-05, "loss": 2.4011, "step": 1500 }, { "epoch": 0.7752577319587629, "grad_norm": 1.519158124923706, "learning_rate": 7.194068489258109e-05, "loss": 2.4315, "step": 1504 }, { "epoch": 0.777319587628866, "grad_norm": 1.5774180889129639, "learning_rate": 7.178738534268591e-05, "loss": 2.4667, "step": 1508 }, { "epoch": 0.7793814432989691, "grad_norm": 1.6394808292388916, "learning_rate": 7.163383256360398e-05, "loss": 2.4861, "step": 1512 }, { "epoch": 0.7814432989690722, "grad_norm": 1.549655795097351, "learning_rate": 7.14800283400398e-05, "loss": 2.3323, "step": 1516 }, { "epoch": 0.7835051546391752, "grad_norm": 1.6154853105545044, "learning_rate": 7.132597445962042e-05, "loss": 2.5378, "step": 1520 }, { "epoch": 0.7855670103092783, "grad_norm": 1.6021648645401, "learning_rate": 7.117167271287453e-05, "loss": 2.4249, "step": 1524 }, { "epoch": 0.7876288659793814, "grad_norm": 1.665974736213684, "learning_rate": 7.101712489321169e-05, "loss": 2.4566, "step": 1528 }, { "epoch": 0.7896907216494845, "grad_norm": 1.6290616989135742, "learning_rate": 7.086233279690158e-05, "loss": 2.3565, "step": 1532 }, { "epoch": 0.7917525773195876, "grad_norm": 1.5475256443023682, "learning_rate": 7.070729822305298e-05, "loss": 2.4542, "step": 1536 }, { "epoch": 0.7938144329896907, "grad_norm": 1.6090917587280273, "learning_rate": 7.055202297359293e-05, "loss": 2.4498, "step": 1540 }, { "epoch": 0.7958762886597938, "grad_norm": 1.5616388320922852, "learning_rate": 7.039650885324582e-05, "loss": 2.3708, "step": 1544 }, { "epoch": 0.797938144329897, "grad_norm": 1.711113452911377, "learning_rate": 7.024075766951233e-05, "loss": 2.5234, "step": 1548 }, { "epoch": 0.8, "grad_norm": 1.6016921997070312, "learning_rate": 7.008477123264848e-05, "loss": 2.4755, "step": 1552 }, { "epoch": 0.8020618556701031, "grad_norm": 1.5505585670471191, "learning_rate": 6.99285513556446e-05, "loss": 2.3435, "step": 1556 }, { "epoch": 0.8041237113402062, "grad_norm": 1.532699704170227, "learning_rate": 6.977209985420419e-05, "loss": 2.4878, "step": 1560 }, { "epoch": 0.8061855670103093, "grad_norm": 2.197500705718994, "learning_rate": 6.961541854672293e-05, "loss": 2.3707, "step": 1564 }, { "epoch": 0.8082474226804124, "grad_norm": 1.6112059354782104, "learning_rate": 6.945850925426742e-05, "loss": 2.4803, "step": 1568 }, { "epoch": 0.8103092783505155, "grad_norm": 1.6318986415863037, "learning_rate": 6.930137380055403e-05, "loss": 2.315, "step": 1572 }, { "epoch": 0.8123711340206186, "grad_norm": 1.607735514640808, "learning_rate": 6.914401401192789e-05, "loss": 2.3804, "step": 1576 }, { "epoch": 0.8144329896907216, "grad_norm": 1.474547266960144, "learning_rate": 6.898643171734137e-05, "loss": 2.3736, "step": 1580 }, { "epoch": 0.8164948453608247, "grad_norm": 1.9736120700836182, "learning_rate": 6.882862874833305e-05, "loss": 2.3804, "step": 1584 }, { "epoch": 0.8185567010309278, "grad_norm": 1.5962013006210327, "learning_rate": 6.867060693900631e-05, "loss": 2.4503, "step": 1588 }, { "epoch": 0.8206185567010309, "grad_norm": 1.565443754196167, "learning_rate": 6.851236812600808e-05, "loss": 2.4805, "step": 1592 }, { "epoch": 0.822680412371134, "grad_norm": 1.6651315689086914, "learning_rate": 6.835391414850748e-05, "loss": 2.4496, "step": 1596 }, { "epoch": 0.8247422680412371, "grad_norm": 1.5796141624450684, "learning_rate": 6.819524684817438e-05, "loss": 2.4143, "step": 1600 }, { "epoch": 0.8268041237113402, "grad_norm": 1.513759732246399, "learning_rate": 6.803636806915812e-05, "loss": 2.4383, "step": 1604 }, { "epoch": 0.8288659793814434, "grad_norm": 1.556221842765808, "learning_rate": 6.787727965806591e-05, "loss": 2.4026, "step": 1608 }, { "epoch": 0.8309278350515464, "grad_norm": 1.5572640895843506, "learning_rate": 6.771798346394157e-05, "loss": 2.4022, "step": 1612 }, { "epoch": 0.8329896907216495, "grad_norm": 1.602418065071106, "learning_rate": 6.755848133824383e-05, "loss": 2.4555, "step": 1616 }, { "epoch": 0.8350515463917526, "grad_norm": 1.676102638244629, "learning_rate": 6.739877513482497e-05, "loss": 2.3695, "step": 1620 }, { "epoch": 0.8371134020618557, "grad_norm": 1.748834252357483, "learning_rate": 6.72388667099092e-05, "loss": 2.2801, "step": 1624 }, { "epoch": 0.8391752577319588, "grad_norm": 1.5271378755569458, "learning_rate": 6.707875792207108e-05, "loss": 2.2845, "step": 1628 }, { "epoch": 0.8412371134020619, "grad_norm": 1.7049288749694824, "learning_rate": 6.691845063221393e-05, "loss": 2.4126, "step": 1632 }, { "epoch": 0.843298969072165, "grad_norm": 1.590855360031128, "learning_rate": 6.675794670354826e-05, "loss": 2.3571, "step": 1636 }, { "epoch": 0.845360824742268, "grad_norm": 1.7120413780212402, "learning_rate": 6.659724800157002e-05, "loss": 2.4111, "step": 1640 }, { "epoch": 0.8474226804123711, "grad_norm": 1.5834614038467407, "learning_rate": 6.643635639403897e-05, "loss": 2.4045, "step": 1644 }, { "epoch": 0.8494845360824742, "grad_norm": 1.4899585247039795, "learning_rate": 6.627527375095697e-05, "loss": 2.2914, "step": 1648 }, { "epoch": 0.8515463917525773, "grad_norm": 1.573357105255127, "learning_rate": 6.611400194454619e-05, "loss": 2.3746, "step": 1652 }, { "epoch": 0.8536082474226804, "grad_norm": 1.528746247291565, "learning_rate": 6.595254284922748e-05, "loss": 2.3568, "step": 1656 }, { "epoch": 0.8556701030927835, "grad_norm": 1.6635682582855225, "learning_rate": 6.579089834159844e-05, "loss": 2.3868, "step": 1660 }, { "epoch": 0.8577319587628865, "grad_norm": 1.5613417625427246, "learning_rate": 6.562907030041168e-05, "loss": 2.377, "step": 1664 }, { "epoch": 0.8597938144329897, "grad_norm": 1.6135239601135254, "learning_rate": 6.546706060655298e-05, "loss": 2.4542, "step": 1668 }, { "epoch": 0.8618556701030928, "grad_norm": 1.5151220560073853, "learning_rate": 6.530487114301944e-05, "loss": 2.2057, "step": 1672 }, { "epoch": 0.8639175257731959, "grad_norm": 1.6280299425125122, "learning_rate": 6.514250379489753e-05, "loss": 2.3594, "step": 1676 }, { "epoch": 0.865979381443299, "grad_norm": 1.5538045167922974, "learning_rate": 6.49799604493413e-05, "loss": 2.4155, "step": 1680 }, { "epoch": 0.8680412371134021, "grad_norm": 1.5412131547927856, "learning_rate": 6.481724299555029e-05, "loss": 2.3285, "step": 1684 }, { "epoch": 0.8701030927835052, "grad_norm": 1.7468520402908325, "learning_rate": 6.465435332474768e-05, "loss": 2.2616, "step": 1688 }, { "epoch": 0.8721649484536083, "grad_norm": 1.4656267166137695, "learning_rate": 6.449129333015834e-05, "loss": 2.2932, "step": 1692 }, { "epoch": 0.8742268041237113, "grad_norm": 1.554957628250122, "learning_rate": 6.432806490698671e-05, "loss": 2.4188, "step": 1696 }, { "epoch": 0.8762886597938144, "grad_norm": 1.6073874235153198, "learning_rate": 6.416466995239485e-05, "loss": 2.2739, "step": 1700 }, { "epoch": 0.8783505154639175, "grad_norm": 1.6184226274490356, "learning_rate": 6.400111036548037e-05, "loss": 2.3879, "step": 1704 }, { "epoch": 0.8804123711340206, "grad_norm": 1.5735539197921753, "learning_rate": 6.383738804725438e-05, "loss": 2.3191, "step": 1708 }, { "epoch": 0.8824742268041237, "grad_norm": 1.6026270389556885, "learning_rate": 6.36735049006193e-05, "loss": 2.5497, "step": 1712 }, { "epoch": 0.8845360824742268, "grad_norm": 1.6282131671905518, "learning_rate": 6.350946283034696e-05, "loss": 2.4152, "step": 1716 }, { "epoch": 0.8865979381443299, "grad_norm": 1.553600549697876, "learning_rate": 6.334526374305615e-05, "loss": 2.3417, "step": 1720 }, { "epoch": 0.8886597938144329, "grad_norm": 1.5668585300445557, "learning_rate": 6.318090954719074e-05, "loss": 2.3811, "step": 1724 }, { "epoch": 0.8907216494845361, "grad_norm": 1.7151823043823242, "learning_rate": 6.301640215299735e-05, "loss": 2.2925, "step": 1728 }, { "epoch": 0.8927835051546392, "grad_norm": 1.5615097284317017, "learning_rate": 6.285174347250322e-05, "loss": 2.3591, "step": 1732 }, { "epoch": 0.8948453608247423, "grad_norm": 1.6054131984710693, "learning_rate": 6.26869354194939e-05, "loss": 2.3084, "step": 1736 }, { "epoch": 0.8969072164948454, "grad_norm": 1.606719732284546, "learning_rate": 6.252197990949108e-05, "loss": 2.4081, "step": 1740 }, { "epoch": 0.8989690721649485, "grad_norm": 1.6772600412368774, "learning_rate": 6.235687885973032e-05, "loss": 2.3733, "step": 1744 }, { "epoch": 0.9010309278350516, "grad_norm": 1.508259654045105, "learning_rate": 6.219163418913872e-05, "loss": 2.3318, "step": 1748 }, { "epoch": 0.9030927835051547, "grad_norm": 1.5906274318695068, "learning_rate": 6.202624781831268e-05, "loss": 2.297, "step": 1752 }, { "epoch": 0.9051546391752577, "grad_norm": 1.5747168064117432, "learning_rate": 6.186072166949552e-05, "loss": 2.4498, "step": 1756 }, { "epoch": 0.9072164948453608, "grad_norm": 1.5119340419769287, "learning_rate": 6.16950576665552e-05, "loss": 2.3045, "step": 1760 }, { "epoch": 0.9092783505154639, "grad_norm": 1.5936625003814697, "learning_rate": 6.152925773496189e-05, "loss": 2.3989, "step": 1764 }, { "epoch": 0.911340206185567, "grad_norm": 1.6385735273361206, "learning_rate": 6.13633238017656e-05, "loss": 2.3415, "step": 1768 }, { "epoch": 0.9134020618556701, "grad_norm": 1.6806796789169312, "learning_rate": 6.119725779557386e-05, "loss": 2.37, "step": 1772 }, { "epoch": 0.9154639175257732, "grad_norm": 1.5404062271118164, "learning_rate": 6.103106164652924e-05, "loss": 2.476, "step": 1776 }, { "epoch": 0.9175257731958762, "grad_norm": 1.5629997253417969, "learning_rate": 6.086473728628691e-05, "loss": 2.3843, "step": 1780 }, { "epoch": 0.9195876288659793, "grad_norm": 1.5114765167236328, "learning_rate": 6.069828664799221e-05, "loss": 2.3972, "step": 1784 }, { "epoch": 0.9216494845360824, "grad_norm": 1.5878251791000366, "learning_rate": 6.053171166625817e-05, "loss": 2.3686, "step": 1788 }, { "epoch": 0.9237113402061856, "grad_norm": 1.6494956016540527, "learning_rate": 6.036501427714304e-05, "loss": 2.4627, "step": 1792 }, { "epoch": 0.9257731958762887, "grad_norm": 1.5250719785690308, "learning_rate": 6.0198196418127804e-05, "loss": 2.332, "step": 1796 }, { "epoch": 0.9278350515463918, "grad_norm": 1.5805870294570923, "learning_rate": 6.0031260028093596e-05, "loss": 2.2881, "step": 1800 }, { "epoch": 0.9298969072164949, "grad_norm": 1.6690597534179688, "learning_rate": 5.98642070472992e-05, "loss": 2.2946, "step": 1804 }, { "epoch": 0.931958762886598, "grad_norm": 1.5214745998382568, "learning_rate": 5.969703941735858e-05, "loss": 2.3984, "step": 1808 }, { "epoch": 0.934020618556701, "grad_norm": 1.6358650922775269, "learning_rate": 5.952975908121815e-05, "loss": 2.3758, "step": 1812 }, { "epoch": 0.9360824742268041, "grad_norm": 1.9057590961456299, "learning_rate": 5.936236798313431e-05, "loss": 2.3347, "step": 1816 }, { "epoch": 0.9381443298969072, "grad_norm": 1.6489713191986084, "learning_rate": 5.9194868068650845e-05, "loss": 2.2552, "step": 1820 }, { "epoch": 0.9402061855670103, "grad_norm": 1.5380582809448242, "learning_rate": 5.902726128457625e-05, "loss": 2.2898, "step": 1824 }, { "epoch": 0.9422680412371134, "grad_norm": 1.5303738117218018, "learning_rate": 5.885954957896115e-05, "loss": 2.2896, "step": 1828 }, { "epoch": 0.9443298969072165, "grad_norm": 1.577243685722351, "learning_rate": 5.8691734901075634e-05, "loss": 2.3281, "step": 1832 }, { "epoch": 0.9463917525773196, "grad_norm": 1.545732021331787, "learning_rate": 5.852381920138663e-05, "loss": 2.2197, "step": 1836 }, { "epoch": 0.9484536082474226, "grad_norm": 1.5541272163391113, "learning_rate": 5.835580443153522e-05, "loss": 2.2776, "step": 1840 }, { "epoch": 0.9505154639175257, "grad_norm": 1.5543440580368042, "learning_rate": 5.818769254431395e-05, "loss": 2.3597, "step": 1844 }, { "epoch": 0.9525773195876288, "grad_norm": 1.5079783201217651, "learning_rate": 5.80194854936441e-05, "loss": 2.3476, "step": 1848 }, { "epoch": 0.954639175257732, "grad_norm": 1.503811240196228, "learning_rate": 5.7851185234553064e-05, "loss": 2.3318, "step": 1852 }, { "epoch": 0.9567010309278351, "grad_norm": 1.532036542892456, "learning_rate": 5.768279372315153e-05, "loss": 2.2278, "step": 1856 }, { "epoch": 0.9587628865979382, "grad_norm": 1.5316864252090454, "learning_rate": 5.7514312916610814e-05, "loss": 2.3835, "step": 1860 }, { "epoch": 0.9608247422680413, "grad_norm": 1.5309964418411255, "learning_rate": 5.7345744773140086e-05, "loss": 2.3779, "step": 1864 }, { "epoch": 0.9628865979381444, "grad_norm": 1.577561855316162, "learning_rate": 5.71770912519636e-05, "loss": 2.3334, "step": 1868 }, { "epoch": 0.9649484536082474, "grad_norm": 1.5817878246307373, "learning_rate": 5.7008354313297926e-05, "loss": 2.28, "step": 1872 }, { "epoch": 0.9670103092783505, "grad_norm": 1.5670493841171265, "learning_rate": 5.683953591832922e-05, "loss": 2.304, "step": 1876 }, { "epoch": 0.9690721649484536, "grad_norm": 1.5366458892822266, "learning_rate": 5.667063802919032e-05, "loss": 2.4577, "step": 1880 }, { "epoch": 0.9711340206185567, "grad_norm": 1.5330809354782104, "learning_rate": 5.6501662608938014e-05, "loss": 2.4178, "step": 1884 }, { "epoch": 0.9731958762886598, "grad_norm": 1.4714388847351074, "learning_rate": 5.633261162153027e-05, "loss": 2.3252, "step": 1888 }, { "epoch": 0.9752577319587629, "grad_norm": 1.5558996200561523, "learning_rate": 5.6163487031803305e-05, "loss": 2.3597, "step": 1892 }, { "epoch": 0.977319587628866, "grad_norm": 1.5362873077392578, "learning_rate": 5.5994290805448826e-05, "loss": 2.2979, "step": 1896 }, { "epoch": 0.979381443298969, "grad_norm": 1.5119233131408691, "learning_rate": 5.582502490899111e-05, "loss": 2.3582, "step": 1900 }, { "epoch": 0.9814432989690721, "grad_norm": 1.6467958688735962, "learning_rate": 5.565569130976422e-05, "loss": 2.3366, "step": 1904 }, { "epoch": 0.9835051546391752, "grad_norm": 1.6580411195755005, "learning_rate": 5.548629197588913e-05, "loss": 2.2901, "step": 1908 }, { "epoch": 0.9855670103092784, "grad_norm": 1.4866169691085815, "learning_rate": 5.5316828876250795e-05, "loss": 2.3313, "step": 1912 }, { "epoch": 0.9876288659793815, "grad_norm": 1.6460485458374023, "learning_rate": 5.514730398047533e-05, "loss": 2.2917, "step": 1916 }, { "epoch": 0.9896907216494846, "grad_norm": 1.4895657300949097, "learning_rate": 5.497771925890706e-05, "loss": 2.3709, "step": 1920 }, { "epoch": 0.9917525773195877, "grad_norm": 1.4158916473388672, "learning_rate": 5.48080766825857e-05, "loss": 2.1582, "step": 1924 }, { "epoch": 0.9938144329896907, "grad_norm": 1.611509919166565, "learning_rate": 5.463837822322333e-05, "loss": 2.5048, "step": 1928 }, { "epoch": 0.9958762886597938, "grad_norm": 1.6175845861434937, "learning_rate": 5.446862585318161e-05, "loss": 2.3836, "step": 1932 }, { "epoch": 0.9979381443298969, "grad_norm": 1.6315233707427979, "learning_rate": 5.429882154544875e-05, "loss": 2.2926, "step": 1936 }, { "epoch": 1.0, "grad_norm": 4.427371025085449, "learning_rate": 5.4128967273616625e-05, "loss": 2.3215, "step": 1940 } ], "logging_steps": 4, "max_steps": 3880, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1940, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.2824010606012006e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }