diff --git "a/checkpoint-3022/trainer_state.json" "b/checkpoint-3022/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-3022/trainer_state.json" @@ -0,0 +1,5318 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 3022, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0026472534745201853, + "grad_norm": 24.50491714477539, + "learning_rate": 2.631578947368421e-06, + "loss": 6.5473, + "step": 4 + }, + { + "epoch": 0.005294506949040371, + "grad_norm": 17.426511764526367, + "learning_rate": 5.263157894736842e-06, + "loss": 6.2116, + "step": 8 + }, + { + "epoch": 0.007941760423560556, + "grad_norm": 6.35976505279541, + "learning_rate": 7.894736842105263e-06, + "loss": 5.7967, + "step": 12 + }, + { + "epoch": 0.010589013898080741, + "grad_norm": 5.454939842224121, + "learning_rate": 1.0526315789473684e-05, + "loss": 5.3365, + "step": 16 + }, + { + "epoch": 0.013236267372600927, + "grad_norm": 4.607099533081055, + "learning_rate": 1.3157894736842106e-05, + "loss": 4.7105, + "step": 20 + }, + { + "epoch": 0.01588352084712111, + "grad_norm": 3.6498019695281982, + "learning_rate": 1.5789473684210526e-05, + "loss": 4.286, + "step": 24 + }, + { + "epoch": 0.018530774321641297, + "grad_norm": 4.196900844573975, + "learning_rate": 1.8421052631578947e-05, + "loss": 4.3134, + "step": 28 + }, + { + "epoch": 0.021178027796161483, + "grad_norm": 3.617469072341919, + "learning_rate": 2.105263157894737e-05, + "loss": 3.7494, + "step": 32 + }, + { + "epoch": 0.02382528127068167, + "grad_norm": 3.05267333984375, + "learning_rate": 2.368421052631579e-05, + "loss": 3.8046, + "step": 36 + }, + { + "epoch": 0.026472534745201854, + "grad_norm": 2.607614517211914, + "learning_rate": 2.6315789473684212e-05, + "loss": 3.385, + "step": 40 + }, + { + "epoch": 0.02911978821972204, + "grad_norm": 2.536888837814331, + "learning_rate": 2.8947368421052634e-05, + "loss": 3.3516, + "step": 44 + }, + { + "epoch": 0.03176704169424222, + "grad_norm": 2.315871000289917, + "learning_rate": 3.157894736842105e-05, + "loss": 3.0795, + "step": 48 + }, + { + "epoch": 0.03441429516876241, + "grad_norm": 2.3058571815490723, + "learning_rate": 3.421052631578947e-05, + "loss": 3.0708, + "step": 52 + }, + { + "epoch": 0.037061548643282594, + "grad_norm": 2.067796230316162, + "learning_rate": 3.6842105263157895e-05, + "loss": 2.8311, + "step": 56 + }, + { + "epoch": 0.03970880211780278, + "grad_norm": 1.9578440189361572, + "learning_rate": 3.9473684210526316e-05, + "loss": 2.696, + "step": 60 + }, + { + "epoch": 0.042356055592322965, + "grad_norm": 2.043933629989624, + "learning_rate": 4.210526315789474e-05, + "loss": 2.7501, + "step": 64 + }, + { + "epoch": 0.04500330906684315, + "grad_norm": 1.82830810546875, + "learning_rate": 4.473684210526316e-05, + "loss": 2.5058, + "step": 68 + }, + { + "epoch": 0.04765056254136334, + "grad_norm": 1.8841806650161743, + "learning_rate": 4.736842105263158e-05, + "loss": 2.5708, + "step": 72 + }, + { + "epoch": 0.05029781601588352, + "grad_norm": 1.9775539636611938, + "learning_rate": 5e-05, + "loss": 2.6332, + "step": 76 + }, + { + "epoch": 0.05294506949040371, + "grad_norm": 1.7908610105514526, + "learning_rate": 5.2631578947368424e-05, + "loss": 2.5441, + "step": 80 + }, + { + "epoch": 0.05559232296492389, + "grad_norm": 1.977647066116333, + "learning_rate": 5.526315789473685e-05, + "loss": 2.3617, + "step": 84 + }, + { + "epoch": 0.05823957643944408, + "grad_norm": 2.008470296859741, + "learning_rate": 5.789473684210527e-05, + "loss": 2.3994, + "step": 88 + }, + { + "epoch": 0.06088682991396426, + "grad_norm": 2.070720911026001, + "learning_rate": 6.052631578947369e-05, + "loss": 2.3509, + "step": 92 + }, + { + "epoch": 0.06353408338848444, + "grad_norm": 2.0442869663238525, + "learning_rate": 6.31578947368421e-05, + "loss": 2.35, + "step": 96 + }, + { + "epoch": 0.06618133686300463, + "grad_norm": 1.8274725675582886, + "learning_rate": 6.578947368421054e-05, + "loss": 2.2802, + "step": 100 + }, + { + "epoch": 0.06882859033752482, + "grad_norm": 1.9744892120361328, + "learning_rate": 6.842105263157895e-05, + "loss": 2.4711, + "step": 104 + }, + { + "epoch": 0.071475843812045, + "grad_norm": 1.881946086883545, + "learning_rate": 7.105263157894737e-05, + "loss": 2.3495, + "step": 108 + }, + { + "epoch": 0.07412309728656519, + "grad_norm": 1.7632906436920166, + "learning_rate": 7.368421052631579e-05, + "loss": 2.1906, + "step": 112 + }, + { + "epoch": 0.07677035076108538, + "grad_norm": 1.8465447425842285, + "learning_rate": 7.631578947368422e-05, + "loss": 2.4193, + "step": 116 + }, + { + "epoch": 0.07941760423560557, + "grad_norm": 1.978273868560791, + "learning_rate": 7.894736842105263e-05, + "loss": 2.3229, + "step": 120 + }, + { + "epoch": 0.08206485771012574, + "grad_norm": 1.9878270626068115, + "learning_rate": 8.157894736842105e-05, + "loss": 2.3028, + "step": 124 + }, + { + "epoch": 0.08471211118464593, + "grad_norm": 1.7065322399139404, + "learning_rate": 8.421052631578948e-05, + "loss": 2.244, + "step": 128 + }, + { + "epoch": 0.08735936465916612, + "grad_norm": 1.8170701265335083, + "learning_rate": 8.68421052631579e-05, + "loss": 2.1112, + "step": 132 + }, + { + "epoch": 0.0900066181336863, + "grad_norm": 1.9288476705551147, + "learning_rate": 8.947368421052632e-05, + "loss": 2.3551, + "step": 136 + }, + { + "epoch": 0.09265387160820648, + "grad_norm": 1.8695253133773804, + "learning_rate": 9.210526315789474e-05, + "loss": 2.2814, + "step": 140 + }, + { + "epoch": 0.09530112508272667, + "grad_norm": 1.7066093683242798, + "learning_rate": 9.473684210526316e-05, + "loss": 1.9036, + "step": 144 + }, + { + "epoch": 0.09794837855724686, + "grad_norm": 1.8588757514953613, + "learning_rate": 9.736842105263158e-05, + "loss": 2.0139, + "step": 148 + }, + { + "epoch": 0.10059563203176704, + "grad_norm": 1.789518117904663, + "learning_rate": 0.0001, + "loss": 2.1809, + "step": 152 + }, + { + "epoch": 0.10324288550628723, + "grad_norm": 1.9242740869522095, + "learning_rate": 9.999952071344157e-05, + "loss": 2.301, + "step": 156 + }, + { + "epoch": 0.10589013898080742, + "grad_norm": 1.7974549531936646, + "learning_rate": 9.999808286295485e-05, + "loss": 2.2312, + "step": 160 + }, + { + "epoch": 0.10853739245532759, + "grad_norm": 1.7276233434677124, + "learning_rate": 9.999568647610555e-05, + "loss": 2.1109, + "step": 164 + }, + { + "epoch": 0.11118464592984778, + "grad_norm": 1.8286519050598145, + "learning_rate": 9.999233159883593e-05, + "loss": 2.0782, + "step": 168 + }, + { + "epoch": 0.11383189940436797, + "grad_norm": 1.919313907623291, + "learning_rate": 9.998801829546386e-05, + "loss": 2.0693, + "step": 172 + }, + { + "epoch": 0.11647915287888816, + "grad_norm": 1.6544960737228394, + "learning_rate": 9.998274664868173e-05, + "loss": 2.0982, + "step": 176 + }, + { + "epoch": 0.11912640635340833, + "grad_norm": 1.8223872184753418, + "learning_rate": 9.997651675955466e-05, + "loss": 2.1379, + "step": 180 + }, + { + "epoch": 0.12177365982792852, + "grad_norm": 1.7743052244186401, + "learning_rate": 9.996932874751877e-05, + "loss": 2.0637, + "step": 184 + }, + { + "epoch": 0.12442091330244871, + "grad_norm": 1.7228261232376099, + "learning_rate": 9.996118275037873e-05, + "loss": 2.1696, + "step": 188 + }, + { + "epoch": 0.1270681667769689, + "grad_norm": 1.6266913414001465, + "learning_rate": 9.995207892430524e-05, + "loss": 2.1247, + "step": 192 + }, + { + "epoch": 0.12971542025148908, + "grad_norm": 1.8206615447998047, + "learning_rate": 9.994201744383196e-05, + "loss": 2.1831, + "step": 196 + }, + { + "epoch": 0.13236267372600927, + "grad_norm": 1.943579912185669, + "learning_rate": 9.993099850185216e-05, + "loss": 1.9262, + "step": 200 + }, + { + "epoch": 0.13500992720052946, + "grad_norm": 1.89098060131073, + "learning_rate": 9.991902230961511e-05, + "loss": 2.2636, + "step": 204 + }, + { + "epoch": 0.13765718067504965, + "grad_norm": 1.8418017625808716, + "learning_rate": 9.99060890967219e-05, + "loss": 2.2454, + "step": 208 + }, + { + "epoch": 0.14030443414956983, + "grad_norm": 1.7433375120162964, + "learning_rate": 9.989219911112113e-05, + "loss": 2.2591, + "step": 212 + }, + { + "epoch": 0.14295168762409, + "grad_norm": 1.885964035987854, + "learning_rate": 9.987735261910417e-05, + "loss": 1.9402, + "step": 216 + }, + { + "epoch": 0.14559894109861019, + "grad_norm": 1.626397728919983, + "learning_rate": 9.986154990529995e-05, + "loss": 2.119, + "step": 220 + }, + { + "epoch": 0.14824619457313037, + "grad_norm": 1.5490047931671143, + "learning_rate": 9.984479127266961e-05, + "loss": 1.8635, + "step": 224 + }, + { + "epoch": 0.15089344804765056, + "grad_norm": 1.5588316917419434, + "learning_rate": 9.982707704250065e-05, + "loss": 1.8135, + "step": 228 + }, + { + "epoch": 0.15354070152217075, + "grad_norm": 1.9416462182998657, + "learning_rate": 9.980840755440075e-05, + "loss": 2.2288, + "step": 232 + }, + { + "epoch": 0.15618795499669094, + "grad_norm": 1.5774728059768677, + "learning_rate": 9.978878316629133e-05, + "loss": 1.9254, + "step": 236 + }, + { + "epoch": 0.15883520847121113, + "grad_norm": 1.6661707162857056, + "learning_rate": 9.976820425440058e-05, + "loss": 2.0111, + "step": 240 + }, + { + "epoch": 0.1614824619457313, + "grad_norm": 1.5805509090423584, + "learning_rate": 9.974667121325634e-05, + "loss": 2.0657, + "step": 244 + }, + { + "epoch": 0.16412971542025148, + "grad_norm": 1.7854478359222412, + "learning_rate": 9.972418445567851e-05, + "loss": 1.8586, + "step": 248 + }, + { + "epoch": 0.16677696889477167, + "grad_norm": 1.61441171169281, + "learning_rate": 9.97007444127711e-05, + "loss": 1.9234, + "step": 252 + }, + { + "epoch": 0.16942422236929186, + "grad_norm": 2.154454469680786, + "learning_rate": 9.967635153391401e-05, + "loss": 1.949, + "step": 256 + }, + { + "epoch": 0.17207147584381205, + "grad_norm": 1.5182636976242065, + "learning_rate": 9.965100628675441e-05, + "loss": 2.013, + "step": 260 + }, + { + "epoch": 0.17471872931833224, + "grad_norm": 1.751714825630188, + "learning_rate": 9.962470915719775e-05, + "loss": 1.9629, + "step": 264 + }, + { + "epoch": 0.17736598279285243, + "grad_norm": 1.5807703733444214, + "learning_rate": 9.959746064939846e-05, + "loss": 1.8705, + "step": 268 + }, + { + "epoch": 0.1800132362673726, + "grad_norm": 1.7142225503921509, + "learning_rate": 9.956926128575026e-05, + "loss": 2.0033, + "step": 272 + }, + { + "epoch": 0.18266048974189278, + "grad_norm": 1.555530309677124, + "learning_rate": 9.954011160687622e-05, + "loss": 1.8995, + "step": 276 + }, + { + "epoch": 0.18530774321641297, + "grad_norm": 1.5679693222045898, + "learning_rate": 9.951001217161829e-05, + "loss": 2.042, + "step": 280 + }, + { + "epoch": 0.18795499669093316, + "grad_norm": 1.6399418115615845, + "learning_rate": 9.947896355702666e-05, + "loss": 2.0388, + "step": 284 + }, + { + "epoch": 0.19060225016545335, + "grad_norm": 1.7505602836608887, + "learning_rate": 9.944696635834867e-05, + "loss": 1.9648, + "step": 288 + }, + { + "epoch": 0.19324950363997354, + "grad_norm": 1.4888848066329956, + "learning_rate": 9.941402118901744e-05, + "loss": 1.8595, + "step": 292 + }, + { + "epoch": 0.19589675711449372, + "grad_norm": 1.4739277362823486, + "learning_rate": 9.938012868064e-05, + "loss": 1.7959, + "step": 296 + }, + { + "epoch": 0.1985440105890139, + "grad_norm": 1.5393471717834473, + "learning_rate": 9.934528948298533e-05, + "loss": 1.9469, + "step": 300 + }, + { + "epoch": 0.20119126406353408, + "grad_norm": 1.5673627853393555, + "learning_rate": 9.930950426397179e-05, + "loss": 1.9332, + "step": 304 + }, + { + "epoch": 0.20383851753805426, + "grad_norm": 1.6461111307144165, + "learning_rate": 9.927277370965435e-05, + "loss": 1.8055, + "step": 308 + }, + { + "epoch": 0.20648577101257445, + "grad_norm": 1.5950462818145752, + "learning_rate": 9.923509852421145e-05, + "loss": 1.8414, + "step": 312 + }, + { + "epoch": 0.20913302448709464, + "grad_norm": 1.433727741241455, + "learning_rate": 9.919647942993148e-05, + "loss": 1.9514, + "step": 316 + }, + { + "epoch": 0.21178027796161483, + "grad_norm": 1.445776343345642, + "learning_rate": 9.915691716719898e-05, + "loss": 1.7297, + "step": 320 + }, + { + "epoch": 0.21442753143613502, + "grad_norm": 1.9325745105743408, + "learning_rate": 9.911641249448036e-05, + "loss": 1.9855, + "step": 324 + }, + { + "epoch": 0.21707478491065518, + "grad_norm": 1.494813323020935, + "learning_rate": 9.907496618830942e-05, + "loss": 1.7916, + "step": 328 + }, + { + "epoch": 0.21972203838517537, + "grad_norm": 1.4863932132720947, + "learning_rate": 9.903257904327249e-05, + "loss": 1.8029, + "step": 332 + }, + { + "epoch": 0.22236929185969556, + "grad_norm": 1.594827651977539, + "learning_rate": 9.898925187199308e-05, + "loss": 1.9516, + "step": 336 + }, + { + "epoch": 0.22501654533421575, + "grad_norm": 1.5738781690597534, + "learning_rate": 9.894498550511646e-05, + "loss": 1.8997, + "step": 340 + }, + { + "epoch": 0.22766379880873594, + "grad_norm": 1.5598024129867554, + "learning_rate": 9.88997807912936e-05, + "loss": 1.9249, + "step": 344 + }, + { + "epoch": 0.23031105228325613, + "grad_norm": 1.4761321544647217, + "learning_rate": 9.885363859716497e-05, + "loss": 1.7571, + "step": 348 + }, + { + "epoch": 0.23295830575777632, + "grad_norm": 1.4266904592514038, + "learning_rate": 9.88065598073439e-05, + "loss": 1.9811, + "step": 352 + }, + { + "epoch": 0.23560555923229648, + "grad_norm": 1.5371057987213135, + "learning_rate": 9.875854532439964e-05, + "loss": 1.8021, + "step": 356 + }, + { + "epoch": 0.23825281270681667, + "grad_norm": 1.380096673965454, + "learning_rate": 9.870959606884004e-05, + "loss": 1.8223, + "step": 360 + }, + { + "epoch": 0.24090006618133686, + "grad_norm": 1.632664680480957, + "learning_rate": 9.865971297909393e-05, + "loss": 2.006, + "step": 364 + }, + { + "epoch": 0.24354731965585705, + "grad_norm": 1.3765276670455933, + "learning_rate": 9.860889701149307e-05, + "loss": 1.7893, + "step": 368 + }, + { + "epoch": 0.24619457313037724, + "grad_norm": 1.5789958238601685, + "learning_rate": 9.855714914025384e-05, + "loss": 1.9381, + "step": 372 + }, + { + "epoch": 0.24884182660489743, + "grad_norm": 1.8294042348861694, + "learning_rate": 9.850447035745866e-05, + "loss": 1.8584, + "step": 376 + }, + { + "epoch": 0.2514890800794176, + "grad_norm": 1.5388972759246826, + "learning_rate": 9.845086167303679e-05, + "loss": 1.8763, + "step": 380 + }, + { + "epoch": 0.2541363335539378, + "grad_norm": 1.5301390886306763, + "learning_rate": 9.839632411474513e-05, + "loss": 2.0612, + "step": 384 + }, + { + "epoch": 0.256783587028458, + "grad_norm": 1.546277642250061, + "learning_rate": 9.83408587281484e-05, + "loss": 1.9085, + "step": 388 + }, + { + "epoch": 0.25943084050297816, + "grad_norm": 1.5818853378295898, + "learning_rate": 9.828446657659918e-05, + "loss": 2.0181, + "step": 392 + }, + { + "epoch": 0.26207809397749837, + "grad_norm": 1.2648255825042725, + "learning_rate": 9.82271487412175e-05, + "loss": 1.6947, + "step": 396 + }, + { + "epoch": 0.26472534745201853, + "grad_norm": 1.541934847831726, + "learning_rate": 9.816890632087006e-05, + "loss": 1.8053, + "step": 400 + }, + { + "epoch": 0.2673726009265387, + "grad_norm": 1.5966472625732422, + "learning_rate": 9.810974043214922e-05, + "loss": 1.8733, + "step": 404 + }, + { + "epoch": 0.2700198544010589, + "grad_norm": 1.5871154069900513, + "learning_rate": 9.804965220935161e-05, + "loss": 1.896, + "step": 408 + }, + { + "epoch": 0.2726671078755791, + "grad_norm": 1.4850573539733887, + "learning_rate": 9.798864280445632e-05, + "loss": 1.8494, + "step": 412 + }, + { + "epoch": 0.2753143613500993, + "grad_norm": 1.4737725257873535, + "learning_rate": 9.792671338710285e-05, + "loss": 1.8145, + "step": 416 + }, + { + "epoch": 0.27796161482461945, + "grad_norm": 1.5895408391952515, + "learning_rate": 9.786386514456872e-05, + "loss": 1.9279, + "step": 420 + }, + { + "epoch": 0.28060886829913967, + "grad_norm": 1.522838830947876, + "learning_rate": 9.780009928174661e-05, + "loss": 1.9103, + "step": 424 + }, + { + "epoch": 0.28325612177365983, + "grad_norm": 1.4890238046646118, + "learning_rate": 9.773541702112137e-05, + "loss": 1.9306, + "step": 428 + }, + { + "epoch": 0.28590337524818, + "grad_norm": 1.5047945976257324, + "learning_rate": 9.766981960274653e-05, + "loss": 1.8459, + "step": 432 + }, + { + "epoch": 0.2885506287227002, + "grad_norm": 1.4997539520263672, + "learning_rate": 9.760330828422053e-05, + "loss": 1.7442, + "step": 436 + }, + { + "epoch": 0.29119788219722037, + "grad_norm": 1.389294981956482, + "learning_rate": 9.753588434066258e-05, + "loss": 1.9077, + "step": 440 + }, + { + "epoch": 0.2938451356717406, + "grad_norm": 1.3641945123672485, + "learning_rate": 9.746754906468832e-05, + "loss": 1.8979, + "step": 444 + }, + { + "epoch": 0.29649238914626075, + "grad_norm": 1.5315138101577759, + "learning_rate": 9.73983037663849e-05, + "loss": 1.8207, + "step": 448 + }, + { + "epoch": 0.29913964262078097, + "grad_norm": 1.5057647228240967, + "learning_rate": 9.732814977328592e-05, + "loss": 1.911, + "step": 452 + }, + { + "epoch": 0.3017868960953011, + "grad_norm": 1.368912696838379, + "learning_rate": 9.725708843034605e-05, + "loss": 1.8377, + "step": 456 + }, + { + "epoch": 0.3044341495698213, + "grad_norm": 1.389817714691162, + "learning_rate": 9.718512109991514e-05, + "loss": 1.7907, + "step": 460 + }, + { + "epoch": 0.3070814030443415, + "grad_norm": 1.7318735122680664, + "learning_rate": 9.711224916171215e-05, + "loss": 1.9412, + "step": 464 + }, + { + "epoch": 0.30972865651886167, + "grad_norm": 1.4791710376739502, + "learning_rate": 9.703847401279871e-05, + "loss": 1.7754, + "step": 468 + }, + { + "epoch": 0.3123759099933819, + "grad_norm": 1.3618526458740234, + "learning_rate": 9.69637970675523e-05, + "loss": 1.73, + "step": 472 + }, + { + "epoch": 0.31502316346790205, + "grad_norm": 1.5649083852767944, + "learning_rate": 9.688821975763918e-05, + "loss": 1.9635, + "step": 476 + }, + { + "epoch": 0.31767041694242226, + "grad_norm": 1.3701534271240234, + "learning_rate": 9.681174353198687e-05, + "loss": 1.6581, + "step": 480 + }, + { + "epoch": 0.3203176704169424, + "grad_norm": 1.4764872789382935, + "learning_rate": 9.673436985675645e-05, + "loss": 1.794, + "step": 484 + }, + { + "epoch": 0.3229649238914626, + "grad_norm": 1.4432624578475952, + "learning_rate": 9.665610021531447e-05, + "loss": 1.9016, + "step": 488 + }, + { + "epoch": 0.3256121773659828, + "grad_norm": 1.572975993156433, + "learning_rate": 9.657693610820437e-05, + "loss": 2.035, + "step": 492 + }, + { + "epoch": 0.32825943084050296, + "grad_norm": 1.5382163524627686, + "learning_rate": 9.649687905311785e-05, + "loss": 2.0041, + "step": 496 + }, + { + "epoch": 0.3309066843150232, + "grad_norm": 1.3413423299789429, + "learning_rate": 9.641593058486574e-05, + "loss": 1.7448, + "step": 500 + }, + { + "epoch": 0.33355393778954334, + "grad_norm": 1.4374409914016724, + "learning_rate": 9.633409225534855e-05, + "loss": 1.7816, + "step": 504 + }, + { + "epoch": 0.33620119126406356, + "grad_norm": 1.4096835851669312, + "learning_rate": 9.625136563352671e-05, + "loss": 1.772, + "step": 508 + }, + { + "epoch": 0.3388484447385837, + "grad_norm": 2.1890769004821777, + "learning_rate": 9.616775230539057e-05, + "loss": 1.8641, + "step": 512 + }, + { + "epoch": 0.3414956982131039, + "grad_norm": 1.4621169567108154, + "learning_rate": 9.608325387392986e-05, + "loss": 1.7406, + "step": 516 + }, + { + "epoch": 0.3441429516876241, + "grad_norm": 1.4140963554382324, + "learning_rate": 9.599787195910313e-05, + "loss": 1.6127, + "step": 520 + }, + { + "epoch": 0.34679020516214426, + "grad_norm": 1.459409236907959, + "learning_rate": 9.591160819780649e-05, + "loss": 1.7579, + "step": 524 + }, + { + "epoch": 0.3494374586366645, + "grad_norm": 1.7444220781326294, + "learning_rate": 9.582446424384242e-05, + "loss": 1.8177, + "step": 528 + }, + { + "epoch": 0.35208471211118464, + "grad_norm": 1.4114232063293457, + "learning_rate": 9.573644176788794e-05, + "loss": 1.7955, + "step": 532 + }, + { + "epoch": 0.35473196558570486, + "grad_norm": 1.4076716899871826, + "learning_rate": 9.564754245746264e-05, + "loss": 1.9122, + "step": 536 + }, + { + "epoch": 0.357379219060225, + "grad_norm": 1.4209445714950562, + "learning_rate": 9.555776801689632e-05, + "loss": 1.8108, + "step": 540 + }, + { + "epoch": 0.3600264725347452, + "grad_norm": 1.5626829862594604, + "learning_rate": 9.546712016729624e-05, + "loss": 1.9285, + "step": 544 + }, + { + "epoch": 0.3626737260092654, + "grad_norm": 1.4253438711166382, + "learning_rate": 9.537560064651427e-05, + "loss": 1.6505, + "step": 548 + }, + { + "epoch": 0.36532097948378556, + "grad_norm": 1.447141170501709, + "learning_rate": 9.528321120911346e-05, + "loss": 1.8303, + "step": 552 + }, + { + "epoch": 0.3679682329583058, + "grad_norm": 1.4913408756256104, + "learning_rate": 9.51899536263344e-05, + "loss": 1.8382, + "step": 556 + }, + { + "epoch": 0.37061548643282594, + "grad_norm": 1.5191394090652466, + "learning_rate": 9.509582968606136e-05, + "loss": 1.7477, + "step": 560 + }, + { + "epoch": 0.37326273990734615, + "grad_norm": 1.3612414598464966, + "learning_rate": 9.500084119278788e-05, + "loss": 1.7101, + "step": 564 + }, + { + "epoch": 0.3759099933818663, + "grad_norm": 1.3365185260772705, + "learning_rate": 9.49049899675823e-05, + "loss": 1.8855, + "step": 568 + }, + { + "epoch": 0.3785572468563865, + "grad_norm": 1.4907687902450562, + "learning_rate": 9.480827784805278e-05, + "loss": 1.8158, + "step": 572 + }, + { + "epoch": 0.3812045003309067, + "grad_norm": 1.2549834251403809, + "learning_rate": 9.471070668831208e-05, + "loss": 1.6304, + "step": 576 + }, + { + "epoch": 0.38385175380542685, + "grad_norm": 1.6914743185043335, + "learning_rate": 9.4612278358942e-05, + "loss": 1.6976, + "step": 580 + }, + { + "epoch": 0.38649900727994707, + "grad_norm": 1.5349342823028564, + "learning_rate": 9.451299474695754e-05, + "loss": 1.7323, + "step": 584 + }, + { + "epoch": 0.38914626075446723, + "grad_norm": 1.4379171133041382, + "learning_rate": 9.441285775577075e-05, + "loss": 1.7762, + "step": 588 + }, + { + "epoch": 0.39179351422898745, + "grad_norm": 1.360475778579712, + "learning_rate": 9.431186930515419e-05, + "loss": 1.7328, + "step": 592 + }, + { + "epoch": 0.3944407677035076, + "grad_norm": 1.4364429712295532, + "learning_rate": 9.421003133120412e-05, + "loss": 1.7363, + "step": 596 + }, + { + "epoch": 0.3970880211780278, + "grad_norm": 1.4598385095596313, + "learning_rate": 9.410734578630343e-05, + "loss": 1.6917, + "step": 600 + }, + { + "epoch": 0.399735274652548, + "grad_norm": 1.3313078880310059, + "learning_rate": 9.400381463908416e-05, + "loss": 1.8008, + "step": 604 + }, + { + "epoch": 0.40238252812706815, + "grad_norm": 1.5070075988769531, + "learning_rate": 9.389943987438983e-05, + "loss": 1.669, + "step": 608 + }, + { + "epoch": 0.40502978160158837, + "grad_norm": 1.3858133554458618, + "learning_rate": 9.379422349323728e-05, + "loss": 1.6599, + "step": 612 + }, + { + "epoch": 0.40767703507610853, + "grad_norm": 1.3775012493133545, + "learning_rate": 9.368816751277843e-05, + "loss": 1.628, + "step": 616 + }, + { + "epoch": 0.41032428855062875, + "grad_norm": 1.3733761310577393, + "learning_rate": 9.358127396626147e-05, + "loss": 1.6797, + "step": 620 + }, + { + "epoch": 0.4129715420251489, + "grad_norm": 1.760237455368042, + "learning_rate": 9.347354490299205e-05, + "loss": 1.7479, + "step": 624 + }, + { + "epoch": 0.41561879549966907, + "grad_norm": 1.2483643293380737, + "learning_rate": 9.336498238829384e-05, + "loss": 1.6595, + "step": 628 + }, + { + "epoch": 0.4182660489741893, + "grad_norm": 2.099116563796997, + "learning_rate": 9.325558850346897e-05, + "loss": 1.6933, + "step": 632 + }, + { + "epoch": 0.42091330244870945, + "grad_norm": 1.3913215398788452, + "learning_rate": 9.31453653457582e-05, + "loss": 1.6433, + "step": 636 + }, + { + "epoch": 0.42356055592322966, + "grad_norm": 1.3813973665237427, + "learning_rate": 9.303431502830065e-05, + "loss": 1.6652, + "step": 640 + }, + { + "epoch": 0.4262078093977498, + "grad_norm": 1.496819019317627, + "learning_rate": 9.292243968009331e-05, + "loss": 1.747, + "step": 644 + }, + { + "epoch": 0.42885506287227004, + "grad_norm": 1.37201988697052, + "learning_rate": 9.280974144595018e-05, + "loss": 1.6331, + "step": 648 + }, + { + "epoch": 0.4315023163467902, + "grad_norm": 1.505353331565857, + "learning_rate": 9.269622248646124e-05, + "loss": 1.7717, + "step": 652 + }, + { + "epoch": 0.43414956982131037, + "grad_norm": 1.8498897552490234, + "learning_rate": 9.258188497795093e-05, + "loss": 1.6643, + "step": 656 + }, + { + "epoch": 0.4367968232958306, + "grad_norm": 1.2886799573898315, + "learning_rate": 9.24667311124365e-05, + "loss": 1.777, + "step": 660 + }, + { + "epoch": 0.43944407677035074, + "grad_norm": 1.283218502998352, + "learning_rate": 9.23507630975859e-05, + "loss": 1.6958, + "step": 664 + }, + { + "epoch": 0.44209133024487096, + "grad_norm": 1.3919546604156494, + "learning_rate": 9.223398315667561e-05, + "loss": 1.6515, + "step": 668 + }, + { + "epoch": 0.4447385837193911, + "grad_norm": 1.4083247184753418, + "learning_rate": 9.211639352854787e-05, + "loss": 1.7531, + "step": 672 + }, + { + "epoch": 0.44738583719391134, + "grad_norm": 1.2739989757537842, + "learning_rate": 9.199799646756777e-05, + "loss": 1.7694, + "step": 676 + }, + { + "epoch": 0.4500330906684315, + "grad_norm": 1.4435306787490845, + "learning_rate": 9.187879424358014e-05, + "loss": 1.8044, + "step": 680 + }, + { + "epoch": 0.45268034414295166, + "grad_norm": 1.4848833084106445, + "learning_rate": 9.17587891418659e-05, + "loss": 1.6531, + "step": 684 + }, + { + "epoch": 0.4553275976174719, + "grad_norm": 1.527485966682434, + "learning_rate": 9.163798346309837e-05, + "loss": 1.8783, + "step": 688 + }, + { + "epoch": 0.45797485109199204, + "grad_norm": 1.2369976043701172, + "learning_rate": 9.151637952329903e-05, + "loss": 1.5479, + "step": 692 + }, + { + "epoch": 0.46062210456651226, + "grad_norm": 1.4693775177001953, + "learning_rate": 9.139397965379327e-05, + "loss": 1.7891, + "step": 696 + }, + { + "epoch": 0.4632693580410324, + "grad_norm": 1.6788188219070435, + "learning_rate": 9.127078620116556e-05, + "loss": 1.7637, + "step": 700 + }, + { + "epoch": 0.46591661151555264, + "grad_norm": 1.3309741020202637, + "learning_rate": 9.114680152721453e-05, + "loss": 1.6053, + "step": 704 + }, + { + "epoch": 0.4685638649900728, + "grad_norm": 1.509023904800415, + "learning_rate": 9.102202800890772e-05, + "loss": 1.8784, + "step": 708 + }, + { + "epoch": 0.47121111846459296, + "grad_norm": 1.3232872486114502, + "learning_rate": 9.089646803833589e-05, + "loss": 1.6745, + "step": 712 + }, + { + "epoch": 0.4738583719391132, + "grad_norm": 1.3540325164794922, + "learning_rate": 9.077012402266731e-05, + "loss": 1.6668, + "step": 716 + }, + { + "epoch": 0.47650562541363334, + "grad_norm": 1.3100489377975464, + "learning_rate": 9.064299838410152e-05, + "loss": 1.6188, + "step": 720 + }, + { + "epoch": 0.47915287888815355, + "grad_norm": 1.3783172369003296, + "learning_rate": 9.051509355982293e-05, + "loss": 1.6491, + "step": 724 + }, + { + "epoch": 0.4818001323626737, + "grad_norm": 1.27851402759552, + "learning_rate": 9.038641200195404e-05, + "loss": 1.8925, + "step": 728 + }, + { + "epoch": 0.48444738583719393, + "grad_norm": 1.4370380640029907, + "learning_rate": 9.025695617750848e-05, + "loss": 1.7996, + "step": 732 + }, + { + "epoch": 0.4870946393117141, + "grad_norm": 1.4078205823898315, + "learning_rate": 9.012672856834373e-05, + "loss": 1.8554, + "step": 736 + }, + { + "epoch": 0.48974189278623426, + "grad_norm": 1.3553669452667236, + "learning_rate": 8.999573167111348e-05, + "loss": 1.5417, + "step": 740 + }, + { + "epoch": 0.4923891462607545, + "grad_norm": 1.4759166240692139, + "learning_rate": 8.986396799721983e-05, + "loss": 1.6143, + "step": 744 + }, + { + "epoch": 0.49503639973527463, + "grad_norm": 1.3601372241973877, + "learning_rate": 8.973144007276508e-05, + "loss": 1.7011, + "step": 748 + }, + { + "epoch": 0.49768365320979485, + "grad_norm": 1.425181269645691, + "learning_rate": 8.959815043850336e-05, + "loss": 1.672, + "step": 752 + }, + { + "epoch": 0.500330906684315, + "grad_norm": 1.440303921699524, + "learning_rate": 8.946410164979184e-05, + "loss": 1.8008, + "step": 756 + }, + { + "epoch": 0.5029781601588352, + "grad_norm": 1.4576961994171143, + "learning_rate": 8.932929627654185e-05, + "loss": 1.5234, + "step": 760 + }, + { + "epoch": 0.5056254136333554, + "grad_norm": 1.3088816404342651, + "learning_rate": 8.919373690316952e-05, + "loss": 1.701, + "step": 764 + }, + { + "epoch": 0.5082726671078756, + "grad_norm": 3.7521555423736572, + "learning_rate": 8.905742612854628e-05, + "loss": 1.6714, + "step": 768 + }, + { + "epoch": 0.5109199205823958, + "grad_norm": 1.4540220499038696, + "learning_rate": 8.892036656594898e-05, + "loss": 1.6276, + "step": 772 + }, + { + "epoch": 0.513567174056916, + "grad_norm": 1.3043605089187622, + "learning_rate": 8.87825608430099e-05, + "loss": 1.635, + "step": 776 + }, + { + "epoch": 0.5162144275314361, + "grad_norm": 1.3931020498275757, + "learning_rate": 8.864401160166624e-05, + "loss": 1.5822, + "step": 780 + }, + { + "epoch": 0.5188616810059563, + "grad_norm": 1.3738582134246826, + "learning_rate": 8.85047214981096e-05, + "loss": 1.694, + "step": 784 + }, + { + "epoch": 0.5215089344804765, + "grad_norm": 1.3968422412872314, + "learning_rate": 8.83646932027349e-05, + "loss": 1.6673, + "step": 788 + }, + { + "epoch": 0.5241561879549967, + "grad_norm": 1.4195423126220703, + "learning_rate": 8.822392940008937e-05, + "loss": 1.5422, + "step": 792 + }, + { + "epoch": 0.5268034414295168, + "grad_norm": 1.2660058736801147, + "learning_rate": 8.808243278882094e-05, + "loss": 1.4875, + "step": 796 + }, + { + "epoch": 0.5294506949040371, + "grad_norm": 1.3500608205795288, + "learning_rate": 8.794020608162656e-05, + "loss": 1.6946, + "step": 800 + }, + { + "epoch": 0.5320979483785573, + "grad_norm": 1.6274265050888062, + "learning_rate": 8.779725200520021e-05, + "loss": 1.6943, + "step": 804 + }, + { + "epoch": 0.5347452018530774, + "grad_norm": 1.2186963558197021, + "learning_rate": 8.765357330018056e-05, + "loss": 1.4563, + "step": 808 + }, + { + "epoch": 0.5373924553275976, + "grad_norm": 1.501142978668213, + "learning_rate": 8.750917272109848e-05, + "loss": 1.6729, + "step": 812 + }, + { + "epoch": 0.5400397088021178, + "grad_norm": 1.372517466545105, + "learning_rate": 8.736405303632427e-05, + "loss": 1.636, + "step": 816 + }, + { + "epoch": 0.542686962276638, + "grad_norm": 1.4448741674423218, + "learning_rate": 8.721821702801449e-05, + "loss": 1.6977, + "step": 820 + }, + { + "epoch": 0.5453342157511581, + "grad_norm": 1.4774208068847656, + "learning_rate": 8.707166749205866e-05, + "loss": 1.7892, + "step": 824 + }, + { + "epoch": 0.5479814692256784, + "grad_norm": 1.3137487173080444, + "learning_rate": 8.692440723802571e-05, + "loss": 1.5086, + "step": 828 + }, + { + "epoch": 0.5506287227001986, + "grad_norm": 1.4480420351028442, + "learning_rate": 8.677643908911007e-05, + "loss": 1.6694, + "step": 832 + }, + { + "epoch": 0.5532759761747187, + "grad_norm": 1.4660981893539429, + "learning_rate": 8.662776588207747e-05, + "loss": 1.632, + "step": 836 + }, + { + "epoch": 0.5559232296492389, + "grad_norm": 1.2639222145080566, + "learning_rate": 8.647839046721076e-05, + "loss": 1.5101, + "step": 840 + }, + { + "epoch": 0.5585704831237591, + "grad_norm": 1.3556458950042725, + "learning_rate": 8.632831570825508e-05, + "loss": 1.7912, + "step": 844 + }, + { + "epoch": 0.5612177365982793, + "grad_norm": 1.2261251211166382, + "learning_rate": 8.617754448236298e-05, + "loss": 1.6547, + "step": 848 + }, + { + "epoch": 0.5638649900727994, + "grad_norm": 1.2850754261016846, + "learning_rate": 8.602607968003935e-05, + "loss": 1.5365, + "step": 852 + }, + { + "epoch": 0.5665122435473197, + "grad_norm": 1.3346043825149536, + "learning_rate": 8.587392420508598e-05, + "loss": 1.6175, + "step": 856 + }, + { + "epoch": 0.5691594970218399, + "grad_norm": 1.5381152629852295, + "learning_rate": 8.572108097454578e-05, + "loss": 1.7967, + "step": 860 + }, + { + "epoch": 0.57180675049636, + "grad_norm": 1.2237263917922974, + "learning_rate": 8.556755291864701e-05, + "loss": 1.6057, + "step": 864 + }, + { + "epoch": 0.5744540039708802, + "grad_norm": 1.233619213104248, + "learning_rate": 8.541334298074701e-05, + "loss": 1.7107, + "step": 868 + }, + { + "epoch": 0.5771012574454004, + "grad_norm": 1.2423778772354126, + "learning_rate": 8.525845411727581e-05, + "loss": 1.4729, + "step": 872 + }, + { + "epoch": 0.5797485109199206, + "grad_norm": 7.3384480476379395, + "learning_rate": 8.51028892976794e-05, + "loss": 1.6363, + "step": 876 + }, + { + "epoch": 0.5823957643944407, + "grad_norm": 1.3198407888412476, + "learning_rate": 8.494665150436288e-05, + "loss": 1.646, + "step": 880 + }, + { + "epoch": 0.585043017868961, + "grad_norm": 1.172568678855896, + "learning_rate": 8.478974373263318e-05, + "loss": 1.4356, + "step": 884 + }, + { + "epoch": 0.5876902713434812, + "grad_norm": 1.4879450798034668, + "learning_rate": 8.463216899064179e-05, + "loss": 1.7847, + "step": 888 + }, + { + "epoch": 0.5903375248180013, + "grad_norm": 1.3998438119888306, + "learning_rate": 8.447393029932692e-05, + "loss": 1.7818, + "step": 892 + }, + { + "epoch": 0.5929847782925215, + "grad_norm": 1.3567726612091064, + "learning_rate": 8.431503069235565e-05, + "loss": 1.5539, + "step": 896 + }, + { + "epoch": 0.5956320317670417, + "grad_norm": 1.4983903169631958, + "learning_rate": 8.415547321606584e-05, + "loss": 1.6477, + "step": 900 + }, + { + "epoch": 0.5982792852415619, + "grad_norm": 1.2646454572677612, + "learning_rate": 8.399526092940768e-05, + "loss": 1.6087, + "step": 904 + }, + { + "epoch": 0.600926538716082, + "grad_norm": 1.4137752056121826, + "learning_rate": 8.38343969038849e-05, + "loss": 1.7626, + "step": 908 + }, + { + "epoch": 0.6035737921906023, + "grad_norm": 1.4016697406768799, + "learning_rate": 8.367288422349617e-05, + "loss": 1.6947, + "step": 912 + }, + { + "epoch": 0.6062210456651225, + "grad_norm": 1.331425666809082, + "learning_rate": 8.351072598467576e-05, + "loss": 1.6358, + "step": 916 + }, + { + "epoch": 0.6088682991396426, + "grad_norm": 1.2292309999465942, + "learning_rate": 8.334792529623419e-05, + "loss": 1.4613, + "step": 920 + }, + { + "epoch": 0.6115155526141628, + "grad_norm": 1.3756728172302246, + "learning_rate": 8.318448527929877e-05, + "loss": 1.5771, + "step": 924 + }, + { + "epoch": 0.614162806088683, + "grad_norm": 1.4124281406402588, + "learning_rate": 8.302040906725361e-05, + "loss": 1.7364, + "step": 928 + }, + { + "epoch": 0.6168100595632032, + "grad_norm": 1.298540472984314, + "learning_rate": 8.285569980567964e-05, + "loss": 1.6394, + "step": 932 + }, + { + "epoch": 0.6194573130377233, + "grad_norm": 1.3905584812164307, + "learning_rate": 8.269036065229427e-05, + "loss": 1.7034, + "step": 936 + }, + { + "epoch": 0.6221045665122436, + "grad_norm": 1.4072821140289307, + "learning_rate": 8.252439477689082e-05, + "loss": 1.6315, + "step": 940 + }, + { + "epoch": 0.6247518199867638, + "grad_norm": 1.239159345626831, + "learning_rate": 8.235780536127787e-05, + "loss": 1.5178, + "step": 944 + }, + { + "epoch": 0.6273990734612839, + "grad_norm": 1.3636091947555542, + "learning_rate": 8.21905955992181e-05, + "loss": 1.6564, + "step": 948 + }, + { + "epoch": 0.6300463269358041, + "grad_norm": 1.3506637811660767, + "learning_rate": 8.202276869636713e-05, + "loss": 1.646, + "step": 952 + }, + { + "epoch": 0.6326935804103243, + "grad_norm": 1.4368304014205933, + "learning_rate": 8.185432787021216e-05, + "loss": 1.5073, + "step": 956 + }, + { + "epoch": 0.6353408338848445, + "grad_norm": 1.3278450965881348, + "learning_rate": 8.168527635001015e-05, + "loss": 1.5203, + "step": 960 + }, + { + "epoch": 0.6379880873593646, + "grad_norm": 1.2450168132781982, + "learning_rate": 8.151561737672591e-05, + "loss": 1.7171, + "step": 964 + }, + { + "epoch": 0.6406353408338848, + "grad_norm": 1.2755018472671509, + "learning_rate": 8.134535420297008e-05, + "loss": 1.5675, + "step": 968 + }, + { + "epoch": 0.6432825943084051, + "grad_norm": 1.3066191673278809, + "learning_rate": 8.117449009293668e-05, + "loss": 1.6525, + "step": 972 + }, + { + "epoch": 0.6459298477829252, + "grad_norm": 1.2875075340270996, + "learning_rate": 8.100302832234056e-05, + "loss": 1.6484, + "step": 976 + }, + { + "epoch": 0.6485771012574454, + "grad_norm": 1.5069595575332642, + "learning_rate": 8.083097217835461e-05, + "loss": 1.6251, + "step": 980 + }, + { + "epoch": 0.6512243547319656, + "grad_norm": 1.334075927734375, + "learning_rate": 8.065832495954668e-05, + "loss": 1.743, + "step": 984 + }, + { + "epoch": 0.6538716082064858, + "grad_norm": 1.3219469785690308, + "learning_rate": 8.048508997581647e-05, + "loss": 1.6345, + "step": 988 + }, + { + "epoch": 0.6565188616810059, + "grad_norm": 1.4275529384613037, + "learning_rate": 8.03112705483319e-05, + "loss": 1.7515, + "step": 992 + }, + { + "epoch": 0.6591661151555261, + "grad_norm": 1.349526286125183, + "learning_rate": 8.013687000946561e-05, + "loss": 1.5209, + "step": 996 + }, + { + "epoch": 0.6618133686300464, + "grad_norm": 1.3620506525039673, + "learning_rate": 7.996189170273096e-05, + "loss": 1.6789, + "step": 1000 + }, + { + "epoch": 0.6644606221045665, + "grad_norm": 1.2079874277114868, + "learning_rate": 7.978633898271795e-05, + "loss": 1.3453, + "step": 1004 + }, + { + "epoch": 0.6671078755790867, + "grad_norm": 1.3527398109436035, + "learning_rate": 7.961021521502895e-05, + "loss": 1.5927, + "step": 1008 + }, + { + "epoch": 0.6697551290536069, + "grad_norm": 1.3048250675201416, + "learning_rate": 7.943352377621414e-05, + "loss": 1.643, + "step": 1012 + }, + { + "epoch": 0.6724023825281271, + "grad_norm": 1.2111921310424805, + "learning_rate": 7.925626805370678e-05, + "loss": 1.4432, + "step": 1016 + }, + { + "epoch": 0.6750496360026472, + "grad_norm": 1.3531336784362793, + "learning_rate": 7.907845144575829e-05, + "loss": 1.6235, + "step": 1020 + }, + { + "epoch": 0.6776968894771674, + "grad_norm": 1.204720139503479, + "learning_rate": 7.890007736137307e-05, + "loss": 1.5377, + "step": 1024 + }, + { + "epoch": 0.6803441429516877, + "grad_norm": 1.3632683753967285, + "learning_rate": 7.872114922024313e-05, + "loss": 1.5758, + "step": 1028 + }, + { + "epoch": 0.6829913964262078, + "grad_norm": 1.4058332443237305, + "learning_rate": 7.854167045268264e-05, + "loss": 1.4645, + "step": 1032 + }, + { + "epoch": 0.685638649900728, + "grad_norm": 1.2490967512130737, + "learning_rate": 7.836164449956199e-05, + "loss": 1.5723, + "step": 1036 + }, + { + "epoch": 0.6882859033752482, + "grad_norm": 1.3228312730789185, + "learning_rate": 7.818107481224198e-05, + "loss": 1.466, + "step": 1040 + }, + { + "epoch": 0.6909331568497684, + "grad_norm": 1.3664582967758179, + "learning_rate": 7.799996485250755e-05, + "loss": 1.4823, + "step": 1044 + }, + { + "epoch": 0.6935804103242885, + "grad_norm": 1.1946579217910767, + "learning_rate": 7.781831809250151e-05, + "loss": 1.6093, + "step": 1048 + }, + { + "epoch": 0.6962276637988087, + "grad_norm": 1.3534433841705322, + "learning_rate": 7.763613801465786e-05, + "loss": 1.5823, + "step": 1052 + }, + { + "epoch": 0.698874917273329, + "grad_norm": 1.275877833366394, + "learning_rate": 7.745342811163507e-05, + "loss": 1.508, + "step": 1056 + }, + { + "epoch": 0.7015221707478491, + "grad_norm": 1.2870965003967285, + "learning_rate": 7.727019188624922e-05, + "loss": 1.6452, + "step": 1060 + }, + { + "epoch": 0.7041694242223693, + "grad_norm": 1.2805050611495972, + "learning_rate": 7.708643285140667e-05, + "loss": 1.7463, + "step": 1064 + }, + { + "epoch": 0.7068166776968895, + "grad_norm": 1.331794261932373, + "learning_rate": 7.690215453003684e-05, + "loss": 1.4428, + "step": 1068 + }, + { + "epoch": 0.7094639311714097, + "grad_norm": 1.3701887130737305, + "learning_rate": 7.671736045502462e-05, + "loss": 1.6868, + "step": 1072 + }, + { + "epoch": 0.7121111846459298, + "grad_norm": 1.3474302291870117, + "learning_rate": 7.653205416914267e-05, + "loss": 1.4919, + "step": 1076 + }, + { + "epoch": 0.71475843812045, + "grad_norm": 1.6028352975845337, + "learning_rate": 7.634623922498348e-05, + "loss": 1.5958, + "step": 1080 + }, + { + "epoch": 0.7174056915949703, + "grad_norm": 1.2263597249984741, + "learning_rate": 7.615991918489125e-05, + "loss": 1.7238, + "step": 1084 + }, + { + "epoch": 0.7200529450694904, + "grad_norm": 1.4178084135055542, + "learning_rate": 7.597309762089359e-05, + "loss": 1.48, + "step": 1088 + }, + { + "epoch": 0.7227001985440106, + "grad_norm": 1.3942856788635254, + "learning_rate": 7.57857781146331e-05, + "loss": 1.5336, + "step": 1092 + }, + { + "epoch": 0.7253474520185308, + "grad_norm": 1.2155961990356445, + "learning_rate": 7.559796425729863e-05, + "loss": 1.4977, + "step": 1096 + }, + { + "epoch": 0.727994705493051, + "grad_norm": 1.3590655326843262, + "learning_rate": 7.540965964955649e-05, + "loss": 1.6736, + "step": 1100 + }, + { + "epoch": 0.7306419589675711, + "grad_norm": 1.1585520505905151, + "learning_rate": 7.522086790148133e-05, + "loss": 1.6883, + "step": 1104 + }, + { + "epoch": 0.7332892124420913, + "grad_norm": 1.2694188356399536, + "learning_rate": 7.503159263248709e-05, + "loss": 1.657, + "step": 1108 + }, + { + "epoch": 0.7359364659166115, + "grad_norm": 1.2413800954818726, + "learning_rate": 7.484183747125742e-05, + "loss": 1.4757, + "step": 1112 + }, + { + "epoch": 0.7385837193911317, + "grad_norm": 1.1527191400527954, + "learning_rate": 7.46516060556763e-05, + "loss": 1.5628, + "step": 1116 + }, + { + "epoch": 0.7412309728656519, + "grad_norm": 1.5187007188796997, + "learning_rate": 7.446090203275809e-05, + "loss": 1.6387, + "step": 1120 + }, + { + "epoch": 0.7438782263401721, + "grad_norm": 1.3278498649597168, + "learning_rate": 7.426972905857781e-05, + "loss": 1.5212, + "step": 1124 + }, + { + "epoch": 0.7465254798146923, + "grad_norm": 1.4994242191314697, + "learning_rate": 7.407809079820094e-05, + "loss": 1.7582, + "step": 1128 + }, + { + "epoch": 0.7491727332892124, + "grad_norm": 1.2623709440231323, + "learning_rate": 7.388599092561315e-05, + "loss": 1.6223, + "step": 1132 + }, + { + "epoch": 0.7518199867637326, + "grad_norm": 1.3785511255264282, + "learning_rate": 7.369343312364993e-05, + "loss": 1.5051, + "step": 1136 + }, + { + "epoch": 0.7544672402382528, + "grad_norm": 1.2472020387649536, + "learning_rate": 7.350042108392594e-05, + "loss": 1.419, + "step": 1140 + }, + { + "epoch": 0.757114493712773, + "grad_norm": 1.6892167329788208, + "learning_rate": 7.330695850676421e-05, + "loss": 1.5718, + "step": 1144 + }, + { + "epoch": 0.7597617471872932, + "grad_norm": 1.4521297216415405, + "learning_rate": 7.311304910112525e-05, + "loss": 1.6383, + "step": 1148 + }, + { + "epoch": 0.7624090006618134, + "grad_norm": 1.450149655342102, + "learning_rate": 7.291869658453594e-05, + "loss": 1.771, + "step": 1152 + }, + { + "epoch": 0.7650562541363336, + "grad_norm": 1.3068790435791016, + "learning_rate": 7.272390468301821e-05, + "loss": 1.6414, + "step": 1156 + }, + { + "epoch": 0.7677035076108537, + "grad_norm": 1.1887469291687012, + "learning_rate": 7.252867713101771e-05, + "loss": 1.3455, + "step": 1160 + }, + { + "epoch": 0.7703507610853739, + "grad_norm": 1.2392699718475342, + "learning_rate": 7.233301767133205e-05, + "loss": 1.5139, + "step": 1164 + }, + { + "epoch": 0.7729980145598941, + "grad_norm": 1.353925347328186, + "learning_rate": 7.213693005503924e-05, + "loss": 1.6324, + "step": 1168 + }, + { + "epoch": 0.7756452680344142, + "grad_norm": 1.2792888879776, + "learning_rate": 7.194041804142557e-05, + "loss": 1.69, + "step": 1172 + }, + { + "epoch": 0.7782925215089345, + "grad_norm": 1.1825402975082397, + "learning_rate": 7.174348539791375e-05, + "loss": 1.3613, + "step": 1176 + }, + { + "epoch": 0.7809397749834547, + "grad_norm": 1.2615066766738892, + "learning_rate": 7.154613589999054e-05, + "loss": 1.6972, + "step": 1180 + }, + { + "epoch": 0.7835870284579749, + "grad_norm": 1.239867091178894, + "learning_rate": 7.13483733311344e-05, + "loss": 1.403, + "step": 1184 + }, + { + "epoch": 0.786234281932495, + "grad_norm": 1.3656786680221558, + "learning_rate": 7.115020148274295e-05, + "loss": 1.6528, + "step": 1188 + }, + { + "epoch": 0.7888815354070152, + "grad_norm": 1.2590436935424805, + "learning_rate": 7.095162415406034e-05, + "loss": 1.5411, + "step": 1192 + }, + { + "epoch": 0.7915287888815354, + "grad_norm": 1.2784417867660522, + "learning_rate": 7.075264515210435e-05, + "loss": 1.5618, + "step": 1196 + }, + { + "epoch": 0.7941760423560555, + "grad_norm": 1.3260300159454346, + "learning_rate": 7.055326829159341e-05, + "loss": 1.5295, + "step": 1200 + }, + { + "epoch": 0.7968232958305758, + "grad_norm": 5.832207202911377, + "learning_rate": 7.03534973948735e-05, + "loss": 1.5864, + "step": 1204 + }, + { + "epoch": 0.799470549305096, + "grad_norm": 1.2828547954559326, + "learning_rate": 7.015333629184484e-05, + "loss": 1.5081, + "step": 1208 + }, + { + "epoch": 0.8021178027796162, + "grad_norm": 1.2997095584869385, + "learning_rate": 6.995278881988847e-05, + "loss": 1.5827, + "step": 1212 + }, + { + "epoch": 0.8047650562541363, + "grad_norm": 1.2829680442810059, + "learning_rate": 6.975185882379271e-05, + "loss": 1.4565, + "step": 1216 + }, + { + "epoch": 0.8074123097286565, + "grad_norm": 1.3034470081329346, + "learning_rate": 6.955055015567942e-05, + "loss": 1.4973, + "step": 1220 + }, + { + "epoch": 0.8100595632031767, + "grad_norm": 1.170404314994812, + "learning_rate": 6.934886667493012e-05, + "loss": 1.4518, + "step": 1224 + }, + { + "epoch": 0.8127068166776968, + "grad_norm": 1.2815779447555542, + "learning_rate": 6.914681224811208e-05, + "loss": 1.546, + "step": 1228 + }, + { + "epoch": 0.8153540701522171, + "grad_norm": 1.227200984954834, + "learning_rate": 6.894439074890414e-05, + "loss": 1.5478, + "step": 1232 + }, + { + "epoch": 0.8180013236267373, + "grad_norm": 1.2927132844924927, + "learning_rate": 6.874160605802244e-05, + "loss": 1.6184, + "step": 1236 + }, + { + "epoch": 0.8206485771012575, + "grad_norm": 1.2327131032943726, + "learning_rate": 6.853846206314605e-05, + "loss": 1.5553, + "step": 1240 + }, + { + "epoch": 0.8232958305757776, + "grad_norm": 1.1886876821517944, + "learning_rate": 6.833496265884241e-05, + "loss": 1.4956, + "step": 1244 + }, + { + "epoch": 0.8259430840502978, + "grad_norm": 1.4828628301620483, + "learning_rate": 6.813111174649269e-05, + "loss": 1.7339, + "step": 1248 + }, + { + "epoch": 0.828590337524818, + "grad_norm": 1.2269375324249268, + "learning_rate": 6.792691323421698e-05, + "loss": 1.5712, + "step": 1252 + }, + { + "epoch": 0.8312375909993381, + "grad_norm": 1.4898347854614258, + "learning_rate": 6.772237103679937e-05, + "loss": 1.6172, + "step": 1256 + }, + { + "epoch": 0.8338848444738584, + "grad_norm": 1.1373467445373535, + "learning_rate": 6.751748907561288e-05, + "loss": 1.3869, + "step": 1260 + }, + { + "epoch": 0.8365320979483786, + "grad_norm": 1.2607003450393677, + "learning_rate": 6.731227127854434e-05, + "loss": 1.5501, + "step": 1264 + }, + { + "epoch": 0.8391793514228988, + "grad_norm": 1.357080340385437, + "learning_rate": 6.710672157991899e-05, + "loss": 1.5804, + "step": 1268 + }, + { + "epoch": 0.8418266048974189, + "grad_norm": 1.300445318222046, + "learning_rate": 6.690084392042513e-05, + "loss": 1.4547, + "step": 1272 + }, + { + "epoch": 0.8444738583719391, + "grad_norm": 1.281031608581543, + "learning_rate": 6.669464224703861e-05, + "loss": 1.5843, + "step": 1276 + }, + { + "epoch": 0.8471211118464593, + "grad_norm": 1.2201812267303467, + "learning_rate": 6.648812051294697e-05, + "loss": 1.4422, + "step": 1280 + }, + { + "epoch": 0.8497683653209794, + "grad_norm": 1.2445136308670044, + "learning_rate": 6.628128267747391e-05, + "loss": 1.5826, + "step": 1284 + }, + { + "epoch": 0.8524156187954997, + "grad_norm": 1.383170247077942, + "learning_rate": 6.607413270600319e-05, + "loss": 1.6194, + "step": 1288 + }, + { + "epoch": 0.8550628722700199, + "grad_norm": 1.370076060295105, + "learning_rate": 6.586667456990267e-05, + "loss": 1.6408, + "step": 1292 + }, + { + "epoch": 0.8577101257445401, + "grad_norm": 1.293721318244934, + "learning_rate": 6.565891224644822e-05, + "loss": 1.5066, + "step": 1296 + }, + { + "epoch": 0.8603573792190602, + "grad_norm": 1.4381659030914307, + "learning_rate": 6.545084971874738e-05, + "loss": 1.5161, + "step": 1300 + }, + { + "epoch": 0.8630046326935804, + "grad_norm": 1.3525183200836182, + "learning_rate": 6.524249097566306e-05, + "loss": 1.6022, + "step": 1304 + }, + { + "epoch": 0.8656518861681006, + "grad_norm": 1.1742914915084839, + "learning_rate": 6.503384001173707e-05, + "loss": 1.3307, + "step": 1308 + }, + { + "epoch": 0.8682991396426207, + "grad_norm": 1.275770664215088, + "learning_rate": 6.48249008271135e-05, + "loss": 1.5092, + "step": 1312 + }, + { + "epoch": 0.870946393117141, + "grad_norm": 1.3267558813095093, + "learning_rate": 6.461567742746206e-05, + "loss": 1.6288, + "step": 1316 + }, + { + "epoch": 0.8735936465916612, + "grad_norm": 1.1977699995040894, + "learning_rate": 6.440617382390128e-05, + "loss": 1.5567, + "step": 1320 + }, + { + "epoch": 0.8762409000661814, + "grad_norm": 1.1399099826812744, + "learning_rate": 6.419639403292161e-05, + "loss": 1.5925, + "step": 1324 + }, + { + "epoch": 0.8788881535407015, + "grad_norm": 1.3445255756378174, + "learning_rate": 6.398634207630841e-05, + "loss": 1.5288, + "step": 1328 + }, + { + "epoch": 0.8815354070152217, + "grad_norm": 1.2953174114227295, + "learning_rate": 6.377602198106483e-05, + "loss": 1.5119, + "step": 1332 + }, + { + "epoch": 0.8841826604897419, + "grad_norm": 1.2466961145401, + "learning_rate": 6.356543777933468e-05, + "loss": 1.4559, + "step": 1336 + }, + { + "epoch": 0.886829913964262, + "grad_norm": 1.410008430480957, + "learning_rate": 6.335459350832504e-05, + "loss": 1.6239, + "step": 1340 + }, + { + "epoch": 0.8894771674387822, + "grad_norm": 1.2374393939971924, + "learning_rate": 6.314349321022893e-05, + "loss": 1.4162, + "step": 1344 + }, + { + "epoch": 0.8921244209133025, + "grad_norm": 1.3700758218765259, + "learning_rate": 6.293214093214775e-05, + "loss": 1.4784, + "step": 1348 + }, + { + "epoch": 0.8947716743878227, + "grad_norm": 1.345596432685852, + "learning_rate": 6.272054072601374e-05, + "loss": 1.5489, + "step": 1352 + }, + { + "epoch": 0.8974189278623428, + "grad_norm": 1.1666315793991089, + "learning_rate": 6.250869664851227e-05, + "loss": 1.3515, + "step": 1356 + }, + { + "epoch": 0.900066181336863, + "grad_norm": 1.2450063228607178, + "learning_rate": 6.229661276100412e-05, + "loss": 1.4763, + "step": 1360 + }, + { + "epoch": 0.9027134348113832, + "grad_norm": 1.1888995170593262, + "learning_rate": 6.208429312944754e-05, + "loss": 1.4322, + "step": 1364 + }, + { + "epoch": 0.9053606882859033, + "grad_norm": 1.3319921493530273, + "learning_rate": 6.187174182432033e-05, + "loss": 1.5044, + "step": 1368 + }, + { + "epoch": 0.9080079417604235, + "grad_norm": 1.2023800611495972, + "learning_rate": 6.165896292054187e-05, + "loss": 1.5033, + "step": 1372 + }, + { + "epoch": 0.9106551952349438, + "grad_norm": 1.3017017841339111, + "learning_rate": 6.14459604973949e-05, + "loss": 1.4683, + "step": 1376 + }, + { + "epoch": 0.913302448709464, + "grad_norm": 1.2657389640808105, + "learning_rate": 6.12327386384473e-05, + "loss": 1.5533, + "step": 1380 + }, + { + "epoch": 0.9159497021839841, + "grad_norm": 1.3227919340133667, + "learning_rate": 6.101930143147395e-05, + "loss": 1.5239, + "step": 1384 + }, + { + "epoch": 0.9185969556585043, + "grad_norm": 1.3174325227737427, + "learning_rate": 6.080565296837821e-05, + "loss": 1.5259, + "step": 1388 + }, + { + "epoch": 0.9212442091330245, + "grad_norm": 1.2424542903900146, + "learning_rate": 6.059179734511356e-05, + "loss": 1.3573, + "step": 1392 + }, + { + "epoch": 0.9238914626075446, + "grad_norm": 1.2109280824661255, + "learning_rate": 6.037773866160502e-05, + "loss": 1.3831, + "step": 1396 + }, + { + "epoch": 0.9265387160820648, + "grad_norm": 1.2729474306106567, + "learning_rate": 6.0163481021670575e-05, + "loss": 1.674, + "step": 1400 + }, + { + "epoch": 0.9291859695565851, + "grad_norm": 1.1736104488372803, + "learning_rate": 5.994902853294251e-05, + "loss": 1.4935, + "step": 1404 + }, + { + "epoch": 0.9318332230311053, + "grad_norm": 1.3021750450134277, + "learning_rate": 5.973438530678861e-05, + "loss": 1.6066, + "step": 1408 + }, + { + "epoch": 0.9344804765056254, + "grad_norm": 1.3625566959381104, + "learning_rate": 5.951955545823342e-05, + "loss": 1.629, + "step": 1412 + }, + { + "epoch": 0.9371277299801456, + "grad_norm": 1.1946239471435547, + "learning_rate": 5.930454310587929e-05, + "loss": 1.4444, + "step": 1416 + }, + { + "epoch": 0.9397749834546658, + "grad_norm": 1.4337393045425415, + "learning_rate": 5.9089352371827446e-05, + "loss": 1.6888, + "step": 1420 + }, + { + "epoch": 0.9424222369291859, + "grad_norm": 1.3422842025756836, + "learning_rate": 5.8873987381598924e-05, + "loss": 1.6227, + "step": 1424 + }, + { + "epoch": 0.9450694904037061, + "grad_norm": 1.2459781169891357, + "learning_rate": 5.865845226405553e-05, + "loss": 1.4704, + "step": 1428 + }, + { + "epoch": 0.9477167438782264, + "grad_norm": 1.5130873918533325, + "learning_rate": 5.844275115132064e-05, + "loss": 1.5029, + "step": 1432 + }, + { + "epoch": 0.9503639973527466, + "grad_norm": 1.127805471420288, + "learning_rate": 5.822688817870004e-05, + "loss": 1.5289, + "step": 1436 + }, + { + "epoch": 0.9530112508272667, + "grad_norm": 1.283653736114502, + "learning_rate": 5.801086748460255e-05, + "loss": 1.545, + "step": 1440 + }, + { + "epoch": 0.9556585043017869, + "grad_norm": 1.290038824081421, + "learning_rate": 5.7794693210460804e-05, + "loss": 1.5588, + "step": 1444 + }, + { + "epoch": 0.9583057577763071, + "grad_norm": 1.2246005535125732, + "learning_rate": 5.757836950065172e-05, + "loss": 1.4577, + "step": 1448 + }, + { + "epoch": 0.9609530112508272, + "grad_norm": 1.3036789894104004, + "learning_rate": 5.736190050241719e-05, + "loss": 1.6891, + "step": 1452 + }, + { + "epoch": 0.9636002647253474, + "grad_norm": 1.2149336338043213, + "learning_rate": 5.714529036578443e-05, + "loss": 1.4114, + "step": 1456 + }, + { + "epoch": 0.9662475181998676, + "grad_norm": 1.1539721488952637, + "learning_rate": 5.692854324348653e-05, + "loss": 1.5497, + "step": 1460 + }, + { + "epoch": 0.9688947716743879, + "grad_norm": 1.3573237657546997, + "learning_rate": 5.6711663290882776e-05, + "loss": 1.4812, + "step": 1464 + }, + { + "epoch": 0.971542025148908, + "grad_norm": 1.338049292564392, + "learning_rate": 5.649465466587902e-05, + "loss": 1.6043, + "step": 1468 + }, + { + "epoch": 0.9741892786234282, + "grad_norm": 1.3066737651824951, + "learning_rate": 5.627752152884794e-05, + "loss": 1.582, + "step": 1472 + }, + { + "epoch": 0.9768365320979484, + "grad_norm": 1.2373597621917725, + "learning_rate": 5.606026804254931e-05, + "loss": 1.4099, + "step": 1476 + }, + { + "epoch": 0.9794837855724685, + "grad_norm": 1.1805121898651123, + "learning_rate": 5.584289837205012e-05, + "loss": 1.3914, + "step": 1480 + }, + { + "epoch": 0.9821310390469887, + "grad_norm": 1.3286755084991455, + "learning_rate": 5.5625416684644874e-05, + "loss": 1.4803, + "step": 1484 + }, + { + "epoch": 0.984778292521509, + "grad_norm": 1.2538626194000244, + "learning_rate": 5.540782714977549e-05, + "loss": 1.5063, + "step": 1488 + }, + { + "epoch": 0.9874255459960292, + "grad_norm": 1.2164523601531982, + "learning_rate": 5.51901339389516e-05, + "loss": 1.3555, + "step": 1492 + }, + { + "epoch": 0.9900727994705493, + "grad_norm": 1.217489242553711, + "learning_rate": 5.4972341225670354e-05, + "loss": 1.4818, + "step": 1496 + }, + { + "epoch": 0.9927200529450695, + "grad_norm": 1.170462727546692, + "learning_rate": 5.4754453185336586e-05, + "loss": 1.5693, + "step": 1500 + }, + { + "epoch": 0.9953673064195897, + "grad_norm": 1.2590230703353882, + "learning_rate": 5.453647399518262e-05, + "loss": 1.3735, + "step": 1504 + }, + { + "epoch": 0.9980145598941098, + "grad_norm": 1.1807870864868164, + "learning_rate": 5.431840783418832e-05, + "loss": 1.3643, + "step": 1508 + }, + { + "epoch": 1.00066181336863, + "grad_norm": 1.1421568393707275, + "learning_rate": 5.410025888300087e-05, + "loss": 1.4336, + "step": 1512 + }, + { + "epoch": 1.0033090668431501, + "grad_norm": 1.161148190498352, + "learning_rate": 5.388203132385467e-05, + "loss": 1.2284, + "step": 1516 + }, + { + "epoch": 1.0059563203176705, + "grad_norm": 1.129975438117981, + "learning_rate": 5.366372934049114e-05, + "loss": 1.2385, + "step": 1520 + }, + { + "epoch": 1.0086035737921906, + "grad_norm": 1.0889602899551392, + "learning_rate": 5.3445357118078545e-05, + "loss": 1.0735, + "step": 1524 + }, + { + "epoch": 1.011250827266711, + "grad_norm": 1.2157572507858276, + "learning_rate": 5.322691884313172e-05, + "loss": 1.1803, + "step": 1528 + }, + { + "epoch": 1.013898080741231, + "grad_norm": 1.1153740882873535, + "learning_rate": 5.300841870343183e-05, + "loss": 1.0574, + "step": 1532 + }, + { + "epoch": 1.016545334215751, + "grad_norm": 1.1907968521118164, + "learning_rate": 5.2789860887946066e-05, + "loss": 1.0691, + "step": 1536 + }, + { + "epoch": 1.0191925876902714, + "grad_norm": 1.1797744035720825, + "learning_rate": 5.257124958674736e-05, + "loss": 1.1063, + "step": 1540 + }, + { + "epoch": 1.0218398411647915, + "grad_norm": 1.0647462606430054, + "learning_rate": 5.235258899093406e-05, + "loss": 1.0512, + "step": 1544 + }, + { + "epoch": 1.0244870946393116, + "grad_norm": 1.1768978834152222, + "learning_rate": 5.213388329254949e-05, + "loss": 1.197, + "step": 1548 + }, + { + "epoch": 1.027134348113832, + "grad_norm": 1.282067060470581, + "learning_rate": 5.191513668450178e-05, + "loss": 1.231, + "step": 1552 + }, + { + "epoch": 1.029781601588352, + "grad_norm": 1.3294609785079956, + "learning_rate": 5.1696353360483216e-05, + "loss": 1.2719, + "step": 1556 + }, + { + "epoch": 1.0324288550628722, + "grad_norm": 1.187889814376831, + "learning_rate": 5.1477537514890116e-05, + "loss": 1.2815, + "step": 1560 + }, + { + "epoch": 1.0350761085373925, + "grad_norm": 1.152590036392212, + "learning_rate": 5.125869334274219e-05, + "loss": 1.126, + "step": 1564 + }, + { + "epoch": 1.0377233620119126, + "grad_norm": 1.1706854104995728, + "learning_rate": 5.103982503960224e-05, + "loss": 1.22, + "step": 1568 + }, + { + "epoch": 1.0403706154864327, + "grad_norm": 1.1738533973693848, + "learning_rate": 5.082093680149571e-05, + "loss": 1.2386, + "step": 1572 + }, + { + "epoch": 1.043017868960953, + "grad_norm": 1.299540400505066, + "learning_rate": 5.060203282483022e-05, + "loss": 1.2308, + "step": 1576 + }, + { + "epoch": 1.0456651224354732, + "grad_norm": 1.1205031871795654, + "learning_rate": 5.038311730631509e-05, + "loss": 1.1254, + "step": 1580 + }, + { + "epoch": 1.0483123759099935, + "grad_norm": 1.1589237451553345, + "learning_rate": 5.016419444288096e-05, + "loss": 1.046, + "step": 1584 + }, + { + "epoch": 1.0509596293845136, + "grad_norm": 1.1844594478607178, + "learning_rate": 4.9945268431599245e-05, + "loss": 1.1835, + "step": 1588 + }, + { + "epoch": 1.0536068828590337, + "grad_norm": 1.319905400276184, + "learning_rate": 4.972634346960173e-05, + "loss": 1.2235, + "step": 1592 + }, + { + "epoch": 1.056254136333554, + "grad_norm": 1.1240413188934326, + "learning_rate": 4.950742375400007e-05, + "loss": 1.0733, + "step": 1596 + }, + { + "epoch": 1.0589013898080741, + "grad_norm": 1.27524995803833, + "learning_rate": 4.9288513481805374e-05, + "loss": 1.1595, + "step": 1600 + }, + { + "epoch": 1.0615486432825942, + "grad_norm": 1.2067784070968628, + "learning_rate": 4.906961684984767e-05, + "loss": 1.1771, + "step": 1604 + }, + { + "epoch": 1.0641958967571146, + "grad_norm": 1.154008150100708, + "learning_rate": 4.8850738054695486e-05, + "loss": 1.1934, + "step": 1608 + }, + { + "epoch": 1.0668431502316347, + "grad_norm": 1.1568691730499268, + "learning_rate": 4.863188129257539e-05, + "loss": 1.1032, + "step": 1612 + }, + { + "epoch": 1.0694904037061548, + "grad_norm": 1.1935631036758423, + "learning_rate": 4.8413050759291585e-05, + "loss": 1.1457, + "step": 1616 + }, + { + "epoch": 1.072137657180675, + "grad_norm": 1.1685223579406738, + "learning_rate": 4.8194250650145374e-05, + "loss": 1.0371, + "step": 1620 + }, + { + "epoch": 1.0747849106551952, + "grad_norm": 1.2918758392333984, + "learning_rate": 4.797548515985481e-05, + "loss": 1.1128, + "step": 1624 + }, + { + "epoch": 1.0774321641297153, + "grad_norm": 1.232910394668579, + "learning_rate": 4.775675848247427e-05, + "loss": 1.0407, + "step": 1628 + }, + { + "epoch": 1.0800794176042356, + "grad_norm": 1.27483069896698, + "learning_rate": 4.7538074811313975e-05, + "loss": 1.1523, + "step": 1632 + }, + { + "epoch": 1.0827266710787558, + "grad_norm": 1.2089005708694458, + "learning_rate": 4.731943833885973e-05, + "loss": 1.0901, + "step": 1636 + }, + { + "epoch": 1.0853739245532759, + "grad_norm": 1.272049069404602, + "learning_rate": 4.7100853256692406e-05, + "loss": 1.1968, + "step": 1640 + }, + { + "epoch": 1.0880211780277962, + "grad_norm": 1.1610321998596191, + "learning_rate": 4.6882323755407706e-05, + "loss": 1.0379, + "step": 1644 + }, + { + "epoch": 1.0906684315023163, + "grad_norm": 1.0861836671829224, + "learning_rate": 4.666385402453568e-05, + "loss": 1.1274, + "step": 1648 + }, + { + "epoch": 1.0933156849768366, + "grad_norm": 1.2042131423950195, + "learning_rate": 4.644544825246059e-05, + "loss": 1.1502, + "step": 1652 + }, + { + "epoch": 1.0959629384513567, + "grad_norm": 1.5976825952529907, + "learning_rate": 4.622711062634046e-05, + "loss": 1.1527, + "step": 1656 + }, + { + "epoch": 1.0986101919258768, + "grad_norm": 1.2653815746307373, + "learning_rate": 4.600884533202686e-05, + "loss": 1.0946, + "step": 1660 + }, + { + "epoch": 1.1012574454003972, + "grad_norm": 1.129782795906067, + "learning_rate": 4.579065655398465e-05, + "loss": 1.1376, + "step": 1664 + }, + { + "epoch": 1.1039046988749173, + "grad_norm": 1.0471429824829102, + "learning_rate": 4.5572548475211805e-05, + "loss": 1.1488, + "step": 1668 + }, + { + "epoch": 1.1065519523494374, + "grad_norm": 1.281714916229248, + "learning_rate": 4.535452527715911e-05, + "loss": 1.2245, + "step": 1672 + }, + { + "epoch": 1.1091992058239577, + "grad_norm": 1.1683017015457153, + "learning_rate": 4.5136591139650105e-05, + "loss": 1.1307, + "step": 1676 + }, + { + "epoch": 1.1118464592984778, + "grad_norm": 1.1847896575927734, + "learning_rate": 4.491875024080088e-05, + "loss": 1.0821, + "step": 1680 + }, + { + "epoch": 1.114493712772998, + "grad_norm": 1.196803331375122, + "learning_rate": 4.470100675694007e-05, + "loss": 1.0633, + "step": 1684 + }, + { + "epoch": 1.1171409662475182, + "grad_norm": 1.1869444847106934, + "learning_rate": 4.4483364862528646e-05, + "loss": 1.1864, + "step": 1688 + }, + { + "epoch": 1.1197882197220383, + "grad_norm": 1.221575140953064, + "learning_rate": 4.4265828730079987e-05, + "loss": 1.0547, + "step": 1692 + }, + { + "epoch": 1.1224354731965587, + "grad_norm": 1.164784550666809, + "learning_rate": 4.404840253007987e-05, + "loss": 1.1614, + "step": 1696 + }, + { + "epoch": 1.1250827266710788, + "grad_norm": 1.0524084568023682, + "learning_rate": 4.3831090430906484e-05, + "loss": 1.1285, + "step": 1700 + }, + { + "epoch": 1.1277299801455989, + "grad_norm": 1.6504064798355103, + "learning_rate": 4.361389659875058e-05, + "loss": 1.1689, + "step": 1704 + }, + { + "epoch": 1.1303772336201192, + "grad_norm": 1.1136175394058228, + "learning_rate": 4.339682519753551e-05, + "loss": 1.0815, + "step": 1708 + }, + { + "epoch": 1.1330244870946393, + "grad_norm": 1.1745281219482422, + "learning_rate": 4.3179880388837496e-05, + "loss": 1.1722, + "step": 1712 + }, + { + "epoch": 1.1356717405691594, + "grad_norm": 1.0880483388900757, + "learning_rate": 4.2963066331805725e-05, + "loss": 1.0361, + "step": 1716 + }, + { + "epoch": 1.1383189940436798, + "grad_norm": 1.137968897819519, + "learning_rate": 4.2746387183082755e-05, + "loss": 1.1, + "step": 1720 + }, + { + "epoch": 1.1409662475181999, + "grad_norm": 1.2682772874832153, + "learning_rate": 4.252984709672473e-05, + "loss": 1.134, + "step": 1724 + }, + { + "epoch": 1.14361350099272, + "grad_norm": 1.128180742263794, + "learning_rate": 4.231345022412174e-05, + "loss": 1.0812, + "step": 1728 + }, + { + "epoch": 1.1462607544672403, + "grad_norm": 1.0430972576141357, + "learning_rate": 4.2097200713918264e-05, + "loss": 1.034, + "step": 1732 + }, + { + "epoch": 1.1489080079417604, + "grad_norm": 1.1832259893417358, + "learning_rate": 4.188110271193371e-05, + "loss": 1.1422, + "step": 1736 + }, + { + "epoch": 1.1515552614162807, + "grad_norm": 1.1320624351501465, + "learning_rate": 4.1665160361082704e-05, + "loss": 1.0688, + "step": 1740 + }, + { + "epoch": 1.1542025148908008, + "grad_norm": 1.2752870321273804, + "learning_rate": 4.144937780129594e-05, + "loss": 1.1926, + "step": 1744 + }, + { + "epoch": 1.156849768365321, + "grad_norm": 1.2092264890670776, + "learning_rate": 4.123375916944061e-05, + "loss": 1.0973, + "step": 1748 + }, + { + "epoch": 1.159497021839841, + "grad_norm": 1.1710125207901, + "learning_rate": 4.101830859924124e-05, + "loss": 1.2602, + "step": 1752 + }, + { + "epoch": 1.1621442753143614, + "grad_norm": 1.4670571088790894, + "learning_rate": 4.080303022120025e-05, + "loss": 1.2005, + "step": 1756 + }, + { + "epoch": 1.1647915287888815, + "grad_norm": 1.1942548751831055, + "learning_rate": 4.058792816251902e-05, + "loss": 1.2164, + "step": 1760 + }, + { + "epoch": 1.1674387822634018, + "grad_norm": 1.2230584621429443, + "learning_rate": 4.037300654701856e-05, + "loss": 1.0395, + "step": 1764 + }, + { + "epoch": 1.170086035737922, + "grad_norm": 1.3117454051971436, + "learning_rate": 4.015826949506049e-05, + "loss": 1.1848, + "step": 1768 + }, + { + "epoch": 1.172733289212442, + "grad_norm": 1.2102235555648804, + "learning_rate": 3.994372112346812e-05, + "loss": 1.1349, + "step": 1772 + }, + { + "epoch": 1.1753805426869623, + "grad_norm": 1.3425853252410889, + "learning_rate": 3.9729365545447514e-05, + "loss": 1.1756, + "step": 1776 + }, + { + "epoch": 1.1780277961614825, + "grad_norm": 1.1865317821502686, + "learning_rate": 3.9515206870508534e-05, + "loss": 1.1298, + "step": 1780 + }, + { + "epoch": 1.1806750496360026, + "grad_norm": 1.0945122241973877, + "learning_rate": 3.930124920438616e-05, + "loss": 1.1275, + "step": 1784 + }, + { + "epoch": 1.1833223031105229, + "grad_norm": 1.2114017009735107, + "learning_rate": 3.908749664896171e-05, + "loss": 1.1958, + "step": 1788 + }, + { + "epoch": 1.185969556585043, + "grad_norm": 1.1771973371505737, + "learning_rate": 3.887395330218429e-05, + "loss": 1.0868, + "step": 1792 + }, + { + "epoch": 1.188616810059563, + "grad_norm": 1.2639689445495605, + "learning_rate": 3.866062325799209e-05, + "loss": 1.213, + "step": 1796 + }, + { + "epoch": 1.1912640635340834, + "grad_norm": 1.1774057149887085, + "learning_rate": 3.844751060623404e-05, + "loss": 1.0974, + "step": 1800 + }, + { + "epoch": 1.1939113170086035, + "grad_norm": 1.1269370317459106, + "learning_rate": 3.823461943259132e-05, + "loss": 1.1296, + "step": 1804 + }, + { + "epoch": 1.1965585704831239, + "grad_norm": 1.2880319356918335, + "learning_rate": 3.802195381849901e-05, + "loss": 1.1121, + "step": 1808 + }, + { + "epoch": 1.199205823957644, + "grad_norm": 1.1425657272338867, + "learning_rate": 3.7809517841067976e-05, + "loss": 1.0818, + "step": 1812 + }, + { + "epoch": 1.201853077432164, + "grad_norm": 1.1727538108825684, + "learning_rate": 3.759731557300652e-05, + "loss": 1.025, + "step": 1816 + }, + { + "epoch": 1.2045003309066844, + "grad_norm": 1.2917152643203735, + "learning_rate": 3.738535108254246e-05, + "loss": 1.21, + "step": 1820 + }, + { + "epoch": 1.2071475843812045, + "grad_norm": 1.1989338397979736, + "learning_rate": 3.7173628433345006e-05, + "loss": 1.1712, + "step": 1824 + }, + { + "epoch": 1.2097948378557246, + "grad_norm": 1.2029826641082764, + "learning_rate": 3.696215168444699e-05, + "loss": 1.1146, + "step": 1828 + }, + { + "epoch": 1.212442091330245, + "grad_norm": 1.173412561416626, + "learning_rate": 3.675092489016693e-05, + "loss": 1.1237, + "step": 1832 + }, + { + "epoch": 1.215089344804765, + "grad_norm": 1.250653862953186, + "learning_rate": 3.6539952100031326e-05, + "loss": 1.1326, + "step": 1836 + }, + { + "epoch": 1.2177365982792852, + "grad_norm": 1.1222728490829468, + "learning_rate": 3.632923735869711e-05, + "loss": 1.1575, + "step": 1840 + }, + { + "epoch": 1.2203838517538055, + "grad_norm": 1.098129153251648, + "learning_rate": 3.611878470587402e-05, + "loss": 1.1357, + "step": 1844 + }, + { + "epoch": 1.2230311052283256, + "grad_norm": 1.2261312007904053, + "learning_rate": 3.5908598176247124e-05, + "loss": 1.075, + "step": 1848 + }, + { + "epoch": 1.225678358702846, + "grad_norm": 1.145168423652649, + "learning_rate": 3.569868179939958e-05, + "loss": 1.1333, + "step": 1852 + }, + { + "epoch": 1.228325612177366, + "grad_norm": 1.1339921951293945, + "learning_rate": 3.5489039599735294e-05, + "loss": 1.0158, + "step": 1856 + }, + { + "epoch": 1.2309728656518861, + "grad_norm": 1.2139281034469604, + "learning_rate": 3.5279675596401846e-05, + "loss": 1.1726, + "step": 1860 + }, + { + "epoch": 1.2336201191264062, + "grad_norm": 1.2778246402740479, + "learning_rate": 3.5070593803213267e-05, + "loss": 1.182, + "step": 1864 + }, + { + "epoch": 1.2362673726009266, + "grad_norm": 1.2227150201797485, + "learning_rate": 3.4861798228573325e-05, + "loss": 1.0037, + "step": 1868 + }, + { + "epoch": 1.2389146260754467, + "grad_norm": 1.2715504169464111, + "learning_rate": 3.465329287539852e-05, + "loss": 1.21, + "step": 1872 + }, + { + "epoch": 1.241561879549967, + "grad_norm": 1.300766944885254, + "learning_rate": 3.444508174104136e-05, + "loss": 1.1, + "step": 1876 + }, + { + "epoch": 1.244209133024487, + "grad_norm": 1.1540982723236084, + "learning_rate": 3.423716881721375e-05, + "loss": 1.1127, + "step": 1880 + }, + { + "epoch": 1.2468563864990072, + "grad_norm": 1.4233511686325073, + "learning_rate": 3.402955808991052e-05, + "loss": 1.1692, + "step": 1884 + }, + { + "epoch": 1.2495036399735275, + "grad_norm": 1.2163995504379272, + "learning_rate": 3.382225353933288e-05, + "loss": 1.0856, + "step": 1888 + }, + { + "epoch": 1.2521508934480476, + "grad_norm": 1.2361574172973633, + "learning_rate": 3.3615259139812225e-05, + "loss": 1.2024, + "step": 1892 + }, + { + "epoch": 1.254798146922568, + "grad_norm": 1.0741496086120605, + "learning_rate": 3.340857885973388e-05, + "loss": 1.0447, + "step": 1896 + }, + { + "epoch": 1.257445400397088, + "grad_norm": 1.1579320430755615, + "learning_rate": 3.320221666146107e-05, + "loss": 1.0772, + "step": 1900 + }, + { + "epoch": 1.2600926538716082, + "grad_norm": 1.2062878608703613, + "learning_rate": 3.299617650125889e-05, + "loss": 1.1011, + "step": 1904 + }, + { + "epoch": 1.2627399073461283, + "grad_norm": 1.2862952947616577, + "learning_rate": 3.279046232921852e-05, + "loss": 1.2596, + "step": 1908 + }, + { + "epoch": 1.2653871608206486, + "grad_norm": 1.2335329055786133, + "learning_rate": 3.2585078089181464e-05, + "loss": 1.2462, + "step": 1912 + }, + { + "epoch": 1.2680344142951687, + "grad_norm": 1.0968290567398071, + "learning_rate": 3.238002771866391e-05, + "loss": 1.0543, + "step": 1916 + }, + { + "epoch": 1.270681667769689, + "grad_norm": 1.06516695022583, + "learning_rate": 3.217531514878136e-05, + "loss": 1.1669, + "step": 1920 + }, + { + "epoch": 1.2733289212442092, + "grad_norm": 1.1616246700286865, + "learning_rate": 3.1970944304173126e-05, + "loss": 1.2252, + "step": 1924 + }, + { + "epoch": 1.2759761747187293, + "grad_norm": 1.1696902513504028, + "learning_rate": 3.176691910292715e-05, + "loss": 1.2329, + "step": 1928 + }, + { + "epoch": 1.2786234281932494, + "grad_norm": 1.210041880607605, + "learning_rate": 3.156324345650488e-05, + "loss": 1.3271, + "step": 1932 + }, + { + "epoch": 1.2812706816677697, + "grad_norm": 1.0774304866790771, + "learning_rate": 3.1359921269666324e-05, + "loss": 1.0306, + "step": 1936 + }, + { + "epoch": 1.2839179351422898, + "grad_norm": 1.166651725769043, + "learning_rate": 3.1156956440395136e-05, + "loss": 1.021, + "step": 1940 + }, + { + "epoch": 1.2865651886168101, + "grad_norm": 1.2745511531829834, + "learning_rate": 3.095435285982387e-05, + "loss": 1.1301, + "step": 1944 + }, + { + "epoch": 1.2892124420913302, + "grad_norm": 1.0762966871261597, + "learning_rate": 3.075211441215944e-05, + "loss": 1.0831, + "step": 1948 + }, + { + "epoch": 1.2918596955658503, + "grad_norm": 1.298743486404419, + "learning_rate": 3.055024497460867e-05, + "loss": 1.1705, + "step": 1952 + }, + { + "epoch": 1.2945069490403707, + "grad_norm": 1.243034839630127, + "learning_rate": 3.0348748417303823e-05, + "loss": 1.1282, + "step": 1956 + }, + { + "epoch": 1.2971542025148908, + "grad_norm": 1.2496618032455444, + "learning_rate": 3.0147628603228594e-05, + "loss": 1.0639, + "step": 1960 + }, + { + "epoch": 1.299801455989411, + "grad_norm": 1.141508936882019, + "learning_rate": 2.9946889388143913e-05, + "loss": 1.1297, + "step": 1964 + }, + { + "epoch": 1.3024487094639312, + "grad_norm": 1.188610553741455, + "learning_rate": 2.974653462051411e-05, + "loss": 1.1628, + "step": 1968 + }, + { + "epoch": 1.3050959629384513, + "grad_norm": 1.1807959079742432, + "learning_rate": 2.9546568141433006e-05, + "loss": 1.0527, + "step": 1972 + }, + { + "epoch": 1.3077432164129714, + "grad_norm": 1.1804313659667969, + "learning_rate": 2.9346993784550474e-05, + "loss": 1.196, + "step": 1976 + }, + { + "epoch": 1.3103904698874917, + "grad_norm": 1.1646931171417236, + "learning_rate": 2.9147815375998766e-05, + "loss": 1.0773, + "step": 1980 + }, + { + "epoch": 1.3130377233620119, + "grad_norm": 1.4130630493164062, + "learning_rate": 2.8949036734319247e-05, + "loss": 1.2183, + "step": 1984 + }, + { + "epoch": 1.3156849768365322, + "grad_norm": 1.1829743385314941, + "learning_rate": 2.8750661670389135e-05, + "loss": 1.1457, + "step": 1988 + }, + { + "epoch": 1.3183322303110523, + "grad_norm": 1.1480798721313477, + "learning_rate": 2.8552693987348532e-05, + "loss": 1.0502, + "step": 1992 + }, + { + "epoch": 1.3209794837855724, + "grad_norm": 1.1411528587341309, + "learning_rate": 2.835513748052738e-05, + "loss": 1.1938, + "step": 1996 + }, + { + "epoch": 1.3236267372600927, + "grad_norm": 1.1550084352493286, + "learning_rate": 2.815799593737285e-05, + "loss": 1.1577, + "step": 2000 + }, + { + "epoch": 1.3262739907346128, + "grad_norm": 1.1829745769500732, + "learning_rate": 2.7961273137376566e-05, + "loss": 1.097, + "step": 2004 + }, + { + "epoch": 1.3289212442091332, + "grad_norm": 1.229865312576294, + "learning_rate": 2.7764972852002323e-05, + "loss": 1.0721, + "step": 2008 + }, + { + "epoch": 1.3315684976836533, + "grad_norm": 1.1786168813705444, + "learning_rate": 2.7569098844613616e-05, + "loss": 1.094, + "step": 2012 + }, + { + "epoch": 1.3342157511581734, + "grad_norm": 1.4941198825836182, + "learning_rate": 2.7373654870401634e-05, + "loss": 1.2017, + "step": 2016 + }, + { + "epoch": 1.3368630046326935, + "grad_norm": 1.1714154481887817, + "learning_rate": 2.7178644676313143e-05, + "loss": 0.9992, + "step": 2020 + }, + { + "epoch": 1.3395102581072138, + "grad_norm": 1.2153651714324951, + "learning_rate": 2.698407200097872e-05, + "loss": 1.1801, + "step": 2024 + }, + { + "epoch": 1.342157511581734, + "grad_norm": 1.2198010683059692, + "learning_rate": 2.6789940574641102e-05, + "loss": 1.0585, + "step": 2028 + }, + { + "epoch": 1.3448047650562542, + "grad_norm": 1.2211023569107056, + "learning_rate": 2.6596254119083656e-05, + "loss": 1.111, + "step": 2032 + }, + { + "epoch": 1.3474520185307743, + "grad_norm": 1.2999107837677002, + "learning_rate": 2.6403016347558894e-05, + "loss": 1.1344, + "step": 2036 + }, + { + "epoch": 1.3500992720052944, + "grad_norm": 1.181583046913147, + "learning_rate": 2.6210230964717513e-05, + "loss": 1.0638, + "step": 2040 + }, + { + "epoch": 1.3527465254798146, + "grad_norm": 1.1883265972137451, + "learning_rate": 2.6017901666537216e-05, + "loss": 1.0218, + "step": 2044 + }, + { + "epoch": 1.3553937789543349, + "grad_norm": 1.2537999153137207, + "learning_rate": 2.5826032140251943e-05, + "loss": 1.0679, + "step": 2048 + }, + { + "epoch": 1.358041032428855, + "grad_norm": 1.1566420793533325, + "learning_rate": 2.563462606428101e-05, + "loss": 1.116, + "step": 2052 + }, + { + "epoch": 1.3606882859033753, + "grad_norm": 1.1046433448791504, + "learning_rate": 2.5443687108158836e-05, + "loss": 1.0058, + "step": 2056 + }, + { + "epoch": 1.3633355393778954, + "grad_norm": 1.307966709136963, + "learning_rate": 2.525321893246444e-05, + "loss": 1.2426, + "step": 2060 + }, + { + "epoch": 1.3659827928524155, + "grad_norm": 1.0436811447143555, + "learning_rate": 2.5063225188751273e-05, + "loss": 1.0737, + "step": 2064 + }, + { + "epoch": 1.3686300463269359, + "grad_norm": 1.0671106576919556, + "learning_rate": 2.4873709519477202e-05, + "loss": 1.083, + "step": 2068 + }, + { + "epoch": 1.371277299801456, + "grad_norm": 1.3584109544754028, + "learning_rate": 2.4684675557934767e-05, + "loss": 1.0333, + "step": 2072 + }, + { + "epoch": 1.3739245532759763, + "grad_norm": 1.180293321609497, + "learning_rate": 2.4496126928181467e-05, + "loss": 1.0714, + "step": 2076 + }, + { + "epoch": 1.3765718067504964, + "grad_norm": 1.102691888809204, + "learning_rate": 2.4308067244970228e-05, + "loss": 1.0386, + "step": 2080 + }, + { + "epoch": 1.3792190602250165, + "grad_norm": 1.156723976135254, + "learning_rate": 2.4120500113680177e-05, + "loss": 1.0593, + "step": 2084 + }, + { + "epoch": 1.3818663136995366, + "grad_norm": 1.2727686166763306, + "learning_rate": 2.3933429130247538e-05, + "loss": 1.2251, + "step": 2088 + }, + { + "epoch": 1.384513567174057, + "grad_norm": 1.213897466659546, + "learning_rate": 2.3746857881096584e-05, + "loss": 1.0509, + "step": 2092 + }, + { + "epoch": 1.387160820648577, + "grad_norm": 1.1525429487228394, + "learning_rate": 2.3560789943071033e-05, + "loss": 1.0187, + "step": 2096 + }, + { + "epoch": 1.3898080741230974, + "grad_norm": 1.1950461864471436, + "learning_rate": 2.3375228883365334e-05, + "loss": 1.0912, + "step": 2100 + }, + { + "epoch": 1.3924553275976175, + "grad_norm": 1.1531497240066528, + "learning_rate": 2.319017825945633e-05, + "loss": 1.128, + "step": 2104 + }, + { + "epoch": 1.3951025810721376, + "grad_norm": 1.2713518142700195, + "learning_rate": 2.300564161903511e-05, + "loss": 1.0656, + "step": 2108 + }, + { + "epoch": 1.397749834546658, + "grad_norm": 1.1415860652923584, + "learning_rate": 2.282162249993895e-05, + "loss": 1.1084, + "step": 2112 + }, + { + "epoch": 1.400397088021178, + "grad_norm": 1.114864468574524, + "learning_rate": 2.263812443008343e-05, + "loss": 1.0531, + "step": 2116 + }, + { + "epoch": 1.4030443414956983, + "grad_norm": 1.3787562847137451, + "learning_rate": 2.245515092739488e-05, + "loss": 1.072, + "step": 2120 + }, + { + "epoch": 1.4056915949702184, + "grad_norm": 1.014003872871399, + "learning_rate": 2.2272705499742925e-05, + "loss": 1.0156, + "step": 2124 + }, + { + "epoch": 1.4083388484447386, + "grad_norm": 1.1538441181182861, + "learning_rate": 2.209079164487323e-05, + "loss": 1.0101, + "step": 2128 + }, + { + "epoch": 1.4109861019192587, + "grad_norm": 1.2096091508865356, + "learning_rate": 2.1909412850340394e-05, + "loss": 1.0201, + "step": 2132 + }, + { + "epoch": 1.413633355393779, + "grad_norm": 1.1149653196334839, + "learning_rate": 2.1728572593441133e-05, + "loss": 1.1124, + "step": 2136 + }, + { + "epoch": 1.416280608868299, + "grad_norm": 1.3355867862701416, + "learning_rate": 2.154827434114765e-05, + "loss": 1.1943, + "step": 2140 + }, + { + "epoch": 1.4189278623428194, + "grad_norm": 1.2160899639129639, + "learning_rate": 2.1368521550041066e-05, + "loss": 1.1481, + "step": 2144 + }, + { + "epoch": 1.4215751158173395, + "grad_norm": 1.163010597229004, + "learning_rate": 2.1189317666245285e-05, + "loss": 1.0703, + "step": 2148 + }, + { + "epoch": 1.4242223692918596, + "grad_norm": 1.1877809762954712, + "learning_rate": 2.1010666125360767e-05, + "loss": 1.1211, + "step": 2152 + }, + { + "epoch": 1.42686962276638, + "grad_norm": 1.4443504810333252, + "learning_rate": 2.083257035239885e-05, + "loss": 1.2918, + "step": 2156 + }, + { + "epoch": 1.4295168762409, + "grad_norm": 1.2549368143081665, + "learning_rate": 2.0655033761715897e-05, + "loss": 1.1117, + "step": 2160 + }, + { + "epoch": 1.4321641297154202, + "grad_norm": 1.2271883487701416, + "learning_rate": 2.0478059756948002e-05, + "loss": 1.1452, + "step": 2164 + }, + { + "epoch": 1.4348113831899405, + "grad_norm": 1.2357865571975708, + "learning_rate": 2.0301651730945627e-05, + "loss": 1.0594, + "step": 2168 + }, + { + "epoch": 1.4374586366644606, + "grad_norm": 1.08621346950531, + "learning_rate": 2.0125813065708566e-05, + "loss": 1.0332, + "step": 2172 + }, + { + "epoch": 1.4401058901389807, + "grad_norm": 1.1553773880004883, + "learning_rate": 1.9950547132321183e-05, + "loss": 1.0823, + "step": 2176 + }, + { + "epoch": 1.442753143613501, + "grad_norm": 1.2597051858901978, + "learning_rate": 1.9775857290887757e-05, + "loss": 1.0197, + "step": 2180 + }, + { + "epoch": 1.4454003970880211, + "grad_norm": 1.2433415651321411, + "learning_rate": 1.9601746890467965e-05, + "loss": 1.0602, + "step": 2184 + }, + { + "epoch": 1.4480476505625415, + "grad_norm": 1.3405801057815552, + "learning_rate": 1.942821926901279e-05, + "loss": 1.1459, + "step": 2188 + }, + { + "epoch": 1.4506949040370616, + "grad_norm": 1.1183578968048096, + "learning_rate": 1.9255277753300487e-05, + "loss": 1.08, + "step": 2192 + }, + { + "epoch": 1.4533421575115817, + "grad_norm": 1.011930227279663, + "learning_rate": 1.9082925658872853e-05, + "loss": 1.0511, + "step": 2196 + }, + { + "epoch": 1.4559894109861018, + "grad_norm": 1.1752732992172241, + "learning_rate": 1.8911166289971545e-05, + "loss": 1.0437, + "step": 2200 + }, + { + "epoch": 1.4586366644606221, + "grad_norm": 1.1920056343078613, + "learning_rate": 1.8740002939474822e-05, + "loss": 1.0756, + "step": 2204 + }, + { + "epoch": 1.4612839179351422, + "grad_norm": 1.1798444986343384, + "learning_rate": 1.856943888883444e-05, + "loss": 1.0473, + "step": 2208 + }, + { + "epoch": 1.4639311714096626, + "grad_norm": 1.4702142477035522, + "learning_rate": 1.8399477408012643e-05, + "loss": 1.0968, + "step": 2212 + }, + { + "epoch": 1.4665784248841827, + "grad_norm": 1.2086206674575806, + "learning_rate": 1.82301217554196e-05, + "loss": 1.0752, + "step": 2216 + }, + { + "epoch": 1.4692256783587028, + "grad_norm": 1.2675915956497192, + "learning_rate": 1.8061375177850774e-05, + "loss": 1.1505, + "step": 2220 + }, + { + "epoch": 1.471872931833223, + "grad_norm": 1.1746258735656738, + "learning_rate": 1.7893240910424876e-05, + "loss": 1.0708, + "step": 2224 + }, + { + "epoch": 1.4745201853077432, + "grad_norm": 1.2071187496185303, + "learning_rate": 1.772572217652163e-05, + "loss": 1.085, + "step": 2228 + }, + { + "epoch": 1.4771674387822635, + "grad_norm": 1.321071743965149, + "learning_rate": 1.755882218772018e-05, + "loss": 1.1952, + "step": 2232 + }, + { + "epoch": 1.4798146922567836, + "grad_norm": 1.1357455253601074, + "learning_rate": 1.7392544143737355e-05, + "loss": 0.9572, + "step": 2236 + }, + { + "epoch": 1.4824619457313037, + "grad_norm": 1.1780970096588135, + "learning_rate": 1.7226891232366394e-05, + "loss": 0.9885, + "step": 2240 + }, + { + "epoch": 1.4851091992058238, + "grad_norm": 1.017472505569458, + "learning_rate": 1.7061866629415862e-05, + "loss": 1.0184, + "step": 2244 + }, + { + "epoch": 1.4877564526803442, + "grad_norm": 1.0961604118347168, + "learning_rate": 1.6897473498648765e-05, + "loss": 1.0232, + "step": 2248 + }, + { + "epoch": 1.4904037061548643, + "grad_norm": 1.187002182006836, + "learning_rate": 1.673371499172174e-05, + "loss": 0.9823, + "step": 2252 + }, + { + "epoch": 1.4930509596293846, + "grad_norm": 1.1367725133895874, + "learning_rate": 1.6570594248124875e-05, + "loss": 1.0288, + "step": 2256 + }, + { + "epoch": 1.4956982131039047, + "grad_norm": 1.129102110862732, + "learning_rate": 1.640811439512136e-05, + "loss": 1.0688, + "step": 2260 + }, + { + "epoch": 1.4983454665784248, + "grad_norm": 1.1886552572250366, + "learning_rate": 1.6246278547687604e-05, + "loss": 1.0209, + "step": 2264 + }, + { + "epoch": 1.500992720052945, + "grad_norm": 1.2786222696304321, + "learning_rate": 1.6085089808453408e-05, + "loss": 1.1101, + "step": 2268 + }, + { + "epoch": 1.5036399735274653, + "grad_norm": 1.2403247356414795, + "learning_rate": 1.592455126764264e-05, + "loss": 1.0919, + "step": 2272 + }, + { + "epoch": 1.5062872270019856, + "grad_norm": 1.1364173889160156, + "learning_rate": 1.5764666003013905e-05, + "loss": 1.0854, + "step": 2276 + }, + { + "epoch": 1.5089344804765057, + "grad_norm": 1.0539426803588867, + "learning_rate": 1.560543707980152e-05, + "loss": 1.014, + "step": 2280 + }, + { + "epoch": 1.5115817339510258, + "grad_norm": 1.2470543384552002, + "learning_rate": 1.544686755065677e-05, + "loss": 1.0845, + "step": 2284 + }, + { + "epoch": 1.514228987425546, + "grad_norm": 1.3111423254013062, + "learning_rate": 1.5288960455589447e-05, + "loss": 1.1363, + "step": 2288 + }, + { + "epoch": 1.5168762409000662, + "grad_norm": 1.076616883277893, + "learning_rate": 1.5131718821909435e-05, + "loss": 1.0104, + "step": 2292 + }, + { + "epoch": 1.5195234943745863, + "grad_norm": 1.082895040512085, + "learning_rate": 1.4975145664168839e-05, + "loss": 1.0468, + "step": 2296 + }, + { + "epoch": 1.5221707478491067, + "grad_norm": 1.2314468622207642, + "learning_rate": 1.4819243984104015e-05, + "loss": 1.0802, + "step": 2300 + }, + { + "epoch": 1.5248180013236268, + "grad_norm": 1.7986695766448975, + "learning_rate": 1.4664016770578182e-05, + "loss": 1.0324, + "step": 2304 + }, + { + "epoch": 1.5274652547981469, + "grad_norm": 1.2059293985366821, + "learning_rate": 1.4509466999523985e-05, + "loss": 1.0119, + "step": 2308 + }, + { + "epoch": 1.530112508272667, + "grad_norm": 1.1547520160675049, + "learning_rate": 1.4355597633886575e-05, + "loss": 1.0348, + "step": 2312 + }, + { + "epoch": 1.5327597617471873, + "grad_norm": 1.1303229331970215, + "learning_rate": 1.4202411623566685e-05, + "loss": 0.9453, + "step": 2316 + }, + { + "epoch": 1.5354070152217076, + "grad_norm": 1.3329232931137085, + "learning_rate": 1.4049911905364128e-05, + "loss": 1.1958, + "step": 2320 + }, + { + "epoch": 1.5380542686962277, + "grad_norm": 1.2855108976364136, + "learning_rate": 1.3898101402921516e-05, + "loss": 1.1197, + "step": 2324 + }, + { + "epoch": 1.5407015221707479, + "grad_norm": 1.1098500490188599, + "learning_rate": 1.3746983026668198e-05, + "loss": 1.0392, + "step": 2328 + }, + { + "epoch": 1.543348775645268, + "grad_norm": 1.232391119003296, + "learning_rate": 1.359655967376442e-05, + "loss": 1.0877, + "step": 2332 + }, + { + "epoch": 1.545996029119788, + "grad_norm": 1.2778176069259644, + "learning_rate": 1.3446834228045812e-05, + "loss": 1.0646, + "step": 2336 + }, + { + "epoch": 1.5486432825943084, + "grad_norm": 1.0760436058044434, + "learning_rate": 1.3297809559968133e-05, + "loss": 1.0476, + "step": 2340 + }, + { + "epoch": 1.5512905360688287, + "grad_norm": 1.0470277070999146, + "learning_rate": 1.3149488526552201e-05, + "loss": 0.9706, + "step": 2344 + }, + { + "epoch": 1.5539377895433488, + "grad_norm": 1.3804305791854858, + "learning_rate": 1.3001873971329121e-05, + "loss": 1.0437, + "step": 2348 + }, + { + "epoch": 1.556585043017869, + "grad_norm": 1.1428264379501343, + "learning_rate": 1.2854968724285754e-05, + "loss": 1.0923, + "step": 2352 + }, + { + "epoch": 1.559232296492389, + "grad_norm": 1.1798884868621826, + "learning_rate": 1.270877560181054e-05, + "loss": 1.1306, + "step": 2356 + }, + { + "epoch": 1.5618795499669094, + "grad_norm": 1.1382559537887573, + "learning_rate": 1.2563297406639395e-05, + "loss": 1.1029, + "step": 2360 + }, + { + "epoch": 1.5645268034414295, + "grad_norm": 1.0915166139602661, + "learning_rate": 1.2418536927802094e-05, + "loss": 0.9779, + "step": 2364 + }, + { + "epoch": 1.5671740569159498, + "grad_norm": 1.1595373153686523, + "learning_rate": 1.2274496940568664e-05, + "loss": 1.1744, + "step": 2368 + }, + { + "epoch": 1.56982131039047, + "grad_norm": 1.1752400398254395, + "learning_rate": 1.213118020639633e-05, + "loss": 1.0246, + "step": 2372 + }, + { + "epoch": 1.57246856386499, + "grad_norm": 1.064510464668274, + "learning_rate": 1.1988589472876438e-05, + "loss": 1.1571, + "step": 2376 + }, + { + "epoch": 1.5751158173395101, + "grad_norm": 1.2771798372268677, + "learning_rate": 1.184672747368189e-05, + "loss": 1.0656, + "step": 2380 + }, + { + "epoch": 1.5777630708140304, + "grad_norm": 1.2218413352966309, + "learning_rate": 1.1705596928514645e-05, + "loss": 1.0626, + "step": 2384 + }, + { + "epoch": 1.5804103242885508, + "grad_norm": 1.0653800964355469, + "learning_rate": 1.1565200543053623e-05, + "loss": 1.0626, + "step": 2388 + }, + { + "epoch": 1.5830575777630709, + "grad_norm": 1.2271225452423096, + "learning_rate": 1.1425541008902851e-05, + "loss": 1.1017, + "step": 2392 + }, + { + "epoch": 1.585704831237591, + "grad_norm": 1.1287221908569336, + "learning_rate": 1.128662100353985e-05, + "loss": 0.9612, + "step": 2396 + }, + { + "epoch": 1.588352084712111, + "grad_norm": 1.1722044944763184, + "learning_rate": 1.1148443190264246e-05, + "loss": 0.9906, + "step": 2400 + }, + { + "epoch": 1.5909993381866314, + "grad_norm": 1.3099933862686157, + "learning_rate": 1.1011010218146777e-05, + "loss": 1.0637, + "step": 2404 + }, + { + "epoch": 1.5936465916611515, + "grad_norm": 1.1737853288650513, + "learning_rate": 1.0874324721978501e-05, + "loss": 1.082, + "step": 2408 + }, + { + "epoch": 1.5962938451356719, + "grad_norm": 1.258298635482788, + "learning_rate": 1.0738389322220276e-05, + "loss": 1.0151, + "step": 2412 + }, + { + "epoch": 1.598941098610192, + "grad_norm": 1.198495864868164, + "learning_rate": 1.0603206624952482e-05, + "loss": 1.0566, + "step": 2416 + }, + { + "epoch": 1.601588352084712, + "grad_norm": 1.1976563930511475, + "learning_rate": 1.0468779221825103e-05, + "loss": 1.1149, + "step": 2420 + }, + { + "epoch": 1.6042356055592322, + "grad_norm": 1.0899832248687744, + "learning_rate": 1.0335109690008055e-05, + "loss": 1.0187, + "step": 2424 + }, + { + "epoch": 1.6068828590337525, + "grad_norm": 1.3058562278747559, + "learning_rate": 1.0202200592141703e-05, + "loss": 1.1494, + "step": 2428 + }, + { + "epoch": 1.6095301125082728, + "grad_norm": 1.304995059967041, + "learning_rate": 1.0070054476287849e-05, + "loss": 1.1067, + "step": 2432 + }, + { + "epoch": 1.612177365982793, + "grad_norm": 1.2065619230270386, + "learning_rate": 9.938673875880755e-06, + "loss": 1.03, + "step": 2436 + }, + { + "epoch": 1.614824619457313, + "grad_norm": 1.3018181324005127, + "learning_rate": 9.808061309678634e-06, + "loss": 1.1286, + "step": 2440 + }, + { + "epoch": 1.6174718729318331, + "grad_norm": 1.257094144821167, + "learning_rate": 9.678219281715412e-06, + "loss": 1.2452, + "step": 2444 + }, + { + "epoch": 1.6201191264063532, + "grad_norm": 1.1389868259429932, + "learning_rate": 9.549150281252633e-06, + "loss": 1.1589, + "step": 2448 + }, + { + "epoch": 1.6227663798808736, + "grad_norm": 1.2208179235458374, + "learning_rate": 9.420856782731774e-06, + "loss": 1.0969, + "step": 2452 + }, + { + "epoch": 1.625413633355394, + "grad_norm": 1.2272435426712036, + "learning_rate": 9.293341245726794e-06, + "loss": 0.9552, + "step": 2456 + }, + { + "epoch": 1.628060886829914, + "grad_norm": 1.1400785446166992, + "learning_rate": 9.16660611489702e-06, + "loss": 0.9583, + "step": 2460 + }, + { + "epoch": 1.6307081403044341, + "grad_norm": 1.1277272701263428, + "learning_rate": 9.040653819940259e-06, + "loss": 1.0511, + "step": 2464 + }, + { + "epoch": 1.6333553937789542, + "grad_norm": 1.1486189365386963, + "learning_rate": 8.915486775546173e-06, + "loss": 0.9686, + "step": 2468 + }, + { + "epoch": 1.6360026472534746, + "grad_norm": 1.1076239347457886, + "learning_rate": 8.791107381350027e-06, + "loss": 0.9773, + "step": 2472 + }, + { + "epoch": 1.6386499007279947, + "grad_norm": 1.0638751983642578, + "learning_rate": 8.6675180218867e-06, + "loss": 1.0176, + "step": 2476 + }, + { + "epoch": 1.641297154202515, + "grad_norm": 1.201035499572754, + "learning_rate": 8.544721066544964e-06, + "loss": 1.0009, + "step": 2480 + }, + { + "epoch": 1.643944407677035, + "grad_norm": 1.2673206329345703, + "learning_rate": 8.422718869522006e-06, + "loss": 1.1548, + "step": 2484 + }, + { + "epoch": 1.6465916611515552, + "grad_norm": 1.1903181076049805, + "learning_rate": 8.30151376977834e-06, + "loss": 1.0678, + "step": 2488 + }, + { + "epoch": 1.6492389146260753, + "grad_norm": 1.1597754955291748, + "learning_rate": 8.181108090993001e-06, + "loss": 1.0756, + "step": 2492 + }, + { + "epoch": 1.6518861681005956, + "grad_norm": 1.142747163772583, + "learning_rate": 8.061504141518888e-06, + "loss": 1.1026, + "step": 2496 + }, + { + "epoch": 1.654533421575116, + "grad_norm": 1.187888741493225, + "learning_rate": 7.942704214338648e-06, + "loss": 1.0138, + "step": 2500 + }, + { + "epoch": 1.657180675049636, + "grad_norm": 1.1005282402038574, + "learning_rate": 7.824710587020596e-06, + "loss": 1.015, + "step": 2504 + }, + { + "epoch": 1.6598279285241562, + "grad_norm": 1.2265509366989136, + "learning_rate": 7.707525521675097e-06, + "loss": 1.3109, + "step": 2508 + }, + { + "epoch": 1.6624751819986763, + "grad_norm": 1.1046435832977295, + "learning_rate": 7.591151264911239e-06, + "loss": 1.0726, + "step": 2512 + }, + { + "epoch": 1.6651224354731966, + "grad_norm": 1.1124870777130127, + "learning_rate": 7.475590047793712e-06, + "loss": 1.0319, + "step": 2516 + }, + { + "epoch": 1.6677696889477167, + "grad_norm": 1.0768115520477295, + "learning_rate": 7.360844085800023e-06, + "loss": 0.9718, + "step": 2520 + }, + { + "epoch": 1.670416942422237, + "grad_norm": 1.1033765077590942, + "learning_rate": 7.246915578778046e-06, + "loss": 0.9838, + "step": 2524 + }, + { + "epoch": 1.6730641958967571, + "grad_norm": 1.1125131845474243, + "learning_rate": 7.133806710903884e-06, + "loss": 0.9366, + "step": 2528 + }, + { + "epoch": 1.6757114493712773, + "grad_norm": 1.0644124746322632, + "learning_rate": 7.0215196506399515e-06, + "loss": 0.9442, + "step": 2532 + }, + { + "epoch": 1.6783587028457974, + "grad_norm": 1.4144614934921265, + "learning_rate": 6.910056550693356e-06, + "loss": 1.0511, + "step": 2536 + }, + { + "epoch": 1.6810059563203177, + "grad_norm": 1.1880645751953125, + "learning_rate": 6.799419547974739e-06, + "loss": 1.069, + "step": 2540 + }, + { + "epoch": 1.683653209794838, + "grad_norm": 1.2131253480911255, + "learning_rate": 6.6896107635572414e-06, + "loss": 1.11, + "step": 2544 + }, + { + "epoch": 1.6863004632693581, + "grad_norm": 1.1012145280838013, + "learning_rate": 6.580632302635831e-06, + "loss": 1.0216, + "step": 2548 + }, + { + "epoch": 1.6889477167438782, + "grad_norm": 1.4158655405044556, + "learning_rate": 6.472486254486954e-06, + "loss": 0.989, + "step": 2552 + }, + { + "epoch": 1.6915949702183983, + "grad_norm": 1.168895959854126, + "learning_rate": 6.36517469242851e-06, + "loss": 1.1558, + "step": 2556 + }, + { + "epoch": 1.6942422236929184, + "grad_norm": 1.180389642715454, + "learning_rate": 6.258699673780083e-06, + "loss": 1.0815, + "step": 2560 + }, + { + "epoch": 1.6968894771674388, + "grad_norm": 1.186112642288208, + "learning_rate": 6.15306323982347e-06, + "loss": 1.0766, + "step": 2564 + }, + { + "epoch": 1.699536730641959, + "grad_norm": 1.3972220420837402, + "learning_rate": 6.04826741576357e-06, + "loss": 0.933, + "step": 2568 + }, + { + "epoch": 1.7021839841164792, + "grad_norm": 1.0709800720214844, + "learning_rate": 5.944314210689611e-06, + "loss": 0.9295, + "step": 2572 + }, + { + "epoch": 1.7048312375909993, + "grad_norm": 1.131684422492981, + "learning_rate": 5.841205617536516e-06, + "loss": 1.0127, + "step": 2576 + }, + { + "epoch": 1.7074784910655194, + "grad_norm": 1.1289499998092651, + "learning_rate": 5.738943613046821e-06, + "loss": 1.0566, + "step": 2580 + }, + { + "epoch": 1.7101257445400397, + "grad_norm": 1.0850427150726318, + "learning_rate": 5.637530157732673e-06, + "loss": 0.929, + "step": 2584 + }, + { + "epoch": 1.7127729980145598, + "grad_norm": 1.3074991703033447, + "learning_rate": 5.536967195838333e-06, + "loss": 1.1549, + "step": 2588 + }, + { + "epoch": 1.7154202514890802, + "grad_norm": 1.286634922027588, + "learning_rate": 5.437256655302814e-06, + "loss": 1.0361, + "step": 2592 + }, + { + "epoch": 1.7180675049636003, + "grad_norm": 1.098363995552063, + "learning_rate": 5.338400447723008e-06, + "loss": 1.0157, + "step": 2596 + }, + { + "epoch": 1.7207147584381204, + "grad_norm": 1.2663805484771729, + "learning_rate": 5.240400468316975e-06, + "loss": 1.0805, + "step": 2600 + }, + { + "epoch": 1.7233620119126405, + "grad_norm": 1.2380725145339966, + "learning_rate": 5.143258595887607e-06, + "loss": 1.0504, + "step": 2604 + }, + { + "epoch": 1.7260092653871608, + "grad_norm": 1.534725546836853, + "learning_rate": 5.046976692786665e-06, + "loss": 1.0683, + "step": 2608 + }, + { + "epoch": 1.7286565188616811, + "grad_norm": 1.2903854846954346, + "learning_rate": 4.951556604879048e-06, + "loss": 1.1924, + "step": 2612 + }, + { + "epoch": 1.7313037723362013, + "grad_norm": 1.378965139389038, + "learning_rate": 4.857000161507353e-06, + "loss": 1.1261, + "step": 2616 + }, + { + "epoch": 1.7339510258107214, + "grad_norm": 1.3099424839019775, + "learning_rate": 4.763309175456876e-06, + "loss": 1.1385, + "step": 2620 + }, + { + "epoch": 1.7365982792852415, + "grad_norm": 1.1315497159957886, + "learning_rate": 4.67048544292083e-06, + "loss": 1.0022, + "step": 2624 + }, + { + "epoch": 1.7392455327597618, + "grad_norm": 1.0618172883987427, + "learning_rate": 4.5785307434659195e-06, + "loss": 0.933, + "step": 2628 + }, + { + "epoch": 1.741892786234282, + "grad_norm": 1.1535784006118774, + "learning_rate": 4.487446839998194e-06, + "loss": 1.0693, + "step": 2632 + }, + { + "epoch": 1.7445400397088022, + "grad_norm": 1.208883285522461, + "learning_rate": 4.397235478729262e-06, + "loss": 1.0487, + "step": 2636 + }, + { + "epoch": 1.7471872931833223, + "grad_norm": 1.079362392425537, + "learning_rate": 4.307898389142867e-06, + "loss": 1.0225, + "step": 2640 + }, + { + "epoch": 1.7498345466578424, + "grad_norm": 1.1642612218856812, + "learning_rate": 4.21943728396163e-06, + "loss": 1.0915, + "step": 2644 + }, + { + "epoch": 1.7524818001323625, + "grad_norm": 1.202144742012024, + "learning_rate": 4.1318538591143204e-06, + "loss": 0.9903, + "step": 2648 + }, + { + "epoch": 1.7551290536068829, + "grad_norm": 1.182325839996338, + "learning_rate": 4.045149793703257e-06, + "loss": 1.0321, + "step": 2652 + }, + { + "epoch": 1.7577763070814032, + "grad_norm": 1.1768420934677124, + "learning_rate": 3.959326749972159e-06, + "loss": 1.0065, + "step": 2656 + }, + { + "epoch": 1.7604235605559233, + "grad_norm": 1.1037213802337646, + "learning_rate": 3.8743863732742855e-06, + "loss": 1.0145, + "step": 2660 + }, + { + "epoch": 1.7630708140304434, + "grad_norm": 1.0442618131637573, + "learning_rate": 3.790330292040878e-06, + "loss": 0.9401, + "step": 2664 + }, + { + "epoch": 1.7657180675049635, + "grad_norm": 1.2205618619918823, + "learning_rate": 3.7071601177499193e-06, + "loss": 1.0445, + "step": 2668 + }, + { + "epoch": 1.7683653209794836, + "grad_norm": 0.982466995716095, + "learning_rate": 3.6248774448952695e-06, + "loss": 0.9302, + "step": 2672 + }, + { + "epoch": 1.771012574454004, + "grad_norm": 1.2503985166549683, + "learning_rate": 3.5434838509560974e-06, + "loss": 0.9465, + "step": 2676 + }, + { + "epoch": 1.7736598279285243, + "grad_norm": 1.2538197040557861, + "learning_rate": 3.4629808963666355e-06, + "loss": 1.1634, + "step": 2680 + }, + { + "epoch": 1.7763070814030444, + "grad_norm": 1.1053706407546997, + "learning_rate": 3.3833701244862347e-06, + "loss": 0.9964, + "step": 2684 + }, + { + "epoch": 1.7789543348775645, + "grad_norm": 1.2324868440628052, + "learning_rate": 3.304653061569807e-06, + "loss": 1.009, + "step": 2688 + }, + { + "epoch": 1.7816015883520846, + "grad_norm": 1.1064050197601318, + "learning_rate": 3.226831216738568e-06, + "loss": 0.9975, + "step": 2692 + }, + { + "epoch": 1.784248841826605, + "grad_norm": 1.1996777057647705, + "learning_rate": 3.149906081951076e-06, + "loss": 1.1181, + "step": 2696 + }, + { + "epoch": 1.786896095301125, + "grad_norm": 1.0701042413711548, + "learning_rate": 3.0738791319746606e-06, + "loss": 0.9735, + "step": 2700 + }, + { + "epoch": 1.7895433487756454, + "grad_norm": 1.426613211631775, + "learning_rate": 2.9987518243571266e-06, + "loss": 1.0882, + "step": 2704 + }, + { + "epoch": 1.7921906022501655, + "grad_norm": 1.1900283098220825, + "learning_rate": 2.924525599398831e-06, + "loss": 1.0896, + "step": 2708 + }, + { + "epoch": 1.7948378557246856, + "grad_norm": 1.203924536705017, + "learning_rate": 2.8512018801250428e-06, + "loss": 1.0041, + "step": 2712 + }, + { + "epoch": 1.7974851091992057, + "grad_norm": 1.1849395036697388, + "learning_rate": 2.7787820722586844e-06, + "loss": 1.018, + "step": 2716 + }, + { + "epoch": 1.800132362673726, + "grad_norm": 1.3121761083602905, + "learning_rate": 2.707267564193383e-06, + "loss": 1.0887, + "step": 2720 + }, + { + "epoch": 1.8027796161482463, + "grad_norm": 1.0863194465637207, + "learning_rate": 2.636659726966817e-06, + "loss": 0.9601, + "step": 2724 + }, + { + "epoch": 1.8054268696227664, + "grad_norm": 1.2052465677261353, + "learning_rate": 2.5669599142344958e-06, + "loss": 1.1252, + "step": 2728 + }, + { + "epoch": 1.8080741230972865, + "grad_norm": 1.2324072122573853, + "learning_rate": 2.4981694622437545e-06, + "loss": 1.0962, + "step": 2732 + }, + { + "epoch": 1.8107213765718067, + "grad_norm": 1.1981109380722046, + "learning_rate": 2.4302896898081516e-06, + "loss": 1.1382, + "step": 2736 + }, + { + "epoch": 1.813368630046327, + "grad_norm": 1.0790292024612427, + "learning_rate": 2.3633218982821724e-06, + "loss": 1.0246, + "step": 2740 + }, + { + "epoch": 1.816015883520847, + "grad_norm": 1.188328504562378, + "learning_rate": 2.2972673715363268e-06, + "loss": 1.1037, + "step": 2744 + }, + { + "epoch": 1.8186631369953674, + "grad_norm": 2.650550365447998, + "learning_rate": 2.232127375932491e-06, + "loss": 0.9985, + "step": 2748 + }, + { + "epoch": 1.8213103904698875, + "grad_norm": 1.209547758102417, + "learning_rate": 2.1679031602996168e-06, + "loss": 1.0379, + "step": 2752 + }, + { + "epoch": 1.8239576439444076, + "grad_norm": 1.2373130321502686, + "learning_rate": 2.104595955909844e-06, + "loss": 1.1138, + "step": 2756 + }, + { + "epoch": 1.8266048974189277, + "grad_norm": 1.1303315162658691, + "learning_rate": 2.042206976454869e-06, + "loss": 1.0872, + "step": 2760 + }, + { + "epoch": 1.829252150893448, + "grad_norm": 1.1631232500076294, + "learning_rate": 1.980737418022649e-06, + "loss": 0.9993, + "step": 2764 + }, + { + "epoch": 1.8318994043679684, + "grad_norm": 0.9920935034751892, + "learning_rate": 1.9201884590745122e-06, + "loss": 0.9902, + "step": 2768 + }, + { + "epoch": 1.8345466578424885, + "grad_norm": 1.1404036283493042, + "learning_rate": 1.8605612604225387e-06, + "loss": 0.9403, + "step": 2772 + }, + { + "epoch": 1.8371939113170086, + "grad_norm": 1.3009891510009766, + "learning_rate": 1.8018569652073381e-06, + "loss": 1.065, + "step": 2776 + }, + { + "epoch": 1.8398411647915287, + "grad_norm": 1.0856890678405762, + "learning_rate": 1.7440766988760793e-06, + "loss": 1.0082, + "step": 2780 + }, + { + "epoch": 1.8424884182660488, + "grad_norm": 1.2409597635269165, + "learning_rate": 1.6872215691609684e-06, + "loss": 1.2227, + "step": 2784 + }, + { + "epoch": 1.8451356717405691, + "grad_norm": 1.229095458984375, + "learning_rate": 1.631292666057982e-06, + "loss": 1.1196, + "step": 2788 + }, + { + "epoch": 1.8477829252150895, + "grad_norm": 1.1981017589569092, + "learning_rate": 1.5762910618059789e-06, + "loss": 1.1182, + "step": 2792 + }, + { + "epoch": 1.8504301786896096, + "grad_norm": 1.2496317625045776, + "learning_rate": 1.5222178108661444e-06, + "loss": 1.011, + "step": 2796 + }, + { + "epoch": 1.8530774321641297, + "grad_norm": 1.3405871391296387, + "learning_rate": 1.469073949901778e-06, + "loss": 0.9571, + "step": 2800 + }, + { + "epoch": 1.8557246856386498, + "grad_norm": 1.1392794847488403, + "learning_rate": 1.4168604977583989e-06, + "loss": 0.9235, + "step": 2804 + }, + { + "epoch": 1.8583719391131701, + "grad_norm": 1.3417925834655762, + "learning_rate": 1.3655784554442385e-06, + "loss": 0.9861, + "step": 2808 + }, + { + "epoch": 1.8610191925876902, + "grad_norm": 1.2177116870880127, + "learning_rate": 1.3152288061110518e-06, + "loss": 1.0414, + "step": 2812 + }, + { + "epoch": 1.8636664460622105, + "grad_norm": 1.18758225440979, + "learning_rate": 1.2658125150352361e-06, + "loss": 1.0958, + "step": 2816 + }, + { + "epoch": 1.8663136995367307, + "grad_norm": 1.068544864654541, + "learning_rate": 1.2173305295993477e-06, + "loss": 0.8817, + "step": 2820 + }, + { + "epoch": 1.8689609530112508, + "grad_norm": 1.0975282192230225, + "learning_rate": 1.169783779273953e-06, + "loss": 0.9843, + "step": 2824 + }, + { + "epoch": 1.8716082064857709, + "grad_norm": 1.1519986391067505, + "learning_rate": 1.1231731755997954e-06, + "loss": 1.1748, + "step": 2828 + }, + { + "epoch": 1.8742554599602912, + "grad_norm": 1.3243839740753174, + "learning_rate": 1.0774996121702908e-06, + "loss": 1.0024, + "step": 2832 + }, + { + "epoch": 1.8769027134348115, + "grad_norm": 1.1130131483078003, + "learning_rate": 1.0327639646144415e-06, + "loss": 0.9669, + "step": 2836 + }, + { + "epoch": 1.8795499669093316, + "grad_norm": 1.2060186862945557, + "learning_rate": 9.889670905800397e-07, + "loss": 0.9385, + "step": 2840 + }, + { + "epoch": 1.8821972203838517, + "grad_norm": 1.1549471616744995, + "learning_rate": 9.461098297172011e-07, + "loss": 0.9559, + "step": 2844 + }, + { + "epoch": 1.8848444738583718, + "grad_norm": 1.1581448316574097, + "learning_rate": 9.041930036622903e-07, + "loss": 1.069, + "step": 2848 + }, + { + "epoch": 1.8874917273328922, + "grad_norm": 1.1043188571929932, + "learning_rate": 8.632174160221496e-07, + "loss": 1.0042, + "step": 2852 + }, + { + "epoch": 1.8901389808074123, + "grad_norm": 1.1459840536117554, + "learning_rate": 8.231838523587277e-07, + "loss": 0.9267, + "step": 2856 + }, + { + "epoch": 1.8927862342819326, + "grad_norm": 1.2066096067428589, + "learning_rate": 7.840930801739754e-07, + "loss": 1.0465, + "step": 2860 + }, + { + "epoch": 1.8954334877564527, + "grad_norm": 1.2505649328231812, + "learning_rate": 7.459458488951632e-07, + "loss": 1.0685, + "step": 2864 + }, + { + "epoch": 1.8980807412309728, + "grad_norm": 1.138899564743042, + "learning_rate": 7.087428898604975e-07, + "loss": 1.0052, + "step": 2868 + }, + { + "epoch": 1.900727994705493, + "grad_norm": 1.1179523468017578, + "learning_rate": 6.724849163050995e-07, + "loss": 0.9854, + "step": 2872 + }, + { + "epoch": 1.9033752481800132, + "grad_norm": 1.3499395847320557, + "learning_rate": 6.37172623347354e-07, + "loss": 1.0413, + "step": 2876 + }, + { + "epoch": 1.9060225016545336, + "grad_norm": 1.0739634037017822, + "learning_rate": 6.02806687975549e-07, + "loss": 1.1554, + "step": 2880 + }, + { + "epoch": 1.9086697551290537, + "grad_norm": 1.0829598903656006, + "learning_rate": 5.693877690349292e-07, + "loss": 1.0416, + "step": 2884 + }, + { + "epoch": 1.9113170086035738, + "grad_norm": 1.0071786642074585, + "learning_rate": 5.369165072150239e-07, + "loss": 0.929, + "step": 2888 + }, + { + "epoch": 1.913964262078094, + "grad_norm": 1.1580030918121338, + "learning_rate": 5.053935250374176e-07, + "loss": 1.0629, + "step": 2892 + }, + { + "epoch": 1.916611515552614, + "grad_norm": 1.2572953701019287, + "learning_rate": 4.7481942684378113e-07, + "loss": 1.1105, + "step": 2896 + }, + { + "epoch": 1.9192587690271343, + "grad_norm": 1.1861546039581299, + "learning_rate": 4.451947987842764e-07, + "loss": 1.0511, + "step": 2900 + }, + { + "epoch": 1.9219060225016547, + "grad_norm": 1.1360516548156738, + "learning_rate": 4.165202088063425e-07, + "loss": 1.0623, + "step": 2904 + }, + { + "epoch": 1.9245532759761748, + "grad_norm": 1.1186720132827759, + "learning_rate": 3.8879620664381e-07, + "loss": 0.9999, + "step": 2908 + }, + { + "epoch": 1.9272005294506949, + "grad_norm": 1.2490679025650024, + "learning_rate": 3.620233238063375e-07, + "loss": 1.0442, + "step": 2912 + }, + { + "epoch": 1.929847782925215, + "grad_norm": 1.309167504310608, + "learning_rate": 3.362020735692417e-07, + "loss": 1.1706, + "step": 2916 + }, + { + "epoch": 1.9324950363997353, + "grad_norm": 1.1864930391311646, + "learning_rate": 3.1133295096364977e-07, + "loss": 1.0731, + "step": 2920 + }, + { + "epoch": 1.9351422898742554, + "grad_norm": 1.1746701002120972, + "learning_rate": 2.87416432767007e-07, + "loss": 1.0544, + "step": 2924 + }, + { + "epoch": 1.9377895433487757, + "grad_norm": 1.272407054901123, + "learning_rate": 2.644529774939397e-07, + "loss": 1.0909, + "step": 2928 + }, + { + "epoch": 1.9404367968232958, + "grad_norm": 1.1303869485855103, + "learning_rate": 2.4244302538746766e-07, + "loss": 0.9551, + "step": 2932 + }, + { + "epoch": 1.943084050297816, + "grad_norm": 1.0882586240768433, + "learning_rate": 2.2138699841056655e-07, + "loss": 0.9893, + "step": 2936 + }, + { + "epoch": 1.945731303772336, + "grad_norm": 1.2608906030654907, + "learning_rate": 2.012853002380466e-07, + "loss": 1.0569, + "step": 2940 + }, + { + "epoch": 1.9483785572468564, + "grad_norm": 1.2106075286865234, + "learning_rate": 1.8213831624887545e-07, + "loss": 0.9922, + "step": 2944 + }, + { + "epoch": 1.9510258107213767, + "grad_norm": 1.1815046072006226, + "learning_rate": 1.6394641351872297e-07, + "loss": 1.0113, + "step": 2948 + }, + { + "epoch": 1.9536730641958968, + "grad_norm": 1.1953189373016357, + "learning_rate": 1.4670994081297795e-07, + "loss": 1.0361, + "step": 2952 + }, + { + "epoch": 1.956320317670417, + "grad_norm": 1.0204826593399048, + "learning_rate": 1.3042922858002015e-07, + "loss": 0.9583, + "step": 2956 + }, + { + "epoch": 1.958967571144937, + "grad_norm": 1.1778640747070312, + "learning_rate": 1.1510458894490871e-07, + "loss": 1.0795, + "step": 2960 + }, + { + "epoch": 1.9616148246194574, + "grad_norm": 1.1050951480865479, + "learning_rate": 1.0073631570340358e-07, + "loss": 0.947, + "step": 2964 + }, + { + "epoch": 1.9642620780939775, + "grad_norm": 1.4342139959335327, + "learning_rate": 8.732468431630892e-08, + "loss": 0.9858, + "step": 2968 + }, + { + "epoch": 1.9669093315684978, + "grad_norm": 1.3275805711746216, + "learning_rate": 7.486995190420509e-08, + "loss": 1.0232, + "step": 2972 + }, + { + "epoch": 1.969556585043018, + "grad_norm": 1.862630844116211, + "learning_rate": 6.337235724254154e-08, + "loss": 1.1036, + "step": 2976 + }, + { + "epoch": 1.972203838517538, + "grad_norm": 1.1249923706054688, + "learning_rate": 5.2832120757007054e-08, + "loss": 1.1517, + "step": 2980 + }, + { + "epoch": 1.974851091992058, + "grad_norm": 1.4025081396102905, + "learning_rate": 4.324944451934987e-08, + "loss": 1.1827, + "step": 2984 + }, + { + "epoch": 1.9774983454665784, + "grad_norm": 1.2881486415863037, + "learning_rate": 3.4624512243497386e-08, + "loss": 0.9921, + "step": 2988 + }, + { + "epoch": 1.9801455989410988, + "grad_norm": 1.256659746170044, + "learning_rate": 2.6957489281997926e-08, + "loss": 1.0058, + "step": 2992 + }, + { + "epoch": 1.9827928524156189, + "grad_norm": 1.2083126306533813, + "learning_rate": 2.0248522622906552e-08, + "loss": 1.0364, + "step": 2996 + }, + { + "epoch": 1.985440105890139, + "grad_norm": 1.2300423383712769, + "learning_rate": 1.4497740886920685e-08, + "loss": 1.056, + "step": 3000 + }, + { + "epoch": 1.988087359364659, + "grad_norm": 1.1946439743041992, + "learning_rate": 9.70525432493763e-09, + "loss": 1.1, + "step": 3004 + }, + { + "epoch": 1.9907346128391792, + "grad_norm": 1.1879000663757324, + "learning_rate": 5.8711548159229305e-09, + "loss": 0.9764, + "step": 3008 + }, + { + "epoch": 1.9933818663136995, + "grad_norm": 1.7793687582015991, + "learning_rate": 2.9955158651839845e-09, + "loss": 1.0218, + "step": 3012 + }, + { + "epoch": 1.9960291197882198, + "grad_norm": 1.2599058151245117, + "learning_rate": 1.0783926029211966e-09, + "loss": 1.0414, + "step": 3016 + }, + { + "epoch": 1.99867637326274, + "grad_norm": 1.2598057985305786, + "learning_rate": 1.1982178318437066e-10, + "loss": 1.1198, + "step": 3020 + } + ], + "logging_steps": 4, + "max_steps": 3022, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1511, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.115408240900833e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}