diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12831 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1827, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005473453749315818, + "grad_norm": 6.498785495758057, + "learning_rate": 9.999992607989888e-06, + "loss": 3.6359, + "step": 1 + }, + { + "epoch": 0.0010946907498631637, + "grad_norm": 6.897223472595215, + "learning_rate": 9.999970431981408e-06, + "loss": 4.6947, + "step": 2 + }, + { + "epoch": 0.0016420361247947454, + "grad_norm": 7.065066337585449, + "learning_rate": 9.999933472040129e-06, + "loss": 4.3915, + "step": 3 + }, + { + "epoch": 0.0021893814997263274, + "grad_norm": 6.855522155761719, + "learning_rate": 9.999881728275334e-06, + "loss": 3.9864, + "step": 4 + }, + { + "epoch": 0.002736726874657909, + "grad_norm": 9.54251480102539, + "learning_rate": 9.99981520084002e-06, + "loss": 3.7584, + "step": 5 + }, + { + "epoch": 0.003284072249589491, + "grad_norm": 9.166783332824707, + "learning_rate": 9.999733889930897e-06, + "loss": 4.3282, + "step": 6 + }, + { + "epoch": 0.0038314176245210726, + "grad_norm": 9.869710922241211, + "learning_rate": 9.999637795788383e-06, + "loss": 3.745, + "step": 7 + }, + { + "epoch": 0.004378762999452655, + "grad_norm": 10.634393692016602, + "learning_rate": 9.999526918696613e-06, + "loss": 3.2058, + "step": 8 + }, + { + "epoch": 0.0049261083743842365, + "grad_norm": 11.015477180480957, + "learning_rate": 9.999401258983426e-06, + "loss": 4.0171, + "step": 9 + }, + { + "epoch": 0.005473453749315818, + "grad_norm": 13.21458911895752, + "learning_rate": 9.999260817020373e-06, + "loss": 2.6568, + "step": 10 + }, + { + "epoch": 0.0060207991242474, + "grad_norm": 11.63002872467041, + "learning_rate": 9.999105593222714e-06, + "loss": 2.5763, + "step": 11 + }, + { + "epoch": 0.006568144499178982, + "grad_norm": 6.239895820617676, + "learning_rate": 9.998935588049414e-06, + "loss": 2.3638, + "step": 12 + }, + { + "epoch": 0.0071154898741105635, + "grad_norm": 16.099512100219727, + "learning_rate": 9.998750802003148e-06, + "loss": 3.0956, + "step": 13 + }, + { + "epoch": 0.007662835249042145, + "grad_norm": 9.356679916381836, + "learning_rate": 9.99855123563029e-06, + "loss": 2.1751, + "step": 14 + }, + { + "epoch": 0.008210180623973728, + "grad_norm": 7.839269638061523, + "learning_rate": 9.99833688952092e-06, + "loss": 2.5912, + "step": 15 + }, + { + "epoch": 0.00875752599890531, + "grad_norm": 10.665407180786133, + "learning_rate": 9.998107764308815e-06, + "loss": 2.2888, + "step": 16 + }, + { + "epoch": 0.009304871373836891, + "grad_norm": 8.104930877685547, + "learning_rate": 9.997863860671457e-06, + "loss": 1.4194, + "step": 17 + }, + { + "epoch": 0.009852216748768473, + "grad_norm": 5.245537757873535, + "learning_rate": 9.997605179330018e-06, + "loss": 2.3864, + "step": 18 + }, + { + "epoch": 0.010399562123700055, + "grad_norm": 9.132122039794922, + "learning_rate": 9.99733172104937e-06, + "loss": 2.0268, + "step": 19 + }, + { + "epoch": 0.010946907498631636, + "grad_norm": 6.973398208618164, + "learning_rate": 9.997043486638076e-06, + "loss": 1.4236, + "step": 20 + }, + { + "epoch": 0.011494252873563218, + "grad_norm": 4.4784159660339355, + "learning_rate": 9.996740476948386e-06, + "loss": 1.0039, + "step": 21 + }, + { + "epoch": 0.0120415982484948, + "grad_norm": 4.953526020050049, + "learning_rate": 9.996422692876242e-06, + "loss": 1.2214, + "step": 22 + }, + { + "epoch": 0.012588943623426382, + "grad_norm": 9.643818855285645, + "learning_rate": 9.996090135361269e-06, + "loss": 1.1022, + "step": 23 + }, + { + "epoch": 0.013136288998357963, + "grad_norm": 7.458937644958496, + "learning_rate": 9.995742805386775e-06, + "loss": 1.861, + "step": 24 + }, + { + "epoch": 0.013683634373289545, + "grad_norm": 3.3844003677368164, + "learning_rate": 9.995380703979744e-06, + "loss": 0.3331, + "step": 25 + }, + { + "epoch": 0.014230979748221127, + "grad_norm": 4.211463928222656, + "learning_rate": 9.995003832210843e-06, + "loss": 0.3263, + "step": 26 + }, + { + "epoch": 0.014778325123152709, + "grad_norm": 5.280601978302002, + "learning_rate": 9.994612191194407e-06, + "loss": 0.6696, + "step": 27 + }, + { + "epoch": 0.01532567049808429, + "grad_norm": 1.4705464839935303, + "learning_rate": 9.994205782088438e-06, + "loss": 0.1472, + "step": 28 + }, + { + "epoch": 0.015873015873015872, + "grad_norm": 4.857443332672119, + "learning_rate": 9.993784606094612e-06, + "loss": 0.3761, + "step": 29 + }, + { + "epoch": 0.016420361247947456, + "grad_norm": 1.6145434379577637, + "learning_rate": 9.993348664458263e-06, + "loss": 0.0976, + "step": 30 + }, + { + "epoch": 0.016967706622879036, + "grad_norm": 0.5500306487083435, + "learning_rate": 9.992897958468386e-06, + "loss": 0.0244, + "step": 31 + }, + { + "epoch": 0.01751505199781062, + "grad_norm": 1.4634612798690796, + "learning_rate": 9.992432489457626e-06, + "loss": 0.0901, + "step": 32 + }, + { + "epoch": 0.0180623973727422, + "grad_norm": 0.18377567827701569, + "learning_rate": 9.991952258802288e-06, + "loss": 0.0113, + "step": 33 + }, + { + "epoch": 0.018609742747673783, + "grad_norm": 0.2462746500968933, + "learning_rate": 9.99145726792232e-06, + "loss": 0.0164, + "step": 34 + }, + { + "epoch": 0.019157088122605363, + "grad_norm": 0.3507516384124756, + "learning_rate": 9.990947518281312e-06, + "loss": 0.0084, + "step": 35 + }, + { + "epoch": 0.019704433497536946, + "grad_norm": 0.05946524441242218, + "learning_rate": 9.990423011386489e-06, + "loss": 0.0025, + "step": 36 + }, + { + "epoch": 0.020251778872468526, + "grad_norm": 0.29986003041267395, + "learning_rate": 9.989883748788724e-06, + "loss": 0.0155, + "step": 37 + }, + { + "epoch": 0.02079912424740011, + "grad_norm": 0.036302998661994934, + "learning_rate": 9.989329732082504e-06, + "loss": 0.0018, + "step": 38 + }, + { + "epoch": 0.021346469622331693, + "grad_norm": 0.18141533434391022, + "learning_rate": 9.98876096290595e-06, + "loss": 0.0096, + "step": 39 + }, + { + "epoch": 0.021893814997263273, + "grad_norm": 3.4637537002563477, + "learning_rate": 9.988177442940803e-06, + "loss": 1.3062, + "step": 40 + }, + { + "epoch": 0.022441160372194856, + "grad_norm": 0.18254823982715607, + "learning_rate": 9.987579173912413e-06, + "loss": 0.0111, + "step": 41 + }, + { + "epoch": 0.022988505747126436, + "grad_norm": 5.862061023712158, + "learning_rate": 9.986966157589751e-06, + "loss": 1.1001, + "step": 42 + }, + { + "epoch": 0.02353585112205802, + "grad_norm": 0.026641806587576866, + "learning_rate": 9.986338395785377e-06, + "loss": 0.0012, + "step": 43 + }, + { + "epoch": 0.0240831964969896, + "grad_norm": 1.614923119544983, + "learning_rate": 9.985695890355467e-06, + "loss": 0.0334, + "step": 44 + }, + { + "epoch": 0.024630541871921183, + "grad_norm": 0.09787950664758682, + "learning_rate": 9.98503864319978e-06, + "loss": 0.0042, + "step": 45 + }, + { + "epoch": 0.025177887246852763, + "grad_norm": 0.04275774955749512, + "learning_rate": 9.98436665626167e-06, + "loss": 0.0019, + "step": 46 + }, + { + "epoch": 0.025725232621784347, + "grad_norm": 0.03735092282295227, + "learning_rate": 9.983679931528068e-06, + "loss": 0.0018, + "step": 47 + }, + { + "epoch": 0.026272577996715927, + "grad_norm": 0.07040659338235855, + "learning_rate": 9.982978471029485e-06, + "loss": 0.0039, + "step": 48 + }, + { + "epoch": 0.02681992337164751, + "grad_norm": 0.015653884038329124, + "learning_rate": 9.982262276840002e-06, + "loss": 0.0007, + "step": 49 + }, + { + "epoch": 0.02736726874657909, + "grad_norm": 1.0839862823486328, + "learning_rate": 9.981531351077266e-06, + "loss": 0.0126, + "step": 50 + }, + { + "epoch": 0.027914614121510674, + "grad_norm": 0.02633080445230007, + "learning_rate": 9.980785695902481e-06, + "loss": 0.0009, + "step": 51 + }, + { + "epoch": 0.028461959496442254, + "grad_norm": 7.923893928527832, + "learning_rate": 9.980025313520403e-06, + "loss": 0.2134, + "step": 52 + }, + { + "epoch": 0.029009304871373837, + "grad_norm": 6.742312431335449, + "learning_rate": 9.979250206179333e-06, + "loss": 0.4451, + "step": 53 + }, + { + "epoch": 0.029556650246305417, + "grad_norm": 0.008141374215483665, + "learning_rate": 9.978460376171113e-06, + "loss": 0.0004, + "step": 54 + }, + { + "epoch": 0.030103995621237, + "grad_norm": 0.006501413881778717, + "learning_rate": 9.977655825831114e-06, + "loss": 0.0003, + "step": 55 + }, + { + "epoch": 0.03065134099616858, + "grad_norm": 0.04851650074124336, + "learning_rate": 9.976836557538234e-06, + "loss": 0.0023, + "step": 56 + }, + { + "epoch": 0.031198686371100164, + "grad_norm": 0.008002854883670807, + "learning_rate": 9.97600257371489e-06, + "loss": 0.0003, + "step": 57 + }, + { + "epoch": 0.031746031746031744, + "grad_norm": 0.0038702317979186773, + "learning_rate": 9.975153876827008e-06, + "loss": 0.0002, + "step": 58 + }, + { + "epoch": 0.03229337712096333, + "grad_norm": 0.006718830205500126, + "learning_rate": 9.974290469384019e-06, + "loss": 0.0003, + "step": 59 + }, + { + "epoch": 0.03284072249589491, + "grad_norm": 0.011266892775893211, + "learning_rate": 9.973412353938847e-06, + "loss": 0.0005, + "step": 60 + }, + { + "epoch": 0.033388067870826495, + "grad_norm": 0.07028752565383911, + "learning_rate": 9.97251953308791e-06, + "loss": 0.0024, + "step": 61 + }, + { + "epoch": 0.03393541324575807, + "grad_norm": 0.022092929109930992, + "learning_rate": 9.971612009471105e-06, + "loss": 0.0008, + "step": 62 + }, + { + "epoch": 0.034482758620689655, + "grad_norm": 0.024217141792178154, + "learning_rate": 9.970689785771798e-06, + "loss": 0.0011, + "step": 63 + }, + { + "epoch": 0.03503010399562124, + "grad_norm": 0.021001331508159637, + "learning_rate": 9.969752864716828e-06, + "loss": 0.0009, + "step": 64 + }, + { + "epoch": 0.03557744937055282, + "grad_norm": 0.01548935379832983, + "learning_rate": 9.968801249076484e-06, + "loss": 0.0007, + "step": 65 + }, + { + "epoch": 0.0361247947454844, + "grad_norm": 5.745925426483154, + "learning_rate": 9.967834941664508e-06, + "loss": 1.477, + "step": 66 + }, + { + "epoch": 0.03667214012041598, + "grad_norm": 0.02599332295358181, + "learning_rate": 9.96685394533808e-06, + "loss": 0.0009, + "step": 67 + }, + { + "epoch": 0.037219485495347565, + "grad_norm": 7.921395778656006, + "learning_rate": 9.965858262997817e-06, + "loss": 1.0394, + "step": 68 + }, + { + "epoch": 0.03776683087027915, + "grad_norm": 1.6005064249038696, + "learning_rate": 9.964847897587753e-06, + "loss": 0.0348, + "step": 69 + }, + { + "epoch": 0.038314176245210725, + "grad_norm": 0.01647241599857807, + "learning_rate": 9.963822852095344e-06, + "loss": 0.0007, + "step": 70 + }, + { + "epoch": 0.03886152162014231, + "grad_norm": 0.01896149478852749, + "learning_rate": 9.962783129551447e-06, + "loss": 0.0007, + "step": 71 + }, + { + "epoch": 0.03940886699507389, + "grad_norm": 3.117213487625122, + "learning_rate": 9.961728733030318e-06, + "loss": 0.1582, + "step": 72 + }, + { + "epoch": 0.039956212370005476, + "grad_norm": 0.005239859223365784, + "learning_rate": 9.9606596656496e-06, + "loss": 0.0002, + "step": 73 + }, + { + "epoch": 0.04050355774493705, + "grad_norm": 0.010306102223694324, + "learning_rate": 9.959575930570318e-06, + "loss": 0.0005, + "step": 74 + }, + { + "epoch": 0.041050903119868636, + "grad_norm": 0.005997025407850742, + "learning_rate": 9.958477530996862e-06, + "loss": 0.0002, + "step": 75 + }, + { + "epoch": 0.04159824849480022, + "grad_norm": 0.00372750754468143, + "learning_rate": 9.957364470176986e-06, + "loss": 0.0002, + "step": 76 + }, + { + "epoch": 0.0421455938697318, + "grad_norm": 0.021045928820967674, + "learning_rate": 9.95623675140179e-06, + "loss": 0.0009, + "step": 77 + }, + { + "epoch": 0.042692939244663386, + "grad_norm": 6.066615581512451, + "learning_rate": 9.955094378005723e-06, + "loss": 0.912, + "step": 78 + }, + { + "epoch": 0.04324028461959496, + "grad_norm": 0.002792640123516321, + "learning_rate": 9.953937353366551e-06, + "loss": 0.0001, + "step": 79 + }, + { + "epoch": 0.043787629994526546, + "grad_norm": 0.03558952733874321, + "learning_rate": 9.952765680905378e-06, + "loss": 0.0017, + "step": 80 + }, + { + "epoch": 0.04433497536945813, + "grad_norm": 0.23785309493541718, + "learning_rate": 9.951579364086603e-06, + "loss": 0.0061, + "step": 81 + }, + { + "epoch": 0.04488232074438971, + "grad_norm": 0.005056190770119429, + "learning_rate": 9.950378406417935e-06, + "loss": 0.0002, + "step": 82 + }, + { + "epoch": 0.04542966611932129, + "grad_norm": 0.005788102280348539, + "learning_rate": 9.949162811450373e-06, + "loss": 0.0003, + "step": 83 + }, + { + "epoch": 0.04597701149425287, + "grad_norm": 0.027935905382037163, + "learning_rate": 9.947932582778188e-06, + "loss": 0.0011, + "step": 84 + }, + { + "epoch": 0.046524356869184456, + "grad_norm": 0.10212934762239456, + "learning_rate": 9.946687724038929e-06, + "loss": 0.003, + "step": 85 + }, + { + "epoch": 0.04707170224411604, + "grad_norm": 0.0034679218661040068, + "learning_rate": 9.945428238913396e-06, + "loss": 0.0002, + "step": 86 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 0.006359061226248741, + "learning_rate": 9.944154131125643e-06, + "loss": 0.0003, + "step": 87 + }, + { + "epoch": 0.0481663929939792, + "grad_norm": 0.011999201029539108, + "learning_rate": 9.942865404442955e-06, + "loss": 0.0006, + "step": 88 + }, + { + "epoch": 0.04871373836891078, + "grad_norm": 5.890190601348877, + "learning_rate": 9.941562062675848e-06, + "loss": 0.572, + "step": 89 + }, + { + "epoch": 0.04926108374384237, + "grad_norm": 3.3754000663757324, + "learning_rate": 9.940244109678043e-06, + "loss": 0.2577, + "step": 90 + }, + { + "epoch": 0.04980842911877394, + "grad_norm": 0.003820559708401561, + "learning_rate": 9.938911549346473e-06, + "loss": 0.0002, + "step": 91 + }, + { + "epoch": 0.05035577449370553, + "grad_norm": 0.18967939913272858, + "learning_rate": 9.937564385621254e-06, + "loss": 0.0082, + "step": 92 + }, + { + "epoch": 0.05090311986863711, + "grad_norm": 0.007278892211616039, + "learning_rate": 9.936202622485687e-06, + "loss": 0.0004, + "step": 93 + }, + { + "epoch": 0.051450465243568694, + "grad_norm": 6.680490016937256, + "learning_rate": 9.93482626396624e-06, + "loss": 0.8037, + "step": 94 + }, + { + "epoch": 0.05199781061850027, + "grad_norm": 0.09887053072452545, + "learning_rate": 9.933435314132534e-06, + "loss": 0.0048, + "step": 95 + }, + { + "epoch": 0.052545155993431854, + "grad_norm": 3.9046106338500977, + "learning_rate": 9.932029777097333e-06, + "loss": 0.3514, + "step": 96 + }, + { + "epoch": 0.05309250136836344, + "grad_norm": 0.015004507265985012, + "learning_rate": 9.93060965701654e-06, + "loss": 0.0007, + "step": 97 + }, + { + "epoch": 0.05363984674329502, + "grad_norm": 0.1083778440952301, + "learning_rate": 9.929174958089167e-06, + "loss": 0.0059, + "step": 98 + }, + { + "epoch": 0.054187192118226604, + "grad_norm": 0.0027883078437298536, + "learning_rate": 9.927725684557339e-06, + "loss": 0.0001, + "step": 99 + }, + { + "epoch": 0.05473453749315818, + "grad_norm": 0.03726564347743988, + "learning_rate": 9.926261840706275e-06, + "loss": 0.0019, + "step": 100 + }, + { + "epoch": 0.055281882868089764, + "grad_norm": 0.016526643186807632, + "learning_rate": 9.924783430864273e-06, + "loss": 0.0008, + "step": 101 + }, + { + "epoch": 0.05582922824302135, + "grad_norm": 0.009402341209352016, + "learning_rate": 9.923290459402701e-06, + "loss": 0.0004, + "step": 102 + }, + { + "epoch": 0.05637657361795293, + "grad_norm": 0.012751290574669838, + "learning_rate": 9.921782930735985e-06, + "loss": 0.0006, + "step": 103 + }, + { + "epoch": 0.05692391899288451, + "grad_norm": 0.0394417904317379, + "learning_rate": 9.92026084932159e-06, + "loss": 0.0019, + "step": 104 + }, + { + "epoch": 0.05747126436781609, + "grad_norm": 0.02528909221291542, + "learning_rate": 9.918724219660013e-06, + "loss": 0.0013, + "step": 105 + }, + { + "epoch": 0.058018609742747675, + "grad_norm": 0.0044928742572665215, + "learning_rate": 9.917173046294769e-06, + "loss": 0.0002, + "step": 106 + }, + { + "epoch": 0.05856595511767926, + "grad_norm": 0.011569995433092117, + "learning_rate": 9.91560733381237e-06, + "loss": 0.0005, + "step": 107 + }, + { + "epoch": 0.059113300492610835, + "grad_norm": 0.005935823544859886, + "learning_rate": 9.914027086842323e-06, + "loss": 0.0003, + "step": 108 + }, + { + "epoch": 0.05966064586754242, + "grad_norm": 0.0639987364411354, + "learning_rate": 9.912432310057108e-06, + "loss": 0.0031, + "step": 109 + }, + { + "epoch": 0.060207991242474, + "grad_norm": 0.2608802318572998, + "learning_rate": 9.910823008172168e-06, + "loss": 0.011, + "step": 110 + }, + { + "epoch": 0.060755336617405585, + "grad_norm": 0.0046829953789711, + "learning_rate": 9.909199185945893e-06, + "loss": 0.0002, + "step": 111 + }, + { + "epoch": 0.06130268199233716, + "grad_norm": 0.008316273801028728, + "learning_rate": 9.907560848179607e-06, + "loss": 0.0003, + "step": 112 + }, + { + "epoch": 0.061850027367268745, + "grad_norm": 0.012634074315428734, + "learning_rate": 9.905907999717551e-06, + "loss": 0.0004, + "step": 113 + }, + { + "epoch": 0.06239737274220033, + "grad_norm": 0.2069849669933319, + "learning_rate": 9.90424064544688e-06, + "loss": 0.0092, + "step": 114 + }, + { + "epoch": 0.06294471811713191, + "grad_norm": 4.1578803062438965, + "learning_rate": 9.902558790297631e-06, + "loss": 0.3153, + "step": 115 + }, + { + "epoch": 0.06349206349206349, + "grad_norm": 0.3431181013584137, + "learning_rate": 9.900862439242719e-06, + "loss": 0.0144, + "step": 116 + }, + { + "epoch": 0.06403940886699508, + "grad_norm": 0.18196187913417816, + "learning_rate": 9.899151597297923e-06, + "loss": 0.0085, + "step": 117 + }, + { + "epoch": 0.06458675424192666, + "grad_norm": 0.011420561000704765, + "learning_rate": 9.897426269521868e-06, + "loss": 0.0005, + "step": 118 + }, + { + "epoch": 0.06513409961685823, + "grad_norm": 0.20257817208766937, + "learning_rate": 9.895686461016007e-06, + "loss": 0.0096, + "step": 119 + }, + { + "epoch": 0.06568144499178982, + "grad_norm": 0.0031977526377886534, + "learning_rate": 9.893932176924616e-06, + "loss": 0.0002, + "step": 120 + }, + { + "epoch": 0.0662287903667214, + "grad_norm": 0.0007180224638432264, + "learning_rate": 9.892163422434767e-06, + "loss": 0.0001, + "step": 121 + }, + { + "epoch": 0.06677613574165299, + "grad_norm": 5.340695381164551, + "learning_rate": 9.890380202776323e-06, + "loss": 0.503, + "step": 122 + }, + { + "epoch": 0.06732348111658457, + "grad_norm": 0.0025924837682396173, + "learning_rate": 9.888582523221912e-06, + "loss": 0.0001, + "step": 123 + }, + { + "epoch": 0.06787082649151614, + "grad_norm": 2.3252947330474854, + "learning_rate": 9.886770389086923e-06, + "loss": 0.2639, + "step": 124 + }, + { + "epoch": 0.06841817186644773, + "grad_norm": 0.004119732417166233, + "learning_rate": 9.884943805729481e-06, + "loss": 0.0002, + "step": 125 + }, + { + "epoch": 0.06896551724137931, + "grad_norm": 0.06781020015478134, + "learning_rate": 9.883102778550434e-06, + "loss": 0.0022, + "step": 126 + }, + { + "epoch": 0.06951286261631089, + "grad_norm": 4.811086654663086, + "learning_rate": 9.88124731299334e-06, + "loss": 0.4408, + "step": 127 + }, + { + "epoch": 0.07006020799124248, + "grad_norm": 0.008488141000270844, + "learning_rate": 9.879377414544444e-06, + "loss": 0.0003, + "step": 128 + }, + { + "epoch": 0.07060755336617405, + "grad_norm": 0.007522579748183489, + "learning_rate": 9.877493088732672e-06, + "loss": 0.0004, + "step": 129 + }, + { + "epoch": 0.07115489874110564, + "grad_norm": 0.011271845549345016, + "learning_rate": 9.875594341129607e-06, + "loss": 0.0005, + "step": 130 + }, + { + "epoch": 0.07170224411603722, + "grad_norm": 0.87716144323349, + "learning_rate": 9.873681177349473e-06, + "loss": 0.0288, + "step": 131 + }, + { + "epoch": 0.0722495894909688, + "grad_norm": 0.021911056712269783, + "learning_rate": 9.871753603049117e-06, + "loss": 0.0009, + "step": 132 + }, + { + "epoch": 0.07279693486590039, + "grad_norm": 0.0031497348099946976, + "learning_rate": 9.869811623928001e-06, + "loss": 0.0001, + "step": 133 + }, + { + "epoch": 0.07334428024083196, + "grad_norm": 1.4936490058898926, + "learning_rate": 9.86785524572818e-06, + "loss": 0.1167, + "step": 134 + }, + { + "epoch": 0.07389162561576355, + "grad_norm": 0.11308709532022476, + "learning_rate": 9.865884474234275e-06, + "loss": 0.0048, + "step": 135 + }, + { + "epoch": 0.07443897099069513, + "grad_norm": 0.008609531447291374, + "learning_rate": 9.863899315273475e-06, + "loss": 0.0004, + "step": 136 + }, + { + "epoch": 0.0749863163656267, + "grad_norm": 0.35647737979888916, + "learning_rate": 9.861899774715504e-06, + "loss": 0.0147, + "step": 137 + }, + { + "epoch": 0.0755336617405583, + "grad_norm": 6.980875492095947, + "learning_rate": 9.859885858472614e-06, + "loss": 0.8246, + "step": 138 + }, + { + "epoch": 0.07608100711548987, + "grad_norm": 0.002529384568333626, + "learning_rate": 9.857857572499559e-06, + "loss": 0.0001, + "step": 139 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 7.442034721374512, + "learning_rate": 9.855814922793583e-06, + "loss": 0.3612, + "step": 140 + }, + { + "epoch": 0.07717569786535304, + "grad_norm": 3.36582350730896, + "learning_rate": 9.853757915394403e-06, + "loss": 0.2024, + "step": 141 + }, + { + "epoch": 0.07772304324028462, + "grad_norm": 0.004160053096711636, + "learning_rate": 9.851686556384182e-06, + "loss": 0.0002, + "step": 142 + }, + { + "epoch": 0.07827038861521621, + "grad_norm": 0.0015179223846644163, + "learning_rate": 9.849600851887528e-06, + "loss": 0.0001, + "step": 143 + }, + { + "epoch": 0.07881773399014778, + "grad_norm": 5.545576095581055, + "learning_rate": 9.847500808071458e-06, + "loss": 0.596, + "step": 144 + }, + { + "epoch": 0.07936507936507936, + "grad_norm": 0.01277522835880518, + "learning_rate": 9.84538643114539e-06, + "loss": 0.0007, + "step": 145 + }, + { + "epoch": 0.07991242474001095, + "grad_norm": 0.0032218769192695618, + "learning_rate": 9.843257727361124e-06, + "loss": 0.0002, + "step": 146 + }, + { + "epoch": 0.08045977011494253, + "grad_norm": 0.009027930907905102, + "learning_rate": 9.841114703012817e-06, + "loss": 0.0005, + "step": 147 + }, + { + "epoch": 0.0810071154898741, + "grad_norm": 3.7516531944274902, + "learning_rate": 9.838957364436973e-06, + "loss": 0.551, + "step": 148 + }, + { + "epoch": 0.0815544608648057, + "grad_norm": 0.01132990699261427, + "learning_rate": 9.836785718012422e-06, + "loss": 0.0005, + "step": 149 + }, + { + "epoch": 0.08210180623973727, + "grad_norm": 0.011869224719703197, + "learning_rate": 9.834599770160296e-06, + "loss": 0.0005, + "step": 150 + }, + { + "epoch": 0.08264915161466886, + "grad_norm": 0.011585843749344349, + "learning_rate": 9.832399527344012e-06, + "loss": 0.0005, + "step": 151 + }, + { + "epoch": 0.08319649698960044, + "grad_norm": 0.10732828080654144, + "learning_rate": 9.830184996069259e-06, + "loss": 0.0033, + "step": 152 + }, + { + "epoch": 0.08374384236453201, + "grad_norm": 4.160403251647949, + "learning_rate": 9.82795618288397e-06, + "loss": 0.4624, + "step": 153 + }, + { + "epoch": 0.0842911877394636, + "grad_norm": 0.5677698254585266, + "learning_rate": 9.82571309437831e-06, + "loss": 0.0172, + "step": 154 + }, + { + "epoch": 0.08483853311439518, + "grad_norm": 0.013016915880143642, + "learning_rate": 9.823455737184655e-06, + "loss": 0.0004, + "step": 155 + }, + { + "epoch": 0.08538587848932677, + "grad_norm": 0.02737693302333355, + "learning_rate": 9.821184117977564e-06, + "loss": 0.0014, + "step": 156 + }, + { + "epoch": 0.08593322386425835, + "grad_norm": 0.002458269475027919, + "learning_rate": 9.81889824347377e-06, + "loss": 0.0001, + "step": 157 + }, + { + "epoch": 0.08648056923918993, + "grad_norm": 0.21147428452968597, + "learning_rate": 9.816598120432159e-06, + "loss": 0.0073, + "step": 158 + }, + { + "epoch": 0.08702791461412152, + "grad_norm": 0.007792654912918806, + "learning_rate": 9.81428375565374e-06, + "loss": 0.0004, + "step": 159 + }, + { + "epoch": 0.08757525998905309, + "grad_norm": 10.54150390625, + "learning_rate": 9.811955155981641e-06, + "loss": 0.621, + "step": 160 + }, + { + "epoch": 0.08812260536398467, + "grad_norm": 0.33467328548431396, + "learning_rate": 9.809612328301071e-06, + "loss": 0.0132, + "step": 161 + }, + { + "epoch": 0.08866995073891626, + "grad_norm": 0.025340264663100243, + "learning_rate": 9.807255279539313e-06, + "loss": 0.0009, + "step": 162 + }, + { + "epoch": 0.08921729611384784, + "grad_norm": 2.977665901184082, + "learning_rate": 9.8048840166657e-06, + "loss": 0.0356, + "step": 163 + }, + { + "epoch": 0.08976464148877943, + "grad_norm": 0.02018612250685692, + "learning_rate": 9.80249854669159e-06, + "loss": 0.001, + "step": 164 + }, + { + "epoch": 0.090311986863711, + "grad_norm": 0.024333668872714043, + "learning_rate": 9.80009887667035e-06, + "loss": 0.0011, + "step": 165 + }, + { + "epoch": 0.09085933223864258, + "grad_norm": 0.02486356534063816, + "learning_rate": 9.797685013697336e-06, + "loss": 0.0008, + "step": 166 + }, + { + "epoch": 0.09140667761357417, + "grad_norm": 0.009108465164899826, + "learning_rate": 9.795256964909868e-06, + "loss": 0.0004, + "step": 167 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 0.004747547209262848, + "learning_rate": 9.792814737487207e-06, + "loss": 0.0002, + "step": 168 + }, + { + "epoch": 0.09250136836343732, + "grad_norm": 2.016352653503418, + "learning_rate": 9.790358338650546e-06, + "loss": 0.1343, + "step": 169 + }, + { + "epoch": 0.09304871373836891, + "grad_norm": 0.003539568977430463, + "learning_rate": 9.787887775662969e-06, + "loss": 0.0002, + "step": 170 + }, + { + "epoch": 0.09359605911330049, + "grad_norm": 0.01274325605481863, + "learning_rate": 9.78540305582945e-06, + "loss": 0.0005, + "step": 171 + }, + { + "epoch": 0.09414340448823208, + "grad_norm": 0.8760842680931091, + "learning_rate": 9.78290418649682e-06, + "loss": 0.0427, + "step": 172 + }, + { + "epoch": 0.09469074986316366, + "grad_norm": 0.008066395297646523, + "learning_rate": 9.780391175053744e-06, + "loss": 0.0003, + "step": 173 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 3.9684042930603027, + "learning_rate": 9.777864028930705e-06, + "loss": 0.5087, + "step": 174 + }, + { + "epoch": 0.09578544061302682, + "grad_norm": 1.021899938583374, + "learning_rate": 9.775322755599979e-06, + "loss": 0.0392, + "step": 175 + }, + { + "epoch": 0.0963327859879584, + "grad_norm": 0.6588534712791443, + "learning_rate": 9.77276736257561e-06, + "loss": 0.0338, + "step": 176 + }, + { + "epoch": 0.09688013136288999, + "grad_norm": 0.017111433669924736, + "learning_rate": 9.7701978574134e-06, + "loss": 0.0005, + "step": 177 + }, + { + "epoch": 0.09742747673782157, + "grad_norm": 0.010617710649967194, + "learning_rate": 9.76761424771087e-06, + "loss": 0.0004, + "step": 178 + }, + { + "epoch": 0.09797482211275314, + "grad_norm": 0.0035194838419556618, + "learning_rate": 9.765016541107247e-06, + "loss": 0.0002, + "step": 179 + }, + { + "epoch": 0.09852216748768473, + "grad_norm": 0.0017975402297452092, + "learning_rate": 9.762404745283439e-06, + "loss": 0.0001, + "step": 180 + }, + { + "epoch": 0.09906951286261631, + "grad_norm": 0.007557627744972706, + "learning_rate": 9.759778867962017e-06, + "loss": 0.0002, + "step": 181 + }, + { + "epoch": 0.09961685823754789, + "grad_norm": 1.118253469467163, + "learning_rate": 9.757138916907184e-06, + "loss": 0.0372, + "step": 182 + }, + { + "epoch": 0.10016420361247948, + "grad_norm": 2.939692258834839, + "learning_rate": 9.754484899924762e-06, + "loss": 0.3909, + "step": 183 + }, + { + "epoch": 0.10071154898741105, + "grad_norm": 0.13486160337924957, + "learning_rate": 9.751816824862152e-06, + "loss": 0.0047, + "step": 184 + }, + { + "epoch": 0.10125889436234264, + "grad_norm": 5.695981025695801, + "learning_rate": 9.749134699608336e-06, + "loss": 0.1484, + "step": 185 + }, + { + "epoch": 0.10180623973727422, + "grad_norm": 0.004681828897446394, + "learning_rate": 9.746438532093827e-06, + "loss": 0.0002, + "step": 186 + }, + { + "epoch": 0.1023535851122058, + "grad_norm": 0.007142484653741121, + "learning_rate": 9.74372833029067e-06, + "loss": 0.0003, + "step": 187 + }, + { + "epoch": 0.10290093048713739, + "grad_norm": 0.016086289659142494, + "learning_rate": 9.741004102212395e-06, + "loss": 0.0008, + "step": 188 + }, + { + "epoch": 0.10344827586206896, + "grad_norm": 0.0052831051871180534, + "learning_rate": 9.738265855914014e-06, + "loss": 0.0003, + "step": 189 + }, + { + "epoch": 0.10399562123700054, + "grad_norm": 0.4634156823158264, + "learning_rate": 9.735513599491982e-06, + "loss": 0.0108, + "step": 190 + }, + { + "epoch": 0.10454296661193213, + "grad_norm": 0.004725305829197168, + "learning_rate": 9.732747341084185e-06, + "loss": 0.0002, + "step": 191 + }, + { + "epoch": 0.10509031198686371, + "grad_norm": 0.0022925010416656733, + "learning_rate": 9.729967088869907e-06, + "loss": 0.0001, + "step": 192 + }, + { + "epoch": 0.1056376573617953, + "grad_norm": 0.0022578220814466476, + "learning_rate": 9.727172851069807e-06, + "loss": 0.0001, + "step": 193 + }, + { + "epoch": 0.10618500273672687, + "grad_norm": 0.005604981444776058, + "learning_rate": 9.7243646359459e-06, + "loss": 0.0002, + "step": 194 + }, + { + "epoch": 0.10673234811165845, + "grad_norm": 0.0032916353084146976, + "learning_rate": 9.721542451801526e-06, + "loss": 0.0001, + "step": 195 + }, + { + "epoch": 0.10727969348659004, + "grad_norm": 0.009358298033475876, + "learning_rate": 9.718706306981332e-06, + "loss": 0.0004, + "step": 196 + }, + { + "epoch": 0.10782703886152162, + "grad_norm": 0.0041911546140909195, + "learning_rate": 9.715856209871243e-06, + "loss": 0.0002, + "step": 197 + }, + { + "epoch": 0.10837438423645321, + "grad_norm": 0.003506641834974289, + "learning_rate": 9.712992168898436e-06, + "loss": 0.0002, + "step": 198 + }, + { + "epoch": 0.10892172961138478, + "grad_norm": 4.677643775939941, + "learning_rate": 9.71011419253132e-06, + "loss": 0.6282, + "step": 199 + }, + { + "epoch": 0.10946907498631636, + "grad_norm": 0.02117490954697132, + "learning_rate": 9.707222289279508e-06, + "loss": 0.0008, + "step": 200 + }, + { + "epoch": 0.11001642036124795, + "grad_norm": 0.003004522994160652, + "learning_rate": 9.704316467693789e-06, + "loss": 0.0001, + "step": 201 + }, + { + "epoch": 0.11056376573617953, + "grad_norm": 0.003951009828597307, + "learning_rate": 9.701396736366108e-06, + "loss": 0.0001, + "step": 202 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.005045830737799406, + "learning_rate": 9.698463103929542e-06, + "loss": 0.0003, + "step": 203 + }, + { + "epoch": 0.1116584564860427, + "grad_norm": 0.001341570750810206, + "learning_rate": 9.695515579058265e-06, + "loss": 0.0001, + "step": 204 + }, + { + "epoch": 0.11220580186097427, + "grad_norm": 0.007657123729586601, + "learning_rate": 9.692554170467529e-06, + "loss": 0.0002, + "step": 205 + }, + { + "epoch": 0.11275314723590586, + "grad_norm": 0.10520672798156738, + "learning_rate": 9.689578886913641e-06, + "loss": 0.0032, + "step": 206 + }, + { + "epoch": 0.11330049261083744, + "grad_norm": 0.001422167639248073, + "learning_rate": 9.686589737193929e-06, + "loss": 0.0001, + "step": 207 + }, + { + "epoch": 0.11384783798576902, + "grad_norm": 0.00470214756205678, + "learning_rate": 9.683586730146727e-06, + "loss": 0.0001, + "step": 208 + }, + { + "epoch": 0.1143951833607006, + "grad_norm": 4.35874080657959, + "learning_rate": 9.680569874651336e-06, + "loss": 0.0846, + "step": 209 + }, + { + "epoch": 0.11494252873563218, + "grad_norm": 0.006586556322872639, + "learning_rate": 9.677539179628005e-06, + "loss": 0.0004, + "step": 210 + }, + { + "epoch": 0.11548987411056376, + "grad_norm": 0.004729663487523794, + "learning_rate": 9.674494654037909e-06, + "loss": 0.0001, + "step": 211 + }, + { + "epoch": 0.11603721948549535, + "grad_norm": 0.015061924234032631, + "learning_rate": 9.67143630688311e-06, + "loss": 0.0006, + "step": 212 + }, + { + "epoch": 0.11658456486042693, + "grad_norm": 0.009670287370681763, + "learning_rate": 9.668364147206542e-06, + "loss": 0.0004, + "step": 213 + }, + { + "epoch": 0.11713191023535852, + "grad_norm": 7.630548477172852, + "learning_rate": 9.665278184091981e-06, + "loss": 0.928, + "step": 214 + }, + { + "epoch": 0.11767925561029009, + "grad_norm": 0.011364479549229145, + "learning_rate": 9.662178426664014e-06, + "loss": 0.0004, + "step": 215 + }, + { + "epoch": 0.11822660098522167, + "grad_norm": 0.042831990867853165, + "learning_rate": 9.659064884088017e-06, + "loss": 0.0014, + "step": 216 + }, + { + "epoch": 0.11877394636015326, + "grad_norm": 0.004559504333883524, + "learning_rate": 9.655937565570124e-06, + "loss": 0.0002, + "step": 217 + }, + { + "epoch": 0.11932129173508484, + "grad_norm": 0.0060465335845947266, + "learning_rate": 9.652796480357203e-06, + "loss": 0.0002, + "step": 218 + }, + { + "epoch": 0.11986863711001643, + "grad_norm": 0.002930757123976946, + "learning_rate": 9.649641637736829e-06, + "loss": 0.0002, + "step": 219 + }, + { + "epoch": 0.120415982484948, + "grad_norm": 4.215134143829346, + "learning_rate": 9.646473047037252e-06, + "loss": 0.5299, + "step": 220 + }, + { + "epoch": 0.12096332785987958, + "grad_norm": 7.453273296356201, + "learning_rate": 9.643290717627376e-06, + "loss": 0.6441, + "step": 221 + }, + { + "epoch": 0.12151067323481117, + "grad_norm": 0.01627381704747677, + "learning_rate": 9.640094658916723e-06, + "loss": 0.0009, + "step": 222 + }, + { + "epoch": 0.12205801860974275, + "grad_norm": 0.003751779207959771, + "learning_rate": 9.636884880355412e-06, + "loss": 0.0001, + "step": 223 + }, + { + "epoch": 0.12260536398467432, + "grad_norm": 0.0018137397710233927, + "learning_rate": 9.63366139143413e-06, + "loss": 0.0001, + "step": 224 + }, + { + "epoch": 0.12315270935960591, + "grad_norm": 0.9278050661087036, + "learning_rate": 9.630424201684105e-06, + "loss": 0.0315, + "step": 225 + }, + { + "epoch": 0.12370005473453749, + "grad_norm": 0.004502156283706427, + "learning_rate": 9.62717332067707e-06, + "loss": 0.0002, + "step": 226 + }, + { + "epoch": 0.12424740010946908, + "grad_norm": 0.047333408147096634, + "learning_rate": 9.623908758025243e-06, + "loss": 0.0007, + "step": 227 + }, + { + "epoch": 0.12479474548440066, + "grad_norm": 0.08214745670557022, + "learning_rate": 9.620630523381295e-06, + "loss": 0.0029, + "step": 228 + }, + { + "epoch": 0.12534209085933223, + "grad_norm": 0.00138851348310709, + "learning_rate": 9.617338626438326e-06, + "loss": 0.0001, + "step": 229 + }, + { + "epoch": 0.12588943623426382, + "grad_norm": 4.332123279571533, + "learning_rate": 9.61403307692983e-06, + "loss": 0.772, + "step": 230 + }, + { + "epoch": 0.12643678160919541, + "grad_norm": 6.4362874031066895, + "learning_rate": 9.610713884629667e-06, + "loss": 0.9625, + "step": 231 + }, + { + "epoch": 0.12698412698412698, + "grad_norm": 0.004271378740668297, + "learning_rate": 9.60738105935204e-06, + "loss": 0.0002, + "step": 232 + }, + { + "epoch": 0.12753147235905857, + "grad_norm": 0.005139256827533245, + "learning_rate": 9.604034610951458e-06, + "loss": 0.0002, + "step": 233 + }, + { + "epoch": 0.12807881773399016, + "grad_norm": 0.005095008760690689, + "learning_rate": 9.600674549322716e-06, + "loss": 0.0002, + "step": 234 + }, + { + "epoch": 0.12862616310892172, + "grad_norm": 5.136434078216553, + "learning_rate": 9.597300884400858e-06, + "loss": 0.943, + "step": 235 + }, + { + "epoch": 0.1291735084838533, + "grad_norm": 0.04719545692205429, + "learning_rate": 9.593913626161148e-06, + "loss": 0.002, + "step": 236 + }, + { + "epoch": 0.1297208538587849, + "grad_norm": 0.013157535344362259, + "learning_rate": 9.590512784619045e-06, + "loss": 0.0004, + "step": 237 + }, + { + "epoch": 0.13026819923371646, + "grad_norm": 4.134466648101807, + "learning_rate": 9.587098369830171e-06, + "loss": 0.9442, + "step": 238 + }, + { + "epoch": 0.13081554460864805, + "grad_norm": 0.0218354444950819, + "learning_rate": 9.583670391890285e-06, + "loss": 0.0013, + "step": 239 + }, + { + "epoch": 0.13136288998357964, + "grad_norm": 0.025551216676831245, + "learning_rate": 9.580228860935242e-06, + "loss": 0.001, + "step": 240 + }, + { + "epoch": 0.1319102353585112, + "grad_norm": 0.9895619750022888, + "learning_rate": 9.576773787140974e-06, + "loss": 0.0341, + "step": 241 + }, + { + "epoch": 0.1324575807334428, + "grad_norm": 4.647812366485596, + "learning_rate": 9.57330518072346e-06, + "loss": 0.2535, + "step": 242 + }, + { + "epoch": 0.1330049261083744, + "grad_norm": 0.027296705171465874, + "learning_rate": 9.569823051938689e-06, + "loss": 0.0012, + "step": 243 + }, + { + "epoch": 0.13355227148330598, + "grad_norm": 0.030924847349524498, + "learning_rate": 9.566327411082634e-06, + "loss": 0.001, + "step": 244 + }, + { + "epoch": 0.13409961685823754, + "grad_norm": 0.00540255568921566, + "learning_rate": 9.562818268491216e-06, + "loss": 0.0003, + "step": 245 + }, + { + "epoch": 0.13464696223316913, + "grad_norm": 0.014651118777692318, + "learning_rate": 9.559295634540287e-06, + "loss": 0.0006, + "step": 246 + }, + { + "epoch": 0.13519430760810072, + "grad_norm": 0.0771181657910347, + "learning_rate": 9.555759519645584e-06, + "loss": 0.0025, + "step": 247 + }, + { + "epoch": 0.13574165298303228, + "grad_norm": 0.036993276327848434, + "learning_rate": 9.552209934262703e-06, + "loss": 0.0015, + "step": 248 + }, + { + "epoch": 0.13628899835796388, + "grad_norm": 0.014038922265172005, + "learning_rate": 9.548646888887076e-06, + "loss": 0.0007, + "step": 249 + }, + { + "epoch": 0.13683634373289547, + "grad_norm": 0.045163851231336594, + "learning_rate": 9.54507039405393e-06, + "loss": 0.0018, + "step": 250 + }, + { + "epoch": 0.13738368910782703, + "grad_norm": 4.474141597747803, + "learning_rate": 9.541480460338255e-06, + "loss": 0.7153, + "step": 251 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 2.494176149368286, + "learning_rate": 9.537877098354787e-06, + "loss": 0.1233, + "step": 252 + }, + { + "epoch": 0.1384783798576902, + "grad_norm": 0.03589564189314842, + "learning_rate": 9.534260318757956e-06, + "loss": 0.0011, + "step": 253 + }, + { + "epoch": 0.13902572523262177, + "grad_norm": 0.1724754422903061, + "learning_rate": 9.530630132241876e-06, + "loss": 0.0075, + "step": 254 + }, + { + "epoch": 0.13957307060755336, + "grad_norm": 0.8134304881095886, + "learning_rate": 9.526986549540292e-06, + "loss": 0.1045, + "step": 255 + }, + { + "epoch": 0.14012041598248495, + "grad_norm": 0.19864898920059204, + "learning_rate": 9.523329581426568e-06, + "loss": 0.0081, + "step": 256 + }, + { + "epoch": 0.14066776135741654, + "grad_norm": 0.04995502904057503, + "learning_rate": 9.519659238713642e-06, + "loss": 0.0017, + "step": 257 + }, + { + "epoch": 0.1412151067323481, + "grad_norm": 1.2576309442520142, + "learning_rate": 9.515975532253994e-06, + "loss": 0.2187, + "step": 258 + }, + { + "epoch": 0.1417624521072797, + "grad_norm": 0.04681723564863205, + "learning_rate": 9.512278472939627e-06, + "loss": 0.0018, + "step": 259 + }, + { + "epoch": 0.1423097974822113, + "grad_norm": 0.8970088958740234, + "learning_rate": 9.508568071702016e-06, + "loss": 0.1183, + "step": 260 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.037138741463422775, + "learning_rate": 9.504844339512096e-06, + "loss": 0.0013, + "step": 261 + }, + { + "epoch": 0.14340448823207444, + "grad_norm": 0.30721408128738403, + "learning_rate": 9.50110728738021e-06, + "loss": 0.0112, + "step": 262 + }, + { + "epoch": 0.14395183360700603, + "grad_norm": 0.038838911801576614, + "learning_rate": 9.49735692635609e-06, + "loss": 0.0013, + "step": 263 + }, + { + "epoch": 0.1444991789819376, + "grad_norm": 8.811029434204102, + "learning_rate": 9.493593267528818e-06, + "loss": 1.0208, + "step": 264 + }, + { + "epoch": 0.14504652435686918, + "grad_norm": 0.03491875156760216, + "learning_rate": 9.489816322026796e-06, + "loss": 0.0013, + "step": 265 + }, + { + "epoch": 0.14559386973180077, + "grad_norm": 0.3128603398799896, + "learning_rate": 9.486026101017711e-06, + "loss": 0.01, + "step": 266 + }, + { + "epoch": 0.14614121510673234, + "grad_norm": 0.012525409460067749, + "learning_rate": 9.482222615708506e-06, + "loss": 0.0006, + "step": 267 + }, + { + "epoch": 0.14668856048166393, + "grad_norm": 3.2567107677459717, + "learning_rate": 9.478405877345339e-06, + "loss": 0.2549, + "step": 268 + }, + { + "epoch": 0.14723590585659552, + "grad_norm": 0.11480681598186493, + "learning_rate": 9.474575897213558e-06, + "loss": 0.0059, + "step": 269 + }, + { + "epoch": 0.1477832512315271, + "grad_norm": 0.0677303895354271, + "learning_rate": 9.470732686637665e-06, + "loss": 0.0028, + "step": 270 + }, + { + "epoch": 0.14833059660645867, + "grad_norm": 0.010763351805508137, + "learning_rate": 9.466876256981279e-06, + "loss": 0.0004, + "step": 271 + }, + { + "epoch": 0.14887794198139026, + "grad_norm": 0.17489972710609436, + "learning_rate": 9.463006619647109e-06, + "loss": 0.0061, + "step": 272 + }, + { + "epoch": 0.14942528735632185, + "grad_norm": 0.33539700508117676, + "learning_rate": 9.459123786076911e-06, + "loss": 0.0116, + "step": 273 + }, + { + "epoch": 0.1499726327312534, + "grad_norm": 0.004740248434245586, + "learning_rate": 9.455227767751467e-06, + "loss": 0.0002, + "step": 274 + }, + { + "epoch": 0.150519978106185, + "grad_norm": 0.0359928198158741, + "learning_rate": 9.451318576190538e-06, + "loss": 0.0015, + "step": 275 + }, + { + "epoch": 0.1510673234811166, + "grad_norm": 0.03405028209090233, + "learning_rate": 9.447396222952837e-06, + "loss": 0.0012, + "step": 276 + }, + { + "epoch": 0.15161466885604816, + "grad_norm": 0.010478261858224869, + "learning_rate": 9.443460719635993e-06, + "loss": 0.0004, + "step": 277 + }, + { + "epoch": 0.15216201423097975, + "grad_norm": 0.10749869793653488, + "learning_rate": 9.43951207787652e-06, + "loss": 0.0048, + "step": 278 + }, + { + "epoch": 0.15270935960591134, + "grad_norm": 0.021480495110154152, + "learning_rate": 9.435550309349776e-06, + "loss": 0.001, + "step": 279 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 0.012531301937997341, + "learning_rate": 9.431575425769938e-06, + "loss": 0.0005, + "step": 280 + }, + { + "epoch": 0.1538040503557745, + "grad_norm": 3.6380410194396973, + "learning_rate": 9.427587438889954e-06, + "loss": 0.4189, + "step": 281 + }, + { + "epoch": 0.15435139573070608, + "grad_norm": 3.237821340560913, + "learning_rate": 9.423586360501521e-06, + "loss": 0.2345, + "step": 282 + }, + { + "epoch": 0.15489874110563764, + "grad_norm": 0.009717877954244614, + "learning_rate": 9.419572202435044e-06, + "loss": 0.0004, + "step": 283 + }, + { + "epoch": 0.15544608648056923, + "grad_norm": 0.018061315640807152, + "learning_rate": 9.415544976559601e-06, + "loss": 0.0007, + "step": 284 + }, + { + "epoch": 0.15599343185550082, + "grad_norm": 3.929448366165161, + "learning_rate": 9.411504694782909e-06, + "loss": 0.5633, + "step": 285 + }, + { + "epoch": 0.15654077723043242, + "grad_norm": 0.016803627833724022, + "learning_rate": 9.407451369051293e-06, + "loss": 0.0006, + "step": 286 + }, + { + "epoch": 0.15708812260536398, + "grad_norm": 0.011305440217256546, + "learning_rate": 9.40338501134964e-06, + "loss": 0.0005, + "step": 287 + }, + { + "epoch": 0.15763546798029557, + "grad_norm": 0.8303712606430054, + "learning_rate": 9.399305633701372e-06, + "loss": 0.0285, + "step": 288 + }, + { + "epoch": 0.15818281335522716, + "grad_norm": 0.009512335993349552, + "learning_rate": 9.395213248168414e-06, + "loss": 0.0004, + "step": 289 + }, + { + "epoch": 0.15873015873015872, + "grad_norm": 0.004181810654699802, + "learning_rate": 9.391107866851143e-06, + "loss": 0.0002, + "step": 290 + }, + { + "epoch": 0.1592775041050903, + "grad_norm": 0.12350267171859741, + "learning_rate": 9.38698950188837e-06, + "loss": 0.004, + "step": 291 + }, + { + "epoch": 0.1598248494800219, + "grad_norm": 0.04803336039185524, + "learning_rate": 9.382858165457291e-06, + "loss": 0.0014, + "step": 292 + }, + { + "epoch": 0.16037219485495346, + "grad_norm": 2.1996352672576904, + "learning_rate": 9.378713869773462e-06, + "loss": 0.1333, + "step": 293 + }, + { + "epoch": 0.16091954022988506, + "grad_norm": 0.014819197356700897, + "learning_rate": 9.374556627090749e-06, + "loss": 0.0008, + "step": 294 + }, + { + "epoch": 0.16146688560481665, + "grad_norm": 0.012402137741446495, + "learning_rate": 9.370386449701306e-06, + "loss": 0.0006, + "step": 295 + }, + { + "epoch": 0.1620142309797482, + "grad_norm": 0.05959853157401085, + "learning_rate": 9.366203349935531e-06, + "loss": 0.0025, + "step": 296 + }, + { + "epoch": 0.1625615763546798, + "grad_norm": 0.006656737066805363, + "learning_rate": 9.36200734016203e-06, + "loss": 0.0003, + "step": 297 + }, + { + "epoch": 0.1631089217296114, + "grad_norm": 0.019398879259824753, + "learning_rate": 9.35779843278758e-06, + "loss": 0.0005, + "step": 298 + }, + { + "epoch": 0.16365626710454298, + "grad_norm": 0.022218007594347, + "learning_rate": 9.353576640257096e-06, + "loss": 0.0008, + "step": 299 + }, + { + "epoch": 0.16420361247947454, + "grad_norm": 0.023649122565984726, + "learning_rate": 9.349341975053593e-06, + "loss": 0.001, + "step": 300 + }, + { + "epoch": 0.16475095785440613, + "grad_norm": 1.304732084274292, + "learning_rate": 9.345094449698143e-06, + "loss": 0.0465, + "step": 301 + }, + { + "epoch": 0.16529830322933772, + "grad_norm": 0.006432794965803623, + "learning_rate": 9.34083407674985e-06, + "loss": 0.0003, + "step": 302 + }, + { + "epoch": 0.16584564860426929, + "grad_norm": 0.4880763590335846, + "learning_rate": 9.336560868805799e-06, + "loss": 0.0188, + "step": 303 + }, + { + "epoch": 0.16639299397920088, + "grad_norm": 0.007425636053085327, + "learning_rate": 9.33227483850103e-06, + "loss": 0.0002, + "step": 304 + }, + { + "epoch": 0.16694033935413247, + "grad_norm": 0.057750653475522995, + "learning_rate": 9.327975998508496e-06, + "loss": 0.0008, + "step": 305 + }, + { + "epoch": 0.16748768472906403, + "grad_norm": 0.035874757915735245, + "learning_rate": 9.32366436153902e-06, + "loss": 0.0017, + "step": 306 + }, + { + "epoch": 0.16803503010399562, + "grad_norm": 0.006478885188698769, + "learning_rate": 9.319339940341272e-06, + "loss": 0.0003, + "step": 307 + }, + { + "epoch": 0.1685823754789272, + "grad_norm": 0.05246639624238014, + "learning_rate": 9.315002747701716e-06, + "loss": 0.0021, + "step": 308 + }, + { + "epoch": 0.16912972085385877, + "grad_norm": 0.019346576184034348, + "learning_rate": 9.310652796444581e-06, + "loss": 0.001, + "step": 309 + }, + { + "epoch": 0.16967706622879036, + "grad_norm": 0.0032381361816078424, + "learning_rate": 9.306290099431822e-06, + "loss": 0.0001, + "step": 310 + }, + { + "epoch": 0.17022441160372195, + "grad_norm": 0.002722548320889473, + "learning_rate": 9.301914669563077e-06, + "loss": 0.0002, + "step": 311 + }, + { + "epoch": 0.17077175697865354, + "grad_norm": 0.13813874125480652, + "learning_rate": 9.297526519775637e-06, + "loss": 0.0055, + "step": 312 + }, + { + "epoch": 0.1713191023535851, + "grad_norm": 0.014929019846022129, + "learning_rate": 9.293125663044399e-06, + "loss": 0.0002, + "step": 313 + }, + { + "epoch": 0.1718664477285167, + "grad_norm": 0.016266122460365295, + "learning_rate": 9.288712112381834e-06, + "loss": 0.0008, + "step": 314 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 0.002234517829492688, + "learning_rate": 9.284285880837947e-06, + "loss": 0.0001, + "step": 315 + }, + { + "epoch": 0.17296113847837985, + "grad_norm": 0.054850004613399506, + "learning_rate": 9.279846981500237e-06, + "loss": 0.0029, + "step": 316 + }, + { + "epoch": 0.17350848385331144, + "grad_norm": 0.0061068604700267315, + "learning_rate": 9.275395427493662e-06, + "loss": 0.0003, + "step": 317 + }, + { + "epoch": 0.17405582922824303, + "grad_norm": 0.005567502696067095, + "learning_rate": 9.27093123198059e-06, + "loss": 0.0002, + "step": 318 + }, + { + "epoch": 0.1746031746031746, + "grad_norm": 2.0154500007629395, + "learning_rate": 9.266454408160779e-06, + "loss": 0.2103, + "step": 319 + }, + { + "epoch": 0.17515051997810618, + "grad_norm": 0.006063805893063545, + "learning_rate": 9.261964969271315e-06, + "loss": 0.0002, + "step": 320 + }, + { + "epoch": 0.17569786535303777, + "grad_norm": 0.007771521806716919, + "learning_rate": 9.257462928586589e-06, + "loss": 0.0004, + "step": 321 + }, + { + "epoch": 0.17624521072796934, + "grad_norm": 0.0036675152368843555, + "learning_rate": 9.252948299418255e-06, + "loss": 0.0001, + "step": 322 + }, + { + "epoch": 0.17679255610290093, + "grad_norm": 0.003957046661525965, + "learning_rate": 9.248421095115185e-06, + "loss": 0.0002, + "step": 323 + }, + { + "epoch": 0.17733990147783252, + "grad_norm": 0.008962135761976242, + "learning_rate": 9.243881329063436e-06, + "loss": 0.0003, + "step": 324 + }, + { + "epoch": 0.17788724685276408, + "grad_norm": 0.4392240047454834, + "learning_rate": 9.239329014686207e-06, + "loss": 0.0103, + "step": 325 + }, + { + "epoch": 0.17843459222769567, + "grad_norm": 0.003559105796739459, + "learning_rate": 9.2347641654438e-06, + "loss": 0.0001, + "step": 326 + }, + { + "epoch": 0.17898193760262726, + "grad_norm": 0.009468616917729378, + "learning_rate": 9.230186794833578e-06, + "loss": 0.0005, + "step": 327 + }, + { + "epoch": 0.17952928297755885, + "grad_norm": 0.025194939225912094, + "learning_rate": 9.225596916389929e-06, + "loss": 0.0012, + "step": 328 + }, + { + "epoch": 0.18007662835249041, + "grad_norm": 0.03295588493347168, + "learning_rate": 9.220994543684225e-06, + "loss": 0.0013, + "step": 329 + }, + { + "epoch": 0.180623973727422, + "grad_norm": 0.025521008297801018, + "learning_rate": 9.216379690324782e-06, + "loss": 0.0012, + "step": 330 + }, + { + "epoch": 0.1811713191023536, + "grad_norm": 0.004285611677914858, + "learning_rate": 9.211752369956814e-06, + "loss": 0.0002, + "step": 331 + }, + { + "epoch": 0.18171866447728516, + "grad_norm": 0.010934761725366116, + "learning_rate": 9.207112596262404e-06, + "loss": 0.0005, + "step": 332 + }, + { + "epoch": 0.18226600985221675, + "grad_norm": 4.3676371574401855, + "learning_rate": 9.202460382960449e-06, + "loss": 1.2057, + "step": 333 + }, + { + "epoch": 0.18281335522714834, + "grad_norm": 0.023604271933436394, + "learning_rate": 9.197795743806634e-06, + "loss": 0.0009, + "step": 334 + }, + { + "epoch": 0.1833607006020799, + "grad_norm": 0.09049314260482788, + "learning_rate": 9.193118692593385e-06, + "loss": 0.0027, + "step": 335 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 0.05073768272995949, + "learning_rate": 9.188429243149824e-06, + "loss": 0.002, + "step": 336 + }, + { + "epoch": 0.18445539135194308, + "grad_norm": 0.010885998606681824, + "learning_rate": 9.183727409341737e-06, + "loss": 0.0002, + "step": 337 + }, + { + "epoch": 0.18500273672687464, + "grad_norm": 0.0029311482794582844, + "learning_rate": 9.179013205071518e-06, + "loss": 0.0001, + "step": 338 + }, + { + "epoch": 0.18555008210180624, + "grad_norm": 4.0211181640625, + "learning_rate": 9.174286644278154e-06, + "loss": 0.849, + "step": 339 + }, + { + "epoch": 0.18609742747673783, + "grad_norm": 0.001774416770786047, + "learning_rate": 9.169547740937152e-06, + "loss": 0.0001, + "step": 340 + }, + { + "epoch": 0.18664477285166942, + "grad_norm": 0.0026573315262794495, + "learning_rate": 9.164796509060526e-06, + "loss": 0.0001, + "step": 341 + }, + { + "epoch": 0.18719211822660098, + "grad_norm": 5.1158905029296875, + "learning_rate": 9.160032962696734e-06, + "loss": 0.8304, + "step": 342 + }, + { + "epoch": 0.18773946360153257, + "grad_norm": 1.3284435272216797, + "learning_rate": 9.155257115930651e-06, + "loss": 0.0391, + "step": 343 + }, + { + "epoch": 0.18828680897646416, + "grad_norm": 0.004486436489969492, + "learning_rate": 9.15046898288352e-06, + "loss": 0.0002, + "step": 344 + }, + { + "epoch": 0.18883415435139572, + "grad_norm": 3.2804343700408936, + "learning_rate": 9.145668577712911e-06, + "loss": 0.2478, + "step": 345 + }, + { + "epoch": 0.1893814997263273, + "grad_norm": 0.017663557082414627, + "learning_rate": 9.140855914612683e-06, + "loss": 0.0008, + "step": 346 + }, + { + "epoch": 0.1899288451012589, + "grad_norm": 0.003916335292160511, + "learning_rate": 9.136031007812937e-06, + "loss": 0.0002, + "step": 347 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.0020537979435175657, + "learning_rate": 9.131193871579975e-06, + "loss": 0.0001, + "step": 348 + }, + { + "epoch": 0.19102353585112206, + "grad_norm": 0.01471023727208376, + "learning_rate": 9.126344520216264e-06, + "loss": 0.0007, + "step": 349 + }, + { + "epoch": 0.19157088122605365, + "grad_norm": 0.00609373115003109, + "learning_rate": 9.121482968060384e-06, + "loss": 0.0002, + "step": 350 + }, + { + "epoch": 0.1921182266009852, + "grad_norm": 0.004218249581754208, + "learning_rate": 9.116609229486992e-06, + "loss": 0.0002, + "step": 351 + }, + { + "epoch": 0.1926655719759168, + "grad_norm": 0.17256547510623932, + "learning_rate": 9.11172331890678e-06, + "loss": 0.0087, + "step": 352 + }, + { + "epoch": 0.1932129173508484, + "grad_norm": 0.33836865425109863, + "learning_rate": 9.106825250766424e-06, + "loss": 0.0185, + "step": 353 + }, + { + "epoch": 0.19376026272577998, + "grad_norm": 0.016286412253975868, + "learning_rate": 9.101915039548557e-06, + "loss": 0.0008, + "step": 354 + }, + { + "epoch": 0.19430760810071154, + "grad_norm": 2.213080883026123, + "learning_rate": 9.096992699771707e-06, + "loss": 0.3761, + "step": 355 + }, + { + "epoch": 0.19485495347564313, + "grad_norm": 0.03972277790307999, + "learning_rate": 9.092058245990271e-06, + "loss": 0.0017, + "step": 356 + }, + { + "epoch": 0.19540229885057472, + "grad_norm": 0.028462251648306847, + "learning_rate": 9.08711169279446e-06, + "loss": 0.0013, + "step": 357 + }, + { + "epoch": 0.1959496442255063, + "grad_norm": 0.04045112803578377, + "learning_rate": 9.082153054810263e-06, + "loss": 0.0016, + "step": 358 + }, + { + "epoch": 0.19649698960043788, + "grad_norm": 3.064483642578125, + "learning_rate": 9.077182346699402e-06, + "loss": 0.1838, + "step": 359 + }, + { + "epoch": 0.19704433497536947, + "grad_norm": 0.020319310948252678, + "learning_rate": 9.072199583159285e-06, + "loss": 0.0008, + "step": 360 + }, + { + "epoch": 0.19759168035030103, + "grad_norm": 0.004529138095676899, + "learning_rate": 9.067204778922968e-06, + "loss": 0.0002, + "step": 361 + }, + { + "epoch": 0.19813902572523262, + "grad_norm": 0.010784142650663853, + "learning_rate": 9.062197948759112e-06, + "loss": 0.0004, + "step": 362 + }, + { + "epoch": 0.1986863711001642, + "grad_norm": 0.02503327652812004, + "learning_rate": 9.057179107471926e-06, + "loss": 0.001, + "step": 363 + }, + { + "epoch": 0.19923371647509577, + "grad_norm": 0.024941373616456985, + "learning_rate": 9.052148269901145e-06, + "loss": 0.0011, + "step": 364 + }, + { + "epoch": 0.19978106185002736, + "grad_norm": 2.407067060470581, + "learning_rate": 9.047105450921968e-06, + "loss": 0.3134, + "step": 365 + }, + { + "epoch": 0.20032840722495895, + "grad_norm": 3.505232810974121, + "learning_rate": 9.042050665445024e-06, + "loss": 0.1596, + "step": 366 + }, + { + "epoch": 0.20087575259989054, + "grad_norm": 0.01830684021115303, + "learning_rate": 9.03698392841632e-06, + "loss": 0.0009, + "step": 367 + }, + { + "epoch": 0.2014230979748221, + "grad_norm": 0.03134550899267197, + "learning_rate": 9.031905254817209e-06, + "loss": 0.0011, + "step": 368 + }, + { + "epoch": 0.2019704433497537, + "grad_norm": 0.03162311762571335, + "learning_rate": 9.026814659664331e-06, + "loss": 0.0013, + "step": 369 + }, + { + "epoch": 0.2025177887246853, + "grad_norm": 0.05484980717301369, + "learning_rate": 9.021712158009578e-06, + "loss": 0.0025, + "step": 370 + }, + { + "epoch": 0.20306513409961685, + "grad_norm": 0.14212408661842346, + "learning_rate": 9.01659776494005e-06, + "loss": 0.0053, + "step": 371 + }, + { + "epoch": 0.20361247947454844, + "grad_norm": 9.681675910949707, + "learning_rate": 9.011471495578e-06, + "loss": 0.4362, + "step": 372 + }, + { + "epoch": 0.20415982484948003, + "grad_norm": 0.023493144661188126, + "learning_rate": 9.006333365080808e-06, + "loss": 0.0013, + "step": 373 + }, + { + "epoch": 0.2047071702244116, + "grad_norm": 0.14610296487808228, + "learning_rate": 9.001183388640915e-06, + "loss": 0.004, + "step": 374 + }, + { + "epoch": 0.20525451559934318, + "grad_norm": 0.017246192321181297, + "learning_rate": 8.996021581485795e-06, + "loss": 0.0006, + "step": 375 + }, + { + "epoch": 0.20580186097427478, + "grad_norm": 0.023940537124872208, + "learning_rate": 8.990847958877897e-06, + "loss": 0.0009, + "step": 376 + }, + { + "epoch": 0.20634920634920634, + "grad_norm": 0.1029309406876564, + "learning_rate": 8.985662536114614e-06, + "loss": 0.0049, + "step": 377 + }, + { + "epoch": 0.20689655172413793, + "grad_norm": 0.00324469106271863, + "learning_rate": 8.98046532852822e-06, + "loss": 0.0001, + "step": 378 + }, + { + "epoch": 0.20744389709906952, + "grad_norm": 0.04180926829576492, + "learning_rate": 8.975256351485842e-06, + "loss": 0.0015, + "step": 379 + }, + { + "epoch": 0.20799124247400108, + "grad_norm": 1.3598195314407349, + "learning_rate": 8.970035620389404e-06, + "loss": 0.083, + "step": 380 + }, + { + "epoch": 0.20853858784893267, + "grad_norm": 0.015766430646181107, + "learning_rate": 8.964803150675583e-06, + "loss": 0.0006, + "step": 381 + }, + { + "epoch": 0.20908593322386426, + "grad_norm": 0.024671118706464767, + "learning_rate": 8.95955895781577e-06, + "loss": 0.0008, + "step": 382 + }, + { + "epoch": 0.20963327859879585, + "grad_norm": 0.07474718987941742, + "learning_rate": 8.954303057316014e-06, + "loss": 0.004, + "step": 383 + }, + { + "epoch": 0.21018062397372742, + "grad_norm": 0.26028382778167725, + "learning_rate": 8.949035464716984e-06, + "loss": 0.011, + "step": 384 + }, + { + "epoch": 0.210727969348659, + "grad_norm": 0.024138959124684334, + "learning_rate": 8.943756195593916e-06, + "loss": 0.001, + "step": 385 + }, + { + "epoch": 0.2112753147235906, + "grad_norm": 0.01153239980340004, + "learning_rate": 8.938465265556576e-06, + "loss": 0.0004, + "step": 386 + }, + { + "epoch": 0.21182266009852216, + "grad_norm": 0.006579091772437096, + "learning_rate": 8.93316269024921e-06, + "loss": 0.0003, + "step": 387 + }, + { + "epoch": 0.21237000547345375, + "grad_norm": 0.023460250347852707, + "learning_rate": 8.92784848535049e-06, + "loss": 0.0009, + "step": 388 + }, + { + "epoch": 0.21291735084838534, + "grad_norm": 3.0115833282470703, + "learning_rate": 8.92252266657348e-06, + "loss": 0.4978, + "step": 389 + }, + { + "epoch": 0.2134646962233169, + "grad_norm": 1.2171604633331299, + "learning_rate": 8.917185249665583e-06, + "loss": 0.0114, + "step": 390 + }, + { + "epoch": 0.2140120415982485, + "grad_norm": 0.004426921717822552, + "learning_rate": 8.911836250408494e-06, + "loss": 0.0002, + "step": 391 + }, + { + "epoch": 0.21455938697318008, + "grad_norm": 0.022343961521983147, + "learning_rate": 8.90647568461816e-06, + "loss": 0.0009, + "step": 392 + }, + { + "epoch": 0.21510673234811165, + "grad_norm": 0.03706027567386627, + "learning_rate": 8.901103568144715e-06, + "loss": 0.0013, + "step": 393 + }, + { + "epoch": 0.21565407772304324, + "grad_norm": 0.10373899340629578, + "learning_rate": 8.895719916872463e-06, + "loss": 0.0031, + "step": 394 + }, + { + "epoch": 0.21620142309797483, + "grad_norm": 3.4080610275268555, + "learning_rate": 8.8903247467198e-06, + "loss": 0.5519, + "step": 395 + }, + { + "epoch": 0.21674876847290642, + "grad_norm": 0.0201762355864048, + "learning_rate": 8.88491807363919e-06, + "loss": 0.0007, + "step": 396 + }, + { + "epoch": 0.21729611384783798, + "grad_norm": 0.04070815071463585, + "learning_rate": 8.879499913617107e-06, + "loss": 0.0017, + "step": 397 + }, + { + "epoch": 0.21784345922276957, + "grad_norm": 0.9010310173034668, + "learning_rate": 8.874070282673985e-06, + "loss": 0.0271, + "step": 398 + }, + { + "epoch": 0.21839080459770116, + "grad_norm": 0.16603036224842072, + "learning_rate": 8.868629196864182e-06, + "loss": 0.0057, + "step": 399 + }, + { + "epoch": 0.21893814997263272, + "grad_norm": 0.010245956480503082, + "learning_rate": 8.863176672275921e-06, + "loss": 0.0005, + "step": 400 + }, + { + "epoch": 0.2194854953475643, + "grad_norm": 0.02312450110912323, + "learning_rate": 8.857712725031247e-06, + "loss": 0.0011, + "step": 401 + }, + { + "epoch": 0.2200328407224959, + "grad_norm": 5.873071193695068, + "learning_rate": 8.852237371285984e-06, + "loss": 0.2469, + "step": 402 + }, + { + "epoch": 0.22058018609742747, + "grad_norm": 0.7911509871482849, + "learning_rate": 8.84675062722968e-06, + "loss": 0.0259, + "step": 403 + }, + { + "epoch": 0.22112753147235906, + "grad_norm": 0.06114649772644043, + "learning_rate": 8.841252509085561e-06, + "loss": 0.0026, + "step": 404 + }, + { + "epoch": 0.22167487684729065, + "grad_norm": 0.01913605071604252, + "learning_rate": 8.835743033110482e-06, + "loss": 0.0007, + "step": 405 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.3251266479492188, + "learning_rate": 8.83022221559489e-06, + "loss": 0.309, + "step": 406 + }, + { + "epoch": 0.2227695675971538, + "grad_norm": 0.04632899910211563, + "learning_rate": 8.824690072862758e-06, + "loss": 0.002, + "step": 407 + }, + { + "epoch": 0.2233169129720854, + "grad_norm": 0.23529572784900665, + "learning_rate": 8.819146621271546e-06, + "loss": 0.0107, + "step": 408 + }, + { + "epoch": 0.22386425834701698, + "grad_norm": 0.005736156366765499, + "learning_rate": 8.813591877212157e-06, + "loss": 0.0003, + "step": 409 + }, + { + "epoch": 0.22441160372194854, + "grad_norm": 1.7037173509597778, + "learning_rate": 8.80802585710888e-06, + "loss": 0.0472, + "step": 410 + }, + { + "epoch": 0.22495894909688013, + "grad_norm": 0.1383262276649475, + "learning_rate": 8.802448577419343e-06, + "loss": 0.0057, + "step": 411 + }, + { + "epoch": 0.22550629447181172, + "grad_norm": 0.003472214797511697, + "learning_rate": 8.796860054634471e-06, + "loss": 0.0002, + "step": 412 + }, + { + "epoch": 0.2260536398467433, + "grad_norm": 0.006429862696677446, + "learning_rate": 8.791260305278434e-06, + "loss": 0.0003, + "step": 413 + }, + { + "epoch": 0.22660098522167488, + "grad_norm": 0.2246072143316269, + "learning_rate": 8.78564934590859e-06, + "loss": 0.0117, + "step": 414 + }, + { + "epoch": 0.22714833059660647, + "grad_norm": 0.005228335503488779, + "learning_rate": 8.780027193115444e-06, + "loss": 0.0002, + "step": 415 + }, + { + "epoch": 0.22769567597153803, + "grad_norm": 0.6419472098350525, + "learning_rate": 8.774393863522606e-06, + "loss": 0.0406, + "step": 416 + }, + { + "epoch": 0.22824302134646962, + "grad_norm": 0.9705145955085754, + "learning_rate": 8.768749373786722e-06, + "loss": 0.0587, + "step": 417 + }, + { + "epoch": 0.2287903667214012, + "grad_norm": 0.0039025377482175827, + "learning_rate": 8.763093740597447e-06, + "loss": 0.0002, + "step": 418 + }, + { + "epoch": 0.22933771209633277, + "grad_norm": 0.08845080435276031, + "learning_rate": 8.757426980677377e-06, + "loss": 0.0032, + "step": 419 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 0.005025999154895544, + "learning_rate": 8.751749110782013e-06, + "loss": 0.0002, + "step": 420 + }, + { + "epoch": 0.23043240284619596, + "grad_norm": 0.0018684992101043463, + "learning_rate": 8.746060147699701e-06, + "loss": 0.0001, + "step": 421 + }, + { + "epoch": 0.23097974822112752, + "grad_norm": 0.0019199148518964648, + "learning_rate": 8.740360108251592e-06, + "loss": 0.0001, + "step": 422 + }, + { + "epoch": 0.2315270935960591, + "grad_norm": 13.037283897399902, + "learning_rate": 8.734649009291586e-06, + "loss": 1.4691, + "step": 423 + }, + { + "epoch": 0.2320744389709907, + "grad_norm": 0.003358412766829133, + "learning_rate": 8.72892686770628e-06, + "loss": 0.0002, + "step": 424 + }, + { + "epoch": 0.2326217843459223, + "grad_norm": 0.014812062494456768, + "learning_rate": 8.72319370041493e-06, + "loss": 0.0007, + "step": 425 + }, + { + "epoch": 0.23316912972085385, + "grad_norm": 0.0031867721118032932, + "learning_rate": 8.717449524369386e-06, + "loss": 0.0002, + "step": 426 + }, + { + "epoch": 0.23371647509578544, + "grad_norm": 0.6433345079421997, + "learning_rate": 8.71169435655405e-06, + "loss": 0.0296, + "step": 427 + }, + { + "epoch": 0.23426382047071703, + "grad_norm": 0.0018559806048870087, + "learning_rate": 8.705928213985827e-06, + "loss": 0.0001, + "step": 428 + }, + { + "epoch": 0.2348111658456486, + "grad_norm": 0.02164350636303425, + "learning_rate": 8.700151113714071e-06, + "loss": 0.001, + "step": 429 + }, + { + "epoch": 0.23535851122058019, + "grad_norm": 0.0030800742097198963, + "learning_rate": 8.694363072820535e-06, + "loss": 0.0002, + "step": 430 + }, + { + "epoch": 0.23590585659551178, + "grad_norm": 0.11861609667539597, + "learning_rate": 8.688564108419321e-06, + "loss": 0.0041, + "step": 431 + }, + { + "epoch": 0.23645320197044334, + "grad_norm": 0.01565435156226158, + "learning_rate": 8.68275423765683e-06, + "loss": 0.0009, + "step": 432 + }, + { + "epoch": 0.23700054734537493, + "grad_norm": 0.024173466488718987, + "learning_rate": 8.676933477711714e-06, + "loss": 0.0012, + "step": 433 + }, + { + "epoch": 0.23754789272030652, + "grad_norm": 0.003673731815069914, + "learning_rate": 8.671101845794816e-06, + "loss": 0.0001, + "step": 434 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 2.293191909790039, + "learning_rate": 8.665259359149132e-06, + "loss": 0.0733, + "step": 435 + }, + { + "epoch": 0.23864258347016967, + "grad_norm": 0.02679836004972458, + "learning_rate": 8.65940603504975e-06, + "loss": 0.0016, + "step": 436 + }, + { + "epoch": 0.23918992884510126, + "grad_norm": 0.006045842077583075, + "learning_rate": 8.653541890803798e-06, + "loss": 0.0003, + "step": 437 + }, + { + "epoch": 0.23973727422003285, + "grad_norm": 0.009248864836990833, + "learning_rate": 8.647666943750405e-06, + "loss": 0.0005, + "step": 438 + }, + { + "epoch": 0.24028461959496442, + "grad_norm": 0.001006833277642727, + "learning_rate": 8.641781211260641e-06, + "loss": 0.0001, + "step": 439 + }, + { + "epoch": 0.240831964969896, + "grad_norm": 3.960789680480957, + "learning_rate": 8.635884710737458e-06, + "loss": 0.1464, + "step": 440 + }, + { + "epoch": 0.2413793103448276, + "grad_norm": 0.0017514158971607685, + "learning_rate": 8.629977459615655e-06, + "loss": 0.0001, + "step": 441 + }, + { + "epoch": 0.24192665571975916, + "grad_norm": 0.0009652414591982961, + "learning_rate": 8.624059475361818e-06, + "loss": 0.0001, + "step": 442 + }, + { + "epoch": 0.24247400109469075, + "grad_norm": 0.0015257166232913733, + "learning_rate": 8.618130775474262e-06, + "loss": 0.0001, + "step": 443 + }, + { + "epoch": 0.24302134646962234, + "grad_norm": 0.0018407206516712904, + "learning_rate": 8.612191377482995e-06, + "loss": 0.0001, + "step": 444 + }, + { + "epoch": 0.2435686918445539, + "grad_norm": 0.002263927599415183, + "learning_rate": 8.606241298949651e-06, + "loss": 0.0001, + "step": 445 + }, + { + "epoch": 0.2441160372194855, + "grad_norm": 0.008505883626639843, + "learning_rate": 8.600280557467448e-06, + "loss": 0.0004, + "step": 446 + }, + { + "epoch": 0.24466338259441708, + "grad_norm": 0.0033861438278108835, + "learning_rate": 8.594309170661128e-06, + "loss": 0.0002, + "step": 447 + }, + { + "epoch": 0.24521072796934865, + "grad_norm": 8.31734561920166, + "learning_rate": 8.588327156186915e-06, + "loss": 0.2658, + "step": 448 + }, + { + "epoch": 0.24575807334428024, + "grad_norm": 0.001098312553949654, + "learning_rate": 8.58233453173245e-06, + "loss": 0.0001, + "step": 449 + }, + { + "epoch": 0.24630541871921183, + "grad_norm": 6.409268379211426, + "learning_rate": 8.576331315016753e-06, + "loss": 0.4498, + "step": 450 + }, + { + "epoch": 0.24685276409414342, + "grad_norm": 0.0007233834476210177, + "learning_rate": 8.570317523790155e-06, + "loss": 0.0, + "step": 451 + }, + { + "epoch": 0.24740010946907498, + "grad_norm": 0.003745045978575945, + "learning_rate": 8.564293175834261e-06, + "loss": 0.0001, + "step": 452 + }, + { + "epoch": 0.24794745484400657, + "grad_norm": 0.0014460004167631269, + "learning_rate": 8.558258288961887e-06, + "loss": 0.0001, + "step": 453 + }, + { + "epoch": 0.24849480021893816, + "grad_norm": 0.2529180347919464, + "learning_rate": 8.552212881017012e-06, + "loss": 0.0083, + "step": 454 + }, + { + "epoch": 0.24904214559386972, + "grad_norm": 0.01214388944208622, + "learning_rate": 8.546156969874723e-06, + "loss": 0.0007, + "step": 455 + }, + { + "epoch": 0.24958949096880131, + "grad_norm": 0.0014981742715463042, + "learning_rate": 8.540090573441159e-06, + "loss": 0.0001, + "step": 456 + }, + { + "epoch": 0.2501368363437329, + "grad_norm": 0.001638900488615036, + "learning_rate": 8.534013709653469e-06, + "loss": 0.0001, + "step": 457 + }, + { + "epoch": 0.25068418171866447, + "grad_norm": 0.05050064995884895, + "learning_rate": 8.527926396479746e-06, + "loss": 0.002, + "step": 458 + }, + { + "epoch": 0.2512315270935961, + "grad_norm": 0.0042986744083464146, + "learning_rate": 8.521828651918983e-06, + "loss": 0.0002, + "step": 459 + }, + { + "epoch": 0.25177887246852765, + "grad_norm": 0.012750094756484032, + "learning_rate": 8.515720494001016e-06, + "loss": 0.0006, + "step": 460 + }, + { + "epoch": 0.2523262178434592, + "grad_norm": 0.008734261617064476, + "learning_rate": 8.509601940786472e-06, + "loss": 0.0004, + "step": 461 + }, + { + "epoch": 0.25287356321839083, + "grad_norm": 0.0008263569907285273, + "learning_rate": 8.503473010366713e-06, + "loss": 0.0001, + "step": 462 + }, + { + "epoch": 0.2534209085933224, + "grad_norm": 0.0020867646671831608, + "learning_rate": 8.497333720863786e-06, + "loss": 0.0001, + "step": 463 + }, + { + "epoch": 0.25396825396825395, + "grad_norm": 0.0010886021191254258, + "learning_rate": 8.491184090430365e-06, + "loss": 0.0001, + "step": 464 + }, + { + "epoch": 0.2545155993431856, + "grad_norm": 0.6500905752182007, + "learning_rate": 8.485024137249705e-06, + "loss": 0.0172, + "step": 465 + }, + { + "epoch": 0.25506294471811713, + "grad_norm": 0.26900380849838257, + "learning_rate": 8.478853879535578e-06, + "loss": 0.0092, + "step": 466 + }, + { + "epoch": 0.2556102900930487, + "grad_norm": 0.09617394208908081, + "learning_rate": 8.472673335532226e-06, + "loss": 0.0046, + "step": 467 + }, + { + "epoch": 0.2561576354679803, + "grad_norm": 3.6172397136688232, + "learning_rate": 8.46648252351431e-06, + "loss": 0.9964, + "step": 468 + }, + { + "epoch": 0.2567049808429119, + "grad_norm": 0.005054984707385302, + "learning_rate": 8.460281461786848e-06, + "loss": 0.0002, + "step": 469 + }, + { + "epoch": 0.25725232621784344, + "grad_norm": 3.7851755619049072, + "learning_rate": 8.454070168685162e-06, + "loss": 0.4502, + "step": 470 + }, + { + "epoch": 0.25779967159277506, + "grad_norm": 0.00290496414527297, + "learning_rate": 8.447848662574828e-06, + "loss": 0.0002, + "step": 471 + }, + { + "epoch": 0.2583470169677066, + "grad_norm": 0.42932426929473877, + "learning_rate": 8.441616961851624e-06, + "loss": 0.0123, + "step": 472 + }, + { + "epoch": 0.2588943623426382, + "grad_norm": 3.719259738922119, + "learning_rate": 8.435375084941464e-06, + "loss": 0.4588, + "step": 473 + }, + { + "epoch": 0.2594417077175698, + "grad_norm": 0.07546942681074142, + "learning_rate": 8.429123050300357e-06, + "loss": 0.0038, + "step": 474 + }, + { + "epoch": 0.25998905309250137, + "grad_norm": 0.01607462391257286, + "learning_rate": 8.422860876414344e-06, + "loss": 0.0009, + "step": 475 + }, + { + "epoch": 0.26053639846743293, + "grad_norm": 0.0010948505951091647, + "learning_rate": 8.416588581799447e-06, + "loss": 0.0001, + "step": 476 + }, + { + "epoch": 0.26108374384236455, + "grad_norm": 3.172180652618408, + "learning_rate": 8.41030618500161e-06, + "loss": 0.0499, + "step": 477 + }, + { + "epoch": 0.2616310892172961, + "grad_norm": 0.002868334762752056, + "learning_rate": 8.404013704596653e-06, + "loss": 0.0001, + "step": 478 + }, + { + "epoch": 0.26217843459222767, + "grad_norm": 0.0034138199407607317, + "learning_rate": 8.3977111591902e-06, + "loss": 0.0002, + "step": 479 + }, + { + "epoch": 0.2627257799671593, + "grad_norm": 0.11267642676830292, + "learning_rate": 8.391398567417653e-06, + "loss": 0.0031, + "step": 480 + }, + { + "epoch": 0.26327312534209085, + "grad_norm": 0.06314973533153534, + "learning_rate": 8.385075947944101e-06, + "loss": 0.0035, + "step": 481 + }, + { + "epoch": 0.2638204707170224, + "grad_norm": 0.0866016298532486, + "learning_rate": 8.378743319464293e-06, + "loss": 0.004, + "step": 482 + }, + { + "epoch": 0.26436781609195403, + "grad_norm": 4.425952434539795, + "learning_rate": 8.372400700702569e-06, + "loss": 1.0502, + "step": 483 + }, + { + "epoch": 0.2649151614668856, + "grad_norm": 0.011416326276957989, + "learning_rate": 8.366048110412817e-06, + "loss": 0.0006, + "step": 484 + }, + { + "epoch": 0.2654625068418172, + "grad_norm": 0.006862805690616369, + "learning_rate": 8.359685567378392e-06, + "loss": 0.0004, + "step": 485 + }, + { + "epoch": 0.2660098522167488, + "grad_norm": 0.13846512138843536, + "learning_rate": 8.353313090412093e-06, + "loss": 0.0073, + "step": 486 + }, + { + "epoch": 0.26655719759168034, + "grad_norm": 0.003636781359091401, + "learning_rate": 8.346930698356083e-06, + "loss": 0.0002, + "step": 487 + }, + { + "epoch": 0.26710454296661196, + "grad_norm": 0.03555435314774513, + "learning_rate": 8.340538410081846e-06, + "loss": 0.0017, + "step": 488 + }, + { + "epoch": 0.2676518883415435, + "grad_norm": 0.004512585233896971, + "learning_rate": 8.334136244490128e-06, + "loss": 0.0002, + "step": 489 + }, + { + "epoch": 0.2681992337164751, + "grad_norm": 0.3295449912548065, + "learning_rate": 8.327724220510873e-06, + "loss": 0.02, + "step": 490 + }, + { + "epoch": 0.2687465790914067, + "grad_norm": 0.06443799287080765, + "learning_rate": 8.321302357103183e-06, + "loss": 0.0036, + "step": 491 + }, + { + "epoch": 0.26929392446633826, + "grad_norm": 0.01787388324737549, + "learning_rate": 8.314870673255248e-06, + "loss": 0.0009, + "step": 492 + }, + { + "epoch": 0.2698412698412698, + "grad_norm": 0.018945252522826195, + "learning_rate": 8.308429187984298e-06, + "loss": 0.001, + "step": 493 + }, + { + "epoch": 0.27038861521620144, + "grad_norm": 0.006173065863549709, + "learning_rate": 8.301977920336542e-06, + "loss": 0.0002, + "step": 494 + }, + { + "epoch": 0.270935960591133, + "grad_norm": 2.493547201156616, + "learning_rate": 8.295516889387115e-06, + "loss": 0.0723, + "step": 495 + }, + { + "epoch": 0.27148330596606457, + "grad_norm": 2.351107120513916, + "learning_rate": 8.289046114240019e-06, + "loss": 0.4947, + "step": 496 + }, + { + "epoch": 0.2720306513409962, + "grad_norm": 0.0063329474069178104, + "learning_rate": 8.282565614028068e-06, + "loss": 0.0003, + "step": 497 + }, + { + "epoch": 0.27257799671592775, + "grad_norm": 0.00849709752947092, + "learning_rate": 8.276075407912831e-06, + "loss": 0.0003, + "step": 498 + }, + { + "epoch": 0.2731253420908593, + "grad_norm": 0.01383709441870451, + "learning_rate": 8.269575515084577e-06, + "loss": 0.0007, + "step": 499 + }, + { + "epoch": 0.27367268746579093, + "grad_norm": 0.006640062667429447, + "learning_rate": 8.263065954762212e-06, + "loss": 0.0003, + "step": 500 + }, + { + "epoch": 0.2742200328407225, + "grad_norm": 0.008006825111806393, + "learning_rate": 8.256546746193237e-06, + "loss": 0.0004, + "step": 501 + }, + { + "epoch": 0.27476737821565406, + "grad_norm": 0.03927391767501831, + "learning_rate": 8.250017908653666e-06, + "loss": 0.0022, + "step": 502 + }, + { + "epoch": 0.2753147235905857, + "grad_norm": 4.720467567443848, + "learning_rate": 8.243479461447999e-06, + "loss": 0.2078, + "step": 503 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 4.6646552085876465, + "learning_rate": 8.23693142390914e-06, + "loss": 0.8793, + "step": 504 + }, + { + "epoch": 0.2764094143404488, + "grad_norm": 4.503306865692139, + "learning_rate": 8.230373815398352e-06, + "loss": 0.4268, + "step": 505 + }, + { + "epoch": 0.2769567597153804, + "grad_norm": 0.004571705125272274, + "learning_rate": 8.2238066553052e-06, + "loss": 0.0002, + "step": 506 + }, + { + "epoch": 0.277504105090312, + "grad_norm": 0.039737485349178314, + "learning_rate": 8.21722996304749e-06, + "loss": 0.0016, + "step": 507 + }, + { + "epoch": 0.27805145046524354, + "grad_norm": 0.02338779717683792, + "learning_rate": 8.210643758071211e-06, + "loss": 0.0009, + "step": 508 + }, + { + "epoch": 0.27859879584017516, + "grad_norm": 0.36878153681755066, + "learning_rate": 8.20404805985048e-06, + "loss": 0.0202, + "step": 509 + }, + { + "epoch": 0.2791461412151067, + "grad_norm": 0.009232649579644203, + "learning_rate": 8.197442887887488e-06, + "loss": 0.0005, + "step": 510 + }, + { + "epoch": 0.2796934865900383, + "grad_norm": 0.2624075710773468, + "learning_rate": 8.19082826171243e-06, + "loss": 0.0159, + "step": 511 + }, + { + "epoch": 0.2802408319649699, + "grad_norm": 2.7321317195892334, + "learning_rate": 8.184204200883458e-06, + "loss": 0.4077, + "step": 512 + }, + { + "epoch": 0.28078817733990147, + "grad_norm": 0.011593791656196117, + "learning_rate": 8.177570724986627e-06, + "loss": 0.0006, + "step": 513 + }, + { + "epoch": 0.2813355227148331, + "grad_norm": 0.05139797925949097, + "learning_rate": 8.170927853635824e-06, + "loss": 0.0024, + "step": 514 + }, + { + "epoch": 0.28188286808976465, + "grad_norm": 0.06633631885051727, + "learning_rate": 8.164275606472716e-06, + "loss": 0.004, + "step": 515 + }, + { + "epoch": 0.2824302134646962, + "grad_norm": 0.12292248010635376, + "learning_rate": 8.157614003166695e-06, + "loss": 0.0054, + "step": 516 + }, + { + "epoch": 0.28297755883962783, + "grad_norm": 0.01715407706797123, + "learning_rate": 8.150943063414815e-06, + "loss": 0.0008, + "step": 517 + }, + { + "epoch": 0.2835249042145594, + "grad_norm": 1.500406265258789, + "learning_rate": 8.144262806941743e-06, + "loss": 0.1147, + "step": 518 + }, + { + "epoch": 0.28407224958949095, + "grad_norm": 0.012590247206389904, + "learning_rate": 8.137573253499683e-06, + "loss": 0.0006, + "step": 519 + }, + { + "epoch": 0.2846195949644226, + "grad_norm": 0.009325980208814144, + "learning_rate": 8.130874422868335e-06, + "loss": 0.0004, + "step": 520 + }, + { + "epoch": 0.28516694033935414, + "grad_norm": 0.009420981630682945, + "learning_rate": 8.124166334854831e-06, + "loss": 0.0004, + "step": 521 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.010938627645373344, + "learning_rate": 8.117449009293668e-06, + "loss": 0.0005, + "step": 522 + }, + { + "epoch": 0.2862616310892173, + "grad_norm": 2.296308755874634, + "learning_rate": 8.110722466046666e-06, + "loss": 0.4474, + "step": 523 + }, + { + "epoch": 0.2868089764641489, + "grad_norm": 0.030201993882656097, + "learning_rate": 8.103986725002893e-06, + "loss": 0.0014, + "step": 524 + }, + { + "epoch": 0.28735632183908044, + "grad_norm": 0.0328528992831707, + "learning_rate": 8.097241806078616e-06, + "loss": 0.0017, + "step": 525 + }, + { + "epoch": 0.28790366721401206, + "grad_norm": 0.06072849780321121, + "learning_rate": 8.090487729217238e-06, + "loss": 0.0035, + "step": 526 + }, + { + "epoch": 0.2884510125889436, + "grad_norm": 0.040568139404058456, + "learning_rate": 8.083724514389242e-06, + "loss": 0.0022, + "step": 527 + }, + { + "epoch": 0.2889983579638752, + "grad_norm": 0.03415596857666969, + "learning_rate": 8.076952181592125e-06, + "loss": 0.0016, + "step": 528 + }, + { + "epoch": 0.2895457033388068, + "grad_norm": 0.04164343699812889, + "learning_rate": 8.070170750850354e-06, + "loss": 0.0018, + "step": 529 + }, + { + "epoch": 0.29009304871373837, + "grad_norm": 0.1452062726020813, + "learning_rate": 8.063380242215289e-06, + "loss": 0.0096, + "step": 530 + }, + { + "epoch": 0.29064039408866993, + "grad_norm": 0.057472992688417435, + "learning_rate": 8.05658067576513e-06, + "loss": 0.0027, + "step": 531 + }, + { + "epoch": 0.29118773946360155, + "grad_norm": 0.0152335399761796, + "learning_rate": 8.049772071604864e-06, + "loss": 0.0007, + "step": 532 + }, + { + "epoch": 0.2917350848385331, + "grad_norm": 0.09490533173084259, + "learning_rate": 8.042954449866203e-06, + "loss": 0.0056, + "step": 533 + }, + { + "epoch": 0.2922824302134647, + "grad_norm": 0.025839099660515785, + "learning_rate": 8.036127830707515e-06, + "loss": 0.0011, + "step": 534 + }, + { + "epoch": 0.2928297755883963, + "grad_norm": 0.02462073415517807, + "learning_rate": 8.029292234313777e-06, + "loss": 0.001, + "step": 535 + }, + { + "epoch": 0.29337712096332785, + "grad_norm": 1.0036952495574951, + "learning_rate": 8.022447680896505e-06, + "loss": 0.0427, + "step": 536 + }, + { + "epoch": 0.2939244663382594, + "grad_norm": 0.014462477527558804, + "learning_rate": 8.015594190693705e-06, + "loss": 0.0007, + "step": 537 + }, + { + "epoch": 0.29447181171319103, + "grad_norm": 0.03679874539375305, + "learning_rate": 8.008731783969803e-06, + "loss": 0.0016, + "step": 538 + }, + { + "epoch": 0.2950191570881226, + "grad_norm": 0.036358315497636795, + "learning_rate": 8.001860481015594e-06, + "loss": 0.0018, + "step": 539 + }, + { + "epoch": 0.2955665024630542, + "grad_norm": 2.7873520851135254, + "learning_rate": 7.99498030214817e-06, + "loss": 0.447, + "step": 540 + }, + { + "epoch": 0.2961138478379858, + "grad_norm": 0.1810341328382492, + "learning_rate": 7.988091267710873e-06, + "loss": 0.0074, + "step": 541 + }, + { + "epoch": 0.29666119321291734, + "grad_norm": 0.1556759476661682, + "learning_rate": 7.981193398073232e-06, + "loss": 0.0121, + "step": 542 + }, + { + "epoch": 0.29720853858784896, + "grad_norm": 3.2317357063293457, + "learning_rate": 7.97428671363089e-06, + "loss": 0.6869, + "step": 543 + }, + { + "epoch": 0.2977558839627805, + "grad_norm": 0.01287668477743864, + "learning_rate": 7.967371234805563e-06, + "loss": 0.0006, + "step": 544 + }, + { + "epoch": 0.2983032293377121, + "grad_norm": 0.13862210512161255, + "learning_rate": 7.960446982044964e-06, + "loss": 0.0088, + "step": 545 + }, + { + "epoch": 0.2988505747126437, + "grad_norm": 0.16142381727695465, + "learning_rate": 7.953513975822755e-06, + "loss": 0.012, + "step": 546 + }, + { + "epoch": 0.29939792008757526, + "grad_norm": 0.12508529424667358, + "learning_rate": 7.946572236638477e-06, + "loss": 0.0083, + "step": 547 + }, + { + "epoch": 0.2999452654625068, + "grad_norm": 0.02337013930082321, + "learning_rate": 7.939621785017488e-06, + "loss": 0.0012, + "step": 548 + }, + { + "epoch": 0.30049261083743845, + "grad_norm": 0.03486739471554756, + "learning_rate": 7.932662641510915e-06, + "loss": 0.0017, + "step": 549 + }, + { + "epoch": 0.30103995621237, + "grad_norm": 0.131191223859787, + "learning_rate": 7.925694826695582e-06, + "loss": 0.0056, + "step": 550 + }, + { + "epoch": 0.30158730158730157, + "grad_norm": 0.058438822627067566, + "learning_rate": 7.918718361173951e-06, + "loss": 0.004, + "step": 551 + }, + { + "epoch": 0.3021346469622332, + "grad_norm": 0.16554298996925354, + "learning_rate": 7.911733265574061e-06, + "loss": 0.0104, + "step": 552 + }, + { + "epoch": 0.30268199233716475, + "grad_norm": 0.017998240888118744, + "learning_rate": 7.904739560549475e-06, + "loss": 0.0009, + "step": 553 + }, + { + "epoch": 0.3032293377120963, + "grad_norm": 0.03947608172893524, + "learning_rate": 7.897737266779207e-06, + "loss": 0.0023, + "step": 554 + }, + { + "epoch": 0.30377668308702793, + "grad_norm": 0.07796397805213928, + "learning_rate": 7.890726404967665e-06, + "loss": 0.0037, + "step": 555 + }, + { + "epoch": 0.3043240284619595, + "grad_norm": 4.119320869445801, + "learning_rate": 7.883706995844598e-06, + "loss": 0.9868, + "step": 556 + }, + { + "epoch": 0.30487137383689106, + "grad_norm": 0.01604565419256687, + "learning_rate": 7.87667906016502e-06, + "loss": 0.0007, + "step": 557 + }, + { + "epoch": 0.3054187192118227, + "grad_norm": 1.9092696905136108, + "learning_rate": 7.869642618709162e-06, + "loss": 0.4158, + "step": 558 + }, + { + "epoch": 0.30596606458675424, + "grad_norm": 0.2341250330209732, + "learning_rate": 7.8625976922824e-06, + "loss": 0.0119, + "step": 559 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 0.07946468889713287, + "learning_rate": 7.855544301715203e-06, + "loss": 0.0048, + "step": 560 + }, + { + "epoch": 0.3070607553366174, + "grad_norm": 0.12750709056854248, + "learning_rate": 7.848482467863062e-06, + "loss": 0.0057, + "step": 561 + }, + { + "epoch": 0.307608100711549, + "grad_norm": 0.2701328992843628, + "learning_rate": 7.841412211606439e-06, + "loss": 0.0146, + "step": 562 + }, + { + "epoch": 0.30815544608648054, + "grad_norm": 4.954006195068359, + "learning_rate": 7.834333553850694e-06, + "loss": 0.7856, + "step": 563 + }, + { + "epoch": 0.30870279146141216, + "grad_norm": 0.25672805309295654, + "learning_rate": 7.827246515526035e-06, + "loss": 0.017, + "step": 564 + }, + { + "epoch": 0.3092501368363437, + "grad_norm": 0.02571881376206875, + "learning_rate": 7.82015111758744e-06, + "loss": 0.0013, + "step": 565 + }, + { + "epoch": 0.3097974822112753, + "grad_norm": 0.05526258423924446, + "learning_rate": 7.813047381014613e-06, + "loss": 0.0026, + "step": 566 + }, + { + "epoch": 0.3103448275862069, + "grad_norm": 1.4756897687911987, + "learning_rate": 7.805935326811913e-06, + "loss": 0.048, + "step": 567 + }, + { + "epoch": 0.31089217296113847, + "grad_norm": 0.09228195250034332, + "learning_rate": 7.798814976008286e-06, + "loss": 0.0043, + "step": 568 + }, + { + "epoch": 0.3114395183360701, + "grad_norm": 0.4741865396499634, + "learning_rate": 7.791686349657219e-06, + "loss": 0.0377, + "step": 569 + }, + { + "epoch": 0.31198686371100165, + "grad_norm": 0.24840384721755981, + "learning_rate": 7.78454946883666e-06, + "loss": 0.0219, + "step": 570 + }, + { + "epoch": 0.3125342090859332, + "grad_norm": 0.13321872055530548, + "learning_rate": 7.777404354648967e-06, + "loss": 0.0069, + "step": 571 + }, + { + "epoch": 0.31308155446086483, + "grad_norm": 2.217121124267578, + "learning_rate": 7.770251028220844e-06, + "loss": 0.6064, + "step": 572 + }, + { + "epoch": 0.3136288998357964, + "grad_norm": 0.048713941127061844, + "learning_rate": 7.763089510703276e-06, + "loss": 0.0025, + "step": 573 + }, + { + "epoch": 0.31417624521072796, + "grad_norm": 0.128813236951828, + "learning_rate": 7.755919823271466e-06, + "loss": 0.0072, + "step": 574 + }, + { + "epoch": 0.3147235905856596, + "grad_norm": 0.03228011354804039, + "learning_rate": 7.748741987124773e-06, + "loss": 0.0017, + "step": 575 + }, + { + "epoch": 0.31527093596059114, + "grad_norm": 0.12434003502130508, + "learning_rate": 7.741556023486655e-06, + "loss": 0.0071, + "step": 576 + }, + { + "epoch": 0.3158182813355227, + "grad_norm": 0.05032031983137131, + "learning_rate": 7.734361953604596e-06, + "loss": 0.0021, + "step": 577 + }, + { + "epoch": 0.3163656267104543, + "grad_norm": 0.03245704621076584, + "learning_rate": 7.727159798750054e-06, + "loss": 0.0015, + "step": 578 + }, + { + "epoch": 0.3169129720853859, + "grad_norm": 0.09331010282039642, + "learning_rate": 7.719949580218387e-06, + "loss": 0.0056, + "step": 579 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.04344509541988373, + "learning_rate": 7.712731319328798e-06, + "loss": 0.002, + "step": 580 + }, + { + "epoch": 0.31800766283524906, + "grad_norm": 0.016618408262729645, + "learning_rate": 7.70550503742427e-06, + "loss": 0.0008, + "step": 581 + }, + { + "epoch": 0.3185550082101806, + "grad_norm": 0.018471628427505493, + "learning_rate": 7.698270755871506e-06, + "loss": 0.0008, + "step": 582 + }, + { + "epoch": 0.3191023535851122, + "grad_norm": 0.05424068868160248, + "learning_rate": 7.691028496060856e-06, + "loss": 0.0026, + "step": 583 + }, + { + "epoch": 0.3196496989600438, + "grad_norm": 0.28279751539230347, + "learning_rate": 7.683778279406261e-06, + "loss": 0.0188, + "step": 584 + }, + { + "epoch": 0.32019704433497537, + "grad_norm": 0.041227713227272034, + "learning_rate": 7.676520127345198e-06, + "loss": 0.0019, + "step": 585 + }, + { + "epoch": 0.32074438970990693, + "grad_norm": 2.211728096008301, + "learning_rate": 7.669254061338591e-06, + "loss": 0.4059, + "step": 586 + }, + { + "epoch": 0.32129173508483855, + "grad_norm": 2.1793460845947266, + "learning_rate": 7.66198010287078e-06, + "loss": 0.3825, + "step": 587 + }, + { + "epoch": 0.3218390804597701, + "grad_norm": 0.633358359336853, + "learning_rate": 7.654698273449435e-06, + "loss": 0.0113, + "step": 588 + }, + { + "epoch": 0.3223864258347017, + "grad_norm": 0.018418803811073303, + "learning_rate": 7.647408594605495e-06, + "loss": 0.001, + "step": 589 + }, + { + "epoch": 0.3229337712096333, + "grad_norm": 0.019276276230812073, + "learning_rate": 7.640111087893114e-06, + "loss": 0.001, + "step": 590 + }, + { + "epoch": 0.32348111658456485, + "grad_norm": 0.017273103818297386, + "learning_rate": 7.632805774889589e-06, + "loss": 0.0009, + "step": 591 + }, + { + "epoch": 0.3240284619594964, + "grad_norm": 0.0777779296040535, + "learning_rate": 7.625492677195298e-06, + "loss": 0.0038, + "step": 592 + }, + { + "epoch": 0.32457580733442803, + "grad_norm": 0.1303318440914154, + "learning_rate": 7.6181718164336415e-06, + "loss": 0.0071, + "step": 593 + }, + { + "epoch": 0.3251231527093596, + "grad_norm": 0.18606723845005035, + "learning_rate": 7.610843214250964e-06, + "loss": 0.011, + "step": 594 + }, + { + "epoch": 0.32567049808429116, + "grad_norm": 1.8400012254714966, + "learning_rate": 7.603506892316513e-06, + "loss": 0.1371, + "step": 595 + }, + { + "epoch": 0.3262178434592228, + "grad_norm": 0.019331350922584534, + "learning_rate": 7.5961628723223505e-06, + "loss": 0.0008, + "step": 596 + }, + { + "epoch": 0.32676518883415434, + "grad_norm": 0.02180156670510769, + "learning_rate": 7.588811175983305e-06, + "loss": 0.0011, + "step": 597 + }, + { + "epoch": 0.32731253420908596, + "grad_norm": 0.49529480934143066, + "learning_rate": 7.581451825036903e-06, + "loss": 0.0257, + "step": 598 + }, + { + "epoch": 0.3278598795840175, + "grad_norm": 0.08971985429525375, + "learning_rate": 7.574084841243302e-06, + "loss": 0.0044, + "step": 599 + }, + { + "epoch": 0.3284072249589491, + "grad_norm": 1.5169864892959595, + "learning_rate": 7.5667102463852314e-06, + "loss": 0.4389, + "step": 600 + }, + { + "epoch": 0.3289545703338807, + "grad_norm": 0.16540849208831787, + "learning_rate": 7.55932806226792e-06, + "loss": 0.0108, + "step": 601 + }, + { + "epoch": 0.32950191570881227, + "grad_norm": 0.01784505322575569, + "learning_rate": 7.551938310719043e-06, + "loss": 0.0009, + "step": 602 + }, + { + "epoch": 0.33004926108374383, + "grad_norm": 0.02484745904803276, + "learning_rate": 7.5445410135886455e-06, + "loss": 0.0013, + "step": 603 + }, + { + "epoch": 0.33059660645867545, + "grad_norm": 0.00939080398529768, + "learning_rate": 7.537136192749086e-06, + "loss": 0.0005, + "step": 604 + }, + { + "epoch": 0.331143951833607, + "grad_norm": 0.006826468743383884, + "learning_rate": 7.529723870094969e-06, + "loss": 0.0004, + "step": 605 + }, + { + "epoch": 0.33169129720853857, + "grad_norm": 0.5507104396820068, + "learning_rate": 7.522304067543082e-06, + "loss": 0.0289, + "step": 606 + }, + { + "epoch": 0.3322386425834702, + "grad_norm": 0.03984001278877258, + "learning_rate": 7.514876807032323e-06, + "loss": 0.0015, + "step": 607 + }, + { + "epoch": 0.33278598795840175, + "grad_norm": 0.010095106437802315, + "learning_rate": 7.507442110523649e-06, + "loss": 0.0005, + "step": 608 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.014431447722017765, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0006, + "step": 609 + }, + { + "epoch": 0.33388067870826493, + "grad_norm": 2.57116436958313, + "learning_rate": 7.492550497466239e-06, + "loss": 0.5546, + "step": 610 + }, + { + "epoch": 0.3344280240831965, + "grad_norm": 1.8662731647491455, + "learning_rate": 7.485093624949085e-06, + "loss": 0.2615, + "step": 611 + }, + { + "epoch": 0.33497536945812806, + "grad_norm": 0.3963170051574707, + "learning_rate": 7.477629404497048e-06, + "loss": 0.0106, + "step": 612 + }, + { + "epoch": 0.3355227148330597, + "grad_norm": 0.02827736735343933, + "learning_rate": 7.470157858180365e-06, + "loss": 0.0016, + "step": 613 + }, + { + "epoch": 0.33607006020799124, + "grad_norm": 0.1261405646800995, + "learning_rate": 7.462679008090935e-06, + "loss": 0.0064, + "step": 614 + }, + { + "epoch": 0.3366174055829228, + "grad_norm": 0.2770984470844269, + "learning_rate": 7.455192876342253e-06, + "loss": 0.0154, + "step": 615 + }, + { + "epoch": 0.3371647509578544, + "grad_norm": 0.019770491868257523, + "learning_rate": 7.447699485069342e-06, + "loss": 0.0009, + "step": 616 + }, + { + "epoch": 0.337712096332786, + "grad_norm": 0.01447894237935543, + "learning_rate": 7.440198856428693e-06, + "loss": 0.0007, + "step": 617 + }, + { + "epoch": 0.33825944170771755, + "grad_norm": 0.048531387001276016, + "learning_rate": 7.432691012598196e-06, + "loss": 0.0016, + "step": 618 + }, + { + "epoch": 0.33880678708264916, + "grad_norm": 2.619145393371582, + "learning_rate": 7.42517597577707e-06, + "loss": 0.5279, + "step": 619 + }, + { + "epoch": 0.3393541324575807, + "grad_norm": 0.17820040881633759, + "learning_rate": 7.41765376818581e-06, + "loss": 0.0093, + "step": 620 + }, + { + "epoch": 0.3399014778325123, + "grad_norm": 0.08188218623399734, + "learning_rate": 7.4101244120661105e-06, + "loss": 0.0038, + "step": 621 + }, + { + "epoch": 0.3404488232074439, + "grad_norm": 2.2620508670806885, + "learning_rate": 7.4025879296807975e-06, + "loss": 0.2475, + "step": 622 + }, + { + "epoch": 0.34099616858237547, + "grad_norm": 0.013281342573463917, + "learning_rate": 7.395044343313777e-06, + "loss": 0.0006, + "step": 623 + }, + { + "epoch": 0.3415435139573071, + "grad_norm": 0.21208718419075012, + "learning_rate": 7.387493675269955e-06, + "loss": 0.0121, + "step": 624 + }, + { + "epoch": 0.34209085933223865, + "grad_norm": 0.11422328650951385, + "learning_rate": 7.379935947875177e-06, + "loss": 0.0067, + "step": 625 + }, + { + "epoch": 0.3426382047071702, + "grad_norm": 0.015390058048069477, + "learning_rate": 7.372371183476159e-06, + "loss": 0.0006, + "step": 626 + }, + { + "epoch": 0.34318555008210183, + "grad_norm": 0.022556424140930176, + "learning_rate": 7.36479940444043e-06, + "loss": 0.0009, + "step": 627 + }, + { + "epoch": 0.3437328954570334, + "grad_norm": 1.4255520105361938, + "learning_rate": 7.3572206331562575e-06, + "loss": 0.1016, + "step": 628 + }, + { + "epoch": 0.34428024083196496, + "grad_norm": 0.017720935866236687, + "learning_rate": 7.349634892032582e-06, + "loss": 0.0008, + "step": 629 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 0.021948307752609253, + "learning_rate": 7.342042203498952e-06, + "loss": 0.001, + "step": 630 + }, + { + "epoch": 0.34537493158182814, + "grad_norm": 0.22511789202690125, + "learning_rate": 7.334442590005462e-06, + "loss": 0.0121, + "step": 631 + }, + { + "epoch": 0.3459222769567597, + "grad_norm": 0.012582589872181416, + "learning_rate": 7.3268360740226785e-06, + "loss": 0.0005, + "step": 632 + }, + { + "epoch": 0.3464696223316913, + "grad_norm": 0.0653594508767128, + "learning_rate": 7.319222678041578e-06, + "loss": 0.0024, + "step": 633 + }, + { + "epoch": 0.3470169677066229, + "grad_norm": 0.005534766241908073, + "learning_rate": 7.311602424573483e-06, + "loss": 0.0002, + "step": 634 + }, + { + "epoch": 0.34756431308155444, + "grad_norm": 0.023336416110396385, + "learning_rate": 7.3039753361499885e-06, + "loss": 0.0012, + "step": 635 + }, + { + "epoch": 0.34811165845648606, + "grad_norm": 0.08834053575992584, + "learning_rate": 7.2963414353229e-06, + "loss": 0.0037, + "step": 636 + }, + { + "epoch": 0.3486590038314176, + "grad_norm": 0.028155624866485596, + "learning_rate": 7.288700744664167e-06, + "loss": 0.0007, + "step": 637 + }, + { + "epoch": 0.3492063492063492, + "grad_norm": 1.5263876914978027, + "learning_rate": 7.281053286765816e-06, + "loss": 0.0783, + "step": 638 + }, + { + "epoch": 0.3497536945812808, + "grad_norm": 0.007886398583650589, + "learning_rate": 7.273399084239878e-06, + "loss": 0.0004, + "step": 639 + }, + { + "epoch": 0.35030103995621237, + "grad_norm": 0.08708926290273666, + "learning_rate": 7.265738159718332e-06, + "loss": 0.0051, + "step": 640 + }, + { + "epoch": 0.35084838533114393, + "grad_norm": 0.005664496682584286, + "learning_rate": 7.258070535853031e-06, + "loss": 0.0002, + "step": 641 + }, + { + "epoch": 0.35139573070607555, + "grad_norm": 0.06383395940065384, + "learning_rate": 7.250396235315634e-06, + "loss": 0.0022, + "step": 642 + }, + { + "epoch": 0.3519430760810071, + "grad_norm": 1.0289912223815918, + "learning_rate": 7.242715280797547e-06, + "loss": 0.0553, + "step": 643 + }, + { + "epoch": 0.3524904214559387, + "grad_norm": 0.017405716702342033, + "learning_rate": 7.235027695009846e-06, + "loss": 0.0006, + "step": 644 + }, + { + "epoch": 0.3530377668308703, + "grad_norm": 0.2064036875963211, + "learning_rate": 7.2273335006832144e-06, + "loss": 0.0124, + "step": 645 + }, + { + "epoch": 0.35358511220580185, + "grad_norm": 0.0043057892471551895, + "learning_rate": 7.219632720567879e-06, + "loss": 0.0002, + "step": 646 + }, + { + "epoch": 0.3541324575807334, + "grad_norm": 0.015848618000745773, + "learning_rate": 7.211925377433537e-06, + "loss": 0.0008, + "step": 647 + }, + { + "epoch": 0.35467980295566504, + "grad_norm": 0.006341234780848026, + "learning_rate": 7.204211494069292e-06, + "loss": 0.0003, + "step": 648 + }, + { + "epoch": 0.3552271483305966, + "grad_norm": 0.2039993554353714, + "learning_rate": 7.196491093283585e-06, + "loss": 0.012, + "step": 649 + }, + { + "epoch": 0.35577449370552816, + "grad_norm": 0.01146707870066166, + "learning_rate": 7.188764197904129e-06, + "loss": 0.0005, + "step": 650 + }, + { + "epoch": 0.3563218390804598, + "grad_norm": 1.2888134717941284, + "learning_rate": 7.181030830777838e-06, + "loss": 0.0737, + "step": 651 + }, + { + "epoch": 0.35686918445539134, + "grad_norm": 0.007257033605128527, + "learning_rate": 7.173291014770765e-06, + "loss": 0.0003, + "step": 652 + }, + { + "epoch": 0.35741652983032296, + "grad_norm": 0.09801796823740005, + "learning_rate": 7.165544772768027e-06, + "loss": 0.0047, + "step": 653 + }, + { + "epoch": 0.3579638752052545, + "grad_norm": 0.0024445722810924053, + "learning_rate": 7.157792127673747e-06, + "loss": 0.0001, + "step": 654 + }, + { + "epoch": 0.3585112205801861, + "grad_norm": 0.02576364018023014, + "learning_rate": 7.150033102410975e-06, + "loss": 0.0013, + "step": 655 + }, + { + "epoch": 0.3590585659551177, + "grad_norm": 5.51544713973999, + "learning_rate": 7.142267719921629e-06, + "loss": 0.6532, + "step": 656 + }, + { + "epoch": 0.35960591133004927, + "grad_norm": 0.09530629217624664, + "learning_rate": 7.134496003166423e-06, + "loss": 0.0057, + "step": 657 + }, + { + "epoch": 0.36015325670498083, + "grad_norm": 0.03840158134698868, + "learning_rate": 7.1267179751248005e-06, + "loss": 0.0012, + "step": 658 + }, + { + "epoch": 0.36070060207991245, + "grad_norm": 0.00522483279928565, + "learning_rate": 7.118933658794868e-06, + "loss": 0.0002, + "step": 659 + }, + { + "epoch": 0.361247947454844, + "grad_norm": 0.2433670610189438, + "learning_rate": 7.111143077193321e-06, + "loss": 0.0168, + "step": 660 + }, + { + "epoch": 0.36179529282977557, + "grad_norm": 0.08343702554702759, + "learning_rate": 7.103346253355383e-06, + "loss": 0.0018, + "step": 661 + }, + { + "epoch": 0.3623426382047072, + "grad_norm": 0.006387744098901749, + "learning_rate": 7.0955432103347355e-06, + "loss": 0.0003, + "step": 662 + }, + { + "epoch": 0.36288998357963875, + "grad_norm": 0.0036262532230466604, + "learning_rate": 7.087733971203448e-06, + "loss": 0.0002, + "step": 663 + }, + { + "epoch": 0.3634373289545703, + "grad_norm": 0.10606978833675385, + "learning_rate": 7.0799185590519086e-06, + "loss": 0.006, + "step": 664 + }, + { + "epoch": 0.36398467432950193, + "grad_norm": 0.13019196689128876, + "learning_rate": 7.0720969969887595e-06, + "loss": 0.0045, + "step": 665 + }, + { + "epoch": 0.3645320197044335, + "grad_norm": 0.10207099467515945, + "learning_rate": 7.06426930814083e-06, + "loss": 0.0027, + "step": 666 + }, + { + "epoch": 0.36507936507936506, + "grad_norm": 0.00420819316059351, + "learning_rate": 7.056435515653059e-06, + "loss": 0.0002, + "step": 667 + }, + { + "epoch": 0.3656267104542967, + "grad_norm": 0.09126473218202591, + "learning_rate": 7.048595642688436e-06, + "loss": 0.0053, + "step": 668 + }, + { + "epoch": 0.36617405582922824, + "grad_norm": 0.004398214165121317, + "learning_rate": 7.040749712427932e-06, + "loss": 0.0002, + "step": 669 + }, + { + "epoch": 0.3667214012041598, + "grad_norm": 0.00617770291864872, + "learning_rate": 7.032897748070423e-06, + "loss": 0.0003, + "step": 670 + }, + { + "epoch": 0.3672687465790914, + "grad_norm": 0.10934311896562576, + "learning_rate": 7.0250397728326295e-06, + "loss": 0.0063, + "step": 671 + }, + { + "epoch": 0.367816091954023, + "grad_norm": 0.004969688132405281, + "learning_rate": 7.017175809949044e-06, + "loss": 0.0002, + "step": 672 + }, + { + "epoch": 0.36836343732895455, + "grad_norm": 2.228437900543213, + "learning_rate": 7.009305882671864e-06, + "loss": 0.2851, + "step": 673 + }, + { + "epoch": 0.36891078270388616, + "grad_norm": 0.007352550979703665, + "learning_rate": 7.001430014270921e-06, + "loss": 0.0003, + "step": 674 + }, + { + "epoch": 0.3694581280788177, + "grad_norm": 0.00255901413038373, + "learning_rate": 6.993548228033618e-06, + "loss": 0.0001, + "step": 675 + }, + { + "epoch": 0.3700054734537493, + "grad_norm": 3.149127960205078, + "learning_rate": 6.9856605472648494e-06, + "loss": 0.5427, + "step": 676 + }, + { + "epoch": 0.3705528188286809, + "grad_norm": 0.5940586924552917, + "learning_rate": 6.977766995286943e-06, + "loss": 0.0356, + "step": 677 + }, + { + "epoch": 0.37110016420361247, + "grad_norm": 2.30926775932312, + "learning_rate": 6.969867595439586e-06, + "loss": 0.4709, + "step": 678 + }, + { + "epoch": 0.3716475095785441, + "grad_norm": 0.16510634124279022, + "learning_rate": 6.961962371079752e-06, + "loss": 0.0081, + "step": 679 + }, + { + "epoch": 0.37219485495347565, + "grad_norm": 0.05402740463614464, + "learning_rate": 6.954051345581645e-06, + "loss": 0.0016, + "step": 680 + }, + { + "epoch": 0.3727422003284072, + "grad_norm": 0.004461527802050114, + "learning_rate": 6.946134542336615e-06, + "loss": 0.0003, + "step": 681 + }, + { + "epoch": 0.37328954570333883, + "grad_norm": 3.6112473011016846, + "learning_rate": 6.938211984753097e-06, + "loss": 0.3841, + "step": 682 + }, + { + "epoch": 0.3738368910782704, + "grad_norm": 0.008113270625472069, + "learning_rate": 6.930283696256543e-06, + "loss": 0.0003, + "step": 683 + }, + { + "epoch": 0.37438423645320196, + "grad_norm": 0.15984749794006348, + "learning_rate": 6.922349700289348e-06, + "loss": 0.0107, + "step": 684 + }, + { + "epoch": 0.3749315818281336, + "grad_norm": 0.006165068130940199, + "learning_rate": 6.914410020310782e-06, + "loss": 0.0003, + "step": 685 + }, + { + "epoch": 0.37547892720306514, + "grad_norm": 0.030400443822145462, + "learning_rate": 6.906464679796927e-06, + "loss": 0.0015, + "step": 686 + }, + { + "epoch": 0.3760262725779967, + "grad_norm": 0.005032232962548733, + "learning_rate": 6.898513702240592e-06, + "loss": 0.0002, + "step": 687 + }, + { + "epoch": 0.3765736179529283, + "grad_norm": 0.19334912300109863, + "learning_rate": 6.890557111151266e-06, + "loss": 0.0082, + "step": 688 + }, + { + "epoch": 0.3771209633278599, + "grad_norm": 0.022890301421284676, + "learning_rate": 6.882594930055024e-06, + "loss": 0.0015, + "step": 689 + }, + { + "epoch": 0.37766830870279144, + "grad_norm": 2.6308631896972656, + "learning_rate": 6.8746271824944774e-06, + "loss": 0.6082, + "step": 690 + }, + { + "epoch": 0.37821565407772306, + "grad_norm": 0.004297096747905016, + "learning_rate": 6.8666538920286965e-06, + "loss": 0.0002, + "step": 691 + }, + { + "epoch": 0.3787629994526546, + "grad_norm": 4.101519584655762, + "learning_rate": 6.858675082233135e-06, + "loss": 0.2531, + "step": 692 + }, + { + "epoch": 0.3793103448275862, + "grad_norm": 2.1500627994537354, + "learning_rate": 6.850690776699574e-06, + "loss": 0.3575, + "step": 693 + }, + { + "epoch": 0.3798576902025178, + "grad_norm": 0.4142707586288452, + "learning_rate": 6.842700999036036e-06, + "loss": 0.0254, + "step": 694 + }, + { + "epoch": 0.38040503557744937, + "grad_norm": 0.1441921442747116, + "learning_rate": 6.834705772866732e-06, + "loss": 0.0101, + "step": 695 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.016346001997590065, + "learning_rate": 6.8267051218319766e-06, + "loss": 0.0006, + "step": 696 + }, + { + "epoch": 0.38149972632731255, + "grad_norm": 0.017199577763676643, + "learning_rate": 6.8186990695881275e-06, + "loss": 0.0008, + "step": 697 + }, + { + "epoch": 0.3820470717022441, + "grad_norm": 0.009311121888458729, + "learning_rate": 6.810687639807514e-06, + "loss": 0.0004, + "step": 698 + }, + { + "epoch": 0.3825944170771757, + "grad_norm": 1.2476736307144165, + "learning_rate": 6.802670856178362e-06, + "loss": 0.1364, + "step": 699 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 1.8799997568130493, + "learning_rate": 6.79464874240473e-06, + "loss": 0.3712, + "step": 700 + }, + { + "epoch": 0.38368910782703886, + "grad_norm": 1.3058347702026367, + "learning_rate": 6.7866213222064385e-06, + "loss": 0.2762, + "step": 701 + }, + { + "epoch": 0.3842364532019704, + "grad_norm": 0.01594836823642254, + "learning_rate": 6.7785886193189936e-06, + "loss": 0.0007, + "step": 702 + }, + { + "epoch": 0.38478379857690204, + "grad_norm": 0.012764069251716137, + "learning_rate": 6.770550657493525e-06, + "loss": 0.0005, + "step": 703 + }, + { + "epoch": 0.3853311439518336, + "grad_norm": 1.4421093463897705, + "learning_rate": 6.76250746049671e-06, + "loss": 0.0832, + "step": 704 + }, + { + "epoch": 0.38587848932676516, + "grad_norm": 0.002905939007177949, + "learning_rate": 6.754459052110707e-06, + "loss": 0.0001, + "step": 705 + }, + { + "epoch": 0.3864258347016968, + "grad_norm": 2.3534743785858154, + "learning_rate": 6.7464054561330805e-06, + "loss": 0.4796, + "step": 706 + }, + { + "epoch": 0.38697318007662834, + "grad_norm": 0.04106984660029411, + "learning_rate": 6.7383466963767386e-06, + "loss": 0.0018, + "step": 707 + }, + { + "epoch": 0.38752052545155996, + "grad_norm": 0.007894037291407585, + "learning_rate": 6.730282796669853e-06, + "loss": 0.0004, + "step": 708 + }, + { + "epoch": 0.3880678708264915, + "grad_norm": 0.36676910519599915, + "learning_rate": 6.722213780855795e-06, + "loss": 0.0271, + "step": 709 + }, + { + "epoch": 0.3886152162014231, + "grad_norm": 0.5855551958084106, + "learning_rate": 6.714139672793063e-06, + "loss": 0.0555, + "step": 710 + }, + { + "epoch": 0.3891625615763547, + "grad_norm": 0.7090444564819336, + "learning_rate": 6.7060604963552125e-06, + "loss": 0.0533, + "step": 711 + }, + { + "epoch": 0.38970990695128627, + "grad_norm": 0.22933818399906158, + "learning_rate": 6.697976275430786e-06, + "loss": 0.0136, + "step": 712 + }, + { + "epoch": 0.39025725232621783, + "grad_norm": 0.344751238822937, + "learning_rate": 6.6898870339232405e-06, + "loss": 0.0281, + "step": 713 + }, + { + "epoch": 0.39080459770114945, + "grad_norm": 0.16770361363887787, + "learning_rate": 6.681792795750876e-06, + "loss": 0.008, + "step": 714 + }, + { + "epoch": 0.391351943076081, + "grad_norm": 0.016307057812809944, + "learning_rate": 6.673693584846771e-06, + "loss": 0.0006, + "step": 715 + }, + { + "epoch": 0.3918992884510126, + "grad_norm": 0.3089595139026642, + "learning_rate": 6.665589425158705e-06, + "loss": 0.0221, + "step": 716 + }, + { + "epoch": 0.3924466338259442, + "grad_norm": 0.05962289124727249, + "learning_rate": 6.657480340649088e-06, + "loss": 0.0036, + "step": 717 + }, + { + "epoch": 0.39299397920087575, + "grad_norm": 0.13849852979183197, + "learning_rate": 6.649366355294895e-06, + "loss": 0.0091, + "step": 718 + }, + { + "epoch": 0.3935413245758073, + "grad_norm": 0.2673451602458954, + "learning_rate": 6.641247493087591e-06, + "loss": 0.0216, + "step": 719 + }, + { + "epoch": 0.39408866995073893, + "grad_norm": 0.02874746359884739, + "learning_rate": 6.633123778033061e-06, + "loss": 0.0012, + "step": 720 + }, + { + "epoch": 0.3946360153256705, + "grad_norm": 0.04847461357712746, + "learning_rate": 6.624995234151539e-06, + "loss": 0.0025, + "step": 721 + }, + { + "epoch": 0.39518336070060206, + "grad_norm": 4.155821800231934, + "learning_rate": 6.616861885477535e-06, + "loss": 0.2413, + "step": 722 + }, + { + "epoch": 0.3957307060755337, + "grad_norm": 0.025641655549407005, + "learning_rate": 6.608723756059768e-06, + "loss": 0.0011, + "step": 723 + }, + { + "epoch": 0.39627805145046524, + "grad_norm": 0.10781197994947433, + "learning_rate": 6.600580869961091e-06, + "loss": 0.0063, + "step": 724 + }, + { + "epoch": 0.3968253968253968, + "grad_norm": 0.01060369610786438, + "learning_rate": 6.592433251258423e-06, + "loss": 0.0005, + "step": 725 + }, + { + "epoch": 0.3973727422003284, + "grad_norm": 0.153262197971344, + "learning_rate": 6.5842809240426765e-06, + "loss": 0.0087, + "step": 726 + }, + { + "epoch": 0.39792008757526, + "grad_norm": 0.017993014305830002, + "learning_rate": 6.576123912418686e-06, + "loss": 0.0008, + "step": 727 + }, + { + "epoch": 0.39846743295019155, + "grad_norm": 0.828525722026825, + "learning_rate": 6.567962240505136e-06, + "loss": 0.0422, + "step": 728 + }, + { + "epoch": 0.39901477832512317, + "grad_norm": 2.484757900238037, + "learning_rate": 6.559795932434489e-06, + "loss": 0.4287, + "step": 729 + }, + { + "epoch": 0.3995621237000547, + "grad_norm": 0.007983885705471039, + "learning_rate": 6.551625012352921e-06, + "loss": 0.0003, + "step": 730 + }, + { + "epoch": 0.4001094690749863, + "grad_norm": 0.3982315957546234, + "learning_rate": 6.543449504420241e-06, + "loss": 0.0341, + "step": 731 + }, + { + "epoch": 0.4006568144499179, + "grad_norm": 0.08536089211702347, + "learning_rate": 6.535269432809821e-06, + "loss": 0.0056, + "step": 732 + }, + { + "epoch": 0.40120415982484947, + "grad_norm": 1.6352207660675049, + "learning_rate": 6.5270848217085325e-06, + "loss": 0.1665, + "step": 733 + }, + { + "epoch": 0.4017515051997811, + "grad_norm": 0.01377387810498476, + "learning_rate": 6.518895695316666e-06, + "loss": 0.0007, + "step": 734 + }, + { + "epoch": 0.40229885057471265, + "grad_norm": 0.30763447284698486, + "learning_rate": 6.510702077847864e-06, + "loss": 0.0218, + "step": 735 + }, + { + "epoch": 0.4028461959496442, + "grad_norm": 0.09024977684020996, + "learning_rate": 6.502503993529048e-06, + "loss": 0.0051, + "step": 736 + }, + { + "epoch": 0.40339354132457583, + "grad_norm": 0.08540990948677063, + "learning_rate": 6.494301466600345e-06, + "loss": 0.0054, + "step": 737 + }, + { + "epoch": 0.4039408866995074, + "grad_norm": 0.011326467618346214, + "learning_rate": 6.486094521315022e-06, + "loss": 0.0005, + "step": 738 + }, + { + "epoch": 0.40448823207443896, + "grad_norm": 0.0065598557703197, + "learning_rate": 6.477883181939406e-06, + "loss": 0.0003, + "step": 739 + }, + { + "epoch": 0.4050355774493706, + "grad_norm": 0.012220407836139202, + "learning_rate": 6.469667472752821e-06, + "loss": 0.0005, + "step": 740 + }, + { + "epoch": 0.40558292282430214, + "grad_norm": 0.004128745291382074, + "learning_rate": 6.461447418047506e-06, + "loss": 0.0002, + "step": 741 + }, + { + "epoch": 0.4061302681992337, + "grad_norm": 0.039554912596940994, + "learning_rate": 6.453223042128556e-06, + "loss": 0.0018, + "step": 742 + }, + { + "epoch": 0.4066776135741653, + "grad_norm": 0.009743032976984978, + "learning_rate": 6.444994369313835e-06, + "loss": 0.0004, + "step": 743 + }, + { + "epoch": 0.4072249589490969, + "grad_norm": 0.0073027294129133224, + "learning_rate": 6.4367614239339185e-06, + "loss": 0.0003, + "step": 744 + }, + { + "epoch": 0.40777230432402845, + "grad_norm": 0.01663777604699135, + "learning_rate": 6.428524230332012e-06, + "loss": 0.0007, + "step": 745 + }, + { + "epoch": 0.40831964969896006, + "grad_norm": 3.9854557514190674, + "learning_rate": 6.420282812863881e-06, + "loss": 0.2017, + "step": 746 + }, + { + "epoch": 0.4088669950738916, + "grad_norm": 0.20734180510044098, + "learning_rate": 6.412037195897786e-06, + "loss": 0.011, + "step": 747 + }, + { + "epoch": 0.4094143404488232, + "grad_norm": 0.004745072685182095, + "learning_rate": 6.403787403814399e-06, + "loss": 0.0003, + "step": 748 + }, + { + "epoch": 0.4099616858237548, + "grad_norm": 2.455504894256592, + "learning_rate": 6.395533461006736e-06, + "loss": 0.3553, + "step": 749 + }, + { + "epoch": 0.41050903119868637, + "grad_norm": 0.005224741529673338, + "learning_rate": 6.387275391880091e-06, + "loss": 0.0002, + "step": 750 + }, + { + "epoch": 0.41105637657361793, + "grad_norm": 0.002598387422040105, + "learning_rate": 6.379013220851956e-06, + "loss": 0.0001, + "step": 751 + }, + { + "epoch": 0.41160372194854955, + "grad_norm": 0.00500458711758256, + "learning_rate": 6.370746972351952e-06, + "loss": 0.0002, + "step": 752 + }, + { + "epoch": 0.4121510673234811, + "grad_norm": 0.055540215224027634, + "learning_rate": 6.362476670821755e-06, + "loss": 0.0032, + "step": 753 + }, + { + "epoch": 0.4126984126984127, + "grad_norm": 0.009147963486611843, + "learning_rate": 6.354202340715027e-06, + "loss": 0.0003, + "step": 754 + }, + { + "epoch": 0.4132457580733443, + "grad_norm": 0.008779282681643963, + "learning_rate": 6.345924006497339e-06, + "loss": 0.0004, + "step": 755 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.08578510582447052, + "learning_rate": 6.337641692646106e-06, + "loss": 0.005, + "step": 756 + }, + { + "epoch": 0.4143404488232074, + "grad_norm": 2.1286394596099854, + "learning_rate": 6.329355423650504e-06, + "loss": 0.4173, + "step": 757 + }, + { + "epoch": 0.41488779419813904, + "grad_norm": 0.005920462775975466, + "learning_rate": 6.321065224011408e-06, + "loss": 0.0003, + "step": 758 + }, + { + "epoch": 0.4154351395730706, + "grad_norm": 0.0021460726857185364, + "learning_rate": 6.312771118241314e-06, + "loss": 0.0001, + "step": 759 + }, + { + "epoch": 0.41598248494800216, + "grad_norm": 0.02204807847738266, + "learning_rate": 6.3044731308642685e-06, + "loss": 0.0012, + "step": 760 + }, + { + "epoch": 0.4165298303229338, + "grad_norm": 0.010705336928367615, + "learning_rate": 6.296171286415791e-06, + "loss": 0.0005, + "step": 761 + }, + { + "epoch": 0.41707717569786534, + "grad_norm": 0.004768910817801952, + "learning_rate": 6.287865609442812e-06, + "loss": 0.0002, + "step": 762 + }, + { + "epoch": 0.41762452107279696, + "grad_norm": 2.3559985160827637, + "learning_rate": 6.2795561245035895e-06, + "loss": 0.1317, + "step": 763 + }, + { + "epoch": 0.4181718664477285, + "grad_norm": 0.27280962467193604, + "learning_rate": 6.271242856167642e-06, + "loss": 0.013, + "step": 764 + }, + { + "epoch": 0.4187192118226601, + "grad_norm": 0.030251115560531616, + "learning_rate": 6.262925829015675e-06, + "loss": 0.0017, + "step": 765 + }, + { + "epoch": 0.4192665571975917, + "grad_norm": 0.08326046913862228, + "learning_rate": 6.254605067639509e-06, + "loss": 0.002, + "step": 766 + }, + { + "epoch": 0.41981390257252327, + "grad_norm": 0.0023234374821186066, + "learning_rate": 6.246280596642004e-06, + "loss": 0.0001, + "step": 767 + }, + { + "epoch": 0.42036124794745483, + "grad_norm": 0.01413232646882534, + "learning_rate": 6.23795244063699e-06, + "loss": 0.0005, + "step": 768 + }, + { + "epoch": 0.42090859332238645, + "grad_norm": 0.00254431227222085, + "learning_rate": 6.229620624249189e-06, + "loss": 0.0001, + "step": 769 + }, + { + "epoch": 0.421455938697318, + "grad_norm": 0.11062794923782349, + "learning_rate": 6.221285172114156e-06, + "loss": 0.0069, + "step": 770 + }, + { + "epoch": 0.4220032840722496, + "grad_norm": 0.07323046773672104, + "learning_rate": 6.212946108878185e-06, + "loss": 0.0045, + "step": 771 + }, + { + "epoch": 0.4225506294471812, + "grad_norm": 0.0021918388083577156, + "learning_rate": 6.204603459198252e-06, + "loss": 0.0001, + "step": 772 + }, + { + "epoch": 0.42309797482211275, + "grad_norm": 2.624161720275879, + "learning_rate": 6.196257247741939e-06, + "loss": 0.5911, + "step": 773 + }, + { + "epoch": 0.4236453201970443, + "grad_norm": 0.00811794027686119, + "learning_rate": 6.187907499187357e-06, + "loss": 0.0004, + "step": 774 + }, + { + "epoch": 0.42419266557197594, + "grad_norm": 0.041902635246515274, + "learning_rate": 6.179554238223076e-06, + "loss": 0.0018, + "step": 775 + }, + { + "epoch": 0.4247400109469075, + "grad_norm": 0.05677172169089317, + "learning_rate": 6.171197489548051e-06, + "loss": 0.0025, + "step": 776 + }, + { + "epoch": 0.42528735632183906, + "grad_norm": 0.007656686939299107, + "learning_rate": 6.162837277871553e-06, + "loss": 0.0003, + "step": 777 + }, + { + "epoch": 0.4258347016967707, + "grad_norm": 0.21811577677726746, + "learning_rate": 6.1544736279130865e-06, + "loss": 0.0146, + "step": 778 + }, + { + "epoch": 0.42638204707170224, + "grad_norm": 0.0371868871152401, + "learning_rate": 6.146106564402329e-06, + "loss": 0.002, + "step": 779 + }, + { + "epoch": 0.4269293924466338, + "grad_norm": 0.5981050729751587, + "learning_rate": 6.1377361120790445e-06, + "loss": 0.0398, + "step": 780 + }, + { + "epoch": 0.4274767378215654, + "grad_norm": 0.005229018162935972, + "learning_rate": 6.129362295693022e-06, + "loss": 0.0002, + "step": 781 + }, + { + "epoch": 0.428024083196497, + "grad_norm": 0.00975254736840725, + "learning_rate": 6.120985140003996e-06, + "loss": 0.0005, + "step": 782 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 2.9124345779418945, + "learning_rate": 6.112604669781572e-06, + "loss": 0.6284, + "step": 783 + }, + { + "epoch": 0.42911877394636017, + "grad_norm": 0.0032201369758695364, + "learning_rate": 6.104220909805162e-06, + "loss": 0.0002, + "step": 784 + }, + { + "epoch": 0.42966611932129173, + "grad_norm": 0.007221321575343609, + "learning_rate": 6.095833884863897e-06, + "loss": 0.0003, + "step": 785 + }, + { + "epoch": 0.4302134646962233, + "grad_norm": 0.06140093877911568, + "learning_rate": 6.08744361975657e-06, + "loss": 0.0029, + "step": 786 + }, + { + "epoch": 0.4307608100711549, + "grad_norm": 0.006075866986066103, + "learning_rate": 6.07905013929155e-06, + "loss": 0.0003, + "step": 787 + }, + { + "epoch": 0.43130815544608647, + "grad_norm": 0.10445165634155273, + "learning_rate": 6.0706534682867125e-06, + "loss": 0.0059, + "step": 788 + }, + { + "epoch": 0.4318555008210181, + "grad_norm": 0.09483800083398819, + "learning_rate": 6.062253631569368e-06, + "loss": 0.0064, + "step": 789 + }, + { + "epoch": 0.43240284619594965, + "grad_norm": 0.22675608098506927, + "learning_rate": 6.053850653976191e-06, + "loss": 0.0166, + "step": 790 + }, + { + "epoch": 0.4329501915708812, + "grad_norm": 0.08051242679357529, + "learning_rate": 6.045444560353136e-06, + "loss": 0.0055, + "step": 791 + }, + { + "epoch": 0.43349753694581283, + "grad_norm": 0.1378559172153473, + "learning_rate": 6.037035375555376e-06, + "loss": 0.0096, + "step": 792 + }, + { + "epoch": 0.4340448823207444, + "grad_norm": 0.004898442886769772, + "learning_rate": 6.028623124447224e-06, + "loss": 0.0002, + "step": 793 + }, + { + "epoch": 0.43459222769567596, + "grad_norm": 0.07252166420221329, + "learning_rate": 6.020207831902056e-06, + "loss": 0.0041, + "step": 794 + }, + { + "epoch": 0.4351395730706076, + "grad_norm": 0.011364106088876724, + "learning_rate": 6.011789522802242e-06, + "loss": 0.0005, + "step": 795 + }, + { + "epoch": 0.43568691844553914, + "grad_norm": 0.03275460749864578, + "learning_rate": 6.003368222039078e-06, + "loss": 0.0016, + "step": 796 + }, + { + "epoch": 0.4362342638204707, + "grad_norm": 0.6348550319671631, + "learning_rate": 5.994943954512694e-06, + "loss": 0.0458, + "step": 797 + }, + { + "epoch": 0.4367816091954023, + "grad_norm": 1.7934825420379639, + "learning_rate": 5.986516745132e-06, + "loss": 0.2173, + "step": 798 + }, + { + "epoch": 0.4373289545703339, + "grad_norm": 0.004570923279970884, + "learning_rate": 5.978086618814606e-06, + "loss": 0.0002, + "step": 799 + }, + { + "epoch": 0.43787629994526545, + "grad_norm": 0.013217059895396233, + "learning_rate": 5.96965360048674e-06, + "loss": 0.0006, + "step": 800 + }, + { + "epoch": 0.43842364532019706, + "grad_norm": 0.8358885049819946, + "learning_rate": 5.961217715083185e-06, + "loss": 0.0486, + "step": 801 + }, + { + "epoch": 0.4389709906951286, + "grad_norm": 0.08642356842756271, + "learning_rate": 5.952778987547203e-06, + "loss": 0.0054, + "step": 802 + }, + { + "epoch": 0.4395183360700602, + "grad_norm": 0.09967074543237686, + "learning_rate": 5.944337442830457e-06, + "loss": 0.0039, + "step": 803 + }, + { + "epoch": 0.4400656814449918, + "grad_norm": 2.0377254486083984, + "learning_rate": 5.935893105892938e-06, + "loss": 0.2651, + "step": 804 + }, + { + "epoch": 0.44061302681992337, + "grad_norm": 0.0057347621768713, + "learning_rate": 5.927446001702899e-06, + "loss": 0.0003, + "step": 805 + }, + { + "epoch": 0.44116037219485493, + "grad_norm": 0.003230379894375801, + "learning_rate": 5.918996155236771e-06, + "loss": 0.0002, + "step": 806 + }, + { + "epoch": 0.44170771756978655, + "grad_norm": 2.145993709564209, + "learning_rate": 5.9105435914790935e-06, + "loss": 0.4657, + "step": 807 + }, + { + "epoch": 0.4422550629447181, + "grad_norm": 0.038496311753988266, + "learning_rate": 5.902088335422442e-06, + "loss": 0.0021, + "step": 808 + }, + { + "epoch": 0.4428024083196497, + "grad_norm": 0.006777400616556406, + "learning_rate": 5.893630412067351e-06, + "loss": 0.0003, + "step": 809 + }, + { + "epoch": 0.4433497536945813, + "grad_norm": 0.04935964569449425, + "learning_rate": 5.885169846422242e-06, + "loss": 0.0018, + "step": 810 + }, + { + "epoch": 0.44389709906951286, + "grad_norm": 0.011123371310532093, + "learning_rate": 5.876706663503352e-06, + "loss": 0.0005, + "step": 811 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.03125881776213646, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.0015, + "step": 812 + }, + { + "epoch": 0.44499178981937604, + "grad_norm": 0.004140671342611313, + "learning_rate": 5.859772545947782e-06, + "loss": 0.0002, + "step": 813 + }, + { + "epoch": 0.4455391351943076, + "grad_norm": 0.002423380734398961, + "learning_rate": 5.85130166138197e-06, + "loss": 0.0001, + "step": 814 + }, + { + "epoch": 0.44608648056923916, + "grad_norm": 0.004158463794738054, + "learning_rate": 5.8428282596839625e-06, + "loss": 0.0002, + "step": 815 + }, + { + "epoch": 0.4466338259441708, + "grad_norm": 0.08258446305990219, + "learning_rate": 5.834352365907946e-06, + "loss": 0.003, + "step": 816 + }, + { + "epoch": 0.44718117131910234, + "grad_norm": 0.004286042880266905, + "learning_rate": 5.82587400511548e-06, + "loss": 0.0002, + "step": 817 + }, + { + "epoch": 0.44772851669403396, + "grad_norm": 0.006518741603940725, + "learning_rate": 5.817393202375416e-06, + "loss": 0.0003, + "step": 818 + }, + { + "epoch": 0.4482758620689655, + "grad_norm": 0.33282148838043213, + "learning_rate": 5.808909982763825e-06, + "loss": 0.0305, + "step": 819 + }, + { + "epoch": 0.4488232074438971, + "grad_norm": 0.04650212079286575, + "learning_rate": 5.800424371363924e-06, + "loss": 0.0022, + "step": 820 + }, + { + "epoch": 0.4493705528188287, + "grad_norm": 0.0025998507626354694, + "learning_rate": 5.791936393266004e-06, + "loss": 0.0002, + "step": 821 + }, + { + "epoch": 0.44991789819376027, + "grad_norm": 0.4078565239906311, + "learning_rate": 5.783446073567353e-06, + "loss": 0.0282, + "step": 822 + }, + { + "epoch": 0.45046524356869183, + "grad_norm": 0.03496750444173813, + "learning_rate": 5.774953437372181e-06, + "loss": 0.0011, + "step": 823 + }, + { + "epoch": 0.45101258894362345, + "grad_norm": 0.15228895843029022, + "learning_rate": 5.766458509791553e-06, + "loss": 0.0082, + "step": 824 + }, + { + "epoch": 0.451559934318555, + "grad_norm": 0.026087448000907898, + "learning_rate": 5.757961315943303e-06, + "loss": 0.0013, + "step": 825 + }, + { + "epoch": 0.4521072796934866, + "grad_norm": 0.1130935400724411, + "learning_rate": 5.749461880951966e-06, + "loss": 0.0067, + "step": 826 + }, + { + "epoch": 0.4526546250684182, + "grad_norm": 0.06168792396783829, + "learning_rate": 5.7409602299487085e-06, + "loss": 0.0035, + "step": 827 + }, + { + "epoch": 0.45320197044334976, + "grad_norm": 2.115023612976074, + "learning_rate": 5.732456388071247e-06, + "loss": 0.4833, + "step": 828 + }, + { + "epoch": 0.4537493158182813, + "grad_norm": 0.0042996820993721485, + "learning_rate": 5.723950380463774e-06, + "loss": 0.0002, + "step": 829 + }, + { + "epoch": 0.45429666119321294, + "grad_norm": 0.003995430190116167, + "learning_rate": 5.715442232276887e-06, + "loss": 0.0002, + "step": 830 + }, + { + "epoch": 0.4548440065681445, + "grad_norm": 0.006226207595318556, + "learning_rate": 5.706931968667514e-06, + "loss": 0.0003, + "step": 831 + }, + { + "epoch": 0.45539135194307606, + "grad_norm": 0.2677818238735199, + "learning_rate": 5.6984196147988365e-06, + "loss": 0.017, + "step": 832 + }, + { + "epoch": 0.4559386973180077, + "grad_norm": 0.025894341990351677, + "learning_rate": 5.689905195840216e-06, + "loss": 0.001, + "step": 833 + }, + { + "epoch": 0.45648604269293924, + "grad_norm": 0.11859169602394104, + "learning_rate": 5.681388736967124e-06, + "loss": 0.007, + "step": 834 + }, + { + "epoch": 0.4570333880678708, + "grad_norm": 0.002087466651573777, + "learning_rate": 5.672870263361057e-06, + "loss": 0.0001, + "step": 835 + }, + { + "epoch": 0.4575807334428024, + "grad_norm": 1.965535283088684, + "learning_rate": 5.6643498002094725e-06, + "loss": 0.1036, + "step": 836 + }, + { + "epoch": 0.458128078817734, + "grad_norm": 0.26401904225349426, + "learning_rate": 5.655827372705712e-06, + "loss": 0.0155, + "step": 837 + }, + { + "epoch": 0.45867542419266555, + "grad_norm": 1.1218135356903076, + "learning_rate": 5.647303006048924e-06, + "loss": 0.0511, + "step": 838 + }, + { + "epoch": 0.45922276956759717, + "grad_norm": 0.003350113518536091, + "learning_rate": 5.638776725443989e-06, + "loss": 0.0002, + "step": 839 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.15477736294269562, + "learning_rate": 5.630248556101448e-06, + "loss": 0.0096, + "step": 840 + }, + { + "epoch": 0.4603174603174603, + "grad_norm": 1.8222780227661133, + "learning_rate": 5.621718523237427e-06, + "loss": 0.1304, + "step": 841 + }, + { + "epoch": 0.4608648056923919, + "grad_norm": 0.47362422943115234, + "learning_rate": 5.613186652073561e-06, + "loss": 0.0269, + "step": 842 + }, + { + "epoch": 0.4614121510673235, + "grad_norm": 0.0013261528220027685, + "learning_rate": 5.604652967836922e-06, + "loss": 0.0001, + "step": 843 + }, + { + "epoch": 0.46195949644225504, + "grad_norm": 2.21683669090271, + "learning_rate": 5.596117495759943e-06, + "loss": 0.4404, + "step": 844 + }, + { + "epoch": 0.46250684181718665, + "grad_norm": 0.5288764834403992, + "learning_rate": 5.58758026108034e-06, + "loss": 0.0157, + "step": 845 + }, + { + "epoch": 0.4630541871921182, + "grad_norm": 0.0019743768498301506, + "learning_rate": 5.579041289041045e-06, + "loss": 0.0001, + "step": 846 + }, + { + "epoch": 0.46360153256704983, + "grad_norm": 0.005323327146470547, + "learning_rate": 5.570500604890124e-06, + "loss": 0.0002, + "step": 847 + }, + { + "epoch": 0.4641488779419814, + "grad_norm": 5.332734107971191, + "learning_rate": 5.561958233880707e-06, + "loss": 0.1358, + "step": 848 + }, + { + "epoch": 0.46469622331691296, + "grad_norm": 0.08172398060560226, + "learning_rate": 5.55341420127091e-06, + "loss": 0.0038, + "step": 849 + }, + { + "epoch": 0.4652435686918446, + "grad_norm": 0.009597660973668098, + "learning_rate": 5.544868532323766e-06, + "loss": 0.0004, + "step": 850 + }, + { + "epoch": 0.46579091406677614, + "grad_norm": 0.0016919386107474566, + "learning_rate": 5.536321252307141e-06, + "loss": 0.0001, + "step": 851 + }, + { + "epoch": 0.4663382594417077, + "grad_norm": 0.0031833848915994167, + "learning_rate": 5.527772386493667e-06, + "loss": 0.0002, + "step": 852 + }, + { + "epoch": 0.4668856048166393, + "grad_norm": 0.0018079314613714814, + "learning_rate": 5.519221960160666e-06, + "loss": 0.0001, + "step": 853 + }, + { + "epoch": 0.4674329501915709, + "grad_norm": 0.7406489253044128, + "learning_rate": 5.510669998590074e-06, + "loss": 0.0509, + "step": 854 + }, + { + "epoch": 0.46798029556650245, + "grad_norm": 0.05459873005747795, + "learning_rate": 5.502116527068363e-06, + "loss": 0.0019, + "step": 855 + }, + { + "epoch": 0.46852764094143406, + "grad_norm": 0.00270917359739542, + "learning_rate": 5.493561570886473e-06, + "loss": 0.0001, + "step": 856 + }, + { + "epoch": 0.4690749863163656, + "grad_norm": 4.040920734405518, + "learning_rate": 5.485005155339736e-06, + "loss": 0.1073, + "step": 857 + }, + { + "epoch": 0.4696223316912972, + "grad_norm": 0.7534052133560181, + "learning_rate": 5.4764473057277925e-06, + "loss": 0.0456, + "step": 858 + }, + { + "epoch": 0.4701696770662288, + "grad_norm": 0.030180972069501877, + "learning_rate": 5.467888047354528e-06, + "loss": 0.0013, + "step": 859 + }, + { + "epoch": 0.47071702244116037, + "grad_norm": 0.021435558795928955, + "learning_rate": 5.4593274055279935e-06, + "loss": 0.0008, + "step": 860 + }, + { + "epoch": 0.47126436781609193, + "grad_norm": 0.0014348177937790751, + "learning_rate": 5.450765405560328e-06, + "loss": 0.0001, + "step": 861 + }, + { + "epoch": 0.47181171319102355, + "grad_norm": 0.0019303731387481093, + "learning_rate": 5.442202072767686e-06, + "loss": 0.0001, + "step": 862 + }, + { + "epoch": 0.4723590585659551, + "grad_norm": 3.485708236694336, + "learning_rate": 5.433637432470169e-06, + "loss": 0.5884, + "step": 863 + }, + { + "epoch": 0.4729064039408867, + "grad_norm": 0.06138644367456436, + "learning_rate": 5.425071509991737e-06, + "loss": 0.0042, + "step": 864 + }, + { + "epoch": 0.4734537493158183, + "grad_norm": 0.009944554418325424, + "learning_rate": 5.4165043306601436e-06, + "loss": 0.0004, + "step": 865 + }, + { + "epoch": 0.47400109469074986, + "grad_norm": 0.0024830952752381563, + "learning_rate": 5.407935919806862e-06, + "loss": 0.0001, + "step": 866 + }, + { + "epoch": 0.4745484400656814, + "grad_norm": 0.001542185782454908, + "learning_rate": 5.399366302767003e-06, + "loss": 0.0001, + "step": 867 + }, + { + "epoch": 0.47509578544061304, + "grad_norm": 1.6952769756317139, + "learning_rate": 5.390795504879243e-06, + "loss": 0.1442, + "step": 868 + }, + { + "epoch": 0.4756431308155446, + "grad_norm": 0.005125279538333416, + "learning_rate": 5.382223551485754e-06, + "loss": 0.0003, + "step": 869 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.0007472229772247374, + "learning_rate": 5.373650467932122e-06, + "loss": 0.0001, + "step": 870 + }, + { + "epoch": 0.4767378215654078, + "grad_norm": 1.5565142631530762, + "learning_rate": 5.3650762795672755e-06, + "loss": 0.0455, + "step": 871 + }, + { + "epoch": 0.47728516694033934, + "grad_norm": 1.660844087600708, + "learning_rate": 5.356501011743408e-06, + "loss": 0.0606, + "step": 872 + }, + { + "epoch": 0.47783251231527096, + "grad_norm": 0.0034840109292417765, + "learning_rate": 5.347924689815906e-06, + "loss": 0.0002, + "step": 873 + }, + { + "epoch": 0.4783798576902025, + "grad_norm": 0.9910545945167542, + "learning_rate": 5.3393473391432745e-06, + "loss": 0.0963, + "step": 874 + }, + { + "epoch": 0.4789272030651341, + "grad_norm": 1.6280092000961304, + "learning_rate": 5.330768985087059e-06, + "loss": 0.1113, + "step": 875 + }, + { + "epoch": 0.4794745484400657, + "grad_norm": 0.340638130903244, + "learning_rate": 5.32218965301177e-06, + "loss": 0.0135, + "step": 876 + }, + { + "epoch": 0.48002189381499727, + "grad_norm": 0.04405590519309044, + "learning_rate": 5.313609368284813e-06, + "loss": 0.0019, + "step": 877 + }, + { + "epoch": 0.48056923918992883, + "grad_norm": 0.05662679672241211, + "learning_rate": 5.305028156276405e-06, + "loss": 0.0024, + "step": 878 + }, + { + "epoch": 0.48111658456486045, + "grad_norm": 0.001325729419477284, + "learning_rate": 5.296446042359512e-06, + "loss": 0.0001, + "step": 879 + }, + { + "epoch": 0.481663929939792, + "grad_norm": 1.2463085651397705, + "learning_rate": 5.2878630519097605e-06, + "loss": 0.0509, + "step": 880 + }, + { + "epoch": 0.4822112753147236, + "grad_norm": 0.0031781319994479418, + "learning_rate": 5.279279210305373e-06, + "loss": 0.0002, + "step": 881 + }, + { + "epoch": 0.4827586206896552, + "grad_norm": 0.024987971410155296, + "learning_rate": 5.270694542927089e-06, + "loss": 0.0012, + "step": 882 + }, + { + "epoch": 0.48330596606458676, + "grad_norm": 0.005412014201283455, + "learning_rate": 5.262109075158084e-06, + "loss": 0.0003, + "step": 883 + }, + { + "epoch": 0.4838533114395183, + "grad_norm": 0.05908210948109627, + "learning_rate": 5.2535228323839046e-06, + "loss": 0.0035, + "step": 884 + }, + { + "epoch": 0.48440065681444994, + "grad_norm": 0.04235808923840523, + "learning_rate": 5.2449358399923885e-06, + "loss": 0.0017, + "step": 885 + }, + { + "epoch": 0.4849480021893815, + "grad_norm": 0.00043109082616865635, + "learning_rate": 5.236348123373593e-06, + "loss": 0.0001, + "step": 886 + }, + { + "epoch": 0.48549534756431306, + "grad_norm": 0.5738411545753479, + "learning_rate": 5.227759707919707e-06, + "loss": 0.0308, + "step": 887 + }, + { + "epoch": 0.4860426929392447, + "grad_norm": 0.04854976013302803, + "learning_rate": 5.219170619024996e-06, + "loss": 0.0028, + "step": 888 + }, + { + "epoch": 0.48659003831417624, + "grad_norm": 0.0018312755273655057, + "learning_rate": 5.2105808820857126e-06, + "loss": 0.0001, + "step": 889 + }, + { + "epoch": 0.4871373836891078, + "grad_norm": 1.7619683742523193, + "learning_rate": 5.201990522500027e-06, + "loss": 0.4159, + "step": 890 + }, + { + "epoch": 0.4876847290640394, + "grad_norm": 0.019682124257087708, + "learning_rate": 5.193399565667945e-06, + "loss": 0.0008, + "step": 891 + }, + { + "epoch": 0.488232074438971, + "grad_norm": 0.0009164040675386786, + "learning_rate": 5.184808036991246e-06, + "loss": 0.0001, + "step": 892 + }, + { + "epoch": 0.48877941981390255, + "grad_norm": 0.0016248204046860337, + "learning_rate": 5.1762159618733954e-06, + "loss": 0.0001, + "step": 893 + }, + { + "epoch": 0.48932676518883417, + "grad_norm": 0.015387938357889652, + "learning_rate": 5.167623365719474e-06, + "loss": 0.0004, + "step": 894 + }, + { + "epoch": 0.48987411056376573, + "grad_norm": 0.0004146917490288615, + "learning_rate": 5.1590302739361096e-06, + "loss": 0.0, + "step": 895 + }, + { + "epoch": 0.4904214559386973, + "grad_norm": 0.0017720222240313888, + "learning_rate": 5.150436711931387e-06, + "loss": 0.0001, + "step": 896 + }, + { + "epoch": 0.4909688013136289, + "grad_norm": 0.0020141142886132, + "learning_rate": 5.1418427051147855e-06, + "loss": 0.0001, + "step": 897 + }, + { + "epoch": 0.4915161466885605, + "grad_norm": 0.0037431721575558186, + "learning_rate": 5.1332482788971005e-06, + "loss": 0.0002, + "step": 898 + }, + { + "epoch": 0.49206349206349204, + "grad_norm": 0.05117342248558998, + "learning_rate": 5.1246534586903655e-06, + "loss": 0.0015, + "step": 899 + }, + { + "epoch": 0.49261083743842365, + "grad_norm": 0.006282655987888575, + "learning_rate": 5.116058269907779e-06, + "loss": 0.0003, + "step": 900 + }, + { + "epoch": 0.4931581828133552, + "grad_norm": 0.001593268709257245, + "learning_rate": 5.107462737963631e-06, + "loss": 0.0001, + "step": 901 + }, + { + "epoch": 0.49370552818828684, + "grad_norm": 0.0004761675954796374, + "learning_rate": 5.098866888273224e-06, + "loss": 0.0001, + "step": 902 + }, + { + "epoch": 0.4942528735632184, + "grad_norm": 0.0016890882980078459, + "learning_rate": 5.090270746252803e-06, + "loss": 0.0001, + "step": 903 + }, + { + "epoch": 0.49480021893814996, + "grad_norm": 0.04701818525791168, + "learning_rate": 5.081674337319473e-06, + "loss": 0.0024, + "step": 904 + }, + { + "epoch": 0.4953475643130816, + "grad_norm": 1.8550165891647339, + "learning_rate": 5.073077686891132e-06, + "loss": 0.5051, + "step": 905 + }, + { + "epoch": 0.49589490968801314, + "grad_norm": 2.175128936767578, + "learning_rate": 5.0644808203863926e-06, + "loss": 0.448, + "step": 906 + }, + { + "epoch": 0.4964422550629447, + "grad_norm": 0.008438384160399437, + "learning_rate": 5.055883763224502e-06, + "loss": 0.0005, + "step": 907 + }, + { + "epoch": 0.4969896004378763, + "grad_norm": 0.0014406866393983364, + "learning_rate": 5.047286540825273e-06, + "loss": 0.0001, + "step": 908 + }, + { + "epoch": 0.4975369458128079, + "grad_norm": 0.02875029854476452, + "learning_rate": 5.038689178609011e-06, + "loss": 0.0009, + "step": 909 + }, + { + "epoch": 0.49808429118773945, + "grad_norm": 0.0011718091554939747, + "learning_rate": 5.030091701996428e-06, + "loss": 0.0001, + "step": 910 + }, + { + "epoch": 0.49863163656267107, + "grad_norm": 0.037694063037633896, + "learning_rate": 5.021494136408578e-06, + "loss": 0.0019, + "step": 911 + }, + { + "epoch": 0.49917898193760263, + "grad_norm": 0.0005312002613209188, + "learning_rate": 5.012896507266779e-06, + "loss": 0.0, + "step": 912 + }, + { + "epoch": 0.4997263273125342, + "grad_norm": 3.6381168365478516, + "learning_rate": 5.0042988399925365e-06, + "loss": 1.3302, + "step": 913 + }, + { + "epoch": 0.5002736726874658, + "grad_norm": 0.2435426563024521, + "learning_rate": 4.995701160007466e-06, + "loss": 0.0075, + "step": 914 + }, + { + "epoch": 0.5008210180623974, + "grad_norm": 0.001816119416616857, + "learning_rate": 4.987103492733221e-06, + "loss": 0.0001, + "step": 915 + }, + { + "epoch": 0.5013683634373289, + "grad_norm": 0.0019303924636915326, + "learning_rate": 4.9785058635914234e-06, + "loss": 0.0001, + "step": 916 + }, + { + "epoch": 0.5019157088122606, + "grad_norm": 0.16450561583042145, + "learning_rate": 4.9699082980035735e-06, + "loss": 0.0094, + "step": 917 + }, + { + "epoch": 0.5024630541871922, + "grad_norm": 0.005837967619299889, + "learning_rate": 4.96131082139099e-06, + "loss": 0.0003, + "step": 918 + }, + { + "epoch": 0.5030103995621237, + "grad_norm": 0.2567005157470703, + "learning_rate": 4.952713459174728e-06, + "loss": 0.01, + "step": 919 + }, + { + "epoch": 0.5035577449370553, + "grad_norm": 0.0013026647502556443, + "learning_rate": 4.944116236775499e-06, + "loss": 0.0001, + "step": 920 + }, + { + "epoch": 0.5041050903119869, + "grad_norm": 0.0013909480767324567, + "learning_rate": 4.935519179613607e-06, + "loss": 0.0001, + "step": 921 + }, + { + "epoch": 0.5046524356869184, + "grad_norm": 0.044553812593221664, + "learning_rate": 4.9269223131088685e-06, + "loss": 0.0019, + "step": 922 + }, + { + "epoch": 0.50519978106185, + "grad_norm": 0.034393515437841415, + "learning_rate": 4.9183256626805276e-06, + "loss": 0.0018, + "step": 923 + }, + { + "epoch": 0.5057471264367817, + "grad_norm": 0.04988914355635643, + "learning_rate": 4.909729253747197e-06, + "loss": 0.0024, + "step": 924 + }, + { + "epoch": 0.5062944718117132, + "grad_norm": 0.30242329835891724, + "learning_rate": 4.901133111726777e-06, + "loss": 0.0205, + "step": 925 + }, + { + "epoch": 0.5068418171866448, + "grad_norm": 2.0898563861846924, + "learning_rate": 4.892537262036371e-06, + "loss": 0.1189, + "step": 926 + }, + { + "epoch": 0.5073891625615764, + "grad_norm": 0.27229753136634827, + "learning_rate": 4.883941730092222e-06, + "loss": 0.0146, + "step": 927 + }, + { + "epoch": 0.5079365079365079, + "grad_norm": 0.033172935247421265, + "learning_rate": 4.875346541309637e-06, + "loss": 0.0017, + "step": 928 + }, + { + "epoch": 0.5084838533114395, + "grad_norm": 3.3485257625579834, + "learning_rate": 4.866751721102901e-06, + "loss": 0.7556, + "step": 929 + }, + { + "epoch": 0.5090311986863711, + "grad_norm": 0.032600533217191696, + "learning_rate": 4.858157294885215e-06, + "loss": 0.0015, + "step": 930 + }, + { + "epoch": 0.5095785440613027, + "grad_norm": 0.017700565978884697, + "learning_rate": 4.8495632880686155e-06, + "loss": 0.0009, + "step": 931 + }, + { + "epoch": 0.5101258894362343, + "grad_norm": 0.0006235113833099604, + "learning_rate": 4.840969726063892e-06, + "loss": 0.0001, + "step": 932 + }, + { + "epoch": 0.5106732348111659, + "grad_norm": 0.005358157679438591, + "learning_rate": 4.832376634280526e-06, + "loss": 0.0003, + "step": 933 + }, + { + "epoch": 0.5112205801860974, + "grad_norm": 0.0027646853122860193, + "learning_rate": 4.823784038126608e-06, + "loss": 0.0002, + "step": 934 + }, + { + "epoch": 0.511767925561029, + "grad_norm": 0.012461655773222446, + "learning_rate": 4.8151919630087565e-06, + "loss": 0.0005, + "step": 935 + }, + { + "epoch": 0.5123152709359606, + "grad_norm": 0.0028007435612380505, + "learning_rate": 4.806600434332056e-06, + "loss": 0.0001, + "step": 936 + }, + { + "epoch": 0.5128626163108921, + "grad_norm": 0.02188277803361416, + "learning_rate": 4.7980094774999765e-06, + "loss": 0.0009, + "step": 937 + }, + { + "epoch": 0.5134099616858238, + "grad_norm": 0.003332045627757907, + "learning_rate": 4.789419117914288e-06, + "loss": 0.0002, + "step": 938 + }, + { + "epoch": 0.5139573070607554, + "grad_norm": 0.0016168846050277352, + "learning_rate": 4.780829380975004e-06, + "loss": 0.0001, + "step": 939 + }, + { + "epoch": 0.5145046524356869, + "grad_norm": 0.0034641437232494354, + "learning_rate": 4.772240292080295e-06, + "loss": 0.0001, + "step": 940 + }, + { + "epoch": 0.5150519978106185, + "grad_norm": 0.043745800852775574, + "learning_rate": 4.76365187662641e-06, + "loss": 0.0026, + "step": 941 + }, + { + "epoch": 0.5155993431855501, + "grad_norm": 0.00854698196053505, + "learning_rate": 4.755064160007612e-06, + "loss": 0.0005, + "step": 942 + }, + { + "epoch": 0.5161466885604816, + "grad_norm": 0.0028649435844272375, + "learning_rate": 4.746477167616098e-06, + "loss": 0.0001, + "step": 943 + }, + { + "epoch": 0.5166940339354132, + "grad_norm": 2.511202812194824, + "learning_rate": 4.737890924841918e-06, + "loss": 0.4582, + "step": 944 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 0.02500765584409237, + "learning_rate": 4.729305457072913e-06, + "loss": 0.0009, + "step": 945 + }, + { + "epoch": 0.5177887246852764, + "grad_norm": 0.021063437685370445, + "learning_rate": 4.7207207896946275e-06, + "loss": 0.001, + "step": 946 + }, + { + "epoch": 0.518336070060208, + "grad_norm": 0.004225156735628843, + "learning_rate": 4.712136948090241e-06, + "loss": 0.0002, + "step": 947 + }, + { + "epoch": 0.5188834154351396, + "grad_norm": 0.004292478784918785, + "learning_rate": 4.70355395764049e-06, + "loss": 0.0002, + "step": 948 + }, + { + "epoch": 0.5194307608100711, + "grad_norm": 0.003189288079738617, + "learning_rate": 4.694971843723596e-06, + "loss": 0.0002, + "step": 949 + }, + { + "epoch": 0.5199781061850027, + "grad_norm": 0.19809499382972717, + "learning_rate": 4.68639063171519e-06, + "loss": 0.0099, + "step": 950 + }, + { + "epoch": 0.5205254515599343, + "grad_norm": 0.07779690623283386, + "learning_rate": 4.677810346988231e-06, + "loss": 0.0042, + "step": 951 + }, + { + "epoch": 0.5210727969348659, + "grad_norm": 0.0021239686757326126, + "learning_rate": 4.6692310149129425e-06, + "loss": 0.0001, + "step": 952 + }, + { + "epoch": 0.5216201423097975, + "grad_norm": 0.0036316164769232273, + "learning_rate": 4.660652660856726e-06, + "loss": 0.0002, + "step": 953 + }, + { + "epoch": 0.5221674876847291, + "grad_norm": 0.06568455696105957, + "learning_rate": 4.6520753101840945e-06, + "loss": 0.0039, + "step": 954 + }, + { + "epoch": 0.5227148330596606, + "grad_norm": 0.0028860997408628464, + "learning_rate": 4.643498988256595e-06, + "loss": 0.0001, + "step": 955 + }, + { + "epoch": 0.5232621784345922, + "grad_norm": 1.3472000360488892, + "learning_rate": 4.634923720432727e-06, + "loss": 0.0462, + "step": 956 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 0.023126907646656036, + "learning_rate": 4.626349532067879e-06, + "loss": 0.0012, + "step": 957 + }, + { + "epoch": 0.5243568691844553, + "grad_norm": 2.742182970046997, + "learning_rate": 4.617776448514248e-06, + "loss": 0.3063, + "step": 958 + }, + { + "epoch": 0.524904214559387, + "grad_norm": 0.01860973611474037, + "learning_rate": 4.609204495120759e-06, + "loss": 0.0009, + "step": 959 + }, + { + "epoch": 0.5254515599343186, + "grad_norm": 0.00370593904517591, + "learning_rate": 4.600633697232999e-06, + "loss": 0.0002, + "step": 960 + }, + { + "epoch": 0.5259989053092501, + "grad_norm": 0.002041914500296116, + "learning_rate": 4.59206408019314e-06, + "loss": 0.0001, + "step": 961 + }, + { + "epoch": 0.5265462506841817, + "grad_norm": 3.0877633094787598, + "learning_rate": 4.583495669339857e-06, + "loss": 0.2035, + "step": 962 + }, + { + "epoch": 0.5270935960591133, + "grad_norm": 2.1611199378967285, + "learning_rate": 4.574928490008264e-06, + "loss": 0.4401, + "step": 963 + }, + { + "epoch": 0.5276409414340448, + "grad_norm": 0.003639899892732501, + "learning_rate": 4.566362567529834e-06, + "loss": 0.0002, + "step": 964 + }, + { + "epoch": 0.5281882868089764, + "grad_norm": 0.005092285107821226, + "learning_rate": 4.557797927232315e-06, + "loss": 0.0002, + "step": 965 + }, + { + "epoch": 0.5287356321839081, + "grad_norm": 0.007063568569719791, + "learning_rate": 4.549234594439674e-06, + "loss": 0.0003, + "step": 966 + }, + { + "epoch": 0.5292829775588396, + "grad_norm": 0.029361741617321968, + "learning_rate": 4.54067259447201e-06, + "loss": 0.0014, + "step": 967 + }, + { + "epoch": 0.5298303229337712, + "grad_norm": 0.0071528819389641285, + "learning_rate": 4.532111952645474e-06, + "loss": 0.0003, + "step": 968 + }, + { + "epoch": 0.5303776683087028, + "grad_norm": 0.0060792481526732445, + "learning_rate": 4.523552694272208e-06, + "loss": 0.0003, + "step": 969 + }, + { + "epoch": 0.5309250136836344, + "grad_norm": 0.03650781884789467, + "learning_rate": 4.514994844660265e-06, + "loss": 0.0023, + "step": 970 + }, + { + "epoch": 0.5314723590585659, + "grad_norm": 0.018084436655044556, + "learning_rate": 4.506438429113528e-06, + "loss": 0.0009, + "step": 971 + }, + { + "epoch": 0.5320197044334976, + "grad_norm": 0.007830976508557796, + "learning_rate": 4.497883472931639e-06, + "loss": 0.0003, + "step": 972 + }, + { + "epoch": 0.5325670498084292, + "grad_norm": 2.4423887729644775, + "learning_rate": 4.489330001409929e-06, + "loss": 0.268, + "step": 973 + }, + { + "epoch": 0.5331143951833607, + "grad_norm": 0.015997666865587234, + "learning_rate": 4.480778039839336e-06, + "loss": 0.0007, + "step": 974 + }, + { + "epoch": 0.5336617405582923, + "grad_norm": 0.00490145618095994, + "learning_rate": 4.472227613506334e-06, + "loss": 0.0003, + "step": 975 + }, + { + "epoch": 0.5342090859332239, + "grad_norm": 0.03548009321093559, + "learning_rate": 4.4636787476928605e-06, + "loss": 0.0013, + "step": 976 + }, + { + "epoch": 0.5347564313081554, + "grad_norm": 0.0016715583624318242, + "learning_rate": 4.455131467676235e-06, + "loss": 0.0001, + "step": 977 + }, + { + "epoch": 0.535303776683087, + "grad_norm": 3.094775915145874, + "learning_rate": 4.446585798729091e-06, + "loss": 0.1449, + "step": 978 + }, + { + "epoch": 0.5358511220580187, + "grad_norm": 0.08493595570325851, + "learning_rate": 4.438041766119293e-06, + "loss": 0.0045, + "step": 979 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 0.6037715673446655, + "learning_rate": 4.429499395109877e-06, + "loss": 0.0485, + "step": 980 + }, + { + "epoch": 0.5369458128078818, + "grad_norm": 1.0072492361068726, + "learning_rate": 4.4209587109589565e-06, + "loss": 0.1076, + "step": 981 + }, + { + "epoch": 0.5374931581828134, + "grad_norm": 0.00744901318103075, + "learning_rate": 4.412419738919661e-06, + "loss": 0.0003, + "step": 982 + }, + { + "epoch": 0.5380405035577449, + "grad_norm": 0.002214734675362706, + "learning_rate": 4.40388250424006e-06, + "loss": 0.0001, + "step": 983 + }, + { + "epoch": 0.5385878489326765, + "grad_norm": 0.004368369467556477, + "learning_rate": 4.395347032163079e-06, + "loss": 0.0003, + "step": 984 + }, + { + "epoch": 0.5391351943076081, + "grad_norm": 2.0607409477233887, + "learning_rate": 4.38681334792644e-06, + "loss": 0.394, + "step": 985 + }, + { + "epoch": 0.5396825396825397, + "grad_norm": 5.10407829284668, + "learning_rate": 4.3782814767625755e-06, + "loss": 0.2882, + "step": 986 + }, + { + "epoch": 0.5402298850574713, + "grad_norm": 0.00618614861741662, + "learning_rate": 4.369751443898554e-06, + "loss": 0.0003, + "step": 987 + }, + { + "epoch": 0.5407772304324029, + "grad_norm": 3.2158572673797607, + "learning_rate": 4.361223274556012e-06, + "loss": 0.6837, + "step": 988 + }, + { + "epoch": 0.5413245758073344, + "grad_norm": 0.01705230213701725, + "learning_rate": 4.3526969939510785e-06, + "loss": 0.0007, + "step": 989 + }, + { + "epoch": 0.541871921182266, + "grad_norm": 0.0015571071999147534, + "learning_rate": 4.3441726272942895e-06, + "loss": 0.0001, + "step": 990 + }, + { + "epoch": 0.5424192665571976, + "grad_norm": 0.010929671116173267, + "learning_rate": 4.335650199790528e-06, + "loss": 0.0004, + "step": 991 + }, + { + "epoch": 0.5429666119321291, + "grad_norm": 2.3123574256896973, + "learning_rate": 4.327129736638946e-06, + "loss": 0.1457, + "step": 992 + }, + { + "epoch": 0.5435139573070608, + "grad_norm": 2.3527815341949463, + "learning_rate": 4.318611263032878e-06, + "loss": 0.4302, + "step": 993 + }, + { + "epoch": 0.5440613026819924, + "grad_norm": 0.42728134989738464, + "learning_rate": 4.310094804159784e-06, + "loss": 0.0214, + "step": 994 + }, + { + "epoch": 0.5446086480569239, + "grad_norm": 0.01541733555495739, + "learning_rate": 4.301580385201166e-06, + "loss": 0.0008, + "step": 995 + }, + { + "epoch": 0.5451559934318555, + "grad_norm": 0.02181682363152504, + "learning_rate": 4.293068031332488e-06, + "loss": 0.0011, + "step": 996 + }, + { + "epoch": 0.5457033388067871, + "grad_norm": 0.0581437423825264, + "learning_rate": 4.284557767723114e-06, + "loss": 0.0028, + "step": 997 + }, + { + "epoch": 0.5462506841817186, + "grad_norm": 0.013395448215305805, + "learning_rate": 4.2760496195362285e-06, + "loss": 0.0006, + "step": 998 + }, + { + "epoch": 0.5467980295566502, + "grad_norm": 1.165745735168457, + "learning_rate": 4.267543611928755e-06, + "loss": 0.0997, + "step": 999 + }, + { + "epoch": 0.5473453749315819, + "grad_norm": 0.03456006571650505, + "learning_rate": 4.259039770051292e-06, + "loss": 0.0021, + "step": 1000 + }, + { + "epoch": 0.5478927203065134, + "grad_norm": 0.2218150794506073, + "learning_rate": 4.250538119048036e-06, + "loss": 0.018, + "step": 1001 + }, + { + "epoch": 0.548440065681445, + "grad_norm": 0.00396326370537281, + "learning_rate": 4.2420386840567e-06, + "loss": 0.0002, + "step": 1002 + }, + { + "epoch": 0.5489874110563766, + "grad_norm": 0.19975824654102325, + "learning_rate": 4.233541490208448e-06, + "loss": 0.0134, + "step": 1003 + }, + { + "epoch": 0.5495347564313081, + "grad_norm": 0.012520854361355305, + "learning_rate": 4.22504656262782e-06, + "loss": 0.0004, + "step": 1004 + }, + { + "epoch": 0.5500821018062397, + "grad_norm": 2.170820951461792, + "learning_rate": 4.2165539264326495e-06, + "loss": 0.5925, + "step": 1005 + }, + { + "epoch": 0.5506294471811713, + "grad_norm": 0.004577749874442816, + "learning_rate": 4.208063606733999e-06, + "loss": 0.0002, + "step": 1006 + }, + { + "epoch": 0.5511767925561029, + "grad_norm": 0.103189617395401, + "learning_rate": 4.199575628636078e-06, + "loss": 0.006, + "step": 1007 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.004631855525076389, + "learning_rate": 4.191090017236177e-06, + "loss": 0.0002, + "step": 1008 + }, + { + "epoch": 0.5522714833059661, + "grad_norm": 1.1117832660675049, + "learning_rate": 4.182606797624585e-06, + "loss": 0.1485, + "step": 1009 + }, + { + "epoch": 0.5528188286808976, + "grad_norm": 0.0021044062450528145, + "learning_rate": 4.1741259948845206e-06, + "loss": 0.0001, + "step": 1010 + }, + { + "epoch": 0.5533661740558292, + "grad_norm": 1.9612685441970825, + "learning_rate": 4.165647634092055e-06, + "loss": 0.3809, + "step": 1011 + }, + { + "epoch": 0.5539135194307608, + "grad_norm": 0.0078058550134301186, + "learning_rate": 4.157171740316039e-06, + "loss": 0.0003, + "step": 1012 + }, + { + "epoch": 0.5544608648056923, + "grad_norm": 0.09045789390802383, + "learning_rate": 4.148698338618031e-06, + "loss": 0.0039, + "step": 1013 + }, + { + "epoch": 0.555008210180624, + "grad_norm": 0.003742016153410077, + "learning_rate": 4.14022745405222e-06, + "loss": 0.0002, + "step": 1014 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.004482665564864874, + "learning_rate": 4.131759111665349e-06, + "loss": 0.0002, + "step": 1015 + }, + { + "epoch": 0.5561029009304871, + "grad_norm": 1.2101866006851196, + "learning_rate": 4.123293336496651e-06, + "loss": 0.0996, + "step": 1016 + }, + { + "epoch": 0.5566502463054187, + "grad_norm": 0.001449257368221879, + "learning_rate": 4.114830153577759e-06, + "loss": 0.0001, + "step": 1017 + }, + { + "epoch": 0.5571975916803503, + "grad_norm": 0.12253908812999725, + "learning_rate": 4.10636958793265e-06, + "loss": 0.0076, + "step": 1018 + }, + { + "epoch": 0.5577449370552818, + "grad_norm": 0.010266436263918877, + "learning_rate": 4.0979116645775606e-06, + "loss": 0.0005, + "step": 1019 + }, + { + "epoch": 0.5582922824302134, + "grad_norm": 0.06671036034822464, + "learning_rate": 4.089456408520908e-06, + "loss": 0.0034, + "step": 1020 + }, + { + "epoch": 0.5588396278051451, + "grad_norm": 3.1648943424224854, + "learning_rate": 4.0810038447632296e-06, + "loss": 0.5862, + "step": 1021 + }, + { + "epoch": 0.5593869731800766, + "grad_norm": 0.08388476818799973, + "learning_rate": 4.072553998297103e-06, + "loss": 0.0056, + "step": 1022 + }, + { + "epoch": 0.5599343185550082, + "grad_norm": 0.07822927832603455, + "learning_rate": 4.064106894107064e-06, + "loss": 0.0041, + "step": 1023 + }, + { + "epoch": 0.5604816639299398, + "grad_norm": 0.06111420318484306, + "learning_rate": 4.055662557169545e-06, + "loss": 0.0031, + "step": 1024 + }, + { + "epoch": 0.5610290093048714, + "grad_norm": 0.21099752187728882, + "learning_rate": 4.047221012452798e-06, + "loss": 0.0146, + "step": 1025 + }, + { + "epoch": 0.5615763546798029, + "grad_norm": 0.0014589588390663266, + "learning_rate": 4.0387822849168165e-06, + "loss": 0.0001, + "step": 1026 + }, + { + "epoch": 0.5621237000547346, + "grad_norm": 0.00508915726095438, + "learning_rate": 4.030346399513261e-06, + "loss": 0.0002, + "step": 1027 + }, + { + "epoch": 0.5626710454296662, + "grad_norm": 0.0018473148811608553, + "learning_rate": 4.021913381185394e-06, + "loss": 0.0001, + "step": 1028 + }, + { + "epoch": 0.5632183908045977, + "grad_norm": 0.12212100625038147, + "learning_rate": 4.013483254868001e-06, + "loss": 0.0089, + "step": 1029 + }, + { + "epoch": 0.5637657361795293, + "grad_norm": 3.567409038543701, + "learning_rate": 4.005056045487307e-06, + "loss": 0.2606, + "step": 1030 + }, + { + "epoch": 0.5643130815544609, + "grad_norm": 0.11344505101442337, + "learning_rate": 3.996631777960923e-06, + "loss": 0.0075, + "step": 1031 + }, + { + "epoch": 0.5648604269293924, + "grad_norm": 0.007242574356496334, + "learning_rate": 3.9882104771977585e-06, + "loss": 0.0004, + "step": 1032 + }, + { + "epoch": 0.565407772304324, + "grad_norm": 0.20782946050167084, + "learning_rate": 3.979792168097946e-06, + "loss": 0.0083, + "step": 1033 + }, + { + "epoch": 0.5659551176792557, + "grad_norm": 0.049997005611658096, + "learning_rate": 3.971376875552777e-06, + "loss": 0.0024, + "step": 1034 + }, + { + "epoch": 0.5665024630541872, + "grad_norm": 0.010352439247071743, + "learning_rate": 3.962964624444625e-06, + "loss": 0.0005, + "step": 1035 + }, + { + "epoch": 0.5670498084291188, + "grad_norm": 0.12570813298225403, + "learning_rate": 3.9545554396468655e-06, + "loss": 0.0063, + "step": 1036 + }, + { + "epoch": 0.5675971538040504, + "grad_norm": 0.005166537594050169, + "learning_rate": 3.946149346023811e-06, + "loss": 0.0002, + "step": 1037 + }, + { + "epoch": 0.5681444991789819, + "grad_norm": 0.004181237425655127, + "learning_rate": 3.937746368430633e-06, + "loss": 0.0002, + "step": 1038 + }, + { + "epoch": 0.5686918445539135, + "grad_norm": 0.14119568467140198, + "learning_rate": 3.929346531713289e-06, + "loss": 0.0089, + "step": 1039 + }, + { + "epoch": 0.5692391899288451, + "grad_norm": 0.03071342408657074, + "learning_rate": 3.920949860708452e-06, + "loss": 0.0015, + "step": 1040 + }, + { + "epoch": 0.5697865353037767, + "grad_norm": 0.003992474637925625, + "learning_rate": 3.912556380243431e-06, + "loss": 0.0002, + "step": 1041 + }, + { + "epoch": 0.5703338806787083, + "grad_norm": 0.011039703153073788, + "learning_rate": 3.9041661151361045e-06, + "loss": 0.0006, + "step": 1042 + }, + { + "epoch": 0.5708812260536399, + "grad_norm": 0.01362689770758152, + "learning_rate": 3.89577909019484e-06, + "loss": 0.0006, + "step": 1043 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.016619103029370308, + "learning_rate": 3.887395330218429e-06, + "loss": 0.0007, + "step": 1044 + }, + { + "epoch": 0.571975916803503, + "grad_norm": 0.0034893490374088287, + "learning_rate": 3.879014859996006e-06, + "loss": 0.0002, + "step": 1045 + }, + { + "epoch": 0.5725232621784346, + "grad_norm": 2.8486711978912354, + "learning_rate": 3.8706377043069785e-06, + "loss": 0.1652, + "step": 1046 + }, + { + "epoch": 0.5730706075533661, + "grad_norm": 0.027762796729803085, + "learning_rate": 3.862263887920957e-06, + "loss": 0.0013, + "step": 1047 + }, + { + "epoch": 0.5736179529282978, + "grad_norm": 0.09105691313743591, + "learning_rate": 3.853893435597673e-06, + "loss": 0.0061, + "step": 1048 + }, + { + "epoch": 0.5741652983032294, + "grad_norm": 0.00692678801715374, + "learning_rate": 3.8455263720869134e-06, + "loss": 0.0003, + "step": 1049 + }, + { + "epoch": 0.5747126436781609, + "grad_norm": 1.1762075424194336, + "learning_rate": 3.8371627221284495e-06, + "loss": 0.0687, + "step": 1050 + }, + { + "epoch": 0.5752599890530925, + "grad_norm": 0.14164984226226807, + "learning_rate": 3.82880251045195e-06, + "loss": 0.0088, + "step": 1051 + }, + { + "epoch": 0.5758073344280241, + "grad_norm": 0.00480201980099082, + "learning_rate": 3.820445761776925e-06, + "loss": 0.0002, + "step": 1052 + }, + { + "epoch": 0.5763546798029556, + "grad_norm": 0.3967006802558899, + "learning_rate": 3.8120925008126457e-06, + "loss": 0.0231, + "step": 1053 + }, + { + "epoch": 0.5769020251778872, + "grad_norm": 0.01882929727435112, + "learning_rate": 3.8037427522580627e-06, + "loss": 0.0009, + "step": 1054 + }, + { + "epoch": 0.5774493705528189, + "grad_norm": 0.30294129252433777, + "learning_rate": 3.7953965408017483e-06, + "loss": 0.0205, + "step": 1055 + }, + { + "epoch": 0.5779967159277504, + "grad_norm": 0.07470423728227615, + "learning_rate": 3.7870538911218176e-06, + "loss": 0.0045, + "step": 1056 + }, + { + "epoch": 0.578544061302682, + "grad_norm": 1.4331607818603516, + "learning_rate": 3.7787148278858453e-06, + "loss": 0.1268, + "step": 1057 + }, + { + "epoch": 0.5790914066776136, + "grad_norm": 0.0020402672234922647, + "learning_rate": 3.77037937575081e-06, + "loss": 0.0001, + "step": 1058 + }, + { + "epoch": 0.5796387520525451, + "grad_norm": 1.464131236076355, + "learning_rate": 3.762047559363013e-06, + "loss": 0.0974, + "step": 1059 + }, + { + "epoch": 0.5801860974274767, + "grad_norm": 0.7270787358283997, + "learning_rate": 3.753719403357997e-06, + "loss": 0.0442, + "step": 1060 + }, + { + "epoch": 0.5807334428024084, + "grad_norm": 0.006362659856677055, + "learning_rate": 3.745394932360491e-06, + "loss": 0.0003, + "step": 1061 + }, + { + "epoch": 0.5812807881773399, + "grad_norm": 0.0032875300385057926, + "learning_rate": 3.7370741709843263e-06, + "loss": 0.0002, + "step": 1062 + }, + { + "epoch": 0.5818281335522715, + "grad_norm": 0.0008078487007878721, + "learning_rate": 3.728757143832359e-06, + "loss": 0.0001, + "step": 1063 + }, + { + "epoch": 0.5823754789272031, + "grad_norm": 0.03340085595846176, + "learning_rate": 3.7204438754964113e-06, + "loss": 0.0015, + "step": 1064 + }, + { + "epoch": 0.5829228243021346, + "grad_norm": 3.268207550048828, + "learning_rate": 3.7121343905571897e-06, + "loss": 0.5775, + "step": 1065 + }, + { + "epoch": 0.5834701696770662, + "grad_norm": 0.003012469271197915, + "learning_rate": 3.70382871358421e-06, + "loss": 0.0002, + "step": 1066 + }, + { + "epoch": 0.5840175150519978, + "grad_norm": 0.0514325350522995, + "learning_rate": 3.695526869135733e-06, + "loss": 0.0029, + "step": 1067 + }, + { + "epoch": 0.5845648604269293, + "grad_norm": 0.0025664675049483776, + "learning_rate": 3.6872288817586883e-06, + "loss": 0.0001, + "step": 1068 + }, + { + "epoch": 0.585112205801861, + "grad_norm": 0.051885541528463364, + "learning_rate": 3.678934775988594e-06, + "loss": 0.0031, + "step": 1069 + }, + { + "epoch": 0.5856595511767926, + "grad_norm": 0.11281336098909378, + "learning_rate": 3.6706445763494976e-06, + "loss": 0.0066, + "step": 1070 + }, + { + "epoch": 0.5862068965517241, + "grad_norm": 0.03128824383020401, + "learning_rate": 3.662358307353897e-06, + "loss": 0.0016, + "step": 1071 + }, + { + "epoch": 0.5867542419266557, + "grad_norm": 0.04773644730448723, + "learning_rate": 3.6540759935026627e-06, + "loss": 0.0028, + "step": 1072 + }, + { + "epoch": 0.5873015873015873, + "grad_norm": 0.023716391995549202, + "learning_rate": 3.6457976592849753e-06, + "loss": 0.0012, + "step": 1073 + }, + { + "epoch": 0.5878489326765188, + "grad_norm": 0.10552944242954254, + "learning_rate": 3.637523329178247e-06, + "loss": 0.0061, + "step": 1074 + }, + { + "epoch": 0.5883962780514504, + "grad_norm": 0.005593111272901297, + "learning_rate": 3.6292530276480493e-06, + "loss": 0.0003, + "step": 1075 + }, + { + "epoch": 0.5889436234263821, + "grad_norm": 0.054677512496709824, + "learning_rate": 3.6209867791480446e-06, + "loss": 0.0035, + "step": 1076 + }, + { + "epoch": 0.5894909688013136, + "grad_norm": 0.1693568378686905, + "learning_rate": 3.6127246081199107e-06, + "loss": 0.0135, + "step": 1077 + }, + { + "epoch": 0.5900383141762452, + "grad_norm": 0.04031054675579071, + "learning_rate": 3.6044665389932663e-06, + "loss": 0.0022, + "step": 1078 + }, + { + "epoch": 0.5905856595511768, + "grad_norm": 0.0014612111262977123, + "learning_rate": 3.596212596185603e-06, + "loss": 0.0001, + "step": 1079 + }, + { + "epoch": 0.5911330049261084, + "grad_norm": 3.7056195735931396, + "learning_rate": 3.587962804102214e-06, + "loss": 0.7452, + "step": 1080 + }, + { + "epoch": 0.5916803503010399, + "grad_norm": 0.028046006336808205, + "learning_rate": 3.5797171871361203e-06, + "loss": 0.0013, + "step": 1081 + }, + { + "epoch": 0.5922276956759716, + "grad_norm": 0.16697245836257935, + "learning_rate": 3.57147576966799e-06, + "loss": 0.0103, + "step": 1082 + }, + { + "epoch": 0.5927750410509032, + "grad_norm": 0.16656920313835144, + "learning_rate": 3.5632385760660828e-06, + "loss": 0.0086, + "step": 1083 + }, + { + "epoch": 0.5933223864258347, + "grad_norm": 0.0012677456252276897, + "learning_rate": 3.5550056306861667e-06, + "loss": 0.0001, + "step": 1084 + }, + { + "epoch": 0.5938697318007663, + "grad_norm": 0.023179659619927406, + "learning_rate": 3.5467769578714455e-06, + "loss": 0.0012, + "step": 1085 + }, + { + "epoch": 0.5944170771756979, + "grad_norm": 0.007856342010200024, + "learning_rate": 3.5385525819524933e-06, + "loss": 0.0004, + "step": 1086 + }, + { + "epoch": 0.5949644225506294, + "grad_norm": 0.0038795997388660908, + "learning_rate": 3.530332527247181e-06, + "loss": 0.0002, + "step": 1087 + }, + { + "epoch": 0.595511767925561, + "grad_norm": 0.057348210364580154, + "learning_rate": 3.5221168180605946e-06, + "loss": 0.0036, + "step": 1088 + }, + { + "epoch": 0.5960591133004927, + "grad_norm": 0.0044167679734528065, + "learning_rate": 3.5139054786849787e-06, + "loss": 0.0002, + "step": 1089 + }, + { + "epoch": 0.5966064586754242, + "grad_norm": 0.0067407819442451, + "learning_rate": 3.5056985333996566e-06, + "loss": 0.0003, + "step": 1090 + }, + { + "epoch": 0.5971538040503558, + "grad_norm": 0.006905603222548962, + "learning_rate": 3.4974960064709534e-06, + "loss": 0.0003, + "step": 1091 + }, + { + "epoch": 0.5977011494252874, + "grad_norm": 1.9404516220092773, + "learning_rate": 3.489297922152136e-06, + "loss": 0.2303, + "step": 1092 + }, + { + "epoch": 0.5982484948002189, + "grad_norm": 0.05944973602890968, + "learning_rate": 3.4811043046833353e-06, + "loss": 0.0033, + "step": 1093 + }, + { + "epoch": 0.5987958401751505, + "grad_norm": 0.1706078201532364, + "learning_rate": 3.4729151782914683e-06, + "loss": 0.0122, + "step": 1094 + }, + { + "epoch": 0.5993431855500821, + "grad_norm": 0.0021984418854117393, + "learning_rate": 3.4647305671901797e-06, + "loss": 0.0001, + "step": 1095 + }, + { + "epoch": 0.5998905309250137, + "grad_norm": 0.017191147431731224, + "learning_rate": 3.456550495579762e-06, + "loss": 0.0007, + "step": 1096 + }, + { + "epoch": 0.6004378762999453, + "grad_norm": 2.794296979904175, + "learning_rate": 3.44837498764708e-06, + "loss": 0.4829, + "step": 1097 + }, + { + "epoch": 0.6009852216748769, + "grad_norm": 0.3021494448184967, + "learning_rate": 3.440204067565511e-06, + "loss": 0.0199, + "step": 1098 + }, + { + "epoch": 0.6015325670498084, + "grad_norm": 0.0023824695963412523, + "learning_rate": 3.432037759494867e-06, + "loss": 0.0002, + "step": 1099 + }, + { + "epoch": 0.60207991242474, + "grad_norm": 0.0029570909682661295, + "learning_rate": 3.4238760875813155e-06, + "loss": 0.0002, + "step": 1100 + }, + { + "epoch": 0.6026272577996716, + "grad_norm": 2.326396942138672, + "learning_rate": 3.4157190759573243e-06, + "loss": 0.5833, + "step": 1101 + }, + { + "epoch": 0.6031746031746031, + "grad_norm": 0.21300382912158966, + "learning_rate": 3.4075667487415785e-06, + "loss": 0.0169, + "step": 1102 + }, + { + "epoch": 0.6037219485495348, + "grad_norm": 0.054834965616464615, + "learning_rate": 3.3994191300389103e-06, + "loss": 0.0028, + "step": 1103 + }, + { + "epoch": 0.6042692939244664, + "grad_norm": 0.0037344787269830704, + "learning_rate": 3.391276243940234e-06, + "loss": 0.0002, + "step": 1104 + }, + { + "epoch": 0.6048166392993979, + "grad_norm": 0.0019440932665020227, + "learning_rate": 3.3831381145224667e-06, + "loss": 0.0001, + "step": 1105 + }, + { + "epoch": 0.6053639846743295, + "grad_norm": 0.0031255807261914015, + "learning_rate": 3.375004765848463e-06, + "loss": 0.0002, + "step": 1106 + }, + { + "epoch": 0.6059113300492611, + "grad_norm": 0.04441095516085625, + "learning_rate": 3.3668762219669393e-06, + "loss": 0.0024, + "step": 1107 + }, + { + "epoch": 0.6064586754241926, + "grad_norm": 0.5900030732154846, + "learning_rate": 3.3587525069124093e-06, + "loss": 0.0258, + "step": 1108 + }, + { + "epoch": 0.6070060207991242, + "grad_norm": 0.0020720800384879112, + "learning_rate": 3.350633644705107e-06, + "loss": 0.0001, + "step": 1109 + }, + { + "epoch": 0.6075533661740559, + "grad_norm": 0.04347815364599228, + "learning_rate": 3.3425196593509135e-06, + "loss": 0.0021, + "step": 1110 + }, + { + "epoch": 0.6081007115489874, + "grad_norm": 1.996151089668274, + "learning_rate": 3.334410574841298e-06, + "loss": 0.0879, + "step": 1111 + }, + { + "epoch": 0.608648056923919, + "grad_norm": 0.0028991817962378263, + "learning_rate": 3.3263064151532303e-06, + "loss": 0.0002, + "step": 1112 + }, + { + "epoch": 0.6091954022988506, + "grad_norm": 0.14750750362873077, + "learning_rate": 3.3182072042491244e-06, + "loss": 0.009, + "step": 1113 + }, + { + "epoch": 0.6097427476737821, + "grad_norm": 0.17952144145965576, + "learning_rate": 3.310112966076762e-06, + "loss": 0.0127, + "step": 1114 + }, + { + "epoch": 0.6102900930487137, + "grad_norm": 0.0014247623039409518, + "learning_rate": 3.3020237245692154e-06, + "loss": 0.0001, + "step": 1115 + }, + { + "epoch": 0.6108374384236454, + "grad_norm": 0.0048622991889715195, + "learning_rate": 3.293939503644788e-06, + "loss": 0.0003, + "step": 1116 + }, + { + "epoch": 0.6113847837985769, + "grad_norm": 0.004406985826790333, + "learning_rate": 3.285860327206939e-06, + "loss": 0.0002, + "step": 1117 + }, + { + "epoch": 0.6119321291735085, + "grad_norm": 0.0034555234014987946, + "learning_rate": 3.277786219144207e-06, + "loss": 0.0002, + "step": 1118 + }, + { + "epoch": 0.6124794745484401, + "grad_norm": 0.21532057225704193, + "learning_rate": 3.2697172033301485e-06, + "loss": 0.0077, + "step": 1119 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 0.05346650257706642, + "learning_rate": 3.2616533036232635e-06, + "loss": 0.003, + "step": 1120 + }, + { + "epoch": 0.6135741652983032, + "grad_norm": 0.001968423603102565, + "learning_rate": 3.2535945438669203e-06, + "loss": 0.0001, + "step": 1121 + }, + { + "epoch": 0.6141215106732348, + "grad_norm": 0.0016493768198415637, + "learning_rate": 3.245540947889294e-06, + "loss": 0.0001, + "step": 1122 + }, + { + "epoch": 0.6146688560481663, + "grad_norm": 0.06369902193546295, + "learning_rate": 3.2374925395032926e-06, + "loss": 0.0034, + "step": 1123 + }, + { + "epoch": 0.615216201423098, + "grad_norm": 0.0024321821983903646, + "learning_rate": 3.229449342506477e-06, + "loss": 0.0001, + "step": 1124 + }, + { + "epoch": 0.6157635467980296, + "grad_norm": 0.014392325654625893, + "learning_rate": 3.2214113806810077e-06, + "loss": 0.0008, + "step": 1125 + }, + { + "epoch": 0.6163108921729611, + "grad_norm": 0.0016964362002909184, + "learning_rate": 3.2133786777935645e-06, + "loss": 0.0001, + "step": 1126 + }, + { + "epoch": 0.6168582375478927, + "grad_norm": 0.01708538644015789, + "learning_rate": 3.205351257595272e-06, + "loss": 0.0009, + "step": 1127 + }, + { + "epoch": 0.6174055829228243, + "grad_norm": 0.0007502536755055189, + "learning_rate": 3.197329143821639e-06, + "loss": 0.0001, + "step": 1128 + }, + { + "epoch": 0.6179529282977558, + "grad_norm": 2.2683279514312744, + "learning_rate": 3.189312360192489e-06, + "loss": 0.2333, + "step": 1129 + }, + { + "epoch": 0.6185002736726875, + "grad_norm": 0.001995793776586652, + "learning_rate": 3.181300930411874e-06, + "loss": 0.0001, + "step": 1130 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 0.02659580484032631, + "learning_rate": 3.173294878168025e-06, + "loss": 0.0017, + "step": 1131 + }, + { + "epoch": 0.6195949644225506, + "grad_norm": 1.8708947896957397, + "learning_rate": 3.165294227133271e-06, + "loss": 0.1571, + "step": 1132 + }, + { + "epoch": 0.6201423097974822, + "grad_norm": 0.0021453495137393475, + "learning_rate": 3.157299000963966e-06, + "loss": 0.0001, + "step": 1133 + }, + { + "epoch": 0.6206896551724138, + "grad_norm": 0.011976787820458412, + "learning_rate": 3.149309223300428e-06, + "loss": 0.0006, + "step": 1134 + }, + { + "epoch": 0.6212370005473454, + "grad_norm": 0.5774064064025879, + "learning_rate": 3.141324917766866e-06, + "loss": 0.0201, + "step": 1135 + }, + { + "epoch": 0.6217843459222769, + "grad_norm": 0.023726046085357666, + "learning_rate": 3.1333461079713056e-06, + "loss": 0.0006, + "step": 1136 + }, + { + "epoch": 0.6223316912972086, + "grad_norm": 0.23722946643829346, + "learning_rate": 3.1253728175055242e-06, + "loss": 0.0162, + "step": 1137 + }, + { + "epoch": 0.6228790366721402, + "grad_norm": 0.0039423611015081406, + "learning_rate": 3.1174050699449776e-06, + "loss": 0.0002, + "step": 1138 + }, + { + "epoch": 0.6234263820470717, + "grad_norm": 0.012701333500444889, + "learning_rate": 3.109442888848736e-06, + "loss": 0.0006, + "step": 1139 + }, + { + "epoch": 0.6239737274220033, + "grad_norm": 0.01759052835404873, + "learning_rate": 3.1014862977594083e-06, + "loss": 0.0007, + "step": 1140 + }, + { + "epoch": 0.6245210727969349, + "grad_norm": 0.0015605682274326682, + "learning_rate": 3.093535320203074e-06, + "loss": 0.0001, + "step": 1141 + }, + { + "epoch": 0.6250684181718664, + "grad_norm": 2.1218316555023193, + "learning_rate": 3.0855899796892188e-06, + "loss": 0.0915, + "step": 1142 + }, + { + "epoch": 0.625615763546798, + "grad_norm": 0.024543100968003273, + "learning_rate": 3.0776502997106526e-06, + "loss": 0.0011, + "step": 1143 + }, + { + "epoch": 0.6261631089217297, + "grad_norm": 0.0035148372408002615, + "learning_rate": 3.0697163037434573e-06, + "loss": 0.0002, + "step": 1144 + }, + { + "epoch": 0.6267104542966612, + "grad_norm": 0.00434495834633708, + "learning_rate": 3.061788015246905e-06, + "loss": 0.0002, + "step": 1145 + }, + { + "epoch": 0.6272577996715928, + "grad_norm": 0.030992476269602776, + "learning_rate": 3.0538654576633865e-06, + "loss": 0.0015, + "step": 1146 + }, + { + "epoch": 0.6278051450465244, + "grad_norm": 0.006613498087972403, + "learning_rate": 3.045948654418356e-06, + "loss": 0.0004, + "step": 1147 + }, + { + "epoch": 0.6283524904214559, + "grad_norm": 0.0013742909068241715, + "learning_rate": 3.0380376289202497e-06, + "loss": 0.0001, + "step": 1148 + }, + { + "epoch": 0.6288998357963875, + "grad_norm": 0.006594480946660042, + "learning_rate": 3.0301324045604163e-06, + "loss": 0.0003, + "step": 1149 + }, + { + "epoch": 0.6294471811713191, + "grad_norm": 0.0012503663310781121, + "learning_rate": 3.0222330047130572e-06, + "loss": 0.0001, + "step": 1150 + }, + { + "epoch": 0.6299945265462507, + "grad_norm": 0.024310488253831863, + "learning_rate": 3.0143394527351522e-06, + "loss": 0.0014, + "step": 1151 + }, + { + "epoch": 0.6305418719211823, + "grad_norm": 0.001007006736472249, + "learning_rate": 3.0064517719663833e-06, + "loss": 0.0001, + "step": 1152 + }, + { + "epoch": 0.6310892172961139, + "grad_norm": 2.5153512954711914, + "learning_rate": 2.9985699857290788e-06, + "loss": 0.5184, + "step": 1153 + }, + { + "epoch": 0.6316365626710454, + "grad_norm": 0.002309981267899275, + "learning_rate": 2.990694117328139e-06, + "loss": 0.0001, + "step": 1154 + }, + { + "epoch": 0.632183908045977, + "grad_norm": 2.245021104812622, + "learning_rate": 2.982824190050958e-06, + "loss": 0.5133, + "step": 1155 + }, + { + "epoch": 0.6327312534209086, + "grad_norm": 0.000964795530308038, + "learning_rate": 2.9749602271673717e-06, + "loss": 0.0001, + "step": 1156 + }, + { + "epoch": 0.6332785987958401, + "grad_norm": 2.1958580017089844, + "learning_rate": 2.967102251929579e-06, + "loss": 0.28, + "step": 1157 + }, + { + "epoch": 0.6338259441707718, + "grad_norm": 0.7682985067367554, + "learning_rate": 2.959250287572069e-06, + "loss": 0.0413, + "step": 1158 + }, + { + "epoch": 0.6343732895457034, + "grad_norm": 0.0010651570046320558, + "learning_rate": 2.9514043573115635e-06, + "loss": 0.0001, + "step": 1159 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.04936596006155014, + "learning_rate": 2.9435644843469434e-06, + "loss": 0.0021, + "step": 1160 + }, + { + "epoch": 0.6354679802955665, + "grad_norm": 0.0013516498729586601, + "learning_rate": 2.935730691859172e-06, + "loss": 0.0001, + "step": 1161 + }, + { + "epoch": 0.6360153256704981, + "grad_norm": 0.0008485048892907798, + "learning_rate": 2.927903003011241e-06, + "loss": 0.0001, + "step": 1162 + }, + { + "epoch": 0.6365626710454296, + "grad_norm": 0.0014783508377149701, + "learning_rate": 2.920081440948094e-06, + "loss": 0.0001, + "step": 1163 + }, + { + "epoch": 0.6371100164203612, + "grad_norm": 3.501877546310425, + "learning_rate": 2.912266028796554e-06, + "loss": 0.4404, + "step": 1164 + }, + { + "epoch": 0.6376573617952929, + "grad_norm": 2.515601634979248, + "learning_rate": 2.9044567896652666e-06, + "loss": 0.1112, + "step": 1165 + }, + { + "epoch": 0.6382047071702244, + "grad_norm": 0.296254962682724, + "learning_rate": 2.8966537466446186e-06, + "loss": 0.0086, + "step": 1166 + }, + { + "epoch": 0.638752052545156, + "grad_norm": 0.001710059237666428, + "learning_rate": 2.888856922806682e-06, + "loss": 0.0001, + "step": 1167 + }, + { + "epoch": 0.6392993979200876, + "grad_norm": 1.275455355644226, + "learning_rate": 2.881066341205133e-06, + "loss": 0.0864, + "step": 1168 + }, + { + "epoch": 0.6398467432950191, + "grad_norm": 0.019870832562446594, + "learning_rate": 2.8732820248752016e-06, + "loss": 0.001, + "step": 1169 + }, + { + "epoch": 0.6403940886699507, + "grad_norm": 1.7314554452896118, + "learning_rate": 2.8655039968335774e-06, + "loss": 0.2768, + "step": 1170 + }, + { + "epoch": 0.6409414340448824, + "grad_norm": 2.4131124019622803, + "learning_rate": 2.8577322800783717e-06, + "loss": 0.6951, + "step": 1171 + }, + { + "epoch": 0.6414887794198139, + "grad_norm": 0.004058561287820339, + "learning_rate": 2.849966897589026e-06, + "loss": 0.0002, + "step": 1172 + }, + { + "epoch": 0.6420361247947455, + "grad_norm": 0.12226421386003494, + "learning_rate": 2.842207872326255e-06, + "loss": 0.0061, + "step": 1173 + }, + { + "epoch": 0.6425834701696771, + "grad_norm": 0.0881904885172844, + "learning_rate": 2.8344552272319727e-06, + "loss": 0.0048, + "step": 1174 + }, + { + "epoch": 0.6431308155446086, + "grad_norm": 0.9335182905197144, + "learning_rate": 2.826708985229238e-06, + "loss": 0.2188, + "step": 1175 + }, + { + "epoch": 0.6436781609195402, + "grad_norm": 0.035821665078401566, + "learning_rate": 2.8189691692221627e-06, + "loss": 0.0014, + "step": 1176 + }, + { + "epoch": 0.6442255062944718, + "grad_norm": 1.6445341110229492, + "learning_rate": 2.811235802095873e-06, + "loss": 0.1193, + "step": 1177 + }, + { + "epoch": 0.6447728516694033, + "grad_norm": 0.01749396324157715, + "learning_rate": 2.803508906716417e-06, + "loss": 0.0009, + "step": 1178 + }, + { + "epoch": 0.645320197044335, + "grad_norm": 1.3020265102386475, + "learning_rate": 2.7957885059307097e-06, + "loss": 0.0759, + "step": 1179 + }, + { + "epoch": 0.6458675424192666, + "grad_norm": 0.004729298409074545, + "learning_rate": 2.7880746225664623e-06, + "loss": 0.0002, + "step": 1180 + }, + { + "epoch": 0.6464148877941981, + "grad_norm": 0.27152588963508606, + "learning_rate": 2.780367279432123e-06, + "loss": 0.0131, + "step": 1181 + }, + { + "epoch": 0.6469622331691297, + "grad_norm": 0.004669299814850092, + "learning_rate": 2.7726664993167864e-06, + "loss": 0.0002, + "step": 1182 + }, + { + "epoch": 0.6475095785440613, + "grad_norm": 0.0026488250587135553, + "learning_rate": 2.7649723049901554e-06, + "loss": 0.0001, + "step": 1183 + }, + { + "epoch": 0.6480569239189928, + "grad_norm": 0.004148549400269985, + "learning_rate": 2.7572847192024544e-06, + "loss": 0.0003, + "step": 1184 + }, + { + "epoch": 0.6486042692939245, + "grad_norm": 0.08109666407108307, + "learning_rate": 2.749603764684367e-06, + "loss": 0.0042, + "step": 1185 + }, + { + "epoch": 0.6491516146688561, + "grad_norm": 0.011724872514605522, + "learning_rate": 2.7419294641469718e-06, + "loss": 0.0006, + "step": 1186 + }, + { + "epoch": 0.6496989600437876, + "grad_norm": 0.004057840444147587, + "learning_rate": 2.73426184028167e-06, + "loss": 0.0002, + "step": 1187 + }, + { + "epoch": 0.6502463054187192, + "grad_norm": 0.2738402485847473, + "learning_rate": 2.7266009157601226e-06, + "loss": 0.0188, + "step": 1188 + }, + { + "epoch": 0.6507936507936508, + "grad_norm": 0.0022387555800378323, + "learning_rate": 2.718946713234185e-06, + "loss": 0.0001, + "step": 1189 + }, + { + "epoch": 0.6513409961685823, + "grad_norm": 1.4459933042526245, + "learning_rate": 2.711299255335833e-06, + "loss": 0.1288, + "step": 1190 + }, + { + "epoch": 0.6518883415435139, + "grad_norm": 0.24324724078178406, + "learning_rate": 2.703658564677101e-06, + "loss": 0.0167, + "step": 1191 + }, + { + "epoch": 0.6524356869184456, + "grad_norm": 0.0050564659759402275, + "learning_rate": 2.696024663850013e-06, + "loss": 0.0003, + "step": 1192 + }, + { + "epoch": 0.6529830322933772, + "grad_norm": 0.014988393522799015, + "learning_rate": 2.688397575426517e-06, + "loss": 0.0007, + "step": 1193 + }, + { + "epoch": 0.6535303776683087, + "grad_norm": 0.13157279789447784, + "learning_rate": 2.680777321958424e-06, + "loss": 0.0071, + "step": 1194 + }, + { + "epoch": 0.6540777230432403, + "grad_norm": 5.4224419593811035, + "learning_rate": 2.6731639259773235e-06, + "loss": 0.8259, + "step": 1195 + }, + { + "epoch": 0.6546250684181719, + "grad_norm": 0.04328963905572891, + "learning_rate": 2.6655574099945403e-06, + "loss": 0.0024, + "step": 1196 + }, + { + "epoch": 0.6551724137931034, + "grad_norm": 0.005359618458896875, + "learning_rate": 2.65795779650105e-06, + "loss": 0.0003, + "step": 1197 + }, + { + "epoch": 0.655719759168035, + "grad_norm": 0.07889597117900848, + "learning_rate": 2.6503651079674207e-06, + "loss": 0.0045, + "step": 1198 + }, + { + "epoch": 0.6562671045429667, + "grad_norm": 0.09890451282262802, + "learning_rate": 2.642779366843743e-06, + "loss": 0.0067, + "step": 1199 + }, + { + "epoch": 0.6568144499178982, + "grad_norm": 0.0020398315973579884, + "learning_rate": 2.6352005955595715e-06, + "loss": 0.0001, + "step": 1200 + }, + { + "epoch": 0.6573617952928298, + "grad_norm": 0.032349683344364166, + "learning_rate": 2.6276288165238416e-06, + "loss": 0.002, + "step": 1201 + }, + { + "epoch": 0.6579091406677614, + "grad_norm": 0.049246739596128464, + "learning_rate": 2.620064052124825e-06, + "loss": 0.0029, + "step": 1202 + }, + { + "epoch": 0.6584564860426929, + "grad_norm": 0.04094109684228897, + "learning_rate": 2.612506324730046e-06, + "loss": 0.0021, + "step": 1203 + }, + { + "epoch": 0.6590038314176245, + "grad_norm": 0.013727393932640553, + "learning_rate": 2.6049556566862234e-06, + "loss": 0.0006, + "step": 1204 + }, + { + "epoch": 0.6595511767925561, + "grad_norm": 0.11889801919460297, + "learning_rate": 2.597412070319201e-06, + "loss": 0.0068, + "step": 1205 + }, + { + "epoch": 0.6600985221674877, + "grad_norm": 2.300886631011963, + "learning_rate": 2.589875587933892e-06, + "loss": 0.2447, + "step": 1206 + }, + { + "epoch": 0.6606458675424193, + "grad_norm": 0.0018244871171191335, + "learning_rate": 2.582346231814189e-06, + "loss": 0.0001, + "step": 1207 + }, + { + "epoch": 0.6611932129173509, + "grad_norm": 0.28391480445861816, + "learning_rate": 2.57482402422293e-06, + "loss": 0.0223, + "step": 1208 + }, + { + "epoch": 0.6617405582922824, + "grad_norm": 0.03892725333571434, + "learning_rate": 2.567308987401806e-06, + "loss": 0.0016, + "step": 1209 + }, + { + "epoch": 0.662287903667214, + "grad_norm": 0.029897578060626984, + "learning_rate": 2.5598011435713077e-06, + "loss": 0.0018, + "step": 1210 + }, + { + "epoch": 0.6628352490421456, + "grad_norm": 0.11231425404548645, + "learning_rate": 2.552300514930657e-06, + "loss": 0.0071, + "step": 1211 + }, + { + "epoch": 0.6633825944170771, + "grad_norm": 0.6822946667671204, + "learning_rate": 2.5448071236577493e-06, + "loss": 0.0485, + "step": 1212 + }, + { + "epoch": 0.6639299397920088, + "grad_norm": 0.011305405758321285, + "learning_rate": 2.5373209919090657e-06, + "loss": 0.0006, + "step": 1213 + }, + { + "epoch": 0.6644772851669404, + "grad_norm": 0.011048182845115662, + "learning_rate": 2.5298421418196363e-06, + "loss": 0.0005, + "step": 1214 + }, + { + "epoch": 0.6650246305418719, + "grad_norm": 0.01335719134658575, + "learning_rate": 2.522370595502954e-06, + "loss": 0.0006, + "step": 1215 + }, + { + "epoch": 0.6655719759168035, + "grad_norm": 0.004644239321351051, + "learning_rate": 2.5149063750509166e-06, + "loss": 0.0002, + "step": 1216 + }, + { + "epoch": 0.6661193212917351, + "grad_norm": 0.06332351267337799, + "learning_rate": 2.507449502533762e-06, + "loss": 0.0033, + "step": 1217 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.057875651866197586, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.0026, + "step": 1218 + }, + { + "epoch": 0.6672140120415982, + "grad_norm": 0.0002122679288731888, + "learning_rate": 2.4925578894763524e-06, + "loss": 0.0, + "step": 1219 + }, + { + "epoch": 0.6677613574165299, + "grad_norm": 0.01880548521876335, + "learning_rate": 2.485123192967677e-06, + "loss": 0.001, + "step": 1220 + }, + { + "epoch": 0.6683087027914614, + "grad_norm": 0.0019574714824557304, + "learning_rate": 2.4776959324569193e-06, + "loss": 0.0001, + "step": 1221 + }, + { + "epoch": 0.668856048166393, + "grad_norm": 0.028929315507411957, + "learning_rate": 2.4702761299050314e-06, + "loss": 0.0012, + "step": 1222 + }, + { + "epoch": 0.6694033935413246, + "grad_norm": 0.04426882788538933, + "learning_rate": 2.462863807250915e-06, + "loss": 0.0027, + "step": 1223 + }, + { + "epoch": 0.6699507389162561, + "grad_norm": 0.005717442370951176, + "learning_rate": 2.4554589864113566e-06, + "loss": 0.0003, + "step": 1224 + }, + { + "epoch": 0.6704980842911877, + "grad_norm": 1.8422279357910156, + "learning_rate": 2.4480616892809593e-06, + "loss": 0.2233, + "step": 1225 + }, + { + "epoch": 0.6710454296661194, + "grad_norm": 0.23395343124866486, + "learning_rate": 2.4406719377320808e-06, + "loss": 0.0146, + "step": 1226 + }, + { + "epoch": 0.6715927750410509, + "grad_norm": 0.001357329892925918, + "learning_rate": 2.4332897536147728e-06, + "loss": 0.0001, + "step": 1227 + }, + { + "epoch": 0.6721401204159825, + "grad_norm": 0.07431194186210632, + "learning_rate": 2.425915158756699e-06, + "loss": 0.0045, + "step": 1228 + }, + { + "epoch": 0.6726874657909141, + "grad_norm": 2.2066705226898193, + "learning_rate": 2.418548174963099e-06, + "loss": 0.5243, + "step": 1229 + }, + { + "epoch": 0.6732348111658456, + "grad_norm": 0.0052291397005319595, + "learning_rate": 2.411188824016697e-06, + "loss": 0.0003, + "step": 1230 + }, + { + "epoch": 0.6737821565407772, + "grad_norm": 0.0007067082915455103, + "learning_rate": 2.4038371276776525e-06, + "loss": 0.0001, + "step": 1231 + }, + { + "epoch": 0.6743295019157088, + "grad_norm": 3.0892605781555176, + "learning_rate": 2.396493107683488e-06, + "loss": 0.8147, + "step": 1232 + }, + { + "epoch": 0.6748768472906403, + "grad_norm": 0.0053380937315523624, + "learning_rate": 2.3891567857490373e-06, + "loss": 0.0003, + "step": 1233 + }, + { + "epoch": 0.675424192665572, + "grad_norm": 2.618467092514038, + "learning_rate": 2.38182818356636e-06, + "loss": 0.5732, + "step": 1234 + }, + { + "epoch": 0.6759715380405036, + "grad_norm": 0.14668996632099152, + "learning_rate": 2.374507322804702e-06, + "loss": 0.0059, + "step": 1235 + }, + { + "epoch": 0.6765188834154351, + "grad_norm": 0.495018869638443, + "learning_rate": 2.3671942251104125e-06, + "loss": 0.0381, + "step": 1236 + }, + { + "epoch": 0.6770662287903667, + "grad_norm": 0.0009198206826113164, + "learning_rate": 2.359888912106888e-06, + "loss": 0.0001, + "step": 1237 + }, + { + "epoch": 0.6776135741652983, + "grad_norm": 0.005943139083683491, + "learning_rate": 2.3525914053945054e-06, + "loss": 0.0003, + "step": 1238 + }, + { + "epoch": 0.6781609195402298, + "grad_norm": 0.1299518346786499, + "learning_rate": 2.345301726550567e-06, + "loss": 0.0094, + "step": 1239 + }, + { + "epoch": 0.6787082649151615, + "grad_norm": 0.013089645653963089, + "learning_rate": 2.3380198971292195e-06, + "loss": 0.0006, + "step": 1240 + }, + { + "epoch": 0.6792556102900931, + "grad_norm": 2.11917781829834, + "learning_rate": 2.3307459386614095e-06, + "loss": 0.3652, + "step": 1241 + }, + { + "epoch": 0.6798029556650246, + "grad_norm": 0.020909443497657776, + "learning_rate": 2.323479872654805e-06, + "loss": 0.0011, + "step": 1242 + }, + { + "epoch": 0.6803503010399562, + "grad_norm": 0.004706669598817825, + "learning_rate": 2.316221720593739e-06, + "loss": 0.0003, + "step": 1243 + }, + { + "epoch": 0.6808976464148878, + "grad_norm": 0.010048504918813705, + "learning_rate": 2.3089715039391447e-06, + "loss": 0.0005, + "step": 1244 + }, + { + "epoch": 0.6814449917898193, + "grad_norm": 0.32672691345214844, + "learning_rate": 2.301729244128496e-06, + "loss": 0.0203, + "step": 1245 + }, + { + "epoch": 0.6819923371647509, + "grad_norm": 0.37050729990005493, + "learning_rate": 2.2944949625757295e-06, + "loss": 0.0143, + "step": 1246 + }, + { + "epoch": 0.6825396825396826, + "grad_norm": 0.07902567833662033, + "learning_rate": 2.2872686806712037e-06, + "loss": 0.0046, + "step": 1247 + }, + { + "epoch": 0.6830870279146142, + "grad_norm": 0.0054732682183384895, + "learning_rate": 2.2800504197816147e-06, + "loss": 0.0003, + "step": 1248 + }, + { + "epoch": 0.6836343732895457, + "grad_norm": 0.002054156269878149, + "learning_rate": 2.2728402012499477e-06, + "loss": 0.0001, + "step": 1249 + }, + { + "epoch": 0.6841817186644773, + "grad_norm": 2.3521831035614014, + "learning_rate": 2.265638046395405e-06, + "loss": 0.297, + "step": 1250 + }, + { + "epoch": 0.6847290640394089, + "grad_norm": 4.510025501251221, + "learning_rate": 2.2584439765133453e-06, + "loss": 0.15, + "step": 1251 + }, + { + "epoch": 0.6852764094143404, + "grad_norm": 0.0020269020460546017, + "learning_rate": 2.251258012875228e-06, + "loss": 0.0001, + "step": 1252 + }, + { + "epoch": 0.685823754789272, + "grad_norm": 0.1867758184671402, + "learning_rate": 2.244080176728536e-06, + "loss": 0.0132, + "step": 1253 + }, + { + "epoch": 0.6863711001642037, + "grad_norm": 0.021715736016631126, + "learning_rate": 2.2369104892967253e-06, + "loss": 0.001, + "step": 1254 + }, + { + "epoch": 0.6869184455391352, + "grad_norm": 0.9083892107009888, + "learning_rate": 2.229748971779157e-06, + "loss": 0.0824, + "step": 1255 + }, + { + "epoch": 0.6874657909140668, + "grad_norm": 0.001186563284136355, + "learning_rate": 2.2225956453510345e-06, + "loss": 0.0001, + "step": 1256 + }, + { + "epoch": 0.6880131362889984, + "grad_norm": 2.4314286708831787, + "learning_rate": 2.2154505311633406e-06, + "loss": 0.3183, + "step": 1257 + }, + { + "epoch": 0.6885604816639299, + "grad_norm": 0.2594597041606903, + "learning_rate": 2.208313650342784e-06, + "loss": 0.0179, + "step": 1258 + }, + { + "epoch": 0.6891078270388615, + "grad_norm": 0.1705789715051651, + "learning_rate": 2.2011850239917136e-06, + "loss": 0.0117, + "step": 1259 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.010192793793976307, + "learning_rate": 2.1940646731880887e-06, + "loss": 0.0004, + "step": 1260 + }, + { + "epoch": 0.6902025177887247, + "grad_norm": 0.008953562937676907, + "learning_rate": 2.186952618985387e-06, + "loss": 0.0003, + "step": 1261 + }, + { + "epoch": 0.6907498631636563, + "grad_norm": 0.022463621571660042, + "learning_rate": 2.1798488824125613e-06, + "loss": 0.0013, + "step": 1262 + }, + { + "epoch": 0.6912972085385879, + "grad_norm": 0.06705313920974731, + "learning_rate": 2.1727534844739658e-06, + "loss": 0.0032, + "step": 1263 + }, + { + "epoch": 0.6918445539135194, + "grad_norm": 0.2966002821922302, + "learning_rate": 2.1656664461493073e-06, + "loss": 0.0223, + "step": 1264 + }, + { + "epoch": 0.692391899288451, + "grad_norm": 0.06438577175140381, + "learning_rate": 2.1585877883935617e-06, + "loss": 0.0037, + "step": 1265 + }, + { + "epoch": 0.6929392446633826, + "grad_norm": 0.01615588553249836, + "learning_rate": 2.151517532136939e-06, + "loss": 0.0008, + "step": 1266 + }, + { + "epoch": 0.6934865900383141, + "grad_norm": 0.004233787767589092, + "learning_rate": 2.1444556982847996e-06, + "loss": 0.0002, + "step": 1267 + }, + { + "epoch": 0.6940339354132458, + "grad_norm": 0.6990953683853149, + "learning_rate": 2.137402307717602e-06, + "loss": 0.0346, + "step": 1268 + }, + { + "epoch": 0.6945812807881774, + "grad_norm": 2.0177769660949707, + "learning_rate": 2.1303573812908383e-06, + "loss": 0.2455, + "step": 1269 + }, + { + "epoch": 0.6951286261631089, + "grad_norm": 0.14173780381679535, + "learning_rate": 2.1233209398349817e-06, + "loss": 0.0088, + "step": 1270 + }, + { + "epoch": 0.6956759715380405, + "grad_norm": 0.018130486831068993, + "learning_rate": 2.1162930041554026e-06, + "loss": 0.001, + "step": 1271 + }, + { + "epoch": 0.6962233169129721, + "grad_norm": 0.023228706791996956, + "learning_rate": 2.109273595032335e-06, + "loss": 0.0009, + "step": 1272 + }, + { + "epoch": 0.6967706622879036, + "grad_norm": 0.06960176676511765, + "learning_rate": 2.1022627332207944e-06, + "loss": 0.0034, + "step": 1273 + }, + { + "epoch": 0.6973180076628352, + "grad_norm": 0.01118812058120966, + "learning_rate": 2.095260439450526e-06, + "loss": 0.0005, + "step": 1274 + }, + { + "epoch": 0.6978653530377669, + "grad_norm": 0.0013710305793210864, + "learning_rate": 2.0882667344259384e-06, + "loss": 0.0001, + "step": 1275 + }, + { + "epoch": 0.6984126984126984, + "grad_norm": 0.009152219630777836, + "learning_rate": 2.081281638826052e-06, + "loss": 0.0005, + "step": 1276 + }, + { + "epoch": 0.69896004378763, + "grad_norm": 1.5331132411956787, + "learning_rate": 2.0743051733044184e-06, + "loss": 0.2018, + "step": 1277 + }, + { + "epoch": 0.6995073891625616, + "grad_norm": 0.006271702703088522, + "learning_rate": 2.0673373584890847e-06, + "loss": 0.0003, + "step": 1278 + }, + { + "epoch": 0.7000547345374931, + "grad_norm": 0.005529694724828005, + "learning_rate": 2.0603782149825126e-06, + "loss": 0.0003, + "step": 1279 + }, + { + "epoch": 0.7006020799124247, + "grad_norm": 0.3179608881473541, + "learning_rate": 2.053427763361525e-06, + "loss": 0.0212, + "step": 1280 + }, + { + "epoch": 0.7011494252873564, + "grad_norm": 0.007049943320453167, + "learning_rate": 2.0464860241772454e-06, + "loss": 0.0003, + "step": 1281 + }, + { + "epoch": 0.7016967706622879, + "grad_norm": 0.42016157507896423, + "learning_rate": 2.0395530179550365e-06, + "loss": 0.0337, + "step": 1282 + }, + { + "epoch": 0.7022441160372195, + "grad_norm": 0.049554865807294846, + "learning_rate": 2.0326287651944392e-06, + "loss": 0.003, + "step": 1283 + }, + { + "epoch": 0.7027914614121511, + "grad_norm": 2.7460994720458984, + "learning_rate": 2.0257132863691108e-06, + "loss": 0.5833, + "step": 1284 + }, + { + "epoch": 0.7033388067870826, + "grad_norm": 0.8982927799224854, + "learning_rate": 2.01880660192677e-06, + "loss": 0.0693, + "step": 1285 + }, + { + "epoch": 0.7038861521620142, + "grad_norm": 0.12810498476028442, + "learning_rate": 2.011908732289127e-06, + "loss": 0.0062, + "step": 1286 + }, + { + "epoch": 0.7044334975369458, + "grad_norm": 0.001164785004220903, + "learning_rate": 2.0050196978518323e-06, + "loss": 0.0001, + "step": 1287 + }, + { + "epoch": 0.7049808429118773, + "grad_norm": 2.3705291748046875, + "learning_rate": 1.998139518984409e-06, + "loss": 0.5842, + "step": 1288 + }, + { + "epoch": 0.705528188286809, + "grad_norm": 1.3433531522750854, + "learning_rate": 1.9912682160301986e-06, + "loss": 0.2473, + "step": 1289 + }, + { + "epoch": 0.7060755336617406, + "grad_norm": 1.682137131690979, + "learning_rate": 1.9844058093062962e-06, + "loss": 0.3179, + "step": 1290 + }, + { + "epoch": 0.7066228790366721, + "grad_norm": 0.007880594581365585, + "learning_rate": 1.977552319103498e-06, + "loss": 0.0004, + "step": 1291 + }, + { + "epoch": 0.7071702244116037, + "grad_norm": 0.005036715883761644, + "learning_rate": 1.970707765686225e-06, + "loss": 0.0002, + "step": 1292 + }, + { + "epoch": 0.7077175697865353, + "grad_norm": 0.1129072830080986, + "learning_rate": 1.963872169292486e-06, + "loss": 0.0074, + "step": 1293 + }, + { + "epoch": 0.7082649151614668, + "grad_norm": 0.06477373838424683, + "learning_rate": 1.957045550133798e-06, + "loss": 0.0037, + "step": 1294 + }, + { + "epoch": 0.7088122605363985, + "grad_norm": 0.0029708503279834986, + "learning_rate": 1.9502279283951363e-06, + "loss": 0.0002, + "step": 1295 + }, + { + "epoch": 0.7093596059113301, + "grad_norm": 0.043561290949583054, + "learning_rate": 1.943419324234871e-06, + "loss": 0.0025, + "step": 1296 + }, + { + "epoch": 0.7099069512862616, + "grad_norm": 0.03608259931206703, + "learning_rate": 1.9366197577847144e-06, + "loss": 0.0017, + "step": 1297 + }, + { + "epoch": 0.7104542966611932, + "grad_norm": 0.017917588353157043, + "learning_rate": 1.929829249149646e-06, + "loss": 0.0009, + "step": 1298 + }, + { + "epoch": 0.7110016420361248, + "grad_norm": 0.0025389937218278646, + "learning_rate": 1.923047818407875e-06, + "loss": 0.0002, + "step": 1299 + }, + { + "epoch": 0.7115489874110563, + "grad_norm": 0.004917440470308065, + "learning_rate": 1.916275485610761e-06, + "loss": 0.0002, + "step": 1300 + }, + { + "epoch": 0.7120963327859879, + "grad_norm": 0.46073776483535767, + "learning_rate": 1.909512270782764e-06, + "loss": 0.0267, + "step": 1301 + }, + { + "epoch": 0.7126436781609196, + "grad_norm": 0.06749262660741806, + "learning_rate": 1.9027581939213852e-06, + "loss": 0.0034, + "step": 1302 + }, + { + "epoch": 0.7131910235358512, + "grad_norm": 0.1648429036140442, + "learning_rate": 1.8960132749971077e-06, + "loss": 0.0105, + "step": 1303 + }, + { + "epoch": 0.7137383689107827, + "grad_norm": 0.003856829833239317, + "learning_rate": 1.8892775339533354e-06, + "loss": 0.0002, + "step": 1304 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.19861356914043427, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.0131, + "step": 1305 + }, + { + "epoch": 0.7148330596606459, + "grad_norm": 0.003126499243080616, + "learning_rate": 1.8758336651451697e-06, + "loss": 0.0002, + "step": 1306 + }, + { + "epoch": 0.7153804050355774, + "grad_norm": 0.003605087986215949, + "learning_rate": 1.8691255771316664e-06, + "loss": 0.0002, + "step": 1307 + }, + { + "epoch": 0.715927750410509, + "grad_norm": 0.2548067271709442, + "learning_rate": 1.8624267465003176e-06, + "loss": 0.0152, + "step": 1308 + }, + { + "epoch": 0.7164750957854407, + "grad_norm": 0.00206976430490613, + "learning_rate": 1.8557371930582579e-06, + "loss": 0.0001, + "step": 1309 + }, + { + "epoch": 0.7170224411603722, + "grad_norm": 0.002555719343945384, + "learning_rate": 1.8490569365851846e-06, + "loss": 0.0001, + "step": 1310 + }, + { + "epoch": 0.7175697865353038, + "grad_norm": 0.00730897206813097, + "learning_rate": 1.8423859968333063e-06, + "loss": 0.0003, + "step": 1311 + }, + { + "epoch": 0.7181171319102354, + "grad_norm": 0.09046211838722229, + "learning_rate": 1.8357243935272856e-06, + "loss": 0.0052, + "step": 1312 + }, + { + "epoch": 0.7186644772851669, + "grad_norm": 0.5083016157150269, + "learning_rate": 1.8290721463641782e-06, + "loss": 0.0485, + "step": 1313 + }, + { + "epoch": 0.7192118226600985, + "grad_norm": 0.18554258346557617, + "learning_rate": 1.8224292750133743e-06, + "loss": 0.0146, + "step": 1314 + }, + { + "epoch": 0.7197591680350302, + "grad_norm": 0.005190784577280283, + "learning_rate": 1.8157957991165415e-06, + "loss": 0.0002, + "step": 1315 + }, + { + "epoch": 0.7203065134099617, + "grad_norm": 0.06379152089357376, + "learning_rate": 1.8091717382875723e-06, + "loss": 0.0035, + "step": 1316 + }, + { + "epoch": 0.7208538587848933, + "grad_norm": 0.04362674430012703, + "learning_rate": 1.8025571121125141e-06, + "loss": 0.0024, + "step": 1317 + }, + { + "epoch": 0.7214012041598249, + "grad_norm": 0.0061056301929056644, + "learning_rate": 1.7959519401495208e-06, + "loss": 0.0003, + "step": 1318 + }, + { + "epoch": 0.7219485495347564, + "grad_norm": 0.0022033448331058025, + "learning_rate": 1.7893562419287908e-06, + "loss": 0.0001, + "step": 1319 + }, + { + "epoch": 0.722495894909688, + "grad_norm": 0.0009400215349160135, + "learning_rate": 1.7827700369525125e-06, + "loss": 0.0001, + "step": 1320 + }, + { + "epoch": 0.7230432402846196, + "grad_norm": 2.527238368988037, + "learning_rate": 1.7761933446948004e-06, + "loss": 0.1914, + "step": 1321 + }, + { + "epoch": 0.7235905856595511, + "grad_norm": 0.1244870126247406, + "learning_rate": 1.7696261846016505e-06, + "loss": 0.0085, + "step": 1322 + }, + { + "epoch": 0.7241379310344828, + "grad_norm": 0.04909505695104599, + "learning_rate": 1.7630685760908623e-06, + "loss": 0.0022, + "step": 1323 + }, + { + "epoch": 0.7246852764094144, + "grad_norm": 2.485495090484619, + "learning_rate": 1.756520538552003e-06, + "loss": 0.1524, + "step": 1324 + }, + { + "epoch": 0.7252326217843459, + "grad_norm": 0.0722983255982399, + "learning_rate": 1.749982091346335e-06, + "loss": 0.0033, + "step": 1325 + }, + { + "epoch": 0.7257799671592775, + "grad_norm": 0.003003154881298542, + "learning_rate": 1.7434532538067655e-06, + "loss": 0.0002, + "step": 1326 + }, + { + "epoch": 0.7263273125342091, + "grad_norm": 0.0019068497931584716, + "learning_rate": 1.736934045237787e-06, + "loss": 0.0001, + "step": 1327 + }, + { + "epoch": 0.7268746579091406, + "grad_norm": 0.004010593984276056, + "learning_rate": 1.7304244849154256e-06, + "loss": 0.0002, + "step": 1328 + }, + { + "epoch": 0.7274220032840722, + "grad_norm": 0.15621091425418854, + "learning_rate": 1.72392459208717e-06, + "loss": 0.0071, + "step": 1329 + }, + { + "epoch": 0.7279693486590039, + "grad_norm": 0.39699360728263855, + "learning_rate": 1.7174343859719334e-06, + "loss": 0.0214, + "step": 1330 + }, + { + "epoch": 0.7285166940339354, + "grad_norm": 0.01176412496715784, + "learning_rate": 1.7109538857599829e-06, + "loss": 0.0006, + "step": 1331 + }, + { + "epoch": 0.729064039408867, + "grad_norm": 0.0023852819576859474, + "learning_rate": 1.7044831106128867e-06, + "loss": 0.0001, + "step": 1332 + }, + { + "epoch": 0.7296113847837986, + "grad_norm": 0.011824233457446098, + "learning_rate": 1.6980220796634583e-06, + "loss": 0.0005, + "step": 1333 + }, + { + "epoch": 0.7301587301587301, + "grad_norm": 0.0041591511107981205, + "learning_rate": 1.6915708120157042e-06, + "loss": 0.0002, + "step": 1334 + }, + { + "epoch": 0.7307060755336617, + "grad_norm": 1.6555798053741455, + "learning_rate": 1.6851293267447527e-06, + "loss": 0.2759, + "step": 1335 + }, + { + "epoch": 0.7312534209085934, + "grad_norm": 0.056029561907052994, + "learning_rate": 1.6786976428968188e-06, + "loss": 0.003, + "step": 1336 + }, + { + "epoch": 0.7318007662835249, + "grad_norm": 0.07709532231092453, + "learning_rate": 1.6722757794891287e-06, + "loss": 0.0043, + "step": 1337 + }, + { + "epoch": 0.7323481116584565, + "grad_norm": 0.049780573695898056, + "learning_rate": 1.6658637555098744e-06, + "loss": 0.0029, + "step": 1338 + }, + { + "epoch": 0.7328954570333881, + "grad_norm": 1.2633681297302246, + "learning_rate": 1.6594615899181526e-06, + "loss": 0.1035, + "step": 1339 + }, + { + "epoch": 0.7334428024083196, + "grad_norm": 0.0017923095729202032, + "learning_rate": 1.653069301643918e-06, + "loss": 0.0001, + "step": 1340 + }, + { + "epoch": 0.7339901477832512, + "grad_norm": 0.06065789982676506, + "learning_rate": 1.6466869095879079e-06, + "loss": 0.003, + "step": 1341 + }, + { + "epoch": 0.7345374931581828, + "grad_norm": 0.004060553852468729, + "learning_rate": 1.6403144326216085e-06, + "loss": 0.0001, + "step": 1342 + }, + { + "epoch": 0.7350848385331143, + "grad_norm": 1.5886331796646118, + "learning_rate": 1.6339518895871853e-06, + "loss": 0.2001, + "step": 1343 + }, + { + "epoch": 0.735632183908046, + "grad_norm": 0.005949839483946562, + "learning_rate": 1.627599299297431e-06, + "loss": 0.0003, + "step": 1344 + }, + { + "epoch": 0.7361795292829776, + "grad_norm": 0.004496109671890736, + "learning_rate": 1.6212566805357094e-06, + "loss": 0.0002, + "step": 1345 + }, + { + "epoch": 0.7367268746579091, + "grad_norm": 0.006199996452778578, + "learning_rate": 1.6149240520559023e-06, + "loss": 0.0002, + "step": 1346 + }, + { + "epoch": 0.7372742200328407, + "grad_norm": 0.002357217948883772, + "learning_rate": 1.6086014325823485e-06, + "loss": 0.0001, + "step": 1347 + }, + { + "epoch": 0.7378215654077723, + "grad_norm": 0.015918180346488953, + "learning_rate": 1.6022888408097991e-06, + "loss": 0.0008, + "step": 1348 + }, + { + "epoch": 0.7383689107827038, + "grad_norm": 0.0009474227554164827, + "learning_rate": 1.5959862954033495e-06, + "loss": 0.0001, + "step": 1349 + }, + { + "epoch": 0.7389162561576355, + "grad_norm": 0.08809763938188553, + "learning_rate": 1.589693814998391e-06, + "loss": 0.0051, + "step": 1350 + }, + { + "epoch": 0.7394636015325671, + "grad_norm": 2.6322948932647705, + "learning_rate": 1.5834114182005544e-06, + "loss": 0.346, + "step": 1351 + }, + { + "epoch": 0.7400109469074986, + "grad_norm": 0.17066499590873718, + "learning_rate": 1.577139123585657e-06, + "loss": 0.0112, + "step": 1352 + }, + { + "epoch": 0.7405582922824302, + "grad_norm": 0.0018952427199110389, + "learning_rate": 1.5708769496996445e-06, + "loss": 0.0001, + "step": 1353 + }, + { + "epoch": 0.7411056376573618, + "grad_norm": 0.005605690646916628, + "learning_rate": 1.5646249150585368e-06, + "loss": 0.0003, + "step": 1354 + }, + { + "epoch": 0.7416529830322933, + "grad_norm": 0.07938912510871887, + "learning_rate": 1.5583830381483789e-06, + "loss": 0.0039, + "step": 1355 + }, + { + "epoch": 0.7422003284072249, + "grad_norm": 2.0162599086761475, + "learning_rate": 1.552151337425173e-06, + "loss": 0.4149, + "step": 1356 + }, + { + "epoch": 0.7427476737821566, + "grad_norm": 0.0025566776748746634, + "learning_rate": 1.5459298313148402e-06, + "loss": 0.0002, + "step": 1357 + }, + { + "epoch": 0.7432950191570882, + "grad_norm": 0.301851361989975, + "learning_rate": 1.5397185382131524e-06, + "loss": 0.0229, + "step": 1358 + }, + { + "epoch": 0.7438423645320197, + "grad_norm": 0.005030965898185968, + "learning_rate": 1.533517476485691e-06, + "loss": 0.0002, + "step": 1359 + }, + { + "epoch": 0.7443897099069513, + "grad_norm": 0.05721491575241089, + "learning_rate": 1.5273266644677737e-06, + "loss": 0.0034, + "step": 1360 + }, + { + "epoch": 0.7449370552818829, + "grad_norm": 0.005043413024395704, + "learning_rate": 1.521146120464424e-06, + "loss": 0.0002, + "step": 1361 + }, + { + "epoch": 0.7454844006568144, + "grad_norm": 0.010818198323249817, + "learning_rate": 1.514975862750297e-06, + "loss": 0.0006, + "step": 1362 + }, + { + "epoch": 0.746031746031746, + "grad_norm": 0.010304316878318787, + "learning_rate": 1.5088159095696365e-06, + "loss": 0.0005, + "step": 1363 + }, + { + "epoch": 0.7465790914066777, + "grad_norm": 0.08233784884214401, + "learning_rate": 1.5026662791362145e-06, + "loss": 0.0043, + "step": 1364 + }, + { + "epoch": 0.7471264367816092, + "grad_norm": 0.00964348390698433, + "learning_rate": 1.4965269896332884e-06, + "loss": 0.0005, + "step": 1365 + }, + { + "epoch": 0.7476737821565408, + "grad_norm": 0.005296952556818724, + "learning_rate": 1.4903980592135281e-06, + "loss": 0.0003, + "step": 1366 + }, + { + "epoch": 0.7482211275314724, + "grad_norm": 0.00572380842640996, + "learning_rate": 1.4842795059989845e-06, + "loss": 0.0003, + "step": 1367 + }, + { + "epoch": 0.7487684729064039, + "grad_norm": 0.0025136778131127357, + "learning_rate": 1.4781713480810184e-06, + "loss": 0.0001, + "step": 1368 + }, + { + "epoch": 0.7493158182813355, + "grad_norm": 0.002252168720588088, + "learning_rate": 1.472073603520256e-06, + "loss": 0.0001, + "step": 1369 + }, + { + "epoch": 0.7498631636562672, + "grad_norm": 0.12772350013256073, + "learning_rate": 1.4659862903465322e-06, + "loss": 0.0059, + "step": 1370 + }, + { + "epoch": 0.7504105090311987, + "grad_norm": 0.006171985529363155, + "learning_rate": 1.4599094265588432e-06, + "loss": 0.0004, + "step": 1371 + }, + { + "epoch": 0.7509578544061303, + "grad_norm": 0.2971583902835846, + "learning_rate": 1.4538430301252783e-06, + "loss": 0.0203, + "step": 1372 + }, + { + "epoch": 0.7515051997810619, + "grad_norm": 0.01382134947925806, + "learning_rate": 1.4477871189829872e-06, + "loss": 0.0008, + "step": 1373 + }, + { + "epoch": 0.7520525451559934, + "grad_norm": 2.908761978149414, + "learning_rate": 1.4417417110381126e-06, + "loss": 0.6791, + "step": 1374 + }, + { + "epoch": 0.752599890530925, + "grad_norm": 0.004314839839935303, + "learning_rate": 1.4357068241657396e-06, + "loss": 0.0002, + "step": 1375 + }, + { + "epoch": 0.7531472359058566, + "grad_norm": 1.0610297918319702, + "learning_rate": 1.4296824762098465e-06, + "loss": 0.0323, + "step": 1376 + }, + { + "epoch": 0.7536945812807881, + "grad_norm": 0.031116381287574768, + "learning_rate": 1.4236686849832497e-06, + "loss": 0.0017, + "step": 1377 + }, + { + "epoch": 0.7542419266557198, + "grad_norm": 2.27239990234375, + "learning_rate": 1.4176654682675518e-06, + "loss": 0.4945, + "step": 1378 + }, + { + "epoch": 0.7547892720306514, + "grad_norm": 0.006444338243454695, + "learning_rate": 1.411672843813086e-06, + "loss": 0.0003, + "step": 1379 + }, + { + "epoch": 0.7553366174055829, + "grad_norm": 0.016596568748354912, + "learning_rate": 1.405690829338872e-06, + "loss": 0.0006, + "step": 1380 + }, + { + "epoch": 0.7558839627805145, + "grad_norm": 0.0022818318102508783, + "learning_rate": 1.3997194425325533e-06, + "loss": 0.0001, + "step": 1381 + }, + { + "epoch": 0.7564313081554461, + "grad_norm": 0.25383079051971436, + "learning_rate": 1.39375870105035e-06, + "loss": 0.0177, + "step": 1382 + }, + { + "epoch": 0.7569786535303776, + "grad_norm": 0.0013886289671063423, + "learning_rate": 1.3878086225170067e-06, + "loss": 0.0001, + "step": 1383 + }, + { + "epoch": 0.7575259989053093, + "grad_norm": 0.12911343574523926, + "learning_rate": 1.3818692245257398e-06, + "loss": 0.0084, + "step": 1384 + }, + { + "epoch": 0.7580733442802409, + "grad_norm": 0.09496220201253891, + "learning_rate": 1.3759405246381841e-06, + "loss": 0.0055, + "step": 1385 + }, + { + "epoch": 0.7586206896551724, + "grad_norm": 0.11710565537214279, + "learning_rate": 1.370022540384347e-06, + "loss": 0.0087, + "step": 1386 + }, + { + "epoch": 0.759168035030104, + "grad_norm": 0.001593872788362205, + "learning_rate": 1.364115289262543e-06, + "loss": 0.0001, + "step": 1387 + }, + { + "epoch": 0.7597153804050356, + "grad_norm": 0.006262289825826883, + "learning_rate": 1.358218788739361e-06, + "loss": 0.0003, + "step": 1388 + }, + { + "epoch": 0.7602627257799671, + "grad_norm": 0.004391077905893326, + "learning_rate": 1.352333056249595e-06, + "loss": 0.0003, + "step": 1389 + }, + { + "epoch": 0.7608100711548987, + "grad_norm": 0.012612469494342804, + "learning_rate": 1.3464581091962037e-06, + "loss": 0.0006, + "step": 1390 + }, + { + "epoch": 0.7613574165298304, + "grad_norm": 0.24891842901706696, + "learning_rate": 1.340593964950252e-06, + "loss": 0.0156, + "step": 1391 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.0030374748166650534, + "learning_rate": 1.3347406408508695e-06, + "loss": 0.0002, + "step": 1392 + }, + { + "epoch": 0.7624521072796935, + "grad_norm": 0.028773490339517593, + "learning_rate": 1.3288981542051844e-06, + "loss": 0.0016, + "step": 1393 + }, + { + "epoch": 0.7629994526546251, + "grad_norm": 0.051260218024253845, + "learning_rate": 1.3230665222882872e-06, + "loss": 0.0023, + "step": 1394 + }, + { + "epoch": 0.7635467980295566, + "grad_norm": 0.10326068848371506, + "learning_rate": 1.3172457623431706e-06, + "loss": 0.0064, + "step": 1395 + }, + { + "epoch": 0.7640941434044882, + "grad_norm": 0.09019647538661957, + "learning_rate": 1.3114358915806808e-06, + "loss": 0.0056, + "step": 1396 + }, + { + "epoch": 0.7646414887794198, + "grad_norm": 0.06393663585186005, + "learning_rate": 1.3056369271794656e-06, + "loss": 0.0038, + "step": 1397 + }, + { + "epoch": 0.7651888341543513, + "grad_norm": 0.0010087719419971108, + "learning_rate": 1.2998488862859305e-06, + "loss": 0.0001, + "step": 1398 + }, + { + "epoch": 0.765736179529283, + "grad_norm": 0.24810169637203217, + "learning_rate": 1.2940717860141734e-06, + "loss": 0.0203, + "step": 1399 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 0.0025587116833776236, + "learning_rate": 1.2883056434459506e-06, + "loss": 0.0001, + "step": 1400 + }, + { + "epoch": 0.7668308702791461, + "grad_norm": 0.17504432797431946, + "learning_rate": 1.2825504756306156e-06, + "loss": 0.0092, + "step": 1401 + }, + { + "epoch": 0.7673782156540777, + "grad_norm": 0.003361350391060114, + "learning_rate": 1.2768062995850716e-06, + "loss": 0.0002, + "step": 1402 + }, + { + "epoch": 0.7679255610290093, + "grad_norm": 0.004303762689232826, + "learning_rate": 1.2710731322937198e-06, + "loss": 0.0002, + "step": 1403 + }, + { + "epoch": 0.7684729064039408, + "grad_norm": 0.0592142678797245, + "learning_rate": 1.2653509907084171e-06, + "loss": 0.0027, + "step": 1404 + }, + { + "epoch": 0.7690202517788725, + "grad_norm": 0.002017846331000328, + "learning_rate": 1.2596398917484088e-06, + "loss": 0.0001, + "step": 1405 + }, + { + "epoch": 0.7695675971538041, + "grad_norm": 0.06095453351736069, + "learning_rate": 1.2539398523003e-06, + "loss": 0.0035, + "step": 1406 + }, + { + "epoch": 0.7701149425287356, + "grad_norm": 0.05319780856370926, + "learning_rate": 1.2482508892179884e-06, + "loss": 0.0021, + "step": 1407 + }, + { + "epoch": 0.7706622879036672, + "grad_norm": 0.10127472877502441, + "learning_rate": 1.2425730193226237e-06, + "loss": 0.0067, + "step": 1408 + }, + { + "epoch": 0.7712096332785988, + "grad_norm": 0.00393831729888916, + "learning_rate": 1.2369062594025549e-06, + "loss": 0.0002, + "step": 1409 + }, + { + "epoch": 0.7717569786535303, + "grad_norm": 0.132782444357872, + "learning_rate": 1.2312506262132795e-06, + "loss": 0.0068, + "step": 1410 + }, + { + "epoch": 0.7723043240284619, + "grad_norm": 0.0037302267737686634, + "learning_rate": 1.2256061364773958e-06, + "loss": 0.0002, + "step": 1411 + }, + { + "epoch": 0.7728516694033936, + "grad_norm": 0.09026055783033371, + "learning_rate": 1.2199728068845574e-06, + "loss": 0.0045, + "step": 1412 + }, + { + "epoch": 0.7733990147783252, + "grad_norm": 0.028559090569615364, + "learning_rate": 1.214350654091413e-06, + "loss": 0.0015, + "step": 1413 + }, + { + "epoch": 0.7739463601532567, + "grad_norm": 0.10447684675455093, + "learning_rate": 1.2087396947215678e-06, + "loss": 0.0068, + "step": 1414 + }, + { + "epoch": 0.7744937055281883, + "grad_norm": 0.004746158141642809, + "learning_rate": 1.2031399453655296e-06, + "loss": 0.0002, + "step": 1415 + }, + { + "epoch": 0.7750410509031199, + "grad_norm": 0.0731428861618042, + "learning_rate": 1.1975514225806573e-06, + "loss": 0.004, + "step": 1416 + }, + { + "epoch": 0.7755883962780514, + "grad_norm": 0.010456275194883347, + "learning_rate": 1.191974142891123e-06, + "loss": 0.0005, + "step": 1417 + }, + { + "epoch": 0.776135741652983, + "grad_norm": 0.11829491704702377, + "learning_rate": 1.1864081227878438e-06, + "loss": 0.0091, + "step": 1418 + }, + { + "epoch": 0.7766830870279147, + "grad_norm": 2.1715807914733887, + "learning_rate": 1.1808533787284543e-06, + "loss": 0.1799, + "step": 1419 + }, + { + "epoch": 0.7772304324028462, + "grad_norm": 0.0016666334122419357, + "learning_rate": 1.1753099271372432e-06, + "loss": 0.0001, + "step": 1420 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.0010274969972670078, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.0001, + "step": 1421 + }, + { + "epoch": 0.7783251231527094, + "grad_norm": 0.001577872666530311, + "learning_rate": 1.1642569668895171e-06, + "loss": 0.0001, + "step": 1422 + }, + { + "epoch": 0.7788724685276409, + "grad_norm": 0.0030261666979640722, + "learning_rate": 1.1587474909144419e-06, + "loss": 0.0002, + "step": 1423 + }, + { + "epoch": 0.7794198139025725, + "grad_norm": 0.0762784481048584, + "learning_rate": 1.1532493727703214e-06, + "loss": 0.0037, + "step": 1424 + }, + { + "epoch": 0.7799671592775042, + "grad_norm": 2.675801992416382, + "learning_rate": 1.1477626287140164e-06, + "loss": 0.1384, + "step": 1425 + }, + { + "epoch": 0.7805145046524357, + "grad_norm": 0.04527585953474045, + "learning_rate": 1.1422872749687542e-06, + "loss": 0.0028, + "step": 1426 + }, + { + "epoch": 0.7810618500273673, + "grad_norm": 0.005558805540204048, + "learning_rate": 1.136823327724081e-06, + "loss": 0.0003, + "step": 1427 + }, + { + "epoch": 0.7816091954022989, + "grad_norm": 0.27077722549438477, + "learning_rate": 1.1313708031358183e-06, + "loss": 0.0131, + "step": 1428 + }, + { + "epoch": 0.7821565407772304, + "grad_norm": 0.025989117100834846, + "learning_rate": 1.1259297173260158e-06, + "loss": 0.0013, + "step": 1429 + }, + { + "epoch": 0.782703886152162, + "grad_norm": 0.026661472395062447, + "learning_rate": 1.1205000863828936e-06, + "loss": 0.0014, + "step": 1430 + }, + { + "epoch": 0.7832512315270936, + "grad_norm": 0.001215687021613121, + "learning_rate": 1.1150819263608098e-06, + "loss": 0.0001, + "step": 1431 + }, + { + "epoch": 0.7837985769020251, + "grad_norm": 0.0007609071908518672, + "learning_rate": 1.1096752532802007e-06, + "loss": 0.0001, + "step": 1432 + }, + { + "epoch": 0.7843459222769568, + "grad_norm": 0.014400236308574677, + "learning_rate": 1.104280083127539e-06, + "loss": 0.0008, + "step": 1433 + }, + { + "epoch": 0.7848932676518884, + "grad_norm": 2.9239320755004883, + "learning_rate": 1.0988964318552848e-06, + "loss": 0.1112, + "step": 1434 + }, + { + "epoch": 0.7854406130268199, + "grad_norm": 0.001832630136050284, + "learning_rate": 1.0935243153818437e-06, + "loss": 0.0001, + "step": 1435 + }, + { + "epoch": 0.7859879584017515, + "grad_norm": 1.0065650939941406, + "learning_rate": 1.0881637495915055e-06, + "loss": 0.0454, + "step": 1436 + }, + { + "epoch": 0.7865353037766831, + "grad_norm": 0.00475548068061471, + "learning_rate": 1.0828147503344177e-06, + "loss": 0.0002, + "step": 1437 + }, + { + "epoch": 0.7870826491516146, + "grad_norm": 0.0034952079877257347, + "learning_rate": 1.077477333426521e-06, + "loss": 0.0002, + "step": 1438 + }, + { + "epoch": 0.7876299945265463, + "grad_norm": 0.47412243485450745, + "learning_rate": 1.072151514649512e-06, + "loss": 0.0393, + "step": 1439 + }, + { + "epoch": 0.7881773399014779, + "grad_norm": 0.0020336441230028868, + "learning_rate": 1.0668373097507922e-06, + "loss": 0.0001, + "step": 1440 + }, + { + "epoch": 0.7887246852764094, + "grad_norm": 0.40038520097732544, + "learning_rate": 1.061534734443425e-06, + "loss": 0.0264, + "step": 1441 + }, + { + "epoch": 0.789272030651341, + "grad_norm": 1.4300894737243652, + "learning_rate": 1.0562438044060846e-06, + "loss": 0.0774, + "step": 1442 + }, + { + "epoch": 0.7898193760262726, + "grad_norm": 0.001692896126769483, + "learning_rate": 1.0509645352830178e-06, + "loss": 0.0001, + "step": 1443 + }, + { + "epoch": 0.7903667214012041, + "grad_norm": 0.05487797036767006, + "learning_rate": 1.0456969426839869e-06, + "loss": 0.0028, + "step": 1444 + }, + { + "epoch": 0.7909140667761357, + "grad_norm": 2.1209144592285156, + "learning_rate": 1.040441042184231e-06, + "loss": 0.5426, + "step": 1445 + }, + { + "epoch": 0.7914614121510674, + "grad_norm": 0.001598983071744442, + "learning_rate": 1.035196849324418e-06, + "loss": 0.0001, + "step": 1446 + }, + { + "epoch": 0.7920087575259989, + "grad_norm": 0.7857943773269653, + "learning_rate": 1.0299643796105985e-06, + "loss": 0.0651, + "step": 1447 + }, + { + "epoch": 0.7925561029009305, + "grad_norm": 0.0008221364114433527, + "learning_rate": 1.0247436485141605e-06, + "loss": 0.0001, + "step": 1448 + }, + { + "epoch": 0.7931034482758621, + "grad_norm": 2.5932040214538574, + "learning_rate": 1.0195346714717813e-06, + "loss": 0.1756, + "step": 1449 + }, + { + "epoch": 0.7936507936507936, + "grad_norm": 2.633694648742676, + "learning_rate": 1.0143374638853892e-06, + "loss": 0.4555, + "step": 1450 + }, + { + "epoch": 0.7941981390257252, + "grad_norm": 0.0076095834374427795, + "learning_rate": 1.0091520411221028e-06, + "loss": 0.0004, + "step": 1451 + }, + { + "epoch": 0.7947454844006568, + "grad_norm": 0.0011043306440114975, + "learning_rate": 1.0039784185142065e-06, + "loss": 0.0001, + "step": 1452 + }, + { + "epoch": 0.7952928297755884, + "grad_norm": 0.0019398522563278675, + "learning_rate": 9.988166113590857e-07, + "loss": 0.0001, + "step": 1453 + }, + { + "epoch": 0.79584017515052, + "grad_norm": 3.7462940216064453, + "learning_rate": 9.936666349191936e-07, + "loss": 0.2701, + "step": 1454 + }, + { + "epoch": 0.7963875205254516, + "grad_norm": 0.921699047088623, + "learning_rate": 9.88528504422e-07, + "loss": 0.0569, + "step": 1455 + }, + { + "epoch": 0.7969348659003831, + "grad_norm": 0.0006251604063436389, + "learning_rate": 9.834022350599538e-07, + "loss": 0.0, + "step": 1456 + }, + { + "epoch": 0.7974822112753147, + "grad_norm": 0.0008446245919913054, + "learning_rate": 9.78287841990423e-07, + "loss": 0.0001, + "step": 1457 + }, + { + "epoch": 0.7980295566502463, + "grad_norm": 0.10622086375951767, + "learning_rate": 9.731853403356705e-07, + "loss": 0.0066, + "step": 1458 + }, + { + "epoch": 0.7985769020251778, + "grad_norm": 0.0006699857767671347, + "learning_rate": 9.68094745182792e-07, + "loss": 0.0001, + "step": 1459 + }, + { + "epoch": 0.7991242474001095, + "grad_norm": 1.367194652557373, + "learning_rate": 9.630160715836805e-07, + "loss": 0.2876, + "step": 1460 + }, + { + "epoch": 0.7996715927750411, + "grad_norm": 0.030433079227805138, + "learning_rate": 9.579493345549772e-07, + "loss": 0.0015, + "step": 1461 + }, + { + "epoch": 0.8002189381499726, + "grad_norm": 0.0018854098161682487, + "learning_rate": 9.528945490780339e-07, + "loss": 0.0001, + "step": 1462 + }, + { + "epoch": 0.8007662835249042, + "grad_norm": 0.006027139723300934, + "learning_rate": 9.47851730098856e-07, + "loss": 0.0003, + "step": 1463 + }, + { + "epoch": 0.8013136288998358, + "grad_norm": 0.7355122566223145, + "learning_rate": 9.428208925280746e-07, + "loss": 0.0696, + "step": 1464 + }, + { + "epoch": 0.8018609742747673, + "grad_norm": 2.0000030994415283, + "learning_rate": 9.378020512408903e-07, + "loss": 0.0859, + "step": 1465 + }, + { + "epoch": 0.8024083196496989, + "grad_norm": 2.8742313385009766, + "learning_rate": 9.327952210770319e-07, + "loss": 0.5494, + "step": 1466 + }, + { + "epoch": 0.8029556650246306, + "grad_norm": 0.16476808488368988, + "learning_rate": 9.278004168407151e-07, + "loss": 0.0098, + "step": 1467 + }, + { + "epoch": 0.8035030103995622, + "grad_norm": 0.003957406617701054, + "learning_rate": 9.228176533005984e-07, + "loss": 0.0002, + "step": 1468 + }, + { + "epoch": 0.8040503557744937, + "grad_norm": 0.00669764494523406, + "learning_rate": 9.178469451897376e-07, + "loss": 0.0003, + "step": 1469 + }, + { + "epoch": 0.8045977011494253, + "grad_norm": 0.005619878880679607, + "learning_rate": 9.128883072055411e-07, + "loss": 0.0003, + "step": 1470 + }, + { + "epoch": 0.8051450465243569, + "grad_norm": 3.6218132972717285, + "learning_rate": 9.079417540097307e-07, + "loss": 0.7148, + "step": 1471 + }, + { + "epoch": 0.8056923918992884, + "grad_norm": 0.005683012772351503, + "learning_rate": 9.030073002282941e-07, + "loss": 0.0002, + "step": 1472 + }, + { + "epoch": 0.80623973727422, + "grad_norm": 0.005043988116085529, + "learning_rate": 8.980849604514453e-07, + "loss": 0.0003, + "step": 1473 + }, + { + "epoch": 0.8067870826491517, + "grad_norm": 0.5382112860679626, + "learning_rate": 8.931747492335758e-07, + "loss": 0.0256, + "step": 1474 + }, + { + "epoch": 0.8073344280240832, + "grad_norm": 0.46263471245765686, + "learning_rate": 8.882766810932214e-07, + "loss": 0.0309, + "step": 1475 + }, + { + "epoch": 0.8078817733990148, + "grad_norm": 0.0011781550711020827, + "learning_rate": 8.833907705130091e-07, + "loss": 0.0001, + "step": 1476 + }, + { + "epoch": 0.8084291187739464, + "grad_norm": 0.006095638498663902, + "learning_rate": 8.785170319396174e-07, + "loss": 0.0003, + "step": 1477 + }, + { + "epoch": 0.8089764641488779, + "grad_norm": 0.03239554166793823, + "learning_rate": 8.736554797837376e-07, + "loss": 0.0009, + "step": 1478 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 0.000690631044562906, + "learning_rate": 8.688061284200266e-07, + "loss": 0.0001, + "step": 1479 + }, + { + "epoch": 0.8100711548987412, + "grad_norm": 1.924971580505371, + "learning_rate": 8.639689921870642e-07, + "loss": 0.0994, + "step": 1480 + }, + { + "epoch": 0.8106185002736727, + "grad_norm": 0.0030510155484080315, + "learning_rate": 8.591440853873184e-07, + "loss": 0.0002, + "step": 1481 + }, + { + "epoch": 0.8111658456486043, + "grad_norm": 0.010571463964879513, + "learning_rate": 8.543314222870891e-07, + "loss": 0.0005, + "step": 1482 + }, + { + "epoch": 0.8117131910235359, + "grad_norm": 0.7898827791213989, + "learning_rate": 8.495310171164805e-07, + "loss": 0.1773, + "step": 1483 + }, + { + "epoch": 0.8122605363984674, + "grad_norm": 1.2340046167373657, + "learning_rate": 8.447428840693489e-07, + "loss": 0.184, + "step": 1484 + }, + { + "epoch": 0.812807881773399, + "grad_norm": 0.004051442723721266, + "learning_rate": 8.399670373032665e-07, + "loss": 0.0002, + "step": 1485 + }, + { + "epoch": 0.8133552271483306, + "grad_norm": 0.0019139654468744993, + "learning_rate": 8.35203490939474e-07, + "loss": 0.0001, + "step": 1486 + }, + { + "epoch": 0.8139025725232621, + "grad_norm": 0.00680742971599102, + "learning_rate": 8.304522590628489e-07, + "loss": 0.0003, + "step": 1487 + }, + { + "epoch": 0.8144499178981938, + "grad_norm": 0.1572553664445877, + "learning_rate": 8.257133557218471e-07, + "loss": 0.0091, + "step": 1488 + }, + { + "epoch": 0.8149972632731254, + "grad_norm": 0.00778138916939497, + "learning_rate": 8.209867949284822e-07, + "loss": 0.0005, + "step": 1489 + }, + { + "epoch": 0.8155446086480569, + "grad_norm": 1.3799161911010742, + "learning_rate": 8.162725906582658e-07, + "loss": 0.3446, + "step": 1490 + }, + { + "epoch": 0.8160919540229885, + "grad_norm": 0.007187338080257177, + "learning_rate": 8.115707568501768e-07, + "loss": 0.0003, + "step": 1491 + }, + { + "epoch": 0.8166392993979201, + "grad_norm": 1.3800119161605835, + "learning_rate": 8.068813074066151e-07, + "loss": 0.1066, + "step": 1492 + }, + { + "epoch": 0.8171866447728516, + "grad_norm": 0.10583969950675964, + "learning_rate": 8.022042561933674e-07, + "loss": 0.0055, + "step": 1493 + }, + { + "epoch": 0.8177339901477833, + "grad_norm": 0.013661573641002178, + "learning_rate": 7.975396170395522e-07, + "loss": 0.0007, + "step": 1494 + }, + { + "epoch": 0.8182813355227149, + "grad_norm": 0.1210581511259079, + "learning_rate": 7.928874037375983e-07, + "loss": 0.0075, + "step": 1495 + }, + { + "epoch": 0.8188286808976464, + "grad_norm": 0.005312152206897736, + "learning_rate": 7.882476300431868e-07, + "loss": 0.0002, + "step": 1496 + }, + { + "epoch": 0.819376026272578, + "grad_norm": 0.026137027889490128, + "learning_rate": 7.836203096752193e-07, + "loss": 0.0012, + "step": 1497 + }, + { + "epoch": 0.8199233716475096, + "grad_norm": 1.857779622077942, + "learning_rate": 7.790054563157745e-07, + "loss": 0.3924, + "step": 1498 + }, + { + "epoch": 0.8204707170224411, + "grad_norm": 0.026395171880722046, + "learning_rate": 7.744030836100724e-07, + "loss": 0.0014, + "step": 1499 + }, + { + "epoch": 0.8210180623973727, + "grad_norm": 0.014956077560782433, + "learning_rate": 7.698132051664236e-07, + "loss": 0.0007, + "step": 1500 + }, + { + "epoch": 0.8215654077723044, + "grad_norm": 1.9142413139343262, + "learning_rate": 7.652358345562016e-07, + "loss": 0.2772, + "step": 1501 + }, + { + "epoch": 0.8221127531472359, + "grad_norm": 0.0007567325956188142, + "learning_rate": 7.606709853137939e-07, + "loss": 0.0001, + "step": 1502 + }, + { + "epoch": 0.8226600985221675, + "grad_norm": 0.0009599222685210407, + "learning_rate": 7.561186709365653e-07, + "loss": 0.0001, + "step": 1503 + }, + { + "epoch": 0.8232074438970991, + "grad_norm": 0.13788969814777374, + "learning_rate": 7.515789048848171e-07, + "loss": 0.0076, + "step": 1504 + }, + { + "epoch": 0.8237547892720306, + "grad_norm": 0.005967889446765184, + "learning_rate": 7.470517005817473e-07, + "loss": 0.0002, + "step": 1505 + }, + { + "epoch": 0.8243021346469622, + "grad_norm": 0.001601370982825756, + "learning_rate": 7.425370714134122e-07, + "loss": 0.0001, + "step": 1506 + }, + { + "epoch": 0.8248494800218938, + "grad_norm": 0.0028195439372211695, + "learning_rate": 7.380350307286865e-07, + "loss": 0.0001, + "step": 1507 + }, + { + "epoch": 0.8253968253968254, + "grad_norm": 0.016938157379627228, + "learning_rate": 7.33545591839222e-07, + "loss": 0.0009, + "step": 1508 + }, + { + "epoch": 0.825944170771757, + "grad_norm": 0.003891898086294532, + "learning_rate": 7.290687680194092e-07, + "loss": 0.0002, + "step": 1509 + }, + { + "epoch": 0.8264915161466886, + "grad_norm": 2.533268928527832, + "learning_rate": 7.246045725063394e-07, + "loss": 0.2119, + "step": 1510 + }, + { + "epoch": 0.8270388615216201, + "grad_norm": 0.0018059737049043179, + "learning_rate": 7.201530184997635e-07, + "loss": 0.0001, + "step": 1511 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.017416877672076225, + "learning_rate": 7.157141191620548e-07, + "loss": 0.0008, + "step": 1512 + }, + { + "epoch": 0.8281335522714833, + "grad_norm": 0.04424600675702095, + "learning_rate": 7.112878876181673e-07, + "loss": 0.0027, + "step": 1513 + }, + { + "epoch": 0.8286808976464148, + "grad_norm": 0.04984992370009422, + "learning_rate": 7.068743369556042e-07, + "loss": 0.0025, + "step": 1514 + }, + { + "epoch": 0.8292282430213465, + "grad_norm": 0.01709463633596897, + "learning_rate": 7.024734802243649e-07, + "loss": 0.0007, + "step": 1515 + }, + { + "epoch": 0.8297755883962781, + "grad_norm": 1.7468068599700928, + "learning_rate": 6.980853304369239e-07, + "loss": 0.2574, + "step": 1516 + }, + { + "epoch": 0.8303229337712096, + "grad_norm": 0.002679870929569006, + "learning_rate": 6.937099005681792e-07, + "loss": 0.0001, + "step": 1517 + }, + { + "epoch": 0.8308702791461412, + "grad_norm": 0.001303765457123518, + "learning_rate": 6.8934720355542e-07, + "loss": 0.0001, + "step": 1518 + }, + { + "epoch": 0.8314176245210728, + "grad_norm": 1.4065778255462646, + "learning_rate": 6.849972522982845e-07, + "loss": 0.2669, + "step": 1519 + }, + { + "epoch": 0.8319649698960043, + "grad_norm": 0.0015437363181263208, + "learning_rate": 6.806600596587299e-07, + "loss": 0.0001, + "step": 1520 + }, + { + "epoch": 0.8325123152709359, + "grad_norm": 0.0023011069279164076, + "learning_rate": 6.763356384609809e-07, + "loss": 0.0001, + "step": 1521 + }, + { + "epoch": 0.8330596606458676, + "grad_norm": 0.0042620436288416386, + "learning_rate": 6.720240014915063e-07, + "loss": 0.0002, + "step": 1522 + }, + { + "epoch": 0.8336070060207992, + "grad_norm": 0.1685761958360672, + "learning_rate": 6.677251614989699e-07, + "loss": 0.0096, + "step": 1523 + }, + { + "epoch": 0.8341543513957307, + "grad_norm": 0.22431378066539764, + "learning_rate": 6.634391311942024e-07, + "loss": 0.0173, + "step": 1524 + }, + { + "epoch": 0.8347016967706623, + "grad_norm": 2.5347378253936768, + "learning_rate": 6.591659232501507e-07, + "loss": 0.3766, + "step": 1525 + }, + { + "epoch": 0.8352490421455939, + "grad_norm": 0.06536459177732468, + "learning_rate": 6.549055503018575e-07, + "loss": 0.0037, + "step": 1526 + }, + { + "epoch": 0.8357963875205254, + "grad_norm": 0.044048044830560684, + "learning_rate": 6.506580249464089e-07, + "loss": 0.0023, + "step": 1527 + }, + { + "epoch": 0.836343732895457, + "grad_norm": 3.255218029022217, + "learning_rate": 6.464233597429054e-07, + "loss": 0.3087, + "step": 1528 + }, + { + "epoch": 0.8368910782703887, + "grad_norm": 0.05638457089662552, + "learning_rate": 6.42201567212421e-07, + "loss": 0.0029, + "step": 1529 + }, + { + "epoch": 0.8374384236453202, + "grad_norm": 0.021221015602350235, + "learning_rate": 6.379926598379727e-07, + "loss": 0.0012, + "step": 1530 + }, + { + "epoch": 0.8379857690202518, + "grad_norm": 0.0007102249655872583, + "learning_rate": 6.337966500644699e-07, + "loss": 0.0001, + "step": 1531 + }, + { + "epoch": 0.8385331143951834, + "grad_norm": 2.1397488117218018, + "learning_rate": 6.296135502986944e-07, + "loss": 0.11, + "step": 1532 + }, + { + "epoch": 0.8390804597701149, + "grad_norm": 0.005235095042735338, + "learning_rate": 6.254433729092518e-07, + "loss": 0.0002, + "step": 1533 + }, + { + "epoch": 0.8396278051450465, + "grad_norm": 0.006238035392016172, + "learning_rate": 6.212861302265393e-07, + "loss": 0.0003, + "step": 1534 + }, + { + "epoch": 0.8401751505199782, + "grad_norm": 0.030654199421405792, + "learning_rate": 6.171418345427088e-07, + "loss": 0.0019, + "step": 1535 + }, + { + "epoch": 0.8407224958949097, + "grad_norm": 1.587546706199646, + "learning_rate": 6.130104981116314e-07, + "loss": 0.2527, + "step": 1536 + }, + { + "epoch": 0.8412698412698413, + "grad_norm": 0.003837469732388854, + "learning_rate": 6.088921331488568e-07, + "loss": 0.0002, + "step": 1537 + }, + { + "epoch": 0.8418171866447729, + "grad_norm": 0.10386902838945389, + "learning_rate": 6.04786751831587e-07, + "loss": 0.006, + "step": 1538 + }, + { + "epoch": 0.8423645320197044, + "grad_norm": 0.0031864827033132315, + "learning_rate": 6.006943662986275e-07, + "loss": 0.0002, + "step": 1539 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 1.3453859090805054, + "learning_rate": 5.966149886503614e-07, + "loss": 0.2179, + "step": 1540 + }, + { + "epoch": 0.8434592227695676, + "grad_norm": 0.002893652068451047, + "learning_rate": 5.925486309487083e-07, + "loss": 0.0002, + "step": 1541 + }, + { + "epoch": 0.8440065681444991, + "grad_norm": 0.003957205452024937, + "learning_rate": 5.884953052170917e-07, + "loss": 0.0002, + "step": 1542 + }, + { + "epoch": 0.8445539135194308, + "grad_norm": 0.011327473446726799, + "learning_rate": 5.844550234404012e-07, + "loss": 0.0006, + "step": 1543 + }, + { + "epoch": 0.8451012588943624, + "grad_norm": 0.0016348304925486445, + "learning_rate": 5.804277975649574e-07, + "loss": 0.0001, + "step": 1544 + }, + { + "epoch": 0.8456486042692939, + "grad_norm": 0.018026404082775116, + "learning_rate": 5.764136394984809e-07, + "loss": 0.0006, + "step": 1545 + }, + { + "epoch": 0.8461959496442255, + "grad_norm": 0.06777101755142212, + "learning_rate": 5.724125611100467e-07, + "loss": 0.0028, + "step": 1546 + }, + { + "epoch": 0.8467432950191571, + "grad_norm": 0.005540602374821901, + "learning_rate": 5.684245742300625e-07, + "loss": 0.0002, + "step": 1547 + }, + { + "epoch": 0.8472906403940886, + "grad_norm": 0.04115450754761696, + "learning_rate": 5.644496906502233e-07, + "loss": 0.0019, + "step": 1548 + }, + { + "epoch": 0.8478379857690203, + "grad_norm": 2.1989943981170654, + "learning_rate": 5.60487922123481e-07, + "loss": 0.2341, + "step": 1549 + }, + { + "epoch": 0.8483853311439519, + "grad_norm": 0.006643370725214481, + "learning_rate": 5.565392803640069e-07, + "loss": 0.0003, + "step": 1550 + }, + { + "epoch": 0.8489326765188834, + "grad_norm": 0.14747342467308044, + "learning_rate": 5.526037770471649e-07, + "loss": 0.0081, + "step": 1551 + }, + { + "epoch": 0.849480021893815, + "grad_norm": 0.05744752287864685, + "learning_rate": 5.486814238094629e-07, + "loss": 0.0032, + "step": 1552 + }, + { + "epoch": 0.8500273672687466, + "grad_norm": 0.02027791179716587, + "learning_rate": 5.447722322485333e-07, + "loss": 0.001, + "step": 1553 + }, + { + "epoch": 0.8505747126436781, + "grad_norm": 0.0011862594401463866, + "learning_rate": 5.408762139230889e-07, + "loss": 0.0001, + "step": 1554 + }, + { + "epoch": 0.8511220580186097, + "grad_norm": 0.052090659737586975, + "learning_rate": 5.369933803528926e-07, + "loss": 0.0035, + "step": 1555 + }, + { + "epoch": 0.8516694033935414, + "grad_norm": 2.1469361782073975, + "learning_rate": 5.331237430187214e-07, + "loss": 0.1273, + "step": 1556 + }, + { + "epoch": 0.8522167487684729, + "grad_norm": 0.02452508918941021, + "learning_rate": 5.292673133623372e-07, + "loss": 0.001, + "step": 1557 + }, + { + "epoch": 0.8527640941434045, + "grad_norm": 0.0034144010860472918, + "learning_rate": 5.254241027864432e-07, + "loss": 0.0002, + "step": 1558 + }, + { + "epoch": 0.8533114395183361, + "grad_norm": 0.14860154688358307, + "learning_rate": 5.215941226546628e-07, + "loss": 0.0095, + "step": 1559 + }, + { + "epoch": 0.8538587848932676, + "grad_norm": 0.0034301080740988255, + "learning_rate": 5.177773842914963e-07, + "loss": 0.0002, + "step": 1560 + }, + { + "epoch": 0.8544061302681992, + "grad_norm": 0.001376735046505928, + "learning_rate": 5.139738989822901e-07, + "loss": 0.0001, + "step": 1561 + }, + { + "epoch": 0.8549534756431308, + "grad_norm": 0.06300047785043716, + "learning_rate": 5.101836779732044e-07, + "loss": 0.003, + "step": 1562 + }, + { + "epoch": 0.8555008210180624, + "grad_norm": 0.10772773623466492, + "learning_rate": 5.064067324711836e-07, + "loss": 0.0074, + "step": 1563 + }, + { + "epoch": 0.856048166392994, + "grad_norm": 3.098621368408203, + "learning_rate": 5.026430736439102e-07, + "loss": 0.8447, + "step": 1564 + }, + { + "epoch": 0.8565955117679256, + "grad_norm": 0.020347200334072113, + "learning_rate": 4.988927126197901e-07, + "loss": 0.0012, + "step": 1565 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.004135144874453545, + "learning_rate": 4.951556604879049e-07, + "loss": 0.0002, + "step": 1566 + }, + { + "epoch": 0.8576902025177887, + "grad_norm": 0.13048239052295685, + "learning_rate": 4.91431928297984e-07, + "loss": 0.0085, + "step": 1567 + }, + { + "epoch": 0.8582375478927203, + "grad_norm": 0.0032235216349363327, + "learning_rate": 4.877215270603752e-07, + "loss": 0.0002, + "step": 1568 + }, + { + "epoch": 0.8587848932676518, + "grad_norm": 0.7628779411315918, + "learning_rate": 4.840244677460076e-07, + "loss": 0.0259, + "step": 1569 + }, + { + "epoch": 0.8593322386425835, + "grad_norm": 0.03926008939743042, + "learning_rate": 4.803407612863603e-07, + "loss": 0.0022, + "step": 1570 + }, + { + "epoch": 0.8598795840175151, + "grad_norm": 0.003697637002915144, + "learning_rate": 4.7667041857343276e-07, + "loss": 0.0002, + "step": 1571 + }, + { + "epoch": 0.8604269293924466, + "grad_norm": 0.007138053886592388, + "learning_rate": 4.730134504597084e-07, + "loss": 0.0003, + "step": 1572 + }, + { + "epoch": 0.8609742747673782, + "grad_norm": 0.024003850296139717, + "learning_rate": 4.69369867758126e-07, + "loss": 0.0011, + "step": 1573 + }, + { + "epoch": 0.8615216201423098, + "grad_norm": 0.013022515922784805, + "learning_rate": 4.6573968124204506e-07, + "loss": 0.0006, + "step": 1574 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 0.025911865755915642, + "learning_rate": 4.6212290164521554e-07, + "loss": 0.0011, + "step": 1575 + }, + { + "epoch": 0.8626163108921729, + "grad_norm": 1.6997803449630737, + "learning_rate": 4.585195396617464e-07, + "loss": 0.0852, + "step": 1576 + }, + { + "epoch": 0.8631636562671046, + "grad_norm": 0.0011990342754870653, + "learning_rate": 4.549296059460717e-07, + "loss": 0.0001, + "step": 1577 + }, + { + "epoch": 0.8637110016420362, + "grad_norm": 2.725243330001831, + "learning_rate": 4.5135311111292435e-07, + "loss": 0.4184, + "step": 1578 + }, + { + "epoch": 0.8642583470169677, + "grad_norm": 0.001103404094465077, + "learning_rate": 4.477900657372969e-07, + "loss": 0.0001, + "step": 1579 + }, + { + "epoch": 0.8648056923918993, + "grad_norm": 0.18278087675571442, + "learning_rate": 4.442404803544176e-07, + "loss": 0.0114, + "step": 1580 + }, + { + "epoch": 0.8653530377668309, + "grad_norm": 0.0011701801558956504, + "learning_rate": 4.407043654597126e-07, + "loss": 0.0001, + "step": 1581 + }, + { + "epoch": 0.8659003831417624, + "grad_norm": 0.0017612545052543283, + "learning_rate": 4.371817315087845e-07, + "loss": 0.0001, + "step": 1582 + }, + { + "epoch": 0.866447728516694, + "grad_norm": 2.4826674461364746, + "learning_rate": 4.336725889173676e-07, + "loss": 0.5371, + "step": 1583 + }, + { + "epoch": 0.8669950738916257, + "grad_norm": 0.010181195102632046, + "learning_rate": 4.3017694806131163e-07, + "loss": 0.0004, + "step": 1584 + }, + { + "epoch": 0.8675424192665572, + "grad_norm": 0.003270001383498311, + "learning_rate": 4.266948192765402e-07, + "loss": 0.0002, + "step": 1585 + }, + { + "epoch": 0.8680897646414888, + "grad_norm": 0.3261997699737549, + "learning_rate": 4.2322621285902697e-07, + "loss": 0.0192, + "step": 1586 + }, + { + "epoch": 0.8686371100164204, + "grad_norm": 0.004310634918510914, + "learning_rate": 4.1977113906475965e-07, + "loss": 0.0002, + "step": 1587 + }, + { + "epoch": 0.8691844553913519, + "grad_norm": 0.009598495438694954, + "learning_rate": 4.163296081097168e-07, + "loss": 0.0005, + "step": 1588 + }, + { + "epoch": 0.8697318007662835, + "grad_norm": 0.004435474518686533, + "learning_rate": 4.1290163016982855e-07, + "loss": 0.0002, + "step": 1589 + }, + { + "epoch": 0.8702791461412152, + "grad_norm": 0.001486207009293139, + "learning_rate": 4.0948721538095593e-07, + "loss": 0.0001, + "step": 1590 + }, + { + "epoch": 0.8708264915161467, + "grad_norm": 0.12514939904212952, + "learning_rate": 4.060863738388532e-07, + "loss": 0.0074, + "step": 1591 + }, + { + "epoch": 0.8713738368910783, + "grad_norm": 0.006256112828850746, + "learning_rate": 4.026991155991433e-07, + "loss": 0.0003, + "step": 1592 + }, + { + "epoch": 0.8719211822660099, + "grad_norm": 0.023347727954387665, + "learning_rate": 3.9932545067728366e-07, + "loss": 0.0012, + "step": 1593 + }, + { + "epoch": 0.8724685276409414, + "grad_norm": 0.06584785878658295, + "learning_rate": 3.9596538904854263e-07, + "loss": 0.0032, + "step": 1594 + }, + { + "epoch": 0.873015873015873, + "grad_norm": 0.08080962300300598, + "learning_rate": 3.9261894064796136e-07, + "loss": 0.0051, + "step": 1595 + }, + { + "epoch": 0.8735632183908046, + "grad_norm": 0.002836831146851182, + "learning_rate": 3.8928611537033424e-07, + "loss": 0.0002, + "step": 1596 + }, + { + "epoch": 0.8741105637657361, + "grad_norm": 0.002772917505353689, + "learning_rate": 3.859669230701718e-07, + "loss": 0.0001, + "step": 1597 + }, + { + "epoch": 0.8746579091406678, + "grad_norm": 2.930410385131836, + "learning_rate": 3.8266137356167466e-07, + "loss": 0.1096, + "step": 1598 + }, + { + "epoch": 0.8752052545155994, + "grad_norm": 0.0008128105546347797, + "learning_rate": 3.7936947661870616e-07, + "loss": 0.0001, + "step": 1599 + }, + { + "epoch": 0.8757525998905309, + "grad_norm": 0.005456219427287579, + "learning_rate": 3.760912419747592e-07, + "loss": 0.0003, + "step": 1600 + }, + { + "epoch": 0.8762999452654625, + "grad_norm": 0.014633177779614925, + "learning_rate": 3.728266793229307e-07, + "loss": 0.0007, + "step": 1601 + }, + { + "epoch": 0.8768472906403941, + "grad_norm": 2.828840732574463, + "learning_rate": 3.695757983158954e-07, + "loss": 0.5272, + "step": 1602 + }, + { + "epoch": 0.8773946360153256, + "grad_norm": 0.0014424819964915514, + "learning_rate": 3.663386085658693e-07, + "loss": 0.0001, + "step": 1603 + }, + { + "epoch": 0.8779419813902573, + "grad_norm": 0.10093405842781067, + "learning_rate": 3.631151196445887e-07, + "loss": 0.0051, + "step": 1604 + }, + { + "epoch": 0.8784893267651889, + "grad_norm": 0.0005839603254571557, + "learning_rate": 3.5990534108327926e-07, + "loss": 0.0, + "step": 1605 + }, + { + "epoch": 0.8790366721401204, + "grad_norm": 0.022238705307245255, + "learning_rate": 3.567092823726259e-07, + "loss": 0.0011, + "step": 1606 + }, + { + "epoch": 0.879584017515052, + "grad_norm": 0.14515052735805511, + "learning_rate": 3.5352695296274884e-07, + "loss": 0.0091, + "step": 1607 + }, + { + "epoch": 0.8801313628899836, + "grad_norm": 0.0035123212728649378, + "learning_rate": 3.5035836226317177e-07, + "loss": 0.0002, + "step": 1608 + }, + { + "epoch": 0.8806787082649151, + "grad_norm": 1.9213814735412598, + "learning_rate": 3.4720351964279863e-07, + "loss": 0.3315, + "step": 1609 + }, + { + "epoch": 0.8812260536398467, + "grad_norm": 0.0053222388960421085, + "learning_rate": 3.4406243442987765e-07, + "loss": 0.0003, + "step": 1610 + }, + { + "epoch": 0.8817733990147784, + "grad_norm": 0.0038527492433786392, + "learning_rate": 3.409351159119845e-07, + "loss": 0.0002, + "step": 1611 + }, + { + "epoch": 0.8823207443897099, + "grad_norm": 0.0030256537720561028, + "learning_rate": 3.3782157333598687e-07, + "loss": 0.0002, + "step": 1612 + }, + { + "epoch": 0.8828680897646415, + "grad_norm": 0.01982075721025467, + "learning_rate": 3.347218159080201e-07, + "loss": 0.001, + "step": 1613 + }, + { + "epoch": 0.8834154351395731, + "grad_norm": 0.6527382135391235, + "learning_rate": 3.3163585279345823e-07, + "loss": 0.0446, + "step": 1614 + }, + { + "epoch": 0.8839627805145046, + "grad_norm": 0.09229915589094162, + "learning_rate": 3.2856369311689174e-07, + "loss": 0.0065, + "step": 1615 + }, + { + "epoch": 0.8845101258894362, + "grad_norm": 0.045364126563072205, + "learning_rate": 3.2550534596209217e-07, + "loss": 0.002, + "step": 1616 + }, + { + "epoch": 0.8850574712643678, + "grad_norm": 4.158183574676514, + "learning_rate": 3.224608203719953e-07, + "loss": 0.1492, + "step": 1617 + }, + { + "epoch": 0.8856048166392994, + "grad_norm": 1.665842056274414, + "learning_rate": 3.1943012534866536e-07, + "loss": 0.2008, + "step": 1618 + }, + { + "epoch": 0.886152162014231, + "grad_norm": 2.252728223800659, + "learning_rate": 3.164132698532735e-07, + "loss": 0.3352, + "step": 1619 + }, + { + "epoch": 0.8866995073891626, + "grad_norm": 0.2482650727033615, + "learning_rate": 3.134102628060698e-07, + "loss": 0.0132, + "step": 1620 + }, + { + "epoch": 0.8872468527640941, + "grad_norm": 0.015394785441458225, + "learning_rate": 3.1042111308636047e-07, + "loss": 0.0006, + "step": 1621 + }, + { + "epoch": 0.8877941981390257, + "grad_norm": 0.17973975837230682, + "learning_rate": 3.074458295324717e-07, + "loss": 0.011, + "step": 1622 + }, + { + "epoch": 0.8883415435139573, + "grad_norm": 0.006913541350513697, + "learning_rate": 3.0448442094173634e-07, + "loss": 0.0004, + "step": 1623 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.447867751121521, + "learning_rate": 3.015368960704584e-07, + "loss": 0.1321, + "step": 1624 + }, + { + "epoch": 0.8894362342638205, + "grad_norm": 0.08996415138244629, + "learning_rate": 2.98603263633892e-07, + "loss": 0.0063, + "step": 1625 + }, + { + "epoch": 0.8899835796387521, + "grad_norm": 0.0666528046131134, + "learning_rate": 2.9568353230621185e-07, + "loss": 0.0035, + "step": 1626 + }, + { + "epoch": 0.8905309250136836, + "grad_norm": 0.01801203191280365, + "learning_rate": 2.9277771072049433e-07, + "loss": 0.0008, + "step": 1627 + }, + { + "epoch": 0.8910782703886152, + "grad_norm": 0.004558671731501818, + "learning_rate": 2.898858074686806e-07, + "loss": 0.0002, + "step": 1628 + }, + { + "epoch": 0.8916256157635468, + "grad_norm": 0.13645006716251373, + "learning_rate": 2.8700783110156507e-07, + "loss": 0.0044, + "step": 1629 + }, + { + "epoch": 0.8921729611384783, + "grad_norm": 2.16013503074646, + "learning_rate": 2.841437901287586e-07, + "loss": 0.4084, + "step": 1630 + }, + { + "epoch": 0.89272030651341, + "grad_norm": 0.23281130194664001, + "learning_rate": 2.812936930186688e-07, + "loss": 0.0129, + "step": 1631 + }, + { + "epoch": 0.8932676518883416, + "grad_norm": 0.05298805981874466, + "learning_rate": 2.784575481984747e-07, + "loss": 0.0028, + "step": 1632 + }, + { + "epoch": 0.8938149972632731, + "grad_norm": 1.8807384967803955, + "learning_rate": 2.756353640541021e-07, + "loss": 0.3986, + "step": 1633 + }, + { + "epoch": 0.8943623426382047, + "grad_norm": 0.4551723003387451, + "learning_rate": 2.728271489301937e-07, + "loss": 0.0514, + "step": 1634 + }, + { + "epoch": 0.8949096880131363, + "grad_norm": 1.692553162574768, + "learning_rate": 2.700329111300937e-07, + "loss": 0.1173, + "step": 1635 + }, + { + "epoch": 0.8954570333880679, + "grad_norm": 0.0010673926444724202, + "learning_rate": 2.672526589158153e-07, + "loss": 0.0001, + "step": 1636 + }, + { + "epoch": 0.8960043787629994, + "grad_norm": 0.026626106351614, + "learning_rate": 2.644864005080183e-07, + "loss": 0.0013, + "step": 1637 + }, + { + "epoch": 0.896551724137931, + "grad_norm": 0.0034642897080630064, + "learning_rate": 2.617341440859883e-07, + "loss": 0.0002, + "step": 1638 + }, + { + "epoch": 0.8970990695128627, + "grad_norm": 4.151165962219238, + "learning_rate": 2.5899589778760614e-07, + "loss": 0.3504, + "step": 1639 + }, + { + "epoch": 0.8976464148877942, + "grad_norm": 1.1645855903625488, + "learning_rate": 2.5627166970933257e-07, + "loss": 0.0355, + "step": 1640 + }, + { + "epoch": 0.8981937602627258, + "grad_norm": 0.024651098996400833, + "learning_rate": 2.535614679061732e-07, + "loss": 0.0012, + "step": 1641 + }, + { + "epoch": 0.8987411056376574, + "grad_norm": 2.9519765377044678, + "learning_rate": 2.5086530039166615e-07, + "loss": 0.6242, + "step": 1642 + }, + { + "epoch": 0.8992884510125889, + "grad_norm": 0.0605325847864151, + "learning_rate": 2.4818317513784886e-07, + "loss": 0.0029, + "step": 1643 + }, + { + "epoch": 0.8998357963875205, + "grad_norm": 0.002007205504924059, + "learning_rate": 2.4551510007524035e-07, + "loss": 0.0001, + "step": 1644 + }, + { + "epoch": 0.9003831417624522, + "grad_norm": 4.2551140785217285, + "learning_rate": 2.428610830928152e-07, + "loss": 0.5177, + "step": 1645 + }, + { + "epoch": 0.9009304871373837, + "grad_norm": 0.015494787134230137, + "learning_rate": 2.402211320379838e-07, + "loss": 0.0008, + "step": 1646 + }, + { + "epoch": 0.9014778325123153, + "grad_norm": 0.02462865225970745, + "learning_rate": 2.3759525471656163e-07, + "loss": 0.0013, + "step": 1647 + }, + { + "epoch": 0.9020251778872469, + "grad_norm": 0.11775901913642883, + "learning_rate": 2.3498345889275465e-07, + "loss": 0.0072, + "step": 1648 + }, + { + "epoch": 0.9025725232621784, + "grad_norm": 0.009577536955475807, + "learning_rate": 2.3238575228913152e-07, + "loss": 0.0005, + "step": 1649 + }, + { + "epoch": 0.90311986863711, + "grad_norm": 0.25204992294311523, + "learning_rate": 2.2980214258660038e-07, + "loss": 0.0117, + "step": 1650 + }, + { + "epoch": 0.9036672140120416, + "grad_norm": 0.04809905216097832, + "learning_rate": 2.2723263742438938e-07, + "loss": 0.0021, + "step": 1651 + }, + { + "epoch": 0.9042145593869731, + "grad_norm": 0.001318107359111309, + "learning_rate": 2.2467724440002336e-07, + "loss": 0.0001, + "step": 1652 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 0.005981622263789177, + "learning_rate": 2.2213597106929608e-07, + "loss": 0.0003, + "step": 1653 + }, + { + "epoch": 0.9053092501368364, + "grad_norm": 2.8996164798736572, + "learning_rate": 2.1960882494625692e-07, + "loss": 0.7142, + "step": 1654 + }, + { + "epoch": 0.9058565955117679, + "grad_norm": 0.01610439084470272, + "learning_rate": 2.1709581350318089e-07, + "loss": 0.0008, + "step": 1655 + }, + { + "epoch": 0.9064039408866995, + "grad_norm": 0.02391280047595501, + "learning_rate": 2.1459694417055033e-07, + "loss": 0.0013, + "step": 1656 + }, + { + "epoch": 0.9069512862616311, + "grad_norm": 3.049898624420166, + "learning_rate": 2.1211222433703217e-07, + "loss": 0.1284, + "step": 1657 + }, + { + "epoch": 0.9074986316365626, + "grad_norm": 0.0016047470271587372, + "learning_rate": 2.0964166134945674e-07, + "loss": 0.0001, + "step": 1658 + }, + { + "epoch": 0.9080459770114943, + "grad_norm": 0.00969348568469286, + "learning_rate": 2.0718526251279346e-07, + "loss": 0.0005, + "step": 1659 + }, + { + "epoch": 0.9085933223864259, + "grad_norm": 0.007032374385744333, + "learning_rate": 2.0474303509013361e-07, + "loss": 0.0004, + "step": 1660 + }, + { + "epoch": 0.9091406677613574, + "grad_norm": 0.07196672260761261, + "learning_rate": 2.0231498630266467e-07, + "loss": 0.0036, + "step": 1661 + }, + { + "epoch": 0.909688013136289, + "grad_norm": 0.009290819987654686, + "learning_rate": 1.999011233296505e-07, + "loss": 0.0004, + "step": 1662 + }, + { + "epoch": 0.9102353585112206, + "grad_norm": 0.0019148091087117791, + "learning_rate": 1.9750145330841186e-07, + "loss": 0.0001, + "step": 1663 + }, + { + "epoch": 0.9107827038861521, + "grad_norm": 0.00228285463526845, + "learning_rate": 1.9511598333430194e-07, + "loss": 0.0001, + "step": 1664 + }, + { + "epoch": 0.9113300492610837, + "grad_norm": 0.0029851458966732025, + "learning_rate": 1.9274472046068805e-07, + "loss": 0.0002, + "step": 1665 + }, + { + "epoch": 0.9118773946360154, + "grad_norm": 0.14858005940914154, + "learning_rate": 1.9038767169893058e-07, + "loss": 0.0082, + "step": 1666 + }, + { + "epoch": 0.9124247400109469, + "grad_norm": 0.0019396482966840267, + "learning_rate": 1.8804484401836077e-07, + "loss": 0.0001, + "step": 1667 + }, + { + "epoch": 0.9129720853858785, + "grad_norm": 0.0032133522909134626, + "learning_rate": 1.857162443462601e-07, + "loss": 0.0002, + "step": 1668 + }, + { + "epoch": 0.9135194307608101, + "grad_norm": 0.012534908019006252, + "learning_rate": 1.834018795678427e-07, + "loss": 0.0006, + "step": 1669 + }, + { + "epoch": 0.9140667761357416, + "grad_norm": 0.0011830313596874475, + "learning_rate": 1.8110175652623075e-07, + "loss": 0.0001, + "step": 1670 + }, + { + "epoch": 0.9146141215106732, + "grad_norm": 1.939831256866455, + "learning_rate": 1.7881588202243782e-07, + "loss": 0.4046, + "step": 1671 + }, + { + "epoch": 0.9151614668856048, + "grad_norm": 0.04055565223097801, + "learning_rate": 1.7654426281534576e-07, + "loss": 0.002, + "step": 1672 + }, + { + "epoch": 0.9157088122605364, + "grad_norm": 0.02508675493299961, + "learning_rate": 1.7428690562169003e-07, + "loss": 0.0015, + "step": 1673 + }, + { + "epoch": 0.916256157635468, + "grad_norm": 0.008100957609713078, + "learning_rate": 1.7204381711603046e-07, + "loss": 0.0004, + "step": 1674 + }, + { + "epoch": 0.9168035030103996, + "grad_norm": 0.0611899308860302, + "learning_rate": 1.698150039307428e-07, + "loss": 0.0033, + "step": 1675 + }, + { + "epoch": 0.9173508483853311, + "grad_norm": 0.14495496451854706, + "learning_rate": 1.6760047265598933e-07, + "loss": 0.0075, + "step": 1676 + }, + { + "epoch": 0.9178981937602627, + "grad_norm": 0.16413192451000214, + "learning_rate": 1.6540022983970505e-07, + "loss": 0.0083, + "step": 1677 + }, + { + "epoch": 0.9184455391351943, + "grad_norm": 0.006359891500324011, + "learning_rate": 1.632142819875776e-07, + "loss": 0.0003, + "step": 1678 + }, + { + "epoch": 0.9189928845101258, + "grad_norm": 0.27122026681900024, + "learning_rate": 1.610426355630268e-07, + "loss": 0.0159, + "step": 1679 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.011510318145155907, + "learning_rate": 1.5888529698718347e-07, + "loss": 0.0005, + "step": 1680 + }, + { + "epoch": 0.9200875752599891, + "grad_norm": 0.7412816286087036, + "learning_rate": 1.5674227263887732e-07, + "loss": 0.0462, + "step": 1681 + }, + { + "epoch": 0.9206349206349206, + "grad_norm": 2.5012526512145996, + "learning_rate": 1.5461356885461077e-07, + "loss": 0.2889, + "step": 1682 + }, + { + "epoch": 0.9211822660098522, + "grad_norm": 2.3169915676116943, + "learning_rate": 1.524991919285429e-07, + "loss": 0.2376, + "step": 1683 + }, + { + "epoch": 0.9217296113847838, + "grad_norm": 0.22899430990219116, + "learning_rate": 1.503991481124728e-07, + "loss": 0.0108, + "step": 1684 + }, + { + "epoch": 0.9222769567597153, + "grad_norm": 0.017632320523262024, + "learning_rate": 1.48313443615819e-07, + "loss": 0.0008, + "step": 1685 + }, + { + "epoch": 0.922824302134647, + "grad_norm": 0.76988285779953, + "learning_rate": 1.4624208460559897e-07, + "loss": 0.0616, + "step": 1686 + }, + { + "epoch": 0.9233716475095786, + "grad_norm": 0.002816552296280861, + "learning_rate": 1.4418507720641794e-07, + "loss": 0.0001, + "step": 1687 + }, + { + "epoch": 0.9239189928845101, + "grad_norm": 0.04683569818735123, + "learning_rate": 1.4214242750044238e-07, + "loss": 0.0021, + "step": 1688 + }, + { + "epoch": 0.9244663382594417, + "grad_norm": 0.009630477987229824, + "learning_rate": 1.401141415273871e-07, + "loss": 0.0005, + "step": 1689 + }, + { + "epoch": 0.9250136836343733, + "grad_norm": 0.005564996041357517, + "learning_rate": 1.3810022528449597e-07, + "loss": 0.0003, + "step": 1690 + }, + { + "epoch": 0.9255610290093049, + "grad_norm": 0.035956043750047684, + "learning_rate": 1.3610068472652615e-07, + "loss": 0.0018, + "step": 1691 + }, + { + "epoch": 0.9261083743842364, + "grad_norm": 2.302034854888916, + "learning_rate": 1.3411552576572562e-07, + "loss": 0.3058, + "step": 1692 + }, + { + "epoch": 0.926655719759168, + "grad_norm": 0.0033738012425601482, + "learning_rate": 1.3214475427182182e-07, + "loss": 0.0002, + "step": 1693 + }, + { + "epoch": 0.9272030651340997, + "grad_norm": 2.1513004302978516, + "learning_rate": 1.3018837607199909e-07, + "loss": 0.3423, + "step": 1694 + }, + { + "epoch": 0.9277504105090312, + "grad_norm": 0.06418784707784653, + "learning_rate": 1.2824639695088403e-07, + "loss": 0.0028, + "step": 1695 + }, + { + "epoch": 0.9282977558839628, + "grad_norm": 0.0017216246342286468, + "learning_rate": 1.2631882265052908e-07, + "loss": 0.0001, + "step": 1696 + }, + { + "epoch": 0.9288451012588944, + "grad_norm": 0.025960400700569153, + "learning_rate": 1.2440565887039347e-07, + "loss": 0.0011, + "step": 1697 + }, + { + "epoch": 0.9293924466338259, + "grad_norm": 0.027499064803123474, + "learning_rate": 1.2250691126732772e-07, + "loss": 0.0012, + "step": 1698 + }, + { + "epoch": 0.9299397920087575, + "grad_norm": 0.011645481921732426, + "learning_rate": 1.2062258545555649e-07, + "loss": 0.0007, + "step": 1699 + }, + { + "epoch": 0.9304871373836892, + "grad_norm": 2.105513334274292, + "learning_rate": 1.1875268700666187e-07, + "loss": 0.1763, + "step": 1700 + }, + { + "epoch": 0.9310344827586207, + "grad_norm": 0.10536600649356842, + "learning_rate": 1.1689722144956672e-07, + "loss": 0.0074, + "step": 1701 + }, + { + "epoch": 0.9315818281335523, + "grad_norm": 0.02680140547454357, + "learning_rate": 1.1505619427051973e-07, + "loss": 0.0012, + "step": 1702 + }, + { + "epoch": 0.9321291735084839, + "grad_norm": 0.010312930680811405, + "learning_rate": 1.1322961091307705e-07, + "loss": 0.0006, + "step": 1703 + }, + { + "epoch": 0.9326765188834154, + "grad_norm": 0.03216090425848961, + "learning_rate": 1.1141747677808845e-07, + "loss": 0.0015, + "step": 1704 + }, + { + "epoch": 0.933223864258347, + "grad_norm": 0.0016830548411235213, + "learning_rate": 1.0961979722367789e-07, + "loss": 0.0001, + "step": 1705 + }, + { + "epoch": 0.9337712096332786, + "grad_norm": 0.003885602578520775, + "learning_rate": 1.0783657756523347e-07, + "loss": 0.0002, + "step": 1706 + }, + { + "epoch": 0.9343185550082101, + "grad_norm": 0.004555482417345047, + "learning_rate": 1.0606782307538532e-07, + "loss": 0.0002, + "step": 1707 + }, + { + "epoch": 0.9348659003831418, + "grad_norm": 0.0035665256436914206, + "learning_rate": 1.0431353898399388e-07, + "loss": 0.0002, + "step": 1708 + }, + { + "epoch": 0.9354132457580734, + "grad_norm": 0.0027156653814017773, + "learning_rate": 1.0257373047813324e-07, + "loss": 0.0001, + "step": 1709 + }, + { + "epoch": 0.9359605911330049, + "grad_norm": 0.007819607853889465, + "learning_rate": 1.008484027020773e-07, + "loss": 0.0004, + "step": 1710 + }, + { + "epoch": 0.9365079365079365, + "grad_norm": 0.10022959858179092, + "learning_rate": 9.913756075728088e-08, + "loss": 0.0063, + "step": 1711 + }, + { + "epoch": 0.9370552818828681, + "grad_norm": 0.0077483695931732655, + "learning_rate": 9.744120970236914e-08, + "loss": 0.0003, + "step": 1712 + }, + { + "epoch": 0.9376026272577996, + "grad_norm": 1.458129644393921, + "learning_rate": 9.575935455311935e-08, + "loss": 0.1238, + "step": 1713 + }, + { + "epoch": 0.9381499726327313, + "grad_norm": 0.000479287700727582, + "learning_rate": 9.409200028244803e-08, + "loss": 0.0001, + "step": 1714 + }, + { + "epoch": 0.9386973180076629, + "grad_norm": 0.0025392011739313602, + "learning_rate": 9.243915182039431e-08, + "loss": 0.0001, + "step": 1715 + }, + { + "epoch": 0.9392446633825944, + "grad_norm": 1.5274864435195923, + "learning_rate": 9.08008140541089e-08, + "loss": 0.1416, + "step": 1716 + }, + { + "epoch": 0.939792008757526, + "grad_norm": 2.8007185459136963, + "learning_rate": 8.917699182783346e-08, + "loss": 0.5872, + "step": 1717 + }, + { + "epoch": 0.9403393541324576, + "grad_norm": 0.030282529070973396, + "learning_rate": 8.756768994289289e-08, + "loss": 0.0015, + "step": 1718 + }, + { + "epoch": 0.9408866995073891, + "grad_norm": 0.013260996900498867, + "learning_rate": 8.597291315767808e-08, + "loss": 0.0007, + "step": 1719 + }, + { + "epoch": 0.9414340448823207, + "grad_norm": 0.27540886402130127, + "learning_rate": 8.439266618763098e-08, + "loss": 0.0146, + "step": 1720 + }, + { + "epoch": 0.9419813902572524, + "grad_norm": 0.20886638760566711, + "learning_rate": 8.282695370523175e-08, + "loss": 0.0109, + "step": 1721 + }, + { + "epoch": 0.9425287356321839, + "grad_norm": 0.006277718581259251, + "learning_rate": 8.127578033998663e-08, + "loss": 0.0003, + "step": 1722 + }, + { + "epoch": 0.9430760810071155, + "grad_norm": 0.31388920545578003, + "learning_rate": 7.973915067840954e-08, + "loss": 0.0189, + "step": 1723 + }, + { + "epoch": 0.9436234263820471, + "grad_norm": 0.028467781841754913, + "learning_rate": 7.821706926401496e-08, + "loss": 0.0014, + "step": 1724 + }, + { + "epoch": 0.9441707717569786, + "grad_norm": 0.10319659858942032, + "learning_rate": 7.670954059729896e-08, + "loss": 0.0057, + "step": 1725 + }, + { + "epoch": 0.9447181171319102, + "grad_norm": 1.4109545946121216, + "learning_rate": 7.521656913572817e-08, + "loss": 0.242, + "step": 1726 + }, + { + "epoch": 0.9452654625068418, + "grad_norm": 2.2406203746795654, + "learning_rate": 7.373815929372586e-08, + "loss": 0.2529, + "step": 1727 + }, + { + "epoch": 0.9458128078817734, + "grad_norm": 3.631880760192871, + "learning_rate": 7.227431544266194e-08, + "loss": 0.139, + "step": 1728 + }, + { + "epoch": 0.946360153256705, + "grad_norm": 0.0025556655600667, + "learning_rate": 7.082504191083417e-08, + "loss": 0.0002, + "step": 1729 + }, + { + "epoch": 0.9469074986316366, + "grad_norm": 0.007169991731643677, + "learning_rate": 6.939034298346192e-08, + "loss": 0.0003, + "step": 1730 + }, + { + "epoch": 0.9474548440065681, + "grad_norm": 0.005089812446385622, + "learning_rate": 6.797022290266741e-08, + "loss": 0.0002, + "step": 1731 + }, + { + "epoch": 0.9480021893814997, + "grad_norm": 0.04695936292409897, + "learning_rate": 6.656468586746789e-08, + "loss": 0.002, + "step": 1732 + }, + { + "epoch": 0.9485495347564313, + "grad_norm": 0.23610833287239075, + "learning_rate": 6.517373603376176e-08, + "loss": 0.0122, + "step": 1733 + }, + { + "epoch": 0.9490968801313628, + "grad_norm": 0.13638249039649963, + "learning_rate": 6.379737751431415e-08, + "loss": 0.0083, + "step": 1734 + }, + { + "epoch": 0.9496442255062945, + "grad_norm": 0.9663567543029785, + "learning_rate": 6.243561437874745e-08, + "loss": 0.0553, + "step": 1735 + }, + { + "epoch": 0.9501915708812261, + "grad_norm": 0.03125873580574989, + "learning_rate": 6.108845065352864e-08, + "loss": 0.0015, + "step": 1736 + }, + { + "epoch": 0.9507389162561576, + "grad_norm": 1.2737540006637573, + "learning_rate": 5.97558903219575e-08, + "loss": 0.0837, + "step": 1737 + }, + { + "epoch": 0.9512862616310892, + "grad_norm": 0.003754893783479929, + "learning_rate": 5.843793732415282e-08, + "loss": 0.0002, + "step": 1738 + }, + { + "epoch": 0.9518336070060208, + "grad_norm": 0.014718581922352314, + "learning_rate": 5.713459555704404e-08, + "loss": 0.0007, + "step": 1739 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 2.769636869430542, + "learning_rate": 5.584586887435739e-08, + "loss": 0.2708, + "step": 1740 + }, + { + "epoch": 0.952928297755884, + "grad_norm": 2.3508386611938477, + "learning_rate": 5.457176108660478e-08, + "loss": 0.5224, + "step": 1741 + }, + { + "epoch": 0.9534756431308156, + "grad_norm": 0.01864919811487198, + "learning_rate": 5.331227596107325e-08, + "loss": 0.0009, + "step": 1742 + }, + { + "epoch": 0.9540229885057471, + "grad_norm": 0.022848375141620636, + "learning_rate": 5.206741722181385e-08, + "loss": 0.0012, + "step": 1743 + }, + { + "epoch": 0.9545703338806787, + "grad_norm": 0.8749512434005737, + "learning_rate": 5.0837188549628934e-08, + "loss": 0.036, + "step": 1744 + }, + { + "epoch": 0.9551176792556103, + "grad_norm": 0.026627959683537483, + "learning_rate": 4.9621593582065416e-08, + "loss": 0.0015, + "step": 1745 + }, + { + "epoch": 0.9556650246305419, + "grad_norm": 1.6593846082687378, + "learning_rate": 4.842063591339763e-08, + "loss": 0.3074, + "step": 1746 + }, + { + "epoch": 0.9562123700054734, + "grad_norm": 1.2600449323654175, + "learning_rate": 4.723431909462339e-08, + "loss": 0.059, + "step": 1747 + }, + { + "epoch": 0.956759715380405, + "grad_norm": 0.00087725929915905, + "learning_rate": 4.606264663344851e-08, + "loss": 0.0001, + "step": 1748 + }, + { + "epoch": 0.9573070607553367, + "grad_norm": 0.08448724448680878, + "learning_rate": 4.490562199427839e-08, + "loss": 0.0044, + "step": 1749 + }, + { + "epoch": 0.9578544061302682, + "grad_norm": 2.0202221870422363, + "learning_rate": 4.376324859820924e-08, + "loss": 0.2994, + "step": 1750 + }, + { + "epoch": 0.9584017515051998, + "grad_norm": 2.787095785140991, + "learning_rate": 4.2635529823014664e-08, + "loss": 0.5762, + "step": 1751 + }, + { + "epoch": 0.9589490968801314, + "grad_norm": 0.17930848896503448, + "learning_rate": 4.1522469003137946e-08, + "loss": 0.0087, + "step": 1752 + }, + { + "epoch": 0.9594964422550629, + "grad_norm": 1.5180494785308838, + "learning_rate": 4.0424069429682024e-08, + "loss": 0.3157, + "step": 1753 + }, + { + "epoch": 0.9600437876299945, + "grad_norm": 0.0052284374833106995, + "learning_rate": 3.9340334350399525e-08, + "loss": 0.0002, + "step": 1754 + }, + { + "epoch": 0.9605911330049262, + "grad_norm": 0.0017134948866441846, + "learning_rate": 3.82712669696822e-08, + "loss": 0.0001, + "step": 1755 + }, + { + "epoch": 0.9611384783798577, + "grad_norm": 2.9423959255218506, + "learning_rate": 3.721687044855315e-08, + "loss": 0.1727, + "step": 1756 + }, + { + "epoch": 0.9616858237547893, + "grad_norm": 0.25030413269996643, + "learning_rate": 3.617714790465576e-08, + "loss": 0.0137, + "step": 1757 + }, + { + "epoch": 0.9622331691297209, + "grad_norm": 2.802866220474243, + "learning_rate": 3.515210241224698e-08, + "loss": 0.4985, + "step": 1758 + }, + { + "epoch": 0.9627805145046524, + "grad_norm": 0.20648542046546936, + "learning_rate": 3.4141737002184036e-08, + "loss": 0.011, + "step": 1759 + }, + { + "epoch": 0.963327859879584, + "grad_norm": 0.24056246876716614, + "learning_rate": 3.3146054661920556e-08, + "loss": 0.0174, + "step": 1760 + }, + { + "epoch": 0.9638752052545156, + "grad_norm": 0.2007961869239807, + "learning_rate": 3.216505833549377e-08, + "loss": 0.0112, + "step": 1761 + }, + { + "epoch": 0.9644225506294472, + "grad_norm": 0.24780268967151642, + "learning_rate": 3.1198750923517316e-08, + "loss": 0.0167, + "step": 1762 + }, + { + "epoch": 0.9649698960043788, + "grad_norm": 0.002492027822881937, + "learning_rate": 3.0247135283172914e-08, + "loss": 0.0001, + "step": 1763 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.00669819675385952, + "learning_rate": 2.9310214228202016e-08, + "loss": 0.0003, + "step": 1764 + }, + { + "epoch": 0.9660645867542419, + "grad_norm": 0.03622516244649887, + "learning_rate": 2.8387990528896404e-08, + "loss": 0.0023, + "step": 1765 + }, + { + "epoch": 0.9666119321291735, + "grad_norm": 0.022351013496518135, + "learning_rate": 2.7480466912090386e-08, + "loss": 0.0012, + "step": 1766 + }, + { + "epoch": 0.9671592775041051, + "grad_norm": 0.011430719867348671, + "learning_rate": 2.6587646061153604e-08, + "loss": 0.0005, + "step": 1767 + }, + { + "epoch": 0.9677066228790366, + "grad_norm": 0.03099585883319378, + "learning_rate": 2.5709530615983246e-08, + "loss": 0.0014, + "step": 1768 + }, + { + "epoch": 0.9682539682539683, + "grad_norm": 0.044385433197021484, + "learning_rate": 2.4846123172992953e-08, + "loss": 0.0021, + "step": 1769 + }, + { + "epoch": 0.9688013136288999, + "grad_norm": 0.0769612118601799, + "learning_rate": 2.3997426285110592e-08, + "loss": 0.0036, + "step": 1770 + }, + { + "epoch": 0.9693486590038314, + "grad_norm": 0.007691814098507166, + "learning_rate": 2.3163442461766604e-08, + "loss": 0.0004, + "step": 1771 + }, + { + "epoch": 0.969896004378763, + "grad_norm": 0.03668086975812912, + "learning_rate": 2.2344174168887346e-08, + "loss": 0.0018, + "step": 1772 + }, + { + "epoch": 0.9704433497536946, + "grad_norm": 0.03234627842903137, + "learning_rate": 2.153962382888841e-08, + "loss": 0.0016, + "step": 1773 + }, + { + "epoch": 0.9709906951286261, + "grad_norm": 1.0921375751495361, + "learning_rate": 2.0749793820667995e-08, + "loss": 0.2131, + "step": 1774 + }, + { + "epoch": 0.9715380405035577, + "grad_norm": 0.0049126651138067245, + "learning_rate": 1.9974686479597993e-08, + "loss": 0.0003, + "step": 1775 + }, + { + "epoch": 0.9720853858784894, + "grad_norm": 0.030328121036291122, + "learning_rate": 1.921430409752012e-08, + "loss": 0.0015, + "step": 1776 + }, + { + "epoch": 0.9726327312534209, + "grad_norm": 0.010860969312489033, + "learning_rate": 1.846864892273481e-08, + "loss": 0.0005, + "step": 1777 + }, + { + "epoch": 0.9731800766283525, + "grad_norm": 0.06575039029121399, + "learning_rate": 1.7737723159999e-08, + "loss": 0.0038, + "step": 1778 + }, + { + "epoch": 0.9737274220032841, + "grad_norm": 0.08788035809993744, + "learning_rate": 1.702152897051612e-08, + "loss": 0.0051, + "step": 1779 + }, + { + "epoch": 0.9742747673782156, + "grad_norm": 0.0011737667955458164, + "learning_rate": 1.632006847193335e-08, + "loss": 0.0001, + "step": 1780 + }, + { + "epoch": 0.9748221127531472, + "grad_norm": 0.009627328254282475, + "learning_rate": 1.563334373833103e-08, + "loss": 0.0004, + "step": 1781 + }, + { + "epoch": 0.9753694581280788, + "grad_norm": 0.17805936932563782, + "learning_rate": 1.496135680021993e-08, + "loss": 0.0106, + "step": 1782 + }, + { + "epoch": 0.9759168035030104, + "grad_norm": 0.009691783227026463, + "learning_rate": 1.4304109644533438e-08, + "loss": 0.0004, + "step": 1783 + }, + { + "epoch": 0.976464148877942, + "grad_norm": 0.001948626129887998, + "learning_rate": 1.3661604214623147e-08, + "loss": 0.0001, + "step": 1784 + }, + { + "epoch": 0.9770114942528736, + "grad_norm": 0.0033026135060936213, + "learning_rate": 1.3033842410251074e-08, + "loss": 0.0002, + "step": 1785 + }, + { + "epoch": 0.9775588396278051, + "grad_norm": 1.8588638305664062, + "learning_rate": 1.2420826087586324e-08, + "loss": 0.4028, + "step": 1786 + }, + { + "epoch": 0.9781061850027367, + "grad_norm": 0.002413011621683836, + "learning_rate": 1.182255705919788e-08, + "loss": 0.0001, + "step": 1787 + }, + { + "epoch": 0.9786535303776683, + "grad_norm": 0.0020728744566440582, + "learning_rate": 1.123903709404961e-08, + "loss": 0.0001, + "step": 1788 + }, + { + "epoch": 0.9792008757525998, + "grad_norm": 1.5994459390640259, + "learning_rate": 1.0670267917496923e-08, + "loss": 0.3538, + "step": 1789 + }, + { + "epoch": 0.9797482211275315, + "grad_norm": 0.06763313710689545, + "learning_rate": 1.011625121127735e-08, + "loss": 0.004, + "step": 1790 + }, + { + "epoch": 0.9802955665024631, + "grad_norm": 0.07340343296527863, + "learning_rate": 9.576988613511084e-09, + "loss": 0.0041, + "step": 1791 + }, + { + "epoch": 0.9808429118773946, + "grad_norm": 0.004961518570780754, + "learning_rate": 9.052481718690998e-09, + "loss": 0.0002, + "step": 1792 + }, + { + "epoch": 0.9813902572523262, + "grad_norm": 0.3102180063724518, + "learning_rate": 8.542732077680971e-09, + "loss": 0.0071, + "step": 1793 + }, + { + "epoch": 0.9819376026272578, + "grad_norm": 0.002447548322379589, + "learning_rate": 8.04774119771201e-09, + "loss": 0.0001, + "step": 1794 + }, + { + "epoch": 0.9824849480021893, + "grad_norm": 0.003043276723474264, + "learning_rate": 7.567510542373923e-09, + "loss": 0.0002, + "step": 1795 + }, + { + "epoch": 0.983032293377121, + "grad_norm": 0.10866506397724152, + "learning_rate": 7.102041531615867e-09, + "loss": 0.0067, + "step": 1796 + }, + { + "epoch": 0.9835796387520526, + "grad_norm": 0.11161591857671738, + "learning_rate": 6.65133554173747e-09, + "loss": 0.0061, + "step": 1797 + }, + { + "epoch": 0.9841269841269841, + "grad_norm": 0.057195790112018585, + "learning_rate": 6.215393905388278e-09, + "loss": 0.0026, + "step": 1798 + }, + { + "epoch": 0.9846743295019157, + "grad_norm": 0.009329413995146751, + "learning_rate": 5.794217911562205e-09, + "loss": 0.0005, + "step": 1799 + }, + { + "epoch": 0.9852216748768473, + "grad_norm": 0.030702760443091393, + "learning_rate": 5.387808805594752e-09, + "loss": 0.0012, + "step": 1800 + }, + { + "epoch": 0.9857690202517789, + "grad_norm": 1.5946861505508423, + "learning_rate": 4.996167789157457e-09, + "loss": 0.0419, + "step": 1801 + }, + { + "epoch": 0.9863163656267104, + "grad_norm": 2.047494649887085, + "learning_rate": 4.619296020256236e-09, + "loss": 0.4583, + "step": 1802 + }, + { + "epoch": 0.986863711001642, + "grad_norm": 0.001278059440664947, + "learning_rate": 4.257194613226379e-09, + "loss": 0.0001, + "step": 1803 + }, + { + "epoch": 0.9874110563765737, + "grad_norm": 0.005214928183704615, + "learning_rate": 3.9098646387319974e-09, + "loss": 0.0002, + "step": 1804 + }, + { + "epoch": 0.9879584017515052, + "grad_norm": 0.08367697894573212, + "learning_rate": 3.577307123759366e-09, + "loss": 0.005, + "step": 1805 + }, + { + "epoch": 0.9885057471264368, + "grad_norm": 0.03230416402220726, + "learning_rate": 3.2595230516152543e-09, + "loss": 0.0014, + "step": 1806 + }, + { + "epoch": 0.9890530925013684, + "grad_norm": 2.5367720127105713, + "learning_rate": 2.956513361925262e-09, + "loss": 0.2684, + "step": 1807 + }, + { + "epoch": 0.9896004378762999, + "grad_norm": 0.4515298008918762, + "learning_rate": 2.6682789506299322e-09, + "loss": 0.024, + "step": 1808 + }, + { + "epoch": 0.9901477832512315, + "grad_norm": 0.0051987855695188046, + "learning_rate": 2.3948206699819787e-09, + "loss": 0.0002, + "step": 1809 + }, + { + "epoch": 0.9906951286261632, + "grad_norm": 0.011141418479382992, + "learning_rate": 2.136139328543507e-09, + "loss": 0.0005, + "step": 1810 + }, + { + "epoch": 0.9912424740010947, + "grad_norm": 0.003159766783937812, + "learning_rate": 1.892235691184907e-09, + "loss": 0.0002, + "step": 1811 + }, + { + "epoch": 0.9917898193760263, + "grad_norm": 0.0010058737825602293, + "learning_rate": 1.6631104790809648e-09, + "loss": 0.0001, + "step": 1812 + }, + { + "epoch": 0.9923371647509579, + "grad_norm": 0.14787787199020386, + "learning_rate": 1.4487643697103092e-09, + "loss": 0.0097, + "step": 1813 + }, + { + "epoch": 0.9928845101258894, + "grad_norm": 0.05396431311964989, + "learning_rate": 1.2491979968526358e-09, + "loss": 0.0028, + "step": 1814 + }, + { + "epoch": 0.993431855500821, + "grad_norm": 0.008193233981728554, + "learning_rate": 1.0644119505864858e-09, + "loss": 0.0004, + "step": 1815 + }, + { + "epoch": 0.9939792008757526, + "grad_norm": 0.001475210883654654, + "learning_rate": 8.944067772881371e-10, + "loss": 0.0001, + "step": 1816 + }, + { + "epoch": 0.9945265462506842, + "grad_norm": 0.0464748851954937, + "learning_rate": 7.391829796288275e-10, + "loss": 0.0026, + "step": 1817 + }, + { + "epoch": 0.9950738916256158, + "grad_norm": 0.0010864358628168702, + "learning_rate": 5.987410165758656e-10, + "loss": 0.0001, + "step": 1818 + }, + { + "epoch": 0.9956212370005474, + "grad_norm": 0.10535825043916702, + "learning_rate": 4.730813033881898e-10, + "loss": 0.0065, + "step": 1819 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 0.10840434581041336, + "learning_rate": 3.6220421161692333e-10, + "loss": 0.0041, + "step": 1820 + }, + { + "epoch": 0.9967159277504105, + "grad_norm": 0.057124633342027664, + "learning_rate": 2.6611006910370884e-10, + "loss": 0.0028, + "step": 1821 + }, + { + "epoch": 0.9972632731253421, + "grad_norm": 0.3710714280605316, + "learning_rate": 1.847991599801535e-10, + "loss": 0.0237, + "step": 1822 + }, + { + "epoch": 0.9978106185002736, + "grad_norm": 0.09165342897176743, + "learning_rate": 1.1827172466727376e-10, + "loss": 0.0048, + "step": 1823 + }, + { + "epoch": 0.9983579638752053, + "grad_norm": 0.01384245976805687, + "learning_rate": 6.652795987271975e-11, + "loss": 0.0007, + "step": 1824 + }, + { + "epoch": 0.9989053092501369, + "grad_norm": 0.0028413371182978153, + "learning_rate": 2.9568018593550965e-11, + "loss": 0.0002, + "step": 1825 + }, + { + "epoch": 0.9994526546250684, + "grad_norm": 0.20825180411338806, + "learning_rate": 7.392010112350356e-12, + "loss": 0.0144, + "step": 1826 + }, + { + "epoch": 1.0, + "grad_norm": 0.01608341373503208, + "learning_rate": 0.0, + "loss": 0.0008, + "step": 1827 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.27450980392156865, + "eval_loss": 0.03834722936153412, + "eval_runtime": 1062.6197, + "eval_samples_per_second": 0.192, + "eval_steps_per_second": 0.192, + "step": 1827 + } + ], + "logging_steps": 1, + "max_steps": 1827, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.437810384269312e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}