diff --git "a/checkpoint-11700/trainer_state.json" "b/checkpoint-11700/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-11700/trainer_state.json" @@ -0,0 +1,9159 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.951564076690212, + "eval_steps": 100, + "global_step": 11700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025227043390514633, + "grad_norm": 2.0247561931610107, + "learning_rate": 4.995795492768248e-05, + "loss": 3.4981, + "step": 10 + }, + { + "epoch": 0.005045408678102927, + "grad_norm": 1.5878010988235474, + "learning_rate": 4.991590985536496e-05, + "loss": 0.2065, + "step": 20 + }, + { + "epoch": 0.0075681130171543895, + "grad_norm": 1.1973843574523926, + "learning_rate": 4.987386478304743e-05, + "loss": 0.1051, + "step": 30 + }, + { + "epoch": 0.010090817356205853, + "grad_norm": 1.2424113750457764, + "learning_rate": 4.983181971072991e-05, + "loss": 0.0835, + "step": 40 + }, + { + "epoch": 0.012613521695257316, + "grad_norm": 0.5049477219581604, + "learning_rate": 4.9789774638412376e-05, + "loss": 0.0712, + "step": 50 + }, + { + "epoch": 0.015136226034308779, + "grad_norm": 0.6395494937896729, + "learning_rate": 4.974772956609486e-05, + "loss": 0.0777, + "step": 60 + }, + { + "epoch": 0.017658930373360242, + "grad_norm": 0.5573010444641113, + "learning_rate": 4.970568449377733e-05, + "loss": 0.0627, + "step": 70 + }, + { + "epoch": 0.020181634712411706, + "grad_norm": 0.5474572777748108, + "learning_rate": 4.9663639421459806e-05, + "loss": 0.0553, + "step": 80 + }, + { + "epoch": 0.022704339051463168, + "grad_norm": 0.4932677149772644, + "learning_rate": 4.962159434914228e-05, + "loss": 0.0455, + "step": 90 + }, + { + "epoch": 0.025227043390514632, + "grad_norm": 0.4285220205783844, + "learning_rate": 4.957954927682476e-05, + "loss": 0.0403, + "step": 100 + }, + { + "epoch": 0.025227043390514632, + "eval_loss": 0.04899341240525246, + "eval_runtime": 21.0493, + "eval_samples_per_second": 83.708, + "eval_steps_per_second": 20.951, + "step": 100 + }, + { + "epoch": 0.027749747729566093, + "grad_norm": 0.5163519978523254, + "learning_rate": 4.953750420450724e-05, + "loss": 0.0511, + "step": 110 + }, + { + "epoch": 0.030272452068617558, + "grad_norm": 0.42003411054611206, + "learning_rate": 4.949545913218971e-05, + "loss": 0.0353, + "step": 120 + }, + { + "epoch": 0.03279515640766902, + "grad_norm": 0.35371342301368713, + "learning_rate": 4.945341405987218e-05, + "loss": 0.035, + "step": 130 + }, + { + "epoch": 0.035317860746720484, + "grad_norm": 0.4225226938724518, + "learning_rate": 4.941136898755466e-05, + "loss": 0.0368, + "step": 140 + }, + { + "epoch": 0.037840565085771945, + "grad_norm": 0.5139334201812744, + "learning_rate": 4.9369323915237136e-05, + "loss": 0.0411, + "step": 150 + }, + { + "epoch": 0.04036326942482341, + "grad_norm": 0.44391313195228577, + "learning_rate": 4.932727884291961e-05, + "loss": 0.0336, + "step": 160 + }, + { + "epoch": 0.042885973763874874, + "grad_norm": 0.4110734462738037, + "learning_rate": 4.928523377060209e-05, + "loss": 0.0353, + "step": 170 + }, + { + "epoch": 0.045408678102926335, + "grad_norm": 0.4065672755241394, + "learning_rate": 4.924318869828457e-05, + "loss": 0.0302, + "step": 180 + }, + { + "epoch": 0.0479313824419778, + "grad_norm": 0.3082253336906433, + "learning_rate": 4.920114362596704e-05, + "loss": 0.0335, + "step": 190 + }, + { + "epoch": 0.050454086781029264, + "grad_norm": 0.36431366205215454, + "learning_rate": 4.9159098553649516e-05, + "loss": 0.0359, + "step": 200 + }, + { + "epoch": 0.050454086781029264, + "eval_loss": 0.03114187717437744, + "eval_runtime": 20.9213, + "eval_samples_per_second": 84.22, + "eval_steps_per_second": 21.079, + "step": 200 + }, + { + "epoch": 0.052976791120080725, + "grad_norm": 0.3635391294956207, + "learning_rate": 4.911705348133199e-05, + "loss": 0.0314, + "step": 210 + }, + { + "epoch": 0.055499495459132187, + "grad_norm": 0.3692784905433655, + "learning_rate": 4.9075008409014465e-05, + "loss": 0.0355, + "step": 220 + }, + { + "epoch": 0.058022199798183655, + "grad_norm": 0.3028589189052582, + "learning_rate": 4.903296333669694e-05, + "loss": 0.0331, + "step": 230 + }, + { + "epoch": 0.060544904137235116, + "grad_norm": 0.31779277324676514, + "learning_rate": 4.8990918264379415e-05, + "loss": 0.0218, + "step": 240 + }, + { + "epoch": 0.06306760847628658, + "grad_norm": 0.36319565773010254, + "learning_rate": 4.8948873192061896e-05, + "loss": 0.02, + "step": 250 + }, + { + "epoch": 0.06559031281533804, + "grad_norm": 0.30493369698524475, + "learning_rate": 4.890682811974437e-05, + "loss": 0.0321, + "step": 260 + }, + { + "epoch": 0.0681130171543895, + "grad_norm": 0.4685748815536499, + "learning_rate": 4.8864783047426845e-05, + "loss": 0.0273, + "step": 270 + }, + { + "epoch": 0.07063572149344097, + "grad_norm": 0.45751261711120605, + "learning_rate": 4.882273797510932e-05, + "loss": 0.0293, + "step": 280 + }, + { + "epoch": 0.07315842583249244, + "grad_norm": 0.3978378474712372, + "learning_rate": 4.8780692902791795e-05, + "loss": 0.0279, + "step": 290 + }, + { + "epoch": 0.07568113017154389, + "grad_norm": 0.22009262442588806, + "learning_rate": 4.873864783047427e-05, + "loss": 0.0236, + "step": 300 + }, + { + "epoch": 0.07568113017154389, + "eval_loss": 0.027077585458755493, + "eval_runtime": 20.9736, + "eval_samples_per_second": 84.01, + "eval_steps_per_second": 21.026, + "step": 300 + }, + { + "epoch": 0.07820383451059536, + "grad_norm": 0.4900023937225342, + "learning_rate": 4.8696602758156744e-05, + "loss": 0.0269, + "step": 310 + }, + { + "epoch": 0.08072653884964683, + "grad_norm": 0.4146521985530853, + "learning_rate": 4.865455768583922e-05, + "loss": 0.0427, + "step": 320 + }, + { + "epoch": 0.08324924318869828, + "grad_norm": 0.3285127580165863, + "learning_rate": 4.86125126135217e-05, + "loss": 0.0272, + "step": 330 + }, + { + "epoch": 0.08577194752774975, + "grad_norm": 0.49365487694740295, + "learning_rate": 4.8570467541204175e-05, + "loss": 0.0263, + "step": 340 + }, + { + "epoch": 0.08829465186680122, + "grad_norm": 0.3131512701511383, + "learning_rate": 4.852842246888665e-05, + "loss": 0.0193, + "step": 350 + }, + { + "epoch": 0.09081735620585267, + "grad_norm": 0.29098790884017944, + "learning_rate": 4.8486377396569124e-05, + "loss": 0.0262, + "step": 360 + }, + { + "epoch": 0.09334006054490414, + "grad_norm": 0.2346099615097046, + "learning_rate": 4.84443323242516e-05, + "loss": 0.0203, + "step": 370 + }, + { + "epoch": 0.0958627648839556, + "grad_norm": 0.31069958209991455, + "learning_rate": 4.8402287251934074e-05, + "loss": 0.0199, + "step": 380 + }, + { + "epoch": 0.09838546922300706, + "grad_norm": 0.36899533867836, + "learning_rate": 4.836024217961655e-05, + "loss": 0.0236, + "step": 390 + }, + { + "epoch": 0.10090817356205853, + "grad_norm": 0.2813776135444641, + "learning_rate": 4.831819710729903e-05, + "loss": 0.019, + "step": 400 + }, + { + "epoch": 0.10090817356205853, + "eval_loss": 0.02430903911590576, + "eval_runtime": 20.9604, + "eval_samples_per_second": 84.063, + "eval_steps_per_second": 21.04, + "step": 400 + }, + { + "epoch": 0.10343087790111, + "grad_norm": 0.4171730577945709, + "learning_rate": 4.8276152034981504e-05, + "loss": 0.0179, + "step": 410 + }, + { + "epoch": 0.10595358224016145, + "grad_norm": 0.30979079008102417, + "learning_rate": 4.823410696266398e-05, + "loss": 0.0238, + "step": 420 + }, + { + "epoch": 0.10847628657921292, + "grad_norm": 0.2565608620643616, + "learning_rate": 4.8192061890346454e-05, + "loss": 0.0216, + "step": 430 + }, + { + "epoch": 0.11099899091826437, + "grad_norm": 0.2515753507614136, + "learning_rate": 4.815001681802893e-05, + "loss": 0.0187, + "step": 440 + }, + { + "epoch": 0.11352169525731584, + "grad_norm": 0.49704504013061523, + "learning_rate": 4.81079717457114e-05, + "loss": 0.0276, + "step": 450 + }, + { + "epoch": 0.11604439959636731, + "grad_norm": 0.215419739484787, + "learning_rate": 4.806592667339388e-05, + "loss": 0.0174, + "step": 460 + }, + { + "epoch": 0.11856710393541876, + "grad_norm": 0.3299199044704437, + "learning_rate": 4.802388160107635e-05, + "loss": 0.0229, + "step": 470 + }, + { + "epoch": 0.12108980827447023, + "grad_norm": 0.2970931828022003, + "learning_rate": 4.7981836528758834e-05, + "loss": 0.026, + "step": 480 + }, + { + "epoch": 0.1236125126135217, + "grad_norm": 0.250384658575058, + "learning_rate": 4.793979145644131e-05, + "loss": 0.0189, + "step": 490 + }, + { + "epoch": 0.12613521695257315, + "grad_norm": 0.2535875141620636, + "learning_rate": 4.789774638412378e-05, + "loss": 0.0202, + "step": 500 + }, + { + "epoch": 0.12613521695257315, + "eval_loss": 0.023427454754710197, + "eval_runtime": 20.8805, + "eval_samples_per_second": 84.385, + "eval_steps_per_second": 21.12, + "step": 500 + }, + { + "epoch": 0.12865792129162462, + "grad_norm": 0.3403398096561432, + "learning_rate": 4.7855701311806265e-05, + "loss": 0.0217, + "step": 510 + }, + { + "epoch": 0.1311806256306761, + "grad_norm": 0.3057946264743805, + "learning_rate": 4.781365623948873e-05, + "loss": 0.0191, + "step": 520 + }, + { + "epoch": 0.13370332996972756, + "grad_norm": 0.25819921493530273, + "learning_rate": 4.777161116717121e-05, + "loss": 0.0193, + "step": 530 + }, + { + "epoch": 0.136226034308779, + "grad_norm": 0.29612287878990173, + "learning_rate": 4.772956609485368e-05, + "loss": 0.0208, + "step": 540 + }, + { + "epoch": 0.13874873864783047, + "grad_norm": 0.2910136580467224, + "learning_rate": 4.768752102253616e-05, + "loss": 0.0173, + "step": 550 + }, + { + "epoch": 0.14127144298688193, + "grad_norm": 0.3340837359428406, + "learning_rate": 4.764547595021864e-05, + "loss": 0.0151, + "step": 560 + }, + { + "epoch": 0.1437941473259334, + "grad_norm": 0.3069872260093689, + "learning_rate": 4.760343087790111e-05, + "loss": 0.0166, + "step": 570 + }, + { + "epoch": 0.14631685166498487, + "grad_norm": 0.24297013878822327, + "learning_rate": 4.756138580558359e-05, + "loss": 0.0228, + "step": 580 + }, + { + "epoch": 0.14883955600403634, + "grad_norm": 0.28086113929748535, + "learning_rate": 4.751934073326607e-05, + "loss": 0.0221, + "step": 590 + }, + { + "epoch": 0.15136226034308778, + "grad_norm": 0.26318562030792236, + "learning_rate": 4.7477295660948536e-05, + "loss": 0.0281, + "step": 600 + }, + { + "epoch": 0.15136226034308778, + "eval_loss": 0.02024998515844345, + "eval_runtime": 20.8969, + "eval_samples_per_second": 84.319, + "eval_steps_per_second": 21.104, + "step": 600 + }, + { + "epoch": 0.15388496468213925, + "grad_norm": 0.27950412034988403, + "learning_rate": 4.743525058863101e-05, + "loss": 0.0169, + "step": 610 + }, + { + "epoch": 0.15640766902119072, + "grad_norm": 0.20953460037708282, + "learning_rate": 4.7393205516313486e-05, + "loss": 0.0213, + "step": 620 + }, + { + "epoch": 0.15893037336024218, + "grad_norm": 0.2499765157699585, + "learning_rate": 4.735116044399597e-05, + "loss": 0.0191, + "step": 630 + }, + { + "epoch": 0.16145307769929365, + "grad_norm": 0.3006986975669861, + "learning_rate": 4.730911537167844e-05, + "loss": 0.0269, + "step": 640 + }, + { + "epoch": 0.16397578203834512, + "grad_norm": 0.24447965621948242, + "learning_rate": 4.7267070299360917e-05, + "loss": 0.0193, + "step": 650 + }, + { + "epoch": 0.16649848637739656, + "grad_norm": 0.319516122341156, + "learning_rate": 4.722502522704339e-05, + "loss": 0.0222, + "step": 660 + }, + { + "epoch": 0.16902119071644803, + "grad_norm": 0.30482372641563416, + "learning_rate": 4.718298015472587e-05, + "loss": 0.0165, + "step": 670 + }, + { + "epoch": 0.1715438950554995, + "grad_norm": 0.18806371092796326, + "learning_rate": 4.714093508240835e-05, + "loss": 0.014, + "step": 680 + }, + { + "epoch": 0.17406659939455096, + "grad_norm": 0.21826079487800598, + "learning_rate": 4.7098890010090815e-05, + "loss": 0.0192, + "step": 690 + }, + { + "epoch": 0.17658930373360243, + "grad_norm": 0.2127252221107483, + "learning_rate": 4.70568449377733e-05, + "loss": 0.0142, + "step": 700 + }, + { + "epoch": 0.17658930373360243, + "eval_loss": 0.018979934975504875, + "eval_runtime": 20.8994, + "eval_samples_per_second": 84.309, + "eval_steps_per_second": 21.101, + "step": 700 + }, + { + "epoch": 0.17911200807265387, + "grad_norm": 0.23581889271736145, + "learning_rate": 4.701479986545577e-05, + "loss": 0.0199, + "step": 710 + }, + { + "epoch": 0.18163471241170534, + "grad_norm": 0.18842558562755585, + "learning_rate": 4.6972754793138246e-05, + "loss": 0.0194, + "step": 720 + }, + { + "epoch": 0.1841574167507568, + "grad_norm": 0.29515010118484497, + "learning_rate": 4.693070972082072e-05, + "loss": 0.0299, + "step": 730 + }, + { + "epoch": 0.18668012108980828, + "grad_norm": 0.27162402868270874, + "learning_rate": 4.68886646485032e-05, + "loss": 0.0227, + "step": 740 + }, + { + "epoch": 0.18920282542885974, + "grad_norm": 0.18802249431610107, + "learning_rate": 4.684661957618568e-05, + "loss": 0.0169, + "step": 750 + }, + { + "epoch": 0.1917255297679112, + "grad_norm": 0.34699660539627075, + "learning_rate": 4.680457450386815e-05, + "loss": 0.0159, + "step": 760 + }, + { + "epoch": 0.19424823410696265, + "grad_norm": 0.3048790693283081, + "learning_rate": 4.676252943155062e-05, + "loss": 0.0178, + "step": 770 + }, + { + "epoch": 0.19677093844601412, + "grad_norm": 0.2703554332256317, + "learning_rate": 4.67204843592331e-05, + "loss": 0.0136, + "step": 780 + }, + { + "epoch": 0.1992936427850656, + "grad_norm": 0.18560905754566193, + "learning_rate": 4.6678439286915575e-05, + "loss": 0.0202, + "step": 790 + }, + { + "epoch": 0.20181634712411706, + "grad_norm": 0.2768602967262268, + "learning_rate": 4.663639421459805e-05, + "loss": 0.0283, + "step": 800 + }, + { + "epoch": 0.20181634712411706, + "eval_loss": 0.01911783590912819, + "eval_runtime": 20.9061, + "eval_samples_per_second": 84.282, + "eval_steps_per_second": 21.094, + "step": 800 + }, + { + "epoch": 0.20433905146316853, + "grad_norm": 0.18893299996852875, + "learning_rate": 4.6594349142280525e-05, + "loss": 0.0205, + "step": 810 + }, + { + "epoch": 0.20686175580222, + "grad_norm": 0.24870939552783966, + "learning_rate": 4.6552304069963006e-05, + "loss": 0.0185, + "step": 820 + }, + { + "epoch": 0.20938446014127143, + "grad_norm": 0.2561938464641571, + "learning_rate": 4.651025899764548e-05, + "loss": 0.0144, + "step": 830 + }, + { + "epoch": 0.2119071644803229, + "grad_norm": 0.22478680312633514, + "learning_rate": 4.6468213925327956e-05, + "loss": 0.0213, + "step": 840 + }, + { + "epoch": 0.21442986881937437, + "grad_norm": 0.30591025948524475, + "learning_rate": 4.642616885301043e-05, + "loss": 0.0129, + "step": 850 + }, + { + "epoch": 0.21695257315842584, + "grad_norm": 0.21737821400165558, + "learning_rate": 4.6384123780692905e-05, + "loss": 0.0181, + "step": 860 + }, + { + "epoch": 0.2194752774974773, + "grad_norm": 0.20260506868362427, + "learning_rate": 4.634207870837538e-05, + "loss": 0.0144, + "step": 870 + }, + { + "epoch": 0.22199798183652875, + "grad_norm": 0.21997040510177612, + "learning_rate": 4.6300033636057854e-05, + "loss": 0.0215, + "step": 880 + }, + { + "epoch": 0.22452068617558021, + "grad_norm": 0.2595633864402771, + "learning_rate": 4.6257988563740336e-05, + "loss": 0.0159, + "step": 890 + }, + { + "epoch": 0.22704339051463168, + "grad_norm": 0.16551759839057922, + "learning_rate": 4.621594349142281e-05, + "loss": 0.0217, + "step": 900 + }, + { + "epoch": 0.22704339051463168, + "eval_loss": 0.017788389697670937, + "eval_runtime": 20.9214, + "eval_samples_per_second": 84.22, + "eval_steps_per_second": 21.079, + "step": 900 + }, + { + "epoch": 0.22956609485368315, + "grad_norm": 0.27989163994789124, + "learning_rate": 4.6173898419105285e-05, + "loss": 0.0166, + "step": 910 + }, + { + "epoch": 0.23208879919273462, + "grad_norm": 0.1843036413192749, + "learning_rate": 4.613185334678776e-05, + "loss": 0.0148, + "step": 920 + }, + { + "epoch": 0.2346115035317861, + "grad_norm": 0.2792811691761017, + "learning_rate": 4.6089808274470234e-05, + "loss": 0.014, + "step": 930 + }, + { + "epoch": 0.23713420787083753, + "grad_norm": 0.4182822108268738, + "learning_rate": 4.604776320215271e-05, + "loss": 0.013, + "step": 940 + }, + { + "epoch": 0.239656912209889, + "grad_norm": 0.17877671122550964, + "learning_rate": 4.6005718129835184e-05, + "loss": 0.0207, + "step": 950 + }, + { + "epoch": 0.24217961654894046, + "grad_norm": 0.21961210668087006, + "learning_rate": 4.596367305751766e-05, + "loss": 0.0168, + "step": 960 + }, + { + "epoch": 0.24470232088799193, + "grad_norm": 0.12489340454339981, + "learning_rate": 4.592162798520014e-05, + "loss": 0.0177, + "step": 970 + }, + { + "epoch": 0.2472250252270434, + "grad_norm": 0.24905265867710114, + "learning_rate": 4.5879582912882614e-05, + "loss": 0.0126, + "step": 980 + }, + { + "epoch": 0.24974772956609487, + "grad_norm": 0.14141976833343506, + "learning_rate": 4.583753784056509e-05, + "loss": 0.0102, + "step": 990 + }, + { + "epoch": 0.2522704339051463, + "grad_norm": 0.19035248458385468, + "learning_rate": 4.5795492768247564e-05, + "loss": 0.0143, + "step": 1000 + }, + { + "epoch": 0.2522704339051463, + "eval_loss": 0.016077525913715363, + "eval_runtime": 21.0175, + "eval_samples_per_second": 83.835, + "eval_steps_per_second": 20.983, + "step": 1000 + }, + { + "epoch": 0.2547931382441978, + "grad_norm": 0.2033461332321167, + "learning_rate": 4.575344769593004e-05, + "loss": 0.0172, + "step": 1010 + }, + { + "epoch": 0.25731584258324924, + "grad_norm": 0.17932486534118652, + "learning_rate": 4.571140262361251e-05, + "loss": 0.0223, + "step": 1020 + }, + { + "epoch": 0.2598385469223007, + "grad_norm": 0.16702575981616974, + "learning_rate": 4.566935755129499e-05, + "loss": 0.0117, + "step": 1030 + }, + { + "epoch": 0.2623612512613522, + "grad_norm": 0.24906021356582642, + "learning_rate": 4.562731247897747e-05, + "loss": 0.0136, + "step": 1040 + }, + { + "epoch": 0.2648839556004036, + "grad_norm": 0.2807481586933136, + "learning_rate": 4.5585267406659944e-05, + "loss": 0.0161, + "step": 1050 + }, + { + "epoch": 0.2674066599394551, + "grad_norm": 0.25573644042015076, + "learning_rate": 4.554322233434242e-05, + "loss": 0.0161, + "step": 1060 + }, + { + "epoch": 0.26992936427850656, + "grad_norm": 0.20996974408626556, + "learning_rate": 4.550117726202489e-05, + "loss": 0.0099, + "step": 1070 + }, + { + "epoch": 0.272452068617558, + "grad_norm": 0.18074114620685577, + "learning_rate": 4.545913218970737e-05, + "loss": 0.0142, + "step": 1080 + }, + { + "epoch": 0.2749747729566095, + "grad_norm": 0.11202214658260345, + "learning_rate": 4.541708711738984e-05, + "loss": 0.0148, + "step": 1090 + }, + { + "epoch": 0.27749747729566093, + "grad_norm": 1.3392621278762817, + "learning_rate": 4.537504204507232e-05, + "loss": 0.028, + "step": 1100 + }, + { + "epoch": 0.27749747729566093, + "eval_loss": 0.015062345191836357, + "eval_runtime": 20.9469, + "eval_samples_per_second": 84.117, + "eval_steps_per_second": 21.053, + "step": 1100 + }, + { + "epoch": 0.28002018163471243, + "grad_norm": 0.26927411556243896, + "learning_rate": 4.533299697275479e-05, + "loss": 0.0149, + "step": 1110 + }, + { + "epoch": 0.28254288597376387, + "grad_norm": 0.25918543338775635, + "learning_rate": 4.529095190043727e-05, + "loss": 0.0157, + "step": 1120 + }, + { + "epoch": 0.28506559031281536, + "grad_norm": 0.10137899965047836, + "learning_rate": 4.524890682811975e-05, + "loss": 0.0117, + "step": 1130 + }, + { + "epoch": 0.2875882946518668, + "grad_norm": 0.1916513592004776, + "learning_rate": 4.520686175580222e-05, + "loss": 0.0176, + "step": 1140 + }, + { + "epoch": 0.29011099899091825, + "grad_norm": 0.3005896210670471, + "learning_rate": 4.51648166834847e-05, + "loss": 0.0122, + "step": 1150 + }, + { + "epoch": 0.29263370332996974, + "grad_norm": 0.24127791821956635, + "learning_rate": 4.512277161116717e-05, + "loss": 0.0108, + "step": 1160 + }, + { + "epoch": 0.2951564076690212, + "grad_norm": 0.28272244334220886, + "learning_rate": 4.5080726538849647e-05, + "loss": 0.0119, + "step": 1170 + }, + { + "epoch": 0.2976791120080727, + "grad_norm": 0.36542513966560364, + "learning_rate": 4.503868146653212e-05, + "loss": 0.0122, + "step": 1180 + }, + { + "epoch": 0.3002018163471241, + "grad_norm": 0.26852190494537354, + "learning_rate": 4.49966363942146e-05, + "loss": 0.0162, + "step": 1190 + }, + { + "epoch": 0.30272452068617556, + "grad_norm": 0.26203736662864685, + "learning_rate": 4.495459132189708e-05, + "loss": 0.0125, + "step": 1200 + }, + { + "epoch": 0.30272452068617556, + "eval_loss": 0.0145474998280406, + "eval_runtime": 20.9732, + "eval_samples_per_second": 84.012, + "eval_steps_per_second": 21.027, + "step": 1200 + }, + { + "epoch": 0.30524722502522705, + "grad_norm": 0.31206783652305603, + "learning_rate": 4.491254624957955e-05, + "loss": 0.0241, + "step": 1210 + }, + { + "epoch": 0.3077699293642785, + "grad_norm": 0.17130957543849945, + "learning_rate": 4.487050117726203e-05, + "loss": 0.0174, + "step": 1220 + }, + { + "epoch": 0.31029263370333, + "grad_norm": 0.3070640563964844, + "learning_rate": 4.482845610494451e-05, + "loss": 0.023, + "step": 1230 + }, + { + "epoch": 0.31281533804238143, + "grad_norm": 0.5285329818725586, + "learning_rate": 4.4786411032626976e-05, + "loss": 0.0115, + "step": 1240 + }, + { + "epoch": 0.31533804238143287, + "grad_norm": 0.21449489891529083, + "learning_rate": 4.474436596030945e-05, + "loss": 0.0205, + "step": 1250 + }, + { + "epoch": 0.31786074672048437, + "grad_norm": 0.18218982219696045, + "learning_rate": 4.4702320887991925e-05, + "loss": 0.0062, + "step": 1260 + }, + { + "epoch": 0.3203834510595358, + "grad_norm": 0.03409017622470856, + "learning_rate": 4.466027581567441e-05, + "loss": 0.02, + "step": 1270 + }, + { + "epoch": 0.3229061553985873, + "grad_norm": 0.2536049783229828, + "learning_rate": 4.461823074335688e-05, + "loss": 0.0146, + "step": 1280 + }, + { + "epoch": 0.32542885973763874, + "grad_norm": 0.17619676887989044, + "learning_rate": 4.4576185671039356e-05, + "loss": 0.0074, + "step": 1290 + }, + { + "epoch": 0.32795156407669024, + "grad_norm": 0.1441410630941391, + "learning_rate": 4.453414059872183e-05, + "loss": 0.013, + "step": 1300 + }, + { + "epoch": 0.32795156407669024, + "eval_loss": 0.012744620442390442, + "eval_runtime": 21.0058, + "eval_samples_per_second": 83.882, + "eval_steps_per_second": 20.994, + "step": 1300 + }, + { + "epoch": 0.3304742684157417, + "grad_norm": 0.18683987855911255, + "learning_rate": 4.449209552640431e-05, + "loss": 0.0119, + "step": 1310 + }, + { + "epoch": 0.3329969727547931, + "grad_norm": 0.16165736317634583, + "learning_rate": 4.445005045408678e-05, + "loss": 0.0113, + "step": 1320 + }, + { + "epoch": 0.3355196770938446, + "grad_norm": 0.1178312599658966, + "learning_rate": 4.4408005381769255e-05, + "loss": 0.0113, + "step": 1330 + }, + { + "epoch": 0.33804238143289606, + "grad_norm": 0.1859322488307953, + "learning_rate": 4.436596030945173e-05, + "loss": 0.0104, + "step": 1340 + }, + { + "epoch": 0.34056508577194755, + "grad_norm": 0.49083656072616577, + "learning_rate": 4.432391523713421e-05, + "loss": 0.0095, + "step": 1350 + }, + { + "epoch": 0.343087790110999, + "grad_norm": 0.14915814995765686, + "learning_rate": 4.4281870164816686e-05, + "loss": 0.013, + "step": 1360 + }, + { + "epoch": 0.34561049445005043, + "grad_norm": 0.16166740655899048, + "learning_rate": 4.423982509249916e-05, + "loss": 0.0104, + "step": 1370 + }, + { + "epoch": 0.3481331987891019, + "grad_norm": 0.19710753858089447, + "learning_rate": 4.419778002018164e-05, + "loss": 0.0094, + "step": 1380 + }, + { + "epoch": 0.35065590312815337, + "grad_norm": 0.12713222205638885, + "learning_rate": 4.4155734947864116e-05, + "loss": 0.0122, + "step": 1390 + }, + { + "epoch": 0.35317860746720486, + "grad_norm": 0.11732326447963715, + "learning_rate": 4.411368987554659e-05, + "loss": 0.0127, + "step": 1400 + }, + { + "epoch": 0.35317860746720486, + "eval_loss": 0.010630101896822453, + "eval_runtime": 20.9885, + "eval_samples_per_second": 83.951, + "eval_steps_per_second": 21.012, + "step": 1400 + }, + { + "epoch": 0.3557013118062563, + "grad_norm": 0.16016925871372223, + "learning_rate": 4.407164480322906e-05, + "loss": 0.0035, + "step": 1410 + }, + { + "epoch": 0.35822401614530774, + "grad_norm": 0.11872086673974991, + "learning_rate": 4.402959973091154e-05, + "loss": 0.0049, + "step": 1420 + }, + { + "epoch": 0.36074672048435924, + "grad_norm": 0.15516729652881622, + "learning_rate": 4.3987554658594015e-05, + "loss": 0.0144, + "step": 1430 + }, + { + "epoch": 0.3632694248234107, + "grad_norm": 0.18500037491321564, + "learning_rate": 4.394550958627649e-05, + "loss": 0.0113, + "step": 1440 + }, + { + "epoch": 0.3657921291624622, + "grad_norm": 0.17393891513347626, + "learning_rate": 4.3903464513958964e-05, + "loss": 0.0102, + "step": 1450 + }, + { + "epoch": 0.3683148335015136, + "grad_norm": 0.24622410535812378, + "learning_rate": 4.3861419441641446e-05, + "loss": 0.0099, + "step": 1460 + }, + { + "epoch": 0.3708375378405651, + "grad_norm": 0.18613703548908234, + "learning_rate": 4.381937436932392e-05, + "loss": 0.0111, + "step": 1470 + }, + { + "epoch": 0.37336024217961655, + "grad_norm": 0.23599150776863098, + "learning_rate": 4.3777329297006395e-05, + "loss": 0.0152, + "step": 1480 + }, + { + "epoch": 0.375882946518668, + "grad_norm": 0.08963891863822937, + "learning_rate": 4.373528422468886e-05, + "loss": 0.0156, + "step": 1490 + }, + { + "epoch": 0.3784056508577195, + "grad_norm": 0.26133468747138977, + "learning_rate": 4.3693239152371344e-05, + "loss": 0.0185, + "step": 1500 + }, + { + "epoch": 0.3784056508577195, + "eval_loss": 0.010692655108869076, + "eval_runtime": 21.0428, + "eval_samples_per_second": 83.734, + "eval_steps_per_second": 20.957, + "step": 1500 + }, + { + "epoch": 0.38092835519677093, + "grad_norm": 0.07590801268815994, + "learning_rate": 4.365119408005382e-05, + "loss": 0.0102, + "step": 1510 + }, + { + "epoch": 0.3834510595358224, + "grad_norm": 0.047652024775743484, + "learning_rate": 4.3609149007736294e-05, + "loss": 0.0095, + "step": 1520 + }, + { + "epoch": 0.38597376387487387, + "grad_norm": 0.23399275541305542, + "learning_rate": 4.3567103935418775e-05, + "loss": 0.0089, + "step": 1530 + }, + { + "epoch": 0.3884964682139253, + "grad_norm": 0.2155078798532486, + "learning_rate": 4.352505886310125e-05, + "loss": 0.0115, + "step": 1540 + }, + { + "epoch": 0.3910191725529768, + "grad_norm": 0.09053190052509308, + "learning_rate": 4.3483013790783725e-05, + "loss": 0.0088, + "step": 1550 + }, + { + "epoch": 0.39354187689202824, + "grad_norm": 0.2110535055398941, + "learning_rate": 4.34409687184662e-05, + "loss": 0.0098, + "step": 1560 + }, + { + "epoch": 0.39606458123107974, + "grad_norm": 0.1765887439250946, + "learning_rate": 4.3398923646148674e-05, + "loss": 0.0072, + "step": 1570 + }, + { + "epoch": 0.3985872855701312, + "grad_norm": 0.3545493483543396, + "learning_rate": 4.335687857383115e-05, + "loss": 0.0132, + "step": 1580 + }, + { + "epoch": 0.4011099899091826, + "grad_norm": 0.06623344123363495, + "learning_rate": 4.331483350151362e-05, + "loss": 0.0069, + "step": 1590 + }, + { + "epoch": 0.4036326942482341, + "grad_norm": 0.16485914587974548, + "learning_rate": 4.32727884291961e-05, + "loss": 0.007, + "step": 1600 + }, + { + "epoch": 0.4036326942482341, + "eval_loss": 0.01017470471560955, + "eval_runtime": 20.8918, + "eval_samples_per_second": 84.339, + "eval_steps_per_second": 21.109, + "step": 1600 + }, + { + "epoch": 0.40615539858728555, + "grad_norm": 0.15467578172683716, + "learning_rate": 4.323074335687858e-05, + "loss": 0.0081, + "step": 1610 + }, + { + "epoch": 0.40867810292633705, + "grad_norm": 0.2580385208129883, + "learning_rate": 4.3188698284561054e-05, + "loss": 0.0112, + "step": 1620 + }, + { + "epoch": 0.4112008072653885, + "grad_norm": 0.010637140832841396, + "learning_rate": 4.314665321224353e-05, + "loss": 0.008, + "step": 1630 + }, + { + "epoch": 0.41372351160444, + "grad_norm": 0.26677659153938293, + "learning_rate": 4.3104608139926e-05, + "loss": 0.0094, + "step": 1640 + }, + { + "epoch": 0.4162462159434914, + "grad_norm": 0.2677707374095917, + "learning_rate": 4.306256306760848e-05, + "loss": 0.0127, + "step": 1650 + }, + { + "epoch": 0.41876892028254287, + "grad_norm": 0.13589175045490265, + "learning_rate": 4.302051799529095e-05, + "loss": 0.0056, + "step": 1660 + }, + { + "epoch": 0.42129162462159436, + "grad_norm": 0.10289867222309113, + "learning_rate": 4.297847292297343e-05, + "loss": 0.0109, + "step": 1670 + }, + { + "epoch": 0.4238143289606458, + "grad_norm": 0.07062846422195435, + "learning_rate": 4.29364278506559e-05, + "loss": 0.0048, + "step": 1680 + }, + { + "epoch": 0.4263370332996973, + "grad_norm": 0.16123530268669128, + "learning_rate": 4.289438277833838e-05, + "loss": 0.0052, + "step": 1690 + }, + { + "epoch": 0.42885973763874874, + "grad_norm": 0.13397027552127838, + "learning_rate": 4.285233770602086e-05, + "loss": 0.0048, + "step": 1700 + }, + { + "epoch": 0.42885973763874874, + "eval_loss": 0.010062881745398045, + "eval_runtime": 20.9947, + "eval_samples_per_second": 83.926, + "eval_steps_per_second": 21.005, + "step": 1700 + }, + { + "epoch": 0.4313824419778002, + "grad_norm": 0.04028566554188728, + "learning_rate": 4.281029263370333e-05, + "loss": 0.0101, + "step": 1710 + }, + { + "epoch": 0.4339051463168517, + "grad_norm": 0.06560038775205612, + "learning_rate": 4.276824756138581e-05, + "loss": 0.0033, + "step": 1720 + }, + { + "epoch": 0.4364278506559031, + "grad_norm": 0.16810742020606995, + "learning_rate": 4.272620248906828e-05, + "loss": 0.0054, + "step": 1730 + }, + { + "epoch": 0.4389505549949546, + "grad_norm": 0.015353145077824593, + "learning_rate": 4.268415741675076e-05, + "loss": 0.0058, + "step": 1740 + }, + { + "epoch": 0.44147325933400605, + "grad_norm": 0.2716507911682129, + "learning_rate": 4.264211234443323e-05, + "loss": 0.0114, + "step": 1750 + }, + { + "epoch": 0.4439959636730575, + "grad_norm": 0.10725341737270355, + "learning_rate": 4.260006727211571e-05, + "loss": 0.0095, + "step": 1760 + }, + { + "epoch": 0.446518668012109, + "grad_norm": 0.21090877056121826, + "learning_rate": 4.255802219979819e-05, + "loss": 0.0171, + "step": 1770 + }, + { + "epoch": 0.44904137235116043, + "grad_norm": 0.08791640400886536, + "learning_rate": 4.251597712748066e-05, + "loss": 0.0111, + "step": 1780 + }, + { + "epoch": 0.4515640766902119, + "grad_norm": 0.29180845618247986, + "learning_rate": 4.247393205516314e-05, + "loss": 0.0093, + "step": 1790 + }, + { + "epoch": 0.45408678102926336, + "grad_norm": 0.21628066897392273, + "learning_rate": 4.243188698284561e-05, + "loss": 0.0056, + "step": 1800 + }, + { + "epoch": 0.45408678102926336, + "eval_loss": 0.010144516825675964, + "eval_runtime": 20.9537, + "eval_samples_per_second": 84.09, + "eval_steps_per_second": 21.046, + "step": 1800 + }, + { + "epoch": 0.45660948536831486, + "grad_norm": 0.17846959829330444, + "learning_rate": 4.2389841910528086e-05, + "loss": 0.008, + "step": 1810 + }, + { + "epoch": 0.4591321897073663, + "grad_norm": 0.18932151794433594, + "learning_rate": 4.234779683821056e-05, + "loss": 0.0098, + "step": 1820 + }, + { + "epoch": 0.46165489404641774, + "grad_norm": 0.005480750929564238, + "learning_rate": 4.2305751765893035e-05, + "loss": 0.0099, + "step": 1830 + }, + { + "epoch": 0.46417759838546924, + "grad_norm": 0.02110099606215954, + "learning_rate": 4.226370669357552e-05, + "loss": 0.0089, + "step": 1840 + }, + { + "epoch": 0.4667003027245207, + "grad_norm": 0.12439311295747757, + "learning_rate": 4.222166162125799e-05, + "loss": 0.006, + "step": 1850 + }, + { + "epoch": 0.4692230070635722, + "grad_norm": 0.12683548033237457, + "learning_rate": 4.2179616548940466e-05, + "loss": 0.0062, + "step": 1860 + }, + { + "epoch": 0.4717457114026236, + "grad_norm": 0.10005199909210205, + "learning_rate": 4.213757147662295e-05, + "loss": 0.0064, + "step": 1870 + }, + { + "epoch": 0.47426841574167505, + "grad_norm": 0.101644366979599, + "learning_rate": 4.2095526404305416e-05, + "loss": 0.0076, + "step": 1880 + }, + { + "epoch": 0.47679112008072655, + "grad_norm": 0.09989798069000244, + "learning_rate": 4.205348133198789e-05, + "loss": 0.0048, + "step": 1890 + }, + { + "epoch": 0.479313824419778, + "grad_norm": 0.12589283287525177, + "learning_rate": 4.2011436259670365e-05, + "loss": 0.008, + "step": 1900 + }, + { + "epoch": 0.479313824419778, + "eval_loss": 0.00869656726717949, + "eval_runtime": 20.8804, + "eval_samples_per_second": 84.385, + "eval_steps_per_second": 21.12, + "step": 1900 + }, + { + "epoch": 0.4818365287588295, + "grad_norm": 0.3338044583797455, + "learning_rate": 4.1969391187352846e-05, + "loss": 0.014, + "step": 1910 + }, + { + "epoch": 0.4843592330978809, + "grad_norm": 0.1185823529958725, + "learning_rate": 4.192734611503532e-05, + "loss": 0.0072, + "step": 1920 + }, + { + "epoch": 0.48688193743693237, + "grad_norm": 0.2536753714084625, + "learning_rate": 4.1885301042717796e-05, + "loss": 0.0177, + "step": 1930 + }, + { + "epoch": 0.48940464177598386, + "grad_norm": 0.1340733915567398, + "learning_rate": 4.184325597040027e-05, + "loss": 0.0052, + "step": 1940 + }, + { + "epoch": 0.4919273461150353, + "grad_norm": 0.09943121671676636, + "learning_rate": 4.180121089808275e-05, + "loss": 0.0054, + "step": 1950 + }, + { + "epoch": 0.4944500504540868, + "grad_norm": 0.17324581742286682, + "learning_rate": 4.175916582576522e-05, + "loss": 0.0061, + "step": 1960 + }, + { + "epoch": 0.49697275479313824, + "grad_norm": 0.027863750234246254, + "learning_rate": 4.1717120753447694e-05, + "loss": 0.0093, + "step": 1970 + }, + { + "epoch": 0.49949545913218973, + "grad_norm": 0.016479160636663437, + "learning_rate": 4.167507568113017e-05, + "loss": 0.0036, + "step": 1980 + }, + { + "epoch": 0.5020181634712412, + "grad_norm": 0.06331757456064224, + "learning_rate": 4.163303060881265e-05, + "loss": 0.0074, + "step": 1990 + }, + { + "epoch": 0.5045408678102926, + "grad_norm": 0.028800033032894135, + "learning_rate": 4.1590985536495125e-05, + "loss": 0.0092, + "step": 2000 + }, + { + "epoch": 0.5045408678102926, + "eval_loss": 0.008615074679255486, + "eval_runtime": 20.9704, + "eval_samples_per_second": 84.023, + "eval_steps_per_second": 21.03, + "step": 2000 + }, + { + "epoch": 0.507063572149344, + "grad_norm": 0.11708183586597443, + "learning_rate": 4.15489404641776e-05, + "loss": 0.0028, + "step": 2010 + }, + { + "epoch": 0.5095862764883956, + "grad_norm": 0.0794278159737587, + "learning_rate": 4.1506895391860074e-05, + "loss": 0.0097, + "step": 2020 + }, + { + "epoch": 0.512108980827447, + "grad_norm": 0.06485351175069809, + "learning_rate": 4.1464850319542556e-05, + "loss": 0.0076, + "step": 2030 + }, + { + "epoch": 0.5146316851664985, + "grad_norm": 0.15527907013893127, + "learning_rate": 4.142280524722503e-05, + "loss": 0.013, + "step": 2040 + }, + { + "epoch": 0.5171543895055499, + "grad_norm": 0.20114894211292267, + "learning_rate": 4.13807601749075e-05, + "loss": 0.0117, + "step": 2050 + }, + { + "epoch": 0.5196770938446014, + "grad_norm": 0.12151603400707245, + "learning_rate": 4.133871510258998e-05, + "loss": 0.015, + "step": 2060 + }, + { + "epoch": 0.5221997981836529, + "grad_norm": 0.055018067359924316, + "learning_rate": 4.1296670030272455e-05, + "loss": 0.0079, + "step": 2070 + }, + { + "epoch": 0.5247225025227044, + "grad_norm": 0.16336438059806824, + "learning_rate": 4.125462495795493e-05, + "loss": 0.0072, + "step": 2080 + }, + { + "epoch": 0.5272452068617558, + "grad_norm": 0.2550767660140991, + "learning_rate": 4.1212579885637404e-05, + "loss": 0.013, + "step": 2090 + }, + { + "epoch": 0.5297679112008072, + "grad_norm": 0.040902867913246155, + "learning_rate": 4.1170534813319885e-05, + "loss": 0.0033, + "step": 2100 + }, + { + "epoch": 0.5297679112008072, + "eval_loss": 0.00901652593165636, + "eval_runtime": 21.0472, + "eval_samples_per_second": 83.717, + "eval_steps_per_second": 20.953, + "step": 2100 + }, + { + "epoch": 0.5322906155398587, + "grad_norm": 0.03393733501434326, + "learning_rate": 4.112848974100236e-05, + "loss": 0.0107, + "step": 2110 + }, + { + "epoch": 0.5348133198789102, + "grad_norm": 0.10513912886381149, + "learning_rate": 4.1086444668684835e-05, + "loss": 0.0069, + "step": 2120 + }, + { + "epoch": 0.5373360242179617, + "grad_norm": 0.012084727175533772, + "learning_rate": 4.10443995963673e-05, + "loss": 0.0082, + "step": 2130 + }, + { + "epoch": 0.5398587285570131, + "grad_norm": 0.07994495332241058, + "learning_rate": 4.1002354524049784e-05, + "loss": 0.0109, + "step": 2140 + }, + { + "epoch": 0.5423814328960646, + "grad_norm": 0.13017794489860535, + "learning_rate": 4.096030945173226e-05, + "loss": 0.0041, + "step": 2150 + }, + { + "epoch": 0.544904137235116, + "grad_norm": 0.0023468900471925735, + "learning_rate": 4.091826437941473e-05, + "loss": 0.0085, + "step": 2160 + }, + { + "epoch": 0.5474268415741675, + "grad_norm": 0.061518047004938126, + "learning_rate": 4.087621930709721e-05, + "loss": 0.011, + "step": 2170 + }, + { + "epoch": 0.549949545913219, + "grad_norm": 0.07088392227888107, + "learning_rate": 4.083417423477969e-05, + "loss": 0.0056, + "step": 2180 + }, + { + "epoch": 0.5524722502522704, + "grad_norm": 0.09153332561254501, + "learning_rate": 4.0792129162462164e-05, + "loss": 0.0059, + "step": 2190 + }, + { + "epoch": 0.5549949545913219, + "grad_norm": 0.15585757791996002, + "learning_rate": 4.075008409014464e-05, + "loss": 0.0041, + "step": 2200 + }, + { + "epoch": 0.5549949545913219, + "eval_loss": 0.008859611116349697, + "eval_runtime": 20.9351, + "eval_samples_per_second": 84.165, + "eval_steps_per_second": 21.065, + "step": 2200 + }, + { + "epoch": 0.5575176589303733, + "grad_norm": 0.14012502133846283, + "learning_rate": 4.070803901782711e-05, + "loss": 0.0052, + "step": 2210 + }, + { + "epoch": 0.5600403632694249, + "grad_norm": 0.18286050856113434, + "learning_rate": 4.066599394550959e-05, + "loss": 0.0117, + "step": 2220 + }, + { + "epoch": 0.5625630676084763, + "grad_norm": 0.12133604288101196, + "learning_rate": 4.062394887319206e-05, + "loss": 0.0064, + "step": 2230 + }, + { + "epoch": 0.5650857719475277, + "grad_norm": 0.006398872472345829, + "learning_rate": 4.058190380087454e-05, + "loss": 0.0032, + "step": 2240 + }, + { + "epoch": 0.5676084762865792, + "grad_norm": 0.0005545477033592761, + "learning_rate": 4.053985872855702e-05, + "loss": 0.004, + "step": 2250 + }, + { + "epoch": 0.5701311806256307, + "grad_norm": 0.16576875746250153, + "learning_rate": 4.0497813656239493e-05, + "loss": 0.0041, + "step": 2260 + }, + { + "epoch": 0.5726538849646822, + "grad_norm": 0.034229591488838196, + "learning_rate": 4.045576858392197e-05, + "loss": 0.0051, + "step": 2270 + }, + { + "epoch": 0.5751765893037336, + "grad_norm": 0.13495758175849915, + "learning_rate": 4.041372351160444e-05, + "loss": 0.0081, + "step": 2280 + }, + { + "epoch": 0.577699293642785, + "grad_norm": 0.20754534006118774, + "learning_rate": 4.037167843928692e-05, + "loss": 0.0129, + "step": 2290 + }, + { + "epoch": 0.5802219979818365, + "grad_norm": 0.12224958837032318, + "learning_rate": 4.032963336696939e-05, + "loss": 0.007, + "step": 2300 + }, + { + "epoch": 0.5802219979818365, + "eval_loss": 0.008081664331257343, + "eval_runtime": 20.9045, + "eval_samples_per_second": 84.288, + "eval_steps_per_second": 21.096, + "step": 2300 + }, + { + "epoch": 0.582744702320888, + "grad_norm": 0.20963284373283386, + "learning_rate": 4.028758829465187e-05, + "loss": 0.011, + "step": 2310 + }, + { + "epoch": 0.5852674066599395, + "grad_norm": 0.1182667464017868, + "learning_rate": 4.024554322233434e-05, + "loss": 0.0085, + "step": 2320 + }, + { + "epoch": 0.5877901109989909, + "grad_norm": 0.1626705825328827, + "learning_rate": 4.020349815001682e-05, + "loss": 0.0091, + "step": 2330 + }, + { + "epoch": 0.5903128153380424, + "grad_norm": 0.10798126459121704, + "learning_rate": 4.01614530776993e-05, + "loss": 0.009, + "step": 2340 + }, + { + "epoch": 0.5928355196770938, + "grad_norm": 0.03671824559569359, + "learning_rate": 4.011940800538177e-05, + "loss": 0.005, + "step": 2350 + }, + { + "epoch": 0.5953582240161454, + "grad_norm": 0.019325584173202515, + "learning_rate": 4.007736293306425e-05, + "loss": 0.0082, + "step": 2360 + }, + { + "epoch": 0.5978809283551968, + "grad_norm": 0.04128754511475563, + "learning_rate": 4.003531786074672e-05, + "loss": 0.0046, + "step": 2370 + }, + { + "epoch": 0.6004036326942482, + "grad_norm": 0.07875852286815643, + "learning_rate": 3.9993272788429196e-05, + "loss": 0.0283, + "step": 2380 + }, + { + "epoch": 0.6029263370332997, + "grad_norm": 0.11841381341218948, + "learning_rate": 3.995122771611167e-05, + "loss": 0.0052, + "step": 2390 + }, + { + "epoch": 0.6054490413723511, + "grad_norm": 0.14310500025749207, + "learning_rate": 3.990918264379415e-05, + "loss": 0.0027, + "step": 2400 + }, + { + "epoch": 0.6054490413723511, + "eval_loss": 0.008280658163130283, + "eval_runtime": 20.9355, + "eval_samples_per_second": 84.163, + "eval_steps_per_second": 21.065, + "step": 2400 + }, + { + "epoch": 0.6079717457114027, + "grad_norm": 0.1013203114271164, + "learning_rate": 3.986713757147663e-05, + "loss": 0.0071, + "step": 2410 + }, + { + "epoch": 0.6104944500504541, + "grad_norm": 0.09219915419816971, + "learning_rate": 3.98250924991591e-05, + "loss": 0.0128, + "step": 2420 + }, + { + "epoch": 0.6130171543895055, + "grad_norm": 0.21949277818202972, + "learning_rate": 3.9783047426841576e-05, + "loss": 0.0125, + "step": 2430 + }, + { + "epoch": 0.615539858728557, + "grad_norm": 0.04883907735347748, + "learning_rate": 3.974100235452405e-05, + "loss": 0.0133, + "step": 2440 + }, + { + "epoch": 0.6180625630676084, + "grad_norm": 0.28083309531211853, + "learning_rate": 3.9698957282206526e-05, + "loss": 0.0094, + "step": 2450 + }, + { + "epoch": 0.62058526740666, + "grad_norm": 0.1395656317472458, + "learning_rate": 3.9656912209889e-05, + "loss": 0.008, + "step": 2460 + }, + { + "epoch": 0.6231079717457114, + "grad_norm": 0.3387027084827423, + "learning_rate": 3.9614867137571475e-05, + "loss": 0.0108, + "step": 2470 + }, + { + "epoch": 0.6256306760847629, + "grad_norm": 0.12317987531423569, + "learning_rate": 3.9572822065253956e-05, + "loss": 0.0106, + "step": 2480 + }, + { + "epoch": 0.6281533804238143, + "grad_norm": 0.11516406387090683, + "learning_rate": 3.953077699293643e-05, + "loss": 0.0109, + "step": 2490 + }, + { + "epoch": 0.6306760847628657, + "grad_norm": 0.3164563775062561, + "learning_rate": 3.9488731920618906e-05, + "loss": 0.0122, + "step": 2500 + }, + { + "epoch": 0.6306760847628657, + "eval_loss": 0.0077254436910152435, + "eval_runtime": 20.9125, + "eval_samples_per_second": 84.256, + "eval_steps_per_second": 21.088, + "step": 2500 + }, + { + "epoch": 0.6331987891019173, + "grad_norm": 0.1707511991262436, + "learning_rate": 3.944668684830138e-05, + "loss": 0.009, + "step": 2510 + }, + { + "epoch": 0.6357214934409687, + "grad_norm": 0.08108045160770416, + "learning_rate": 3.9404641775983855e-05, + "loss": 0.012, + "step": 2520 + }, + { + "epoch": 0.6382441977800202, + "grad_norm": 0.1104462668299675, + "learning_rate": 3.936259670366633e-05, + "loss": 0.004, + "step": 2530 + }, + { + "epoch": 0.6407669021190716, + "grad_norm": 0.17339076101779938, + "learning_rate": 3.9320551631348804e-05, + "loss": 0.0056, + "step": 2540 + }, + { + "epoch": 0.643289606458123, + "grad_norm": 0.10303635895252228, + "learning_rate": 3.9278506559031286e-05, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6458123107971746, + "grad_norm": 0.009340761229395866, + "learning_rate": 3.923646148671376e-05, + "loss": 0.0071, + "step": 2560 + }, + { + "epoch": 0.648335015136226, + "grad_norm": 0.11290521174669266, + "learning_rate": 3.9194416414396235e-05, + "loss": 0.0064, + "step": 2570 + }, + { + "epoch": 0.6508577194752775, + "grad_norm": 0.13023380935192108, + "learning_rate": 3.915237134207871e-05, + "loss": 0.0047, + "step": 2580 + }, + { + "epoch": 0.6533804238143289, + "grad_norm": 0.027826808393001556, + "learning_rate": 3.911032626976119e-05, + "loss": 0.004, + "step": 2590 + }, + { + "epoch": 0.6559031281533805, + "grad_norm": 0.12674188613891602, + "learning_rate": 3.906828119744366e-05, + "loss": 0.0087, + "step": 2600 + }, + { + "epoch": 0.6559031281533805, + "eval_loss": 0.007544004824012518, + "eval_runtime": 20.9279, + "eval_samples_per_second": 84.194, + "eval_steps_per_second": 21.072, + "step": 2600 + }, + { + "epoch": 0.6584258324924319, + "grad_norm": 0.05906185507774353, + "learning_rate": 3.9026236125126134e-05, + "loss": 0.0112, + "step": 2610 + }, + { + "epoch": 0.6609485368314834, + "grad_norm": 0.02223772369325161, + "learning_rate": 3.898419105280861e-05, + "loss": 0.0061, + "step": 2620 + }, + { + "epoch": 0.6634712411705348, + "grad_norm": 0.1578211635351181, + "learning_rate": 3.894214598049109e-05, + "loss": 0.0065, + "step": 2630 + }, + { + "epoch": 0.6659939455095862, + "grad_norm": 0.0348033532500267, + "learning_rate": 3.8900100908173565e-05, + "loss": 0.0106, + "step": 2640 + }, + { + "epoch": 0.6685166498486378, + "grad_norm": 0.09289257973432541, + "learning_rate": 3.885805583585604e-05, + "loss": 0.0055, + "step": 2650 + }, + { + "epoch": 0.6710393541876892, + "grad_norm": 0.0011186335468664765, + "learning_rate": 3.8816010763538514e-05, + "loss": 0.0043, + "step": 2660 + }, + { + "epoch": 0.6735620585267407, + "grad_norm": 0.04303692653775215, + "learning_rate": 3.8773965691220995e-05, + "loss": 0.003, + "step": 2670 + }, + { + "epoch": 0.6760847628657921, + "grad_norm": 0.02356291376054287, + "learning_rate": 3.873192061890347e-05, + "loss": 0.004, + "step": 2680 + }, + { + "epoch": 0.6786074672048436, + "grad_norm": 0.23490223288536072, + "learning_rate": 3.868987554658594e-05, + "loss": 0.0087, + "step": 2690 + }, + { + "epoch": 0.6811301715438951, + "grad_norm": 0.18736770749092102, + "learning_rate": 3.864783047426841e-05, + "loss": 0.0101, + "step": 2700 + }, + { + "epoch": 0.6811301715438951, + "eval_loss": 0.007495530880987644, + "eval_runtime": 20.9691, + "eval_samples_per_second": 84.029, + "eval_steps_per_second": 21.031, + "step": 2700 + }, + { + "epoch": 0.6836528758829465, + "grad_norm": 0.06488362699747086, + "learning_rate": 3.8605785401950894e-05, + "loss": 0.0023, + "step": 2710 + }, + { + "epoch": 0.686175580221998, + "grad_norm": 0.11341580748558044, + "learning_rate": 3.856374032963337e-05, + "loss": 0.0075, + "step": 2720 + }, + { + "epoch": 0.6886982845610494, + "grad_norm": 0.018855459988117218, + "learning_rate": 3.852169525731584e-05, + "loss": 0.0072, + "step": 2730 + }, + { + "epoch": 0.6912209889001009, + "grad_norm": 0.002237241482362151, + "learning_rate": 3.8479650184998325e-05, + "loss": 0.0046, + "step": 2740 + }, + { + "epoch": 0.6937436932391524, + "grad_norm": 0.2180403620004654, + "learning_rate": 3.84376051126808e-05, + "loss": 0.008, + "step": 2750 + }, + { + "epoch": 0.6962663975782039, + "grad_norm": 0.09451308846473694, + "learning_rate": 3.8395560040363274e-05, + "loss": 0.0065, + "step": 2760 + }, + { + "epoch": 0.6987891019172553, + "grad_norm": 0.14188626408576965, + "learning_rate": 3.835351496804574e-05, + "loss": 0.0106, + "step": 2770 + }, + { + "epoch": 0.7013118062563067, + "grad_norm": 0.10723700374364853, + "learning_rate": 3.8311469895728223e-05, + "loss": 0.0101, + "step": 2780 + }, + { + "epoch": 0.7038345105953582, + "grad_norm": 0.09538406878709793, + "learning_rate": 3.82694248234107e-05, + "loss": 0.0101, + "step": 2790 + }, + { + "epoch": 0.7063572149344097, + "grad_norm": 0.013723093084990978, + "learning_rate": 3.822737975109317e-05, + "loss": 0.0064, + "step": 2800 + }, + { + "epoch": 0.7063572149344097, + "eval_loss": 0.007626931183040142, + "eval_runtime": 20.9136, + "eval_samples_per_second": 84.252, + "eval_steps_per_second": 21.087, + "step": 2800 + }, + { + "epoch": 0.7088799192734612, + "grad_norm": 0.08048822730779648, + "learning_rate": 3.818533467877565e-05, + "loss": 0.0046, + "step": 2810 + }, + { + "epoch": 0.7114026236125126, + "grad_norm": 0.2678566873073578, + "learning_rate": 3.814328960645813e-05, + "loss": 0.0111, + "step": 2820 + }, + { + "epoch": 0.713925327951564, + "grad_norm": 0.016534708440303802, + "learning_rate": 3.8101244534140604e-05, + "loss": 0.0105, + "step": 2830 + }, + { + "epoch": 0.7164480322906155, + "grad_norm": 0.17189861834049225, + "learning_rate": 3.805919946182308e-05, + "loss": 0.0124, + "step": 2840 + }, + { + "epoch": 0.718970736629667, + "grad_norm": 0.004572316538542509, + "learning_rate": 3.8017154389505546e-05, + "loss": 0.0054, + "step": 2850 + }, + { + "epoch": 0.7214934409687185, + "grad_norm": 0.02059135213494301, + "learning_rate": 3.797510931718803e-05, + "loss": 0.0108, + "step": 2860 + }, + { + "epoch": 0.7240161453077699, + "grad_norm": 0.11188461631536484, + "learning_rate": 3.79330642448705e-05, + "loss": 0.0052, + "step": 2870 + }, + { + "epoch": 0.7265388496468214, + "grad_norm": 0.0961654856801033, + "learning_rate": 3.789101917255298e-05, + "loss": 0.0092, + "step": 2880 + }, + { + "epoch": 0.7290615539858728, + "grad_norm": 0.004565075505524874, + "learning_rate": 3.784897410023546e-05, + "loss": 0.005, + "step": 2890 + }, + { + "epoch": 0.7315842583249244, + "grad_norm": 0.058100827038288116, + "learning_rate": 3.780692902791793e-05, + "loss": 0.0196, + "step": 2900 + }, + { + "epoch": 0.7315842583249244, + "eval_loss": 0.007003966718912125, + "eval_runtime": 20.9017, + "eval_samples_per_second": 84.299, + "eval_steps_per_second": 21.099, + "step": 2900 + }, + { + "epoch": 0.7341069626639758, + "grad_norm": 0.08321081846952438, + "learning_rate": 3.776488395560041e-05, + "loss": 0.0042, + "step": 2910 + }, + { + "epoch": 0.7366296670030272, + "grad_norm": 0.12256285548210144, + "learning_rate": 3.772283888328288e-05, + "loss": 0.0101, + "step": 2920 + }, + { + "epoch": 0.7391523713420787, + "grad_norm": 0.18364761769771576, + "learning_rate": 3.768079381096536e-05, + "loss": 0.0136, + "step": 2930 + }, + { + "epoch": 0.7416750756811302, + "grad_norm": 0.3442452847957611, + "learning_rate": 3.763874873864783e-05, + "loss": 0.0094, + "step": 2940 + }, + { + "epoch": 0.7441977800201817, + "grad_norm": 0.1985316276550293, + "learning_rate": 3.7596703666330306e-05, + "loss": 0.0058, + "step": 2950 + }, + { + "epoch": 0.7467204843592331, + "grad_norm": 0.000595409597735852, + "learning_rate": 3.755465859401278e-05, + "loss": 0.0212, + "step": 2960 + }, + { + "epoch": 0.7492431886982845, + "grad_norm": 0.23967699706554413, + "learning_rate": 3.751261352169526e-05, + "loss": 0.0064, + "step": 2970 + }, + { + "epoch": 0.751765893037336, + "grad_norm": 0.0012721189996227622, + "learning_rate": 3.747056844937774e-05, + "loss": 0.003, + "step": 2980 + }, + { + "epoch": 0.7542885973763875, + "grad_norm": 0.0369889996945858, + "learning_rate": 3.742852337706021e-05, + "loss": 0.0072, + "step": 2990 + }, + { + "epoch": 0.756811301715439, + "grad_norm": 0.07918387651443481, + "learning_rate": 3.7386478304742686e-05, + "loss": 0.0031, + "step": 3000 + }, + { + "epoch": 0.756811301715439, + "eval_loss": 0.007090387400239706, + "eval_runtime": 20.9252, + "eval_samples_per_second": 84.205, + "eval_steps_per_second": 21.075, + "step": 3000 + }, + { + "epoch": 0.7593340060544904, + "grad_norm": 0.08373123407363892, + "learning_rate": 3.734443323242516e-05, + "loss": 0.0033, + "step": 3010 + }, + { + "epoch": 0.7618567103935419, + "grad_norm": 0.06391701102256775, + "learning_rate": 3.7302388160107636e-05, + "loss": 0.0084, + "step": 3020 + }, + { + "epoch": 0.7643794147325933, + "grad_norm": 0.11146340519189835, + "learning_rate": 3.726034308779011e-05, + "loss": 0.0054, + "step": 3030 + }, + { + "epoch": 0.7669021190716448, + "grad_norm": 0.1086326614022255, + "learning_rate": 3.7218298015472585e-05, + "loss": 0.0028, + "step": 3040 + }, + { + "epoch": 0.7694248234106963, + "grad_norm": 0.029285268858075142, + "learning_rate": 3.7176252943155067e-05, + "loss": 0.0065, + "step": 3050 + }, + { + "epoch": 0.7719475277497477, + "grad_norm": 0.13605491816997528, + "learning_rate": 3.713420787083754e-05, + "loss": 0.0083, + "step": 3060 + }, + { + "epoch": 0.7744702320887992, + "grad_norm": 0.0959770679473877, + "learning_rate": 3.7092162798520016e-05, + "loss": 0.0078, + "step": 3070 + }, + { + "epoch": 0.7769929364278506, + "grad_norm": 0.022328553721308708, + "learning_rate": 3.705011772620249e-05, + "loss": 0.0065, + "step": 3080 + }, + { + "epoch": 0.7795156407669022, + "grad_norm": 0.0018502280581742525, + "learning_rate": 3.7008072653884965e-05, + "loss": 0.0107, + "step": 3090 + }, + { + "epoch": 0.7820383451059536, + "grad_norm": 0.020223820582032204, + "learning_rate": 3.696602758156744e-05, + "loss": 0.0014, + "step": 3100 + }, + { + "epoch": 0.7820383451059536, + "eval_loss": 0.007150140590965748, + "eval_runtime": 20.9003, + "eval_samples_per_second": 84.305, + "eval_steps_per_second": 21.1, + "step": 3100 + }, + { + "epoch": 0.784561049445005, + "grad_norm": 0.17186589539051056, + "learning_rate": 3.6923982509249915e-05, + "loss": 0.0043, + "step": 3110 + }, + { + "epoch": 0.7870837537840565, + "grad_norm": 0.054055992513895035, + "learning_rate": 3.6881937436932396e-05, + "loss": 0.0036, + "step": 3120 + }, + { + "epoch": 0.7896064581231079, + "grad_norm": 0.24574770033359528, + "learning_rate": 3.683989236461487e-05, + "loss": 0.0125, + "step": 3130 + }, + { + "epoch": 0.7921291624621595, + "grad_norm": 0.09889545291662216, + "learning_rate": 3.6797847292297345e-05, + "loss": 0.0071, + "step": 3140 + }, + { + "epoch": 0.7946518668012109, + "grad_norm": 0.1626003533601761, + "learning_rate": 3.675580221997982e-05, + "loss": 0.0037, + "step": 3150 + }, + { + "epoch": 0.7971745711402624, + "grad_norm": 0.1329105943441391, + "learning_rate": 3.6713757147662295e-05, + "loss": 0.007, + "step": 3160 + }, + { + "epoch": 0.7996972754793138, + "grad_norm": 0.18876679241657257, + "learning_rate": 3.667171207534477e-05, + "loss": 0.0121, + "step": 3170 + }, + { + "epoch": 0.8022199798183652, + "grad_norm": 0.1532873511314392, + "learning_rate": 3.6629667003027244e-05, + "loss": 0.0061, + "step": 3180 + }, + { + "epoch": 0.8047426841574168, + "grad_norm": 0.14677301049232483, + "learning_rate": 3.658762193070972e-05, + "loss": 0.0046, + "step": 3190 + }, + { + "epoch": 0.8072653884964682, + "grad_norm": 0.1844077706336975, + "learning_rate": 3.65455768583922e-05, + "loss": 0.0075, + "step": 3200 + }, + { + "epoch": 0.8072653884964682, + "eval_loss": 0.006863369140774012, + "eval_runtime": 20.9061, + "eval_samples_per_second": 84.282, + "eval_steps_per_second": 21.094, + "step": 3200 + }, + { + "epoch": 0.8097880928355197, + "grad_norm": 0.07923123240470886, + "learning_rate": 3.6503531786074675e-05, + "loss": 0.0023, + "step": 3210 + }, + { + "epoch": 0.8123107971745711, + "grad_norm": 0.003948994446545839, + "learning_rate": 3.646148671375715e-05, + "loss": 0.0036, + "step": 3220 + }, + { + "epoch": 0.8148335015136225, + "grad_norm": 0.18977274000644684, + "learning_rate": 3.641944164143963e-05, + "loss": 0.0058, + "step": 3230 + }, + { + "epoch": 0.8173562058526741, + "grad_norm": 0.13241952657699585, + "learning_rate": 3.63773965691221e-05, + "loss": 0.0048, + "step": 3240 + }, + { + "epoch": 0.8198789101917255, + "grad_norm": 0.10835881531238556, + "learning_rate": 3.6335351496804573e-05, + "loss": 0.0065, + "step": 3250 + }, + { + "epoch": 0.822401614530777, + "grad_norm": 0.10177090018987656, + "learning_rate": 3.629330642448705e-05, + "loss": 0.0089, + "step": 3260 + }, + { + "epoch": 0.8249243188698284, + "grad_norm": 0.11547684669494629, + "learning_rate": 3.625126135216953e-05, + "loss": 0.0069, + "step": 3270 + }, + { + "epoch": 0.82744702320888, + "grad_norm": 0.22250983119010925, + "learning_rate": 3.6209216279852004e-05, + "loss": 0.0128, + "step": 3280 + }, + { + "epoch": 0.8299697275479314, + "grad_norm": 0.05016797035932541, + "learning_rate": 3.616717120753448e-05, + "loss": 0.0041, + "step": 3290 + }, + { + "epoch": 0.8324924318869829, + "grad_norm": 0.19384223222732544, + "learning_rate": 3.6125126135216953e-05, + "loss": 0.0125, + "step": 3300 + }, + { + "epoch": 0.8324924318869829, + "eval_loss": 0.006563546601682901, + "eval_runtime": 20.8989, + "eval_samples_per_second": 84.311, + "eval_steps_per_second": 21.102, + "step": 3300 + }, + { + "epoch": 0.8350151362260343, + "grad_norm": 0.09639815986156464, + "learning_rate": 3.6083081062899435e-05, + "loss": 0.0064, + "step": 3310 + }, + { + "epoch": 0.8375378405650857, + "grad_norm": 0.006088436581194401, + "learning_rate": 3.60410359905819e-05, + "loss": 0.0072, + "step": 3320 + }, + { + "epoch": 0.8400605449041373, + "grad_norm": 0.20799516141414642, + "learning_rate": 3.599899091826438e-05, + "loss": 0.0094, + "step": 3330 + }, + { + "epoch": 0.8425832492431887, + "grad_norm": 0.0011494633508846164, + "learning_rate": 3.595694584594685e-05, + "loss": 0.0064, + "step": 3340 + }, + { + "epoch": 0.8451059535822402, + "grad_norm": 0.012169969268143177, + "learning_rate": 3.5914900773629334e-05, + "loss": 0.0077, + "step": 3350 + }, + { + "epoch": 0.8476286579212916, + "grad_norm": 0.15517696738243103, + "learning_rate": 3.587285570131181e-05, + "loss": 0.004, + "step": 3360 + }, + { + "epoch": 0.850151362260343, + "grad_norm": 0.1262129694223404, + "learning_rate": 3.583081062899428e-05, + "loss": 0.0123, + "step": 3370 + }, + { + "epoch": 0.8526740665993946, + "grad_norm": 0.05431267246603966, + "learning_rate": 3.578876555667676e-05, + "loss": 0.0012, + "step": 3380 + }, + { + "epoch": 0.855196770938446, + "grad_norm": 0.27199476957321167, + "learning_rate": 3.574672048435924e-05, + "loss": 0.0076, + "step": 3390 + }, + { + "epoch": 0.8577194752774975, + "grad_norm": 0.20499233901500702, + "learning_rate": 3.5704675412041714e-05, + "loss": 0.0045, + "step": 3400 + }, + { + "epoch": 0.8577194752774975, + "eval_loss": 0.006544772535562515, + "eval_runtime": 20.8969, + "eval_samples_per_second": 84.319, + "eval_steps_per_second": 21.104, + "step": 3400 + }, + { + "epoch": 0.8602421796165489, + "grad_norm": 0.08625713735818863, + "learning_rate": 3.566263033972418e-05, + "loss": 0.0099, + "step": 3410 + }, + { + "epoch": 0.8627648839556004, + "grad_norm": 0.11165639013051987, + "learning_rate": 3.562058526740666e-05, + "loss": 0.0026, + "step": 3420 + }, + { + "epoch": 0.8652875882946519, + "grad_norm": 0.0018256891053169966, + "learning_rate": 3.557854019508914e-05, + "loss": 0.0048, + "step": 3430 + }, + { + "epoch": 0.8678102926337034, + "grad_norm": 0.19064132869243622, + "learning_rate": 3.553649512277161e-05, + "loss": 0.0021, + "step": 3440 + }, + { + "epoch": 0.8703329969727548, + "grad_norm": 0.2267286479473114, + "learning_rate": 3.549445005045409e-05, + "loss": 0.0091, + "step": 3450 + }, + { + "epoch": 0.8728557013118062, + "grad_norm": 0.006103180348873138, + "learning_rate": 3.545240497813657e-05, + "loss": 0.0103, + "step": 3460 + }, + { + "epoch": 0.8753784056508577, + "grad_norm": 0.0026887240819633007, + "learning_rate": 3.541035990581904e-05, + "loss": 0.0048, + "step": 3470 + }, + { + "epoch": 0.8779011099899092, + "grad_norm": 0.06880487501621246, + "learning_rate": 3.536831483350152e-05, + "loss": 0.0031, + "step": 3480 + }, + { + "epoch": 0.8804238143289607, + "grad_norm": 0.14154627919197083, + "learning_rate": 3.5326269761183986e-05, + "loss": 0.0145, + "step": 3490 + }, + { + "epoch": 0.8829465186680121, + "grad_norm": 0.0016246146988123655, + "learning_rate": 3.528422468886647e-05, + "loss": 0.0026, + "step": 3500 + }, + { + "epoch": 0.8829465186680121, + "eval_loss": 0.007226438261568546, + "eval_runtime": 20.911, + "eval_samples_per_second": 84.262, + "eval_steps_per_second": 21.089, + "step": 3500 + }, + { + "epoch": 0.8854692230070635, + "grad_norm": 0.0016699236584827304, + "learning_rate": 3.524217961654894e-05, + "loss": 0.0058, + "step": 3510 + }, + { + "epoch": 0.887991927346115, + "grad_norm": 0.2493603378534317, + "learning_rate": 3.5200134544231416e-05, + "loss": 0.0113, + "step": 3520 + }, + { + "epoch": 0.8905146316851665, + "grad_norm": 0.1181974858045578, + "learning_rate": 3.515808947191389e-05, + "loss": 0.0059, + "step": 3530 + }, + { + "epoch": 0.893037336024218, + "grad_norm": 0.11245719343423843, + "learning_rate": 3.511604439959637e-05, + "loss": 0.0045, + "step": 3540 + }, + { + "epoch": 0.8955600403632694, + "grad_norm": 0.13200731575489044, + "learning_rate": 3.507399932727885e-05, + "loss": 0.0054, + "step": 3550 + }, + { + "epoch": 0.8980827447023209, + "grad_norm": 0.195307195186615, + "learning_rate": 3.503195425496132e-05, + "loss": 0.0055, + "step": 3560 + }, + { + "epoch": 0.9006054490413723, + "grad_norm": 0.08665880560874939, + "learning_rate": 3.4989909182643797e-05, + "loss": 0.0074, + "step": 3570 + }, + { + "epoch": 0.9031281533804238, + "grad_norm": 0.00980606209486723, + "learning_rate": 3.494786411032627e-05, + "loss": 0.0049, + "step": 3580 + }, + { + "epoch": 0.9056508577194753, + "grad_norm": 0.1497032195329666, + "learning_rate": 3.4905819038008746e-05, + "loss": 0.0055, + "step": 3590 + }, + { + "epoch": 0.9081735620585267, + "grad_norm": 0.09247948974370956, + "learning_rate": 3.486377396569122e-05, + "loss": 0.0022, + "step": 3600 + }, + { + "epoch": 0.9081735620585267, + "eval_loss": 0.0062293908558785915, + "eval_runtime": 20.8987, + "eval_samples_per_second": 84.312, + "eval_steps_per_second": 21.102, + "step": 3600 + }, + { + "epoch": 0.9106962663975782, + "grad_norm": 0.09304177761077881, + "learning_rate": 3.48217288933737e-05, + "loss": 0.0101, + "step": 3610 + }, + { + "epoch": 0.9132189707366297, + "grad_norm": 0.004028341267257929, + "learning_rate": 3.477968382105618e-05, + "loss": 0.0097, + "step": 3620 + }, + { + "epoch": 0.9157416750756812, + "grad_norm": 0.03493291139602661, + "learning_rate": 3.473763874873865e-05, + "loss": 0.0051, + "step": 3630 + }, + { + "epoch": 0.9182643794147326, + "grad_norm": 0.12278582155704498, + "learning_rate": 3.4695593676421126e-05, + "loss": 0.0106, + "step": 3640 + }, + { + "epoch": 0.920787083753784, + "grad_norm": 0.18783220648765564, + "learning_rate": 3.46535486041036e-05, + "loss": 0.0043, + "step": 3650 + }, + { + "epoch": 0.9233097880928355, + "grad_norm": 0.013721932657063007, + "learning_rate": 3.4611503531786075e-05, + "loss": 0.0031, + "step": 3660 + }, + { + "epoch": 0.925832492431887, + "grad_norm": 0.01255789864808321, + "learning_rate": 3.456945845946855e-05, + "loss": 0.0043, + "step": 3670 + }, + { + "epoch": 0.9283551967709385, + "grad_norm": 0.06713691353797913, + "learning_rate": 3.4527413387151025e-05, + "loss": 0.0024, + "step": 3680 + }, + { + "epoch": 0.9308779011099899, + "grad_norm": 0.007393176201730967, + "learning_rate": 3.4485368314833506e-05, + "loss": 0.0059, + "step": 3690 + }, + { + "epoch": 0.9334006054490414, + "grad_norm": 0.14634644985198975, + "learning_rate": 3.444332324251598e-05, + "loss": 0.0102, + "step": 3700 + }, + { + "epoch": 0.9334006054490414, + "eval_loss": 0.005810776725411415, + "eval_runtime": 20.8803, + "eval_samples_per_second": 84.386, + "eval_steps_per_second": 21.12, + "step": 3700 + }, + { + "epoch": 0.9359233097880928, + "grad_norm": 0.00905498769134283, + "learning_rate": 3.4401278170198455e-05, + "loss": 0.003, + "step": 3710 + }, + { + "epoch": 0.9384460141271443, + "grad_norm": 0.005104383919388056, + "learning_rate": 3.435923309788093e-05, + "loss": 0.0046, + "step": 3720 + }, + { + "epoch": 0.9409687184661958, + "grad_norm": 0.036407459527254105, + "learning_rate": 3.4317188025563405e-05, + "loss": 0.0052, + "step": 3730 + }, + { + "epoch": 0.9434914228052472, + "grad_norm": 0.12225465476512909, + "learning_rate": 3.427514295324588e-05, + "loss": 0.0011, + "step": 3740 + }, + { + "epoch": 0.9460141271442987, + "grad_norm": 0.002337078098207712, + "learning_rate": 3.4233097880928354e-05, + "loss": 0.0053, + "step": 3750 + }, + { + "epoch": 0.9485368314833501, + "grad_norm": 0.0018623026553541422, + "learning_rate": 3.4191052808610836e-05, + "loss": 0.0021, + "step": 3760 + }, + { + "epoch": 0.9510595358224017, + "grad_norm": 0.013399872928857803, + "learning_rate": 3.414900773629331e-05, + "loss": 0.0032, + "step": 3770 + }, + { + "epoch": 0.9535822401614531, + "grad_norm": 0.010270589962601662, + "learning_rate": 3.4106962663975785e-05, + "loss": 0.0083, + "step": 3780 + }, + { + "epoch": 0.9561049445005045, + "grad_norm": 0.07046973705291748, + "learning_rate": 3.406491759165826e-05, + "loss": 0.0014, + "step": 3790 + }, + { + "epoch": 0.958627648839556, + "grad_norm": 0.0009812767384573817, + "learning_rate": 3.4022872519340734e-05, + "loss": 0.0039, + "step": 3800 + }, + { + "epoch": 0.958627648839556, + "eval_loss": 0.005735259968787432, + "eval_runtime": 20.894, + "eval_samples_per_second": 84.331, + "eval_steps_per_second": 21.107, + "step": 3800 + }, + { + "epoch": 0.9611503531786074, + "grad_norm": 0.11425192654132843, + "learning_rate": 3.398082744702321e-05, + "loss": 0.0068, + "step": 3810 + }, + { + "epoch": 0.963673057517659, + "grad_norm": 0.07777775824069977, + "learning_rate": 3.3938782374705683e-05, + "loss": 0.0013, + "step": 3820 + }, + { + "epoch": 0.9661957618567104, + "grad_norm": 0.10662028938531876, + "learning_rate": 3.389673730238816e-05, + "loss": 0.0084, + "step": 3830 + }, + { + "epoch": 0.9687184661957619, + "grad_norm": 0.07375224679708481, + "learning_rate": 3.385469223007064e-05, + "loss": 0.008, + "step": 3840 + }, + { + "epoch": 0.9712411705348133, + "grad_norm": 0.03361163288354874, + "learning_rate": 3.3812647157753114e-05, + "loss": 0.0059, + "step": 3850 + }, + { + "epoch": 0.9737638748738647, + "grad_norm": 0.012914448976516724, + "learning_rate": 3.377060208543559e-05, + "loss": 0.0009, + "step": 3860 + }, + { + "epoch": 0.9762865792129163, + "grad_norm": 0.15875528752803802, + "learning_rate": 3.3728557013118064e-05, + "loss": 0.0089, + "step": 3870 + }, + { + "epoch": 0.9788092835519677, + "grad_norm": 0.08293774724006653, + "learning_rate": 3.368651194080054e-05, + "loss": 0.0032, + "step": 3880 + }, + { + "epoch": 0.9813319878910192, + "grad_norm": 0.176809623837471, + "learning_rate": 3.364446686848301e-05, + "loss": 0.0095, + "step": 3890 + }, + { + "epoch": 0.9838546922300706, + "grad_norm": 0.15428629517555237, + "learning_rate": 3.360242179616549e-05, + "loss": 0.009, + "step": 3900 + }, + { + "epoch": 0.9838546922300706, + "eval_loss": 0.005954666528850794, + "eval_runtime": 20.8681, + "eval_samples_per_second": 84.435, + "eval_steps_per_second": 21.133, + "step": 3900 + }, + { + "epoch": 0.986377396569122, + "grad_norm": 0.03709837794303894, + "learning_rate": 3.356037672384797e-05, + "loss": 0.0062, + "step": 3910 + }, + { + "epoch": 0.9889001009081736, + "grad_norm": 0.01939135603606701, + "learning_rate": 3.3518331651530444e-05, + "loss": 0.0045, + "step": 3920 + }, + { + "epoch": 0.991422805247225, + "grad_norm": 0.12339838594198227, + "learning_rate": 3.347628657921292e-05, + "loss": 0.0107, + "step": 3930 + }, + { + "epoch": 0.9939455095862765, + "grad_norm": 0.11743946373462677, + "learning_rate": 3.343424150689539e-05, + "loss": 0.0026, + "step": 3940 + }, + { + "epoch": 0.9964682139253279, + "grad_norm": 0.0008224299526773393, + "learning_rate": 3.3392196434577874e-05, + "loss": 0.0029, + "step": 3950 + }, + { + "epoch": 0.9989909182643795, + "grad_norm": 0.1629364788532257, + "learning_rate": 3.335015136226034e-05, + "loss": 0.0045, + "step": 3960 + }, + { + "epoch": 1.001513622603431, + "grad_norm": 0.015702659264206886, + "learning_rate": 3.330810628994282e-05, + "loss": 0.0018, + "step": 3970 + }, + { + "epoch": 1.0040363269424823, + "grad_norm": 0.010090239346027374, + "learning_rate": 3.326606121762529e-05, + "loss": 0.0016, + "step": 3980 + }, + { + "epoch": 1.0065590312815338, + "grad_norm": 0.1662328988313675, + "learning_rate": 3.322401614530777e-05, + "loss": 0.0028, + "step": 3990 + }, + { + "epoch": 1.0090817356205852, + "grad_norm": 0.028376251459121704, + "learning_rate": 3.318197107299025e-05, + "loss": 0.0012, + "step": 4000 + }, + { + "epoch": 1.0090817356205852, + "eval_loss": 0.005606195889413357, + "eval_runtime": 20.8958, + "eval_samples_per_second": 84.323, + "eval_steps_per_second": 21.105, + "step": 4000 + }, + { + "epoch": 1.0116044399596367, + "grad_norm": 0.18040278553962708, + "learning_rate": 3.313992600067272e-05, + "loss": 0.0031, + "step": 4010 + }, + { + "epoch": 1.014127144298688, + "grad_norm": 0.09627419710159302, + "learning_rate": 3.30978809283552e-05, + "loss": 0.005, + "step": 4020 + }, + { + "epoch": 1.0166498486377396, + "grad_norm": 0.02057558484375477, + "learning_rate": 3.305583585603768e-05, + "loss": 0.0017, + "step": 4030 + }, + { + "epoch": 1.0191725529767912, + "grad_norm": 0.0033118566498160362, + "learning_rate": 3.301379078372015e-05, + "loss": 0.0013, + "step": 4040 + }, + { + "epoch": 1.0216952573158427, + "grad_norm": 0.13773638010025024, + "learning_rate": 3.297174571140262e-05, + "loss": 0.0032, + "step": 4050 + }, + { + "epoch": 1.024217961654894, + "grad_norm": 0.11453539878129959, + "learning_rate": 3.2929700639085096e-05, + "loss": 0.0031, + "step": 4060 + }, + { + "epoch": 1.0267406659939455, + "grad_norm": 0.002913431962952018, + "learning_rate": 3.288765556676758e-05, + "loss": 0.0011, + "step": 4070 + }, + { + "epoch": 1.029263370332997, + "grad_norm": 0.1419718712568283, + "learning_rate": 3.284561049445005e-05, + "loss": 0.0091, + "step": 4080 + }, + { + "epoch": 1.0317860746720484, + "grad_norm": 0.007667609490454197, + "learning_rate": 3.2803565422132527e-05, + "loss": 0.0022, + "step": 4090 + }, + { + "epoch": 1.0343087790110999, + "grad_norm": 0.11158380657434464, + "learning_rate": 3.276152034981501e-05, + "loss": 0.0029, + "step": 4100 + }, + { + "epoch": 1.0343087790110999, + "eval_loss": 0.005591338500380516, + "eval_runtime": 20.8942, + "eval_samples_per_second": 84.33, + "eval_steps_per_second": 21.106, + "step": 4100 + }, + { + "epoch": 1.0368314833501513, + "grad_norm": 0.005254568066447973, + "learning_rate": 3.271947527749748e-05, + "loss": 0.0037, + "step": 4110 + }, + { + "epoch": 1.0393541876892027, + "grad_norm": 0.13042157888412476, + "learning_rate": 3.267743020517996e-05, + "loss": 0.0052, + "step": 4120 + }, + { + "epoch": 1.0418768920282542, + "grad_norm": 0.007648650091141462, + "learning_rate": 3.2635385132862425e-05, + "loss": 0.0037, + "step": 4130 + }, + { + "epoch": 1.0443995963673058, + "grad_norm": 0.18600983917713165, + "learning_rate": 3.259334006054491e-05, + "loss": 0.0036, + "step": 4140 + }, + { + "epoch": 1.0469223007063573, + "grad_norm": 0.00177775660995394, + "learning_rate": 3.255129498822738e-05, + "loss": 0.0026, + "step": 4150 + }, + { + "epoch": 1.0494450050454087, + "grad_norm": 0.055269479751586914, + "learning_rate": 3.2509249915909856e-05, + "loss": 0.0044, + "step": 4160 + }, + { + "epoch": 1.0519677093844602, + "grad_norm": 0.007563919294625521, + "learning_rate": 3.246720484359233e-05, + "loss": 0.0073, + "step": 4170 + }, + { + "epoch": 1.0544904137235116, + "grad_norm": 0.12277320772409439, + "learning_rate": 3.242515977127481e-05, + "loss": 0.0034, + "step": 4180 + }, + { + "epoch": 1.057013118062563, + "grad_norm": 0.1538102626800537, + "learning_rate": 3.238311469895729e-05, + "loss": 0.0029, + "step": 4190 + }, + { + "epoch": 1.0595358224016145, + "grad_norm": 0.23897789418697357, + "learning_rate": 3.234106962663976e-05, + "loss": 0.0026, + "step": 4200 + }, + { + "epoch": 1.0595358224016145, + "eval_loss": 0.005667256191372871, + "eval_runtime": 20.9021, + "eval_samples_per_second": 84.298, + "eval_steps_per_second": 21.098, + "step": 4200 + }, + { + "epoch": 1.062058526740666, + "grad_norm": 0.059796128422021866, + "learning_rate": 3.229902455432223e-05, + "loss": 0.0036, + "step": 4210 + }, + { + "epoch": 1.0645812310797174, + "grad_norm": 0.11254319548606873, + "learning_rate": 3.225697948200471e-05, + "loss": 0.0052, + "step": 4220 + }, + { + "epoch": 1.067103935418769, + "grad_norm": 0.008323262445628643, + "learning_rate": 3.2214934409687185e-05, + "loss": 0.0033, + "step": 4230 + }, + { + "epoch": 1.0696266397578205, + "grad_norm": 4.037764301756397e-05, + "learning_rate": 3.217288933736966e-05, + "loss": 0.0018, + "step": 4240 + }, + { + "epoch": 1.072149344096872, + "grad_norm": 0.07235367596149445, + "learning_rate": 3.213084426505214e-05, + "loss": 0.001, + "step": 4250 + }, + { + "epoch": 1.0746720484359233, + "grad_norm": 0.14224140346050262, + "learning_rate": 3.2088799192734616e-05, + "loss": 0.0021, + "step": 4260 + }, + { + "epoch": 1.0771947527749748, + "grad_norm": 0.15408071875572205, + "learning_rate": 3.204675412041709e-05, + "loss": 0.0036, + "step": 4270 + }, + { + "epoch": 1.0797174571140262, + "grad_norm": 0.0037913790438324213, + "learning_rate": 3.2004709048099566e-05, + "loss": 0.0053, + "step": 4280 + }, + { + "epoch": 1.0822401614530777, + "grad_norm": 0.13299870491027832, + "learning_rate": 3.196266397578204e-05, + "loss": 0.0028, + "step": 4290 + }, + { + "epoch": 1.084762865792129, + "grad_norm": 0.12634998559951782, + "learning_rate": 3.1920618903464515e-05, + "loss": 0.0041, + "step": 4300 + }, + { + "epoch": 1.084762865792129, + "eval_loss": 0.005686003249138594, + "eval_runtime": 20.8959, + "eval_samples_per_second": 84.323, + "eval_steps_per_second": 21.105, + "step": 4300 + }, + { + "epoch": 1.0872855701311805, + "grad_norm": 0.16738834977149963, + "learning_rate": 3.187857383114699e-05, + "loss": 0.003, + "step": 4310 + }, + { + "epoch": 1.089808274470232, + "grad_norm": 0.037017568945884705, + "learning_rate": 3.1836528758829464e-05, + "loss": 0.0025, + "step": 4320 + }, + { + "epoch": 1.0923309788092836, + "grad_norm": 0.07605406641960144, + "learning_rate": 3.1794483686511946e-05, + "loss": 0.0024, + "step": 4330 + }, + { + "epoch": 1.094853683148335, + "grad_norm": 0.011002608574926853, + "learning_rate": 3.175243861419442e-05, + "loss": 0.0017, + "step": 4340 + }, + { + "epoch": 1.0973763874873865, + "grad_norm": 0.0014461104292422533, + "learning_rate": 3.1710393541876895e-05, + "loss": 0.001, + "step": 4350 + }, + { + "epoch": 1.099899091826438, + "grad_norm": 0.04258690029382706, + "learning_rate": 3.166834846955937e-05, + "loss": 0.001, + "step": 4360 + }, + { + "epoch": 1.1024217961654894, + "grad_norm": 0.1580243706703186, + "learning_rate": 3.1626303397241844e-05, + "loss": 0.0063, + "step": 4370 + }, + { + "epoch": 1.1049445005045408, + "grad_norm": 0.0013237325474619865, + "learning_rate": 3.158425832492432e-05, + "loss": 0.0062, + "step": 4380 + }, + { + "epoch": 1.1074672048435923, + "grad_norm": 0.0012392470380291343, + "learning_rate": 3.1542213252606794e-05, + "loss": 0.0025, + "step": 4390 + }, + { + "epoch": 1.1099899091826437, + "grad_norm": 0.04316063970327377, + "learning_rate": 3.150016818028927e-05, + "loss": 0.0015, + "step": 4400 + }, + { + "epoch": 1.1099899091826437, + "eval_loss": 0.005799129139631987, + "eval_runtime": 20.8943, + "eval_samples_per_second": 84.329, + "eval_steps_per_second": 21.106, + "step": 4400 + }, + { + "epoch": 1.1125126135216952, + "grad_norm": 0.12705808877944946, + "learning_rate": 3.145812310797175e-05, + "loss": 0.0038, + "step": 4410 + }, + { + "epoch": 1.1150353178607468, + "grad_norm": 0.01606024242937565, + "learning_rate": 3.1416078035654224e-05, + "loss": 0.0035, + "step": 4420 + }, + { + "epoch": 1.1175580221997983, + "grad_norm": 0.0025201209355145693, + "learning_rate": 3.13740329633367e-05, + "loss": 0.0053, + "step": 4430 + }, + { + "epoch": 1.1200807265388497, + "grad_norm": 0.0011012004688382149, + "learning_rate": 3.1331987891019174e-05, + "loss": 0.0044, + "step": 4440 + }, + { + "epoch": 1.1226034308779012, + "grad_norm": 0.00029570693732239306, + "learning_rate": 3.128994281870165e-05, + "loss": 0.0037, + "step": 4450 + }, + { + "epoch": 1.1251261352169526, + "grad_norm": 0.028565967455506325, + "learning_rate": 3.124789774638412e-05, + "loss": 0.005, + "step": 4460 + }, + { + "epoch": 1.127648839556004, + "grad_norm": 0.09666335582733154, + "learning_rate": 3.12058526740666e-05, + "loss": 0.0025, + "step": 4470 + }, + { + "epoch": 1.1301715438950555, + "grad_norm": 0.0027206747326999903, + "learning_rate": 3.116380760174908e-05, + "loss": 0.0011, + "step": 4480 + }, + { + "epoch": 1.132694248234107, + "grad_norm": 0.012391073629260063, + "learning_rate": 3.1121762529431554e-05, + "loss": 0.0002, + "step": 4490 + }, + { + "epoch": 1.1352169525731584, + "grad_norm": 0.05354852229356766, + "learning_rate": 3.107971745711403e-05, + "loss": 0.0014, + "step": 4500 + }, + { + "epoch": 1.1352169525731584, + "eval_loss": 0.005535118281841278, + "eval_runtime": 20.8683, + "eval_samples_per_second": 84.434, + "eval_steps_per_second": 21.132, + "step": 4500 + }, + { + "epoch": 1.1377396569122098, + "grad_norm": 0.03663089498877525, + "learning_rate": 3.10376723847965e-05, + "loss": 0.0028, + "step": 4510 + }, + { + "epoch": 1.1402623612512612, + "grad_norm": 0.20328471064567566, + "learning_rate": 3.099562731247898e-05, + "loss": 0.0014, + "step": 4520 + }, + { + "epoch": 1.142785065590313, + "grad_norm": 0.19447293877601624, + "learning_rate": 3.095358224016145e-05, + "loss": 0.0046, + "step": 4530 + }, + { + "epoch": 1.1453077699293643, + "grad_norm": 0.02381107583642006, + "learning_rate": 3.091153716784393e-05, + "loss": 0.0049, + "step": 4540 + }, + { + "epoch": 1.1478304742684158, + "grad_norm": 0.010867373086512089, + "learning_rate": 3.08694920955264e-05, + "loss": 0.0014, + "step": 4550 + }, + { + "epoch": 1.1503531786074672, + "grad_norm": 0.09643906354904175, + "learning_rate": 3.082744702320888e-05, + "loss": 0.0077, + "step": 4560 + }, + { + "epoch": 1.1528758829465187, + "grad_norm": 0.0748005211353302, + "learning_rate": 3.078540195089136e-05, + "loss": 0.0006, + "step": 4570 + }, + { + "epoch": 1.15539858728557, + "grad_norm": 0.007943224161863327, + "learning_rate": 3.074335687857383e-05, + "loss": 0.0009, + "step": 4580 + }, + { + "epoch": 1.1579212916246215, + "grad_norm": 0.0026662801392376423, + "learning_rate": 3.0701311806256314e-05, + "loss": 0.0006, + "step": 4590 + }, + { + "epoch": 1.160443995963673, + "grad_norm": 0.007000184152275324, + "learning_rate": 3.065926673393878e-05, + "loss": 0.0029, + "step": 4600 + }, + { + "epoch": 1.160443995963673, + "eval_loss": 0.005535861011594534, + "eval_runtime": 20.911, + "eval_samples_per_second": 84.262, + "eval_steps_per_second": 21.089, + "step": 4600 + }, + { + "epoch": 1.1629667003027246, + "grad_norm": 0.010504338890314102, + "learning_rate": 3.0617221661621257e-05, + "loss": 0.0021, + "step": 4610 + }, + { + "epoch": 1.165489404641776, + "grad_norm": 0.0042596235871315, + "learning_rate": 3.057517658930373e-05, + "loss": 0.0025, + "step": 4620 + }, + { + "epoch": 1.1680121089808275, + "grad_norm": 0.1560107320547104, + "learning_rate": 3.053313151698621e-05, + "loss": 0.0053, + "step": 4630 + }, + { + "epoch": 1.170534813319879, + "grad_norm": 0.03363262489438057, + "learning_rate": 3.0491086444668687e-05, + "loss": 0.0021, + "step": 4640 + }, + { + "epoch": 1.1730575176589304, + "grad_norm": 0.024774545803666115, + "learning_rate": 3.0449041372351162e-05, + "loss": 0.0085, + "step": 4650 + }, + { + "epoch": 1.1755802219979818, + "grad_norm": 0.021040301769971848, + "learning_rate": 3.0406996300033637e-05, + "loss": 0.0015, + "step": 4660 + }, + { + "epoch": 1.1781029263370333, + "grad_norm": 0.09655909985303879, + "learning_rate": 3.0364951227716115e-05, + "loss": 0.0021, + "step": 4670 + }, + { + "epoch": 1.1806256306760847, + "grad_norm": 0.13116657733917236, + "learning_rate": 3.032290615539859e-05, + "loss": 0.0035, + "step": 4680 + }, + { + "epoch": 1.1831483350151362, + "grad_norm": 0.11077822744846344, + "learning_rate": 3.0280861083081064e-05, + "loss": 0.0019, + "step": 4690 + }, + { + "epoch": 1.1856710393541876, + "grad_norm": 0.0015139818424358964, + "learning_rate": 3.023881601076354e-05, + "loss": 0.0019, + "step": 4700 + }, + { + "epoch": 1.1856710393541876, + "eval_loss": 0.005410597659647465, + "eval_runtime": 20.9117, + "eval_samples_per_second": 84.259, + "eval_steps_per_second": 21.089, + "step": 4700 + }, + { + "epoch": 1.188193743693239, + "grad_norm": 0.24234654009342194, + "learning_rate": 3.0196770938446017e-05, + "loss": 0.0029, + "step": 4710 + }, + { + "epoch": 1.1907164480322907, + "grad_norm": 0.13478586077690125, + "learning_rate": 3.015472586612849e-05, + "loss": 0.0054, + "step": 4720 + }, + { + "epoch": 1.1932391523713421, + "grad_norm": 0.00044387555681169033, + "learning_rate": 3.0112680793810966e-05, + "loss": 0.0024, + "step": 4730 + }, + { + "epoch": 1.1957618567103936, + "grad_norm": 0.005823497194796801, + "learning_rate": 3.007063572149344e-05, + "loss": 0.0018, + "step": 4740 + }, + { + "epoch": 1.198284561049445, + "grad_norm": 0.03638460114598274, + "learning_rate": 3.002859064917592e-05, + "loss": 0.0029, + "step": 4750 + }, + { + "epoch": 1.2008072653884965, + "grad_norm": 0.019287308678030968, + "learning_rate": 2.9986545576858393e-05, + "loss": 0.0021, + "step": 4760 + }, + { + "epoch": 1.203329969727548, + "grad_norm": 0.10308881103992462, + "learning_rate": 2.9944500504540868e-05, + "loss": 0.0065, + "step": 4770 + }, + { + "epoch": 1.2058526740665994, + "grad_norm": 0.061113424599170685, + "learning_rate": 2.9902455432223346e-05, + "loss": 0.0017, + "step": 4780 + }, + { + "epoch": 1.2083753784056508, + "grad_norm": 0.07032662630081177, + "learning_rate": 2.986041035990582e-05, + "loss": 0.0029, + "step": 4790 + }, + { + "epoch": 1.2108980827447022, + "grad_norm": 0.09232667833566666, + "learning_rate": 2.9818365287588296e-05, + "loss": 0.0005, + "step": 4800 + }, + { + "epoch": 1.2108980827447022, + "eval_loss": 0.0057380907237529755, + "eval_runtime": 20.8889, + "eval_samples_per_second": 84.351, + "eval_steps_per_second": 21.112, + "step": 4800 + }, + { + "epoch": 1.213420787083754, + "grad_norm": 0.0020024082623422146, + "learning_rate": 2.977632021527077e-05, + "loss": 0.0027, + "step": 4810 + }, + { + "epoch": 1.2159434914228053, + "grad_norm": 0.02490418404340744, + "learning_rate": 2.9734275142953248e-05, + "loss": 0.001, + "step": 4820 + }, + { + "epoch": 1.2184661957618568, + "grad_norm": 0.16153936088085175, + "learning_rate": 2.9692230070635723e-05, + "loss": 0.0021, + "step": 4830 + }, + { + "epoch": 1.2209889001009082, + "grad_norm": 0.1121373400092125, + "learning_rate": 2.9650184998318198e-05, + "loss": 0.002, + "step": 4840 + }, + { + "epoch": 1.2235116044399597, + "grad_norm": 0.0005092213395982981, + "learning_rate": 2.9608139926000672e-05, + "loss": 0.0031, + "step": 4850 + }, + { + "epoch": 1.226034308779011, + "grad_norm": 0.004732844419777393, + "learning_rate": 2.956609485368315e-05, + "loss": 0.0008, + "step": 4860 + }, + { + "epoch": 1.2285570131180625, + "grad_norm": 0.21401868760585785, + "learning_rate": 2.9524049781365625e-05, + "loss": 0.0103, + "step": 4870 + }, + { + "epoch": 1.231079717457114, + "grad_norm": 0.08474498987197876, + "learning_rate": 2.94820047090481e-05, + "loss": 0.0012, + "step": 4880 + }, + { + "epoch": 1.2336024217961654, + "grad_norm": 0.005474930163472891, + "learning_rate": 2.9439959636730574e-05, + "loss": 0.0018, + "step": 4890 + }, + { + "epoch": 1.2361251261352169, + "grad_norm": 0.20003733038902283, + "learning_rate": 2.9397914564413052e-05, + "loss": 0.0038, + "step": 4900 + }, + { + "epoch": 1.2361251261352169, + "eval_loss": 0.005515508819371462, + "eval_runtime": 20.9039, + "eval_samples_per_second": 84.29, + "eval_steps_per_second": 21.097, + "step": 4900 + }, + { + "epoch": 1.2386478304742683, + "grad_norm": 0.16607780754566193, + "learning_rate": 2.9355869492095527e-05, + "loss": 0.0037, + "step": 4910 + }, + { + "epoch": 1.24117053481332, + "grad_norm": 0.21575786173343658, + "learning_rate": 2.9313824419778e-05, + "loss": 0.0049, + "step": 4920 + }, + { + "epoch": 1.2436932391523714, + "grad_norm": 0.1419685035943985, + "learning_rate": 2.9271779347460483e-05, + "loss": 0.0056, + "step": 4930 + }, + { + "epoch": 1.2462159434914228, + "grad_norm": 0.0009386079618707299, + "learning_rate": 2.9229734275142954e-05, + "loss": 0.0015, + "step": 4940 + }, + { + "epoch": 1.2487386478304743, + "grad_norm": 0.011205712333321571, + "learning_rate": 2.918768920282543e-05, + "loss": 0.0026, + "step": 4950 + }, + { + "epoch": 1.2512613521695257, + "grad_norm": 0.21776536107063293, + "learning_rate": 2.9145644130507904e-05, + "loss": 0.0039, + "step": 4960 + }, + { + "epoch": 1.2537840565085772, + "grad_norm": 0.09818103164434433, + "learning_rate": 2.9103599058190385e-05, + "loss": 0.0011, + "step": 4970 + }, + { + "epoch": 1.2563067608476286, + "grad_norm": 0.16241490840911865, + "learning_rate": 2.9061553985872856e-05, + "loss": 0.002, + "step": 4980 + }, + { + "epoch": 1.25882946518668, + "grad_norm": 0.0026818953920155764, + "learning_rate": 2.901950891355533e-05, + "loss": 0.0021, + "step": 4990 + }, + { + "epoch": 1.2613521695257317, + "grad_norm": 0.026470551267266273, + "learning_rate": 2.8977463841237806e-05, + "loss": 0.006, + "step": 5000 + }, + { + "epoch": 1.2613521695257317, + "eval_loss": 0.00518006319180131, + "eval_runtime": 20.9271, + "eval_samples_per_second": 84.197, + "eval_steps_per_second": 21.073, + "step": 5000 + }, + { + "epoch": 1.2638748738647831, + "grad_norm": 0.0004209143517073244, + "learning_rate": 2.8935418768920287e-05, + "loss": 0.0033, + "step": 5010 + }, + { + "epoch": 1.2663975782038346, + "grad_norm": 0.0030910836067050695, + "learning_rate": 2.8893373696602762e-05, + "loss": 0.0024, + "step": 5020 + }, + { + "epoch": 1.268920282542886, + "grad_norm": 0.013859076425433159, + "learning_rate": 2.8851328624285233e-05, + "loss": 0.0073, + "step": 5030 + }, + { + "epoch": 1.2714429868819375, + "grad_norm": 0.0023835492320358753, + "learning_rate": 2.8809283551967708e-05, + "loss": 0.0023, + "step": 5040 + }, + { + "epoch": 1.273965691220989, + "grad_norm": 0.0017705514328554273, + "learning_rate": 2.876723847965019e-05, + "loss": 0.0009, + "step": 5050 + }, + { + "epoch": 1.2764883955600403, + "grad_norm": 0.09929084032773972, + "learning_rate": 2.8725193407332664e-05, + "loss": 0.0024, + "step": 5060 + }, + { + "epoch": 1.2790110998990918, + "grad_norm": 0.21266485750675201, + "learning_rate": 2.8683148335015135e-05, + "loss": 0.0051, + "step": 5070 + }, + { + "epoch": 1.2815338042381432, + "grad_norm": 0.045401476323604584, + "learning_rate": 2.864110326269761e-05, + "loss": 0.001, + "step": 5080 + }, + { + "epoch": 1.2840565085771947, + "grad_norm": 0.010040095075964928, + "learning_rate": 2.859905819038009e-05, + "loss": 0.0049, + "step": 5090 + }, + { + "epoch": 1.286579212916246, + "grad_norm": 0.1340843141078949, + "learning_rate": 2.8557013118062566e-05, + "loss": 0.0038, + "step": 5100 + }, + { + "epoch": 1.286579212916246, + "eval_loss": 0.0053547462448477745, + "eval_runtime": 20.8929, + "eval_samples_per_second": 84.335, + "eval_steps_per_second": 21.108, + "step": 5100 + }, + { + "epoch": 1.2891019172552975, + "grad_norm": 0.060051582753658295, + "learning_rate": 2.8514968045745037e-05, + "loss": 0.0005, + "step": 5110 + }, + { + "epoch": 1.2916246215943492, + "grad_norm": 0.000854416866786778, + "learning_rate": 2.847292297342752e-05, + "loss": 0.0023, + "step": 5120 + }, + { + "epoch": 1.2941473259334006, + "grad_norm": 0.002330298302695155, + "learning_rate": 2.8430877901109993e-05, + "loss": 0.0031, + "step": 5130 + }, + { + "epoch": 1.296670030272452, + "grad_norm": 0.08991765975952148, + "learning_rate": 2.8388832828792468e-05, + "loss": 0.001, + "step": 5140 + }, + { + "epoch": 1.2991927346115035, + "grad_norm": 0.1184747964143753, + "learning_rate": 2.834678775647494e-05, + "loss": 0.0023, + "step": 5150 + }, + { + "epoch": 1.301715438950555, + "grad_norm": 0.023154448717832565, + "learning_rate": 2.830474268415742e-05, + "loss": 0.0024, + "step": 5160 + }, + { + "epoch": 1.3042381432896064, + "grad_norm": 0.0035342529881745577, + "learning_rate": 2.8262697611839895e-05, + "loss": 0.0033, + "step": 5170 + }, + { + "epoch": 1.3067608476286579, + "grad_norm": 0.09643299877643585, + "learning_rate": 2.822065253952237e-05, + "loss": 0.0068, + "step": 5180 + }, + { + "epoch": 1.3092835519677095, + "grad_norm": 0.0010538576170802116, + "learning_rate": 2.817860746720484e-05, + "loss": 0.0032, + "step": 5190 + }, + { + "epoch": 1.311806256306761, + "grad_norm": 0.004331338219344616, + "learning_rate": 2.8136562394887323e-05, + "loss": 0.0065, + "step": 5200 + }, + { + "epoch": 1.311806256306761, + "eval_loss": 0.005469166673719883, + "eval_runtime": 20.887, + "eval_samples_per_second": 84.359, + "eval_steps_per_second": 21.114, + "step": 5200 + }, + { + "epoch": 1.3143289606458124, + "grad_norm": 0.11001147329807281, + "learning_rate": 2.8094517322569797e-05, + "loss": 0.0027, + "step": 5210 + }, + { + "epoch": 1.3168516649848638, + "grad_norm": 0.006123987026512623, + "learning_rate": 2.8052472250252272e-05, + "loss": 0.0048, + "step": 5220 + }, + { + "epoch": 1.3193743693239153, + "grad_norm": 0.018299918621778488, + "learning_rate": 2.8010427177934743e-05, + "loss": 0.0046, + "step": 5230 + }, + { + "epoch": 1.3218970736629667, + "grad_norm": 0.04792286828160286, + "learning_rate": 2.7968382105617225e-05, + "loss": 0.0026, + "step": 5240 + }, + { + "epoch": 1.3244197780020182, + "grad_norm": 0.0024629354011267424, + "learning_rate": 2.79263370332997e-05, + "loss": 0.0021, + "step": 5250 + }, + { + "epoch": 1.3269424823410696, + "grad_norm": 0.00013681373093277216, + "learning_rate": 2.7884291960982174e-05, + "loss": 0.0044, + "step": 5260 + }, + { + "epoch": 1.329465186680121, + "grad_norm": 0.10849064588546753, + "learning_rate": 2.7842246888664652e-05, + "loss": 0.0034, + "step": 5270 + }, + { + "epoch": 1.3319878910191725, + "grad_norm": 0.0731433853507042, + "learning_rate": 2.7800201816347127e-05, + "loss": 0.0026, + "step": 5280 + }, + { + "epoch": 1.334510595358224, + "grad_norm": 0.0010674018412828445, + "learning_rate": 2.77581567440296e-05, + "loss": 0.0009, + "step": 5290 + }, + { + "epoch": 1.3370332996972754, + "grad_norm": 0.17949962615966797, + "learning_rate": 2.7716111671712076e-05, + "loss": 0.0015, + "step": 5300 + }, + { + "epoch": 1.3370332996972754, + "eval_loss": 0.005221678409725428, + "eval_runtime": 20.8945, + "eval_samples_per_second": 84.328, + "eval_steps_per_second": 21.106, + "step": 5300 + }, + { + "epoch": 1.339556004036327, + "grad_norm": 0.0010082372464239597, + "learning_rate": 2.7674066599394554e-05, + "loss": 0.0003, + "step": 5310 + }, + { + "epoch": 1.3420787083753785, + "grad_norm": 0.00024729970027692616, + "learning_rate": 2.763202152707703e-05, + "loss": 0.0012, + "step": 5320 + }, + { + "epoch": 1.34460141271443, + "grad_norm": 0.09433750808238983, + "learning_rate": 2.7589976454759504e-05, + "loss": 0.0016, + "step": 5330 + }, + { + "epoch": 1.3471241170534813, + "grad_norm": 0.001336489338427782, + "learning_rate": 2.7547931382441978e-05, + "loss": 0.0024, + "step": 5340 + }, + { + "epoch": 1.3496468213925328, + "grad_norm": 0.012806025333702564, + "learning_rate": 2.7505886310124456e-05, + "loss": 0.0033, + "step": 5350 + }, + { + "epoch": 1.3521695257315842, + "grad_norm": 0.16509069502353668, + "learning_rate": 2.746384123780693e-05, + "loss": 0.0086, + "step": 5360 + }, + { + "epoch": 1.3546922300706357, + "grad_norm": 0.0008099581464193761, + "learning_rate": 2.7421796165489406e-05, + "loss": 0.0024, + "step": 5370 + }, + { + "epoch": 1.357214934409687, + "grad_norm": 0.004303140100091696, + "learning_rate": 2.737975109317188e-05, + "loss": 0.0027, + "step": 5380 + }, + { + "epoch": 1.3597376387487388, + "grad_norm": 0.00023327719827648252, + "learning_rate": 2.733770602085436e-05, + "loss": 0.0013, + "step": 5390 + }, + { + "epoch": 1.3622603430877902, + "grad_norm": 0.003809950314462185, + "learning_rate": 2.7295660948536833e-05, + "loss": 0.0005, + "step": 5400 + }, + { + "epoch": 1.3622603430877902, + "eval_loss": 0.005022699944674969, + "eval_runtime": 21.0476, + "eval_samples_per_second": 83.715, + "eval_steps_per_second": 20.952, + "step": 5400 + }, + { + "epoch": 1.3647830474268416, + "grad_norm": 0.00021514434774871916, + "learning_rate": 2.7253615876219308e-05, + "loss": 0.0028, + "step": 5410 + }, + { + "epoch": 1.367305751765893, + "grad_norm": 0.17706815898418427, + "learning_rate": 2.7211570803901782e-05, + "loss": 0.0014, + "step": 5420 + }, + { + "epoch": 1.3698284561049445, + "grad_norm": 0.004937909543514252, + "learning_rate": 2.716952573158426e-05, + "loss": 0.0031, + "step": 5430 + }, + { + "epoch": 1.372351160443996, + "grad_norm": 0.0958208441734314, + "learning_rate": 2.7127480659266735e-05, + "loss": 0.0033, + "step": 5440 + }, + { + "epoch": 1.3748738647830474, + "grad_norm": 0.06263504922389984, + "learning_rate": 2.708543558694921e-05, + "loss": 0.0001, + "step": 5450 + }, + { + "epoch": 1.3773965691220988, + "grad_norm": 0.003332935506477952, + "learning_rate": 2.7043390514631688e-05, + "loss": 0.0002, + "step": 5460 + }, + { + "epoch": 1.3799192734611503, + "grad_norm": 0.16171465814113617, + "learning_rate": 2.7001345442314162e-05, + "loss": 0.004, + "step": 5470 + }, + { + "epoch": 1.3824419778002017, + "grad_norm": 0.04109754040837288, + "learning_rate": 2.6959300369996637e-05, + "loss": 0.0055, + "step": 5480 + }, + { + "epoch": 1.3849646821392532, + "grad_norm": 0.0015252727316692472, + "learning_rate": 2.6917255297679112e-05, + "loss": 0.0038, + "step": 5490 + }, + { + "epoch": 1.3874873864783046, + "grad_norm": 0.0015239976346492767, + "learning_rate": 2.687521022536159e-05, + "loss": 0.001, + "step": 5500 + }, + { + "epoch": 1.3874873864783046, + "eval_loss": 0.005176006816327572, + "eval_runtime": 21.011, + "eval_samples_per_second": 83.861, + "eval_steps_per_second": 20.989, + "step": 5500 + }, + { + "epoch": 1.3900100908173563, + "grad_norm": 0.08292572945356369, + "learning_rate": 2.6833165153044064e-05, + "loss": 0.0015, + "step": 5510 + }, + { + "epoch": 1.3925327951564077, + "grad_norm": 0.011006727814674377, + "learning_rate": 2.679112008072654e-05, + "loss": 0.0019, + "step": 5520 + }, + { + "epoch": 1.3950554994954592, + "grad_norm": 0.15567320585250854, + "learning_rate": 2.6749075008409014e-05, + "loss": 0.0021, + "step": 5530 + }, + { + "epoch": 1.3975782038345106, + "grad_norm": 0.09949897229671478, + "learning_rate": 2.6707029936091492e-05, + "loss": 0.002, + "step": 5540 + }, + { + "epoch": 1.400100908173562, + "grad_norm": 0.07961593568325043, + "learning_rate": 2.6664984863773967e-05, + "loss": 0.0009, + "step": 5550 + }, + { + "epoch": 1.4026236125126135, + "grad_norm": 0.15322332084178925, + "learning_rate": 2.662293979145644e-05, + "loss": 0.0024, + "step": 5560 + }, + { + "epoch": 1.405146316851665, + "grad_norm": 0.1159447729587555, + "learning_rate": 2.6580894719138916e-05, + "loss": 0.0019, + "step": 5570 + }, + { + "epoch": 1.4076690211907166, + "grad_norm": 0.0029101588297635317, + "learning_rate": 2.6538849646821394e-05, + "loss": 0.0008, + "step": 5580 + }, + { + "epoch": 1.410191725529768, + "grad_norm": 0.0002611145027913153, + "learning_rate": 2.649680457450387e-05, + "loss": 0.0055, + "step": 5590 + }, + { + "epoch": 1.4127144298688195, + "grad_norm": 0.14663146436214447, + "learning_rate": 2.6454759502186343e-05, + "loss": 0.0039, + "step": 5600 + }, + { + "epoch": 1.4127144298688195, + "eval_loss": 0.005056018941104412, + "eval_runtime": 20.9323, + "eval_samples_per_second": 84.176, + "eval_steps_per_second": 21.068, + "step": 5600 + }, + { + "epoch": 1.415237134207871, + "grad_norm": 0.0049458956345915794, + "learning_rate": 2.6412714429868825e-05, + "loss": 0.001, + "step": 5610 + }, + { + "epoch": 1.4177598385469223, + "grad_norm": 0.3080619275569916, + "learning_rate": 2.6370669357551296e-05, + "loss": 0.0071, + "step": 5620 + }, + { + "epoch": 1.4202825428859738, + "grad_norm": 0.0023910084273666143, + "learning_rate": 2.632862428523377e-05, + "loss": 0.0017, + "step": 5630 + }, + { + "epoch": 1.4228052472250252, + "grad_norm": 0.0009933901019394398, + "learning_rate": 2.6286579212916245e-05, + "loss": 0.0044, + "step": 5640 + }, + { + "epoch": 1.4253279515640767, + "grad_norm": 0.02665986306965351, + "learning_rate": 2.6244534140598727e-05, + "loss": 0.0009, + "step": 5650 + }, + { + "epoch": 1.427850655903128, + "grad_norm": 0.17384563386440277, + "learning_rate": 2.6202489068281198e-05, + "loss": 0.0065, + "step": 5660 + }, + { + "epoch": 1.4303733602421795, + "grad_norm": 0.05648142844438553, + "learning_rate": 2.6160443995963673e-05, + "loss": 0.0016, + "step": 5670 + }, + { + "epoch": 1.432896064581231, + "grad_norm": 0.004266271833330393, + "learning_rate": 2.6118398923646147e-05, + "loss": 0.0028, + "step": 5680 + }, + { + "epoch": 1.4354187689202824, + "grad_norm": 0.020753854885697365, + "learning_rate": 2.607635385132863e-05, + "loss": 0.002, + "step": 5690 + }, + { + "epoch": 1.437941473259334, + "grad_norm": 0.08341605216264725, + "learning_rate": 2.6034308779011103e-05, + "loss": 0.0023, + "step": 5700 + }, + { + "epoch": 1.437941473259334, + "eval_loss": 0.004923286382108927, + "eval_runtime": 20.9235, + "eval_samples_per_second": 84.212, + "eval_steps_per_second": 21.077, + "step": 5700 + }, + { + "epoch": 1.4404641775983855, + "grad_norm": 0.007267744280397892, + "learning_rate": 2.5992263706693575e-05, + "loss": 0.003, + "step": 5710 + }, + { + "epoch": 1.442986881937437, + "grad_norm": 0.00982646644115448, + "learning_rate": 2.595021863437605e-05, + "loss": 0.0001, + "step": 5720 + }, + { + "epoch": 1.4455095862764884, + "grad_norm": 0.0013306884793564677, + "learning_rate": 2.590817356205853e-05, + "loss": 0.0019, + "step": 5730 + }, + { + "epoch": 1.4480322906155398, + "grad_norm": 0.037949543446302414, + "learning_rate": 2.5866128489741005e-05, + "loss": 0.0014, + "step": 5740 + }, + { + "epoch": 1.4505549949545913, + "grad_norm": 0.0034357199911028147, + "learning_rate": 2.5824083417423477e-05, + "loss": 0.0025, + "step": 5750 + }, + { + "epoch": 1.4530776992936427, + "grad_norm": 0.08490198105573654, + "learning_rate": 2.578203834510595e-05, + "loss": 0.0016, + "step": 5760 + }, + { + "epoch": 1.4556004036326944, + "grad_norm": 0.09188306331634521, + "learning_rate": 2.5739993272788433e-05, + "loss": 0.0006, + "step": 5770 + }, + { + "epoch": 1.4581231079717458, + "grad_norm": 0.3032228350639343, + "learning_rate": 2.5697948200470908e-05, + "loss": 0.0037, + "step": 5780 + }, + { + "epoch": 1.4606458123107973, + "grad_norm": 0.0033124187029898167, + "learning_rate": 2.565590312815338e-05, + "loss": 0.0003, + "step": 5790 + }, + { + "epoch": 1.4631685166498487, + "grad_norm": 0.18161827325820923, + "learning_rate": 2.561385805583586e-05, + "loss": 0.0035, + "step": 5800 + }, + { + "epoch": 1.4631685166498487, + "eval_loss": 0.0052693067118525505, + "eval_runtime": 20.897, + "eval_samples_per_second": 84.318, + "eval_steps_per_second": 21.103, + "step": 5800 + }, + { + "epoch": 1.4656912209889001, + "grad_norm": 0.005931831430643797, + "learning_rate": 2.5571812983518335e-05, + "loss": 0.0028, + "step": 5810 + }, + { + "epoch": 1.4682139253279516, + "grad_norm": 0.0015284974360838532, + "learning_rate": 2.552976791120081e-05, + "loss": 0.0014, + "step": 5820 + }, + { + "epoch": 1.470736629667003, + "grad_norm": 0.035359546542167664, + "learning_rate": 2.548772283888328e-05, + "loss": 0.0017, + "step": 5830 + }, + { + "epoch": 1.4732593340060545, + "grad_norm": 0.0004528906138148159, + "learning_rate": 2.5445677766565762e-05, + "loss": 0.003, + "step": 5840 + }, + { + "epoch": 1.475782038345106, + "grad_norm": 0.0017564162844792008, + "learning_rate": 2.5403632694248237e-05, + "loss": 0.0016, + "step": 5850 + }, + { + "epoch": 1.4783047426841573, + "grad_norm": 0.1506708413362503, + "learning_rate": 2.536158762193071e-05, + "loss": 0.0008, + "step": 5860 + }, + { + "epoch": 1.4808274470232088, + "grad_norm": 0.0023614871315658092, + "learning_rate": 2.5319542549613183e-05, + "loss": 0.0022, + "step": 5870 + }, + { + "epoch": 1.4833501513622602, + "grad_norm": 0.00034669501474127173, + "learning_rate": 2.5277497477295664e-05, + "loss": 0.0032, + "step": 5880 + }, + { + "epoch": 1.4858728557013117, + "grad_norm": 0.0016111385775730014, + "learning_rate": 2.523545240497814e-05, + "loss": 0.0037, + "step": 5890 + }, + { + "epoch": 1.4883955600403633, + "grad_norm": 0.00014663147157989442, + "learning_rate": 2.5193407332660614e-05, + "loss": 0.0008, + "step": 5900 + }, + { + "epoch": 1.4883955600403633, + "eval_loss": 0.004971860907971859, + "eval_runtime": 20.9024, + "eval_samples_per_second": 84.296, + "eval_steps_per_second": 21.098, + "step": 5900 + }, + { + "epoch": 1.4909182643794148, + "grad_norm": 0.15087057650089264, + "learning_rate": 2.5151362260343085e-05, + "loss": 0.0027, + "step": 5910 + }, + { + "epoch": 1.4934409687184662, + "grad_norm": 0.18127664923667908, + "learning_rate": 2.5109317188025566e-05, + "loss": 0.0042, + "step": 5920 + }, + { + "epoch": 1.4959636730575177, + "grad_norm": 0.069893017411232, + "learning_rate": 2.506727211570804e-05, + "loss": 0.0022, + "step": 5930 + }, + { + "epoch": 1.498486377396569, + "grad_norm": 0.00019991624867543578, + "learning_rate": 2.5025227043390516e-05, + "loss": 0.0017, + "step": 5940 + }, + { + "epoch": 1.5010090817356205, + "grad_norm": 0.09269930422306061, + "learning_rate": 2.498318197107299e-05, + "loss": 0.0028, + "step": 5950 + }, + { + "epoch": 1.5035317860746722, + "grad_norm": 0.06926806271076202, + "learning_rate": 2.494113689875547e-05, + "loss": 0.0039, + "step": 5960 + }, + { + "epoch": 1.5060544904137236, + "grad_norm": 0.03350943699479103, + "learning_rate": 2.4899091826437943e-05, + "loss": 0.0052, + "step": 5970 + }, + { + "epoch": 1.508577194752775, + "grad_norm": 0.008175240829586983, + "learning_rate": 2.4857046754120418e-05, + "loss": 0.0023, + "step": 5980 + }, + { + "epoch": 1.5110998990918265, + "grad_norm": 0.1838151216506958, + "learning_rate": 2.4815001681802892e-05, + "loss": 0.0031, + "step": 5990 + }, + { + "epoch": 1.513622603430878, + "grad_norm": 0.11169478297233582, + "learning_rate": 2.477295660948537e-05, + "loss": 0.0042, + "step": 6000 + }, + { + "epoch": 1.513622603430878, + "eval_loss": 0.004873940721154213, + "eval_runtime": 20.8974, + "eval_samples_per_second": 84.317, + "eval_steps_per_second": 21.103, + "step": 6000 + }, + { + "epoch": 1.5161453077699294, + "grad_norm": 0.0018095527775585651, + "learning_rate": 2.4730911537167845e-05, + "loss": 0.0023, + "step": 6010 + }, + { + "epoch": 1.5186680121089808, + "grad_norm": 0.0017755021108314395, + "learning_rate": 2.468886646485032e-05, + "loss": 0.0007, + "step": 6020 + }, + { + "epoch": 1.5211907164480323, + "grad_norm": 0.1006636768579483, + "learning_rate": 2.4646821392532794e-05, + "loss": 0.0054, + "step": 6030 + }, + { + "epoch": 1.5237134207870837, + "grad_norm": 0.17334707081317902, + "learning_rate": 2.4604776320215273e-05, + "loss": 0.0031, + "step": 6040 + }, + { + "epoch": 1.5262361251261352, + "grad_norm": 0.004185323137789965, + "learning_rate": 2.4562731247897747e-05, + "loss": 0.0018, + "step": 6050 + }, + { + "epoch": 1.5287588294651866, + "grad_norm": 0.06221470236778259, + "learning_rate": 2.4520686175580225e-05, + "loss": 0.0032, + "step": 6060 + }, + { + "epoch": 1.531281533804238, + "grad_norm": 0.10330460220575333, + "learning_rate": 2.4478641103262697e-05, + "loss": 0.0035, + "step": 6070 + }, + { + "epoch": 1.5338042381432895, + "grad_norm": 0.13589535653591156, + "learning_rate": 2.4436596030945175e-05, + "loss": 0.0016, + "step": 6080 + }, + { + "epoch": 1.536326942482341, + "grad_norm": 0.19307217001914978, + "learning_rate": 2.439455095862765e-05, + "loss": 0.0039, + "step": 6090 + }, + { + "epoch": 1.5388496468213926, + "grad_norm": 0.07404123246669769, + "learning_rate": 2.4352505886310127e-05, + "loss": 0.0008, + "step": 6100 + }, + { + "epoch": 1.5388496468213926, + "eval_loss": 0.005088960751891136, + "eval_runtime": 20.9119, + "eval_samples_per_second": 84.258, + "eval_steps_per_second": 21.088, + "step": 6100 + }, + { + "epoch": 1.541372351160444, + "grad_norm": 0.06749244034290314, + "learning_rate": 2.43104608139926e-05, + "loss": 0.0045, + "step": 6110 + }, + { + "epoch": 1.5438950554994955, + "grad_norm": 0.00822696927934885, + "learning_rate": 2.4268415741675077e-05, + "loss": 0.0022, + "step": 6120 + }, + { + "epoch": 1.546417759838547, + "grad_norm": 0.0362759605050087, + "learning_rate": 2.4226370669357555e-05, + "loss": 0.0018, + "step": 6130 + }, + { + "epoch": 1.5489404641775983, + "grad_norm": 0.0018353847553953528, + "learning_rate": 2.418432559704003e-05, + "loss": 0.0033, + "step": 6140 + }, + { + "epoch": 1.55146316851665, + "grad_norm": 0.0009886518819257617, + "learning_rate": 2.4142280524722504e-05, + "loss": 0.0033, + "step": 6150 + }, + { + "epoch": 1.5539858728557014, + "grad_norm": 0.005221598315984011, + "learning_rate": 2.410023545240498e-05, + "loss": 0.0042, + "step": 6160 + }, + { + "epoch": 1.5565085771947529, + "grad_norm": 0.02474922128021717, + "learning_rate": 2.4058190380087457e-05, + "loss": 0.0059, + "step": 6170 + }, + { + "epoch": 1.5590312815338043, + "grad_norm": 0.005371175706386566, + "learning_rate": 2.401614530776993e-05, + "loss": 0.004, + "step": 6180 + }, + { + "epoch": 1.5615539858728558, + "grad_norm": 0.1124267429113388, + "learning_rate": 2.3974100235452406e-05, + "loss": 0.0007, + "step": 6190 + }, + { + "epoch": 1.5640766902119072, + "grad_norm": 0.009999338537454605, + "learning_rate": 2.393205516313488e-05, + "loss": 0.0018, + "step": 6200 + }, + { + "epoch": 1.5640766902119072, + "eval_loss": 0.004953174851834774, + "eval_runtime": 20.8923, + "eval_samples_per_second": 84.337, + "eval_steps_per_second": 21.108, + "step": 6200 + }, + { + "epoch": 1.5665993945509586, + "grad_norm": 0.028597630560398102, + "learning_rate": 2.389001009081736e-05, + "loss": 0.0044, + "step": 6210 + }, + { + "epoch": 1.56912209889001, + "grad_norm": 0.0002847505093086511, + "learning_rate": 2.3847965018499833e-05, + "loss": 0.0013, + "step": 6220 + }, + { + "epoch": 1.5716448032290615, + "grad_norm": 0.12137818336486816, + "learning_rate": 2.3805919946182308e-05, + "loss": 0.0025, + "step": 6230 + }, + { + "epoch": 1.574167507568113, + "grad_norm": 0.05651724711060524, + "learning_rate": 2.3763874873864783e-05, + "loss": 0.0022, + "step": 6240 + }, + { + "epoch": 1.5766902119071644, + "grad_norm": 0.225179061293602, + "learning_rate": 2.372182980154726e-05, + "loss": 0.0012, + "step": 6250 + }, + { + "epoch": 1.5792129162462158, + "grad_norm": 0.0041250442154705524, + "learning_rate": 2.3679784729229735e-05, + "loss": 0.0055, + "step": 6260 + }, + { + "epoch": 1.5817356205852673, + "grad_norm": 0.008712096139788628, + "learning_rate": 2.363773965691221e-05, + "loss": 0.0014, + "step": 6270 + }, + { + "epoch": 1.5842583249243187, + "grad_norm": 0.007221329025924206, + "learning_rate": 2.3595694584594685e-05, + "loss": 0.0022, + "step": 6280 + }, + { + "epoch": 1.5867810292633702, + "grad_norm": 0.012200511991977692, + "learning_rate": 2.3553649512277163e-05, + "loss": 0.0012, + "step": 6290 + }, + { + "epoch": 1.5893037336024218, + "grad_norm": 0.0005704654031433165, + "learning_rate": 2.3511604439959638e-05, + "loss": 0.0013, + "step": 6300 + }, + { + "epoch": 1.5893037336024218, + "eval_loss": 0.0048631117679178715, + "eval_runtime": 20.8908, + "eval_samples_per_second": 84.344, + "eval_steps_per_second": 21.11, + "step": 6300 + }, + { + "epoch": 1.5918264379414733, + "grad_norm": 0.002790976082906127, + "learning_rate": 2.3469559367642112e-05, + "loss": 0.0013, + "step": 6310 + }, + { + "epoch": 1.5943491422805247, + "grad_norm": 0.0014387964038178325, + "learning_rate": 2.342751429532459e-05, + "loss": 0.0007, + "step": 6320 + }, + { + "epoch": 1.5968718466195762, + "grad_norm": 0.00022175043704919517, + "learning_rate": 2.3385469223007065e-05, + "loss": 0.0035, + "step": 6330 + }, + { + "epoch": 1.5993945509586278, + "grad_norm": 0.11953356117010117, + "learning_rate": 2.334342415068954e-05, + "loss": 0.0048, + "step": 6340 + }, + { + "epoch": 1.6019172552976793, + "grad_norm": 0.18491606414318085, + "learning_rate": 2.3301379078372014e-05, + "loss": 0.0038, + "step": 6350 + }, + { + "epoch": 1.6044399596367307, + "grad_norm": 0.19568416476249695, + "learning_rate": 2.3259334006054492e-05, + "loss": 0.0027, + "step": 6360 + }, + { + "epoch": 1.6069626639757821, + "grad_norm": 0.08327057212591171, + "learning_rate": 2.3217288933736967e-05, + "loss": 0.0012, + "step": 6370 + }, + { + "epoch": 1.6094853683148336, + "grad_norm": 0.03957786411046982, + "learning_rate": 2.3175243861419445e-05, + "loss": 0.0037, + "step": 6380 + }, + { + "epoch": 1.612008072653885, + "grad_norm": 0.003976314328610897, + "learning_rate": 2.3133198789101916e-05, + "loss": 0.0015, + "step": 6390 + }, + { + "epoch": 1.6145307769929365, + "grad_norm": 0.05154380202293396, + "learning_rate": 2.3091153716784394e-05, + "loss": 0.0003, + "step": 6400 + }, + { + "epoch": 1.6145307769929365, + "eval_loss": 0.0047143567353487015, + "eval_runtime": 20.8928, + "eval_samples_per_second": 84.335, + "eval_steps_per_second": 21.108, + "step": 6400 + }, + { + "epoch": 1.617053481331988, + "grad_norm": 0.0036455143708735704, + "learning_rate": 2.304910864446687e-05, + "loss": 0.0023, + "step": 6410 + }, + { + "epoch": 1.6195761856710393, + "grad_norm": 0.0702298954129219, + "learning_rate": 2.3007063572149347e-05, + "loss": 0.0019, + "step": 6420 + }, + { + "epoch": 1.6220988900100908, + "grad_norm": 0.002678563119843602, + "learning_rate": 2.296501849983182e-05, + "loss": 0.0023, + "step": 6430 + }, + { + "epoch": 1.6246215943491422, + "grad_norm": 0.002132556401193142, + "learning_rate": 2.2922973427514296e-05, + "loss": 0.0018, + "step": 6440 + }, + { + "epoch": 1.6271442986881937, + "grad_norm": 0.2592438757419586, + "learning_rate": 2.288092835519677e-05, + "loss": 0.0039, + "step": 6450 + }, + { + "epoch": 1.629667003027245, + "grad_norm": 0.07999824732542038, + "learning_rate": 2.283888328287925e-05, + "loss": 0.005, + "step": 6460 + }, + { + "epoch": 1.6321897073662965, + "grad_norm": 0.0010637440718710423, + "learning_rate": 2.2796838210561724e-05, + "loss": 0.0008, + "step": 6470 + }, + { + "epoch": 1.634712411705348, + "grad_norm": 0.2309955358505249, + "learning_rate": 2.27547931382442e-05, + "loss": 0.0033, + "step": 6480 + }, + { + "epoch": 1.6372351160443996, + "grad_norm": 0.003076382912695408, + "learning_rate": 2.2712748065926676e-05, + "loss": 0.0004, + "step": 6490 + }, + { + "epoch": 1.639757820383451, + "grad_norm": 0.13771465420722961, + "learning_rate": 2.267070299360915e-05, + "loss": 0.0016, + "step": 6500 + }, + { + "epoch": 1.639757820383451, + "eval_loss": 0.00458995345979929, + "eval_runtime": 20.8904, + "eval_samples_per_second": 84.345, + "eval_steps_per_second": 21.11, + "step": 6500 + }, + { + "epoch": 1.6422805247225025, + "grad_norm": 0.0014351740246638656, + "learning_rate": 2.2628657921291626e-05, + "loss": 0.0014, + "step": 6510 + }, + { + "epoch": 1.644803229061554, + "grad_norm": 0.08296415954828262, + "learning_rate": 2.25866128489741e-05, + "loss": 0.0012, + "step": 6520 + }, + { + "epoch": 1.6473259334006054, + "grad_norm": 0.00042011673212982714, + "learning_rate": 2.254456777665658e-05, + "loss": 0.0087, + "step": 6530 + }, + { + "epoch": 1.649848637739657, + "grad_norm": 0.01929500512778759, + "learning_rate": 2.2502522704339053e-05, + "loss": 0.0015, + "step": 6540 + }, + { + "epoch": 1.6523713420787085, + "grad_norm": 0.14432388544082642, + "learning_rate": 2.2460477632021528e-05, + "loss": 0.001, + "step": 6550 + }, + { + "epoch": 1.65489404641776, + "grad_norm": 0.0028279852122068405, + "learning_rate": 2.2418432559704003e-05, + "loss": 0.0022, + "step": 6560 + }, + { + "epoch": 1.6574167507568114, + "grad_norm": 0.12562096118927002, + "learning_rate": 2.237638748738648e-05, + "loss": 0.0023, + "step": 6570 + }, + { + "epoch": 1.6599394550958628, + "grad_norm": 0.12296324968338013, + "learning_rate": 2.2334342415068955e-05, + "loss": 0.0031, + "step": 6580 + }, + { + "epoch": 1.6624621594349143, + "grad_norm": 0.023628313094377518, + "learning_rate": 2.229229734275143e-05, + "loss": 0.0049, + "step": 6590 + }, + { + "epoch": 1.6649848637739657, + "grad_norm": 0.06037677451968193, + "learning_rate": 2.2250252270433905e-05, + "loss": 0.0015, + "step": 6600 + }, + { + "epoch": 1.6649848637739657, + "eval_loss": 0.004487240687012672, + "eval_runtime": 20.9059, + "eval_samples_per_second": 84.282, + "eval_steps_per_second": 21.094, + "step": 6600 + }, + { + "epoch": 1.6675075681130171, + "grad_norm": 0.038203973323106766, + "learning_rate": 2.2208207198116383e-05, + "loss": 0.0024, + "step": 6610 + }, + { + "epoch": 1.6700302724520686, + "grad_norm": 0.0006242411327548325, + "learning_rate": 2.2166162125798857e-05, + "loss": 0.0016, + "step": 6620 + }, + { + "epoch": 1.67255297679112, + "grad_norm": 0.24198785424232483, + "learning_rate": 2.2124117053481332e-05, + "loss": 0.0042, + "step": 6630 + }, + { + "epoch": 1.6750756811301715, + "grad_norm": 0.008689168840646744, + "learning_rate": 2.208207198116381e-05, + "loss": 0.0014, + "step": 6640 + }, + { + "epoch": 1.677598385469223, + "grad_norm": 0.0001791265094652772, + "learning_rate": 2.2040026908846285e-05, + "loss": 0.001, + "step": 6650 + }, + { + "epoch": 1.6801210898082743, + "grad_norm": 0.1187405064702034, + "learning_rate": 2.199798183652876e-05, + "loss": 0.0012, + "step": 6660 + }, + { + "epoch": 1.6826437941473258, + "grad_norm": 0.17541439831256866, + "learning_rate": 2.1955936764211234e-05, + "loss": 0.0015, + "step": 6670 + }, + { + "epoch": 1.6851664984863775, + "grad_norm": 0.025754814967513084, + "learning_rate": 2.1913891691893712e-05, + "loss": 0.0024, + "step": 6680 + }, + { + "epoch": 1.687689202825429, + "grad_norm": 0.16264697909355164, + "learning_rate": 2.1871846619576187e-05, + "loss": 0.0025, + "step": 6690 + }, + { + "epoch": 1.6902119071644803, + "grad_norm": 0.0034603734966367483, + "learning_rate": 2.1829801547258665e-05, + "loss": 0.0005, + "step": 6700 + }, + { + "epoch": 1.6902119071644803, + "eval_loss": 0.00464710732921958, + "eval_runtime": 20.9042, + "eval_samples_per_second": 84.289, + "eval_steps_per_second": 21.096, + "step": 6700 + }, + { + "epoch": 1.6927346115035318, + "grad_norm": 0.005144911352545023, + "learning_rate": 2.1787756474941136e-05, + "loss": 0.0005, + "step": 6710 + }, + { + "epoch": 1.6952573158425832, + "grad_norm": 0.2386636584997177, + "learning_rate": 2.1745711402623614e-05, + "loss": 0.0045, + "step": 6720 + }, + { + "epoch": 1.6977800201816349, + "grad_norm": 0.0006302748224698007, + "learning_rate": 2.170366633030609e-05, + "loss": 0.0003, + "step": 6730 + }, + { + "epoch": 1.7003027245206863, + "grad_norm": 0.04393825680017471, + "learning_rate": 2.1661621257988567e-05, + "loss": 0.0015, + "step": 6740 + }, + { + "epoch": 1.7028254288597378, + "grad_norm": 0.002832000143826008, + "learning_rate": 2.1619576185671038e-05, + "loss": 0.0029, + "step": 6750 + }, + { + "epoch": 1.7053481331987892, + "grad_norm": 0.16164688766002655, + "learning_rate": 2.1577531113353516e-05, + "loss": 0.0049, + "step": 6760 + }, + { + "epoch": 1.7078708375378406, + "grad_norm": 0.1447678804397583, + "learning_rate": 2.153548604103599e-05, + "loss": 0.0026, + "step": 6770 + }, + { + "epoch": 1.710393541876892, + "grad_norm": 0.005429640877991915, + "learning_rate": 2.149344096871847e-05, + "loss": 0.003, + "step": 6780 + }, + { + "epoch": 1.7129162462159435, + "grad_norm": 0.0007169700693339109, + "learning_rate": 2.145139589640094e-05, + "loss": 0.0012, + "step": 6790 + }, + { + "epoch": 1.715438950554995, + "grad_norm": 0.0009801093256101012, + "learning_rate": 2.1409350824083418e-05, + "loss": 0.0023, + "step": 6800 + }, + { + "epoch": 1.715438950554995, + "eval_loss": 0.0046570939011871815, + "eval_runtime": 20.8879, + "eval_samples_per_second": 84.355, + "eval_steps_per_second": 21.113, + "step": 6800 + }, + { + "epoch": 1.7179616548940464, + "grad_norm": 0.009192834608256817, + "learning_rate": 2.1367305751765896e-05, + "loss": 0.004, + "step": 6810 + }, + { + "epoch": 1.7204843592330978, + "grad_norm": 0.004604650661349297, + "learning_rate": 2.132526067944837e-05, + "loss": 0.0012, + "step": 6820 + }, + { + "epoch": 1.7230070635721493, + "grad_norm": 0.0017496418440714478, + "learning_rate": 2.1283215607130846e-05, + "loss": 0.0027, + "step": 6830 + }, + { + "epoch": 1.7255297679112007, + "grad_norm": 0.003268537111580372, + "learning_rate": 2.124117053481332e-05, + "loss": 0.0027, + "step": 6840 + }, + { + "epoch": 1.7280524722502522, + "grad_norm": 0.00015593957505188882, + "learning_rate": 2.11991254624958e-05, + "loss": 0.0034, + "step": 6850 + }, + { + "epoch": 1.7305751765893036, + "grad_norm": 0.0808212012052536, + "learning_rate": 2.1157080390178273e-05, + "loss": 0.0048, + "step": 6860 + }, + { + "epoch": 1.733097880928355, + "grad_norm": 0.24864982068538666, + "learning_rate": 2.1115035317860748e-05, + "loss": 0.0018, + "step": 6870 + }, + { + "epoch": 1.7356205852674067, + "grad_norm": 0.004026748705655336, + "learning_rate": 2.1072990245543222e-05, + "loss": 0.0033, + "step": 6880 + }, + { + "epoch": 1.7381432896064581, + "grad_norm": 0.017659470438957214, + "learning_rate": 2.10309451732257e-05, + "loss": 0.0016, + "step": 6890 + }, + { + "epoch": 1.7406659939455096, + "grad_norm": 0.004599269945174456, + "learning_rate": 2.0988900100908175e-05, + "loss": 0.0011, + "step": 6900 + }, + { + "epoch": 1.7406659939455096, + "eval_loss": 0.004562552087008953, + "eval_runtime": 20.8736, + "eval_samples_per_second": 84.413, + "eval_steps_per_second": 21.127, + "step": 6900 + }, + { + "epoch": 1.743188698284561, + "grad_norm": 0.00346059980802238, + "learning_rate": 2.094685502859065e-05, + "loss": 0.0026, + "step": 6910 + }, + { + "epoch": 1.7457114026236125, + "grad_norm": 0.03819597512483597, + "learning_rate": 2.0904809956273124e-05, + "loss": 0.0009, + "step": 6920 + }, + { + "epoch": 1.7482341069626641, + "grad_norm": 0.2355988621711731, + "learning_rate": 2.0862764883955602e-05, + "loss": 0.0041, + "step": 6930 + }, + { + "epoch": 1.7507568113017156, + "grad_norm": 0.0024086080957204103, + "learning_rate": 2.0820719811638077e-05, + "loss": 0.0029, + "step": 6940 + }, + { + "epoch": 1.753279515640767, + "grad_norm": 0.25141242146492004, + "learning_rate": 2.0778674739320552e-05, + "loss": 0.0027, + "step": 6950 + }, + { + "epoch": 1.7558022199798184, + "grad_norm": 0.08421680331230164, + "learning_rate": 2.0736629667003026e-05, + "loss": 0.0005, + "step": 6960 + }, + { + "epoch": 1.7583249243188699, + "grad_norm": 0.19552625715732574, + "learning_rate": 2.0694584594685504e-05, + "loss": 0.0014, + "step": 6970 + }, + { + "epoch": 1.7608476286579213, + "grad_norm": 0.0034374226815998554, + "learning_rate": 2.065253952236798e-05, + "loss": 0.0009, + "step": 6980 + }, + { + "epoch": 1.7633703329969728, + "grad_norm": 0.0022041252814233303, + "learning_rate": 2.0610494450050454e-05, + "loss": 0.0007, + "step": 6990 + }, + { + "epoch": 1.7658930373360242, + "grad_norm": 0.2155584841966629, + "learning_rate": 2.0568449377732932e-05, + "loss": 0.0025, + "step": 7000 + }, + { + "epoch": 1.7658930373360242, + "eval_loss": 0.004664108622819185, + "eval_runtime": 20.9248, + "eval_samples_per_second": 84.206, + "eval_steps_per_second": 21.075, + "step": 7000 + }, + { + "epoch": 1.7684157416750756, + "grad_norm": 0.00033274645102210343, + "learning_rate": 2.0526404305415406e-05, + "loss": 0.0011, + "step": 7010 + }, + { + "epoch": 1.770938446014127, + "grad_norm": 0.004436755087226629, + "learning_rate": 2.048435923309788e-05, + "loss": 0.0023, + "step": 7020 + }, + { + "epoch": 1.7734611503531785, + "grad_norm": 0.12980465590953827, + "learning_rate": 2.0442314160780356e-05, + "loss": 0.0012, + "step": 7030 + }, + { + "epoch": 1.77598385469223, + "grad_norm": 0.0001751563249854371, + "learning_rate": 2.0400269088462834e-05, + "loss": 0.0009, + "step": 7040 + }, + { + "epoch": 1.7785065590312814, + "grad_norm": 0.18885697424411774, + "learning_rate": 2.035822401614531e-05, + "loss": 0.0037, + "step": 7050 + }, + { + "epoch": 1.7810292633703328, + "grad_norm": 0.10697660595178604, + "learning_rate": 2.0316178943827787e-05, + "loss": 0.0016, + "step": 7060 + }, + { + "epoch": 1.7835519677093845, + "grad_norm": 0.02362459897994995, + "learning_rate": 2.0274133871510258e-05, + "loss": 0.0007, + "step": 7070 + }, + { + "epoch": 1.786074672048436, + "grad_norm": 8.634651749162003e-05, + "learning_rate": 2.0232088799192736e-05, + "loss": 0.0015, + "step": 7080 + }, + { + "epoch": 1.7885973763874874, + "grad_norm": 0.14645344018936157, + "learning_rate": 2.019004372687521e-05, + "loss": 0.0019, + "step": 7090 + }, + { + "epoch": 1.7911200807265388, + "grad_norm": 0.0008572082151658833, + "learning_rate": 2.014799865455769e-05, + "loss": 0.0022, + "step": 7100 + }, + { + "epoch": 1.7911200807265388, + "eval_loss": 0.0045646862126886845, + "eval_runtime": 20.8857, + "eval_samples_per_second": 84.364, + "eval_steps_per_second": 21.115, + "step": 7100 + }, + { + "epoch": 1.7936427850655903, + "grad_norm": 0.0013892538845539093, + "learning_rate": 2.010595358224016e-05, + "loss": 0.0028, + "step": 7110 + }, + { + "epoch": 1.796165489404642, + "grad_norm": 0.17493274807929993, + "learning_rate": 2.0063908509922638e-05, + "loss": 0.0026, + "step": 7120 + }, + { + "epoch": 1.7986881937436934, + "grad_norm": 0.0053392620757222176, + "learning_rate": 2.0021863437605113e-05, + "loss": 0.0006, + "step": 7130 + }, + { + "epoch": 1.8012108980827448, + "grad_norm": 0.05420933663845062, + "learning_rate": 1.997981836528759e-05, + "loss": 0.0003, + "step": 7140 + }, + { + "epoch": 1.8037336024217963, + "grad_norm": 0.00458677439019084, + "learning_rate": 1.9937773292970065e-05, + "loss": 0.0014, + "step": 7150 + }, + { + "epoch": 1.8062563067608477, + "grad_norm": 0.0370076522231102, + "learning_rate": 1.989572822065254e-05, + "loss": 0.0031, + "step": 7160 + }, + { + "epoch": 1.8087790110998991, + "grad_norm": 0.007424044422805309, + "learning_rate": 1.9853683148335018e-05, + "loss": 0.0027, + "step": 7170 + }, + { + "epoch": 1.8113017154389506, + "grad_norm": 0.03529626876115799, + "learning_rate": 1.9811638076017493e-05, + "loss": 0.0009, + "step": 7180 + }, + { + "epoch": 1.813824419778002, + "grad_norm": 0.15175025165081024, + "learning_rate": 1.9769593003699967e-05, + "loss": 0.0015, + "step": 7190 + }, + { + "epoch": 1.8163471241170535, + "grad_norm": 0.00071295554516837, + "learning_rate": 1.9727547931382442e-05, + "loss": 0.0006, + "step": 7200 + }, + { + "epoch": 1.8163471241170535, + "eval_loss": 0.004439685959368944, + "eval_runtime": 20.8976, + "eval_samples_per_second": 84.316, + "eval_steps_per_second": 21.103, + "step": 7200 + }, + { + "epoch": 1.818869828456105, + "grad_norm": 0.13802039623260498, + "learning_rate": 1.968550285906492e-05, + "loss": 0.0029, + "step": 7210 + }, + { + "epoch": 1.8213925327951563, + "grad_norm": 0.001995902741327882, + "learning_rate": 1.9643457786747395e-05, + "loss": 0.0021, + "step": 7220 + }, + { + "epoch": 1.8239152371342078, + "grad_norm": 0.0005990486242808402, + "learning_rate": 1.960141271442987e-05, + "loss": 0.0006, + "step": 7230 + }, + { + "epoch": 1.8264379414732592, + "grad_norm": 0.08083165436983109, + "learning_rate": 1.9559367642112344e-05, + "loss": 0.0025, + "step": 7240 + }, + { + "epoch": 1.8289606458123107, + "grad_norm": 0.15096262097358704, + "learning_rate": 1.9517322569794822e-05, + "loss": 0.0024, + "step": 7250 + }, + { + "epoch": 1.831483350151362, + "grad_norm": 0.0005141481524333358, + "learning_rate": 1.9475277497477297e-05, + "loss": 0.0047, + "step": 7260 + }, + { + "epoch": 1.8340060544904138, + "grad_norm": 0.0009411073406226933, + "learning_rate": 1.943323242515977e-05, + "loss": 0.0033, + "step": 7270 + }, + { + "epoch": 1.8365287588294652, + "grad_norm": 0.005096075590699911, + "learning_rate": 1.9391187352842246e-05, + "loss": 0.0023, + "step": 7280 + }, + { + "epoch": 1.8390514631685166, + "grad_norm": 0.0007901940844021738, + "learning_rate": 1.9349142280524724e-05, + "loss": 0.0012, + "step": 7290 + }, + { + "epoch": 1.841574167507568, + "grad_norm": 0.00019426460494287312, + "learning_rate": 1.93070972082072e-05, + "loss": 0.0007, + "step": 7300 + }, + { + "epoch": 1.841574167507568, + "eval_loss": 0.004412606358528137, + "eval_runtime": 20.9146, + "eval_samples_per_second": 84.247, + "eval_steps_per_second": 21.086, + "step": 7300 + }, + { + "epoch": 1.8440968718466195, + "grad_norm": 0.1782449334859848, + "learning_rate": 1.9265052135889674e-05, + "loss": 0.0031, + "step": 7310 + }, + { + "epoch": 1.8466195761856712, + "grad_norm": 0.01708620972931385, + "learning_rate": 1.922300706357215e-05, + "loss": 0.0014, + "step": 7320 + }, + { + "epoch": 1.8491422805247226, + "grad_norm": 0.06721550226211548, + "learning_rate": 1.9180961991254626e-05, + "loss": 0.0006, + "step": 7330 + }, + { + "epoch": 1.851664984863774, + "grad_norm": 0.16179241240024567, + "learning_rate": 1.91389169189371e-05, + "loss": 0.0031, + "step": 7340 + }, + { + "epoch": 1.8541876892028255, + "grad_norm": 0.0004393104463815689, + "learning_rate": 1.9096871846619576e-05, + "loss": 0.0006, + "step": 7350 + }, + { + "epoch": 1.856710393541877, + "grad_norm": 3.2668671337887645e-05, + "learning_rate": 1.9054826774302054e-05, + "loss": 0.0034, + "step": 7360 + }, + { + "epoch": 1.8592330978809284, + "grad_norm": 0.1319074034690857, + "learning_rate": 1.901278170198453e-05, + "loss": 0.0007, + "step": 7370 + }, + { + "epoch": 1.8617558022199798, + "grad_norm": 0.18538399040699005, + "learning_rate": 1.8970736629667006e-05, + "loss": 0.0022, + "step": 7380 + }, + { + "epoch": 1.8642785065590313, + "grad_norm": 0.05137573927640915, + "learning_rate": 1.8928691557349478e-05, + "loss": 0.0028, + "step": 7390 + }, + { + "epoch": 1.8668012108980827, + "grad_norm": 0.12319748103618622, + "learning_rate": 1.8886646485031956e-05, + "loss": 0.002, + "step": 7400 + }, + { + "epoch": 1.8668012108980827, + "eval_loss": 0.004506191238760948, + "eval_runtime": 20.8931, + "eval_samples_per_second": 84.334, + "eval_steps_per_second": 21.107, + "step": 7400 + }, + { + "epoch": 1.8693239152371341, + "grad_norm": 0.0010132059687748551, + "learning_rate": 1.884460141271443e-05, + "loss": 0.003, + "step": 7410 + }, + { + "epoch": 1.8718466195761856, + "grad_norm": 0.005542921368032694, + "learning_rate": 1.880255634039691e-05, + "loss": 0.0031, + "step": 7420 + }, + { + "epoch": 1.874369323915237, + "grad_norm": 0.06616313755512238, + "learning_rate": 1.876051126807938e-05, + "loss": 0.003, + "step": 7430 + }, + { + "epoch": 1.8768920282542885, + "grad_norm": 0.13730089366436005, + "learning_rate": 1.8718466195761858e-05, + "loss": 0.0035, + "step": 7440 + }, + { + "epoch": 1.87941473259334, + "grad_norm": 0.00045170748489908874, + "learning_rate": 1.8676421123444332e-05, + "loss": 0.0006, + "step": 7450 + }, + { + "epoch": 1.8819374369323916, + "grad_norm": 0.00024056418624240905, + "learning_rate": 1.863437605112681e-05, + "loss": 0.0032, + "step": 7460 + }, + { + "epoch": 1.884460141271443, + "grad_norm": 0.11974132061004639, + "learning_rate": 1.8592330978809282e-05, + "loss": 0.0003, + "step": 7470 + }, + { + "epoch": 1.8869828456104945, + "grad_norm": 0.17995594441890717, + "learning_rate": 1.855028590649176e-05, + "loss": 0.0026, + "step": 7480 + }, + { + "epoch": 1.889505549949546, + "grad_norm": 0.16754589974880219, + "learning_rate": 1.8508240834174238e-05, + "loss": 0.0024, + "step": 7490 + }, + { + "epoch": 1.8920282542885973, + "grad_norm": 0.0004354271513875574, + "learning_rate": 1.8466195761856713e-05, + "loss": 0.0054, + "step": 7500 + }, + { + "epoch": 1.8920282542885973, + "eval_loss": 0.004418503027409315, + "eval_runtime": 20.8963, + "eval_samples_per_second": 84.321, + "eval_steps_per_second": 21.104, + "step": 7500 + }, + { + "epoch": 1.894550958627649, + "grad_norm": 0.08341953903436661, + "learning_rate": 1.8424150689539187e-05, + "loss": 0.0018, + "step": 7510 + }, + { + "epoch": 1.8970736629667004, + "grad_norm": 0.027765339240431786, + "learning_rate": 1.8382105617221662e-05, + "loss": 0.0019, + "step": 7520 + }, + { + "epoch": 1.8995963673057519, + "grad_norm": 0.08241262286901474, + "learning_rate": 1.834006054490414e-05, + "loss": 0.0033, + "step": 7530 + }, + { + "epoch": 1.9021190716448033, + "grad_norm": 0.0011641500750556588, + "learning_rate": 1.8298015472586615e-05, + "loss": 0.0018, + "step": 7540 + }, + { + "epoch": 1.9046417759838548, + "grad_norm": 0.15536606311798096, + "learning_rate": 1.825597040026909e-05, + "loss": 0.0029, + "step": 7550 + }, + { + "epoch": 1.9071644803229062, + "grad_norm": 0.08957267552614212, + "learning_rate": 1.8213925327951564e-05, + "loss": 0.0015, + "step": 7560 + }, + { + "epoch": 1.9096871846619576, + "grad_norm": 0.001939677633345127, + "learning_rate": 1.8171880255634042e-05, + "loss": 0.0023, + "step": 7570 + }, + { + "epoch": 1.912209889001009, + "grad_norm": 0.0007268782937899232, + "learning_rate": 1.8129835183316517e-05, + "loss": 0.0009, + "step": 7580 + }, + { + "epoch": 1.9147325933400605, + "grad_norm": 0.0003874763788189739, + "learning_rate": 1.808779011099899e-05, + "loss": 0.0016, + "step": 7590 + }, + { + "epoch": 1.917255297679112, + "grad_norm": 0.077357217669487, + "learning_rate": 1.8045745038681466e-05, + "loss": 0.0021, + "step": 7600 + }, + { + "epoch": 1.917255297679112, + "eval_loss": 0.00432681106030941, + "eval_runtime": 20.8834, + "eval_samples_per_second": 84.373, + "eval_steps_per_second": 21.117, + "step": 7600 + }, + { + "epoch": 1.9197780020181634, + "grad_norm": 0.006311261095106602, + "learning_rate": 1.8003699966363944e-05, + "loss": 0.0035, + "step": 7610 + }, + { + "epoch": 1.9223007063572148, + "grad_norm": 0.0030232472345232964, + "learning_rate": 1.796165489404642e-05, + "loss": 0.0025, + "step": 7620 + }, + { + "epoch": 1.9248234106962663, + "grad_norm": 0.201024129986763, + "learning_rate": 1.7919609821728893e-05, + "loss": 0.0015, + "step": 7630 + }, + { + "epoch": 1.9273461150353177, + "grad_norm": 0.002884042216464877, + "learning_rate": 1.7877564749411368e-05, + "loss": 0.0018, + "step": 7640 + }, + { + "epoch": 1.9298688193743692, + "grad_norm": 0.23472699522972107, + "learning_rate": 1.7835519677093846e-05, + "loss": 0.0029, + "step": 7650 + }, + { + "epoch": 1.9323915237134208, + "grad_norm": 0.0487983413040638, + "learning_rate": 1.779347460477632e-05, + "loss": 0.0016, + "step": 7660 + }, + { + "epoch": 1.9349142280524723, + "grad_norm": 0.1196766048669815, + "learning_rate": 1.7751429532458795e-05, + "loss": 0.002, + "step": 7670 + }, + { + "epoch": 1.9374369323915237, + "grad_norm": 0.11802256107330322, + "learning_rate": 1.7709384460141273e-05, + "loss": 0.0016, + "step": 7680 + }, + { + "epoch": 1.9399596367305751, + "grad_norm": 0.00048595041153021157, + "learning_rate": 1.7667339387823748e-05, + "loss": 0.0021, + "step": 7690 + }, + { + "epoch": 1.9424823410696268, + "grad_norm": 0.0009480112348683178, + "learning_rate": 1.7625294315506226e-05, + "loss": 0.0025, + "step": 7700 + }, + { + "epoch": 1.9424823410696268, + "eval_loss": 0.0042783478274941444, + "eval_runtime": 20.8947, + "eval_samples_per_second": 84.328, + "eval_steps_per_second": 21.106, + "step": 7700 + }, + { + "epoch": 1.9450050454086782, + "grad_norm": 0.0010953915771096945, + "learning_rate": 1.7583249243188697e-05, + "loss": 0.0017, + "step": 7710 + }, + { + "epoch": 1.9475277497477297, + "grad_norm": 0.004912779200822115, + "learning_rate": 1.7541204170871175e-05, + "loss": 0.003, + "step": 7720 + }, + { + "epoch": 1.9500504540867811, + "grad_norm": 0.05038010701537132, + "learning_rate": 1.749915909855365e-05, + "loss": 0.0017, + "step": 7730 + }, + { + "epoch": 1.9525731584258326, + "grad_norm": 0.0019162135431542993, + "learning_rate": 1.7457114026236128e-05, + "loss": 0.0012, + "step": 7740 + }, + { + "epoch": 1.955095862764884, + "grad_norm": 0.09494713693857193, + "learning_rate": 1.74150689539186e-05, + "loss": 0.0008, + "step": 7750 + }, + { + "epoch": 1.9576185671039354, + "grad_norm": 0.0007395916618406773, + "learning_rate": 1.7373023881601078e-05, + "loss": 0.0, + "step": 7760 + }, + { + "epoch": 1.9601412714429869, + "grad_norm": 0.10083262622356415, + "learning_rate": 1.7330978809283552e-05, + "loss": 0.0012, + "step": 7770 + }, + { + "epoch": 1.9626639757820383, + "grad_norm": 0.06073877960443497, + "learning_rate": 1.728893373696603e-05, + "loss": 0.0021, + "step": 7780 + }, + { + "epoch": 1.9651866801210898, + "grad_norm": 0.0009476240957155824, + "learning_rate": 1.72468886646485e-05, + "loss": 0.0029, + "step": 7790 + }, + { + "epoch": 1.9677093844601412, + "grad_norm": 0.11249450594186783, + "learning_rate": 1.720484359233098e-05, + "loss": 0.0019, + "step": 7800 + }, + { + "epoch": 1.9677093844601412, + "eval_loss": 0.004121602047234774, + "eval_runtime": 20.9133, + "eval_samples_per_second": 84.253, + "eval_steps_per_second": 21.087, + "step": 7800 + }, + { + "epoch": 1.9702320887991926, + "grad_norm": 0.10980529338121414, + "learning_rate": 1.7162798520013454e-05, + "loss": 0.0031, + "step": 7810 + }, + { + "epoch": 1.972754793138244, + "grad_norm": 0.09414640069007874, + "learning_rate": 1.7120753447695932e-05, + "loss": 0.0012, + "step": 7820 + }, + { + "epoch": 1.9752774974772955, + "grad_norm": 0.1049361452460289, + "learning_rate": 1.7078708375378407e-05, + "loss": 0.0052, + "step": 7830 + }, + { + "epoch": 1.977800201816347, + "grad_norm": 0.06330663710832596, + "learning_rate": 1.703666330306088e-05, + "loss": 0.0043, + "step": 7840 + }, + { + "epoch": 1.9803229061553986, + "grad_norm": 0.000840139458887279, + "learning_rate": 1.699461823074336e-05, + "loss": 0.0007, + "step": 7850 + }, + { + "epoch": 1.98284561049445, + "grad_norm": 0.0014659571461379528, + "learning_rate": 1.6952573158425834e-05, + "loss": 0.0015, + "step": 7860 + }, + { + "epoch": 1.9853683148335015, + "grad_norm": 0.0011587137123569846, + "learning_rate": 1.691052808610831e-05, + "loss": 0.0009, + "step": 7870 + }, + { + "epoch": 1.987891019172553, + "grad_norm": 0.19755807518959045, + "learning_rate": 1.6868483013790784e-05, + "loss": 0.0028, + "step": 7880 + }, + { + "epoch": 1.9904137235116044, + "grad_norm": 0.08327340334653854, + "learning_rate": 1.6826437941473262e-05, + "loss": 0.0012, + "step": 7890 + }, + { + "epoch": 1.992936427850656, + "grad_norm": 0.0017620738362893462, + "learning_rate": 1.6784392869155736e-05, + "loss": 0.0013, + "step": 7900 + }, + { + "epoch": 1.992936427850656, + "eval_loss": 0.004206486977636814, + "eval_runtime": 20.8838, + "eval_samples_per_second": 84.372, + "eval_steps_per_second": 21.117, + "step": 7900 + }, + { + "epoch": 1.9954591321897075, + "grad_norm": 0.002144803060218692, + "learning_rate": 1.674234779683821e-05, + "loss": 0.0004, + "step": 7910 + }, + { + "epoch": 1.997981836528759, + "grad_norm": 0.21179014444351196, + "learning_rate": 1.6700302724520686e-05, + "loss": 0.0029, + "step": 7920 + }, + { + "epoch": 2.0005045408678104, + "grad_norm": 0.029519561678171158, + "learning_rate": 1.6658257652203164e-05, + "loss": 0.0002, + "step": 7930 + }, + { + "epoch": 2.003027245206862, + "grad_norm": 0.0001509381690993905, + "learning_rate": 1.661621257988564e-05, + "loss": 0.0008, + "step": 7940 + }, + { + "epoch": 2.0055499495459133, + "grad_norm": 0.0014306082157418132, + "learning_rate": 1.6574167507568113e-05, + "loss": 0.0007, + "step": 7950 + }, + { + "epoch": 2.0080726538849647, + "grad_norm": 0.0010387469083070755, + "learning_rate": 1.6532122435250588e-05, + "loss": 0.0005, + "step": 7960 + }, + { + "epoch": 2.010595358224016, + "grad_norm": 0.1410035490989685, + "learning_rate": 1.6490077362933066e-05, + "loss": 0.0002, + "step": 7970 + }, + { + "epoch": 2.0131180625630676, + "grad_norm": 0.00042714065057225525, + "learning_rate": 1.644803229061554e-05, + "loss": 0.0008, + "step": 7980 + }, + { + "epoch": 2.015640766902119, + "grad_norm": 0.0003534654970280826, + "learning_rate": 1.6405987218298015e-05, + "loss": 0.0014, + "step": 7990 + }, + { + "epoch": 2.0181634712411705, + "grad_norm": 0.13387076556682587, + "learning_rate": 1.6363942145980493e-05, + "loss": 0.0039, + "step": 8000 + }, + { + "epoch": 2.0181634712411705, + "eval_loss": 0.004306289833039045, + "eval_runtime": 20.8954, + "eval_samples_per_second": 84.325, + "eval_steps_per_second": 21.105, + "step": 8000 + }, + { + "epoch": 2.020686175580222, + "grad_norm": 0.12466511130332947, + "learning_rate": 1.6321897073662968e-05, + "loss": 0.001, + "step": 8010 + }, + { + "epoch": 2.0232088799192733, + "grad_norm": 0.00024663680233061314, + "learning_rate": 1.6279852001345443e-05, + "loss": 0.0002, + "step": 8020 + }, + { + "epoch": 2.025731584258325, + "grad_norm": 0.00923093967139721, + "learning_rate": 1.6237806929027917e-05, + "loss": 0.0006, + "step": 8030 + }, + { + "epoch": 2.028254288597376, + "grad_norm": 0.00024134966952260584, + "learning_rate": 1.6195761856710395e-05, + "loss": 0.0013, + "step": 8040 + }, + { + "epoch": 2.0307769929364277, + "grad_norm": 0.14056503772735596, + "learning_rate": 1.615371678439287e-05, + "loss": 0.0023, + "step": 8050 + }, + { + "epoch": 2.033299697275479, + "grad_norm": 0.17755192518234253, + "learning_rate": 1.6111671712075348e-05, + "loss": 0.0018, + "step": 8060 + }, + { + "epoch": 2.035822401614531, + "grad_norm": 0.07552599161863327, + "learning_rate": 1.606962663975782e-05, + "loss": 0.001, + "step": 8070 + }, + { + "epoch": 2.0383451059535824, + "grad_norm": 0.05230065807700157, + "learning_rate": 1.6027581567440297e-05, + "loss": 0.0006, + "step": 8080 + }, + { + "epoch": 2.040867810292634, + "grad_norm": 0.011521569453179836, + "learning_rate": 1.5985536495122772e-05, + "loss": 0.0005, + "step": 8090 + }, + { + "epoch": 2.0433905146316853, + "grad_norm": 0.00038396025775000453, + "learning_rate": 1.594349142280525e-05, + "loss": 0.0008, + "step": 8100 + }, + { + "epoch": 2.0433905146316853, + "eval_loss": 0.0042536817491054535, + "eval_runtime": 20.8933, + "eval_samples_per_second": 84.333, + "eval_steps_per_second": 21.107, + "step": 8100 + }, + { + "epoch": 2.0459132189707367, + "grad_norm": 0.0009189638658426702, + "learning_rate": 1.590144635048772e-05, + "loss": 0.0002, + "step": 8110 + }, + { + "epoch": 2.048435923309788, + "grad_norm": 0.00038111425237730145, + "learning_rate": 1.58594012781702e-05, + "loss": 0.0009, + "step": 8120 + }, + { + "epoch": 2.0509586276488396, + "grad_norm": 0.002369858091697097, + "learning_rate": 1.5817356205852674e-05, + "loss": 0.0001, + "step": 8130 + }, + { + "epoch": 2.053481331987891, + "grad_norm": 0.07348914444446564, + "learning_rate": 1.5775311133535152e-05, + "loss": 0.0018, + "step": 8140 + }, + { + "epoch": 2.0560040363269425, + "grad_norm": 0.0008833975298330188, + "learning_rate": 1.5733266061217623e-05, + "loss": 0.0005, + "step": 8150 + }, + { + "epoch": 2.058526740665994, + "grad_norm": 0.0008616661070846021, + "learning_rate": 1.56912209889001e-05, + "loss": 0.0002, + "step": 8160 + }, + { + "epoch": 2.0610494450050454, + "grad_norm": 0.003825935535132885, + "learning_rate": 1.564917591658258e-05, + "loss": 0.0009, + "step": 8170 + }, + { + "epoch": 2.063572149344097, + "grad_norm": 0.0037789177149534225, + "learning_rate": 1.5607130844265054e-05, + "loss": 0.0012, + "step": 8180 + }, + { + "epoch": 2.0660948536831483, + "grad_norm": 0.002533160848543048, + "learning_rate": 1.556508577194753e-05, + "loss": 0.0007, + "step": 8190 + }, + { + "epoch": 2.0686175580221997, + "grad_norm": 0.005100834183394909, + "learning_rate": 1.5523040699630003e-05, + "loss": 0.0005, + "step": 8200 + }, + { + "epoch": 2.0686175580221997, + "eval_loss": 0.004342484753578901, + "eval_runtime": 20.9079, + "eval_samples_per_second": 84.274, + "eval_steps_per_second": 21.093, + "step": 8200 + }, + { + "epoch": 2.071140262361251, + "grad_norm": 0.003723128465935588, + "learning_rate": 1.548099562731248e-05, + "loss": 0.0005, + "step": 8210 + }, + { + "epoch": 2.0736629667003026, + "grad_norm": 0.0664861872792244, + "learning_rate": 1.5438950554994956e-05, + "loss": 0.0007, + "step": 8220 + }, + { + "epoch": 2.076185671039354, + "grad_norm": 0.003986823838204145, + "learning_rate": 1.539690548267743e-05, + "loss": 0.0008, + "step": 8230 + }, + { + "epoch": 2.0787083753784055, + "grad_norm": 0.03816875070333481, + "learning_rate": 1.5354860410359905e-05, + "loss": 0.0017, + "step": 8240 + }, + { + "epoch": 2.081231079717457, + "grad_norm": 0.22215162217617035, + "learning_rate": 1.5312815338042384e-05, + "loss": 0.0027, + "step": 8250 + }, + { + "epoch": 2.0837537840565084, + "grad_norm": 0.13046610355377197, + "learning_rate": 1.5270770265724858e-05, + "loss": 0.0031, + "step": 8260 + }, + { + "epoch": 2.0862764883955602, + "grad_norm": 0.16013352572917938, + "learning_rate": 1.5228725193407335e-05, + "loss": 0.0013, + "step": 8270 + }, + { + "epoch": 2.0887991927346117, + "grad_norm": 0.01975584402680397, + "learning_rate": 1.5186680121089808e-05, + "loss": 0.0002, + "step": 8280 + }, + { + "epoch": 2.091321897073663, + "grad_norm": 0.02759338729083538, + "learning_rate": 1.5144635048772286e-05, + "loss": 0.0009, + "step": 8290 + }, + { + "epoch": 2.0938446014127146, + "grad_norm": 0.0007609634776599705, + "learning_rate": 1.5102589976454759e-05, + "loss": 0.0003, + "step": 8300 + }, + { + "epoch": 2.0938446014127146, + "eval_loss": 0.004117065574973822, + "eval_runtime": 20.9041, + "eval_samples_per_second": 84.29, + "eval_steps_per_second": 21.096, + "step": 8300 + }, + { + "epoch": 2.096367305751766, + "grad_norm": 0.0008400371880270541, + "learning_rate": 1.5060544904137237e-05, + "loss": 0.0014, + "step": 8310 + }, + { + "epoch": 2.0988900100908174, + "grad_norm": 0.0029490781016647816, + "learning_rate": 1.501849983181971e-05, + "loss": 0.0017, + "step": 8320 + }, + { + "epoch": 2.101412714429869, + "grad_norm": 0.14210307598114014, + "learning_rate": 1.4976454759502188e-05, + "loss": 0.0012, + "step": 8330 + }, + { + "epoch": 2.1039354187689203, + "grad_norm": 0.0019058181205764413, + "learning_rate": 1.4934409687184664e-05, + "loss": 0.0006, + "step": 8340 + }, + { + "epoch": 2.1064581231079718, + "grad_norm": 0.0012054074322804809, + "learning_rate": 1.4892364614867139e-05, + "loss": 0.0018, + "step": 8350 + }, + { + "epoch": 2.108980827447023, + "grad_norm": 0.0005894547794014215, + "learning_rate": 1.4850319542549615e-05, + "loss": 0.0025, + "step": 8360 + }, + { + "epoch": 2.1115035317860746, + "grad_norm": 0.0017818346386775374, + "learning_rate": 1.480827447023209e-05, + "loss": 0.0017, + "step": 8370 + }, + { + "epoch": 2.114026236125126, + "grad_norm": 0.0991104245185852, + "learning_rate": 1.4766229397914566e-05, + "loss": 0.001, + "step": 8380 + }, + { + "epoch": 2.1165489404641775, + "grad_norm": 0.0006472957320511341, + "learning_rate": 1.472418432559704e-05, + "loss": 0.0004, + "step": 8390 + }, + { + "epoch": 2.119071644803229, + "grad_norm": 0.03154408931732178, + "learning_rate": 1.4682139253279517e-05, + "loss": 0.001, + "step": 8400 + }, + { + "epoch": 2.119071644803229, + "eval_loss": 0.004066385794430971, + "eval_runtime": 20.8985, + "eval_samples_per_second": 84.312, + "eval_steps_per_second": 21.102, + "step": 8400 + }, + { + "epoch": 2.1215943491422804, + "grad_norm": 0.0002149187057511881, + "learning_rate": 1.4640094180961992e-05, + "loss": 0.0013, + "step": 8410 + }, + { + "epoch": 2.124117053481332, + "grad_norm": 0.11510289460420609, + "learning_rate": 1.4598049108644468e-05, + "loss": 0.0006, + "step": 8420 + }, + { + "epoch": 2.1266397578203833, + "grad_norm": 0.11964685469865799, + "learning_rate": 1.4556004036326943e-05, + "loss": 0.0003, + "step": 8430 + }, + { + "epoch": 2.1291624621594347, + "grad_norm": 0.1316743791103363, + "learning_rate": 1.4513958964009419e-05, + "loss": 0.0005, + "step": 8440 + }, + { + "epoch": 2.131685166498486, + "grad_norm": 3.9832641050452366e-05, + "learning_rate": 1.4471913891691894e-05, + "loss": 0.0, + "step": 8450 + }, + { + "epoch": 2.134207870837538, + "grad_norm": 0.00014946168812457472, + "learning_rate": 1.442986881937437e-05, + "loss": 0.0008, + "step": 8460 + }, + { + "epoch": 2.1367305751765895, + "grad_norm": 0.040535129606723785, + "learning_rate": 1.4387823747056845e-05, + "loss": 0.0007, + "step": 8470 + }, + { + "epoch": 2.139253279515641, + "grad_norm": 9.963886986952275e-05, + "learning_rate": 1.4345778674739321e-05, + "loss": 0.0008, + "step": 8480 + }, + { + "epoch": 2.1417759838546924, + "grad_norm": 0.00044994213385507464, + "learning_rate": 1.4303733602421796e-05, + "loss": 0.0004, + "step": 8490 + }, + { + "epoch": 2.144298688193744, + "grad_norm": 0.09494508057832718, + "learning_rate": 1.4261688530104272e-05, + "loss": 0.0008, + "step": 8500 + }, + { + "epoch": 2.144298688193744, + "eval_loss": 0.004108693916350603, + "eval_runtime": 20.9, + "eval_samples_per_second": 84.306, + "eval_steps_per_second": 21.1, + "step": 8500 + }, + { + "epoch": 2.1468213925327952, + "grad_norm": 0.001810372225008905, + "learning_rate": 1.4219643457786749e-05, + "loss": 0.0002, + "step": 8510 + }, + { + "epoch": 2.1493440968718467, + "grad_norm": 0.00013946890248917043, + "learning_rate": 1.4177598385469223e-05, + "loss": 0.0007, + "step": 8520 + }, + { + "epoch": 2.151866801210898, + "grad_norm": 0.003362849121913314, + "learning_rate": 1.4135553313151701e-05, + "loss": 0.0002, + "step": 8530 + }, + { + "epoch": 2.1543895055499496, + "grad_norm": 0.01551423966884613, + "learning_rate": 1.4093508240834174e-05, + "loss": 0.0003, + "step": 8540 + }, + { + "epoch": 2.156912209889001, + "grad_norm": 0.0713684931397438, + "learning_rate": 1.4051463168516652e-05, + "loss": 0.0005, + "step": 8550 + }, + { + "epoch": 2.1594349142280524, + "grad_norm": 0.0008997659897431731, + "learning_rate": 1.4009418096199125e-05, + "loss": 0.0008, + "step": 8560 + }, + { + "epoch": 2.161957618567104, + "grad_norm": 0.00035184502485208213, + "learning_rate": 1.3967373023881603e-05, + "loss": 0.0004, + "step": 8570 + }, + { + "epoch": 2.1644803229061553, + "grad_norm": 0.08870701491832733, + "learning_rate": 1.3925327951564076e-05, + "loss": 0.002, + "step": 8580 + }, + { + "epoch": 2.1670030272452068, + "grad_norm": 0.032671812921762466, + "learning_rate": 1.3883282879246554e-05, + "loss": 0.0001, + "step": 8590 + }, + { + "epoch": 2.169525731584258, + "grad_norm": 0.01390095055103302, + "learning_rate": 1.3841237806929027e-05, + "loss": 0.0016, + "step": 8600 + }, + { + "epoch": 2.169525731584258, + "eval_loss": 0.004325889516621828, + "eval_runtime": 20.8969, + "eval_samples_per_second": 84.319, + "eval_steps_per_second": 21.104, + "step": 8600 + }, + { + "epoch": 2.1720484359233097, + "grad_norm": 0.04760993644595146, + "learning_rate": 1.3799192734611505e-05, + "loss": 0.0016, + "step": 8610 + }, + { + "epoch": 2.174571140262361, + "grad_norm": 0.006453169509768486, + "learning_rate": 1.3757147662293978e-05, + "loss": 0.0002, + "step": 8620 + }, + { + "epoch": 2.1770938446014125, + "grad_norm": 0.04346233606338501, + "learning_rate": 1.3715102589976456e-05, + "loss": 0.0012, + "step": 8630 + }, + { + "epoch": 2.179616548940464, + "grad_norm": 0.010294480249285698, + "learning_rate": 1.367305751765893e-05, + "loss": 0.0003, + "step": 8640 + }, + { + "epoch": 2.182139253279516, + "grad_norm": 0.00014406938862521201, + "learning_rate": 1.3631012445341407e-05, + "loss": 0.0001, + "step": 8650 + }, + { + "epoch": 2.1846619576185673, + "grad_norm": 0.10597830265760422, + "learning_rate": 1.358896737302388e-05, + "loss": 0.0005, + "step": 8660 + }, + { + "epoch": 2.1871846619576187, + "grad_norm": 0.125259131193161, + "learning_rate": 1.3546922300706358e-05, + "loss": 0.0004, + "step": 8670 + }, + { + "epoch": 2.18970736629667, + "grad_norm": 0.00017470364400651306, + "learning_rate": 1.3504877228388835e-05, + "loss": 0.0005, + "step": 8680 + }, + { + "epoch": 2.1922300706357216, + "grad_norm": 0.1178673580288887, + "learning_rate": 1.346283215607131e-05, + "loss": 0.0005, + "step": 8690 + }, + { + "epoch": 2.194752774974773, + "grad_norm": 0.00032886656117625535, + "learning_rate": 1.3420787083753786e-05, + "loss": 0.0016, + "step": 8700 + }, + { + "epoch": 2.194752774974773, + "eval_loss": 0.004205956123769283, + "eval_runtime": 20.8455, + "eval_samples_per_second": 84.527, + "eval_steps_per_second": 21.156, + "step": 8700 + }, + { + "epoch": 2.1972754793138245, + "grad_norm": 0.00314329843968153, + "learning_rate": 1.337874201143626e-05, + "loss": 0.0009, + "step": 8710 + }, + { + "epoch": 2.199798183652876, + "grad_norm": 0.0048809046857059, + "learning_rate": 1.3336696939118737e-05, + "loss": 0.0013, + "step": 8720 + }, + { + "epoch": 2.2023208879919274, + "grad_norm": 0.0004060929059050977, + "learning_rate": 1.3294651866801211e-05, + "loss": 0.0001, + "step": 8730 + }, + { + "epoch": 2.204843592330979, + "grad_norm": 0.0005921365809626877, + "learning_rate": 1.3252606794483688e-05, + "loss": 0.0004, + "step": 8740 + }, + { + "epoch": 2.2073662966700303, + "grad_norm": 0.00011614049435593188, + "learning_rate": 1.3210561722166163e-05, + "loss": 0.0013, + "step": 8750 + }, + { + "epoch": 2.2098890010090817, + "grad_norm": 0.010343813337385654, + "learning_rate": 1.3168516649848639e-05, + "loss": 0.0003, + "step": 8760 + }, + { + "epoch": 2.212411705348133, + "grad_norm": 0.06460902839899063, + "learning_rate": 1.3126471577531114e-05, + "loss": 0.0004, + "step": 8770 + }, + { + "epoch": 2.2149344096871846, + "grad_norm": 0.0001900464267237112, + "learning_rate": 1.308442650521359e-05, + "loss": 0.0005, + "step": 8780 + }, + { + "epoch": 2.217457114026236, + "grad_norm": 0.24223244190216064, + "learning_rate": 1.3042381432896065e-05, + "loss": 0.0009, + "step": 8790 + }, + { + "epoch": 2.2199798183652875, + "grad_norm": 0.012591979466378689, + "learning_rate": 1.3000336360578541e-05, + "loss": 0.0015, + "step": 8800 + }, + { + "epoch": 2.2199798183652875, + "eval_loss": 0.00407541636377573, + "eval_runtime": 20.8574, + "eval_samples_per_second": 84.478, + "eval_steps_per_second": 21.144, + "step": 8800 + }, + { + "epoch": 2.222502522704339, + "grad_norm": 0.008851522579789162, + "learning_rate": 1.2958291288261016e-05, + "loss": 0.0001, + "step": 8810 + }, + { + "epoch": 2.2250252270433903, + "grad_norm": 0.0007781846798025072, + "learning_rate": 1.2916246215943492e-05, + "loss": 0.0002, + "step": 8820 + }, + { + "epoch": 2.227547931382442, + "grad_norm": 0.00011876798089360818, + "learning_rate": 1.2874201143625967e-05, + "loss": 0.0004, + "step": 8830 + }, + { + "epoch": 2.2300706357214937, + "grad_norm": 7.863504288252443e-05, + "learning_rate": 1.2832156071308443e-05, + "loss": 0.0006, + "step": 8840 + }, + { + "epoch": 2.232593340060545, + "grad_norm": 0.03496154770255089, + "learning_rate": 1.279011099899092e-05, + "loss": 0.0008, + "step": 8850 + }, + { + "epoch": 2.2351160443995965, + "grad_norm": 0.0036200936883687973, + "learning_rate": 1.2748065926673394e-05, + "loss": 0.0013, + "step": 8860 + }, + { + "epoch": 2.237638748738648, + "grad_norm": 0.1222803145647049, + "learning_rate": 1.2706020854355872e-05, + "loss": 0.0024, + "step": 8870 + }, + { + "epoch": 2.2401614530776994, + "grad_norm": 0.006895432714372873, + "learning_rate": 1.2663975782038345e-05, + "loss": 0.0005, + "step": 8880 + }, + { + "epoch": 2.242684157416751, + "grad_norm": 0.0022829826921224594, + "learning_rate": 1.2621930709720823e-05, + "loss": 0.0002, + "step": 8890 + }, + { + "epoch": 2.2452068617558023, + "grad_norm": 0.0012163568753749132, + "learning_rate": 1.2579885637403296e-05, + "loss": 0.0008, + "step": 8900 + }, + { + "epoch": 2.2452068617558023, + "eval_loss": 0.004117515403777361, + "eval_runtime": 20.8601, + "eval_samples_per_second": 84.468, + "eval_steps_per_second": 21.141, + "step": 8900 + }, + { + "epoch": 2.2477295660948537, + "grad_norm": 0.00023967861488927156, + "learning_rate": 1.2537840565085774e-05, + "loss": 0.0001, + "step": 8910 + }, + { + "epoch": 2.250252270433905, + "grad_norm": 0.002785157412290573, + "learning_rate": 1.2495795492768249e-05, + "loss": 0.0008, + "step": 8920 + }, + { + "epoch": 2.2527749747729566, + "grad_norm": 0.002094635972753167, + "learning_rate": 1.2453750420450725e-05, + "loss": 0.001, + "step": 8930 + }, + { + "epoch": 2.255297679112008, + "grad_norm": 0.17214207351207733, + "learning_rate": 1.24117053481332e-05, + "loss": 0.001, + "step": 8940 + }, + { + "epoch": 2.2578203834510595, + "grad_norm": 0.14640472829341888, + "learning_rate": 1.2369660275815676e-05, + "loss": 0.0009, + "step": 8950 + }, + { + "epoch": 2.260343087790111, + "grad_norm": 0.05824064090847969, + "learning_rate": 1.232761520349815e-05, + "loss": 0.0004, + "step": 8960 + }, + { + "epoch": 2.2628657921291624, + "grad_norm": 0.0001570479798829183, + "learning_rate": 1.2285570131180627e-05, + "loss": 0.0003, + "step": 8970 + }, + { + "epoch": 2.265388496468214, + "grad_norm": 0.008577616885304451, + "learning_rate": 1.2243525058863102e-05, + "loss": 0.0001, + "step": 8980 + }, + { + "epoch": 2.2679112008072653, + "grad_norm": 0.1447947919368744, + "learning_rate": 1.2201479986545578e-05, + "loss": 0.0011, + "step": 8990 + }, + { + "epoch": 2.2704339051463167, + "grad_norm": 0.00018846993043553084, + "learning_rate": 1.2159434914228053e-05, + "loss": 0.0002, + "step": 9000 + }, + { + "epoch": 2.2704339051463167, + "eval_loss": 0.004093192983418703, + "eval_runtime": 20.8777, + "eval_samples_per_second": 84.396, + "eval_steps_per_second": 21.123, + "step": 9000 + }, + { + "epoch": 2.272956609485368, + "grad_norm": 0.0007275242242030799, + "learning_rate": 1.211738984191053e-05, + "loss": 0.0011, + "step": 9010 + }, + { + "epoch": 2.2754793138244196, + "grad_norm": 0.010818258859217167, + "learning_rate": 1.2075344769593004e-05, + "loss": 0.0002, + "step": 9020 + }, + { + "epoch": 2.2780020181634715, + "grad_norm": 0.019404212012887, + "learning_rate": 1.203329969727548e-05, + "loss": 0.0007, + "step": 9030 + }, + { + "epoch": 2.2805247225025225, + "grad_norm": 0.07261942327022552, + "learning_rate": 1.1991254624957955e-05, + "loss": 0.0002, + "step": 9040 + }, + { + "epoch": 2.2830474268415744, + "grad_norm": 0.0001016618189169094, + "learning_rate": 1.1949209552640431e-05, + "loss": 0.0019, + "step": 9050 + }, + { + "epoch": 2.285570131180626, + "grad_norm": 0.00039705351809971035, + "learning_rate": 1.1907164480322906e-05, + "loss": 0.0002, + "step": 9060 + }, + { + "epoch": 2.2880928355196772, + "grad_norm": 0.019509321078658104, + "learning_rate": 1.1865119408005382e-05, + "loss": 0.0007, + "step": 9070 + }, + { + "epoch": 2.2906155398587287, + "grad_norm": 0.00039067715988494456, + "learning_rate": 1.1823074335687857e-05, + "loss": 0.0007, + "step": 9080 + }, + { + "epoch": 2.29313824419778, + "grad_norm": 0.006623209919780493, + "learning_rate": 1.1781029263370335e-05, + "loss": 0.0017, + "step": 9090 + }, + { + "epoch": 2.2956609485368316, + "grad_norm": 0.09274378418922424, + "learning_rate": 1.173898419105281e-05, + "loss": 0.0007, + "step": 9100 + }, + { + "epoch": 2.2956609485368316, + "eval_loss": 0.004082069266587496, + "eval_runtime": 20.8844, + "eval_samples_per_second": 84.369, + "eval_steps_per_second": 21.116, + "step": 9100 + }, + { + "epoch": 2.298183652875883, + "grad_norm": 0.00021790213941130787, + "learning_rate": 1.1696939118735286e-05, + "loss": 0.0006, + "step": 9110 + }, + { + "epoch": 2.3007063572149344, + "grad_norm": 0.11509310454130173, + "learning_rate": 1.165489404641776e-05, + "loss": 0.0008, + "step": 9120 + }, + { + "epoch": 2.303229061553986, + "grad_norm": 0.0007341225282289088, + "learning_rate": 1.1612848974100237e-05, + "loss": 0.0004, + "step": 9130 + }, + { + "epoch": 2.3057517658930373, + "grad_norm": 0.14291776716709137, + "learning_rate": 1.1570803901782712e-05, + "loss": 0.0023, + "step": 9140 + }, + { + "epoch": 2.3082744702320888, + "grad_norm": 9.565720392856747e-05, + "learning_rate": 1.1528758829465188e-05, + "loss": 0.0007, + "step": 9150 + }, + { + "epoch": 2.31079717457114, + "grad_norm": 3.3541025914018974e-05, + "learning_rate": 1.1486713757147663e-05, + "loss": 0.0013, + "step": 9160 + }, + { + "epoch": 2.3133198789101916, + "grad_norm": 0.0004649158217944205, + "learning_rate": 1.1444668684830139e-05, + "loss": 0.0013, + "step": 9170 + }, + { + "epoch": 2.315842583249243, + "grad_norm": 0.19510401785373688, + "learning_rate": 1.1402623612512614e-05, + "loss": 0.0009, + "step": 9180 + }, + { + "epoch": 2.3183652875882945, + "grad_norm": 0.06924453377723694, + "learning_rate": 1.136057854019509e-05, + "loss": 0.0009, + "step": 9190 + }, + { + "epoch": 2.320887991927346, + "grad_norm": 0.006778767332434654, + "learning_rate": 1.1318533467877565e-05, + "loss": 0.0, + "step": 9200 + }, + { + "epoch": 2.320887991927346, + "eval_loss": 0.004074608441442251, + "eval_runtime": 20.8629, + "eval_samples_per_second": 84.456, + "eval_steps_per_second": 21.138, + "step": 9200 + }, + { + "epoch": 2.3234106962663974, + "grad_norm": 0.0005497061647474766, + "learning_rate": 1.1276488395560041e-05, + "loss": 0.0007, + "step": 9210 + }, + { + "epoch": 2.3259334006054493, + "grad_norm": 0.2050006240606308, + "learning_rate": 1.1234443323242516e-05, + "loss": 0.0025, + "step": 9220 + }, + { + "epoch": 2.3284561049445003, + "grad_norm": 0.002956175012513995, + "learning_rate": 1.1192398250924992e-05, + "loss": 0.0013, + "step": 9230 + }, + { + "epoch": 2.330978809283552, + "grad_norm": 0.0013219810789451003, + "learning_rate": 1.1150353178607467e-05, + "loss": 0.0016, + "step": 9240 + }, + { + "epoch": 2.3335015136226036, + "grad_norm": 0.0003643881937023252, + "learning_rate": 1.1108308106289943e-05, + "loss": 0.0003, + "step": 9250 + }, + { + "epoch": 2.336024217961655, + "grad_norm": 0.00021895192912779748, + "learning_rate": 1.106626303397242e-05, + "loss": 0.0005, + "step": 9260 + }, + { + "epoch": 2.3385469223007065, + "grad_norm": 0.07140027731657028, + "learning_rate": 1.1024217961654896e-05, + "loss": 0.0003, + "step": 9270 + }, + { + "epoch": 2.341069626639758, + "grad_norm": 0.004080440849065781, + "learning_rate": 1.098217288933737e-05, + "loss": 0.0007, + "step": 9280 + }, + { + "epoch": 2.3435923309788094, + "grad_norm": 0.00017156251124106348, + "learning_rate": 1.0940127817019847e-05, + "loss": 0.001, + "step": 9290 + }, + { + "epoch": 2.346115035317861, + "grad_norm": 0.00018193376308772713, + "learning_rate": 1.0898082744702322e-05, + "loss": 0.0019, + "step": 9300 + }, + { + "epoch": 2.346115035317861, + "eval_loss": 0.00402724277228117, + "eval_runtime": 20.8513, + "eval_samples_per_second": 84.503, + "eval_steps_per_second": 21.15, + "step": 9300 + }, + { + "epoch": 2.3486377396569122, + "grad_norm": 3.1227758881868795e-05, + "learning_rate": 1.0856037672384798e-05, + "loss": 0.0001, + "step": 9310 + }, + { + "epoch": 2.3511604439959637, + "grad_norm": 0.0005630968371406198, + "learning_rate": 1.0813992600067273e-05, + "loss": 0.0013, + "step": 9320 + }, + { + "epoch": 2.353683148335015, + "grad_norm": 0.011226714588701725, + "learning_rate": 1.0771947527749749e-05, + "loss": 0.0026, + "step": 9330 + }, + { + "epoch": 2.3562058526740666, + "grad_norm": 0.0001727965282043442, + "learning_rate": 1.0729902455432224e-05, + "loss": 0.0007, + "step": 9340 + }, + { + "epoch": 2.358728557013118, + "grad_norm": 0.00026375442394055426, + "learning_rate": 1.06878573831147e-05, + "loss": 0.0005, + "step": 9350 + }, + { + "epoch": 2.3612512613521695, + "grad_norm": 0.058445390313863754, + "learning_rate": 1.0645812310797175e-05, + "loss": 0.0002, + "step": 9360 + }, + { + "epoch": 2.363773965691221, + "grad_norm": 0.0032288488000631332, + "learning_rate": 1.0603767238479651e-05, + "loss": 0.0009, + "step": 9370 + }, + { + "epoch": 2.3662966700302723, + "grad_norm": 0.0001218500838149339, + "learning_rate": 1.0561722166162126e-05, + "loss": 0.0003, + "step": 9380 + }, + { + "epoch": 2.3688193743693238, + "grad_norm": 0.0034417565912008286, + "learning_rate": 1.0519677093844602e-05, + "loss": 0.0001, + "step": 9390 + }, + { + "epoch": 2.371342078708375, + "grad_norm": 0.0020040010567754507, + "learning_rate": 1.0477632021527077e-05, + "loss": 0.0007, + "step": 9400 + }, + { + "epoch": 2.371342078708375, + "eval_loss": 0.00405073631554842, + "eval_runtime": 20.8534, + "eval_samples_per_second": 84.495, + "eval_steps_per_second": 21.148, + "step": 9400 + }, + { + "epoch": 2.3738647830474267, + "grad_norm": 0.158976748585701, + "learning_rate": 1.0435586949209553e-05, + "loss": 0.0011, + "step": 9410 + }, + { + "epoch": 2.376387487386478, + "grad_norm": 0.0006932442774996161, + "learning_rate": 1.0393541876892028e-05, + "loss": 0.0004, + "step": 9420 + }, + { + "epoch": 2.37891019172553, + "grad_norm": 0.000254453276284039, + "learning_rate": 1.0351496804574506e-05, + "loss": 0.0003, + "step": 9430 + }, + { + "epoch": 2.3814328960645814, + "grad_norm": 0.001546886982396245, + "learning_rate": 1.030945173225698e-05, + "loss": 0.0001, + "step": 9440 + }, + { + "epoch": 2.383955600403633, + "grad_norm": 0.004422788508236408, + "learning_rate": 1.0267406659939457e-05, + "loss": 0.0001, + "step": 9450 + }, + { + "epoch": 2.3864783047426843, + "grad_norm": 0.08441481739282608, + "learning_rate": 1.0225361587621931e-05, + "loss": 0.001, + "step": 9460 + }, + { + "epoch": 2.3890010090817357, + "grad_norm": 0.0022749004419893026, + "learning_rate": 1.0183316515304408e-05, + "loss": 0.0002, + "step": 9470 + }, + { + "epoch": 2.391523713420787, + "grad_norm": 0.0009072842076420784, + "learning_rate": 1.0141271442986882e-05, + "loss": 0.0007, + "step": 9480 + }, + { + "epoch": 2.3940464177598386, + "grad_norm": 0.0009312007459811866, + "learning_rate": 1.0099226370669359e-05, + "loss": 0.0004, + "step": 9490 + }, + { + "epoch": 2.39656912209889, + "grad_norm": 0.00628532562404871, + "learning_rate": 1.0057181298351834e-05, + "loss": 0.0004, + "step": 9500 + }, + { + "epoch": 2.39656912209889, + "eval_loss": 0.004011793062090874, + "eval_runtime": 20.8472, + "eval_samples_per_second": 84.52, + "eval_steps_per_second": 21.154, + "step": 9500 + }, + { + "epoch": 2.3990918264379415, + "grad_norm": 0.01351741049438715, + "learning_rate": 1.001513622603431e-05, + "loss": 0.0002, + "step": 9510 + }, + { + "epoch": 2.401614530776993, + "grad_norm": 0.1305786371231079, + "learning_rate": 9.973091153716785e-06, + "loss": 0.0007, + "step": 9520 + }, + { + "epoch": 2.4041372351160444, + "grad_norm": 0.0029213367961347103, + "learning_rate": 9.931046081399261e-06, + "loss": 0.0003, + "step": 9530 + }, + { + "epoch": 2.406659939455096, + "grad_norm": 0.004418868105858564, + "learning_rate": 9.889001009081736e-06, + "loss": 0.0007, + "step": 9540 + }, + { + "epoch": 2.4091826437941473, + "grad_norm": 0.022457575425505638, + "learning_rate": 9.846955936764212e-06, + "loss": 0.0005, + "step": 9550 + }, + { + "epoch": 2.4117053481331987, + "grad_norm": 0.0011523871216922998, + "learning_rate": 9.804910864446687e-06, + "loss": 0.0008, + "step": 9560 + }, + { + "epoch": 2.41422805247225, + "grad_norm": 0.00031501849298365414, + "learning_rate": 9.762865792129163e-06, + "loss": 0.0005, + "step": 9570 + }, + { + "epoch": 2.4167507568113016, + "grad_norm": 0.0037088736426085234, + "learning_rate": 9.720820719811638e-06, + "loss": 0.0012, + "step": 9580 + }, + { + "epoch": 2.419273461150353, + "grad_norm": 6.174742884468287e-05, + "learning_rate": 9.678775647494114e-06, + "loss": 0.0001, + "step": 9590 + }, + { + "epoch": 2.4217961654894045, + "grad_norm": 0.05361416935920715, + "learning_rate": 9.63673057517659e-06, + "loss": 0.0014, + "step": 9600 + }, + { + "epoch": 2.4217961654894045, + "eval_loss": 0.003927647601813078, + "eval_runtime": 20.8667, + "eval_samples_per_second": 84.441, + "eval_steps_per_second": 21.134, + "step": 9600 + }, + { + "epoch": 2.424318869828456, + "grad_norm": 0.0005672819679602981, + "learning_rate": 9.594685502859067e-06, + "loss": 0.0002, + "step": 9610 + }, + { + "epoch": 2.426841574167508, + "grad_norm": 0.03695020079612732, + "learning_rate": 9.552640430541541e-06, + "loss": 0.004, + "step": 9620 + }, + { + "epoch": 2.429364278506559, + "grad_norm": 0.17845889925956726, + "learning_rate": 9.510595358224018e-06, + "loss": 0.0011, + "step": 9630 + }, + { + "epoch": 2.4318869828456107, + "grad_norm": 0.0009999609319493175, + "learning_rate": 9.468550285906492e-06, + "loss": 0.0008, + "step": 9640 + }, + { + "epoch": 2.434409687184662, + "grad_norm": 0.10458512604236603, + "learning_rate": 9.426505213588969e-06, + "loss": 0.0007, + "step": 9650 + }, + { + "epoch": 2.4369323915237135, + "grad_norm": 0.00014382805966306478, + "learning_rate": 9.384460141271443e-06, + "loss": 0.0006, + "step": 9660 + }, + { + "epoch": 2.439455095862765, + "grad_norm": 0.2015506774187088, + "learning_rate": 9.34241506895392e-06, + "loss": 0.0022, + "step": 9670 + }, + { + "epoch": 2.4419778002018164, + "grad_norm": 0.0007017810712568462, + "learning_rate": 9.300369996636394e-06, + "loss": 0.0003, + "step": 9680 + }, + { + "epoch": 2.444500504540868, + "grad_norm": 0.003951243124902248, + "learning_rate": 9.25832492431887e-06, + "loss": 0.0016, + "step": 9690 + }, + { + "epoch": 2.4470232088799193, + "grad_norm": 0.0021652476862072945, + "learning_rate": 9.216279852001345e-06, + "loss": 0.0002, + "step": 9700 + }, + { + "epoch": 2.4470232088799193, + "eval_loss": 0.004020575433969498, + "eval_runtime": 20.8681, + "eval_samples_per_second": 84.435, + "eval_steps_per_second": 21.133, + "step": 9700 + }, + { + "epoch": 2.4495459132189707, + "grad_norm": 0.0001469567941967398, + "learning_rate": 9.174234779683822e-06, + "loss": 0.0004, + "step": 9710 + }, + { + "epoch": 2.452068617558022, + "grad_norm": 0.06326698511838913, + "learning_rate": 9.132189707366296e-06, + "loss": 0.0002, + "step": 9720 + }, + { + "epoch": 2.4545913218970736, + "grad_norm": 0.006055818870663643, + "learning_rate": 9.090144635048773e-06, + "loss": 0.0001, + "step": 9730 + }, + { + "epoch": 2.457114026236125, + "grad_norm": 0.006732003763318062, + "learning_rate": 9.048099562731247e-06, + "loss": 0.0003, + "step": 9740 + }, + { + "epoch": 2.4596367305751765, + "grad_norm": 0.0013023527571931481, + "learning_rate": 9.006054490413724e-06, + "loss": 0.0003, + "step": 9750 + }, + { + "epoch": 2.462159434914228, + "grad_norm": 0.00027046047034673393, + "learning_rate": 8.964009418096199e-06, + "loss": 0.0005, + "step": 9760 + }, + { + "epoch": 2.4646821392532794, + "grad_norm": 0.0663028210401535, + "learning_rate": 8.921964345778677e-06, + "loss": 0.0017, + "step": 9770 + }, + { + "epoch": 2.467204843592331, + "grad_norm": 0.06608086824417114, + "learning_rate": 8.879919273461151e-06, + "loss": 0.0016, + "step": 9780 + }, + { + "epoch": 2.4697275479313823, + "grad_norm": 0.1625903695821762, + "learning_rate": 8.837874201143628e-06, + "loss": 0.0028, + "step": 9790 + }, + { + "epoch": 2.4722502522704337, + "grad_norm": 0.0005290044355206192, + "learning_rate": 8.795829128826102e-06, + "loss": 0.0013, + "step": 9800 + }, + { + "epoch": 2.4722502522704337, + "eval_loss": 0.00388718512840569, + "eval_runtime": 20.8863, + "eval_samples_per_second": 84.362, + "eval_steps_per_second": 21.114, + "step": 9800 + }, + { + "epoch": 2.4747729566094856, + "grad_norm": 0.0725766122341156, + "learning_rate": 8.753784056508579e-06, + "loss": 0.0003, + "step": 9810 + }, + { + "epoch": 2.4772956609485366, + "grad_norm": 0.00025911492411978543, + "learning_rate": 8.711738984191053e-06, + "loss": 0.0007, + "step": 9820 + }, + { + "epoch": 2.4798183652875885, + "grad_norm": 0.005303604993969202, + "learning_rate": 8.66969391187353e-06, + "loss": 0.0005, + "step": 9830 + }, + { + "epoch": 2.48234106962664, + "grad_norm": 0.00039688186370767653, + "learning_rate": 8.627648839556004e-06, + "loss": 0.0006, + "step": 9840 + }, + { + "epoch": 2.4848637739656914, + "grad_norm": 0.010475796647369862, + "learning_rate": 8.58560376723848e-06, + "loss": 0.0005, + "step": 9850 + }, + { + "epoch": 2.487386478304743, + "grad_norm": 0.0014096908271312714, + "learning_rate": 8.543558694920955e-06, + "loss": 0.0009, + "step": 9860 + }, + { + "epoch": 2.4899091826437942, + "grad_norm": 0.0016413936391472816, + "learning_rate": 8.501513622603432e-06, + "loss": 0.0011, + "step": 9870 + }, + { + "epoch": 2.4924318869828457, + "grad_norm": 0.00043976728920824826, + "learning_rate": 8.459468550285906e-06, + "loss": 0.0001, + "step": 9880 + }, + { + "epoch": 2.494954591321897, + "grad_norm": 0.00024035267415456474, + "learning_rate": 8.417423477968383e-06, + "loss": 0.0023, + "step": 9890 + }, + { + "epoch": 2.4974772956609486, + "grad_norm": 0.0011502320412546396, + "learning_rate": 8.375378405650857e-06, + "loss": 0.0006, + "step": 9900 + }, + { + "epoch": 2.4974772956609486, + "eval_loss": 0.00393605325371027, + "eval_runtime": 20.8485, + "eval_samples_per_second": 84.515, + "eval_steps_per_second": 21.153, + "step": 9900 + }, + { + "epoch": 2.5, + "grad_norm": 0.00509544787928462, + "learning_rate": 8.333333333333334e-06, + "loss": 0.0002, + "step": 9910 + }, + { + "epoch": 2.5025227043390514, + "grad_norm": 5.717107706004754e-05, + "learning_rate": 8.291288261015808e-06, + "loss": 0.0001, + "step": 9920 + }, + { + "epoch": 2.505045408678103, + "grad_norm": 0.08509433269500732, + "learning_rate": 8.249243188698285e-06, + "loss": 0.0003, + "step": 9930 + }, + { + "epoch": 2.5075681130171543, + "grad_norm": 0.0029674407560378313, + "learning_rate": 8.207198116380761e-06, + "loss": 0.0003, + "step": 9940 + }, + { + "epoch": 2.5100908173562058, + "grad_norm": 0.0014874001499265432, + "learning_rate": 8.165153044063237e-06, + "loss": 0.0012, + "step": 9950 + }, + { + "epoch": 2.512613521695257, + "grad_norm": 0.0054718488827347755, + "learning_rate": 8.123107971745712e-06, + "loss": 0.0004, + "step": 9960 + }, + { + "epoch": 2.5151362260343086, + "grad_norm": 0.021269412711262703, + "learning_rate": 8.081062899428188e-06, + "loss": 0.0005, + "step": 9970 + }, + { + "epoch": 2.51765893037336, + "grad_norm": 0.00045099278213456273, + "learning_rate": 8.039017827110663e-06, + "loss": 0.0003, + "step": 9980 + }, + { + "epoch": 2.5201816347124115, + "grad_norm": 5.915598012506962e-05, + "learning_rate": 7.99697275479314e-06, + "loss": 0.0002, + "step": 9990 + }, + { + "epoch": 2.5227043390514634, + "grad_norm": 9.444829629501328e-05, + "learning_rate": 7.954927682475614e-06, + "loss": 0.0001, + "step": 10000 + }, + { + "epoch": 2.5227043390514634, + "eval_loss": 0.0038506174460053444, + "eval_runtime": 20.868, + "eval_samples_per_second": 84.436, + "eval_steps_per_second": 21.133, + "step": 10000 + }, + { + "epoch": 2.5252270433905144, + "grad_norm": 0.00031684929854236543, + "learning_rate": 7.91288261015809e-06, + "loss": 0.0, + "step": 10010 + }, + { + "epoch": 2.5277497477295663, + "grad_norm": 0.00030053374939598143, + "learning_rate": 7.870837537840565e-06, + "loss": 0.0002, + "step": 10020 + }, + { + "epoch": 2.5302724520686173, + "grad_norm": 0.07218382507562637, + "learning_rate": 7.828792465523042e-06, + "loss": 0.0002, + "step": 10030 + }, + { + "epoch": 2.532795156407669, + "grad_norm": 9.184365626424551e-05, + "learning_rate": 7.786747393205516e-06, + "loss": 0.0, + "step": 10040 + }, + { + "epoch": 2.5353178607467206, + "grad_norm": 0.00037872057873755693, + "learning_rate": 7.744702320887993e-06, + "loss": 0.0028, + "step": 10050 + }, + { + "epoch": 2.537840565085772, + "grad_norm": 0.00029898545471951365, + "learning_rate": 7.702657248570467e-06, + "loss": 0.0009, + "step": 10060 + }, + { + "epoch": 2.5403632694248235, + "grad_norm": 0.0002952404029201716, + "learning_rate": 7.660612176252944e-06, + "loss": 0.0001, + "step": 10070 + }, + { + "epoch": 2.542885973763875, + "grad_norm": 0.06380714476108551, + "learning_rate": 7.618567103935418e-06, + "loss": 0.0005, + "step": 10080 + }, + { + "epoch": 2.5454086781029264, + "grad_norm": 0.2544499933719635, + "learning_rate": 7.576522031617894e-06, + "loss": 0.0029, + "step": 10090 + }, + { + "epoch": 2.547931382441978, + "grad_norm": 0.0004265220195520669, + "learning_rate": 7.53447695930037e-06, + "loss": 0.0001, + "step": 10100 + }, + { + "epoch": 2.547931382441978, + "eval_loss": 0.00382298999466002, + "eval_runtime": 20.8338, + "eval_samples_per_second": 84.574, + "eval_steps_per_second": 21.168, + "step": 10100 + }, + { + "epoch": 2.5504540867810293, + "grad_norm": 0.00022531530703417957, + "learning_rate": 7.4924318869828465e-06, + "loss": 0.0005, + "step": 10110 + }, + { + "epoch": 2.5529767911200807, + "grad_norm": 0.0006108383531682193, + "learning_rate": 7.450386814665322e-06, + "loss": 0.0012, + "step": 10120 + }, + { + "epoch": 2.555499495459132, + "grad_norm": 0.007859915494918823, + "learning_rate": 7.4083417423477975e-06, + "loss": 0.0002, + "step": 10130 + }, + { + "epoch": 2.5580221997981836, + "grad_norm": 0.07120391726493835, + "learning_rate": 7.366296670030273e-06, + "loss": 0.0018, + "step": 10140 + }, + { + "epoch": 2.560544904137235, + "grad_norm": 3.911245221388526e-05, + "learning_rate": 7.3242515977127486e-06, + "loss": 0.0001, + "step": 10150 + }, + { + "epoch": 2.5630676084762865, + "grad_norm": 0.00011251613614149392, + "learning_rate": 7.282206525395224e-06, + "loss": 0.0003, + "step": 10160 + }, + { + "epoch": 2.565590312815338, + "grad_norm": 2.7329329896019772e-05, + "learning_rate": 7.2401614530777e-06, + "loss": 0.0, + "step": 10170 + }, + { + "epoch": 2.5681130171543893, + "grad_norm": 0.015592537820339203, + "learning_rate": 7.198116380760175e-06, + "loss": 0.0001, + "step": 10180 + }, + { + "epoch": 2.570635721493441, + "grad_norm": 0.004030313808470964, + "learning_rate": 7.156071308442651e-06, + "loss": 0.0006, + "step": 10190 + }, + { + "epoch": 2.573158425832492, + "grad_norm": 0.00028404564363881946, + "learning_rate": 7.114026236125126e-06, + "loss": 0.0011, + "step": 10200 + }, + { + "epoch": 2.573158425832492, + "eval_loss": 0.0038917113561183214, + "eval_runtime": 20.8627, + "eval_samples_per_second": 84.457, + "eval_steps_per_second": 21.138, + "step": 10200 + }, + { + "epoch": 2.575681130171544, + "grad_norm": 0.015596185810863972, + "learning_rate": 7.071981163807602e-06, + "loss": 0.0008, + "step": 10210 + }, + { + "epoch": 2.578203834510595, + "grad_norm": 0.000240606430452317, + "learning_rate": 7.029936091490077e-06, + "loss": 0.0003, + "step": 10220 + }, + { + "epoch": 2.580726538849647, + "grad_norm": 0.03709765151143074, + "learning_rate": 6.987891019172553e-06, + "loss": 0.0015, + "step": 10230 + }, + { + "epoch": 2.5832492431886984, + "grad_norm": 0.038254085928201675, + "learning_rate": 6.945845946855028e-06, + "loss": 0.0004, + "step": 10240 + }, + { + "epoch": 2.58577194752775, + "grad_norm": 0.0966513380408287, + "learning_rate": 6.903800874537504e-06, + "loss": 0.0012, + "step": 10250 + }, + { + "epoch": 2.5882946518668013, + "grad_norm": 0.002003374509513378, + "learning_rate": 6.861755802219979e-06, + "loss": 0.0015, + "step": 10260 + }, + { + "epoch": 2.5908173562058527, + "grad_norm": 0.2246997058391571, + "learning_rate": 6.8197107299024555e-06, + "loss": 0.0013, + "step": 10270 + }, + { + "epoch": 2.593340060544904, + "grad_norm": 0.046293605118989944, + "learning_rate": 6.777665657584932e-06, + "loss": 0.0013, + "step": 10280 + }, + { + "epoch": 2.5958627648839556, + "grad_norm": 0.04472287371754646, + "learning_rate": 6.735620585267407e-06, + "loss": 0.0024, + "step": 10290 + }, + { + "epoch": 2.598385469223007, + "grad_norm": 0.013854089193046093, + "learning_rate": 6.693575512949883e-06, + "loss": 0.0007, + "step": 10300 + }, + { + "epoch": 2.598385469223007, + "eval_loss": 0.003853140166029334, + "eval_runtime": 20.8889, + "eval_samples_per_second": 84.351, + "eval_steps_per_second": 21.112, + "step": 10300 + }, + { + "epoch": 2.6009081735620585, + "grad_norm": 6.33889067103155e-05, + "learning_rate": 6.6515304406323584e-06, + "loss": 0.0, + "step": 10310 + }, + { + "epoch": 2.60343087790111, + "grad_norm": 0.028591223061084747, + "learning_rate": 6.609485368314834e-06, + "loss": 0.0007, + "step": 10320 + }, + { + "epoch": 2.6059535822401614, + "grad_norm": 0.015871459618210793, + "learning_rate": 6.5674402959973095e-06, + "loss": 0.0008, + "step": 10330 + }, + { + "epoch": 2.608476286579213, + "grad_norm": 0.0005443996633403003, + "learning_rate": 6.525395223679785e-06, + "loss": 0.002, + "step": 10340 + }, + { + "epoch": 2.6109989909182643, + "grad_norm": 0.11164157837629318, + "learning_rate": 6.4833501513622605e-06, + "loss": 0.0005, + "step": 10350 + }, + { + "epoch": 2.6135216952573157, + "grad_norm": 0.005891601089388132, + "learning_rate": 6.441305079044736e-06, + "loss": 0.0002, + "step": 10360 + }, + { + "epoch": 2.616044399596367, + "grad_norm": 0.00020742563356179744, + "learning_rate": 6.3992600067272115e-06, + "loss": 0.0014, + "step": 10370 + }, + { + "epoch": 2.618567103935419, + "grad_norm": 6.0241254686843604e-05, + "learning_rate": 6.357214934409687e-06, + "loss": 0.0002, + "step": 10380 + }, + { + "epoch": 2.62108980827447, + "grad_norm": 0.00019894012075383216, + "learning_rate": 6.3151698620921625e-06, + "loss": 0.0005, + "step": 10390 + }, + { + "epoch": 2.623612512613522, + "grad_norm": 0.13243666291236877, + "learning_rate": 6.273124789774638e-06, + "loss": 0.001, + "step": 10400 + }, + { + "epoch": 2.623612512613522, + "eval_loss": 0.0038242663722485304, + "eval_runtime": 20.9018, + "eval_samples_per_second": 84.299, + "eval_steps_per_second": 21.099, + "step": 10400 + }, + { + "epoch": 2.626135216952573, + "grad_norm": 0.12121517211198807, + "learning_rate": 6.231079717457114e-06, + "loss": 0.0008, + "step": 10410 + }, + { + "epoch": 2.628657921291625, + "grad_norm": 0.008915661834180355, + "learning_rate": 6.18903464513959e-06, + "loss": 0.0, + "step": 10420 + }, + { + "epoch": 2.6311806256306762, + "grad_norm": 0.005935797467827797, + "learning_rate": 6.1469895728220654e-06, + "loss": 0.0002, + "step": 10430 + }, + { + "epoch": 2.6337033299697277, + "grad_norm": 0.030999109148979187, + "learning_rate": 6.104944500504541e-06, + "loss": 0.0002, + "step": 10440 + }, + { + "epoch": 2.636226034308779, + "grad_norm": 0.13484105467796326, + "learning_rate": 6.0628994281870165e-06, + "loss": 0.0005, + "step": 10450 + }, + { + "epoch": 2.6387487386478305, + "grad_norm": 0.0004957011551596224, + "learning_rate": 6.020854355869492e-06, + "loss": 0.0006, + "step": 10460 + }, + { + "epoch": 2.641271442986882, + "grad_norm": 0.12348122149705887, + "learning_rate": 5.9788092835519675e-06, + "loss": 0.0005, + "step": 10470 + }, + { + "epoch": 2.6437941473259334, + "grad_norm": 9.011943620862439e-05, + "learning_rate": 5.936764211234443e-06, + "loss": 0.0006, + "step": 10480 + }, + { + "epoch": 2.646316851664985, + "grad_norm": 0.00026155789964832366, + "learning_rate": 5.8947191389169185e-06, + "loss": 0.0008, + "step": 10490 + }, + { + "epoch": 2.6488395560040363, + "grad_norm": 0.0001088874414563179, + "learning_rate": 5.852674066599395e-06, + "loss": 0.0011, + "step": 10500 + }, + { + "epoch": 2.6488395560040363, + "eval_loss": 0.0038149829488247633, + "eval_runtime": 20.9282, + "eval_samples_per_second": 84.193, + "eval_steps_per_second": 21.072, + "step": 10500 + }, + { + "epoch": 2.6513622603430878, + "grad_norm": 4.513943349593319e-05, + "learning_rate": 5.81062899428187e-06, + "loss": 0.0008, + "step": 10510 + }, + { + "epoch": 2.653884964682139, + "grad_norm": 0.00011352117144269869, + "learning_rate": 5.768583921964346e-06, + "loss": 0.0004, + "step": 10520 + }, + { + "epoch": 2.6564076690211906, + "grad_norm": 8.327852265210822e-05, + "learning_rate": 5.726538849646821e-06, + "loss": 0.0003, + "step": 10530 + }, + { + "epoch": 2.658930373360242, + "grad_norm": 0.08921755105257034, + "learning_rate": 5.684493777329297e-06, + "loss": 0.0014, + "step": 10540 + }, + { + "epoch": 2.6614530776992935, + "grad_norm": 0.00016910290287341923, + "learning_rate": 5.642448705011772e-06, + "loss": 0.0001, + "step": 10550 + }, + { + "epoch": 2.663975782038345, + "grad_norm": 7.930315769044682e-05, + "learning_rate": 5.600403632694248e-06, + "loss": 0.0001, + "step": 10560 + }, + { + "epoch": 2.666498486377397, + "grad_norm": 9.457457781536505e-05, + "learning_rate": 5.5583585603767234e-06, + "loss": 0.0, + "step": 10570 + }, + { + "epoch": 2.669021190716448, + "grad_norm": 0.00021707512496504933, + "learning_rate": 5.5163134880592e-06, + "loss": 0.0003, + "step": 10580 + }, + { + "epoch": 2.6715438950554997, + "grad_norm": 0.001966067822650075, + "learning_rate": 5.474268415741675e-06, + "loss": 0.0007, + "step": 10590 + }, + { + "epoch": 2.6740665993945507, + "grad_norm": 0.004528351593762636, + "learning_rate": 5.432223343424151e-06, + "loss": 0.0005, + "step": 10600 + }, + { + "epoch": 2.6740665993945507, + "eval_loss": 0.003807534696534276, + "eval_runtime": 20.87, + "eval_samples_per_second": 84.427, + "eval_steps_per_second": 21.131, + "step": 10600 + }, + { + "epoch": 2.6765893037336026, + "grad_norm": 0.00017803607624955475, + "learning_rate": 5.390178271106626e-06, + "loss": 0.0005, + "step": 10610 + }, + { + "epoch": 2.679112008072654, + "grad_norm": 3.115162326139398e-05, + "learning_rate": 5.348133198789102e-06, + "loss": 0.0, + "step": 10620 + }, + { + "epoch": 2.6816347124117055, + "grad_norm": 0.07494215667247772, + "learning_rate": 5.306088126471577e-06, + "loss": 0.0008, + "step": 10630 + }, + { + "epoch": 2.684157416750757, + "grad_norm": 0.07868482917547226, + "learning_rate": 5.264043054154053e-06, + "loss": 0.0005, + "step": 10640 + }, + { + "epoch": 2.6866801210898084, + "grad_norm": 0.01250834483653307, + "learning_rate": 5.221997981836528e-06, + "loss": 0.0001, + "step": 10650 + }, + { + "epoch": 2.68920282542886, + "grad_norm": 0.10575691610574722, + "learning_rate": 5.179952909519004e-06, + "loss": 0.001, + "step": 10660 + }, + { + "epoch": 2.6917255297679112, + "grad_norm": 0.16051237285137177, + "learning_rate": 5.13790783720148e-06, + "loss": 0.0006, + "step": 10670 + }, + { + "epoch": 2.6942482341069627, + "grad_norm": 4.039399209432304e-05, + "learning_rate": 5.095862764883956e-06, + "loss": 0.0001, + "step": 10680 + }, + { + "epoch": 2.696770938446014, + "grad_norm": 9.023944585351273e-05, + "learning_rate": 5.053817692566431e-06, + "loss": 0.0005, + "step": 10690 + }, + { + "epoch": 2.6992936427850656, + "grad_norm": 0.20219027996063232, + "learning_rate": 5.011772620248907e-06, + "loss": 0.0003, + "step": 10700 + }, + { + "epoch": 2.6992936427850656, + "eval_loss": 0.0038071214221417904, + "eval_runtime": 20.8909, + "eval_samples_per_second": 84.343, + "eval_steps_per_second": 21.11, + "step": 10700 + }, + { + "epoch": 2.701816347124117, + "grad_norm": 0.12794020771980286, + "learning_rate": 4.969727547931382e-06, + "loss": 0.0014, + "step": 10710 + }, + { + "epoch": 2.7043390514631684, + "grad_norm": 0.000275536032859236, + "learning_rate": 4.927682475613858e-06, + "loss": 0.0006, + "step": 10720 + }, + { + "epoch": 2.70686175580222, + "grad_norm": 0.0002630538656376302, + "learning_rate": 4.885637403296333e-06, + "loss": 0.0016, + "step": 10730 + }, + { + "epoch": 2.7093844601412713, + "grad_norm": 0.042821742594242096, + "learning_rate": 4.843592330978809e-06, + "loss": 0.0001, + "step": 10740 + }, + { + "epoch": 2.7119071644803228, + "grad_norm": 0.10874561965465546, + "learning_rate": 4.801547258661285e-06, + "loss": 0.0006, + "step": 10750 + }, + { + "epoch": 2.714429868819374, + "grad_norm": 0.00025562438531778753, + "learning_rate": 4.759502186343761e-06, + "loss": 0.0001, + "step": 10760 + }, + { + "epoch": 2.7169525731584256, + "grad_norm": 0.006827104836702347, + "learning_rate": 4.717457114026236e-06, + "loss": 0.001, + "step": 10770 + }, + { + "epoch": 2.7194752774974775, + "grad_norm": 0.005648414604365826, + "learning_rate": 4.675412041708712e-06, + "loss": 0.0004, + "step": 10780 + }, + { + "epoch": 2.7219979818365285, + "grad_norm": 0.001025490928441286, + "learning_rate": 4.633366969391187e-06, + "loss": 0.0018, + "step": 10790 + }, + { + "epoch": 2.7245206861755804, + "grad_norm": 0.0006745181744918227, + "learning_rate": 4.591321897073663e-06, + "loss": 0.0002, + "step": 10800 + }, + { + "epoch": 2.7245206861755804, + "eval_loss": 0.0037325455341488123, + "eval_runtime": 20.8462, + "eval_samples_per_second": 84.524, + "eval_steps_per_second": 21.155, + "step": 10800 + }, + { + "epoch": 2.727043390514632, + "grad_norm": 0.0065965172834694386, + "learning_rate": 4.549276824756138e-06, + "loss": 0.0001, + "step": 10810 + }, + { + "epoch": 2.7295660948536833, + "grad_norm": 0.0002903965360019356, + "learning_rate": 4.507231752438614e-06, + "loss": 0.0003, + "step": 10820 + }, + { + "epoch": 2.7320887991927347, + "grad_norm": 0.16553114354610443, + "learning_rate": 4.465186680121089e-06, + "loss": 0.0006, + "step": 10830 + }, + { + "epoch": 2.734611503531786, + "grad_norm": 0.0074982005171477795, + "learning_rate": 4.423141607803566e-06, + "loss": 0.0001, + "step": 10840 + }, + { + "epoch": 2.7371342078708376, + "grad_norm": 0.0002544449525885284, + "learning_rate": 4.381096535486041e-06, + "loss": 0.0005, + "step": 10850 + }, + { + "epoch": 2.739656912209889, + "grad_norm": 0.07092459499835968, + "learning_rate": 4.339051463168517e-06, + "loss": 0.0005, + "step": 10860 + }, + { + "epoch": 2.7421796165489405, + "grad_norm": 0.03416803479194641, + "learning_rate": 4.297006390850992e-06, + "loss": 0.0004, + "step": 10870 + }, + { + "epoch": 2.744702320887992, + "grad_norm": 0.0010244250297546387, + "learning_rate": 4.254961318533468e-06, + "loss": 0.0003, + "step": 10880 + }, + { + "epoch": 2.7472250252270434, + "grad_norm": 0.0003882810124196112, + "learning_rate": 4.212916246215943e-06, + "loss": 0.0005, + "step": 10890 + }, + { + "epoch": 2.749747729566095, + "grad_norm": 0.0006057489081285894, + "learning_rate": 4.170871173898419e-06, + "loss": 0.0009, + "step": 10900 + }, + { + "epoch": 2.749747729566095, + "eval_loss": 0.0038078054785728455, + "eval_runtime": 20.8522, + "eval_samples_per_second": 84.499, + "eval_steps_per_second": 21.149, + "step": 10900 + }, + { + "epoch": 2.7522704339051463, + "grad_norm": 0.00148275145329535, + "learning_rate": 4.128826101580894e-06, + "loss": 0.0001, + "step": 10910 + }, + { + "epoch": 2.7547931382441977, + "grad_norm": 0.062449101358652115, + "learning_rate": 4.086781029263371e-06, + "loss": 0.0017, + "step": 10920 + }, + { + "epoch": 2.757315842583249, + "grad_norm": 0.011098313145339489, + "learning_rate": 4.044735956945846e-06, + "loss": 0.0003, + "step": 10930 + }, + { + "epoch": 2.7598385469223006, + "grad_norm": 0.0001871915883384645, + "learning_rate": 4.002690884628322e-06, + "loss": 0.0001, + "step": 10940 + }, + { + "epoch": 2.762361251261352, + "grad_norm": 0.00037126371171325445, + "learning_rate": 3.960645812310797e-06, + "loss": 0.0, + "step": 10950 + }, + { + "epoch": 2.7648839556004035, + "grad_norm": 0.0006047156057320535, + "learning_rate": 3.918600739993273e-06, + "loss": 0.0012, + "step": 10960 + }, + { + "epoch": 2.7674066599394553, + "grad_norm": 0.00014337015454657376, + "learning_rate": 3.876555667675748e-06, + "loss": 0.0004, + "step": 10970 + }, + { + "epoch": 2.7699293642785063, + "grad_norm": 0.12640614807605743, + "learning_rate": 3.834510595358224e-06, + "loss": 0.0026, + "step": 10980 + }, + { + "epoch": 2.772452068617558, + "grad_norm": 0.00037311791675165296, + "learning_rate": 3.7924655230406996e-06, + "loss": 0.0, + "step": 10990 + }, + { + "epoch": 2.774974772956609, + "grad_norm": 0.00015324597188737243, + "learning_rate": 3.750420450723175e-06, + "loss": 0.0009, + "step": 11000 + }, + { + "epoch": 2.774974772956609, + "eval_loss": 0.0037996473256498575, + "eval_runtime": 20.8643, + "eval_samples_per_second": 84.451, + "eval_steps_per_second": 21.137, + "step": 11000 + }, + { + "epoch": 2.777497477295661, + "grad_norm": 0.10045702010393143, + "learning_rate": 3.7083753784056515e-06, + "loss": 0.0002, + "step": 11010 + }, + { + "epoch": 2.7800201816347125, + "grad_norm": 0.00010195528011536226, + "learning_rate": 3.666330306088127e-06, + "loss": 0.0012, + "step": 11020 + }, + { + "epoch": 2.782542885973764, + "grad_norm": 0.00022041058400645852, + "learning_rate": 3.6242852337706025e-06, + "loss": 0.0002, + "step": 11030 + }, + { + "epoch": 2.7850655903128154, + "grad_norm": 0.18306997418403625, + "learning_rate": 3.582240161453078e-06, + "loss": 0.0016, + "step": 11040 + }, + { + "epoch": 2.787588294651867, + "grad_norm": 6.177197064971551e-05, + "learning_rate": 3.5401950891355535e-06, + "loss": 0.0, + "step": 11050 + }, + { + "epoch": 2.7901109989909183, + "grad_norm": 0.0007134904735721648, + "learning_rate": 3.498150016818029e-06, + "loss": 0.0001, + "step": 11060 + }, + { + "epoch": 2.7926337033299697, + "grad_norm": 0.026096561923623085, + "learning_rate": 3.4561049445005045e-06, + "loss": 0.0004, + "step": 11070 + }, + { + "epoch": 2.795156407669021, + "grad_norm": 0.0005445599090307951, + "learning_rate": 3.41405987218298e-06, + "loss": 0.0, + "step": 11080 + }, + { + "epoch": 2.7976791120080726, + "grad_norm": 0.00022800432634539902, + "learning_rate": 3.3720147998654564e-06, + "loss": 0.0, + "step": 11090 + }, + { + "epoch": 2.800201816347124, + "grad_norm": 0.00023530615726485848, + "learning_rate": 3.329969727547932e-06, + "loss": 0.0004, + "step": 11100 + }, + { + "epoch": 2.800201816347124, + "eval_loss": 0.003767798189073801, + "eval_runtime": 20.8684, + "eval_samples_per_second": 84.434, + "eval_steps_per_second": 21.132, + "step": 11100 + }, + { + "epoch": 2.8027245206861755, + "grad_norm": 9.487225906923413e-05, + "learning_rate": 3.2879246552304074e-06, + "loss": 0.0, + "step": 11110 + }, + { + "epoch": 2.805247225025227, + "grad_norm": 0.07703667134046555, + "learning_rate": 3.245879582912883e-06, + "loss": 0.0009, + "step": 11120 + }, + { + "epoch": 2.8077699293642784, + "grad_norm": 0.09232014417648315, + "learning_rate": 3.2038345105953585e-06, + "loss": 0.0003, + "step": 11130 + }, + { + "epoch": 2.81029263370333, + "grad_norm": 0.0033814776688814163, + "learning_rate": 3.161789438277834e-06, + "loss": 0.0022, + "step": 11140 + }, + { + "epoch": 2.8128153380423813, + "grad_norm": 0.003383078845217824, + "learning_rate": 3.11974436596031e-06, + "loss": 0.0018, + "step": 11150 + }, + { + "epoch": 2.815338042381433, + "grad_norm": 0.0075791082344949245, + "learning_rate": 3.0776992936427854e-06, + "loss": 0.0001, + "step": 11160 + }, + { + "epoch": 2.817860746720484, + "grad_norm": 0.0005082746502012014, + "learning_rate": 3.035654221325261e-06, + "loss": 0.0004, + "step": 11170 + }, + { + "epoch": 2.820383451059536, + "grad_norm": 1.3200211469666101e-05, + "learning_rate": 2.9936091490077364e-06, + "loss": 0.0003, + "step": 11180 + }, + { + "epoch": 2.822906155398587, + "grad_norm": 0.0003332770138513297, + "learning_rate": 2.951564076690212e-06, + "loss": 0.0, + "step": 11190 + }, + { + "epoch": 2.825428859737639, + "grad_norm": 0.15700918436050415, + "learning_rate": 2.909519004372688e-06, + "loss": 0.0019, + "step": 11200 + }, + { + "epoch": 2.825428859737639, + "eval_loss": 0.0037862148601561785, + "eval_runtime": 20.8605, + "eval_samples_per_second": 84.466, + "eval_steps_per_second": 21.14, + "step": 11200 + }, + { + "epoch": 2.8279515640766903, + "grad_norm": 0.00048214950948022306, + "learning_rate": 2.8674739320551634e-06, + "loss": 0.0001, + "step": 11210 + }, + { + "epoch": 2.830474268415742, + "grad_norm": 0.0002856640494428575, + "learning_rate": 2.825428859737639e-06, + "loss": 0.0009, + "step": 11220 + }, + { + "epoch": 2.8329969727547932, + "grad_norm": 0.00010505354293854907, + "learning_rate": 2.7833837874201144e-06, + "loss": 0.0001, + "step": 11230 + }, + { + "epoch": 2.8355196770938447, + "grad_norm": 0.0005396956112235785, + "learning_rate": 2.7413387151025904e-06, + "loss": 0.0006, + "step": 11240 + }, + { + "epoch": 2.838042381432896, + "grad_norm": 0.03256835415959358, + "learning_rate": 2.699293642785066e-06, + "loss": 0.0006, + "step": 11250 + }, + { + "epoch": 2.8405650857719476, + "grad_norm": 4.0616308979224414e-05, + "learning_rate": 2.6572485704675414e-06, + "loss": 0.0005, + "step": 11260 + }, + { + "epoch": 2.843087790110999, + "grad_norm": 0.09430497884750366, + "learning_rate": 2.615203498150017e-06, + "loss": 0.0003, + "step": 11270 + }, + { + "epoch": 2.8456104944500504, + "grad_norm": 0.0003849182394333184, + "learning_rate": 2.573158425832493e-06, + "loss": 0.0, + "step": 11280 + }, + { + "epoch": 2.848133198789102, + "grad_norm": 0.0575651191174984, + "learning_rate": 2.5311133535149683e-06, + "loss": 0.0001, + "step": 11290 + }, + { + "epoch": 2.8506559031281533, + "grad_norm": 0.0015473919920623302, + "learning_rate": 2.489068281197444e-06, + "loss": 0.0007, + "step": 11300 + }, + { + "epoch": 2.8506559031281533, + "eval_loss": 0.0037519715260714293, + "eval_runtime": 20.8758, + "eval_samples_per_second": 84.404, + "eval_steps_per_second": 21.125, + "step": 11300 + }, + { + "epoch": 2.8531786074672048, + "grad_norm": 0.000955607567448169, + "learning_rate": 2.4470232088799194e-06, + "loss": 0.0011, + "step": 11310 + }, + { + "epoch": 2.855701311806256, + "grad_norm": 4.5967324695084244e-05, + "learning_rate": 2.4049781365623953e-06, + "loss": 0.0003, + "step": 11320 + }, + { + "epoch": 2.8582240161453076, + "grad_norm": 0.0006287918658927083, + "learning_rate": 2.362933064244871e-06, + "loss": 0.0008, + "step": 11330 + }, + { + "epoch": 2.860746720484359, + "grad_norm": 0.0023038501385599375, + "learning_rate": 2.3208879919273463e-06, + "loss": 0.0002, + "step": 11340 + }, + { + "epoch": 2.863269424823411, + "grad_norm": 0.00017235818086192012, + "learning_rate": 2.278842919609822e-06, + "loss": 0.0001, + "step": 11350 + }, + { + "epoch": 2.865792129162462, + "grad_norm": 0.0003580110496841371, + "learning_rate": 2.2367978472922973e-06, + "loss": 0.0001, + "step": 11360 + }, + { + "epoch": 2.868314833501514, + "grad_norm": 0.0013928780099377036, + "learning_rate": 2.1947527749747733e-06, + "loss": 0.0003, + "step": 11370 + }, + { + "epoch": 2.870837537840565, + "grad_norm": 0.11936355382204056, + "learning_rate": 2.152707702657249e-06, + "loss": 0.001, + "step": 11380 + }, + { + "epoch": 2.8733602421796167, + "grad_norm": 0.0003929549129679799, + "learning_rate": 2.1106626303397243e-06, + "loss": 0.0, + "step": 11390 + }, + { + "epoch": 2.875882946518668, + "grad_norm": 0.00011808017734438181, + "learning_rate": 2.0686175580222e-06, + "loss": 0.001, + "step": 11400 + }, + { + "epoch": 2.875882946518668, + "eval_loss": 0.0037574958987534046, + "eval_runtime": 20.8424, + "eval_samples_per_second": 84.539, + "eval_steps_per_second": 21.159, + "step": 11400 + }, + { + "epoch": 2.8784056508577196, + "grad_norm": 0.1294584721326828, + "learning_rate": 2.0265724857046758e-06, + "loss": 0.0006, + "step": 11410 + }, + { + "epoch": 2.880928355196771, + "grad_norm": 0.0016598176443949342, + "learning_rate": 1.9845274133871513e-06, + "loss": 0.0009, + "step": 11420 + }, + { + "epoch": 2.8834510595358225, + "grad_norm": 1.5323972547776066e-05, + "learning_rate": 1.9424823410696268e-06, + "loss": 0.0, + "step": 11430 + }, + { + "epoch": 2.885973763874874, + "grad_norm": 0.000323007203405723, + "learning_rate": 1.9004372687521023e-06, + "loss": 0.0001, + "step": 11440 + }, + { + "epoch": 2.8884964682139254, + "grad_norm": 0.0010820828611031175, + "learning_rate": 1.858392196434578e-06, + "loss": 0.001, + "step": 11450 + }, + { + "epoch": 2.891019172552977, + "grad_norm": 0.009386632591485977, + "learning_rate": 1.8163471241170535e-06, + "loss": 0.0, + "step": 11460 + }, + { + "epoch": 2.8935418768920282, + "grad_norm": 0.005150569602847099, + "learning_rate": 1.7743020517995292e-06, + "loss": 0.0001, + "step": 11470 + }, + { + "epoch": 2.8960645812310797, + "grad_norm": 5.517240060726181e-05, + "learning_rate": 1.7322569794820048e-06, + "loss": 0.0011, + "step": 11480 + }, + { + "epoch": 2.898587285570131, + "grad_norm": 0.16206330060958862, + "learning_rate": 1.6902119071644805e-06, + "loss": 0.001, + "step": 11490 + }, + { + "epoch": 2.9011099899091826, + "grad_norm": 0.00011850109876831993, + "learning_rate": 1.648166834846956e-06, + "loss": 0.001, + "step": 11500 + }, + { + "epoch": 2.9011099899091826, + "eval_loss": 0.0037340966518968344, + "eval_runtime": 20.8481, + "eval_samples_per_second": 84.516, + "eval_steps_per_second": 21.153, + "step": 11500 + }, + { + "epoch": 2.903632694248234, + "grad_norm": 0.0002364334650337696, + "learning_rate": 1.6061217625294317e-06, + "loss": 0.0, + "step": 11510 + }, + { + "epoch": 2.9061553985872854, + "grad_norm": 0.005173501092940569, + "learning_rate": 1.5640766902119072e-06, + "loss": 0.0005, + "step": 11520 + }, + { + "epoch": 2.908678102926337, + "grad_norm": 0.08454468101263046, + "learning_rate": 1.522031617894383e-06, + "loss": 0.0001, + "step": 11530 + }, + { + "epoch": 2.9112008072653888, + "grad_norm": 0.018247609958052635, + "learning_rate": 1.4799865455768585e-06, + "loss": 0.0004, + "step": 11540 + }, + { + "epoch": 2.9137235116044398, + "grad_norm": 0.08170945197343826, + "learning_rate": 1.4379414732593342e-06, + "loss": 0.0005, + "step": 11550 + }, + { + "epoch": 2.9162462159434916, + "grad_norm": 8.340697240782902e-05, + "learning_rate": 1.3958964009418097e-06, + "loss": 0.0006, + "step": 11560 + }, + { + "epoch": 2.9187689202825426, + "grad_norm": 0.0001018949769786559, + "learning_rate": 1.3538513286242854e-06, + "loss": 0.0009, + "step": 11570 + }, + { + "epoch": 2.9212916246215945, + "grad_norm": 0.0001912551961140707, + "learning_rate": 1.311806256306761e-06, + "loss": 0.0002, + "step": 11580 + }, + { + "epoch": 2.923814328960646, + "grad_norm": 0.0019354906398802996, + "learning_rate": 1.2697611839892367e-06, + "loss": 0.0, + "step": 11590 + }, + { + "epoch": 2.9263370332996974, + "grad_norm": 0.05732259526848793, + "learning_rate": 1.2277161116717122e-06, + "loss": 0.0013, + "step": 11600 + }, + { + "epoch": 2.9263370332996974, + "eval_loss": 0.0037303089629858732, + "eval_runtime": 20.8696, + "eval_samples_per_second": 84.429, + "eval_steps_per_second": 21.131, + "step": 11600 + }, + { + "epoch": 2.928859737638749, + "grad_norm": 0.04633721709251404, + "learning_rate": 1.185671039354188e-06, + "loss": 0.0002, + "step": 11610 + }, + { + "epoch": 2.9313824419778003, + "grad_norm": 0.000182849689736031, + "learning_rate": 1.1436259670366634e-06, + "loss": 0.0, + "step": 11620 + }, + { + "epoch": 2.9339051463168517, + "grad_norm": 9.736415813677013e-05, + "learning_rate": 1.1015808947191391e-06, + "loss": 0.0002, + "step": 11630 + }, + { + "epoch": 2.936427850655903, + "grad_norm": 0.00046163774095475674, + "learning_rate": 1.0595358224016146e-06, + "loss": 0.0011, + "step": 11640 + }, + { + "epoch": 2.9389505549949546, + "grad_norm": 5.9953119489364326e-05, + "learning_rate": 1.0174907500840902e-06, + "loss": 0.0003, + "step": 11650 + }, + { + "epoch": 2.941473259334006, + "grad_norm": 0.00025017280131578445, + "learning_rate": 9.754456777665659e-07, + "loss": 0.0011, + "step": 11660 + }, + { + "epoch": 2.9439959636730575, + "grad_norm": 0.142095148563385, + "learning_rate": 9.334006054490415e-07, + "loss": 0.0005, + "step": 11670 + }, + { + "epoch": 2.946518668012109, + "grad_norm": 0.00014950388867873698, + "learning_rate": 8.91355533131517e-07, + "loss": 0.0011, + "step": 11680 + }, + { + "epoch": 2.9490413723511604, + "grad_norm": 7.325205660890788e-05, + "learning_rate": 8.493104608139925e-07, + "loss": 0.0014, + "step": 11690 + }, + { + "epoch": 2.951564076690212, + "grad_norm": 0.00012016925757052377, + "learning_rate": 8.072653884964682e-07, + "loss": 0.0014, + "step": 11700 + }, + { + "epoch": 2.951564076690212, + "eval_loss": 0.0037244223058223724, + "eval_runtime": 20.8534, + "eval_samples_per_second": 84.495, + "eval_steps_per_second": 21.148, + "step": 11700 + } + ], + "logging_steps": 10, + "max_steps": 11892, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}