{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9767911200807267, "eval_steps": 100, "global_step": 11800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025227043390514633, "grad_norm": 2.0247561931610107, "learning_rate": 4.995795492768248e-05, "loss": 3.4981, "step": 10 }, { "epoch": 0.005045408678102927, "grad_norm": 1.5878010988235474, "learning_rate": 4.991590985536496e-05, "loss": 0.2065, "step": 20 }, { "epoch": 0.0075681130171543895, "grad_norm": 1.1973843574523926, "learning_rate": 4.987386478304743e-05, "loss": 0.1051, "step": 30 }, { "epoch": 0.010090817356205853, "grad_norm": 1.2424113750457764, "learning_rate": 4.983181971072991e-05, "loss": 0.0835, "step": 40 }, { "epoch": 0.012613521695257316, "grad_norm": 0.5049477219581604, "learning_rate": 4.9789774638412376e-05, "loss": 0.0712, "step": 50 }, { "epoch": 0.015136226034308779, "grad_norm": 0.6395494937896729, "learning_rate": 4.974772956609486e-05, "loss": 0.0777, "step": 60 }, { "epoch": 0.017658930373360242, "grad_norm": 0.5573010444641113, "learning_rate": 4.970568449377733e-05, "loss": 0.0627, "step": 70 }, { "epoch": 0.020181634712411706, "grad_norm": 0.5474572777748108, "learning_rate": 4.9663639421459806e-05, "loss": 0.0553, "step": 80 }, { "epoch": 0.022704339051463168, "grad_norm": 0.4932677149772644, "learning_rate": 4.962159434914228e-05, "loss": 0.0455, "step": 90 }, { "epoch": 0.025227043390514632, "grad_norm": 0.4285220205783844, "learning_rate": 4.957954927682476e-05, "loss": 0.0403, "step": 100 }, { "epoch": 0.025227043390514632, "eval_loss": 0.04899341240525246, "eval_runtime": 21.0493, "eval_samples_per_second": 83.708, "eval_steps_per_second": 20.951, "step": 100 }, { "epoch": 0.027749747729566093, "grad_norm": 0.5163519978523254, "learning_rate": 4.953750420450724e-05, "loss": 0.0511, "step": 110 }, { "epoch": 0.030272452068617558, "grad_norm": 0.42003411054611206, "learning_rate": 4.949545913218971e-05, "loss": 0.0353, "step": 120 }, { "epoch": 0.03279515640766902, "grad_norm": 0.35371342301368713, "learning_rate": 4.945341405987218e-05, "loss": 0.035, "step": 130 }, { "epoch": 0.035317860746720484, "grad_norm": 0.4225226938724518, "learning_rate": 4.941136898755466e-05, "loss": 0.0368, "step": 140 }, { "epoch": 0.037840565085771945, "grad_norm": 0.5139334201812744, "learning_rate": 4.9369323915237136e-05, "loss": 0.0411, "step": 150 }, { "epoch": 0.04036326942482341, "grad_norm": 0.44391313195228577, "learning_rate": 4.932727884291961e-05, "loss": 0.0336, "step": 160 }, { "epoch": 0.042885973763874874, "grad_norm": 0.4110734462738037, "learning_rate": 4.928523377060209e-05, "loss": 0.0353, "step": 170 }, { "epoch": 0.045408678102926335, "grad_norm": 0.4065672755241394, "learning_rate": 4.924318869828457e-05, "loss": 0.0302, "step": 180 }, { "epoch": 0.0479313824419778, "grad_norm": 0.3082253336906433, "learning_rate": 4.920114362596704e-05, "loss": 0.0335, "step": 190 }, { "epoch": 0.050454086781029264, "grad_norm": 0.36431366205215454, "learning_rate": 4.9159098553649516e-05, "loss": 0.0359, "step": 200 }, { "epoch": 0.050454086781029264, "eval_loss": 0.03114187717437744, "eval_runtime": 20.9213, "eval_samples_per_second": 84.22, "eval_steps_per_second": 21.079, "step": 200 }, { "epoch": 0.052976791120080725, "grad_norm": 0.3635391294956207, "learning_rate": 4.911705348133199e-05, "loss": 0.0314, "step": 210 }, { "epoch": 0.055499495459132187, "grad_norm": 0.3692784905433655, "learning_rate": 4.9075008409014465e-05, "loss": 0.0355, "step": 220 }, { "epoch": 0.058022199798183655, "grad_norm": 0.3028589189052582, "learning_rate": 4.903296333669694e-05, "loss": 0.0331, "step": 230 }, { "epoch": 0.060544904137235116, "grad_norm": 0.31779277324676514, "learning_rate": 4.8990918264379415e-05, "loss": 0.0218, "step": 240 }, { "epoch": 0.06306760847628658, "grad_norm": 0.36319565773010254, "learning_rate": 4.8948873192061896e-05, "loss": 0.02, "step": 250 }, { "epoch": 0.06559031281533804, "grad_norm": 0.30493369698524475, "learning_rate": 4.890682811974437e-05, "loss": 0.0321, "step": 260 }, { "epoch": 0.0681130171543895, "grad_norm": 0.4685748815536499, "learning_rate": 4.8864783047426845e-05, "loss": 0.0273, "step": 270 }, { "epoch": 0.07063572149344097, "grad_norm": 0.45751261711120605, "learning_rate": 4.882273797510932e-05, "loss": 0.0293, "step": 280 }, { "epoch": 0.07315842583249244, "grad_norm": 0.3978378474712372, "learning_rate": 4.8780692902791795e-05, "loss": 0.0279, "step": 290 }, { "epoch": 0.07568113017154389, "grad_norm": 0.22009262442588806, "learning_rate": 4.873864783047427e-05, "loss": 0.0236, "step": 300 }, { "epoch": 0.07568113017154389, "eval_loss": 0.027077585458755493, "eval_runtime": 20.9736, "eval_samples_per_second": 84.01, "eval_steps_per_second": 21.026, "step": 300 }, { "epoch": 0.07820383451059536, "grad_norm": 0.4900023937225342, "learning_rate": 4.8696602758156744e-05, "loss": 0.0269, "step": 310 }, { "epoch": 0.08072653884964683, "grad_norm": 0.4146521985530853, "learning_rate": 4.865455768583922e-05, "loss": 0.0427, "step": 320 }, { "epoch": 0.08324924318869828, "grad_norm": 0.3285127580165863, "learning_rate": 4.86125126135217e-05, "loss": 0.0272, "step": 330 }, { "epoch": 0.08577194752774975, "grad_norm": 0.49365487694740295, "learning_rate": 4.8570467541204175e-05, "loss": 0.0263, "step": 340 }, { "epoch": 0.08829465186680122, "grad_norm": 0.3131512701511383, "learning_rate": 4.852842246888665e-05, "loss": 0.0193, "step": 350 }, { "epoch": 0.09081735620585267, "grad_norm": 0.29098790884017944, "learning_rate": 4.8486377396569124e-05, "loss": 0.0262, "step": 360 }, { "epoch": 0.09334006054490414, "grad_norm": 0.2346099615097046, "learning_rate": 4.84443323242516e-05, "loss": 0.0203, "step": 370 }, { "epoch": 0.0958627648839556, "grad_norm": 0.31069958209991455, "learning_rate": 4.8402287251934074e-05, "loss": 0.0199, "step": 380 }, { "epoch": 0.09838546922300706, "grad_norm": 0.36899533867836, "learning_rate": 4.836024217961655e-05, "loss": 0.0236, "step": 390 }, { "epoch": 0.10090817356205853, "grad_norm": 0.2813776135444641, "learning_rate": 4.831819710729903e-05, "loss": 0.019, "step": 400 }, { "epoch": 0.10090817356205853, "eval_loss": 0.02430903911590576, "eval_runtime": 20.9604, "eval_samples_per_second": 84.063, "eval_steps_per_second": 21.04, "step": 400 }, { "epoch": 0.10343087790111, "grad_norm": 0.4171730577945709, "learning_rate": 4.8276152034981504e-05, "loss": 0.0179, "step": 410 }, { "epoch": 0.10595358224016145, "grad_norm": 0.30979079008102417, "learning_rate": 4.823410696266398e-05, "loss": 0.0238, "step": 420 }, { "epoch": 0.10847628657921292, "grad_norm": 0.2565608620643616, "learning_rate": 4.8192061890346454e-05, "loss": 0.0216, "step": 430 }, { "epoch": 0.11099899091826437, "grad_norm": 0.2515753507614136, "learning_rate": 4.815001681802893e-05, "loss": 0.0187, "step": 440 }, { "epoch": 0.11352169525731584, "grad_norm": 0.49704504013061523, "learning_rate": 4.81079717457114e-05, "loss": 0.0276, "step": 450 }, { "epoch": 0.11604439959636731, "grad_norm": 0.215419739484787, "learning_rate": 4.806592667339388e-05, "loss": 0.0174, "step": 460 }, { "epoch": 0.11856710393541876, "grad_norm": 0.3299199044704437, "learning_rate": 4.802388160107635e-05, "loss": 0.0229, "step": 470 }, { "epoch": 0.12108980827447023, "grad_norm": 0.2970931828022003, "learning_rate": 4.7981836528758834e-05, "loss": 0.026, "step": 480 }, { "epoch": 0.1236125126135217, "grad_norm": 0.250384658575058, "learning_rate": 4.793979145644131e-05, "loss": 0.0189, "step": 490 }, { "epoch": 0.12613521695257315, "grad_norm": 0.2535875141620636, "learning_rate": 4.789774638412378e-05, "loss": 0.0202, "step": 500 }, { "epoch": 0.12613521695257315, "eval_loss": 0.023427454754710197, "eval_runtime": 20.8805, "eval_samples_per_second": 84.385, "eval_steps_per_second": 21.12, "step": 500 }, { "epoch": 0.12865792129162462, "grad_norm": 0.3403398096561432, "learning_rate": 4.7855701311806265e-05, "loss": 0.0217, "step": 510 }, { "epoch": 0.1311806256306761, "grad_norm": 0.3057946264743805, "learning_rate": 4.781365623948873e-05, "loss": 0.0191, "step": 520 }, { "epoch": 0.13370332996972756, "grad_norm": 0.25819921493530273, "learning_rate": 4.777161116717121e-05, "loss": 0.0193, "step": 530 }, { "epoch": 0.136226034308779, "grad_norm": 0.29612287878990173, "learning_rate": 4.772956609485368e-05, "loss": 0.0208, "step": 540 }, { "epoch": 0.13874873864783047, "grad_norm": 0.2910136580467224, "learning_rate": 4.768752102253616e-05, "loss": 0.0173, "step": 550 }, { "epoch": 0.14127144298688193, "grad_norm": 0.3340837359428406, "learning_rate": 4.764547595021864e-05, "loss": 0.0151, "step": 560 }, { "epoch": 0.1437941473259334, "grad_norm": 0.3069872260093689, "learning_rate": 4.760343087790111e-05, "loss": 0.0166, "step": 570 }, { "epoch": 0.14631685166498487, "grad_norm": 0.24297013878822327, "learning_rate": 4.756138580558359e-05, "loss": 0.0228, "step": 580 }, { "epoch": 0.14883955600403634, "grad_norm": 0.28086113929748535, "learning_rate": 4.751934073326607e-05, "loss": 0.0221, "step": 590 }, { "epoch": 0.15136226034308778, "grad_norm": 0.26318562030792236, "learning_rate": 4.7477295660948536e-05, "loss": 0.0281, "step": 600 }, { "epoch": 0.15136226034308778, "eval_loss": 0.02024998515844345, "eval_runtime": 20.8969, "eval_samples_per_second": 84.319, "eval_steps_per_second": 21.104, "step": 600 }, { "epoch": 0.15388496468213925, "grad_norm": 0.27950412034988403, "learning_rate": 4.743525058863101e-05, "loss": 0.0169, "step": 610 }, { "epoch": 0.15640766902119072, "grad_norm": 0.20953460037708282, "learning_rate": 4.7393205516313486e-05, "loss": 0.0213, "step": 620 }, { "epoch": 0.15893037336024218, "grad_norm": 0.2499765157699585, "learning_rate": 4.735116044399597e-05, "loss": 0.0191, "step": 630 }, { "epoch": 0.16145307769929365, "grad_norm": 0.3006986975669861, "learning_rate": 4.730911537167844e-05, "loss": 0.0269, "step": 640 }, { "epoch": 0.16397578203834512, "grad_norm": 0.24447965621948242, "learning_rate": 4.7267070299360917e-05, "loss": 0.0193, "step": 650 }, { "epoch": 0.16649848637739656, "grad_norm": 0.319516122341156, "learning_rate": 4.722502522704339e-05, "loss": 0.0222, "step": 660 }, { "epoch": 0.16902119071644803, "grad_norm": 0.30482372641563416, "learning_rate": 4.718298015472587e-05, "loss": 0.0165, "step": 670 }, { "epoch": 0.1715438950554995, "grad_norm": 0.18806371092796326, "learning_rate": 4.714093508240835e-05, "loss": 0.014, "step": 680 }, { "epoch": 0.17406659939455096, "grad_norm": 0.21826079487800598, "learning_rate": 4.7098890010090815e-05, "loss": 0.0192, "step": 690 }, { "epoch": 0.17658930373360243, "grad_norm": 0.2127252221107483, "learning_rate": 4.70568449377733e-05, "loss": 0.0142, "step": 700 }, { "epoch": 0.17658930373360243, "eval_loss": 0.018979934975504875, "eval_runtime": 20.8994, "eval_samples_per_second": 84.309, "eval_steps_per_second": 21.101, "step": 700 }, { "epoch": 0.17911200807265387, "grad_norm": 0.23581889271736145, "learning_rate": 4.701479986545577e-05, "loss": 0.0199, "step": 710 }, { "epoch": 0.18163471241170534, "grad_norm": 0.18842558562755585, "learning_rate": 4.6972754793138246e-05, "loss": 0.0194, "step": 720 }, { "epoch": 0.1841574167507568, "grad_norm": 0.29515010118484497, "learning_rate": 4.693070972082072e-05, "loss": 0.0299, "step": 730 }, { "epoch": 0.18668012108980828, "grad_norm": 0.27162402868270874, "learning_rate": 4.68886646485032e-05, "loss": 0.0227, "step": 740 }, { "epoch": 0.18920282542885974, "grad_norm": 0.18802249431610107, "learning_rate": 4.684661957618568e-05, "loss": 0.0169, "step": 750 }, { "epoch": 0.1917255297679112, "grad_norm": 0.34699660539627075, "learning_rate": 4.680457450386815e-05, "loss": 0.0159, "step": 760 }, { "epoch": 0.19424823410696265, "grad_norm": 0.3048790693283081, "learning_rate": 4.676252943155062e-05, "loss": 0.0178, "step": 770 }, { "epoch": 0.19677093844601412, "grad_norm": 0.2703554332256317, "learning_rate": 4.67204843592331e-05, "loss": 0.0136, "step": 780 }, { "epoch": 0.1992936427850656, "grad_norm": 0.18560905754566193, "learning_rate": 4.6678439286915575e-05, "loss": 0.0202, "step": 790 }, { "epoch": 0.20181634712411706, "grad_norm": 0.2768602967262268, "learning_rate": 4.663639421459805e-05, "loss": 0.0283, "step": 800 }, { "epoch": 0.20181634712411706, "eval_loss": 0.01911783590912819, "eval_runtime": 20.9061, "eval_samples_per_second": 84.282, "eval_steps_per_second": 21.094, "step": 800 }, { "epoch": 0.20433905146316853, "grad_norm": 0.18893299996852875, "learning_rate": 4.6594349142280525e-05, "loss": 0.0205, "step": 810 }, { "epoch": 0.20686175580222, "grad_norm": 0.24870939552783966, "learning_rate": 4.6552304069963006e-05, "loss": 0.0185, "step": 820 }, { "epoch": 0.20938446014127143, "grad_norm": 0.2561938464641571, "learning_rate": 4.651025899764548e-05, "loss": 0.0144, "step": 830 }, { "epoch": 0.2119071644803229, "grad_norm": 0.22478680312633514, "learning_rate": 4.6468213925327956e-05, "loss": 0.0213, "step": 840 }, { "epoch": 0.21442986881937437, "grad_norm": 0.30591025948524475, "learning_rate": 4.642616885301043e-05, "loss": 0.0129, "step": 850 }, { "epoch": 0.21695257315842584, "grad_norm": 0.21737821400165558, "learning_rate": 4.6384123780692905e-05, "loss": 0.0181, "step": 860 }, { "epoch": 0.2194752774974773, "grad_norm": 0.20260506868362427, "learning_rate": 4.634207870837538e-05, "loss": 0.0144, "step": 870 }, { "epoch": 0.22199798183652875, "grad_norm": 0.21997040510177612, "learning_rate": 4.6300033636057854e-05, "loss": 0.0215, "step": 880 }, { "epoch": 0.22452068617558021, "grad_norm": 0.2595633864402771, "learning_rate": 4.6257988563740336e-05, "loss": 0.0159, "step": 890 }, { "epoch": 0.22704339051463168, "grad_norm": 0.16551759839057922, "learning_rate": 4.621594349142281e-05, "loss": 0.0217, "step": 900 }, { "epoch": 0.22704339051463168, "eval_loss": 0.017788389697670937, "eval_runtime": 20.9214, "eval_samples_per_second": 84.22, "eval_steps_per_second": 21.079, "step": 900 }, { "epoch": 0.22956609485368315, "grad_norm": 0.27989163994789124, "learning_rate": 4.6173898419105285e-05, "loss": 0.0166, "step": 910 }, { "epoch": 0.23208879919273462, "grad_norm": 0.1843036413192749, "learning_rate": 4.613185334678776e-05, "loss": 0.0148, "step": 920 }, { "epoch": 0.2346115035317861, "grad_norm": 0.2792811691761017, "learning_rate": 4.6089808274470234e-05, "loss": 0.014, "step": 930 }, { "epoch": 0.23713420787083753, "grad_norm": 0.4182822108268738, "learning_rate": 4.604776320215271e-05, "loss": 0.013, "step": 940 }, { "epoch": 0.239656912209889, "grad_norm": 0.17877671122550964, "learning_rate": 4.6005718129835184e-05, "loss": 0.0207, "step": 950 }, { "epoch": 0.24217961654894046, "grad_norm": 0.21961210668087006, "learning_rate": 4.596367305751766e-05, "loss": 0.0168, "step": 960 }, { "epoch": 0.24470232088799193, "grad_norm": 0.12489340454339981, "learning_rate": 4.592162798520014e-05, "loss": 0.0177, "step": 970 }, { "epoch": 0.2472250252270434, "grad_norm": 0.24905265867710114, "learning_rate": 4.5879582912882614e-05, "loss": 0.0126, "step": 980 }, { "epoch": 0.24974772956609487, "grad_norm": 0.14141976833343506, "learning_rate": 4.583753784056509e-05, "loss": 0.0102, "step": 990 }, { "epoch": 0.2522704339051463, "grad_norm": 0.19035248458385468, "learning_rate": 4.5795492768247564e-05, "loss": 0.0143, "step": 1000 }, { "epoch": 0.2522704339051463, "eval_loss": 0.016077525913715363, "eval_runtime": 21.0175, "eval_samples_per_second": 83.835, "eval_steps_per_second": 20.983, "step": 1000 }, { "epoch": 0.2547931382441978, "grad_norm": 0.2033461332321167, "learning_rate": 4.575344769593004e-05, "loss": 0.0172, "step": 1010 }, { "epoch": 0.25731584258324924, "grad_norm": 0.17932486534118652, "learning_rate": 4.571140262361251e-05, "loss": 0.0223, "step": 1020 }, { "epoch": 0.2598385469223007, "grad_norm": 0.16702575981616974, "learning_rate": 4.566935755129499e-05, "loss": 0.0117, "step": 1030 }, { "epoch": 0.2623612512613522, "grad_norm": 0.24906021356582642, "learning_rate": 4.562731247897747e-05, "loss": 0.0136, "step": 1040 }, { "epoch": 0.2648839556004036, "grad_norm": 0.2807481586933136, "learning_rate": 4.5585267406659944e-05, "loss": 0.0161, "step": 1050 }, { "epoch": 0.2674066599394551, "grad_norm": 0.25573644042015076, "learning_rate": 4.554322233434242e-05, "loss": 0.0161, "step": 1060 }, { "epoch": 0.26992936427850656, "grad_norm": 0.20996974408626556, "learning_rate": 4.550117726202489e-05, "loss": 0.0099, "step": 1070 }, { "epoch": 0.272452068617558, "grad_norm": 0.18074114620685577, "learning_rate": 4.545913218970737e-05, "loss": 0.0142, "step": 1080 }, { "epoch": 0.2749747729566095, "grad_norm": 0.11202214658260345, "learning_rate": 4.541708711738984e-05, "loss": 0.0148, "step": 1090 }, { "epoch": 0.27749747729566093, "grad_norm": 1.3392621278762817, "learning_rate": 4.537504204507232e-05, "loss": 0.028, "step": 1100 }, { "epoch": 0.27749747729566093, "eval_loss": 0.015062345191836357, "eval_runtime": 20.9469, "eval_samples_per_second": 84.117, "eval_steps_per_second": 21.053, "step": 1100 }, { "epoch": 0.28002018163471243, "grad_norm": 0.26927411556243896, "learning_rate": 4.533299697275479e-05, "loss": 0.0149, "step": 1110 }, { "epoch": 0.28254288597376387, "grad_norm": 0.25918543338775635, "learning_rate": 4.529095190043727e-05, "loss": 0.0157, "step": 1120 }, { "epoch": 0.28506559031281536, "grad_norm": 0.10137899965047836, "learning_rate": 4.524890682811975e-05, "loss": 0.0117, "step": 1130 }, { "epoch": 0.2875882946518668, "grad_norm": 0.1916513592004776, "learning_rate": 4.520686175580222e-05, "loss": 0.0176, "step": 1140 }, { "epoch": 0.29011099899091825, "grad_norm": 0.3005896210670471, "learning_rate": 4.51648166834847e-05, "loss": 0.0122, "step": 1150 }, { "epoch": 0.29263370332996974, "grad_norm": 0.24127791821956635, "learning_rate": 4.512277161116717e-05, "loss": 0.0108, "step": 1160 }, { "epoch": 0.2951564076690212, "grad_norm": 0.28272244334220886, "learning_rate": 4.5080726538849647e-05, "loss": 0.0119, "step": 1170 }, { "epoch": 0.2976791120080727, "grad_norm": 0.36542513966560364, "learning_rate": 4.503868146653212e-05, "loss": 0.0122, "step": 1180 }, { "epoch": 0.3002018163471241, "grad_norm": 0.26852190494537354, "learning_rate": 4.49966363942146e-05, "loss": 0.0162, "step": 1190 }, { "epoch": 0.30272452068617556, "grad_norm": 0.26203736662864685, "learning_rate": 4.495459132189708e-05, "loss": 0.0125, "step": 1200 }, { "epoch": 0.30272452068617556, "eval_loss": 0.0145474998280406, "eval_runtime": 20.9732, "eval_samples_per_second": 84.012, "eval_steps_per_second": 21.027, "step": 1200 }, { "epoch": 0.30524722502522705, "grad_norm": 0.31206783652305603, "learning_rate": 4.491254624957955e-05, "loss": 0.0241, "step": 1210 }, { "epoch": 0.3077699293642785, "grad_norm": 0.17130957543849945, "learning_rate": 4.487050117726203e-05, "loss": 0.0174, "step": 1220 }, { "epoch": 0.31029263370333, "grad_norm": 0.3070640563964844, "learning_rate": 4.482845610494451e-05, "loss": 0.023, "step": 1230 }, { "epoch": 0.31281533804238143, "grad_norm": 0.5285329818725586, "learning_rate": 4.4786411032626976e-05, "loss": 0.0115, "step": 1240 }, { "epoch": 0.31533804238143287, "grad_norm": 0.21449489891529083, "learning_rate": 4.474436596030945e-05, "loss": 0.0205, "step": 1250 }, { "epoch": 0.31786074672048437, "grad_norm": 0.18218982219696045, "learning_rate": 4.4702320887991925e-05, "loss": 0.0062, "step": 1260 }, { "epoch": 0.3203834510595358, "grad_norm": 0.03409017622470856, "learning_rate": 4.466027581567441e-05, "loss": 0.02, "step": 1270 }, { "epoch": 0.3229061553985873, "grad_norm": 0.2536049783229828, "learning_rate": 4.461823074335688e-05, "loss": 0.0146, "step": 1280 }, { "epoch": 0.32542885973763874, "grad_norm": 0.17619676887989044, "learning_rate": 4.4576185671039356e-05, "loss": 0.0074, "step": 1290 }, { "epoch": 0.32795156407669024, "grad_norm": 0.1441410630941391, "learning_rate": 4.453414059872183e-05, "loss": 0.013, "step": 1300 }, { "epoch": 0.32795156407669024, "eval_loss": 0.012744620442390442, "eval_runtime": 21.0058, "eval_samples_per_second": 83.882, "eval_steps_per_second": 20.994, "step": 1300 }, { "epoch": 0.3304742684157417, "grad_norm": 0.18683987855911255, "learning_rate": 4.449209552640431e-05, "loss": 0.0119, "step": 1310 }, { "epoch": 0.3329969727547931, "grad_norm": 0.16165736317634583, "learning_rate": 4.445005045408678e-05, "loss": 0.0113, "step": 1320 }, { "epoch": 0.3355196770938446, "grad_norm": 0.1178312599658966, "learning_rate": 4.4408005381769255e-05, "loss": 0.0113, "step": 1330 }, { "epoch": 0.33804238143289606, "grad_norm": 0.1859322488307953, "learning_rate": 4.436596030945173e-05, "loss": 0.0104, "step": 1340 }, { "epoch": 0.34056508577194755, "grad_norm": 0.49083656072616577, "learning_rate": 4.432391523713421e-05, "loss": 0.0095, "step": 1350 }, { "epoch": 0.343087790110999, "grad_norm": 0.14915814995765686, "learning_rate": 4.4281870164816686e-05, "loss": 0.013, "step": 1360 }, { "epoch": 0.34561049445005043, "grad_norm": 0.16166740655899048, "learning_rate": 4.423982509249916e-05, "loss": 0.0104, "step": 1370 }, { "epoch": 0.3481331987891019, "grad_norm": 0.19710753858089447, "learning_rate": 4.419778002018164e-05, "loss": 0.0094, "step": 1380 }, { "epoch": 0.35065590312815337, "grad_norm": 0.12713222205638885, "learning_rate": 4.4155734947864116e-05, "loss": 0.0122, "step": 1390 }, { "epoch": 0.35317860746720486, "grad_norm": 0.11732326447963715, "learning_rate": 4.411368987554659e-05, "loss": 0.0127, "step": 1400 }, { "epoch": 0.35317860746720486, "eval_loss": 0.010630101896822453, "eval_runtime": 20.9885, "eval_samples_per_second": 83.951, "eval_steps_per_second": 21.012, "step": 1400 }, { "epoch": 0.3557013118062563, "grad_norm": 0.16016925871372223, "learning_rate": 4.407164480322906e-05, "loss": 0.0035, "step": 1410 }, { "epoch": 0.35822401614530774, "grad_norm": 0.11872086673974991, "learning_rate": 4.402959973091154e-05, "loss": 0.0049, "step": 1420 }, { "epoch": 0.36074672048435924, "grad_norm": 0.15516729652881622, "learning_rate": 4.3987554658594015e-05, "loss": 0.0144, "step": 1430 }, { "epoch": 0.3632694248234107, "grad_norm": 0.18500037491321564, "learning_rate": 4.394550958627649e-05, "loss": 0.0113, "step": 1440 }, { "epoch": 0.3657921291624622, "grad_norm": 0.17393891513347626, "learning_rate": 4.3903464513958964e-05, "loss": 0.0102, "step": 1450 }, { "epoch": 0.3683148335015136, "grad_norm": 0.24622410535812378, "learning_rate": 4.3861419441641446e-05, "loss": 0.0099, "step": 1460 }, { "epoch": 0.3708375378405651, "grad_norm": 0.18613703548908234, "learning_rate": 4.381937436932392e-05, "loss": 0.0111, "step": 1470 }, { "epoch": 0.37336024217961655, "grad_norm": 0.23599150776863098, "learning_rate": 4.3777329297006395e-05, "loss": 0.0152, "step": 1480 }, { "epoch": 0.375882946518668, "grad_norm": 0.08963891863822937, "learning_rate": 4.373528422468886e-05, "loss": 0.0156, "step": 1490 }, { "epoch": 0.3784056508577195, "grad_norm": 0.26133468747138977, "learning_rate": 4.3693239152371344e-05, "loss": 0.0185, "step": 1500 }, { "epoch": 0.3784056508577195, "eval_loss": 0.010692655108869076, "eval_runtime": 21.0428, "eval_samples_per_second": 83.734, "eval_steps_per_second": 20.957, "step": 1500 }, { "epoch": 0.38092835519677093, "grad_norm": 0.07590801268815994, "learning_rate": 4.365119408005382e-05, "loss": 0.0102, "step": 1510 }, { "epoch": 0.3834510595358224, "grad_norm": 0.047652024775743484, "learning_rate": 4.3609149007736294e-05, "loss": 0.0095, "step": 1520 }, { "epoch": 0.38597376387487387, "grad_norm": 0.23399275541305542, "learning_rate": 4.3567103935418775e-05, "loss": 0.0089, "step": 1530 }, { "epoch": 0.3884964682139253, "grad_norm": 0.2155078798532486, "learning_rate": 4.352505886310125e-05, "loss": 0.0115, "step": 1540 }, { "epoch": 0.3910191725529768, "grad_norm": 0.09053190052509308, "learning_rate": 4.3483013790783725e-05, "loss": 0.0088, "step": 1550 }, { "epoch": 0.39354187689202824, "grad_norm": 0.2110535055398941, "learning_rate": 4.34409687184662e-05, "loss": 0.0098, "step": 1560 }, { "epoch": 0.39606458123107974, "grad_norm": 0.1765887439250946, "learning_rate": 4.3398923646148674e-05, "loss": 0.0072, "step": 1570 }, { "epoch": 0.3985872855701312, "grad_norm": 0.3545493483543396, "learning_rate": 4.335687857383115e-05, "loss": 0.0132, "step": 1580 }, { "epoch": 0.4011099899091826, "grad_norm": 0.06623344123363495, "learning_rate": 4.331483350151362e-05, "loss": 0.0069, "step": 1590 }, { "epoch": 0.4036326942482341, "grad_norm": 0.16485914587974548, "learning_rate": 4.32727884291961e-05, "loss": 0.007, "step": 1600 }, { "epoch": 0.4036326942482341, "eval_loss": 0.01017470471560955, "eval_runtime": 20.8918, "eval_samples_per_second": 84.339, "eval_steps_per_second": 21.109, "step": 1600 }, { "epoch": 0.40615539858728555, "grad_norm": 0.15467578172683716, "learning_rate": 4.323074335687858e-05, "loss": 0.0081, "step": 1610 }, { "epoch": 0.40867810292633705, "grad_norm": 0.2580385208129883, "learning_rate": 4.3188698284561054e-05, "loss": 0.0112, "step": 1620 }, { "epoch": 0.4112008072653885, "grad_norm": 0.010637140832841396, "learning_rate": 4.314665321224353e-05, "loss": 0.008, "step": 1630 }, { "epoch": 0.41372351160444, "grad_norm": 0.26677659153938293, "learning_rate": 4.3104608139926e-05, "loss": 0.0094, "step": 1640 }, { "epoch": 0.4162462159434914, "grad_norm": 0.2677707374095917, "learning_rate": 4.306256306760848e-05, "loss": 0.0127, "step": 1650 }, { "epoch": 0.41876892028254287, "grad_norm": 0.13589175045490265, "learning_rate": 4.302051799529095e-05, "loss": 0.0056, "step": 1660 }, { "epoch": 0.42129162462159436, "grad_norm": 0.10289867222309113, "learning_rate": 4.297847292297343e-05, "loss": 0.0109, "step": 1670 }, { "epoch": 0.4238143289606458, "grad_norm": 0.07062846422195435, "learning_rate": 4.29364278506559e-05, "loss": 0.0048, "step": 1680 }, { "epoch": 0.4263370332996973, "grad_norm": 0.16123530268669128, "learning_rate": 4.289438277833838e-05, "loss": 0.0052, "step": 1690 }, { "epoch": 0.42885973763874874, "grad_norm": 0.13397027552127838, "learning_rate": 4.285233770602086e-05, "loss": 0.0048, "step": 1700 }, { "epoch": 0.42885973763874874, "eval_loss": 0.010062881745398045, "eval_runtime": 20.9947, "eval_samples_per_second": 83.926, "eval_steps_per_second": 21.005, "step": 1700 }, { "epoch": 0.4313824419778002, "grad_norm": 0.04028566554188728, "learning_rate": 4.281029263370333e-05, "loss": 0.0101, "step": 1710 }, { "epoch": 0.4339051463168517, "grad_norm": 0.06560038775205612, "learning_rate": 4.276824756138581e-05, "loss": 0.0033, "step": 1720 }, { "epoch": 0.4364278506559031, "grad_norm": 0.16810742020606995, "learning_rate": 4.272620248906828e-05, "loss": 0.0054, "step": 1730 }, { "epoch": 0.4389505549949546, "grad_norm": 0.015353145077824593, "learning_rate": 4.268415741675076e-05, "loss": 0.0058, "step": 1740 }, { "epoch": 0.44147325933400605, "grad_norm": 0.2716507911682129, "learning_rate": 4.264211234443323e-05, "loss": 0.0114, "step": 1750 }, { "epoch": 0.4439959636730575, "grad_norm": 0.10725341737270355, "learning_rate": 4.260006727211571e-05, "loss": 0.0095, "step": 1760 }, { "epoch": 0.446518668012109, "grad_norm": 0.21090877056121826, "learning_rate": 4.255802219979819e-05, "loss": 0.0171, "step": 1770 }, { "epoch": 0.44904137235116043, "grad_norm": 0.08791640400886536, "learning_rate": 4.251597712748066e-05, "loss": 0.0111, "step": 1780 }, { "epoch": 0.4515640766902119, "grad_norm": 0.29180845618247986, "learning_rate": 4.247393205516314e-05, "loss": 0.0093, "step": 1790 }, { "epoch": 0.45408678102926336, "grad_norm": 0.21628066897392273, "learning_rate": 4.243188698284561e-05, "loss": 0.0056, "step": 1800 }, { "epoch": 0.45408678102926336, "eval_loss": 0.010144516825675964, "eval_runtime": 20.9537, "eval_samples_per_second": 84.09, "eval_steps_per_second": 21.046, "step": 1800 }, { "epoch": 0.45660948536831486, "grad_norm": 0.17846959829330444, "learning_rate": 4.2389841910528086e-05, "loss": 0.008, "step": 1810 }, { "epoch": 0.4591321897073663, "grad_norm": 0.18932151794433594, "learning_rate": 4.234779683821056e-05, "loss": 0.0098, "step": 1820 }, { "epoch": 0.46165489404641774, "grad_norm": 0.005480750929564238, "learning_rate": 4.2305751765893035e-05, "loss": 0.0099, "step": 1830 }, { "epoch": 0.46417759838546924, "grad_norm": 0.02110099606215954, "learning_rate": 4.226370669357552e-05, "loss": 0.0089, "step": 1840 }, { "epoch": 0.4667003027245207, "grad_norm": 0.12439311295747757, "learning_rate": 4.222166162125799e-05, "loss": 0.006, "step": 1850 }, { "epoch": 0.4692230070635722, "grad_norm": 0.12683548033237457, "learning_rate": 4.2179616548940466e-05, "loss": 0.0062, "step": 1860 }, { "epoch": 0.4717457114026236, "grad_norm": 0.10005199909210205, "learning_rate": 4.213757147662295e-05, "loss": 0.0064, "step": 1870 }, { "epoch": 0.47426841574167505, "grad_norm": 0.101644366979599, "learning_rate": 4.2095526404305416e-05, "loss": 0.0076, "step": 1880 }, { "epoch": 0.47679112008072655, "grad_norm": 0.09989798069000244, "learning_rate": 4.205348133198789e-05, "loss": 0.0048, "step": 1890 }, { "epoch": 0.479313824419778, "grad_norm": 0.12589283287525177, "learning_rate": 4.2011436259670365e-05, "loss": 0.008, "step": 1900 }, { "epoch": 0.479313824419778, "eval_loss": 0.00869656726717949, "eval_runtime": 20.8804, "eval_samples_per_second": 84.385, "eval_steps_per_second": 21.12, "step": 1900 }, { "epoch": 0.4818365287588295, "grad_norm": 0.3338044583797455, "learning_rate": 4.1969391187352846e-05, "loss": 0.014, "step": 1910 }, { "epoch": 0.4843592330978809, "grad_norm": 0.1185823529958725, "learning_rate": 4.192734611503532e-05, "loss": 0.0072, "step": 1920 }, { "epoch": 0.48688193743693237, "grad_norm": 0.2536753714084625, "learning_rate": 4.1885301042717796e-05, "loss": 0.0177, "step": 1930 }, { "epoch": 0.48940464177598386, "grad_norm": 0.1340733915567398, "learning_rate": 4.184325597040027e-05, "loss": 0.0052, "step": 1940 }, { "epoch": 0.4919273461150353, "grad_norm": 0.09943121671676636, "learning_rate": 4.180121089808275e-05, "loss": 0.0054, "step": 1950 }, { "epoch": 0.4944500504540868, "grad_norm": 0.17324581742286682, "learning_rate": 4.175916582576522e-05, "loss": 0.0061, "step": 1960 }, { "epoch": 0.49697275479313824, "grad_norm": 0.027863750234246254, "learning_rate": 4.1717120753447694e-05, "loss": 0.0093, "step": 1970 }, { "epoch": 0.49949545913218973, "grad_norm": 0.016479160636663437, "learning_rate": 4.167507568113017e-05, "loss": 0.0036, "step": 1980 }, { "epoch": 0.5020181634712412, "grad_norm": 0.06331757456064224, "learning_rate": 4.163303060881265e-05, "loss": 0.0074, "step": 1990 }, { "epoch": 0.5045408678102926, "grad_norm": 0.028800033032894135, "learning_rate": 4.1590985536495125e-05, "loss": 0.0092, "step": 2000 }, { "epoch": 0.5045408678102926, "eval_loss": 0.008615074679255486, "eval_runtime": 20.9704, "eval_samples_per_second": 84.023, "eval_steps_per_second": 21.03, "step": 2000 }, { "epoch": 0.507063572149344, "grad_norm": 0.11708183586597443, "learning_rate": 4.15489404641776e-05, "loss": 0.0028, "step": 2010 }, { "epoch": 0.5095862764883956, "grad_norm": 0.0794278159737587, "learning_rate": 4.1506895391860074e-05, "loss": 0.0097, "step": 2020 }, { "epoch": 0.512108980827447, "grad_norm": 0.06485351175069809, "learning_rate": 4.1464850319542556e-05, "loss": 0.0076, "step": 2030 }, { "epoch": 0.5146316851664985, "grad_norm": 0.15527907013893127, "learning_rate": 4.142280524722503e-05, "loss": 0.013, "step": 2040 }, { "epoch": 0.5171543895055499, "grad_norm": 0.20114894211292267, "learning_rate": 4.13807601749075e-05, "loss": 0.0117, "step": 2050 }, { "epoch": 0.5196770938446014, "grad_norm": 0.12151603400707245, "learning_rate": 4.133871510258998e-05, "loss": 0.015, "step": 2060 }, { "epoch": 0.5221997981836529, "grad_norm": 0.055018067359924316, "learning_rate": 4.1296670030272455e-05, "loss": 0.0079, "step": 2070 }, { "epoch": 0.5247225025227044, "grad_norm": 0.16336438059806824, "learning_rate": 4.125462495795493e-05, "loss": 0.0072, "step": 2080 }, { "epoch": 0.5272452068617558, "grad_norm": 0.2550767660140991, "learning_rate": 4.1212579885637404e-05, "loss": 0.013, "step": 2090 }, { "epoch": 0.5297679112008072, "grad_norm": 0.040902867913246155, "learning_rate": 4.1170534813319885e-05, "loss": 0.0033, "step": 2100 }, { "epoch": 0.5297679112008072, "eval_loss": 0.00901652593165636, "eval_runtime": 21.0472, "eval_samples_per_second": 83.717, "eval_steps_per_second": 20.953, "step": 2100 }, { "epoch": 0.5322906155398587, "grad_norm": 0.03393733501434326, "learning_rate": 4.112848974100236e-05, "loss": 0.0107, "step": 2110 }, { "epoch": 0.5348133198789102, "grad_norm": 0.10513912886381149, "learning_rate": 4.1086444668684835e-05, "loss": 0.0069, "step": 2120 }, { "epoch": 0.5373360242179617, "grad_norm": 0.012084727175533772, "learning_rate": 4.10443995963673e-05, "loss": 0.0082, "step": 2130 }, { "epoch": 0.5398587285570131, "grad_norm": 0.07994495332241058, "learning_rate": 4.1002354524049784e-05, "loss": 0.0109, "step": 2140 }, { "epoch": 0.5423814328960646, "grad_norm": 0.13017794489860535, "learning_rate": 4.096030945173226e-05, "loss": 0.0041, "step": 2150 }, { "epoch": 0.544904137235116, "grad_norm": 0.0023468900471925735, "learning_rate": 4.091826437941473e-05, "loss": 0.0085, "step": 2160 }, { "epoch": 0.5474268415741675, "grad_norm": 0.061518047004938126, "learning_rate": 4.087621930709721e-05, "loss": 0.011, "step": 2170 }, { "epoch": 0.549949545913219, "grad_norm": 0.07088392227888107, "learning_rate": 4.083417423477969e-05, "loss": 0.0056, "step": 2180 }, { "epoch": 0.5524722502522704, "grad_norm": 0.09153332561254501, "learning_rate": 4.0792129162462164e-05, "loss": 0.0059, "step": 2190 }, { "epoch": 0.5549949545913219, "grad_norm": 0.15585757791996002, "learning_rate": 4.075008409014464e-05, "loss": 0.0041, "step": 2200 }, { "epoch": 0.5549949545913219, "eval_loss": 0.008859611116349697, "eval_runtime": 20.9351, "eval_samples_per_second": 84.165, "eval_steps_per_second": 21.065, "step": 2200 }, { "epoch": 0.5575176589303733, "grad_norm": 0.14012502133846283, "learning_rate": 4.070803901782711e-05, "loss": 0.0052, "step": 2210 }, { "epoch": 0.5600403632694249, "grad_norm": 0.18286050856113434, "learning_rate": 4.066599394550959e-05, "loss": 0.0117, "step": 2220 }, { "epoch": 0.5625630676084763, "grad_norm": 0.12133604288101196, "learning_rate": 4.062394887319206e-05, "loss": 0.0064, "step": 2230 }, { "epoch": 0.5650857719475277, "grad_norm": 0.006398872472345829, "learning_rate": 4.058190380087454e-05, "loss": 0.0032, "step": 2240 }, { "epoch": 0.5676084762865792, "grad_norm": 0.0005545477033592761, "learning_rate": 4.053985872855702e-05, "loss": 0.004, "step": 2250 }, { "epoch": 0.5701311806256307, "grad_norm": 0.16576875746250153, "learning_rate": 4.0497813656239493e-05, "loss": 0.0041, "step": 2260 }, { "epoch": 0.5726538849646822, "grad_norm": 0.034229591488838196, "learning_rate": 4.045576858392197e-05, "loss": 0.0051, "step": 2270 }, { "epoch": 0.5751765893037336, "grad_norm": 0.13495758175849915, "learning_rate": 4.041372351160444e-05, "loss": 0.0081, "step": 2280 }, { "epoch": 0.577699293642785, "grad_norm": 0.20754534006118774, "learning_rate": 4.037167843928692e-05, "loss": 0.0129, "step": 2290 }, { "epoch": 0.5802219979818365, "grad_norm": 0.12224958837032318, "learning_rate": 4.032963336696939e-05, "loss": 0.007, "step": 2300 }, { "epoch": 0.5802219979818365, "eval_loss": 0.008081664331257343, "eval_runtime": 20.9045, "eval_samples_per_second": 84.288, "eval_steps_per_second": 21.096, "step": 2300 }, { "epoch": 0.582744702320888, "grad_norm": 0.20963284373283386, "learning_rate": 4.028758829465187e-05, "loss": 0.011, "step": 2310 }, { "epoch": 0.5852674066599395, "grad_norm": 0.1182667464017868, "learning_rate": 4.024554322233434e-05, "loss": 0.0085, "step": 2320 }, { "epoch": 0.5877901109989909, "grad_norm": 0.1626705825328827, "learning_rate": 4.020349815001682e-05, "loss": 0.0091, "step": 2330 }, { "epoch": 0.5903128153380424, "grad_norm": 0.10798126459121704, "learning_rate": 4.01614530776993e-05, "loss": 0.009, "step": 2340 }, { "epoch": 0.5928355196770938, "grad_norm": 0.03671824559569359, "learning_rate": 4.011940800538177e-05, "loss": 0.005, "step": 2350 }, { "epoch": 0.5953582240161454, "grad_norm": 0.019325584173202515, "learning_rate": 4.007736293306425e-05, "loss": 0.0082, "step": 2360 }, { "epoch": 0.5978809283551968, "grad_norm": 0.04128754511475563, "learning_rate": 4.003531786074672e-05, "loss": 0.0046, "step": 2370 }, { "epoch": 0.6004036326942482, "grad_norm": 0.07875852286815643, "learning_rate": 3.9993272788429196e-05, "loss": 0.0283, "step": 2380 }, { "epoch": 0.6029263370332997, "grad_norm": 0.11841381341218948, "learning_rate": 3.995122771611167e-05, "loss": 0.0052, "step": 2390 }, { "epoch": 0.6054490413723511, "grad_norm": 0.14310500025749207, "learning_rate": 3.990918264379415e-05, "loss": 0.0027, "step": 2400 }, { "epoch": 0.6054490413723511, "eval_loss": 0.008280658163130283, "eval_runtime": 20.9355, "eval_samples_per_second": 84.163, "eval_steps_per_second": 21.065, "step": 2400 }, { "epoch": 0.6079717457114027, "grad_norm": 0.1013203114271164, "learning_rate": 3.986713757147663e-05, "loss": 0.0071, "step": 2410 }, { "epoch": 0.6104944500504541, "grad_norm": 0.09219915419816971, "learning_rate": 3.98250924991591e-05, "loss": 0.0128, "step": 2420 }, { "epoch": 0.6130171543895055, "grad_norm": 0.21949277818202972, "learning_rate": 3.9783047426841576e-05, "loss": 0.0125, "step": 2430 }, { "epoch": 0.615539858728557, "grad_norm": 0.04883907735347748, "learning_rate": 3.974100235452405e-05, "loss": 0.0133, "step": 2440 }, { "epoch": 0.6180625630676084, "grad_norm": 0.28083309531211853, "learning_rate": 3.9698957282206526e-05, "loss": 0.0094, "step": 2450 }, { "epoch": 0.62058526740666, "grad_norm": 0.1395656317472458, "learning_rate": 3.9656912209889e-05, "loss": 0.008, "step": 2460 }, { "epoch": 0.6231079717457114, "grad_norm": 0.3387027084827423, "learning_rate": 3.9614867137571475e-05, "loss": 0.0108, "step": 2470 }, { "epoch": 0.6256306760847629, "grad_norm": 0.12317987531423569, "learning_rate": 3.9572822065253956e-05, "loss": 0.0106, "step": 2480 }, { "epoch": 0.6281533804238143, "grad_norm": 0.11516406387090683, "learning_rate": 3.953077699293643e-05, "loss": 0.0109, "step": 2490 }, { "epoch": 0.6306760847628657, "grad_norm": 0.3164563775062561, "learning_rate": 3.9488731920618906e-05, "loss": 0.0122, "step": 2500 }, { "epoch": 0.6306760847628657, "eval_loss": 0.0077254436910152435, "eval_runtime": 20.9125, "eval_samples_per_second": 84.256, "eval_steps_per_second": 21.088, "step": 2500 }, { "epoch": 0.6331987891019173, "grad_norm": 0.1707511991262436, "learning_rate": 3.944668684830138e-05, "loss": 0.009, "step": 2510 }, { "epoch": 0.6357214934409687, "grad_norm": 0.08108045160770416, "learning_rate": 3.9404641775983855e-05, "loss": 0.012, "step": 2520 }, { "epoch": 0.6382441977800202, "grad_norm": 0.1104462668299675, "learning_rate": 3.936259670366633e-05, "loss": 0.004, "step": 2530 }, { "epoch": 0.6407669021190716, "grad_norm": 0.17339076101779938, "learning_rate": 3.9320551631348804e-05, "loss": 0.0056, "step": 2540 }, { "epoch": 0.643289606458123, "grad_norm": 0.10303635895252228, "learning_rate": 3.9278506559031286e-05, "loss": 0.0078, "step": 2550 }, { "epoch": 0.6458123107971746, "grad_norm": 0.009340761229395866, "learning_rate": 3.923646148671376e-05, "loss": 0.0071, "step": 2560 }, { "epoch": 0.648335015136226, "grad_norm": 0.11290521174669266, "learning_rate": 3.9194416414396235e-05, "loss": 0.0064, "step": 2570 }, { "epoch": 0.6508577194752775, "grad_norm": 0.13023380935192108, "learning_rate": 3.915237134207871e-05, "loss": 0.0047, "step": 2580 }, { "epoch": 0.6533804238143289, "grad_norm": 0.027826808393001556, "learning_rate": 3.911032626976119e-05, "loss": 0.004, "step": 2590 }, { "epoch": 0.6559031281533805, "grad_norm": 0.12674188613891602, "learning_rate": 3.906828119744366e-05, "loss": 0.0087, "step": 2600 }, { "epoch": 0.6559031281533805, "eval_loss": 0.007544004824012518, "eval_runtime": 20.9279, "eval_samples_per_second": 84.194, "eval_steps_per_second": 21.072, "step": 2600 }, { "epoch": 0.6584258324924319, "grad_norm": 0.05906185507774353, "learning_rate": 3.9026236125126134e-05, "loss": 0.0112, "step": 2610 }, { "epoch": 0.6609485368314834, "grad_norm": 0.02223772369325161, "learning_rate": 3.898419105280861e-05, "loss": 0.0061, "step": 2620 }, { "epoch": 0.6634712411705348, "grad_norm": 0.1578211635351181, "learning_rate": 3.894214598049109e-05, "loss": 0.0065, "step": 2630 }, { "epoch": 0.6659939455095862, "grad_norm": 0.0348033532500267, "learning_rate": 3.8900100908173565e-05, "loss": 0.0106, "step": 2640 }, { "epoch": 0.6685166498486378, "grad_norm": 0.09289257973432541, "learning_rate": 3.885805583585604e-05, "loss": 0.0055, "step": 2650 }, { "epoch": 0.6710393541876892, "grad_norm": 0.0011186335468664765, "learning_rate": 3.8816010763538514e-05, "loss": 0.0043, "step": 2660 }, { "epoch": 0.6735620585267407, "grad_norm": 0.04303692653775215, "learning_rate": 3.8773965691220995e-05, "loss": 0.003, "step": 2670 }, { "epoch": 0.6760847628657921, "grad_norm": 0.02356291376054287, "learning_rate": 3.873192061890347e-05, "loss": 0.004, "step": 2680 }, { "epoch": 0.6786074672048436, "grad_norm": 0.23490223288536072, "learning_rate": 3.868987554658594e-05, "loss": 0.0087, "step": 2690 }, { "epoch": 0.6811301715438951, "grad_norm": 0.18736770749092102, "learning_rate": 3.864783047426841e-05, "loss": 0.0101, "step": 2700 }, { "epoch": 0.6811301715438951, "eval_loss": 0.007495530880987644, "eval_runtime": 20.9691, "eval_samples_per_second": 84.029, "eval_steps_per_second": 21.031, "step": 2700 }, { "epoch": 0.6836528758829465, "grad_norm": 0.06488362699747086, "learning_rate": 3.8605785401950894e-05, "loss": 0.0023, "step": 2710 }, { "epoch": 0.686175580221998, "grad_norm": 0.11341580748558044, "learning_rate": 3.856374032963337e-05, "loss": 0.0075, "step": 2720 }, { "epoch": 0.6886982845610494, "grad_norm": 0.018855459988117218, "learning_rate": 3.852169525731584e-05, "loss": 0.0072, "step": 2730 }, { "epoch": 0.6912209889001009, "grad_norm": 0.002237241482362151, "learning_rate": 3.8479650184998325e-05, "loss": 0.0046, "step": 2740 }, { "epoch": 0.6937436932391524, "grad_norm": 0.2180403620004654, "learning_rate": 3.84376051126808e-05, "loss": 0.008, "step": 2750 }, { "epoch": 0.6962663975782039, "grad_norm": 0.09451308846473694, "learning_rate": 3.8395560040363274e-05, "loss": 0.0065, "step": 2760 }, { "epoch": 0.6987891019172553, "grad_norm": 0.14188626408576965, "learning_rate": 3.835351496804574e-05, "loss": 0.0106, "step": 2770 }, { "epoch": 0.7013118062563067, "grad_norm": 0.10723700374364853, "learning_rate": 3.8311469895728223e-05, "loss": 0.0101, "step": 2780 }, { "epoch": 0.7038345105953582, "grad_norm": 0.09538406878709793, "learning_rate": 3.82694248234107e-05, "loss": 0.0101, "step": 2790 }, { "epoch": 0.7063572149344097, "grad_norm": 0.013723093084990978, "learning_rate": 3.822737975109317e-05, "loss": 0.0064, "step": 2800 }, { "epoch": 0.7063572149344097, "eval_loss": 0.007626931183040142, "eval_runtime": 20.9136, "eval_samples_per_second": 84.252, "eval_steps_per_second": 21.087, "step": 2800 }, { "epoch": 0.7088799192734612, "grad_norm": 0.08048822730779648, "learning_rate": 3.818533467877565e-05, "loss": 0.0046, "step": 2810 }, { "epoch": 0.7114026236125126, "grad_norm": 0.2678566873073578, "learning_rate": 3.814328960645813e-05, "loss": 0.0111, "step": 2820 }, { "epoch": 0.713925327951564, "grad_norm": 0.016534708440303802, "learning_rate": 3.8101244534140604e-05, "loss": 0.0105, "step": 2830 }, { "epoch": 0.7164480322906155, "grad_norm": 0.17189861834049225, "learning_rate": 3.805919946182308e-05, "loss": 0.0124, "step": 2840 }, { "epoch": 0.718970736629667, "grad_norm": 0.004572316538542509, "learning_rate": 3.8017154389505546e-05, "loss": 0.0054, "step": 2850 }, { "epoch": 0.7214934409687185, "grad_norm": 0.02059135213494301, "learning_rate": 3.797510931718803e-05, "loss": 0.0108, "step": 2860 }, { "epoch": 0.7240161453077699, "grad_norm": 0.11188461631536484, "learning_rate": 3.79330642448705e-05, "loss": 0.0052, "step": 2870 }, { "epoch": 0.7265388496468214, "grad_norm": 0.0961654856801033, "learning_rate": 3.789101917255298e-05, "loss": 0.0092, "step": 2880 }, { "epoch": 0.7290615539858728, "grad_norm": 0.004565075505524874, "learning_rate": 3.784897410023546e-05, "loss": 0.005, "step": 2890 }, { "epoch": 0.7315842583249244, "grad_norm": 0.058100827038288116, "learning_rate": 3.780692902791793e-05, "loss": 0.0196, "step": 2900 }, { "epoch": 0.7315842583249244, "eval_loss": 0.007003966718912125, "eval_runtime": 20.9017, "eval_samples_per_second": 84.299, "eval_steps_per_second": 21.099, "step": 2900 }, { "epoch": 0.7341069626639758, "grad_norm": 0.08321081846952438, "learning_rate": 3.776488395560041e-05, "loss": 0.0042, "step": 2910 }, { "epoch": 0.7366296670030272, "grad_norm": 0.12256285548210144, "learning_rate": 3.772283888328288e-05, "loss": 0.0101, "step": 2920 }, { "epoch": 0.7391523713420787, "grad_norm": 0.18364761769771576, "learning_rate": 3.768079381096536e-05, "loss": 0.0136, "step": 2930 }, { "epoch": 0.7416750756811302, "grad_norm": 0.3442452847957611, "learning_rate": 3.763874873864783e-05, "loss": 0.0094, "step": 2940 }, { "epoch": 0.7441977800201817, "grad_norm": 0.1985316276550293, "learning_rate": 3.7596703666330306e-05, "loss": 0.0058, "step": 2950 }, { "epoch": 0.7467204843592331, "grad_norm": 0.000595409597735852, "learning_rate": 3.755465859401278e-05, "loss": 0.0212, "step": 2960 }, { "epoch": 0.7492431886982845, "grad_norm": 0.23967699706554413, "learning_rate": 3.751261352169526e-05, "loss": 0.0064, "step": 2970 }, { "epoch": 0.751765893037336, "grad_norm": 0.0012721189996227622, "learning_rate": 3.747056844937774e-05, "loss": 0.003, "step": 2980 }, { "epoch": 0.7542885973763875, "grad_norm": 0.0369889996945858, "learning_rate": 3.742852337706021e-05, "loss": 0.0072, "step": 2990 }, { "epoch": 0.756811301715439, "grad_norm": 0.07918387651443481, "learning_rate": 3.7386478304742686e-05, "loss": 0.0031, "step": 3000 }, { "epoch": 0.756811301715439, "eval_loss": 0.007090387400239706, "eval_runtime": 20.9252, "eval_samples_per_second": 84.205, "eval_steps_per_second": 21.075, "step": 3000 }, { "epoch": 0.7593340060544904, "grad_norm": 0.08373123407363892, "learning_rate": 3.734443323242516e-05, "loss": 0.0033, "step": 3010 }, { "epoch": 0.7618567103935419, "grad_norm": 0.06391701102256775, "learning_rate": 3.7302388160107636e-05, "loss": 0.0084, "step": 3020 }, { "epoch": 0.7643794147325933, "grad_norm": 0.11146340519189835, "learning_rate": 3.726034308779011e-05, "loss": 0.0054, "step": 3030 }, { "epoch": 0.7669021190716448, "grad_norm": 0.1086326614022255, "learning_rate": 3.7218298015472585e-05, "loss": 0.0028, "step": 3040 }, { "epoch": 0.7694248234106963, "grad_norm": 0.029285268858075142, "learning_rate": 3.7176252943155067e-05, "loss": 0.0065, "step": 3050 }, { "epoch": 0.7719475277497477, "grad_norm": 0.13605491816997528, "learning_rate": 3.713420787083754e-05, "loss": 0.0083, "step": 3060 }, { "epoch": 0.7744702320887992, "grad_norm": 0.0959770679473877, "learning_rate": 3.7092162798520016e-05, "loss": 0.0078, "step": 3070 }, { "epoch": 0.7769929364278506, "grad_norm": 0.022328553721308708, "learning_rate": 3.705011772620249e-05, "loss": 0.0065, "step": 3080 }, { "epoch": 0.7795156407669022, "grad_norm": 0.0018502280581742525, "learning_rate": 3.7008072653884965e-05, "loss": 0.0107, "step": 3090 }, { "epoch": 0.7820383451059536, "grad_norm": 0.020223820582032204, "learning_rate": 3.696602758156744e-05, "loss": 0.0014, "step": 3100 }, { "epoch": 0.7820383451059536, "eval_loss": 0.007150140590965748, "eval_runtime": 20.9003, "eval_samples_per_second": 84.305, "eval_steps_per_second": 21.1, "step": 3100 }, { "epoch": 0.784561049445005, "grad_norm": 0.17186589539051056, "learning_rate": 3.6923982509249915e-05, "loss": 0.0043, "step": 3110 }, { "epoch": 0.7870837537840565, "grad_norm": 0.054055992513895035, "learning_rate": 3.6881937436932396e-05, "loss": 0.0036, "step": 3120 }, { "epoch": 0.7896064581231079, "grad_norm": 0.24574770033359528, "learning_rate": 3.683989236461487e-05, "loss": 0.0125, "step": 3130 }, { "epoch": 0.7921291624621595, "grad_norm": 0.09889545291662216, "learning_rate": 3.6797847292297345e-05, "loss": 0.0071, "step": 3140 }, { "epoch": 0.7946518668012109, "grad_norm": 0.1626003533601761, "learning_rate": 3.675580221997982e-05, "loss": 0.0037, "step": 3150 }, { "epoch": 0.7971745711402624, "grad_norm": 0.1329105943441391, "learning_rate": 3.6713757147662295e-05, "loss": 0.007, "step": 3160 }, { "epoch": 0.7996972754793138, "grad_norm": 0.18876679241657257, "learning_rate": 3.667171207534477e-05, "loss": 0.0121, "step": 3170 }, { "epoch": 0.8022199798183652, "grad_norm": 0.1532873511314392, "learning_rate": 3.6629667003027244e-05, "loss": 0.0061, "step": 3180 }, { "epoch": 0.8047426841574168, "grad_norm": 0.14677301049232483, "learning_rate": 3.658762193070972e-05, "loss": 0.0046, "step": 3190 }, { "epoch": 0.8072653884964682, "grad_norm": 0.1844077706336975, "learning_rate": 3.65455768583922e-05, "loss": 0.0075, "step": 3200 }, { "epoch": 0.8072653884964682, "eval_loss": 0.006863369140774012, "eval_runtime": 20.9061, "eval_samples_per_second": 84.282, "eval_steps_per_second": 21.094, "step": 3200 }, { "epoch": 0.8097880928355197, "grad_norm": 0.07923123240470886, "learning_rate": 3.6503531786074675e-05, "loss": 0.0023, "step": 3210 }, { "epoch": 0.8123107971745711, "grad_norm": 0.003948994446545839, "learning_rate": 3.646148671375715e-05, "loss": 0.0036, "step": 3220 }, { "epoch": 0.8148335015136225, "grad_norm": 0.18977274000644684, "learning_rate": 3.641944164143963e-05, "loss": 0.0058, "step": 3230 }, { "epoch": 0.8173562058526741, "grad_norm": 0.13241952657699585, "learning_rate": 3.63773965691221e-05, "loss": 0.0048, "step": 3240 }, { "epoch": 0.8198789101917255, "grad_norm": 0.10835881531238556, "learning_rate": 3.6335351496804573e-05, "loss": 0.0065, "step": 3250 }, { "epoch": 0.822401614530777, "grad_norm": 0.10177090018987656, "learning_rate": 3.629330642448705e-05, "loss": 0.0089, "step": 3260 }, { "epoch": 0.8249243188698284, "grad_norm": 0.11547684669494629, "learning_rate": 3.625126135216953e-05, "loss": 0.0069, "step": 3270 }, { "epoch": 0.82744702320888, "grad_norm": 0.22250983119010925, "learning_rate": 3.6209216279852004e-05, "loss": 0.0128, "step": 3280 }, { "epoch": 0.8299697275479314, "grad_norm": 0.05016797035932541, "learning_rate": 3.616717120753448e-05, "loss": 0.0041, "step": 3290 }, { "epoch": 0.8324924318869829, "grad_norm": 0.19384223222732544, "learning_rate": 3.6125126135216953e-05, "loss": 0.0125, "step": 3300 }, { "epoch": 0.8324924318869829, "eval_loss": 0.006563546601682901, "eval_runtime": 20.8989, "eval_samples_per_second": 84.311, "eval_steps_per_second": 21.102, "step": 3300 }, { "epoch": 0.8350151362260343, "grad_norm": 0.09639815986156464, "learning_rate": 3.6083081062899435e-05, "loss": 0.0064, "step": 3310 }, { "epoch": 0.8375378405650857, "grad_norm": 0.006088436581194401, "learning_rate": 3.60410359905819e-05, "loss": 0.0072, "step": 3320 }, { "epoch": 0.8400605449041373, "grad_norm": 0.20799516141414642, "learning_rate": 3.599899091826438e-05, "loss": 0.0094, "step": 3330 }, { "epoch": 0.8425832492431887, "grad_norm": 0.0011494633508846164, "learning_rate": 3.595694584594685e-05, "loss": 0.0064, "step": 3340 }, { "epoch": 0.8451059535822402, "grad_norm": 0.012169969268143177, "learning_rate": 3.5914900773629334e-05, "loss": 0.0077, "step": 3350 }, { "epoch": 0.8476286579212916, "grad_norm": 0.15517696738243103, "learning_rate": 3.587285570131181e-05, "loss": 0.004, "step": 3360 }, { "epoch": 0.850151362260343, "grad_norm": 0.1262129694223404, "learning_rate": 3.583081062899428e-05, "loss": 0.0123, "step": 3370 }, { "epoch": 0.8526740665993946, "grad_norm": 0.05431267246603966, "learning_rate": 3.578876555667676e-05, "loss": 0.0012, "step": 3380 }, { "epoch": 0.855196770938446, "grad_norm": 0.27199476957321167, "learning_rate": 3.574672048435924e-05, "loss": 0.0076, "step": 3390 }, { "epoch": 0.8577194752774975, "grad_norm": 0.20499233901500702, "learning_rate": 3.5704675412041714e-05, "loss": 0.0045, "step": 3400 }, { "epoch": 0.8577194752774975, "eval_loss": 0.006544772535562515, "eval_runtime": 20.8969, "eval_samples_per_second": 84.319, "eval_steps_per_second": 21.104, "step": 3400 }, { "epoch": 0.8602421796165489, "grad_norm": 0.08625713735818863, "learning_rate": 3.566263033972418e-05, "loss": 0.0099, "step": 3410 }, { "epoch": 0.8627648839556004, "grad_norm": 0.11165639013051987, "learning_rate": 3.562058526740666e-05, "loss": 0.0026, "step": 3420 }, { "epoch": 0.8652875882946519, "grad_norm": 0.0018256891053169966, "learning_rate": 3.557854019508914e-05, "loss": 0.0048, "step": 3430 }, { "epoch": 0.8678102926337034, "grad_norm": 0.19064132869243622, "learning_rate": 3.553649512277161e-05, "loss": 0.0021, "step": 3440 }, { "epoch": 0.8703329969727548, "grad_norm": 0.2267286479473114, "learning_rate": 3.549445005045409e-05, "loss": 0.0091, "step": 3450 }, { "epoch": 0.8728557013118062, "grad_norm": 0.006103180348873138, "learning_rate": 3.545240497813657e-05, "loss": 0.0103, "step": 3460 }, { "epoch": 0.8753784056508577, "grad_norm": 0.0026887240819633007, "learning_rate": 3.541035990581904e-05, "loss": 0.0048, "step": 3470 }, { "epoch": 0.8779011099899092, "grad_norm": 0.06880487501621246, "learning_rate": 3.536831483350152e-05, "loss": 0.0031, "step": 3480 }, { "epoch": 0.8804238143289607, "grad_norm": 0.14154627919197083, "learning_rate": 3.5326269761183986e-05, "loss": 0.0145, "step": 3490 }, { "epoch": 0.8829465186680121, "grad_norm": 0.0016246146988123655, "learning_rate": 3.528422468886647e-05, "loss": 0.0026, "step": 3500 }, { "epoch": 0.8829465186680121, "eval_loss": 0.007226438261568546, "eval_runtime": 20.911, "eval_samples_per_second": 84.262, "eval_steps_per_second": 21.089, "step": 3500 }, { "epoch": 0.8854692230070635, "grad_norm": 0.0016699236584827304, "learning_rate": 3.524217961654894e-05, "loss": 0.0058, "step": 3510 }, { "epoch": 0.887991927346115, "grad_norm": 0.2493603378534317, "learning_rate": 3.5200134544231416e-05, "loss": 0.0113, "step": 3520 }, { "epoch": 0.8905146316851665, "grad_norm": 0.1181974858045578, "learning_rate": 3.515808947191389e-05, "loss": 0.0059, "step": 3530 }, { "epoch": 0.893037336024218, "grad_norm": 0.11245719343423843, "learning_rate": 3.511604439959637e-05, "loss": 0.0045, "step": 3540 }, { "epoch": 0.8955600403632694, "grad_norm": 0.13200731575489044, "learning_rate": 3.507399932727885e-05, "loss": 0.0054, "step": 3550 }, { "epoch": 0.8980827447023209, "grad_norm": 0.195307195186615, "learning_rate": 3.503195425496132e-05, "loss": 0.0055, "step": 3560 }, { "epoch": 0.9006054490413723, "grad_norm": 0.08665880560874939, "learning_rate": 3.4989909182643797e-05, "loss": 0.0074, "step": 3570 }, { "epoch": 0.9031281533804238, "grad_norm": 0.00980606209486723, "learning_rate": 3.494786411032627e-05, "loss": 0.0049, "step": 3580 }, { "epoch": 0.9056508577194753, "grad_norm": 0.1497032195329666, "learning_rate": 3.4905819038008746e-05, "loss": 0.0055, "step": 3590 }, { "epoch": 0.9081735620585267, "grad_norm": 0.09247948974370956, "learning_rate": 3.486377396569122e-05, "loss": 0.0022, "step": 3600 }, { "epoch": 0.9081735620585267, "eval_loss": 0.0062293908558785915, "eval_runtime": 20.8987, "eval_samples_per_second": 84.312, "eval_steps_per_second": 21.102, "step": 3600 }, { "epoch": 0.9106962663975782, "grad_norm": 0.09304177761077881, "learning_rate": 3.48217288933737e-05, "loss": 0.0101, "step": 3610 }, { "epoch": 0.9132189707366297, "grad_norm": 0.004028341267257929, "learning_rate": 3.477968382105618e-05, "loss": 0.0097, "step": 3620 }, { "epoch": 0.9157416750756812, "grad_norm": 0.03493291139602661, "learning_rate": 3.473763874873865e-05, "loss": 0.0051, "step": 3630 }, { "epoch": 0.9182643794147326, "grad_norm": 0.12278582155704498, "learning_rate": 3.4695593676421126e-05, "loss": 0.0106, "step": 3640 }, { "epoch": 0.920787083753784, "grad_norm": 0.18783220648765564, "learning_rate": 3.46535486041036e-05, "loss": 0.0043, "step": 3650 }, { "epoch": 0.9233097880928355, "grad_norm": 0.013721932657063007, "learning_rate": 3.4611503531786075e-05, "loss": 0.0031, "step": 3660 }, { "epoch": 0.925832492431887, "grad_norm": 0.01255789864808321, "learning_rate": 3.456945845946855e-05, "loss": 0.0043, "step": 3670 }, { "epoch": 0.9283551967709385, "grad_norm": 0.06713691353797913, "learning_rate": 3.4527413387151025e-05, "loss": 0.0024, "step": 3680 }, { "epoch": 0.9308779011099899, "grad_norm": 0.007393176201730967, "learning_rate": 3.4485368314833506e-05, "loss": 0.0059, "step": 3690 }, { "epoch": 0.9334006054490414, "grad_norm": 0.14634644985198975, "learning_rate": 3.444332324251598e-05, "loss": 0.0102, "step": 3700 }, { "epoch": 0.9334006054490414, "eval_loss": 0.005810776725411415, "eval_runtime": 20.8803, "eval_samples_per_second": 84.386, "eval_steps_per_second": 21.12, "step": 3700 }, { "epoch": 0.9359233097880928, "grad_norm": 0.00905498769134283, "learning_rate": 3.4401278170198455e-05, "loss": 0.003, "step": 3710 }, { "epoch": 0.9384460141271443, "grad_norm": 0.005104383919388056, "learning_rate": 3.435923309788093e-05, "loss": 0.0046, "step": 3720 }, { "epoch": 0.9409687184661958, "grad_norm": 0.036407459527254105, "learning_rate": 3.4317188025563405e-05, "loss": 0.0052, "step": 3730 }, { "epoch": 0.9434914228052472, "grad_norm": 0.12225465476512909, "learning_rate": 3.427514295324588e-05, "loss": 0.0011, "step": 3740 }, { "epoch": 0.9460141271442987, "grad_norm": 0.002337078098207712, "learning_rate": 3.4233097880928354e-05, "loss": 0.0053, "step": 3750 }, { "epoch": 0.9485368314833501, "grad_norm": 0.0018623026553541422, "learning_rate": 3.4191052808610836e-05, "loss": 0.0021, "step": 3760 }, { "epoch": 0.9510595358224017, "grad_norm": 0.013399872928857803, "learning_rate": 3.414900773629331e-05, "loss": 0.0032, "step": 3770 }, { "epoch": 0.9535822401614531, "grad_norm": 0.010270589962601662, "learning_rate": 3.4106962663975785e-05, "loss": 0.0083, "step": 3780 }, { "epoch": 0.9561049445005045, "grad_norm": 0.07046973705291748, "learning_rate": 3.406491759165826e-05, "loss": 0.0014, "step": 3790 }, { "epoch": 0.958627648839556, "grad_norm": 0.0009812767384573817, "learning_rate": 3.4022872519340734e-05, "loss": 0.0039, "step": 3800 }, { "epoch": 0.958627648839556, "eval_loss": 0.005735259968787432, "eval_runtime": 20.894, "eval_samples_per_second": 84.331, "eval_steps_per_second": 21.107, "step": 3800 }, { "epoch": 0.9611503531786074, "grad_norm": 0.11425192654132843, "learning_rate": 3.398082744702321e-05, "loss": 0.0068, "step": 3810 }, { "epoch": 0.963673057517659, "grad_norm": 0.07777775824069977, "learning_rate": 3.3938782374705683e-05, "loss": 0.0013, "step": 3820 }, { "epoch": 0.9661957618567104, "grad_norm": 0.10662028938531876, "learning_rate": 3.389673730238816e-05, "loss": 0.0084, "step": 3830 }, { "epoch": 0.9687184661957619, "grad_norm": 0.07375224679708481, "learning_rate": 3.385469223007064e-05, "loss": 0.008, "step": 3840 }, { "epoch": 0.9712411705348133, "grad_norm": 0.03361163288354874, "learning_rate": 3.3812647157753114e-05, "loss": 0.0059, "step": 3850 }, { "epoch": 0.9737638748738647, "grad_norm": 0.012914448976516724, "learning_rate": 3.377060208543559e-05, "loss": 0.0009, "step": 3860 }, { "epoch": 0.9762865792129163, "grad_norm": 0.15875528752803802, "learning_rate": 3.3728557013118064e-05, "loss": 0.0089, "step": 3870 }, { "epoch": 0.9788092835519677, "grad_norm": 0.08293774724006653, "learning_rate": 3.368651194080054e-05, "loss": 0.0032, "step": 3880 }, { "epoch": 0.9813319878910192, "grad_norm": 0.176809623837471, "learning_rate": 3.364446686848301e-05, "loss": 0.0095, "step": 3890 }, { "epoch": 0.9838546922300706, "grad_norm": 0.15428629517555237, "learning_rate": 3.360242179616549e-05, "loss": 0.009, "step": 3900 }, { "epoch": 0.9838546922300706, "eval_loss": 0.005954666528850794, "eval_runtime": 20.8681, "eval_samples_per_second": 84.435, "eval_steps_per_second": 21.133, "step": 3900 }, { "epoch": 0.986377396569122, "grad_norm": 0.03709837794303894, "learning_rate": 3.356037672384797e-05, "loss": 0.0062, "step": 3910 }, { "epoch": 0.9889001009081736, "grad_norm": 0.01939135603606701, "learning_rate": 3.3518331651530444e-05, "loss": 0.0045, "step": 3920 }, { "epoch": 0.991422805247225, "grad_norm": 0.12339838594198227, "learning_rate": 3.347628657921292e-05, "loss": 0.0107, "step": 3930 }, { "epoch": 0.9939455095862765, "grad_norm": 0.11743946373462677, "learning_rate": 3.343424150689539e-05, "loss": 0.0026, "step": 3940 }, { "epoch": 0.9964682139253279, "grad_norm": 0.0008224299526773393, "learning_rate": 3.3392196434577874e-05, "loss": 0.0029, "step": 3950 }, { "epoch": 0.9989909182643795, "grad_norm": 0.1629364788532257, "learning_rate": 3.335015136226034e-05, "loss": 0.0045, "step": 3960 }, { "epoch": 1.001513622603431, "grad_norm": 0.015702659264206886, "learning_rate": 3.330810628994282e-05, "loss": 0.0018, "step": 3970 }, { "epoch": 1.0040363269424823, "grad_norm": 0.010090239346027374, "learning_rate": 3.326606121762529e-05, "loss": 0.0016, "step": 3980 }, { "epoch": 1.0065590312815338, "grad_norm": 0.1662328988313675, "learning_rate": 3.322401614530777e-05, "loss": 0.0028, "step": 3990 }, { "epoch": 1.0090817356205852, "grad_norm": 0.028376251459121704, "learning_rate": 3.318197107299025e-05, "loss": 0.0012, "step": 4000 }, { "epoch": 1.0090817356205852, "eval_loss": 0.005606195889413357, "eval_runtime": 20.8958, "eval_samples_per_second": 84.323, "eval_steps_per_second": 21.105, "step": 4000 }, { "epoch": 1.0116044399596367, "grad_norm": 0.18040278553962708, "learning_rate": 3.313992600067272e-05, "loss": 0.0031, "step": 4010 }, { "epoch": 1.014127144298688, "grad_norm": 0.09627419710159302, "learning_rate": 3.30978809283552e-05, "loss": 0.005, "step": 4020 }, { "epoch": 1.0166498486377396, "grad_norm": 0.02057558484375477, "learning_rate": 3.305583585603768e-05, "loss": 0.0017, "step": 4030 }, { "epoch": 1.0191725529767912, "grad_norm": 0.0033118566498160362, "learning_rate": 3.301379078372015e-05, "loss": 0.0013, "step": 4040 }, { "epoch": 1.0216952573158427, "grad_norm": 0.13773638010025024, "learning_rate": 3.297174571140262e-05, "loss": 0.0032, "step": 4050 }, { "epoch": 1.024217961654894, "grad_norm": 0.11453539878129959, "learning_rate": 3.2929700639085096e-05, "loss": 0.0031, "step": 4060 }, { "epoch": 1.0267406659939455, "grad_norm": 0.002913431962952018, "learning_rate": 3.288765556676758e-05, "loss": 0.0011, "step": 4070 }, { "epoch": 1.029263370332997, "grad_norm": 0.1419718712568283, "learning_rate": 3.284561049445005e-05, "loss": 0.0091, "step": 4080 }, { "epoch": 1.0317860746720484, "grad_norm": 0.007667609490454197, "learning_rate": 3.2803565422132527e-05, "loss": 0.0022, "step": 4090 }, { "epoch": 1.0343087790110999, "grad_norm": 0.11158380657434464, "learning_rate": 3.276152034981501e-05, "loss": 0.0029, "step": 4100 }, { "epoch": 1.0343087790110999, "eval_loss": 0.005591338500380516, "eval_runtime": 20.8942, "eval_samples_per_second": 84.33, "eval_steps_per_second": 21.106, "step": 4100 }, { "epoch": 1.0368314833501513, "grad_norm": 0.005254568066447973, "learning_rate": 3.271947527749748e-05, "loss": 0.0037, "step": 4110 }, { "epoch": 1.0393541876892027, "grad_norm": 0.13042157888412476, "learning_rate": 3.267743020517996e-05, "loss": 0.0052, "step": 4120 }, { "epoch": 1.0418768920282542, "grad_norm": 0.007648650091141462, "learning_rate": 3.2635385132862425e-05, "loss": 0.0037, "step": 4130 }, { "epoch": 1.0443995963673058, "grad_norm": 0.18600983917713165, "learning_rate": 3.259334006054491e-05, "loss": 0.0036, "step": 4140 }, { "epoch": 1.0469223007063573, "grad_norm": 0.00177775660995394, "learning_rate": 3.255129498822738e-05, "loss": 0.0026, "step": 4150 }, { "epoch": 1.0494450050454087, "grad_norm": 0.055269479751586914, "learning_rate": 3.2509249915909856e-05, "loss": 0.0044, "step": 4160 }, { "epoch": 1.0519677093844602, "grad_norm": 0.007563919294625521, "learning_rate": 3.246720484359233e-05, "loss": 0.0073, "step": 4170 }, { "epoch": 1.0544904137235116, "grad_norm": 0.12277320772409439, "learning_rate": 3.242515977127481e-05, "loss": 0.0034, "step": 4180 }, { "epoch": 1.057013118062563, "grad_norm": 0.1538102626800537, "learning_rate": 3.238311469895729e-05, "loss": 0.0029, "step": 4190 }, { "epoch": 1.0595358224016145, "grad_norm": 0.23897789418697357, "learning_rate": 3.234106962663976e-05, "loss": 0.0026, "step": 4200 }, { "epoch": 1.0595358224016145, "eval_loss": 0.005667256191372871, "eval_runtime": 20.9021, "eval_samples_per_second": 84.298, "eval_steps_per_second": 21.098, "step": 4200 }, { "epoch": 1.062058526740666, "grad_norm": 0.059796128422021866, "learning_rate": 3.229902455432223e-05, "loss": 0.0036, "step": 4210 }, { "epoch": 1.0645812310797174, "grad_norm": 0.11254319548606873, "learning_rate": 3.225697948200471e-05, "loss": 0.0052, "step": 4220 }, { "epoch": 1.067103935418769, "grad_norm": 0.008323262445628643, "learning_rate": 3.2214934409687185e-05, "loss": 0.0033, "step": 4230 }, { "epoch": 1.0696266397578205, "grad_norm": 4.037764301756397e-05, "learning_rate": 3.217288933736966e-05, "loss": 0.0018, "step": 4240 }, { "epoch": 1.072149344096872, "grad_norm": 0.07235367596149445, "learning_rate": 3.213084426505214e-05, "loss": 0.001, "step": 4250 }, { "epoch": 1.0746720484359233, "grad_norm": 0.14224140346050262, "learning_rate": 3.2088799192734616e-05, "loss": 0.0021, "step": 4260 }, { "epoch": 1.0771947527749748, "grad_norm": 0.15408071875572205, "learning_rate": 3.204675412041709e-05, "loss": 0.0036, "step": 4270 }, { "epoch": 1.0797174571140262, "grad_norm": 0.0037913790438324213, "learning_rate": 3.2004709048099566e-05, "loss": 0.0053, "step": 4280 }, { "epoch": 1.0822401614530777, "grad_norm": 0.13299870491027832, "learning_rate": 3.196266397578204e-05, "loss": 0.0028, "step": 4290 }, { "epoch": 1.084762865792129, "grad_norm": 0.12634998559951782, "learning_rate": 3.1920618903464515e-05, "loss": 0.0041, "step": 4300 }, { "epoch": 1.084762865792129, "eval_loss": 0.005686003249138594, "eval_runtime": 20.8959, "eval_samples_per_second": 84.323, "eval_steps_per_second": 21.105, "step": 4300 }, { "epoch": 1.0872855701311805, "grad_norm": 0.16738834977149963, "learning_rate": 3.187857383114699e-05, "loss": 0.003, "step": 4310 }, { "epoch": 1.089808274470232, "grad_norm": 0.037017568945884705, "learning_rate": 3.1836528758829464e-05, "loss": 0.0025, "step": 4320 }, { "epoch": 1.0923309788092836, "grad_norm": 0.07605406641960144, "learning_rate": 3.1794483686511946e-05, "loss": 0.0024, "step": 4330 }, { "epoch": 1.094853683148335, "grad_norm": 0.011002608574926853, "learning_rate": 3.175243861419442e-05, "loss": 0.0017, "step": 4340 }, { "epoch": 1.0973763874873865, "grad_norm": 0.0014461104292422533, "learning_rate": 3.1710393541876895e-05, "loss": 0.001, "step": 4350 }, { "epoch": 1.099899091826438, "grad_norm": 0.04258690029382706, "learning_rate": 3.166834846955937e-05, "loss": 0.001, "step": 4360 }, { "epoch": 1.1024217961654894, "grad_norm": 0.1580243706703186, "learning_rate": 3.1626303397241844e-05, "loss": 0.0063, "step": 4370 }, { "epoch": 1.1049445005045408, "grad_norm": 0.0013237325474619865, "learning_rate": 3.158425832492432e-05, "loss": 0.0062, "step": 4380 }, { "epoch": 1.1074672048435923, "grad_norm": 0.0012392470380291343, "learning_rate": 3.1542213252606794e-05, "loss": 0.0025, "step": 4390 }, { "epoch": 1.1099899091826437, "grad_norm": 0.04316063970327377, "learning_rate": 3.150016818028927e-05, "loss": 0.0015, "step": 4400 }, { "epoch": 1.1099899091826437, "eval_loss": 0.005799129139631987, "eval_runtime": 20.8943, "eval_samples_per_second": 84.329, "eval_steps_per_second": 21.106, "step": 4400 }, { "epoch": 1.1125126135216952, "grad_norm": 0.12705808877944946, "learning_rate": 3.145812310797175e-05, "loss": 0.0038, "step": 4410 }, { "epoch": 1.1150353178607468, "grad_norm": 0.01606024242937565, "learning_rate": 3.1416078035654224e-05, "loss": 0.0035, "step": 4420 }, { "epoch": 1.1175580221997983, "grad_norm": 0.0025201209355145693, "learning_rate": 3.13740329633367e-05, "loss": 0.0053, "step": 4430 }, { "epoch": 1.1200807265388497, "grad_norm": 0.0011012004688382149, "learning_rate": 3.1331987891019174e-05, "loss": 0.0044, "step": 4440 }, { "epoch": 1.1226034308779012, "grad_norm": 0.00029570693732239306, "learning_rate": 3.128994281870165e-05, "loss": 0.0037, "step": 4450 }, { "epoch": 1.1251261352169526, "grad_norm": 0.028565967455506325, "learning_rate": 3.124789774638412e-05, "loss": 0.005, "step": 4460 }, { "epoch": 1.127648839556004, "grad_norm": 0.09666335582733154, "learning_rate": 3.12058526740666e-05, "loss": 0.0025, "step": 4470 }, { "epoch": 1.1301715438950555, "grad_norm": 0.0027206747326999903, "learning_rate": 3.116380760174908e-05, "loss": 0.0011, "step": 4480 }, { "epoch": 1.132694248234107, "grad_norm": 0.012391073629260063, "learning_rate": 3.1121762529431554e-05, "loss": 0.0002, "step": 4490 }, { "epoch": 1.1352169525731584, "grad_norm": 0.05354852229356766, "learning_rate": 3.107971745711403e-05, "loss": 0.0014, "step": 4500 }, { "epoch": 1.1352169525731584, "eval_loss": 0.005535118281841278, "eval_runtime": 20.8683, "eval_samples_per_second": 84.434, "eval_steps_per_second": 21.132, "step": 4500 }, { "epoch": 1.1377396569122098, "grad_norm": 0.03663089498877525, "learning_rate": 3.10376723847965e-05, "loss": 0.0028, "step": 4510 }, { "epoch": 1.1402623612512612, "grad_norm": 0.20328471064567566, "learning_rate": 3.099562731247898e-05, "loss": 0.0014, "step": 4520 }, { "epoch": 1.142785065590313, "grad_norm": 0.19447293877601624, "learning_rate": 3.095358224016145e-05, "loss": 0.0046, "step": 4530 }, { "epoch": 1.1453077699293643, "grad_norm": 0.02381107583642006, "learning_rate": 3.091153716784393e-05, "loss": 0.0049, "step": 4540 }, { "epoch": 1.1478304742684158, "grad_norm": 0.010867373086512089, "learning_rate": 3.08694920955264e-05, "loss": 0.0014, "step": 4550 }, { "epoch": 1.1503531786074672, "grad_norm": 0.09643906354904175, "learning_rate": 3.082744702320888e-05, "loss": 0.0077, "step": 4560 }, { "epoch": 1.1528758829465187, "grad_norm": 0.0748005211353302, "learning_rate": 3.078540195089136e-05, "loss": 0.0006, "step": 4570 }, { "epoch": 1.15539858728557, "grad_norm": 0.007943224161863327, "learning_rate": 3.074335687857383e-05, "loss": 0.0009, "step": 4580 }, { "epoch": 1.1579212916246215, "grad_norm": 0.0026662801392376423, "learning_rate": 3.0701311806256314e-05, "loss": 0.0006, "step": 4590 }, { "epoch": 1.160443995963673, "grad_norm": 0.007000184152275324, "learning_rate": 3.065926673393878e-05, "loss": 0.0029, "step": 4600 }, { "epoch": 1.160443995963673, "eval_loss": 0.005535861011594534, "eval_runtime": 20.911, "eval_samples_per_second": 84.262, "eval_steps_per_second": 21.089, "step": 4600 }, { "epoch": 1.1629667003027246, "grad_norm": 0.010504338890314102, "learning_rate": 3.0617221661621257e-05, "loss": 0.0021, "step": 4610 }, { "epoch": 1.165489404641776, "grad_norm": 0.0042596235871315, "learning_rate": 3.057517658930373e-05, "loss": 0.0025, "step": 4620 }, { "epoch": 1.1680121089808275, "grad_norm": 0.1560107320547104, "learning_rate": 3.053313151698621e-05, "loss": 0.0053, "step": 4630 }, { "epoch": 1.170534813319879, "grad_norm": 0.03363262489438057, "learning_rate": 3.0491086444668687e-05, "loss": 0.0021, "step": 4640 }, { "epoch": 1.1730575176589304, "grad_norm": 0.024774545803666115, "learning_rate": 3.0449041372351162e-05, "loss": 0.0085, "step": 4650 }, { "epoch": 1.1755802219979818, "grad_norm": 0.021040301769971848, "learning_rate": 3.0406996300033637e-05, "loss": 0.0015, "step": 4660 }, { "epoch": 1.1781029263370333, "grad_norm": 0.09655909985303879, "learning_rate": 3.0364951227716115e-05, "loss": 0.0021, "step": 4670 }, { "epoch": 1.1806256306760847, "grad_norm": 0.13116657733917236, "learning_rate": 3.032290615539859e-05, "loss": 0.0035, "step": 4680 }, { "epoch": 1.1831483350151362, "grad_norm": 0.11077822744846344, "learning_rate": 3.0280861083081064e-05, "loss": 0.0019, "step": 4690 }, { "epoch": 1.1856710393541876, "grad_norm": 0.0015139818424358964, "learning_rate": 3.023881601076354e-05, "loss": 0.0019, "step": 4700 }, { "epoch": 1.1856710393541876, "eval_loss": 0.005410597659647465, "eval_runtime": 20.9117, "eval_samples_per_second": 84.259, "eval_steps_per_second": 21.089, "step": 4700 }, { "epoch": 1.188193743693239, "grad_norm": 0.24234654009342194, "learning_rate": 3.0196770938446017e-05, "loss": 0.0029, "step": 4710 }, { "epoch": 1.1907164480322907, "grad_norm": 0.13478586077690125, "learning_rate": 3.015472586612849e-05, "loss": 0.0054, "step": 4720 }, { "epoch": 1.1932391523713421, "grad_norm": 0.00044387555681169033, "learning_rate": 3.0112680793810966e-05, "loss": 0.0024, "step": 4730 }, { "epoch": 1.1957618567103936, "grad_norm": 0.005823497194796801, "learning_rate": 3.007063572149344e-05, "loss": 0.0018, "step": 4740 }, { "epoch": 1.198284561049445, "grad_norm": 0.03638460114598274, "learning_rate": 3.002859064917592e-05, "loss": 0.0029, "step": 4750 }, { "epoch": 1.2008072653884965, "grad_norm": 0.019287308678030968, "learning_rate": 2.9986545576858393e-05, "loss": 0.0021, "step": 4760 }, { "epoch": 1.203329969727548, "grad_norm": 0.10308881103992462, "learning_rate": 2.9944500504540868e-05, "loss": 0.0065, "step": 4770 }, { "epoch": 1.2058526740665994, "grad_norm": 0.061113424599170685, "learning_rate": 2.9902455432223346e-05, "loss": 0.0017, "step": 4780 }, { "epoch": 1.2083753784056508, "grad_norm": 0.07032662630081177, "learning_rate": 2.986041035990582e-05, "loss": 0.0029, "step": 4790 }, { "epoch": 1.2108980827447022, "grad_norm": 0.09232667833566666, "learning_rate": 2.9818365287588296e-05, "loss": 0.0005, "step": 4800 }, { "epoch": 1.2108980827447022, "eval_loss": 0.0057380907237529755, "eval_runtime": 20.8889, "eval_samples_per_second": 84.351, "eval_steps_per_second": 21.112, "step": 4800 }, { "epoch": 1.213420787083754, "grad_norm": 0.0020024082623422146, "learning_rate": 2.977632021527077e-05, "loss": 0.0027, "step": 4810 }, { "epoch": 1.2159434914228053, "grad_norm": 0.02490418404340744, "learning_rate": 2.9734275142953248e-05, "loss": 0.001, "step": 4820 }, { "epoch": 1.2184661957618568, "grad_norm": 0.16153936088085175, "learning_rate": 2.9692230070635723e-05, "loss": 0.0021, "step": 4830 }, { "epoch": 1.2209889001009082, "grad_norm": 0.1121373400092125, "learning_rate": 2.9650184998318198e-05, "loss": 0.002, "step": 4840 }, { "epoch": 1.2235116044399597, "grad_norm": 0.0005092213395982981, "learning_rate": 2.9608139926000672e-05, "loss": 0.0031, "step": 4850 }, { "epoch": 1.226034308779011, "grad_norm": 0.004732844419777393, "learning_rate": 2.956609485368315e-05, "loss": 0.0008, "step": 4860 }, { "epoch": 1.2285570131180625, "grad_norm": 0.21401868760585785, "learning_rate": 2.9524049781365625e-05, "loss": 0.0103, "step": 4870 }, { "epoch": 1.231079717457114, "grad_norm": 0.08474498987197876, "learning_rate": 2.94820047090481e-05, "loss": 0.0012, "step": 4880 }, { "epoch": 1.2336024217961654, "grad_norm": 0.005474930163472891, "learning_rate": 2.9439959636730574e-05, "loss": 0.0018, "step": 4890 }, { "epoch": 1.2361251261352169, "grad_norm": 0.20003733038902283, "learning_rate": 2.9397914564413052e-05, "loss": 0.0038, "step": 4900 }, { "epoch": 1.2361251261352169, "eval_loss": 0.005515508819371462, "eval_runtime": 20.9039, "eval_samples_per_second": 84.29, "eval_steps_per_second": 21.097, "step": 4900 }, { "epoch": 1.2386478304742683, "grad_norm": 0.16607780754566193, "learning_rate": 2.9355869492095527e-05, "loss": 0.0037, "step": 4910 }, { "epoch": 1.24117053481332, "grad_norm": 0.21575786173343658, "learning_rate": 2.9313824419778e-05, "loss": 0.0049, "step": 4920 }, { "epoch": 1.2436932391523714, "grad_norm": 0.1419685035943985, "learning_rate": 2.9271779347460483e-05, "loss": 0.0056, "step": 4930 }, { "epoch": 1.2462159434914228, "grad_norm": 0.0009386079618707299, "learning_rate": 2.9229734275142954e-05, "loss": 0.0015, "step": 4940 }, { "epoch": 1.2487386478304743, "grad_norm": 0.011205712333321571, "learning_rate": 2.918768920282543e-05, "loss": 0.0026, "step": 4950 }, { "epoch": 1.2512613521695257, "grad_norm": 0.21776536107063293, "learning_rate": 2.9145644130507904e-05, "loss": 0.0039, "step": 4960 }, { "epoch": 1.2537840565085772, "grad_norm": 0.09818103164434433, "learning_rate": 2.9103599058190385e-05, "loss": 0.0011, "step": 4970 }, { "epoch": 1.2563067608476286, "grad_norm": 0.16241490840911865, "learning_rate": 2.9061553985872856e-05, "loss": 0.002, "step": 4980 }, { "epoch": 1.25882946518668, "grad_norm": 0.0026818953920155764, "learning_rate": 2.901950891355533e-05, "loss": 0.0021, "step": 4990 }, { "epoch": 1.2613521695257317, "grad_norm": 0.026470551267266273, "learning_rate": 2.8977463841237806e-05, "loss": 0.006, "step": 5000 }, { "epoch": 1.2613521695257317, "eval_loss": 0.00518006319180131, "eval_runtime": 20.9271, "eval_samples_per_second": 84.197, "eval_steps_per_second": 21.073, "step": 5000 }, { "epoch": 1.2638748738647831, "grad_norm": 0.0004209143517073244, "learning_rate": 2.8935418768920287e-05, "loss": 0.0033, "step": 5010 }, { "epoch": 1.2663975782038346, "grad_norm": 0.0030910836067050695, "learning_rate": 2.8893373696602762e-05, "loss": 0.0024, "step": 5020 }, { "epoch": 1.268920282542886, "grad_norm": 0.013859076425433159, "learning_rate": 2.8851328624285233e-05, "loss": 0.0073, "step": 5030 }, { "epoch": 1.2714429868819375, "grad_norm": 0.0023835492320358753, "learning_rate": 2.8809283551967708e-05, "loss": 0.0023, "step": 5040 }, { "epoch": 1.273965691220989, "grad_norm": 0.0017705514328554273, "learning_rate": 2.876723847965019e-05, "loss": 0.0009, "step": 5050 }, { "epoch": 1.2764883955600403, "grad_norm": 0.09929084032773972, "learning_rate": 2.8725193407332664e-05, "loss": 0.0024, "step": 5060 }, { "epoch": 1.2790110998990918, "grad_norm": 0.21266485750675201, "learning_rate": 2.8683148335015135e-05, "loss": 0.0051, "step": 5070 }, { "epoch": 1.2815338042381432, "grad_norm": 0.045401476323604584, "learning_rate": 2.864110326269761e-05, "loss": 0.001, "step": 5080 }, { "epoch": 1.2840565085771947, "grad_norm": 0.010040095075964928, "learning_rate": 2.859905819038009e-05, "loss": 0.0049, "step": 5090 }, { "epoch": 1.286579212916246, "grad_norm": 0.1340843141078949, "learning_rate": 2.8557013118062566e-05, "loss": 0.0038, "step": 5100 }, { "epoch": 1.286579212916246, "eval_loss": 0.0053547462448477745, "eval_runtime": 20.8929, "eval_samples_per_second": 84.335, "eval_steps_per_second": 21.108, "step": 5100 }, { "epoch": 1.2891019172552975, "grad_norm": 0.060051582753658295, "learning_rate": 2.8514968045745037e-05, "loss": 0.0005, "step": 5110 }, { "epoch": 1.2916246215943492, "grad_norm": 0.000854416866786778, "learning_rate": 2.847292297342752e-05, "loss": 0.0023, "step": 5120 }, { "epoch": 1.2941473259334006, "grad_norm": 0.002330298302695155, "learning_rate": 2.8430877901109993e-05, "loss": 0.0031, "step": 5130 }, { "epoch": 1.296670030272452, "grad_norm": 0.08991765975952148, "learning_rate": 2.8388832828792468e-05, "loss": 0.001, "step": 5140 }, { "epoch": 1.2991927346115035, "grad_norm": 0.1184747964143753, "learning_rate": 2.834678775647494e-05, "loss": 0.0023, "step": 5150 }, { "epoch": 1.301715438950555, "grad_norm": 0.023154448717832565, "learning_rate": 2.830474268415742e-05, "loss": 0.0024, "step": 5160 }, { "epoch": 1.3042381432896064, "grad_norm": 0.0035342529881745577, "learning_rate": 2.8262697611839895e-05, "loss": 0.0033, "step": 5170 }, { "epoch": 1.3067608476286579, "grad_norm": 0.09643299877643585, "learning_rate": 2.822065253952237e-05, "loss": 0.0068, "step": 5180 }, { "epoch": 1.3092835519677095, "grad_norm": 0.0010538576170802116, "learning_rate": 2.817860746720484e-05, "loss": 0.0032, "step": 5190 }, { "epoch": 1.311806256306761, "grad_norm": 0.004331338219344616, "learning_rate": 2.8136562394887323e-05, "loss": 0.0065, "step": 5200 }, { "epoch": 1.311806256306761, "eval_loss": 0.005469166673719883, "eval_runtime": 20.887, "eval_samples_per_second": 84.359, "eval_steps_per_second": 21.114, "step": 5200 }, { "epoch": 1.3143289606458124, "grad_norm": 0.11001147329807281, "learning_rate": 2.8094517322569797e-05, "loss": 0.0027, "step": 5210 }, { "epoch": 1.3168516649848638, "grad_norm": 0.006123987026512623, "learning_rate": 2.8052472250252272e-05, "loss": 0.0048, "step": 5220 }, { "epoch": 1.3193743693239153, "grad_norm": 0.018299918621778488, "learning_rate": 2.8010427177934743e-05, "loss": 0.0046, "step": 5230 }, { "epoch": 1.3218970736629667, "grad_norm": 0.04792286828160286, "learning_rate": 2.7968382105617225e-05, "loss": 0.0026, "step": 5240 }, { "epoch": 1.3244197780020182, "grad_norm": 0.0024629354011267424, "learning_rate": 2.79263370332997e-05, "loss": 0.0021, "step": 5250 }, { "epoch": 1.3269424823410696, "grad_norm": 0.00013681373093277216, "learning_rate": 2.7884291960982174e-05, "loss": 0.0044, "step": 5260 }, { "epoch": 1.329465186680121, "grad_norm": 0.10849064588546753, "learning_rate": 2.7842246888664652e-05, "loss": 0.0034, "step": 5270 }, { "epoch": 1.3319878910191725, "grad_norm": 0.0731433853507042, "learning_rate": 2.7800201816347127e-05, "loss": 0.0026, "step": 5280 }, { "epoch": 1.334510595358224, "grad_norm": 0.0010674018412828445, "learning_rate": 2.77581567440296e-05, "loss": 0.0009, "step": 5290 }, { "epoch": 1.3370332996972754, "grad_norm": 0.17949962615966797, "learning_rate": 2.7716111671712076e-05, "loss": 0.0015, "step": 5300 }, { "epoch": 1.3370332996972754, "eval_loss": 0.005221678409725428, "eval_runtime": 20.8945, "eval_samples_per_second": 84.328, "eval_steps_per_second": 21.106, "step": 5300 }, { "epoch": 1.339556004036327, "grad_norm": 0.0010082372464239597, "learning_rate": 2.7674066599394554e-05, "loss": 0.0003, "step": 5310 }, { "epoch": 1.3420787083753785, "grad_norm": 0.00024729970027692616, "learning_rate": 2.763202152707703e-05, "loss": 0.0012, "step": 5320 }, { "epoch": 1.34460141271443, "grad_norm": 0.09433750808238983, "learning_rate": 2.7589976454759504e-05, "loss": 0.0016, "step": 5330 }, { "epoch": 1.3471241170534813, "grad_norm": 0.001336489338427782, "learning_rate": 2.7547931382441978e-05, "loss": 0.0024, "step": 5340 }, { "epoch": 1.3496468213925328, "grad_norm": 0.012806025333702564, "learning_rate": 2.7505886310124456e-05, "loss": 0.0033, "step": 5350 }, { "epoch": 1.3521695257315842, "grad_norm": 0.16509069502353668, "learning_rate": 2.746384123780693e-05, "loss": 0.0086, "step": 5360 }, { "epoch": 1.3546922300706357, "grad_norm": 0.0008099581464193761, "learning_rate": 2.7421796165489406e-05, "loss": 0.0024, "step": 5370 }, { "epoch": 1.357214934409687, "grad_norm": 0.004303140100091696, "learning_rate": 2.737975109317188e-05, "loss": 0.0027, "step": 5380 }, { "epoch": 1.3597376387487388, "grad_norm": 0.00023327719827648252, "learning_rate": 2.733770602085436e-05, "loss": 0.0013, "step": 5390 }, { "epoch": 1.3622603430877902, "grad_norm": 0.003809950314462185, "learning_rate": 2.7295660948536833e-05, "loss": 0.0005, "step": 5400 }, { "epoch": 1.3622603430877902, "eval_loss": 0.005022699944674969, "eval_runtime": 21.0476, "eval_samples_per_second": 83.715, "eval_steps_per_second": 20.952, "step": 5400 }, { "epoch": 1.3647830474268416, "grad_norm": 0.00021514434774871916, "learning_rate": 2.7253615876219308e-05, "loss": 0.0028, "step": 5410 }, { "epoch": 1.367305751765893, "grad_norm": 0.17706815898418427, "learning_rate": 2.7211570803901782e-05, "loss": 0.0014, "step": 5420 }, { "epoch": 1.3698284561049445, "grad_norm": 0.004937909543514252, "learning_rate": 2.716952573158426e-05, "loss": 0.0031, "step": 5430 }, { "epoch": 1.372351160443996, "grad_norm": 0.0958208441734314, "learning_rate": 2.7127480659266735e-05, "loss": 0.0033, "step": 5440 }, { "epoch": 1.3748738647830474, "grad_norm": 0.06263504922389984, "learning_rate": 2.708543558694921e-05, "loss": 0.0001, "step": 5450 }, { "epoch": 1.3773965691220988, "grad_norm": 0.003332935506477952, "learning_rate": 2.7043390514631688e-05, "loss": 0.0002, "step": 5460 }, { "epoch": 1.3799192734611503, "grad_norm": 0.16171465814113617, "learning_rate": 2.7001345442314162e-05, "loss": 0.004, "step": 5470 }, { "epoch": 1.3824419778002017, "grad_norm": 0.04109754040837288, "learning_rate": 2.6959300369996637e-05, "loss": 0.0055, "step": 5480 }, { "epoch": 1.3849646821392532, "grad_norm": 0.0015252727316692472, "learning_rate": 2.6917255297679112e-05, "loss": 0.0038, "step": 5490 }, { "epoch": 1.3874873864783046, "grad_norm": 0.0015239976346492767, "learning_rate": 2.687521022536159e-05, "loss": 0.001, "step": 5500 }, { "epoch": 1.3874873864783046, "eval_loss": 0.005176006816327572, "eval_runtime": 21.011, "eval_samples_per_second": 83.861, "eval_steps_per_second": 20.989, "step": 5500 }, { "epoch": 1.3900100908173563, "grad_norm": 0.08292572945356369, "learning_rate": 2.6833165153044064e-05, "loss": 0.0015, "step": 5510 }, { "epoch": 1.3925327951564077, "grad_norm": 0.011006727814674377, "learning_rate": 2.679112008072654e-05, "loss": 0.0019, "step": 5520 }, { "epoch": 1.3950554994954592, "grad_norm": 0.15567320585250854, "learning_rate": 2.6749075008409014e-05, "loss": 0.0021, "step": 5530 }, { "epoch": 1.3975782038345106, "grad_norm": 0.09949897229671478, "learning_rate": 2.6707029936091492e-05, "loss": 0.002, "step": 5540 }, { "epoch": 1.400100908173562, "grad_norm": 0.07961593568325043, "learning_rate": 2.6664984863773967e-05, "loss": 0.0009, "step": 5550 }, { "epoch": 1.4026236125126135, "grad_norm": 0.15322332084178925, "learning_rate": 2.662293979145644e-05, "loss": 0.0024, "step": 5560 }, { "epoch": 1.405146316851665, "grad_norm": 0.1159447729587555, "learning_rate": 2.6580894719138916e-05, "loss": 0.0019, "step": 5570 }, { "epoch": 1.4076690211907166, "grad_norm": 0.0029101588297635317, "learning_rate": 2.6538849646821394e-05, "loss": 0.0008, "step": 5580 }, { "epoch": 1.410191725529768, "grad_norm": 0.0002611145027913153, "learning_rate": 2.649680457450387e-05, "loss": 0.0055, "step": 5590 }, { "epoch": 1.4127144298688195, "grad_norm": 0.14663146436214447, "learning_rate": 2.6454759502186343e-05, "loss": 0.0039, "step": 5600 }, { "epoch": 1.4127144298688195, "eval_loss": 0.005056018941104412, "eval_runtime": 20.9323, "eval_samples_per_second": 84.176, "eval_steps_per_second": 21.068, "step": 5600 }, { "epoch": 1.415237134207871, "grad_norm": 0.0049458956345915794, "learning_rate": 2.6412714429868825e-05, "loss": 0.001, "step": 5610 }, { "epoch": 1.4177598385469223, "grad_norm": 0.3080619275569916, "learning_rate": 2.6370669357551296e-05, "loss": 0.0071, "step": 5620 }, { "epoch": 1.4202825428859738, "grad_norm": 0.0023910084273666143, "learning_rate": 2.632862428523377e-05, "loss": 0.0017, "step": 5630 }, { "epoch": 1.4228052472250252, "grad_norm": 0.0009933901019394398, "learning_rate": 2.6286579212916245e-05, "loss": 0.0044, "step": 5640 }, { "epoch": 1.4253279515640767, "grad_norm": 0.02665986306965351, "learning_rate": 2.6244534140598727e-05, "loss": 0.0009, "step": 5650 }, { "epoch": 1.427850655903128, "grad_norm": 0.17384563386440277, "learning_rate": 2.6202489068281198e-05, "loss": 0.0065, "step": 5660 }, { "epoch": 1.4303733602421795, "grad_norm": 0.05648142844438553, "learning_rate": 2.6160443995963673e-05, "loss": 0.0016, "step": 5670 }, { "epoch": 1.432896064581231, "grad_norm": 0.004266271833330393, "learning_rate": 2.6118398923646147e-05, "loss": 0.0028, "step": 5680 }, { "epoch": 1.4354187689202824, "grad_norm": 0.020753854885697365, "learning_rate": 2.607635385132863e-05, "loss": 0.002, "step": 5690 }, { "epoch": 1.437941473259334, "grad_norm": 0.08341605216264725, "learning_rate": 2.6034308779011103e-05, "loss": 0.0023, "step": 5700 }, { "epoch": 1.437941473259334, "eval_loss": 0.004923286382108927, "eval_runtime": 20.9235, "eval_samples_per_second": 84.212, "eval_steps_per_second": 21.077, "step": 5700 }, { "epoch": 1.4404641775983855, "grad_norm": 0.007267744280397892, "learning_rate": 2.5992263706693575e-05, "loss": 0.003, "step": 5710 }, { "epoch": 1.442986881937437, "grad_norm": 0.00982646644115448, "learning_rate": 2.595021863437605e-05, "loss": 0.0001, "step": 5720 }, { "epoch": 1.4455095862764884, "grad_norm": 0.0013306884793564677, "learning_rate": 2.590817356205853e-05, "loss": 0.0019, "step": 5730 }, { "epoch": 1.4480322906155398, "grad_norm": 0.037949543446302414, "learning_rate": 2.5866128489741005e-05, "loss": 0.0014, "step": 5740 }, { "epoch": 1.4505549949545913, "grad_norm": 0.0034357199911028147, "learning_rate": 2.5824083417423477e-05, "loss": 0.0025, "step": 5750 }, { "epoch": 1.4530776992936427, "grad_norm": 0.08490198105573654, "learning_rate": 2.578203834510595e-05, "loss": 0.0016, "step": 5760 }, { "epoch": 1.4556004036326944, "grad_norm": 0.09188306331634521, "learning_rate": 2.5739993272788433e-05, "loss": 0.0006, "step": 5770 }, { "epoch": 1.4581231079717458, "grad_norm": 0.3032228350639343, "learning_rate": 2.5697948200470908e-05, "loss": 0.0037, "step": 5780 }, { "epoch": 1.4606458123107973, "grad_norm": 0.0033124187029898167, "learning_rate": 2.565590312815338e-05, "loss": 0.0003, "step": 5790 }, { "epoch": 1.4631685166498487, "grad_norm": 0.18161827325820923, "learning_rate": 2.561385805583586e-05, "loss": 0.0035, "step": 5800 }, { "epoch": 1.4631685166498487, "eval_loss": 0.0052693067118525505, "eval_runtime": 20.897, "eval_samples_per_second": 84.318, "eval_steps_per_second": 21.103, "step": 5800 }, { "epoch": 1.4656912209889001, "grad_norm": 0.005931831430643797, "learning_rate": 2.5571812983518335e-05, "loss": 0.0028, "step": 5810 }, { "epoch": 1.4682139253279516, "grad_norm": 0.0015284974360838532, "learning_rate": 2.552976791120081e-05, "loss": 0.0014, "step": 5820 }, { "epoch": 1.470736629667003, "grad_norm": 0.035359546542167664, "learning_rate": 2.548772283888328e-05, "loss": 0.0017, "step": 5830 }, { "epoch": 1.4732593340060545, "grad_norm": 0.0004528906138148159, "learning_rate": 2.5445677766565762e-05, "loss": 0.003, "step": 5840 }, { "epoch": 1.475782038345106, "grad_norm": 0.0017564162844792008, "learning_rate": 2.5403632694248237e-05, "loss": 0.0016, "step": 5850 }, { "epoch": 1.4783047426841573, "grad_norm": 0.1506708413362503, "learning_rate": 2.536158762193071e-05, "loss": 0.0008, "step": 5860 }, { "epoch": 1.4808274470232088, "grad_norm": 0.0023614871315658092, "learning_rate": 2.5319542549613183e-05, "loss": 0.0022, "step": 5870 }, { "epoch": 1.4833501513622602, "grad_norm": 0.00034669501474127173, "learning_rate": 2.5277497477295664e-05, "loss": 0.0032, "step": 5880 }, { "epoch": 1.4858728557013117, "grad_norm": 0.0016111385775730014, "learning_rate": 2.523545240497814e-05, "loss": 0.0037, "step": 5890 }, { "epoch": 1.4883955600403633, "grad_norm": 0.00014663147157989442, "learning_rate": 2.5193407332660614e-05, "loss": 0.0008, "step": 5900 }, { "epoch": 1.4883955600403633, "eval_loss": 0.004971860907971859, "eval_runtime": 20.9024, "eval_samples_per_second": 84.296, "eval_steps_per_second": 21.098, "step": 5900 }, { "epoch": 1.4909182643794148, "grad_norm": 0.15087057650089264, "learning_rate": 2.5151362260343085e-05, "loss": 0.0027, "step": 5910 }, { "epoch": 1.4934409687184662, "grad_norm": 0.18127664923667908, "learning_rate": 2.5109317188025566e-05, "loss": 0.0042, "step": 5920 }, { "epoch": 1.4959636730575177, "grad_norm": 0.069893017411232, "learning_rate": 2.506727211570804e-05, "loss": 0.0022, "step": 5930 }, { "epoch": 1.498486377396569, "grad_norm": 0.00019991624867543578, "learning_rate": 2.5025227043390516e-05, "loss": 0.0017, "step": 5940 }, { "epoch": 1.5010090817356205, "grad_norm": 0.09269930422306061, "learning_rate": 2.498318197107299e-05, "loss": 0.0028, "step": 5950 }, { "epoch": 1.5035317860746722, "grad_norm": 0.06926806271076202, "learning_rate": 2.494113689875547e-05, "loss": 0.0039, "step": 5960 }, { "epoch": 1.5060544904137236, "grad_norm": 0.03350943699479103, "learning_rate": 2.4899091826437943e-05, "loss": 0.0052, "step": 5970 }, { "epoch": 1.508577194752775, "grad_norm": 0.008175240829586983, "learning_rate": 2.4857046754120418e-05, "loss": 0.0023, "step": 5980 }, { "epoch": 1.5110998990918265, "grad_norm": 0.1838151216506958, "learning_rate": 2.4815001681802892e-05, "loss": 0.0031, "step": 5990 }, { "epoch": 1.513622603430878, "grad_norm": 0.11169478297233582, "learning_rate": 2.477295660948537e-05, "loss": 0.0042, "step": 6000 }, { "epoch": 1.513622603430878, "eval_loss": 0.004873940721154213, "eval_runtime": 20.8974, "eval_samples_per_second": 84.317, "eval_steps_per_second": 21.103, "step": 6000 }, { "epoch": 1.5161453077699294, "grad_norm": 0.0018095527775585651, "learning_rate": 2.4730911537167845e-05, "loss": 0.0023, "step": 6010 }, { "epoch": 1.5186680121089808, "grad_norm": 0.0017755021108314395, "learning_rate": 2.468886646485032e-05, "loss": 0.0007, "step": 6020 }, { "epoch": 1.5211907164480323, "grad_norm": 0.1006636768579483, "learning_rate": 2.4646821392532794e-05, "loss": 0.0054, "step": 6030 }, { "epoch": 1.5237134207870837, "grad_norm": 0.17334707081317902, "learning_rate": 2.4604776320215273e-05, "loss": 0.0031, "step": 6040 }, { "epoch": 1.5262361251261352, "grad_norm": 0.004185323137789965, "learning_rate": 2.4562731247897747e-05, "loss": 0.0018, "step": 6050 }, { "epoch": 1.5287588294651866, "grad_norm": 0.06221470236778259, "learning_rate": 2.4520686175580225e-05, "loss": 0.0032, "step": 6060 }, { "epoch": 1.531281533804238, "grad_norm": 0.10330460220575333, "learning_rate": 2.4478641103262697e-05, "loss": 0.0035, "step": 6070 }, { "epoch": 1.5338042381432895, "grad_norm": 0.13589535653591156, "learning_rate": 2.4436596030945175e-05, "loss": 0.0016, "step": 6080 }, { "epoch": 1.536326942482341, "grad_norm": 0.19307217001914978, "learning_rate": 2.439455095862765e-05, "loss": 0.0039, "step": 6090 }, { "epoch": 1.5388496468213926, "grad_norm": 0.07404123246669769, "learning_rate": 2.4352505886310127e-05, "loss": 0.0008, "step": 6100 }, { "epoch": 1.5388496468213926, "eval_loss": 0.005088960751891136, "eval_runtime": 20.9119, "eval_samples_per_second": 84.258, "eval_steps_per_second": 21.088, "step": 6100 }, { "epoch": 1.541372351160444, "grad_norm": 0.06749244034290314, "learning_rate": 2.43104608139926e-05, "loss": 0.0045, "step": 6110 }, { "epoch": 1.5438950554994955, "grad_norm": 0.00822696927934885, "learning_rate": 2.4268415741675077e-05, "loss": 0.0022, "step": 6120 }, { "epoch": 1.546417759838547, "grad_norm": 0.0362759605050087, "learning_rate": 2.4226370669357555e-05, "loss": 0.0018, "step": 6130 }, { "epoch": 1.5489404641775983, "grad_norm": 0.0018353847553953528, "learning_rate": 2.418432559704003e-05, "loss": 0.0033, "step": 6140 }, { "epoch": 1.55146316851665, "grad_norm": 0.0009886518819257617, "learning_rate": 2.4142280524722504e-05, "loss": 0.0033, "step": 6150 }, { "epoch": 1.5539858728557014, "grad_norm": 0.005221598315984011, "learning_rate": 2.410023545240498e-05, "loss": 0.0042, "step": 6160 }, { "epoch": 1.5565085771947529, "grad_norm": 0.02474922128021717, "learning_rate": 2.4058190380087457e-05, "loss": 0.0059, "step": 6170 }, { "epoch": 1.5590312815338043, "grad_norm": 0.005371175706386566, "learning_rate": 2.401614530776993e-05, "loss": 0.004, "step": 6180 }, { "epoch": 1.5615539858728558, "grad_norm": 0.1124267429113388, "learning_rate": 2.3974100235452406e-05, "loss": 0.0007, "step": 6190 }, { "epoch": 1.5640766902119072, "grad_norm": 0.009999338537454605, "learning_rate": 2.393205516313488e-05, "loss": 0.0018, "step": 6200 }, { "epoch": 1.5640766902119072, "eval_loss": 0.004953174851834774, "eval_runtime": 20.8923, "eval_samples_per_second": 84.337, "eval_steps_per_second": 21.108, "step": 6200 }, { "epoch": 1.5665993945509586, "grad_norm": 0.028597630560398102, "learning_rate": 2.389001009081736e-05, "loss": 0.0044, "step": 6210 }, { "epoch": 1.56912209889001, "grad_norm": 0.0002847505093086511, "learning_rate": 2.3847965018499833e-05, "loss": 0.0013, "step": 6220 }, { "epoch": 1.5716448032290615, "grad_norm": 0.12137818336486816, "learning_rate": 2.3805919946182308e-05, "loss": 0.0025, "step": 6230 }, { "epoch": 1.574167507568113, "grad_norm": 0.05651724711060524, "learning_rate": 2.3763874873864783e-05, "loss": 0.0022, "step": 6240 }, { "epoch": 1.5766902119071644, "grad_norm": 0.225179061293602, "learning_rate": 2.372182980154726e-05, "loss": 0.0012, "step": 6250 }, { "epoch": 1.5792129162462158, "grad_norm": 0.0041250442154705524, "learning_rate": 2.3679784729229735e-05, "loss": 0.0055, "step": 6260 }, { "epoch": 1.5817356205852673, "grad_norm": 0.008712096139788628, "learning_rate": 2.363773965691221e-05, "loss": 0.0014, "step": 6270 }, { "epoch": 1.5842583249243187, "grad_norm": 0.007221329025924206, "learning_rate": 2.3595694584594685e-05, "loss": 0.0022, "step": 6280 }, { "epoch": 1.5867810292633702, "grad_norm": 0.012200511991977692, "learning_rate": 2.3553649512277163e-05, "loss": 0.0012, "step": 6290 }, { "epoch": 1.5893037336024218, "grad_norm": 0.0005704654031433165, "learning_rate": 2.3511604439959638e-05, "loss": 0.0013, "step": 6300 }, { "epoch": 1.5893037336024218, "eval_loss": 0.0048631117679178715, "eval_runtime": 20.8908, "eval_samples_per_second": 84.344, "eval_steps_per_second": 21.11, "step": 6300 }, { "epoch": 1.5918264379414733, "grad_norm": 0.002790976082906127, "learning_rate": 2.3469559367642112e-05, "loss": 0.0013, "step": 6310 }, { "epoch": 1.5943491422805247, "grad_norm": 0.0014387964038178325, "learning_rate": 2.342751429532459e-05, "loss": 0.0007, "step": 6320 }, { "epoch": 1.5968718466195762, "grad_norm": 0.00022175043704919517, "learning_rate": 2.3385469223007065e-05, "loss": 0.0035, "step": 6330 }, { "epoch": 1.5993945509586278, "grad_norm": 0.11953356117010117, "learning_rate": 2.334342415068954e-05, "loss": 0.0048, "step": 6340 }, { "epoch": 1.6019172552976793, "grad_norm": 0.18491606414318085, "learning_rate": 2.3301379078372014e-05, "loss": 0.0038, "step": 6350 }, { "epoch": 1.6044399596367307, "grad_norm": 0.19568416476249695, "learning_rate": 2.3259334006054492e-05, "loss": 0.0027, "step": 6360 }, { "epoch": 1.6069626639757821, "grad_norm": 0.08327057212591171, "learning_rate": 2.3217288933736967e-05, "loss": 0.0012, "step": 6370 }, { "epoch": 1.6094853683148336, "grad_norm": 0.03957786411046982, "learning_rate": 2.3175243861419445e-05, "loss": 0.0037, "step": 6380 }, { "epoch": 1.612008072653885, "grad_norm": 0.003976314328610897, "learning_rate": 2.3133198789101916e-05, "loss": 0.0015, "step": 6390 }, { "epoch": 1.6145307769929365, "grad_norm": 0.05154380202293396, "learning_rate": 2.3091153716784394e-05, "loss": 0.0003, "step": 6400 }, { "epoch": 1.6145307769929365, "eval_loss": 0.0047143567353487015, "eval_runtime": 20.8928, "eval_samples_per_second": 84.335, "eval_steps_per_second": 21.108, "step": 6400 }, { "epoch": 1.617053481331988, "grad_norm": 0.0036455143708735704, "learning_rate": 2.304910864446687e-05, "loss": 0.0023, "step": 6410 }, { "epoch": 1.6195761856710393, "grad_norm": 0.0702298954129219, "learning_rate": 2.3007063572149347e-05, "loss": 0.0019, "step": 6420 }, { "epoch": 1.6220988900100908, "grad_norm": 0.002678563119843602, "learning_rate": 2.296501849983182e-05, "loss": 0.0023, "step": 6430 }, { "epoch": 1.6246215943491422, "grad_norm": 0.002132556401193142, "learning_rate": 2.2922973427514296e-05, "loss": 0.0018, "step": 6440 }, { "epoch": 1.6271442986881937, "grad_norm": 0.2592438757419586, "learning_rate": 2.288092835519677e-05, "loss": 0.0039, "step": 6450 }, { "epoch": 1.629667003027245, "grad_norm": 0.07999824732542038, "learning_rate": 2.283888328287925e-05, "loss": 0.005, "step": 6460 }, { "epoch": 1.6321897073662965, "grad_norm": 0.0010637440718710423, "learning_rate": 2.2796838210561724e-05, "loss": 0.0008, "step": 6470 }, { "epoch": 1.634712411705348, "grad_norm": 0.2309955358505249, "learning_rate": 2.27547931382442e-05, "loss": 0.0033, "step": 6480 }, { "epoch": 1.6372351160443996, "grad_norm": 0.003076382912695408, "learning_rate": 2.2712748065926676e-05, "loss": 0.0004, "step": 6490 }, { "epoch": 1.639757820383451, "grad_norm": 0.13771465420722961, "learning_rate": 2.267070299360915e-05, "loss": 0.0016, "step": 6500 }, { "epoch": 1.639757820383451, "eval_loss": 0.00458995345979929, "eval_runtime": 20.8904, "eval_samples_per_second": 84.345, "eval_steps_per_second": 21.11, "step": 6500 }, { "epoch": 1.6422805247225025, "grad_norm": 0.0014351740246638656, "learning_rate": 2.2628657921291626e-05, "loss": 0.0014, "step": 6510 }, { "epoch": 1.644803229061554, "grad_norm": 0.08296415954828262, "learning_rate": 2.25866128489741e-05, "loss": 0.0012, "step": 6520 }, { "epoch": 1.6473259334006054, "grad_norm": 0.00042011673212982714, "learning_rate": 2.254456777665658e-05, "loss": 0.0087, "step": 6530 }, { "epoch": 1.649848637739657, "grad_norm": 0.01929500512778759, "learning_rate": 2.2502522704339053e-05, "loss": 0.0015, "step": 6540 }, { "epoch": 1.6523713420787085, "grad_norm": 0.14432388544082642, "learning_rate": 2.2460477632021528e-05, "loss": 0.001, "step": 6550 }, { "epoch": 1.65489404641776, "grad_norm": 0.0028279852122068405, "learning_rate": 2.2418432559704003e-05, "loss": 0.0022, "step": 6560 }, { "epoch": 1.6574167507568114, "grad_norm": 0.12562096118927002, "learning_rate": 2.237638748738648e-05, "loss": 0.0023, "step": 6570 }, { "epoch": 1.6599394550958628, "grad_norm": 0.12296324968338013, "learning_rate": 2.2334342415068955e-05, "loss": 0.0031, "step": 6580 }, { "epoch": 1.6624621594349143, "grad_norm": 0.023628313094377518, "learning_rate": 2.229229734275143e-05, "loss": 0.0049, "step": 6590 }, { "epoch": 1.6649848637739657, "grad_norm": 0.06037677451968193, "learning_rate": 2.2250252270433905e-05, "loss": 0.0015, "step": 6600 }, { "epoch": 1.6649848637739657, "eval_loss": 0.004487240687012672, "eval_runtime": 20.9059, "eval_samples_per_second": 84.282, "eval_steps_per_second": 21.094, "step": 6600 }, { "epoch": 1.6675075681130171, "grad_norm": 0.038203973323106766, "learning_rate": 2.2208207198116383e-05, "loss": 0.0024, "step": 6610 }, { "epoch": 1.6700302724520686, "grad_norm": 0.0006242411327548325, "learning_rate": 2.2166162125798857e-05, "loss": 0.0016, "step": 6620 }, { "epoch": 1.67255297679112, "grad_norm": 0.24198785424232483, "learning_rate": 2.2124117053481332e-05, "loss": 0.0042, "step": 6630 }, { "epoch": 1.6750756811301715, "grad_norm": 0.008689168840646744, "learning_rate": 2.208207198116381e-05, "loss": 0.0014, "step": 6640 }, { "epoch": 1.677598385469223, "grad_norm": 0.0001791265094652772, "learning_rate": 2.2040026908846285e-05, "loss": 0.001, "step": 6650 }, { "epoch": 1.6801210898082743, "grad_norm": 0.1187405064702034, "learning_rate": 2.199798183652876e-05, "loss": 0.0012, "step": 6660 }, { "epoch": 1.6826437941473258, "grad_norm": 0.17541439831256866, "learning_rate": 2.1955936764211234e-05, "loss": 0.0015, "step": 6670 }, { "epoch": 1.6851664984863775, "grad_norm": 0.025754814967513084, "learning_rate": 2.1913891691893712e-05, "loss": 0.0024, "step": 6680 }, { "epoch": 1.687689202825429, "grad_norm": 0.16264697909355164, "learning_rate": 2.1871846619576187e-05, "loss": 0.0025, "step": 6690 }, { "epoch": 1.6902119071644803, "grad_norm": 0.0034603734966367483, "learning_rate": 2.1829801547258665e-05, "loss": 0.0005, "step": 6700 }, { "epoch": 1.6902119071644803, "eval_loss": 0.00464710732921958, "eval_runtime": 20.9042, "eval_samples_per_second": 84.289, "eval_steps_per_second": 21.096, "step": 6700 }, { "epoch": 1.6927346115035318, "grad_norm": 0.005144911352545023, "learning_rate": 2.1787756474941136e-05, "loss": 0.0005, "step": 6710 }, { "epoch": 1.6952573158425832, "grad_norm": 0.2386636584997177, "learning_rate": 2.1745711402623614e-05, "loss": 0.0045, "step": 6720 }, { "epoch": 1.6977800201816349, "grad_norm": 0.0006302748224698007, "learning_rate": 2.170366633030609e-05, "loss": 0.0003, "step": 6730 }, { "epoch": 1.7003027245206863, "grad_norm": 0.04393825680017471, "learning_rate": 2.1661621257988567e-05, "loss": 0.0015, "step": 6740 }, { "epoch": 1.7028254288597378, "grad_norm": 0.002832000143826008, "learning_rate": 2.1619576185671038e-05, "loss": 0.0029, "step": 6750 }, { "epoch": 1.7053481331987892, "grad_norm": 0.16164688766002655, "learning_rate": 2.1577531113353516e-05, "loss": 0.0049, "step": 6760 }, { "epoch": 1.7078708375378406, "grad_norm": 0.1447678804397583, "learning_rate": 2.153548604103599e-05, "loss": 0.0026, "step": 6770 }, { "epoch": 1.710393541876892, "grad_norm": 0.005429640877991915, "learning_rate": 2.149344096871847e-05, "loss": 0.003, "step": 6780 }, { "epoch": 1.7129162462159435, "grad_norm": 0.0007169700693339109, "learning_rate": 2.145139589640094e-05, "loss": 0.0012, "step": 6790 }, { "epoch": 1.715438950554995, "grad_norm": 0.0009801093256101012, "learning_rate": 2.1409350824083418e-05, "loss": 0.0023, "step": 6800 }, { "epoch": 1.715438950554995, "eval_loss": 0.0046570939011871815, "eval_runtime": 20.8879, "eval_samples_per_second": 84.355, "eval_steps_per_second": 21.113, "step": 6800 }, { "epoch": 1.7179616548940464, "grad_norm": 0.009192834608256817, "learning_rate": 2.1367305751765896e-05, "loss": 0.004, "step": 6810 }, { "epoch": 1.7204843592330978, "grad_norm": 0.004604650661349297, "learning_rate": 2.132526067944837e-05, "loss": 0.0012, "step": 6820 }, { "epoch": 1.7230070635721493, "grad_norm": 0.0017496418440714478, "learning_rate": 2.1283215607130846e-05, "loss": 0.0027, "step": 6830 }, { "epoch": 1.7255297679112007, "grad_norm": 0.003268537111580372, "learning_rate": 2.124117053481332e-05, "loss": 0.0027, "step": 6840 }, { "epoch": 1.7280524722502522, "grad_norm": 0.00015593957505188882, "learning_rate": 2.11991254624958e-05, "loss": 0.0034, "step": 6850 }, { "epoch": 1.7305751765893036, "grad_norm": 0.0808212012052536, "learning_rate": 2.1157080390178273e-05, "loss": 0.0048, "step": 6860 }, { "epoch": 1.733097880928355, "grad_norm": 0.24864982068538666, "learning_rate": 2.1115035317860748e-05, "loss": 0.0018, "step": 6870 }, { "epoch": 1.7356205852674067, "grad_norm": 0.004026748705655336, "learning_rate": 2.1072990245543222e-05, "loss": 0.0033, "step": 6880 }, { "epoch": 1.7381432896064581, "grad_norm": 0.017659470438957214, "learning_rate": 2.10309451732257e-05, "loss": 0.0016, "step": 6890 }, { "epoch": 1.7406659939455096, "grad_norm": 0.004599269945174456, "learning_rate": 2.0988900100908175e-05, "loss": 0.0011, "step": 6900 }, { "epoch": 1.7406659939455096, "eval_loss": 0.004562552087008953, "eval_runtime": 20.8736, "eval_samples_per_second": 84.413, "eval_steps_per_second": 21.127, "step": 6900 }, { "epoch": 1.743188698284561, "grad_norm": 0.00346059980802238, "learning_rate": 2.094685502859065e-05, "loss": 0.0026, "step": 6910 }, { "epoch": 1.7457114026236125, "grad_norm": 0.03819597512483597, "learning_rate": 2.0904809956273124e-05, "loss": 0.0009, "step": 6920 }, { "epoch": 1.7482341069626641, "grad_norm": 0.2355988621711731, "learning_rate": 2.0862764883955602e-05, "loss": 0.0041, "step": 6930 }, { "epoch": 1.7507568113017156, "grad_norm": 0.0024086080957204103, "learning_rate": 2.0820719811638077e-05, "loss": 0.0029, "step": 6940 }, { "epoch": 1.753279515640767, "grad_norm": 0.25141242146492004, "learning_rate": 2.0778674739320552e-05, "loss": 0.0027, "step": 6950 }, { "epoch": 1.7558022199798184, "grad_norm": 0.08421680331230164, "learning_rate": 2.0736629667003026e-05, "loss": 0.0005, "step": 6960 }, { "epoch": 1.7583249243188699, "grad_norm": 0.19552625715732574, "learning_rate": 2.0694584594685504e-05, "loss": 0.0014, "step": 6970 }, { "epoch": 1.7608476286579213, "grad_norm": 0.0034374226815998554, "learning_rate": 2.065253952236798e-05, "loss": 0.0009, "step": 6980 }, { "epoch": 1.7633703329969728, "grad_norm": 0.0022041252814233303, "learning_rate": 2.0610494450050454e-05, "loss": 0.0007, "step": 6990 }, { "epoch": 1.7658930373360242, "grad_norm": 0.2155584841966629, "learning_rate": 2.0568449377732932e-05, "loss": 0.0025, "step": 7000 }, { "epoch": 1.7658930373360242, "eval_loss": 0.004664108622819185, "eval_runtime": 20.9248, "eval_samples_per_second": 84.206, "eval_steps_per_second": 21.075, "step": 7000 }, { "epoch": 1.7684157416750756, "grad_norm": 0.00033274645102210343, "learning_rate": 2.0526404305415406e-05, "loss": 0.0011, "step": 7010 }, { "epoch": 1.770938446014127, "grad_norm": 0.004436755087226629, "learning_rate": 2.048435923309788e-05, "loss": 0.0023, "step": 7020 }, { "epoch": 1.7734611503531785, "grad_norm": 0.12980465590953827, "learning_rate": 2.0442314160780356e-05, "loss": 0.0012, "step": 7030 }, { "epoch": 1.77598385469223, "grad_norm": 0.0001751563249854371, "learning_rate": 2.0400269088462834e-05, "loss": 0.0009, "step": 7040 }, { "epoch": 1.7785065590312814, "grad_norm": 0.18885697424411774, "learning_rate": 2.035822401614531e-05, "loss": 0.0037, "step": 7050 }, { "epoch": 1.7810292633703328, "grad_norm": 0.10697660595178604, "learning_rate": 2.0316178943827787e-05, "loss": 0.0016, "step": 7060 }, { "epoch": 1.7835519677093845, "grad_norm": 0.02362459897994995, "learning_rate": 2.0274133871510258e-05, "loss": 0.0007, "step": 7070 }, { "epoch": 1.786074672048436, "grad_norm": 8.634651749162003e-05, "learning_rate": 2.0232088799192736e-05, "loss": 0.0015, "step": 7080 }, { "epoch": 1.7885973763874874, "grad_norm": 0.14645344018936157, "learning_rate": 2.019004372687521e-05, "loss": 0.0019, "step": 7090 }, { "epoch": 1.7911200807265388, "grad_norm": 0.0008572082151658833, "learning_rate": 2.014799865455769e-05, "loss": 0.0022, "step": 7100 }, { "epoch": 1.7911200807265388, "eval_loss": 0.0045646862126886845, "eval_runtime": 20.8857, "eval_samples_per_second": 84.364, "eval_steps_per_second": 21.115, "step": 7100 }, { "epoch": 1.7936427850655903, "grad_norm": 0.0013892538845539093, "learning_rate": 2.010595358224016e-05, "loss": 0.0028, "step": 7110 }, { "epoch": 1.796165489404642, "grad_norm": 0.17493274807929993, "learning_rate": 2.0063908509922638e-05, "loss": 0.0026, "step": 7120 }, { "epoch": 1.7986881937436934, "grad_norm": 0.0053392620757222176, "learning_rate": 2.0021863437605113e-05, "loss": 0.0006, "step": 7130 }, { "epoch": 1.8012108980827448, "grad_norm": 0.05420933663845062, "learning_rate": 1.997981836528759e-05, "loss": 0.0003, "step": 7140 }, { "epoch": 1.8037336024217963, "grad_norm": 0.00458677439019084, "learning_rate": 1.9937773292970065e-05, "loss": 0.0014, "step": 7150 }, { "epoch": 1.8062563067608477, "grad_norm": 0.0370076522231102, "learning_rate": 1.989572822065254e-05, "loss": 0.0031, "step": 7160 }, { "epoch": 1.8087790110998991, "grad_norm": 0.007424044422805309, "learning_rate": 1.9853683148335018e-05, "loss": 0.0027, "step": 7170 }, { "epoch": 1.8113017154389506, "grad_norm": 0.03529626876115799, "learning_rate": 1.9811638076017493e-05, "loss": 0.0009, "step": 7180 }, { "epoch": 1.813824419778002, "grad_norm": 0.15175025165081024, "learning_rate": 1.9769593003699967e-05, "loss": 0.0015, "step": 7190 }, { "epoch": 1.8163471241170535, "grad_norm": 0.00071295554516837, "learning_rate": 1.9727547931382442e-05, "loss": 0.0006, "step": 7200 }, { "epoch": 1.8163471241170535, "eval_loss": 0.004439685959368944, "eval_runtime": 20.8976, "eval_samples_per_second": 84.316, "eval_steps_per_second": 21.103, "step": 7200 }, { "epoch": 1.818869828456105, "grad_norm": 0.13802039623260498, "learning_rate": 1.968550285906492e-05, "loss": 0.0029, "step": 7210 }, { "epoch": 1.8213925327951563, "grad_norm": 0.001995902741327882, "learning_rate": 1.9643457786747395e-05, "loss": 0.0021, "step": 7220 }, { "epoch": 1.8239152371342078, "grad_norm": 0.0005990486242808402, "learning_rate": 1.960141271442987e-05, "loss": 0.0006, "step": 7230 }, { "epoch": 1.8264379414732592, "grad_norm": 0.08083165436983109, "learning_rate": 1.9559367642112344e-05, "loss": 0.0025, "step": 7240 }, { "epoch": 1.8289606458123107, "grad_norm": 0.15096262097358704, "learning_rate": 1.9517322569794822e-05, "loss": 0.0024, "step": 7250 }, { "epoch": 1.831483350151362, "grad_norm": 0.0005141481524333358, "learning_rate": 1.9475277497477297e-05, "loss": 0.0047, "step": 7260 }, { "epoch": 1.8340060544904138, "grad_norm": 0.0009411073406226933, "learning_rate": 1.943323242515977e-05, "loss": 0.0033, "step": 7270 }, { "epoch": 1.8365287588294652, "grad_norm": 0.005096075590699911, "learning_rate": 1.9391187352842246e-05, "loss": 0.0023, "step": 7280 }, { "epoch": 1.8390514631685166, "grad_norm": 0.0007901940844021738, "learning_rate": 1.9349142280524724e-05, "loss": 0.0012, "step": 7290 }, { "epoch": 1.841574167507568, "grad_norm": 0.00019426460494287312, "learning_rate": 1.93070972082072e-05, "loss": 0.0007, "step": 7300 }, { "epoch": 1.841574167507568, "eval_loss": 0.004412606358528137, "eval_runtime": 20.9146, "eval_samples_per_second": 84.247, "eval_steps_per_second": 21.086, "step": 7300 }, { "epoch": 1.8440968718466195, "grad_norm": 0.1782449334859848, "learning_rate": 1.9265052135889674e-05, "loss": 0.0031, "step": 7310 }, { "epoch": 1.8466195761856712, "grad_norm": 0.01708620972931385, "learning_rate": 1.922300706357215e-05, "loss": 0.0014, "step": 7320 }, { "epoch": 1.8491422805247226, "grad_norm": 0.06721550226211548, "learning_rate": 1.9180961991254626e-05, "loss": 0.0006, "step": 7330 }, { "epoch": 1.851664984863774, "grad_norm": 0.16179241240024567, "learning_rate": 1.91389169189371e-05, "loss": 0.0031, "step": 7340 }, { "epoch": 1.8541876892028255, "grad_norm": 0.0004393104463815689, "learning_rate": 1.9096871846619576e-05, "loss": 0.0006, "step": 7350 }, { "epoch": 1.856710393541877, "grad_norm": 3.2668671337887645e-05, "learning_rate": 1.9054826774302054e-05, "loss": 0.0034, "step": 7360 }, { "epoch": 1.8592330978809284, "grad_norm": 0.1319074034690857, "learning_rate": 1.901278170198453e-05, "loss": 0.0007, "step": 7370 }, { "epoch": 1.8617558022199798, "grad_norm": 0.18538399040699005, "learning_rate": 1.8970736629667006e-05, "loss": 0.0022, "step": 7380 }, { "epoch": 1.8642785065590313, "grad_norm": 0.05137573927640915, "learning_rate": 1.8928691557349478e-05, "loss": 0.0028, "step": 7390 }, { "epoch": 1.8668012108980827, "grad_norm": 0.12319748103618622, "learning_rate": 1.8886646485031956e-05, "loss": 0.002, "step": 7400 }, { "epoch": 1.8668012108980827, "eval_loss": 0.004506191238760948, "eval_runtime": 20.8931, "eval_samples_per_second": 84.334, "eval_steps_per_second": 21.107, "step": 7400 }, { "epoch": 1.8693239152371341, "grad_norm": 0.0010132059687748551, "learning_rate": 1.884460141271443e-05, "loss": 0.003, "step": 7410 }, { "epoch": 1.8718466195761856, "grad_norm": 0.005542921368032694, "learning_rate": 1.880255634039691e-05, "loss": 0.0031, "step": 7420 }, { "epoch": 1.874369323915237, "grad_norm": 0.06616313755512238, "learning_rate": 1.876051126807938e-05, "loss": 0.003, "step": 7430 }, { "epoch": 1.8768920282542885, "grad_norm": 0.13730089366436005, "learning_rate": 1.8718466195761858e-05, "loss": 0.0035, "step": 7440 }, { "epoch": 1.87941473259334, "grad_norm": 0.00045170748489908874, "learning_rate": 1.8676421123444332e-05, "loss": 0.0006, "step": 7450 }, { "epoch": 1.8819374369323916, "grad_norm": 0.00024056418624240905, "learning_rate": 1.863437605112681e-05, "loss": 0.0032, "step": 7460 }, { "epoch": 1.884460141271443, "grad_norm": 0.11974132061004639, "learning_rate": 1.8592330978809282e-05, "loss": 0.0003, "step": 7470 }, { "epoch": 1.8869828456104945, "grad_norm": 0.17995594441890717, "learning_rate": 1.855028590649176e-05, "loss": 0.0026, "step": 7480 }, { "epoch": 1.889505549949546, "grad_norm": 0.16754589974880219, "learning_rate": 1.8508240834174238e-05, "loss": 0.0024, "step": 7490 }, { "epoch": 1.8920282542885973, "grad_norm": 0.0004354271513875574, "learning_rate": 1.8466195761856713e-05, "loss": 0.0054, "step": 7500 }, { "epoch": 1.8920282542885973, "eval_loss": 0.004418503027409315, "eval_runtime": 20.8963, "eval_samples_per_second": 84.321, "eval_steps_per_second": 21.104, "step": 7500 }, { "epoch": 1.894550958627649, "grad_norm": 0.08341953903436661, "learning_rate": 1.8424150689539187e-05, "loss": 0.0018, "step": 7510 }, { "epoch": 1.8970736629667004, "grad_norm": 0.027765339240431786, "learning_rate": 1.8382105617221662e-05, "loss": 0.0019, "step": 7520 }, { "epoch": 1.8995963673057519, "grad_norm": 0.08241262286901474, "learning_rate": 1.834006054490414e-05, "loss": 0.0033, "step": 7530 }, { "epoch": 1.9021190716448033, "grad_norm": 0.0011641500750556588, "learning_rate": 1.8298015472586615e-05, "loss": 0.0018, "step": 7540 }, { "epoch": 1.9046417759838548, "grad_norm": 0.15536606311798096, "learning_rate": 1.825597040026909e-05, "loss": 0.0029, "step": 7550 }, { "epoch": 1.9071644803229062, "grad_norm": 0.08957267552614212, "learning_rate": 1.8213925327951564e-05, "loss": 0.0015, "step": 7560 }, { "epoch": 1.9096871846619576, "grad_norm": 0.001939677633345127, "learning_rate": 1.8171880255634042e-05, "loss": 0.0023, "step": 7570 }, { "epoch": 1.912209889001009, "grad_norm": 0.0007268782937899232, "learning_rate": 1.8129835183316517e-05, "loss": 0.0009, "step": 7580 }, { "epoch": 1.9147325933400605, "grad_norm": 0.0003874763788189739, "learning_rate": 1.808779011099899e-05, "loss": 0.0016, "step": 7590 }, { "epoch": 1.917255297679112, "grad_norm": 0.077357217669487, "learning_rate": 1.8045745038681466e-05, "loss": 0.0021, "step": 7600 }, { "epoch": 1.917255297679112, "eval_loss": 0.00432681106030941, "eval_runtime": 20.8834, "eval_samples_per_second": 84.373, "eval_steps_per_second": 21.117, "step": 7600 }, { "epoch": 1.9197780020181634, "grad_norm": 0.006311261095106602, "learning_rate": 1.8003699966363944e-05, "loss": 0.0035, "step": 7610 }, { "epoch": 1.9223007063572148, "grad_norm": 0.0030232472345232964, "learning_rate": 1.796165489404642e-05, "loss": 0.0025, "step": 7620 }, { "epoch": 1.9248234106962663, "grad_norm": 0.201024129986763, "learning_rate": 1.7919609821728893e-05, "loss": 0.0015, "step": 7630 }, { "epoch": 1.9273461150353177, "grad_norm": 0.002884042216464877, "learning_rate": 1.7877564749411368e-05, "loss": 0.0018, "step": 7640 }, { "epoch": 1.9298688193743692, "grad_norm": 0.23472699522972107, "learning_rate": 1.7835519677093846e-05, "loss": 0.0029, "step": 7650 }, { "epoch": 1.9323915237134208, "grad_norm": 0.0487983413040638, "learning_rate": 1.779347460477632e-05, "loss": 0.0016, "step": 7660 }, { "epoch": 1.9349142280524723, "grad_norm": 0.1196766048669815, "learning_rate": 1.7751429532458795e-05, "loss": 0.002, "step": 7670 }, { "epoch": 1.9374369323915237, "grad_norm": 0.11802256107330322, "learning_rate": 1.7709384460141273e-05, "loss": 0.0016, "step": 7680 }, { "epoch": 1.9399596367305751, "grad_norm": 0.00048595041153021157, "learning_rate": 1.7667339387823748e-05, "loss": 0.0021, "step": 7690 }, { "epoch": 1.9424823410696268, "grad_norm": 0.0009480112348683178, "learning_rate": 1.7625294315506226e-05, "loss": 0.0025, "step": 7700 }, { "epoch": 1.9424823410696268, "eval_loss": 0.0042783478274941444, "eval_runtime": 20.8947, "eval_samples_per_second": 84.328, "eval_steps_per_second": 21.106, "step": 7700 }, { "epoch": 1.9450050454086782, "grad_norm": 0.0010953915771096945, "learning_rate": 1.7583249243188697e-05, "loss": 0.0017, "step": 7710 }, { "epoch": 1.9475277497477297, "grad_norm": 0.004912779200822115, "learning_rate": 1.7541204170871175e-05, "loss": 0.003, "step": 7720 }, { "epoch": 1.9500504540867811, "grad_norm": 0.05038010701537132, "learning_rate": 1.749915909855365e-05, "loss": 0.0017, "step": 7730 }, { "epoch": 1.9525731584258326, "grad_norm": 0.0019162135431542993, "learning_rate": 1.7457114026236128e-05, "loss": 0.0012, "step": 7740 }, { "epoch": 1.955095862764884, "grad_norm": 0.09494713693857193, "learning_rate": 1.74150689539186e-05, "loss": 0.0008, "step": 7750 }, { "epoch": 1.9576185671039354, "grad_norm": 0.0007395916618406773, "learning_rate": 1.7373023881601078e-05, "loss": 0.0, "step": 7760 }, { "epoch": 1.9601412714429869, "grad_norm": 0.10083262622356415, "learning_rate": 1.7330978809283552e-05, "loss": 0.0012, "step": 7770 }, { "epoch": 1.9626639757820383, "grad_norm": 0.06073877960443497, "learning_rate": 1.728893373696603e-05, "loss": 0.0021, "step": 7780 }, { "epoch": 1.9651866801210898, "grad_norm": 0.0009476240957155824, "learning_rate": 1.72468886646485e-05, "loss": 0.0029, "step": 7790 }, { "epoch": 1.9677093844601412, "grad_norm": 0.11249450594186783, "learning_rate": 1.720484359233098e-05, "loss": 0.0019, "step": 7800 }, { "epoch": 1.9677093844601412, "eval_loss": 0.004121602047234774, "eval_runtime": 20.9133, "eval_samples_per_second": 84.253, "eval_steps_per_second": 21.087, "step": 7800 }, { "epoch": 1.9702320887991926, "grad_norm": 0.10980529338121414, "learning_rate": 1.7162798520013454e-05, "loss": 0.0031, "step": 7810 }, { "epoch": 1.972754793138244, "grad_norm": 0.09414640069007874, "learning_rate": 1.7120753447695932e-05, "loss": 0.0012, "step": 7820 }, { "epoch": 1.9752774974772955, "grad_norm": 0.1049361452460289, "learning_rate": 1.7078708375378407e-05, "loss": 0.0052, "step": 7830 }, { "epoch": 1.977800201816347, "grad_norm": 0.06330663710832596, "learning_rate": 1.703666330306088e-05, "loss": 0.0043, "step": 7840 }, { "epoch": 1.9803229061553986, "grad_norm": 0.000840139458887279, "learning_rate": 1.699461823074336e-05, "loss": 0.0007, "step": 7850 }, { "epoch": 1.98284561049445, "grad_norm": 0.0014659571461379528, "learning_rate": 1.6952573158425834e-05, "loss": 0.0015, "step": 7860 }, { "epoch": 1.9853683148335015, "grad_norm": 0.0011587137123569846, "learning_rate": 1.691052808610831e-05, "loss": 0.0009, "step": 7870 }, { "epoch": 1.987891019172553, "grad_norm": 0.19755807518959045, "learning_rate": 1.6868483013790784e-05, "loss": 0.0028, "step": 7880 }, { "epoch": 1.9904137235116044, "grad_norm": 0.08327340334653854, "learning_rate": 1.6826437941473262e-05, "loss": 0.0012, "step": 7890 }, { "epoch": 1.992936427850656, "grad_norm": 0.0017620738362893462, "learning_rate": 1.6784392869155736e-05, "loss": 0.0013, "step": 7900 }, { "epoch": 1.992936427850656, "eval_loss": 0.004206486977636814, "eval_runtime": 20.8838, "eval_samples_per_second": 84.372, "eval_steps_per_second": 21.117, "step": 7900 }, { "epoch": 1.9954591321897075, "grad_norm": 0.002144803060218692, "learning_rate": 1.674234779683821e-05, "loss": 0.0004, "step": 7910 }, { "epoch": 1.997981836528759, "grad_norm": 0.21179014444351196, "learning_rate": 1.6700302724520686e-05, "loss": 0.0029, "step": 7920 }, { "epoch": 2.0005045408678104, "grad_norm": 0.029519561678171158, "learning_rate": 1.6658257652203164e-05, "loss": 0.0002, "step": 7930 }, { "epoch": 2.003027245206862, "grad_norm": 0.0001509381690993905, "learning_rate": 1.661621257988564e-05, "loss": 0.0008, "step": 7940 }, { "epoch": 2.0055499495459133, "grad_norm": 0.0014306082157418132, "learning_rate": 1.6574167507568113e-05, "loss": 0.0007, "step": 7950 }, { "epoch": 2.0080726538849647, "grad_norm": 0.0010387469083070755, "learning_rate": 1.6532122435250588e-05, "loss": 0.0005, "step": 7960 }, { "epoch": 2.010595358224016, "grad_norm": 0.1410035490989685, "learning_rate": 1.6490077362933066e-05, "loss": 0.0002, "step": 7970 }, { "epoch": 2.0131180625630676, "grad_norm": 0.00042714065057225525, "learning_rate": 1.644803229061554e-05, "loss": 0.0008, "step": 7980 }, { "epoch": 2.015640766902119, "grad_norm": 0.0003534654970280826, "learning_rate": 1.6405987218298015e-05, "loss": 0.0014, "step": 7990 }, { "epoch": 2.0181634712411705, "grad_norm": 0.13387076556682587, "learning_rate": 1.6363942145980493e-05, "loss": 0.0039, "step": 8000 }, { "epoch": 2.0181634712411705, "eval_loss": 0.004306289833039045, "eval_runtime": 20.8954, "eval_samples_per_second": 84.325, "eval_steps_per_second": 21.105, "step": 8000 }, { "epoch": 2.020686175580222, "grad_norm": 0.12466511130332947, "learning_rate": 1.6321897073662968e-05, "loss": 0.001, "step": 8010 }, { "epoch": 2.0232088799192733, "grad_norm": 0.00024663680233061314, "learning_rate": 1.6279852001345443e-05, "loss": 0.0002, "step": 8020 }, { "epoch": 2.025731584258325, "grad_norm": 0.00923093967139721, "learning_rate": 1.6237806929027917e-05, "loss": 0.0006, "step": 8030 }, { "epoch": 2.028254288597376, "grad_norm": 0.00024134966952260584, "learning_rate": 1.6195761856710395e-05, "loss": 0.0013, "step": 8040 }, { "epoch": 2.0307769929364277, "grad_norm": 0.14056503772735596, "learning_rate": 1.615371678439287e-05, "loss": 0.0023, "step": 8050 }, { "epoch": 2.033299697275479, "grad_norm": 0.17755192518234253, "learning_rate": 1.6111671712075348e-05, "loss": 0.0018, "step": 8060 }, { "epoch": 2.035822401614531, "grad_norm": 0.07552599161863327, "learning_rate": 1.606962663975782e-05, "loss": 0.001, "step": 8070 }, { "epoch": 2.0383451059535824, "grad_norm": 0.05230065807700157, "learning_rate": 1.6027581567440297e-05, "loss": 0.0006, "step": 8080 }, { "epoch": 2.040867810292634, "grad_norm": 0.011521569453179836, "learning_rate": 1.5985536495122772e-05, "loss": 0.0005, "step": 8090 }, { "epoch": 2.0433905146316853, "grad_norm": 0.00038396025775000453, "learning_rate": 1.594349142280525e-05, "loss": 0.0008, "step": 8100 }, { "epoch": 2.0433905146316853, "eval_loss": 0.0042536817491054535, "eval_runtime": 20.8933, "eval_samples_per_second": 84.333, "eval_steps_per_second": 21.107, "step": 8100 }, { "epoch": 2.0459132189707367, "grad_norm": 0.0009189638658426702, "learning_rate": 1.590144635048772e-05, "loss": 0.0002, "step": 8110 }, { "epoch": 2.048435923309788, "grad_norm": 0.00038111425237730145, "learning_rate": 1.58594012781702e-05, "loss": 0.0009, "step": 8120 }, { "epoch": 2.0509586276488396, "grad_norm": 0.002369858091697097, "learning_rate": 1.5817356205852674e-05, "loss": 0.0001, "step": 8130 }, { "epoch": 2.053481331987891, "grad_norm": 0.07348914444446564, "learning_rate": 1.5775311133535152e-05, "loss": 0.0018, "step": 8140 }, { "epoch": 2.0560040363269425, "grad_norm": 0.0008833975298330188, "learning_rate": 1.5733266061217623e-05, "loss": 0.0005, "step": 8150 }, { "epoch": 2.058526740665994, "grad_norm": 0.0008616661070846021, "learning_rate": 1.56912209889001e-05, "loss": 0.0002, "step": 8160 }, { "epoch": 2.0610494450050454, "grad_norm": 0.003825935535132885, "learning_rate": 1.564917591658258e-05, "loss": 0.0009, "step": 8170 }, { "epoch": 2.063572149344097, "grad_norm": 0.0037789177149534225, "learning_rate": 1.5607130844265054e-05, "loss": 0.0012, "step": 8180 }, { "epoch": 2.0660948536831483, "grad_norm": 0.002533160848543048, "learning_rate": 1.556508577194753e-05, "loss": 0.0007, "step": 8190 }, { "epoch": 2.0686175580221997, "grad_norm": 0.005100834183394909, "learning_rate": 1.5523040699630003e-05, "loss": 0.0005, "step": 8200 }, { "epoch": 2.0686175580221997, "eval_loss": 0.004342484753578901, "eval_runtime": 20.9079, "eval_samples_per_second": 84.274, "eval_steps_per_second": 21.093, "step": 8200 }, { "epoch": 2.071140262361251, "grad_norm": 0.003723128465935588, "learning_rate": 1.548099562731248e-05, "loss": 0.0005, "step": 8210 }, { "epoch": 2.0736629667003026, "grad_norm": 0.0664861872792244, "learning_rate": 1.5438950554994956e-05, "loss": 0.0007, "step": 8220 }, { "epoch": 2.076185671039354, "grad_norm": 0.003986823838204145, "learning_rate": 1.539690548267743e-05, "loss": 0.0008, "step": 8230 }, { "epoch": 2.0787083753784055, "grad_norm": 0.03816875070333481, "learning_rate": 1.5354860410359905e-05, "loss": 0.0017, "step": 8240 }, { "epoch": 2.081231079717457, "grad_norm": 0.22215162217617035, "learning_rate": 1.5312815338042384e-05, "loss": 0.0027, "step": 8250 }, { "epoch": 2.0837537840565084, "grad_norm": 0.13046610355377197, "learning_rate": 1.5270770265724858e-05, "loss": 0.0031, "step": 8260 }, { "epoch": 2.0862764883955602, "grad_norm": 0.16013352572917938, "learning_rate": 1.5228725193407335e-05, "loss": 0.0013, "step": 8270 }, { "epoch": 2.0887991927346117, "grad_norm": 0.01975584402680397, "learning_rate": 1.5186680121089808e-05, "loss": 0.0002, "step": 8280 }, { "epoch": 2.091321897073663, "grad_norm": 0.02759338729083538, "learning_rate": 1.5144635048772286e-05, "loss": 0.0009, "step": 8290 }, { "epoch": 2.0938446014127146, "grad_norm": 0.0007609634776599705, "learning_rate": 1.5102589976454759e-05, "loss": 0.0003, "step": 8300 }, { "epoch": 2.0938446014127146, "eval_loss": 0.004117065574973822, "eval_runtime": 20.9041, "eval_samples_per_second": 84.29, "eval_steps_per_second": 21.096, "step": 8300 }, { "epoch": 2.096367305751766, "grad_norm": 0.0008400371880270541, "learning_rate": 1.5060544904137237e-05, "loss": 0.0014, "step": 8310 }, { "epoch": 2.0988900100908174, "grad_norm": 0.0029490781016647816, "learning_rate": 1.501849983181971e-05, "loss": 0.0017, "step": 8320 }, { "epoch": 2.101412714429869, "grad_norm": 0.14210307598114014, "learning_rate": 1.4976454759502188e-05, "loss": 0.0012, "step": 8330 }, { "epoch": 2.1039354187689203, "grad_norm": 0.0019058181205764413, "learning_rate": 1.4934409687184664e-05, "loss": 0.0006, "step": 8340 }, { "epoch": 2.1064581231079718, "grad_norm": 0.0012054074322804809, "learning_rate": 1.4892364614867139e-05, "loss": 0.0018, "step": 8350 }, { "epoch": 2.108980827447023, "grad_norm": 0.0005894547794014215, "learning_rate": 1.4850319542549615e-05, "loss": 0.0025, "step": 8360 }, { "epoch": 2.1115035317860746, "grad_norm": 0.0017818346386775374, "learning_rate": 1.480827447023209e-05, "loss": 0.0017, "step": 8370 }, { "epoch": 2.114026236125126, "grad_norm": 0.0991104245185852, "learning_rate": 1.4766229397914566e-05, "loss": 0.001, "step": 8380 }, { "epoch": 2.1165489404641775, "grad_norm": 0.0006472957320511341, "learning_rate": 1.472418432559704e-05, "loss": 0.0004, "step": 8390 }, { "epoch": 2.119071644803229, "grad_norm": 0.03154408931732178, "learning_rate": 1.4682139253279517e-05, "loss": 0.001, "step": 8400 }, { "epoch": 2.119071644803229, "eval_loss": 0.004066385794430971, "eval_runtime": 20.8985, "eval_samples_per_second": 84.312, "eval_steps_per_second": 21.102, "step": 8400 }, { "epoch": 2.1215943491422804, "grad_norm": 0.0002149187057511881, "learning_rate": 1.4640094180961992e-05, "loss": 0.0013, "step": 8410 }, { "epoch": 2.124117053481332, "grad_norm": 0.11510289460420609, "learning_rate": 1.4598049108644468e-05, "loss": 0.0006, "step": 8420 }, { "epoch": 2.1266397578203833, "grad_norm": 0.11964685469865799, "learning_rate": 1.4556004036326943e-05, "loss": 0.0003, "step": 8430 }, { "epoch": 2.1291624621594347, "grad_norm": 0.1316743791103363, "learning_rate": 1.4513958964009419e-05, "loss": 0.0005, "step": 8440 }, { "epoch": 2.131685166498486, "grad_norm": 3.9832641050452366e-05, "learning_rate": 1.4471913891691894e-05, "loss": 0.0, "step": 8450 }, { "epoch": 2.134207870837538, "grad_norm": 0.00014946168812457472, "learning_rate": 1.442986881937437e-05, "loss": 0.0008, "step": 8460 }, { "epoch": 2.1367305751765895, "grad_norm": 0.040535129606723785, "learning_rate": 1.4387823747056845e-05, "loss": 0.0007, "step": 8470 }, { "epoch": 2.139253279515641, "grad_norm": 9.963886986952275e-05, "learning_rate": 1.4345778674739321e-05, "loss": 0.0008, "step": 8480 }, { "epoch": 2.1417759838546924, "grad_norm": 0.00044994213385507464, "learning_rate": 1.4303733602421796e-05, "loss": 0.0004, "step": 8490 }, { "epoch": 2.144298688193744, "grad_norm": 0.09494508057832718, "learning_rate": 1.4261688530104272e-05, "loss": 0.0008, "step": 8500 }, { "epoch": 2.144298688193744, "eval_loss": 0.004108693916350603, "eval_runtime": 20.9, "eval_samples_per_second": 84.306, "eval_steps_per_second": 21.1, "step": 8500 }, { "epoch": 2.1468213925327952, "grad_norm": 0.001810372225008905, "learning_rate": 1.4219643457786749e-05, "loss": 0.0002, "step": 8510 }, { "epoch": 2.1493440968718467, "grad_norm": 0.00013946890248917043, "learning_rate": 1.4177598385469223e-05, "loss": 0.0007, "step": 8520 }, { "epoch": 2.151866801210898, "grad_norm": 0.003362849121913314, "learning_rate": 1.4135553313151701e-05, "loss": 0.0002, "step": 8530 }, { "epoch": 2.1543895055499496, "grad_norm": 0.01551423966884613, "learning_rate": 1.4093508240834174e-05, "loss": 0.0003, "step": 8540 }, { "epoch": 2.156912209889001, "grad_norm": 0.0713684931397438, "learning_rate": 1.4051463168516652e-05, "loss": 0.0005, "step": 8550 }, { "epoch": 2.1594349142280524, "grad_norm": 0.0008997659897431731, "learning_rate": 1.4009418096199125e-05, "loss": 0.0008, "step": 8560 }, { "epoch": 2.161957618567104, "grad_norm": 0.00035184502485208213, "learning_rate": 1.3967373023881603e-05, "loss": 0.0004, "step": 8570 }, { "epoch": 2.1644803229061553, "grad_norm": 0.08870701491832733, "learning_rate": 1.3925327951564076e-05, "loss": 0.002, "step": 8580 }, { "epoch": 2.1670030272452068, "grad_norm": 0.032671812921762466, "learning_rate": 1.3883282879246554e-05, "loss": 0.0001, "step": 8590 }, { "epoch": 2.169525731584258, "grad_norm": 0.01390095055103302, "learning_rate": 1.3841237806929027e-05, "loss": 0.0016, "step": 8600 }, { "epoch": 2.169525731584258, "eval_loss": 0.004325889516621828, "eval_runtime": 20.8969, "eval_samples_per_second": 84.319, "eval_steps_per_second": 21.104, "step": 8600 }, { "epoch": 2.1720484359233097, "grad_norm": 0.04760993644595146, "learning_rate": 1.3799192734611505e-05, "loss": 0.0016, "step": 8610 }, { "epoch": 2.174571140262361, "grad_norm": 0.006453169509768486, "learning_rate": 1.3757147662293978e-05, "loss": 0.0002, "step": 8620 }, { "epoch": 2.1770938446014125, "grad_norm": 0.04346233606338501, "learning_rate": 1.3715102589976456e-05, "loss": 0.0012, "step": 8630 }, { "epoch": 2.179616548940464, "grad_norm": 0.010294480249285698, "learning_rate": 1.367305751765893e-05, "loss": 0.0003, "step": 8640 }, { "epoch": 2.182139253279516, "grad_norm": 0.00014406938862521201, "learning_rate": 1.3631012445341407e-05, "loss": 0.0001, "step": 8650 }, { "epoch": 2.1846619576185673, "grad_norm": 0.10597830265760422, "learning_rate": 1.358896737302388e-05, "loss": 0.0005, "step": 8660 }, { "epoch": 2.1871846619576187, "grad_norm": 0.125259131193161, "learning_rate": 1.3546922300706358e-05, "loss": 0.0004, "step": 8670 }, { "epoch": 2.18970736629667, "grad_norm": 0.00017470364400651306, "learning_rate": 1.3504877228388835e-05, "loss": 0.0005, "step": 8680 }, { "epoch": 2.1922300706357216, "grad_norm": 0.1178673580288887, "learning_rate": 1.346283215607131e-05, "loss": 0.0005, "step": 8690 }, { "epoch": 2.194752774974773, "grad_norm": 0.00032886656117625535, "learning_rate": 1.3420787083753786e-05, "loss": 0.0016, "step": 8700 }, { "epoch": 2.194752774974773, "eval_loss": 0.004205956123769283, "eval_runtime": 20.8455, "eval_samples_per_second": 84.527, "eval_steps_per_second": 21.156, "step": 8700 }, { "epoch": 2.1972754793138245, "grad_norm": 0.00314329843968153, "learning_rate": 1.337874201143626e-05, "loss": 0.0009, "step": 8710 }, { "epoch": 2.199798183652876, "grad_norm": 0.0048809046857059, "learning_rate": 1.3336696939118737e-05, "loss": 0.0013, "step": 8720 }, { "epoch": 2.2023208879919274, "grad_norm": 0.0004060929059050977, "learning_rate": 1.3294651866801211e-05, "loss": 0.0001, "step": 8730 }, { "epoch": 2.204843592330979, "grad_norm": 0.0005921365809626877, "learning_rate": 1.3252606794483688e-05, "loss": 0.0004, "step": 8740 }, { "epoch": 2.2073662966700303, "grad_norm": 0.00011614049435593188, "learning_rate": 1.3210561722166163e-05, "loss": 0.0013, "step": 8750 }, { "epoch": 2.2098890010090817, "grad_norm": 0.010343813337385654, "learning_rate": 1.3168516649848639e-05, "loss": 0.0003, "step": 8760 }, { "epoch": 2.212411705348133, "grad_norm": 0.06460902839899063, "learning_rate": 1.3126471577531114e-05, "loss": 0.0004, "step": 8770 }, { "epoch": 2.2149344096871846, "grad_norm": 0.0001900464267237112, "learning_rate": 1.308442650521359e-05, "loss": 0.0005, "step": 8780 }, { "epoch": 2.217457114026236, "grad_norm": 0.24223244190216064, "learning_rate": 1.3042381432896065e-05, "loss": 0.0009, "step": 8790 }, { "epoch": 2.2199798183652875, "grad_norm": 0.012591979466378689, "learning_rate": 1.3000336360578541e-05, "loss": 0.0015, "step": 8800 }, { "epoch": 2.2199798183652875, "eval_loss": 0.00407541636377573, "eval_runtime": 20.8574, "eval_samples_per_second": 84.478, "eval_steps_per_second": 21.144, "step": 8800 }, { "epoch": 2.222502522704339, "grad_norm": 0.008851522579789162, "learning_rate": 1.2958291288261016e-05, "loss": 0.0001, "step": 8810 }, { "epoch": 2.2250252270433903, "grad_norm": 0.0007781846798025072, "learning_rate": 1.2916246215943492e-05, "loss": 0.0002, "step": 8820 }, { "epoch": 2.227547931382442, "grad_norm": 0.00011876798089360818, "learning_rate": 1.2874201143625967e-05, "loss": 0.0004, "step": 8830 }, { "epoch": 2.2300706357214937, "grad_norm": 7.863504288252443e-05, "learning_rate": 1.2832156071308443e-05, "loss": 0.0006, "step": 8840 }, { "epoch": 2.232593340060545, "grad_norm": 0.03496154770255089, "learning_rate": 1.279011099899092e-05, "loss": 0.0008, "step": 8850 }, { "epoch": 2.2351160443995965, "grad_norm": 0.0036200936883687973, "learning_rate": 1.2748065926673394e-05, "loss": 0.0013, "step": 8860 }, { "epoch": 2.237638748738648, "grad_norm": 0.1222803145647049, "learning_rate": 1.2706020854355872e-05, "loss": 0.0024, "step": 8870 }, { "epoch": 2.2401614530776994, "grad_norm": 0.006895432714372873, "learning_rate": 1.2663975782038345e-05, "loss": 0.0005, "step": 8880 }, { "epoch": 2.242684157416751, "grad_norm": 0.0022829826921224594, "learning_rate": 1.2621930709720823e-05, "loss": 0.0002, "step": 8890 }, { "epoch": 2.2452068617558023, "grad_norm": 0.0012163568753749132, "learning_rate": 1.2579885637403296e-05, "loss": 0.0008, "step": 8900 }, { "epoch": 2.2452068617558023, "eval_loss": 0.004117515403777361, "eval_runtime": 20.8601, "eval_samples_per_second": 84.468, "eval_steps_per_second": 21.141, "step": 8900 }, { "epoch": 2.2477295660948537, "grad_norm": 0.00023967861488927156, "learning_rate": 1.2537840565085774e-05, "loss": 0.0001, "step": 8910 }, { "epoch": 2.250252270433905, "grad_norm": 0.002785157412290573, "learning_rate": 1.2495795492768249e-05, "loss": 0.0008, "step": 8920 }, { "epoch": 2.2527749747729566, "grad_norm": 0.002094635972753167, "learning_rate": 1.2453750420450725e-05, "loss": 0.001, "step": 8930 }, { "epoch": 2.255297679112008, "grad_norm": 0.17214207351207733, "learning_rate": 1.24117053481332e-05, "loss": 0.001, "step": 8940 }, { "epoch": 2.2578203834510595, "grad_norm": 0.14640472829341888, "learning_rate": 1.2369660275815676e-05, "loss": 0.0009, "step": 8950 }, { "epoch": 2.260343087790111, "grad_norm": 0.05824064090847969, "learning_rate": 1.232761520349815e-05, "loss": 0.0004, "step": 8960 }, { "epoch": 2.2628657921291624, "grad_norm": 0.0001570479798829183, "learning_rate": 1.2285570131180627e-05, "loss": 0.0003, "step": 8970 }, { "epoch": 2.265388496468214, "grad_norm": 0.008577616885304451, "learning_rate": 1.2243525058863102e-05, "loss": 0.0001, "step": 8980 }, { "epoch": 2.2679112008072653, "grad_norm": 0.1447947919368744, "learning_rate": 1.2201479986545578e-05, "loss": 0.0011, "step": 8990 }, { "epoch": 2.2704339051463167, "grad_norm": 0.00018846993043553084, "learning_rate": 1.2159434914228053e-05, "loss": 0.0002, "step": 9000 }, { "epoch": 2.2704339051463167, "eval_loss": 0.004093192983418703, "eval_runtime": 20.8777, "eval_samples_per_second": 84.396, "eval_steps_per_second": 21.123, "step": 9000 }, { "epoch": 2.272956609485368, "grad_norm": 0.0007275242242030799, "learning_rate": 1.211738984191053e-05, "loss": 0.0011, "step": 9010 }, { "epoch": 2.2754793138244196, "grad_norm": 0.010818258859217167, "learning_rate": 1.2075344769593004e-05, "loss": 0.0002, "step": 9020 }, { "epoch": 2.2780020181634715, "grad_norm": 0.019404212012887, "learning_rate": 1.203329969727548e-05, "loss": 0.0007, "step": 9030 }, { "epoch": 2.2805247225025225, "grad_norm": 0.07261942327022552, "learning_rate": 1.1991254624957955e-05, "loss": 0.0002, "step": 9040 }, { "epoch": 2.2830474268415744, "grad_norm": 0.0001016618189169094, "learning_rate": 1.1949209552640431e-05, "loss": 0.0019, "step": 9050 }, { "epoch": 2.285570131180626, "grad_norm": 0.00039705351809971035, "learning_rate": 1.1907164480322906e-05, "loss": 0.0002, "step": 9060 }, { "epoch": 2.2880928355196772, "grad_norm": 0.019509321078658104, "learning_rate": 1.1865119408005382e-05, "loss": 0.0007, "step": 9070 }, { "epoch": 2.2906155398587287, "grad_norm": 0.00039067715988494456, "learning_rate": 1.1823074335687857e-05, "loss": 0.0007, "step": 9080 }, { "epoch": 2.29313824419778, "grad_norm": 0.006623209919780493, "learning_rate": 1.1781029263370335e-05, "loss": 0.0017, "step": 9090 }, { "epoch": 2.2956609485368316, "grad_norm": 0.09274378418922424, "learning_rate": 1.173898419105281e-05, "loss": 0.0007, "step": 9100 }, { "epoch": 2.2956609485368316, "eval_loss": 0.004082069266587496, "eval_runtime": 20.8844, "eval_samples_per_second": 84.369, "eval_steps_per_second": 21.116, "step": 9100 }, { "epoch": 2.298183652875883, "grad_norm": 0.00021790213941130787, "learning_rate": 1.1696939118735286e-05, "loss": 0.0006, "step": 9110 }, { "epoch": 2.3007063572149344, "grad_norm": 0.11509310454130173, "learning_rate": 1.165489404641776e-05, "loss": 0.0008, "step": 9120 }, { "epoch": 2.303229061553986, "grad_norm": 0.0007341225282289088, "learning_rate": 1.1612848974100237e-05, "loss": 0.0004, "step": 9130 }, { "epoch": 2.3057517658930373, "grad_norm": 0.14291776716709137, "learning_rate": 1.1570803901782712e-05, "loss": 0.0023, "step": 9140 }, { "epoch": 2.3082744702320888, "grad_norm": 9.565720392856747e-05, "learning_rate": 1.1528758829465188e-05, "loss": 0.0007, "step": 9150 }, { "epoch": 2.31079717457114, "grad_norm": 3.3541025914018974e-05, "learning_rate": 1.1486713757147663e-05, "loss": 0.0013, "step": 9160 }, { "epoch": 2.3133198789101916, "grad_norm": 0.0004649158217944205, "learning_rate": 1.1444668684830139e-05, "loss": 0.0013, "step": 9170 }, { "epoch": 2.315842583249243, "grad_norm": 0.19510401785373688, "learning_rate": 1.1402623612512614e-05, "loss": 0.0009, "step": 9180 }, { "epoch": 2.3183652875882945, "grad_norm": 0.06924453377723694, "learning_rate": 1.136057854019509e-05, "loss": 0.0009, "step": 9190 }, { "epoch": 2.320887991927346, "grad_norm": 0.006778767332434654, "learning_rate": 1.1318533467877565e-05, "loss": 0.0, "step": 9200 }, { "epoch": 2.320887991927346, "eval_loss": 0.004074608441442251, "eval_runtime": 20.8629, "eval_samples_per_second": 84.456, "eval_steps_per_second": 21.138, "step": 9200 }, { "epoch": 2.3234106962663974, "grad_norm": 0.0005497061647474766, "learning_rate": 1.1276488395560041e-05, "loss": 0.0007, "step": 9210 }, { "epoch": 2.3259334006054493, "grad_norm": 0.2050006240606308, "learning_rate": 1.1234443323242516e-05, "loss": 0.0025, "step": 9220 }, { "epoch": 2.3284561049445003, "grad_norm": 0.002956175012513995, "learning_rate": 1.1192398250924992e-05, "loss": 0.0013, "step": 9230 }, { "epoch": 2.330978809283552, "grad_norm": 0.0013219810789451003, "learning_rate": 1.1150353178607467e-05, "loss": 0.0016, "step": 9240 }, { "epoch": 2.3335015136226036, "grad_norm": 0.0003643881937023252, "learning_rate": 1.1108308106289943e-05, "loss": 0.0003, "step": 9250 }, { "epoch": 2.336024217961655, "grad_norm": 0.00021895192912779748, "learning_rate": 1.106626303397242e-05, "loss": 0.0005, "step": 9260 }, { "epoch": 2.3385469223007065, "grad_norm": 0.07140027731657028, "learning_rate": 1.1024217961654896e-05, "loss": 0.0003, "step": 9270 }, { "epoch": 2.341069626639758, "grad_norm": 0.004080440849065781, "learning_rate": 1.098217288933737e-05, "loss": 0.0007, "step": 9280 }, { "epoch": 2.3435923309788094, "grad_norm": 0.00017156251124106348, "learning_rate": 1.0940127817019847e-05, "loss": 0.001, "step": 9290 }, { "epoch": 2.346115035317861, "grad_norm": 0.00018193376308772713, "learning_rate": 1.0898082744702322e-05, "loss": 0.0019, "step": 9300 }, { "epoch": 2.346115035317861, "eval_loss": 0.00402724277228117, "eval_runtime": 20.8513, "eval_samples_per_second": 84.503, "eval_steps_per_second": 21.15, "step": 9300 }, { "epoch": 2.3486377396569122, "grad_norm": 3.1227758881868795e-05, "learning_rate": 1.0856037672384798e-05, "loss": 0.0001, "step": 9310 }, { "epoch": 2.3511604439959637, "grad_norm": 0.0005630968371406198, "learning_rate": 1.0813992600067273e-05, "loss": 0.0013, "step": 9320 }, { "epoch": 2.353683148335015, "grad_norm": 0.011226714588701725, "learning_rate": 1.0771947527749749e-05, "loss": 0.0026, "step": 9330 }, { "epoch": 2.3562058526740666, "grad_norm": 0.0001727965282043442, "learning_rate": 1.0729902455432224e-05, "loss": 0.0007, "step": 9340 }, { "epoch": 2.358728557013118, "grad_norm": 0.00026375442394055426, "learning_rate": 1.06878573831147e-05, "loss": 0.0005, "step": 9350 }, { "epoch": 2.3612512613521695, "grad_norm": 0.058445390313863754, "learning_rate": 1.0645812310797175e-05, "loss": 0.0002, "step": 9360 }, { "epoch": 2.363773965691221, "grad_norm": 0.0032288488000631332, "learning_rate": 1.0603767238479651e-05, "loss": 0.0009, "step": 9370 }, { "epoch": 2.3662966700302723, "grad_norm": 0.0001218500838149339, "learning_rate": 1.0561722166162126e-05, "loss": 0.0003, "step": 9380 }, { "epoch": 2.3688193743693238, "grad_norm": 0.0034417565912008286, "learning_rate": 1.0519677093844602e-05, "loss": 0.0001, "step": 9390 }, { "epoch": 2.371342078708375, "grad_norm": 0.0020040010567754507, "learning_rate": 1.0477632021527077e-05, "loss": 0.0007, "step": 9400 }, { "epoch": 2.371342078708375, "eval_loss": 0.00405073631554842, "eval_runtime": 20.8534, "eval_samples_per_second": 84.495, "eval_steps_per_second": 21.148, "step": 9400 }, { "epoch": 2.3738647830474267, "grad_norm": 0.158976748585701, "learning_rate": 1.0435586949209553e-05, "loss": 0.0011, "step": 9410 }, { "epoch": 2.376387487386478, "grad_norm": 0.0006932442774996161, "learning_rate": 1.0393541876892028e-05, "loss": 0.0004, "step": 9420 }, { "epoch": 2.37891019172553, "grad_norm": 0.000254453276284039, "learning_rate": 1.0351496804574506e-05, "loss": 0.0003, "step": 9430 }, { "epoch": 2.3814328960645814, "grad_norm": 0.001546886982396245, "learning_rate": 1.030945173225698e-05, "loss": 0.0001, "step": 9440 }, { "epoch": 2.383955600403633, "grad_norm": 0.004422788508236408, "learning_rate": 1.0267406659939457e-05, "loss": 0.0001, "step": 9450 }, { "epoch": 2.3864783047426843, "grad_norm": 0.08441481739282608, "learning_rate": 1.0225361587621931e-05, "loss": 0.001, "step": 9460 }, { "epoch": 2.3890010090817357, "grad_norm": 0.0022749004419893026, "learning_rate": 1.0183316515304408e-05, "loss": 0.0002, "step": 9470 }, { "epoch": 2.391523713420787, "grad_norm": 0.0009072842076420784, "learning_rate": 1.0141271442986882e-05, "loss": 0.0007, "step": 9480 }, { "epoch": 2.3940464177598386, "grad_norm": 0.0009312007459811866, "learning_rate": 1.0099226370669359e-05, "loss": 0.0004, "step": 9490 }, { "epoch": 2.39656912209889, "grad_norm": 0.00628532562404871, "learning_rate": 1.0057181298351834e-05, "loss": 0.0004, "step": 9500 }, { "epoch": 2.39656912209889, "eval_loss": 0.004011793062090874, "eval_runtime": 20.8472, "eval_samples_per_second": 84.52, "eval_steps_per_second": 21.154, "step": 9500 }, { "epoch": 2.3990918264379415, "grad_norm": 0.01351741049438715, "learning_rate": 1.001513622603431e-05, "loss": 0.0002, "step": 9510 }, { "epoch": 2.401614530776993, "grad_norm": 0.1305786371231079, "learning_rate": 9.973091153716785e-06, "loss": 0.0007, "step": 9520 }, { "epoch": 2.4041372351160444, "grad_norm": 0.0029213367961347103, "learning_rate": 9.931046081399261e-06, "loss": 0.0003, "step": 9530 }, { "epoch": 2.406659939455096, "grad_norm": 0.004418868105858564, "learning_rate": 9.889001009081736e-06, "loss": 0.0007, "step": 9540 }, { "epoch": 2.4091826437941473, "grad_norm": 0.022457575425505638, "learning_rate": 9.846955936764212e-06, "loss": 0.0005, "step": 9550 }, { "epoch": 2.4117053481331987, "grad_norm": 0.0011523871216922998, "learning_rate": 9.804910864446687e-06, "loss": 0.0008, "step": 9560 }, { "epoch": 2.41422805247225, "grad_norm": 0.00031501849298365414, "learning_rate": 9.762865792129163e-06, "loss": 0.0005, "step": 9570 }, { "epoch": 2.4167507568113016, "grad_norm": 0.0037088736426085234, "learning_rate": 9.720820719811638e-06, "loss": 0.0012, "step": 9580 }, { "epoch": 2.419273461150353, "grad_norm": 6.174742884468287e-05, "learning_rate": 9.678775647494114e-06, "loss": 0.0001, "step": 9590 }, { "epoch": 2.4217961654894045, "grad_norm": 0.05361416935920715, "learning_rate": 9.63673057517659e-06, "loss": 0.0014, "step": 9600 }, { "epoch": 2.4217961654894045, "eval_loss": 0.003927647601813078, "eval_runtime": 20.8667, "eval_samples_per_second": 84.441, "eval_steps_per_second": 21.134, "step": 9600 }, { "epoch": 2.424318869828456, "grad_norm": 0.0005672819679602981, "learning_rate": 9.594685502859067e-06, "loss": 0.0002, "step": 9610 }, { "epoch": 2.426841574167508, "grad_norm": 0.03695020079612732, "learning_rate": 9.552640430541541e-06, "loss": 0.004, "step": 9620 }, { "epoch": 2.429364278506559, "grad_norm": 0.17845889925956726, "learning_rate": 9.510595358224018e-06, "loss": 0.0011, "step": 9630 }, { "epoch": 2.4318869828456107, "grad_norm": 0.0009999609319493175, "learning_rate": 9.468550285906492e-06, "loss": 0.0008, "step": 9640 }, { "epoch": 2.434409687184662, "grad_norm": 0.10458512604236603, "learning_rate": 9.426505213588969e-06, "loss": 0.0007, "step": 9650 }, { "epoch": 2.4369323915237135, "grad_norm": 0.00014382805966306478, "learning_rate": 9.384460141271443e-06, "loss": 0.0006, "step": 9660 }, { "epoch": 2.439455095862765, "grad_norm": 0.2015506774187088, "learning_rate": 9.34241506895392e-06, "loss": 0.0022, "step": 9670 }, { "epoch": 2.4419778002018164, "grad_norm": 0.0007017810712568462, "learning_rate": 9.300369996636394e-06, "loss": 0.0003, "step": 9680 }, { "epoch": 2.444500504540868, "grad_norm": 0.003951243124902248, "learning_rate": 9.25832492431887e-06, "loss": 0.0016, "step": 9690 }, { "epoch": 2.4470232088799193, "grad_norm": 0.0021652476862072945, "learning_rate": 9.216279852001345e-06, "loss": 0.0002, "step": 9700 }, { "epoch": 2.4470232088799193, "eval_loss": 0.004020575433969498, "eval_runtime": 20.8681, "eval_samples_per_second": 84.435, "eval_steps_per_second": 21.133, "step": 9700 }, { "epoch": 2.4495459132189707, "grad_norm": 0.0001469567941967398, "learning_rate": 9.174234779683822e-06, "loss": 0.0004, "step": 9710 }, { "epoch": 2.452068617558022, "grad_norm": 0.06326698511838913, "learning_rate": 9.132189707366296e-06, "loss": 0.0002, "step": 9720 }, { "epoch": 2.4545913218970736, "grad_norm": 0.006055818870663643, "learning_rate": 9.090144635048773e-06, "loss": 0.0001, "step": 9730 }, { "epoch": 2.457114026236125, "grad_norm": 0.006732003763318062, "learning_rate": 9.048099562731247e-06, "loss": 0.0003, "step": 9740 }, { "epoch": 2.4596367305751765, "grad_norm": 0.0013023527571931481, "learning_rate": 9.006054490413724e-06, "loss": 0.0003, "step": 9750 }, { "epoch": 2.462159434914228, "grad_norm": 0.00027046047034673393, "learning_rate": 8.964009418096199e-06, "loss": 0.0005, "step": 9760 }, { "epoch": 2.4646821392532794, "grad_norm": 0.0663028210401535, "learning_rate": 8.921964345778677e-06, "loss": 0.0017, "step": 9770 }, { "epoch": 2.467204843592331, "grad_norm": 0.06608086824417114, "learning_rate": 8.879919273461151e-06, "loss": 0.0016, "step": 9780 }, { "epoch": 2.4697275479313823, "grad_norm": 0.1625903695821762, "learning_rate": 8.837874201143628e-06, "loss": 0.0028, "step": 9790 }, { "epoch": 2.4722502522704337, "grad_norm": 0.0005290044355206192, "learning_rate": 8.795829128826102e-06, "loss": 0.0013, "step": 9800 }, { "epoch": 2.4722502522704337, "eval_loss": 0.00388718512840569, "eval_runtime": 20.8863, "eval_samples_per_second": 84.362, "eval_steps_per_second": 21.114, "step": 9800 }, { "epoch": 2.4747729566094856, "grad_norm": 0.0725766122341156, "learning_rate": 8.753784056508579e-06, "loss": 0.0003, "step": 9810 }, { "epoch": 2.4772956609485366, "grad_norm": 0.00025911492411978543, "learning_rate": 8.711738984191053e-06, "loss": 0.0007, "step": 9820 }, { "epoch": 2.4798183652875885, "grad_norm": 0.005303604993969202, "learning_rate": 8.66969391187353e-06, "loss": 0.0005, "step": 9830 }, { "epoch": 2.48234106962664, "grad_norm": 0.00039688186370767653, "learning_rate": 8.627648839556004e-06, "loss": 0.0006, "step": 9840 }, { "epoch": 2.4848637739656914, "grad_norm": 0.010475796647369862, "learning_rate": 8.58560376723848e-06, "loss": 0.0005, "step": 9850 }, { "epoch": 2.487386478304743, "grad_norm": 0.0014096908271312714, "learning_rate": 8.543558694920955e-06, "loss": 0.0009, "step": 9860 }, { "epoch": 2.4899091826437942, "grad_norm": 0.0016413936391472816, "learning_rate": 8.501513622603432e-06, "loss": 0.0011, "step": 9870 }, { "epoch": 2.4924318869828457, "grad_norm": 0.00043976728920824826, "learning_rate": 8.459468550285906e-06, "loss": 0.0001, "step": 9880 }, { "epoch": 2.494954591321897, "grad_norm": 0.00024035267415456474, "learning_rate": 8.417423477968383e-06, "loss": 0.0023, "step": 9890 }, { "epoch": 2.4974772956609486, "grad_norm": 0.0011502320412546396, "learning_rate": 8.375378405650857e-06, "loss": 0.0006, "step": 9900 }, { "epoch": 2.4974772956609486, "eval_loss": 0.00393605325371027, "eval_runtime": 20.8485, "eval_samples_per_second": 84.515, "eval_steps_per_second": 21.153, "step": 9900 }, { "epoch": 2.5, "grad_norm": 0.00509544787928462, "learning_rate": 8.333333333333334e-06, "loss": 0.0002, "step": 9910 }, { "epoch": 2.5025227043390514, "grad_norm": 5.717107706004754e-05, "learning_rate": 8.291288261015808e-06, "loss": 0.0001, "step": 9920 }, { "epoch": 2.505045408678103, "grad_norm": 0.08509433269500732, "learning_rate": 8.249243188698285e-06, "loss": 0.0003, "step": 9930 }, { "epoch": 2.5075681130171543, "grad_norm": 0.0029674407560378313, "learning_rate": 8.207198116380761e-06, "loss": 0.0003, "step": 9940 }, { "epoch": 2.5100908173562058, "grad_norm": 0.0014874001499265432, "learning_rate": 8.165153044063237e-06, "loss": 0.0012, "step": 9950 }, { "epoch": 2.512613521695257, "grad_norm": 0.0054718488827347755, "learning_rate": 8.123107971745712e-06, "loss": 0.0004, "step": 9960 }, { "epoch": 2.5151362260343086, "grad_norm": 0.021269412711262703, "learning_rate": 8.081062899428188e-06, "loss": 0.0005, "step": 9970 }, { "epoch": 2.51765893037336, "grad_norm": 0.00045099278213456273, "learning_rate": 8.039017827110663e-06, "loss": 0.0003, "step": 9980 }, { "epoch": 2.5201816347124115, "grad_norm": 5.915598012506962e-05, "learning_rate": 7.99697275479314e-06, "loss": 0.0002, "step": 9990 }, { "epoch": 2.5227043390514634, "grad_norm": 9.444829629501328e-05, "learning_rate": 7.954927682475614e-06, "loss": 0.0001, "step": 10000 }, { "epoch": 2.5227043390514634, "eval_loss": 0.0038506174460053444, "eval_runtime": 20.868, "eval_samples_per_second": 84.436, "eval_steps_per_second": 21.133, "step": 10000 }, { "epoch": 2.5252270433905144, "grad_norm": 0.00031684929854236543, "learning_rate": 7.91288261015809e-06, "loss": 0.0, "step": 10010 }, { "epoch": 2.5277497477295663, "grad_norm": 0.00030053374939598143, "learning_rate": 7.870837537840565e-06, "loss": 0.0002, "step": 10020 }, { "epoch": 2.5302724520686173, "grad_norm": 0.07218382507562637, "learning_rate": 7.828792465523042e-06, "loss": 0.0002, "step": 10030 }, { "epoch": 2.532795156407669, "grad_norm": 9.184365626424551e-05, "learning_rate": 7.786747393205516e-06, "loss": 0.0, "step": 10040 }, { "epoch": 2.5353178607467206, "grad_norm": 0.00037872057873755693, "learning_rate": 7.744702320887993e-06, "loss": 0.0028, "step": 10050 }, { "epoch": 2.537840565085772, "grad_norm": 0.00029898545471951365, "learning_rate": 7.702657248570467e-06, "loss": 0.0009, "step": 10060 }, { "epoch": 2.5403632694248235, "grad_norm": 0.0002952404029201716, "learning_rate": 7.660612176252944e-06, "loss": 0.0001, "step": 10070 }, { "epoch": 2.542885973763875, "grad_norm": 0.06380714476108551, "learning_rate": 7.618567103935418e-06, "loss": 0.0005, "step": 10080 }, { "epoch": 2.5454086781029264, "grad_norm": 0.2544499933719635, "learning_rate": 7.576522031617894e-06, "loss": 0.0029, "step": 10090 }, { "epoch": 2.547931382441978, "grad_norm": 0.0004265220195520669, "learning_rate": 7.53447695930037e-06, "loss": 0.0001, "step": 10100 }, { "epoch": 2.547931382441978, "eval_loss": 0.00382298999466002, "eval_runtime": 20.8338, "eval_samples_per_second": 84.574, "eval_steps_per_second": 21.168, "step": 10100 }, { "epoch": 2.5504540867810293, "grad_norm": 0.00022531530703417957, "learning_rate": 7.4924318869828465e-06, "loss": 0.0005, "step": 10110 }, { "epoch": 2.5529767911200807, "grad_norm": 0.0006108383531682193, "learning_rate": 7.450386814665322e-06, "loss": 0.0012, "step": 10120 }, { "epoch": 2.555499495459132, "grad_norm": 0.007859915494918823, "learning_rate": 7.4083417423477975e-06, "loss": 0.0002, "step": 10130 }, { "epoch": 2.5580221997981836, "grad_norm": 0.07120391726493835, "learning_rate": 7.366296670030273e-06, "loss": 0.0018, "step": 10140 }, { "epoch": 2.560544904137235, "grad_norm": 3.911245221388526e-05, "learning_rate": 7.3242515977127486e-06, "loss": 0.0001, "step": 10150 }, { "epoch": 2.5630676084762865, "grad_norm": 0.00011251613614149392, "learning_rate": 7.282206525395224e-06, "loss": 0.0003, "step": 10160 }, { "epoch": 2.565590312815338, "grad_norm": 2.7329329896019772e-05, "learning_rate": 7.2401614530777e-06, "loss": 0.0, "step": 10170 }, { "epoch": 2.5681130171543893, "grad_norm": 0.015592537820339203, "learning_rate": 7.198116380760175e-06, "loss": 0.0001, "step": 10180 }, { "epoch": 2.570635721493441, "grad_norm": 0.004030313808470964, "learning_rate": 7.156071308442651e-06, "loss": 0.0006, "step": 10190 }, { "epoch": 2.573158425832492, "grad_norm": 0.00028404564363881946, "learning_rate": 7.114026236125126e-06, "loss": 0.0011, "step": 10200 }, { "epoch": 2.573158425832492, "eval_loss": 0.0038917113561183214, "eval_runtime": 20.8627, "eval_samples_per_second": 84.457, "eval_steps_per_second": 21.138, "step": 10200 }, { "epoch": 2.575681130171544, "grad_norm": 0.015596185810863972, "learning_rate": 7.071981163807602e-06, "loss": 0.0008, "step": 10210 }, { "epoch": 2.578203834510595, "grad_norm": 0.000240606430452317, "learning_rate": 7.029936091490077e-06, "loss": 0.0003, "step": 10220 }, { "epoch": 2.580726538849647, "grad_norm": 0.03709765151143074, "learning_rate": 6.987891019172553e-06, "loss": 0.0015, "step": 10230 }, { "epoch": 2.5832492431886984, "grad_norm": 0.038254085928201675, "learning_rate": 6.945845946855028e-06, "loss": 0.0004, "step": 10240 }, { "epoch": 2.58577194752775, "grad_norm": 0.0966513380408287, "learning_rate": 6.903800874537504e-06, "loss": 0.0012, "step": 10250 }, { "epoch": 2.5882946518668013, "grad_norm": 0.002003374509513378, "learning_rate": 6.861755802219979e-06, "loss": 0.0015, "step": 10260 }, { "epoch": 2.5908173562058527, "grad_norm": 0.2246997058391571, "learning_rate": 6.8197107299024555e-06, "loss": 0.0013, "step": 10270 }, { "epoch": 2.593340060544904, "grad_norm": 0.046293605118989944, "learning_rate": 6.777665657584932e-06, "loss": 0.0013, "step": 10280 }, { "epoch": 2.5958627648839556, "grad_norm": 0.04472287371754646, "learning_rate": 6.735620585267407e-06, "loss": 0.0024, "step": 10290 }, { "epoch": 2.598385469223007, "grad_norm": 0.013854089193046093, "learning_rate": 6.693575512949883e-06, "loss": 0.0007, "step": 10300 }, { "epoch": 2.598385469223007, "eval_loss": 0.003853140166029334, "eval_runtime": 20.8889, "eval_samples_per_second": 84.351, "eval_steps_per_second": 21.112, "step": 10300 }, { "epoch": 2.6009081735620585, "grad_norm": 6.33889067103155e-05, "learning_rate": 6.6515304406323584e-06, "loss": 0.0, "step": 10310 }, { "epoch": 2.60343087790111, "grad_norm": 0.028591223061084747, "learning_rate": 6.609485368314834e-06, "loss": 0.0007, "step": 10320 }, { "epoch": 2.6059535822401614, "grad_norm": 0.015871459618210793, "learning_rate": 6.5674402959973095e-06, "loss": 0.0008, "step": 10330 }, { "epoch": 2.608476286579213, "grad_norm": 0.0005443996633403003, "learning_rate": 6.525395223679785e-06, "loss": 0.002, "step": 10340 }, { "epoch": 2.6109989909182643, "grad_norm": 0.11164157837629318, "learning_rate": 6.4833501513622605e-06, "loss": 0.0005, "step": 10350 }, { "epoch": 2.6135216952573157, "grad_norm": 0.005891601089388132, "learning_rate": 6.441305079044736e-06, "loss": 0.0002, "step": 10360 }, { "epoch": 2.616044399596367, "grad_norm": 0.00020742563356179744, "learning_rate": 6.3992600067272115e-06, "loss": 0.0014, "step": 10370 }, { "epoch": 2.618567103935419, "grad_norm": 6.0241254686843604e-05, "learning_rate": 6.357214934409687e-06, "loss": 0.0002, "step": 10380 }, { "epoch": 2.62108980827447, "grad_norm": 0.00019894012075383216, "learning_rate": 6.3151698620921625e-06, "loss": 0.0005, "step": 10390 }, { "epoch": 2.623612512613522, "grad_norm": 0.13243666291236877, "learning_rate": 6.273124789774638e-06, "loss": 0.001, "step": 10400 }, { "epoch": 2.623612512613522, "eval_loss": 0.0038242663722485304, "eval_runtime": 20.9018, "eval_samples_per_second": 84.299, "eval_steps_per_second": 21.099, "step": 10400 }, { "epoch": 2.626135216952573, "grad_norm": 0.12121517211198807, "learning_rate": 6.231079717457114e-06, "loss": 0.0008, "step": 10410 }, { "epoch": 2.628657921291625, "grad_norm": 0.008915661834180355, "learning_rate": 6.18903464513959e-06, "loss": 0.0, "step": 10420 }, { "epoch": 2.6311806256306762, "grad_norm": 0.005935797467827797, "learning_rate": 6.1469895728220654e-06, "loss": 0.0002, "step": 10430 }, { "epoch": 2.6337033299697277, "grad_norm": 0.030999109148979187, "learning_rate": 6.104944500504541e-06, "loss": 0.0002, "step": 10440 }, { "epoch": 2.636226034308779, "grad_norm": 0.13484105467796326, "learning_rate": 6.0628994281870165e-06, "loss": 0.0005, "step": 10450 }, { "epoch": 2.6387487386478305, "grad_norm": 0.0004957011551596224, "learning_rate": 6.020854355869492e-06, "loss": 0.0006, "step": 10460 }, { "epoch": 2.641271442986882, "grad_norm": 0.12348122149705887, "learning_rate": 5.9788092835519675e-06, "loss": 0.0005, "step": 10470 }, { "epoch": 2.6437941473259334, "grad_norm": 9.011943620862439e-05, "learning_rate": 5.936764211234443e-06, "loss": 0.0006, "step": 10480 }, { "epoch": 2.646316851664985, "grad_norm": 0.00026155789964832366, "learning_rate": 5.8947191389169185e-06, "loss": 0.0008, "step": 10490 }, { "epoch": 2.6488395560040363, "grad_norm": 0.0001088874414563179, "learning_rate": 5.852674066599395e-06, "loss": 0.0011, "step": 10500 }, { "epoch": 2.6488395560040363, "eval_loss": 0.0038149829488247633, "eval_runtime": 20.9282, "eval_samples_per_second": 84.193, "eval_steps_per_second": 21.072, "step": 10500 }, { "epoch": 2.6513622603430878, "grad_norm": 4.513943349593319e-05, "learning_rate": 5.81062899428187e-06, "loss": 0.0008, "step": 10510 }, { "epoch": 2.653884964682139, "grad_norm": 0.00011352117144269869, "learning_rate": 5.768583921964346e-06, "loss": 0.0004, "step": 10520 }, { "epoch": 2.6564076690211906, "grad_norm": 8.327852265210822e-05, "learning_rate": 5.726538849646821e-06, "loss": 0.0003, "step": 10530 }, { "epoch": 2.658930373360242, "grad_norm": 0.08921755105257034, "learning_rate": 5.684493777329297e-06, "loss": 0.0014, "step": 10540 }, { "epoch": 2.6614530776992935, "grad_norm": 0.00016910290287341923, "learning_rate": 5.642448705011772e-06, "loss": 0.0001, "step": 10550 }, { "epoch": 2.663975782038345, "grad_norm": 7.930315769044682e-05, "learning_rate": 5.600403632694248e-06, "loss": 0.0001, "step": 10560 }, { "epoch": 2.666498486377397, "grad_norm": 9.457457781536505e-05, "learning_rate": 5.5583585603767234e-06, "loss": 0.0, "step": 10570 }, { "epoch": 2.669021190716448, "grad_norm": 0.00021707512496504933, "learning_rate": 5.5163134880592e-06, "loss": 0.0003, "step": 10580 }, { "epoch": 2.6715438950554997, "grad_norm": 0.001966067822650075, "learning_rate": 5.474268415741675e-06, "loss": 0.0007, "step": 10590 }, { "epoch": 2.6740665993945507, "grad_norm": 0.004528351593762636, "learning_rate": 5.432223343424151e-06, "loss": 0.0005, "step": 10600 }, { "epoch": 2.6740665993945507, "eval_loss": 0.003807534696534276, "eval_runtime": 20.87, "eval_samples_per_second": 84.427, "eval_steps_per_second": 21.131, "step": 10600 }, { "epoch": 2.6765893037336026, "grad_norm": 0.00017803607624955475, "learning_rate": 5.390178271106626e-06, "loss": 0.0005, "step": 10610 }, { "epoch": 2.679112008072654, "grad_norm": 3.115162326139398e-05, "learning_rate": 5.348133198789102e-06, "loss": 0.0, "step": 10620 }, { "epoch": 2.6816347124117055, "grad_norm": 0.07494215667247772, "learning_rate": 5.306088126471577e-06, "loss": 0.0008, "step": 10630 }, { "epoch": 2.684157416750757, "grad_norm": 0.07868482917547226, "learning_rate": 5.264043054154053e-06, "loss": 0.0005, "step": 10640 }, { "epoch": 2.6866801210898084, "grad_norm": 0.01250834483653307, "learning_rate": 5.221997981836528e-06, "loss": 0.0001, "step": 10650 }, { "epoch": 2.68920282542886, "grad_norm": 0.10575691610574722, "learning_rate": 5.179952909519004e-06, "loss": 0.001, "step": 10660 }, { "epoch": 2.6917255297679112, "grad_norm": 0.16051237285137177, "learning_rate": 5.13790783720148e-06, "loss": 0.0006, "step": 10670 }, { "epoch": 2.6942482341069627, "grad_norm": 4.039399209432304e-05, "learning_rate": 5.095862764883956e-06, "loss": 0.0001, "step": 10680 }, { "epoch": 2.696770938446014, "grad_norm": 9.023944585351273e-05, "learning_rate": 5.053817692566431e-06, "loss": 0.0005, "step": 10690 }, { "epoch": 2.6992936427850656, "grad_norm": 0.20219027996063232, "learning_rate": 5.011772620248907e-06, "loss": 0.0003, "step": 10700 }, { "epoch": 2.6992936427850656, "eval_loss": 0.0038071214221417904, "eval_runtime": 20.8909, "eval_samples_per_second": 84.343, "eval_steps_per_second": 21.11, "step": 10700 }, { "epoch": 2.701816347124117, "grad_norm": 0.12794020771980286, "learning_rate": 4.969727547931382e-06, "loss": 0.0014, "step": 10710 }, { "epoch": 2.7043390514631684, "grad_norm": 0.000275536032859236, "learning_rate": 4.927682475613858e-06, "loss": 0.0006, "step": 10720 }, { "epoch": 2.70686175580222, "grad_norm": 0.0002630538656376302, "learning_rate": 4.885637403296333e-06, "loss": 0.0016, "step": 10730 }, { "epoch": 2.7093844601412713, "grad_norm": 0.042821742594242096, "learning_rate": 4.843592330978809e-06, "loss": 0.0001, "step": 10740 }, { "epoch": 2.7119071644803228, "grad_norm": 0.10874561965465546, "learning_rate": 4.801547258661285e-06, "loss": 0.0006, "step": 10750 }, { "epoch": 2.714429868819374, "grad_norm": 0.00025562438531778753, "learning_rate": 4.759502186343761e-06, "loss": 0.0001, "step": 10760 }, { "epoch": 2.7169525731584256, "grad_norm": 0.006827104836702347, "learning_rate": 4.717457114026236e-06, "loss": 0.001, "step": 10770 }, { "epoch": 2.7194752774974775, "grad_norm": 0.005648414604365826, "learning_rate": 4.675412041708712e-06, "loss": 0.0004, "step": 10780 }, { "epoch": 2.7219979818365285, "grad_norm": 0.001025490928441286, "learning_rate": 4.633366969391187e-06, "loss": 0.0018, "step": 10790 }, { "epoch": 2.7245206861755804, "grad_norm": 0.0006745181744918227, "learning_rate": 4.591321897073663e-06, "loss": 0.0002, "step": 10800 }, { "epoch": 2.7245206861755804, "eval_loss": 0.0037325455341488123, "eval_runtime": 20.8462, "eval_samples_per_second": 84.524, "eval_steps_per_second": 21.155, "step": 10800 }, { "epoch": 2.727043390514632, "grad_norm": 0.0065965172834694386, "learning_rate": 4.549276824756138e-06, "loss": 0.0001, "step": 10810 }, { "epoch": 2.7295660948536833, "grad_norm": 0.0002903965360019356, "learning_rate": 4.507231752438614e-06, "loss": 0.0003, "step": 10820 }, { "epoch": 2.7320887991927347, "grad_norm": 0.16553114354610443, "learning_rate": 4.465186680121089e-06, "loss": 0.0006, "step": 10830 }, { "epoch": 2.734611503531786, "grad_norm": 0.0074982005171477795, "learning_rate": 4.423141607803566e-06, "loss": 0.0001, "step": 10840 }, { "epoch": 2.7371342078708376, "grad_norm": 0.0002544449525885284, "learning_rate": 4.381096535486041e-06, "loss": 0.0005, "step": 10850 }, { "epoch": 2.739656912209889, "grad_norm": 0.07092459499835968, "learning_rate": 4.339051463168517e-06, "loss": 0.0005, "step": 10860 }, { "epoch": 2.7421796165489405, "grad_norm": 0.03416803479194641, "learning_rate": 4.297006390850992e-06, "loss": 0.0004, "step": 10870 }, { "epoch": 2.744702320887992, "grad_norm": 0.0010244250297546387, "learning_rate": 4.254961318533468e-06, "loss": 0.0003, "step": 10880 }, { "epoch": 2.7472250252270434, "grad_norm": 0.0003882810124196112, "learning_rate": 4.212916246215943e-06, "loss": 0.0005, "step": 10890 }, { "epoch": 2.749747729566095, "grad_norm": 0.0006057489081285894, "learning_rate": 4.170871173898419e-06, "loss": 0.0009, "step": 10900 }, { "epoch": 2.749747729566095, "eval_loss": 0.0038078054785728455, "eval_runtime": 20.8522, "eval_samples_per_second": 84.499, "eval_steps_per_second": 21.149, "step": 10900 }, { "epoch": 2.7522704339051463, "grad_norm": 0.00148275145329535, "learning_rate": 4.128826101580894e-06, "loss": 0.0001, "step": 10910 }, { "epoch": 2.7547931382441977, "grad_norm": 0.062449101358652115, "learning_rate": 4.086781029263371e-06, "loss": 0.0017, "step": 10920 }, { "epoch": 2.757315842583249, "grad_norm": 0.011098313145339489, "learning_rate": 4.044735956945846e-06, "loss": 0.0003, "step": 10930 }, { "epoch": 2.7598385469223006, "grad_norm": 0.0001871915883384645, "learning_rate": 4.002690884628322e-06, "loss": 0.0001, "step": 10940 }, { "epoch": 2.762361251261352, "grad_norm": 0.00037126371171325445, "learning_rate": 3.960645812310797e-06, "loss": 0.0, "step": 10950 }, { "epoch": 2.7648839556004035, "grad_norm": 0.0006047156057320535, "learning_rate": 3.918600739993273e-06, "loss": 0.0012, "step": 10960 }, { "epoch": 2.7674066599394553, "grad_norm": 0.00014337015454657376, "learning_rate": 3.876555667675748e-06, "loss": 0.0004, "step": 10970 }, { "epoch": 2.7699293642785063, "grad_norm": 0.12640614807605743, "learning_rate": 3.834510595358224e-06, "loss": 0.0026, "step": 10980 }, { "epoch": 2.772452068617558, "grad_norm": 0.00037311791675165296, "learning_rate": 3.7924655230406996e-06, "loss": 0.0, "step": 10990 }, { "epoch": 2.774974772956609, "grad_norm": 0.00015324597188737243, "learning_rate": 3.750420450723175e-06, "loss": 0.0009, "step": 11000 }, { "epoch": 2.774974772956609, "eval_loss": 0.0037996473256498575, "eval_runtime": 20.8643, "eval_samples_per_second": 84.451, "eval_steps_per_second": 21.137, "step": 11000 }, { "epoch": 2.777497477295661, "grad_norm": 0.10045702010393143, "learning_rate": 3.7083753784056515e-06, "loss": 0.0002, "step": 11010 }, { "epoch": 2.7800201816347125, "grad_norm": 0.00010195528011536226, "learning_rate": 3.666330306088127e-06, "loss": 0.0012, "step": 11020 }, { "epoch": 2.782542885973764, "grad_norm": 0.00022041058400645852, "learning_rate": 3.6242852337706025e-06, "loss": 0.0002, "step": 11030 }, { "epoch": 2.7850655903128154, "grad_norm": 0.18306997418403625, "learning_rate": 3.582240161453078e-06, "loss": 0.0016, "step": 11040 }, { "epoch": 2.787588294651867, "grad_norm": 6.177197064971551e-05, "learning_rate": 3.5401950891355535e-06, "loss": 0.0, "step": 11050 }, { "epoch": 2.7901109989909183, "grad_norm": 0.0007134904735721648, "learning_rate": 3.498150016818029e-06, "loss": 0.0001, "step": 11060 }, { "epoch": 2.7926337033299697, "grad_norm": 0.026096561923623085, "learning_rate": 3.4561049445005045e-06, "loss": 0.0004, "step": 11070 }, { "epoch": 2.795156407669021, "grad_norm": 0.0005445599090307951, "learning_rate": 3.41405987218298e-06, "loss": 0.0, "step": 11080 }, { "epoch": 2.7976791120080726, "grad_norm": 0.00022800432634539902, "learning_rate": 3.3720147998654564e-06, "loss": 0.0, "step": 11090 }, { "epoch": 2.800201816347124, "grad_norm": 0.00023530615726485848, "learning_rate": 3.329969727547932e-06, "loss": 0.0004, "step": 11100 }, { "epoch": 2.800201816347124, "eval_loss": 0.003767798189073801, "eval_runtime": 20.8684, "eval_samples_per_second": 84.434, "eval_steps_per_second": 21.132, "step": 11100 }, { "epoch": 2.8027245206861755, "grad_norm": 9.487225906923413e-05, "learning_rate": 3.2879246552304074e-06, "loss": 0.0, "step": 11110 }, { "epoch": 2.805247225025227, "grad_norm": 0.07703667134046555, "learning_rate": 3.245879582912883e-06, "loss": 0.0009, "step": 11120 }, { "epoch": 2.8077699293642784, "grad_norm": 0.09232014417648315, "learning_rate": 3.2038345105953585e-06, "loss": 0.0003, "step": 11130 }, { "epoch": 2.81029263370333, "grad_norm": 0.0033814776688814163, "learning_rate": 3.161789438277834e-06, "loss": 0.0022, "step": 11140 }, { "epoch": 2.8128153380423813, "grad_norm": 0.003383078845217824, "learning_rate": 3.11974436596031e-06, "loss": 0.0018, "step": 11150 }, { "epoch": 2.815338042381433, "grad_norm": 0.0075791082344949245, "learning_rate": 3.0776992936427854e-06, "loss": 0.0001, "step": 11160 }, { "epoch": 2.817860746720484, "grad_norm": 0.0005082746502012014, "learning_rate": 3.035654221325261e-06, "loss": 0.0004, "step": 11170 }, { "epoch": 2.820383451059536, "grad_norm": 1.3200211469666101e-05, "learning_rate": 2.9936091490077364e-06, "loss": 0.0003, "step": 11180 }, { "epoch": 2.822906155398587, "grad_norm": 0.0003332770138513297, "learning_rate": 2.951564076690212e-06, "loss": 0.0, "step": 11190 }, { "epoch": 2.825428859737639, "grad_norm": 0.15700918436050415, "learning_rate": 2.909519004372688e-06, "loss": 0.0019, "step": 11200 }, { "epoch": 2.825428859737639, "eval_loss": 0.0037862148601561785, "eval_runtime": 20.8605, "eval_samples_per_second": 84.466, "eval_steps_per_second": 21.14, "step": 11200 }, { "epoch": 2.8279515640766903, "grad_norm": 0.00048214950948022306, "learning_rate": 2.8674739320551634e-06, "loss": 0.0001, "step": 11210 }, { "epoch": 2.830474268415742, "grad_norm": 0.0002856640494428575, "learning_rate": 2.825428859737639e-06, "loss": 0.0009, "step": 11220 }, { "epoch": 2.8329969727547932, "grad_norm": 0.00010505354293854907, "learning_rate": 2.7833837874201144e-06, "loss": 0.0001, "step": 11230 }, { "epoch": 2.8355196770938447, "grad_norm": 0.0005396956112235785, "learning_rate": 2.7413387151025904e-06, "loss": 0.0006, "step": 11240 }, { "epoch": 2.838042381432896, "grad_norm": 0.03256835415959358, "learning_rate": 2.699293642785066e-06, "loss": 0.0006, "step": 11250 }, { "epoch": 2.8405650857719476, "grad_norm": 4.0616308979224414e-05, "learning_rate": 2.6572485704675414e-06, "loss": 0.0005, "step": 11260 }, { "epoch": 2.843087790110999, "grad_norm": 0.09430497884750366, "learning_rate": 2.615203498150017e-06, "loss": 0.0003, "step": 11270 }, { "epoch": 2.8456104944500504, "grad_norm": 0.0003849182394333184, "learning_rate": 2.573158425832493e-06, "loss": 0.0, "step": 11280 }, { "epoch": 2.848133198789102, "grad_norm": 0.0575651191174984, "learning_rate": 2.5311133535149683e-06, "loss": 0.0001, "step": 11290 }, { "epoch": 2.8506559031281533, "grad_norm": 0.0015473919920623302, "learning_rate": 2.489068281197444e-06, "loss": 0.0007, "step": 11300 }, { "epoch": 2.8506559031281533, "eval_loss": 0.0037519715260714293, "eval_runtime": 20.8758, "eval_samples_per_second": 84.404, "eval_steps_per_second": 21.125, "step": 11300 }, { "epoch": 2.8531786074672048, "grad_norm": 0.000955607567448169, "learning_rate": 2.4470232088799194e-06, "loss": 0.0011, "step": 11310 }, { "epoch": 2.855701311806256, "grad_norm": 4.5967324695084244e-05, "learning_rate": 2.4049781365623953e-06, "loss": 0.0003, "step": 11320 }, { "epoch": 2.8582240161453076, "grad_norm": 0.0006287918658927083, "learning_rate": 2.362933064244871e-06, "loss": 0.0008, "step": 11330 }, { "epoch": 2.860746720484359, "grad_norm": 0.0023038501385599375, "learning_rate": 2.3208879919273463e-06, "loss": 0.0002, "step": 11340 }, { "epoch": 2.863269424823411, "grad_norm": 0.00017235818086192012, "learning_rate": 2.278842919609822e-06, "loss": 0.0001, "step": 11350 }, { "epoch": 2.865792129162462, "grad_norm": 0.0003580110496841371, "learning_rate": 2.2367978472922973e-06, "loss": 0.0001, "step": 11360 }, { "epoch": 2.868314833501514, "grad_norm": 0.0013928780099377036, "learning_rate": 2.1947527749747733e-06, "loss": 0.0003, "step": 11370 }, { "epoch": 2.870837537840565, "grad_norm": 0.11936355382204056, "learning_rate": 2.152707702657249e-06, "loss": 0.001, "step": 11380 }, { "epoch": 2.8733602421796167, "grad_norm": 0.0003929549129679799, "learning_rate": 2.1106626303397243e-06, "loss": 0.0, "step": 11390 }, { "epoch": 2.875882946518668, "grad_norm": 0.00011808017734438181, "learning_rate": 2.0686175580222e-06, "loss": 0.001, "step": 11400 }, { "epoch": 2.875882946518668, "eval_loss": 0.0037574958987534046, "eval_runtime": 20.8424, "eval_samples_per_second": 84.539, "eval_steps_per_second": 21.159, "step": 11400 }, { "epoch": 2.8784056508577196, "grad_norm": 0.1294584721326828, "learning_rate": 2.0265724857046758e-06, "loss": 0.0006, "step": 11410 }, { "epoch": 2.880928355196771, "grad_norm": 0.0016598176443949342, "learning_rate": 1.9845274133871513e-06, "loss": 0.0009, "step": 11420 }, { "epoch": 2.8834510595358225, "grad_norm": 1.5323972547776066e-05, "learning_rate": 1.9424823410696268e-06, "loss": 0.0, "step": 11430 }, { "epoch": 2.885973763874874, "grad_norm": 0.000323007203405723, "learning_rate": 1.9004372687521023e-06, "loss": 0.0001, "step": 11440 }, { "epoch": 2.8884964682139254, "grad_norm": 0.0010820828611031175, "learning_rate": 1.858392196434578e-06, "loss": 0.001, "step": 11450 }, { "epoch": 2.891019172552977, "grad_norm": 0.009386632591485977, "learning_rate": 1.8163471241170535e-06, "loss": 0.0, "step": 11460 }, { "epoch": 2.8935418768920282, "grad_norm": 0.005150569602847099, "learning_rate": 1.7743020517995292e-06, "loss": 0.0001, "step": 11470 }, { "epoch": 2.8960645812310797, "grad_norm": 5.517240060726181e-05, "learning_rate": 1.7322569794820048e-06, "loss": 0.0011, "step": 11480 }, { "epoch": 2.898587285570131, "grad_norm": 0.16206330060958862, "learning_rate": 1.6902119071644805e-06, "loss": 0.001, "step": 11490 }, { "epoch": 2.9011099899091826, "grad_norm": 0.00011850109876831993, "learning_rate": 1.648166834846956e-06, "loss": 0.001, "step": 11500 }, { "epoch": 2.9011099899091826, "eval_loss": 0.0037340966518968344, "eval_runtime": 20.8481, "eval_samples_per_second": 84.516, "eval_steps_per_second": 21.153, "step": 11500 }, { "epoch": 2.903632694248234, "grad_norm": 0.0002364334650337696, "learning_rate": 1.6061217625294317e-06, "loss": 0.0, "step": 11510 }, { "epoch": 2.9061553985872854, "grad_norm": 0.005173501092940569, "learning_rate": 1.5640766902119072e-06, "loss": 0.0005, "step": 11520 }, { "epoch": 2.908678102926337, "grad_norm": 0.08454468101263046, "learning_rate": 1.522031617894383e-06, "loss": 0.0001, "step": 11530 }, { "epoch": 2.9112008072653888, "grad_norm": 0.018247609958052635, "learning_rate": 1.4799865455768585e-06, "loss": 0.0004, "step": 11540 }, { "epoch": 2.9137235116044398, "grad_norm": 0.08170945197343826, "learning_rate": 1.4379414732593342e-06, "loss": 0.0005, "step": 11550 }, { "epoch": 2.9162462159434916, "grad_norm": 8.340697240782902e-05, "learning_rate": 1.3958964009418097e-06, "loss": 0.0006, "step": 11560 }, { "epoch": 2.9187689202825426, "grad_norm": 0.0001018949769786559, "learning_rate": 1.3538513286242854e-06, "loss": 0.0009, "step": 11570 }, { "epoch": 2.9212916246215945, "grad_norm": 0.0001912551961140707, "learning_rate": 1.311806256306761e-06, "loss": 0.0002, "step": 11580 }, { "epoch": 2.923814328960646, "grad_norm": 0.0019354906398802996, "learning_rate": 1.2697611839892367e-06, "loss": 0.0, "step": 11590 }, { "epoch": 2.9263370332996974, "grad_norm": 0.05732259526848793, "learning_rate": 1.2277161116717122e-06, "loss": 0.0013, "step": 11600 }, { "epoch": 2.9263370332996974, "eval_loss": 0.0037303089629858732, "eval_runtime": 20.8696, "eval_samples_per_second": 84.429, "eval_steps_per_second": 21.131, "step": 11600 }, { "epoch": 2.928859737638749, "grad_norm": 0.04633721709251404, "learning_rate": 1.185671039354188e-06, "loss": 0.0002, "step": 11610 }, { "epoch": 2.9313824419778003, "grad_norm": 0.000182849689736031, "learning_rate": 1.1436259670366634e-06, "loss": 0.0, "step": 11620 }, { "epoch": 2.9339051463168517, "grad_norm": 9.736415813677013e-05, "learning_rate": 1.1015808947191391e-06, "loss": 0.0002, "step": 11630 }, { "epoch": 2.936427850655903, "grad_norm": 0.00046163774095475674, "learning_rate": 1.0595358224016146e-06, "loss": 0.0011, "step": 11640 }, { "epoch": 2.9389505549949546, "grad_norm": 5.9953119489364326e-05, "learning_rate": 1.0174907500840902e-06, "loss": 0.0003, "step": 11650 }, { "epoch": 2.941473259334006, "grad_norm": 0.00025017280131578445, "learning_rate": 9.754456777665659e-07, "loss": 0.0011, "step": 11660 }, { "epoch": 2.9439959636730575, "grad_norm": 0.142095148563385, "learning_rate": 9.334006054490415e-07, "loss": 0.0005, "step": 11670 }, { "epoch": 2.946518668012109, "grad_norm": 0.00014950388867873698, "learning_rate": 8.91355533131517e-07, "loss": 0.0011, "step": 11680 }, { "epoch": 2.9490413723511604, "grad_norm": 7.325205660890788e-05, "learning_rate": 8.493104608139925e-07, "loss": 0.0014, "step": 11690 }, { "epoch": 2.951564076690212, "grad_norm": 0.00012016925757052377, "learning_rate": 8.072653884964682e-07, "loss": 0.0014, "step": 11700 }, { "epoch": 2.951564076690212, "eval_loss": 0.0037244223058223724, "eval_runtime": 20.8534, "eval_samples_per_second": 84.495, "eval_steps_per_second": 21.148, "step": 11700 }, { "epoch": 2.9540867810292633, "grad_norm": 7.947654376039281e-05, "learning_rate": 7.652203161789439e-07, "loss": 0.0004, "step": 11710 }, { "epoch": 2.9566094853683147, "grad_norm": 0.0002958715776912868, "learning_rate": 7.231752438614195e-07, "loss": 0.0001, "step": 11720 }, { "epoch": 2.959132189707366, "grad_norm": 7.174400525400415e-05, "learning_rate": 6.811301715438951e-07, "loss": 0.0001, "step": 11730 }, { "epoch": 2.9616548940464176, "grad_norm": 0.00031969661358743906, "learning_rate": 6.390850992263707e-07, "loss": 0.0005, "step": 11740 }, { "epoch": 2.9641775983854695, "grad_norm": 0.13676594197750092, "learning_rate": 5.970400269088463e-07, "loss": 0.0006, "step": 11750 }, { "epoch": 2.9667003027245205, "grad_norm": 3.757096783374436e-05, "learning_rate": 5.54994954591322e-07, "loss": 0.0013, "step": 11760 }, { "epoch": 2.9692230070635723, "grad_norm": 4.601416731020436e-05, "learning_rate": 5.129498822737976e-07, "loss": 0.001, "step": 11770 }, { "epoch": 2.9717457114026233, "grad_norm": 0.0027868992183357477, "learning_rate": 4.7090480995627313e-07, "loss": 0.0015, "step": 11780 }, { "epoch": 2.974268415741675, "grad_norm": 0.010421237908303738, "learning_rate": 4.288597376387487e-07, "loss": 0.0002, "step": 11790 }, { "epoch": 2.9767911200807267, "grad_norm": 0.001120994915254414, "learning_rate": 3.8681466532122437e-07, "loss": 0.0006, "step": 11800 }, { "epoch": 2.9767911200807267, "eval_loss": 0.0037176210898905993, "eval_runtime": 20.8732, "eval_samples_per_second": 84.415, "eval_steps_per_second": 21.128, "step": 11800 } ], "logging_steps": 10, "max_steps": 11892, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }