{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0669380370872907, "eval_steps": 800, "global_step": 2360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004522840343735866, "eval_loss": 2.6005213260650635, "eval_runtime": 106.0609, "eval_samples_per_second": 20.79, "eval_steps_per_second": 5.205, "step": 1 }, { "epoch": 0.004522840343735866, "grad_norm": 4.152566909790039, "learning_rate": 2.5e-06, "loss": 2.461, "step": 10 }, { "epoch": 0.009045680687471733, "grad_norm": 3.253302812576294, "learning_rate": 5e-06, "loss": 2.1663, "step": 20 }, { "epoch": 0.013568521031207599, "grad_norm": 3.0376248359680176, "learning_rate": 7.5e-06, "loss": 1.5723, "step": 30 }, { "epoch": 0.018091361374943465, "grad_norm": 1.5493546724319458, "learning_rate": 1e-05, "loss": 1.0123, "step": 40 }, { "epoch": 0.022614201718679332, "grad_norm": 0.7575089335441589, "learning_rate": 1.25e-05, "loss": 0.7936, "step": 50 }, { "epoch": 0.027137042062415198, "grad_norm": 1.0295528173446655, "learning_rate": 1.5e-05, "loss": 0.688, "step": 60 }, { "epoch": 0.031659882406151064, "grad_norm": 0.5441007614135742, "learning_rate": 1.75e-05, "loss": 0.6524, "step": 70 }, { "epoch": 0.03618272274988693, "grad_norm": 0.8951964974403381, "learning_rate": 2e-05, "loss": 0.6575, "step": 80 }, { "epoch": 0.0407055630936228, "grad_norm": 0.7289975881576538, "learning_rate": 2.25e-05, "loss": 0.6334, "step": 90 }, { "epoch": 0.045228403437358664, "grad_norm": 0.8988147377967834, "learning_rate": 2.5e-05, "loss": 0.6591, "step": 100 }, { "epoch": 0.04975124378109453, "grad_norm": 0.4641079902648926, "learning_rate": 2.7500000000000004e-05, "loss": 0.6724, "step": 110 }, { "epoch": 0.054274084124830396, "grad_norm": 0.6655707955360413, "learning_rate": 3e-05, "loss": 0.6467, "step": 120 }, { "epoch": 0.05879692446856626, "grad_norm": 0.7066192030906677, "learning_rate": 3.2500000000000004e-05, "loss": 0.6202, "step": 130 }, { "epoch": 0.06331976481230213, "grad_norm": 0.9096384644508362, "learning_rate": 3.5e-05, "loss": 0.6344, "step": 140 }, { "epoch": 0.06784260515603799, "grad_norm": 0.6074104905128479, "learning_rate": 3.7500000000000003e-05, "loss": 0.5695, "step": 150 }, { "epoch": 0.07236544549977386, "grad_norm": 0.8516980409622192, "learning_rate": 4e-05, "loss": 0.5924, "step": 160 }, { "epoch": 0.07688828584350972, "grad_norm": 0.7963458299636841, "learning_rate": 4.25e-05, "loss": 0.6169, "step": 170 }, { "epoch": 0.0814111261872456, "grad_norm": 0.8972164988517761, "learning_rate": 4.5e-05, "loss": 0.5872, "step": 180 }, { "epoch": 0.08593396653098145, "grad_norm": 4.4737162590026855, "learning_rate": 4.75e-05, "loss": 0.6164, "step": 190 }, { "epoch": 0.09045680687471733, "grad_norm": 0.6346756815910339, "learning_rate": 5e-05, "loss": 0.5744, "step": 200 }, { "epoch": 0.09497964721845319, "grad_norm": 0.8093542456626892, "learning_rate": 5.25e-05, "loss": 0.5741, "step": 210 }, { "epoch": 0.09950248756218906, "grad_norm": 0.8128954768180847, "learning_rate": 5.500000000000001e-05, "loss": 0.5842, "step": 220 }, { "epoch": 0.10402532790592492, "grad_norm": 0.5212199687957764, "learning_rate": 5.7499999999999995e-05, "loss": 0.5368, "step": 230 }, { "epoch": 0.10854816824966079, "grad_norm": 0.7929616570472717, "learning_rate": 6e-05, "loss": 0.5645, "step": 240 }, { "epoch": 0.11307100859339665, "grad_norm": 0.6961472630500793, "learning_rate": 6.25e-05, "loss": 0.5541, "step": 250 }, { "epoch": 0.11759384893713253, "grad_norm": 0.6359185576438904, "learning_rate": 6.500000000000001e-05, "loss": 0.5642, "step": 260 }, { "epoch": 0.12211668928086838, "grad_norm": 0.6574355959892273, "learning_rate": 6.750000000000001e-05, "loss": 0.5556, "step": 270 }, { "epoch": 0.12663952962460426, "grad_norm": 0.5738745331764221, "learning_rate": 7e-05, "loss": 0.5582, "step": 280 }, { "epoch": 0.13116236996834013, "grad_norm": 0.3199772834777832, "learning_rate": 7.25e-05, "loss": 0.5796, "step": 290 }, { "epoch": 0.13568521031207598, "grad_norm": 0.3531240224838257, "learning_rate": 7.500000000000001e-05, "loss": 0.5612, "step": 300 }, { "epoch": 0.14020805065581185, "grad_norm": 0.5375868678092957, "learning_rate": 7.75e-05, "loss": 0.5645, "step": 310 }, { "epoch": 0.14473089099954772, "grad_norm": 0.2688968777656555, "learning_rate": 8e-05, "loss": 0.5485, "step": 320 }, { "epoch": 0.14925373134328357, "grad_norm": 0.6451207995414734, "learning_rate": 8.25e-05, "loss": 0.5242, "step": 330 }, { "epoch": 0.15377657168701944, "grad_norm": 2.1042656898498535, "learning_rate": 8.5e-05, "loss": 0.5639, "step": 340 }, { "epoch": 0.15829941203075532, "grad_norm": 0.38682234287261963, "learning_rate": 8.75e-05, "loss": 0.5417, "step": 350 }, { "epoch": 0.1628222523744912, "grad_norm": 0.5062561631202698, "learning_rate": 9e-05, "loss": 0.5509, "step": 360 }, { "epoch": 0.16734509271822703, "grad_norm": 0.6663705706596375, "learning_rate": 9.250000000000001e-05, "loss": 0.557, "step": 370 }, { "epoch": 0.1718679330619629, "grad_norm": 0.4430944323539734, "learning_rate": 9.5e-05, "loss": 0.5711, "step": 380 }, { "epoch": 0.17639077340569878, "grad_norm": 0.6567730903625488, "learning_rate": 9.75e-05, "loss": 0.5437, "step": 390 }, { "epoch": 0.18091361374943465, "grad_norm": 0.44569894671440125, "learning_rate": 0.0001, "loss": 0.5124, "step": 400 }, { "epoch": 0.1854364540931705, "grad_norm": 3.6767895221710205, "learning_rate": 9.999957281897735e-05, "loss": 0.5489, "step": 410 }, { "epoch": 0.18995929443690637, "grad_norm": 1.6086912155151367, "learning_rate": 9.999829128320874e-05, "loss": 0.5508, "step": 420 }, { "epoch": 0.19448213478064225, "grad_norm": 0.42233482003211975, "learning_rate": 9.999615541459207e-05, "loss": 0.5205, "step": 430 }, { "epoch": 0.19900497512437812, "grad_norm": 0.5907697677612305, "learning_rate": 9.999316524962345e-05, "loss": 0.5201, "step": 440 }, { "epoch": 0.20352781546811397, "grad_norm": 0.5699201226234436, "learning_rate": 9.998932083939656e-05, "loss": 0.5263, "step": 450 }, { "epoch": 0.20805065581184984, "grad_norm": 0.8701225519180298, "learning_rate": 9.998462224960175e-05, "loss": 0.5421, "step": 460 }, { "epoch": 0.2125734961555857, "grad_norm": 0.38067907094955444, "learning_rate": 9.997906956052494e-05, "loss": 0.5388, "step": 470 }, { "epoch": 0.21709633649932158, "grad_norm": 0.3497803509235382, "learning_rate": 9.997266286704631e-05, "loss": 0.5216, "step": 480 }, { "epoch": 0.22161917684305743, "grad_norm": 1.2132240533828735, "learning_rate": 9.996540227863854e-05, "loss": 0.5204, "step": 490 }, { "epoch": 0.2261420171867933, "grad_norm": 0.30633747577667236, "learning_rate": 9.995728791936504e-05, "loss": 0.5494, "step": 500 }, { "epoch": 0.23066485753052918, "grad_norm": 0.6719753742218018, "learning_rate": 9.994831992787788e-05, "loss": 0.5489, "step": 510 }, { "epoch": 0.23518769787426505, "grad_norm": 0.5767594575881958, "learning_rate": 9.993849845741524e-05, "loss": 0.5324, "step": 520 }, { "epoch": 0.2397105382180009, "grad_norm": 0.236255943775177, "learning_rate": 9.992782367579899e-05, "loss": 0.5181, "step": 530 }, { "epoch": 0.24423337856173677, "grad_norm": 0.788173258304596, "learning_rate": 9.991629576543163e-05, "loss": 0.5303, "step": 540 }, { "epoch": 0.24875621890547264, "grad_norm": 0.2152377963066101, "learning_rate": 9.990391492329341e-05, "loss": 0.5284, "step": 550 }, { "epoch": 0.2532790592492085, "grad_norm": 0.7022669911384583, "learning_rate": 9.989068136093873e-05, "loss": 0.5282, "step": 560 }, { "epoch": 0.25780189959294436, "grad_norm": 0.5153821110725403, "learning_rate": 9.987659530449268e-05, "loss": 0.5221, "step": 570 }, { "epoch": 0.26232473993668026, "grad_norm": 0.3221074640750885, "learning_rate": 9.986165699464705e-05, "loss": 0.5202, "step": 580 }, { "epoch": 0.2668475802804161, "grad_norm": 0.22985632717609406, "learning_rate": 9.98458666866564e-05, "loss": 0.5148, "step": 590 }, { "epoch": 0.27137042062415195, "grad_norm": 0.5174903869628906, "learning_rate": 9.98292246503335e-05, "loss": 0.5389, "step": 600 }, { "epoch": 0.27589326096788785, "grad_norm": 0.41488873958587646, "learning_rate": 9.981173117004484e-05, "loss": 0.514, "step": 610 }, { "epoch": 0.2804161013116237, "grad_norm": 0.4851066768169403, "learning_rate": 9.979338654470569e-05, "loss": 0.5268, "step": 620 }, { "epoch": 0.28493894165535955, "grad_norm": 0.6160866618156433, "learning_rate": 9.977419108777514e-05, "loss": 0.5216, "step": 630 }, { "epoch": 0.28946178199909545, "grad_norm": 0.47816914319992065, "learning_rate": 9.975414512725057e-05, "loss": 0.5058, "step": 640 }, { "epoch": 0.2939846223428313, "grad_norm": 0.47290515899658203, "learning_rate": 9.973324900566213e-05, "loss": 0.5233, "step": 650 }, { "epoch": 0.29850746268656714, "grad_norm": 0.5466296076774597, "learning_rate": 9.97115030800669e-05, "loss": 0.5208, "step": 660 }, { "epoch": 0.30303030303030304, "grad_norm": 0.1515071988105774, "learning_rate": 9.968890772204271e-05, "loss": 0.5153, "step": 670 }, { "epoch": 0.3075531433740389, "grad_norm": 0.22212214767932892, "learning_rate": 9.966546331768191e-05, "loss": 0.4956, "step": 680 }, { "epoch": 0.3120759837177748, "grad_norm": 0.5279558300971985, "learning_rate": 9.96411702675847e-05, "loss": 0.5137, "step": 690 }, { "epoch": 0.31659882406151063, "grad_norm": 0.49932244420051575, "learning_rate": 9.961602898685226e-05, "loss": 0.5226, "step": 700 }, { "epoch": 0.3211216644052465, "grad_norm": 0.4041373133659363, "learning_rate": 9.959003990507972e-05, "loss": 0.5063, "step": 710 }, { "epoch": 0.3256445047489824, "grad_norm": 0.22287319600582123, "learning_rate": 9.956320346634876e-05, "loss": 0.4965, "step": 720 }, { "epoch": 0.3301673450927182, "grad_norm": 0.379319429397583, "learning_rate": 9.953552012922012e-05, "loss": 0.4926, "step": 730 }, { "epoch": 0.33469018543645407, "grad_norm": 0.4465825855731964, "learning_rate": 9.950699036672559e-05, "loss": 0.5295, "step": 740 }, { "epoch": 0.33921302578018997, "grad_norm": 0.8027335405349731, "learning_rate": 9.947761466636014e-05, "loss": 0.5249, "step": 750 }, { "epoch": 0.3437358661239258, "grad_norm": 0.4452328383922577, "learning_rate": 9.944739353007344e-05, "loss": 0.5168, "step": 760 }, { "epoch": 0.3482587064676617, "grad_norm": 0.5175443291664124, "learning_rate": 9.941632747426129e-05, "loss": 0.5063, "step": 770 }, { "epoch": 0.35278154681139756, "grad_norm": 0.6492401361465454, "learning_rate": 9.938441702975689e-05, "loss": 0.5028, "step": 780 }, { "epoch": 0.3573043871551334, "grad_norm": 0.28473520278930664, "learning_rate": 9.93516627418217e-05, "loss": 0.502, "step": 790 }, { "epoch": 0.3618272274988693, "grad_norm": 0.29608818888664246, "learning_rate": 9.931806517013612e-05, "loss": 0.503, "step": 800 }, { "epoch": 0.3618272274988693, "eval_loss": 0.5416414737701416, "eval_runtime": 106.1622, "eval_samples_per_second": 20.77, "eval_steps_per_second": 5.2, "step": 800 }, { "epoch": 0.36635006784260515, "grad_norm": 0.13932587206363678, "learning_rate": 9.928362488878996e-05, "loss": 0.5012, "step": 810 }, { "epoch": 0.370872908186341, "grad_norm": 0.5137602686882019, "learning_rate": 9.92483424862726e-05, "loss": 0.5183, "step": 820 }, { "epoch": 0.3753957485300769, "grad_norm": 0.33779120445251465, "learning_rate": 9.921221856546293e-05, "loss": 0.5007, "step": 830 }, { "epoch": 0.37991858887381275, "grad_norm": 0.23543624579906464, "learning_rate": 9.917525374361912e-05, "loss": 0.5078, "step": 840 }, { "epoch": 0.38444142921754865, "grad_norm": 0.4530757665634155, "learning_rate": 9.913744865236798e-05, "loss": 0.519, "step": 850 }, { "epoch": 0.3889642695612845, "grad_norm": 0.29827597737312317, "learning_rate": 9.90988039376942e-05, "loss": 0.5017, "step": 860 }, { "epoch": 0.39348710990502034, "grad_norm": 0.8379443883895874, "learning_rate": 9.905932025992932e-05, "loss": 0.517, "step": 870 }, { "epoch": 0.39800995024875624, "grad_norm": 0.3807092607021332, "learning_rate": 9.901899829374047e-05, "loss": 0.5118, "step": 880 }, { "epoch": 0.4025327905924921, "grad_norm": 0.3721909523010254, "learning_rate": 9.897783872811882e-05, "loss": 0.512, "step": 890 }, { "epoch": 0.40705563093622793, "grad_norm": 0.16293685138225555, "learning_rate": 9.893584226636772e-05, "loss": 0.4883, "step": 900 }, { "epoch": 0.41157847127996383, "grad_norm": 0.2745698094367981, "learning_rate": 9.88930096260909e-05, "loss": 0.4878, "step": 910 }, { "epoch": 0.4161013116236997, "grad_norm": 0.2861226499080658, "learning_rate": 9.884934153917997e-05, "loss": 0.4861, "step": 920 }, { "epoch": 0.4206241519674355, "grad_norm": 0.14675496518611908, "learning_rate": 9.880483875180205e-05, "loss": 0.502, "step": 930 }, { "epoch": 0.4251469923111714, "grad_norm": 0.3355661928653717, "learning_rate": 9.8759502024387e-05, "loss": 0.4769, "step": 940 }, { "epoch": 0.42966983265490727, "grad_norm": 0.3499309718608856, "learning_rate": 9.871333213161438e-05, "loss": 0.5044, "step": 950 }, { "epoch": 0.43419267299864317, "grad_norm": 0.13604167103767395, "learning_rate": 9.86663298624003e-05, "loss": 0.4837, "step": 960 }, { "epoch": 0.438715513342379, "grad_norm": 0.42066818475723267, "learning_rate": 9.861849601988383e-05, "loss": 0.4768, "step": 970 }, { "epoch": 0.44323835368611486, "grad_norm": 0.2138914316892624, "learning_rate": 9.856983142141339e-05, "loss": 0.5059, "step": 980 }, { "epoch": 0.44776119402985076, "grad_norm": 0.3707284927368164, "learning_rate": 9.852033689853267e-05, "loss": 0.5059, "step": 990 }, { "epoch": 0.4522840343735866, "grad_norm": 0.6477210521697998, "learning_rate": 9.847001329696653e-05, "loss": 0.4977, "step": 1000 }, { "epoch": 0.45680687471732245, "grad_norm": 0.14800307154655457, "learning_rate": 9.841886147660645e-05, "loss": 0.4976, "step": 1010 }, { "epoch": 0.46132971506105835, "grad_norm": 0.47785401344299316, "learning_rate": 9.836688231149592e-05, "loss": 0.5012, "step": 1020 }, { "epoch": 0.4658525554047942, "grad_norm": 0.12753424048423767, "learning_rate": 9.831407668981546e-05, "loss": 0.4982, "step": 1030 }, { "epoch": 0.4703753957485301, "grad_norm": 0.2438424527645111, "learning_rate": 9.826044551386744e-05, "loss": 0.5074, "step": 1040 }, { "epoch": 0.47489823609226595, "grad_norm": 1.7088356018066406, "learning_rate": 9.820598970006069e-05, "loss": 0.5032, "step": 1050 }, { "epoch": 0.4794210764360018, "grad_norm": 0.4954736530780792, "learning_rate": 9.815071017889482e-05, "loss": 0.5021, "step": 1060 }, { "epoch": 0.4839439167797377, "grad_norm": 0.2744172513484955, "learning_rate": 9.809460789494432e-05, "loss": 0.5182, "step": 1070 }, { "epoch": 0.48846675712347354, "grad_norm": 0.3839784264564514, "learning_rate": 9.803768380684242e-05, "loss": 0.4917, "step": 1080 }, { "epoch": 0.4929895974672094, "grad_norm": 0.1647842675447464, "learning_rate": 9.797993888726473e-05, "loss": 0.5027, "step": 1090 }, { "epoch": 0.4975124378109453, "grad_norm": 0.3968697190284729, "learning_rate": 9.792137412291265e-05, "loss": 0.4932, "step": 1100 }, { "epoch": 0.5020352781546812, "grad_norm": 0.28644707798957825, "learning_rate": 9.786199051449636e-05, "loss": 0.4935, "step": 1110 }, { "epoch": 0.506558118498417, "grad_norm": 0.4823949933052063, "learning_rate": 9.780178907671789e-05, "loss": 0.4999, "step": 1120 }, { "epoch": 0.5110809588421529, "grad_norm": 0.4848293364048004, "learning_rate": 9.774077083825372e-05, "loss": 0.5054, "step": 1130 }, { "epoch": 0.5156037991858887, "grad_norm": 0.15839064121246338, "learning_rate": 9.767893684173721e-05, "loss": 0.4789, "step": 1140 }, { "epoch": 0.5201266395296246, "grad_norm": 0.34784838557243347, "learning_rate": 9.761628814374073e-05, "loss": 0.4942, "step": 1150 }, { "epoch": 0.5246494798733605, "grad_norm": 0.18030278384685516, "learning_rate": 9.755282581475769e-05, "loss": 0.4813, "step": 1160 }, { "epoch": 0.5291723202170964, "grad_norm": 0.43726080656051636, "learning_rate": 9.748855093918417e-05, "loss": 0.5017, "step": 1170 }, { "epoch": 0.5336951605608322, "grad_norm": 0.22014489769935608, "learning_rate": 9.742346461530048e-05, "loss": 0.5077, "step": 1180 }, { "epoch": 0.5382180009045681, "grad_norm": 0.2954523265361786, "learning_rate": 9.735756795525231e-05, "loss": 0.4936, "step": 1190 }, { "epoch": 0.5427408412483039, "grad_norm": 0.1660991609096527, "learning_rate": 9.729086208503174e-05, "loss": 0.4889, "step": 1200 }, { "epoch": 0.5472636815920398, "grad_norm": 0.4176950454711914, "learning_rate": 9.722334814445809e-05, "loss": 0.5041, "step": 1210 }, { "epoch": 0.5517865219357757, "grad_norm": 0.11009762436151505, "learning_rate": 9.715502728715826e-05, "loss": 0.4917, "step": 1220 }, { "epoch": 0.5563093622795116, "grad_norm": 0.652611255645752, "learning_rate": 9.708590068054728e-05, "loss": 0.4923, "step": 1230 }, { "epoch": 0.5608322026232474, "grad_norm": 0.16778019070625305, "learning_rate": 9.701596950580806e-05, "loss": 0.5064, "step": 1240 }, { "epoch": 0.5653550429669832, "grad_norm": 0.3204459547996521, "learning_rate": 9.694523495787149e-05, "loss": 0.4911, "step": 1250 }, { "epoch": 0.5698778833107191, "grad_norm": 0.4674127399921417, "learning_rate": 9.687369824539577e-05, "loss": 0.4971, "step": 1260 }, { "epoch": 0.574400723654455, "grad_norm": 0.12897251546382904, "learning_rate": 9.680136059074598e-05, "loss": 0.4798, "step": 1270 }, { "epoch": 0.5789235639981909, "grad_norm": 0.15009181201457977, "learning_rate": 9.672822322997305e-05, "loss": 0.4895, "step": 1280 }, { "epoch": 0.5834464043419267, "grad_norm": 0.304299920797348, "learning_rate": 9.665428741279266e-05, "loss": 0.498, "step": 1290 }, { "epoch": 0.5879692446856626, "grad_norm": 0.40142571926116943, "learning_rate": 9.657955440256395e-05, "loss": 0.4799, "step": 1300 }, { "epoch": 0.5924920850293984, "grad_norm": 0.17843011021614075, "learning_rate": 9.650402547626786e-05, "loss": 0.4848, "step": 1310 }, { "epoch": 0.5970149253731343, "grad_norm": 0.579076886177063, "learning_rate": 9.642770192448536e-05, "loss": 0.489, "step": 1320 }, { "epoch": 0.6015377657168702, "grad_norm": 0.4647994637489319, "learning_rate": 9.635058505137536e-05, "loss": 0.4964, "step": 1330 }, { "epoch": 0.6060606060606061, "grad_norm": 0.14807263016700745, "learning_rate": 9.627267617465243e-05, "loss": 0.492, "step": 1340 }, { "epoch": 0.6105834464043419, "grad_norm": 0.14985252916812897, "learning_rate": 9.619397662556435e-05, "loss": 0.498, "step": 1350 }, { "epoch": 0.6151062867480778, "grad_norm": 0.16624276340007782, "learning_rate": 9.611448774886924e-05, "loss": 0.4846, "step": 1360 }, { "epoch": 0.6196291270918136, "grad_norm": 0.20036545395851135, "learning_rate": 9.60342109028127e-05, "loss": 0.4919, "step": 1370 }, { "epoch": 0.6241519674355496, "grad_norm": 0.2734001576900482, "learning_rate": 9.595314745910456e-05, "loss": 0.4885, "step": 1380 }, { "epoch": 0.6286748077792854, "grad_norm": 0.12794898450374603, "learning_rate": 9.587129880289538e-05, "loss": 0.5028, "step": 1390 }, { "epoch": 0.6331976481230213, "grad_norm": 0.40597257018089294, "learning_rate": 9.578866633275288e-05, "loss": 0.5009, "step": 1400 }, { "epoch": 0.6377204884667571, "grad_norm": 0.34428995847702026, "learning_rate": 9.570525146063798e-05, "loss": 0.4905, "step": 1410 }, { "epoch": 0.642243328810493, "grad_norm": 0.33436307311058044, "learning_rate": 9.562105561188069e-05, "loss": 0.4891, "step": 1420 }, { "epoch": 0.6467661691542289, "grad_norm": 0.375051885843277, "learning_rate": 9.553608022515577e-05, "loss": 0.5031, "step": 1430 }, { "epoch": 0.6512890094979648, "grad_norm": 0.1948522925376892, "learning_rate": 9.545032675245813e-05, "loss": 0.4836, "step": 1440 }, { "epoch": 0.6558118498417006, "grad_norm": 0.08413676917552948, "learning_rate": 9.5363796659078e-05, "loss": 0.4831, "step": 1450 }, { "epoch": 0.6603346901854364, "grad_norm": 0.289809912443161, "learning_rate": 9.527649142357596e-05, "loss": 0.4851, "step": 1460 }, { "epoch": 0.6648575305291723, "grad_norm": 0.14166517555713654, "learning_rate": 9.518841253775755e-05, "loss": 0.4786, "step": 1470 }, { "epoch": 0.6693803708729081, "grad_norm": 0.13443373143672943, "learning_rate": 9.509956150664796e-05, "loss": 0.4929, "step": 1480 }, { "epoch": 0.6739032112166441, "grad_norm": 0.1985711306333542, "learning_rate": 9.500993984846614e-05, "loss": 0.481, "step": 1490 }, { "epoch": 0.6784260515603799, "grad_norm": 0.3250739276409149, "learning_rate": 9.491954909459895e-05, "loss": 0.4872, "step": 1500 }, { "epoch": 0.6829488919041158, "grad_norm": 0.32657676935195923, "learning_rate": 9.4828390789575e-05, "loss": 0.49, "step": 1510 }, { "epoch": 0.6874717322478516, "grad_norm": 0.22073891758918762, "learning_rate": 9.473646649103818e-05, "loss": 0.5, "step": 1520 }, { "epoch": 0.6919945725915875, "grad_norm": 0.22980472445487976, "learning_rate": 9.464377776972114e-05, "loss": 0.4867, "step": 1530 }, { "epoch": 0.6965174129353234, "grad_norm": 0.1441211700439453, "learning_rate": 9.45503262094184e-05, "loss": 0.4779, "step": 1540 }, { "epoch": 0.7010402532790593, "grad_norm": 0.19868245720863342, "learning_rate": 9.445611340695926e-05, "loss": 0.4917, "step": 1550 }, { "epoch": 0.7055630936227951, "grad_norm": 0.2807522714138031, "learning_rate": 9.43611409721806e-05, "loss": 0.4955, "step": 1560 }, { "epoch": 0.710085933966531, "grad_norm": 0.4753507375717163, "learning_rate": 9.426541052789925e-05, "loss": 0.4884, "step": 1570 }, { "epoch": 0.7146087743102668, "grad_norm": 0.16993290185928345, "learning_rate": 9.416892370988444e-05, "loss": 0.4816, "step": 1580 }, { "epoch": 0.7191316146540027, "grad_norm": 0.31910213828086853, "learning_rate": 9.407168216682962e-05, "loss": 0.491, "step": 1590 }, { "epoch": 0.7236544549977386, "grad_norm": 0.14142963290214539, "learning_rate": 9.397368756032445e-05, "loss": 0.4923, "step": 1600 }, { "epoch": 0.7236544549977386, "eval_loss": 0.5151379704475403, "eval_runtime": 104.895, "eval_samples_per_second": 21.021, "eval_steps_per_second": 5.262, "step": 1600 }, { "epoch": 0.7281772953414745, "grad_norm": 0.26469337940216064, "learning_rate": 9.387494156482643e-05, "loss": 0.4913, "step": 1610 }, { "epoch": 0.7327001356852103, "grad_norm": 0.261874794960022, "learning_rate": 9.377544586763215e-05, "loss": 0.4778, "step": 1620 }, { "epoch": 0.7372229760289462, "grad_norm": 0.24810470640659332, "learning_rate": 9.367520216884856e-05, "loss": 0.4855, "step": 1630 }, { "epoch": 0.741745816372682, "grad_norm": 0.1436118185520172, "learning_rate": 9.357421218136386e-05, "loss": 0.4847, "step": 1640 }, { "epoch": 0.746268656716418, "grad_norm": 0.16066552698612213, "learning_rate": 9.347247763081835e-05, "loss": 0.4872, "step": 1650 }, { "epoch": 0.7507914970601538, "grad_norm": 0.4255355894565582, "learning_rate": 9.337000025557476e-05, "loss": 0.4928, "step": 1660 }, { "epoch": 0.7553143374038896, "grad_norm": 0.1905021220445633, "learning_rate": 9.326678180668871e-05, "loss": 0.4904, "step": 1670 }, { "epoch": 0.7598371777476255, "grad_norm": 0.2327072024345398, "learning_rate": 9.316282404787871e-05, "loss": 0.4888, "step": 1680 }, { "epoch": 0.7643600180913613, "grad_norm": 0.20930258929729462, "learning_rate": 9.305812875549599e-05, "loss": 0.4858, "step": 1690 }, { "epoch": 0.7688828584350973, "grad_norm": 0.1341727375984192, "learning_rate": 9.295269771849427e-05, "loss": 0.4793, "step": 1700 }, { "epoch": 0.7734056987788331, "grad_norm": 0.17430901527404785, "learning_rate": 9.284653273839905e-05, "loss": 0.4969, "step": 1710 }, { "epoch": 0.777928539122569, "grad_norm": 0.5216471552848816, "learning_rate": 9.273963562927695e-05, "loss": 0.491, "step": 1720 }, { "epoch": 0.7824513794663048, "grad_norm": 0.19990628957748413, "learning_rate": 9.263200821770461e-05, "loss": 0.4651, "step": 1730 }, { "epoch": 0.7869742198100407, "grad_norm": 0.5802574753761292, "learning_rate": 9.252365234273755e-05, "loss": 0.4775, "step": 1740 }, { "epoch": 0.7914970601537765, "grad_norm": 0.18451005220413208, "learning_rate": 9.241456985587868e-05, "loss": 0.4823, "step": 1750 }, { "epoch": 0.7960199004975125, "grad_norm": 0.3577069640159607, "learning_rate": 9.230476262104677e-05, "loss": 0.4772, "step": 1760 }, { "epoch": 0.8005427408412483, "grad_norm": 0.13898539543151855, "learning_rate": 9.219423251454448e-05, "loss": 0.4746, "step": 1770 }, { "epoch": 0.8050655811849842, "grad_norm": 0.23426567018032074, "learning_rate": 9.208298142502636e-05, "loss": 0.4865, "step": 1780 }, { "epoch": 0.80958842152872, "grad_norm": 0.33651697635650635, "learning_rate": 9.197101125346657e-05, "loss": 0.5107, "step": 1790 }, { "epoch": 0.8141112618724559, "grad_norm": 0.41530391573905945, "learning_rate": 9.185832391312644e-05, "loss": 0.4836, "step": 1800 }, { "epoch": 0.8186341022161918, "grad_norm": 0.4000149965286255, "learning_rate": 9.174492132952166e-05, "loss": 0.4799, "step": 1810 }, { "epoch": 0.8231569425599277, "grad_norm": 0.2623349726200104, "learning_rate": 9.163080544038952e-05, "loss": 0.4738, "step": 1820 }, { "epoch": 0.8276797829036635, "grad_norm": 0.2306557595729828, "learning_rate": 9.151597819565571e-05, "loss": 0.4833, "step": 1830 }, { "epoch": 0.8322026232473994, "grad_norm": 0.11869548261165619, "learning_rate": 9.140044155740101e-05, "loss": 0.4836, "step": 1840 }, { "epoch": 0.8367254635911352, "grad_norm": 0.4591820240020752, "learning_rate": 9.12841974998278e-05, "loss": 0.4942, "step": 1850 }, { "epoch": 0.841248303934871, "grad_norm": 0.17287275195121765, "learning_rate": 9.116724800922629e-05, "loss": 0.4734, "step": 1860 }, { "epoch": 0.845771144278607, "grad_norm": 0.15221983194351196, "learning_rate": 9.104959508394061e-05, "loss": 0.4806, "step": 1870 }, { "epoch": 0.8502939846223428, "grad_norm": 0.4994131922721863, "learning_rate": 9.093124073433463e-05, "loss": 0.4837, "step": 1880 }, { "epoch": 0.8548168249660787, "grad_norm": 0.47702082991600037, "learning_rate": 9.081218698275763e-05, "loss": 0.4836, "step": 1890 }, { "epoch": 0.8593396653098145, "grad_norm": 0.18302541971206665, "learning_rate": 9.069243586350975e-05, "loss": 0.4837, "step": 1900 }, { "epoch": 0.8638625056535504, "grad_norm": 0.35281088948249817, "learning_rate": 9.057198942280722e-05, "loss": 0.4799, "step": 1910 }, { "epoch": 0.8683853459972863, "grad_norm": 0.30625414848327637, "learning_rate": 9.045084971874738e-05, "loss": 0.4743, "step": 1920 }, { "epoch": 0.8729081863410222, "grad_norm": 0.4937283396720886, "learning_rate": 9.032901882127354e-05, "loss": 0.5007, "step": 1930 }, { "epoch": 0.877431026684758, "grad_norm": 0.25247976183891296, "learning_rate": 9.020649881213958e-05, "loss": 0.4792, "step": 1940 }, { "epoch": 0.8819538670284939, "grad_norm": 0.43513771891593933, "learning_rate": 9.008329178487442e-05, "loss": 0.486, "step": 1950 }, { "epoch": 0.8864767073722297, "grad_norm": 0.4003300368785858, "learning_rate": 8.995939984474624e-05, "loss": 0.4871, "step": 1960 }, { "epoch": 0.8909995477159657, "grad_norm": 0.14090712368488312, "learning_rate": 8.983482510872645e-05, "loss": 0.4849, "step": 1970 }, { "epoch": 0.8955223880597015, "grad_norm": 0.5139490962028503, "learning_rate": 8.970956970545355e-05, "loss": 0.4954, "step": 1980 }, { "epoch": 0.9000452284034374, "grad_norm": 0.15464723110198975, "learning_rate": 8.958363577519684e-05, "loss": 0.4874, "step": 1990 }, { "epoch": 0.9045680687471732, "grad_norm": 0.6348758339881897, "learning_rate": 8.945702546981969e-05, "loss": 0.4893, "step": 2000 }, { "epoch": 0.9090909090909091, "grad_norm": 0.34100142121315, "learning_rate": 8.932974095274289e-05, "loss": 0.4942, "step": 2010 }, { "epoch": 0.9136137494346449, "grad_norm": 0.1424902230501175, "learning_rate": 8.920178439890765e-05, "loss": 0.4801, "step": 2020 }, { "epoch": 0.9181365897783809, "grad_norm": 0.07340684533119202, "learning_rate": 8.907315799473846e-05, "loss": 0.4734, "step": 2030 }, { "epoch": 0.9226594301221167, "grad_norm": 0.8894658088684082, "learning_rate": 8.894386393810563e-05, "loss": 0.4813, "step": 2040 }, { "epoch": 0.9271822704658526, "grad_norm": 3.0061469078063965, "learning_rate": 8.881390443828787e-05, "loss": 0.4722, "step": 2050 }, { "epoch": 0.9317051108095884, "grad_norm": 0.435508131980896, "learning_rate": 8.868328171593448e-05, "loss": 0.4856, "step": 2060 }, { "epoch": 0.9362279511533242, "grad_norm": 0.30460214614868164, "learning_rate": 8.855199800302736e-05, "loss": 0.473, "step": 2070 }, { "epoch": 0.9407507914970602, "grad_norm": 0.48862236738204956, "learning_rate": 8.842005554284296e-05, "loss": 0.4914, "step": 2080 }, { "epoch": 0.945273631840796, "grad_norm": 0.16484391689300537, "learning_rate": 8.828745658991386e-05, "loss": 0.4872, "step": 2090 }, { "epoch": 0.9497964721845319, "grad_norm": 0.19850239157676697, "learning_rate": 8.815420340999033e-05, "loss": 0.4978, "step": 2100 }, { "epoch": 0.9543193125282677, "grad_norm": 0.15624162554740906, "learning_rate": 8.802029828000156e-05, "loss": 0.4904, "step": 2110 }, { "epoch": 0.9588421528720036, "grad_norm": 0.20471015572547913, "learning_rate": 8.788574348801675e-05, "loss": 0.4818, "step": 2120 }, { "epoch": 0.9633649932157394, "grad_norm": 0.879814088344574, "learning_rate": 8.775054133320604e-05, "loss": 0.4832, "step": 2130 }, { "epoch": 0.9678878335594754, "grad_norm": 0.14262792468070984, "learning_rate": 8.761469412580125e-05, "loss": 0.4893, "step": 2140 }, { "epoch": 0.9724106739032112, "grad_norm": 0.43501853942871094, "learning_rate": 8.74782041870563e-05, "loss": 0.4751, "step": 2150 }, { "epoch": 0.9769335142469471, "grad_norm": 0.10217157751321793, "learning_rate": 8.73410738492077e-05, "loss": 0.4863, "step": 2160 }, { "epoch": 0.9814563545906829, "grad_norm": 0.550026535987854, "learning_rate": 8.720330545543453e-05, "loss": 0.4832, "step": 2170 }, { "epoch": 0.9859791949344188, "grad_norm": 0.283246785402298, "learning_rate": 8.706490135981855e-05, "loss": 0.4895, "step": 2180 }, { "epoch": 0.9905020352781547, "grad_norm": 0.21039070188999176, "learning_rate": 8.692586392730387e-05, "loss": 0.494, "step": 2190 }, { "epoch": 0.9950248756218906, "grad_norm": 0.7897614240646362, "learning_rate": 8.678619553365659e-05, "loss": 0.4871, "step": 2200 }, { "epoch": 0.9995477159656264, "grad_norm": 0.3698125183582306, "learning_rate": 8.66458985654242e-05, "loss": 0.4764, "step": 2210 }, { "epoch": 1.0036182722749887, "grad_norm": 0.529675304889679, "learning_rate": 8.650497541989482e-05, "loss": 0.4834, "step": 2220 }, { "epoch": 1.0081411126187245, "grad_norm": 0.10685670375823975, "learning_rate": 8.636342850505616e-05, "loss": 0.4864, "step": 2230 }, { "epoch": 1.0126639529624604, "grad_norm": 0.2046063244342804, "learning_rate": 8.622126023955446e-05, "loss": 0.4796, "step": 2240 }, { "epoch": 1.0171867933061962, "grad_norm": 0.2893441319465637, "learning_rate": 8.60784730526531e-05, "loss": 0.4882, "step": 2250 }, { "epoch": 1.021709633649932, "grad_norm": 0.12210704386234283, "learning_rate": 8.59350693841912e-05, "loss": 0.4805, "step": 2260 }, { "epoch": 1.0262324739936681, "grad_norm": 4.210540294647217, "learning_rate": 8.579105168454173e-05, "loss": 0.4812, "step": 2270 }, { "epoch": 1.030755314337404, "grad_norm": 0.3887004256248474, "learning_rate": 8.564642241456986e-05, "loss": 0.4804, "step": 2280 }, { "epoch": 1.0352781546811398, "grad_norm": 0.4718762934207916, "learning_rate": 8.550118404559075e-05, "loss": 0.489, "step": 2290 }, { "epoch": 1.0398009950248757, "grad_norm": 0.26404207944869995, "learning_rate": 8.535533905932738e-05, "loss": 0.4829, "step": 2300 }, { "epoch": 1.0443238353686115, "grad_norm": 0.6389791965484619, "learning_rate": 8.52088899478682e-05, "loss": 0.4806, "step": 2310 }, { "epoch": 1.0488466757123474, "grad_norm": 1.5533722639083862, "learning_rate": 8.506183921362443e-05, "loss": 0.4878, "step": 2320 }, { "epoch": 1.0533695160560832, "grad_norm": 0.30983996391296387, "learning_rate": 8.491418936928742e-05, "loss": 0.4808, "step": 2330 }, { "epoch": 1.057892356399819, "grad_norm": 0.16938813030719757, "learning_rate": 8.476594293778561e-05, "loss": 0.4863, "step": 2340 }, { "epoch": 1.062415196743555, "grad_norm": 1.2061636447906494, "learning_rate": 8.461710245224148e-05, "loss": 0.4806, "step": 2350 }, { "epoch": 1.0669380370872907, "grad_norm": 0.0852380022406578, "learning_rate": 8.44676704559283e-05, "loss": 0.4764, "step": 2360 } ], "logging_steps": 10, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.09484398947074e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }