{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4023, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007457121551081283, "grad_norm": 1.4923322722027186, "learning_rate": 5e-06, "loss": 0.6314, "step": 10 }, { "epoch": 0.014914243102162566, "grad_norm": 1.4277498493155112, "learning_rate": 5e-06, "loss": 0.5957, "step": 20 }, { "epoch": 0.02237136465324385, "grad_norm": 1.3160638879347244, "learning_rate": 5e-06, "loss": 0.5858, "step": 30 }, { "epoch": 0.02982848620432513, "grad_norm": 1.3286844636967108, "learning_rate": 5e-06, "loss": 0.6083, "step": 40 }, { "epoch": 0.037285607755406416, "grad_norm": 1.3457945081617781, "learning_rate": 5e-06, "loss": 0.5833, "step": 50 }, { "epoch": 0.0447427293064877, "grad_norm": 1.1775640475079898, "learning_rate": 5e-06, "loss": 0.5615, "step": 60 }, { "epoch": 0.05219985085756898, "grad_norm": 0.7851058774790824, "learning_rate": 5e-06, "loss": 0.5599, "step": 70 }, { "epoch": 0.05965697240865026, "grad_norm": 0.6748980341595908, "learning_rate": 5e-06, "loss": 0.5738, "step": 80 }, { "epoch": 0.06711409395973154, "grad_norm": 0.6098300182337207, "learning_rate": 5e-06, "loss": 0.5513, "step": 90 }, { "epoch": 0.07457121551081283, "grad_norm": 0.6353294403272493, "learning_rate": 5e-06, "loss": 0.5723, "step": 100 }, { "epoch": 0.08202833706189411, "grad_norm": 0.5674389209790128, "learning_rate": 5e-06, "loss": 0.5666, "step": 110 }, { "epoch": 0.0894854586129754, "grad_norm": 0.6279447283125179, "learning_rate": 5e-06, "loss": 0.5475, "step": 120 }, { "epoch": 0.09694258016405667, "grad_norm": 0.576962331333649, "learning_rate": 5e-06, "loss": 0.5532, "step": 130 }, { "epoch": 0.10439970171513796, "grad_norm": 0.5745640803688579, "learning_rate": 5e-06, "loss": 0.5561, "step": 140 }, { "epoch": 0.11185682326621924, "grad_norm": 0.568035600435591, "learning_rate": 5e-06, "loss": 0.5542, "step": 150 }, { "epoch": 0.11931394481730052, "grad_norm": 0.5903955926152153, "learning_rate": 5e-06, "loss": 0.55, "step": 160 }, { "epoch": 0.1267710663683818, "grad_norm": 0.6272192755787677, "learning_rate": 5e-06, "loss": 0.5577, "step": 170 }, { "epoch": 0.1342281879194631, "grad_norm": 0.621732319302288, "learning_rate": 5e-06, "loss": 0.5579, "step": 180 }, { "epoch": 0.14168530947054436, "grad_norm": 0.582261312726868, "learning_rate": 5e-06, "loss": 0.5691, "step": 190 }, { "epoch": 0.14914243102162567, "grad_norm": 0.6038562541450465, "learning_rate": 5e-06, "loss": 0.5441, "step": 200 }, { "epoch": 0.15659955257270694, "grad_norm": 0.6342330659293374, "learning_rate": 5e-06, "loss": 0.5438, "step": 210 }, { "epoch": 0.16405667412378822, "grad_norm": 0.5605346796698036, "learning_rate": 5e-06, "loss": 0.5552, "step": 220 }, { "epoch": 0.1715137956748695, "grad_norm": 0.5998445612806728, "learning_rate": 5e-06, "loss": 0.57, "step": 230 }, { "epoch": 0.1789709172259508, "grad_norm": 0.5714415956910529, "learning_rate": 5e-06, "loss": 0.5588, "step": 240 }, { "epoch": 0.18642803877703207, "grad_norm": 0.6368975283895788, "learning_rate": 5e-06, "loss": 0.5435, "step": 250 }, { "epoch": 0.19388516032811334, "grad_norm": 0.5912645424796484, "learning_rate": 5e-06, "loss": 0.5371, "step": 260 }, { "epoch": 0.20134228187919462, "grad_norm": 0.5671791863380606, "learning_rate": 5e-06, "loss": 0.554, "step": 270 }, { "epoch": 0.20879940343027592, "grad_norm": 0.5825115142497552, "learning_rate": 5e-06, "loss": 0.5648, "step": 280 }, { "epoch": 0.2162565249813572, "grad_norm": 0.6148559353437262, "learning_rate": 5e-06, "loss": 0.5626, "step": 290 }, { "epoch": 0.22371364653243847, "grad_norm": 0.5992171256664753, "learning_rate": 5e-06, "loss": 0.5495, "step": 300 }, { "epoch": 0.23117076808351977, "grad_norm": 0.5756923092743548, "learning_rate": 5e-06, "loss": 0.5492, "step": 310 }, { "epoch": 0.23862788963460105, "grad_norm": 0.5571239833558982, "learning_rate": 5e-06, "loss": 0.5382, "step": 320 }, { "epoch": 0.24608501118568232, "grad_norm": 0.6142758618521919, "learning_rate": 5e-06, "loss": 0.5626, "step": 330 }, { "epoch": 0.2535421327367636, "grad_norm": 0.6071098554908272, "learning_rate": 5e-06, "loss": 0.5537, "step": 340 }, { "epoch": 0.2609992542878449, "grad_norm": 0.5503873105742949, "learning_rate": 5e-06, "loss": 0.5577, "step": 350 }, { "epoch": 0.2684563758389262, "grad_norm": 0.5837181817071831, "learning_rate": 5e-06, "loss": 0.5514, "step": 360 }, { "epoch": 0.2759134973900075, "grad_norm": 0.6207096163948669, "learning_rate": 5e-06, "loss": 0.5415, "step": 370 }, { "epoch": 0.2833706189410887, "grad_norm": 0.5430609166419872, "learning_rate": 5e-06, "loss": 0.5641, "step": 380 }, { "epoch": 0.29082774049217003, "grad_norm": 0.6540545927611947, "learning_rate": 5e-06, "loss": 0.5481, "step": 390 }, { "epoch": 0.29828486204325133, "grad_norm": 0.5461956945004067, "learning_rate": 5e-06, "loss": 0.5513, "step": 400 }, { "epoch": 0.3057419835943326, "grad_norm": 0.6695239493079291, "learning_rate": 5e-06, "loss": 0.5638, "step": 410 }, { "epoch": 0.3131991051454139, "grad_norm": 0.586611622356162, "learning_rate": 5e-06, "loss": 0.5492, "step": 420 }, { "epoch": 0.32065622669649513, "grad_norm": 0.610109513110288, "learning_rate": 5e-06, "loss": 0.5607, "step": 430 }, { "epoch": 0.32811334824757643, "grad_norm": 0.6127769350008854, "learning_rate": 5e-06, "loss": 0.5645, "step": 440 }, { "epoch": 0.33557046979865773, "grad_norm": 0.6680654140600404, "learning_rate": 5e-06, "loss": 0.5549, "step": 450 }, { "epoch": 0.343027591349739, "grad_norm": 0.5826310904520773, "learning_rate": 5e-06, "loss": 0.5548, "step": 460 }, { "epoch": 0.3504847129008203, "grad_norm": 0.6239330754324169, "learning_rate": 5e-06, "loss": 0.5583, "step": 470 }, { "epoch": 0.3579418344519016, "grad_norm": 0.5877553356456545, "learning_rate": 5e-06, "loss": 0.545, "step": 480 }, { "epoch": 0.36539895600298283, "grad_norm": 0.5829722537190941, "learning_rate": 5e-06, "loss": 0.5395, "step": 490 }, { "epoch": 0.37285607755406414, "grad_norm": 0.6621755183448562, "learning_rate": 5e-06, "loss": 0.5456, "step": 500 }, { "epoch": 0.38031319910514544, "grad_norm": 0.5591184552346828, "learning_rate": 5e-06, "loss": 0.5367, "step": 510 }, { "epoch": 0.3877703206562267, "grad_norm": 0.565634611705387, "learning_rate": 5e-06, "loss": 0.5716, "step": 520 }, { "epoch": 0.395227442207308, "grad_norm": 0.5782734602622104, "learning_rate": 5e-06, "loss": 0.5545, "step": 530 }, { "epoch": 0.40268456375838924, "grad_norm": 0.5695093375447697, "learning_rate": 5e-06, "loss": 0.5646, "step": 540 }, { "epoch": 0.41014168530947054, "grad_norm": 0.6396895709952651, "learning_rate": 5e-06, "loss": 0.559, "step": 550 }, { "epoch": 0.41759880686055184, "grad_norm": 0.5626775345590961, "learning_rate": 5e-06, "loss": 0.5398, "step": 560 }, { "epoch": 0.4250559284116331, "grad_norm": 0.6561792759757238, "learning_rate": 5e-06, "loss": 0.5506, "step": 570 }, { "epoch": 0.4325130499627144, "grad_norm": 0.6503627380598381, "learning_rate": 5e-06, "loss": 0.5624, "step": 580 }, { "epoch": 0.4399701715137957, "grad_norm": 0.5712768208534026, "learning_rate": 5e-06, "loss": 0.5493, "step": 590 }, { "epoch": 0.44742729306487694, "grad_norm": 0.632567133823459, "learning_rate": 5e-06, "loss": 0.5495, "step": 600 }, { "epoch": 0.45488441461595824, "grad_norm": 0.5891856850275913, "learning_rate": 5e-06, "loss": 0.5422, "step": 610 }, { "epoch": 0.46234153616703955, "grad_norm": 0.5745761726721252, "learning_rate": 5e-06, "loss": 0.5575, "step": 620 }, { "epoch": 0.4697986577181208, "grad_norm": 0.6304716115534362, "learning_rate": 5e-06, "loss": 0.552, "step": 630 }, { "epoch": 0.4772557792692021, "grad_norm": 0.5804599760466073, "learning_rate": 5e-06, "loss": 0.5574, "step": 640 }, { "epoch": 0.48471290082028334, "grad_norm": 0.5920559180390651, "learning_rate": 5e-06, "loss": 0.5515, "step": 650 }, { "epoch": 0.49217002237136465, "grad_norm": 0.5791705205755153, "learning_rate": 5e-06, "loss": 0.5574, "step": 660 }, { "epoch": 0.49962714392244595, "grad_norm": 0.6201405707302831, "learning_rate": 5e-06, "loss": 0.5585, "step": 670 }, { "epoch": 0.5070842654735273, "grad_norm": 0.5753619878991011, "learning_rate": 5e-06, "loss": 0.556, "step": 680 }, { "epoch": 0.5145413870246085, "grad_norm": 0.5594987960581536, "learning_rate": 5e-06, "loss": 0.5437, "step": 690 }, { "epoch": 0.5219985085756897, "grad_norm": 0.5703994092235947, "learning_rate": 5e-06, "loss": 0.5568, "step": 700 }, { "epoch": 0.5294556301267711, "grad_norm": 0.5834034437186236, "learning_rate": 5e-06, "loss": 0.5431, "step": 710 }, { "epoch": 0.5369127516778524, "grad_norm": 0.560106669559214, "learning_rate": 5e-06, "loss": 0.5552, "step": 720 }, { "epoch": 0.5443698732289336, "grad_norm": 0.5679786278378056, "learning_rate": 5e-06, "loss": 0.554, "step": 730 }, { "epoch": 0.551826994780015, "grad_norm": 0.6429988192429542, "learning_rate": 5e-06, "loss": 0.5495, "step": 740 }, { "epoch": 0.5592841163310962, "grad_norm": 0.6267938520430484, "learning_rate": 5e-06, "loss": 0.5561, "step": 750 }, { "epoch": 0.5667412378821775, "grad_norm": 0.6138480606756376, "learning_rate": 5e-06, "loss": 0.5532, "step": 760 }, { "epoch": 0.5741983594332588, "grad_norm": 0.6398337341244397, "learning_rate": 5e-06, "loss": 0.5567, "step": 770 }, { "epoch": 0.5816554809843401, "grad_norm": 0.6201612138500541, "learning_rate": 5e-06, "loss": 0.5411, "step": 780 }, { "epoch": 0.5891126025354213, "grad_norm": 0.5777153613729241, "learning_rate": 5e-06, "loss": 0.5598, "step": 790 }, { "epoch": 0.5965697240865027, "grad_norm": 0.5831925884746647, "learning_rate": 5e-06, "loss": 0.5505, "step": 800 }, { "epoch": 0.6040268456375839, "grad_norm": 0.5811154005536924, "learning_rate": 5e-06, "loss": 0.5301, "step": 810 }, { "epoch": 0.6114839671886652, "grad_norm": 0.5578916945797876, "learning_rate": 5e-06, "loss": 0.5616, "step": 820 }, { "epoch": 0.6189410887397464, "grad_norm": 0.6251996052710133, "learning_rate": 5e-06, "loss": 0.5444, "step": 830 }, { "epoch": 0.6263982102908278, "grad_norm": 0.5440172219846166, "learning_rate": 5e-06, "loss": 0.5596, "step": 840 }, { "epoch": 0.633855331841909, "grad_norm": 0.5904453872308907, "learning_rate": 5e-06, "loss": 0.5469, "step": 850 }, { "epoch": 0.6413124533929903, "grad_norm": 0.5659960741097526, "learning_rate": 5e-06, "loss": 0.562, "step": 860 }, { "epoch": 0.6487695749440716, "grad_norm": 0.5836414884442678, "learning_rate": 5e-06, "loss": 0.5388, "step": 870 }, { "epoch": 0.6562266964951529, "grad_norm": 0.6185085505610105, "learning_rate": 5e-06, "loss": 0.5406, "step": 880 }, { "epoch": 0.6636838180462341, "grad_norm": 0.5986837831143031, "learning_rate": 5e-06, "loss": 0.5549, "step": 890 }, { "epoch": 0.6711409395973155, "grad_norm": 0.5921990630549645, "learning_rate": 5e-06, "loss": 0.5291, "step": 900 }, { "epoch": 0.6785980611483967, "grad_norm": 0.5802544521725337, "learning_rate": 5e-06, "loss": 0.5523, "step": 910 }, { "epoch": 0.686055182699478, "grad_norm": 0.5851196561131752, "learning_rate": 5e-06, "loss": 0.5448, "step": 920 }, { "epoch": 0.6935123042505593, "grad_norm": 0.583877657890954, "learning_rate": 5e-06, "loss": 0.5415, "step": 930 }, { "epoch": 0.7009694258016406, "grad_norm": 0.5598625156605288, "learning_rate": 5e-06, "loss": 0.5712, "step": 940 }, { "epoch": 0.7084265473527218, "grad_norm": 0.6934347616736215, "learning_rate": 5e-06, "loss": 0.5519, "step": 950 }, { "epoch": 0.7158836689038032, "grad_norm": 0.5977390121445122, "learning_rate": 5e-06, "loss": 0.5551, "step": 960 }, { "epoch": 0.7233407904548844, "grad_norm": 0.5908530451006745, "learning_rate": 5e-06, "loss": 0.5561, "step": 970 }, { "epoch": 0.7307979120059657, "grad_norm": 0.5657057055912034, "learning_rate": 5e-06, "loss": 0.553, "step": 980 }, { "epoch": 0.738255033557047, "grad_norm": 0.5866240008585512, "learning_rate": 5e-06, "loss": 0.5403, "step": 990 }, { "epoch": 0.7457121551081283, "grad_norm": 0.5925600343817397, "learning_rate": 5e-06, "loss": 0.548, "step": 1000 }, { "epoch": 0.7531692766592095, "grad_norm": 0.5905026803607015, "learning_rate": 5e-06, "loss": 0.5466, "step": 1010 }, { "epoch": 0.7606263982102909, "grad_norm": 0.5939700140149752, "learning_rate": 5e-06, "loss": 0.5519, "step": 1020 }, { "epoch": 0.7680835197613721, "grad_norm": 0.6358723434503033, "learning_rate": 5e-06, "loss": 0.5465, "step": 1030 }, { "epoch": 0.7755406413124534, "grad_norm": 0.5893886156719156, "learning_rate": 5e-06, "loss": 0.5478, "step": 1040 }, { "epoch": 0.7829977628635347, "grad_norm": 0.6124289839560092, "learning_rate": 5e-06, "loss": 0.5578, "step": 1050 }, { "epoch": 0.790454884414616, "grad_norm": 0.5876929465940476, "learning_rate": 5e-06, "loss": 0.5486, "step": 1060 }, { "epoch": 0.7979120059656972, "grad_norm": 0.5765792583371309, "learning_rate": 5e-06, "loss": 0.5453, "step": 1070 }, { "epoch": 0.8053691275167785, "grad_norm": 0.5763345735177611, "learning_rate": 5e-06, "loss": 0.54, "step": 1080 }, { "epoch": 0.8128262490678598, "grad_norm": 0.6392240553326028, "learning_rate": 5e-06, "loss": 0.5536, "step": 1090 }, { "epoch": 0.8202833706189411, "grad_norm": 0.5783160074020695, "learning_rate": 5e-06, "loss": 0.5645, "step": 1100 }, { "epoch": 0.8277404921700223, "grad_norm": 0.580827750660942, "learning_rate": 5e-06, "loss": 0.5618, "step": 1110 }, { "epoch": 0.8351976137211037, "grad_norm": 0.5805219027688908, "learning_rate": 5e-06, "loss": 0.5378, "step": 1120 }, { "epoch": 0.8426547352721849, "grad_norm": 0.5631938645784198, "learning_rate": 5e-06, "loss": 0.552, "step": 1130 }, { "epoch": 0.8501118568232662, "grad_norm": 0.5460444743065537, "learning_rate": 5e-06, "loss": 0.5474, "step": 1140 }, { "epoch": 0.8575689783743475, "grad_norm": 0.619229281669363, "learning_rate": 5e-06, "loss": 0.5568, "step": 1150 }, { "epoch": 0.8650260999254288, "grad_norm": 0.5892594363626477, "learning_rate": 5e-06, "loss": 0.5343, "step": 1160 }, { "epoch": 0.87248322147651, "grad_norm": 0.6139047400145734, "learning_rate": 5e-06, "loss": 0.5428, "step": 1170 }, { "epoch": 0.8799403430275914, "grad_norm": 0.5966906346273294, "learning_rate": 5e-06, "loss": 0.5356, "step": 1180 }, { "epoch": 0.8873974645786726, "grad_norm": 0.6535776740165109, "learning_rate": 5e-06, "loss": 0.5499, "step": 1190 }, { "epoch": 0.8948545861297539, "grad_norm": 0.5662037475274219, "learning_rate": 5e-06, "loss": 0.5288, "step": 1200 }, { "epoch": 0.9023117076808352, "grad_norm": 0.5995000278562762, "learning_rate": 5e-06, "loss": 0.5563, "step": 1210 }, { "epoch": 0.9097688292319165, "grad_norm": 0.6346956605590084, "learning_rate": 5e-06, "loss": 0.5392, "step": 1220 }, { "epoch": 0.9172259507829977, "grad_norm": 0.5657793485267905, "learning_rate": 5e-06, "loss": 0.551, "step": 1230 }, { "epoch": 0.9246830723340791, "grad_norm": 0.5681455949615055, "learning_rate": 5e-06, "loss": 0.5603, "step": 1240 }, { "epoch": 0.9321401938851603, "grad_norm": 0.6009029524168309, "learning_rate": 5e-06, "loss": 0.5468, "step": 1250 }, { "epoch": 0.9395973154362416, "grad_norm": 0.6145844854677124, "learning_rate": 5e-06, "loss": 0.5609, "step": 1260 }, { "epoch": 0.947054436987323, "grad_norm": 0.5696932141639989, "learning_rate": 5e-06, "loss": 0.5592, "step": 1270 }, { "epoch": 0.9545115585384042, "grad_norm": 0.5890062683374891, "learning_rate": 5e-06, "loss": 0.5554, "step": 1280 }, { "epoch": 0.9619686800894854, "grad_norm": 0.5938117712349276, "learning_rate": 5e-06, "loss": 0.5646, "step": 1290 }, { "epoch": 0.9694258016405667, "grad_norm": 0.6257865475961188, "learning_rate": 5e-06, "loss": 0.5559, "step": 1300 }, { "epoch": 0.976882923191648, "grad_norm": 0.5860984667723387, "learning_rate": 5e-06, "loss": 0.5486, "step": 1310 }, { "epoch": 0.9843400447427293, "grad_norm": 0.5609191548184437, "learning_rate": 5e-06, "loss": 0.5341, "step": 1320 }, { "epoch": 0.9917971662938105, "grad_norm": 0.6025440403637561, "learning_rate": 5e-06, "loss": 0.5446, "step": 1330 }, { "epoch": 0.9992542878448919, "grad_norm": 0.5572472436768862, "learning_rate": 5e-06, "loss": 0.5404, "step": 1340 }, { "epoch": 1.0, "eval_loss": 0.5463822484016418, "eval_runtime": 339.2131, "eval_samples_per_second": 26.626, "eval_steps_per_second": 0.419, "step": 1341 }, { "epoch": 1.0067114093959733, "grad_norm": 0.6160017220578684, "learning_rate": 5e-06, "loss": 0.4824, "step": 1350 }, { "epoch": 1.0141685309470545, "grad_norm": 0.5591512386853066, "learning_rate": 5e-06, "loss": 0.4797, "step": 1360 }, { "epoch": 1.0216256524981358, "grad_norm": 0.6118294509759679, "learning_rate": 5e-06, "loss": 0.4686, "step": 1370 }, { "epoch": 1.029082774049217, "grad_norm": 0.5583913619404066, "learning_rate": 5e-06, "loss": 0.469, "step": 1380 }, { "epoch": 1.0365398956002982, "grad_norm": 0.5818799856636863, "learning_rate": 5e-06, "loss": 0.4903, "step": 1390 }, { "epoch": 1.0439970171513795, "grad_norm": 0.6171364008097712, "learning_rate": 5e-06, "loss": 0.4956, "step": 1400 }, { "epoch": 1.0514541387024607, "grad_norm": 0.6061235511507647, "learning_rate": 5e-06, "loss": 0.4933, "step": 1410 }, { "epoch": 1.0589112602535422, "grad_norm": 0.6107761071918465, "learning_rate": 5e-06, "loss": 0.486, "step": 1420 }, { "epoch": 1.0663683818046235, "grad_norm": 0.5547904203912964, "learning_rate": 5e-06, "loss": 0.4968, "step": 1430 }, { "epoch": 1.0738255033557047, "grad_norm": 0.5709552455416576, "learning_rate": 5e-06, "loss": 0.4902, "step": 1440 }, { "epoch": 1.081282624906786, "grad_norm": 0.6263774656448823, "learning_rate": 5e-06, "loss": 0.4703, "step": 1450 }, { "epoch": 1.0887397464578672, "grad_norm": 0.5856371980460074, "learning_rate": 5e-06, "loss": 0.4816, "step": 1460 }, { "epoch": 1.0961968680089484, "grad_norm": 0.5914165561619208, "learning_rate": 5e-06, "loss": 0.479, "step": 1470 }, { "epoch": 1.10365398956003, "grad_norm": 0.6094767293533612, "learning_rate": 5e-06, "loss": 0.4834, "step": 1480 }, { "epoch": 1.1111111111111112, "grad_norm": 0.5628759979218878, "learning_rate": 5e-06, "loss": 0.4807, "step": 1490 }, { "epoch": 1.1185682326621924, "grad_norm": 0.5604069640766214, "learning_rate": 5e-06, "loss": 0.4945, "step": 1500 }, { "epoch": 1.1260253542132737, "grad_norm": 0.6056154435588564, "learning_rate": 5e-06, "loss": 0.4756, "step": 1510 }, { "epoch": 1.133482475764355, "grad_norm": 0.5813579925633866, "learning_rate": 5e-06, "loss": 0.4753, "step": 1520 }, { "epoch": 1.1409395973154361, "grad_norm": 0.5387240892913252, "learning_rate": 5e-06, "loss": 0.474, "step": 1530 }, { "epoch": 1.1483967188665176, "grad_norm": 0.5334644195163835, "learning_rate": 5e-06, "loss": 0.4786, "step": 1540 }, { "epoch": 1.1558538404175989, "grad_norm": 0.6302845458131542, "learning_rate": 5e-06, "loss": 0.4658, "step": 1550 }, { "epoch": 1.1633109619686801, "grad_norm": 0.5458667914397936, "learning_rate": 5e-06, "loss": 0.4884, "step": 1560 }, { "epoch": 1.1707680835197614, "grad_norm": 0.5839445403317228, "learning_rate": 5e-06, "loss": 0.4907, "step": 1570 }, { "epoch": 1.1782252050708426, "grad_norm": 0.5603070510925319, "learning_rate": 5e-06, "loss": 0.4872, "step": 1580 }, { "epoch": 1.1856823266219239, "grad_norm": 0.6298096241603139, "learning_rate": 5e-06, "loss": 0.4816, "step": 1590 }, { "epoch": 1.1931394481730053, "grad_norm": 0.6031228261187916, "learning_rate": 5e-06, "loss": 0.4676, "step": 1600 }, { "epoch": 1.2005965697240866, "grad_norm": 0.5890973152880289, "learning_rate": 5e-06, "loss": 0.4789, "step": 1610 }, { "epoch": 1.2080536912751678, "grad_norm": 0.5915397514486846, "learning_rate": 5e-06, "loss": 0.4816, "step": 1620 }, { "epoch": 1.215510812826249, "grad_norm": 0.5786222706794956, "learning_rate": 5e-06, "loss": 0.4852, "step": 1630 }, { "epoch": 1.2229679343773303, "grad_norm": 0.5699021496499598, "learning_rate": 5e-06, "loss": 0.4986, "step": 1640 }, { "epoch": 1.2304250559284116, "grad_norm": 0.5949853159150295, "learning_rate": 5e-06, "loss": 0.4811, "step": 1650 }, { "epoch": 1.2378821774794928, "grad_norm": 0.5673027327267307, "learning_rate": 5e-06, "loss": 0.4886, "step": 1660 }, { "epoch": 1.2453392990305743, "grad_norm": 0.560192582433626, "learning_rate": 5e-06, "loss": 0.4656, "step": 1670 }, { "epoch": 1.2527964205816555, "grad_norm": 0.6108254414303492, "learning_rate": 5e-06, "loss": 0.4715, "step": 1680 }, { "epoch": 1.2602535421327368, "grad_norm": 0.5590093744521378, "learning_rate": 5e-06, "loss": 0.471, "step": 1690 }, { "epoch": 1.267710663683818, "grad_norm": 0.5914565694717694, "learning_rate": 5e-06, "loss": 0.4736, "step": 1700 }, { "epoch": 1.2751677852348993, "grad_norm": 0.5674996558281405, "learning_rate": 5e-06, "loss": 0.4798, "step": 1710 }, { "epoch": 1.2826249067859807, "grad_norm": 0.6498966266280284, "learning_rate": 5e-06, "loss": 0.4792, "step": 1720 }, { "epoch": 1.290082028337062, "grad_norm": 0.5750798444633526, "learning_rate": 5e-06, "loss": 0.4843, "step": 1730 }, { "epoch": 1.2975391498881432, "grad_norm": 0.5825772193243091, "learning_rate": 5e-06, "loss": 0.4869, "step": 1740 }, { "epoch": 1.3049962714392245, "grad_norm": 0.613520592437996, "learning_rate": 5e-06, "loss": 0.4838, "step": 1750 }, { "epoch": 1.3124533929903057, "grad_norm": 0.6147541770677961, "learning_rate": 5e-06, "loss": 0.4746, "step": 1760 }, { "epoch": 1.319910514541387, "grad_norm": 0.6241611435750868, "learning_rate": 5e-06, "loss": 0.4931, "step": 1770 }, { "epoch": 1.3273676360924682, "grad_norm": 0.5926826684472789, "learning_rate": 5e-06, "loss": 0.4871, "step": 1780 }, { "epoch": 1.3348247576435495, "grad_norm": 0.6066670454162487, "learning_rate": 5e-06, "loss": 0.4841, "step": 1790 }, { "epoch": 1.342281879194631, "grad_norm": 0.6069960632718007, "learning_rate": 5e-06, "loss": 0.4862, "step": 1800 }, { "epoch": 1.3497390007457122, "grad_norm": 0.5479395919590173, "learning_rate": 5e-06, "loss": 0.4766, "step": 1810 }, { "epoch": 1.3571961222967934, "grad_norm": 0.5639904817681118, "learning_rate": 5e-06, "loss": 0.4812, "step": 1820 }, { "epoch": 1.3646532438478747, "grad_norm": 0.5858843238376426, "learning_rate": 5e-06, "loss": 0.4686, "step": 1830 }, { "epoch": 1.372110365398956, "grad_norm": 0.5734924915807309, "learning_rate": 5e-06, "loss": 0.477, "step": 1840 }, { "epoch": 1.3795674869500374, "grad_norm": 0.5902217700915133, "learning_rate": 5e-06, "loss": 0.4889, "step": 1850 }, { "epoch": 1.3870246085011186, "grad_norm": 0.5854924372795176, "learning_rate": 5e-06, "loss": 0.4792, "step": 1860 }, { "epoch": 1.3944817300521999, "grad_norm": 0.6350410086083018, "learning_rate": 5e-06, "loss": 0.4897, "step": 1870 }, { "epoch": 1.4019388516032811, "grad_norm": 0.5606930586639369, "learning_rate": 5e-06, "loss": 0.496, "step": 1880 }, { "epoch": 1.4093959731543624, "grad_norm": 0.5586778910160224, "learning_rate": 5e-06, "loss": 0.4671, "step": 1890 }, { "epoch": 1.4168530947054436, "grad_norm": 0.5732737315369484, "learning_rate": 5e-06, "loss": 0.4786, "step": 1900 }, { "epoch": 1.4243102162565249, "grad_norm": 4.027507437517355, "learning_rate": 5e-06, "loss": 0.4886, "step": 1910 }, { "epoch": 1.4317673378076063, "grad_norm": 0.5991590420531947, "learning_rate": 5e-06, "loss": 0.497, "step": 1920 }, { "epoch": 1.4392244593586876, "grad_norm": 0.5828725709335938, "learning_rate": 5e-06, "loss": 0.4899, "step": 1930 }, { "epoch": 1.4466815809097688, "grad_norm": 0.5473354356011987, "learning_rate": 5e-06, "loss": 0.4775, "step": 1940 }, { "epoch": 1.45413870246085, "grad_norm": 0.6210979610287483, "learning_rate": 5e-06, "loss": 0.4907, "step": 1950 }, { "epoch": 1.4615958240119313, "grad_norm": 0.5638329844959816, "learning_rate": 5e-06, "loss": 0.4759, "step": 1960 }, { "epoch": 1.4690529455630128, "grad_norm": 0.5723878428151293, "learning_rate": 5e-06, "loss": 0.4897, "step": 1970 }, { "epoch": 1.476510067114094, "grad_norm": 0.5521756950405313, "learning_rate": 5e-06, "loss": 0.4746, "step": 1980 }, { "epoch": 1.4839671886651753, "grad_norm": 0.6691296670250785, "learning_rate": 5e-06, "loss": 0.4885, "step": 1990 }, { "epoch": 1.4914243102162565, "grad_norm": 0.6141910929035425, "learning_rate": 5e-06, "loss": 0.4884, "step": 2000 }, { "epoch": 1.4988814317673378, "grad_norm": 0.5400373664496432, "learning_rate": 5e-06, "loss": 0.4799, "step": 2010 }, { "epoch": 1.506338553318419, "grad_norm": 0.5645402137800272, "learning_rate": 5e-06, "loss": 0.4832, "step": 2020 }, { "epoch": 1.5137956748695003, "grad_norm": 0.59297682178836, "learning_rate": 5e-06, "loss": 0.4765, "step": 2030 }, { "epoch": 1.5212527964205815, "grad_norm": 0.6099720079302091, "learning_rate": 5e-06, "loss": 0.4876, "step": 2040 }, { "epoch": 1.5287099179716628, "grad_norm": 0.5797412589969116, "learning_rate": 5e-06, "loss": 0.4867, "step": 2050 }, { "epoch": 1.5361670395227442, "grad_norm": 0.6160597774931952, "learning_rate": 5e-06, "loss": 0.4952, "step": 2060 }, { "epoch": 1.5436241610738255, "grad_norm": 0.5775983688665581, "learning_rate": 5e-06, "loss": 0.476, "step": 2070 }, { "epoch": 1.5510812826249067, "grad_norm": 0.5846473827271578, "learning_rate": 5e-06, "loss": 0.4937, "step": 2080 }, { "epoch": 1.5585384041759882, "grad_norm": 0.5702728266559334, "learning_rate": 5e-06, "loss": 0.4974, "step": 2090 }, { "epoch": 1.5659955257270695, "grad_norm": 0.5870580247571568, "learning_rate": 5e-06, "loss": 0.4877, "step": 2100 }, { "epoch": 1.5734526472781507, "grad_norm": 0.5817243215012845, "learning_rate": 5e-06, "loss": 0.4873, "step": 2110 }, { "epoch": 1.580909768829232, "grad_norm": 0.5315491273649968, "learning_rate": 5e-06, "loss": 0.4798, "step": 2120 }, { "epoch": 1.5883668903803132, "grad_norm": 0.6200889922410328, "learning_rate": 5e-06, "loss": 0.4699, "step": 2130 }, { "epoch": 1.5958240119313944, "grad_norm": 0.6029121739064351, "learning_rate": 5e-06, "loss": 0.4995, "step": 2140 }, { "epoch": 1.6032811334824757, "grad_norm": 0.6028582417864272, "learning_rate": 5e-06, "loss": 0.4959, "step": 2150 }, { "epoch": 1.610738255033557, "grad_norm": 0.5893432209254877, "learning_rate": 5e-06, "loss": 0.4783, "step": 2160 }, { "epoch": 1.6181953765846382, "grad_norm": 0.5884082032907385, "learning_rate": 5e-06, "loss": 0.4823, "step": 2170 }, { "epoch": 1.6256524981357197, "grad_norm": 0.5921720057393691, "learning_rate": 5e-06, "loss": 0.4761, "step": 2180 }, { "epoch": 1.633109619686801, "grad_norm": 0.5776696223531537, "learning_rate": 5e-06, "loss": 0.4823, "step": 2190 }, { "epoch": 1.6405667412378822, "grad_norm": 0.5645109392805745, "learning_rate": 5e-06, "loss": 0.4827, "step": 2200 }, { "epoch": 1.6480238627889636, "grad_norm": 0.5532331562154945, "learning_rate": 5e-06, "loss": 0.4878, "step": 2210 }, { "epoch": 1.6554809843400449, "grad_norm": 0.5877417623919632, "learning_rate": 5e-06, "loss": 0.4854, "step": 2220 }, { "epoch": 1.6629381058911261, "grad_norm": 0.5528378617144938, "learning_rate": 5e-06, "loss": 0.4876, "step": 2230 }, { "epoch": 1.6703952274422074, "grad_norm": 0.6104249956677287, "learning_rate": 5e-06, "loss": 0.4976, "step": 2240 }, { "epoch": 1.6778523489932886, "grad_norm": 0.5676314169193764, "learning_rate": 5e-06, "loss": 0.473, "step": 2250 }, { "epoch": 1.6853094705443699, "grad_norm": 0.5696949870525951, "learning_rate": 5e-06, "loss": 0.4866, "step": 2260 }, { "epoch": 1.692766592095451, "grad_norm": 0.5702619592671349, "learning_rate": 5e-06, "loss": 0.4909, "step": 2270 }, { "epoch": 1.7002237136465324, "grad_norm": 0.5973494243465649, "learning_rate": 5e-06, "loss": 0.4914, "step": 2280 }, { "epoch": 1.7076808351976136, "grad_norm": 0.5545197259083948, "learning_rate": 5e-06, "loss": 0.4804, "step": 2290 }, { "epoch": 1.7151379567486948, "grad_norm": 0.5905506685708721, "learning_rate": 5e-06, "loss": 0.4925, "step": 2300 }, { "epoch": 1.7225950782997763, "grad_norm": 0.6320966702911732, "learning_rate": 5e-06, "loss": 0.4913, "step": 2310 }, { "epoch": 1.7300521998508576, "grad_norm": 0.5417972662105043, "learning_rate": 5e-06, "loss": 0.4769, "step": 2320 }, { "epoch": 1.7375093214019388, "grad_norm": 0.597965564481012, "learning_rate": 5e-06, "loss": 0.4898, "step": 2330 }, { "epoch": 1.7449664429530203, "grad_norm": 0.6144634291593705, "learning_rate": 5e-06, "loss": 0.4826, "step": 2340 }, { "epoch": 1.7524235645041015, "grad_norm": 0.5987390856089427, "learning_rate": 5e-06, "loss": 0.4777, "step": 2350 }, { "epoch": 1.7598806860551828, "grad_norm": 0.5693070234792923, "learning_rate": 5e-06, "loss": 0.4934, "step": 2360 }, { "epoch": 1.767337807606264, "grad_norm": 0.5879521460157956, "learning_rate": 5e-06, "loss": 0.5053, "step": 2370 }, { "epoch": 1.7747949291573453, "grad_norm": 0.5680219111185565, "learning_rate": 5e-06, "loss": 0.4817, "step": 2380 }, { "epoch": 1.7822520507084265, "grad_norm": 0.6501364801989891, "learning_rate": 5e-06, "loss": 0.4853, "step": 2390 }, { "epoch": 1.7897091722595078, "grad_norm": 0.615030215815147, "learning_rate": 5e-06, "loss": 0.4791, "step": 2400 }, { "epoch": 1.797166293810589, "grad_norm": 0.5785652647623134, "learning_rate": 5e-06, "loss": 0.4809, "step": 2410 }, { "epoch": 1.8046234153616703, "grad_norm": 0.6044924431994139, "learning_rate": 5e-06, "loss": 0.4929, "step": 2420 }, { "epoch": 1.8120805369127517, "grad_norm": 0.5426508657118521, "learning_rate": 5e-06, "loss": 0.4847, "step": 2430 }, { "epoch": 1.819537658463833, "grad_norm": 0.6009223844637949, "learning_rate": 5e-06, "loss": 0.4906, "step": 2440 }, { "epoch": 1.8269947800149142, "grad_norm": 0.5743711413707431, "learning_rate": 5e-06, "loss": 0.4975, "step": 2450 }, { "epoch": 1.8344519015659957, "grad_norm": 0.5983412684564712, "learning_rate": 5e-06, "loss": 0.4814, "step": 2460 }, { "epoch": 1.841909023117077, "grad_norm": 0.5560773314183736, "learning_rate": 5e-06, "loss": 0.4811, "step": 2470 }, { "epoch": 1.8493661446681582, "grad_norm": 0.5992209963884128, "learning_rate": 5e-06, "loss": 0.496, "step": 2480 }, { "epoch": 1.8568232662192394, "grad_norm": 0.5387716311327034, "learning_rate": 5e-06, "loss": 0.4804, "step": 2490 }, { "epoch": 1.8642803877703207, "grad_norm": 0.5990433932479565, "learning_rate": 5e-06, "loss": 0.4863, "step": 2500 }, { "epoch": 1.871737509321402, "grad_norm": 0.5341574484686165, "learning_rate": 5e-06, "loss": 0.4909, "step": 2510 }, { "epoch": 1.8791946308724832, "grad_norm": 0.5857022850771958, "learning_rate": 5e-06, "loss": 0.4831, "step": 2520 }, { "epoch": 1.8866517524235644, "grad_norm": 0.596806200047546, "learning_rate": 5e-06, "loss": 0.5042, "step": 2530 }, { "epoch": 1.8941088739746457, "grad_norm": 0.5831993654340557, "learning_rate": 5e-06, "loss": 0.5004, "step": 2540 }, { "epoch": 1.901565995525727, "grad_norm": 0.5256203832071005, "learning_rate": 5e-06, "loss": 0.4802, "step": 2550 }, { "epoch": 1.9090231170768084, "grad_norm": 0.6256000165719325, "learning_rate": 5e-06, "loss": 0.49, "step": 2560 }, { "epoch": 1.9164802386278896, "grad_norm": 0.6025652662604613, "learning_rate": 5e-06, "loss": 0.4961, "step": 2570 }, { "epoch": 1.9239373601789709, "grad_norm": 0.5851336978699863, "learning_rate": 5e-06, "loss": 0.493, "step": 2580 }, { "epoch": 1.9313944817300523, "grad_norm": 0.5841404426808496, "learning_rate": 5e-06, "loss": 0.4788, "step": 2590 }, { "epoch": 1.9388516032811336, "grad_norm": 0.5553137000045634, "learning_rate": 5e-06, "loss": 0.4944, "step": 2600 }, { "epoch": 1.9463087248322148, "grad_norm": 0.5954485746159562, "learning_rate": 5e-06, "loss": 0.4924, "step": 2610 }, { "epoch": 1.953765846383296, "grad_norm": 0.573701644653239, "learning_rate": 5e-06, "loss": 0.4909, "step": 2620 }, { "epoch": 1.9612229679343773, "grad_norm": 0.6460568307562583, "learning_rate": 5e-06, "loss": 0.4981, "step": 2630 }, { "epoch": 1.9686800894854586, "grad_norm": 0.5881894639448034, "learning_rate": 5e-06, "loss": 0.498, "step": 2640 }, { "epoch": 1.9761372110365398, "grad_norm": 0.5442840215629223, "learning_rate": 5e-06, "loss": 0.4793, "step": 2650 }, { "epoch": 1.983594332587621, "grad_norm": 0.5772375310283397, "learning_rate": 5e-06, "loss": 0.4877, "step": 2660 }, { "epoch": 1.9910514541387023, "grad_norm": 0.5624955248751579, "learning_rate": 5e-06, "loss": 0.4852, "step": 2670 }, { "epoch": 1.9985085756897838, "grad_norm": 0.5618877188539807, "learning_rate": 5e-06, "loss": 0.4804, "step": 2680 }, { "epoch": 2.0, "eval_loss": 0.5512435436248779, "eval_runtime": 342.0704, "eval_samples_per_second": 26.404, "eval_steps_per_second": 0.415, "step": 2682 }, { "epoch": 2.005965697240865, "grad_norm": 0.6558184793140344, "learning_rate": 5e-06, "loss": 0.4215, "step": 2690 }, { "epoch": 2.0134228187919465, "grad_norm": 0.6034125785194365, "learning_rate": 5e-06, "loss": 0.4033, "step": 2700 }, { "epoch": 2.0208799403430278, "grad_norm": 0.5610165111292696, "learning_rate": 5e-06, "loss": 0.3907, "step": 2710 }, { "epoch": 2.028337061894109, "grad_norm": 0.580517844768164, "learning_rate": 5e-06, "loss": 0.3977, "step": 2720 }, { "epoch": 2.0357941834451903, "grad_norm": 0.5514277294221982, "learning_rate": 5e-06, "loss": 0.4112, "step": 2730 }, { "epoch": 2.0432513049962715, "grad_norm": 0.60029875754166, "learning_rate": 5e-06, "loss": 0.4176, "step": 2740 }, { "epoch": 2.0507084265473527, "grad_norm": 0.6089301446363884, "learning_rate": 5e-06, "loss": 0.4026, "step": 2750 }, { "epoch": 2.058165548098434, "grad_norm": 0.5849276057096263, "learning_rate": 5e-06, "loss": 0.409, "step": 2760 }, { "epoch": 2.0656226696495152, "grad_norm": 0.5950355197672823, "learning_rate": 5e-06, "loss": 0.4024, "step": 2770 }, { "epoch": 2.0730797912005965, "grad_norm": 0.5889941012752544, "learning_rate": 5e-06, "loss": 0.4175, "step": 2780 }, { "epoch": 2.0805369127516777, "grad_norm": 0.5611907053787162, "learning_rate": 5e-06, "loss": 0.403, "step": 2790 }, { "epoch": 2.087994034302759, "grad_norm": 0.5934324272584153, "learning_rate": 5e-06, "loss": 0.413, "step": 2800 }, { "epoch": 2.0954511558538402, "grad_norm": 0.5774168779978719, "learning_rate": 5e-06, "loss": 0.4159, "step": 2810 }, { "epoch": 2.1029082774049215, "grad_norm": 0.564340240629499, "learning_rate": 5e-06, "loss": 0.4107, "step": 2820 }, { "epoch": 2.110365398956003, "grad_norm": 0.6144699399372608, "learning_rate": 5e-06, "loss": 0.415, "step": 2830 }, { "epoch": 2.1178225205070844, "grad_norm": 0.623817632576955, "learning_rate": 5e-06, "loss": 0.415, "step": 2840 }, { "epoch": 2.1252796420581657, "grad_norm": 0.5835224903490361, "learning_rate": 5e-06, "loss": 0.4096, "step": 2850 }, { "epoch": 2.132736763609247, "grad_norm": 0.6186331596636564, "learning_rate": 5e-06, "loss": 0.4015, "step": 2860 }, { "epoch": 2.140193885160328, "grad_norm": 0.5790608361059295, "learning_rate": 5e-06, "loss": 0.4068, "step": 2870 }, { "epoch": 2.1476510067114094, "grad_norm": 0.5869215595357605, "learning_rate": 5e-06, "loss": 0.4215, "step": 2880 }, { "epoch": 2.1551081282624907, "grad_norm": 0.5936702509963696, "learning_rate": 5e-06, "loss": 0.4141, "step": 2890 }, { "epoch": 2.162565249813572, "grad_norm": 0.6381922955234081, "learning_rate": 5e-06, "loss": 0.407, "step": 2900 }, { "epoch": 2.170022371364653, "grad_norm": 0.5862737710378301, "learning_rate": 5e-06, "loss": 0.417, "step": 2910 }, { "epoch": 2.1774794929157344, "grad_norm": 0.5822255490097745, "learning_rate": 5e-06, "loss": 0.4225, "step": 2920 }, { "epoch": 2.1849366144668156, "grad_norm": 0.5764505027268744, "learning_rate": 5e-06, "loss": 0.4138, "step": 2930 }, { "epoch": 2.192393736017897, "grad_norm": 0.6073760733719726, "learning_rate": 5e-06, "loss": 0.4112, "step": 2940 }, { "epoch": 2.1998508575689786, "grad_norm": 0.6091808835643772, "learning_rate": 5e-06, "loss": 0.4208, "step": 2950 }, { "epoch": 2.20730797912006, "grad_norm": 0.6239375451432535, "learning_rate": 5e-06, "loss": 0.4167, "step": 2960 }, { "epoch": 2.214765100671141, "grad_norm": 0.6130674669978408, "learning_rate": 5e-06, "loss": 0.413, "step": 2970 }, { "epoch": 2.2222222222222223, "grad_norm": 0.5668324865537339, "learning_rate": 5e-06, "loss": 0.4126, "step": 2980 }, { "epoch": 2.2296793437733036, "grad_norm": 0.5765769707939168, "learning_rate": 5e-06, "loss": 0.4189, "step": 2990 }, { "epoch": 2.237136465324385, "grad_norm": 0.6216905758163017, "learning_rate": 5e-06, "loss": 0.416, "step": 3000 }, { "epoch": 2.244593586875466, "grad_norm": 0.642406145303932, "learning_rate": 5e-06, "loss": 0.4182, "step": 3010 }, { "epoch": 2.2520507084265473, "grad_norm": 0.6153726361933832, "learning_rate": 5e-06, "loss": 0.4401, "step": 3020 }, { "epoch": 2.2595078299776286, "grad_norm": 0.6162592369179899, "learning_rate": 5e-06, "loss": 0.4093, "step": 3030 }, { "epoch": 2.26696495152871, "grad_norm": 0.6062140595912686, "learning_rate": 5e-06, "loss": 0.4201, "step": 3040 }, { "epoch": 2.274422073079791, "grad_norm": 0.6489911104793248, "learning_rate": 5e-06, "loss": 0.4216, "step": 3050 }, { "epoch": 2.2818791946308723, "grad_norm": 0.5973147232668122, "learning_rate": 5e-06, "loss": 0.4225, "step": 3060 }, { "epoch": 2.289336316181954, "grad_norm": 0.6029950937606031, "learning_rate": 5e-06, "loss": 0.4187, "step": 3070 }, { "epoch": 2.2967934377330352, "grad_norm": 0.6231584364415401, "learning_rate": 5e-06, "loss": 0.4238, "step": 3080 }, { "epoch": 2.3042505592841165, "grad_norm": 0.5561184622741537, "learning_rate": 5e-06, "loss": 0.4219, "step": 3090 }, { "epoch": 2.3117076808351977, "grad_norm": 0.5941537102506256, "learning_rate": 5e-06, "loss": 0.4246, "step": 3100 }, { "epoch": 2.319164802386279, "grad_norm": 0.6410414670886565, "learning_rate": 5e-06, "loss": 0.4071, "step": 3110 }, { "epoch": 2.3266219239373602, "grad_norm": 0.5732228808060483, "learning_rate": 5e-06, "loss": 0.4114, "step": 3120 }, { "epoch": 2.3340790454884415, "grad_norm": 0.5736385479610295, "learning_rate": 5e-06, "loss": 0.4154, "step": 3130 }, { "epoch": 2.3415361670395227, "grad_norm": 0.5563198902731237, "learning_rate": 5e-06, "loss": 0.4342, "step": 3140 }, { "epoch": 2.348993288590604, "grad_norm": 0.6010188015263107, "learning_rate": 5e-06, "loss": 0.4115, "step": 3150 }, { "epoch": 2.356450410141685, "grad_norm": 0.6377162414933872, "learning_rate": 5e-06, "loss": 0.4129, "step": 3160 }, { "epoch": 2.3639075316927665, "grad_norm": 0.5983638229139512, "learning_rate": 5e-06, "loss": 0.4116, "step": 3170 }, { "epoch": 2.3713646532438477, "grad_norm": 0.6406800867031649, "learning_rate": 5e-06, "loss": 0.4257, "step": 3180 }, { "epoch": 2.378821774794929, "grad_norm": 0.6165353698446668, "learning_rate": 5e-06, "loss": 0.4332, "step": 3190 }, { "epoch": 2.3862788963460106, "grad_norm": 0.5950518706856058, "learning_rate": 5e-06, "loss": 0.4166, "step": 3200 }, { "epoch": 2.393736017897092, "grad_norm": 0.5601525861175598, "learning_rate": 5e-06, "loss": 0.4214, "step": 3210 }, { "epoch": 2.401193139448173, "grad_norm": 0.6338595567107324, "learning_rate": 5e-06, "loss": 0.4175, "step": 3220 }, { "epoch": 2.4086502609992544, "grad_norm": 0.613546741805029, "learning_rate": 5e-06, "loss": 0.4205, "step": 3230 }, { "epoch": 2.4161073825503356, "grad_norm": 0.6203881329193865, "learning_rate": 5e-06, "loss": 0.4106, "step": 3240 }, { "epoch": 2.423564504101417, "grad_norm": 0.587446398589364, "learning_rate": 5e-06, "loss": 0.435, "step": 3250 }, { "epoch": 2.431021625652498, "grad_norm": 0.6180935141585874, "learning_rate": 5e-06, "loss": 0.4148, "step": 3260 }, { "epoch": 2.4384787472035794, "grad_norm": 0.5966462669608655, "learning_rate": 5e-06, "loss": 0.4071, "step": 3270 }, { "epoch": 2.4459358687546606, "grad_norm": 0.6318483897236675, "learning_rate": 5e-06, "loss": 0.4222, "step": 3280 }, { "epoch": 2.453392990305742, "grad_norm": 0.6445808903137813, "learning_rate": 5e-06, "loss": 0.4152, "step": 3290 }, { "epoch": 2.460850111856823, "grad_norm": 0.6047687718392507, "learning_rate": 5e-06, "loss": 0.4051, "step": 3300 }, { "epoch": 2.4683072334079044, "grad_norm": 0.6113185157541442, "learning_rate": 5e-06, "loss": 0.4221, "step": 3310 }, { "epoch": 2.4757643549589856, "grad_norm": 0.5912448445164308, "learning_rate": 5e-06, "loss": 0.4116, "step": 3320 }, { "epoch": 2.4832214765100673, "grad_norm": 0.6710834935010201, "learning_rate": 5e-06, "loss": 0.4211, "step": 3330 }, { "epoch": 2.4906785980611486, "grad_norm": 0.5855123869668963, "learning_rate": 5e-06, "loss": 0.4176, "step": 3340 }, { "epoch": 2.49813571961223, "grad_norm": 0.5672278638197414, "learning_rate": 5e-06, "loss": 0.4283, "step": 3350 }, { "epoch": 2.505592841163311, "grad_norm": 0.6623637796001947, "learning_rate": 5e-06, "loss": 0.4207, "step": 3360 }, { "epoch": 2.5130499627143923, "grad_norm": 0.5429235298023167, "learning_rate": 5e-06, "loss": 0.4239, "step": 3370 }, { "epoch": 2.5205070842654735, "grad_norm": 0.5553083911368453, "learning_rate": 5e-06, "loss": 0.4266, "step": 3380 }, { "epoch": 2.527964205816555, "grad_norm": 0.5876357465241916, "learning_rate": 5e-06, "loss": 0.4262, "step": 3390 }, { "epoch": 2.535421327367636, "grad_norm": 0.6222644485385007, "learning_rate": 5e-06, "loss": 0.4082, "step": 3400 }, { "epoch": 2.5428784489187173, "grad_norm": 0.6327192739316518, "learning_rate": 5e-06, "loss": 0.4123, "step": 3410 }, { "epoch": 2.5503355704697985, "grad_norm": 0.5833750402335252, "learning_rate": 5e-06, "loss": 0.4177, "step": 3420 }, { "epoch": 2.5577926920208798, "grad_norm": 0.642842699957225, "learning_rate": 5e-06, "loss": 0.4238, "step": 3430 }, { "epoch": 2.5652498135719615, "grad_norm": 0.6050955007783061, "learning_rate": 5e-06, "loss": 0.4333, "step": 3440 }, { "epoch": 2.5727069351230423, "grad_norm": 0.616598274296138, "learning_rate": 5e-06, "loss": 0.4253, "step": 3450 }, { "epoch": 2.580164056674124, "grad_norm": 0.6524589708431916, "learning_rate": 5e-06, "loss": 0.4249, "step": 3460 }, { "epoch": 2.587621178225205, "grad_norm": 0.5751310148688968, "learning_rate": 5e-06, "loss": 0.4177, "step": 3470 }, { "epoch": 2.5950782997762865, "grad_norm": 0.627879388176911, "learning_rate": 5e-06, "loss": 0.4161, "step": 3480 }, { "epoch": 2.6025354213273677, "grad_norm": 0.6201429646996841, "learning_rate": 5e-06, "loss": 0.4163, "step": 3490 }, { "epoch": 2.609992542878449, "grad_norm": 0.5820543874841265, "learning_rate": 5e-06, "loss": 0.4186, "step": 3500 }, { "epoch": 2.61744966442953, "grad_norm": 0.6017708715046648, "learning_rate": 5e-06, "loss": 0.428, "step": 3510 }, { "epoch": 2.6249067859806114, "grad_norm": 0.6272846852335328, "learning_rate": 5e-06, "loss": 0.4171, "step": 3520 }, { "epoch": 2.6323639075316927, "grad_norm": 0.6171944158126793, "learning_rate": 5e-06, "loss": 0.4208, "step": 3530 }, { "epoch": 2.639821029082774, "grad_norm": 0.5757983007676734, "learning_rate": 5e-06, "loss": 0.4232, "step": 3540 }, { "epoch": 2.647278150633855, "grad_norm": 0.615459114965357, "learning_rate": 5e-06, "loss": 0.4145, "step": 3550 }, { "epoch": 2.6547352721849364, "grad_norm": 0.642264448040409, "learning_rate": 5e-06, "loss": 0.428, "step": 3560 }, { "epoch": 2.662192393736018, "grad_norm": 0.5989193246950945, "learning_rate": 5e-06, "loss": 0.4271, "step": 3570 }, { "epoch": 2.669649515287099, "grad_norm": 0.5516635613380466, "learning_rate": 5e-06, "loss": 0.4153, "step": 3580 }, { "epoch": 2.6771066368381806, "grad_norm": 0.6619116578052296, "learning_rate": 5e-06, "loss": 0.4157, "step": 3590 }, { "epoch": 2.684563758389262, "grad_norm": 0.6371720904571588, "learning_rate": 5e-06, "loss": 0.427, "step": 3600 }, { "epoch": 2.692020879940343, "grad_norm": 0.5757772064087978, "learning_rate": 5e-06, "loss": 0.4141, "step": 3610 }, { "epoch": 2.6994780014914244, "grad_norm": 0.6037614796783571, "learning_rate": 5e-06, "loss": 0.4169, "step": 3620 }, { "epoch": 2.7069351230425056, "grad_norm": 0.6349534794022433, "learning_rate": 5e-06, "loss": 0.4363, "step": 3630 }, { "epoch": 2.714392244593587, "grad_norm": 0.5873004454383765, "learning_rate": 5e-06, "loss": 0.4247, "step": 3640 }, { "epoch": 2.721849366144668, "grad_norm": 0.5840849381722206, "learning_rate": 5e-06, "loss": 0.4275, "step": 3650 }, { "epoch": 2.7293064876957494, "grad_norm": 0.5723518390413971, "learning_rate": 5e-06, "loss": 0.4294, "step": 3660 }, { "epoch": 2.7367636092468306, "grad_norm": 0.5792593078543581, "learning_rate": 5e-06, "loss": 0.4274, "step": 3670 }, { "epoch": 2.744220730797912, "grad_norm": 0.5770536517441345, "learning_rate": 5e-06, "loss": 0.4184, "step": 3680 }, { "epoch": 2.751677852348993, "grad_norm": 0.5803564642723817, "learning_rate": 5e-06, "loss": 0.4172, "step": 3690 }, { "epoch": 2.759134973900075, "grad_norm": 0.610053209093683, "learning_rate": 5e-06, "loss": 0.4189, "step": 3700 }, { "epoch": 2.7665920954511556, "grad_norm": 0.6101684327110989, "learning_rate": 5e-06, "loss": 0.4219, "step": 3710 }, { "epoch": 2.7740492170022373, "grad_norm": 0.5904638045362951, "learning_rate": 5e-06, "loss": 0.4199, "step": 3720 }, { "epoch": 2.7815063385533185, "grad_norm": 0.6123335251469906, "learning_rate": 5e-06, "loss": 0.417, "step": 3730 }, { "epoch": 2.7889634601043998, "grad_norm": 0.6433911863159033, "learning_rate": 5e-06, "loss": 0.4247, "step": 3740 }, { "epoch": 2.796420581655481, "grad_norm": 0.6031149493474106, "learning_rate": 5e-06, "loss": 0.4185, "step": 3750 }, { "epoch": 2.8038777032065623, "grad_norm": 0.5895736114737712, "learning_rate": 5e-06, "loss": 0.4289, "step": 3760 }, { "epoch": 2.8113348247576435, "grad_norm": 0.5882027658002406, "learning_rate": 5e-06, "loss": 0.4295, "step": 3770 }, { "epoch": 2.8187919463087248, "grad_norm": 0.6697395749462909, "learning_rate": 5e-06, "loss": 0.4278, "step": 3780 }, { "epoch": 2.826249067859806, "grad_norm": 0.61486633354577, "learning_rate": 5e-06, "loss": 0.4194, "step": 3790 }, { "epoch": 2.8337061894108873, "grad_norm": 0.6062660800570471, "learning_rate": 5e-06, "loss": 0.4191, "step": 3800 }, { "epoch": 2.841163310961969, "grad_norm": 0.5823531490175686, "learning_rate": 5e-06, "loss": 0.4249, "step": 3810 }, { "epoch": 2.8486204325130497, "grad_norm": 0.6594942240638506, "learning_rate": 5e-06, "loss": 0.4141, "step": 3820 }, { "epoch": 2.8560775540641314, "grad_norm": 0.5894447911469776, "learning_rate": 5e-06, "loss": 0.4226, "step": 3830 }, { "epoch": 2.8635346756152127, "grad_norm": 0.6257621624046176, "learning_rate": 5e-06, "loss": 0.4237, "step": 3840 }, { "epoch": 2.870991797166294, "grad_norm": 0.5507526853748216, "learning_rate": 5e-06, "loss": 0.4247, "step": 3850 }, { "epoch": 2.878448918717375, "grad_norm": 0.555588132655637, "learning_rate": 5e-06, "loss": 0.4259, "step": 3860 }, { "epoch": 2.8859060402684564, "grad_norm": 0.617068246539021, "learning_rate": 5e-06, "loss": 0.4307, "step": 3870 }, { "epoch": 2.8933631618195377, "grad_norm": 0.5957184006665759, "learning_rate": 5e-06, "loss": 0.4302, "step": 3880 }, { "epoch": 2.900820283370619, "grad_norm": 0.5821306913096179, "learning_rate": 5e-06, "loss": 0.4197, "step": 3890 }, { "epoch": 2.9082774049217, "grad_norm": 0.5948347831059887, "learning_rate": 5e-06, "loss": 0.4196, "step": 3900 }, { "epoch": 2.9157345264727814, "grad_norm": 0.5748015527924519, "learning_rate": 5e-06, "loss": 0.4234, "step": 3910 }, { "epoch": 2.9231916480238627, "grad_norm": 0.573330338580251, "learning_rate": 5e-06, "loss": 0.4155, "step": 3920 }, { "epoch": 2.930648769574944, "grad_norm": 0.5959038101452754, "learning_rate": 5e-06, "loss": 0.4245, "step": 3930 }, { "epoch": 2.9381058911260256, "grad_norm": 0.6327154198497399, "learning_rate": 5e-06, "loss": 0.4141, "step": 3940 }, { "epoch": 2.9455630126771064, "grad_norm": 0.5879319738064069, "learning_rate": 5e-06, "loss": 0.4111, "step": 3950 }, { "epoch": 2.953020134228188, "grad_norm": 0.5850847757389654, "learning_rate": 5e-06, "loss": 0.413, "step": 3960 }, { "epoch": 2.9604772557792693, "grad_norm": 0.6440418621171623, "learning_rate": 5e-06, "loss": 0.4254, "step": 3970 }, { "epoch": 2.9679343773303506, "grad_norm": 0.6104743322838082, "learning_rate": 5e-06, "loss": 0.4116, "step": 3980 }, { "epoch": 2.975391498881432, "grad_norm": 0.5820609145361088, "learning_rate": 5e-06, "loss": 0.4188, "step": 3990 }, { "epoch": 2.982848620432513, "grad_norm": 0.6266879443777426, "learning_rate": 5e-06, "loss": 0.4393, "step": 4000 }, { "epoch": 2.9903057419835943, "grad_norm": 0.5751703131935023, "learning_rate": 5e-06, "loss": 0.4298, "step": 4010 }, { "epoch": 2.9977628635346756, "grad_norm": 0.6279401240117848, "learning_rate": 5e-06, "loss": 0.4292, "step": 4020 }, { "epoch": 3.0, "eval_loss": 0.5737335681915283, "eval_runtime": 341.8895, "eval_samples_per_second": 26.418, "eval_steps_per_second": 0.415, "step": 4023 }, { "epoch": 3.0, "step": 4023, "total_flos": 2108813507297280.0, "train_loss": 0.4856802497430432, "train_runtime": 54595.2546, "train_samples_per_second": 9.429, "train_steps_per_second": 0.074 } ], "logging_steps": 10, "max_steps": 4023, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2108813507297280.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }