{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9982238010657194, "eval_steps": 500, "global_step": 1266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023682652457075192, "grad_norm": 1.5009291573426993, "learning_rate": 5e-06, "loss": 0.8889, "step": 10 }, { "epoch": 0.047365304914150384, "grad_norm": 1.2603811495508497, "learning_rate": 5e-06, "loss": 0.788, "step": 20 }, { "epoch": 0.07104795737122557, "grad_norm": 0.8337069878140884, "learning_rate": 5e-06, "loss": 0.7543, "step": 30 }, { "epoch": 0.09473060982830077, "grad_norm": 0.7724138071764233, "learning_rate": 5e-06, "loss": 0.7321, "step": 40 }, { "epoch": 0.11841326228537596, "grad_norm": 0.7604181173774881, "learning_rate": 5e-06, "loss": 0.7246, "step": 50 }, { "epoch": 0.14209591474245115, "grad_norm": 0.7070103274073624, "learning_rate": 5e-06, "loss": 0.7112, "step": 60 }, { "epoch": 0.16577856719952636, "grad_norm": 0.926548104967391, "learning_rate": 5e-06, "loss": 0.7088, "step": 70 }, { "epoch": 0.18946121965660154, "grad_norm": 0.5558770280577042, "learning_rate": 5e-06, "loss": 0.6938, "step": 80 }, { "epoch": 0.21314387211367672, "grad_norm": 0.6030257818397093, "learning_rate": 5e-06, "loss": 0.6949, "step": 90 }, { "epoch": 0.23682652457075193, "grad_norm": 0.5336538650662063, "learning_rate": 5e-06, "loss": 0.6836, "step": 100 }, { "epoch": 0.2605091770278271, "grad_norm": 0.8278552253316139, "learning_rate": 5e-06, "loss": 0.6821, "step": 110 }, { "epoch": 0.2841918294849023, "grad_norm": 0.6977545376176977, "learning_rate": 5e-06, "loss": 0.6782, "step": 120 }, { "epoch": 0.30787448194197753, "grad_norm": 0.6610810719616435, "learning_rate": 5e-06, "loss": 0.6889, "step": 130 }, { "epoch": 0.3315571343990527, "grad_norm": 0.5558037806422206, "learning_rate": 5e-06, "loss": 0.6785, "step": 140 }, { "epoch": 0.3552397868561279, "grad_norm": 0.636731427790056, "learning_rate": 5e-06, "loss": 0.6824, "step": 150 }, { "epoch": 0.3789224393132031, "grad_norm": 0.5856745299156039, "learning_rate": 5e-06, "loss": 0.6713, "step": 160 }, { "epoch": 0.40260509177027826, "grad_norm": 0.6346770000919194, "learning_rate": 5e-06, "loss": 0.6752, "step": 170 }, { "epoch": 0.42628774422735344, "grad_norm": 0.5928544013121315, "learning_rate": 5e-06, "loss": 0.6667, "step": 180 }, { "epoch": 0.4499703966844287, "grad_norm": 0.44291132727946475, "learning_rate": 5e-06, "loss": 0.6741, "step": 190 }, { "epoch": 0.47365304914150386, "grad_norm": 0.8183515848783448, "learning_rate": 5e-06, "loss": 0.6657, "step": 200 }, { "epoch": 0.49733570159857904, "grad_norm": 0.4527112724378749, "learning_rate": 5e-06, "loss": 0.6675, "step": 210 }, { "epoch": 0.5210183540556542, "grad_norm": 0.4926733749500316, "learning_rate": 5e-06, "loss": 0.6652, "step": 220 }, { "epoch": 0.5447010065127295, "grad_norm": 0.5595158055497421, "learning_rate": 5e-06, "loss": 0.6691, "step": 230 }, { "epoch": 0.5683836589698046, "grad_norm": 0.6125812031695264, "learning_rate": 5e-06, "loss": 0.6601, "step": 240 }, { "epoch": 0.5920663114268798, "grad_norm": 0.48674298646689895, "learning_rate": 5e-06, "loss": 0.6747, "step": 250 }, { "epoch": 0.6157489638839551, "grad_norm": 0.4613055270129497, "learning_rate": 5e-06, "loss": 0.6634, "step": 260 }, { "epoch": 0.6394316163410302, "grad_norm": 0.47597492722028856, "learning_rate": 5e-06, "loss": 0.6623, "step": 270 }, { "epoch": 0.6631142687981054, "grad_norm": 0.4692966269330733, "learning_rate": 5e-06, "loss": 0.6695, "step": 280 }, { "epoch": 0.6867969212551805, "grad_norm": 0.5086284371034546, "learning_rate": 5e-06, "loss": 0.6584, "step": 290 }, { "epoch": 0.7104795737122558, "grad_norm": 0.4936523688283777, "learning_rate": 5e-06, "loss": 0.6623, "step": 300 }, { "epoch": 0.7341622261693309, "grad_norm": 0.5392570381158606, "learning_rate": 5e-06, "loss": 0.6536, "step": 310 }, { "epoch": 0.7578448786264061, "grad_norm": 0.5646794511515978, "learning_rate": 5e-06, "loss": 0.6531, "step": 320 }, { "epoch": 0.7815275310834814, "grad_norm": 0.4694116213218389, "learning_rate": 5e-06, "loss": 0.657, "step": 330 }, { "epoch": 0.8052101835405565, "grad_norm": 0.46701567257467963, "learning_rate": 5e-06, "loss": 0.6601, "step": 340 }, { "epoch": 0.8288928359976317, "grad_norm": 0.42590717154516355, "learning_rate": 5e-06, "loss": 0.6504, "step": 350 }, { "epoch": 0.8525754884547069, "grad_norm": 0.45312976877971856, "learning_rate": 5e-06, "loss": 0.6512, "step": 360 }, { "epoch": 0.8762581409117821, "grad_norm": 0.8468203203584982, "learning_rate": 5e-06, "loss": 0.6512, "step": 370 }, { "epoch": 0.8999407933688574, "grad_norm": 0.542746765004271, "learning_rate": 5e-06, "loss": 0.652, "step": 380 }, { "epoch": 0.9236234458259325, "grad_norm": 0.5833444889695804, "learning_rate": 5e-06, "loss": 0.6454, "step": 390 }, { "epoch": 0.9473060982830077, "grad_norm": 0.6150273738613266, "learning_rate": 5e-06, "loss": 0.6529, "step": 400 }, { "epoch": 0.9709887507400828, "grad_norm": 0.4798722057884747, "learning_rate": 5e-06, "loss": 0.6503, "step": 410 }, { "epoch": 0.9946714031971581, "grad_norm": 0.6351351683445448, "learning_rate": 5e-06, "loss": 0.6498, "step": 420 }, { "epoch": 0.9994079336885732, "eval_loss": 0.6475206017494202, "eval_runtime": 225.2004, "eval_samples_per_second": 50.511, "eval_steps_per_second": 0.395, "step": 422 }, { "epoch": 1.0183540556542332, "grad_norm": 0.5657496620777399, "learning_rate": 5e-06, "loss": 0.6177, "step": 430 }, { "epoch": 1.0420367081113084, "grad_norm": 0.4892239120730945, "learning_rate": 5e-06, "loss": 0.6162, "step": 440 }, { "epoch": 1.0657193605683837, "grad_norm": 0.5991744290901849, "learning_rate": 5e-06, "loss": 0.6042, "step": 450 }, { "epoch": 1.089402013025459, "grad_norm": 0.44418054047007693, "learning_rate": 5e-06, "loss": 0.6107, "step": 460 }, { "epoch": 1.1130846654825342, "grad_norm": 0.5035800519212378, "learning_rate": 5e-06, "loss": 0.6124, "step": 470 }, { "epoch": 1.1367673179396092, "grad_norm": 0.4337632225925288, "learning_rate": 5e-06, "loss": 0.6113, "step": 480 }, { "epoch": 1.1604499703966844, "grad_norm": 0.5274563247162665, "learning_rate": 5e-06, "loss": 0.6042, "step": 490 }, { "epoch": 1.1841326228537596, "grad_norm": 0.6254329423737273, "learning_rate": 5e-06, "loss": 0.6144, "step": 500 }, { "epoch": 1.2078152753108349, "grad_norm": 0.5562298009085542, "learning_rate": 5e-06, "loss": 0.6182, "step": 510 }, { "epoch": 1.2314979277679101, "grad_norm": 0.6448107581853412, "learning_rate": 5e-06, "loss": 0.6178, "step": 520 }, { "epoch": 1.2551805802249851, "grad_norm": 0.5597759186037163, "learning_rate": 5e-06, "loss": 0.6051, "step": 530 }, { "epoch": 1.2788632326820604, "grad_norm": 0.5094254785710688, "learning_rate": 5e-06, "loss": 0.601, "step": 540 }, { "epoch": 1.3025458851391356, "grad_norm": 0.4694431260792789, "learning_rate": 5e-06, "loss": 0.6106, "step": 550 }, { "epoch": 1.3262285375962108, "grad_norm": 0.47790257844004136, "learning_rate": 5e-06, "loss": 0.6109, "step": 560 }, { "epoch": 1.349911190053286, "grad_norm": 0.5322942141679202, "learning_rate": 5e-06, "loss": 0.6101, "step": 570 }, { "epoch": 1.373593842510361, "grad_norm": 0.45819277751159426, "learning_rate": 5e-06, "loss": 0.6081, "step": 580 }, { "epoch": 1.3972764949674363, "grad_norm": 0.4501719179556634, "learning_rate": 5e-06, "loss": 0.6089, "step": 590 }, { "epoch": 1.4209591474245116, "grad_norm": 0.4542697399112238, "learning_rate": 5e-06, "loss": 0.6152, "step": 600 }, { "epoch": 1.4446417998815868, "grad_norm": 0.4553860474478459, "learning_rate": 5e-06, "loss": 0.6088, "step": 610 }, { "epoch": 1.468324452338662, "grad_norm": 0.4652671956652514, "learning_rate": 5e-06, "loss": 0.6067, "step": 620 }, { "epoch": 1.492007104795737, "grad_norm": 0.529442099891288, "learning_rate": 5e-06, "loss": 0.6102, "step": 630 }, { "epoch": 1.5156897572528123, "grad_norm": 0.5263954654202675, "learning_rate": 5e-06, "loss": 0.606, "step": 640 }, { "epoch": 1.5393724097098875, "grad_norm": 0.45995572644463967, "learning_rate": 5e-06, "loss": 0.6136, "step": 650 }, { "epoch": 1.5630550621669625, "grad_norm": 0.5170488701848859, "learning_rate": 5e-06, "loss": 0.6086, "step": 660 }, { "epoch": 1.586737714624038, "grad_norm": 0.4871128879860199, "learning_rate": 5e-06, "loss": 0.6086, "step": 670 }, { "epoch": 1.610420367081113, "grad_norm": 0.4974631876345873, "learning_rate": 5e-06, "loss": 0.6125, "step": 680 }, { "epoch": 1.6341030195381883, "grad_norm": 0.43324745659224545, "learning_rate": 5e-06, "loss": 0.6145, "step": 690 }, { "epoch": 1.6577856719952635, "grad_norm": 0.5290498388557234, "learning_rate": 5e-06, "loss": 0.6099, "step": 700 }, { "epoch": 1.6814683244523385, "grad_norm": 0.626734166124248, "learning_rate": 5e-06, "loss": 0.6101, "step": 710 }, { "epoch": 1.705150976909414, "grad_norm": 0.5053972210175365, "learning_rate": 5e-06, "loss": 0.6119, "step": 720 }, { "epoch": 1.728833629366489, "grad_norm": 0.4918219678126516, "learning_rate": 5e-06, "loss": 0.6108, "step": 730 }, { "epoch": 1.7525162818235642, "grad_norm": 0.47894871242204995, "learning_rate": 5e-06, "loss": 0.6083, "step": 740 }, { "epoch": 1.7761989342806395, "grad_norm": 0.46473527001016457, "learning_rate": 5e-06, "loss": 0.6052, "step": 750 }, { "epoch": 1.7998815867377145, "grad_norm": 0.5249910352303092, "learning_rate": 5e-06, "loss": 0.6105, "step": 760 }, { "epoch": 1.82356423919479, "grad_norm": 0.4939384702122645, "learning_rate": 5e-06, "loss": 0.6031, "step": 770 }, { "epoch": 1.847246891651865, "grad_norm": 0.53408366438213, "learning_rate": 5e-06, "loss": 0.6121, "step": 780 }, { "epoch": 1.8709295441089402, "grad_norm": 0.4462578146506802, "learning_rate": 5e-06, "loss": 0.6103, "step": 790 }, { "epoch": 1.8946121965660154, "grad_norm": 0.5031337633152204, "learning_rate": 5e-06, "loss": 0.6092, "step": 800 }, { "epoch": 1.9182948490230904, "grad_norm": 0.5118477000405415, "learning_rate": 5e-06, "loss": 0.6053, "step": 810 }, { "epoch": 1.941977501480166, "grad_norm": 0.4778147349903897, "learning_rate": 5e-06, "loss": 0.6063, "step": 820 }, { "epoch": 1.965660153937241, "grad_norm": 0.627775618514373, "learning_rate": 5e-06, "loss": 0.6096, "step": 830 }, { "epoch": 1.9893428063943162, "grad_norm": 0.47474187825291425, "learning_rate": 5e-06, "loss": 0.6075, "step": 840 }, { "epoch": 1.9988158673771461, "eval_loss": 0.6369723081588745, "eval_runtime": 225.3597, "eval_samples_per_second": 50.475, "eval_steps_per_second": 0.395, "step": 844 }, { "epoch": 2.0130254588513914, "grad_norm": 0.6033407582928859, "learning_rate": 5e-06, "loss": 0.5857, "step": 850 }, { "epoch": 2.0367081113084664, "grad_norm": 0.48338661130504035, "learning_rate": 5e-06, "loss": 0.5626, "step": 860 }, { "epoch": 2.060390763765542, "grad_norm": 0.5017850149423926, "learning_rate": 5e-06, "loss": 0.5622, "step": 870 }, { "epoch": 2.084073416222617, "grad_norm": 0.4965256354858608, "learning_rate": 5e-06, "loss": 0.5676, "step": 880 }, { "epoch": 2.1077560686796923, "grad_norm": 0.5332089820071153, "learning_rate": 5e-06, "loss": 0.5673, "step": 890 }, { "epoch": 2.1314387211367674, "grad_norm": 0.48166411012747123, "learning_rate": 5e-06, "loss": 0.5659, "step": 900 }, { "epoch": 2.1551213735938424, "grad_norm": 0.4411082419748644, "learning_rate": 5e-06, "loss": 0.5702, "step": 910 }, { "epoch": 2.178804026050918, "grad_norm": 0.561793122399257, "learning_rate": 5e-06, "loss": 0.5679, "step": 920 }, { "epoch": 2.202486678507993, "grad_norm": 0.5150369649525713, "learning_rate": 5e-06, "loss": 0.5701, "step": 930 }, { "epoch": 2.2261693309650683, "grad_norm": 0.5125519248465377, "learning_rate": 5e-06, "loss": 0.5682, "step": 940 }, { "epoch": 2.2498519834221433, "grad_norm": 0.45758653804353955, "learning_rate": 5e-06, "loss": 0.5656, "step": 950 }, { "epoch": 2.2735346358792183, "grad_norm": 0.6337954299365447, "learning_rate": 5e-06, "loss": 0.5654, "step": 960 }, { "epoch": 2.297217288336294, "grad_norm": 0.5455290125615978, "learning_rate": 5e-06, "loss": 0.5681, "step": 970 }, { "epoch": 2.320899940793369, "grad_norm": 0.5503953364640523, "learning_rate": 5e-06, "loss": 0.5707, "step": 980 }, { "epoch": 2.3445825932504443, "grad_norm": 0.47176571984059984, "learning_rate": 5e-06, "loss": 0.5709, "step": 990 }, { "epoch": 2.3682652457075193, "grad_norm": 0.5002162272632372, "learning_rate": 5e-06, "loss": 0.5677, "step": 1000 }, { "epoch": 2.3919478981645943, "grad_norm": 0.5573828816208966, "learning_rate": 5e-06, "loss": 0.5666, "step": 1010 }, { "epoch": 2.4156305506216698, "grad_norm": 0.5258827021378585, "learning_rate": 5e-06, "loss": 0.5725, "step": 1020 }, { "epoch": 2.4393132030787448, "grad_norm": 0.5589553231310385, "learning_rate": 5e-06, "loss": 0.5735, "step": 1030 }, { "epoch": 2.4629958555358202, "grad_norm": 0.5100602577703085, "learning_rate": 5e-06, "loss": 0.5653, "step": 1040 }, { "epoch": 2.4866785079928952, "grad_norm": 0.4294738146845761, "learning_rate": 5e-06, "loss": 0.5646, "step": 1050 }, { "epoch": 2.5103611604499703, "grad_norm": 0.548029454638703, "learning_rate": 5e-06, "loss": 0.5712, "step": 1060 }, { "epoch": 2.5340438129070457, "grad_norm": 0.5514898162995253, "learning_rate": 5e-06, "loss": 0.5658, "step": 1070 }, { "epoch": 2.5577264653641207, "grad_norm": 0.5518548821515326, "learning_rate": 5e-06, "loss": 0.5646, "step": 1080 }, { "epoch": 2.581409117821196, "grad_norm": 0.5138811036011879, "learning_rate": 5e-06, "loss": 0.5703, "step": 1090 }, { "epoch": 2.605091770278271, "grad_norm": 0.4749259332371036, "learning_rate": 5e-06, "loss": 0.5723, "step": 1100 }, { "epoch": 2.6287744227353462, "grad_norm": 0.6074130482912456, "learning_rate": 5e-06, "loss": 0.5759, "step": 1110 }, { "epoch": 2.6524570751924217, "grad_norm": 0.4662137107433661, "learning_rate": 5e-06, "loss": 0.5693, "step": 1120 }, { "epoch": 2.6761397276494967, "grad_norm": 0.47834715677436196, "learning_rate": 5e-06, "loss": 0.5701, "step": 1130 }, { "epoch": 2.699822380106572, "grad_norm": 0.49830991624964954, "learning_rate": 5e-06, "loss": 0.5656, "step": 1140 }, { "epoch": 2.723505032563647, "grad_norm": 0.6214961532916059, "learning_rate": 5e-06, "loss": 0.5722, "step": 1150 }, { "epoch": 2.747187685020722, "grad_norm": 0.44581858116965695, "learning_rate": 5e-06, "loss": 0.5749, "step": 1160 }, { "epoch": 2.7708703374777977, "grad_norm": 0.5288981698339921, "learning_rate": 5e-06, "loss": 0.5726, "step": 1170 }, { "epoch": 2.7945529899348727, "grad_norm": 0.4343690821526064, "learning_rate": 5e-06, "loss": 0.5759, "step": 1180 }, { "epoch": 2.818235642391948, "grad_norm": 0.5088546842925579, "learning_rate": 5e-06, "loss": 0.5681, "step": 1190 }, { "epoch": 2.841918294849023, "grad_norm": 0.4739475280428968, "learning_rate": 5e-06, "loss": 0.5711, "step": 1200 }, { "epoch": 2.865600947306098, "grad_norm": 0.4587392455090901, "learning_rate": 5e-06, "loss": 0.5662, "step": 1210 }, { "epoch": 2.8892835997631736, "grad_norm": 0.5566523612983056, "learning_rate": 5e-06, "loss": 0.576, "step": 1220 }, { "epoch": 2.9129662522202486, "grad_norm": 0.47828632727428366, "learning_rate": 5e-06, "loss": 0.5698, "step": 1230 }, { "epoch": 2.936648904677324, "grad_norm": 0.4432815139860892, "learning_rate": 5e-06, "loss": 0.5776, "step": 1240 }, { "epoch": 2.960331557134399, "grad_norm": 0.4472318974600453, "learning_rate": 5e-06, "loss": 0.5729, "step": 1250 }, { "epoch": 2.984014209591474, "grad_norm": 0.5067121626717436, "learning_rate": 5e-06, "loss": 0.5626, "step": 1260 }, { "epoch": 2.9982238010657194, "eval_loss": 0.6377888917922974, "eval_runtime": 225.9066, "eval_samples_per_second": 50.353, "eval_steps_per_second": 0.394, "step": 1266 }, { "epoch": 2.9982238010657194, "step": 1266, "total_flos": 2120178393415680.0, "train_loss": 0.6200182046182159, "train_runtime": 38034.9039, "train_samples_per_second": 17.046, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1266, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2120178393415680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }