{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9963459196102313, "eval_steps": 500, "global_step": 1230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024360535931790498, "grad_norm": 11.305159308546282, "learning_rate": 5e-06, "loss": 0.8887, "step": 10 }, { "epoch": 0.048721071863580996, "grad_norm": 4.407971880258946, "learning_rate": 5e-06, "loss": 0.7976, "step": 20 }, { "epoch": 0.0730816077953715, "grad_norm": 1.3980174472682592, "learning_rate": 5e-06, "loss": 0.767, "step": 30 }, { "epoch": 0.09744214372716199, "grad_norm": 4.101997301199602, "learning_rate": 5e-06, "loss": 0.7475, "step": 40 }, { "epoch": 0.1218026796589525, "grad_norm": 3.8298876079533914, "learning_rate": 5e-06, "loss": 0.7403, "step": 50 }, { "epoch": 0.146163215590743, "grad_norm": 1.120036674314868, "learning_rate": 5e-06, "loss": 0.7256, "step": 60 }, { "epoch": 0.1705237515225335, "grad_norm": 0.7504651491119281, "learning_rate": 5e-06, "loss": 0.7212, "step": 70 }, { "epoch": 0.19488428745432398, "grad_norm": 0.8705482174733401, "learning_rate": 5e-06, "loss": 0.6967, "step": 80 }, { "epoch": 0.2192448233861145, "grad_norm": 0.8066011359814329, "learning_rate": 5e-06, "loss": 0.6944, "step": 90 }, { "epoch": 0.243605359317905, "grad_norm": 0.7143294307124043, "learning_rate": 5e-06, "loss": 0.6991, "step": 100 }, { "epoch": 0.2679658952496955, "grad_norm": 0.5219278289863366, "learning_rate": 5e-06, "loss": 0.6904, "step": 110 }, { "epoch": 0.292326431181486, "grad_norm": 0.49976792832548467, "learning_rate": 5e-06, "loss": 0.6917, "step": 120 }, { "epoch": 0.3166869671132765, "grad_norm": 0.8671962194472669, "learning_rate": 5e-06, "loss": 0.6923, "step": 130 }, { "epoch": 0.341047503045067, "grad_norm": 0.4958220955019927, "learning_rate": 5e-06, "loss": 0.6886, "step": 140 }, { "epoch": 0.3654080389768575, "grad_norm": 0.5491440067010557, "learning_rate": 5e-06, "loss": 0.6844, "step": 150 }, { "epoch": 0.38976857490864797, "grad_norm": 0.5764231325699036, "learning_rate": 5e-06, "loss": 0.6873, "step": 160 }, { "epoch": 0.41412911084043846, "grad_norm": 0.4866036070242275, "learning_rate": 5e-06, "loss": 0.6773, "step": 170 }, { "epoch": 0.438489646772229, "grad_norm": 0.7229793933095654, "learning_rate": 5e-06, "loss": 0.6801, "step": 180 }, { "epoch": 0.4628501827040195, "grad_norm": 0.5825475907586349, "learning_rate": 5e-06, "loss": 0.6806, "step": 190 }, { "epoch": 0.48721071863581, "grad_norm": 0.641550842935756, "learning_rate": 5e-06, "loss": 0.677, "step": 200 }, { "epoch": 0.5115712545676004, "grad_norm": 0.555875854836963, "learning_rate": 5e-06, "loss": 0.669, "step": 210 }, { "epoch": 0.535931790499391, "grad_norm": 0.4380177981926619, "learning_rate": 5e-06, "loss": 0.6672, "step": 220 }, { "epoch": 0.5602923264311814, "grad_norm": 0.5586357299552903, "learning_rate": 5e-06, "loss": 0.6696, "step": 230 }, { "epoch": 0.584652862362972, "grad_norm": 0.5268423895517483, "learning_rate": 5e-06, "loss": 0.6761, "step": 240 }, { "epoch": 0.6090133982947625, "grad_norm": 0.5068291541548725, "learning_rate": 5e-06, "loss": 0.6672, "step": 250 }, { "epoch": 0.633373934226553, "grad_norm": 0.7203145859800878, "learning_rate": 5e-06, "loss": 0.6758, "step": 260 }, { "epoch": 0.6577344701583435, "grad_norm": 0.4843027545014372, "learning_rate": 5e-06, "loss": 0.6684, "step": 270 }, { "epoch": 0.682095006090134, "grad_norm": 0.4654716032330135, "learning_rate": 5e-06, "loss": 0.6674, "step": 280 }, { "epoch": 0.7064555420219245, "grad_norm": 0.48677469218469316, "learning_rate": 5e-06, "loss": 0.657, "step": 290 }, { "epoch": 0.730816077953715, "grad_norm": 0.501936617406133, "learning_rate": 5e-06, "loss": 0.666, "step": 300 }, { "epoch": 0.7551766138855055, "grad_norm": 0.4189199112711787, "learning_rate": 5e-06, "loss": 0.6672, "step": 310 }, { "epoch": 0.7795371498172959, "grad_norm": 0.525860628294632, "learning_rate": 5e-06, "loss": 0.6625, "step": 320 }, { "epoch": 0.8038976857490865, "grad_norm": 0.5055516889416151, "learning_rate": 5e-06, "loss": 0.6687, "step": 330 }, { "epoch": 0.8282582216808769, "grad_norm": 0.5030088195887705, "learning_rate": 5e-06, "loss": 0.6622, "step": 340 }, { "epoch": 0.8526187576126675, "grad_norm": 0.4409999841350699, "learning_rate": 5e-06, "loss": 0.659, "step": 350 }, { "epoch": 0.876979293544458, "grad_norm": 0.49889143289837934, "learning_rate": 5e-06, "loss": 0.664, "step": 360 }, { "epoch": 0.9013398294762485, "grad_norm": 0.46333426563091684, "learning_rate": 5e-06, "loss": 0.6647, "step": 370 }, { "epoch": 0.925700365408039, "grad_norm": 0.4132898286035426, "learning_rate": 5e-06, "loss": 0.6604, "step": 380 }, { "epoch": 0.9500609013398295, "grad_norm": 0.4602502572358803, "learning_rate": 5e-06, "loss": 0.663, "step": 390 }, { "epoch": 0.97442143727162, "grad_norm": 0.586425378319964, "learning_rate": 5e-06, "loss": 0.6588, "step": 400 }, { "epoch": 0.9987819732034104, "grad_norm": 0.4637558734433708, "learning_rate": 5e-06, "loss": 0.6557, "step": 410 }, { "epoch": 0.9987819732034104, "eval_loss": 0.6518880128860474, "eval_runtime": 221.2706, "eval_samples_per_second": 49.966, "eval_steps_per_second": 0.393, "step": 410 }, { "epoch": 1.0231425091352009, "grad_norm": 0.6010683259164777, "learning_rate": 5e-06, "loss": 0.6207, "step": 420 }, { "epoch": 1.0475030450669915, "grad_norm": 0.6050810738565418, "learning_rate": 5e-06, "loss": 0.61, "step": 430 }, { "epoch": 1.071863580998782, "grad_norm": 0.4799441913834175, "learning_rate": 5e-06, "loss": 0.617, "step": 440 }, { "epoch": 1.0962241169305724, "grad_norm": 0.41533745441354586, "learning_rate": 5e-06, "loss": 0.6233, "step": 450 }, { "epoch": 1.1205846528623629, "grad_norm": 0.42865808124947796, "learning_rate": 5e-06, "loss": 0.616, "step": 460 }, { "epoch": 1.1449451887941535, "grad_norm": 0.5620085827072487, "learning_rate": 5e-06, "loss": 0.6226, "step": 470 }, { "epoch": 1.169305724725944, "grad_norm": 0.47328106114801194, "learning_rate": 5e-06, "loss": 0.609, "step": 480 }, { "epoch": 1.1936662606577344, "grad_norm": 0.4720567281560868, "learning_rate": 5e-06, "loss": 0.6143, "step": 490 }, { "epoch": 1.218026796589525, "grad_norm": 0.44112203366329256, "learning_rate": 5e-06, "loss": 0.614, "step": 500 }, { "epoch": 1.2423873325213155, "grad_norm": 0.5187652730488376, "learning_rate": 5e-06, "loss": 0.6199, "step": 510 }, { "epoch": 1.266747868453106, "grad_norm": 0.5638861172624315, "learning_rate": 5e-06, "loss": 0.619, "step": 520 }, { "epoch": 1.2911084043848966, "grad_norm": 0.5972907620170446, "learning_rate": 5e-06, "loss": 0.6182, "step": 530 }, { "epoch": 1.315468940316687, "grad_norm": 0.5314321040836214, "learning_rate": 5e-06, "loss": 0.619, "step": 540 }, { "epoch": 1.3398294762484775, "grad_norm": 0.5459662859735409, "learning_rate": 5e-06, "loss": 0.6183, "step": 550 }, { "epoch": 1.364190012180268, "grad_norm": 0.5202733547748785, "learning_rate": 5e-06, "loss": 0.618, "step": 560 }, { "epoch": 1.3885505481120584, "grad_norm": 0.4161689870213624, "learning_rate": 5e-06, "loss": 0.6101, "step": 570 }, { "epoch": 1.412911084043849, "grad_norm": 0.46394109509695763, "learning_rate": 5e-06, "loss": 0.6274, "step": 580 }, { "epoch": 1.4372716199756395, "grad_norm": 0.4808851283054136, "learning_rate": 5e-06, "loss": 0.6087, "step": 590 }, { "epoch": 1.46163215590743, "grad_norm": 0.5411540324211217, "learning_rate": 5e-06, "loss": 0.6215, "step": 600 }, { "epoch": 1.4859926918392206, "grad_norm": 0.5416915020329361, "learning_rate": 5e-06, "loss": 0.6167, "step": 610 }, { "epoch": 1.510353227771011, "grad_norm": 0.527607596364707, "learning_rate": 5e-06, "loss": 0.6128, "step": 620 }, { "epoch": 1.5347137637028014, "grad_norm": 0.520963657326471, "learning_rate": 5e-06, "loss": 0.6137, "step": 630 }, { "epoch": 1.559074299634592, "grad_norm": 0.4366228046959017, "learning_rate": 5e-06, "loss": 0.6171, "step": 640 }, { "epoch": 1.5834348355663823, "grad_norm": 0.5504251670894937, "learning_rate": 5e-06, "loss": 0.6143, "step": 650 }, { "epoch": 1.607795371498173, "grad_norm": 0.4715628019229569, "learning_rate": 5e-06, "loss": 0.6202, "step": 660 }, { "epoch": 1.6321559074299634, "grad_norm": 0.5291464708625646, "learning_rate": 5e-06, "loss": 0.6155, "step": 670 }, { "epoch": 1.6565164433617539, "grad_norm": 0.4355159440359265, "learning_rate": 5e-06, "loss": 0.6162, "step": 680 }, { "epoch": 1.6808769792935445, "grad_norm": 0.5112620919843524, "learning_rate": 5e-06, "loss": 0.6279, "step": 690 }, { "epoch": 1.705237515225335, "grad_norm": 0.57875404757705, "learning_rate": 5e-06, "loss": 0.6176, "step": 700 }, { "epoch": 1.7295980511571254, "grad_norm": 0.4410704500201331, "learning_rate": 5e-06, "loss": 0.6195, "step": 710 }, { "epoch": 1.753958587088916, "grad_norm": 0.5587895103691882, "learning_rate": 5e-06, "loss": 0.6194, "step": 720 }, { "epoch": 1.7783191230207065, "grad_norm": 0.4941053548445359, "learning_rate": 5e-06, "loss": 0.6096, "step": 730 }, { "epoch": 1.802679658952497, "grad_norm": 0.5227563230610854, "learning_rate": 5e-06, "loss": 0.6102, "step": 740 }, { "epoch": 1.8270401948842876, "grad_norm": 0.4591897668705156, "learning_rate": 5e-06, "loss": 0.6117, "step": 750 }, { "epoch": 1.8514007308160778, "grad_norm": 0.5103376738813472, "learning_rate": 5e-06, "loss": 0.6134, "step": 760 }, { "epoch": 1.8757612667478685, "grad_norm": 0.532214266722337, "learning_rate": 5e-06, "loss": 0.6102, "step": 770 }, { "epoch": 1.900121802679659, "grad_norm": 0.4632257568024349, "learning_rate": 5e-06, "loss": 0.6218, "step": 780 }, { "epoch": 1.9244823386114494, "grad_norm": 0.5412849420492728, "learning_rate": 5e-06, "loss": 0.6109, "step": 790 }, { "epoch": 1.94884287454324, "grad_norm": 0.48808240750337195, "learning_rate": 5e-06, "loss": 0.6176, "step": 800 }, { "epoch": 1.9732034104750305, "grad_norm": 0.4761455418357999, "learning_rate": 5e-06, "loss": 0.6098, "step": 810 }, { "epoch": 1.997563946406821, "grad_norm": 0.4534197510006015, "learning_rate": 5e-06, "loss": 0.6082, "step": 820 }, { "epoch": 2.0, "eval_loss": 0.6419612765312195, "eval_runtime": 221.435, "eval_samples_per_second": 49.929, "eval_steps_per_second": 0.393, "step": 821 }, { "epoch": 2.0219244823386116, "grad_norm": 0.6074772099873261, "learning_rate": 5e-06, "loss": 0.5769, "step": 830 }, { "epoch": 2.0462850182704018, "grad_norm": 0.5110291152400608, "learning_rate": 5e-06, "loss": 0.564, "step": 840 }, { "epoch": 2.0706455542021924, "grad_norm": 0.740312554525951, "learning_rate": 5e-06, "loss": 0.5717, "step": 850 }, { "epoch": 2.095006090133983, "grad_norm": 0.5821754748157193, "learning_rate": 5e-06, "loss": 0.5726, "step": 860 }, { "epoch": 2.1193666260657733, "grad_norm": 0.53860209415622, "learning_rate": 5e-06, "loss": 0.5742, "step": 870 }, { "epoch": 2.143727161997564, "grad_norm": 0.5215524148222913, "learning_rate": 5e-06, "loss": 0.564, "step": 880 }, { "epoch": 2.1680876979293546, "grad_norm": 0.6458934700822203, "learning_rate": 5e-06, "loss": 0.5724, "step": 890 }, { "epoch": 2.192448233861145, "grad_norm": 0.4435184357785445, "learning_rate": 5e-06, "loss": 0.5684, "step": 900 }, { "epoch": 2.2168087697929355, "grad_norm": 0.5416262844784988, "learning_rate": 5e-06, "loss": 0.5718, "step": 910 }, { "epoch": 2.2411693057247257, "grad_norm": 0.4739984176413269, "learning_rate": 5e-06, "loss": 0.5756, "step": 920 }, { "epoch": 2.2655298416565164, "grad_norm": 0.47994087642094213, "learning_rate": 5e-06, "loss": 0.5742, "step": 930 }, { "epoch": 2.289890377588307, "grad_norm": 0.4742359512444407, "learning_rate": 5e-06, "loss": 0.5731, "step": 940 }, { "epoch": 2.3142509135200973, "grad_norm": 0.5586334439764152, "learning_rate": 5e-06, "loss": 0.576, "step": 950 }, { "epoch": 2.338611449451888, "grad_norm": 0.49880213092932163, "learning_rate": 5e-06, "loss": 0.5799, "step": 960 }, { "epoch": 2.3629719853836786, "grad_norm": 0.49935902866105975, "learning_rate": 5e-06, "loss": 0.5762, "step": 970 }, { "epoch": 2.387332521315469, "grad_norm": 0.5465185670805549, "learning_rate": 5e-06, "loss": 0.5717, "step": 980 }, { "epoch": 2.4116930572472595, "grad_norm": 0.4986248004640357, "learning_rate": 5e-06, "loss": 0.5772, "step": 990 }, { "epoch": 2.43605359317905, "grad_norm": 0.5423471098966955, "learning_rate": 5e-06, "loss": 0.5804, "step": 1000 }, { "epoch": 2.4604141291108403, "grad_norm": 0.5193096800667882, "learning_rate": 5e-06, "loss": 0.5691, "step": 1010 }, { "epoch": 2.484774665042631, "grad_norm": 0.4590023482690989, "learning_rate": 5e-06, "loss": 0.5741, "step": 1020 }, { "epoch": 2.5091352009744217, "grad_norm": 0.4671536002975626, "learning_rate": 5e-06, "loss": 0.5714, "step": 1030 }, { "epoch": 2.533495736906212, "grad_norm": 0.5523685876104364, "learning_rate": 5e-06, "loss": 0.5734, "step": 1040 }, { "epoch": 2.5578562728380025, "grad_norm": 0.6868866709072206, "learning_rate": 5e-06, "loss": 0.5728, "step": 1050 }, { "epoch": 2.582216808769793, "grad_norm": 0.5582819992545279, "learning_rate": 5e-06, "loss": 0.5737, "step": 1060 }, { "epoch": 2.6065773447015834, "grad_norm": 0.4702857244191192, "learning_rate": 5e-06, "loss": 0.566, "step": 1070 }, { "epoch": 2.630937880633374, "grad_norm": 0.6487634608204832, "learning_rate": 5e-06, "loss": 0.5818, "step": 1080 }, { "epoch": 2.6552984165651643, "grad_norm": 0.4736967537062896, "learning_rate": 5e-06, "loss": 0.5753, "step": 1090 }, { "epoch": 2.679658952496955, "grad_norm": 0.5348827813693043, "learning_rate": 5e-06, "loss": 0.5771, "step": 1100 }, { "epoch": 2.704019488428745, "grad_norm": 0.5028960700092897, "learning_rate": 5e-06, "loss": 0.5713, "step": 1110 }, { "epoch": 2.728380024360536, "grad_norm": 0.4780698681645441, "learning_rate": 5e-06, "loss": 0.5746, "step": 1120 }, { "epoch": 2.7527405602923265, "grad_norm": 0.4864478553500122, "learning_rate": 5e-06, "loss": 0.5752, "step": 1130 }, { "epoch": 2.7771010962241167, "grad_norm": 0.4667264912708201, "learning_rate": 5e-06, "loss": 0.5772, "step": 1140 }, { "epoch": 2.8014616321559074, "grad_norm": 0.45394076375291925, "learning_rate": 5e-06, "loss": 0.5823, "step": 1150 }, { "epoch": 2.825822168087698, "grad_norm": 0.5161201565392174, "learning_rate": 5e-06, "loss": 0.5815, "step": 1160 }, { "epoch": 2.8501827040194883, "grad_norm": 0.5076152963599294, "learning_rate": 5e-06, "loss": 0.5784, "step": 1170 }, { "epoch": 2.874543239951279, "grad_norm": 0.4752319372351976, "learning_rate": 5e-06, "loss": 0.5791, "step": 1180 }, { "epoch": 2.8989037758830696, "grad_norm": 0.533679377576446, "learning_rate": 5e-06, "loss": 0.5796, "step": 1190 }, { "epoch": 2.92326431181486, "grad_norm": 0.4952941664544987, "learning_rate": 5e-06, "loss": 0.5735, "step": 1200 }, { "epoch": 2.9476248477466505, "grad_norm": 0.4611730832059269, "learning_rate": 5e-06, "loss": 0.5748, "step": 1210 }, { "epoch": 2.971985383678441, "grad_norm": 0.5882799223730999, "learning_rate": 5e-06, "loss": 0.5781, "step": 1220 }, { "epoch": 2.9963459196102313, "grad_norm": 0.4979608878944041, "learning_rate": 5e-06, "loss": 0.5706, "step": 1230 }, { "epoch": 2.9963459196102313, "eval_loss": 0.6427608132362366, "eval_runtime": 221.8996, "eval_samples_per_second": 49.824, "eval_steps_per_second": 0.392, "step": 1230 }, { "epoch": 2.9963459196102313, "step": 1230, "total_flos": 2059877052579840.0, "train_loss": 0.6269122554034722, "train_runtime": 37089.6991, "train_samples_per_second": 16.991, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1230, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2059877052579840.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }