{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998652991321415, "eval_steps": 500, "global_step": 6495, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015394384898108415, "grad_norm": 1.9140625, "learning_rate": 1.9692070823710548e-05, "loss": 1.1459, "num_input_tokens_seen": 3815960, "step": 100 }, { "epoch": 0.03078876979621683, "grad_norm": 1.5703125, "learning_rate": 1.9384141647421097e-05, "loss": 1.0878, "num_input_tokens_seen": 7588127, "step": 200 }, { "epoch": 0.04618315469432525, "grad_norm": 1.625, "learning_rate": 1.907621247113164e-05, "loss": 1.0942, "num_input_tokens_seen": 11373466, "step": 300 }, { "epoch": 0.06157753959243366, "grad_norm": 1.6328125, "learning_rate": 1.876828329484219e-05, "loss": 1.0788, "num_input_tokens_seen": 15148940, "step": 400 }, { "epoch": 0.07697192449054208, "grad_norm": 1.703125, "learning_rate": 1.8460354118552735e-05, "loss": 1.0807, "num_input_tokens_seen": 18941722, "step": 500 }, { "epoch": 0.0923663093886505, "grad_norm": 1.765625, "learning_rate": 1.815242494226328e-05, "loss": 1.0628, "num_input_tokens_seen": 22691439, "step": 600 }, { "epoch": 0.1077606942867589, "grad_norm": 1.890625, "learning_rate": 1.7844495765973827e-05, "loss": 1.057, "num_input_tokens_seen": 26519584, "step": 700 }, { "epoch": 0.12315507918486732, "grad_norm": 1.59375, "learning_rate": 1.7536566589684373e-05, "loss": 1.0591, "num_input_tokens_seen": 30335003, "step": 800 }, { "epoch": 0.13854946408297575, "grad_norm": 1.5859375, "learning_rate": 1.722863741339492e-05, "loss": 1.0597, "num_input_tokens_seen": 34122220, "step": 900 }, { "epoch": 0.15394384898108415, "grad_norm": 2.140625, "learning_rate": 1.6920708237105468e-05, "loss": 1.0784, "num_input_tokens_seen": 37937918, "step": 1000 }, { "epoch": 0.16933823387919256, "grad_norm": 1.7265625, "learning_rate": 1.6612779060816014e-05, "loss": 1.0598, "num_input_tokens_seen": 41762422, "step": 1100 }, { "epoch": 0.184732618777301, "grad_norm": 1.65625, "learning_rate": 1.630484988452656e-05, "loss": 1.0658, "num_input_tokens_seen": 45541351, "step": 1200 }, { "epoch": 0.2001270036754094, "grad_norm": 1.9453125, "learning_rate": 1.5996920708237106e-05, "loss": 1.0557, "num_input_tokens_seen": 49317373, "step": 1300 }, { "epoch": 0.2155213885735178, "grad_norm": 1.765625, "learning_rate": 1.5688991531947652e-05, "loss": 1.0597, "num_input_tokens_seen": 53117895, "step": 1400 }, { "epoch": 0.23091577347162623, "grad_norm": 1.765625, "learning_rate": 1.53810623556582e-05, "loss": 1.0616, "num_input_tokens_seen": 56839161, "step": 1500 }, { "epoch": 0.24631015836973463, "grad_norm": 1.609375, "learning_rate": 1.5073133179368746e-05, "loss": 1.053, "num_input_tokens_seen": 60623928, "step": 1600 }, { "epoch": 0.26170454326784304, "grad_norm": 1.609375, "learning_rate": 1.4765204003079292e-05, "loss": 1.0483, "num_input_tokens_seen": 64408460, "step": 1700 }, { "epoch": 0.2770989281659515, "grad_norm": 1.6640625, "learning_rate": 1.445727482678984e-05, "loss": 1.049, "num_input_tokens_seen": 68223027, "step": 1800 }, { "epoch": 0.2924933130640599, "grad_norm": 1.953125, "learning_rate": 1.4149345650500385e-05, "loss": 1.0482, "num_input_tokens_seen": 71994158, "step": 1900 }, { "epoch": 0.3078876979621683, "grad_norm": 1.765625, "learning_rate": 1.3841416474210933e-05, "loss": 1.0497, "num_input_tokens_seen": 75865760, "step": 2000 }, { "epoch": 0.3232820828602767, "grad_norm": 2.015625, "learning_rate": 1.3533487297921479e-05, "loss": 1.0409, "num_input_tokens_seen": 79623921, "step": 2100 }, { "epoch": 0.3386764677583851, "grad_norm": 1.7890625, "learning_rate": 1.3225558121632025e-05, "loss": 1.0501, "num_input_tokens_seen": 83411874, "step": 2200 }, { "epoch": 0.3540708526564935, "grad_norm": 1.6328125, "learning_rate": 1.2917628945342572e-05, "loss": 1.0542, "num_input_tokens_seen": 87182489, "step": 2300 }, { "epoch": 0.369465237554602, "grad_norm": 1.8671875, "learning_rate": 1.2609699769053118e-05, "loss": 1.0482, "num_input_tokens_seen": 90989837, "step": 2400 }, { "epoch": 0.3848596224527104, "grad_norm": 1.9375, "learning_rate": 1.2301770592763664e-05, "loss": 1.0404, "num_input_tokens_seen": 94853551, "step": 2500 }, { "epoch": 0.4002540073508188, "grad_norm": 2.015625, "learning_rate": 1.1993841416474212e-05, "loss": 1.0401, "num_input_tokens_seen": 98649900, "step": 2600 }, { "epoch": 0.4156483922489272, "grad_norm": 1.9609375, "learning_rate": 1.1685912240184758e-05, "loss": 1.0453, "num_input_tokens_seen": 102455430, "step": 2700 }, { "epoch": 0.4310427771470356, "grad_norm": 1.75, "learning_rate": 1.1377983063895306e-05, "loss": 1.0335, "num_input_tokens_seen": 106279858, "step": 2800 }, { "epoch": 0.44643716204514405, "grad_norm": 1.6484375, "learning_rate": 1.1070053887605852e-05, "loss": 1.0486, "num_input_tokens_seen": 110042375, "step": 2900 }, { "epoch": 0.46183154694325246, "grad_norm": 1.703125, "learning_rate": 1.0762124711316398e-05, "loss": 1.0347, "num_input_tokens_seen": 113836688, "step": 3000 }, { "epoch": 0.47722593184136086, "grad_norm": 1.7265625, "learning_rate": 1.0454195535026945e-05, "loss": 1.041, "num_input_tokens_seen": 117581458, "step": 3100 }, { "epoch": 0.49262031673946927, "grad_norm": 1.8359375, "learning_rate": 1.0146266358737491e-05, "loss": 1.0357, "num_input_tokens_seen": 121341275, "step": 3200 }, { "epoch": 0.5080147016375777, "grad_norm": 1.546875, "learning_rate": 9.838337182448037e-06, "loss": 1.0374, "num_input_tokens_seen": 125088162, "step": 3300 }, { "epoch": 0.5234090865356861, "grad_norm": 1.4453125, "learning_rate": 9.530408006158585e-06, "loss": 1.0255, "num_input_tokens_seen": 128901749, "step": 3400 }, { "epoch": 0.5388034714337945, "grad_norm": 1.46875, "learning_rate": 9.22247882986913e-06, "loss": 1.0282, "num_input_tokens_seen": 132736866, "step": 3500 }, { "epoch": 0.554197856331903, "grad_norm": 1.6953125, "learning_rate": 8.914549653579677e-06, "loss": 1.0398, "num_input_tokens_seen": 136595429, "step": 3600 }, { "epoch": 0.5695922412300114, "grad_norm": 1.7578125, "learning_rate": 8.606620477290224e-06, "loss": 1.025, "num_input_tokens_seen": 140426462, "step": 3700 }, { "epoch": 0.5849866261281198, "grad_norm": 1.5859375, "learning_rate": 8.29869130100077e-06, "loss": 1.0291, "num_input_tokens_seen": 144234914, "step": 3800 }, { "epoch": 0.6003810110262282, "grad_norm": 2.453125, "learning_rate": 7.990762124711316e-06, "loss": 1.0233, "num_input_tokens_seen": 148032058, "step": 3900 }, { "epoch": 0.6157753959243366, "grad_norm": 1.734375, "learning_rate": 7.682832948421864e-06, "loss": 1.0347, "num_input_tokens_seen": 151814536, "step": 4000 }, { "epoch": 0.631169780822445, "grad_norm": 1.7578125, "learning_rate": 7.37490377213241e-06, "loss": 1.0457, "num_input_tokens_seen": 155598931, "step": 4100 }, { "epoch": 0.6465641657205534, "grad_norm": 1.453125, "learning_rate": 7.066974595842957e-06, "loss": 1.0473, "num_input_tokens_seen": 159326528, "step": 4200 }, { "epoch": 0.6619585506186618, "grad_norm": 1.7109375, "learning_rate": 6.7590454195535035e-06, "loss": 1.0482, "num_input_tokens_seen": 163084806, "step": 4300 }, { "epoch": 0.6773529355167702, "grad_norm": 1.734375, "learning_rate": 6.4511162432640495e-06, "loss": 1.0267, "num_input_tokens_seen": 166891971, "step": 4400 }, { "epoch": 0.6927473204148786, "grad_norm": 1.703125, "learning_rate": 6.143187066974596e-06, "loss": 1.0165, "num_input_tokens_seen": 170700927, "step": 4500 }, { "epoch": 0.708141705312987, "grad_norm": 1.7421875, "learning_rate": 5.835257890685143e-06, "loss": 1.0386, "num_input_tokens_seen": 174509215, "step": 4600 }, { "epoch": 0.7235360902110956, "grad_norm": 1.859375, "learning_rate": 5.52732871439569e-06, "loss": 1.0286, "num_input_tokens_seen": 178355672, "step": 4700 }, { "epoch": 0.738930475109204, "grad_norm": 1.609375, "learning_rate": 5.219399538106236e-06, "loss": 1.0199, "num_input_tokens_seen": 182172598, "step": 4800 }, { "epoch": 0.7543248600073124, "grad_norm": 1.7734375, "learning_rate": 4.911470361816783e-06, "loss": 1.023, "num_input_tokens_seen": 185945174, "step": 4900 }, { "epoch": 0.7697192449054208, "grad_norm": 1.4765625, "learning_rate": 4.6035411855273295e-06, "loss": 1.0317, "num_input_tokens_seen": 189723364, "step": 5000 }, { "epoch": 0.7851136298035292, "grad_norm": 1.53125, "learning_rate": 4.2956120092378755e-06, "loss": 1.0262, "num_input_tokens_seen": 193515488, "step": 5100 }, { "epoch": 0.8005080147016376, "grad_norm": 1.7109375, "learning_rate": 3.987682832948422e-06, "loss": 1.0276, "num_input_tokens_seen": 197294619, "step": 5200 }, { "epoch": 0.815902399599746, "grad_norm": 1.6640625, "learning_rate": 3.6797536566589687e-06, "loss": 1.0141, "num_input_tokens_seen": 201079998, "step": 5300 }, { "epoch": 0.8312967844978544, "grad_norm": 1.875, "learning_rate": 3.3718244803695155e-06, "loss": 1.0406, "num_input_tokens_seen": 204892665, "step": 5400 }, { "epoch": 0.8466911693959628, "grad_norm": 1.75, "learning_rate": 3.063895304080062e-06, "loss": 1.0144, "num_input_tokens_seen": 208692509, "step": 5500 }, { "epoch": 0.8620855542940712, "grad_norm": 1.625, "learning_rate": 2.7559661277906087e-06, "loss": 1.0328, "num_input_tokens_seen": 212482079, "step": 5600 }, { "epoch": 0.8774799391921797, "grad_norm": 1.4453125, "learning_rate": 2.4480369515011547e-06, "loss": 1.0158, "num_input_tokens_seen": 216265581, "step": 5700 }, { "epoch": 0.8928743240902881, "grad_norm": 1.421875, "learning_rate": 2.1401077752117015e-06, "loss": 1.0376, "num_input_tokens_seen": 220058453, "step": 5800 }, { "epoch": 0.9082687089883965, "grad_norm": 1.875, "learning_rate": 1.8321785989222479e-06, "loss": 1.0116, "num_input_tokens_seen": 223849666, "step": 5900 }, { "epoch": 0.9236630938865049, "grad_norm": 1.671875, "learning_rate": 1.5242494226327945e-06, "loss": 1.0319, "num_input_tokens_seen": 227586204, "step": 6000 }, { "epoch": 0.9390574787846133, "grad_norm": 1.578125, "learning_rate": 1.216320246343341e-06, "loss": 1.0248, "num_input_tokens_seen": 231383644, "step": 6100 }, { "epoch": 0.9544518636827217, "grad_norm": 1.71875, "learning_rate": 9.083910700538877e-07, "loss": 1.0252, "num_input_tokens_seen": 235171998, "step": 6200 }, { "epoch": 0.9698462485808301, "grad_norm": 1.59375, "learning_rate": 6.004618937644343e-07, "loss": 1.033, "num_input_tokens_seen": 238903492, "step": 6300 }, { "epoch": 0.9852406334789385, "grad_norm": 1.671875, "learning_rate": 2.9253271747498076e-07, "loss": 1.0206, "num_input_tokens_seen": 242682149, "step": 6400 }, { "epoch": 0.9998652991321415, "num_input_tokens_seen": 246267812, "step": 6495, "total_flos": 1.1089328326722978e+19, "train_loss": 1.0435700872478528, "train_runtime": 38249.836, "train_samples_per_second": 5.434, "train_steps_per_second": 0.17, "train_tokens_per_second": 1606.468 } ], "logging_steps": 100, "max_steps": 6495, "num_input_tokens_seen": 246267812, "num_train_epochs": 1, "save_steps": 800, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1089328326722978e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }