{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04294742008712191, "eval_steps": 20, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012270691453463403, "grad_norm": 0.05003070831298828, "learning_rate": 0.00019981588314717073, "loss": 2.6972, "step": 20 }, { "epoch": 0.0012270691453463403, "eval_loss": 2.2967841625213623, "eval_runtime": 23.2641, "eval_samples_per_second": 4.298, "eval_steps_per_second": 0.559, "step": 20 }, { "epoch": 0.0024541382906926807, "grad_norm": 0.07180789858102798, "learning_rate": 0.00019957039401006504, "loss": 2.2022, "step": 40 }, { "epoch": 0.0024541382906926807, "eval_loss": 2.068006992340088, "eval_runtime": 23.5719, "eval_samples_per_second": 4.242, "eval_steps_per_second": 0.552, "step": 40 }, { "epoch": 0.003681207436039021, "grad_norm": 0.08049603551626205, "learning_rate": 0.00019932490487295938, "loss": 2.0529, "step": 60 }, { "epoch": 0.003681207436039021, "eval_loss": 1.9338455200195312, "eval_runtime": 23.1495, "eval_samples_per_second": 4.32, "eval_steps_per_second": 0.562, "step": 60 }, { "epoch": 0.004908276581385361, "grad_norm": 0.08653070032596588, "learning_rate": 0.00019907941573585368, "loss": 1.9395, "step": 80 }, { "epoch": 0.004908276581385361, "eval_loss": 1.8689138889312744, "eval_runtime": 23.7275, "eval_samples_per_second": 4.215, "eval_steps_per_second": 0.548, "step": 80 }, { "epoch": 0.006135345726731701, "grad_norm": 0.08481493592262268, "learning_rate": 0.00019883392659874802, "loss": 1.8773, "step": 100 }, { "epoch": 0.006135345726731701, "eval_loss": 1.8180441856384277, "eval_runtime": 23.5705, "eval_samples_per_second": 4.243, "eval_steps_per_second": 0.552, "step": 100 }, { "epoch": 0.007362414872078042, "grad_norm": 0.11568216979503632, "learning_rate": 0.00019858843746164233, "loss": 1.7827, "step": 120 }, { "epoch": 0.007362414872078042, "eval_loss": 1.774067997932434, "eval_runtime": 23.9722, "eval_samples_per_second": 4.172, "eval_steps_per_second": 0.542, "step": 120 }, { "epoch": 0.008589484017424381, "grad_norm": 0.10869361460208893, "learning_rate": 0.00019834294832453666, "loss": 1.812, "step": 140 }, { "epoch": 0.008589484017424381, "eval_loss": 1.737804889678955, "eval_runtime": 23.519, "eval_samples_per_second": 4.252, "eval_steps_per_second": 0.553, "step": 140 }, { "epoch": 0.009816553162770723, "grad_norm": 0.0976206362247467, "learning_rate": 0.00019809745918743097, "loss": 1.74, "step": 160 }, { "epoch": 0.009816553162770723, "eval_loss": 1.700899600982666, "eval_runtime": 23.2347, "eval_samples_per_second": 4.304, "eval_steps_per_second": 0.56, "step": 160 }, { "epoch": 0.011043622308117063, "grad_norm": 0.1123971939086914, "learning_rate": 0.00019785197005032528, "loss": 1.787, "step": 180 }, { "epoch": 0.011043622308117063, "eval_loss": 1.6765294075012207, "eval_runtime": 23.6403, "eval_samples_per_second": 4.23, "eval_steps_per_second": 0.55, "step": 180 }, { "epoch": 0.012270691453463402, "grad_norm": 0.10320968925952911, "learning_rate": 0.0001976064809132196, "loss": 1.7804, "step": 200 }, { "epoch": 0.012270691453463402, "eval_loss": 1.6563650369644165, "eval_runtime": 23.6381, "eval_samples_per_second": 4.23, "eval_steps_per_second": 0.55, "step": 200 }, { "epoch": 0.013497760598809742, "grad_norm": 0.14491896331310272, "learning_rate": 0.00019736099177611392, "loss": 1.7043, "step": 220 }, { "epoch": 0.013497760598809742, "eval_loss": 1.6346065998077393, "eval_runtime": 23.7121, "eval_samples_per_second": 4.217, "eval_steps_per_second": 0.548, "step": 220 }, { "epoch": 0.014724829744156084, "grad_norm": 0.12502990663051605, "learning_rate": 0.00019711550263900825, "loss": 1.7345, "step": 240 }, { "epoch": 0.014724829744156084, "eval_loss": 1.6147732734680176, "eval_runtime": 23.5936, "eval_samples_per_second": 4.238, "eval_steps_per_second": 0.551, "step": 240 }, { "epoch": 0.015951898889502422, "grad_norm": 0.1230228915810585, "learning_rate": 0.00019687001350190256, "loss": 1.7338, "step": 260 }, { "epoch": 0.015951898889502422, "eval_loss": 1.5957908630371094, "eval_runtime": 23.389, "eval_samples_per_second": 4.276, "eval_steps_per_second": 0.556, "step": 260 }, { "epoch": 0.017178968034848762, "grad_norm": 0.12000931799411774, "learning_rate": 0.00019662452436479687, "loss": 1.7143, "step": 280 }, { "epoch": 0.017178968034848762, "eval_loss": 1.585697889328003, "eval_runtime": 23.566, "eval_samples_per_second": 4.243, "eval_steps_per_second": 0.552, "step": 280 }, { "epoch": 0.018406037180195105, "grad_norm": 0.1442350149154663, "learning_rate": 0.00019637903522769118, "loss": 1.6406, "step": 300 }, { "epoch": 0.018406037180195105, "eval_loss": 1.5710804462432861, "eval_runtime": 23.5083, "eval_samples_per_second": 4.254, "eval_steps_per_second": 0.553, "step": 300 }, { "epoch": 0.019633106325541445, "grad_norm": 0.09555982798337936, "learning_rate": 0.00019613354609058549, "loss": 1.6213, "step": 320 }, { "epoch": 0.019633106325541445, "eval_loss": 1.5556869506835938, "eval_runtime": 23.5239, "eval_samples_per_second": 4.251, "eval_steps_per_second": 0.553, "step": 320 }, { "epoch": 0.020860175470887785, "grad_norm": 0.13320715725421906, "learning_rate": 0.00019588805695347982, "loss": 1.6956, "step": 340 }, { "epoch": 0.020860175470887785, "eval_loss": 1.5424914360046387, "eval_runtime": 23.6064, "eval_samples_per_second": 4.236, "eval_steps_per_second": 0.551, "step": 340 }, { "epoch": 0.022087244616234125, "grad_norm": 0.12061001360416412, "learning_rate": 0.00019564256781637413, "loss": 1.6589, "step": 360 }, { "epoch": 0.022087244616234125, "eval_loss": 1.528477430343628, "eval_runtime": 23.6796, "eval_samples_per_second": 4.223, "eval_steps_per_second": 0.549, "step": 360 }, { "epoch": 0.023314313761580465, "grad_norm": 0.14327766001224518, "learning_rate": 0.00019539707867926844, "loss": 1.5946, "step": 380 }, { "epoch": 0.023314313761580465, "eval_loss": 1.52202570438385, "eval_runtime": 23.6756, "eval_samples_per_second": 4.224, "eval_steps_per_second": 0.549, "step": 380 }, { "epoch": 0.024541382906926805, "grad_norm": 0.12291988730430603, "learning_rate": 0.00019515158954216277, "loss": 1.5366, "step": 400 }, { "epoch": 0.024541382906926805, "eval_loss": 1.507960319519043, "eval_runtime": 23.6216, "eval_samples_per_second": 4.233, "eval_steps_per_second": 0.55, "step": 400 }, { "epoch": 0.025768452052273145, "grad_norm": 0.15288175642490387, "learning_rate": 0.00019490610040505708, "loss": 1.5829, "step": 420 }, { "epoch": 0.025768452052273145, "eval_loss": 1.4994325637817383, "eval_runtime": 23.6368, "eval_samples_per_second": 4.231, "eval_steps_per_second": 0.55, "step": 420 }, { "epoch": 0.026995521197619485, "grad_norm": 0.13319191336631775, "learning_rate": 0.0001946606112679514, "loss": 1.5523, "step": 440 }, { "epoch": 0.026995521197619485, "eval_loss": 1.4956778287887573, "eval_runtime": 23.6921, "eval_samples_per_second": 4.221, "eval_steps_per_second": 0.549, "step": 440 }, { "epoch": 0.028222590342965825, "grad_norm": 0.14759239554405212, "learning_rate": 0.00019441512213084572, "loss": 1.5735, "step": 460 }, { "epoch": 0.028222590342965825, "eval_loss": 1.486402988433838, "eval_runtime": 23.2911, "eval_samples_per_second": 4.293, "eval_steps_per_second": 0.558, "step": 460 }, { "epoch": 0.029449659488312168, "grad_norm": 0.11428073793649673, "learning_rate": 0.00019416963299374006, "loss": 1.5788, "step": 480 }, { "epoch": 0.029449659488312168, "eval_loss": 1.4712104797363281, "eval_runtime": 23.4851, "eval_samples_per_second": 4.258, "eval_steps_per_second": 0.554, "step": 480 }, { "epoch": 0.030676728633658508, "grad_norm": 0.11649870127439499, "learning_rate": 0.00019392414385663436, "loss": 1.5667, "step": 500 }, { "epoch": 0.030676728633658508, "eval_loss": 1.4620152711868286, "eval_runtime": 23.5455, "eval_samples_per_second": 4.247, "eval_steps_per_second": 0.552, "step": 500 }, { "epoch": 0.031903797779004844, "grad_norm": 0.16019868850708008, "learning_rate": 0.00019367865471952867, "loss": 1.4778, "step": 520 }, { "epoch": 0.031903797779004844, "eval_loss": 1.4597880840301514, "eval_runtime": 23.624, "eval_samples_per_second": 4.233, "eval_steps_per_second": 0.55, "step": 520 }, { "epoch": 0.03313086692435119, "grad_norm": 0.1370091289281845, "learning_rate": 0.00019343316558242298, "loss": 1.5531, "step": 540 }, { "epoch": 0.03313086692435119, "eval_loss": 1.443243384361267, "eval_runtime": 23.5537, "eval_samples_per_second": 4.246, "eval_steps_per_second": 0.552, "step": 540 }, { "epoch": 0.034357936069697524, "grad_norm": 0.1211417093873024, "learning_rate": 0.0001931876764453173, "loss": 1.5879, "step": 560 }, { "epoch": 0.034357936069697524, "eval_loss": 1.4466437101364136, "eval_runtime": 23.8508, "eval_samples_per_second": 4.193, "eval_steps_per_second": 0.545, "step": 560 }, { "epoch": 0.03558500521504387, "grad_norm": 0.14397528767585754, "learning_rate": 0.00019294218730821162, "loss": 1.5352, "step": 580 }, { "epoch": 0.03558500521504387, "eval_loss": 1.4339115619659424, "eval_runtime": 23.649, "eval_samples_per_second": 4.229, "eval_steps_per_second": 0.55, "step": 580 }, { "epoch": 0.03681207436039021, "grad_norm": 0.12468410283327103, "learning_rate": 0.00019269669817110593, "loss": 1.5045, "step": 600 }, { "epoch": 0.03681207436039021, "eval_loss": 1.4277862310409546, "eval_runtime": 23.647, "eval_samples_per_second": 4.229, "eval_steps_per_second": 0.55, "step": 600 }, { "epoch": 0.03803914350573655, "grad_norm": 0.1577584445476532, "learning_rate": 0.00019245120903400024, "loss": 1.5497, "step": 620 }, { "epoch": 0.03803914350573655, "eval_loss": 1.4203659296035767, "eval_runtime": 23.8622, "eval_samples_per_second": 4.191, "eval_steps_per_second": 0.545, "step": 620 }, { "epoch": 0.03926621265108289, "grad_norm": 0.12410438805818558, "learning_rate": 0.00019220571989689457, "loss": 1.503, "step": 640 }, { "epoch": 0.03926621265108289, "eval_loss": 1.4154139757156372, "eval_runtime": 23.4706, "eval_samples_per_second": 4.261, "eval_steps_per_second": 0.554, "step": 640 }, { "epoch": 0.04049328179642923, "grad_norm": 0.13563913106918335, "learning_rate": 0.00019196023075978888, "loss": 1.4851, "step": 660 }, { "epoch": 0.04049328179642923, "eval_loss": 1.414802074432373, "eval_runtime": 23.3961, "eval_samples_per_second": 4.274, "eval_steps_per_second": 0.556, "step": 660 }, { "epoch": 0.04172035094177557, "grad_norm": 0.13915061950683594, "learning_rate": 0.00019171474162268321, "loss": 1.4847, "step": 680 }, { "epoch": 0.04172035094177557, "eval_loss": 1.4029760360717773, "eval_runtime": 23.6066, "eval_samples_per_second": 4.236, "eval_steps_per_second": 0.551, "step": 680 }, { "epoch": 0.04294742008712191, "grad_norm": 0.14418162405490875, "learning_rate": 0.00019146925248557752, "loss": 1.4724, "step": 700 }, { "epoch": 0.04294742008712191, "eval_loss": 1.4029196500778198, "eval_runtime": 23.5244, "eval_samples_per_second": 4.251, "eval_steps_per_second": 0.553, "step": 700 } ], "logging_steps": 20, "max_steps": 16299, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 5.08603168290816e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }