{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 27.330623626708984, "learning_rate": 1e-05, "loss": 14.9828, "mean_token_accuracy": 0.43762992322444916, "step": 1 }, { "epoch": 0.016, "grad_norm": 25.78533935546875, "learning_rate": 2e-05, "loss": 14.48, "mean_token_accuracy": 0.45947156846523285, "step": 2 }, { "epoch": 0.024, "grad_norm": 23.762821197509766, "learning_rate": 3e-05, "loss": 14.3424, "mean_token_accuracy": 0.4559449180960655, "step": 3 }, { "epoch": 0.032, "grad_norm": 21.071897506713867, "learning_rate": 4e-05, "loss": 13.9143, "mean_token_accuracy": 0.46790433675050735, "step": 4 }, { "epoch": 0.04, "grad_norm": 19.133302688598633, "learning_rate": 5e-05, "loss": 12.2497, "mean_token_accuracy": 0.5162213444709778, "step": 5 }, { "epoch": 0.048, "grad_norm": 17.784639358520508, "learning_rate": 4.9473684210526315e-05, "loss": 11.9364, "mean_token_accuracy": 0.5214760452508926, "step": 6 }, { "epoch": 0.056, "grad_norm": 18.64559555053711, "learning_rate": 4.8947368421052635e-05, "loss": 10.9395, "mean_token_accuracy": 0.5401209890842438, "step": 7 }, { "epoch": 0.064, "grad_norm": 17.828125, "learning_rate": 4.842105263157895e-05, "loss": 10.1876, "mean_token_accuracy": 0.5798913389444351, "step": 8 }, { "epoch": 0.072, "grad_norm": 18.621353149414062, "learning_rate": 4.789473684210526e-05, "loss": 9.51, "mean_token_accuracy": 0.6059905588626862, "step": 9 }, { "epoch": 0.08, "grad_norm": 14.266256332397461, "learning_rate": 4.736842105263158e-05, "loss": 9.324, "mean_token_accuracy": 0.623360276222229, "step": 10 }, { "epoch": 0.088, "grad_norm": 13.611992835998535, "learning_rate": 4.68421052631579e-05, "loss": 8.9613, "mean_token_accuracy": 0.6348667591810226, "step": 11 }, { "epoch": 0.096, "grad_norm": 13.141629219055176, "learning_rate": 4.6315789473684214e-05, "loss": 8.1299, "mean_token_accuracy": 0.6677338033914566, "step": 12 }, { "epoch": 0.104, "grad_norm": 11.582746505737305, "learning_rate": 4.5789473684210527e-05, "loss": 8.1148, "mean_token_accuracy": 0.663286492228508, "step": 13 }, { "epoch": 0.112, "grad_norm": 10.934531211853027, "learning_rate": 4.5263157894736846e-05, "loss": 7.5403, "mean_token_accuracy": 0.673496663570404, "step": 14 }, { "epoch": 0.12, "grad_norm": 9.977241516113281, "learning_rate": 4.473684210526316e-05, "loss": 7.2111, "mean_token_accuracy": 0.688684269785881, "step": 15 }, { "epoch": 0.128, "grad_norm": 10.482184410095215, "learning_rate": 4.421052631578947e-05, "loss": 7.5258, "mean_token_accuracy": 0.671795666217804, "step": 16 }, { "epoch": 0.136, "grad_norm": 10.160177230834961, "learning_rate": 4.368421052631579e-05, "loss": 7.1163, "mean_token_accuracy": 0.6790599226951599, "step": 17 }, { "epoch": 0.144, "grad_norm": 10.689698219299316, "learning_rate": 4.3157894736842105e-05, "loss": 7.1231, "mean_token_accuracy": 0.6911827921867371, "step": 18 }, { "epoch": 0.152, "grad_norm": 9.446402549743652, "learning_rate": 4.2631578947368425e-05, "loss": 7.3457, "mean_token_accuracy": 0.689079686999321, "step": 19 }, { "epoch": 0.16, "grad_norm": 10.626145362854004, "learning_rate": 4.210526315789474e-05, "loss": 6.9955, "mean_token_accuracy": 0.6898764669895172, "step": 20 }, { "epoch": 0.168, "grad_norm": 10.320823669433594, "learning_rate": 4.157894736842106e-05, "loss": 6.9504, "mean_token_accuracy": 0.6948718279600143, "step": 21 }, { "epoch": 0.176, "grad_norm": 10.236137390136719, "learning_rate": 4.105263157894737e-05, "loss": 6.2089, "mean_token_accuracy": 0.7240265011787415, "step": 22 }, { "epoch": 0.184, "grad_norm": 9.453045845031738, "learning_rate": 4.0526315789473684e-05, "loss": 6.7502, "mean_token_accuracy": 0.7005183100700378, "step": 23 }, { "epoch": 0.192, "grad_norm": 9.327564239501953, "learning_rate": 4e-05, "loss": 6.4527, "mean_token_accuracy": 0.7164693623781204, "step": 24 }, { "epoch": 0.2, "grad_norm": 9.546314239501953, "learning_rate": 3.9473684210526316e-05, "loss": 5.8974, "mean_token_accuracy": 0.7328019142150879, "step": 25 }, { "epoch": 0.208, "grad_norm": 8.894572257995605, "learning_rate": 3.894736842105263e-05, "loss": 6.109, "mean_token_accuracy": 0.7263201028108597, "step": 26 }, { "epoch": 0.216, "grad_norm": 9.127656936645508, "learning_rate": 3.842105263157895e-05, "loss": 6.4875, "mean_token_accuracy": 0.7085271328687668, "step": 27 }, { "epoch": 0.224, "grad_norm": 9.237127304077148, "learning_rate": 3.789473684210527e-05, "loss": 6.159, "mean_token_accuracy": 0.7449014335870743, "step": 28 }, { "epoch": 0.232, "grad_norm": 9.572649002075195, "learning_rate": 3.736842105263158e-05, "loss": 6.247, "mean_token_accuracy": 0.739266037940979, "step": 29 }, { "epoch": 0.24, "grad_norm": 9.581724166870117, "learning_rate": 3.6842105263157895e-05, "loss": 6.4162, "mean_token_accuracy": 0.7320354580879211, "step": 30 }, { "epoch": 0.248, "grad_norm": 9.571109771728516, "learning_rate": 3.6315789473684214e-05, "loss": 6.3865, "mean_token_accuracy": 0.7435038238763809, "step": 31 }, { "epoch": 0.256, "grad_norm": 9.66092300415039, "learning_rate": 3.578947368421053e-05, "loss": 5.8215, "mean_token_accuracy": 0.7524790912866592, "step": 32 }, { "epoch": 0.264, "grad_norm": 8.532500267028809, "learning_rate": 3.526315789473684e-05, "loss": 5.8042, "mean_token_accuracy": 0.7616962492465973, "step": 33 }, { "epoch": 0.272, "grad_norm": 8.403843879699707, "learning_rate": 3.473684210526316e-05, "loss": 5.9317, "mean_token_accuracy": 0.7603590935468674, "step": 34 }, { "epoch": 0.28, "grad_norm": 8.805196762084961, "learning_rate": 3.421052631578947e-05, "loss": 6.1436, "mean_token_accuracy": 0.7516646534204483, "step": 35 }, { "epoch": 0.288, "grad_norm": 8.515336036682129, "learning_rate": 3.368421052631579e-05, "loss": 6.0511, "mean_token_accuracy": 0.7515160739421844, "step": 36 }, { "epoch": 0.296, "grad_norm": 7.560244560241699, "learning_rate": 3.3157894736842106e-05, "loss": 5.4362, "mean_token_accuracy": 0.772291824221611, "step": 37 }, { "epoch": 0.304, "grad_norm": 8.334061622619629, "learning_rate": 3.2631578947368426e-05, "loss": 5.5668, "mean_token_accuracy": 0.7626153230667114, "step": 38 }, { "epoch": 0.312, "grad_norm": 8.146302223205566, "learning_rate": 3.210526315789474e-05, "loss": 6.1313, "mean_token_accuracy": 0.7336651831865311, "step": 39 }, { "epoch": 0.32, "grad_norm": 7.829502582550049, "learning_rate": 3.157894736842105e-05, "loss": 6.0167, "mean_token_accuracy": 0.7388159483671188, "step": 40 }, { "epoch": 0.328, "grad_norm": 8.113974571228027, "learning_rate": 3.105263157894737e-05, "loss": 5.2954, "mean_token_accuracy": 0.776334211230278, "step": 41 }, { "epoch": 0.336, "grad_norm": 7.286009311676025, "learning_rate": 3.0526315789473684e-05, "loss": 5.6481, "mean_token_accuracy": 0.7564087808132172, "step": 42 }, { "epoch": 0.344, "grad_norm": 7.154447555541992, "learning_rate": 3e-05, "loss": 5.3633, "mean_token_accuracy": 0.7598460763692856, "step": 43 }, { "epoch": 0.352, "grad_norm": 8.190098762512207, "learning_rate": 2.9473684210526314e-05, "loss": 5.3728, "mean_token_accuracy": 0.7727002501487732, "step": 44 }, { "epoch": 0.36, "grad_norm": 7.3226542472839355, "learning_rate": 2.8947368421052634e-05, "loss": 5.4446, "mean_token_accuracy": 0.7465341240167618, "step": 45 }, { "epoch": 0.368, "grad_norm": 8.403494834899902, "learning_rate": 2.842105263157895e-05, "loss": 4.6045, "mean_token_accuracy": 0.7857130914926529, "step": 46 }, { "epoch": 0.376, "grad_norm": 8.063408851623535, "learning_rate": 2.7894736842105263e-05, "loss": 5.2821, "mean_token_accuracy": 0.762213259935379, "step": 47 }, { "epoch": 0.384, "grad_norm": 7.193646430969238, "learning_rate": 2.7368421052631583e-05, "loss": 5.7485, "mean_token_accuracy": 0.7417114228010178, "step": 48 }, { "epoch": 0.392, "grad_norm": 7.28355073928833, "learning_rate": 2.6842105263157896e-05, "loss": 5.5915, "mean_token_accuracy": 0.7473081052303314, "step": 49 }, { "epoch": 0.4, "grad_norm": 7.384160995483398, "learning_rate": 2.6315789473684212e-05, "loss": 6.1476, "mean_token_accuracy": 0.7356720864772797, "step": 50 }, { "epoch": 0.408, "grad_norm": 7.840450763702393, "learning_rate": 2.578947368421053e-05, "loss": 5.2035, "mean_token_accuracy": 0.7573880255222321, "step": 51 }, { "epoch": 0.416, "grad_norm": 7.19984245300293, "learning_rate": 2.5263157894736845e-05, "loss": 4.6355, "mean_token_accuracy": 0.7865147292613983, "step": 52 }, { "epoch": 0.424, "grad_norm": 7.227206707000732, "learning_rate": 2.4736842105263158e-05, "loss": 5.7802, "mean_token_accuracy": 0.7442787438631058, "step": 53 }, { "epoch": 0.432, "grad_norm": 7.431645393371582, "learning_rate": 2.4210526315789474e-05, "loss": 5.5745, "mean_token_accuracy": 0.7558661848306656, "step": 54 }, { "epoch": 0.44, "grad_norm": 9.079876899719238, "learning_rate": 2.368421052631579e-05, "loss": 5.6616, "mean_token_accuracy": 0.7577391117811203, "step": 55 }, { "epoch": 0.448, "grad_norm": 7.336010456085205, "learning_rate": 2.3157894736842107e-05, "loss": 4.7, "mean_token_accuracy": 0.7845469415187836, "step": 56 }, { "epoch": 0.456, "grad_norm": 6.669713020324707, "learning_rate": 2.2631578947368423e-05, "loss": 5.5094, "mean_token_accuracy": 0.7579665780067444, "step": 57 }, { "epoch": 0.464, "grad_norm": 7.331737995147705, "learning_rate": 2.2105263157894736e-05, "loss": 4.6607, "mean_token_accuracy": 0.7783682346343994, "step": 58 }, { "epoch": 0.472, "grad_norm": 6.724721431732178, "learning_rate": 2.1578947368421053e-05, "loss": 5.0542, "mean_token_accuracy": 0.7743726819753647, "step": 59 }, { "epoch": 0.48, "grad_norm": 6.390171527862549, "learning_rate": 2.105263157894737e-05, "loss": 5.0009, "mean_token_accuracy": 0.780523419380188, "step": 60 }, { "epoch": 0.488, "grad_norm": 7.519730567932129, "learning_rate": 2.0526315789473685e-05, "loss": 5.4899, "mean_token_accuracy": 0.7700912803411484, "step": 61 }, { "epoch": 0.496, "grad_norm": 6.765895366668701, "learning_rate": 2e-05, "loss": 4.8741, "mean_token_accuracy": 0.7866266071796417, "step": 62 }, { "epoch": 0.504, "grad_norm": 6.651461601257324, "learning_rate": 1.9473684210526315e-05, "loss": 5.1589, "mean_token_accuracy": 0.769757404923439, "step": 63 }, { "epoch": 0.512, "grad_norm": 6.601439952850342, "learning_rate": 1.8947368421052634e-05, "loss": 5.1586, "mean_token_accuracy": 0.7651441991329193, "step": 64 }, { "epoch": 0.52, "grad_norm": 7.300909042358398, "learning_rate": 1.8421052631578947e-05, "loss": 5.1605, "mean_token_accuracy": 0.7735024839639664, "step": 65 }, { "epoch": 0.528, "grad_norm": 6.311753749847412, "learning_rate": 1.7894736842105264e-05, "loss": 5.0041, "mean_token_accuracy": 0.7669179141521454, "step": 66 }, { "epoch": 0.536, "grad_norm": 7.332730293273926, "learning_rate": 1.736842105263158e-05, "loss": 5.2616, "mean_token_accuracy": 0.7765114605426788, "step": 67 }, { "epoch": 0.544, "grad_norm": 6.422898292541504, "learning_rate": 1.6842105263157896e-05, "loss": 6.1462, "mean_token_accuracy": 0.728233814239502, "step": 68 }, { "epoch": 0.552, "grad_norm": 6.685412883758545, "learning_rate": 1.6315789473684213e-05, "loss": 5.7309, "mean_token_accuracy": 0.7524611800909042, "step": 69 }, { "epoch": 0.56, "grad_norm": 7.049284934997559, "learning_rate": 1.5789473684210526e-05, "loss": 5.0148, "mean_token_accuracy": 0.77178093791008, "step": 70 }, { "epoch": 0.568, "grad_norm": 7.036037445068359, "learning_rate": 1.5263157894736842e-05, "loss": 5.0748, "mean_token_accuracy": 0.7729067206382751, "step": 71 }, { "epoch": 0.576, "grad_norm": 7.007073402404785, "learning_rate": 1.4736842105263157e-05, "loss": 4.5864, "mean_token_accuracy": 0.7893485873937607, "step": 72 }, { "epoch": 0.584, "grad_norm": 6.996627330780029, "learning_rate": 1.4210526315789475e-05, "loss": 5.8196, "mean_token_accuracy": 0.7412619888782501, "step": 73 }, { "epoch": 0.592, "grad_norm": 6.522240161895752, "learning_rate": 1.3684210526315791e-05, "loss": 4.3882, "mean_token_accuracy": 0.8006375879049301, "step": 74 }, { "epoch": 0.6, "grad_norm": 7.209001541137695, "learning_rate": 1.3157894736842106e-05, "loss": 4.9107, "mean_token_accuracy": 0.7705955505371094, "step": 75 }, { "epoch": 0.608, "grad_norm": 6.461360454559326, "learning_rate": 1.2631578947368422e-05, "loss": 4.9865, "mean_token_accuracy": 0.7676331996917725, "step": 76 }, { "epoch": 0.616, "grad_norm": 6.816041469573975, "learning_rate": 1.2105263157894737e-05, "loss": 4.7486, "mean_token_accuracy": 0.8014501333236694, "step": 77 }, { "epoch": 0.624, "grad_norm": 6.4101433753967285, "learning_rate": 1.1578947368421053e-05, "loss": 4.7705, "mean_token_accuracy": 0.7754499018192291, "step": 78 }, { "epoch": 0.632, "grad_norm": 6.9798970222473145, "learning_rate": 1.1052631578947368e-05, "loss": 4.9164, "mean_token_accuracy": 0.7753257304430008, "step": 79 }, { "epoch": 0.64, "grad_norm": 6.403416633605957, "learning_rate": 1.0526315789473684e-05, "loss": 4.7818, "mean_token_accuracy": 0.7874085158109665, "step": 80 }, { "epoch": 0.648, "grad_norm": 6.675773620605469, "learning_rate": 1e-05, "loss": 4.5508, "mean_token_accuracy": 0.795722022652626, "step": 81 }, { "epoch": 0.656, "grad_norm": 6.712479591369629, "learning_rate": 9.473684210526317e-06, "loss": 4.9168, "mean_token_accuracy": 0.7771104872226715, "step": 82 }, { "epoch": 0.664, "grad_norm": 5.9163665771484375, "learning_rate": 8.947368421052632e-06, "loss": 4.891, "mean_token_accuracy": 0.7830108255147934, "step": 83 }, { "epoch": 0.672, "grad_norm": 7.163206100463867, "learning_rate": 8.421052631578948e-06, "loss": 5.149, "mean_token_accuracy": 0.7670323401689529, "step": 84 }, { "epoch": 0.68, "grad_norm": 6.317421913146973, "learning_rate": 7.894736842105263e-06, "loss": 4.8002, "mean_token_accuracy": 0.7903847545385361, "step": 85 }, { "epoch": 0.688, "grad_norm": 6.364376544952393, "learning_rate": 7.3684210526315784e-06, "loss": 4.7666, "mean_token_accuracy": 0.7809462994337082, "step": 86 }, { "epoch": 0.696, "grad_norm": 6.32914924621582, "learning_rate": 6.842105263157896e-06, "loss": 5.2743, "mean_token_accuracy": 0.7658153772354126, "step": 87 }, { "epoch": 0.704, "grad_norm": 6.604763984680176, "learning_rate": 6.315789473684211e-06, "loss": 5.2035, "mean_token_accuracy": 0.776008740067482, "step": 88 }, { "epoch": 0.712, "grad_norm": 6.310863494873047, "learning_rate": 5.789473684210527e-06, "loss": 5.5161, "mean_token_accuracy": 0.7567505836486816, "step": 89 }, { "epoch": 0.72, "grad_norm": 6.1613945960998535, "learning_rate": 5.263157894736842e-06, "loss": 5.0648, "mean_token_accuracy": 0.7786727696657181, "step": 90 }, { "epoch": 0.728, "grad_norm": 6.521576404571533, "learning_rate": 4.736842105263159e-06, "loss": 4.433, "mean_token_accuracy": 0.7890913188457489, "step": 91 }, { "epoch": 0.736, "grad_norm": 6.6246466636657715, "learning_rate": 4.210526315789474e-06, "loss": 4.6914, "mean_token_accuracy": 0.7863939553499222, "step": 92 }, { "epoch": 0.744, "grad_norm": 7.000185489654541, "learning_rate": 3.6842105263157892e-06, "loss": 5.0143, "mean_token_accuracy": 0.7784310281276703, "step": 93 }, { "epoch": 0.752, "grad_norm": 6.975312232971191, "learning_rate": 3.1578947368421056e-06, "loss": 4.7931, "mean_token_accuracy": 0.7743921875953674, "step": 94 }, { "epoch": 0.76, "grad_norm": 6.039007663726807, "learning_rate": 2.631578947368421e-06, "loss": 4.7647, "mean_token_accuracy": 0.780229240655899, "step": 95 }, { "epoch": 0.768, "grad_norm": 6.673982620239258, "learning_rate": 2.105263157894737e-06, "loss": 4.7419, "mean_token_accuracy": 0.7849721014499664, "step": 96 }, { "epoch": 0.776, "grad_norm": 6.379744529724121, "learning_rate": 1.5789473684210528e-06, "loss": 4.7868, "mean_token_accuracy": 0.7784082293510437, "step": 97 }, { "epoch": 0.784, "grad_norm": 6.387270450592041, "learning_rate": 1.0526315789473685e-06, "loss": 4.6571, "mean_token_accuracy": 0.7906839102506638, "step": 98 }, { "epoch": 0.792, "grad_norm": 6.2963056564331055, "learning_rate": 5.263157894736843e-07, "loss": 5.1662, "mean_token_accuracy": 0.7627293914556503, "step": 99 }, { "epoch": 0.8, "grad_norm": 7.28849983215332, "learning_rate": 0.0, "loss": 5.0461, "mean_token_accuracy": 0.7743319720029831, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 522011226931200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }