|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.8, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 27.330623626708984, |
|
"learning_rate": 1e-05, |
|
"loss": 14.9828, |
|
"mean_token_accuracy": 0.43762992322444916, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 25.78533935546875, |
|
"learning_rate": 2e-05, |
|
"loss": 14.48, |
|
"mean_token_accuracy": 0.45947156846523285, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 23.762821197509766, |
|
"learning_rate": 3e-05, |
|
"loss": 14.3424, |
|
"mean_token_accuracy": 0.4559449180960655, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 21.071897506713867, |
|
"learning_rate": 4e-05, |
|
"loss": 13.9143, |
|
"mean_token_accuracy": 0.46790433675050735, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 19.133302688598633, |
|
"learning_rate": 5e-05, |
|
"loss": 12.2497, |
|
"mean_token_accuracy": 0.5162213444709778, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 17.784639358520508, |
|
"learning_rate": 4.9473684210526315e-05, |
|
"loss": 11.9364, |
|
"mean_token_accuracy": 0.5214760452508926, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 18.64559555053711, |
|
"learning_rate": 4.8947368421052635e-05, |
|
"loss": 10.9395, |
|
"mean_token_accuracy": 0.5401209890842438, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 17.828125, |
|
"learning_rate": 4.842105263157895e-05, |
|
"loss": 10.1876, |
|
"mean_token_accuracy": 0.5798913389444351, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 18.621353149414062, |
|
"learning_rate": 4.789473684210526e-05, |
|
"loss": 9.51, |
|
"mean_token_accuracy": 0.6059905588626862, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 14.266256332397461, |
|
"learning_rate": 4.736842105263158e-05, |
|
"loss": 9.324, |
|
"mean_token_accuracy": 0.623360276222229, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 13.611992835998535, |
|
"learning_rate": 4.68421052631579e-05, |
|
"loss": 8.9613, |
|
"mean_token_accuracy": 0.6348667591810226, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 13.141629219055176, |
|
"learning_rate": 4.6315789473684214e-05, |
|
"loss": 8.1299, |
|
"mean_token_accuracy": 0.6677338033914566, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 11.582746505737305, |
|
"learning_rate": 4.5789473684210527e-05, |
|
"loss": 8.1148, |
|
"mean_token_accuracy": 0.663286492228508, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 10.934531211853027, |
|
"learning_rate": 4.5263157894736846e-05, |
|
"loss": 7.5403, |
|
"mean_token_accuracy": 0.673496663570404, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 9.977241516113281, |
|
"learning_rate": 4.473684210526316e-05, |
|
"loss": 7.2111, |
|
"mean_token_accuracy": 0.688684269785881, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 10.482184410095215, |
|
"learning_rate": 4.421052631578947e-05, |
|
"loss": 7.5258, |
|
"mean_token_accuracy": 0.671795666217804, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 10.160177230834961, |
|
"learning_rate": 4.368421052631579e-05, |
|
"loss": 7.1163, |
|
"mean_token_accuracy": 0.6790599226951599, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 10.689698219299316, |
|
"learning_rate": 4.3157894736842105e-05, |
|
"loss": 7.1231, |
|
"mean_token_accuracy": 0.6911827921867371, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 9.446402549743652, |
|
"learning_rate": 4.2631578947368425e-05, |
|
"loss": 7.3457, |
|
"mean_token_accuracy": 0.689079686999321, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 10.626145362854004, |
|
"learning_rate": 4.210526315789474e-05, |
|
"loss": 6.9955, |
|
"mean_token_accuracy": 0.6898764669895172, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 10.320823669433594, |
|
"learning_rate": 4.157894736842106e-05, |
|
"loss": 6.9504, |
|
"mean_token_accuracy": 0.6948718279600143, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 10.236137390136719, |
|
"learning_rate": 4.105263157894737e-05, |
|
"loss": 6.2089, |
|
"mean_token_accuracy": 0.7240265011787415, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 9.453045845031738, |
|
"learning_rate": 4.0526315789473684e-05, |
|
"loss": 6.7502, |
|
"mean_token_accuracy": 0.7005183100700378, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 9.327564239501953, |
|
"learning_rate": 4e-05, |
|
"loss": 6.4527, |
|
"mean_token_accuracy": 0.7164693623781204, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 9.546314239501953, |
|
"learning_rate": 3.9473684210526316e-05, |
|
"loss": 5.8974, |
|
"mean_token_accuracy": 0.7328019142150879, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 8.894572257995605, |
|
"learning_rate": 3.894736842105263e-05, |
|
"loss": 6.109, |
|
"mean_token_accuracy": 0.7263201028108597, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 9.127656936645508, |
|
"learning_rate": 3.842105263157895e-05, |
|
"loss": 6.4875, |
|
"mean_token_accuracy": 0.7085271328687668, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 9.237127304077148, |
|
"learning_rate": 3.789473684210527e-05, |
|
"loss": 6.159, |
|
"mean_token_accuracy": 0.7449014335870743, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 9.572649002075195, |
|
"learning_rate": 3.736842105263158e-05, |
|
"loss": 6.247, |
|
"mean_token_accuracy": 0.739266037940979, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 9.581724166870117, |
|
"learning_rate": 3.6842105263157895e-05, |
|
"loss": 6.4162, |
|
"mean_token_accuracy": 0.7320354580879211, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 9.571109771728516, |
|
"learning_rate": 3.6315789473684214e-05, |
|
"loss": 6.3865, |
|
"mean_token_accuracy": 0.7435038238763809, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 9.66092300415039, |
|
"learning_rate": 3.578947368421053e-05, |
|
"loss": 5.8215, |
|
"mean_token_accuracy": 0.7524790912866592, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 8.532500267028809, |
|
"learning_rate": 3.526315789473684e-05, |
|
"loss": 5.8042, |
|
"mean_token_accuracy": 0.7616962492465973, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 8.403843879699707, |
|
"learning_rate": 3.473684210526316e-05, |
|
"loss": 5.9317, |
|
"mean_token_accuracy": 0.7603590935468674, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 8.805196762084961, |
|
"learning_rate": 3.421052631578947e-05, |
|
"loss": 6.1436, |
|
"mean_token_accuracy": 0.7516646534204483, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 8.515336036682129, |
|
"learning_rate": 3.368421052631579e-05, |
|
"loss": 6.0511, |
|
"mean_token_accuracy": 0.7515160739421844, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 7.560244560241699, |
|
"learning_rate": 3.3157894736842106e-05, |
|
"loss": 5.4362, |
|
"mean_token_accuracy": 0.772291824221611, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 8.334061622619629, |
|
"learning_rate": 3.2631578947368426e-05, |
|
"loss": 5.5668, |
|
"mean_token_accuracy": 0.7626153230667114, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 8.146302223205566, |
|
"learning_rate": 3.210526315789474e-05, |
|
"loss": 6.1313, |
|
"mean_token_accuracy": 0.7336651831865311, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 7.829502582550049, |
|
"learning_rate": 3.157894736842105e-05, |
|
"loss": 6.0167, |
|
"mean_token_accuracy": 0.7388159483671188, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 8.113974571228027, |
|
"learning_rate": 3.105263157894737e-05, |
|
"loss": 5.2954, |
|
"mean_token_accuracy": 0.776334211230278, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 7.286009311676025, |
|
"learning_rate": 3.0526315789473684e-05, |
|
"loss": 5.6481, |
|
"mean_token_accuracy": 0.7564087808132172, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 7.154447555541992, |
|
"learning_rate": 3e-05, |
|
"loss": 5.3633, |
|
"mean_token_accuracy": 0.7598460763692856, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 8.190098762512207, |
|
"learning_rate": 2.9473684210526314e-05, |
|
"loss": 5.3728, |
|
"mean_token_accuracy": 0.7727002501487732, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 7.3226542472839355, |
|
"learning_rate": 2.8947368421052634e-05, |
|
"loss": 5.4446, |
|
"mean_token_accuracy": 0.7465341240167618, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 8.403494834899902, |
|
"learning_rate": 2.842105263157895e-05, |
|
"loss": 4.6045, |
|
"mean_token_accuracy": 0.7857130914926529, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 8.063408851623535, |
|
"learning_rate": 2.7894736842105263e-05, |
|
"loss": 5.2821, |
|
"mean_token_accuracy": 0.762213259935379, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 7.193646430969238, |
|
"learning_rate": 2.7368421052631583e-05, |
|
"loss": 5.7485, |
|
"mean_token_accuracy": 0.7417114228010178, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 7.28355073928833, |
|
"learning_rate": 2.6842105263157896e-05, |
|
"loss": 5.5915, |
|
"mean_token_accuracy": 0.7473081052303314, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 7.384160995483398, |
|
"learning_rate": 2.6315789473684212e-05, |
|
"loss": 6.1476, |
|
"mean_token_accuracy": 0.7356720864772797, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 7.840450763702393, |
|
"learning_rate": 2.578947368421053e-05, |
|
"loss": 5.2035, |
|
"mean_token_accuracy": 0.7573880255222321, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 7.19984245300293, |
|
"learning_rate": 2.5263157894736845e-05, |
|
"loss": 4.6355, |
|
"mean_token_accuracy": 0.7865147292613983, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 7.227206707000732, |
|
"learning_rate": 2.4736842105263158e-05, |
|
"loss": 5.7802, |
|
"mean_token_accuracy": 0.7442787438631058, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 7.431645393371582, |
|
"learning_rate": 2.4210526315789474e-05, |
|
"loss": 5.5745, |
|
"mean_token_accuracy": 0.7558661848306656, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 9.079876899719238, |
|
"learning_rate": 2.368421052631579e-05, |
|
"loss": 5.6616, |
|
"mean_token_accuracy": 0.7577391117811203, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 7.336010456085205, |
|
"learning_rate": 2.3157894736842107e-05, |
|
"loss": 4.7, |
|
"mean_token_accuracy": 0.7845469415187836, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 6.669713020324707, |
|
"learning_rate": 2.2631578947368423e-05, |
|
"loss": 5.5094, |
|
"mean_token_accuracy": 0.7579665780067444, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 7.331737995147705, |
|
"learning_rate": 2.2105263157894736e-05, |
|
"loss": 4.6607, |
|
"mean_token_accuracy": 0.7783682346343994, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 6.724721431732178, |
|
"learning_rate": 2.1578947368421053e-05, |
|
"loss": 5.0542, |
|
"mean_token_accuracy": 0.7743726819753647, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 6.390171527862549, |
|
"learning_rate": 2.105263157894737e-05, |
|
"loss": 5.0009, |
|
"mean_token_accuracy": 0.780523419380188, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 7.519730567932129, |
|
"learning_rate": 2.0526315789473685e-05, |
|
"loss": 5.4899, |
|
"mean_token_accuracy": 0.7700912803411484, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 6.765895366668701, |
|
"learning_rate": 2e-05, |
|
"loss": 4.8741, |
|
"mean_token_accuracy": 0.7866266071796417, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 6.651461601257324, |
|
"learning_rate": 1.9473684210526315e-05, |
|
"loss": 5.1589, |
|
"mean_token_accuracy": 0.769757404923439, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 6.601439952850342, |
|
"learning_rate": 1.8947368421052634e-05, |
|
"loss": 5.1586, |
|
"mean_token_accuracy": 0.7651441991329193, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 7.300909042358398, |
|
"learning_rate": 1.8421052631578947e-05, |
|
"loss": 5.1605, |
|
"mean_token_accuracy": 0.7735024839639664, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 6.311753749847412, |
|
"learning_rate": 1.7894736842105264e-05, |
|
"loss": 5.0041, |
|
"mean_token_accuracy": 0.7669179141521454, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 7.332730293273926, |
|
"learning_rate": 1.736842105263158e-05, |
|
"loss": 5.2616, |
|
"mean_token_accuracy": 0.7765114605426788, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 6.422898292541504, |
|
"learning_rate": 1.6842105263157896e-05, |
|
"loss": 6.1462, |
|
"mean_token_accuracy": 0.728233814239502, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 6.685412883758545, |
|
"learning_rate": 1.6315789473684213e-05, |
|
"loss": 5.7309, |
|
"mean_token_accuracy": 0.7524611800909042, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 7.049284934997559, |
|
"learning_rate": 1.5789473684210526e-05, |
|
"loss": 5.0148, |
|
"mean_token_accuracy": 0.77178093791008, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 7.036037445068359, |
|
"learning_rate": 1.5263157894736842e-05, |
|
"loss": 5.0748, |
|
"mean_token_accuracy": 0.7729067206382751, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 7.007073402404785, |
|
"learning_rate": 1.4736842105263157e-05, |
|
"loss": 4.5864, |
|
"mean_token_accuracy": 0.7893485873937607, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 6.996627330780029, |
|
"learning_rate": 1.4210526315789475e-05, |
|
"loss": 5.8196, |
|
"mean_token_accuracy": 0.7412619888782501, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 6.522240161895752, |
|
"learning_rate": 1.3684210526315791e-05, |
|
"loss": 4.3882, |
|
"mean_token_accuracy": 0.8006375879049301, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 7.209001541137695, |
|
"learning_rate": 1.3157894736842106e-05, |
|
"loss": 4.9107, |
|
"mean_token_accuracy": 0.7705955505371094, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 6.461360454559326, |
|
"learning_rate": 1.2631578947368422e-05, |
|
"loss": 4.9865, |
|
"mean_token_accuracy": 0.7676331996917725, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 6.816041469573975, |
|
"learning_rate": 1.2105263157894737e-05, |
|
"loss": 4.7486, |
|
"mean_token_accuracy": 0.8014501333236694, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 6.4101433753967285, |
|
"learning_rate": 1.1578947368421053e-05, |
|
"loss": 4.7705, |
|
"mean_token_accuracy": 0.7754499018192291, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 6.9798970222473145, |
|
"learning_rate": 1.1052631578947368e-05, |
|
"loss": 4.9164, |
|
"mean_token_accuracy": 0.7753257304430008, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 6.403416633605957, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 4.7818, |
|
"mean_token_accuracy": 0.7874085158109665, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 6.675773620605469, |
|
"learning_rate": 1e-05, |
|
"loss": 4.5508, |
|
"mean_token_accuracy": 0.795722022652626, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 6.712479591369629, |
|
"learning_rate": 9.473684210526317e-06, |
|
"loss": 4.9168, |
|
"mean_token_accuracy": 0.7771104872226715, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 5.9163665771484375, |
|
"learning_rate": 8.947368421052632e-06, |
|
"loss": 4.891, |
|
"mean_token_accuracy": 0.7830108255147934, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 7.163206100463867, |
|
"learning_rate": 8.421052631578948e-06, |
|
"loss": 5.149, |
|
"mean_token_accuracy": 0.7670323401689529, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 6.317421913146973, |
|
"learning_rate": 7.894736842105263e-06, |
|
"loss": 4.8002, |
|
"mean_token_accuracy": 0.7903847545385361, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 6.364376544952393, |
|
"learning_rate": 7.3684210526315784e-06, |
|
"loss": 4.7666, |
|
"mean_token_accuracy": 0.7809462994337082, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 6.32914924621582, |
|
"learning_rate": 6.842105263157896e-06, |
|
"loss": 5.2743, |
|
"mean_token_accuracy": 0.7658153772354126, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 6.604763984680176, |
|
"learning_rate": 6.315789473684211e-06, |
|
"loss": 5.2035, |
|
"mean_token_accuracy": 0.776008740067482, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 6.310863494873047, |
|
"learning_rate": 5.789473684210527e-06, |
|
"loss": 5.5161, |
|
"mean_token_accuracy": 0.7567505836486816, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 6.1613945960998535, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 5.0648, |
|
"mean_token_accuracy": 0.7786727696657181, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 6.521576404571533, |
|
"learning_rate": 4.736842105263159e-06, |
|
"loss": 4.433, |
|
"mean_token_accuracy": 0.7890913188457489, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 6.6246466636657715, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 4.6914, |
|
"mean_token_accuracy": 0.7863939553499222, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 7.000185489654541, |
|
"learning_rate": 3.6842105263157892e-06, |
|
"loss": 5.0143, |
|
"mean_token_accuracy": 0.7784310281276703, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 6.975312232971191, |
|
"learning_rate": 3.1578947368421056e-06, |
|
"loss": 4.7931, |
|
"mean_token_accuracy": 0.7743921875953674, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 6.039007663726807, |
|
"learning_rate": 2.631578947368421e-06, |
|
"loss": 4.7647, |
|
"mean_token_accuracy": 0.780229240655899, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 6.673982620239258, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 4.7419, |
|
"mean_token_accuracy": 0.7849721014499664, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 6.379744529724121, |
|
"learning_rate": 1.5789473684210528e-06, |
|
"loss": 4.7868, |
|
"mean_token_accuracy": 0.7784082293510437, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 6.387270450592041, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 4.6571, |
|
"mean_token_accuracy": 0.7906839102506638, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 6.2963056564331055, |
|
"learning_rate": 5.263157894736843e-07, |
|
"loss": 5.1662, |
|
"mean_token_accuracy": 0.7627293914556503, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.28849983215332, |
|
"learning_rate": 0.0, |
|
"loss": 5.0461, |
|
"mean_token_accuracy": 0.7743319720029831, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 522011226931200.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|