|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 300, |
|
"global_step": 2411, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00414765657403567, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.3119, |
|
"num_input_tokens_seen": 1693136, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00829531314807134, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.1145, |
|
"num_input_tokens_seen": 3345072, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01244296972210701, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 3.999937641573031e-05, |
|
"loss": 0.108, |
|
"num_input_tokens_seen": 4962592, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01659062629614268, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 3.999556576379093e-05, |
|
"loss": 0.0925, |
|
"num_input_tokens_seen": 6599360, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02073828287017835, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.998829155488428e-05, |
|
"loss": 0.1203, |
|
"num_input_tokens_seen": 8134112, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02488593944421402, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 3.997755504902156e-05, |
|
"loss": 0.1231, |
|
"num_input_tokens_seen": 9757824, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02903359601824969, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.9963358105940094e-05, |
|
"loss": 0.0984, |
|
"num_input_tokens_seen": 11420880, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03318125259228536, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.9945703184781146e-05, |
|
"loss": 0.0918, |
|
"num_input_tokens_seen": 13082688, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03732890916632103, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.9924593343664e-05, |
|
"loss": 0.0755, |
|
"num_input_tokens_seen": 14714672, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0414765657403567, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.990003223915623e-05, |
|
"loss": 0.0805, |
|
"num_input_tokens_seen": 16339072, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.045624222314392365, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 3.9872024125640315e-05, |
|
"loss": 0.0869, |
|
"num_input_tokens_seen": 17948576, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04977187888842804, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.9840573854576696e-05, |
|
"loss": 0.081, |
|
"num_input_tokens_seen": 19523632, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05391953546246371, |
|
"grad_norm": 1.25, |
|
"learning_rate": 3.980568687366349e-05, |
|
"loss": 0.0843, |
|
"num_input_tokens_seen": 21150112, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05806719203649938, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 3.976736922589278e-05, |
|
"loss": 0.0837, |
|
"num_input_tokens_seen": 22802656, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06221484861053505, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.972562754850392e-05, |
|
"loss": 0.1104, |
|
"num_input_tokens_seen": 24427648, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06636250518457072, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 3.968046907183383e-05, |
|
"loss": 0.102, |
|
"num_input_tokens_seen": 26034736, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07051016175860639, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 3.963190161806461e-05, |
|
"loss": 0.0808, |
|
"num_input_tokens_seen": 27653024, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07465781833264205, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.9579933599868605e-05, |
|
"loss": 0.0771, |
|
"num_input_tokens_seen": 29276352, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07880547490667773, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 3.952457401895115e-05, |
|
"loss": 0.0809, |
|
"num_input_tokens_seen": 30896032, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0829531314807134, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 3.9465832464491406e-05, |
|
"loss": 0.0724, |
|
"num_input_tokens_seen": 32445728, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08710078805474906, |
|
"grad_norm": 1.0, |
|
"learning_rate": 3.9403719111481295e-05, |
|
"loss": 0.0753, |
|
"num_input_tokens_seen": 34058864, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09124844462878473, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.933824471896305e-05, |
|
"loss": 0.079, |
|
"num_input_tokens_seen": 35664048, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09539610120282041, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.926942062816558e-05, |
|
"loss": 0.078, |
|
"num_input_tokens_seen": 37250864, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09954375777685608, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.919725876053997e-05, |
|
"loss": 0.0762, |
|
"num_input_tokens_seen": 38878368, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10369141435089174, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 3.9121771615694496e-05, |
|
"loss": 0.0812, |
|
"num_input_tokens_seen": 40446704, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.10783907092492742, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.90429722692295e-05, |
|
"loss": 0.09, |
|
"num_input_tokens_seen": 42044128, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11198672749896309, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 3.8960874370472476e-05, |
|
"loss": 0.0744, |
|
"num_input_tokens_seen": 43653376, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11613438407299875, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.887549214011378e-05, |
|
"loss": 0.0822, |
|
"num_input_tokens_seen": 45249536, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12028204064703442, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 3.878684036774337e-05, |
|
"loss": 0.0673, |
|
"num_input_tokens_seen": 46861728, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1244296972210701, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 3.869493440928904e-05, |
|
"loss": 0.0969, |
|
"num_input_tokens_seen": 48483088, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1244296972210701, |
|
"eval_loss": 0.07887571305036545, |
|
"eval_runtime": 51.9245, |
|
"eval_samples_per_second": 60.029, |
|
"eval_steps_per_second": 1.887, |
|
"num_input_tokens_seen": 48483088, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12857735379510576, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 3.859979018435648e-05, |
|
"loss": 0.0703, |
|
"num_input_tokens_seen": 50105600, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13272501036914144, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.8501424173471795e-05, |
|
"loss": 0.073, |
|
"num_input_tokens_seen": 51741392, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1368726669431771, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.839985341522674e-05, |
|
"loss": 0.089, |
|
"num_input_tokens_seen": 53360288, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.14102032351721278, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.829509550332743e-05, |
|
"loss": 0.0805, |
|
"num_input_tokens_seen": 55027488, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14516798009124846, |
|
"grad_norm": 0.75, |
|
"learning_rate": 3.818716858354677e-05, |
|
"loss": 0.0771, |
|
"num_input_tokens_seen": 56654080, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1493156366652841, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 3.807609135058135e-05, |
|
"loss": 0.0877, |
|
"num_input_tokens_seen": 58311408, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1534632932393198, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.796188304481319e-05, |
|
"loss": 0.0755, |
|
"num_input_tokens_seen": 59931536, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.15761094981335547, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 3.784456344897702e-05, |
|
"loss": 0.0894, |
|
"num_input_tokens_seen": 61618192, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.16175860638739112, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 3.772415288473357e-05, |
|
"loss": 0.0918, |
|
"num_input_tokens_seen": 63241440, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1659062629614268, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.7600672209149535e-05, |
|
"loss": 0.0749, |
|
"num_input_tokens_seen": 64840016, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17005391953546245, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.747414281108478e-05, |
|
"loss": 0.0748, |
|
"num_input_tokens_seen": 66494144, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17420157610949813, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 3.734458660748748e-05, |
|
"loss": 0.0775, |
|
"num_input_tokens_seen": 68099408, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1783492326835338, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.721202603959768e-05, |
|
"loss": 0.0779, |
|
"num_input_tokens_seen": 69714944, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18249688925756946, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.7076484069060195e-05, |
|
"loss": 0.0789, |
|
"num_input_tokens_seen": 71316448, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18664454583160514, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 3.69379841739472e-05, |
|
"loss": 0.0749, |
|
"num_input_tokens_seen": 72954352, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.19079220240564082, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.6796550344691495e-05, |
|
"loss": 0.0746, |
|
"num_input_tokens_seen": 74556368, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19493985897967647, |
|
"grad_norm": 1.375, |
|
"learning_rate": 3.6652207079930956e-05, |
|
"loss": 0.084, |
|
"num_input_tokens_seen": 76204704, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.19908751555371215, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.6504979382264966e-05, |
|
"loss": 0.0705, |
|
"num_input_tokens_seen": 77820544, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20323517212774783, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 3.6354892753923574e-05, |
|
"loss": 0.0709, |
|
"num_input_tokens_seen": 79449040, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.20738282870178348, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 3.620197319235009e-05, |
|
"loss": 0.0796, |
|
"num_input_tokens_seen": 81052496, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.21153048527581916, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.604624718569789e-05, |
|
"loss": 0.0854, |
|
"num_input_tokens_seen": 82677824, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.21567814184985484, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 3.588774170824225e-05, |
|
"loss": 0.0729, |
|
"num_input_tokens_seen": 84295712, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2198257984238905, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 3.572648421570799e-05, |
|
"loss": 0.0705, |
|
"num_input_tokens_seen": 85914864, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.22397345499792617, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 3.5562502640513625e-05, |
|
"loss": 0.0723, |
|
"num_input_tokens_seen": 87477280, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.22812111157196185, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.5395825386933086e-05, |
|
"loss": 0.0719, |
|
"num_input_tokens_seen": 89076752, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2322687681459975, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 3.522648132617561e-05, |
|
"loss": 0.0696, |
|
"num_input_tokens_seen": 90756528, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.23641642472003319, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 3.505449979138478e-05, |
|
"loss": 0.0567, |
|
"num_input_tokens_seen": 92354288, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.24056408129406884, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 3.4879910572557544e-05, |
|
"loss": 0.085, |
|
"num_input_tokens_seen": 93981696, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.24471173786810452, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 3.4702743911384093e-05, |
|
"loss": 0.0763, |
|
"num_input_tokens_seen": 95603856, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.2488593944421402, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 3.452303049600954e-05, |
|
"loss": 0.0617, |
|
"num_input_tokens_seen": 97194624, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2488593944421402, |
|
"eval_loss": 0.07688611000776291, |
|
"eval_runtime": 52.396, |
|
"eval_samples_per_second": 59.489, |
|
"eval_steps_per_second": 1.87, |
|
"num_input_tokens_seen": 97194624, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2530070510161759, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 3.434080145571823e-05, |
|
"loss": 0.0708, |
|
"num_input_tokens_seen": 98803440, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.25715470759021153, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 3.4156088355541595e-05, |
|
"loss": 0.0562, |
|
"num_input_tokens_seen": 100369952, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.2613023641642472, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 3.3968923190790616e-05, |
|
"loss": 0.0699, |
|
"num_input_tokens_seen": 102025584, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2654500207382829, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.377933838151374e-05, |
|
"loss": 0.0618, |
|
"num_input_tokens_seen": 103615424, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.26959767731231854, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 3.358736676688113e-05, |
|
"loss": 0.0627, |
|
"num_input_tokens_seen": 105236720, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2737453338863542, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 3.339304159949648e-05, |
|
"loss": 0.0659, |
|
"num_input_tokens_seen": 106863168, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2778929904603899, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 3.3196396539637015e-05, |
|
"loss": 0.0619, |
|
"num_input_tokens_seen": 108445968, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.28204064703442555, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.299746564942309e-05, |
|
"loss": 0.0757, |
|
"num_input_tokens_seen": 110128992, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.2861883036084612, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 3.279628338691798e-05, |
|
"loss": 0.0548, |
|
"num_input_tokens_seen": 111745104, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.2903359601824969, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 3.259288460015927e-05, |
|
"loss": 0.0747, |
|
"num_input_tokens_seen": 113351136, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.29448361675653256, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.238730452112256e-05, |
|
"loss": 0.0761, |
|
"num_input_tokens_seen": 115000912, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2986312733305682, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.217957875961871e-05, |
|
"loss": 0.067, |
|
"num_input_tokens_seen": 116617280, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3027789299046039, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3.196974329712567e-05, |
|
"loss": 0.0687, |
|
"num_input_tokens_seen": 118217968, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3069265864786396, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 3.1757834480555885e-05, |
|
"loss": 0.0596, |
|
"num_input_tokens_seen": 119844304, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.3110742430526752, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 3.1543889015960416e-05, |
|
"loss": 0.0728, |
|
"num_input_tokens_seen": 121474288, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.31522189962671093, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 3.1327943962170874e-05, |
|
"loss": 0.066, |
|
"num_input_tokens_seen": 123105712, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3193695562007466, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 3.111003672438022e-05, |
|
"loss": 0.0709, |
|
"num_input_tokens_seen": 124692896, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.32351721277478224, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 3.089020504766356e-05, |
|
"loss": 0.0597, |
|
"num_input_tokens_seen": 126346784, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.32766486934881794, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.0668487010440105e-05, |
|
"loss": 0.0692, |
|
"num_input_tokens_seen": 127969872, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3318125259228536, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 3.0444921017877357e-05, |
|
"loss": 0.0607, |
|
"num_input_tokens_seen": 129520000, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.33596018249688925, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 3.021954579523874e-05, |
|
"loss": 0.0592, |
|
"num_input_tokens_seen": 131142560, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.3401078390709249, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 2.99924003811757e-05, |
|
"loss": 0.0719, |
|
"num_input_tokens_seen": 132757520, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3442554956449606, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 2.976352412096563e-05, |
|
"loss": 0.0606, |
|
"num_input_tokens_seen": 134374032, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.34840315221899626, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.9532956659696594e-05, |
|
"loss": 0.0519, |
|
"num_input_tokens_seen": 135996944, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.3525508087930319, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 2.930073793540017e-05, |
|
"loss": 0.0579, |
|
"num_input_tokens_seen": 137587472, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3566984653670676, |
|
"grad_norm": 2.125, |
|
"learning_rate": 2.9066908172133544e-05, |
|
"loss": 0.0648, |
|
"num_input_tokens_seen": 139190208, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.36084612194110327, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.8831507873012006e-05, |
|
"loss": 0.0677, |
|
"num_input_tokens_seen": 140807968, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.3649937785151389, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 2.85945778131932e-05, |
|
"loss": 0.0549, |
|
"num_input_tokens_seen": 142402656, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.36914143508917463, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 2.835615903281418e-05, |
|
"loss": 0.0543, |
|
"num_input_tokens_seen": 143979168, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.3732890916632103, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.8116292829882606e-05, |
|
"loss": 0.0779, |
|
"num_input_tokens_seen": 145588016, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3732890916632103, |
|
"eval_loss": 0.06259813904762268, |
|
"eval_runtime": 52.0703, |
|
"eval_samples_per_second": 59.861, |
|
"eval_steps_per_second": 1.882, |
|
"num_input_tokens_seen": 145588016, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.37743674823724593, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 2.7875020753123222e-05, |
|
"loss": 0.0628, |
|
"num_input_tokens_seen": 147239120, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.38158440481128164, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 2.7632384594781006e-05, |
|
"loss": 0.0701, |
|
"num_input_tokens_seen": 148837248, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.3857320613853173, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 2.7388426383381997e-05, |
|
"loss": 0.0578, |
|
"num_input_tokens_seen": 150442016, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.38987971795935294, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 2.7143188376453346e-05, |
|
"loss": 0.0591, |
|
"num_input_tokens_seen": 152046160, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.39402737453338865, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 2.6896713053203564e-05, |
|
"loss": 0.0649, |
|
"num_input_tokens_seen": 153639888, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3981750311074243, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.664904310716446e-05, |
|
"loss": 0.0567, |
|
"num_input_tokens_seen": 155299264, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.40232268768145996, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 2.6400221438795938e-05, |
|
"loss": 0.0659, |
|
"num_input_tokens_seen": 156933648, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.40647034425549566, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 2.6150291148054923e-05, |
|
"loss": 0.0495, |
|
"num_input_tokens_seen": 158474752, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4106180008295313, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 2.5899295526929758e-05, |
|
"loss": 0.0561, |
|
"num_input_tokens_seen": 160089840, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.41476565740356697, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.5647278051941298e-05, |
|
"loss": 0.069, |
|
"num_input_tokens_seen": 161694944, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4189133139776027, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 2.5394282376612132e-05, |
|
"loss": 0.056, |
|
"num_input_tokens_seen": 163300880, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.4230609705516383, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 2.5140352323905016e-05, |
|
"loss": 0.0585, |
|
"num_input_tokens_seen": 164947744, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.427208627125674, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.4885531878632066e-05, |
|
"loss": 0.0632, |
|
"num_input_tokens_seen": 166554016, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.4313562836997097, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 2.4629865179835863e-05, |
|
"loss": 0.0744, |
|
"num_input_tokens_seen": 168185136, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.43550394027374534, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 2.4373396513143844e-05, |
|
"loss": 0.0514, |
|
"num_input_tokens_seen": 169806336, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.439651596847781, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.4116170303097336e-05, |
|
"loss": 0.0591, |
|
"num_input_tokens_seen": 171421072, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4437992534218167, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 2.3858231105456472e-05, |
|
"loss": 0.0583, |
|
"num_input_tokens_seen": 173001056, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.44794690999585235, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.3599623599482418e-05, |
|
"loss": 0.0494, |
|
"num_input_tokens_seen": 174621744, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.452094566569888, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 2.334039258019823e-05, |
|
"loss": 0.0557, |
|
"num_input_tokens_seen": 176178848, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.4562422231439237, |
|
"grad_norm": 0.75, |
|
"learning_rate": 2.3080582950629574e-05, |
|
"loss": 0.0512, |
|
"num_input_tokens_seen": 177751392, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.46038987971795936, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.282023971402684e-05, |
|
"loss": 0.056, |
|
"num_input_tokens_seen": 179395408, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.464537536291995, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 2.2559407966069827e-05, |
|
"loss": 0.0476, |
|
"num_input_tokens_seen": 180974112, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.4686851928660307, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 2.2298132887056437e-05, |
|
"loss": 0.07, |
|
"num_input_tokens_seen": 182636064, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.47283284944006637, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.2036459734076715e-05, |
|
"loss": 0.0481, |
|
"num_input_tokens_seen": 184298640, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.476980506014102, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.1774433833173577e-05, |
|
"loss": 0.0526, |
|
"num_input_tokens_seen": 185976832, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.4811281625881377, |
|
"grad_norm": 0.75, |
|
"learning_rate": 2.1512100571491636e-05, |
|
"loss": 0.0605, |
|
"num_input_tokens_seen": 187586816, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.4852758191621734, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 2.1249505389415372e-05, |
|
"loss": 0.0598, |
|
"num_input_tokens_seen": 189156864, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.48942347573620903, |
|
"grad_norm": 0.375, |
|
"learning_rate": 2.0986693772698133e-05, |
|
"loss": 0.0426, |
|
"num_input_tokens_seen": 190762160, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.4935711323102447, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 2.0723711244583274e-05, |
|
"loss": 0.0473, |
|
"num_input_tokens_seen": 192413488, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.4977187888842804, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 2.046060335791876e-05, |
|
"loss": 0.0538, |
|
"num_input_tokens_seen": 194012016, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4977187888842804, |
|
"eval_loss": 0.05474493280053139, |
|
"eval_runtime": 52.0108, |
|
"eval_samples_per_second": 59.93, |
|
"eval_steps_per_second": 1.884, |
|
"num_input_tokens_seen": 194012016, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.501866445458316, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 2.0197415687266723e-05, |
|
"loss": 0.0555, |
|
"num_input_tokens_seen": 195645952, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5060141020323518, |
|
"grad_norm": 0.625, |
|
"learning_rate": 1.9934193821009168e-05, |
|
"loss": 0.0431, |
|
"num_input_tokens_seen": 197272080, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5101617586063873, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 1.967098335345132e-05, |
|
"loss": 0.0549, |
|
"num_input_tokens_seen": 198887696, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5143094151804231, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 1.9407829876923988e-05, |
|
"loss": 0.07, |
|
"num_input_tokens_seen": 200471392, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5184570717544588, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 1.914477897388619e-05, |
|
"loss": 0.0491, |
|
"num_input_tokens_seen": 202067200, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5226047283284944, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 1.888187620902955e-05, |
|
"loss": 0.0619, |
|
"num_input_tokens_seen": 203693008, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5267523849025301, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 1.861916712138576e-05, |
|
"loss": 0.0607, |
|
"num_input_tokens_seen": 205296720, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.5309000414765658, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 1.8356697216438465e-05, |
|
"loss": 0.0716, |
|
"num_input_tokens_seen": 206884160, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5350476980506014, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 1.8094511958240985e-05, |
|
"loss": 0.0465, |
|
"num_input_tokens_seen": 208524352, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5391953546246371, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 1.783265676154119e-05, |
|
"loss": 0.0511, |
|
"num_input_tokens_seen": 210124256, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5433430111986728, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.7571176983914942e-05, |
|
"loss": 0.0469, |
|
"num_input_tokens_seen": 211766032, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5474906677727084, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1.7310117917909407e-05, |
|
"loss": 0.0491, |
|
"num_input_tokens_seen": 213370896, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.5516383243467441, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.704952478319767e-05, |
|
"loss": 0.0625, |
|
"num_input_tokens_seen": 214955008, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.5557859809207798, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 1.6789442718745956e-05, |
|
"loss": 0.0457, |
|
"num_input_tokens_seen": 216503392, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.5599336374948154, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.6529916774994766e-05, |
|
"loss": 0.0547, |
|
"num_input_tokens_seen": 218057744, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.5640812940688511, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 1.6270991906055517e-05, |
|
"loss": 0.0645, |
|
"num_input_tokens_seen": 219666784, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5682289506428868, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.6012712961923666e-05, |
|
"loss": 0.054, |
|
"num_input_tokens_seen": 221234848, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5723766072169224, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.5755124680710005e-05, |
|
"loss": 0.0497, |
|
"num_input_tokens_seen": 222885040, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5765242637909581, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.5498271680891274e-05, |
|
"loss": 0.056, |
|
"num_input_tokens_seen": 224548576, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.5806719203649938, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.5242198453581541e-05, |
|
"loss": 0.0534, |
|
"num_input_tokens_seen": 226222464, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5848195769390294, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 1.498694935482559e-05, |
|
"loss": 0.0514, |
|
"num_input_tokens_seen": 227820832, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5889672335130651, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.4732568597915749e-05, |
|
"loss": 0.0504, |
|
"num_input_tokens_seen": 229520608, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5931148900871008, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 1.4479100245733438e-05, |
|
"loss": 0.0471, |
|
"num_input_tokens_seen": 231128896, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.5972625466611364, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 1.4226588203116716e-05, |
|
"loss": 0.0472, |
|
"num_input_tokens_seen": 232762928, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6014102032351721, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.3975076209255321e-05, |
|
"loss": 0.0772, |
|
"num_input_tokens_seen": 234398016, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6055578598092078, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 1.3724607830114265e-05, |
|
"loss": 0.053, |
|
"num_input_tokens_seen": 235995488, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6097055163832434, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.3475226450887546e-05, |
|
"loss": 0.0605, |
|
"num_input_tokens_seen": 237566320, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.6138531729572791, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.3226975268483107e-05, |
|
"loss": 0.0416, |
|
"num_input_tokens_seen": 239123808, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6180008295313149, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.2979897284040433e-05, |
|
"loss": 0.0651, |
|
"num_input_tokens_seen": 240734672, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6221484861053505, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 1.2734035295482044e-05, |
|
"loss": 0.0648, |
|
"num_input_tokens_seen": 242396032, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6221484861053505, |
|
"eval_loss": 0.050693172961473465, |
|
"eval_runtime": 51.5616, |
|
"eval_samples_per_second": 60.452, |
|
"eval_steps_per_second": 1.901, |
|
"num_input_tokens_seen": 242396032, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6262961426793862, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 1.2489431890100223e-05, |
|
"loss": 0.0763, |
|
"num_input_tokens_seen": 244010816, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.6304437992534219, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.2246129437180198e-05, |
|
"loss": 0.0705, |
|
"num_input_tokens_seen": 245636640, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.6345914558274575, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 1.2004170080661054e-05, |
|
"loss": 0.0417, |
|
"num_input_tokens_seen": 247264032, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.6387391124014932, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.176359573183579e-05, |
|
"loss": 0.0563, |
|
"num_input_tokens_seen": 248895040, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.6428867689755289, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 1.1524448062091537e-05, |
|
"loss": 0.0567, |
|
"num_input_tokens_seen": 250514032, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.6470344255495645, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 1.1286768495691439e-05, |
|
"loss": 0.0487, |
|
"num_input_tokens_seen": 252089552, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.6511820821236002, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 1.1050598202599265e-05, |
|
"loss": 0.0518, |
|
"num_input_tokens_seen": 253733632, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.6553297386976359, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 1.0815978091348129e-05, |
|
"loss": 0.0548, |
|
"num_input_tokens_seen": 255366048, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.6594773952716715, |
|
"grad_norm": 0.75, |
|
"learning_rate": 1.0582948801954458e-05, |
|
"loss": 0.0653, |
|
"num_input_tokens_seen": 257013840, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.6636250518457072, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 1.035155069887846e-05, |
|
"loss": 0.0442, |
|
"num_input_tokens_seen": 258569696, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6677727084197429, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.0121823864032383e-05, |
|
"loss": 0.0488, |
|
"num_input_tokens_seen": 260201856, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.6719203649937785, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 9.893808089837638e-06, |
|
"loss": 0.0397, |
|
"num_input_tokens_seen": 261772832, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.6760680215678142, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.66754287233214e-06, |
|
"loss": 0.0552, |
|
"num_input_tokens_seen": 263427808, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.6802156781418498, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 9.443067404328918e-06, |
|
"loss": 0.0625, |
|
"num_input_tokens_seen": 265071264, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6843633347158855, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 9.220420568627313e-06, |
|
"loss": 0.0567, |
|
"num_input_tokens_seen": 266668784, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.6885109912899212, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 8.999640931277825e-06, |
|
"loss": 0.0477, |
|
"num_input_tokens_seen": 268367856, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6926586478639568, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 8.780766734901851e-06, |
|
"loss": 0.0408, |
|
"num_input_tokens_seen": 269965632, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.6968063044379925, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 8.563835892067446e-06, |
|
"loss": 0.0677, |
|
"num_input_tokens_seen": 271631968, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.7009539610120282, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 8.348885978722252e-06, |
|
"loss": 0.0454, |
|
"num_input_tokens_seen": 273246624, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.7051016175860638, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 8.135954227684725e-06, |
|
"loss": 0.0671, |
|
"num_input_tokens_seen": 274826768, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7092492741600995, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 7.925077522194797e-06, |
|
"loss": 0.0576, |
|
"num_input_tokens_seen": 276517456, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.7133969307341352, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 7.716292389525135e-06, |
|
"loss": 0.0479, |
|
"num_input_tokens_seen": 278157936, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.7175445873081708, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 7.5096349946539735e-06, |
|
"loss": 0.0521, |
|
"num_input_tokens_seen": 279750640, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.7216922438822065, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 7.305141134000804e-06, |
|
"loss": 0.048, |
|
"num_input_tokens_seen": 281405344, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.7258399004562422, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 7.102846229225813e-06, |
|
"loss": 0.0452, |
|
"num_input_tokens_seen": 283103808, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.7299875570302778, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 6.902785321094301e-06, |
|
"loss": 0.0391, |
|
"num_input_tokens_seen": 284686976, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.7341352136043136, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 6.7049930634070325e-06, |
|
"loss": 0.0487, |
|
"num_input_tokens_seen": 286315792, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.7382828701783493, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 6.509503716997649e-06, |
|
"loss": 0.049, |
|
"num_input_tokens_seen": 287958704, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.7424305267523849, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 6.316351143798141e-06, |
|
"loss": 0.0669, |
|
"num_input_tokens_seen": 289607936, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.7465781833264206, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 6.125568800973372e-06, |
|
"loss": 0.0554, |
|
"num_input_tokens_seen": 291205392, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.7465781833264206, |
|
"eval_loss": 0.049129463732242584, |
|
"eval_runtime": 51.7862, |
|
"eval_samples_per_second": 60.19, |
|
"eval_steps_per_second": 1.892, |
|
"num_input_tokens_seen": 291205392, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.7507258399004563, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 5.9371897351258075e-06, |
|
"loss": 0.0559, |
|
"num_input_tokens_seen": 292866352, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.7548734964744919, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 5.751246576571274e-06, |
|
"loss": 0.064, |
|
"num_input_tokens_seen": 294510528, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.7590211530485276, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 5.567771533686865e-06, |
|
"loss": 0.0563, |
|
"num_input_tokens_seen": 296088976, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.7631688096225633, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 5.386796387331938e-06, |
|
"loss": 0.0543, |
|
"num_input_tokens_seen": 297673328, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.7673164661965989, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 5.208352485343113e-06, |
|
"loss": 0.0556, |
|
"num_input_tokens_seen": 299376752, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.7714641227706346, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 5.032470737104378e-06, |
|
"loss": 0.0568, |
|
"num_input_tokens_seen": 301050928, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.7756117793446703, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 4.859181608193031e-06, |
|
"loss": 0.0478, |
|
"num_input_tokens_seen": 302644848, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.7797594359187059, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 4.688515115102579e-06, |
|
"loss": 0.0438, |
|
"num_input_tokens_seen": 304286592, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.7839070924927416, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 4.520500820043374e-06, |
|
"loss": 0.0524, |
|
"num_input_tokens_seen": 305937648, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.7880547490667773, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 4.355167825821957e-06, |
|
"loss": 0.066, |
|
"num_input_tokens_seen": 307573472, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7922024056408129, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.192544770800002e-06, |
|
"loss": 0.0461, |
|
"num_input_tokens_seen": 309152720, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.7963500622148486, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.032659823933656e-06, |
|
"loss": 0.0531, |
|
"num_input_tokens_seen": 310766416, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.8004977187888843, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 3.875540679894243e-06, |
|
"loss": 0.0377, |
|
"num_input_tokens_seen": 312403088, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.8046453753629199, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 3.721214554271082e-06, |
|
"loss": 0.057, |
|
"num_input_tokens_seen": 314029904, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.8087930319369556, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 3.569708178857312e-06, |
|
"loss": 0.0463, |
|
"num_input_tokens_seen": 315638752, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8129406885109913, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 3.4210477970194945e-06, |
|
"loss": 0.058, |
|
"num_input_tokens_seen": 317249712, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.8170883450850269, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.2752591591518666e-06, |
|
"loss": 0.0615, |
|
"num_input_tokens_seen": 318835520, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.8212360016590626, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 3.1323675182159354e-06, |
|
"loss": 0.0449, |
|
"num_input_tokens_seen": 320467712, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.8253836582330983, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 2.992397625366237e-06, |
|
"loss": 0.0544, |
|
"num_input_tokens_seen": 322067104, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.8295313148071339, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 2.8553737256630753e-06, |
|
"loss": 0.0478, |
|
"num_input_tokens_seen": 323681856, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8336789713811696, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 2.721319553872852e-06, |
|
"loss": 0.0502, |
|
"num_input_tokens_seen": 325343216, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.8378266279552053, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 2.590258330356832e-06, |
|
"loss": 0.0458, |
|
"num_input_tokens_seen": 326946720, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.841974284529241, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 2.462212757048992e-06, |
|
"loss": 0.0589, |
|
"num_input_tokens_seen": 328553024, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.8461219411032767, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.3372050135236846e-06, |
|
"loss": 0.0473, |
|
"num_input_tokens_seen": 330170176, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.8502695976773124, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.2152567531537694e-06, |
|
"loss": 0.0623, |
|
"num_input_tokens_seen": 331801536, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.854417254251348, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 2.0963890993599034e-06, |
|
"loss": 0.0553, |
|
"num_input_tokens_seen": 333424704, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.8585649108253837, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.9806226419516195e-06, |
|
"loss": 0.06, |
|
"num_input_tokens_seen": 335002400, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.8627125673994194, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.867977433560817e-06, |
|
"loss": 0.0413, |
|
"num_input_tokens_seen": 336586720, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.866860223973455, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 1.7584729861683603e-06, |
|
"loss": 0.0441, |
|
"num_input_tokens_seen": 338226048, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.8710078805474907, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 1.6521282677242468e-06, |
|
"loss": 0.0435, |
|
"num_input_tokens_seen": 339841056, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8710078805474907, |
|
"eval_loss": 0.04848345369100571, |
|
"eval_runtime": 51.7331, |
|
"eval_samples_per_second": 60.252, |
|
"eval_steps_per_second": 1.894, |
|
"num_input_tokens_seen": 339841056, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8751555371215264, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 1.548961698862077e-06, |
|
"loss": 0.0487, |
|
"num_input_tokens_seen": 341502688, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.879303193695562, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.4489911497082877e-06, |
|
"loss": 0.0512, |
|
"num_input_tokens_seen": 343140656, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.8834508502695977, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.3522339367867553e-06, |
|
"loss": 0.0539, |
|
"num_input_tokens_seen": 344755872, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.8875985068436334, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 1.2587068200193065e-06, |
|
"loss": 0.0396, |
|
"num_input_tokens_seen": 346370080, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.891746163417669, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 1.1684259998226044e-06, |
|
"loss": 0.0536, |
|
"num_input_tokens_seen": 348025424, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.8958938199917047, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 1.081407114302e-06, |
|
"loss": 0.0444, |
|
"num_input_tokens_seen": 349610864, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.9000414765657404, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 9.976652365427242e-07, |
|
"loss": 0.0437, |
|
"num_input_tokens_seen": 351226816, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.904189133139776, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 9.172148719990237e-07, |
|
"loss": 0.0608, |
|
"num_input_tokens_seen": 352894160, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.9083367897138117, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 8.400699559815506e-07, |
|
"loss": 0.0472, |
|
"num_input_tokens_seen": 354495920, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.9124844462878474, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 7.662438512435533e-07, |
|
"loss": 0.065, |
|
"num_input_tokens_seen": 356095968, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.916632102861883, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 6.957493456662301e-07, |
|
"loss": 0.0527, |
|
"num_input_tokens_seen": 357755280, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.9207797594359187, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 6.285986500436525e-07, |
|
"loss": 0.0423, |
|
"num_input_tokens_seen": 359320080, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.9249274160099544, |
|
"grad_norm": 0.5, |
|
"learning_rate": 5.648033959676658e-07, |
|
"loss": 0.0468, |
|
"num_input_tokens_seen": 360913328, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.92907507258399, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 5.043746338131072e-07, |
|
"loss": 0.0568, |
|
"num_input_tokens_seen": 362546480, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.9332227291580257, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 4.4732283082369767e-07, |
|
"loss": 0.0376, |
|
"num_input_tokens_seen": 364148224, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.9373703857320614, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 3.9365786929894455e-07, |
|
"loss": 0.0513, |
|
"num_input_tokens_seen": 365695088, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.941518042306097, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 3.43389044882374e-07, |
|
"loss": 0.057, |
|
"num_input_tokens_seen": 367280736, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.9456656988801327, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 2.965250649513629e-07, |
|
"loss": 0.0376, |
|
"num_input_tokens_seen": 368861360, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.9498133554541683, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 2.5307404710889217e-07, |
|
"loss": 0.0481, |
|
"num_input_tokens_seen": 370431952, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.953961012028204, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 2.1304351777743816e-07, |
|
"loss": 0.0448, |
|
"num_input_tokens_seen": 372061472, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.9581086686022398, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 1.7644041089527332e-07, |
|
"loss": 0.0726, |
|
"num_input_tokens_seen": 373691376, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.9622563251762754, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 1.4327106671540248e-07, |
|
"loss": 0.0348, |
|
"num_input_tokens_seen": 375329184, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.9664039817503111, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 1.1354123070732137e-07, |
|
"loss": 0.0642, |
|
"num_input_tokens_seen": 376895712, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.9705516383243468, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 8.725605256180602e-08, |
|
"loss": 0.0483, |
|
"num_input_tokens_seen": 378486272, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.9746992948983824, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 6.442008529891075e-08, |
|
"loss": 0.0429, |
|
"num_input_tokens_seen": 380094304, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.9788469514724181, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 4.503728447930566e-08, |
|
"loss": 0.0356, |
|
"num_input_tokens_seen": 381704560, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.9829946080464538, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 2.9111007519113665e-08, |
|
"loss": 0.0359, |
|
"num_input_tokens_seen": 383324784, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.9871422646204894, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.6644013108342294e-08, |
|
"loss": 0.0454, |
|
"num_input_tokens_seen": 384918032, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.9912899211945251, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 7.638460733043751e-09, |
|
"loss": 0.0433, |
|
"num_input_tokens_seen": 386639824, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.9954375777685608, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.0959103012518913e-09, |
|
"loss": 0.067, |
|
"num_input_tokens_seen": 388287472, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.9954375777685608, |
|
"eval_loss": 0.04846753552556038, |
|
"eval_runtime": 52.7044, |
|
"eval_samples_per_second": 59.141, |
|
"eval_steps_per_second": 1.859, |
|
"num_input_tokens_seen": 388287472, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.9995852343425964, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.732187278280506e-11, |
|
"loss": 0.0414, |
|
"num_input_tokens_seen": 389830288, |
|
"step": 2410 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2411, |
|
"num_input_tokens_seen": 389993424, |
|
"num_train_epochs": 1, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.1949017560795054e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|