|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9963459196102313, |
|
"eval_steps": 500, |
|
"global_step": 1230, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.024360535931790498, |
|
"grad_norm": 11.305159308546282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8887, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.048721071863580996, |
|
"grad_norm": 4.407971880258946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7976, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0730816077953715, |
|
"grad_norm": 1.3980174472682592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.767, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09744214372716199, |
|
"grad_norm": 4.101997301199602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7475, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1218026796589525, |
|
"grad_norm": 3.8298876079533914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7403, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.146163215590743, |
|
"grad_norm": 1.120036674314868, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7256, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1705237515225335, |
|
"grad_norm": 0.7504651491119281, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7212, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19488428745432398, |
|
"grad_norm": 0.8705482174733401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6967, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2192448233861145, |
|
"grad_norm": 0.8066011359814329, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6944, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.243605359317905, |
|
"grad_norm": 0.7143294307124043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6991, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2679658952496955, |
|
"grad_norm": 0.5219278289863366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6904, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.292326431181486, |
|
"grad_norm": 0.49976792832548467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6917, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3166869671132765, |
|
"grad_norm": 0.8671962194472669, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6923, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.341047503045067, |
|
"grad_norm": 0.4958220955019927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6886, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3654080389768575, |
|
"grad_norm": 0.5491440067010557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6844, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.38976857490864797, |
|
"grad_norm": 0.5764231325699036, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6873, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.41412911084043846, |
|
"grad_norm": 0.4866036070242275, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6773, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.438489646772229, |
|
"grad_norm": 0.7229793933095654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6801, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4628501827040195, |
|
"grad_norm": 0.5825475907586349, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6806, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.48721071863581, |
|
"grad_norm": 0.641550842935756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.677, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5115712545676004, |
|
"grad_norm": 0.555875854836963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.669, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.535931790499391, |
|
"grad_norm": 0.4380177981926619, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6672, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5602923264311814, |
|
"grad_norm": 0.5586357299552903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6696, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.584652862362972, |
|
"grad_norm": 0.5268423895517483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6761, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6090133982947625, |
|
"grad_norm": 0.5068291541548725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6672, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.633373934226553, |
|
"grad_norm": 0.7203145859800878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6758, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6577344701583435, |
|
"grad_norm": 0.4843027545014372, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6684, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.682095006090134, |
|
"grad_norm": 0.4654716032330135, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6674, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7064555420219245, |
|
"grad_norm": 0.48677469218469316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.657, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.730816077953715, |
|
"grad_norm": 0.501936617406133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.666, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7551766138855055, |
|
"grad_norm": 0.4189199112711787, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6672, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7795371498172959, |
|
"grad_norm": 0.525860628294632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6625, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8038976857490865, |
|
"grad_norm": 0.5055516889416151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6687, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8282582216808769, |
|
"grad_norm": 0.5030088195887705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6622, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8526187576126675, |
|
"grad_norm": 0.4409999841350699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.659, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.876979293544458, |
|
"grad_norm": 0.49889143289837934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.664, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9013398294762485, |
|
"grad_norm": 0.46333426563091684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6647, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.925700365408039, |
|
"grad_norm": 0.4132898286035426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6604, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9500609013398295, |
|
"grad_norm": 0.4602502572358803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.663, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.97442143727162, |
|
"grad_norm": 0.586425378319964, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6588, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9987819732034104, |
|
"grad_norm": 0.4637558734433708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6557, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9987819732034104, |
|
"eval_loss": 0.6518880128860474, |
|
"eval_runtime": 221.2706, |
|
"eval_samples_per_second": 49.966, |
|
"eval_steps_per_second": 0.393, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0231425091352009, |
|
"grad_norm": 0.6010683259164777, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6207, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0475030450669915, |
|
"grad_norm": 0.6050810738565418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.61, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.071863580998782, |
|
"grad_norm": 0.4799441913834175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.617, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0962241169305724, |
|
"grad_norm": 0.41533745441354586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6233, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1205846528623629, |
|
"grad_norm": 0.42865808124947796, |
|
"learning_rate": 5e-06, |
|
"loss": 0.616, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1449451887941535, |
|
"grad_norm": 0.5620085827072487, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6226, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.169305724725944, |
|
"grad_norm": 0.47328106114801194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.609, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1936662606577344, |
|
"grad_norm": 0.4720567281560868, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6143, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.218026796589525, |
|
"grad_norm": 0.44112203366329256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.614, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2423873325213155, |
|
"grad_norm": 0.5187652730488376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6199, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.266747868453106, |
|
"grad_norm": 0.5638861172624315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.619, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2911084043848966, |
|
"grad_norm": 0.5972907620170446, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6182, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.315468940316687, |
|
"grad_norm": 0.5314321040836214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.619, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3398294762484775, |
|
"grad_norm": 0.5459662859735409, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6183, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.364190012180268, |
|
"grad_norm": 0.5202733547748785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.618, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.3885505481120584, |
|
"grad_norm": 0.4161689870213624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6101, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.412911084043849, |
|
"grad_norm": 0.46394109509695763, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6274, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4372716199756395, |
|
"grad_norm": 0.4808851283054136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6087, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.46163215590743, |
|
"grad_norm": 0.5411540324211217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6215, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4859926918392206, |
|
"grad_norm": 0.5416915020329361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6167, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.510353227771011, |
|
"grad_norm": 0.527607596364707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6128, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5347137637028014, |
|
"grad_norm": 0.520963657326471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6137, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.559074299634592, |
|
"grad_norm": 0.4366228046959017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6171, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.5834348355663823, |
|
"grad_norm": 0.5504251670894937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6143, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.607795371498173, |
|
"grad_norm": 0.4715628019229569, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6202, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6321559074299634, |
|
"grad_norm": 0.5291464708625646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6155, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6565164433617539, |
|
"grad_norm": 0.4355159440359265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6162, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6808769792935445, |
|
"grad_norm": 0.5112620919843524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6279, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.705237515225335, |
|
"grad_norm": 0.57875404757705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6176, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7295980511571254, |
|
"grad_norm": 0.4410704500201331, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6195, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.753958587088916, |
|
"grad_norm": 0.5587895103691882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6194, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7783191230207065, |
|
"grad_norm": 0.4941053548445359, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6096, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.802679658952497, |
|
"grad_norm": 0.5227563230610854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6102, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8270401948842876, |
|
"grad_norm": 0.4591897668705156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6117, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.8514007308160778, |
|
"grad_norm": 0.5103376738813472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6134, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8757612667478685, |
|
"grad_norm": 0.532214266722337, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6102, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.900121802679659, |
|
"grad_norm": 0.4632257568024349, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6218, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.9244823386114494, |
|
"grad_norm": 0.5412849420492728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6109, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.94884287454324, |
|
"grad_norm": 0.48808240750337195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6176, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9732034104750305, |
|
"grad_norm": 0.4761455418357999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6098, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.997563946406821, |
|
"grad_norm": 0.4534197510006015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6082, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6419612765312195, |
|
"eval_runtime": 221.435, |
|
"eval_samples_per_second": 49.929, |
|
"eval_steps_per_second": 0.393, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 2.0219244823386116, |
|
"grad_norm": 0.6074772099873261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5769, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.0462850182704018, |
|
"grad_norm": 0.5110291152400608, |
|
"learning_rate": 5e-06, |
|
"loss": 0.564, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.0706455542021924, |
|
"grad_norm": 0.740312554525951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5717, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.095006090133983, |
|
"grad_norm": 0.5821754748157193, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5726, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.1193666260657733, |
|
"grad_norm": 0.53860209415622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5742, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.143727161997564, |
|
"grad_norm": 0.5215524148222913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.564, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.1680876979293546, |
|
"grad_norm": 0.6458934700822203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5724, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.192448233861145, |
|
"grad_norm": 0.4435184357785445, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5684, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.2168087697929355, |
|
"grad_norm": 0.5416262844784988, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5718, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.2411693057247257, |
|
"grad_norm": 0.4739984176413269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5756, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.2655298416565164, |
|
"grad_norm": 0.47994087642094213, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5742, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.289890377588307, |
|
"grad_norm": 0.4742359512444407, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5731, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.3142509135200973, |
|
"grad_norm": 0.5586334439764152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.576, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.338611449451888, |
|
"grad_norm": 0.49880213092932163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5799, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.3629719853836786, |
|
"grad_norm": 0.49935902866105975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5762, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.387332521315469, |
|
"grad_norm": 0.5465185670805549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5717, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.4116930572472595, |
|
"grad_norm": 0.4986248004640357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5772, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.43605359317905, |
|
"grad_norm": 0.5423471098966955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5804, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.4604141291108403, |
|
"grad_norm": 0.5193096800667882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5691, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.484774665042631, |
|
"grad_norm": 0.4590023482690989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5741, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.5091352009744217, |
|
"grad_norm": 0.4671536002975626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5714, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.533495736906212, |
|
"grad_norm": 0.5523685876104364, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5734, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.5578562728380025, |
|
"grad_norm": 0.6868866709072206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5728, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.582216808769793, |
|
"grad_norm": 0.5582819992545279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5737, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.6065773447015834, |
|
"grad_norm": 0.4702857244191192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.566, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.630937880633374, |
|
"grad_norm": 0.6487634608204832, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5818, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.6552984165651643, |
|
"grad_norm": 0.4736967537062896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5753, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.679658952496955, |
|
"grad_norm": 0.5348827813693043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5771, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.704019488428745, |
|
"grad_norm": 0.5028960700092897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5713, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.728380024360536, |
|
"grad_norm": 0.4780698681645441, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5746, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.7527405602923265, |
|
"grad_norm": 0.4864478553500122, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5752, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.7771010962241167, |
|
"grad_norm": 0.4667264912708201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5772, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.8014616321559074, |
|
"grad_norm": 0.45394076375291925, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5823, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.825822168087698, |
|
"grad_norm": 0.5161201565392174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5815, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.8501827040194883, |
|
"grad_norm": 0.5076152963599294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5784, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.874543239951279, |
|
"grad_norm": 0.4752319372351976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5791, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.8989037758830696, |
|
"grad_norm": 0.533679377576446, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5796, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.92326431181486, |
|
"grad_norm": 0.4952941664544987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5735, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.9476248477466505, |
|
"grad_norm": 0.4611730832059269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5748, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.971985383678441, |
|
"grad_norm": 0.5882799223730999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5781, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.9963459196102313, |
|
"grad_norm": 0.4979608878944041, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5706, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.9963459196102313, |
|
"eval_loss": 0.6427608132362366, |
|
"eval_runtime": 221.8996, |
|
"eval_samples_per_second": 49.824, |
|
"eval_steps_per_second": 0.392, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.9963459196102313, |
|
"step": 1230, |
|
"total_flos": 2059877052579840.0, |
|
"train_loss": 0.6269122554034722, |
|
"train_runtime": 37089.6991, |
|
"train_samples_per_second": 16.991, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1230, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2059877052579840.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|