|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9898162809361648, |
|
"eval_steps": 300, |
|
"global_step": 5764, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00034704899898054355, |
|
"eval_loss": 3.3464338779449463, |
|
"eval_runtime": 18.8384, |
|
"eval_samples_per_second": 23.357, |
|
"eval_steps_per_second": 23.357, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00867622497451359, |
|
"grad_norm": 11.25, |
|
"learning_rate": 5e-06, |
|
"loss": 2.9137, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01735244994902718, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.2899, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.026028674923540766, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 1.8283, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03470489989805436, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 2e-05, |
|
"loss": 1.5934, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04338112487256795, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.9999763673911112e-05, |
|
"loss": 1.4074, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05205734984708153, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.9999054706814453e-05, |
|
"loss": 1.3029, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.060733574821595124, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.9997873132219502e-05, |
|
"loss": 1.2048, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.06940979979610871, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.9996219005973644e-05, |
|
"loss": 1.1517, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0780860247706223, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.9994092406259516e-05, |
|
"loss": 1.1061, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.0867622497451359, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.9991493433591315e-05, |
|
"loss": 1.0597, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09543847471964947, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.998842221081005e-05, |
|
"loss": 1.0366, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.10411469969416307, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.998487888307774e-05, |
|
"loss": 1.0176, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.10411469969416307, |
|
"eval_loss": 1.072221279144287, |
|
"eval_runtime": 19.1563, |
|
"eval_samples_per_second": 22.969, |
|
"eval_steps_per_second": 22.969, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11279092466867666, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 1.998086361787053e-05, |
|
"loss": 0.9971, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.12146714964319025, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.9976376604970818e-05, |
|
"loss": 0.965, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13014337461770384, |
|
"grad_norm": 3.875, |
|
"learning_rate": 1.997141805645824e-05, |
|
"loss": 0.941, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.13881959959221743, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.996598820669967e-05, |
|
"loss": 0.9471, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14749582456673102, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.9960087312338138e-05, |
|
"loss": 0.9273, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.1561720495412446, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1.995371565228071e-05, |
|
"loss": 0.8989, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1648482745157582, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.994687352768527e-05, |
|
"loss": 0.8921, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.1735244994902718, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1.9939561261946343e-05, |
|
"loss": 0.8718, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18220072446478539, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.9931779200679754e-05, |
|
"loss": 0.8735, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.19087694943929895, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.992352771170633e-05, |
|
"loss": 0.8747, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.19955317441381254, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 1.9914807185034483e-05, |
|
"loss": 0.8315, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.20822939938832613, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 1.9905618032841812e-05, |
|
"loss": 0.8365, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.20822939938832613, |
|
"eval_loss": 0.9373729228973389, |
|
"eval_runtime": 18.8365, |
|
"eval_samples_per_second": 23.359, |
|
"eval_steps_per_second": 23.359, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.21690562436283972, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.9895960689455598e-05, |
|
"loss": 0.8469, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.2255818493373533, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 1.9885835611332278e-05, |
|
"loss": 0.8306, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2342580743118669, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 1.987524327703587e-05, |
|
"loss": 0.7991, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.2429342992863805, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 1.986418418721537e-05, |
|
"loss": 0.8085, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2516105242608941, |
|
"grad_norm": 3.25, |
|
"learning_rate": 1.9852658864581063e-05, |
|
"loss": 0.7983, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.2602867492354077, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.9840667853879827e-05, |
|
"loss": 0.7847, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.26896297420992127, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 1.9828211721869404e-05, |
|
"loss": 0.7744, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.27763919918443486, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 1.9815291057291583e-05, |
|
"loss": 0.7846, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.28631542415894845, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 1.980190647084438e-05, |
|
"loss": 0.7874, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.29499164913346204, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 1.9788058595153202e-05, |
|
"loss": 0.7744, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.30366787410797563, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 1.97737480847409e-05, |
|
"loss": 0.7565, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.3123440990824892, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.9758975615996874e-05, |
|
"loss": 0.7477, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3123440990824892, |
|
"eval_loss": 0.8618763089179993, |
|
"eval_runtime": 39.106, |
|
"eval_samples_per_second": 11.251, |
|
"eval_steps_per_second": 11.251, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3210203240570028, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.9743741887145067e-05, |
|
"loss": 0.7589, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.3296965490315164, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.9728047618210995e-05, |
|
"loss": 0.7397, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.33837277400603, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.9711893550987696e-05, |
|
"loss": 0.7504, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.3470489989805436, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 1.969528044900068e-05, |
|
"loss": 0.7365, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3557252239550572, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.967820909747182e-05, |
|
"loss": 0.7463, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.36440144892957077, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.9660680303282273e-05, |
|
"loss": 0.7175, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3730776739040843, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.964269489493431e-05, |
|
"loss": 0.7475, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.3817538988785979, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.9624253722512174e-05, |
|
"loss": 0.7255, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3904301238531115, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.9605357657641896e-05, |
|
"loss": 0.7322, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.3991063488276251, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.9586007593450098e-05, |
|
"loss": 0.7329, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.40778257380213867, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.9566204444521776e-05, |
|
"loss": 0.7143, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.41645879877665226, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.954594914685708e-05, |
|
"loss": 0.702, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.41645879877665226, |
|
"eval_loss": 0.8318689465522766, |
|
"eval_runtime": 18.8241, |
|
"eval_samples_per_second": 23.374, |
|
"eval_steps_per_second": 23.374, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.42513502375116585, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.9525242657827063e-05, |
|
"loss": 0.7272, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.43381124872567944, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.9504085956128437e-05, |
|
"loss": 0.7043, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.44248747370019303, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.9482480041737312e-05, |
|
"loss": 0.7123, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.4511636986747066, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.946042593586195e-05, |
|
"loss": 0.693, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4598399236492202, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.9437924680894456e-05, |
|
"loss": 0.7004, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.4685161486237338, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 1.941497734036155e-05, |
|
"loss": 0.6827, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.4771923735982474, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.939158499887428e-05, |
|
"loss": 0.6949, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.485868598572761, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 1.936774876207676e-05, |
|
"loss": 0.7027, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.4945448235472746, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.9343469756593915e-05, |
|
"loss": 0.7069, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.5032210485217882, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.9318749129978225e-05, |
|
"loss": 0.6873, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.5118972734963018, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.9293588050655492e-05, |
|
"loss": 0.6733, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.5205734984708154, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.9267987707869605e-05, |
|
"loss": 0.6779, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5205734984708154, |
|
"eval_loss": 0.7879548072814941, |
|
"eval_runtime": 18.8786, |
|
"eval_samples_per_second": 23.307, |
|
"eval_steps_per_second": 23.307, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.529249723445329, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1.924194931162635e-05, |
|
"loss": 0.6733, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.5379259484198425, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.9215474092636187e-05, |
|
"loss": 0.6681, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5466021733943561, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.918856330225611e-05, |
|
"loss": 0.6771, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.5552783983688697, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.916121821243049e-05, |
|
"loss": 0.6582, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5639546233433833, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.9133440115630953e-05, |
|
"loss": 0.6551, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.5726308483178969, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.910523032479529e-05, |
|
"loss": 0.6631, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5813070732924105, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.9076590173265406e-05, |
|
"loss": 0.6593, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.5899832982669241, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.9047521014724303e-05, |
|
"loss": 0.6439, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5986595232414377, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.9018024223132096e-05, |
|
"loss": 0.6538, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.6073357482159513, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 1.8988101192661057e-05, |
|
"loss": 0.6662, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.6160119731904649, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.895775333762974e-05, |
|
"loss": 0.6467, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.6246881981649784, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.8926982092436117e-05, |
|
"loss": 0.643, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6246881981649784, |
|
"eval_loss": 0.7786636352539062, |
|
"eval_runtime": 19.0872, |
|
"eval_samples_per_second": 23.052, |
|
"eval_steps_per_second": 23.052, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.633364423139492, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.88957889114898e-05, |
|
"loss": 0.6471, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.6420406481140056, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.8864175269143275e-05, |
|
"loss": 0.6413, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.6507168730885192, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.8832142659622236e-05, |
|
"loss": 0.6424, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.6593930980630328, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.8799692596954947e-05, |
|
"loss": 0.6405, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6680693230375464, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.8766826614900687e-05, |
|
"loss": 0.6307, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.67674554801206, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.8733546266877254e-05, |
|
"loss": 0.6151, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.6854217729865736, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.8699853125887543e-05, |
|
"loss": 0.6442, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.6940979979610872, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.8665748784445206e-05, |
|
"loss": 0.6104, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7027742229356008, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.8631234854499365e-05, |
|
"loss": 0.6213, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.7114504479101144, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.8596312967358436e-05, |
|
"loss": 0.6198, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.720126672884628, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.856098477361302e-05, |
|
"loss": 0.6263, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.7288028978591415, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.8525251943057884e-05, |
|
"loss": 0.6201, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7288028978591415, |
|
"eval_loss": 0.7636004090309143, |
|
"eval_runtime": 18.8162, |
|
"eval_samples_per_second": 23.384, |
|
"eval_steps_per_second": 23.384, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7374791228336551, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.8489116164613053e-05, |
|
"loss": 0.6182, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.7461553478081686, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.845257914624396e-05, |
|
"loss": 0.6252, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.7548315727826822, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.841564261488074e-05, |
|
"loss": 0.6067, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.7635077977571958, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.8378308316336585e-05, |
|
"loss": 0.6172, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.7721840227317094, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.834057801522525e-05, |
|
"loss": 0.6064, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.780860247706223, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.8302453494877635e-05, |
|
"loss": 0.6131, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.7895364726807366, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.8263936557257496e-05, |
|
"loss": 0.6197, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.7982126976552502, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.8225029022876275e-05, |
|
"loss": 0.6128, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8068889226297637, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.818573273070706e-05, |
|
"loss": 0.5884, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.8155651476042773, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.8146049538097662e-05, |
|
"loss": 0.6053, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.8242413725787909, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.8105981320682815e-05, |
|
"loss": 0.6103, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.8329175975533045, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.8065529972295545e-05, |
|
"loss": 0.6053, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8329175975533045, |
|
"eval_loss": 0.738046407699585, |
|
"eval_runtime": 18.7861, |
|
"eval_samples_per_second": 23.422, |
|
"eval_steps_per_second": 23.422, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8415938225278181, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.802469740487764e-05, |
|
"loss": 0.5852, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.8502700475023317, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.7983485548389293e-05, |
|
"loss": 0.5995, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.8589462724768453, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.794189635071788e-05, |
|
"loss": 0.5924, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.8676224974513589, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.789993177758588e-05, |
|
"loss": 0.5757, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8762987224258725, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.7857593812457985e-05, |
|
"loss": 0.5869, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.8849749474003861, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.7814884456447337e-05, |
|
"loss": 0.6001, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.8936511723748997, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.7771805728220942e-05, |
|
"loss": 0.5996, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.9023273973494133, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.772835966390428e-05, |
|
"loss": 0.578, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.9110036223239268, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.7684548316985043e-05, |
|
"loss": 0.5959, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.9196798472984404, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.7640373758216075e-05, |
|
"loss": 0.5728, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.928356072272954, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.7595838075517523e-05, |
|
"loss": 0.5762, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.9370322972474676, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.755094337387813e-05, |
|
"loss": 0.5801, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9370322972474676, |
|
"eval_loss": 0.7330707907676697, |
|
"eval_runtime": 18.9063, |
|
"eval_samples_per_second": 23.273, |
|
"eval_steps_per_second": 23.273, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9457085222219812, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1.7505691775255744e-05, |
|
"loss": 0.5517, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.9543847471964948, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.7460085418477025e-05, |
|
"loss": 0.5622, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.9630609721710084, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.7414126459136365e-05, |
|
"loss": 0.5664, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.971737197145522, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.736781706949398e-05, |
|
"loss": 0.5676, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.9804134221200356, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.732115943837326e-05, |
|
"loss": 0.5925, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.9890896470945492, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.7274155771057302e-05, |
|
"loss": 0.5673, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.9977658720690628, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.7226808289184673e-05, |
|
"loss": 0.5745, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 1.0064420970435763, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.717911923064442e-05, |
|
"loss": 0.5659, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.0045441728304014, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.713109084947028e-05, |
|
"loss": 0.4966, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 1.013220397804915, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.7082725415734145e-05, |
|
"loss": 0.4426, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.0218966227794286, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.7034025215438776e-05, |
|
"loss": 0.4382, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 1.0305728477539422, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.6984992550409747e-05, |
|
"loss": 0.4414, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.0305728477539422, |
|
"eval_loss": 0.7384564280509949, |
|
"eval_runtime": 18.8395, |
|
"eval_samples_per_second": 23.355, |
|
"eval_steps_per_second": 23.355, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.0392490727284558, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.6935629738186646e-05, |
|
"loss": 0.4454, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 1.0479252977029694, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.6885939111913544e-05, |
|
"loss": 0.4334, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.056601522677483, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.6835923020228714e-05, |
|
"loss": 0.4293, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 1.0652777476519966, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.678558382715362e-05, |
|
"loss": 0.4502, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.0739539726265102, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.6734923911981188e-05, |
|
"loss": 0.437, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 1.0826301976010237, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.668394566916334e-05, |
|
"loss": 0.4442, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.0913064225755373, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.6632651508197827e-05, |
|
"loss": 0.4448, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 1.099982647550051, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.6581043853514335e-05, |
|
"loss": 0.4358, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.1086588725245645, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.6529125144359902e-05, |
|
"loss": 0.4561, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 1.1173350974990781, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.647689783468362e-05, |
|
"loss": 0.4294, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.1260113224735917, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.642436439302066e-05, |
|
"loss": 0.4316, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 1.1346875474481053, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.637152730237558e-05, |
|
"loss": 0.4455, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.1346875474481053, |
|
"eval_loss": 0.7258099913597107, |
|
"eval_runtime": 18.9376, |
|
"eval_samples_per_second": 23.234, |
|
"eval_steps_per_second": 23.234, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.1433637724226189, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.631838906010498e-05, |
|
"loss": 0.4332, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 1.1520399973971325, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.6264952177799446e-05, |
|
"loss": 0.4303, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.160716222371646, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.6211219181164864e-05, |
|
"loss": 0.4498, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 1.1693924473461597, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.6157192609903017e-05, |
|
"loss": 0.445, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.1780686723206732, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.6102875017591566e-05, |
|
"loss": 0.4471, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 1.1867448972951868, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.6048268971563337e-05, |
|
"loss": 0.4449, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.1954211222697004, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.5993377052784988e-05, |
|
"loss": 0.4333, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 1.204097347244214, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.5938201855735017e-05, |
|
"loss": 0.4307, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.2127735722187276, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.588274598828113e-05, |
|
"loss": 0.4251, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 1.2214497971932412, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.582701207155697e-05, |
|
"loss": 0.4227, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.2301260221677548, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.577100273983826e-05, |
|
"loss": 0.4401, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 1.2388022471422684, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.5714720640418252e-05, |
|
"loss": 0.4333, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.2388022471422684, |
|
"eval_loss": 0.713193416595459, |
|
"eval_runtime": 18.8026, |
|
"eval_samples_per_second": 23.401, |
|
"eval_steps_per_second": 23.401, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.247478472116782, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.5658168433482637e-05, |
|
"loss": 0.432, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 1.2561546970912956, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.560134879198379e-05, |
|
"loss": 0.429, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.2648309220658092, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.554426440151444e-05, |
|
"loss": 0.4378, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 1.2735071470403228, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.5486917960180742e-05, |
|
"loss": 0.4278, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.2821833720148363, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.542931217847472e-05, |
|
"loss": 0.429, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 1.29085959698935, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.5371449779146205e-05, |
|
"loss": 0.4289, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.2995358219638635, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.5313333497074094e-05, |
|
"loss": 0.4271, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 1.3082120469383771, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.5254966079137118e-05, |
|
"loss": 0.4239, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.3168882719128907, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.5196350284083999e-05, |
|
"loss": 0.4291, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 1.3255644968874043, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.513748888240305e-05, |
|
"loss": 0.429, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.3342407218619179, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.507838465619125e-05, |
|
"loss": 0.4232, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 1.3429169468364315, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.5019040399022711e-05, |
|
"loss": 0.4237, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.3429169468364315, |
|
"eval_loss": 0.7250744700431824, |
|
"eval_runtime": 18.82, |
|
"eval_samples_per_second": 23.379, |
|
"eval_steps_per_second": 23.379, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.351593171810945, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.4959458915816681e-05, |
|
"loss": 0.4297, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 1.3602693967854587, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.489964302270493e-05, |
|
"loss": 0.4331, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.3689456217599723, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.483959554689868e-05, |
|
"loss": 0.43, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 1.3776218467344858, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.4779319326554953e-05, |
|
"loss": 0.4165, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.3862980717089994, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.4718817210642427e-05, |
|
"loss": 0.4325, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 1.394974296683513, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.4658092058806783e-05, |
|
"loss": 0.4225, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.4036505216580266, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.4597146741235554e-05, |
|
"loss": 0.4137, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 1.4123267466325402, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.4535984138522442e-05, |
|
"loss": 0.4075, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.4210029716070538, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.447460714153119e-05, |
|
"loss": 0.4228, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 1.4296791965815674, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.4413018651258922e-05, |
|
"loss": 0.4215, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.438355421556081, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.4351221578699045e-05, |
|
"loss": 0.4203, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 1.4470316465305946, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.4289218844703654e-05, |
|
"loss": 0.4068, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.4470316465305946, |
|
"eval_loss": 0.7109268307685852, |
|
"eval_runtime": 19.0434, |
|
"eval_samples_per_second": 23.105, |
|
"eval_steps_per_second": 23.105, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.4557078715051082, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 1.4227013379845471e-05, |
|
"loss": 0.4169, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 1.4643840964796218, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.4164608124279337e-05, |
|
"loss": 0.407, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.4730603214541353, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.4102006027603255e-05, |
|
"loss": 0.4349, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 1.481736546428649, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.403921004871895e-05, |
|
"loss": 0.4077, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.4904127714031625, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.3976223155692047e-05, |
|
"loss": 0.4234, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 1.4990889963776761, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.391304832561175e-05, |
|
"loss": 0.4177, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.5077652213521897, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.3849688544450176e-05, |
|
"loss": 0.4027, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 1.5164414463267033, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.3786146806921166e-05, |
|
"loss": 0.4125, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.525117671301217, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.3722426116338792e-05, |
|
"loss": 0.4019, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 1.5337938962757305, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.3658529484475369e-05, |
|
"loss": 0.4175, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.5424701212502439, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.3594459931419112e-05, |
|
"loss": 0.4136, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 1.5511463462247574, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.3530220485431405e-05, |
|
"loss": 0.3997, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.5511463462247574, |
|
"eval_loss": 0.706421434879303, |
|
"eval_runtime": 19.0141, |
|
"eval_samples_per_second": 23.141, |
|
"eval_steps_per_second": 23.141, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.559822571199271, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.3465814182803653e-05, |
|
"loss": 0.422, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 1.5684987961737846, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.340124406771377e-05, |
|
"loss": 0.4171, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.5771750211482982, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.3336513192082316e-05, |
|
"loss": 0.4085, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 1.5858512461228118, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1.3271624615428218e-05, |
|
"loss": 0.4088, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.5945274710973254, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.3206581404724185e-05, |
|
"loss": 0.3976, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 1.603203696071839, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 1.3141386634251736e-05, |
|
"loss": 0.404, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.6118799210463526, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.3076043385455894e-05, |
|
"loss": 0.4128, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 1.6205561460208662, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.3010554746799544e-05, |
|
"loss": 0.3959, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.6292323709953798, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.2944923813617458e-05, |
|
"loss": 0.3978, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 1.6379085959698934, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.2879153687969984e-05, |
|
"loss": 0.4009, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.646584820944407, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 1.2813247478496428e-05, |
|
"loss": 0.3974, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 1.6552610459189205, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.274720830026814e-05, |
|
"loss": 0.3967, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.6552610459189205, |
|
"eval_loss": 0.7064741253852844, |
|
"eval_runtime": 19.1066, |
|
"eval_samples_per_second": 23.029, |
|
"eval_steps_per_second": 23.029, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.6639372708934341, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.2681039274641261e-05, |
|
"loss": 0.4103, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 1.6726134958679477, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.261474352910919e-05, |
|
"loss": 0.4044, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.6812897208424613, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.2548324197154788e-05, |
|
"loss": 0.3968, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 1.689965945816975, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.248178441810224e-05, |
|
"loss": 0.3955, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.6986421707914885, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.2415127336968691e-05, |
|
"loss": 0.3903, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 1.707318395766002, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.23483561043156e-05, |
|
"loss": 0.3897, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.7159946207405157, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.2281473876099822e-05, |
|
"loss": 0.3981, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 1.7246708457150293, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.2214483813524429e-05, |
|
"loss": 0.4172, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.7333470706895429, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.2147389082889328e-05, |
|
"loss": 0.398, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 1.7420232956640564, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.2080192855441572e-05, |
|
"loss": 0.3901, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.75069952063857, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.2012898307225482e-05, |
|
"loss": 0.3865, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 1.7593757456130836, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.1945508618932537e-05, |
|
"loss": 0.3904, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.7593757456130836, |
|
"eval_loss": 0.707400918006897, |
|
"eval_runtime": 18.7714, |
|
"eval_samples_per_second": 23.44, |
|
"eval_steps_per_second": 23.44, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.7680519705875972, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.1878026975751033e-05, |
|
"loss": 0.3987, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 1.7767281955621108, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.1810456567215525e-05, |
|
"loss": 0.3977, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.7854044205366244, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.1742800587056092e-05, |
|
"loss": 0.3913, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 1.794080645511138, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.1675062233047365e-05, |
|
"loss": 0.3835, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.8027568704856516, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.1607244706857404e-05, |
|
"loss": 0.3856, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 1.8114330954601652, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.1539351213896352e-05, |
|
"loss": 0.3835, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.8201093204346788, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.147138496316494e-05, |
|
"loss": 0.3901, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 1.8287855454091924, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.1403349167102806e-05, |
|
"loss": 0.3953, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.837461770383706, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 1.1335247041436674e-05, |
|
"loss": 0.3911, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 1.8461379953582195, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.126708180502834e-05, |
|
"loss": 0.3765, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.8548142203327331, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.1198856679722548e-05, |
|
"loss": 0.3862, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 1.8634904453072467, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.1130574890194706e-05, |
|
"loss": 0.3838, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.8634904453072467, |
|
"eval_loss": 0.7026991248130798, |
|
"eval_runtime": 18.7531, |
|
"eval_samples_per_second": 23.463, |
|
"eval_steps_per_second": 23.463, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.8721666702817603, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.1062239663798466e-05, |
|
"loss": 0.3843, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 1.880842895256274, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.0993854230413183e-05, |
|
"loss": 0.3971, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.8895191202307875, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.092542182229126e-05, |
|
"loss": 0.378, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 1.898195345205301, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.085694567390537e-05, |
|
"loss": 0.3764, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.9068715701798147, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.0788429021795582e-05, |
|
"loss": 0.3705, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 1.9155477951543283, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.0719875104416373e-05, |
|
"loss": 0.3723, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.9242240201288419, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.0651287161983583e-05, |
|
"loss": 0.3778, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 1.9329002451033555, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.0582668436321244e-05, |
|
"loss": 0.3773, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.941576470077869, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.0514022170708374e-05, |
|
"loss": 0.3662, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 1.9502526950523826, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 1.044535160972566e-05, |
|
"loss": 0.3777, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.9589289200268962, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.0376659999102125e-05, |
|
"loss": 0.3775, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 1.9676051450014098, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.0307950585561705e-05, |
|
"loss": 0.3714, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.9676051450014098, |
|
"eval_loss": 0.7053300142288208, |
|
"eval_runtime": 19.1384, |
|
"eval_samples_per_second": 22.99, |
|
"eval_steps_per_second": 22.99, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.9762813699759234, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.0239226616669792e-05, |
|
"loss": 0.375, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 1.984957594950437, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.0170491340679744e-05, |
|
"loss": 0.3704, |
|
"step": 5750 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 11524, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 1441, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.495938899681739e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|