|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9999750006249843, |
|
"global_step": 50000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 1.2e-06, |
|
"loss": 16.3268, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0003, |
|
"loss": 3.004, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0006, |
|
"loss": 1.3515, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0005999622383021625, |
|
"loss": 1.2607, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0005998489627149555, |
|
"loss": 1.2164, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0005996602017549024, |
|
"loss": 1.1909, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0005993960029415653, |
|
"loss": 1.1706, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0005990564327855827, |
|
"loss": 1.1549, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0005986415767719254, |
|
"loss": 1.1406, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0005981515393383762, |
|
"loss": 1.1303, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0005975864438492385, |
|
"loss": 1.1204, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 1.0444505214691162, |
|
"eval_runtime": 326.787, |
|
"eval_samples_per_second": 313.354, |
|
"eval_steps_per_second": 4.896, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0005969464325642798, |
|
"loss": 1.1114, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0005962316666029183, |
|
"loss": 1.1027, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0005954423259036624, |
|
"loss": 1.0964, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0005945786091788119, |
|
"loss": 1.0897, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0005936407338644336, |
|
"loss": 1.0835, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0005926289360656221, |
|
"loss": 1.0783, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0005915434704970625, |
|
"loss": 1.0738, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0005903846104189068, |
|
"loss": 1.0692, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0005891526475679825, |
|
"loss": 1.0643, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0005878478920843492, |
|
"loss": 1.0606, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 1.000662088394165, |
|
"eval_runtime": 326.7924, |
|
"eval_samples_per_second": 313.349, |
|
"eval_steps_per_second": 4.896, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0005864706724332221, |
|
"loss": 1.0559, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0005850213353222835, |
|
"loss": 1.0521, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0005835002456144005, |
|
"loss": 1.049, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0005819077862357724, |
|
"loss": 1.0454, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.000580244358079532, |
|
"loss": 1.042, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0005785103799048218, |
|
"loss": 1.0393, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0005767062882313744, |
|
"loss": 1.036, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0005748325372296208, |
|
"loss": 1.033, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0005728895986063555, |
|
"loss": 1.0308, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0005708779614859863, |
|
"loss": 1.0275, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 0.9750040173530579, |
|
"eval_runtime": 327.0193, |
|
"eval_samples_per_second": 313.131, |
|
"eval_steps_per_second": 4.893, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0005687981322874007, |
|
"loss": 1.0263, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.000566650634596477, |
|
"loss": 1.0236, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0005644360090342746, |
|
"loss": 1.0211, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0005621548131209354, |
|
"loss": 1.0189, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0005598076211353316, |
|
"loss": 1.0171, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.000557395023970493, |
|
"loss": 1.0155, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0005549176289848543, |
|
"loss": 1.0129, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0005523760598493544, |
|
"loss": 1.011, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0005497709563904314, |
|
"loss": 1.0097, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0005471029744289498, |
|
"loss": 1.0076, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.9587759971618652, |
|
"eval_runtime": 327.4691, |
|
"eval_samples_per_second": 312.701, |
|
"eval_steps_per_second": 4.886, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0005443727856151006, |
|
"loss": 1.0062, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0005415810772593175, |
|
"loss": 1.0044, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0005387285521592496, |
|
"loss": 1.0028, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.0005358159284228363, |
|
"loss": 1.0009, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.000532843939287527, |
|
"loss": 1.0001, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.0005298133329356933, |
|
"loss": 0.9994, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.0005267248723062775, |
|
"loss": 0.9973, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 0.0005235793349027264, |
|
"loss": 0.9954, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 0.0005203775125972599, |
|
"loss": 0.9946, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 0.000517120211431521, |
|
"loss": 0.9934, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.9469436407089233, |
|
"eval_runtime": 326.938, |
|
"eval_samples_per_second": 313.209, |
|
"eval_steps_per_second": 4.894, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 0.0005138082514136589, |
|
"loss": 0.9922, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 0.0005104424663118964, |
|
"loss": 0.9914, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 0.0005070237034446336, |
|
"loss": 0.9905, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.0005035528234671396, |
|
"loss": 0.9877, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.0005000307001548875, |
|
"loss": 0.9873, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.0004964582201835855, |
|
"loss": 0.9867, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.0004928362829059618, |
|
"loss": 0.986, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 0.0004891658001253567, |
|
"loss": 0.9839, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 0.00048544769586618153, |
|
"loss": 0.9832, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 0.00048168290614129995, |
|
"loss": 0.9828, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 0.9360936880111694, |
|
"eval_runtime": 326.912, |
|
"eval_samples_per_second": 313.234, |
|
"eval_steps_per_second": 4.894, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 0.00047787237871639213, |
|
"loss": 0.9809, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 0.0004740170728713594, |
|
"loss": 0.9805, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 0.0004701179591588311, |
|
"loss": 0.9797, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 0.00046617601915983307, |
|
"loss": 0.9785, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 0.00046219224523667927, |
|
"loss": 0.9776, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.00045816764028315066, |
|
"loss": 0.9773, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.0004541032174720219, |
|
"loss": 0.9767, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 0.00045, |
|
"loss": 0.976, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 0.00044585902083014057, |
|
"loss": 0.975, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 0.0004416813224318048, |
|
"loss": 0.9738, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 0.9288003444671631, |
|
"eval_runtime": 326.8909, |
|
"eval_samples_per_second": 313.254, |
|
"eval_steps_per_second": 4.895, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 0.00043746795651822306, |
|
"loss": 0.9733, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 0.0004332199837817322, |
|
"loss": 0.9714, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 0.0004289384736267515, |
|
"loss": 0.9707, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 0.00042462450390056593, |
|
"loss": 0.9701, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 0.0004202791606219841, |
|
"loss": 0.9699, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.0004159035377079385, |
|
"loss": 0.9687, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.0004114987366980982, |
|
"loss": 0.9684, |
|
"step": 19250 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 0.0004070658664775615, |
|
"loss": 0.9674, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 0.00040260604299770063, |
|
"loss": 0.9675, |
|
"step": 19750 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 0.0003981203889952265, |
|
"loss": 0.9669, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.9225976467132568, |
|
"eval_runtime": 326.7616, |
|
"eval_samples_per_second": 313.378, |
|
"eval_steps_per_second": 4.897, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 0.0003936100337095461, |
|
"loss": 0.9654, |
|
"step": 20250 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 0.0003890761125984825, |
|
"loss": 0.9656, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 0.0003845197670524289, |
|
"loss": 0.9641, |
|
"step": 20750 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.0003799421441070104, |
|
"loss": 0.9648, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.0003753443961543237, |
|
"loss": 0.9629, |
|
"step": 21250 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.0003707276806528282, |
|
"loss": 0.9628, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.0003660931598359622, |
|
"loss": 0.9618, |
|
"step": 21750 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 0.0003614420004195572, |
|
"loss": 0.9612, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 0.000356775373308123, |
|
"loss": 0.9608, |
|
"step": 22250 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 0.0003520944533000791, |
|
"loss": 0.9607, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 0.9160612225532532, |
|
"eval_runtime": 326.8283, |
|
"eval_samples_per_second": 313.314, |
|
"eval_steps_per_second": 4.896, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 0.00034740041879200497, |
|
"loss": 0.9596, |
|
"step": 22750 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 0.00034269445148198553, |
|
"loss": 0.9583, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 0.00033797773607212474, |
|
"loss": 0.9591, |
|
"step": 23250 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.0003332514599703033, |
|
"loss": 0.9589, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.0003285168129912547, |
|
"loss": 0.9581, |
|
"step": 23750 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.0003237749870570365, |
|
"loss": 0.9573, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.0003190271758969693, |
|
"loss": 0.956, |
|
"step": 24250 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 0.00031427457474712274, |
|
"loss": 0.9565, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 0.0003095183800494203, |
|
"loss": 0.9563, |
|
"step": 24750 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 0.00030475978915044235, |
|
"loss": 0.9556, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.9126129150390625, |
|
"eval_runtime": 326.9927, |
|
"eval_samples_per_second": 313.157, |
|
"eval_steps_per_second": 4.893, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9542, |
|
"step": 25250 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 0.0002952402108495576, |
|
"loss": 0.9544, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 0.00029048161995057974, |
|
"loss": 0.9541, |
|
"step": 25750 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 0.0002857254252528773, |
|
"loss": 0.9529, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 0.00028097282410303066, |
|
"loss": 0.9527, |
|
"step": 26250 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 0.0002762250129429634, |
|
"loss": 0.9525, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 0.00027148318700874523, |
|
"loss": 0.9522, |
|
"step": 26750 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 0.0002667485400296967, |
|
"loss": 0.9509, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 0.00026202226392787515, |
|
"loss": 0.9513, |
|
"step": 27250 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 0.0002573055485180145, |
|
"loss": 0.9515, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_loss": 0.9075026512145996, |
|
"eval_runtime": 327.309, |
|
"eval_samples_per_second": 312.854, |
|
"eval_steps_per_second": 4.888, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 0.000252599581207995, |
|
"loss": 0.9514, |
|
"step": 27750 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 0.0002479055466999209, |
|
"loss": 0.9499, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 0.00024322462669187702, |
|
"loss": 0.95, |
|
"step": 28250 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 0.0002385579995804428, |
|
"loss": 0.9494, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 0.00023390684016403777, |
|
"loss": 0.9495, |
|
"step": 28750 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 0.00022927231934717176, |
|
"loss": 0.9488, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 0.00022465560384567624, |
|
"loss": 0.9483, |
|
"step": 29250 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.00022005785589298952, |
|
"loss": 0.9475, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.00021548023294757105, |
|
"loss": 0.9472, |
|
"step": 29750 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 0.00021092388740151762, |
|
"loss": 0.9471, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.9042757153511047, |
|
"eval_runtime": 326.9107, |
|
"eval_samples_per_second": 313.235, |
|
"eval_steps_per_second": 4.894, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 0.00020638996629045387, |
|
"loss": 0.9469, |
|
"step": 30250 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 0.0002018796110047735, |
|
"loss": 0.946, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 0.00019739395700229937, |
|
"loss": 0.9452, |
|
"step": 30750 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 0.00019293413352243846, |
|
"loss": 0.9462, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 0.00018850126330190176, |
|
"loss": 0.945, |
|
"step": 31250 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 0.00018409646229206137, |
|
"loss": 0.9448, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 0.00017972083937801593, |
|
"loss": 0.9442, |
|
"step": 31750 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 0.0001753754960994341, |
|
"loss": 0.9443, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 0.0001710615263732485, |
|
"loss": 0.9446, |
|
"step": 32250 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.00016678001621826772, |
|
"loss": 0.9434, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 0.9014184474945068, |
|
"eval_runtime": 326.8232, |
|
"eval_samples_per_second": 313.319, |
|
"eval_steps_per_second": 4.896, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.00016253204348177686, |
|
"loss": 0.9428, |
|
"step": 32750 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 0.00015831867756819522, |
|
"loss": 0.9431, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 0.00015414097916985944, |
|
"loss": 0.9432, |
|
"step": 33250 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 0.00015000000000000004, |
|
"loss": 0.9424, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 0.00014589678252797817, |
|
"loss": 0.9425, |
|
"step": 33750 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 0.00014183235971684924, |
|
"loss": 0.9415, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 0.00013780775476332082, |
|
"loss": 0.9412, |
|
"step": 34250 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 0.0001338239808401669, |
|
"loss": 0.9425, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 0.0001298820408411688, |
|
"loss": 0.9413, |
|
"step": 34750 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.00012598292712864058, |
|
"loss": 0.9409, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 0.8982858061790466, |
|
"eval_runtime": 326.9208, |
|
"eval_samples_per_second": 313.226, |
|
"eval_steps_per_second": 4.894, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.0001221276212836079, |
|
"loss": 0.9406, |
|
"step": 35250 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 0.00011831709385870004, |
|
"loss": 0.9407, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 0.0001145523041338184, |
|
"loss": 0.9399, |
|
"step": 35750 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 0.00011083419987464334, |
|
"loss": 0.9394, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 0.00010716371709403818, |
|
"loss": 0.9399, |
|
"step": 36250 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 0.00010354177981641449, |
|
"loss": 0.94, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 9.996929984511254e-05, |
|
"loss": 0.9393, |
|
"step": 36750 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 9.644717653286037e-05, |
|
"loss": 0.9389, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 9.297629655536644e-05, |
|
"loss": 0.939, |
|
"step": 37250 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 8.955753368810358e-05, |
|
"loss": 0.939, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.895902693271637, |
|
"eval_runtime": 326.825, |
|
"eval_samples_per_second": 313.318, |
|
"eval_steps_per_second": 4.896, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 8.619174858634122e-05, |
|
"loss": 0.9385, |
|
"step": 37750 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 8.287978856847894e-05, |
|
"loss": 0.9378, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 7.962248740274003e-05, |
|
"loss": 0.9383, |
|
"step": 38250 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 7.642066509727359e-05, |
|
"loss": 0.9376, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 7.327512769372254e-05, |
|
"loss": 0.9375, |
|
"step": 38750 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 7.018666706430662e-05, |
|
"loss": 0.9378, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 6.715606071247291e-05, |
|
"loss": 0.9377, |
|
"step": 39250 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 6.418407157716381e-05, |
|
"loss": 0.9374, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 6.127144784075033e-05, |
|
"loss": 0.937, |
|
"step": 39750 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 5.841892274068241e-05, |
|
"loss": 0.9368, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.8947132229804993, |
|
"eval_runtime": 326.8211, |
|
"eval_samples_per_second": 313.321, |
|
"eval_steps_per_second": 4.896, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 5.562721438489928e-05, |
|
"loss": 0.937, |
|
"step": 40250 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 5.2897025571050186e-05, |
|
"loss": 0.9365, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 5.022904360956861e-05, |
|
"loss": 0.9355, |
|
"step": 40750 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 4.76239401506456e-05, |
|
"loss": 0.9369, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 4.5082371015145716e-05, |
|
"loss": 0.9363, |
|
"step": 41250 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 4.260497602950688e-05, |
|
"loss": 0.9351, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 4.019237886466838e-05, |
|
"loss": 0.9363, |
|
"step": 41750 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 3.784518687906452e-05, |
|
"loss": 0.9354, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 3.556399096572541e-05, |
|
"loss": 0.936, |
|
"step": 42250 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 3.3349365403522986e-05, |
|
"loss": 0.9358, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_loss": 0.8933549523353577, |
|
"eval_runtime": 327.2967, |
|
"eval_samples_per_second": 312.866, |
|
"eval_steps_per_second": 4.889, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 3.120186771259927e-05, |
|
"loss": 0.9356, |
|
"step": 42750 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 2.9122038514013678e-05, |
|
"loss": 0.9364, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 2.7110401393644464e-05, |
|
"loss": 0.9352, |
|
"step": 43250 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 2.516746277037912e-05, |
|
"loss": 0.9353, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 2.329371176862562e-05, |
|
"loss": 0.9352, |
|
"step": 43750 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 2.148962009517823e-05, |
|
"loss": 0.9353, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 1.9755641920468003e-05, |
|
"loss": 0.9356, |
|
"step": 44250 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 1.8092213764227503e-05, |
|
"loss": 0.9354, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 1.6499754385599462e-05, |
|
"loss": 0.9362, |
|
"step": 44750 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 1.4978664677716402e-05, |
|
"loss": 0.9353, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 0.8926950693130493, |
|
"eval_runtime": 326.7059, |
|
"eval_samples_per_second": 313.432, |
|
"eval_steps_per_second": 4.897, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 1.3529327566777836e-05, |
|
"loss": 0.9346, |
|
"step": 45250 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 1.2152107915650821e-05, |
|
"loss": 0.9355, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 1.0847352432017387e-05, |
|
"loss": 0.9345, |
|
"step": 45750 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 9.615389581093124e-06, |
|
"loss": 0.9354, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 8.456529502937504e-06, |
|
"loss": 0.9346, |
|
"step": 46250 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 7.371063934377885e-06, |
|
"loss": 0.9353, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 6.35926613556641e-06, |
|
"loss": 0.9341, |
|
"step": 46750 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 5.421390821187988e-06, |
|
"loss": 0.9343, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 4.557674096337593e-06, |
|
"loss": 0.9346, |
|
"step": 47250 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 3.768333397081713e-06, |
|
"loss": 0.9347, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 0.8924221992492676, |
|
"eval_runtime": 326.8364, |
|
"eval_samples_per_second": 313.307, |
|
"eval_steps_per_second": 4.895, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 3.0535674357201944e-06, |
|
"loss": 0.9347, |
|
"step": 47750 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 2.4135561507613975e-06, |
|
"loss": 0.9349, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 1.848460661623763e-06, |
|
"loss": 0.9349, |
|
"step": 48250 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 1.3584232280746231e-06, |
|
"loss": 0.9343, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 9.435672144173178e-07, |
|
"loss": 0.9343, |
|
"step": 48750 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 6.03997058434702e-07, |
|
"loss": 0.9339, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 3.397982450976111e-07, |
|
"loss": 0.9347, |
|
"step": 49250 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 1.5103728504447522e-07, |
|
"loss": 0.9346, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 3.776169783747951e-08, |
|
"loss": 0.9344, |
|
"step": 49750 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.9338, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.8922701478004456, |
|
"eval_runtime": 326.6115, |
|
"eval_samples_per_second": 313.522, |
|
"eval_steps_per_second": 4.899, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 50000, |
|
"total_flos": 1.147093508736575e+19, |
|
"train_loss": 0.9890500741958618, |
|
"train_runtime": 250778.2737, |
|
"train_samples_per_second": 102.082, |
|
"train_steps_per_second": 0.199 |
|
} |
|
], |
|
"max_steps": 50000, |
|
"num_train_epochs": 1, |
|
"total_flos": 1.147093508736575e+19, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|