|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 1730, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.136181831359863, |
|
"learning_rate": 2.8901734104046245e-05, |
|
"loss": 3.3442, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.936872959136963, |
|
"learning_rate": 5.780346820809249e-05, |
|
"loss": 1.9328, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.958111047744751, |
|
"learning_rate": 8.670520231213874e-05, |
|
"loss": 1.0726, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.6158933639526367, |
|
"learning_rate": 0.00011560693641618498, |
|
"loss": 0.843, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.173802375793457, |
|
"learning_rate": 0.00014450867052023122, |
|
"loss": 0.7975, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.179237127304077, |
|
"learning_rate": 0.00017341040462427748, |
|
"loss": 0.7632, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.171823024749756, |
|
"learning_rate": 0.00019974309569685292, |
|
"loss": 0.7023, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.7833949327468872, |
|
"learning_rate": 0.00019653179190751445, |
|
"loss": 0.7452, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.8641424179077148, |
|
"learning_rate": 0.00019332048811817598, |
|
"loss": 0.6986, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.020118474960327, |
|
"learning_rate": 0.0001901091843288375, |
|
"loss": 0.6461, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.5515590906143188, |
|
"learning_rate": 0.00018689788053949903, |
|
"loss": 0.6287, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.8482364416122437, |
|
"learning_rate": 0.00018368657675016056, |
|
"loss": 0.5941, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.901448130607605, |
|
"learning_rate": 0.00018047527296082209, |
|
"loss": 0.5869, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.3397067785263062, |
|
"learning_rate": 0.0001772639691714836, |
|
"loss": 0.576, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.5321860313415527, |
|
"learning_rate": 0.00017405266538214514, |
|
"loss": 0.5475, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.7586708068847656, |
|
"learning_rate": 0.0001708413615928067, |
|
"loss": 0.5372, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.6688508987426758, |
|
"learning_rate": 0.00016763005780346822, |
|
"loss": 0.5429, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.8364487886428833, |
|
"learning_rate": 0.00016441875401412975, |
|
"loss": 0.539, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.682422399520874, |
|
"learning_rate": 0.00016120745022479128, |
|
"loss": 0.4863, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.8813568353652954, |
|
"learning_rate": 0.0001579961464354528, |
|
"loss": 0.5587, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.593272089958191, |
|
"learning_rate": 0.00015478484264611433, |
|
"loss": 0.5121, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.3057204484939575, |
|
"learning_rate": 0.00015157353885677586, |
|
"loss": 0.4958, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.764782190322876, |
|
"learning_rate": 0.00014836223506743738, |
|
"loss": 0.4538, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.9039567708969116, |
|
"learning_rate": 0.0001451509312780989, |
|
"loss": 0.4897, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.793358325958252, |
|
"learning_rate": 0.00014193962748876044, |
|
"loss": 0.4601, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.4797160625457764, |
|
"learning_rate": 0.00013872832369942197, |
|
"loss": 0.5012, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 1.8706116676330566, |
|
"learning_rate": 0.0001355170199100835, |
|
"loss": 0.463, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.2917760610580444, |
|
"learning_rate": 0.00013230571612074502, |
|
"loss": 0.4608, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.67048978805542, |
|
"learning_rate": 0.00012909441233140655, |
|
"loss": 0.4104, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.5852957963943481, |
|
"learning_rate": 0.00012588310854206808, |
|
"loss": 0.3789, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.431355595588684, |
|
"learning_rate": 0.0001226718047527296, |
|
"loss": 0.3824, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.3853150606155396, |
|
"learning_rate": 0.00011946050096339114, |
|
"loss": 0.4049, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.5978338718414307, |
|
"learning_rate": 0.00011624919717405267, |
|
"loss": 0.3867, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.5443283319473267, |
|
"learning_rate": 0.0001130378933847142, |
|
"loss": 0.3646, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.928194284439087, |
|
"learning_rate": 0.00010982658959537572, |
|
"loss": 0.3898, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.7459748983383179, |
|
"learning_rate": 0.00010661528580603725, |
|
"loss": 0.3728, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.5855330228805542, |
|
"learning_rate": 0.00010340398201669879, |
|
"loss": 0.3829, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.3970378637313843, |
|
"learning_rate": 0.00010019267822736032, |
|
"loss": 0.361, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.7334144115447998, |
|
"learning_rate": 9.698137443802185e-05, |
|
"loss": 0.3767, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.5465258359909058, |
|
"learning_rate": 9.377007064868337e-05, |
|
"loss": 0.3614, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.5643867254257202, |
|
"learning_rate": 9.05587668593449e-05, |
|
"loss": 0.3956, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.9123274087905884, |
|
"learning_rate": 8.734746307000643e-05, |
|
"loss": 0.3449, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 2.7245140075683594, |
|
"learning_rate": 8.413615928066796e-05, |
|
"loss": 0.2816, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 1.991822600364685, |
|
"learning_rate": 8.092485549132948e-05, |
|
"loss": 0.2756, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 2.225388765335083, |
|
"learning_rate": 7.771355170199101e-05, |
|
"loss": 0.2869, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 2.127760648727417, |
|
"learning_rate": 7.450224791265255e-05, |
|
"loss": 0.2746, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.051787853240967, |
|
"learning_rate": 7.129094412331408e-05, |
|
"loss": 0.2723, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 2.1885359287261963, |
|
"learning_rate": 6.80796403339756e-05, |
|
"loss": 0.2849, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 1.6504746675491333, |
|
"learning_rate": 6.486833654463712e-05, |
|
"loss": 0.2578, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 1.753282904624939, |
|
"learning_rate": 6.165703275529865e-05, |
|
"loss": 0.3017, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 2.5774452686309814, |
|
"learning_rate": 5.844572896596018e-05, |
|
"loss": 0.275, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.954413652420044, |
|
"learning_rate": 5.523442517662171e-05, |
|
"loss": 0.2934, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 1.5767195224761963, |
|
"learning_rate": 5.2023121387283234e-05, |
|
"loss": 0.2885, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.8420324325561523, |
|
"learning_rate": 4.881181759794477e-05, |
|
"loss": 0.2735, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 2.003744602203369, |
|
"learning_rate": 4.56005138086063e-05, |
|
"loss": 0.2692, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 1.7634247541427612, |
|
"learning_rate": 4.238921001926782e-05, |
|
"loss": 0.253, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 2.338707447052002, |
|
"learning_rate": 3.917790622992935e-05, |
|
"loss": 0.1929, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 2.5318188667297363, |
|
"learning_rate": 3.596660244059088e-05, |
|
"loss": 0.2114, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 1.8917136192321777, |
|
"learning_rate": 3.275529865125241e-05, |
|
"loss": 0.2274, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 1.8130145072937012, |
|
"learning_rate": 2.9543994861913938e-05, |
|
"loss": 0.2101, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 1.923666000366211, |
|
"learning_rate": 2.6332691072575465e-05, |
|
"loss": 0.1908, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 1.6899950504302979, |
|
"learning_rate": 2.3121387283236996e-05, |
|
"loss": 0.198, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 2.0312962532043457, |
|
"learning_rate": 1.9910083493898523e-05, |
|
"loss": 0.2109, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 2.349602460861206, |
|
"learning_rate": 1.6698779704560053e-05, |
|
"loss": 0.2136, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 1.9838098287582397, |
|
"learning_rate": 1.348747591522158e-05, |
|
"loss": 0.2251, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 3.0191967487335205, |
|
"learning_rate": 1.027617212588311e-05, |
|
"loss": 0.2107, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 1.7804679870605469, |
|
"learning_rate": 7.064868336544637e-06, |
|
"loss": 0.1865, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 2.1056630611419678, |
|
"learning_rate": 3.853564547206165e-06, |
|
"loss": 0.2099, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 1.8963791131973267, |
|
"learning_rate": 6.422607578676943e-07, |
|
"loss": 0.201, |
|
"step": 1725 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1730, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 7.5248089823232e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|