|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.3431232091690544, |
|
"eval_steps": 2500, |
|
"global_step": 60000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01119269340974212, |
|
"grad_norm": 2.299727201461792, |
|
"learning_rate": 4.981345510983763e-05, |
|
"loss": 1.8848, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.02238538681948424, |
|
"grad_norm": 1.9952893257141113, |
|
"learning_rate": 4.962691021967526e-05, |
|
"loss": 1.7595, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03357808022922636, |
|
"grad_norm": 2.1056811809539795, |
|
"learning_rate": 4.944036532951289e-05, |
|
"loss": 1.6994, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04477077363896848, |
|
"grad_norm": 2.0474352836608887, |
|
"learning_rate": 4.925382043935053e-05, |
|
"loss": 1.6629, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0559634670487106, |
|
"grad_norm": 1.9989269971847534, |
|
"learning_rate": 4.906727554918816e-05, |
|
"loss": 1.6236, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.0559634670487106, |
|
"eval_accuracy": 0.5569847646608951, |
|
"eval_loss": 2.425182342529297, |
|
"eval_runtime": 707.8445, |
|
"eval_samples_per_second": 91.796, |
|
"eval_steps_per_second": 3.826, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06715616045845273, |
|
"grad_norm": 1.879557490348816, |
|
"learning_rate": 4.888073065902579e-05, |
|
"loss": 1.5991, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.07834885386819485, |
|
"grad_norm": 1.9889895915985107, |
|
"learning_rate": 4.869418576886342e-05, |
|
"loss": 1.5751, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.08954154727793696, |
|
"grad_norm": 1.910925269126892, |
|
"learning_rate": 4.8507640878701055e-05, |
|
"loss": 1.5587, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.10073424068767908, |
|
"grad_norm": 1.9268312454223633, |
|
"learning_rate": 4.8321095988538685e-05, |
|
"loss": 1.546, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.1119269340974212, |
|
"grad_norm": 1.8074718713760376, |
|
"learning_rate": 4.8134551098376315e-05, |
|
"loss": 1.5301, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.1119269340974212, |
|
"eval_accuracy": 0.566450867740456, |
|
"eval_loss": 2.3531110286712646, |
|
"eval_runtime": 716.3757, |
|
"eval_samples_per_second": 90.702, |
|
"eval_steps_per_second": 3.78, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.12311962750716332, |
|
"grad_norm": 1.8723756074905396, |
|
"learning_rate": 4.7948006208213945e-05, |
|
"loss": 1.5153, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.13431232091690545, |
|
"grad_norm": 1.8938133716583252, |
|
"learning_rate": 4.7761461318051575e-05, |
|
"loss": 1.5051, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.14550501432664756, |
|
"grad_norm": 1.8093421459197998, |
|
"learning_rate": 4.757491642788921e-05, |
|
"loss": 1.4922, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.1566977077363897, |
|
"grad_norm": 1.8811379671096802, |
|
"learning_rate": 4.738837153772684e-05, |
|
"loss": 1.4841, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.1678904011461318, |
|
"grad_norm": 1.8162873983383179, |
|
"learning_rate": 4.720182664756447e-05, |
|
"loss": 1.4664, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.1678904011461318, |
|
"eval_accuracy": 0.5726688422748262, |
|
"eval_loss": 2.2988929748535156, |
|
"eval_runtime": 706.3059, |
|
"eval_samples_per_second": 91.996, |
|
"eval_steps_per_second": 3.834, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.17908309455587393, |
|
"grad_norm": 1.861790418624878, |
|
"learning_rate": 4.70152817574021e-05, |
|
"loss": 1.4613, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.19027578796561603, |
|
"grad_norm": 1.7351659536361694, |
|
"learning_rate": 4.682873686723974e-05, |
|
"loss": 1.4554, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.20146848137535817, |
|
"grad_norm": 1.796727180480957, |
|
"learning_rate": 4.664219197707737e-05, |
|
"loss": 1.4469, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.2126611747851003, |
|
"grad_norm": 1.751111388206482, |
|
"learning_rate": 4.6455647086915e-05, |
|
"loss": 1.4405, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.2238538681948424, |
|
"grad_norm": 1.793644905090332, |
|
"learning_rate": 4.626910219675263e-05, |
|
"loss": 1.4314, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.2238538681948424, |
|
"eval_accuracy": 0.5781162212828304, |
|
"eval_loss": 2.257195472717285, |
|
"eval_runtime": 709.7465, |
|
"eval_samples_per_second": 91.55, |
|
"eval_steps_per_second": 3.815, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.23504656160458454, |
|
"grad_norm": 1.7030937671661377, |
|
"learning_rate": 4.6082557306590264e-05, |
|
"loss": 1.425, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.24623925501432664, |
|
"grad_norm": 1.7245328426361084, |
|
"learning_rate": 4.5896012416427894e-05, |
|
"loss": 1.4206, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.25743194842406875, |
|
"grad_norm": 1.7355397939682007, |
|
"learning_rate": 4.570946752626552e-05, |
|
"loss": 1.409, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.2686246418338109, |
|
"grad_norm": 1.7283306121826172, |
|
"learning_rate": 4.5522922636103154e-05, |
|
"loss": 1.4086, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.279817335243553, |
|
"grad_norm": 1.7133527994155884, |
|
"learning_rate": 4.5336377745940784e-05, |
|
"loss": 1.4042, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.279817335243553, |
|
"eval_accuracy": 0.5822483255357088, |
|
"eval_loss": 2.2244207859039307, |
|
"eval_runtime": 708.1859, |
|
"eval_samples_per_second": 91.751, |
|
"eval_steps_per_second": 3.824, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.2910100286532951, |
|
"grad_norm": 1.688602328300476, |
|
"learning_rate": 4.514983285577842e-05, |
|
"loss": 1.3952, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.3022027220630373, |
|
"grad_norm": 1.6839321851730347, |
|
"learning_rate": 4.4963287965616043e-05, |
|
"loss": 1.3932, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.3133954154727794, |
|
"grad_norm": 1.7225844860076904, |
|
"learning_rate": 4.477674307545368e-05, |
|
"loss": 1.3839, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.3245881088825215, |
|
"grad_norm": 1.6329905986785889, |
|
"learning_rate": 4.459019818529131e-05, |
|
"loss": 1.3856, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.3357808022922636, |
|
"grad_norm": 1.7012953758239746, |
|
"learning_rate": 4.440365329512895e-05, |
|
"loss": 1.3771, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.3357808022922636, |
|
"eval_accuracy": 0.586269614225024, |
|
"eval_loss": 2.1954798698425293, |
|
"eval_runtime": 718.0126, |
|
"eval_samples_per_second": 90.496, |
|
"eval_steps_per_second": 3.772, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.34697349570200575, |
|
"grad_norm": 1.6593496799468994, |
|
"learning_rate": 4.421710840496657e-05, |
|
"loss": 1.376, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.35816618911174786, |
|
"grad_norm": 1.6412550210952759, |
|
"learning_rate": 4.4030563514804206e-05, |
|
"loss": 1.3712, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.36935888252148996, |
|
"grad_norm": 1.6455302238464355, |
|
"learning_rate": 4.3844018624641836e-05, |
|
"loss": 1.3699, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.38055157593123207, |
|
"grad_norm": 1.6210881471633911, |
|
"learning_rate": 4.3657473734479466e-05, |
|
"loss": 1.3618, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.3917442693409742, |
|
"grad_norm": 1.6821410655975342, |
|
"learning_rate": 4.3470928844317096e-05, |
|
"loss": 1.3563, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.3917442693409742, |
|
"eval_accuracy": 0.589598096204646, |
|
"eval_loss": 2.168947219848633, |
|
"eval_runtime": 707.8628, |
|
"eval_samples_per_second": 91.793, |
|
"eval_steps_per_second": 3.826, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.40293696275071633, |
|
"grad_norm": 1.719738245010376, |
|
"learning_rate": 4.3284383954154726e-05, |
|
"loss": 1.3585, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.41412965616045844, |
|
"grad_norm": 1.660507321357727, |
|
"learning_rate": 4.309783906399236e-05, |
|
"loss": 1.3502, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.4253223495702006, |
|
"grad_norm": 1.7758148908615112, |
|
"learning_rate": 4.291129417382999e-05, |
|
"loss": 1.3459, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.4365150429799427, |
|
"grad_norm": 1.6665699481964111, |
|
"learning_rate": 4.272474928366762e-05, |
|
"loss": 1.3435, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.4477077363896848, |
|
"grad_norm": 1.6364027261734009, |
|
"learning_rate": 4.253820439350525e-05, |
|
"loss": 1.3401, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.4477077363896848, |
|
"eval_accuracy": 0.5922608205511055, |
|
"eval_loss": 2.1485562324523926, |
|
"eval_runtime": 704.3891, |
|
"eval_samples_per_second": 92.246, |
|
"eval_steps_per_second": 3.844, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.4589004297994269, |
|
"grad_norm": 9.470758438110352, |
|
"learning_rate": 4.426374462750716e-05, |
|
"loss": 8.0235, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.4700931232091691, |
|
"grad_norm": 9.91232967376709, |
|
"learning_rate": 4.412383595988539e-05, |
|
"loss": 7.9603, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.4812858166189112, |
|
"grad_norm": 9.734143257141113, |
|
"learning_rate": 4.398392729226361e-05, |
|
"loss": 7.9793, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.4924785100286533, |
|
"grad_norm": 9.574400901794434, |
|
"learning_rate": 4.3844018624641836e-05, |
|
"loss": 7.9731, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.5036712034383954, |
|
"grad_norm": 10.017444610595703, |
|
"learning_rate": 4.370410995702006e-05, |
|
"loss": 7.9335, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.5036712034383954, |
|
"eval_accuracy": 0.5952892349509148, |
|
"eval_loss": 2.1270551681518555, |
|
"eval_runtime": 525.2336, |
|
"eval_samples_per_second": 123.711, |
|
"eval_steps_per_second": 2.578, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.5148638968481375, |
|
"grad_norm": 9.66054916381836, |
|
"learning_rate": 4.356420128939828e-05, |
|
"loss": 7.9224, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.5260565902578797, |
|
"grad_norm": 9.483991622924805, |
|
"learning_rate": 4.342429262177651e-05, |
|
"loss": 7.9197, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.5372492836676218, |
|
"grad_norm": 9.803547859191895, |
|
"learning_rate": 4.3284383954154726e-05, |
|
"loss": 7.8932, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.5484419770773639, |
|
"grad_norm": 9.597293853759766, |
|
"learning_rate": 4.3144475286532955e-05, |
|
"loss": 7.8622, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.559634670487106, |
|
"grad_norm": 9.679096221923828, |
|
"learning_rate": 4.300456661891118e-05, |
|
"loss": 7.8644, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.559634670487106, |
|
"eval_accuracy": 0.5979848217689566, |
|
"eval_loss": 2.106226682662964, |
|
"eval_runtime": 528.4389, |
|
"eval_samples_per_second": 122.96, |
|
"eval_steps_per_second": 2.562, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.5708273638968482, |
|
"grad_norm": 9.878997802734375, |
|
"learning_rate": 4.28646579512894e-05, |
|
"loss": 7.8388, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.5820200573065902, |
|
"grad_norm": 9.320840835571289, |
|
"learning_rate": 4.272474928366762e-05, |
|
"loss": 7.8199, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.5932127507163324, |
|
"grad_norm": 9.581457138061523, |
|
"learning_rate": 4.2584840616045845e-05, |
|
"loss": 7.8194, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.6044054441260746, |
|
"grad_norm": 9.690735816955566, |
|
"learning_rate": 4.2444931948424074e-05, |
|
"loss": 7.8147, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.6155981375358166, |
|
"grad_norm": 9.55455207824707, |
|
"learning_rate": 4.230502328080229e-05, |
|
"loss": 7.7927, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.6155981375358166, |
|
"eval_accuracy": 0.599545230267029, |
|
"eval_loss": 2.09478497505188, |
|
"eval_runtime": 531.729, |
|
"eval_samples_per_second": 122.199, |
|
"eval_steps_per_second": 2.546, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.6267908309455588, |
|
"grad_norm": 9.352036476135254, |
|
"learning_rate": 4.216511461318052e-05, |
|
"loss": 7.7711, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.6379835243553008, |
|
"grad_norm": 9.413168907165527, |
|
"learning_rate": 4.202520594555874e-05, |
|
"loss": 7.7733, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.649176217765043, |
|
"grad_norm": 9.420402526855469, |
|
"learning_rate": 4.1885297277936964e-05, |
|
"loss": 7.74, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.6603689111747851, |
|
"grad_norm": 9.579030990600586, |
|
"learning_rate": 4.1745388610315186e-05, |
|
"loss": 7.7237, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.6715616045845272, |
|
"grad_norm": 12.816407203674316, |
|
"learning_rate": 4.160547994269341e-05, |
|
"loss": 7.7401, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.6715616045845272, |
|
"eval_accuracy": 0.6018761556694041, |
|
"eval_loss": 2.079362630844116, |
|
"eval_runtime": 531.1941, |
|
"eval_samples_per_second": 122.323, |
|
"eval_steps_per_second": 2.549, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.6827542979942693, |
|
"grad_norm": 9.477621078491211, |
|
"learning_rate": 4.146557127507164e-05, |
|
"loss": 7.717, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.6939469914040115, |
|
"grad_norm": 9.8326416015625, |
|
"learning_rate": 4.132566260744986e-05, |
|
"loss": 7.7148, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.7051396848137536, |
|
"grad_norm": 9.668205261230469, |
|
"learning_rate": 4.118575393982808e-05, |
|
"loss": 7.6845, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.7163323782234957, |
|
"grad_norm": 9.344961166381836, |
|
"learning_rate": 4.1045845272206305e-05, |
|
"loss": 7.673, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.7275250716332379, |
|
"grad_norm": 12.754666328430176, |
|
"learning_rate": 4.090593660458453e-05, |
|
"loss": 7.646, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.7275250716332379, |
|
"eval_accuracy": 0.6036969981017439, |
|
"eval_loss": 2.0638949871063232, |
|
"eval_runtime": 534.6839, |
|
"eval_samples_per_second": 121.524, |
|
"eval_steps_per_second": 2.532, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.7387177650429799, |
|
"grad_norm": 9.269234657287598, |
|
"learning_rate": 4.076602793696275e-05, |
|
"loss": 7.6452, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.7499104584527221, |
|
"grad_norm": 9.59334659576416, |
|
"learning_rate": 4.062611926934098e-05, |
|
"loss": 7.6369, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.7611031518624641, |
|
"grad_norm": 9.979016304016113, |
|
"learning_rate": 4.04862106017192e-05, |
|
"loss": 7.6306, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.7722958452722063, |
|
"grad_norm": 9.395634651184082, |
|
"learning_rate": 4.0346301934097424e-05, |
|
"loss": 7.6083, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.7834885386819485, |
|
"grad_norm": 9.377208709716797, |
|
"learning_rate": 4.0206393266475646e-05, |
|
"loss": 7.6113, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.7834885386819485, |
|
"eval_accuracy": 0.6060486530458662, |
|
"eval_loss": 2.046678066253662, |
|
"eval_runtime": 527.2553, |
|
"eval_samples_per_second": 123.236, |
|
"eval_steps_per_second": 2.568, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.7946812320916905, |
|
"grad_norm": 9.33324146270752, |
|
"learning_rate": 4.006648459885387e-05, |
|
"loss": 7.596, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.8058739255014327, |
|
"grad_norm": 10.012749671936035, |
|
"learning_rate": 3.992657593123209e-05, |
|
"loss": 7.5944, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.8170666189111748, |
|
"grad_norm": 9.17791748046875, |
|
"learning_rate": 3.9786667263610314e-05, |
|
"loss": 7.5724, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.8282593123209169, |
|
"grad_norm": 9.714068412780762, |
|
"learning_rate": 3.964675859598854e-05, |
|
"loss": 7.5716, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.839452005730659, |
|
"grad_norm": 9.122146606445312, |
|
"learning_rate": 3.9506849928366765e-05, |
|
"loss": 7.5428, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.839452005730659, |
|
"eval_accuracy": 0.6080310471813272, |
|
"eval_loss": 2.0341005325317383, |
|
"eval_runtime": 534.9624, |
|
"eval_samples_per_second": 121.461, |
|
"eval_steps_per_second": 2.531, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.8506446991404012, |
|
"grad_norm": 8.890284538269043, |
|
"learning_rate": 3.936694126074499e-05, |
|
"loss": 7.5108, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.8618373925501432, |
|
"grad_norm": 9.258638381958008, |
|
"learning_rate": 3.922703259312321e-05, |
|
"loss": 7.5283, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.8730300859598854, |
|
"grad_norm": 9.524474143981934, |
|
"learning_rate": 3.908712392550143e-05, |
|
"loss": 7.5168, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.8842227793696275, |
|
"grad_norm": 9.608149528503418, |
|
"learning_rate": 3.894721525787966e-05, |
|
"loss": 7.5206, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.8954154727793696, |
|
"grad_norm": 9.405288696289062, |
|
"learning_rate": 3.880730659025788e-05, |
|
"loss": 7.5039, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.8954154727793696, |
|
"eval_accuracy": 0.6094788673634718, |
|
"eval_loss": 2.0253567695617676, |
|
"eval_runtime": 535.0948, |
|
"eval_samples_per_second": 121.431, |
|
"eval_steps_per_second": 2.53, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.9066081661891118, |
|
"grad_norm": 8.706295013427734, |
|
"learning_rate": 3.8667397922636107e-05, |
|
"loss": 7.4819, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.9178008595988538, |
|
"grad_norm": 9.542219161987305, |
|
"learning_rate": 3.852748925501433e-05, |
|
"loss": 7.4888, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.928993553008596, |
|
"grad_norm": 9.111319541931152, |
|
"learning_rate": 3.838758058739255e-05, |
|
"loss": 7.4749, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.9401862464183381, |
|
"grad_norm": 9.335123062133789, |
|
"learning_rate": 3.824767191977078e-05, |
|
"loss": 7.4591, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.9513789398280802, |
|
"grad_norm": 9.537328720092773, |
|
"learning_rate": 3.8107763252148996e-05, |
|
"loss": 7.4533, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.9513789398280802, |
|
"eval_accuracy": 0.6107361281876699, |
|
"eval_loss": 2.0133583545684814, |
|
"eval_runtime": 530.2968, |
|
"eval_samples_per_second": 122.529, |
|
"eval_steps_per_second": 2.553, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.9625716332378224, |
|
"grad_norm": 9.227987289428711, |
|
"learning_rate": 3.7967854584527225e-05, |
|
"loss": 7.4506, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.9737643266475645, |
|
"grad_norm": 9.076460838317871, |
|
"learning_rate": 3.782794591690544e-05, |
|
"loss": 7.4557, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.9849570200573066, |
|
"grad_norm": 9.841446876525879, |
|
"learning_rate": 3.768803724928367e-05, |
|
"loss": 7.4319, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.9961497134670487, |
|
"grad_norm": 9.169388771057129, |
|
"learning_rate": 3.754812858166189e-05, |
|
"loss": 7.4453, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.0073424068767909, |
|
"grad_norm": 9.200368881225586, |
|
"learning_rate": 3.7408219914040115e-05, |
|
"loss": 7.4149, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.0073424068767909, |
|
"eval_accuracy": 0.6120696404224526, |
|
"eval_loss": 2.0035457611083984, |
|
"eval_runtime": 532.3092, |
|
"eval_samples_per_second": 122.066, |
|
"eval_steps_per_second": 2.544, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.018535100286533, |
|
"grad_norm": 9.189336776733398, |
|
"learning_rate": 3.7268311246418344e-05, |
|
"loss": 7.3981, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.029727793696275, |
|
"grad_norm": 9.504659652709961, |
|
"learning_rate": 3.712840257879656e-05, |
|
"loss": 7.4031, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.0409204871060171, |
|
"grad_norm": 9.516868591308594, |
|
"learning_rate": 3.698849391117479e-05, |
|
"loss": 7.3822, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.0521131805157593, |
|
"grad_norm": 9.417741775512695, |
|
"learning_rate": 3.6848585243553005e-05, |
|
"loss": 7.3887, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.0633058739255015, |
|
"grad_norm": 9.202630043029785, |
|
"learning_rate": 3.6708676575931234e-05, |
|
"loss": 7.379, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.0633058739255015, |
|
"eval_accuracy": 0.6134326919026999, |
|
"eval_loss": 1.9946683645248413, |
|
"eval_runtime": 529.4253, |
|
"eval_samples_per_second": 122.731, |
|
"eval_steps_per_second": 2.557, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.0744985673352436, |
|
"grad_norm": 9.18812084197998, |
|
"learning_rate": 3.6568767908309456e-05, |
|
"loss": 7.3648, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.0856912607449858, |
|
"grad_norm": 9.317421913146973, |
|
"learning_rate": 3.642885924068768e-05, |
|
"loss": 7.3581, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.0968839541547277, |
|
"grad_norm": 9.30117130279541, |
|
"learning_rate": 3.628895057306591e-05, |
|
"loss": 7.3242, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.1080766475644699, |
|
"grad_norm": 9.295071601867676, |
|
"learning_rate": 3.6149041905444124e-05, |
|
"loss": 7.3343, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.119269340974212, |
|
"grad_norm": 9.372967720031738, |
|
"learning_rate": 3.600913323782235e-05, |
|
"loss": 7.324, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.119269340974212, |
|
"eval_accuracy": 0.6151487162989436, |
|
"eval_loss": 1.9853588342666626, |
|
"eval_runtime": 528.6238, |
|
"eval_samples_per_second": 122.917, |
|
"eval_steps_per_second": 2.561, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.1304620343839542, |
|
"grad_norm": 10.693807601928711, |
|
"learning_rate": 3.5869224570200575e-05, |
|
"loss": 7.3164, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.1416547277936964, |
|
"grad_norm": 9.047393798828125, |
|
"learning_rate": 3.57293159025788e-05, |
|
"loss": 7.3028, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.1528474212034383, |
|
"grad_norm": 9.055428504943848, |
|
"learning_rate": 3.558940723495702e-05, |
|
"loss": 7.316, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.1640401146131805, |
|
"grad_norm": 8.821599960327148, |
|
"learning_rate": 3.544949856733524e-05, |
|
"loss": 7.2759, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.1752328080229226, |
|
"grad_norm": 8.971498489379883, |
|
"learning_rate": 3.530958989971347e-05, |
|
"loss": 7.3041, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.1752328080229226, |
|
"eval_accuracy": 0.6162336395081003, |
|
"eval_loss": 1.9736484289169312, |
|
"eval_runtime": 526.2473, |
|
"eval_samples_per_second": 123.472, |
|
"eval_steps_per_second": 2.573, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.1864255014326648, |
|
"grad_norm": 9.30490779876709, |
|
"learning_rate": 3.5169681232091694e-05, |
|
"loss": 7.2966, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.197618194842407, |
|
"grad_norm": 9.367337226867676, |
|
"learning_rate": 3.5029772564469917e-05, |
|
"loss": 7.2862, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.2088108882521489, |
|
"grad_norm": 9.002731323242188, |
|
"learning_rate": 3.488986389684814e-05, |
|
"loss": 7.2858, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.220003581661891, |
|
"grad_norm": 9.070691108703613, |
|
"learning_rate": 3.474995522922636e-05, |
|
"loss": 7.2692, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.2311962750716332, |
|
"grad_norm": 9.154426574707031, |
|
"learning_rate": 3.4610046561604584e-05, |
|
"loss": 7.262, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.2311962750716332, |
|
"eval_accuracy": 0.6174860367511424, |
|
"eval_loss": 1.9672149419784546, |
|
"eval_runtime": 529.8804, |
|
"eval_samples_per_second": 122.626, |
|
"eval_steps_per_second": 2.555, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.2423889684813754, |
|
"grad_norm": 9.364106178283691, |
|
"learning_rate": 3.447013789398281e-05, |
|
"loss": 7.2489, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.2535816618911175, |
|
"grad_norm": 9.267243385314941, |
|
"learning_rate": 3.4330229226361035e-05, |
|
"loss": 7.2664, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.2647743553008595, |
|
"grad_norm": 9.162137031555176, |
|
"learning_rate": 3.419032055873926e-05, |
|
"loss": 7.2475, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.2759670487106018, |
|
"grad_norm": 9.292202949523926, |
|
"learning_rate": 3.405041189111748e-05, |
|
"loss": 7.2357, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.2871597421203438, |
|
"grad_norm": 9.280839920043945, |
|
"learning_rate": 3.39105032234957e-05, |
|
"loss": 7.2169, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.2871597421203438, |
|
"eval_accuracy": 0.6184868881174969, |
|
"eval_loss": 1.9653985500335693, |
|
"eval_runtime": 531.1309, |
|
"eval_samples_per_second": 122.337, |
|
"eval_steps_per_second": 2.549, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.298352435530086, |
|
"grad_norm": 8.75936222076416, |
|
"learning_rate": 3.3770594555873925e-05, |
|
"loss": 7.2052, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.309545128939828, |
|
"grad_norm": 8.891804695129395, |
|
"learning_rate": 3.363068588825215e-05, |
|
"loss": 7.2258, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.3207378223495703, |
|
"grad_norm": 8.931051254272461, |
|
"learning_rate": 3.349077722063038e-05, |
|
"loss": 7.1899, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.3319305157593124, |
|
"grad_norm": 9.616579055786133, |
|
"learning_rate": 3.33508685530086e-05, |
|
"loss": 7.2068, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.3431232091690544, |
|
"grad_norm": 8.981892585754395, |
|
"learning_rate": 3.321095988538682e-05, |
|
"loss": 7.2084, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.3431232091690544, |
|
"eval_accuracy": 0.6190439375486008, |
|
"eval_loss": 1.9544332027435303, |
|
"eval_runtime": 528.8193, |
|
"eval_samples_per_second": 122.872, |
|
"eval_steps_per_second": 2.56, |
|
"step": 60000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 178688, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 2500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.2799139827941786e+18, |
|
"train_batch_size": 24, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|