{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3431232091690544, "eval_steps": 2500, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01119269340974212, "grad_norm": 2.299727201461792, "learning_rate": 4.981345510983763e-05, "loss": 1.8848, "step": 500 }, { "epoch": 0.02238538681948424, "grad_norm": 1.9952893257141113, "learning_rate": 4.962691021967526e-05, "loss": 1.7595, "step": 1000 }, { "epoch": 0.03357808022922636, "grad_norm": 2.1056811809539795, "learning_rate": 4.944036532951289e-05, "loss": 1.6994, "step": 1500 }, { "epoch": 0.04477077363896848, "grad_norm": 2.0474352836608887, "learning_rate": 4.925382043935053e-05, "loss": 1.6629, "step": 2000 }, { "epoch": 0.0559634670487106, "grad_norm": 1.9989269971847534, "learning_rate": 4.906727554918816e-05, "loss": 1.6236, "step": 2500 }, { "epoch": 0.0559634670487106, "eval_accuracy": 0.5569847646608951, "eval_loss": 2.425182342529297, "eval_runtime": 707.8445, "eval_samples_per_second": 91.796, "eval_steps_per_second": 3.826, "step": 2500 }, { "epoch": 0.06715616045845273, "grad_norm": 1.879557490348816, "learning_rate": 4.888073065902579e-05, "loss": 1.5991, "step": 3000 }, { "epoch": 0.07834885386819485, "grad_norm": 1.9889895915985107, "learning_rate": 4.869418576886342e-05, "loss": 1.5751, "step": 3500 }, { "epoch": 0.08954154727793696, "grad_norm": 1.910925269126892, "learning_rate": 4.8507640878701055e-05, "loss": 1.5587, "step": 4000 }, { "epoch": 0.10073424068767908, "grad_norm": 1.9268312454223633, "learning_rate": 4.8321095988538685e-05, "loss": 1.546, "step": 4500 }, { "epoch": 0.1119269340974212, "grad_norm": 1.8074718713760376, "learning_rate": 4.8134551098376315e-05, "loss": 1.5301, "step": 5000 }, { "epoch": 0.1119269340974212, "eval_accuracy": 0.566450867740456, "eval_loss": 2.3531110286712646, "eval_runtime": 716.3757, "eval_samples_per_second": 90.702, "eval_steps_per_second": 3.78, "step": 5000 }, { "epoch": 0.12311962750716332, "grad_norm": 1.8723756074905396, "learning_rate": 4.7948006208213945e-05, "loss": 1.5153, "step": 5500 }, { "epoch": 0.13431232091690545, "grad_norm": 1.8938133716583252, "learning_rate": 4.7761461318051575e-05, "loss": 1.5051, "step": 6000 }, { "epoch": 0.14550501432664756, "grad_norm": 1.8093421459197998, "learning_rate": 4.757491642788921e-05, "loss": 1.4922, "step": 6500 }, { "epoch": 0.1566977077363897, "grad_norm": 1.8811379671096802, "learning_rate": 4.738837153772684e-05, "loss": 1.4841, "step": 7000 }, { "epoch": 0.1678904011461318, "grad_norm": 1.8162873983383179, "learning_rate": 4.720182664756447e-05, "loss": 1.4664, "step": 7500 }, { "epoch": 0.1678904011461318, "eval_accuracy": 0.5726688422748262, "eval_loss": 2.2988929748535156, "eval_runtime": 706.3059, "eval_samples_per_second": 91.996, "eval_steps_per_second": 3.834, "step": 7500 }, { "epoch": 0.17908309455587393, "grad_norm": 1.861790418624878, "learning_rate": 4.70152817574021e-05, "loss": 1.4613, "step": 8000 }, { "epoch": 0.19027578796561603, "grad_norm": 1.7351659536361694, "learning_rate": 4.682873686723974e-05, "loss": 1.4554, "step": 8500 }, { "epoch": 0.20146848137535817, "grad_norm": 1.796727180480957, "learning_rate": 4.664219197707737e-05, "loss": 1.4469, "step": 9000 }, { "epoch": 0.2126611747851003, "grad_norm": 1.751111388206482, "learning_rate": 4.6455647086915e-05, "loss": 1.4405, "step": 9500 }, { "epoch": 0.2238538681948424, "grad_norm": 1.793644905090332, "learning_rate": 4.626910219675263e-05, "loss": 1.4314, "step": 10000 }, { "epoch": 0.2238538681948424, "eval_accuracy": 0.5781162212828304, "eval_loss": 2.257195472717285, "eval_runtime": 709.7465, "eval_samples_per_second": 91.55, "eval_steps_per_second": 3.815, "step": 10000 }, { "epoch": 0.23504656160458454, "grad_norm": 1.7030937671661377, "learning_rate": 4.6082557306590264e-05, "loss": 1.425, "step": 10500 }, { "epoch": 0.24623925501432664, "grad_norm": 1.7245328426361084, "learning_rate": 4.5896012416427894e-05, "loss": 1.4206, "step": 11000 }, { "epoch": 0.25743194842406875, "grad_norm": 1.7355397939682007, "learning_rate": 4.570946752626552e-05, "loss": 1.409, "step": 11500 }, { "epoch": 0.2686246418338109, "grad_norm": 1.7283306121826172, "learning_rate": 4.5522922636103154e-05, "loss": 1.4086, "step": 12000 }, { "epoch": 0.279817335243553, "grad_norm": 1.7133527994155884, "learning_rate": 4.5336377745940784e-05, "loss": 1.4042, "step": 12500 }, { "epoch": 0.279817335243553, "eval_accuracy": 0.5822483255357088, "eval_loss": 2.2244207859039307, "eval_runtime": 708.1859, "eval_samples_per_second": 91.751, "eval_steps_per_second": 3.824, "step": 12500 }, { "epoch": 0.2910100286532951, "grad_norm": 1.688602328300476, "learning_rate": 4.514983285577842e-05, "loss": 1.3952, "step": 13000 }, { "epoch": 0.3022027220630373, "grad_norm": 1.6839321851730347, "learning_rate": 4.4963287965616043e-05, "loss": 1.3932, "step": 13500 }, { "epoch": 0.3133954154727794, "grad_norm": 1.7225844860076904, "learning_rate": 4.477674307545368e-05, "loss": 1.3839, "step": 14000 }, { "epoch": 0.3245881088825215, "grad_norm": 1.6329905986785889, "learning_rate": 4.459019818529131e-05, "loss": 1.3856, "step": 14500 }, { "epoch": 0.3357808022922636, "grad_norm": 1.7012953758239746, "learning_rate": 4.440365329512895e-05, "loss": 1.3771, "step": 15000 }, { "epoch": 0.3357808022922636, "eval_accuracy": 0.586269614225024, "eval_loss": 2.1954798698425293, "eval_runtime": 718.0126, "eval_samples_per_second": 90.496, "eval_steps_per_second": 3.772, "step": 15000 }, { "epoch": 0.34697349570200575, "grad_norm": 1.6593496799468994, "learning_rate": 4.421710840496657e-05, "loss": 1.376, "step": 15500 }, { "epoch": 0.35816618911174786, "grad_norm": 1.6412550210952759, "learning_rate": 4.4030563514804206e-05, "loss": 1.3712, "step": 16000 }, { "epoch": 0.36935888252148996, "grad_norm": 1.6455302238464355, "learning_rate": 4.3844018624641836e-05, "loss": 1.3699, "step": 16500 }, { "epoch": 0.38055157593123207, "grad_norm": 1.6210881471633911, "learning_rate": 4.3657473734479466e-05, "loss": 1.3618, "step": 17000 }, { "epoch": 0.3917442693409742, "grad_norm": 1.6821410655975342, "learning_rate": 4.3470928844317096e-05, "loss": 1.3563, "step": 17500 }, { "epoch": 0.3917442693409742, "eval_accuracy": 0.589598096204646, "eval_loss": 2.168947219848633, "eval_runtime": 707.8628, "eval_samples_per_second": 91.793, "eval_steps_per_second": 3.826, "step": 17500 }, { "epoch": 0.40293696275071633, "grad_norm": 1.719738245010376, "learning_rate": 4.3284383954154726e-05, "loss": 1.3585, "step": 18000 }, { "epoch": 0.41412965616045844, "grad_norm": 1.660507321357727, "learning_rate": 4.309783906399236e-05, "loss": 1.3502, "step": 18500 }, { "epoch": 0.4253223495702006, "grad_norm": 1.7758148908615112, "learning_rate": 4.291129417382999e-05, "loss": 1.3459, "step": 19000 }, { "epoch": 0.4365150429799427, "grad_norm": 1.6665699481964111, "learning_rate": 4.272474928366762e-05, "loss": 1.3435, "step": 19500 }, { "epoch": 0.4477077363896848, "grad_norm": 1.6364027261734009, "learning_rate": 4.253820439350525e-05, "loss": 1.3401, "step": 20000 }, { "epoch": 0.4477077363896848, "eval_accuracy": 0.5922608205511055, "eval_loss": 2.1485562324523926, "eval_runtime": 704.3891, "eval_samples_per_second": 92.246, "eval_steps_per_second": 3.844, "step": 20000 }, { "epoch": 0.4589004297994269, "grad_norm": 9.470758438110352, "learning_rate": 4.426374462750716e-05, "loss": 8.0235, "step": 20500 }, { "epoch": 0.4700931232091691, "grad_norm": 9.91232967376709, "learning_rate": 4.412383595988539e-05, "loss": 7.9603, "step": 21000 }, { "epoch": 0.4812858166189112, "grad_norm": 9.734143257141113, "learning_rate": 4.398392729226361e-05, "loss": 7.9793, "step": 21500 }, { "epoch": 0.4924785100286533, "grad_norm": 9.574400901794434, "learning_rate": 4.3844018624641836e-05, "loss": 7.9731, "step": 22000 }, { "epoch": 0.5036712034383954, "grad_norm": 10.017444610595703, "learning_rate": 4.370410995702006e-05, "loss": 7.9335, "step": 22500 }, { "epoch": 0.5036712034383954, "eval_accuracy": 0.5952892349509148, "eval_loss": 2.1270551681518555, "eval_runtime": 525.2336, "eval_samples_per_second": 123.711, "eval_steps_per_second": 2.578, "step": 22500 }, { "epoch": 0.5148638968481375, "grad_norm": 9.66054916381836, "learning_rate": 4.356420128939828e-05, "loss": 7.9224, "step": 23000 }, { "epoch": 0.5260565902578797, "grad_norm": 9.483991622924805, "learning_rate": 4.342429262177651e-05, "loss": 7.9197, "step": 23500 }, { "epoch": 0.5372492836676218, "grad_norm": 9.803547859191895, "learning_rate": 4.3284383954154726e-05, "loss": 7.8932, "step": 24000 }, { "epoch": 0.5484419770773639, "grad_norm": 9.597293853759766, "learning_rate": 4.3144475286532955e-05, "loss": 7.8622, "step": 24500 }, { "epoch": 0.559634670487106, "grad_norm": 9.679096221923828, "learning_rate": 4.300456661891118e-05, "loss": 7.8644, "step": 25000 }, { "epoch": 0.559634670487106, "eval_accuracy": 0.5979848217689566, "eval_loss": 2.106226682662964, "eval_runtime": 528.4389, "eval_samples_per_second": 122.96, "eval_steps_per_second": 2.562, "step": 25000 }, { "epoch": 0.5708273638968482, "grad_norm": 9.878997802734375, "learning_rate": 4.28646579512894e-05, "loss": 7.8388, "step": 25500 }, { "epoch": 0.5820200573065902, "grad_norm": 9.320840835571289, "learning_rate": 4.272474928366762e-05, "loss": 7.8199, "step": 26000 }, { "epoch": 0.5932127507163324, "grad_norm": 9.581457138061523, "learning_rate": 4.2584840616045845e-05, "loss": 7.8194, "step": 26500 }, { "epoch": 0.6044054441260746, "grad_norm": 9.690735816955566, "learning_rate": 4.2444931948424074e-05, "loss": 7.8147, "step": 27000 }, { "epoch": 0.6155981375358166, "grad_norm": 9.55455207824707, "learning_rate": 4.230502328080229e-05, "loss": 7.7927, "step": 27500 }, { "epoch": 0.6155981375358166, "eval_accuracy": 0.599545230267029, "eval_loss": 2.09478497505188, "eval_runtime": 531.729, "eval_samples_per_second": 122.199, "eval_steps_per_second": 2.546, "step": 27500 }, { "epoch": 0.6267908309455588, "grad_norm": 9.352036476135254, "learning_rate": 4.216511461318052e-05, "loss": 7.7711, "step": 28000 }, { "epoch": 0.6379835243553008, "grad_norm": 9.413168907165527, "learning_rate": 4.202520594555874e-05, "loss": 7.7733, "step": 28500 }, { "epoch": 0.649176217765043, "grad_norm": 9.420402526855469, "learning_rate": 4.1885297277936964e-05, "loss": 7.74, "step": 29000 }, { "epoch": 0.6603689111747851, "grad_norm": 9.579030990600586, "learning_rate": 4.1745388610315186e-05, "loss": 7.7237, "step": 29500 }, { "epoch": 0.6715616045845272, "grad_norm": 12.816407203674316, "learning_rate": 4.160547994269341e-05, "loss": 7.7401, "step": 30000 }, { "epoch": 0.6715616045845272, "eval_accuracy": 0.6018761556694041, "eval_loss": 2.079362630844116, "eval_runtime": 531.1941, "eval_samples_per_second": 122.323, "eval_steps_per_second": 2.549, "step": 30000 }, { "epoch": 0.6827542979942693, "grad_norm": 9.477621078491211, "learning_rate": 4.146557127507164e-05, "loss": 7.717, "step": 30500 }, { "epoch": 0.6939469914040115, "grad_norm": 9.8326416015625, "learning_rate": 4.132566260744986e-05, "loss": 7.7148, "step": 31000 }, { "epoch": 0.7051396848137536, "grad_norm": 9.668205261230469, "learning_rate": 4.118575393982808e-05, "loss": 7.6845, "step": 31500 }, { "epoch": 0.7163323782234957, "grad_norm": 9.344961166381836, "learning_rate": 4.1045845272206305e-05, "loss": 7.673, "step": 32000 }, { "epoch": 0.7275250716332379, "grad_norm": 12.754666328430176, "learning_rate": 4.090593660458453e-05, "loss": 7.646, "step": 32500 }, { "epoch": 0.7275250716332379, "eval_accuracy": 0.6036969981017439, "eval_loss": 2.0638949871063232, "eval_runtime": 534.6839, "eval_samples_per_second": 121.524, "eval_steps_per_second": 2.532, "step": 32500 }, { "epoch": 0.7387177650429799, "grad_norm": 9.269234657287598, "learning_rate": 4.076602793696275e-05, "loss": 7.6452, "step": 33000 }, { "epoch": 0.7499104584527221, "grad_norm": 9.59334659576416, "learning_rate": 4.062611926934098e-05, "loss": 7.6369, "step": 33500 }, { "epoch": 0.7611031518624641, "grad_norm": 9.979016304016113, "learning_rate": 4.04862106017192e-05, "loss": 7.6306, "step": 34000 }, { "epoch": 0.7722958452722063, "grad_norm": 9.395634651184082, "learning_rate": 4.0346301934097424e-05, "loss": 7.6083, "step": 34500 }, { "epoch": 0.7834885386819485, "grad_norm": 9.377208709716797, "learning_rate": 4.0206393266475646e-05, "loss": 7.6113, "step": 35000 }, { "epoch": 0.7834885386819485, "eval_accuracy": 0.6060486530458662, "eval_loss": 2.046678066253662, "eval_runtime": 527.2553, "eval_samples_per_second": 123.236, "eval_steps_per_second": 2.568, "step": 35000 }, { "epoch": 0.7946812320916905, "grad_norm": 9.33324146270752, "learning_rate": 4.006648459885387e-05, "loss": 7.596, "step": 35500 }, { "epoch": 0.8058739255014327, "grad_norm": 10.012749671936035, "learning_rate": 3.992657593123209e-05, "loss": 7.5944, "step": 36000 }, { "epoch": 0.8170666189111748, "grad_norm": 9.17791748046875, "learning_rate": 3.9786667263610314e-05, "loss": 7.5724, "step": 36500 }, { "epoch": 0.8282593123209169, "grad_norm": 9.714068412780762, "learning_rate": 3.964675859598854e-05, "loss": 7.5716, "step": 37000 }, { "epoch": 0.839452005730659, "grad_norm": 9.122146606445312, "learning_rate": 3.9506849928366765e-05, "loss": 7.5428, "step": 37500 }, { "epoch": 0.839452005730659, "eval_accuracy": 0.6080310471813272, "eval_loss": 2.0341005325317383, "eval_runtime": 534.9624, "eval_samples_per_second": 121.461, "eval_steps_per_second": 2.531, "step": 37500 }, { "epoch": 0.8506446991404012, "grad_norm": 8.890284538269043, "learning_rate": 3.936694126074499e-05, "loss": 7.5108, "step": 38000 }, { "epoch": 0.8618373925501432, "grad_norm": 9.258638381958008, "learning_rate": 3.922703259312321e-05, "loss": 7.5283, "step": 38500 }, { "epoch": 0.8730300859598854, "grad_norm": 9.524474143981934, "learning_rate": 3.908712392550143e-05, "loss": 7.5168, "step": 39000 }, { "epoch": 0.8842227793696275, "grad_norm": 9.608149528503418, "learning_rate": 3.894721525787966e-05, "loss": 7.5206, "step": 39500 }, { "epoch": 0.8954154727793696, "grad_norm": 9.405288696289062, "learning_rate": 3.880730659025788e-05, "loss": 7.5039, "step": 40000 }, { "epoch": 0.8954154727793696, "eval_accuracy": 0.6094788673634718, "eval_loss": 2.0253567695617676, "eval_runtime": 535.0948, "eval_samples_per_second": 121.431, "eval_steps_per_second": 2.53, "step": 40000 }, { "epoch": 0.9066081661891118, "grad_norm": 8.706295013427734, "learning_rate": 3.8667397922636107e-05, "loss": 7.4819, "step": 40500 }, { "epoch": 0.9178008595988538, "grad_norm": 9.542219161987305, "learning_rate": 3.852748925501433e-05, "loss": 7.4888, "step": 41000 }, { "epoch": 0.928993553008596, "grad_norm": 9.111319541931152, "learning_rate": 3.838758058739255e-05, "loss": 7.4749, "step": 41500 }, { "epoch": 0.9401862464183381, "grad_norm": 9.335123062133789, "learning_rate": 3.824767191977078e-05, "loss": 7.4591, "step": 42000 }, { "epoch": 0.9513789398280802, "grad_norm": 9.537328720092773, "learning_rate": 3.8107763252148996e-05, "loss": 7.4533, "step": 42500 }, { "epoch": 0.9513789398280802, "eval_accuracy": 0.6107361281876699, "eval_loss": 2.0133583545684814, "eval_runtime": 530.2968, "eval_samples_per_second": 122.529, "eval_steps_per_second": 2.553, "step": 42500 }, { "epoch": 0.9625716332378224, "grad_norm": 9.227987289428711, "learning_rate": 3.7967854584527225e-05, "loss": 7.4506, "step": 43000 }, { "epoch": 0.9737643266475645, "grad_norm": 9.076460838317871, "learning_rate": 3.782794591690544e-05, "loss": 7.4557, "step": 43500 }, { "epoch": 0.9849570200573066, "grad_norm": 9.841446876525879, "learning_rate": 3.768803724928367e-05, "loss": 7.4319, "step": 44000 }, { "epoch": 0.9961497134670487, "grad_norm": 9.169388771057129, "learning_rate": 3.754812858166189e-05, "loss": 7.4453, "step": 44500 }, { "epoch": 1.0073424068767909, "grad_norm": 9.200368881225586, "learning_rate": 3.7408219914040115e-05, "loss": 7.4149, "step": 45000 }, { "epoch": 1.0073424068767909, "eval_accuracy": 0.6120696404224526, "eval_loss": 2.0035457611083984, "eval_runtime": 532.3092, "eval_samples_per_second": 122.066, "eval_steps_per_second": 2.544, "step": 45000 }, { "epoch": 1.018535100286533, "grad_norm": 9.189336776733398, "learning_rate": 3.7268311246418344e-05, "loss": 7.3981, "step": 45500 }, { "epoch": 1.029727793696275, "grad_norm": 9.504659652709961, "learning_rate": 3.712840257879656e-05, "loss": 7.4031, "step": 46000 }, { "epoch": 1.0409204871060171, "grad_norm": 9.516868591308594, "learning_rate": 3.698849391117479e-05, "loss": 7.3822, "step": 46500 }, { "epoch": 1.0521131805157593, "grad_norm": 9.417741775512695, "learning_rate": 3.6848585243553005e-05, "loss": 7.3887, "step": 47000 }, { "epoch": 1.0633058739255015, "grad_norm": 9.202630043029785, "learning_rate": 3.6708676575931234e-05, "loss": 7.379, "step": 47500 }, { "epoch": 1.0633058739255015, "eval_accuracy": 0.6134326919026999, "eval_loss": 1.9946683645248413, "eval_runtime": 529.4253, "eval_samples_per_second": 122.731, "eval_steps_per_second": 2.557, "step": 47500 }, { "epoch": 1.0744985673352436, "grad_norm": 9.18812084197998, "learning_rate": 3.6568767908309456e-05, "loss": 7.3648, "step": 48000 }, { "epoch": 1.0856912607449858, "grad_norm": 9.317421913146973, "learning_rate": 3.642885924068768e-05, "loss": 7.3581, "step": 48500 }, { "epoch": 1.0968839541547277, "grad_norm": 9.30117130279541, "learning_rate": 3.628895057306591e-05, "loss": 7.3242, "step": 49000 }, { "epoch": 1.1080766475644699, "grad_norm": 9.295071601867676, "learning_rate": 3.6149041905444124e-05, "loss": 7.3343, "step": 49500 }, { "epoch": 1.119269340974212, "grad_norm": 9.372967720031738, "learning_rate": 3.600913323782235e-05, "loss": 7.324, "step": 50000 }, { "epoch": 1.119269340974212, "eval_accuracy": 0.6151487162989436, "eval_loss": 1.9853588342666626, "eval_runtime": 528.6238, "eval_samples_per_second": 122.917, "eval_steps_per_second": 2.561, "step": 50000 }, { "epoch": 1.1304620343839542, "grad_norm": 10.693807601928711, "learning_rate": 3.5869224570200575e-05, "loss": 7.3164, "step": 50500 }, { "epoch": 1.1416547277936964, "grad_norm": 9.047393798828125, "learning_rate": 3.57293159025788e-05, "loss": 7.3028, "step": 51000 }, { "epoch": 1.1528474212034383, "grad_norm": 9.055428504943848, "learning_rate": 3.558940723495702e-05, "loss": 7.316, "step": 51500 }, { "epoch": 1.1640401146131805, "grad_norm": 8.821599960327148, "learning_rate": 3.544949856733524e-05, "loss": 7.2759, "step": 52000 }, { "epoch": 1.1752328080229226, "grad_norm": 8.971498489379883, "learning_rate": 3.530958989971347e-05, "loss": 7.3041, "step": 52500 }, { "epoch": 1.1752328080229226, "eval_accuracy": 0.6162336395081003, "eval_loss": 1.9736484289169312, "eval_runtime": 526.2473, "eval_samples_per_second": 123.472, "eval_steps_per_second": 2.573, "step": 52500 }, { "epoch": 1.1864255014326648, "grad_norm": 9.30490779876709, "learning_rate": 3.5169681232091694e-05, "loss": 7.2966, "step": 53000 }, { "epoch": 1.197618194842407, "grad_norm": 9.367337226867676, "learning_rate": 3.5029772564469917e-05, "loss": 7.2862, "step": 53500 }, { "epoch": 1.2088108882521489, "grad_norm": 9.002731323242188, "learning_rate": 3.488986389684814e-05, "loss": 7.2858, "step": 54000 }, { "epoch": 1.220003581661891, "grad_norm": 9.070691108703613, "learning_rate": 3.474995522922636e-05, "loss": 7.2692, "step": 54500 }, { "epoch": 1.2311962750716332, "grad_norm": 9.154426574707031, "learning_rate": 3.4610046561604584e-05, "loss": 7.262, "step": 55000 }, { "epoch": 1.2311962750716332, "eval_accuracy": 0.6174860367511424, "eval_loss": 1.9672149419784546, "eval_runtime": 529.8804, "eval_samples_per_second": 122.626, "eval_steps_per_second": 2.555, "step": 55000 }, { "epoch": 1.2423889684813754, "grad_norm": 9.364106178283691, "learning_rate": 3.447013789398281e-05, "loss": 7.2489, "step": 55500 }, { "epoch": 1.2535816618911175, "grad_norm": 9.267243385314941, "learning_rate": 3.4330229226361035e-05, "loss": 7.2664, "step": 56000 }, { "epoch": 1.2647743553008595, "grad_norm": 9.162137031555176, "learning_rate": 3.419032055873926e-05, "loss": 7.2475, "step": 56500 }, { "epoch": 1.2759670487106018, "grad_norm": 9.292202949523926, "learning_rate": 3.405041189111748e-05, "loss": 7.2357, "step": 57000 }, { "epoch": 1.2871597421203438, "grad_norm": 9.280839920043945, "learning_rate": 3.39105032234957e-05, "loss": 7.2169, "step": 57500 }, { "epoch": 1.2871597421203438, "eval_accuracy": 0.6184868881174969, "eval_loss": 1.9653985500335693, "eval_runtime": 531.1309, "eval_samples_per_second": 122.337, "eval_steps_per_second": 2.549, "step": 57500 }, { "epoch": 1.298352435530086, "grad_norm": 8.75936222076416, "learning_rate": 3.3770594555873925e-05, "loss": 7.2052, "step": 58000 }, { "epoch": 1.309545128939828, "grad_norm": 8.891804695129395, "learning_rate": 3.363068588825215e-05, "loss": 7.2258, "step": 58500 }, { "epoch": 1.3207378223495703, "grad_norm": 8.931051254272461, "learning_rate": 3.349077722063038e-05, "loss": 7.1899, "step": 59000 }, { "epoch": 1.3319305157593124, "grad_norm": 9.616579055786133, "learning_rate": 3.33508685530086e-05, "loss": 7.2068, "step": 59500 }, { "epoch": 1.3431232091690544, "grad_norm": 8.981892585754395, "learning_rate": 3.321095988538682e-05, "loss": 7.2084, "step": 60000 }, { "epoch": 1.3431232091690544, "eval_accuracy": 0.6190439375486008, "eval_loss": 1.9544332027435303, "eval_runtime": 528.8193, "eval_samples_per_second": 122.872, "eval_steps_per_second": 2.56, "step": 60000 } ], "logging_steps": 500, "max_steps": 178688, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.2799139827941786e+18, "train_batch_size": 24, "trial_name": null, "trial_params": null }