|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9865410497981157, |
|
"eval_steps": 500, |
|
"global_step": 184, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010767160161507403, |
|
"grad_norm": 0.7117969989776611, |
|
"learning_rate": 1e-05, |
|
"loss": 88.6269, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.021534320323014805, |
|
"grad_norm": 0.6731469035148621, |
|
"learning_rate": 2e-05, |
|
"loss": 88.5874, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03230148048452221, |
|
"grad_norm": 0.684374213218689, |
|
"learning_rate": 3e-05, |
|
"loss": 88.5793, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.04306864064602961, |
|
"grad_norm": 0.6637169122695923, |
|
"learning_rate": 4e-05, |
|
"loss": 88.5999, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05383580080753701, |
|
"grad_norm": 0.7389045357704163, |
|
"learning_rate": 5e-05, |
|
"loss": 88.5472, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06460296096904442, |
|
"grad_norm": 0.7229312658309937, |
|
"learning_rate": 6e-05, |
|
"loss": 88.619, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07537012113055182, |
|
"grad_norm": 0.6590586304664612, |
|
"learning_rate": 7e-05, |
|
"loss": 88.558, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.08613728129205922, |
|
"grad_norm": 0.7481277585029602, |
|
"learning_rate": 8e-05, |
|
"loss": 88.5616, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.09690444145356662, |
|
"grad_norm": 0.7117283940315247, |
|
"learning_rate": 9e-05, |
|
"loss": 88.5806, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.10767160161507403, |
|
"grad_norm": 0.6721596121788025, |
|
"learning_rate": 0.0001, |
|
"loss": 88.5172, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11843876177658143, |
|
"grad_norm": 0.6705605387687683, |
|
"learning_rate": 9.999185052823207e-05, |
|
"loss": 88.6148, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.12920592193808883, |
|
"grad_norm": 0.7348136305809021, |
|
"learning_rate": 9.996740476948385e-05, |
|
"loss": 88.4994, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.13997308209959622, |
|
"grad_norm": 0.7699621319770813, |
|
"learning_rate": 9.992667069255619e-05, |
|
"loss": 88.5175, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.15074024226110364, |
|
"grad_norm": 0.6699883937835693, |
|
"learning_rate": 9.98696615758975e-05, |
|
"loss": 88.6153, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.16150740242261102, |
|
"grad_norm": 0.708699107170105, |
|
"learning_rate": 9.979639600327522e-05, |
|
"loss": 88.512, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.17227456258411844, |
|
"grad_norm": 0.7570011615753174, |
|
"learning_rate": 9.970689785771798e-05, |
|
"loss": 88.5852, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.18304172274562583, |
|
"grad_norm": 0.7698582410812378, |
|
"learning_rate": 9.960119631373022e-05, |
|
"loss": 88.4942, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.19380888290713325, |
|
"grad_norm": 0.7012806534767151, |
|
"learning_rate": 9.947932582778188e-05, |
|
"loss": 88.5333, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.20457604306864063, |
|
"grad_norm": 0.7035436630249023, |
|
"learning_rate": 9.934132612707632e-05, |
|
"loss": 88.4619, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.21534320323014805, |
|
"grad_norm": 0.7284402847290039, |
|
"learning_rate": 9.918724219660013e-05, |
|
"loss": 88.5454, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.22611036339165544, |
|
"grad_norm": 0.8240225911140442, |
|
"learning_rate": 9.901712426445901e-05, |
|
"loss": 88.485, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.23687752355316286, |
|
"grad_norm": 0.6969294548034668, |
|
"learning_rate": 9.883102778550434e-05, |
|
"loss": 88.4997, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.24764468371467024, |
|
"grad_norm": 0.7305822968482971, |
|
"learning_rate": 9.862901342325617e-05, |
|
"loss": 88.4881, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.25841184387617766, |
|
"grad_norm": 0.64059978723526, |
|
"learning_rate": 9.841114703012817e-05, |
|
"loss": 88.5312, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2691790040376851, |
|
"grad_norm": 0.6767567992210388, |
|
"learning_rate": 9.817749962596115e-05, |
|
"loss": 88.4213, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.27994616419919244, |
|
"grad_norm": 0.6672939658164978, |
|
"learning_rate": 9.792814737487207e-05, |
|
"loss": 88.5844, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.29071332436069985, |
|
"grad_norm": 0.7302522659301758, |
|
"learning_rate": 9.766317156042615e-05, |
|
"loss": 88.4343, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.30148048452220727, |
|
"grad_norm": 0.6843910813331604, |
|
"learning_rate": 9.738265855914013e-05, |
|
"loss": 88.4801, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.3122476446837147, |
|
"grad_norm": 0.6865822076797485, |
|
"learning_rate": 9.708669981232541e-05, |
|
"loss": 88.5386, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.32301480484522205, |
|
"grad_norm": 0.6347126960754395, |
|
"learning_rate": 9.677539179628005e-05, |
|
"loss": 88.5193, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.33378196500672946, |
|
"grad_norm": 0.6892479658126831, |
|
"learning_rate": 9.644883599083958e-05, |
|
"loss": 88.5409, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.3445491251682369, |
|
"grad_norm": 0.7092834115028381, |
|
"learning_rate": 9.610713884629666e-05, |
|
"loss": 88.4388, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3553162853297443, |
|
"grad_norm": 0.6895610690116882, |
|
"learning_rate": 9.57504117487006e-05, |
|
"loss": 88.5119, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.36608344549125166, |
|
"grad_norm": 0.7082483172416687, |
|
"learning_rate": 9.537877098354786e-05, |
|
"loss": 88.5405, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.3768506056527591, |
|
"grad_norm": 0.6627039909362793, |
|
"learning_rate": 9.499233769787535e-05, |
|
"loss": 88.4298, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3876177658142665, |
|
"grad_norm": 0.7085216641426086, |
|
"learning_rate": 9.459123786076912e-05, |
|
"loss": 88.4722, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3983849259757739, |
|
"grad_norm": 0.6436282396316528, |
|
"learning_rate": 9.417560222230115e-05, |
|
"loss": 88.4651, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.40915208613728127, |
|
"grad_norm": 0.6832641959190369, |
|
"learning_rate": 9.374556627090749e-05, |
|
"loss": 88.5401, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.4199192462987887, |
|
"grad_norm": 0.6698402762413025, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 88.4947, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.4306864064602961, |
|
"grad_norm": 0.753197431564331, |
|
"learning_rate": 9.284285880837946e-05, |
|
"loss": 88.4373, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4414535666218035, |
|
"grad_norm": 0.6734095215797424, |
|
"learning_rate": 9.237048156080432e-05, |
|
"loss": 88.4169, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.4522207267833109, |
|
"grad_norm": 0.6761934161186218, |
|
"learning_rate": 9.188429243149824e-05, |
|
"loss": 88.4011, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.4629878869448183, |
|
"grad_norm": 0.7175827622413635, |
|
"learning_rate": 9.138444990784453e-05, |
|
"loss": 88.4714, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.4737550471063257, |
|
"grad_norm": 0.7111787796020508, |
|
"learning_rate": 9.087111692794459e-05, |
|
"loss": 88.4071, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4845222072678331, |
|
"grad_norm": 0.6621391773223877, |
|
"learning_rate": 9.034446082750352e-05, |
|
"loss": 88.4675, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4952893674293405, |
|
"grad_norm": 0.6450015306472778, |
|
"learning_rate": 8.980465328528219e-05, |
|
"loss": 88.3995, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.506056527590848, |
|
"grad_norm": 0.6297629475593567, |
|
"learning_rate": 8.925187026713362e-05, |
|
"loss": 88.4825, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.5168236877523553, |
|
"grad_norm": 0.6011308431625366, |
|
"learning_rate": 8.868629196864182e-05, |
|
"loss": 88.4623, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5275908479138627, |
|
"grad_norm": 0.731069803237915, |
|
"learning_rate": 8.810810275638183e-05, |
|
"loss": 88.3527, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5383580080753702, |
|
"grad_norm": 0.6550359129905701, |
|
"learning_rate": 8.751749110782012e-05, |
|
"loss": 88.429, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5491251682368775, |
|
"grad_norm": 0.5967565774917603, |
|
"learning_rate": 8.691464954987493e-05, |
|
"loss": 88.467, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5598923283983849, |
|
"grad_norm": 0.6432732343673706, |
|
"learning_rate": 8.629977459615655e-05, |
|
"loss": 88.4472, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5706594885598923, |
|
"grad_norm": 0.5874126553535461, |
|
"learning_rate": 8.567306668290799e-05, |
|
"loss": 88.4731, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5814266487213997, |
|
"grad_norm": 0.6305550336837769, |
|
"learning_rate": 8.503473010366713e-05, |
|
"loss": 88.4714, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5921938088829072, |
|
"grad_norm": 0.6246164441108704, |
|
"learning_rate": 8.438497294267117e-05, |
|
"loss": 88.5134, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.6029609690444145, |
|
"grad_norm": 0.6057698726654053, |
|
"learning_rate": 8.37240070070257e-05, |
|
"loss": 88.5225, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.6137281292059219, |
|
"grad_norm": 0.6253562569618225, |
|
"learning_rate": 8.305204775766003e-05, |
|
"loss": 88.4513, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6244952893674294, |
|
"grad_norm": 0.6293994188308716, |
|
"learning_rate": 8.236931423909138e-05, |
|
"loss": 88.4613, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.6352624495289367, |
|
"grad_norm": 0.6635474562644958, |
|
"learning_rate": 8.16760290080212e-05, |
|
"loss": 88.4207, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6460296096904441, |
|
"grad_norm": 0.641502320766449, |
|
"learning_rate": 8.097241806078615e-05, |
|
"loss": 88.4173, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6567967698519516, |
|
"grad_norm": 0.6065200567245483, |
|
"learning_rate": 8.025871075968828e-05, |
|
"loss": 88.4221, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.6675639300134589, |
|
"grad_norm": 0.7158219218254089, |
|
"learning_rate": 7.953513975822755e-05, |
|
"loss": 88.3222, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6783310901749664, |
|
"grad_norm": 0.6504823565483093, |
|
"learning_rate": 7.880194092526199e-05, |
|
"loss": 88.3827, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6890982503364738, |
|
"grad_norm": 0.624383807182312, |
|
"learning_rate": 7.805935326811912e-05, |
|
"loss": 88.4309, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6998654104979811, |
|
"grad_norm": 0.6950279474258423, |
|
"learning_rate": 7.730761885468485e-05, |
|
"loss": 88.4252, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.7106325706594886, |
|
"grad_norm": 0.7349709868431091, |
|
"learning_rate": 7.654698273449435e-05, |
|
"loss": 88.2957, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.721399730820996, |
|
"grad_norm": 0.6542060971260071, |
|
"learning_rate": 7.577769285885109e-05, |
|
"loss": 88.3716, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.7321668909825033, |
|
"grad_norm": 0.6578887104988098, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 88.3229, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.7429340511440108, |
|
"grad_norm": 0.7284409403800964, |
|
"learning_rate": 7.421415766938097e-05, |
|
"loss": 88.3006, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7537012113055181, |
|
"grad_norm": 0.6003475785255432, |
|
"learning_rate": 7.342042203498951e-05, |
|
"loss": 88.3455, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7644683714670256, |
|
"grad_norm": 0.6060341596603394, |
|
"learning_rate": 7.261905183787136e-05, |
|
"loss": 88.3768, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.775235531628533, |
|
"grad_norm": 0.6123959422111511, |
|
"learning_rate": 7.181030830777837e-05, |
|
"loss": 88.3185, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7860026917900403, |
|
"grad_norm": 0.5798596143722534, |
|
"learning_rate": 7.099445507801323e-05, |
|
"loss": 88.4075, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7967698519515478, |
|
"grad_norm": 0.6350551843643188, |
|
"learning_rate": 7.017175809949044e-05, |
|
"loss": 88.3893, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.8075370121130552, |
|
"grad_norm": 0.6014510989189148, |
|
"learning_rate": 6.934248555404198e-05, |
|
"loss": 88.3887, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8183041722745625, |
|
"grad_norm": 0.5696172714233398, |
|
"learning_rate": 6.850690776699573e-05, |
|
"loss": 88.3717, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.82907133243607, |
|
"grad_norm": 0.5863236784934998, |
|
"learning_rate": 6.766529711905513e-05, |
|
"loss": 88.4043, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.8398384925975774, |
|
"grad_norm": 0.6391366720199585, |
|
"learning_rate": 6.681792795750875e-05, |
|
"loss": 88.3141, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8506056527590848, |
|
"grad_norm": 0.639610230922699, |
|
"learning_rate": 6.5965076506799e-05, |
|
"loss": 88.3615, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.8613728129205922, |
|
"grad_norm": 0.5825693607330322, |
|
"learning_rate": 6.510702077847863e-05, |
|
"loss": 88.3598, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8721399730820996, |
|
"grad_norm": 0.6000289916992188, |
|
"learning_rate": 6.4244040480585e-05, |
|
"loss": 88.3163, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.882907133243607, |
|
"grad_norm": 0.6992833614349365, |
|
"learning_rate": 6.337641692646106e-05, |
|
"loss": 88.3116, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.8936742934051144, |
|
"grad_norm": 0.5917866230010986, |
|
"learning_rate": 6.250443294305315e-05, |
|
"loss": 88.3113, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.9044414535666218, |
|
"grad_norm": 0.6197730898857117, |
|
"learning_rate": 6.162837277871553e-05, |
|
"loss": 88.3008, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.9152086137281292, |
|
"grad_norm": 0.5347660779953003, |
|
"learning_rate": 6.0748522010551215e-05, |
|
"loss": 88.3428, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.9259757738896366, |
|
"grad_norm": 0.625930905342102, |
|
"learning_rate": 5.9865167451320005e-05, |
|
"loss": 88.4048, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.9367429340511441, |
|
"grad_norm": 0.613715648651123, |
|
"learning_rate": 5.897859705594359e-05, |
|
"loss": 88.3623, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.9475100942126514, |
|
"grad_norm": 0.6773898601531982, |
|
"learning_rate": 5.808909982763825e-05, |
|
"loss": 88.305, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.9582772543741588, |
|
"grad_norm": 0.6399573087692261, |
|
"learning_rate": 5.719696572370595e-05, |
|
"loss": 88.2482, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.9690444145356663, |
|
"grad_norm": 0.6839354038238525, |
|
"learning_rate": 5.6302485561014475e-05, |
|
"loss": 88.281, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9798115746971736, |
|
"grad_norm": 0.6810314655303955, |
|
"learning_rate": 5.540595092119709e-05, |
|
"loss": 88.2934, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.990578734858681, |
|
"grad_norm": 0.6228731870651245, |
|
"learning_rate": 5.4507654055603275e-05, |
|
"loss": 88.3096, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.990578734858681, |
|
"eval_loss": 11.04428768157959, |
|
"eval_runtime": 0.759, |
|
"eval_samples_per_second": 206.84, |
|
"eval_steps_per_second": 52.698, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.0067294751009421, |
|
"grad_norm": 0.5612848997116089, |
|
"learning_rate": 5.360788779003082e-05, |
|
"loss": 88.4213, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.0174966352624495, |
|
"grad_norm": 0.6439865827560425, |
|
"learning_rate": 5.270694542927088e-05, |
|
"loss": 88.3083, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.0282637954239569, |
|
"grad_norm": 0.5426231026649475, |
|
"learning_rate": 5.180512066149682e-05, |
|
"loss": 88.3768, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.0390309555854644, |
|
"grad_norm": 0.6521313190460205, |
|
"learning_rate": 5.090270746252802e-05, |
|
"loss": 88.2851, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.0497981157469718, |
|
"grad_norm": 0.601963996887207, |
|
"learning_rate": 5e-05, |
|
"loss": 88.3373, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.0605652759084792, |
|
"grad_norm": 0.6257451176643372, |
|
"learning_rate": 4.909729253747197e-05, |
|
"loss": 88.3497, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.0713324360699865, |
|
"grad_norm": 0.6289570927619934, |
|
"learning_rate": 4.819487933850319e-05, |
|
"loss": 88.3188, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.0820995962314939, |
|
"grad_norm": 0.6000854969024658, |
|
"learning_rate": 4.729305457072913e-05, |
|
"loss": 88.3107, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0928667563930015, |
|
"grad_norm": 0.7118406295776367, |
|
"learning_rate": 4.63921122099692e-05, |
|
"loss": 88.2865, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.1036339165545088, |
|
"grad_norm": 0.5987817049026489, |
|
"learning_rate": 4.549234594439674e-05, |
|
"loss": 88.2681, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.1144010767160162, |
|
"grad_norm": 0.5965442657470703, |
|
"learning_rate": 4.4594049078802925e-05, |
|
"loss": 88.3694, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.1251682368775235, |
|
"grad_norm": 0.5947834253311157, |
|
"learning_rate": 4.3697514438985536e-05, |
|
"loss": 88.3057, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.135935397039031, |
|
"grad_norm": 0.6357874870300293, |
|
"learning_rate": 4.280303427629404e-05, |
|
"loss": 88.2947, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.1467025572005383, |
|
"grad_norm": 0.5863412618637085, |
|
"learning_rate": 4.1910900172361764e-05, |
|
"loss": 88.342, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.1574697173620458, |
|
"grad_norm": 0.6012663245201111, |
|
"learning_rate": 4.1021402944056416e-05, |
|
"loss": 88.2721, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.1682368775235532, |
|
"grad_norm": 0.6170640587806702, |
|
"learning_rate": 4.0134832548680006e-05, |
|
"loss": 88.3084, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.1790040376850606, |
|
"grad_norm": 0.6174831390380859, |
|
"learning_rate": 3.92514779894488e-05, |
|
"loss": 88.3112, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.189771197846568, |
|
"grad_norm": 0.5705954432487488, |
|
"learning_rate": 3.8371627221284495e-05, |
|
"loss": 88.3226, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.2005383580080753, |
|
"grad_norm": 0.6390698552131653, |
|
"learning_rate": 3.7495567056946855e-05, |
|
"loss": 88.2471, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.2113055181695827, |
|
"grad_norm": 0.643191397190094, |
|
"learning_rate": 3.6623583073538966e-05, |
|
"loss": 88.1814, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.2220726783310902, |
|
"grad_norm": 0.6732293963432312, |
|
"learning_rate": 3.5755959519415005e-05, |
|
"loss": 88.2546, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.2328398384925976, |
|
"grad_norm": 0.6384011507034302, |
|
"learning_rate": 3.489297922152136e-05, |
|
"loss": 88.2384, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.243606998654105, |
|
"grad_norm": 0.5936709046363831, |
|
"learning_rate": 3.403492349320101e-05, |
|
"loss": 88.2589, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.2543741588156123, |
|
"grad_norm": 0.5669568777084351, |
|
"learning_rate": 3.3182072042491244e-05, |
|
"loss": 88.2816, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.2651413189771197, |
|
"grad_norm": 0.5536919236183167, |
|
"learning_rate": 3.2334702880944886e-05, |
|
"loss": 88.3449, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.2759084791386273, |
|
"grad_norm": 0.5760201811790466, |
|
"learning_rate": 3.149309223300428e-05, |
|
"loss": 88.3198, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.2866756393001346, |
|
"grad_norm": 0.62739497423172, |
|
"learning_rate": 3.065751444595805e-05, |
|
"loss": 88.2775, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.297442799461642, |
|
"grad_norm": 0.5816006064414978, |
|
"learning_rate": 2.982824190050958e-05, |
|
"loss": 88.3048, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.3082099596231493, |
|
"grad_norm": 0.6178335547447205, |
|
"learning_rate": 2.900554492198677e-05, |
|
"loss": 88.3031, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.3189771197846567, |
|
"grad_norm": 0.6052290797233582, |
|
"learning_rate": 2.8189691692221627e-05, |
|
"loss": 88.2896, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.3297442799461643, |
|
"grad_norm": 0.5566094517707825, |
|
"learning_rate": 2.738094816212866e-05, |
|
"loss": 88.3939, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.3405114401076716, |
|
"grad_norm": 0.6167682409286499, |
|
"learning_rate": 2.65795779650105e-05, |
|
"loss": 88.283, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.351278600269179, |
|
"grad_norm": 0.6485788822174072, |
|
"learning_rate": 2.5785842330619038e-05, |
|
"loss": 88.341, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.3620457604306864, |
|
"grad_norm": 0.6236302256584167, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 88.3239, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.3728129205921937, |
|
"grad_norm": 0.6133517622947693, |
|
"learning_rate": 2.422230714114891e-05, |
|
"loss": 88.257, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.3835800807537013, |
|
"grad_norm": 0.5628191828727722, |
|
"learning_rate": 2.3453017265505673e-05, |
|
"loss": 88.2815, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.3943472409152087, |
|
"grad_norm": 0.6054531931877136, |
|
"learning_rate": 2.269238114531515e-05, |
|
"loss": 88.2568, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.405114401076716, |
|
"grad_norm": 0.6800547242164612, |
|
"learning_rate": 2.194064673188089e-05, |
|
"loss": 88.2577, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.4158815612382234, |
|
"grad_norm": 0.6266992092132568, |
|
"learning_rate": 2.1198059074738024e-05, |
|
"loss": 88.2155, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.4266487213997308, |
|
"grad_norm": 0.6254451274871826, |
|
"learning_rate": 2.0464860241772455e-05, |
|
"loss": 88.2093, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.4374158815612383, |
|
"grad_norm": 0.6577990055084229, |
|
"learning_rate": 1.9741289240311755e-05, |
|
"loss": 88.1898, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.4481830417227457, |
|
"grad_norm": 0.6361428499221802, |
|
"learning_rate": 1.902758193921385e-05, |
|
"loss": 88.3488, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.458950201884253, |
|
"grad_norm": 0.6057702302932739, |
|
"learning_rate": 1.832397099197882e-05, |
|
"loss": 88.2169, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.4697173620457604, |
|
"grad_norm": 0.5825548768043518, |
|
"learning_rate": 1.7630685760908622e-05, |
|
"loss": 88.2397, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.4804845222072678, |
|
"grad_norm": 0.7038293480873108, |
|
"learning_rate": 1.6947952242339992e-05, |
|
"loss": 88.1477, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.4912516823687754, |
|
"grad_norm": 0.6376841068267822, |
|
"learning_rate": 1.6275992992974308e-05, |
|
"loss": 88.2545, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.5020188425302825, |
|
"grad_norm": 0.653458833694458, |
|
"learning_rate": 1.561502705732883e-05, |
|
"loss": 88.2055, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.51278600269179, |
|
"grad_norm": 0.6237602829933167, |
|
"learning_rate": 1.4965269896332885e-05, |
|
"loss": 88.2087, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.5235531628532974, |
|
"grad_norm": 0.6579408645629883, |
|
"learning_rate": 1.4326933317092e-05, |
|
"loss": 88.2702, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.5343203230148048, |
|
"grad_norm": 0.5924363136291504, |
|
"learning_rate": 1.3700225403843469e-05, |
|
"loss": 88.248, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.5450874831763124, |
|
"grad_norm": 0.6045029759407043, |
|
"learning_rate": 1.3085350450125072e-05, |
|
"loss": 88.3215, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.5558546433378195, |
|
"grad_norm": 0.6474866271018982, |
|
"learning_rate": 1.2482508892179884e-05, |
|
"loss": 88.2769, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.5666218034993271, |
|
"grad_norm": 0.6447649002075195, |
|
"learning_rate": 1.1891897243618182e-05, |
|
"loss": 88.2294, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.5773889636608345, |
|
"grad_norm": 0.6493783593177795, |
|
"learning_rate": 1.1313708031358183e-05, |
|
"loss": 88.2699, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.5881561238223418, |
|
"grad_norm": 0.6726663112640381, |
|
"learning_rate": 1.0748129732866391e-05, |
|
"loss": 88.2096, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.5989232839838494, |
|
"grad_norm": 0.5635113716125488, |
|
"learning_rate": 1.0195346714717813e-05, |
|
"loss": 88.3316, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.6096904441453566, |
|
"grad_norm": 0.6250278949737549, |
|
"learning_rate": 9.65553917249648e-06, |
|
"loss": 88.2534, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.6204576043068641, |
|
"grad_norm": 0.61742103099823, |
|
"learning_rate": 9.12888307205541e-06, |
|
"loss": 88.3115, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.6312247644683715, |
|
"grad_norm": 0.6726629137992859, |
|
"learning_rate": 8.615550092155478e-06, |
|
"loss": 88.1615, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.6419919246298789, |
|
"grad_norm": 0.6055439114570618, |
|
"learning_rate": 8.115707568501768e-06, |
|
"loss": 88.2359, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.6527590847913862, |
|
"grad_norm": 0.646351158618927, |
|
"learning_rate": 7.629518439195671e-06, |
|
"loss": 88.2837, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.6635262449528936, |
|
"grad_norm": 0.6116978526115417, |
|
"learning_rate": 7.157141191620548e-06, |
|
"loss": 88.2912, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.6742934051144012, |
|
"grad_norm": 0.6652332544326782, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 88.2366, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.6850605652759085, |
|
"grad_norm": 0.7036592364311218, |
|
"learning_rate": 6.2544337290925185e-06, |
|
"loss": 88.2283, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.695827725437416, |
|
"grad_norm": 0.6081019043922424, |
|
"learning_rate": 5.824397777698859e-06, |
|
"loss": 88.2434, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.7065948855989233, |
|
"grad_norm": 0.6562557220458984, |
|
"learning_rate": 5.408762139230888e-06, |
|
"loss": 88.1598, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.7173620457604306, |
|
"grad_norm": 0.6159968376159668, |
|
"learning_rate": 5.007662302124672e-06, |
|
"loss": 88.2584, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.7281292059219382, |
|
"grad_norm": 0.6619474291801453, |
|
"learning_rate": 4.621229016452156e-06, |
|
"loss": 88.18, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.7388963660834453, |
|
"grad_norm": 0.6214718222618103, |
|
"learning_rate": 4.249588251299391e-06, |
|
"loss": 88.2279, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.749663526244953, |
|
"grad_norm": 0.6064915657043457, |
|
"learning_rate": 3.892861153703342e-06, |
|
"loss": 88.2215, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.7604306864064603, |
|
"grad_norm": 0.5849805474281311, |
|
"learning_rate": 3.551164009160429e-06, |
|
"loss": 88.2112, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.7711978465679676, |
|
"grad_norm": 0.6270782351493835, |
|
"learning_rate": 3.2246082037199532e-06, |
|
"loss": 88.2335, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.7819650067294752, |
|
"grad_norm": 0.5952561497688293, |
|
"learning_rate": 2.9133001876746004e-06, |
|
"loss": 88.2984, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.7927321668909824, |
|
"grad_norm": 0.6238298416137695, |
|
"learning_rate": 2.6173414408598827e-06, |
|
"loss": 88.2918, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.80349932705249, |
|
"grad_norm": 0.6672609448432922, |
|
"learning_rate": 2.3368284395738684e-06, |
|
"loss": 88.2529, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.8142664872139973, |
|
"grad_norm": 0.5070897340774536, |
|
"learning_rate": 2.0718526251279346e-06, |
|
"loss": 88.3376, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.8250336473755047, |
|
"grad_norm": 0.6033185124397278, |
|
"learning_rate": 1.8225003740388547e-06, |
|
"loss": 88.3511, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.8358008075370122, |
|
"grad_norm": 0.5488670468330383, |
|
"learning_rate": 1.5888529698718346e-06, |
|
"loss": 88.3084, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.8465679676985194, |
|
"grad_norm": 0.6001074910163879, |
|
"learning_rate": 1.3709865767438435e-06, |
|
"loss": 88.2296, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.857335127860027, |
|
"grad_norm": 0.5917924642562866, |
|
"learning_rate": 1.1689722144956671e-06, |
|
"loss": 88.3425, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.8681022880215343, |
|
"grad_norm": 0.5938112735748291, |
|
"learning_rate": 9.82875735540989e-07, |
|
"loss": 88.2656, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.8788694481830417, |
|
"grad_norm": 0.6183878779411316, |
|
"learning_rate": 8.127578033998662e-07, |
|
"loss": 88.2139, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.8896366083445493, |
|
"grad_norm": 0.6551972031593323, |
|
"learning_rate": 6.58673872923693e-07, |
|
"loss": 88.2028, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.9004037685060564, |
|
"grad_norm": 0.6331955194473267, |
|
"learning_rate": 5.206741722181386e-07, |
|
"loss": 88.2651, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.911170928667564, |
|
"grad_norm": 0.6921940445899963, |
|
"learning_rate": 3.9880368626978304e-07, |
|
"loss": 88.2543, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.9219380888290714, |
|
"grad_norm": 0.6135378479957581, |
|
"learning_rate": 2.9310214228202013e-07, |
|
"loss": 88.269, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.9327052489905787, |
|
"grad_norm": 0.5835920572280884, |
|
"learning_rate": 2.0360399672478824e-07, |
|
"loss": 88.267, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.9434724091520863, |
|
"grad_norm": 0.5504326224327087, |
|
"learning_rate": 1.3033842410251075e-07, |
|
"loss": 88.3445, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.9542395693135934, |
|
"grad_norm": 0.6638001799583435, |
|
"learning_rate": 7.332930744380906e-08, |
|
"loss": 88.2083, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.965006729475101, |
|
"grad_norm": 0.69869065284729, |
|
"learning_rate": 3.259523051615254e-08, |
|
"loss": 88.262, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.9757738896366084, |
|
"grad_norm": 0.6452627778053284, |
|
"learning_rate": 8.149471767937567e-09, |
|
"loss": 88.2167, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.9865410497981157, |
|
"grad_norm": 0.6180316805839539, |
|
"learning_rate": 0.0, |
|
"loss": 88.3043, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.9865410497981157, |
|
"eval_loss": 11.03515911102295, |
|
"eval_runtime": 0.3851, |
|
"eval_samples_per_second": 407.704, |
|
"eval_steps_per_second": 103.874, |
|
"step": 184 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 184, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1943745331200.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|