{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9865410497981157, "eval_steps": 500, "global_step": 184, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010767160161507403, "grad_norm": 0.7117969989776611, "learning_rate": 1e-05, "loss": 88.6269, "step": 1 }, { "epoch": 0.021534320323014805, "grad_norm": 0.6731469035148621, "learning_rate": 2e-05, "loss": 88.5874, "step": 2 }, { "epoch": 0.03230148048452221, "grad_norm": 0.684374213218689, "learning_rate": 3e-05, "loss": 88.5793, "step": 3 }, { "epoch": 0.04306864064602961, "grad_norm": 0.6637169122695923, "learning_rate": 4e-05, "loss": 88.5999, "step": 4 }, { "epoch": 0.05383580080753701, "grad_norm": 0.7389045357704163, "learning_rate": 5e-05, "loss": 88.5472, "step": 5 }, { "epoch": 0.06460296096904442, "grad_norm": 0.7229312658309937, "learning_rate": 6e-05, "loss": 88.619, "step": 6 }, { "epoch": 0.07537012113055182, "grad_norm": 0.6590586304664612, "learning_rate": 7e-05, "loss": 88.558, "step": 7 }, { "epoch": 0.08613728129205922, "grad_norm": 0.7481277585029602, "learning_rate": 8e-05, "loss": 88.5616, "step": 8 }, { "epoch": 0.09690444145356662, "grad_norm": 0.7117283940315247, "learning_rate": 9e-05, "loss": 88.5806, "step": 9 }, { "epoch": 0.10767160161507403, "grad_norm": 0.6721596121788025, "learning_rate": 0.0001, "loss": 88.5172, "step": 10 }, { "epoch": 0.11843876177658143, "grad_norm": 0.6705605387687683, "learning_rate": 9.999185052823207e-05, "loss": 88.6148, "step": 11 }, { "epoch": 0.12920592193808883, "grad_norm": 0.7348136305809021, "learning_rate": 9.996740476948385e-05, "loss": 88.4994, "step": 12 }, { "epoch": 0.13997308209959622, "grad_norm": 0.7699621319770813, "learning_rate": 9.992667069255619e-05, "loss": 88.5175, "step": 13 }, { "epoch": 0.15074024226110364, "grad_norm": 0.6699883937835693, "learning_rate": 9.98696615758975e-05, "loss": 88.6153, "step": 14 }, { "epoch": 0.16150740242261102, "grad_norm": 0.708699107170105, "learning_rate": 9.979639600327522e-05, "loss": 88.512, "step": 15 }, { "epoch": 0.17227456258411844, "grad_norm": 0.7570011615753174, "learning_rate": 9.970689785771798e-05, "loss": 88.5852, "step": 16 }, { "epoch": 0.18304172274562583, "grad_norm": 0.7698582410812378, "learning_rate": 9.960119631373022e-05, "loss": 88.4942, "step": 17 }, { "epoch": 0.19380888290713325, "grad_norm": 0.7012806534767151, "learning_rate": 9.947932582778188e-05, "loss": 88.5333, "step": 18 }, { "epoch": 0.20457604306864063, "grad_norm": 0.7035436630249023, "learning_rate": 9.934132612707632e-05, "loss": 88.4619, "step": 19 }, { "epoch": 0.21534320323014805, "grad_norm": 0.7284402847290039, "learning_rate": 9.918724219660013e-05, "loss": 88.5454, "step": 20 }, { "epoch": 0.22611036339165544, "grad_norm": 0.8240225911140442, "learning_rate": 9.901712426445901e-05, "loss": 88.485, "step": 21 }, { "epoch": 0.23687752355316286, "grad_norm": 0.6969294548034668, "learning_rate": 9.883102778550434e-05, "loss": 88.4997, "step": 22 }, { "epoch": 0.24764468371467024, "grad_norm": 0.7305822968482971, "learning_rate": 9.862901342325617e-05, "loss": 88.4881, "step": 23 }, { "epoch": 0.25841184387617766, "grad_norm": 0.64059978723526, "learning_rate": 9.841114703012817e-05, "loss": 88.5312, "step": 24 }, { "epoch": 0.2691790040376851, "grad_norm": 0.6767567992210388, "learning_rate": 9.817749962596115e-05, "loss": 88.4213, "step": 25 }, { "epoch": 0.27994616419919244, "grad_norm": 0.6672939658164978, "learning_rate": 9.792814737487207e-05, "loss": 88.5844, "step": 26 }, { "epoch": 0.29071332436069985, "grad_norm": 0.7302522659301758, "learning_rate": 9.766317156042615e-05, "loss": 88.4343, "step": 27 }, { "epoch": 0.30148048452220727, "grad_norm": 0.6843910813331604, "learning_rate": 9.738265855914013e-05, "loss": 88.4801, "step": 28 }, { "epoch": 0.3122476446837147, "grad_norm": 0.6865822076797485, "learning_rate": 9.708669981232541e-05, "loss": 88.5386, "step": 29 }, { "epoch": 0.32301480484522205, "grad_norm": 0.6347126960754395, "learning_rate": 9.677539179628005e-05, "loss": 88.5193, "step": 30 }, { "epoch": 0.33378196500672946, "grad_norm": 0.6892479658126831, "learning_rate": 9.644883599083958e-05, "loss": 88.5409, "step": 31 }, { "epoch": 0.3445491251682369, "grad_norm": 0.7092834115028381, "learning_rate": 9.610713884629666e-05, "loss": 88.4388, "step": 32 }, { "epoch": 0.3553162853297443, "grad_norm": 0.6895610690116882, "learning_rate": 9.57504117487006e-05, "loss": 88.5119, "step": 33 }, { "epoch": 0.36608344549125166, "grad_norm": 0.7082483172416687, "learning_rate": 9.537877098354786e-05, "loss": 88.5405, "step": 34 }, { "epoch": 0.3768506056527591, "grad_norm": 0.6627039909362793, "learning_rate": 9.499233769787535e-05, "loss": 88.4298, "step": 35 }, { "epoch": 0.3876177658142665, "grad_norm": 0.7085216641426086, "learning_rate": 9.459123786076912e-05, "loss": 88.4722, "step": 36 }, { "epoch": 0.3983849259757739, "grad_norm": 0.6436282396316528, "learning_rate": 9.417560222230115e-05, "loss": 88.4651, "step": 37 }, { "epoch": 0.40915208613728127, "grad_norm": 0.6832641959190369, "learning_rate": 9.374556627090749e-05, "loss": 88.5401, "step": 38 }, { "epoch": 0.4199192462987887, "grad_norm": 0.6698402762413025, "learning_rate": 9.330127018922194e-05, "loss": 88.4947, "step": 39 }, { "epoch": 0.4306864064602961, "grad_norm": 0.753197431564331, "learning_rate": 9.284285880837946e-05, "loss": 88.4373, "step": 40 }, { "epoch": 0.4414535666218035, "grad_norm": 0.6734095215797424, "learning_rate": 9.237048156080432e-05, "loss": 88.4169, "step": 41 }, { "epoch": 0.4522207267833109, "grad_norm": 0.6761934161186218, "learning_rate": 9.188429243149824e-05, "loss": 88.4011, "step": 42 }, { "epoch": 0.4629878869448183, "grad_norm": 0.7175827622413635, "learning_rate": 9.138444990784453e-05, "loss": 88.4714, "step": 43 }, { "epoch": 0.4737550471063257, "grad_norm": 0.7111787796020508, "learning_rate": 9.087111692794459e-05, "loss": 88.4071, "step": 44 }, { "epoch": 0.4845222072678331, "grad_norm": 0.6621391773223877, "learning_rate": 9.034446082750352e-05, "loss": 88.4675, "step": 45 }, { "epoch": 0.4952893674293405, "grad_norm": 0.6450015306472778, "learning_rate": 8.980465328528219e-05, "loss": 88.3995, "step": 46 }, { "epoch": 0.506056527590848, "grad_norm": 0.6297629475593567, "learning_rate": 8.925187026713362e-05, "loss": 88.4825, "step": 47 }, { "epoch": 0.5168236877523553, "grad_norm": 0.6011308431625366, "learning_rate": 8.868629196864182e-05, "loss": 88.4623, "step": 48 }, { "epoch": 0.5275908479138627, "grad_norm": 0.731069803237915, "learning_rate": 8.810810275638183e-05, "loss": 88.3527, "step": 49 }, { "epoch": 0.5383580080753702, "grad_norm": 0.6550359129905701, "learning_rate": 8.751749110782012e-05, "loss": 88.429, "step": 50 }, { "epoch": 0.5491251682368775, "grad_norm": 0.5967565774917603, "learning_rate": 8.691464954987493e-05, "loss": 88.467, "step": 51 }, { "epoch": 0.5598923283983849, "grad_norm": 0.6432732343673706, "learning_rate": 8.629977459615655e-05, "loss": 88.4472, "step": 52 }, { "epoch": 0.5706594885598923, "grad_norm": 0.5874126553535461, "learning_rate": 8.567306668290799e-05, "loss": 88.4731, "step": 53 }, { "epoch": 0.5814266487213997, "grad_norm": 0.6305550336837769, "learning_rate": 8.503473010366713e-05, "loss": 88.4714, "step": 54 }, { "epoch": 0.5921938088829072, "grad_norm": 0.6246164441108704, "learning_rate": 8.438497294267117e-05, "loss": 88.5134, "step": 55 }, { "epoch": 0.6029609690444145, "grad_norm": 0.6057698726654053, "learning_rate": 8.37240070070257e-05, "loss": 88.5225, "step": 56 }, { "epoch": 0.6137281292059219, "grad_norm": 0.6253562569618225, "learning_rate": 8.305204775766003e-05, "loss": 88.4513, "step": 57 }, { "epoch": 0.6244952893674294, "grad_norm": 0.6293994188308716, "learning_rate": 8.236931423909138e-05, "loss": 88.4613, "step": 58 }, { "epoch": 0.6352624495289367, "grad_norm": 0.6635474562644958, "learning_rate": 8.16760290080212e-05, "loss": 88.4207, "step": 59 }, { "epoch": 0.6460296096904441, "grad_norm": 0.641502320766449, "learning_rate": 8.097241806078615e-05, "loss": 88.4173, "step": 60 }, { "epoch": 0.6567967698519516, "grad_norm": 0.6065200567245483, "learning_rate": 8.025871075968828e-05, "loss": 88.4221, "step": 61 }, { "epoch": 0.6675639300134589, "grad_norm": 0.7158219218254089, "learning_rate": 7.953513975822755e-05, "loss": 88.3222, "step": 62 }, { "epoch": 0.6783310901749664, "grad_norm": 0.6504823565483093, "learning_rate": 7.880194092526199e-05, "loss": 88.3827, "step": 63 }, { "epoch": 0.6890982503364738, "grad_norm": 0.624383807182312, "learning_rate": 7.805935326811912e-05, "loss": 88.4309, "step": 64 }, { "epoch": 0.6998654104979811, "grad_norm": 0.6950279474258423, "learning_rate": 7.730761885468485e-05, "loss": 88.4252, "step": 65 }, { "epoch": 0.7106325706594886, "grad_norm": 0.7349709868431091, "learning_rate": 7.654698273449435e-05, "loss": 88.2957, "step": 66 }, { "epoch": 0.721399730820996, "grad_norm": 0.6542060971260071, "learning_rate": 7.577769285885109e-05, "loss": 88.3716, "step": 67 }, { "epoch": 0.7321668909825033, "grad_norm": 0.6578887104988098, "learning_rate": 7.500000000000001e-05, "loss": 88.3229, "step": 68 }, { "epoch": 0.7429340511440108, "grad_norm": 0.7284409403800964, "learning_rate": 7.421415766938097e-05, "loss": 88.3006, "step": 69 }, { "epoch": 0.7537012113055181, "grad_norm": 0.6003475785255432, "learning_rate": 7.342042203498951e-05, "loss": 88.3455, "step": 70 }, { "epoch": 0.7644683714670256, "grad_norm": 0.6060341596603394, "learning_rate": 7.261905183787136e-05, "loss": 88.3768, "step": 71 }, { "epoch": 0.775235531628533, "grad_norm": 0.6123959422111511, "learning_rate": 7.181030830777837e-05, "loss": 88.3185, "step": 72 }, { "epoch": 0.7860026917900403, "grad_norm": 0.5798596143722534, "learning_rate": 7.099445507801323e-05, "loss": 88.4075, "step": 73 }, { "epoch": 0.7967698519515478, "grad_norm": 0.6350551843643188, "learning_rate": 7.017175809949044e-05, "loss": 88.3893, "step": 74 }, { "epoch": 0.8075370121130552, "grad_norm": 0.6014510989189148, "learning_rate": 6.934248555404198e-05, "loss": 88.3887, "step": 75 }, { "epoch": 0.8183041722745625, "grad_norm": 0.5696172714233398, "learning_rate": 6.850690776699573e-05, "loss": 88.3717, "step": 76 }, { "epoch": 0.82907133243607, "grad_norm": 0.5863236784934998, "learning_rate": 6.766529711905513e-05, "loss": 88.4043, "step": 77 }, { "epoch": 0.8398384925975774, "grad_norm": 0.6391366720199585, "learning_rate": 6.681792795750875e-05, "loss": 88.3141, "step": 78 }, { "epoch": 0.8506056527590848, "grad_norm": 0.639610230922699, "learning_rate": 6.5965076506799e-05, "loss": 88.3615, "step": 79 }, { "epoch": 0.8613728129205922, "grad_norm": 0.5825693607330322, "learning_rate": 6.510702077847863e-05, "loss": 88.3598, "step": 80 }, { "epoch": 0.8721399730820996, "grad_norm": 0.6000289916992188, "learning_rate": 6.4244040480585e-05, "loss": 88.3163, "step": 81 }, { "epoch": 0.882907133243607, "grad_norm": 0.6992833614349365, "learning_rate": 6.337641692646106e-05, "loss": 88.3116, "step": 82 }, { "epoch": 0.8936742934051144, "grad_norm": 0.5917866230010986, "learning_rate": 6.250443294305315e-05, "loss": 88.3113, "step": 83 }, { "epoch": 0.9044414535666218, "grad_norm": 0.6197730898857117, "learning_rate": 6.162837277871553e-05, "loss": 88.3008, "step": 84 }, { "epoch": 0.9152086137281292, "grad_norm": 0.5347660779953003, "learning_rate": 6.0748522010551215e-05, "loss": 88.3428, "step": 85 }, { "epoch": 0.9259757738896366, "grad_norm": 0.625930905342102, "learning_rate": 5.9865167451320005e-05, "loss": 88.4048, "step": 86 }, { "epoch": 0.9367429340511441, "grad_norm": 0.613715648651123, "learning_rate": 5.897859705594359e-05, "loss": 88.3623, "step": 87 }, { "epoch": 0.9475100942126514, "grad_norm": 0.6773898601531982, "learning_rate": 5.808909982763825e-05, "loss": 88.305, "step": 88 }, { "epoch": 0.9582772543741588, "grad_norm": 0.6399573087692261, "learning_rate": 5.719696572370595e-05, "loss": 88.2482, "step": 89 }, { "epoch": 0.9690444145356663, "grad_norm": 0.6839354038238525, "learning_rate": 5.6302485561014475e-05, "loss": 88.281, "step": 90 }, { "epoch": 0.9798115746971736, "grad_norm": 0.6810314655303955, "learning_rate": 5.540595092119709e-05, "loss": 88.2934, "step": 91 }, { "epoch": 0.990578734858681, "grad_norm": 0.6228731870651245, "learning_rate": 5.4507654055603275e-05, "loss": 88.3096, "step": 92 }, { "epoch": 0.990578734858681, "eval_loss": 11.04428768157959, "eval_runtime": 0.759, "eval_samples_per_second": 206.84, "eval_steps_per_second": 52.698, "step": 92 }, { "epoch": 1.0067294751009421, "grad_norm": 0.5612848997116089, "learning_rate": 5.360788779003082e-05, "loss": 88.4213, "step": 93 }, { "epoch": 1.0174966352624495, "grad_norm": 0.6439865827560425, "learning_rate": 5.270694542927088e-05, "loss": 88.3083, "step": 94 }, { "epoch": 1.0282637954239569, "grad_norm": 0.5426231026649475, "learning_rate": 5.180512066149682e-05, "loss": 88.3768, "step": 95 }, { "epoch": 1.0390309555854644, "grad_norm": 0.6521313190460205, "learning_rate": 5.090270746252802e-05, "loss": 88.2851, "step": 96 }, { "epoch": 1.0497981157469718, "grad_norm": 0.601963996887207, "learning_rate": 5e-05, "loss": 88.3373, "step": 97 }, { "epoch": 1.0605652759084792, "grad_norm": 0.6257451176643372, "learning_rate": 4.909729253747197e-05, "loss": 88.3497, "step": 98 }, { "epoch": 1.0713324360699865, "grad_norm": 0.6289570927619934, "learning_rate": 4.819487933850319e-05, "loss": 88.3188, "step": 99 }, { "epoch": 1.0820995962314939, "grad_norm": 0.6000854969024658, "learning_rate": 4.729305457072913e-05, "loss": 88.3107, "step": 100 }, { "epoch": 1.0928667563930015, "grad_norm": 0.7118406295776367, "learning_rate": 4.63921122099692e-05, "loss": 88.2865, "step": 101 }, { "epoch": 1.1036339165545088, "grad_norm": 0.5987817049026489, "learning_rate": 4.549234594439674e-05, "loss": 88.2681, "step": 102 }, { "epoch": 1.1144010767160162, "grad_norm": 0.5965442657470703, "learning_rate": 4.4594049078802925e-05, "loss": 88.3694, "step": 103 }, { "epoch": 1.1251682368775235, "grad_norm": 0.5947834253311157, "learning_rate": 4.3697514438985536e-05, "loss": 88.3057, "step": 104 }, { "epoch": 1.135935397039031, "grad_norm": 0.6357874870300293, "learning_rate": 4.280303427629404e-05, "loss": 88.2947, "step": 105 }, { "epoch": 1.1467025572005383, "grad_norm": 0.5863412618637085, "learning_rate": 4.1910900172361764e-05, "loss": 88.342, "step": 106 }, { "epoch": 1.1574697173620458, "grad_norm": 0.6012663245201111, "learning_rate": 4.1021402944056416e-05, "loss": 88.2721, "step": 107 }, { "epoch": 1.1682368775235532, "grad_norm": 0.6170640587806702, "learning_rate": 4.0134832548680006e-05, "loss": 88.3084, "step": 108 }, { "epoch": 1.1790040376850606, "grad_norm": 0.6174831390380859, "learning_rate": 3.92514779894488e-05, "loss": 88.3112, "step": 109 }, { "epoch": 1.189771197846568, "grad_norm": 0.5705954432487488, "learning_rate": 3.8371627221284495e-05, "loss": 88.3226, "step": 110 }, { "epoch": 1.2005383580080753, "grad_norm": 0.6390698552131653, "learning_rate": 3.7495567056946855e-05, "loss": 88.2471, "step": 111 }, { "epoch": 1.2113055181695827, "grad_norm": 0.643191397190094, "learning_rate": 3.6623583073538966e-05, "loss": 88.1814, "step": 112 }, { "epoch": 1.2220726783310902, "grad_norm": 0.6732293963432312, "learning_rate": 3.5755959519415005e-05, "loss": 88.2546, "step": 113 }, { "epoch": 1.2328398384925976, "grad_norm": 0.6384011507034302, "learning_rate": 3.489297922152136e-05, "loss": 88.2384, "step": 114 }, { "epoch": 1.243606998654105, "grad_norm": 0.5936709046363831, "learning_rate": 3.403492349320101e-05, "loss": 88.2589, "step": 115 }, { "epoch": 1.2543741588156123, "grad_norm": 0.5669568777084351, "learning_rate": 3.3182072042491244e-05, "loss": 88.2816, "step": 116 }, { "epoch": 1.2651413189771197, "grad_norm": 0.5536919236183167, "learning_rate": 3.2334702880944886e-05, "loss": 88.3449, "step": 117 }, { "epoch": 1.2759084791386273, "grad_norm": 0.5760201811790466, "learning_rate": 3.149309223300428e-05, "loss": 88.3198, "step": 118 }, { "epoch": 1.2866756393001346, "grad_norm": 0.62739497423172, "learning_rate": 3.065751444595805e-05, "loss": 88.2775, "step": 119 }, { "epoch": 1.297442799461642, "grad_norm": 0.5816006064414978, "learning_rate": 2.982824190050958e-05, "loss": 88.3048, "step": 120 }, { "epoch": 1.3082099596231493, "grad_norm": 0.6178335547447205, "learning_rate": 2.900554492198677e-05, "loss": 88.3031, "step": 121 }, { "epoch": 1.3189771197846567, "grad_norm": 0.6052290797233582, "learning_rate": 2.8189691692221627e-05, "loss": 88.2896, "step": 122 }, { "epoch": 1.3297442799461643, "grad_norm": 0.5566094517707825, "learning_rate": 2.738094816212866e-05, "loss": 88.3939, "step": 123 }, { "epoch": 1.3405114401076716, "grad_norm": 0.6167682409286499, "learning_rate": 2.65795779650105e-05, "loss": 88.283, "step": 124 }, { "epoch": 1.351278600269179, "grad_norm": 0.6485788822174072, "learning_rate": 2.5785842330619038e-05, "loss": 88.341, "step": 125 }, { "epoch": 1.3620457604306864, "grad_norm": 0.6236302256584167, "learning_rate": 2.500000000000001e-05, "loss": 88.3239, "step": 126 }, { "epoch": 1.3728129205921937, "grad_norm": 0.6133517622947693, "learning_rate": 2.422230714114891e-05, "loss": 88.257, "step": 127 }, { "epoch": 1.3835800807537013, "grad_norm": 0.5628191828727722, "learning_rate": 2.3453017265505673e-05, "loss": 88.2815, "step": 128 }, { "epoch": 1.3943472409152087, "grad_norm": 0.6054531931877136, "learning_rate": 2.269238114531515e-05, "loss": 88.2568, "step": 129 }, { "epoch": 1.405114401076716, "grad_norm": 0.6800547242164612, "learning_rate": 2.194064673188089e-05, "loss": 88.2577, "step": 130 }, { "epoch": 1.4158815612382234, "grad_norm": 0.6266992092132568, "learning_rate": 2.1198059074738024e-05, "loss": 88.2155, "step": 131 }, { "epoch": 1.4266487213997308, "grad_norm": 0.6254451274871826, "learning_rate": 2.0464860241772455e-05, "loss": 88.2093, "step": 132 }, { "epoch": 1.4374158815612383, "grad_norm": 0.6577990055084229, "learning_rate": 1.9741289240311755e-05, "loss": 88.1898, "step": 133 }, { "epoch": 1.4481830417227457, "grad_norm": 0.6361428499221802, "learning_rate": 1.902758193921385e-05, "loss": 88.3488, "step": 134 }, { "epoch": 1.458950201884253, "grad_norm": 0.6057702302932739, "learning_rate": 1.832397099197882e-05, "loss": 88.2169, "step": 135 }, { "epoch": 1.4697173620457604, "grad_norm": 0.5825548768043518, "learning_rate": 1.7630685760908622e-05, "loss": 88.2397, "step": 136 }, { "epoch": 1.4804845222072678, "grad_norm": 0.7038293480873108, "learning_rate": 1.6947952242339992e-05, "loss": 88.1477, "step": 137 }, { "epoch": 1.4912516823687754, "grad_norm": 0.6376841068267822, "learning_rate": 1.6275992992974308e-05, "loss": 88.2545, "step": 138 }, { "epoch": 1.5020188425302825, "grad_norm": 0.653458833694458, "learning_rate": 1.561502705732883e-05, "loss": 88.2055, "step": 139 }, { "epoch": 1.51278600269179, "grad_norm": 0.6237602829933167, "learning_rate": 1.4965269896332885e-05, "loss": 88.2087, "step": 140 }, { "epoch": 1.5235531628532974, "grad_norm": 0.6579408645629883, "learning_rate": 1.4326933317092e-05, "loss": 88.2702, "step": 141 }, { "epoch": 1.5343203230148048, "grad_norm": 0.5924363136291504, "learning_rate": 1.3700225403843469e-05, "loss": 88.248, "step": 142 }, { "epoch": 1.5450874831763124, "grad_norm": 0.6045029759407043, "learning_rate": 1.3085350450125072e-05, "loss": 88.3215, "step": 143 }, { "epoch": 1.5558546433378195, "grad_norm": 0.6474866271018982, "learning_rate": 1.2482508892179884e-05, "loss": 88.2769, "step": 144 }, { "epoch": 1.5666218034993271, "grad_norm": 0.6447649002075195, "learning_rate": 1.1891897243618182e-05, "loss": 88.2294, "step": 145 }, { "epoch": 1.5773889636608345, "grad_norm": 0.6493783593177795, "learning_rate": 1.1313708031358183e-05, "loss": 88.2699, "step": 146 }, { "epoch": 1.5881561238223418, "grad_norm": 0.6726663112640381, "learning_rate": 1.0748129732866391e-05, "loss": 88.2096, "step": 147 }, { "epoch": 1.5989232839838494, "grad_norm": 0.5635113716125488, "learning_rate": 1.0195346714717813e-05, "loss": 88.3316, "step": 148 }, { "epoch": 1.6096904441453566, "grad_norm": 0.6250278949737549, "learning_rate": 9.65553917249648e-06, "loss": 88.2534, "step": 149 }, { "epoch": 1.6204576043068641, "grad_norm": 0.61742103099823, "learning_rate": 9.12888307205541e-06, "loss": 88.3115, "step": 150 }, { "epoch": 1.6312247644683715, "grad_norm": 0.6726629137992859, "learning_rate": 8.615550092155478e-06, "loss": 88.1615, "step": 151 }, { "epoch": 1.6419919246298789, "grad_norm": 0.6055439114570618, "learning_rate": 8.115707568501768e-06, "loss": 88.2359, "step": 152 }, { "epoch": 1.6527590847913862, "grad_norm": 0.646351158618927, "learning_rate": 7.629518439195671e-06, "loss": 88.2837, "step": 153 }, { "epoch": 1.6635262449528936, "grad_norm": 0.6116978526115417, "learning_rate": 7.157141191620548e-06, "loss": 88.2912, "step": 154 }, { "epoch": 1.6742934051144012, "grad_norm": 0.6652332544326782, "learning_rate": 6.698729810778065e-06, "loss": 88.2366, "step": 155 }, { "epoch": 1.6850605652759085, "grad_norm": 0.7036592364311218, "learning_rate": 6.2544337290925185e-06, "loss": 88.2283, "step": 156 }, { "epoch": 1.695827725437416, "grad_norm": 0.6081019043922424, "learning_rate": 5.824397777698859e-06, "loss": 88.2434, "step": 157 }, { "epoch": 1.7065948855989233, "grad_norm": 0.6562557220458984, "learning_rate": 5.408762139230888e-06, "loss": 88.1598, "step": 158 }, { "epoch": 1.7173620457604306, "grad_norm": 0.6159968376159668, "learning_rate": 5.007662302124672e-06, "loss": 88.2584, "step": 159 }, { "epoch": 1.7281292059219382, "grad_norm": 0.6619474291801453, "learning_rate": 4.621229016452156e-06, "loss": 88.18, "step": 160 }, { "epoch": 1.7388963660834453, "grad_norm": 0.6214718222618103, "learning_rate": 4.249588251299391e-06, "loss": 88.2279, "step": 161 }, { "epoch": 1.749663526244953, "grad_norm": 0.6064915657043457, "learning_rate": 3.892861153703342e-06, "loss": 88.2215, "step": 162 }, { "epoch": 1.7604306864064603, "grad_norm": 0.5849805474281311, "learning_rate": 3.551164009160429e-06, "loss": 88.2112, "step": 163 }, { "epoch": 1.7711978465679676, "grad_norm": 0.6270782351493835, "learning_rate": 3.2246082037199532e-06, "loss": 88.2335, "step": 164 }, { "epoch": 1.7819650067294752, "grad_norm": 0.5952561497688293, "learning_rate": 2.9133001876746004e-06, "loss": 88.2984, "step": 165 }, { "epoch": 1.7927321668909824, "grad_norm": 0.6238298416137695, "learning_rate": 2.6173414408598827e-06, "loss": 88.2918, "step": 166 }, { "epoch": 1.80349932705249, "grad_norm": 0.6672609448432922, "learning_rate": 2.3368284395738684e-06, "loss": 88.2529, "step": 167 }, { "epoch": 1.8142664872139973, "grad_norm": 0.5070897340774536, "learning_rate": 2.0718526251279346e-06, "loss": 88.3376, "step": 168 }, { "epoch": 1.8250336473755047, "grad_norm": 0.6033185124397278, "learning_rate": 1.8225003740388547e-06, "loss": 88.3511, "step": 169 }, { "epoch": 1.8358008075370122, "grad_norm": 0.5488670468330383, "learning_rate": 1.5888529698718346e-06, "loss": 88.3084, "step": 170 }, { "epoch": 1.8465679676985194, "grad_norm": 0.6001074910163879, "learning_rate": 1.3709865767438435e-06, "loss": 88.2296, "step": 171 }, { "epoch": 1.857335127860027, "grad_norm": 0.5917924642562866, "learning_rate": 1.1689722144956671e-06, "loss": 88.3425, "step": 172 }, { "epoch": 1.8681022880215343, "grad_norm": 0.5938112735748291, "learning_rate": 9.82875735540989e-07, "loss": 88.2656, "step": 173 }, { "epoch": 1.8788694481830417, "grad_norm": 0.6183878779411316, "learning_rate": 8.127578033998662e-07, "loss": 88.2139, "step": 174 }, { "epoch": 1.8896366083445493, "grad_norm": 0.6551972031593323, "learning_rate": 6.58673872923693e-07, "loss": 88.2028, "step": 175 }, { "epoch": 1.9004037685060564, "grad_norm": 0.6331955194473267, "learning_rate": 5.206741722181386e-07, "loss": 88.2651, "step": 176 }, { "epoch": 1.911170928667564, "grad_norm": 0.6921940445899963, "learning_rate": 3.9880368626978304e-07, "loss": 88.2543, "step": 177 }, { "epoch": 1.9219380888290714, "grad_norm": 0.6135378479957581, "learning_rate": 2.9310214228202013e-07, "loss": 88.269, "step": 178 }, { "epoch": 1.9327052489905787, "grad_norm": 0.5835920572280884, "learning_rate": 2.0360399672478824e-07, "loss": 88.267, "step": 179 }, { "epoch": 1.9434724091520863, "grad_norm": 0.5504326224327087, "learning_rate": 1.3033842410251075e-07, "loss": 88.3445, "step": 180 }, { "epoch": 1.9542395693135934, "grad_norm": 0.6638001799583435, "learning_rate": 7.332930744380906e-08, "loss": 88.2083, "step": 181 }, { "epoch": 1.965006729475101, "grad_norm": 0.69869065284729, "learning_rate": 3.259523051615254e-08, "loss": 88.262, "step": 182 }, { "epoch": 1.9757738896366084, "grad_norm": 0.6452627778053284, "learning_rate": 8.149471767937567e-09, "loss": 88.2167, "step": 183 }, { "epoch": 1.9865410497981157, "grad_norm": 0.6180316805839539, "learning_rate": 0.0, "loss": 88.3043, "step": 184 }, { "epoch": 1.9865410497981157, "eval_loss": 11.03515911102295, "eval_runtime": 0.3851, "eval_samples_per_second": 407.704, "eval_steps_per_second": 103.874, "step": 184 } ], "logging_steps": 1, "max_steps": 184, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1943745331200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }