{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9254621347644604, "eval_steps": 50, "global_step": 97, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009540846750149075, "grad_norm": 0.7367311120033264, "learning_rate": 2e-05, "loss": 3.3667, "step": 1 }, { "epoch": 0.009540846750149075, "eval_loss": 3.5421202182769775, "eval_runtime": 46.8738, "eval_samples_per_second": 7.552, "eval_steps_per_second": 1.899, "step": 1 }, { "epoch": 0.01908169350029815, "grad_norm": 0.9371811151504517, "learning_rate": 4e-05, "loss": 2.9695, "step": 2 }, { "epoch": 0.028622540250447227, "grad_norm": 0.614778459072113, "learning_rate": 6e-05, "loss": 3.2482, "step": 3 }, { "epoch": 0.0381633870005963, "grad_norm": 0.7607264518737793, "learning_rate": 8e-05, "loss": 3.3045, "step": 4 }, { "epoch": 0.04770423375074538, "grad_norm": 0.792961597442627, "learning_rate": 0.0001, "loss": 3.299, "step": 5 }, { "epoch": 0.057245080500894455, "grad_norm": 0.9246425032615662, "learning_rate": 0.00012, "loss": 3.2984, "step": 6 }, { "epoch": 0.06678592725104353, "grad_norm": 0.9336756467819214, "learning_rate": 0.00014, "loss": 3.0915, "step": 7 }, { "epoch": 0.0763267740011926, "grad_norm": 1.0689278841018677, "learning_rate": 0.00016, "loss": 3.1456, "step": 8 }, { "epoch": 0.08586762075134168, "grad_norm": 0.6929656267166138, "learning_rate": 0.00018, "loss": 2.8801, "step": 9 }, { "epoch": 0.09540846750149076, "grad_norm": 0.9856171011924744, "learning_rate": 0.0002, "loss": 2.6816, "step": 10 }, { "epoch": 0.10494931425163984, "grad_norm": 0.9085230827331543, "learning_rate": 0.0001999348095389677, "loss": 2.6598, "step": 11 }, { "epoch": 0.11449016100178891, "grad_norm": 0.8404436707496643, "learning_rate": 0.000199739323151795, "loss": 2.6531, "step": 12 }, { "epoch": 0.12403100775193798, "grad_norm": 0.8586527109146118, "learning_rate": 0.00019941379571543596, "loss": 2.6339, "step": 13 }, { "epoch": 0.13357185450208706, "grad_norm": 0.851191520690918, "learning_rate": 0.00019895865165556377, "loss": 2.5046, "step": 14 }, { "epoch": 0.14311270125223613, "grad_norm": 0.6186274886131287, "learning_rate": 0.00019837448439320027, "loss": 2.357, "step": 15 }, { "epoch": 0.1526535480023852, "grad_norm": 0.6838149428367615, "learning_rate": 0.00019766205557100868, "loss": 2.2243, "step": 16 }, { "epoch": 0.16219439475253428, "grad_norm": 1.2345480918884277, "learning_rate": 0.00019682229406025635, "loss": 2.209, "step": 17 }, { "epoch": 0.17173524150268335, "grad_norm": 0.5735088586807251, "learning_rate": 0.00019585629474974415, "loss": 2.5145, "step": 18 }, { "epoch": 0.18127608825283245, "grad_norm": 0.551199734210968, "learning_rate": 0.00019476531711828027, "loss": 2.4064, "step": 19 }, { "epoch": 0.19081693500298152, "grad_norm": 0.6080546379089355, "learning_rate": 0.0001935507835925601, "loss": 2.5871, "step": 20 }, { "epoch": 0.2003577817531306, "grad_norm": 0.6316207051277161, "learning_rate": 0.00019221427769259333, "loss": 2.4882, "step": 21 }, { "epoch": 0.20989862850327967, "grad_norm": 0.5335537195205688, "learning_rate": 0.00019075754196709572, "loss": 2.3722, "step": 22 }, { "epoch": 0.21943947525342875, "grad_norm": 0.516207754611969, "learning_rate": 0.00018918247572153823, "loss": 2.4403, "step": 23 }, { "epoch": 0.22898032200357782, "grad_norm": 0.47695738077163696, "learning_rate": 0.00018749113254181498, "loss": 2.2934, "step": 24 }, { "epoch": 0.2385211687537269, "grad_norm": 0.4737792909145355, "learning_rate": 0.00018568571761675893, "loss": 2.1638, "step": 25 }, { "epoch": 0.24806201550387597, "grad_norm": 0.40663591027259827, "learning_rate": 0.00018376858486299647, "loss": 2.0649, "step": 26 }, { "epoch": 0.25760286225402507, "grad_norm": 0.4352891147136688, "learning_rate": 0.00018174223385588917, "loss": 2.2987, "step": 27 }, { "epoch": 0.2671437090041741, "grad_norm": 0.4194694757461548, "learning_rate": 0.00017960930657056438, "loss": 2.272, "step": 28 }, { "epoch": 0.2766845557543232, "grad_norm": 0.5331905484199524, "learning_rate": 0.00017737258393728364, "loss": 2.3588, "step": 29 }, { "epoch": 0.28622540250447226, "grad_norm": 0.4592475891113281, "learning_rate": 0.00017503498221564025, "loss": 2.3871, "step": 30 }, { "epoch": 0.29576624925462136, "grad_norm": 0.4838412404060364, "learning_rate": 0.0001725995491923131, "loss": 2.1406, "step": 31 }, { "epoch": 0.3053070960047704, "grad_norm": 0.4136384427547455, "learning_rate": 0.00017006946020733425, "loss": 2.1532, "step": 32 }, { "epoch": 0.3148479427549195, "grad_norm": 0.41302838921546936, "learning_rate": 0.0001674480140140514, "loss": 2.1363, "step": 33 }, { "epoch": 0.32438878950506855, "grad_norm": 0.44015756249427795, "learning_rate": 0.00016473862847818277, "loss": 2.265, "step": 34 }, { "epoch": 0.33392963625521765, "grad_norm": 0.44776979088783264, "learning_rate": 0.0001619448361215723, "loss": 2.2622, "step": 35 }, { "epoch": 0.3434704830053667, "grad_norm": 0.4290170967578888, "learning_rate": 0.0001590702795164551, "loss": 2.0401, "step": 36 }, { "epoch": 0.3530113297555158, "grad_norm": 0.48702678084373474, "learning_rate": 0.00015611870653623825, "loss": 2.1099, "step": 37 }, { "epoch": 0.3625521765056649, "grad_norm": 0.5152214765548706, "learning_rate": 0.0001530939654689887, "loss": 2.1686, "step": 38 }, { "epoch": 0.37209302325581395, "grad_norm": 0.4602924585342407, "learning_rate": 0.00015000000000000001, "loss": 2.1263, "step": 39 }, { "epoch": 0.38163387000596305, "grad_norm": 0.47519952058792114, "learning_rate": 0.00014684084406997903, "loss": 1.9896, "step": 40 }, { "epoch": 0.3911747167561121, "grad_norm": 0.5167194604873657, "learning_rate": 0.00014362061661555675, "loss": 2.2722, "step": 41 }, { "epoch": 0.4007155635062612, "grad_norm": 0.48113343119621277, "learning_rate": 0.00014034351619898088, "loss": 2.292, "step": 42 }, { "epoch": 0.41025641025641024, "grad_norm": 0.484389066696167, "learning_rate": 0.00013701381553399145, "loss": 2.2551, "step": 43 }, { "epoch": 0.41979725700655934, "grad_norm": 0.4332115650177002, "learning_rate": 0.0001336358559150175, "loss": 1.9839, "step": 44 }, { "epoch": 0.4293381037567084, "grad_norm": 0.4608907401561737, "learning_rate": 0.00013021404155695725, "loss": 1.9983, "step": 45 }, { "epoch": 0.4388789505068575, "grad_norm": 0.49616098403930664, "learning_rate": 0.00012675283385292212, "loss": 2.1903, "step": 46 }, { "epoch": 0.44841979725700654, "grad_norm": 0.573137640953064, "learning_rate": 0.00012325674555743106, "loss": 2.1866, "step": 47 }, { "epoch": 0.45796064400715564, "grad_norm": 0.45796340703964233, "learning_rate": 0.00011973033490264001, "loss": 2.0213, "step": 48 }, { "epoch": 0.46750149075730474, "grad_norm": 0.5226765871047974, "learning_rate": 0.0001161781996552765, "loss": 2.1563, "step": 49 }, { "epoch": 0.4770423375074538, "grad_norm": 0.5040608644485474, "learning_rate": 0.00011260497112202895, "loss": 2.1839, "step": 50 }, { "epoch": 0.4770423375074538, "eval_loss": 2.176058769226074, "eval_runtime": 47.4353, "eval_samples_per_second": 7.463, "eval_steps_per_second": 1.876, "step": 50 }, { "epoch": 0.4865831842576029, "grad_norm": 0.4927314519882202, "learning_rate": 0.00010901530811120655, "loss": 2.0997, "step": 51 }, { "epoch": 0.49612403100775193, "grad_norm": 0.5394675731658936, "learning_rate": 0.00010541389085854176, "loss": 2.2713, "step": 52 }, { "epoch": 0.505664877757901, "grad_norm": 0.5610401630401611, "learning_rate": 0.00010180541492505604, "loss": 2.1373, "step": 53 }, { "epoch": 0.5152057245080501, "grad_norm": 0.5036469101905823, "learning_rate": 9.819458507494394e-05, "loss": 2.2499, "step": 54 }, { "epoch": 0.5247465712581991, "grad_norm": 0.46630653738975525, "learning_rate": 9.458610914145826e-05, "loss": 1.9732, "step": 55 }, { "epoch": 0.5342874180083482, "grad_norm": 0.47023531794548035, "learning_rate": 9.098469188879349e-05, "loss": 2.1248, "step": 56 }, { "epoch": 0.5438282647584973, "grad_norm": 0.4970254898071289, "learning_rate": 8.739502887797107e-05, "loss": 2.0411, "step": 57 }, { "epoch": 0.5533691115086464, "grad_norm": 0.5040042400360107, "learning_rate": 8.382180034472353e-05, "loss": 2.2428, "step": 58 }, { "epoch": 0.5629099582587954, "grad_norm": 0.5225110650062561, "learning_rate": 8.026966509736001e-05, "loss": 2.1539, "step": 59 }, { "epoch": 0.5724508050089445, "grad_norm": 0.5296663641929626, "learning_rate": 7.674325444256899e-05, "loss": 2.1092, "step": 60 }, { "epoch": 0.5819916517590936, "grad_norm": 0.4598320424556732, "learning_rate": 7.324716614707793e-05, "loss": 2.061, "step": 61 }, { "epoch": 0.5915324985092427, "grad_norm": 0.5481743216514587, "learning_rate": 6.978595844304271e-05, "loss": 2.0384, "step": 62 }, { "epoch": 0.6010733452593918, "grad_norm": 0.569454550743103, "learning_rate": 6.636414408498249e-05, "loss": 2.2044, "step": 63 }, { "epoch": 0.6106141920095408, "grad_norm": 0.5209865570068359, "learning_rate": 6.298618446600856e-05, "loss": 2.0685, "step": 64 }, { "epoch": 0.6201550387596899, "grad_norm": 0.5345067977905273, "learning_rate": 5.965648380101916e-05, "loss": 2.1992, "step": 65 }, { "epoch": 0.629695885509839, "grad_norm": 0.46630385518074036, "learning_rate": 5.6379383384443255e-05, "loss": 2.2016, "step": 66 }, { "epoch": 0.6392367322599881, "grad_norm": 0.5166553854942322, "learning_rate": 5.3159155930021e-05, "loss": 2.1986, "step": 67 }, { "epoch": 0.6487775790101371, "grad_norm": 0.4024442136287689, "learning_rate": 5.000000000000002e-05, "loss": 1.9141, "step": 68 }, { "epoch": 0.6583184257602862, "grad_norm": 0.5243792533874512, "learning_rate": 4.6906034531011346e-05, "loss": 2.3524, "step": 69 }, { "epoch": 0.6678592725104353, "grad_norm": 0.5335293412208557, "learning_rate": 4.388129346376178e-05, "loss": 2.1225, "step": 70 }, { "epoch": 0.6774001192605844, "grad_norm": 0.5144345164299011, "learning_rate": 4.092972048354491e-05, "loss": 2.1678, "step": 71 }, { "epoch": 0.6869409660107334, "grad_norm": 0.4751684069633484, "learning_rate": 3.80551638784277e-05, "loss": 2.1261, "step": 72 }, { "epoch": 0.6964818127608825, "grad_norm": 0.5534799098968506, "learning_rate": 3.5261371521817244e-05, "loss": 2.3748, "step": 73 }, { "epoch": 0.7060226595110316, "grad_norm": 0.48840370774269104, "learning_rate": 3.2551985985948616e-05, "loss": 2.0223, "step": 74 }, { "epoch": 0.7155635062611807, "grad_norm": 0.45086348056793213, "learning_rate": 2.993053979266577e-05, "loss": 2.0576, "step": 75 }, { "epoch": 0.7251043530113298, "grad_norm": 0.4344973862171173, "learning_rate": 2.7400450807686938e-05, "loss": 1.9448, "step": 76 }, { "epoch": 0.7346451997614788, "grad_norm": 0.5358251333236694, "learning_rate": 2.496501778435977e-05, "loss": 2.0693, "step": 77 }, { "epoch": 0.7441860465116279, "grad_norm": 0.550864577293396, "learning_rate": 2.2627416062716366e-05, "loss": 2.2065, "step": 78 }, { "epoch": 0.753726893261777, "grad_norm": 0.594879150390625, "learning_rate": 2.0390693429435627e-05, "loss": 2.2258, "step": 79 }, { "epoch": 0.7632677400119261, "grad_norm": 0.5433897376060486, "learning_rate": 1.825776614411082e-05, "loss": 2.0444, "step": 80 }, { "epoch": 0.7728085867620751, "grad_norm": 0.5564923882484436, "learning_rate": 1.6231415137003537e-05, "loss": 2.2308, "step": 81 }, { "epoch": 0.7823494335122242, "grad_norm": 0.5145489573478699, "learning_rate": 1.4314282383241096e-05, "loss": 2.1803, "step": 82 }, { "epoch": 0.7918902802623733, "grad_norm": 0.5326076745986938, "learning_rate": 1.2508867458185037e-05, "loss": 2.1118, "step": 83 }, { "epoch": 0.8014311270125224, "grad_norm": 0.5149711966514587, "learning_rate": 1.0817524278461776e-05, "loss": 2.2356, "step": 84 }, { "epoch": 0.8109719737626714, "grad_norm": 0.545981228351593, "learning_rate": 9.242458032904311e-06, "loss": 2.0592, "step": 85 }, { "epoch": 0.8205128205128205, "grad_norm": 0.48461177945137024, "learning_rate": 7.785722307406684e-06, "loss": 2.0525, "step": 86 }, { "epoch": 0.8300536672629696, "grad_norm": 0.4771486818790436, "learning_rate": 6.4492164074399065e-06, "loss": 2.1279, "step": 87 }, { "epoch": 0.8395945140131187, "grad_norm": 0.5691129565238953, "learning_rate": 5.2346828817197655e-06, "loss": 2.2043, "step": 88 }, { "epoch": 0.8491353607632678, "grad_norm": 0.5064877867698669, "learning_rate": 4.143705250255869e-06, "loss": 2.1349, "step": 89 }, { "epoch": 0.8586762075134168, "grad_norm": 0.4522230923175812, "learning_rate": 3.1777059397436692e-06, "loss": 2.177, "step": 90 }, { "epoch": 0.8682170542635659, "grad_norm": 0.6604287028312683, "learning_rate": 2.3379444289913342e-06, "loss": 2.0478, "step": 91 }, { "epoch": 0.877757901013715, "grad_norm": 0.4626028835773468, "learning_rate": 1.6255156067997323e-06, "loss": 2.331, "step": 92 }, { "epoch": 0.8872987477638641, "grad_norm": 0.5617081522941589, "learning_rate": 1.0413483444362771e-06, "loss": 2.0815, "step": 93 }, { "epoch": 0.8968395945140131, "grad_norm": 0.48092931509017944, "learning_rate": 5.862042845640403e-07, "loss": 1.9864, "step": 94 }, { "epoch": 0.9063804412641622, "grad_norm": 0.4518766701221466, "learning_rate": 2.606768482050215e-07, "loss": 2.0202, "step": 95 }, { "epoch": 0.9159212880143113, "grad_norm": 0.5857305526733398, "learning_rate": 6.519046103230508e-08, "loss": 2.1958, "step": 96 }, { "epoch": 0.9254621347644604, "grad_norm": 0.5706104636192322, "learning_rate": 0.0, "loss": 2.1415, "step": 97 } ], "logging_steps": 1, "max_steps": 97, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.1683682015195955e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }