{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500.0, "global_step": 6237, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02405002405002405, "grad_norm": 0.4139963388442993, "learning_rate": 0.00019996828714700116, "loss": 1.5971, "step": 50 }, { "epoch": 0.0481000481000481, "grad_norm": 0.3423018157482147, "learning_rate": 0.00019987316870210547, "loss": 1.274, "step": 100 }, { "epoch": 0.07215007215007214, "grad_norm": 0.3551710247993469, "learning_rate": 0.0001997147049948582, "loss": 1.2519, "step": 150 }, { "epoch": 0.0962000962000962, "grad_norm": 0.32329073548316956, "learning_rate": 0.0001994929965319844, "loss": 1.2382, "step": 200 }, { "epoch": 0.12025012025012025, "grad_norm": 0.48585018515586853, "learning_rate": 0.0001992081839336419, "loss": 1.2293, "step": 250 }, { "epoch": 0.1443001443001443, "grad_norm": 0.40136224031448364, "learning_rate": 0.00019886044784423197, "loss": 1.2214, "step": 300 }, { "epoch": 0.16835016835016836, "grad_norm": 0.574002206325531, "learning_rate": 0.00019845000881782432, "loss": 1.2184, "step": 350 }, { "epoch": 0.1924001924001924, "grad_norm": 0.4179827570915222, "learning_rate": 0.00019797712717826914, "loss": 1.2064, "step": 400 }, { "epoch": 0.21645021645021645, "grad_norm": 0.33033809065818787, "learning_rate": 0.00019744210285408488, "loss": 1.2055, "step": 450 }, { "epoch": 0.2405002405002405, "grad_norm": 0.2719138562679291, "learning_rate": 0.0001968452751882264, "loss": 1.2077, "step": 500 }, { "epoch": 0.26455026455026454, "grad_norm": 0.29797521233558655, "learning_rate": 0.00019618702272285434, "loss": 1.2096, "step": 550 }, { "epoch": 0.2886002886002886, "grad_norm": 0.3336372673511505, "learning_rate": 0.00019546776295924212, "loss": 1.2072, "step": 600 }, { "epoch": 0.3126503126503126, "grad_norm": 0.26755037903785706, "learning_rate": 0.0001946879520929728, "loss": 1.1974, "step": 650 }, { "epoch": 0.3367003367003367, "grad_norm": 0.36268576979637146, "learning_rate": 0.00019384808472459368, "loss": 1.2045, "step": 700 }, { "epoch": 0.36075036075036077, "grad_norm": 0.3121575713157654, "learning_rate": 0.0001929486935459127, "loss": 1.1889, "step": 750 }, { "epoch": 0.3848003848003848, "grad_norm": 0.3159404993057251, "learning_rate": 0.00019199034900213452, "loss": 1.1921, "step": 800 }, { "epoch": 0.40885040885040885, "grad_norm": 0.7236579060554504, "learning_rate": 0.000190973658930052, "loss": 1.194, "step": 850 }, { "epoch": 0.4329004329004329, "grad_norm": 0.24907168745994568, "learning_rate": 0.00018989926817252113, "loss": 1.191, "step": 900 }, { "epoch": 0.45695045695045694, "grad_norm": 0.24481187760829926, "learning_rate": 0.00018876785816946505, "loss": 1.1857, "step": 950 }, { "epoch": 0.481000481000481, "grad_norm": 0.2668200731277466, "learning_rate": 0.00018758014652566597, "loss": 1.1957, "step": 1000 }, { "epoch": 0.5050505050505051, "grad_norm": 0.2687171399593353, "learning_rate": 0.0001863368865556191, "loss": 1.1864, "step": 1050 }, { "epoch": 0.5291005291005291, "grad_norm": 0.23915782570838928, "learning_rate": 0.0001850388668057379, "loss": 1.184, "step": 1100 }, { "epoch": 0.5531505531505532, "grad_norm": 0.37159469723701477, "learning_rate": 0.0001836869105542127, "loss": 1.1849, "step": 1150 }, { "epoch": 0.5772005772005772, "grad_norm": 0.2752649784088135, "learning_rate": 0.0001822818752888408, "loss": 1.1843, "step": 1200 }, { "epoch": 0.6012506012506013, "grad_norm": 0.19733025133609772, "learning_rate": 0.00018082465216315882, "loss": 1.1766, "step": 1250 }, { "epoch": 0.6253006253006252, "grad_norm": 0.2180165797472, "learning_rate": 0.00017931616543122214, "loss": 1.1865, "step": 1300 }, { "epoch": 0.6493506493506493, "grad_norm": 0.25025510787963867, "learning_rate": 0.00017775737186139038, "loss": 1.1723, "step": 1350 }, { "epoch": 0.6734006734006734, "grad_norm": 0.2865007817745209, "learning_rate": 0.00017614926012949028, "loss": 1.172, "step": 1400 }, { "epoch": 0.6974506974506974, "grad_norm": 0.3406023681163788, "learning_rate": 0.00017449285019174098, "loss": 1.1795, "step": 1450 }, { "epoch": 0.7215007215007215, "grad_norm": 0.19766800105571747, "learning_rate": 0.00017278919263783978, "loss": 1.1784, "step": 1500 }, { "epoch": 0.7455507455507455, "grad_norm": 0.1965962052345276, "learning_rate": 0.00017103936802461797, "loss": 1.1754, "step": 1550 }, { "epoch": 0.7696007696007696, "grad_norm": 0.2381555736064911, "learning_rate": 0.00016924448619069023, "loss": 1.1671, "step": 1600 }, { "epoch": 0.7936507936507936, "grad_norm": 0.20156389474868774, "learning_rate": 0.00016740568555253155, "loss": 1.1738, "step": 1650 }, { "epoch": 0.8177008177008177, "grad_norm": 0.18294361233711243, "learning_rate": 0.00016552413238242857, "loss": 1.1727, "step": 1700 }, { "epoch": 0.8417508417508418, "grad_norm": 0.2975623309612274, "learning_rate": 0.00016360102006876317, "loss": 1.1677, "step": 1750 }, { "epoch": 0.8658008658008658, "grad_norm": 0.1871371865272522, "learning_rate": 0.0001616375683590974, "loss": 1.1689, "step": 1800 }, { "epoch": 0.8898508898508899, "grad_norm": 0.21457934379577637, "learning_rate": 0.00015963502258654005, "loss": 1.1605, "step": 1850 }, { "epoch": 0.9139009139009139, "grad_norm": 0.20261706411838531, "learning_rate": 0.0001575946528798853, "loss": 1.1627, "step": 1900 }, { "epoch": 0.937950937950938, "grad_norm": 0.17685186862945557, "learning_rate": 0.0001555177533580245, "loss": 1.1627, "step": 1950 }, { "epoch": 0.962000962000962, "grad_norm": 0.212468221783638, "learning_rate": 0.00015340564130914233, "loss": 1.161, "step": 2000 }, { "epoch": 0.9860509860509861, "grad_norm": 0.175174742937088, "learning_rate": 0.00015125965635521724, "loss": 1.1688, "step": 2050 }, { "epoch": 1.0101010101010102, "grad_norm": 0.19970253109931946, "learning_rate": 0.00014908115960235682, "loss": 1.142, "step": 2100 }, { "epoch": 1.034151034151034, "grad_norm": 0.21254608035087585, "learning_rate": 0.00014687153277750676, "loss": 1.1271, "step": 2150 }, { "epoch": 1.0582010582010581, "grad_norm": 0.1651500016450882, "learning_rate": 0.00014463217735208062, "loss": 1.121, "step": 2200 }, { "epoch": 1.0822510822510822, "grad_norm": 0.2405405044555664, "learning_rate": 0.00014236451365306674, "loss": 1.1313, "step": 2250 }, { "epoch": 1.1063011063011063, "grad_norm": 0.17223596572875977, "learning_rate": 0.00014006997996217593, "loss": 1.1344, "step": 2300 }, { "epoch": 1.1303511303511304, "grad_norm": 0.1969347894191742, "learning_rate": 0.00013775003160360096, "loss": 1.1176, "step": 2350 }, { "epoch": 1.1544011544011543, "grad_norm": 0.187143936753273, "learning_rate": 0.00013540614002096701, "loss": 1.1322, "step": 2400 }, { "epoch": 1.1784511784511784, "grad_norm": 0.1838238537311554, "learning_rate": 0.00013303979184405826, "loss": 1.1293, "step": 2450 }, { "epoch": 1.2025012025012025, "grad_norm": 0.17928341031074524, "learning_rate": 0.00013065248794591223, "loss": 1.1268, "step": 2500 }, { "epoch": 1.2265512265512266, "grad_norm": 0.2683047950267792, "learning_rate": 0.00012824574249088063, "loss": 1.1234, "step": 2550 }, { "epoch": 1.2506012506012505, "grad_norm": 0.18034860491752625, "learning_rate": 0.0001258210819742599, "loss": 1.125, "step": 2600 }, { "epoch": 1.2746512746512746, "grad_norm": 0.26357391476631165, "learning_rate": 0.00012338004425410074, "loss": 1.1217, "step": 2650 }, { "epoch": 1.2987012987012987, "grad_norm": 0.17828579246997833, "learning_rate": 0.00012092417757581085, "loss": 1.1262, "step": 2700 }, { "epoch": 1.3227513227513228, "grad_norm": 0.20247310400009155, "learning_rate": 0.00011845503959016928, "loss": 1.1246, "step": 2750 }, { "epoch": 1.3468013468013469, "grad_norm": 0.17381271719932556, "learning_rate": 0.0001159741963653755, "loss": 1.1181, "step": 2800 }, { "epoch": 1.370851370851371, "grad_norm": 0.19958114624023438, "learning_rate": 0.00011348322139375948, "loss": 1.1307, "step": 2850 }, { "epoch": 1.3949013949013949, "grad_norm": 0.21912401914596558, "learning_rate": 0.00011098369459378328, "loss": 1.1264, "step": 2900 }, { "epoch": 1.418951418951419, "grad_norm": 0.1694297194480896, "learning_rate": 0.00010847720130796631, "loss": 1.1256, "step": 2950 }, { "epoch": 1.443001443001443, "grad_norm": 0.13446395099163055, "learning_rate": 0.00010596533129737092, "loss": 1.1258, "step": 3000 }, { "epoch": 1.467051467051467, "grad_norm": 0.140371173620224, "learning_rate": 0.00010344967773328507, "loss": 1.1191, "step": 3050 }, { "epoch": 1.491101491101491, "grad_norm": 0.18016813695430756, "learning_rate": 0.00010093183618674224, "loss": 1.114, "step": 3100 }, { "epoch": 1.5151515151515151, "grad_norm": 0.17306862771511078, "learning_rate": 9.84134036165192e-05, "loss": 1.1149, "step": 3150 }, { "epoch": 1.5392015392015392, "grad_norm": 0.14116255939006805, "learning_rate": 9.589597735625377e-05, "loss": 1.123, "step": 3200 }, { "epoch": 1.5632515632515633, "grad_norm": 0.16819800436496735, "learning_rate": 9.338115410132441e-05, "loss": 1.1203, "step": 3250 }, { "epoch": 1.5873015873015874, "grad_norm": 0.21958529949188232, "learning_rate": 9.087052889613518e-05, "loss": 1.1226, "step": 3300 }, { "epoch": 1.6113516113516113, "grad_norm": 0.15786272287368774, "learning_rate": 8.836569412244745e-05, "loss": 1.1212, "step": 3350 }, { "epoch": 1.6354016354016354, "grad_norm": 0.17366796731948853, "learning_rate": 8.586823848940047e-05, "loss": 1.1129, "step": 3400 }, { "epoch": 1.6594516594516593, "grad_norm": 0.21448016166687012, "learning_rate": 8.337974602586152e-05, "loss": 1.1216, "step": 3450 }, { "epoch": 1.6835016835016834, "grad_norm": 0.17243099212646484, "learning_rate": 8.090179507574427e-05, "loss": 1.1096, "step": 3500 }, { "epoch": 1.7075517075517075, "grad_norm": 0.1429734081029892, "learning_rate": 7.843595729693316e-05, "loss": 1.1071, "step": 3550 }, { "epoch": 1.7316017316017316, "grad_norm": 0.15200386941432953, "learning_rate": 7.598379666444808e-05, "loss": 1.1158, "step": 3600 }, { "epoch": 1.7556517556517557, "grad_norm": 0.1442406326532364, "learning_rate": 7.354686847848242e-05, "loss": 1.112, "step": 3650 }, { "epoch": 1.7797017797017798, "grad_norm": 0.17678239941596985, "learning_rate": 7.11267183779428e-05, "loss": 1.1118, "step": 3700 }, { "epoch": 1.8037518037518039, "grad_norm": 0.147593155503273, "learning_rate": 6.872488136011667e-05, "loss": 1.1165, "step": 3750 }, { "epoch": 1.8278018278018278, "grad_norm": 0.1334652155637741, "learning_rate": 6.634288080708952e-05, "loss": 1.1135, "step": 3800 }, { "epoch": 1.8518518518518519, "grad_norm": 0.14890378713607788, "learning_rate": 6.398222751952899e-05, "loss": 1.1086, "step": 3850 }, { "epoch": 1.8759018759018757, "grad_norm": 0.1334807574748993, "learning_rate": 6.164441875844882e-05, "loss": 1.1144, "step": 3900 }, { "epoch": 1.8999518999518998, "grad_norm": 0.12897680699825287, "learning_rate": 5.933093729556062e-05, "loss": 1.1116, "step": 3950 }, { "epoch": 1.924001924001924, "grad_norm": 0.17530564963817596, "learning_rate": 5.7043250472815356e-05, "loss": 1.1039, "step": 4000 }, { "epoch": 1.948051948051948, "grad_norm": 0.15966495871543884, "learning_rate": 5.478280927173145e-05, "loss": 1.101, "step": 4050 }, { "epoch": 1.9721019721019721, "grad_norm": 0.18890446424484253, "learning_rate": 5.255104739309924e-05, "loss": 1.1077, "step": 4100 }, { "epoch": 1.9961519961519962, "grad_norm": 0.1547369807958603, "learning_rate": 5.0349380347646494e-05, "loss": 1.103, "step": 4150 }, { "epoch": 2.0202020202020203, "grad_norm": 0.13888758420944214, "learning_rate": 4.8179204558240444e-05, "loss": 1.0826, "step": 4200 }, { "epoch": 2.0442520442520444, "grad_norm": 0.11266086250543594, "learning_rate": 4.6041896474197e-05, "loss": 1.071, "step": 4250 }, { "epoch": 2.068302068302068, "grad_norm": 0.14245671033859253, "learning_rate": 4.393881169825779e-05, "loss": 1.0759, "step": 4300 }, { "epoch": 2.092352092352092, "grad_norm": 0.1226249411702156, "learning_rate": 4.187128412678969e-05, "loss": 1.0742, "step": 4350 }, { "epoch": 2.1164021164021163, "grad_norm": 0.12307476997375488, "learning_rate": 3.984062510375155e-05, "loss": 1.0721, "step": 4400 }, { "epoch": 2.1404521404521404, "grad_norm": 0.12813834846019745, "learning_rate": 3.7848122588965144e-05, "loss": 1.0726, "step": 4450 }, { "epoch": 2.1645021645021645, "grad_norm": 0.13432885706424713, "learning_rate": 3.5895040341217543e-05, "loss": 1.0745, "step": 4500 }, { "epoch": 2.1885521885521886, "grad_norm": 0.11649097502231598, "learning_rate": 3.398261711671309e-05, "loss": 1.079, "step": 4550 }, { "epoch": 2.2126022126022127, "grad_norm": 0.11140163242816925, "learning_rate": 3.211206588338358e-05, "loss": 1.0748, "step": 4600 }, { "epoch": 2.236652236652237, "grad_norm": 0.10978424549102783, "learning_rate": 3.028457305155483e-05, "loss": 1.0726, "step": 4650 }, { "epoch": 2.260702260702261, "grad_norm": 0.11395589262247086, "learning_rate": 2.8501297721457422e-05, "loss": 1.0656, "step": 4700 }, { "epoch": 2.284752284752285, "grad_norm": 0.10599405318498611, "learning_rate": 2.6763370948059353e-05, "loss": 1.0765, "step": 4750 }, { "epoch": 2.3088023088023086, "grad_norm": 0.11157254874706268, "learning_rate": 2.5071895023686442e-05, "loss": 1.0726, "step": 4800 }, { "epoch": 2.3328523328523327, "grad_norm": 0.1390163153409958, "learning_rate": 2.342794277888547e-05, "loss": 1.0731, "step": 4850 }, { "epoch": 2.356902356902357, "grad_norm": 0.1519329994916916, "learning_rate": 2.1832556901973965e-05, "loss": 1.0704, "step": 4900 }, { "epoch": 2.380952380952381, "grad_norm": 0.1278182566165924, "learning_rate": 2.0286749277707782e-05, "loss": 1.0661, "step": 4950 }, { "epoch": 2.405002405002405, "grad_norm": 0.10508263111114502, "learning_rate": 1.879150034548588e-05, "loss": 1.0758, "step": 5000 }, { "epoch": 2.429052429052429, "grad_norm": 0.09690719097852707, "learning_rate": 1.7347758477500044e-05, "loss": 1.0644, "step": 5050 }, { "epoch": 2.4531024531024532, "grad_norm": 0.10174595564603806, "learning_rate": 1.5956439377222798e-05, "loss": 1.0726, "step": 5100 }, { "epoch": 2.4771524771524773, "grad_norm": 0.10294167697429657, "learning_rate": 1.4618425498616162e-05, "loss": 1.0655, "step": 5150 }, { "epoch": 2.501202501202501, "grad_norm": 0.11103129386901855, "learning_rate": 1.3334565486428996e-05, "loss": 1.0651, "step": 5200 }, { "epoch": 2.525252525252525, "grad_norm": 0.10614852607250214, "learning_rate": 1.2105673637938053e-05, "loss": 1.0701, "step": 5250 }, { "epoch": 2.549302549302549, "grad_norm": 0.09437720477581024, "learning_rate": 1.0932529386474188e-05, "loss": 1.0673, "step": 5300 }, { "epoch": 2.5733525733525733, "grad_norm": 0.0965106412768364, "learning_rate": 9.815876807061264e-06, "loss": 1.0769, "step": 5350 }, { "epoch": 2.5974025974025974, "grad_norm": 0.09335634112358093, "learning_rate": 8.756424144481312e-06, "loss": 1.0646, "step": 5400 }, { "epoch": 2.6214526214526215, "grad_norm": 0.09890544414520264, "learning_rate": 7.75484336406529e-06, "loss": 1.0757, "step": 5450 }, { "epoch": 2.6455026455026456, "grad_norm": 0.09670912474393845, "learning_rate": 6.8117697254943106e-06, "loss": 1.0668, "step": 5500 }, { "epoch": 2.6695526695526697, "grad_norm": 0.09898468106985092, "learning_rate": 5.927801379881714e-06, "loss": 1.0745, "step": 5550 }, { "epoch": 2.6936026936026938, "grad_norm": 0.08697386831045151, "learning_rate": 5.103498990391509e-06, "loss": 1.0653, "step": 5600 }, { "epoch": 2.717652717652718, "grad_norm": 0.09457134455442429, "learning_rate": 4.339385376633775e-06, "loss": 1.0678, "step": 5650 }, { "epoch": 2.741702741702742, "grad_norm": 0.09092475473880768, "learning_rate": 3.6359451830626723e-06, "loss": 1.0635, "step": 5700 }, { "epoch": 2.7657527657527656, "grad_norm": 0.08736653625965118, "learning_rate": 2.993624571587239e-06, "loss": 1.0639, "step": 5750 }, { "epoch": 2.7898027898027897, "grad_norm": 0.09138292819261551, "learning_rate": 2.4128309385900717e-06, "loss": 1.065, "step": 5800 }, { "epoch": 2.813852813852814, "grad_norm": 0.08842656016349792, "learning_rate": 1.8939326565333037e-06, "loss": 1.0636, "step": 5850 }, { "epoch": 2.837902837902838, "grad_norm": 0.08870802819728851, "learning_rate": 1.437258840315714e-06, "loss": 1.0706, "step": 5900 }, { "epoch": 2.861952861952862, "grad_norm": 0.08659425377845764, "learning_rate": 1.0430991385293575e-06, "loss": 1.0673, "step": 5950 }, { "epoch": 2.886002886002886, "grad_norm": 0.08142086863517761, "learning_rate": 7.117035497478553e-07, "loss": 1.0697, "step": 6000 }, { "epoch": 2.91005291005291, "grad_norm": 0.080448217689991, "learning_rate": 4.432822639630407e-07, "loss": 1.0655, "step": 6050 }, { "epoch": 2.934102934102934, "grad_norm": 0.08980288356542587, "learning_rate": 2.380055292704575e-07, "loss": 1.0701, "step": 6100 }, { "epoch": 2.958152958152958, "grad_norm": 0.08309097588062286, "learning_rate": 9.600354388833443e-08, "loss": 1.0684, "step": 6150 }, { "epoch": 2.982202982202982, "grad_norm": 0.08456841111183167, "learning_rate": 1.7366373578442397e-08, "loss": 1.0684, "step": 6200 } ], "logging_steps": 50, "max_steps": 6237, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.056700790948663e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }