{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9974768713204374, "eval_steps": 500, "global_step": 1188, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001682085786375105, "grad_norm": 0.584265834445358, "learning_rate": 2.521008403361344e-06, "loss": 1.3366, "step": 1 }, { "epoch": 0.008410428931875526, "grad_norm": 0.4317387157455665, "learning_rate": 1.2605042016806723e-05, "loss": 1.2402, "step": 5 }, { "epoch": 0.01682085786375105, "grad_norm": 0.2866682001722115, "learning_rate": 2.5210084033613446e-05, "loss": 1.2481, "step": 10 }, { "epoch": 0.025231286795626577, "grad_norm": 0.2291349749029046, "learning_rate": 3.7815126050420166e-05, "loss": 1.1829, "step": 15 }, { "epoch": 0.0336417157275021, "grad_norm": 0.26272617568016177, "learning_rate": 5.042016806722689e-05, "loss": 1.0796, "step": 20 }, { "epoch": 0.04205214465937763, "grad_norm": 0.17879426310992677, "learning_rate": 6.302521008403361e-05, "loss": 0.9861, "step": 25 }, { "epoch": 0.050462573591253154, "grad_norm": 0.1372088242876512, "learning_rate": 7.563025210084033e-05, "loss": 0.9688, "step": 30 }, { "epoch": 0.05887300252312868, "grad_norm": 0.11605937851559196, "learning_rate": 8.823529411764705e-05, "loss": 0.9683, "step": 35 }, { "epoch": 0.0672834314550042, "grad_norm": 0.12589057657439554, "learning_rate": 0.00010084033613445378, "loss": 0.948, "step": 40 }, { "epoch": 0.07569386038687972, "grad_norm": 0.1652030960687282, "learning_rate": 0.00011344537815126049, "loss": 0.9155, "step": 45 }, { "epoch": 0.08410428931875526, "grad_norm": 0.15080610702585565, "learning_rate": 0.00012605042016806722, "loss": 0.9278, "step": 50 }, { "epoch": 0.09251471825063078, "grad_norm": 0.13357435857967218, "learning_rate": 0.00013865546218487396, "loss": 0.9023, "step": 55 }, { "epoch": 0.10092514718250631, "grad_norm": 0.19497187993777446, "learning_rate": 0.00015126050420168066, "loss": 0.8778, "step": 60 }, { "epoch": 0.10933557611438183, "grad_norm": 0.1976656903261847, "learning_rate": 0.00016386554621848737, "loss": 0.9507, "step": 65 }, { "epoch": 0.11774600504625736, "grad_norm": 0.1403114707521467, "learning_rate": 0.0001764705882352941, "loss": 0.9164, "step": 70 }, { "epoch": 0.1261564339781329, "grad_norm": 0.14099777956857235, "learning_rate": 0.0001890756302521008, "loss": 0.9534, "step": 75 }, { "epoch": 0.1345668629100084, "grad_norm": 0.15889633248603474, "learning_rate": 0.00020168067226890757, "loss": 0.9315, "step": 80 }, { "epoch": 0.14297729184188393, "grad_norm": 0.13905096593977556, "learning_rate": 0.00021428571428571427, "loss": 0.9019, "step": 85 }, { "epoch": 0.15138772077375945, "grad_norm": 0.1248251895789866, "learning_rate": 0.00022689075630252098, "loss": 1.0007, "step": 90 }, { "epoch": 0.159798149705635, "grad_norm": 0.12856969759106154, "learning_rate": 0.00023949579831932771, "loss": 0.9706, "step": 95 }, { "epoch": 0.16820857863751051, "grad_norm": 0.2451599584066991, "learning_rate": 0.00025210084033613445, "loss": 0.9457, "step": 100 }, { "epoch": 0.17661900756938603, "grad_norm": 0.16131565399243042, "learning_rate": 0.00026470588235294115, "loss": 0.9624, "step": 105 }, { "epoch": 0.18502943650126155, "grad_norm": 0.11974149003499743, "learning_rate": 0.0002773109243697479, "loss": 0.943, "step": 110 }, { "epoch": 0.1934398654331371, "grad_norm": 0.4663750210183581, "learning_rate": 0.0002899159663865546, "loss": 0.9683, "step": 115 }, { "epoch": 0.20185029436501262, "grad_norm": 0.315237934726779, "learning_rate": 0.00029999935225318556, "loss": 0.9765, "step": 120 }, { "epoch": 0.21026072329688814, "grad_norm": 41.12896492268303, "learning_rate": 0.00029997668170208376, "loss": 2.4906, "step": 125 }, { "epoch": 0.21867115222876365, "grad_norm": 86.97194601878952, "learning_rate": 0.0002999216294043922, "loss": 4.2426, "step": 130 }, { "epoch": 0.2270815811606392, "grad_norm": 40.145574165614974, "learning_rate": 0.0002998342072465558, "loss": 5.285, "step": 135 }, { "epoch": 0.23549201009251472, "grad_norm": 36.0169334865568, "learning_rate": 0.0002997144341040567, "loss": 5.7741, "step": 140 }, { "epoch": 0.24390243902439024, "grad_norm": 15.431132077300076, "learning_rate": 0.0002995623358373386, "loss": 7.9614, "step": 145 }, { "epoch": 0.2523128679562658, "grad_norm": 6.581926090085861, "learning_rate": 0.0002993779452862235, "loss": 4.2109, "step": 150 }, { "epoch": 0.2607232968881413, "grad_norm": 7.662219825964168, "learning_rate": 0.0002991613022628211, "loss": 3.4552, "step": 155 }, { "epoch": 0.2691337258200168, "grad_norm": 2.538849805479563, "learning_rate": 0.00029891245354293284, "loss": 2.1775, "step": 160 }, { "epoch": 0.27754415475189237, "grad_norm": 2.0222241832932824, "learning_rate": 0.0002986314528559525, "loss": 2.0086, "step": 165 }, { "epoch": 0.28595458368376786, "grad_norm": 1.6691520526281935, "learning_rate": 0.0002983183608732653, "loss": 1.618, "step": 170 }, { "epoch": 0.2943650126156434, "grad_norm": 0.8472051268719076, "learning_rate": 0.00029797324519514835, "loss": 1.4006, "step": 175 }, { "epoch": 0.3027754415475189, "grad_norm": 1.2656793919009501, "learning_rate": 0.0002975961803361749, "loss": 1.2361, "step": 180 }, { "epoch": 0.31118587047939444, "grad_norm": 16.91854783030072, "learning_rate": 0.00029718724770912575, "loss": 1.3024, "step": 185 }, { "epoch": 0.31959629941127, "grad_norm": 0.4264421791196419, "learning_rate": 0.00029674653560741125, "loss": 1.2247, "step": 190 }, { "epoch": 0.3280067283431455, "grad_norm": 11.819413233995139, "learning_rate": 0.00029627413918600773, "loss": 1.5927, "step": 195 }, { "epoch": 0.33641715727502103, "grad_norm": 0.3872518309676493, "learning_rate": 0.0002957701604409124, "loss": 1.2533, "step": 200 }, { "epoch": 0.3448275862068966, "grad_norm": 2.0631756640177508, "learning_rate": 0.0002952347081871212, "loss": 1.1339, "step": 205 }, { "epoch": 0.35323801513877207, "grad_norm": 0.2331543310806729, "learning_rate": 0.00029466789803513435, "loss": 1.0706, "step": 210 }, { "epoch": 0.3616484440706476, "grad_norm": 0.20061594850021877, "learning_rate": 0.0002940698523659947, "loss": 1.1093, "step": 215 }, { "epoch": 0.3700588730025231, "grad_norm": 0.17861349516130504, "learning_rate": 0.0002934407003048641, "loss": 1.1008, "step": 220 }, { "epoch": 0.37846930193439865, "grad_norm": 0.11836066059443685, "learning_rate": 0.00029278057769314384, "loss": 1.045, "step": 225 }, { "epoch": 0.3868797308662742, "grad_norm": 0.18874539756549927, "learning_rate": 0.00029208962705914505, "loss": 1.0056, "step": 230 }, { "epoch": 0.3952901597981497, "grad_norm": 0.45365739150608037, "learning_rate": 0.00029136799758731473, "loss": 0.9995, "step": 235 }, { "epoch": 0.40370058873002523, "grad_norm": 0.12762727857651687, "learning_rate": 0.00029061584508602545, "loss": 1.006, "step": 240 }, { "epoch": 0.4121110176619008, "grad_norm": 4.885346324632363, "learning_rate": 0.0002898333319539341, "loss": 1.3006, "step": 245 }, { "epoch": 0.42052144659377627, "grad_norm": 0.6018215992100235, "learning_rate": 0.0002890206271449186, "loss": 1.0966, "step": 250 }, { "epoch": 0.4289318755256518, "grad_norm": 0.4059499155938192, "learning_rate": 0.00028817790613159817, "loss": 1.0764, "step": 255 }, { "epoch": 0.4373423044575273, "grad_norm": 7.707406865225783, "learning_rate": 0.0002873053508674471, "loss": 1.1362, "step": 260 }, { "epoch": 0.44575273338940286, "grad_norm": 0.22108881306250477, "learning_rate": 0.00028640314974750884, "loss": 1.0774, "step": 265 }, { "epoch": 0.4541631623212784, "grad_norm": 3.3347201885551807, "learning_rate": 0.00028547149756771894, "loss": 1.1651, "step": 270 }, { "epoch": 0.4625735912531539, "grad_norm": 0.34254879266979626, "learning_rate": 0.00028451059548284665, "loss": 1.1397, "step": 275 }, { "epoch": 0.47098402018502944, "grad_norm": 0.1735673338436184, "learning_rate": 0.00028352065096306307, "loss": 1.0421, "step": 280 }, { "epoch": 0.479394449116905, "grad_norm": 0.16104913537338722, "learning_rate": 0.0002825018777491458, "loss": 1.0461, "step": 285 }, { "epoch": 0.4878048780487805, "grad_norm": 0.1831953506941342, "learning_rate": 0.00028145449580632996, "loss": 0.9887, "step": 290 }, { "epoch": 0.496215306980656, "grad_norm": 0.13125693879950318, "learning_rate": 0.0002803787312768149, "loss": 0.9847, "step": 295 }, { "epoch": 0.5046257359125316, "grad_norm": 0.45943182895269363, "learning_rate": 0.00027927481643093754, "loss": 1.0187, "step": 300 }, { "epoch": 0.5130361648444071, "grad_norm": 1.177847689359211, "learning_rate": 0.0002781429896170223, "loss": 1.0201, "step": 305 }, { "epoch": 0.5214465937762826, "grad_norm": 11.730278476509266, "learning_rate": 0.0002769834952099191, "loss": 1.2084, "step": 310 }, { "epoch": 0.5298570227081582, "grad_norm": 0.26427554458970376, "learning_rate": 0.0002757965835582397, "loss": 1.102, "step": 315 }, { "epoch": 0.5382674516400336, "grad_norm": 0.45790641000811105, "learning_rate": 0.0002745825109303045, "loss": 1.0614, "step": 320 }, { "epoch": 0.5466778805719091, "grad_norm": 0.1621409545630021, "learning_rate": 0.0002733415394588114, "loss": 1.0228, "step": 325 }, { "epoch": 0.5550883095037847, "grad_norm": 0.11351840128403332, "learning_rate": 0.0002720739370842379, "loss": 0.9773, "step": 330 }, { "epoch": 0.5634987384356602, "grad_norm": 0.13847315137184252, "learning_rate": 0.0002707799774969897, "loss": 1.0054, "step": 335 }, { "epoch": 0.5719091673675357, "grad_norm": 0.10777953928312381, "learning_rate": 0.0002694599400783078, "loss": 0.9851, "step": 340 }, { "epoch": 0.5803195962994113, "grad_norm": 0.1256406052654263, "learning_rate": 0.00026811410983994667, "loss": 1.0163, "step": 345 }, { "epoch": 0.5887300252312868, "grad_norm": 0.16866862756321335, "learning_rate": 0.00026674277736263687, "loss": 1.0335, "step": 350 }, { "epoch": 0.5971404541631623, "grad_norm": 0.14265754669110667, "learning_rate": 0.0002653462387333451, "loss": 0.9956, "step": 355 }, { "epoch": 0.6055508830950378, "grad_norm": 0.5819454877222049, "learning_rate": 0.0002639247954813458, "loss": 1.0263, "step": 360 }, { "epoch": 0.6139613120269134, "grad_norm": 0.13535505190898894, "learning_rate": 0.0002624787545131169, "loss": 0.9753, "step": 365 }, { "epoch": 0.6223717409587889, "grad_norm": 1.0748063857926706, "learning_rate": 0.0002610084280460756, "loss": 0.9945, "step": 370 }, { "epoch": 0.6307821698906644, "grad_norm": 0.14560862560154247, "learning_rate": 0.00025951413354116665, "loss": 0.988, "step": 375 }, { "epoch": 0.63919259882254, "grad_norm": 0.1246463854830709, "learning_rate": 0.0002579961936343188, "loss": 0.9658, "step": 380 }, { "epoch": 0.6476030277544155, "grad_norm": 0.10713906349411657, "learning_rate": 0.00025645493606678375, "loss": 0.9366, "step": 385 }, { "epoch": 0.656013456686291, "grad_norm": 0.1113038542351718, "learning_rate": 0.00025489069361437326, "loss": 0.9758, "step": 390 }, { "epoch": 0.6644238856181666, "grad_norm": 1.3831640086044374, "learning_rate": 0.00025330380401560846, "loss": 0.9575, "step": 395 }, { "epoch": 0.6728343145500421, "grad_norm": 0.11166426611159094, "learning_rate": 0.0002516946098987985, "loss": 0.963, "step": 400 }, { "epoch": 0.6812447434819175, "grad_norm": 1.0002894310288515, "learning_rate": 0.0002500634587080628, "loss": 0.9299, "step": 405 }, { "epoch": 0.6896551724137931, "grad_norm": 0.1755490764589915, "learning_rate": 0.0002484107026283137, "loss": 0.9814, "step": 410 }, { "epoch": 0.6980656013456686, "grad_norm": 4.147990806845651, "learning_rate": 0.00024673669850921575, "loss": 1.0252, "step": 415 }, { "epoch": 0.7064760302775441, "grad_norm": 0.14199796426249658, "learning_rate": 0.0002450418077881374, "loss": 0.9578, "step": 420 }, { "epoch": 0.7148864592094197, "grad_norm": 0.10967054022758699, "learning_rate": 0.0002433263964121127, "loss": 0.9543, "step": 425 }, { "epoch": 0.7232968881412952, "grad_norm": 0.10681488534249564, "learning_rate": 0.00024159083475882854, "loss": 0.947, "step": 430 }, { "epoch": 0.7317073170731707, "grad_norm": 0.0836144209353973, "learning_rate": 0.00023983549755665623, "loss": 0.966, "step": 435 }, { "epoch": 0.7401177460050462, "grad_norm": 6.618428479377572, "learning_rate": 0.00023806076380374262, "loss": 0.9755, "step": 440 }, { "epoch": 0.7485281749369218, "grad_norm": 0.3700536075693679, "learning_rate": 0.00023626701668618048, "loss": 0.9439, "step": 445 }, { "epoch": 0.7569386038687973, "grad_norm": 0.1339452802214274, "learning_rate": 0.00023445464349527363, "loss": 0.9393, "step": 450 }, { "epoch": 0.7653490328006728, "grad_norm": 0.15063483951074022, "learning_rate": 0.00023262403554391643, "loss": 0.9561, "step": 455 }, { "epoch": 0.7737594617325484, "grad_norm": 0.12236138289386866, "learning_rate": 0.0002307755880821044, "loss": 0.9757, "step": 460 }, { "epoch": 0.7821698906644239, "grad_norm": 0.09859340116113559, "learning_rate": 0.00022890970021159545, "loss": 0.9699, "step": 465 }, { "epoch": 0.7905803195962994, "grad_norm": 0.3134759911324595, "learning_rate": 0.00022702677479973857, "loss": 0.9387, "step": 470 }, { "epoch": 0.798990748528175, "grad_norm": 0.12183559177618263, "learning_rate": 0.00022512721839249044, "loss": 0.8985, "step": 475 }, { "epoch": 0.8074011774600505, "grad_norm": 0.08229469965958389, "learning_rate": 0.00022321144112663708, "loss": 0.9504, "step": 480 }, { "epoch": 0.815811606391926, "grad_norm": 0.14924593232770486, "learning_rate": 0.00022127985664124048, "loss": 0.9338, "step": 485 }, { "epoch": 0.8242220353238016, "grad_norm": 0.2786737248586552, "learning_rate": 0.0002193328819883292, "loss": 1.0327, "step": 490 }, { "epoch": 0.832632464255677, "grad_norm": 0.1485555431366647, "learning_rate": 0.00021737093754285147, "loss": 0.9499, "step": 495 }, { "epoch": 0.8410428931875525, "grad_norm": 0.10549366930700854, "learning_rate": 0.00021539444691191174, "loss": 0.8961, "step": 500 }, { "epoch": 0.8494533221194281, "grad_norm": 0.2501853130288003, "learning_rate": 0.0002134038368433085, "loss": 0.973, "step": 505 }, { "epoch": 0.8578637510513036, "grad_norm": 0.08142359041013753, "learning_rate": 0.00021139953713339454, "loss": 0.9262, "step": 510 }, { "epoch": 0.8662741799831791, "grad_norm": 0.1101621057008162, "learning_rate": 0.00020938198053427885, "loss": 0.9462, "step": 515 }, { "epoch": 0.8746846089150546, "grad_norm": 0.08929629784060397, "learning_rate": 0.00020735160266039006, "loss": 0.9227, "step": 520 }, { "epoch": 0.8830950378469302, "grad_norm": 0.15490402519303073, "learning_rate": 0.00020530884189442244, "loss": 0.9077, "step": 525 }, { "epoch": 0.8915054667788057, "grad_norm": 0.0899609096113038, "learning_rate": 0.00020325413929268369, "loss": 0.9309, "step": 530 }, { "epoch": 0.8999158957106812, "grad_norm": 0.08860120877271818, "learning_rate": 0.00020118793848986554, "loss": 0.9581, "step": 535 }, { "epoch": 0.9083263246425568, "grad_norm": 0.08667002021610543, "learning_rate": 0.00019911068560325804, "loss": 0.8893, "step": 540 }, { "epoch": 0.9167367535744323, "grad_norm": 0.1020909544402857, "learning_rate": 0.00019702282913642723, "loss": 0.8789, "step": 545 }, { "epoch": 0.9251471825063078, "grad_norm": 0.8881918960589515, "learning_rate": 0.00019492481988237818, "loss": 1.0281, "step": 550 }, { "epoch": 0.9335576114381834, "grad_norm": 1.7309452103787781, "learning_rate": 0.00019281711082622314, "loss": 1.5781, "step": 555 }, { "epoch": 0.9419680403700589, "grad_norm": 0.9824981029734958, "learning_rate": 0.000190700157047377, "loss": 1.9042, "step": 560 }, { "epoch": 0.9503784693019344, "grad_norm": 30.47237068603654, "learning_rate": 0.0001885744156212999, "loss": 2.2642, "step": 565 }, { "epoch": 0.95878889823381, "grad_norm": 13.132455734206328, "learning_rate": 0.0001864403455208094, "loss": 2.0529, "step": 570 }, { "epoch": 0.9671993271656855, "grad_norm": 1.5514837112811664, "learning_rate": 0.00018429840751698284, "loss": 1.882, "step": 575 }, { "epoch": 0.975609756097561, "grad_norm": 0.7978250850775721, "learning_rate": 0.00018214906407967136, "loss": 1.0936, "step": 580 }, { "epoch": 0.9840201850294366, "grad_norm": 0.24502338468242643, "learning_rate": 0.00017999277927764696, "loss": 0.9768, "step": 585 }, { "epoch": 0.992430613961312, "grad_norm": 0.14072495383675704, "learning_rate": 0.00017783001867840488, "loss": 0.991, "step": 590 }, { "epoch": 1.0, "grad_norm": 0.14826034089833667, "learning_rate": 0.00017566124924764176, "loss": 0.9232, "step": 595 }, { "epoch": 1.0084104289318756, "grad_norm": 0.1423011247047076, "learning_rate": 0.00017348693924843238, "loss": 0.7383, "step": 600 }, { "epoch": 1.016820857863751, "grad_norm": 18.945959105143892, "learning_rate": 0.00017130755814012607, "loss": 0.8006, "step": 605 }, { "epoch": 1.0252312867956266, "grad_norm": 0.2063458884840928, "learning_rate": 0.0001691235764769848, "loss": 0.7464, "step": 610 }, { "epoch": 1.0336417157275022, "grad_norm": 0.10576479121191024, "learning_rate": 0.00016693546580658493, "loss": 0.7066, "step": 615 }, { "epoch": 1.0420521446593776, "grad_norm": 0.10366396212312518, "learning_rate": 0.00016474369856800457, "loss": 0.7339, "step": 620 }, { "epoch": 1.0504625735912532, "grad_norm": 0.09658540475190697, "learning_rate": 0.00016254874798981835, "loss": 0.7111, "step": 625 }, { "epoch": 1.0588730025231288, "grad_norm": 0.08877086888965659, "learning_rate": 0.00016035108798792165, "loss": 0.71, "step": 630 }, { "epoch": 1.0672834314550042, "grad_norm": 0.10967646061522097, "learning_rate": 0.00015815119306320657, "loss": 0.7296, "step": 635 }, { "epoch": 1.0756938603868798, "grad_norm": 0.0933812784388927, "learning_rate": 0.0001559495381991117, "loss": 0.7361, "step": 640 }, { "epoch": 1.0841042893187554, "grad_norm": 0.19214296231083186, "learning_rate": 0.00015374659875906752, "loss": 0.7134, "step": 645 }, { "epoch": 1.0925147182506307, "grad_norm": 0.11688119487848063, "learning_rate": 0.00015154285038385937, "loss": 0.6893, "step": 650 }, { "epoch": 1.1009251471825063, "grad_norm": 0.09391812895085212, "learning_rate": 0.00014933876888893164, "loss": 0.6963, "step": 655 }, { "epoch": 1.1093355761143817, "grad_norm": 0.10077136979307887, "learning_rate": 0.0001471348301616531, "loss": 0.7436, "step": 660 }, { "epoch": 1.1177460050462573, "grad_norm": 0.08225000161444689, "learning_rate": 0.00014493151005856724, "loss": 0.7004, "step": 665 }, { "epoch": 1.126156433978133, "grad_norm": 0.08801832379105524, "learning_rate": 0.00014272928430264926, "loss": 0.722, "step": 670 }, { "epoch": 1.1345668629100083, "grad_norm": 0.07018288331194043, "learning_rate": 0.00014052862838059195, "loss": 0.6862, "step": 675 }, { "epoch": 1.142977291841884, "grad_norm": 0.0767198314720291, "learning_rate": 0.00013833001744014212, "loss": 0.7101, "step": 680 }, { "epoch": 1.1513877207737595, "grad_norm": 0.07407300906943415, "learning_rate": 0.00013613392618751086, "loss": 0.7175, "step": 685 }, { "epoch": 1.1597981497056349, "grad_norm": 0.07352937678988827, "learning_rate": 0.00013394082878487884, "loss": 0.7004, "step": 690 }, { "epoch": 1.1682085786375105, "grad_norm": 0.07551179793852605, "learning_rate": 0.00013175119874801874, "loss": 0.7054, "step": 695 }, { "epoch": 1.176619007569386, "grad_norm": 0.07052660785539094, "learning_rate": 0.000129565508844058, "loss": 0.6789, "step": 700 }, { "epoch": 1.1850294365012615, "grad_norm": 0.0807185011796273, "learning_rate": 0.00012738423098940244, "loss": 0.6873, "step": 705 }, { "epoch": 1.193439865433137, "grad_norm": 0.1801375449127257, "learning_rate": 0.0001252078361478441, "loss": 0.7115, "step": 710 }, { "epoch": 1.2018502943650127, "grad_norm": 0.07403831159848352, "learning_rate": 0.00012303679422887457, "loss": 0.6882, "step": 715 }, { "epoch": 1.210260723296888, "grad_norm": 0.08819212786222724, "learning_rate": 0.00012087157398622575, "loss": 0.688, "step": 720 }, { "epoch": 1.2186711522287637, "grad_norm": 0.7006553252478528, "learning_rate": 0.0001187126429166605, "loss": 0.8005, "step": 725 }, { "epoch": 1.2270815811606393, "grad_norm": 0.08128464358584869, "learning_rate": 0.00011656046715903468, "loss": 0.6865, "step": 730 }, { "epoch": 1.2354920100925146, "grad_norm": 0.11015841491813824, "learning_rate": 0.00011441551139365197, "loss": 0.6476, "step": 735 }, { "epoch": 1.2439024390243902, "grad_norm": 0.09681229577155445, "learning_rate": 0.0001122782387419339, "loss": 0.6525, "step": 740 }, { "epoch": 1.2523128679562658, "grad_norm": 0.07074288159266276, "learning_rate": 0.00011014911066642675, "loss": 0.7101, "step": 745 }, { "epoch": 1.2607232968881412, "grad_norm": 0.07038355444461111, "learning_rate": 0.00010802858687116586, "loss": 0.685, "step": 750 }, { "epoch": 1.2691337258200168, "grad_norm": 0.06694809495021711, "learning_rate": 0.00010591712520242033, "loss": 0.6435, "step": 755 }, { "epoch": 1.2775441547518924, "grad_norm": 1.54283051971332, "learning_rate": 0.00010381518154983872, "loss": 0.6707, "step": 760 }, { "epoch": 1.2859545836837678, "grad_norm": 0.14970032839830041, "learning_rate": 0.00010172320974801662, "loss": 0.6541, "step": 765 }, { "epoch": 1.2943650126156434, "grad_norm": 0.07670287277238487, "learning_rate": 9.964166147850868e-05, "loss": 0.6746, "step": 770 }, { "epoch": 1.302775441547519, "grad_norm": 0.0743406403000032, "learning_rate": 9.757098617230529e-05, "loss": 0.6622, "step": 775 }, { "epoch": 1.3111858704793944, "grad_norm": 0.06797759191808586, "learning_rate": 9.551163091279481e-05, "loss": 0.6556, "step": 780 }, { "epoch": 1.31959629941127, "grad_norm": 0.07795583221869067, "learning_rate": 9.346404033923304e-05, "loss": 0.6617, "step": 785 }, { "epoch": 1.3280067283431456, "grad_norm": 0.087821277333218, "learning_rate": 9.14286565507406e-05, "loss": 0.71, "step": 790 }, { "epoch": 1.336417157275021, "grad_norm": 0.0680393967098423, "learning_rate": 8.940591901084799e-05, "loss": 0.666, "step": 795 }, { "epoch": 1.3448275862068966, "grad_norm": 0.20717718965553744, "learning_rate": 8.739626445261064e-05, "loss": 0.657, "step": 800 }, { "epoch": 1.3532380151387722, "grad_norm": 0.07605217404928517, "learning_rate": 8.540012678431284e-05, "loss": 0.6679, "step": 805 }, { "epoch": 1.3616484440706476, "grad_norm": 0.09933554183914269, "learning_rate": 8.341793699578171e-05, "loss": 0.6879, "step": 810 }, { "epoch": 1.3700588730025232, "grad_norm": 0.07269335218428621, "learning_rate": 8.145012306533162e-05, "loss": 0.656, "step": 815 }, { "epoch": 1.3784693019343988, "grad_norm": 0.0938405449110704, "learning_rate": 7.949710986735854e-05, "loss": 0.6542, "step": 820 }, { "epoch": 1.3868797308662741, "grad_norm": 0.06387119139044767, "learning_rate": 7.755931908060427e-05, "loss": 0.6835, "step": 825 }, { "epoch": 1.3952901597981497, "grad_norm": 0.6498998302961293, "learning_rate": 7.563716909711155e-05, "loss": 0.6912, "step": 830 }, { "epoch": 1.4037005887300253, "grad_norm": 0.07172546327436916, "learning_rate": 7.373107493188776e-05, "loss": 0.6397, "step": 835 }, { "epoch": 1.4121110176619007, "grad_norm": 0.22055958275283316, "learning_rate": 7.184144813329845e-05, "loss": 0.665, "step": 840 }, { "epoch": 1.4205214465937763, "grad_norm": 0.08300391135437629, "learning_rate": 6.996869669420934e-05, "loss": 0.6781, "step": 845 }, { "epoch": 1.428931875525652, "grad_norm": 0.06921133210004143, "learning_rate": 6.811322496389547e-05, "loss": 0.6743, "step": 850 }, { "epoch": 1.4373423044575273, "grad_norm": 0.0711963752977775, "learning_rate": 6.627543356073752e-05, "loss": 0.6409, "step": 855 }, { "epoch": 1.445752733389403, "grad_norm": 0.06420927363983264, "learning_rate": 6.445571928572372e-05, "loss": 0.64, "step": 860 }, { "epoch": 1.4541631623212785, "grad_norm": 0.06501526947017142, "learning_rate": 6.265447503677568e-05, "loss": 0.7054, "step": 865 }, { "epoch": 1.462573591253154, "grad_norm": 0.21066994089465826, "learning_rate": 6.087208972391683e-05, "loss": 0.6918, "step": 870 }, { "epoch": 1.4709840201850295, "grad_norm": 0.08759599924052495, "learning_rate": 5.910894818530261e-05, "loss": 0.6709, "step": 875 }, { "epoch": 1.479394449116905, "grad_norm": 1.0585940281103237, "learning_rate": 5.736543110412889e-05, "loss": 0.7003, "step": 880 }, { "epoch": 1.4878048780487805, "grad_norm": 0.10929958796237216, "learning_rate": 5.564191492643813e-05, "loss": 0.6804, "step": 885 }, { "epoch": 1.496215306980656, "grad_norm": 0.07895221603568614, "learning_rate": 5.393877177984039e-05, "loss": 0.6609, "step": 890 }, { "epoch": 1.5046257359125317, "grad_norm": 0.0705592884671557, "learning_rate": 5.225636939316621e-05, "loss": 0.6438, "step": 895 }, { "epoch": 1.513036164844407, "grad_norm": 0.06645146330681664, "learning_rate": 5.059507101706976e-05, "loss": 0.658, "step": 900 }, { "epoch": 1.5214465937762824, "grad_norm": 0.06572346800316721, "learning_rate": 4.8955235345598825e-05, "loss": 0.6409, "step": 905 }, { "epoch": 1.5298570227081583, "grad_norm": 0.06268956414981779, "learning_rate": 4.7337216438748384e-05, "loss": 0.6657, "step": 910 }, { "epoch": 1.5382674516400336, "grad_norm": 0.0705504423985544, "learning_rate": 4.5741363646014696e-05, "loss": 0.6631, "step": 915 }, { "epoch": 1.546677880571909, "grad_norm": 0.06497073556361815, "learning_rate": 4.416802153096696e-05, "loss": 0.6319, "step": 920 }, { "epoch": 1.5550883095037848, "grad_norm": 0.32565910926601543, "learning_rate": 4.261752979685159e-05, "loss": 0.6691, "step": 925 }, { "epoch": 1.5634987384356602, "grad_norm": 0.05949233651666068, "learning_rate": 4.1090223213246404e-05, "loss": 0.6349, "step": 930 }, { "epoch": 1.5719091673675356, "grad_norm": 0.07160645769568097, "learning_rate": 3.958643154378005e-05, "loss": 0.6688, "step": 935 }, { "epoch": 1.5803195962994114, "grad_norm": 0.7327591054848349, "learning_rate": 3.8106479474931795e-05, "loss": 0.645, "step": 940 }, { "epoch": 1.5887300252312868, "grad_norm": 0.062156634516279057, "learning_rate": 3.665068654592806e-05, "loss": 0.6373, "step": 945 }, { "epoch": 1.5971404541631622, "grad_norm": 0.06660764066083266, "learning_rate": 3.5219367079750205e-05, "loss": 0.698, "step": 950 }, { "epoch": 1.6055508830950378, "grad_norm": 0.06327609087248642, "learning_rate": 3.381283011526819e-05, "loss": 0.6231, "step": 955 }, { "epoch": 1.6139613120269134, "grad_norm": 0.4333939216760815, "learning_rate": 3.243137934051569e-05, "loss": 0.6252, "step": 960 }, { "epoch": 1.6223717409587888, "grad_norm": 0.06658129188396517, "learning_rate": 3.1075313027120016e-05, "loss": 0.6726, "step": 965 }, { "epoch": 1.6307821698906644, "grad_norm": 0.26691445324073054, "learning_rate": 2.97449239659018e-05, "loss": 0.6342, "step": 970 }, { "epoch": 1.63919259882254, "grad_norm": 0.0605021899974428, "learning_rate": 2.8440499403658122e-05, "loss": 0.6332, "step": 975 }, { "epoch": 1.6476030277544154, "grad_norm": 0.061846457580604676, "learning_rate": 2.7162320981142316e-05, "loss": 0.651, "step": 980 }, { "epoch": 1.656013456686291, "grad_norm": 0.1452013715559192, "learning_rate": 2.5910664672254428e-05, "loss": 0.6646, "step": 985 }, { "epoch": 1.6644238856181666, "grad_norm": 0.06326743122936844, "learning_rate": 2.4685800724455384e-05, "loss": 0.6433, "step": 990 }, { "epoch": 1.672834314550042, "grad_norm": 0.07852388598087191, "learning_rate": 2.3487993600416967e-05, "loss": 0.6371, "step": 995 }, { "epoch": 1.6812447434819175, "grad_norm": 0.06601596346267856, "learning_rate": 2.2317501920921576e-05, "loss": 0.6471, "step": 1000 }, { "epoch": 1.6896551724137931, "grad_norm": 0.059266994188209014, "learning_rate": 2.1174578409022702e-05, "loss": 0.6699, "step": 1005 }, { "epoch": 1.6980656013456685, "grad_norm": 0.07452888371241441, "learning_rate": 2.0059469835479054e-05, "loss": 0.6473, "step": 1010 }, { "epoch": 1.7064760302775441, "grad_norm": 0.07457779730712975, "learning_rate": 1.8972416965473803e-05, "loss": 0.6378, "step": 1015 }, { "epoch": 1.7148864592094197, "grad_norm": 0.06910347568006753, "learning_rate": 1.7913654506630655e-05, "loss": 0.6274, "step": 1020 }, { "epoch": 1.7232968881412951, "grad_norm": 0.07064622370312863, "learning_rate": 1.6883411058337543e-05, "loss": 0.7109, "step": 1025 }, { "epoch": 1.7317073170731707, "grad_norm": 0.06048167956891708, "learning_rate": 1.5881909062389285e-05, "loss": 0.6333, "step": 1030 }, { "epoch": 1.7401177460050463, "grad_norm": 0.06307286865715722, "learning_rate": 1.4909364754959985e-05, "loss": 0.646, "step": 1035 }, { "epoch": 1.7485281749369217, "grad_norm": 0.05911925581755182, "learning_rate": 1.3965988119914734e-05, "loss": 0.6433, "step": 1040 }, { "epoch": 1.7569386038687973, "grad_norm": 0.06333975944083138, "learning_rate": 1.305198284347191e-05, "loss": 0.6339, "step": 1045 }, { "epoch": 1.765349032800673, "grad_norm": 0.05514146566145015, "learning_rate": 1.2167546270224743e-05, "loss": 0.6264, "step": 1050 }, { "epoch": 1.7737594617325483, "grad_norm": 0.05609433763926122, "learning_rate": 1.1312869360532295e-05, "loss": 0.6337, "step": 1055 }, { "epoch": 1.7821698906644239, "grad_norm": 0.05988904119560809, "learning_rate": 1.0488136649288847e-05, "loss": 0.6215, "step": 1060 }, { "epoch": 1.7905803195962995, "grad_norm": 0.09114152521890548, "learning_rate": 9.693526206080693e-06, "loss": 0.6385, "step": 1065 }, { "epoch": 1.7989907485281749, "grad_norm": 0.09735030405268541, "learning_rate": 8.929209596738706e-06, "loss": 0.6077, "step": 1070 }, { "epoch": 1.8074011774600505, "grad_norm": 0.0635144022515458, "learning_rate": 8.195351846295262e-06, "loss": 0.6647, "step": 1075 }, { "epoch": 1.815811606391926, "grad_norm": 0.08588808547970551, "learning_rate": 7.492111403353462e-06, "loss": 0.6523, "step": 1080 }, { "epoch": 1.8242220353238014, "grad_norm": 0.057867203058913214, "learning_rate": 6.819640105876062e-06, "loss": 0.6347, "step": 1085 }, { "epoch": 1.832632464255677, "grad_norm": 0.07139018525983484, "learning_rate": 6.1780831484019684e-06, "loss": 0.6247, "step": 1090 }, { "epoch": 1.8410428931875527, "grad_norm": 0.058328293614124185, "learning_rate": 5.567579050696957e-06, "loss": 0.6387, "step": 1095 }, { "epoch": 1.849453322119428, "grad_norm": 0.05729437886303193, "learning_rate": 4.9882596278455756e-06, "loss": 0.6443, "step": 1100 }, { "epoch": 1.8578637510513036, "grad_norm": 0.05592472433203098, "learning_rate": 4.440249961790826e-06, "loss": 0.5956, "step": 1105 }, { "epoch": 1.8662741799831792, "grad_norm": 0.06609120678589044, "learning_rate": 3.923668374327338e-06, "loss": 0.6619, "step": 1110 }, { "epoch": 1.8746846089150546, "grad_norm": 0.05566880083508612, "learning_rate": 3.438626401554351e-06, "loss": 0.6077, "step": 1115 }, { "epoch": 1.8830950378469302, "grad_norm": 0.05589315119066426, "learning_rate": 2.9852287697938125e-06, "loss": 0.6408, "step": 1120 }, { "epoch": 1.8915054667788058, "grad_norm": 0.06795657947769265, "learning_rate": 2.563573372978617e-06, "loss": 0.6465, "step": 1125 }, { "epoch": 1.8999158957106812, "grad_norm": 0.05715913746496311, "learning_rate": 2.173751251516209e-06, "loss": 0.6673, "step": 1130 }, { "epoch": 1.9083263246425568, "grad_norm": 0.07516171647382662, "learning_rate": 1.8158465726318294e-06, "loss": 0.6135, "step": 1135 }, { "epoch": 1.9167367535744324, "grad_norm": 0.08828033210229927, "learning_rate": 1.4899366121958634e-06, "loss": 0.6472, "step": 1140 }, { "epoch": 1.9251471825063078, "grad_norm": 0.05375437444470602, "learning_rate": 1.19609173803904e-06, "loss": 0.5937, "step": 1145 }, { "epoch": 1.9335576114381834, "grad_norm": 0.07199465636203523, "learning_rate": 9.343753947591681e-07, "loss": 0.6219, "step": 1150 }, { "epoch": 1.941968040370059, "grad_norm": 0.11526483170723129, "learning_rate": 7.048440900226937e-07, "loss": 0.6342, "step": 1155 }, { "epoch": 1.9503784693019344, "grad_norm": 0.05354167010798482, "learning_rate": 5.075473823640597e-07, "loss": 0.6553, "step": 1160 }, { "epoch": 1.95878889823381, "grad_norm": 0.09214456642437543, "learning_rate": 3.425278704853984e-07, "loss": 0.6636, "step": 1165 }, { "epoch": 1.9671993271656856, "grad_norm": 0.06917853774104925, "learning_rate": 2.0982118405897251e-07, "loss": 0.6319, "step": 1170 }, { "epoch": 1.975609756097561, "grad_norm": 0.05500253350216091, "learning_rate": 1.0945597603431167e-07, "loss": 0.6157, "step": 1175 }, { "epoch": 1.9840201850294366, "grad_norm": 0.056648521587817115, "learning_rate": 4.145391645166141e-08, "loss": 0.654, "step": 1180 }, { "epoch": 1.9924306139613122, "grad_norm": 0.06416190308164577, "learning_rate": 5.829687763259094e-09, "loss": 0.6517, "step": 1185 }, { "epoch": 1.9974768713204374, "step": 1188, "total_flos": 7.94530881371059e+18, "train_loss": 0.9845614412216225, "train_runtime": 14065.2912, "train_samples_per_second": 2.704, "train_steps_per_second": 0.084 } ], "logging_steps": 5, "max_steps": 1188, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.94530881371059e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }