|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9974768713204374, |
|
"eval_steps": 500, |
|
"global_step": 1188, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001682085786375105, |
|
"grad_norm": 0.584265834445358, |
|
"learning_rate": 2.521008403361344e-06, |
|
"loss": 1.3366, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008410428931875526, |
|
"grad_norm": 0.4317387157455665, |
|
"learning_rate": 1.2605042016806723e-05, |
|
"loss": 1.2402, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01682085786375105, |
|
"grad_norm": 0.2866682001722115, |
|
"learning_rate": 2.5210084033613446e-05, |
|
"loss": 1.2481, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.025231286795626577, |
|
"grad_norm": 0.2291349749029046, |
|
"learning_rate": 3.7815126050420166e-05, |
|
"loss": 1.1829, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0336417157275021, |
|
"grad_norm": 0.26272617568016177, |
|
"learning_rate": 5.042016806722689e-05, |
|
"loss": 1.0796, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04205214465937763, |
|
"grad_norm": 0.17879426310992677, |
|
"learning_rate": 6.302521008403361e-05, |
|
"loss": 0.9861, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.050462573591253154, |
|
"grad_norm": 0.1372088242876512, |
|
"learning_rate": 7.563025210084033e-05, |
|
"loss": 0.9688, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05887300252312868, |
|
"grad_norm": 0.11605937851559196, |
|
"learning_rate": 8.823529411764705e-05, |
|
"loss": 0.9683, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0672834314550042, |
|
"grad_norm": 0.12589057657439554, |
|
"learning_rate": 0.00010084033613445378, |
|
"loss": 0.948, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07569386038687972, |
|
"grad_norm": 0.1652030960687282, |
|
"learning_rate": 0.00011344537815126049, |
|
"loss": 0.9155, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08410428931875526, |
|
"grad_norm": 0.15080610702585565, |
|
"learning_rate": 0.00012605042016806722, |
|
"loss": 0.9278, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09251471825063078, |
|
"grad_norm": 0.13357435857967218, |
|
"learning_rate": 0.00013865546218487396, |
|
"loss": 0.9023, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.10092514718250631, |
|
"grad_norm": 0.19497187993777446, |
|
"learning_rate": 0.00015126050420168066, |
|
"loss": 0.8778, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10933557611438183, |
|
"grad_norm": 0.1976656903261847, |
|
"learning_rate": 0.00016386554621848737, |
|
"loss": 0.9507, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.11774600504625736, |
|
"grad_norm": 0.1403114707521467, |
|
"learning_rate": 0.0001764705882352941, |
|
"loss": 0.9164, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1261564339781329, |
|
"grad_norm": 0.14099777956857235, |
|
"learning_rate": 0.0001890756302521008, |
|
"loss": 0.9534, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1345668629100084, |
|
"grad_norm": 0.15889633248603474, |
|
"learning_rate": 0.00020168067226890757, |
|
"loss": 0.9315, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14297729184188393, |
|
"grad_norm": 0.13905096593977556, |
|
"learning_rate": 0.00021428571428571427, |
|
"loss": 0.9019, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.15138772077375945, |
|
"grad_norm": 0.1248251895789866, |
|
"learning_rate": 0.00022689075630252098, |
|
"loss": 1.0007, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.159798149705635, |
|
"grad_norm": 0.12856969759106154, |
|
"learning_rate": 0.00023949579831932771, |
|
"loss": 0.9706, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.16820857863751051, |
|
"grad_norm": 0.2451599584066991, |
|
"learning_rate": 0.00025210084033613445, |
|
"loss": 0.9457, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17661900756938603, |
|
"grad_norm": 0.16131565399243042, |
|
"learning_rate": 0.00026470588235294115, |
|
"loss": 0.9624, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.18502943650126155, |
|
"grad_norm": 0.11974149003499743, |
|
"learning_rate": 0.0002773109243697479, |
|
"loss": 0.943, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1934398654331371, |
|
"grad_norm": 0.4663750210183581, |
|
"learning_rate": 0.0002899159663865546, |
|
"loss": 0.9683, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.20185029436501262, |
|
"grad_norm": 0.315237934726779, |
|
"learning_rate": 0.00029999935225318556, |
|
"loss": 0.9765, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.21026072329688814, |
|
"grad_norm": 41.12896492268303, |
|
"learning_rate": 0.00029997668170208376, |
|
"loss": 2.4906, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.21867115222876365, |
|
"grad_norm": 86.97194601878952, |
|
"learning_rate": 0.0002999216294043922, |
|
"loss": 4.2426, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2270815811606392, |
|
"grad_norm": 40.145574165614974, |
|
"learning_rate": 0.0002998342072465558, |
|
"loss": 5.285, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.23549201009251472, |
|
"grad_norm": 36.0169334865568, |
|
"learning_rate": 0.0002997144341040567, |
|
"loss": 5.7741, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.24390243902439024, |
|
"grad_norm": 15.431132077300076, |
|
"learning_rate": 0.0002995623358373386, |
|
"loss": 7.9614, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2523128679562658, |
|
"grad_norm": 6.581926090085861, |
|
"learning_rate": 0.0002993779452862235, |
|
"loss": 4.2109, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2607232968881413, |
|
"grad_norm": 7.662219825964168, |
|
"learning_rate": 0.0002991613022628211, |
|
"loss": 3.4552, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2691337258200168, |
|
"grad_norm": 2.538849805479563, |
|
"learning_rate": 0.00029891245354293284, |
|
"loss": 2.1775, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.27754415475189237, |
|
"grad_norm": 2.0222241832932824, |
|
"learning_rate": 0.0002986314528559525, |
|
"loss": 2.0086, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.28595458368376786, |
|
"grad_norm": 1.6691520526281935, |
|
"learning_rate": 0.0002983183608732653, |
|
"loss": 1.618, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2943650126156434, |
|
"grad_norm": 0.8472051268719076, |
|
"learning_rate": 0.00029797324519514835, |
|
"loss": 1.4006, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3027754415475189, |
|
"grad_norm": 1.2656793919009501, |
|
"learning_rate": 0.0002975961803361749, |
|
"loss": 1.2361, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.31118587047939444, |
|
"grad_norm": 16.91854783030072, |
|
"learning_rate": 0.00029718724770912575, |
|
"loss": 1.3024, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.31959629941127, |
|
"grad_norm": 0.4264421791196419, |
|
"learning_rate": 0.00029674653560741125, |
|
"loss": 1.2247, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3280067283431455, |
|
"grad_norm": 11.819413233995139, |
|
"learning_rate": 0.00029627413918600773, |
|
"loss": 1.5927, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.33641715727502103, |
|
"grad_norm": 0.3872518309676493, |
|
"learning_rate": 0.0002957701604409124, |
|
"loss": 1.2533, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 2.0631756640177508, |
|
"learning_rate": 0.0002952347081871212, |
|
"loss": 1.1339, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.35323801513877207, |
|
"grad_norm": 0.2331543310806729, |
|
"learning_rate": 0.00029466789803513435, |
|
"loss": 1.0706, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3616484440706476, |
|
"grad_norm": 0.20061594850021877, |
|
"learning_rate": 0.0002940698523659947, |
|
"loss": 1.1093, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.3700588730025231, |
|
"grad_norm": 0.17861349516130504, |
|
"learning_rate": 0.0002934407003048641, |
|
"loss": 1.1008, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.37846930193439865, |
|
"grad_norm": 0.11836066059443685, |
|
"learning_rate": 0.00029278057769314384, |
|
"loss": 1.045, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3868797308662742, |
|
"grad_norm": 0.18874539756549927, |
|
"learning_rate": 0.00029208962705914505, |
|
"loss": 1.0056, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3952901597981497, |
|
"grad_norm": 0.45365739150608037, |
|
"learning_rate": 0.00029136799758731473, |
|
"loss": 0.9995, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.40370058873002523, |
|
"grad_norm": 0.12762727857651687, |
|
"learning_rate": 0.00029061584508602545, |
|
"loss": 1.006, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4121110176619008, |
|
"grad_norm": 4.885346324632363, |
|
"learning_rate": 0.0002898333319539341, |
|
"loss": 1.3006, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.42052144659377627, |
|
"grad_norm": 0.6018215992100235, |
|
"learning_rate": 0.0002890206271449186, |
|
"loss": 1.0966, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4289318755256518, |
|
"grad_norm": 0.4059499155938192, |
|
"learning_rate": 0.00028817790613159817, |
|
"loss": 1.0764, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.4373423044575273, |
|
"grad_norm": 7.707406865225783, |
|
"learning_rate": 0.0002873053508674471, |
|
"loss": 1.1362, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.44575273338940286, |
|
"grad_norm": 0.22108881306250477, |
|
"learning_rate": 0.00028640314974750884, |
|
"loss": 1.0774, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.4541631623212784, |
|
"grad_norm": 3.3347201885551807, |
|
"learning_rate": 0.00028547149756771894, |
|
"loss": 1.1651, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4625735912531539, |
|
"grad_norm": 0.34254879266979626, |
|
"learning_rate": 0.00028451059548284665, |
|
"loss": 1.1397, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.47098402018502944, |
|
"grad_norm": 0.1735673338436184, |
|
"learning_rate": 0.00028352065096306307, |
|
"loss": 1.0421, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.479394449116905, |
|
"grad_norm": 0.16104913537338722, |
|
"learning_rate": 0.0002825018777491458, |
|
"loss": 1.0461, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 0.1831953506941342, |
|
"learning_rate": 0.00028145449580632996, |
|
"loss": 0.9887, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.496215306980656, |
|
"grad_norm": 0.13125693879950318, |
|
"learning_rate": 0.0002803787312768149, |
|
"loss": 0.9847, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5046257359125316, |
|
"grad_norm": 0.45943182895269363, |
|
"learning_rate": 0.00027927481643093754, |
|
"loss": 1.0187, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5130361648444071, |
|
"grad_norm": 1.177847689359211, |
|
"learning_rate": 0.0002781429896170223, |
|
"loss": 1.0201, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5214465937762826, |
|
"grad_norm": 11.730278476509266, |
|
"learning_rate": 0.0002769834952099191, |
|
"loss": 1.2084, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5298570227081582, |
|
"grad_norm": 0.26427554458970376, |
|
"learning_rate": 0.0002757965835582397, |
|
"loss": 1.102, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5382674516400336, |
|
"grad_norm": 0.45790641000811105, |
|
"learning_rate": 0.0002745825109303045, |
|
"loss": 1.0614, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5466778805719091, |
|
"grad_norm": 0.1621409545630021, |
|
"learning_rate": 0.0002733415394588114, |
|
"loss": 1.0228, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5550883095037847, |
|
"grad_norm": 0.11351840128403332, |
|
"learning_rate": 0.0002720739370842379, |
|
"loss": 0.9773, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5634987384356602, |
|
"grad_norm": 0.13847315137184252, |
|
"learning_rate": 0.0002707799774969897, |
|
"loss": 1.0054, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.5719091673675357, |
|
"grad_norm": 0.10777953928312381, |
|
"learning_rate": 0.0002694599400783078, |
|
"loss": 0.9851, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5803195962994113, |
|
"grad_norm": 0.1256406052654263, |
|
"learning_rate": 0.00026811410983994667, |
|
"loss": 1.0163, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5887300252312868, |
|
"grad_norm": 0.16866862756321335, |
|
"learning_rate": 0.00026674277736263687, |
|
"loss": 1.0335, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5971404541631623, |
|
"grad_norm": 0.14265754669110667, |
|
"learning_rate": 0.0002653462387333451, |
|
"loss": 0.9956, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6055508830950378, |
|
"grad_norm": 0.5819454877222049, |
|
"learning_rate": 0.0002639247954813458, |
|
"loss": 1.0263, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6139613120269134, |
|
"grad_norm": 0.13535505190898894, |
|
"learning_rate": 0.0002624787545131169, |
|
"loss": 0.9753, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6223717409587889, |
|
"grad_norm": 1.0748063857926706, |
|
"learning_rate": 0.0002610084280460756, |
|
"loss": 0.9945, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6307821698906644, |
|
"grad_norm": 0.14560862560154247, |
|
"learning_rate": 0.00025951413354116665, |
|
"loss": 0.988, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.63919259882254, |
|
"grad_norm": 0.1246463854830709, |
|
"learning_rate": 0.0002579961936343188, |
|
"loss": 0.9658, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6476030277544155, |
|
"grad_norm": 0.10713906349411657, |
|
"learning_rate": 0.00025645493606678375, |
|
"loss": 0.9366, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.656013456686291, |
|
"grad_norm": 0.1113038542351718, |
|
"learning_rate": 0.00025489069361437326, |
|
"loss": 0.9758, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6644238856181666, |
|
"grad_norm": 1.3831640086044374, |
|
"learning_rate": 0.00025330380401560846, |
|
"loss": 0.9575, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.6728343145500421, |
|
"grad_norm": 0.11166426611159094, |
|
"learning_rate": 0.0002516946098987985, |
|
"loss": 0.963, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6812447434819175, |
|
"grad_norm": 1.0002894310288515, |
|
"learning_rate": 0.0002500634587080628, |
|
"loss": 0.9299, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.1755490764589915, |
|
"learning_rate": 0.0002484107026283137, |
|
"loss": 0.9814, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6980656013456686, |
|
"grad_norm": 4.147990806845651, |
|
"learning_rate": 0.00024673669850921575, |
|
"loss": 1.0252, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7064760302775441, |
|
"grad_norm": 0.14199796426249658, |
|
"learning_rate": 0.0002450418077881374, |
|
"loss": 0.9578, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7148864592094197, |
|
"grad_norm": 0.10967054022758699, |
|
"learning_rate": 0.0002433263964121127, |
|
"loss": 0.9543, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.7232968881412952, |
|
"grad_norm": 0.10681488534249564, |
|
"learning_rate": 0.00024159083475882854, |
|
"loss": 0.947, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7317073170731707, |
|
"grad_norm": 0.0836144209353973, |
|
"learning_rate": 0.00023983549755665623, |
|
"loss": 0.966, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.7401177460050462, |
|
"grad_norm": 6.618428479377572, |
|
"learning_rate": 0.00023806076380374262, |
|
"loss": 0.9755, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7485281749369218, |
|
"grad_norm": 0.3700536075693679, |
|
"learning_rate": 0.00023626701668618048, |
|
"loss": 0.9439, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.7569386038687973, |
|
"grad_norm": 0.1339452802214274, |
|
"learning_rate": 0.00023445464349527363, |
|
"loss": 0.9393, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7653490328006728, |
|
"grad_norm": 0.15063483951074022, |
|
"learning_rate": 0.00023262403554391643, |
|
"loss": 0.9561, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.7737594617325484, |
|
"grad_norm": 0.12236138289386866, |
|
"learning_rate": 0.0002307755880821044, |
|
"loss": 0.9757, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7821698906644239, |
|
"grad_norm": 0.09859340116113559, |
|
"learning_rate": 0.00022890970021159545, |
|
"loss": 0.9699, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.7905803195962994, |
|
"grad_norm": 0.3134759911324595, |
|
"learning_rate": 0.00022702677479973857, |
|
"loss": 0.9387, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.798990748528175, |
|
"grad_norm": 0.12183559177618263, |
|
"learning_rate": 0.00022512721839249044, |
|
"loss": 0.8985, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.8074011774600505, |
|
"grad_norm": 0.08229469965958389, |
|
"learning_rate": 0.00022321144112663708, |
|
"loss": 0.9504, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.815811606391926, |
|
"grad_norm": 0.14924593232770486, |
|
"learning_rate": 0.00022127985664124048, |
|
"loss": 0.9338, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.8242220353238016, |
|
"grad_norm": 0.2786737248586552, |
|
"learning_rate": 0.0002193328819883292, |
|
"loss": 1.0327, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.832632464255677, |
|
"grad_norm": 0.1485555431366647, |
|
"learning_rate": 0.00021737093754285147, |
|
"loss": 0.9499, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8410428931875525, |
|
"grad_norm": 0.10549366930700854, |
|
"learning_rate": 0.00021539444691191174, |
|
"loss": 0.8961, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8494533221194281, |
|
"grad_norm": 0.2501853130288003, |
|
"learning_rate": 0.0002134038368433085, |
|
"loss": 0.973, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.8578637510513036, |
|
"grad_norm": 0.08142359041013753, |
|
"learning_rate": 0.00021139953713339454, |
|
"loss": 0.9262, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8662741799831791, |
|
"grad_norm": 0.1101621057008162, |
|
"learning_rate": 0.00020938198053427885, |
|
"loss": 0.9462, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.8746846089150546, |
|
"grad_norm": 0.08929629784060397, |
|
"learning_rate": 0.00020735160266039006, |
|
"loss": 0.9227, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8830950378469302, |
|
"grad_norm": 0.15490402519303073, |
|
"learning_rate": 0.00020530884189442244, |
|
"loss": 0.9077, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.8915054667788057, |
|
"grad_norm": 0.0899609096113038, |
|
"learning_rate": 0.00020325413929268369, |
|
"loss": 0.9309, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8999158957106812, |
|
"grad_norm": 0.08860120877271818, |
|
"learning_rate": 0.00020118793848986554, |
|
"loss": 0.9581, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.9083263246425568, |
|
"grad_norm": 0.08667002021610543, |
|
"learning_rate": 0.00019911068560325804, |
|
"loss": 0.8893, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9167367535744323, |
|
"grad_norm": 0.1020909544402857, |
|
"learning_rate": 0.00019702282913642723, |
|
"loss": 0.8789, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.9251471825063078, |
|
"grad_norm": 0.8881918960589515, |
|
"learning_rate": 0.00019492481988237818, |
|
"loss": 1.0281, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9335576114381834, |
|
"grad_norm": 1.7309452103787781, |
|
"learning_rate": 0.00019281711082622314, |
|
"loss": 1.5781, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.9419680403700589, |
|
"grad_norm": 0.9824981029734958, |
|
"learning_rate": 0.000190700157047377, |
|
"loss": 1.9042, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9503784693019344, |
|
"grad_norm": 30.47237068603654, |
|
"learning_rate": 0.0001885744156212999, |
|
"loss": 2.2642, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.95878889823381, |
|
"grad_norm": 13.132455734206328, |
|
"learning_rate": 0.0001864403455208094, |
|
"loss": 2.0529, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9671993271656855, |
|
"grad_norm": 1.5514837112811664, |
|
"learning_rate": 0.00018429840751698284, |
|
"loss": 1.882, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 0.7978250850775721, |
|
"learning_rate": 0.00018214906407967136, |
|
"loss": 1.0936, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9840201850294366, |
|
"grad_norm": 0.24502338468242643, |
|
"learning_rate": 0.00017999277927764696, |
|
"loss": 0.9768, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.992430613961312, |
|
"grad_norm": 0.14072495383675704, |
|
"learning_rate": 0.00017783001867840488, |
|
"loss": 0.991, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.14826034089833667, |
|
"learning_rate": 0.00017566124924764176, |
|
"loss": 0.9232, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.0084104289318756, |
|
"grad_norm": 0.1423011247047076, |
|
"learning_rate": 0.00017348693924843238, |
|
"loss": 0.7383, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.016820857863751, |
|
"grad_norm": 18.945959105143892, |
|
"learning_rate": 0.00017130755814012607, |
|
"loss": 0.8006, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.0252312867956266, |
|
"grad_norm": 0.2063458884840928, |
|
"learning_rate": 0.0001691235764769848, |
|
"loss": 0.7464, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.0336417157275022, |
|
"grad_norm": 0.10576479121191024, |
|
"learning_rate": 0.00016693546580658493, |
|
"loss": 0.7066, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.0420521446593776, |
|
"grad_norm": 0.10366396212312518, |
|
"learning_rate": 0.00016474369856800457, |
|
"loss": 0.7339, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0504625735912532, |
|
"grad_norm": 0.09658540475190697, |
|
"learning_rate": 0.00016254874798981835, |
|
"loss": 0.7111, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.0588730025231288, |
|
"grad_norm": 0.08877086888965659, |
|
"learning_rate": 0.00016035108798792165, |
|
"loss": 0.71, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.0672834314550042, |
|
"grad_norm": 0.10967646061522097, |
|
"learning_rate": 0.00015815119306320657, |
|
"loss": 0.7296, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.0756938603868798, |
|
"grad_norm": 0.0933812784388927, |
|
"learning_rate": 0.0001559495381991117, |
|
"loss": 0.7361, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.0841042893187554, |
|
"grad_norm": 0.19214296231083186, |
|
"learning_rate": 0.00015374659875906752, |
|
"loss": 0.7134, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.0925147182506307, |
|
"grad_norm": 0.11688119487848063, |
|
"learning_rate": 0.00015154285038385937, |
|
"loss": 0.6893, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1009251471825063, |
|
"grad_norm": 0.09391812895085212, |
|
"learning_rate": 0.00014933876888893164, |
|
"loss": 0.6963, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.1093355761143817, |
|
"grad_norm": 0.10077136979307887, |
|
"learning_rate": 0.0001471348301616531, |
|
"loss": 0.7436, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.1177460050462573, |
|
"grad_norm": 0.08225000161444689, |
|
"learning_rate": 0.00014493151005856724, |
|
"loss": 0.7004, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.126156433978133, |
|
"grad_norm": 0.08801832379105524, |
|
"learning_rate": 0.00014272928430264926, |
|
"loss": 0.722, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.1345668629100083, |
|
"grad_norm": 0.07018288331194043, |
|
"learning_rate": 0.00014052862838059195, |
|
"loss": 0.6862, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.142977291841884, |
|
"grad_norm": 0.0767198314720291, |
|
"learning_rate": 0.00013833001744014212, |
|
"loss": 0.7101, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.1513877207737595, |
|
"grad_norm": 0.07407300906943415, |
|
"learning_rate": 0.00013613392618751086, |
|
"loss": 0.7175, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.1597981497056349, |
|
"grad_norm": 0.07352937678988827, |
|
"learning_rate": 0.00013394082878487884, |
|
"loss": 0.7004, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.1682085786375105, |
|
"grad_norm": 0.07551179793852605, |
|
"learning_rate": 0.00013175119874801874, |
|
"loss": 0.7054, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.176619007569386, |
|
"grad_norm": 0.07052660785539094, |
|
"learning_rate": 0.000129565508844058, |
|
"loss": 0.6789, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1850294365012615, |
|
"grad_norm": 0.0807185011796273, |
|
"learning_rate": 0.00012738423098940244, |
|
"loss": 0.6873, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.193439865433137, |
|
"grad_norm": 0.1801375449127257, |
|
"learning_rate": 0.0001252078361478441, |
|
"loss": 0.7115, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.2018502943650127, |
|
"grad_norm": 0.07403831159848352, |
|
"learning_rate": 0.00012303679422887457, |
|
"loss": 0.6882, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.210260723296888, |
|
"grad_norm": 0.08819212786222724, |
|
"learning_rate": 0.00012087157398622575, |
|
"loss": 0.688, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.2186711522287637, |
|
"grad_norm": 0.7006553252478528, |
|
"learning_rate": 0.0001187126429166605, |
|
"loss": 0.8005, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.2270815811606393, |
|
"grad_norm": 0.08128464358584869, |
|
"learning_rate": 0.00011656046715903468, |
|
"loss": 0.6865, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.2354920100925146, |
|
"grad_norm": 0.11015841491813824, |
|
"learning_rate": 0.00011441551139365197, |
|
"loss": 0.6476, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.2439024390243902, |
|
"grad_norm": 0.09681229577155445, |
|
"learning_rate": 0.0001122782387419339, |
|
"loss": 0.6525, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.2523128679562658, |
|
"grad_norm": 0.07074288159266276, |
|
"learning_rate": 0.00011014911066642675, |
|
"loss": 0.7101, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.2607232968881412, |
|
"grad_norm": 0.07038355444461111, |
|
"learning_rate": 0.00010802858687116586, |
|
"loss": 0.685, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.2691337258200168, |
|
"grad_norm": 0.06694809495021711, |
|
"learning_rate": 0.00010591712520242033, |
|
"loss": 0.6435, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.2775441547518924, |
|
"grad_norm": 1.54283051971332, |
|
"learning_rate": 0.00010381518154983872, |
|
"loss": 0.6707, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.2859545836837678, |
|
"grad_norm": 0.14970032839830041, |
|
"learning_rate": 0.00010172320974801662, |
|
"loss": 0.6541, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.2943650126156434, |
|
"grad_norm": 0.07670287277238487, |
|
"learning_rate": 9.964166147850868e-05, |
|
"loss": 0.6746, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.302775441547519, |
|
"grad_norm": 0.0743406403000032, |
|
"learning_rate": 9.757098617230529e-05, |
|
"loss": 0.6622, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.3111858704793944, |
|
"grad_norm": 0.06797759191808586, |
|
"learning_rate": 9.551163091279481e-05, |
|
"loss": 0.6556, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.31959629941127, |
|
"grad_norm": 0.07795583221869067, |
|
"learning_rate": 9.346404033923304e-05, |
|
"loss": 0.6617, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.3280067283431456, |
|
"grad_norm": 0.087821277333218, |
|
"learning_rate": 9.14286565507406e-05, |
|
"loss": 0.71, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.336417157275021, |
|
"grad_norm": 0.0680393967098423, |
|
"learning_rate": 8.940591901084799e-05, |
|
"loss": 0.666, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.3448275862068966, |
|
"grad_norm": 0.20717718965553744, |
|
"learning_rate": 8.739626445261064e-05, |
|
"loss": 0.657, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.3532380151387722, |
|
"grad_norm": 0.07605217404928517, |
|
"learning_rate": 8.540012678431284e-05, |
|
"loss": 0.6679, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.3616484440706476, |
|
"grad_norm": 0.09933554183914269, |
|
"learning_rate": 8.341793699578171e-05, |
|
"loss": 0.6879, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.3700588730025232, |
|
"grad_norm": 0.07269335218428621, |
|
"learning_rate": 8.145012306533162e-05, |
|
"loss": 0.656, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.3784693019343988, |
|
"grad_norm": 0.0938405449110704, |
|
"learning_rate": 7.949710986735854e-05, |
|
"loss": 0.6542, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.3868797308662741, |
|
"grad_norm": 0.06387119139044767, |
|
"learning_rate": 7.755931908060427e-05, |
|
"loss": 0.6835, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.3952901597981497, |
|
"grad_norm": 0.6498998302961293, |
|
"learning_rate": 7.563716909711155e-05, |
|
"loss": 0.6912, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.4037005887300253, |
|
"grad_norm": 0.07172546327436916, |
|
"learning_rate": 7.373107493188776e-05, |
|
"loss": 0.6397, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.4121110176619007, |
|
"grad_norm": 0.22055958275283316, |
|
"learning_rate": 7.184144813329845e-05, |
|
"loss": 0.665, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.4205214465937763, |
|
"grad_norm": 0.08300391135437629, |
|
"learning_rate": 6.996869669420934e-05, |
|
"loss": 0.6781, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.428931875525652, |
|
"grad_norm": 0.06921133210004143, |
|
"learning_rate": 6.811322496389547e-05, |
|
"loss": 0.6743, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.4373423044575273, |
|
"grad_norm": 0.0711963752977775, |
|
"learning_rate": 6.627543356073752e-05, |
|
"loss": 0.6409, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.445752733389403, |
|
"grad_norm": 0.06420927363983264, |
|
"learning_rate": 6.445571928572372e-05, |
|
"loss": 0.64, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.4541631623212785, |
|
"grad_norm": 0.06501526947017142, |
|
"learning_rate": 6.265447503677568e-05, |
|
"loss": 0.7054, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.462573591253154, |
|
"grad_norm": 0.21066994089465826, |
|
"learning_rate": 6.087208972391683e-05, |
|
"loss": 0.6918, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.4709840201850295, |
|
"grad_norm": 0.08759599924052495, |
|
"learning_rate": 5.910894818530261e-05, |
|
"loss": 0.6709, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.479394449116905, |
|
"grad_norm": 1.0585940281103237, |
|
"learning_rate": 5.736543110412889e-05, |
|
"loss": 0.7003, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.4878048780487805, |
|
"grad_norm": 0.10929958796237216, |
|
"learning_rate": 5.564191492643813e-05, |
|
"loss": 0.6804, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.496215306980656, |
|
"grad_norm": 0.07895221603568614, |
|
"learning_rate": 5.393877177984039e-05, |
|
"loss": 0.6609, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.5046257359125317, |
|
"grad_norm": 0.0705592884671557, |
|
"learning_rate": 5.225636939316621e-05, |
|
"loss": 0.6438, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.513036164844407, |
|
"grad_norm": 0.06645146330681664, |
|
"learning_rate": 5.059507101706976e-05, |
|
"loss": 0.658, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.5214465937762824, |
|
"grad_norm": 0.06572346800316721, |
|
"learning_rate": 4.8955235345598825e-05, |
|
"loss": 0.6409, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.5298570227081583, |
|
"grad_norm": 0.06268956414981779, |
|
"learning_rate": 4.7337216438748384e-05, |
|
"loss": 0.6657, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.5382674516400336, |
|
"grad_norm": 0.0705504423985544, |
|
"learning_rate": 4.5741363646014696e-05, |
|
"loss": 0.6631, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.546677880571909, |
|
"grad_norm": 0.06497073556361815, |
|
"learning_rate": 4.416802153096696e-05, |
|
"loss": 0.6319, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.5550883095037848, |
|
"grad_norm": 0.32565910926601543, |
|
"learning_rate": 4.261752979685159e-05, |
|
"loss": 0.6691, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.5634987384356602, |
|
"grad_norm": 0.05949233651666068, |
|
"learning_rate": 4.1090223213246404e-05, |
|
"loss": 0.6349, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.5719091673675356, |
|
"grad_norm": 0.07160645769568097, |
|
"learning_rate": 3.958643154378005e-05, |
|
"loss": 0.6688, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.5803195962994114, |
|
"grad_norm": 0.7327591054848349, |
|
"learning_rate": 3.8106479474931795e-05, |
|
"loss": 0.645, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.5887300252312868, |
|
"grad_norm": 0.062156634516279057, |
|
"learning_rate": 3.665068654592806e-05, |
|
"loss": 0.6373, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.5971404541631622, |
|
"grad_norm": 0.06660764066083266, |
|
"learning_rate": 3.5219367079750205e-05, |
|
"loss": 0.698, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.6055508830950378, |
|
"grad_norm": 0.06327609087248642, |
|
"learning_rate": 3.381283011526819e-05, |
|
"loss": 0.6231, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.6139613120269134, |
|
"grad_norm": 0.4333939216760815, |
|
"learning_rate": 3.243137934051569e-05, |
|
"loss": 0.6252, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.6223717409587888, |
|
"grad_norm": 0.06658129188396517, |
|
"learning_rate": 3.1075313027120016e-05, |
|
"loss": 0.6726, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.6307821698906644, |
|
"grad_norm": 0.26691445324073054, |
|
"learning_rate": 2.97449239659018e-05, |
|
"loss": 0.6342, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.63919259882254, |
|
"grad_norm": 0.0605021899974428, |
|
"learning_rate": 2.8440499403658122e-05, |
|
"loss": 0.6332, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.6476030277544154, |
|
"grad_norm": 0.061846457580604676, |
|
"learning_rate": 2.7162320981142316e-05, |
|
"loss": 0.651, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.656013456686291, |
|
"grad_norm": 0.1452013715559192, |
|
"learning_rate": 2.5910664672254428e-05, |
|
"loss": 0.6646, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.6644238856181666, |
|
"grad_norm": 0.06326743122936844, |
|
"learning_rate": 2.4685800724455384e-05, |
|
"loss": 0.6433, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.672834314550042, |
|
"grad_norm": 0.07852388598087191, |
|
"learning_rate": 2.3487993600416967e-05, |
|
"loss": 0.6371, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.6812447434819175, |
|
"grad_norm": 0.06601596346267856, |
|
"learning_rate": 2.2317501920921576e-05, |
|
"loss": 0.6471, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.6896551724137931, |
|
"grad_norm": 0.059266994188209014, |
|
"learning_rate": 2.1174578409022702e-05, |
|
"loss": 0.6699, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.6980656013456685, |
|
"grad_norm": 0.07452888371241441, |
|
"learning_rate": 2.0059469835479054e-05, |
|
"loss": 0.6473, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.7064760302775441, |
|
"grad_norm": 0.07457779730712975, |
|
"learning_rate": 1.8972416965473803e-05, |
|
"loss": 0.6378, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.7148864592094197, |
|
"grad_norm": 0.06910347568006753, |
|
"learning_rate": 1.7913654506630655e-05, |
|
"loss": 0.6274, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.7232968881412951, |
|
"grad_norm": 0.07064622370312863, |
|
"learning_rate": 1.6883411058337543e-05, |
|
"loss": 0.7109, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.7317073170731707, |
|
"grad_norm": 0.06048167956891708, |
|
"learning_rate": 1.5881909062389285e-05, |
|
"loss": 0.6333, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.7401177460050463, |
|
"grad_norm": 0.06307286865715722, |
|
"learning_rate": 1.4909364754959985e-05, |
|
"loss": 0.646, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.7485281749369217, |
|
"grad_norm": 0.05911925581755182, |
|
"learning_rate": 1.3965988119914734e-05, |
|
"loss": 0.6433, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.7569386038687973, |
|
"grad_norm": 0.06333975944083138, |
|
"learning_rate": 1.305198284347191e-05, |
|
"loss": 0.6339, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.765349032800673, |
|
"grad_norm": 0.05514146566145015, |
|
"learning_rate": 1.2167546270224743e-05, |
|
"loss": 0.6264, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.7737594617325483, |
|
"grad_norm": 0.05609433763926122, |
|
"learning_rate": 1.1312869360532295e-05, |
|
"loss": 0.6337, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.7821698906644239, |
|
"grad_norm": 0.05988904119560809, |
|
"learning_rate": 1.0488136649288847e-05, |
|
"loss": 0.6215, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.7905803195962995, |
|
"grad_norm": 0.09114152521890548, |
|
"learning_rate": 9.693526206080693e-06, |
|
"loss": 0.6385, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.7989907485281749, |
|
"grad_norm": 0.09735030405268541, |
|
"learning_rate": 8.929209596738706e-06, |
|
"loss": 0.6077, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.8074011774600505, |
|
"grad_norm": 0.0635144022515458, |
|
"learning_rate": 8.195351846295262e-06, |
|
"loss": 0.6647, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.815811606391926, |
|
"grad_norm": 0.08588808547970551, |
|
"learning_rate": 7.492111403353462e-06, |
|
"loss": 0.6523, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.8242220353238014, |
|
"grad_norm": 0.057867203058913214, |
|
"learning_rate": 6.819640105876062e-06, |
|
"loss": 0.6347, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.832632464255677, |
|
"grad_norm": 0.07139018525983484, |
|
"learning_rate": 6.1780831484019684e-06, |
|
"loss": 0.6247, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.8410428931875527, |
|
"grad_norm": 0.058328293614124185, |
|
"learning_rate": 5.567579050696957e-06, |
|
"loss": 0.6387, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.849453322119428, |
|
"grad_norm": 0.05729437886303193, |
|
"learning_rate": 4.9882596278455756e-06, |
|
"loss": 0.6443, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.8578637510513036, |
|
"grad_norm": 0.05592472433203098, |
|
"learning_rate": 4.440249961790826e-06, |
|
"loss": 0.5956, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.8662741799831792, |
|
"grad_norm": 0.06609120678589044, |
|
"learning_rate": 3.923668374327338e-06, |
|
"loss": 0.6619, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.8746846089150546, |
|
"grad_norm": 0.05566880083508612, |
|
"learning_rate": 3.438626401554351e-06, |
|
"loss": 0.6077, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.8830950378469302, |
|
"grad_norm": 0.05589315119066426, |
|
"learning_rate": 2.9852287697938125e-06, |
|
"loss": 0.6408, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.8915054667788058, |
|
"grad_norm": 0.06795657947769265, |
|
"learning_rate": 2.563573372978617e-06, |
|
"loss": 0.6465, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.8999158957106812, |
|
"grad_norm": 0.05715913746496311, |
|
"learning_rate": 2.173751251516209e-06, |
|
"loss": 0.6673, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.9083263246425568, |
|
"grad_norm": 0.07516171647382662, |
|
"learning_rate": 1.8158465726318294e-06, |
|
"loss": 0.6135, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.9167367535744324, |
|
"grad_norm": 0.08828033210229927, |
|
"learning_rate": 1.4899366121958634e-06, |
|
"loss": 0.6472, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.9251471825063078, |
|
"grad_norm": 0.05375437444470602, |
|
"learning_rate": 1.19609173803904e-06, |
|
"loss": 0.5937, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.9335576114381834, |
|
"grad_norm": 0.07199465636203523, |
|
"learning_rate": 9.343753947591681e-07, |
|
"loss": 0.6219, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.941968040370059, |
|
"grad_norm": 0.11526483170723129, |
|
"learning_rate": 7.048440900226937e-07, |
|
"loss": 0.6342, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.9503784693019344, |
|
"grad_norm": 0.05354167010798482, |
|
"learning_rate": 5.075473823640597e-07, |
|
"loss": 0.6553, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.95878889823381, |
|
"grad_norm": 0.09214456642437543, |
|
"learning_rate": 3.425278704853984e-07, |
|
"loss": 0.6636, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.9671993271656856, |
|
"grad_norm": 0.06917853774104925, |
|
"learning_rate": 2.0982118405897251e-07, |
|
"loss": 0.6319, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.975609756097561, |
|
"grad_norm": 0.05500253350216091, |
|
"learning_rate": 1.0945597603431167e-07, |
|
"loss": 0.6157, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.9840201850294366, |
|
"grad_norm": 0.056648521587817115, |
|
"learning_rate": 4.145391645166141e-08, |
|
"loss": 0.654, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.9924306139613122, |
|
"grad_norm": 0.06416190308164577, |
|
"learning_rate": 5.829687763259094e-09, |
|
"loss": 0.6517, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.9974768713204374, |
|
"step": 1188, |
|
"total_flos": 7.94530881371059e+18, |
|
"train_loss": 0.9845614412216225, |
|
"train_runtime": 14065.2912, |
|
"train_samples_per_second": 2.704, |
|
"train_steps_per_second": 0.084 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1188, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.94530881371059e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|