salamandra-2B-instruct-smoltalk / trainer_state.json
sebcif's picture
Model save
9edecc6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9974768713204374,
"eval_steps": 500,
"global_step": 1188,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001682085786375105,
"grad_norm": 0.584265834445358,
"learning_rate": 2.521008403361344e-06,
"loss": 1.3366,
"step": 1
},
{
"epoch": 0.008410428931875526,
"grad_norm": 0.4317387157455665,
"learning_rate": 1.2605042016806723e-05,
"loss": 1.2402,
"step": 5
},
{
"epoch": 0.01682085786375105,
"grad_norm": 0.2866682001722115,
"learning_rate": 2.5210084033613446e-05,
"loss": 1.2481,
"step": 10
},
{
"epoch": 0.025231286795626577,
"grad_norm": 0.2291349749029046,
"learning_rate": 3.7815126050420166e-05,
"loss": 1.1829,
"step": 15
},
{
"epoch": 0.0336417157275021,
"grad_norm": 0.26272617568016177,
"learning_rate": 5.042016806722689e-05,
"loss": 1.0796,
"step": 20
},
{
"epoch": 0.04205214465937763,
"grad_norm": 0.17879426310992677,
"learning_rate": 6.302521008403361e-05,
"loss": 0.9861,
"step": 25
},
{
"epoch": 0.050462573591253154,
"grad_norm": 0.1372088242876512,
"learning_rate": 7.563025210084033e-05,
"loss": 0.9688,
"step": 30
},
{
"epoch": 0.05887300252312868,
"grad_norm": 0.11605937851559196,
"learning_rate": 8.823529411764705e-05,
"loss": 0.9683,
"step": 35
},
{
"epoch": 0.0672834314550042,
"grad_norm": 0.12589057657439554,
"learning_rate": 0.00010084033613445378,
"loss": 0.948,
"step": 40
},
{
"epoch": 0.07569386038687972,
"grad_norm": 0.1652030960687282,
"learning_rate": 0.00011344537815126049,
"loss": 0.9155,
"step": 45
},
{
"epoch": 0.08410428931875526,
"grad_norm": 0.15080610702585565,
"learning_rate": 0.00012605042016806722,
"loss": 0.9278,
"step": 50
},
{
"epoch": 0.09251471825063078,
"grad_norm": 0.13357435857967218,
"learning_rate": 0.00013865546218487396,
"loss": 0.9023,
"step": 55
},
{
"epoch": 0.10092514718250631,
"grad_norm": 0.19497187993777446,
"learning_rate": 0.00015126050420168066,
"loss": 0.8778,
"step": 60
},
{
"epoch": 0.10933557611438183,
"grad_norm": 0.1976656903261847,
"learning_rate": 0.00016386554621848737,
"loss": 0.9507,
"step": 65
},
{
"epoch": 0.11774600504625736,
"grad_norm": 0.1403114707521467,
"learning_rate": 0.0001764705882352941,
"loss": 0.9164,
"step": 70
},
{
"epoch": 0.1261564339781329,
"grad_norm": 0.14099777956857235,
"learning_rate": 0.0001890756302521008,
"loss": 0.9534,
"step": 75
},
{
"epoch": 0.1345668629100084,
"grad_norm": 0.15889633248603474,
"learning_rate": 0.00020168067226890757,
"loss": 0.9315,
"step": 80
},
{
"epoch": 0.14297729184188393,
"grad_norm": 0.13905096593977556,
"learning_rate": 0.00021428571428571427,
"loss": 0.9019,
"step": 85
},
{
"epoch": 0.15138772077375945,
"grad_norm": 0.1248251895789866,
"learning_rate": 0.00022689075630252098,
"loss": 1.0007,
"step": 90
},
{
"epoch": 0.159798149705635,
"grad_norm": 0.12856969759106154,
"learning_rate": 0.00023949579831932771,
"loss": 0.9706,
"step": 95
},
{
"epoch": 0.16820857863751051,
"grad_norm": 0.2451599584066991,
"learning_rate": 0.00025210084033613445,
"loss": 0.9457,
"step": 100
},
{
"epoch": 0.17661900756938603,
"grad_norm": 0.16131565399243042,
"learning_rate": 0.00026470588235294115,
"loss": 0.9624,
"step": 105
},
{
"epoch": 0.18502943650126155,
"grad_norm": 0.11974149003499743,
"learning_rate": 0.0002773109243697479,
"loss": 0.943,
"step": 110
},
{
"epoch": 0.1934398654331371,
"grad_norm": 0.4663750210183581,
"learning_rate": 0.0002899159663865546,
"loss": 0.9683,
"step": 115
},
{
"epoch": 0.20185029436501262,
"grad_norm": 0.315237934726779,
"learning_rate": 0.00029999935225318556,
"loss": 0.9765,
"step": 120
},
{
"epoch": 0.21026072329688814,
"grad_norm": 41.12896492268303,
"learning_rate": 0.00029997668170208376,
"loss": 2.4906,
"step": 125
},
{
"epoch": 0.21867115222876365,
"grad_norm": 86.97194601878952,
"learning_rate": 0.0002999216294043922,
"loss": 4.2426,
"step": 130
},
{
"epoch": 0.2270815811606392,
"grad_norm": 40.145574165614974,
"learning_rate": 0.0002998342072465558,
"loss": 5.285,
"step": 135
},
{
"epoch": 0.23549201009251472,
"grad_norm": 36.0169334865568,
"learning_rate": 0.0002997144341040567,
"loss": 5.7741,
"step": 140
},
{
"epoch": 0.24390243902439024,
"grad_norm": 15.431132077300076,
"learning_rate": 0.0002995623358373386,
"loss": 7.9614,
"step": 145
},
{
"epoch": 0.2523128679562658,
"grad_norm": 6.581926090085861,
"learning_rate": 0.0002993779452862235,
"loss": 4.2109,
"step": 150
},
{
"epoch": 0.2607232968881413,
"grad_norm": 7.662219825964168,
"learning_rate": 0.0002991613022628211,
"loss": 3.4552,
"step": 155
},
{
"epoch": 0.2691337258200168,
"grad_norm": 2.538849805479563,
"learning_rate": 0.00029891245354293284,
"loss": 2.1775,
"step": 160
},
{
"epoch": 0.27754415475189237,
"grad_norm": 2.0222241832932824,
"learning_rate": 0.0002986314528559525,
"loss": 2.0086,
"step": 165
},
{
"epoch": 0.28595458368376786,
"grad_norm": 1.6691520526281935,
"learning_rate": 0.0002983183608732653,
"loss": 1.618,
"step": 170
},
{
"epoch": 0.2943650126156434,
"grad_norm": 0.8472051268719076,
"learning_rate": 0.00029797324519514835,
"loss": 1.4006,
"step": 175
},
{
"epoch": 0.3027754415475189,
"grad_norm": 1.2656793919009501,
"learning_rate": 0.0002975961803361749,
"loss": 1.2361,
"step": 180
},
{
"epoch": 0.31118587047939444,
"grad_norm": 16.91854783030072,
"learning_rate": 0.00029718724770912575,
"loss": 1.3024,
"step": 185
},
{
"epoch": 0.31959629941127,
"grad_norm": 0.4264421791196419,
"learning_rate": 0.00029674653560741125,
"loss": 1.2247,
"step": 190
},
{
"epoch": 0.3280067283431455,
"grad_norm": 11.819413233995139,
"learning_rate": 0.00029627413918600773,
"loss": 1.5927,
"step": 195
},
{
"epoch": 0.33641715727502103,
"grad_norm": 0.3872518309676493,
"learning_rate": 0.0002957701604409124,
"loss": 1.2533,
"step": 200
},
{
"epoch": 0.3448275862068966,
"grad_norm": 2.0631756640177508,
"learning_rate": 0.0002952347081871212,
"loss": 1.1339,
"step": 205
},
{
"epoch": 0.35323801513877207,
"grad_norm": 0.2331543310806729,
"learning_rate": 0.00029466789803513435,
"loss": 1.0706,
"step": 210
},
{
"epoch": 0.3616484440706476,
"grad_norm": 0.20061594850021877,
"learning_rate": 0.0002940698523659947,
"loss": 1.1093,
"step": 215
},
{
"epoch": 0.3700588730025231,
"grad_norm": 0.17861349516130504,
"learning_rate": 0.0002934407003048641,
"loss": 1.1008,
"step": 220
},
{
"epoch": 0.37846930193439865,
"grad_norm": 0.11836066059443685,
"learning_rate": 0.00029278057769314384,
"loss": 1.045,
"step": 225
},
{
"epoch": 0.3868797308662742,
"grad_norm": 0.18874539756549927,
"learning_rate": 0.00029208962705914505,
"loss": 1.0056,
"step": 230
},
{
"epoch": 0.3952901597981497,
"grad_norm": 0.45365739150608037,
"learning_rate": 0.00029136799758731473,
"loss": 0.9995,
"step": 235
},
{
"epoch": 0.40370058873002523,
"grad_norm": 0.12762727857651687,
"learning_rate": 0.00029061584508602545,
"loss": 1.006,
"step": 240
},
{
"epoch": 0.4121110176619008,
"grad_norm": 4.885346324632363,
"learning_rate": 0.0002898333319539341,
"loss": 1.3006,
"step": 245
},
{
"epoch": 0.42052144659377627,
"grad_norm": 0.6018215992100235,
"learning_rate": 0.0002890206271449186,
"loss": 1.0966,
"step": 250
},
{
"epoch": 0.4289318755256518,
"grad_norm": 0.4059499155938192,
"learning_rate": 0.00028817790613159817,
"loss": 1.0764,
"step": 255
},
{
"epoch": 0.4373423044575273,
"grad_norm": 7.707406865225783,
"learning_rate": 0.0002873053508674471,
"loss": 1.1362,
"step": 260
},
{
"epoch": 0.44575273338940286,
"grad_norm": 0.22108881306250477,
"learning_rate": 0.00028640314974750884,
"loss": 1.0774,
"step": 265
},
{
"epoch": 0.4541631623212784,
"grad_norm": 3.3347201885551807,
"learning_rate": 0.00028547149756771894,
"loss": 1.1651,
"step": 270
},
{
"epoch": 0.4625735912531539,
"grad_norm": 0.34254879266979626,
"learning_rate": 0.00028451059548284665,
"loss": 1.1397,
"step": 275
},
{
"epoch": 0.47098402018502944,
"grad_norm": 0.1735673338436184,
"learning_rate": 0.00028352065096306307,
"loss": 1.0421,
"step": 280
},
{
"epoch": 0.479394449116905,
"grad_norm": 0.16104913537338722,
"learning_rate": 0.0002825018777491458,
"loss": 1.0461,
"step": 285
},
{
"epoch": 0.4878048780487805,
"grad_norm": 0.1831953506941342,
"learning_rate": 0.00028145449580632996,
"loss": 0.9887,
"step": 290
},
{
"epoch": 0.496215306980656,
"grad_norm": 0.13125693879950318,
"learning_rate": 0.0002803787312768149,
"loss": 0.9847,
"step": 295
},
{
"epoch": 0.5046257359125316,
"grad_norm": 0.45943182895269363,
"learning_rate": 0.00027927481643093754,
"loss": 1.0187,
"step": 300
},
{
"epoch": 0.5130361648444071,
"grad_norm": 1.177847689359211,
"learning_rate": 0.0002781429896170223,
"loss": 1.0201,
"step": 305
},
{
"epoch": 0.5214465937762826,
"grad_norm": 11.730278476509266,
"learning_rate": 0.0002769834952099191,
"loss": 1.2084,
"step": 310
},
{
"epoch": 0.5298570227081582,
"grad_norm": 0.26427554458970376,
"learning_rate": 0.0002757965835582397,
"loss": 1.102,
"step": 315
},
{
"epoch": 0.5382674516400336,
"grad_norm": 0.45790641000811105,
"learning_rate": 0.0002745825109303045,
"loss": 1.0614,
"step": 320
},
{
"epoch": 0.5466778805719091,
"grad_norm": 0.1621409545630021,
"learning_rate": 0.0002733415394588114,
"loss": 1.0228,
"step": 325
},
{
"epoch": 0.5550883095037847,
"grad_norm": 0.11351840128403332,
"learning_rate": 0.0002720739370842379,
"loss": 0.9773,
"step": 330
},
{
"epoch": 0.5634987384356602,
"grad_norm": 0.13847315137184252,
"learning_rate": 0.0002707799774969897,
"loss": 1.0054,
"step": 335
},
{
"epoch": 0.5719091673675357,
"grad_norm": 0.10777953928312381,
"learning_rate": 0.0002694599400783078,
"loss": 0.9851,
"step": 340
},
{
"epoch": 0.5803195962994113,
"grad_norm": 0.1256406052654263,
"learning_rate": 0.00026811410983994667,
"loss": 1.0163,
"step": 345
},
{
"epoch": 0.5887300252312868,
"grad_norm": 0.16866862756321335,
"learning_rate": 0.00026674277736263687,
"loss": 1.0335,
"step": 350
},
{
"epoch": 0.5971404541631623,
"grad_norm": 0.14265754669110667,
"learning_rate": 0.0002653462387333451,
"loss": 0.9956,
"step": 355
},
{
"epoch": 0.6055508830950378,
"grad_norm": 0.5819454877222049,
"learning_rate": 0.0002639247954813458,
"loss": 1.0263,
"step": 360
},
{
"epoch": 0.6139613120269134,
"grad_norm": 0.13535505190898894,
"learning_rate": 0.0002624787545131169,
"loss": 0.9753,
"step": 365
},
{
"epoch": 0.6223717409587889,
"grad_norm": 1.0748063857926706,
"learning_rate": 0.0002610084280460756,
"loss": 0.9945,
"step": 370
},
{
"epoch": 0.6307821698906644,
"grad_norm": 0.14560862560154247,
"learning_rate": 0.00025951413354116665,
"loss": 0.988,
"step": 375
},
{
"epoch": 0.63919259882254,
"grad_norm": 0.1246463854830709,
"learning_rate": 0.0002579961936343188,
"loss": 0.9658,
"step": 380
},
{
"epoch": 0.6476030277544155,
"grad_norm": 0.10713906349411657,
"learning_rate": 0.00025645493606678375,
"loss": 0.9366,
"step": 385
},
{
"epoch": 0.656013456686291,
"grad_norm": 0.1113038542351718,
"learning_rate": 0.00025489069361437326,
"loss": 0.9758,
"step": 390
},
{
"epoch": 0.6644238856181666,
"grad_norm": 1.3831640086044374,
"learning_rate": 0.00025330380401560846,
"loss": 0.9575,
"step": 395
},
{
"epoch": 0.6728343145500421,
"grad_norm": 0.11166426611159094,
"learning_rate": 0.0002516946098987985,
"loss": 0.963,
"step": 400
},
{
"epoch": 0.6812447434819175,
"grad_norm": 1.0002894310288515,
"learning_rate": 0.0002500634587080628,
"loss": 0.9299,
"step": 405
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.1755490764589915,
"learning_rate": 0.0002484107026283137,
"loss": 0.9814,
"step": 410
},
{
"epoch": 0.6980656013456686,
"grad_norm": 4.147990806845651,
"learning_rate": 0.00024673669850921575,
"loss": 1.0252,
"step": 415
},
{
"epoch": 0.7064760302775441,
"grad_norm": 0.14199796426249658,
"learning_rate": 0.0002450418077881374,
"loss": 0.9578,
"step": 420
},
{
"epoch": 0.7148864592094197,
"grad_norm": 0.10967054022758699,
"learning_rate": 0.0002433263964121127,
"loss": 0.9543,
"step": 425
},
{
"epoch": 0.7232968881412952,
"grad_norm": 0.10681488534249564,
"learning_rate": 0.00024159083475882854,
"loss": 0.947,
"step": 430
},
{
"epoch": 0.7317073170731707,
"grad_norm": 0.0836144209353973,
"learning_rate": 0.00023983549755665623,
"loss": 0.966,
"step": 435
},
{
"epoch": 0.7401177460050462,
"grad_norm": 6.618428479377572,
"learning_rate": 0.00023806076380374262,
"loss": 0.9755,
"step": 440
},
{
"epoch": 0.7485281749369218,
"grad_norm": 0.3700536075693679,
"learning_rate": 0.00023626701668618048,
"loss": 0.9439,
"step": 445
},
{
"epoch": 0.7569386038687973,
"grad_norm": 0.1339452802214274,
"learning_rate": 0.00023445464349527363,
"loss": 0.9393,
"step": 450
},
{
"epoch": 0.7653490328006728,
"grad_norm": 0.15063483951074022,
"learning_rate": 0.00023262403554391643,
"loss": 0.9561,
"step": 455
},
{
"epoch": 0.7737594617325484,
"grad_norm": 0.12236138289386866,
"learning_rate": 0.0002307755880821044,
"loss": 0.9757,
"step": 460
},
{
"epoch": 0.7821698906644239,
"grad_norm": 0.09859340116113559,
"learning_rate": 0.00022890970021159545,
"loss": 0.9699,
"step": 465
},
{
"epoch": 0.7905803195962994,
"grad_norm": 0.3134759911324595,
"learning_rate": 0.00022702677479973857,
"loss": 0.9387,
"step": 470
},
{
"epoch": 0.798990748528175,
"grad_norm": 0.12183559177618263,
"learning_rate": 0.00022512721839249044,
"loss": 0.8985,
"step": 475
},
{
"epoch": 0.8074011774600505,
"grad_norm": 0.08229469965958389,
"learning_rate": 0.00022321144112663708,
"loss": 0.9504,
"step": 480
},
{
"epoch": 0.815811606391926,
"grad_norm": 0.14924593232770486,
"learning_rate": 0.00022127985664124048,
"loss": 0.9338,
"step": 485
},
{
"epoch": 0.8242220353238016,
"grad_norm": 0.2786737248586552,
"learning_rate": 0.0002193328819883292,
"loss": 1.0327,
"step": 490
},
{
"epoch": 0.832632464255677,
"grad_norm": 0.1485555431366647,
"learning_rate": 0.00021737093754285147,
"loss": 0.9499,
"step": 495
},
{
"epoch": 0.8410428931875525,
"grad_norm": 0.10549366930700854,
"learning_rate": 0.00021539444691191174,
"loss": 0.8961,
"step": 500
},
{
"epoch": 0.8494533221194281,
"grad_norm": 0.2501853130288003,
"learning_rate": 0.0002134038368433085,
"loss": 0.973,
"step": 505
},
{
"epoch": 0.8578637510513036,
"grad_norm": 0.08142359041013753,
"learning_rate": 0.00021139953713339454,
"loss": 0.9262,
"step": 510
},
{
"epoch": 0.8662741799831791,
"grad_norm": 0.1101621057008162,
"learning_rate": 0.00020938198053427885,
"loss": 0.9462,
"step": 515
},
{
"epoch": 0.8746846089150546,
"grad_norm": 0.08929629784060397,
"learning_rate": 0.00020735160266039006,
"loss": 0.9227,
"step": 520
},
{
"epoch": 0.8830950378469302,
"grad_norm": 0.15490402519303073,
"learning_rate": 0.00020530884189442244,
"loss": 0.9077,
"step": 525
},
{
"epoch": 0.8915054667788057,
"grad_norm": 0.0899609096113038,
"learning_rate": 0.00020325413929268369,
"loss": 0.9309,
"step": 530
},
{
"epoch": 0.8999158957106812,
"grad_norm": 0.08860120877271818,
"learning_rate": 0.00020118793848986554,
"loss": 0.9581,
"step": 535
},
{
"epoch": 0.9083263246425568,
"grad_norm": 0.08667002021610543,
"learning_rate": 0.00019911068560325804,
"loss": 0.8893,
"step": 540
},
{
"epoch": 0.9167367535744323,
"grad_norm": 0.1020909544402857,
"learning_rate": 0.00019702282913642723,
"loss": 0.8789,
"step": 545
},
{
"epoch": 0.9251471825063078,
"grad_norm": 0.8881918960589515,
"learning_rate": 0.00019492481988237818,
"loss": 1.0281,
"step": 550
},
{
"epoch": 0.9335576114381834,
"grad_norm": 1.7309452103787781,
"learning_rate": 0.00019281711082622314,
"loss": 1.5781,
"step": 555
},
{
"epoch": 0.9419680403700589,
"grad_norm": 0.9824981029734958,
"learning_rate": 0.000190700157047377,
"loss": 1.9042,
"step": 560
},
{
"epoch": 0.9503784693019344,
"grad_norm": 30.47237068603654,
"learning_rate": 0.0001885744156212999,
"loss": 2.2642,
"step": 565
},
{
"epoch": 0.95878889823381,
"grad_norm": 13.132455734206328,
"learning_rate": 0.0001864403455208094,
"loss": 2.0529,
"step": 570
},
{
"epoch": 0.9671993271656855,
"grad_norm": 1.5514837112811664,
"learning_rate": 0.00018429840751698284,
"loss": 1.882,
"step": 575
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.7978250850775721,
"learning_rate": 0.00018214906407967136,
"loss": 1.0936,
"step": 580
},
{
"epoch": 0.9840201850294366,
"grad_norm": 0.24502338468242643,
"learning_rate": 0.00017999277927764696,
"loss": 0.9768,
"step": 585
},
{
"epoch": 0.992430613961312,
"grad_norm": 0.14072495383675704,
"learning_rate": 0.00017783001867840488,
"loss": 0.991,
"step": 590
},
{
"epoch": 1.0,
"grad_norm": 0.14826034089833667,
"learning_rate": 0.00017566124924764176,
"loss": 0.9232,
"step": 595
},
{
"epoch": 1.0084104289318756,
"grad_norm": 0.1423011247047076,
"learning_rate": 0.00017348693924843238,
"loss": 0.7383,
"step": 600
},
{
"epoch": 1.016820857863751,
"grad_norm": 18.945959105143892,
"learning_rate": 0.00017130755814012607,
"loss": 0.8006,
"step": 605
},
{
"epoch": 1.0252312867956266,
"grad_norm": 0.2063458884840928,
"learning_rate": 0.0001691235764769848,
"loss": 0.7464,
"step": 610
},
{
"epoch": 1.0336417157275022,
"grad_norm": 0.10576479121191024,
"learning_rate": 0.00016693546580658493,
"loss": 0.7066,
"step": 615
},
{
"epoch": 1.0420521446593776,
"grad_norm": 0.10366396212312518,
"learning_rate": 0.00016474369856800457,
"loss": 0.7339,
"step": 620
},
{
"epoch": 1.0504625735912532,
"grad_norm": 0.09658540475190697,
"learning_rate": 0.00016254874798981835,
"loss": 0.7111,
"step": 625
},
{
"epoch": 1.0588730025231288,
"grad_norm": 0.08877086888965659,
"learning_rate": 0.00016035108798792165,
"loss": 0.71,
"step": 630
},
{
"epoch": 1.0672834314550042,
"grad_norm": 0.10967646061522097,
"learning_rate": 0.00015815119306320657,
"loss": 0.7296,
"step": 635
},
{
"epoch": 1.0756938603868798,
"grad_norm": 0.0933812784388927,
"learning_rate": 0.0001559495381991117,
"loss": 0.7361,
"step": 640
},
{
"epoch": 1.0841042893187554,
"grad_norm": 0.19214296231083186,
"learning_rate": 0.00015374659875906752,
"loss": 0.7134,
"step": 645
},
{
"epoch": 1.0925147182506307,
"grad_norm": 0.11688119487848063,
"learning_rate": 0.00015154285038385937,
"loss": 0.6893,
"step": 650
},
{
"epoch": 1.1009251471825063,
"grad_norm": 0.09391812895085212,
"learning_rate": 0.00014933876888893164,
"loss": 0.6963,
"step": 655
},
{
"epoch": 1.1093355761143817,
"grad_norm": 0.10077136979307887,
"learning_rate": 0.0001471348301616531,
"loss": 0.7436,
"step": 660
},
{
"epoch": 1.1177460050462573,
"grad_norm": 0.08225000161444689,
"learning_rate": 0.00014493151005856724,
"loss": 0.7004,
"step": 665
},
{
"epoch": 1.126156433978133,
"grad_norm": 0.08801832379105524,
"learning_rate": 0.00014272928430264926,
"loss": 0.722,
"step": 670
},
{
"epoch": 1.1345668629100083,
"grad_norm": 0.07018288331194043,
"learning_rate": 0.00014052862838059195,
"loss": 0.6862,
"step": 675
},
{
"epoch": 1.142977291841884,
"grad_norm": 0.0767198314720291,
"learning_rate": 0.00013833001744014212,
"loss": 0.7101,
"step": 680
},
{
"epoch": 1.1513877207737595,
"grad_norm": 0.07407300906943415,
"learning_rate": 0.00013613392618751086,
"loss": 0.7175,
"step": 685
},
{
"epoch": 1.1597981497056349,
"grad_norm": 0.07352937678988827,
"learning_rate": 0.00013394082878487884,
"loss": 0.7004,
"step": 690
},
{
"epoch": 1.1682085786375105,
"grad_norm": 0.07551179793852605,
"learning_rate": 0.00013175119874801874,
"loss": 0.7054,
"step": 695
},
{
"epoch": 1.176619007569386,
"grad_norm": 0.07052660785539094,
"learning_rate": 0.000129565508844058,
"loss": 0.6789,
"step": 700
},
{
"epoch": 1.1850294365012615,
"grad_norm": 0.0807185011796273,
"learning_rate": 0.00012738423098940244,
"loss": 0.6873,
"step": 705
},
{
"epoch": 1.193439865433137,
"grad_norm": 0.1801375449127257,
"learning_rate": 0.0001252078361478441,
"loss": 0.7115,
"step": 710
},
{
"epoch": 1.2018502943650127,
"grad_norm": 0.07403831159848352,
"learning_rate": 0.00012303679422887457,
"loss": 0.6882,
"step": 715
},
{
"epoch": 1.210260723296888,
"grad_norm": 0.08819212786222724,
"learning_rate": 0.00012087157398622575,
"loss": 0.688,
"step": 720
},
{
"epoch": 1.2186711522287637,
"grad_norm": 0.7006553252478528,
"learning_rate": 0.0001187126429166605,
"loss": 0.8005,
"step": 725
},
{
"epoch": 1.2270815811606393,
"grad_norm": 0.08128464358584869,
"learning_rate": 0.00011656046715903468,
"loss": 0.6865,
"step": 730
},
{
"epoch": 1.2354920100925146,
"grad_norm": 0.11015841491813824,
"learning_rate": 0.00011441551139365197,
"loss": 0.6476,
"step": 735
},
{
"epoch": 1.2439024390243902,
"grad_norm": 0.09681229577155445,
"learning_rate": 0.0001122782387419339,
"loss": 0.6525,
"step": 740
},
{
"epoch": 1.2523128679562658,
"grad_norm": 0.07074288159266276,
"learning_rate": 0.00011014911066642675,
"loss": 0.7101,
"step": 745
},
{
"epoch": 1.2607232968881412,
"grad_norm": 0.07038355444461111,
"learning_rate": 0.00010802858687116586,
"loss": 0.685,
"step": 750
},
{
"epoch": 1.2691337258200168,
"grad_norm": 0.06694809495021711,
"learning_rate": 0.00010591712520242033,
"loss": 0.6435,
"step": 755
},
{
"epoch": 1.2775441547518924,
"grad_norm": 1.54283051971332,
"learning_rate": 0.00010381518154983872,
"loss": 0.6707,
"step": 760
},
{
"epoch": 1.2859545836837678,
"grad_norm": 0.14970032839830041,
"learning_rate": 0.00010172320974801662,
"loss": 0.6541,
"step": 765
},
{
"epoch": 1.2943650126156434,
"grad_norm": 0.07670287277238487,
"learning_rate": 9.964166147850868e-05,
"loss": 0.6746,
"step": 770
},
{
"epoch": 1.302775441547519,
"grad_norm": 0.0743406403000032,
"learning_rate": 9.757098617230529e-05,
"loss": 0.6622,
"step": 775
},
{
"epoch": 1.3111858704793944,
"grad_norm": 0.06797759191808586,
"learning_rate": 9.551163091279481e-05,
"loss": 0.6556,
"step": 780
},
{
"epoch": 1.31959629941127,
"grad_norm": 0.07795583221869067,
"learning_rate": 9.346404033923304e-05,
"loss": 0.6617,
"step": 785
},
{
"epoch": 1.3280067283431456,
"grad_norm": 0.087821277333218,
"learning_rate": 9.14286565507406e-05,
"loss": 0.71,
"step": 790
},
{
"epoch": 1.336417157275021,
"grad_norm": 0.0680393967098423,
"learning_rate": 8.940591901084799e-05,
"loss": 0.666,
"step": 795
},
{
"epoch": 1.3448275862068966,
"grad_norm": 0.20717718965553744,
"learning_rate": 8.739626445261064e-05,
"loss": 0.657,
"step": 800
},
{
"epoch": 1.3532380151387722,
"grad_norm": 0.07605217404928517,
"learning_rate": 8.540012678431284e-05,
"loss": 0.6679,
"step": 805
},
{
"epoch": 1.3616484440706476,
"grad_norm": 0.09933554183914269,
"learning_rate": 8.341793699578171e-05,
"loss": 0.6879,
"step": 810
},
{
"epoch": 1.3700588730025232,
"grad_norm": 0.07269335218428621,
"learning_rate": 8.145012306533162e-05,
"loss": 0.656,
"step": 815
},
{
"epoch": 1.3784693019343988,
"grad_norm": 0.0938405449110704,
"learning_rate": 7.949710986735854e-05,
"loss": 0.6542,
"step": 820
},
{
"epoch": 1.3868797308662741,
"grad_norm": 0.06387119139044767,
"learning_rate": 7.755931908060427e-05,
"loss": 0.6835,
"step": 825
},
{
"epoch": 1.3952901597981497,
"grad_norm": 0.6498998302961293,
"learning_rate": 7.563716909711155e-05,
"loss": 0.6912,
"step": 830
},
{
"epoch": 1.4037005887300253,
"grad_norm": 0.07172546327436916,
"learning_rate": 7.373107493188776e-05,
"loss": 0.6397,
"step": 835
},
{
"epoch": 1.4121110176619007,
"grad_norm": 0.22055958275283316,
"learning_rate": 7.184144813329845e-05,
"loss": 0.665,
"step": 840
},
{
"epoch": 1.4205214465937763,
"grad_norm": 0.08300391135437629,
"learning_rate": 6.996869669420934e-05,
"loss": 0.6781,
"step": 845
},
{
"epoch": 1.428931875525652,
"grad_norm": 0.06921133210004143,
"learning_rate": 6.811322496389547e-05,
"loss": 0.6743,
"step": 850
},
{
"epoch": 1.4373423044575273,
"grad_norm": 0.0711963752977775,
"learning_rate": 6.627543356073752e-05,
"loss": 0.6409,
"step": 855
},
{
"epoch": 1.445752733389403,
"grad_norm": 0.06420927363983264,
"learning_rate": 6.445571928572372e-05,
"loss": 0.64,
"step": 860
},
{
"epoch": 1.4541631623212785,
"grad_norm": 0.06501526947017142,
"learning_rate": 6.265447503677568e-05,
"loss": 0.7054,
"step": 865
},
{
"epoch": 1.462573591253154,
"grad_norm": 0.21066994089465826,
"learning_rate": 6.087208972391683e-05,
"loss": 0.6918,
"step": 870
},
{
"epoch": 1.4709840201850295,
"grad_norm": 0.08759599924052495,
"learning_rate": 5.910894818530261e-05,
"loss": 0.6709,
"step": 875
},
{
"epoch": 1.479394449116905,
"grad_norm": 1.0585940281103237,
"learning_rate": 5.736543110412889e-05,
"loss": 0.7003,
"step": 880
},
{
"epoch": 1.4878048780487805,
"grad_norm": 0.10929958796237216,
"learning_rate": 5.564191492643813e-05,
"loss": 0.6804,
"step": 885
},
{
"epoch": 1.496215306980656,
"grad_norm": 0.07895221603568614,
"learning_rate": 5.393877177984039e-05,
"loss": 0.6609,
"step": 890
},
{
"epoch": 1.5046257359125317,
"grad_norm": 0.0705592884671557,
"learning_rate": 5.225636939316621e-05,
"loss": 0.6438,
"step": 895
},
{
"epoch": 1.513036164844407,
"grad_norm": 0.06645146330681664,
"learning_rate": 5.059507101706976e-05,
"loss": 0.658,
"step": 900
},
{
"epoch": 1.5214465937762824,
"grad_norm": 0.06572346800316721,
"learning_rate": 4.8955235345598825e-05,
"loss": 0.6409,
"step": 905
},
{
"epoch": 1.5298570227081583,
"grad_norm": 0.06268956414981779,
"learning_rate": 4.7337216438748384e-05,
"loss": 0.6657,
"step": 910
},
{
"epoch": 1.5382674516400336,
"grad_norm": 0.0705504423985544,
"learning_rate": 4.5741363646014696e-05,
"loss": 0.6631,
"step": 915
},
{
"epoch": 1.546677880571909,
"grad_norm": 0.06497073556361815,
"learning_rate": 4.416802153096696e-05,
"loss": 0.6319,
"step": 920
},
{
"epoch": 1.5550883095037848,
"grad_norm": 0.32565910926601543,
"learning_rate": 4.261752979685159e-05,
"loss": 0.6691,
"step": 925
},
{
"epoch": 1.5634987384356602,
"grad_norm": 0.05949233651666068,
"learning_rate": 4.1090223213246404e-05,
"loss": 0.6349,
"step": 930
},
{
"epoch": 1.5719091673675356,
"grad_norm": 0.07160645769568097,
"learning_rate": 3.958643154378005e-05,
"loss": 0.6688,
"step": 935
},
{
"epoch": 1.5803195962994114,
"grad_norm": 0.7327591054848349,
"learning_rate": 3.8106479474931795e-05,
"loss": 0.645,
"step": 940
},
{
"epoch": 1.5887300252312868,
"grad_norm": 0.062156634516279057,
"learning_rate": 3.665068654592806e-05,
"loss": 0.6373,
"step": 945
},
{
"epoch": 1.5971404541631622,
"grad_norm": 0.06660764066083266,
"learning_rate": 3.5219367079750205e-05,
"loss": 0.698,
"step": 950
},
{
"epoch": 1.6055508830950378,
"grad_norm": 0.06327609087248642,
"learning_rate": 3.381283011526819e-05,
"loss": 0.6231,
"step": 955
},
{
"epoch": 1.6139613120269134,
"grad_norm": 0.4333939216760815,
"learning_rate": 3.243137934051569e-05,
"loss": 0.6252,
"step": 960
},
{
"epoch": 1.6223717409587888,
"grad_norm": 0.06658129188396517,
"learning_rate": 3.1075313027120016e-05,
"loss": 0.6726,
"step": 965
},
{
"epoch": 1.6307821698906644,
"grad_norm": 0.26691445324073054,
"learning_rate": 2.97449239659018e-05,
"loss": 0.6342,
"step": 970
},
{
"epoch": 1.63919259882254,
"grad_norm": 0.0605021899974428,
"learning_rate": 2.8440499403658122e-05,
"loss": 0.6332,
"step": 975
},
{
"epoch": 1.6476030277544154,
"grad_norm": 0.061846457580604676,
"learning_rate": 2.7162320981142316e-05,
"loss": 0.651,
"step": 980
},
{
"epoch": 1.656013456686291,
"grad_norm": 0.1452013715559192,
"learning_rate": 2.5910664672254428e-05,
"loss": 0.6646,
"step": 985
},
{
"epoch": 1.6644238856181666,
"grad_norm": 0.06326743122936844,
"learning_rate": 2.4685800724455384e-05,
"loss": 0.6433,
"step": 990
},
{
"epoch": 1.672834314550042,
"grad_norm": 0.07852388598087191,
"learning_rate": 2.3487993600416967e-05,
"loss": 0.6371,
"step": 995
},
{
"epoch": 1.6812447434819175,
"grad_norm": 0.06601596346267856,
"learning_rate": 2.2317501920921576e-05,
"loss": 0.6471,
"step": 1000
},
{
"epoch": 1.6896551724137931,
"grad_norm": 0.059266994188209014,
"learning_rate": 2.1174578409022702e-05,
"loss": 0.6699,
"step": 1005
},
{
"epoch": 1.6980656013456685,
"grad_norm": 0.07452888371241441,
"learning_rate": 2.0059469835479054e-05,
"loss": 0.6473,
"step": 1010
},
{
"epoch": 1.7064760302775441,
"grad_norm": 0.07457779730712975,
"learning_rate": 1.8972416965473803e-05,
"loss": 0.6378,
"step": 1015
},
{
"epoch": 1.7148864592094197,
"grad_norm": 0.06910347568006753,
"learning_rate": 1.7913654506630655e-05,
"loss": 0.6274,
"step": 1020
},
{
"epoch": 1.7232968881412951,
"grad_norm": 0.07064622370312863,
"learning_rate": 1.6883411058337543e-05,
"loss": 0.7109,
"step": 1025
},
{
"epoch": 1.7317073170731707,
"grad_norm": 0.06048167956891708,
"learning_rate": 1.5881909062389285e-05,
"loss": 0.6333,
"step": 1030
},
{
"epoch": 1.7401177460050463,
"grad_norm": 0.06307286865715722,
"learning_rate": 1.4909364754959985e-05,
"loss": 0.646,
"step": 1035
},
{
"epoch": 1.7485281749369217,
"grad_norm": 0.05911925581755182,
"learning_rate": 1.3965988119914734e-05,
"loss": 0.6433,
"step": 1040
},
{
"epoch": 1.7569386038687973,
"grad_norm": 0.06333975944083138,
"learning_rate": 1.305198284347191e-05,
"loss": 0.6339,
"step": 1045
},
{
"epoch": 1.765349032800673,
"grad_norm": 0.05514146566145015,
"learning_rate": 1.2167546270224743e-05,
"loss": 0.6264,
"step": 1050
},
{
"epoch": 1.7737594617325483,
"grad_norm": 0.05609433763926122,
"learning_rate": 1.1312869360532295e-05,
"loss": 0.6337,
"step": 1055
},
{
"epoch": 1.7821698906644239,
"grad_norm": 0.05988904119560809,
"learning_rate": 1.0488136649288847e-05,
"loss": 0.6215,
"step": 1060
},
{
"epoch": 1.7905803195962995,
"grad_norm": 0.09114152521890548,
"learning_rate": 9.693526206080693e-06,
"loss": 0.6385,
"step": 1065
},
{
"epoch": 1.7989907485281749,
"grad_norm": 0.09735030405268541,
"learning_rate": 8.929209596738706e-06,
"loss": 0.6077,
"step": 1070
},
{
"epoch": 1.8074011774600505,
"grad_norm": 0.0635144022515458,
"learning_rate": 8.195351846295262e-06,
"loss": 0.6647,
"step": 1075
},
{
"epoch": 1.815811606391926,
"grad_norm": 0.08588808547970551,
"learning_rate": 7.492111403353462e-06,
"loss": 0.6523,
"step": 1080
},
{
"epoch": 1.8242220353238014,
"grad_norm": 0.057867203058913214,
"learning_rate": 6.819640105876062e-06,
"loss": 0.6347,
"step": 1085
},
{
"epoch": 1.832632464255677,
"grad_norm": 0.07139018525983484,
"learning_rate": 6.1780831484019684e-06,
"loss": 0.6247,
"step": 1090
},
{
"epoch": 1.8410428931875527,
"grad_norm": 0.058328293614124185,
"learning_rate": 5.567579050696957e-06,
"loss": 0.6387,
"step": 1095
},
{
"epoch": 1.849453322119428,
"grad_norm": 0.05729437886303193,
"learning_rate": 4.9882596278455756e-06,
"loss": 0.6443,
"step": 1100
},
{
"epoch": 1.8578637510513036,
"grad_norm": 0.05592472433203098,
"learning_rate": 4.440249961790826e-06,
"loss": 0.5956,
"step": 1105
},
{
"epoch": 1.8662741799831792,
"grad_norm": 0.06609120678589044,
"learning_rate": 3.923668374327338e-06,
"loss": 0.6619,
"step": 1110
},
{
"epoch": 1.8746846089150546,
"grad_norm": 0.05566880083508612,
"learning_rate": 3.438626401554351e-06,
"loss": 0.6077,
"step": 1115
},
{
"epoch": 1.8830950378469302,
"grad_norm": 0.05589315119066426,
"learning_rate": 2.9852287697938125e-06,
"loss": 0.6408,
"step": 1120
},
{
"epoch": 1.8915054667788058,
"grad_norm": 0.06795657947769265,
"learning_rate": 2.563573372978617e-06,
"loss": 0.6465,
"step": 1125
},
{
"epoch": 1.8999158957106812,
"grad_norm": 0.05715913746496311,
"learning_rate": 2.173751251516209e-06,
"loss": 0.6673,
"step": 1130
},
{
"epoch": 1.9083263246425568,
"grad_norm": 0.07516171647382662,
"learning_rate": 1.8158465726318294e-06,
"loss": 0.6135,
"step": 1135
},
{
"epoch": 1.9167367535744324,
"grad_norm": 0.08828033210229927,
"learning_rate": 1.4899366121958634e-06,
"loss": 0.6472,
"step": 1140
},
{
"epoch": 1.9251471825063078,
"grad_norm": 0.05375437444470602,
"learning_rate": 1.19609173803904e-06,
"loss": 0.5937,
"step": 1145
},
{
"epoch": 1.9335576114381834,
"grad_norm": 0.07199465636203523,
"learning_rate": 9.343753947591681e-07,
"loss": 0.6219,
"step": 1150
},
{
"epoch": 1.941968040370059,
"grad_norm": 0.11526483170723129,
"learning_rate": 7.048440900226937e-07,
"loss": 0.6342,
"step": 1155
},
{
"epoch": 1.9503784693019344,
"grad_norm": 0.05354167010798482,
"learning_rate": 5.075473823640597e-07,
"loss": 0.6553,
"step": 1160
},
{
"epoch": 1.95878889823381,
"grad_norm": 0.09214456642437543,
"learning_rate": 3.425278704853984e-07,
"loss": 0.6636,
"step": 1165
},
{
"epoch": 1.9671993271656856,
"grad_norm": 0.06917853774104925,
"learning_rate": 2.0982118405897251e-07,
"loss": 0.6319,
"step": 1170
},
{
"epoch": 1.975609756097561,
"grad_norm": 0.05500253350216091,
"learning_rate": 1.0945597603431167e-07,
"loss": 0.6157,
"step": 1175
},
{
"epoch": 1.9840201850294366,
"grad_norm": 0.056648521587817115,
"learning_rate": 4.145391645166141e-08,
"loss": 0.654,
"step": 1180
},
{
"epoch": 1.9924306139613122,
"grad_norm": 0.06416190308164577,
"learning_rate": 5.829687763259094e-09,
"loss": 0.6517,
"step": 1185
},
{
"epoch": 1.9974768713204374,
"step": 1188,
"total_flos": 7.94530881371059e+18,
"train_loss": 0.9845614412216225,
"train_runtime": 14065.2912,
"train_samples_per_second": 2.704,
"train_steps_per_second": 0.084
}
],
"logging_steps": 5,
"max_steps": 1188,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.94530881371059e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}