llama3-1_8b_oh_v3.1_wo_gpt4_llm / trainer_state.json
gsmyrnis's picture
End of training
97b2a4a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1257,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02386634844868735,
"grad_norm": 2.8815478378979726,
"learning_rate": 5e-06,
"loss": 0.8847,
"step": 10
},
{
"epoch": 0.0477326968973747,
"grad_norm": 2.1111681660055823,
"learning_rate": 5e-06,
"loss": 0.7795,
"step": 20
},
{
"epoch": 0.07159904534606205,
"grad_norm": 0.7661989040642819,
"learning_rate": 5e-06,
"loss": 0.7503,
"step": 30
},
{
"epoch": 0.0954653937947494,
"grad_norm": 0.9394058763103804,
"learning_rate": 5e-06,
"loss": 0.7339,
"step": 40
},
{
"epoch": 0.11933174224343675,
"grad_norm": 0.8225402898620612,
"learning_rate": 5e-06,
"loss": 0.7135,
"step": 50
},
{
"epoch": 0.1431980906921241,
"grad_norm": 0.7799933213085141,
"learning_rate": 5e-06,
"loss": 0.7122,
"step": 60
},
{
"epoch": 0.16706443914081145,
"grad_norm": 0.5800160564456466,
"learning_rate": 5e-06,
"loss": 0.6973,
"step": 70
},
{
"epoch": 0.1909307875894988,
"grad_norm": 0.597063104491002,
"learning_rate": 5e-06,
"loss": 0.6903,
"step": 80
},
{
"epoch": 0.21479713603818615,
"grad_norm": 0.5997913835782436,
"learning_rate": 5e-06,
"loss": 0.6808,
"step": 90
},
{
"epoch": 0.2386634844868735,
"grad_norm": 0.9361461454403766,
"learning_rate": 5e-06,
"loss": 0.6882,
"step": 100
},
{
"epoch": 0.26252983293556087,
"grad_norm": 0.6783993996639943,
"learning_rate": 5e-06,
"loss": 0.6836,
"step": 110
},
{
"epoch": 0.2863961813842482,
"grad_norm": 0.5795802549448508,
"learning_rate": 5e-06,
"loss": 0.6806,
"step": 120
},
{
"epoch": 0.31026252983293556,
"grad_norm": 0.5386116555684645,
"learning_rate": 5e-06,
"loss": 0.6786,
"step": 130
},
{
"epoch": 0.3341288782816229,
"grad_norm": 1.1955667749232783,
"learning_rate": 5e-06,
"loss": 0.673,
"step": 140
},
{
"epoch": 0.35799522673031026,
"grad_norm": 0.963473662374355,
"learning_rate": 5e-06,
"loss": 0.6791,
"step": 150
},
{
"epoch": 0.3818615751789976,
"grad_norm": 0.5375818632492324,
"learning_rate": 5e-06,
"loss": 0.6715,
"step": 160
},
{
"epoch": 0.40572792362768495,
"grad_norm": 0.5122467752567826,
"learning_rate": 5e-06,
"loss": 0.6672,
"step": 170
},
{
"epoch": 0.4295942720763723,
"grad_norm": 0.5438018622021655,
"learning_rate": 5e-06,
"loss": 0.6696,
"step": 180
},
{
"epoch": 0.45346062052505964,
"grad_norm": 0.5443450875717797,
"learning_rate": 5e-06,
"loss": 0.6707,
"step": 190
},
{
"epoch": 0.477326968973747,
"grad_norm": 0.5591666330075007,
"learning_rate": 5e-06,
"loss": 0.6629,
"step": 200
},
{
"epoch": 0.5011933174224343,
"grad_norm": 0.7316386056094906,
"learning_rate": 5e-06,
"loss": 0.6601,
"step": 210
},
{
"epoch": 0.5250596658711217,
"grad_norm": 0.8169771743047101,
"learning_rate": 5e-06,
"loss": 0.666,
"step": 220
},
{
"epoch": 0.548926014319809,
"grad_norm": 0.5023436573258486,
"learning_rate": 5e-06,
"loss": 0.6655,
"step": 230
},
{
"epoch": 0.5727923627684964,
"grad_norm": 0.5715922888425466,
"learning_rate": 5e-06,
"loss": 0.6621,
"step": 240
},
{
"epoch": 0.5966587112171837,
"grad_norm": 0.5978492051245125,
"learning_rate": 5e-06,
"loss": 0.6606,
"step": 250
},
{
"epoch": 0.6205250596658711,
"grad_norm": 0.5562863722589444,
"learning_rate": 5e-06,
"loss": 0.6646,
"step": 260
},
{
"epoch": 0.6443914081145584,
"grad_norm": 0.5933691995834427,
"learning_rate": 5e-06,
"loss": 0.6583,
"step": 270
},
{
"epoch": 0.6682577565632458,
"grad_norm": 0.5981641076306046,
"learning_rate": 5e-06,
"loss": 0.6557,
"step": 280
},
{
"epoch": 0.6921241050119332,
"grad_norm": 0.8109530838139422,
"learning_rate": 5e-06,
"loss": 0.6582,
"step": 290
},
{
"epoch": 0.7159904534606205,
"grad_norm": 0.5965206875329182,
"learning_rate": 5e-06,
"loss": 0.6621,
"step": 300
},
{
"epoch": 0.7398568019093079,
"grad_norm": 0.486222749934066,
"learning_rate": 5e-06,
"loss": 0.6549,
"step": 310
},
{
"epoch": 0.7637231503579952,
"grad_norm": 0.5522832083975265,
"learning_rate": 5e-06,
"loss": 0.6496,
"step": 320
},
{
"epoch": 0.7875894988066826,
"grad_norm": 0.5396338578678825,
"learning_rate": 5e-06,
"loss": 0.6431,
"step": 330
},
{
"epoch": 0.8114558472553699,
"grad_norm": 0.509360104131435,
"learning_rate": 5e-06,
"loss": 0.6458,
"step": 340
},
{
"epoch": 0.8353221957040573,
"grad_norm": 0.5620324892726529,
"learning_rate": 5e-06,
"loss": 0.6564,
"step": 350
},
{
"epoch": 0.8591885441527446,
"grad_norm": 0.6119146933240237,
"learning_rate": 5e-06,
"loss": 0.6534,
"step": 360
},
{
"epoch": 0.883054892601432,
"grad_norm": 0.5839021797409776,
"learning_rate": 5e-06,
"loss": 0.6565,
"step": 370
},
{
"epoch": 0.9069212410501193,
"grad_norm": 0.43401159052073285,
"learning_rate": 5e-06,
"loss": 0.6535,
"step": 380
},
{
"epoch": 0.9307875894988067,
"grad_norm": 0.5668890229094246,
"learning_rate": 5e-06,
"loss": 0.6462,
"step": 390
},
{
"epoch": 0.954653937947494,
"grad_norm": 0.5782226624956547,
"learning_rate": 5e-06,
"loss": 0.6546,
"step": 400
},
{
"epoch": 0.9785202863961814,
"grad_norm": 0.6579659643852935,
"learning_rate": 5e-06,
"loss": 0.641,
"step": 410
},
{
"epoch": 1.0,
"eval_loss": 0.6449207663536072,
"eval_runtime": 41.1778,
"eval_samples_per_second": 273.813,
"eval_steps_per_second": 1.093,
"step": 419
},
{
"epoch": 1.0023866348448687,
"grad_norm": 0.7827774952972171,
"learning_rate": 5e-06,
"loss": 0.6431,
"step": 420
},
{
"epoch": 1.026252983293556,
"grad_norm": 0.6329747383963555,
"learning_rate": 5e-06,
"loss": 0.6134,
"step": 430
},
{
"epoch": 1.0501193317422435,
"grad_norm": 0.5281801985221292,
"learning_rate": 5e-06,
"loss": 0.6083,
"step": 440
},
{
"epoch": 1.0739856801909309,
"grad_norm": 0.7577758408944637,
"learning_rate": 5e-06,
"loss": 0.6061,
"step": 450
},
{
"epoch": 1.097852028639618,
"grad_norm": 0.5351677292156073,
"learning_rate": 5e-06,
"loss": 0.6031,
"step": 460
},
{
"epoch": 1.1217183770883055,
"grad_norm": 0.5015989972497082,
"learning_rate": 5e-06,
"loss": 0.6059,
"step": 470
},
{
"epoch": 1.1455847255369929,
"grad_norm": 0.5967761959033508,
"learning_rate": 5e-06,
"loss": 0.6144,
"step": 480
},
{
"epoch": 1.1694510739856803,
"grad_norm": 0.6199769517789647,
"learning_rate": 5e-06,
"loss": 0.6089,
"step": 490
},
{
"epoch": 1.1933174224343674,
"grad_norm": 0.5989234321280023,
"learning_rate": 5e-06,
"loss": 0.6079,
"step": 500
},
{
"epoch": 1.2171837708830548,
"grad_norm": 0.48214122296698664,
"learning_rate": 5e-06,
"loss": 0.6106,
"step": 510
},
{
"epoch": 1.2410501193317423,
"grad_norm": 0.500906885639557,
"learning_rate": 5e-06,
"loss": 0.6114,
"step": 520
},
{
"epoch": 1.2649164677804297,
"grad_norm": 0.5055182485221988,
"learning_rate": 5e-06,
"loss": 0.6073,
"step": 530
},
{
"epoch": 1.288782816229117,
"grad_norm": 0.5890740590556416,
"learning_rate": 5e-06,
"loss": 0.6074,
"step": 540
},
{
"epoch": 1.3126491646778042,
"grad_norm": 0.48236044063151085,
"learning_rate": 5e-06,
"loss": 0.6128,
"step": 550
},
{
"epoch": 1.3365155131264916,
"grad_norm": 0.5202514925506149,
"learning_rate": 5e-06,
"loss": 0.6058,
"step": 560
},
{
"epoch": 1.360381861575179,
"grad_norm": 0.5228316664959745,
"learning_rate": 5e-06,
"loss": 0.6091,
"step": 570
},
{
"epoch": 1.3842482100238662,
"grad_norm": 0.51530770994292,
"learning_rate": 5e-06,
"loss": 0.6087,
"step": 580
},
{
"epoch": 1.4081145584725536,
"grad_norm": 0.6386559379894787,
"learning_rate": 5e-06,
"loss": 0.6116,
"step": 590
},
{
"epoch": 1.431980906921241,
"grad_norm": 0.4779744600855222,
"learning_rate": 5e-06,
"loss": 0.6033,
"step": 600
},
{
"epoch": 1.4558472553699284,
"grad_norm": 0.4819600928038827,
"learning_rate": 5e-06,
"loss": 0.6077,
"step": 610
},
{
"epoch": 1.4797136038186158,
"grad_norm": 0.45917275315096606,
"learning_rate": 5e-06,
"loss": 0.6094,
"step": 620
},
{
"epoch": 1.503579952267303,
"grad_norm": 0.5010113270578477,
"learning_rate": 5e-06,
"loss": 0.6055,
"step": 630
},
{
"epoch": 1.5274463007159904,
"grad_norm": 0.7579117243752399,
"learning_rate": 5e-06,
"loss": 0.6159,
"step": 640
},
{
"epoch": 1.5513126491646778,
"grad_norm": 0.6420792800924288,
"learning_rate": 5e-06,
"loss": 0.6058,
"step": 650
},
{
"epoch": 1.575178997613365,
"grad_norm": 0.55940882824889,
"learning_rate": 5e-06,
"loss": 0.5961,
"step": 660
},
{
"epoch": 1.5990453460620526,
"grad_norm": 0.4983792149426302,
"learning_rate": 5e-06,
"loss": 0.6083,
"step": 670
},
{
"epoch": 1.6229116945107398,
"grad_norm": 0.5263299595036224,
"learning_rate": 5e-06,
"loss": 0.6053,
"step": 680
},
{
"epoch": 1.6467780429594272,
"grad_norm": 0.5149484039474402,
"learning_rate": 5e-06,
"loss": 0.6079,
"step": 690
},
{
"epoch": 1.6706443914081146,
"grad_norm": 0.4685802940879146,
"learning_rate": 5e-06,
"loss": 0.6041,
"step": 700
},
{
"epoch": 1.6945107398568018,
"grad_norm": 0.4639317334767733,
"learning_rate": 5e-06,
"loss": 0.6012,
"step": 710
},
{
"epoch": 1.7183770883054894,
"grad_norm": 0.48774835965978913,
"learning_rate": 5e-06,
"loss": 0.6117,
"step": 720
},
{
"epoch": 1.7422434367541766,
"grad_norm": 0.4895883070209168,
"learning_rate": 5e-06,
"loss": 0.6066,
"step": 730
},
{
"epoch": 1.766109785202864,
"grad_norm": 0.45180400146140737,
"learning_rate": 5e-06,
"loss": 0.6031,
"step": 740
},
{
"epoch": 1.7899761336515514,
"grad_norm": 0.5197512123710193,
"learning_rate": 5e-06,
"loss": 0.6018,
"step": 750
},
{
"epoch": 1.8138424821002386,
"grad_norm": 0.45816712660411146,
"learning_rate": 5e-06,
"loss": 0.6024,
"step": 760
},
{
"epoch": 1.837708830548926,
"grad_norm": 0.45318755220959944,
"learning_rate": 5e-06,
"loss": 0.6101,
"step": 770
},
{
"epoch": 1.8615751789976134,
"grad_norm": 0.48227944198410183,
"learning_rate": 5e-06,
"loss": 0.6084,
"step": 780
},
{
"epoch": 1.8854415274463006,
"grad_norm": 0.5498211459609608,
"learning_rate": 5e-06,
"loss": 0.6014,
"step": 790
},
{
"epoch": 1.9093078758949882,
"grad_norm": 0.5272792947894827,
"learning_rate": 5e-06,
"loss": 0.6035,
"step": 800
},
{
"epoch": 1.9331742243436754,
"grad_norm": 0.4520775749202011,
"learning_rate": 5e-06,
"loss": 0.6023,
"step": 810
},
{
"epoch": 1.9570405727923628,
"grad_norm": 0.48624457536560756,
"learning_rate": 5e-06,
"loss": 0.6022,
"step": 820
},
{
"epoch": 1.9809069212410502,
"grad_norm": 0.5488566572359053,
"learning_rate": 5e-06,
"loss": 0.6068,
"step": 830
},
{
"epoch": 2.0,
"eval_loss": 0.6355295777320862,
"eval_runtime": 40.6265,
"eval_samples_per_second": 277.529,
"eval_steps_per_second": 1.108,
"step": 838
},
{
"epoch": 2.0047732696897373,
"grad_norm": 0.6658281603460572,
"learning_rate": 5e-06,
"loss": 0.5955,
"step": 840
},
{
"epoch": 2.028639618138425,
"grad_norm": 0.7098614483793992,
"learning_rate": 5e-06,
"loss": 0.5701,
"step": 850
},
{
"epoch": 2.052505966587112,
"grad_norm": 0.6533616500153973,
"learning_rate": 5e-06,
"loss": 0.5634,
"step": 860
},
{
"epoch": 2.0763723150357993,
"grad_norm": 0.6126417325021997,
"learning_rate": 5e-06,
"loss": 0.5639,
"step": 870
},
{
"epoch": 2.100238663484487,
"grad_norm": 0.44612672152357774,
"learning_rate": 5e-06,
"loss": 0.5671,
"step": 880
},
{
"epoch": 2.124105011933174,
"grad_norm": 0.5379973514690706,
"learning_rate": 5e-06,
"loss": 0.562,
"step": 890
},
{
"epoch": 2.1479713603818618,
"grad_norm": 0.442404391317877,
"learning_rate": 5e-06,
"loss": 0.5728,
"step": 900
},
{
"epoch": 2.171837708830549,
"grad_norm": 0.5754078775127957,
"learning_rate": 5e-06,
"loss": 0.5613,
"step": 910
},
{
"epoch": 2.195704057279236,
"grad_norm": 0.6586173018673331,
"learning_rate": 5e-06,
"loss": 0.5654,
"step": 920
},
{
"epoch": 2.2195704057279237,
"grad_norm": 0.5439510862576353,
"learning_rate": 5e-06,
"loss": 0.5614,
"step": 930
},
{
"epoch": 2.243436754176611,
"grad_norm": 0.5740630674331443,
"learning_rate": 5e-06,
"loss": 0.565,
"step": 940
},
{
"epoch": 2.2673031026252985,
"grad_norm": 0.48777976586303107,
"learning_rate": 5e-06,
"loss": 0.561,
"step": 950
},
{
"epoch": 2.2911694510739857,
"grad_norm": 0.4461443532364416,
"learning_rate": 5e-06,
"loss": 0.5582,
"step": 960
},
{
"epoch": 2.315035799522673,
"grad_norm": 0.487505257972905,
"learning_rate": 5e-06,
"loss": 0.5692,
"step": 970
},
{
"epoch": 2.3389021479713605,
"grad_norm": 0.48305958679617367,
"learning_rate": 5e-06,
"loss": 0.5694,
"step": 980
},
{
"epoch": 2.3627684964200477,
"grad_norm": 0.503359166301003,
"learning_rate": 5e-06,
"loss": 0.5609,
"step": 990
},
{
"epoch": 2.386634844868735,
"grad_norm": 0.5256679194745039,
"learning_rate": 5e-06,
"loss": 0.5632,
"step": 1000
},
{
"epoch": 2.4105011933174225,
"grad_norm": 0.5732669619119689,
"learning_rate": 5e-06,
"loss": 0.5666,
"step": 1010
},
{
"epoch": 2.4343675417661097,
"grad_norm": 0.4506151186524362,
"learning_rate": 5e-06,
"loss": 0.5762,
"step": 1020
},
{
"epoch": 2.4582338902147973,
"grad_norm": 0.4746241123968773,
"learning_rate": 5e-06,
"loss": 0.5652,
"step": 1030
},
{
"epoch": 2.4821002386634845,
"grad_norm": 0.46349902513638275,
"learning_rate": 5e-06,
"loss": 0.5644,
"step": 1040
},
{
"epoch": 2.5059665871121717,
"grad_norm": 0.45976906873116374,
"learning_rate": 5e-06,
"loss": 0.5686,
"step": 1050
},
{
"epoch": 2.5298329355608593,
"grad_norm": 0.4662220401853357,
"learning_rate": 5e-06,
"loss": 0.5686,
"step": 1060
},
{
"epoch": 2.5536992840095465,
"grad_norm": 0.4951436398512421,
"learning_rate": 5e-06,
"loss": 0.5702,
"step": 1070
},
{
"epoch": 2.577565632458234,
"grad_norm": 0.4502698747379483,
"learning_rate": 5e-06,
"loss": 0.5719,
"step": 1080
},
{
"epoch": 2.6014319809069213,
"grad_norm": 0.5079705723721918,
"learning_rate": 5e-06,
"loss": 0.5671,
"step": 1090
},
{
"epoch": 2.6252983293556085,
"grad_norm": 0.5140346872907439,
"learning_rate": 5e-06,
"loss": 0.5671,
"step": 1100
},
{
"epoch": 2.649164677804296,
"grad_norm": 0.5358331120197253,
"learning_rate": 5e-06,
"loss": 0.5633,
"step": 1110
},
{
"epoch": 2.6730310262529833,
"grad_norm": 0.4907309631164768,
"learning_rate": 5e-06,
"loss": 0.5644,
"step": 1120
},
{
"epoch": 2.6968973747016705,
"grad_norm": 0.5049970936550133,
"learning_rate": 5e-06,
"loss": 0.5692,
"step": 1130
},
{
"epoch": 2.720763723150358,
"grad_norm": 0.5553567091902175,
"learning_rate": 5e-06,
"loss": 0.5658,
"step": 1140
},
{
"epoch": 2.7446300715990453,
"grad_norm": 0.49261752142359677,
"learning_rate": 5e-06,
"loss": 0.5742,
"step": 1150
},
{
"epoch": 2.7684964200477324,
"grad_norm": 0.5018759977401656,
"learning_rate": 5e-06,
"loss": 0.5645,
"step": 1160
},
{
"epoch": 2.79236276849642,
"grad_norm": 0.45826106825699625,
"learning_rate": 5e-06,
"loss": 0.5641,
"step": 1170
},
{
"epoch": 2.8162291169451072,
"grad_norm": 0.5072976091618316,
"learning_rate": 5e-06,
"loss": 0.5676,
"step": 1180
},
{
"epoch": 2.840095465393795,
"grad_norm": 0.651235919618626,
"learning_rate": 5e-06,
"loss": 0.5681,
"step": 1190
},
{
"epoch": 2.863961813842482,
"grad_norm": 0.5113781250941779,
"learning_rate": 5e-06,
"loss": 0.5669,
"step": 1200
},
{
"epoch": 2.8878281622911697,
"grad_norm": 0.4949601222660754,
"learning_rate": 5e-06,
"loss": 0.5581,
"step": 1210
},
{
"epoch": 2.911694510739857,
"grad_norm": 0.5748654481884351,
"learning_rate": 5e-06,
"loss": 0.5678,
"step": 1220
},
{
"epoch": 2.935560859188544,
"grad_norm": 0.5793617800123868,
"learning_rate": 5e-06,
"loss": 0.5649,
"step": 1230
},
{
"epoch": 2.9594272076372317,
"grad_norm": 0.4514242707013011,
"learning_rate": 5e-06,
"loss": 0.5652,
"step": 1240
},
{
"epoch": 2.983293556085919,
"grad_norm": 0.4744906861925991,
"learning_rate": 5e-06,
"loss": 0.5638,
"step": 1250
},
{
"epoch": 3.0,
"eval_loss": 0.6389971375465393,
"eval_runtime": 41.4704,
"eval_samples_per_second": 271.881,
"eval_steps_per_second": 1.085,
"step": 1257
},
{
"epoch": 3.0,
"step": 1257,
"total_flos": 2105521817518080.0,
"train_loss": 0.6169646524112189,
"train_runtime": 7810.9265,
"train_samples_per_second": 82.273,
"train_steps_per_second": 0.161
}
],
"logging_steps": 10,
"max_steps": 1257,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2105521817518080.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}