OH_DCFT_V3_wo_platypus / trainer_state.json
sedrickkeh's picture
End of training
4f4e7b0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9963459196102313,
"eval_steps": 500,
"global_step": 1230,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024360535931790498,
"grad_norm": 11.305159308546282,
"learning_rate": 5e-06,
"loss": 0.8887,
"step": 10
},
{
"epoch": 0.048721071863580996,
"grad_norm": 4.407971880258946,
"learning_rate": 5e-06,
"loss": 0.7976,
"step": 20
},
{
"epoch": 0.0730816077953715,
"grad_norm": 1.3980174472682592,
"learning_rate": 5e-06,
"loss": 0.767,
"step": 30
},
{
"epoch": 0.09744214372716199,
"grad_norm": 4.101997301199602,
"learning_rate": 5e-06,
"loss": 0.7475,
"step": 40
},
{
"epoch": 0.1218026796589525,
"grad_norm": 3.8298876079533914,
"learning_rate": 5e-06,
"loss": 0.7403,
"step": 50
},
{
"epoch": 0.146163215590743,
"grad_norm": 1.120036674314868,
"learning_rate": 5e-06,
"loss": 0.7256,
"step": 60
},
{
"epoch": 0.1705237515225335,
"grad_norm": 0.7504651491119281,
"learning_rate": 5e-06,
"loss": 0.7212,
"step": 70
},
{
"epoch": 0.19488428745432398,
"grad_norm": 0.8705482174733401,
"learning_rate": 5e-06,
"loss": 0.6967,
"step": 80
},
{
"epoch": 0.2192448233861145,
"grad_norm": 0.8066011359814329,
"learning_rate": 5e-06,
"loss": 0.6944,
"step": 90
},
{
"epoch": 0.243605359317905,
"grad_norm": 0.7143294307124043,
"learning_rate": 5e-06,
"loss": 0.6991,
"step": 100
},
{
"epoch": 0.2679658952496955,
"grad_norm": 0.5219278289863366,
"learning_rate": 5e-06,
"loss": 0.6904,
"step": 110
},
{
"epoch": 0.292326431181486,
"grad_norm": 0.49976792832548467,
"learning_rate": 5e-06,
"loss": 0.6917,
"step": 120
},
{
"epoch": 0.3166869671132765,
"grad_norm": 0.8671962194472669,
"learning_rate": 5e-06,
"loss": 0.6923,
"step": 130
},
{
"epoch": 0.341047503045067,
"grad_norm": 0.4958220955019927,
"learning_rate": 5e-06,
"loss": 0.6886,
"step": 140
},
{
"epoch": 0.3654080389768575,
"grad_norm": 0.5491440067010557,
"learning_rate": 5e-06,
"loss": 0.6844,
"step": 150
},
{
"epoch": 0.38976857490864797,
"grad_norm": 0.5764231325699036,
"learning_rate": 5e-06,
"loss": 0.6873,
"step": 160
},
{
"epoch": 0.41412911084043846,
"grad_norm": 0.4866036070242275,
"learning_rate": 5e-06,
"loss": 0.6773,
"step": 170
},
{
"epoch": 0.438489646772229,
"grad_norm": 0.7229793933095654,
"learning_rate": 5e-06,
"loss": 0.6801,
"step": 180
},
{
"epoch": 0.4628501827040195,
"grad_norm": 0.5825475907586349,
"learning_rate": 5e-06,
"loss": 0.6806,
"step": 190
},
{
"epoch": 0.48721071863581,
"grad_norm": 0.641550842935756,
"learning_rate": 5e-06,
"loss": 0.677,
"step": 200
},
{
"epoch": 0.5115712545676004,
"grad_norm": 0.555875854836963,
"learning_rate": 5e-06,
"loss": 0.669,
"step": 210
},
{
"epoch": 0.535931790499391,
"grad_norm": 0.4380177981926619,
"learning_rate": 5e-06,
"loss": 0.6672,
"step": 220
},
{
"epoch": 0.5602923264311814,
"grad_norm": 0.5586357299552903,
"learning_rate": 5e-06,
"loss": 0.6696,
"step": 230
},
{
"epoch": 0.584652862362972,
"grad_norm": 0.5268423895517483,
"learning_rate": 5e-06,
"loss": 0.6761,
"step": 240
},
{
"epoch": 0.6090133982947625,
"grad_norm": 0.5068291541548725,
"learning_rate": 5e-06,
"loss": 0.6672,
"step": 250
},
{
"epoch": 0.633373934226553,
"grad_norm": 0.7203145859800878,
"learning_rate": 5e-06,
"loss": 0.6758,
"step": 260
},
{
"epoch": 0.6577344701583435,
"grad_norm": 0.4843027545014372,
"learning_rate": 5e-06,
"loss": 0.6684,
"step": 270
},
{
"epoch": 0.682095006090134,
"grad_norm": 0.4654716032330135,
"learning_rate": 5e-06,
"loss": 0.6674,
"step": 280
},
{
"epoch": 0.7064555420219245,
"grad_norm": 0.48677469218469316,
"learning_rate": 5e-06,
"loss": 0.657,
"step": 290
},
{
"epoch": 0.730816077953715,
"grad_norm": 0.501936617406133,
"learning_rate": 5e-06,
"loss": 0.666,
"step": 300
},
{
"epoch": 0.7551766138855055,
"grad_norm": 0.4189199112711787,
"learning_rate": 5e-06,
"loss": 0.6672,
"step": 310
},
{
"epoch": 0.7795371498172959,
"grad_norm": 0.525860628294632,
"learning_rate": 5e-06,
"loss": 0.6625,
"step": 320
},
{
"epoch": 0.8038976857490865,
"grad_norm": 0.5055516889416151,
"learning_rate": 5e-06,
"loss": 0.6687,
"step": 330
},
{
"epoch": 0.8282582216808769,
"grad_norm": 0.5030088195887705,
"learning_rate": 5e-06,
"loss": 0.6622,
"step": 340
},
{
"epoch": 0.8526187576126675,
"grad_norm": 0.4409999841350699,
"learning_rate": 5e-06,
"loss": 0.659,
"step": 350
},
{
"epoch": 0.876979293544458,
"grad_norm": 0.49889143289837934,
"learning_rate": 5e-06,
"loss": 0.664,
"step": 360
},
{
"epoch": 0.9013398294762485,
"grad_norm": 0.46333426563091684,
"learning_rate": 5e-06,
"loss": 0.6647,
"step": 370
},
{
"epoch": 0.925700365408039,
"grad_norm": 0.4132898286035426,
"learning_rate": 5e-06,
"loss": 0.6604,
"step": 380
},
{
"epoch": 0.9500609013398295,
"grad_norm": 0.4602502572358803,
"learning_rate": 5e-06,
"loss": 0.663,
"step": 390
},
{
"epoch": 0.97442143727162,
"grad_norm": 0.586425378319964,
"learning_rate": 5e-06,
"loss": 0.6588,
"step": 400
},
{
"epoch": 0.9987819732034104,
"grad_norm": 0.4637558734433708,
"learning_rate": 5e-06,
"loss": 0.6557,
"step": 410
},
{
"epoch": 0.9987819732034104,
"eval_loss": 0.6518880128860474,
"eval_runtime": 221.2706,
"eval_samples_per_second": 49.966,
"eval_steps_per_second": 0.393,
"step": 410
},
{
"epoch": 1.0231425091352009,
"grad_norm": 0.6010683259164777,
"learning_rate": 5e-06,
"loss": 0.6207,
"step": 420
},
{
"epoch": 1.0475030450669915,
"grad_norm": 0.6050810738565418,
"learning_rate": 5e-06,
"loss": 0.61,
"step": 430
},
{
"epoch": 1.071863580998782,
"grad_norm": 0.4799441913834175,
"learning_rate": 5e-06,
"loss": 0.617,
"step": 440
},
{
"epoch": 1.0962241169305724,
"grad_norm": 0.41533745441354586,
"learning_rate": 5e-06,
"loss": 0.6233,
"step": 450
},
{
"epoch": 1.1205846528623629,
"grad_norm": 0.42865808124947796,
"learning_rate": 5e-06,
"loss": 0.616,
"step": 460
},
{
"epoch": 1.1449451887941535,
"grad_norm": 0.5620085827072487,
"learning_rate": 5e-06,
"loss": 0.6226,
"step": 470
},
{
"epoch": 1.169305724725944,
"grad_norm": 0.47328106114801194,
"learning_rate": 5e-06,
"loss": 0.609,
"step": 480
},
{
"epoch": 1.1936662606577344,
"grad_norm": 0.4720567281560868,
"learning_rate": 5e-06,
"loss": 0.6143,
"step": 490
},
{
"epoch": 1.218026796589525,
"grad_norm": 0.44112203366329256,
"learning_rate": 5e-06,
"loss": 0.614,
"step": 500
},
{
"epoch": 1.2423873325213155,
"grad_norm": 0.5187652730488376,
"learning_rate": 5e-06,
"loss": 0.6199,
"step": 510
},
{
"epoch": 1.266747868453106,
"grad_norm": 0.5638861172624315,
"learning_rate": 5e-06,
"loss": 0.619,
"step": 520
},
{
"epoch": 1.2911084043848966,
"grad_norm": 0.5972907620170446,
"learning_rate": 5e-06,
"loss": 0.6182,
"step": 530
},
{
"epoch": 1.315468940316687,
"grad_norm": 0.5314321040836214,
"learning_rate": 5e-06,
"loss": 0.619,
"step": 540
},
{
"epoch": 1.3398294762484775,
"grad_norm": 0.5459662859735409,
"learning_rate": 5e-06,
"loss": 0.6183,
"step": 550
},
{
"epoch": 1.364190012180268,
"grad_norm": 0.5202733547748785,
"learning_rate": 5e-06,
"loss": 0.618,
"step": 560
},
{
"epoch": 1.3885505481120584,
"grad_norm": 0.4161689870213624,
"learning_rate": 5e-06,
"loss": 0.6101,
"step": 570
},
{
"epoch": 1.412911084043849,
"grad_norm": 0.46394109509695763,
"learning_rate": 5e-06,
"loss": 0.6274,
"step": 580
},
{
"epoch": 1.4372716199756395,
"grad_norm": 0.4808851283054136,
"learning_rate": 5e-06,
"loss": 0.6087,
"step": 590
},
{
"epoch": 1.46163215590743,
"grad_norm": 0.5411540324211217,
"learning_rate": 5e-06,
"loss": 0.6215,
"step": 600
},
{
"epoch": 1.4859926918392206,
"grad_norm": 0.5416915020329361,
"learning_rate": 5e-06,
"loss": 0.6167,
"step": 610
},
{
"epoch": 1.510353227771011,
"grad_norm": 0.527607596364707,
"learning_rate": 5e-06,
"loss": 0.6128,
"step": 620
},
{
"epoch": 1.5347137637028014,
"grad_norm": 0.520963657326471,
"learning_rate": 5e-06,
"loss": 0.6137,
"step": 630
},
{
"epoch": 1.559074299634592,
"grad_norm": 0.4366228046959017,
"learning_rate": 5e-06,
"loss": 0.6171,
"step": 640
},
{
"epoch": 1.5834348355663823,
"grad_norm": 0.5504251670894937,
"learning_rate": 5e-06,
"loss": 0.6143,
"step": 650
},
{
"epoch": 1.607795371498173,
"grad_norm": 0.4715628019229569,
"learning_rate": 5e-06,
"loss": 0.6202,
"step": 660
},
{
"epoch": 1.6321559074299634,
"grad_norm": 0.5291464708625646,
"learning_rate": 5e-06,
"loss": 0.6155,
"step": 670
},
{
"epoch": 1.6565164433617539,
"grad_norm": 0.4355159440359265,
"learning_rate": 5e-06,
"loss": 0.6162,
"step": 680
},
{
"epoch": 1.6808769792935445,
"grad_norm": 0.5112620919843524,
"learning_rate": 5e-06,
"loss": 0.6279,
"step": 690
},
{
"epoch": 1.705237515225335,
"grad_norm": 0.57875404757705,
"learning_rate": 5e-06,
"loss": 0.6176,
"step": 700
},
{
"epoch": 1.7295980511571254,
"grad_norm": 0.4410704500201331,
"learning_rate": 5e-06,
"loss": 0.6195,
"step": 710
},
{
"epoch": 1.753958587088916,
"grad_norm": 0.5587895103691882,
"learning_rate": 5e-06,
"loss": 0.6194,
"step": 720
},
{
"epoch": 1.7783191230207065,
"grad_norm": 0.4941053548445359,
"learning_rate": 5e-06,
"loss": 0.6096,
"step": 730
},
{
"epoch": 1.802679658952497,
"grad_norm": 0.5227563230610854,
"learning_rate": 5e-06,
"loss": 0.6102,
"step": 740
},
{
"epoch": 1.8270401948842876,
"grad_norm": 0.4591897668705156,
"learning_rate": 5e-06,
"loss": 0.6117,
"step": 750
},
{
"epoch": 1.8514007308160778,
"grad_norm": 0.5103376738813472,
"learning_rate": 5e-06,
"loss": 0.6134,
"step": 760
},
{
"epoch": 1.8757612667478685,
"grad_norm": 0.532214266722337,
"learning_rate": 5e-06,
"loss": 0.6102,
"step": 770
},
{
"epoch": 1.900121802679659,
"grad_norm": 0.4632257568024349,
"learning_rate": 5e-06,
"loss": 0.6218,
"step": 780
},
{
"epoch": 1.9244823386114494,
"grad_norm": 0.5412849420492728,
"learning_rate": 5e-06,
"loss": 0.6109,
"step": 790
},
{
"epoch": 1.94884287454324,
"grad_norm": 0.48808240750337195,
"learning_rate": 5e-06,
"loss": 0.6176,
"step": 800
},
{
"epoch": 1.9732034104750305,
"grad_norm": 0.4761455418357999,
"learning_rate": 5e-06,
"loss": 0.6098,
"step": 810
},
{
"epoch": 1.997563946406821,
"grad_norm": 0.4534197510006015,
"learning_rate": 5e-06,
"loss": 0.6082,
"step": 820
},
{
"epoch": 2.0,
"eval_loss": 0.6419612765312195,
"eval_runtime": 221.435,
"eval_samples_per_second": 49.929,
"eval_steps_per_second": 0.393,
"step": 821
},
{
"epoch": 2.0219244823386116,
"grad_norm": 0.6074772099873261,
"learning_rate": 5e-06,
"loss": 0.5769,
"step": 830
},
{
"epoch": 2.0462850182704018,
"grad_norm": 0.5110291152400608,
"learning_rate": 5e-06,
"loss": 0.564,
"step": 840
},
{
"epoch": 2.0706455542021924,
"grad_norm": 0.740312554525951,
"learning_rate": 5e-06,
"loss": 0.5717,
"step": 850
},
{
"epoch": 2.095006090133983,
"grad_norm": 0.5821754748157193,
"learning_rate": 5e-06,
"loss": 0.5726,
"step": 860
},
{
"epoch": 2.1193666260657733,
"grad_norm": 0.53860209415622,
"learning_rate": 5e-06,
"loss": 0.5742,
"step": 870
},
{
"epoch": 2.143727161997564,
"grad_norm": 0.5215524148222913,
"learning_rate": 5e-06,
"loss": 0.564,
"step": 880
},
{
"epoch": 2.1680876979293546,
"grad_norm": 0.6458934700822203,
"learning_rate": 5e-06,
"loss": 0.5724,
"step": 890
},
{
"epoch": 2.192448233861145,
"grad_norm": 0.4435184357785445,
"learning_rate": 5e-06,
"loss": 0.5684,
"step": 900
},
{
"epoch": 2.2168087697929355,
"grad_norm": 0.5416262844784988,
"learning_rate": 5e-06,
"loss": 0.5718,
"step": 910
},
{
"epoch": 2.2411693057247257,
"grad_norm": 0.4739984176413269,
"learning_rate": 5e-06,
"loss": 0.5756,
"step": 920
},
{
"epoch": 2.2655298416565164,
"grad_norm": 0.47994087642094213,
"learning_rate": 5e-06,
"loss": 0.5742,
"step": 930
},
{
"epoch": 2.289890377588307,
"grad_norm": 0.4742359512444407,
"learning_rate": 5e-06,
"loss": 0.5731,
"step": 940
},
{
"epoch": 2.3142509135200973,
"grad_norm": 0.5586334439764152,
"learning_rate": 5e-06,
"loss": 0.576,
"step": 950
},
{
"epoch": 2.338611449451888,
"grad_norm": 0.49880213092932163,
"learning_rate": 5e-06,
"loss": 0.5799,
"step": 960
},
{
"epoch": 2.3629719853836786,
"grad_norm": 0.49935902866105975,
"learning_rate": 5e-06,
"loss": 0.5762,
"step": 970
},
{
"epoch": 2.387332521315469,
"grad_norm": 0.5465185670805549,
"learning_rate": 5e-06,
"loss": 0.5717,
"step": 980
},
{
"epoch": 2.4116930572472595,
"grad_norm": 0.4986248004640357,
"learning_rate": 5e-06,
"loss": 0.5772,
"step": 990
},
{
"epoch": 2.43605359317905,
"grad_norm": 0.5423471098966955,
"learning_rate": 5e-06,
"loss": 0.5804,
"step": 1000
},
{
"epoch": 2.4604141291108403,
"grad_norm": 0.5193096800667882,
"learning_rate": 5e-06,
"loss": 0.5691,
"step": 1010
},
{
"epoch": 2.484774665042631,
"grad_norm": 0.4590023482690989,
"learning_rate": 5e-06,
"loss": 0.5741,
"step": 1020
},
{
"epoch": 2.5091352009744217,
"grad_norm": 0.4671536002975626,
"learning_rate": 5e-06,
"loss": 0.5714,
"step": 1030
},
{
"epoch": 2.533495736906212,
"grad_norm": 0.5523685876104364,
"learning_rate": 5e-06,
"loss": 0.5734,
"step": 1040
},
{
"epoch": 2.5578562728380025,
"grad_norm": 0.6868866709072206,
"learning_rate": 5e-06,
"loss": 0.5728,
"step": 1050
},
{
"epoch": 2.582216808769793,
"grad_norm": 0.5582819992545279,
"learning_rate": 5e-06,
"loss": 0.5737,
"step": 1060
},
{
"epoch": 2.6065773447015834,
"grad_norm": 0.4702857244191192,
"learning_rate": 5e-06,
"loss": 0.566,
"step": 1070
},
{
"epoch": 2.630937880633374,
"grad_norm": 0.6487634608204832,
"learning_rate": 5e-06,
"loss": 0.5818,
"step": 1080
},
{
"epoch": 2.6552984165651643,
"grad_norm": 0.4736967537062896,
"learning_rate": 5e-06,
"loss": 0.5753,
"step": 1090
},
{
"epoch": 2.679658952496955,
"grad_norm": 0.5348827813693043,
"learning_rate": 5e-06,
"loss": 0.5771,
"step": 1100
},
{
"epoch": 2.704019488428745,
"grad_norm": 0.5028960700092897,
"learning_rate": 5e-06,
"loss": 0.5713,
"step": 1110
},
{
"epoch": 2.728380024360536,
"grad_norm": 0.4780698681645441,
"learning_rate": 5e-06,
"loss": 0.5746,
"step": 1120
},
{
"epoch": 2.7527405602923265,
"grad_norm": 0.4864478553500122,
"learning_rate": 5e-06,
"loss": 0.5752,
"step": 1130
},
{
"epoch": 2.7771010962241167,
"grad_norm": 0.4667264912708201,
"learning_rate": 5e-06,
"loss": 0.5772,
"step": 1140
},
{
"epoch": 2.8014616321559074,
"grad_norm": 0.45394076375291925,
"learning_rate": 5e-06,
"loss": 0.5823,
"step": 1150
},
{
"epoch": 2.825822168087698,
"grad_norm": 0.5161201565392174,
"learning_rate": 5e-06,
"loss": 0.5815,
"step": 1160
},
{
"epoch": 2.8501827040194883,
"grad_norm": 0.5076152963599294,
"learning_rate": 5e-06,
"loss": 0.5784,
"step": 1170
},
{
"epoch": 2.874543239951279,
"grad_norm": 0.4752319372351976,
"learning_rate": 5e-06,
"loss": 0.5791,
"step": 1180
},
{
"epoch": 2.8989037758830696,
"grad_norm": 0.533679377576446,
"learning_rate": 5e-06,
"loss": 0.5796,
"step": 1190
},
{
"epoch": 2.92326431181486,
"grad_norm": 0.4952941664544987,
"learning_rate": 5e-06,
"loss": 0.5735,
"step": 1200
},
{
"epoch": 2.9476248477466505,
"grad_norm": 0.4611730832059269,
"learning_rate": 5e-06,
"loss": 0.5748,
"step": 1210
},
{
"epoch": 2.971985383678441,
"grad_norm": 0.5882799223730999,
"learning_rate": 5e-06,
"loss": 0.5781,
"step": 1220
},
{
"epoch": 2.9963459196102313,
"grad_norm": 0.4979608878944041,
"learning_rate": 5e-06,
"loss": 0.5706,
"step": 1230
},
{
"epoch": 2.9963459196102313,
"eval_loss": 0.6427608132362366,
"eval_runtime": 221.8996,
"eval_samples_per_second": 49.824,
"eval_steps_per_second": 0.392,
"step": 1230
},
{
"epoch": 2.9963459196102313,
"step": 1230,
"total_flos": 2059877052579840.0,
"train_loss": 0.6269122554034722,
"train_runtime": 37089.6991,
"train_samples_per_second": 16.991,
"train_steps_per_second": 0.033
}
],
"logging_steps": 10,
"max_steps": 1230,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2059877052579840.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}