sedrickkeh's picture
End of training
0b28b43 verified
raw
history blame
18.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025,
"grad_norm": 6.2283793562421,
"learning_rate": 5e-06,
"loss": 0.9226,
"step": 10
},
{
"epoch": 0.05,
"grad_norm": 1.073730252333159,
"learning_rate": 5e-06,
"loss": 0.7903,
"step": 20
},
{
"epoch": 0.075,
"grad_norm": 1.0084608728414968,
"learning_rate": 5e-06,
"loss": 0.7361,
"step": 30
},
{
"epoch": 0.1,
"grad_norm": 0.9477824152833885,
"learning_rate": 5e-06,
"loss": 0.7108,
"step": 40
},
{
"epoch": 0.125,
"grad_norm": 0.8625762666467,
"learning_rate": 5e-06,
"loss": 0.686,
"step": 50
},
{
"epoch": 0.15,
"grad_norm": 0.761040541226219,
"learning_rate": 5e-06,
"loss": 0.6788,
"step": 60
},
{
"epoch": 0.175,
"grad_norm": 0.7440724149702675,
"learning_rate": 5e-06,
"loss": 0.665,
"step": 70
},
{
"epoch": 0.2,
"grad_norm": 0.6683283661638493,
"learning_rate": 5e-06,
"loss": 0.6647,
"step": 80
},
{
"epoch": 0.225,
"grad_norm": 0.6752395142547716,
"learning_rate": 5e-06,
"loss": 0.6486,
"step": 90
},
{
"epoch": 0.25,
"grad_norm": 0.6932666577646985,
"learning_rate": 5e-06,
"loss": 0.6394,
"step": 100
},
{
"epoch": 0.275,
"grad_norm": 0.5563073495123759,
"learning_rate": 5e-06,
"loss": 0.6382,
"step": 110
},
{
"epoch": 0.3,
"grad_norm": 0.5732770966128271,
"learning_rate": 5e-06,
"loss": 0.6351,
"step": 120
},
{
"epoch": 0.325,
"grad_norm": 0.6075711947587346,
"learning_rate": 5e-06,
"loss": 0.6318,
"step": 130
},
{
"epoch": 0.35,
"grad_norm": 0.755814081800888,
"learning_rate": 5e-06,
"loss": 0.6267,
"step": 140
},
{
"epoch": 0.375,
"grad_norm": 0.67881977859985,
"learning_rate": 5e-06,
"loss": 0.6292,
"step": 150
},
{
"epoch": 0.4,
"grad_norm": 0.9073276911461459,
"learning_rate": 5e-06,
"loss": 0.6223,
"step": 160
},
{
"epoch": 0.425,
"grad_norm": 0.6556934680429933,
"learning_rate": 5e-06,
"loss": 0.6181,
"step": 170
},
{
"epoch": 0.45,
"grad_norm": 0.5268985559340588,
"learning_rate": 5e-06,
"loss": 0.6189,
"step": 180
},
{
"epoch": 0.475,
"grad_norm": 0.500078943711909,
"learning_rate": 5e-06,
"loss": 0.612,
"step": 190
},
{
"epoch": 0.5,
"grad_norm": 0.5364000910049038,
"learning_rate": 5e-06,
"loss": 0.6113,
"step": 200
},
{
"epoch": 0.525,
"grad_norm": 0.5290306126723202,
"learning_rate": 5e-06,
"loss": 0.6153,
"step": 210
},
{
"epoch": 0.55,
"grad_norm": 0.4770992636352269,
"learning_rate": 5e-06,
"loss": 0.6104,
"step": 220
},
{
"epoch": 0.575,
"grad_norm": 0.6289416522292515,
"learning_rate": 5e-06,
"loss": 0.6103,
"step": 230
},
{
"epoch": 0.6,
"grad_norm": 0.5398977888752696,
"learning_rate": 5e-06,
"loss": 0.6076,
"step": 240
},
{
"epoch": 0.625,
"grad_norm": 0.5310488552193566,
"learning_rate": 5e-06,
"loss": 0.6096,
"step": 250
},
{
"epoch": 0.65,
"grad_norm": 0.6110323317115457,
"learning_rate": 5e-06,
"loss": 0.605,
"step": 260
},
{
"epoch": 0.675,
"grad_norm": 0.6068134600036437,
"learning_rate": 5e-06,
"loss": 0.6071,
"step": 270
},
{
"epoch": 0.7,
"grad_norm": 0.5634137177645002,
"learning_rate": 5e-06,
"loss": 0.6,
"step": 280
},
{
"epoch": 0.725,
"grad_norm": 0.7693981650465631,
"learning_rate": 5e-06,
"loss": 0.6053,
"step": 290
},
{
"epoch": 0.75,
"grad_norm": 0.594391360805154,
"learning_rate": 5e-06,
"loss": 0.6032,
"step": 300
},
{
"epoch": 0.775,
"grad_norm": 0.6029344366979934,
"learning_rate": 5e-06,
"loss": 0.6006,
"step": 310
},
{
"epoch": 0.8,
"grad_norm": 0.5077693887980811,
"learning_rate": 5e-06,
"loss": 0.5978,
"step": 320
},
{
"epoch": 0.825,
"grad_norm": 0.5013009473527608,
"learning_rate": 5e-06,
"loss": 0.5939,
"step": 330
},
{
"epoch": 0.85,
"grad_norm": 0.6898923986358316,
"learning_rate": 5e-06,
"loss": 0.5941,
"step": 340
},
{
"epoch": 0.875,
"grad_norm": 0.6455192038734223,
"learning_rate": 5e-06,
"loss": 0.5948,
"step": 350
},
{
"epoch": 0.9,
"grad_norm": 0.5846403091528135,
"learning_rate": 5e-06,
"loss": 0.5925,
"step": 360
},
{
"epoch": 0.925,
"grad_norm": 0.603873049442878,
"learning_rate": 5e-06,
"loss": 0.5885,
"step": 370
},
{
"epoch": 0.95,
"grad_norm": 0.5089869229340043,
"learning_rate": 5e-06,
"loss": 0.587,
"step": 380
},
{
"epoch": 0.975,
"grad_norm": 0.46202950175215546,
"learning_rate": 5e-06,
"loss": 0.5866,
"step": 390
},
{
"epoch": 1.0,
"grad_norm": 0.5546489111159046,
"learning_rate": 5e-06,
"loss": 0.5865,
"step": 400
},
{
"epoch": 1.0,
"eval_loss": 0.5859289765357971,
"eval_runtime": 214.5478,
"eval_samples_per_second": 50.231,
"eval_steps_per_second": 0.396,
"step": 400
},
{
"epoch": 1.025,
"grad_norm": 0.6920192241789218,
"learning_rate": 5e-06,
"loss": 0.5496,
"step": 410
},
{
"epoch": 1.05,
"grad_norm": 0.6541721391004665,
"learning_rate": 5e-06,
"loss": 0.5509,
"step": 420
},
{
"epoch": 1.075,
"grad_norm": 0.614260313358043,
"learning_rate": 5e-06,
"loss": 0.558,
"step": 430
},
{
"epoch": 1.1,
"grad_norm": 0.5447908907434397,
"learning_rate": 5e-06,
"loss": 0.5524,
"step": 440
},
{
"epoch": 1.125,
"grad_norm": 0.5958987826147251,
"learning_rate": 5e-06,
"loss": 0.5501,
"step": 450
},
{
"epoch": 1.15,
"grad_norm": 0.5389526586794362,
"learning_rate": 5e-06,
"loss": 0.5497,
"step": 460
},
{
"epoch": 1.175,
"grad_norm": 0.6252863504479338,
"learning_rate": 5e-06,
"loss": 0.548,
"step": 470
},
{
"epoch": 1.2,
"grad_norm": 0.47836275145239177,
"learning_rate": 5e-06,
"loss": 0.5516,
"step": 480
},
{
"epoch": 1.225,
"grad_norm": 0.4969967566952363,
"learning_rate": 5e-06,
"loss": 0.5456,
"step": 490
},
{
"epoch": 1.25,
"grad_norm": 0.5401134954649838,
"learning_rate": 5e-06,
"loss": 0.5559,
"step": 500
},
{
"epoch": 1.275,
"grad_norm": 0.5052946204256015,
"learning_rate": 5e-06,
"loss": 0.5463,
"step": 510
},
{
"epoch": 1.3,
"grad_norm": 0.5464783361936272,
"learning_rate": 5e-06,
"loss": 0.5508,
"step": 520
},
{
"epoch": 1.325,
"grad_norm": 0.7005683495135656,
"learning_rate": 5e-06,
"loss": 0.551,
"step": 530
},
{
"epoch": 1.35,
"grad_norm": 0.8637480119643226,
"learning_rate": 5e-06,
"loss": 0.5447,
"step": 540
},
{
"epoch": 1.375,
"grad_norm": 0.6198566675351206,
"learning_rate": 5e-06,
"loss": 0.5414,
"step": 550
},
{
"epoch": 1.4,
"grad_norm": 0.5676940292369252,
"learning_rate": 5e-06,
"loss": 0.5461,
"step": 560
},
{
"epoch": 1.425,
"grad_norm": 0.46615761940342565,
"learning_rate": 5e-06,
"loss": 0.5391,
"step": 570
},
{
"epoch": 1.45,
"grad_norm": 0.5299833857615388,
"learning_rate": 5e-06,
"loss": 0.544,
"step": 580
},
{
"epoch": 1.475,
"grad_norm": 0.5350523878143963,
"learning_rate": 5e-06,
"loss": 0.5458,
"step": 590
},
{
"epoch": 1.5,
"grad_norm": 0.5367249564898715,
"learning_rate": 5e-06,
"loss": 0.5462,
"step": 600
},
{
"epoch": 1.525,
"grad_norm": 0.7827798795419177,
"learning_rate": 5e-06,
"loss": 0.5463,
"step": 610
},
{
"epoch": 1.55,
"grad_norm": 0.6823097374119125,
"learning_rate": 5e-06,
"loss": 0.5501,
"step": 620
},
{
"epoch": 1.575,
"grad_norm": 0.6572832849543329,
"learning_rate": 5e-06,
"loss": 0.5437,
"step": 630
},
{
"epoch": 1.6,
"grad_norm": 0.5914218329646164,
"learning_rate": 5e-06,
"loss": 0.5345,
"step": 640
},
{
"epoch": 1.625,
"grad_norm": 0.5060878083849677,
"learning_rate": 5e-06,
"loss": 0.5336,
"step": 650
},
{
"epoch": 1.65,
"grad_norm": 0.5427787163346001,
"learning_rate": 5e-06,
"loss": 0.5361,
"step": 660
},
{
"epoch": 1.675,
"grad_norm": 0.6015169228068791,
"learning_rate": 5e-06,
"loss": 0.5395,
"step": 670
},
{
"epoch": 1.7,
"grad_norm": 0.522909609070953,
"learning_rate": 5e-06,
"loss": 0.5407,
"step": 680
},
{
"epoch": 1.725,
"grad_norm": 0.5648959450008263,
"learning_rate": 5e-06,
"loss": 0.5384,
"step": 690
},
{
"epoch": 1.75,
"grad_norm": 0.5073930020348113,
"learning_rate": 5e-06,
"loss": 0.5385,
"step": 700
},
{
"epoch": 1.775,
"grad_norm": 0.5344270884192877,
"learning_rate": 5e-06,
"loss": 0.5389,
"step": 710
},
{
"epoch": 1.8,
"grad_norm": 0.5387282660221612,
"learning_rate": 5e-06,
"loss": 0.5384,
"step": 720
},
{
"epoch": 1.825,
"grad_norm": 0.630517812852184,
"learning_rate": 5e-06,
"loss": 0.5378,
"step": 730
},
{
"epoch": 1.85,
"grad_norm": 0.528770579788001,
"learning_rate": 5e-06,
"loss": 0.5373,
"step": 740
},
{
"epoch": 1.875,
"grad_norm": 0.46867857755871645,
"learning_rate": 5e-06,
"loss": 0.5302,
"step": 750
},
{
"epoch": 1.9,
"grad_norm": 0.5407357139497844,
"learning_rate": 5e-06,
"loss": 0.5327,
"step": 760
},
{
"epoch": 1.925,
"grad_norm": 0.5955017346639638,
"learning_rate": 5e-06,
"loss": 0.5357,
"step": 770
},
{
"epoch": 1.95,
"grad_norm": 0.4917679827974974,
"learning_rate": 5e-06,
"loss": 0.5313,
"step": 780
},
{
"epoch": 1.975,
"grad_norm": 0.5426327036736968,
"learning_rate": 5e-06,
"loss": 0.5362,
"step": 790
},
{
"epoch": 2.0,
"grad_norm": 0.5570351037553893,
"learning_rate": 5e-06,
"loss": 0.5346,
"step": 800
},
{
"epoch": 2.0,
"eval_loss": 0.5597097873687744,
"eval_runtime": 215.4672,
"eval_samples_per_second": 50.017,
"eval_steps_per_second": 0.394,
"step": 800
},
{
"epoch": 2.025,
"grad_norm": 0.6612509870643555,
"learning_rate": 5e-06,
"loss": 0.4939,
"step": 810
},
{
"epoch": 2.05,
"grad_norm": 0.5438913454843807,
"learning_rate": 5e-06,
"loss": 0.4901,
"step": 820
},
{
"epoch": 2.075,
"grad_norm": 0.7056566708123541,
"learning_rate": 5e-06,
"loss": 0.4961,
"step": 830
},
{
"epoch": 2.1,
"grad_norm": 0.5098170694120924,
"learning_rate": 5e-06,
"loss": 0.4971,
"step": 840
},
{
"epoch": 2.125,
"grad_norm": 0.6070617882857331,
"learning_rate": 5e-06,
"loss": 0.4981,
"step": 850
},
{
"epoch": 2.15,
"grad_norm": 0.5998341137122876,
"learning_rate": 5e-06,
"loss": 0.4977,
"step": 860
},
{
"epoch": 2.175,
"grad_norm": 0.58734738292625,
"learning_rate": 5e-06,
"loss": 0.5015,
"step": 870
},
{
"epoch": 2.2,
"grad_norm": 0.6197078930251222,
"learning_rate": 5e-06,
"loss": 0.4979,
"step": 880
},
{
"epoch": 2.225,
"grad_norm": 0.6086748900409549,
"learning_rate": 5e-06,
"loss": 0.4969,
"step": 890
},
{
"epoch": 2.25,
"grad_norm": 0.546234904601564,
"learning_rate": 5e-06,
"loss": 0.4975,
"step": 900
},
{
"epoch": 2.275,
"grad_norm": 0.5160078517376208,
"learning_rate": 5e-06,
"loss": 0.498,
"step": 910
},
{
"epoch": 2.3,
"grad_norm": 0.5415276924291007,
"learning_rate": 5e-06,
"loss": 0.4943,
"step": 920
},
{
"epoch": 2.325,
"grad_norm": 0.5686966271920224,
"learning_rate": 5e-06,
"loss": 0.4934,
"step": 930
},
{
"epoch": 2.35,
"grad_norm": 0.5936539945494198,
"learning_rate": 5e-06,
"loss": 0.4992,
"step": 940
},
{
"epoch": 2.375,
"grad_norm": 0.5548802299834517,
"learning_rate": 5e-06,
"loss": 0.5051,
"step": 950
},
{
"epoch": 2.4,
"grad_norm": 0.5103322725061038,
"learning_rate": 5e-06,
"loss": 0.4955,
"step": 960
},
{
"epoch": 2.425,
"grad_norm": 0.5211482899619925,
"learning_rate": 5e-06,
"loss": 0.4988,
"step": 970
},
{
"epoch": 2.45,
"grad_norm": 0.4896499548762498,
"learning_rate": 5e-06,
"loss": 0.5012,
"step": 980
},
{
"epoch": 2.475,
"grad_norm": 0.6075465454296445,
"learning_rate": 5e-06,
"loss": 0.4921,
"step": 990
},
{
"epoch": 2.5,
"grad_norm": 0.588232935912865,
"learning_rate": 5e-06,
"loss": 0.4967,
"step": 1000
},
{
"epoch": 2.525,
"grad_norm": 0.533857697833421,
"learning_rate": 5e-06,
"loss": 0.4981,
"step": 1010
},
{
"epoch": 2.55,
"grad_norm": 0.5102421831778537,
"learning_rate": 5e-06,
"loss": 0.4963,
"step": 1020
},
{
"epoch": 2.575,
"grad_norm": 0.571515094485817,
"learning_rate": 5e-06,
"loss": 0.4994,
"step": 1030
},
{
"epoch": 2.6,
"grad_norm": 0.5274028357185288,
"learning_rate": 5e-06,
"loss": 0.4998,
"step": 1040
},
{
"epoch": 2.625,
"grad_norm": 0.5263461614707381,
"learning_rate": 5e-06,
"loss": 0.4935,
"step": 1050
},
{
"epoch": 2.65,
"grad_norm": 0.5484636738493971,
"learning_rate": 5e-06,
"loss": 0.495,
"step": 1060
},
{
"epoch": 2.675,
"grad_norm": 0.48284125009839746,
"learning_rate": 5e-06,
"loss": 0.5026,
"step": 1070
},
{
"epoch": 2.7,
"grad_norm": 0.5049035715654736,
"learning_rate": 5e-06,
"loss": 0.4953,
"step": 1080
},
{
"epoch": 2.725,
"grad_norm": 0.5451746081470605,
"learning_rate": 5e-06,
"loss": 0.4917,
"step": 1090
},
{
"epoch": 2.75,
"grad_norm": 0.4946736397645321,
"learning_rate": 5e-06,
"loss": 0.4982,
"step": 1100
},
{
"epoch": 2.775,
"grad_norm": 0.5804259517812362,
"learning_rate": 5e-06,
"loss": 0.4939,
"step": 1110
},
{
"epoch": 2.8,
"grad_norm": 0.5489030189752196,
"learning_rate": 5e-06,
"loss": 0.4957,
"step": 1120
},
{
"epoch": 2.825,
"grad_norm": 0.5457133379941178,
"learning_rate": 5e-06,
"loss": 0.4948,
"step": 1130
},
{
"epoch": 2.85,
"grad_norm": 0.5151610258671091,
"learning_rate": 5e-06,
"loss": 0.4965,
"step": 1140
},
{
"epoch": 2.875,
"grad_norm": 0.5480931688710529,
"learning_rate": 5e-06,
"loss": 0.5025,
"step": 1150
},
{
"epoch": 2.9,
"grad_norm": 0.5250233587635805,
"learning_rate": 5e-06,
"loss": 0.5,
"step": 1160
},
{
"epoch": 2.925,
"grad_norm": 0.5611546648048623,
"learning_rate": 5e-06,
"loss": 0.4959,
"step": 1170
},
{
"epoch": 2.95,
"grad_norm": 0.5168606076772253,
"learning_rate": 5e-06,
"loss": 0.4976,
"step": 1180
},
{
"epoch": 2.975,
"grad_norm": 0.5089740614604118,
"learning_rate": 5e-06,
"loss": 0.4977,
"step": 1190
},
{
"epoch": 3.0,
"grad_norm": 0.501278157123975,
"learning_rate": 5e-06,
"loss": 0.4927,
"step": 1200
},
{
"epoch": 3.0,
"eval_loss": 0.5540264248847961,
"eval_runtime": 214.8763,
"eval_samples_per_second": 50.154,
"eval_steps_per_second": 0.396,
"step": 1200
},
{
"epoch": 3.0,
"step": 1200,
"total_flos": 2009625935216640.0,
"train_loss": 0.5581841540336608,
"train_runtime": 35639.5638,
"train_samples_per_second": 17.235,
"train_steps_per_second": 0.034
}
],
"logging_steps": 10,
"max_steps": 1200,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2009625935216640.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}