nttx's picture
Training in progress, epoch 0, checkpoint
e37915a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7152436298614215,
"eval_steps": 100,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001788109074653554,
"eval_loss": 2.058881998062134,
"eval_runtime": 19.2419,
"eval_samples_per_second": 12.265,
"eval_steps_per_second": 6.132,
"step": 1
},
{
"epoch": 0.008940545373267769,
"grad_norm": 11.78614616394043,
"learning_rate": 1.6666666666666667e-05,
"loss": 7.2335,
"step": 5
},
{
"epoch": 0.017881090746535537,
"grad_norm": 16.716123580932617,
"learning_rate": 3.3333333333333335e-05,
"loss": 6.8512,
"step": 10
},
{
"epoch": 0.02682163611980331,
"grad_norm": 30.8808650970459,
"learning_rate": 5e-05,
"loss": 4.9807,
"step": 15
},
{
"epoch": 0.035762181493071074,
"grad_norm": 22.647905349731445,
"learning_rate": 6.666666666666667e-05,
"loss": 5.8287,
"step": 20
},
{
"epoch": 0.044702726866338846,
"grad_norm": 18.210037231445312,
"learning_rate": 8.333333333333334e-05,
"loss": 4.6728,
"step": 25
},
{
"epoch": 0.05364327223960662,
"grad_norm": 24.852115631103516,
"learning_rate": 0.0001,
"loss": 4.8721,
"step": 30
},
{
"epoch": 0.06258381761287439,
"grad_norm": 21.848241806030273,
"learning_rate": 9.995494831023409e-05,
"loss": 4.8087,
"step": 35
},
{
"epoch": 0.07152436298614215,
"grad_norm": 38.46466827392578,
"learning_rate": 9.981987442712633e-05,
"loss": 6.3369,
"step": 40
},
{
"epoch": 0.08046490835940992,
"grad_norm": 40.56308364868164,
"learning_rate": 9.959502176294383e-05,
"loss": 5.1838,
"step": 45
},
{
"epoch": 0.08940545373267769,
"grad_norm": 50.512901306152344,
"learning_rate": 9.928079551738543e-05,
"loss": 5.9317,
"step": 50
},
{
"epoch": 0.09834599910594546,
"grad_norm": 10.443129539489746,
"learning_rate": 9.887776194738432e-05,
"loss": 6.75,
"step": 55
},
{
"epoch": 0.10728654447921324,
"grad_norm": 11.418550491333008,
"learning_rate": 9.838664734667495e-05,
"loss": 6.4133,
"step": 60
},
{
"epoch": 0.11622708985248101,
"grad_norm": 12.176839828491211,
"learning_rate": 9.780833673696254e-05,
"loss": 4.5906,
"step": 65
},
{
"epoch": 0.12516763522574878,
"grad_norm": 12.221039772033691,
"learning_rate": 9.714387227305422e-05,
"loss": 4.6752,
"step": 70
},
{
"epoch": 0.13410818059901655,
"grad_norm": 14.278007507324219,
"learning_rate": 9.639445136482548e-05,
"loss": 4.6537,
"step": 75
},
{
"epoch": 0.1430487259722843,
"grad_norm": 17.677927017211914,
"learning_rate": 9.55614245194068e-05,
"loss": 4.1464,
"step": 80
},
{
"epoch": 0.15198927134555207,
"grad_norm": 18.3587589263916,
"learning_rate": 9.464629290747842e-05,
"loss": 4.9328,
"step": 85
},
{
"epoch": 0.16092981671881984,
"grad_norm": 30.522045135498047,
"learning_rate": 9.365070565805941e-05,
"loss": 5.6559,
"step": 90
},
{
"epoch": 0.1698703620920876,
"grad_norm": 38.24018859863281,
"learning_rate": 9.257645688666556e-05,
"loss": 4.9604,
"step": 95
},
{
"epoch": 0.17881090746535538,
"grad_norm": 39.934608459472656,
"learning_rate": 9.142548246219212e-05,
"loss": 5.3934,
"step": 100
},
{
"epoch": 0.17881090746535538,
"eval_loss": 1.4117575883865356,
"eval_runtime": 19.5949,
"eval_samples_per_second": 12.044,
"eval_steps_per_second": 6.022,
"step": 100
},
{
"epoch": 0.18775145283862316,
"grad_norm": 9.125673294067383,
"learning_rate": 9.019985651834703e-05,
"loss": 6.622,
"step": 105
},
{
"epoch": 0.19669199821189093,
"grad_norm": 10.328652381896973,
"learning_rate": 8.890178771592199e-05,
"loss": 5.5454,
"step": 110
},
{
"epoch": 0.2056325435851587,
"grad_norm": 10.82226276397705,
"learning_rate": 8.753361526263621e-05,
"loss": 4.1617,
"step": 115
},
{
"epoch": 0.21457308895842647,
"grad_norm": 10.679474830627441,
"learning_rate": 8.609780469772623e-05,
"loss": 3.8837,
"step": 120
},
{
"epoch": 0.22351363433169424,
"grad_norm": 13.074336051940918,
"learning_rate": 8.459694344887732e-05,
"loss": 4.5478,
"step": 125
},
{
"epoch": 0.23245417970496202,
"grad_norm": 19.095172882080078,
"learning_rate": 8.303373616950408e-05,
"loss": 4.6366,
"step": 130
},
{
"epoch": 0.24139472507822976,
"grad_norm": 15.73512077331543,
"learning_rate": 8.141099986478212e-05,
"loss": 5.1201,
"step": 135
},
{
"epoch": 0.25033527045149756,
"grad_norm": 25.20210838317871,
"learning_rate": 7.973165881521434e-05,
"loss": 4.8318,
"step": 140
},
{
"epoch": 0.25927581582476533,
"grad_norm": 43.75430679321289,
"learning_rate": 7.799873930687978e-05,
"loss": 5.2405,
"step": 145
},
{
"epoch": 0.2682163611980331,
"grad_norm": 48.83418273925781,
"learning_rate": 7.621536417786159e-05,
"loss": 5.5238,
"step": 150
},
{
"epoch": 0.2771569065713009,
"grad_norm": 8.100919723510742,
"learning_rate": 7.438474719068173e-05,
"loss": 6.6697,
"step": 155
},
{
"epoch": 0.2860974519445686,
"grad_norm": 11.477263450622559,
"learning_rate": 7.251018724088367e-05,
"loss": 5.9319,
"step": 160
},
{
"epoch": 0.29503799731783636,
"grad_norm": 10.063141822814941,
"learning_rate": 7.059506241219965e-05,
"loss": 5.7131,
"step": 165
},
{
"epoch": 0.30397854269110414,
"grad_norm": 12.019450187683105,
"learning_rate": 6.864282388901544e-05,
"loss": 4.1435,
"step": 170
},
{
"epoch": 0.3129190880643719,
"grad_norm": 11.132643699645996,
"learning_rate": 6.665698973710288e-05,
"loss": 4.311,
"step": 175
},
{
"epoch": 0.3218596334376397,
"grad_norm": 16.12037467956543,
"learning_rate": 6.464113856382752e-05,
"loss": 4.0409,
"step": 180
},
{
"epoch": 0.33080017881090745,
"grad_norm": 14.883207321166992,
"learning_rate": 6.259890306925627e-05,
"loss": 4.0187,
"step": 185
},
{
"epoch": 0.3397407241841752,
"grad_norm": 19.90201187133789,
"learning_rate": 6.0533963499786314e-05,
"loss": 4.8555,
"step": 190
},
{
"epoch": 0.348681269557443,
"grad_norm": 18.977224349975586,
"learning_rate": 5.8450041016092464e-05,
"loss": 3.8155,
"step": 195
},
{
"epoch": 0.35762181493071077,
"grad_norm": 93.56307220458984,
"learning_rate": 5.6350890987343944e-05,
"loss": 6.7862,
"step": 200
},
{
"epoch": 0.35762181493071077,
"eval_loss": 1.2747399806976318,
"eval_runtime": 19.5936,
"eval_samples_per_second": 12.045,
"eval_steps_per_second": 6.022,
"step": 200
},
{
"epoch": 0.36656236030397854,
"grad_norm": 10.30494213104248,
"learning_rate": 5.4240296223775465e-05,
"loss": 6.2965,
"step": 205
},
{
"epoch": 0.3755029056772463,
"grad_norm": 9.586296081542969,
"learning_rate": 5.212206015980742e-05,
"loss": 5.2106,
"step": 210
},
{
"epoch": 0.3844434510505141,
"grad_norm": 11.460832595825195,
"learning_rate": 5e-05,
"loss": 5.0099,
"step": 215
},
{
"epoch": 0.39338399642378186,
"grad_norm": 15.051645278930664,
"learning_rate": 4.78779398401926e-05,
"loss": 4.0896,
"step": 220
},
{
"epoch": 0.40232454179704963,
"grad_norm": 15.79628849029541,
"learning_rate": 4.575970377622456e-05,
"loss": 3.9889,
"step": 225
},
{
"epoch": 0.4112650871703174,
"grad_norm": 9.98121166229248,
"learning_rate": 4.364910901265606e-05,
"loss": 4.068,
"step": 230
},
{
"epoch": 0.4202056325435852,
"grad_norm": 13.449023246765137,
"learning_rate": 4.1549958983907555e-05,
"loss": 4.0041,
"step": 235
},
{
"epoch": 0.42914617791685294,
"grad_norm": 17.6621150970459,
"learning_rate": 3.94660365002137e-05,
"loss": 4.5437,
"step": 240
},
{
"epoch": 0.4380867232901207,
"grad_norm": 42.47966384887695,
"learning_rate": 3.740109693074375e-05,
"loss": 4.8385,
"step": 245
},
{
"epoch": 0.4470272686633885,
"grad_norm": 54.10409164428711,
"learning_rate": 3.5358861436172485e-05,
"loss": 5.6786,
"step": 250
},
{
"epoch": 0.45596781403665626,
"grad_norm": 8.634622573852539,
"learning_rate": 3.334301026289712e-05,
"loss": 6.4977,
"step": 255
},
{
"epoch": 0.46490835940992403,
"grad_norm": 11.310672760009766,
"learning_rate": 3.135717611098458e-05,
"loss": 4.4356,
"step": 260
},
{
"epoch": 0.47384890478319175,
"grad_norm": 11.240716934204102,
"learning_rate": 2.9404937587800375e-05,
"loss": 4.3816,
"step": 265
},
{
"epoch": 0.4827894501564595,
"grad_norm": 10.130899429321289,
"learning_rate": 2.748981275911633e-05,
"loss": 4.2432,
"step": 270
},
{
"epoch": 0.4917299955297273,
"grad_norm": 11.680331230163574,
"learning_rate": 2.5615252809318284e-05,
"loss": 3.5449,
"step": 275
},
{
"epoch": 0.5006705409029951,
"grad_norm": 14.900591850280762,
"learning_rate": 2.3784635822138424e-05,
"loss": 4.2528,
"step": 280
},
{
"epoch": 0.5096110862762628,
"grad_norm": 17.08873748779297,
"learning_rate": 2.2001260693120233e-05,
"loss": 4.8141,
"step": 285
},
{
"epoch": 0.5185516316495307,
"grad_norm": 24.609628677368164,
"learning_rate": 2.026834118478567e-05,
"loss": 3.6354,
"step": 290
},
{
"epoch": 0.5274921770227984,
"grad_norm": 29.8326358795166,
"learning_rate": 1.858900013521788e-05,
"loss": 4.305,
"step": 295
},
{
"epoch": 0.5364327223960662,
"grad_norm": 40.15150833129883,
"learning_rate": 1.6966263830495936e-05,
"loss": 4.9746,
"step": 300
},
{
"epoch": 0.5364327223960662,
"eval_loss": 1.2247991561889648,
"eval_runtime": 19.5982,
"eval_samples_per_second": 12.042,
"eval_steps_per_second": 6.021,
"step": 300
},
{
"epoch": 0.5453732677693339,
"grad_norm": 6.730564117431641,
"learning_rate": 1.5403056551122697e-05,
"loss": 5.6797,
"step": 305
},
{
"epoch": 0.5543138131426018,
"grad_norm": 12.260231971740723,
"learning_rate": 1.3902195302273779e-05,
"loss": 4.9479,
"step": 310
},
{
"epoch": 0.5632543585158695,
"grad_norm": 12.601056098937988,
"learning_rate": 1.246638473736378e-05,
"loss": 4.2043,
"step": 315
},
{
"epoch": 0.5721949038891372,
"grad_norm": 12.597325325012207,
"learning_rate": 1.1098212284078036e-05,
"loss": 4.047,
"step": 320
},
{
"epoch": 0.581135449262405,
"grad_norm": 12.159523010253906,
"learning_rate": 9.800143481652979e-06,
"loss": 4.2291,
"step": 325
},
{
"epoch": 0.5900759946356727,
"grad_norm": 12.575312614440918,
"learning_rate": 8.574517537807897e-06,
"loss": 3.4932,
"step": 330
},
{
"epoch": 0.5990165400089406,
"grad_norm": 25.68616485595703,
"learning_rate": 7.423543113334436e-06,
"loss": 4.4752,
"step": 335
},
{
"epoch": 0.6079570853822083,
"grad_norm": 19.73110008239746,
"learning_rate": 6.349294341940593e-06,
"loss": 4.4499,
"step": 340
},
{
"epoch": 0.6168976307554761,
"grad_norm": 33.66725158691406,
"learning_rate": 5.353707092521582e-06,
"loss": 3.9091,
"step": 345
},
{
"epoch": 0.6258381761287438,
"grad_norm": 37.051727294921875,
"learning_rate": 4.43857548059321e-06,
"loss": 5.7529,
"step": 350
},
{
"epoch": 0.6347787215020116,
"grad_norm": 11.532958030700684,
"learning_rate": 3.605548635174533e-06,
"loss": 5.8252,
"step": 355
},
{
"epoch": 0.6437192668752794,
"grad_norm": 10.616311073303223,
"learning_rate": 2.85612772694579e-06,
"loss": 4.048,
"step": 360
},
{
"epoch": 0.6526598122485472,
"grad_norm": 9.950262069702148,
"learning_rate": 2.191663263037458e-06,
"loss": 4.3658,
"step": 365
},
{
"epoch": 0.6616003576218149,
"grad_norm": 9.809283256530762,
"learning_rate": 1.6133526533250565e-06,
"loss": 3.7123,
"step": 370
},
{
"epoch": 0.6705409029950827,
"grad_norm": 8.985926628112793,
"learning_rate": 1.1222380526156928e-06,
"loss": 3.5311,
"step": 375
},
{
"epoch": 0.6794814483683504,
"grad_norm": 15.09157657623291,
"learning_rate": 7.192044826145771e-07,
"loss": 3.9273,
"step": 380
},
{
"epoch": 0.6884219937416183,
"grad_norm": 19.525718688964844,
"learning_rate": 4.049782370561583e-07,
"loss": 4.0169,
"step": 385
},
{
"epoch": 0.697362539114886,
"grad_norm": 22.638031005859375,
"learning_rate": 1.8012557287367392e-07,
"loss": 4.7521,
"step": 390
},
{
"epoch": 0.7063030844881538,
"grad_norm": 29.394636154174805,
"learning_rate": 4.5051689765929214e-08,
"loss": 4.773,
"step": 395
},
{
"epoch": 0.7152436298614215,
"grad_norm": 39.56925964355469,
"learning_rate": 0.0,
"loss": 4.2009,
"step": 400
},
{
"epoch": 0.7152436298614215,
"eval_loss": 1.1803392171859741,
"eval_runtime": 19.5987,
"eval_samples_per_second": 12.042,
"eval_steps_per_second": 6.021,
"step": 400
}
],
"logging_steps": 5,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4091489359010202e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}