|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.204869681629921,
|
|
"eval_steps": 900,
|
|
"global_step": 9000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0013387440906999122,
|
|
"grad_norm": 11798.958984375,
|
|
"learning_rate": 1.1111111111111112e-07,
|
|
"loss": 10.2879,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.0026774881813998244,
|
|
"grad_norm": 2084.604736328125,
|
|
"learning_rate": 2.2222222222222224e-07,
|
|
"loss": 7.3304,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.004016232272099737,
|
|
"grad_norm": 511.07745361328125,
|
|
"learning_rate": 3.3333333333333335e-07,
|
|
"loss": 7.1891,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.005354976362799649,
|
|
"grad_norm": 1105.57421875,
|
|
"learning_rate": 4.444444444444445e-07,
|
|
"loss": 6.8518,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.006693720453499561,
|
|
"grad_norm": 1405.3367919921875,
|
|
"learning_rate": 5.555555555555555e-07,
|
|
"loss": 6.3241,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.008032464544199473,
|
|
"grad_norm": 9173.2060546875,
|
|
"learning_rate": 6.666666666666667e-07,
|
|
"loss": 5.3569,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.009371208634899385,
|
|
"grad_norm": 8319.126953125,
|
|
"learning_rate": 7.777777777777779e-07,
|
|
"loss": 4.759,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.010709952725599298,
|
|
"grad_norm": 159.01446533203125,
|
|
"learning_rate": 8.88888888888889e-07,
|
|
"loss": 2.9084,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.01204869681629921,
|
|
"grad_norm": 131.8560028076172,
|
|
"learning_rate": 1.0000000000000002e-06,
|
|
"loss": 2.3938,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.013387440906999122,
|
|
"grad_norm": 77.1502914428711,
|
|
"learning_rate": 1.111111111111111e-06,
|
|
"loss": 2.0226,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.014726184997699034,
|
|
"grad_norm": 42.61675262451172,
|
|
"learning_rate": 1.2222222222222223e-06,
|
|
"loss": 1.798,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.016064929088398947,
|
|
"grad_norm": 36.571746826171875,
|
|
"learning_rate": 1.3333333333333334e-06,
|
|
"loss": 1.5957,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.01740367317909886,
|
|
"grad_norm": 25.031883239746094,
|
|
"learning_rate": 1.4444444444444445e-06,
|
|
"loss": 1.42,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.01874241726979877,
|
|
"grad_norm": 35.95310974121094,
|
|
"learning_rate": 1.5555555555555558e-06,
|
|
"loss": 1.2519,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.020081161360498683,
|
|
"grad_norm": 15.751395225524902,
|
|
"learning_rate": 1.6666666666666667e-06,
|
|
"loss": 1.1448,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.021419905451198595,
|
|
"grad_norm": 19.347064971923828,
|
|
"learning_rate": 1.777777777777778e-06,
|
|
"loss": 1.1018,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.022758649541898508,
|
|
"grad_norm": 9.843700408935547,
|
|
"learning_rate": 1.888888888888889e-06,
|
|
"loss": 1.037,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.02409739363259842,
|
|
"grad_norm": 12.104240417480469,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": 1.0081,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.025436137723298332,
|
|
"grad_norm": 14.820916175842285,
|
|
"learning_rate": 2.1111111111111114e-06,
|
|
"loss": 0.9778,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.026774881813998244,
|
|
"grad_norm": 16.050945281982422,
|
|
"learning_rate": 2.222222222222222e-06,
|
|
"loss": 0.9773,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.028113625904698156,
|
|
"grad_norm": 7.794721603393555,
|
|
"learning_rate": 2.3333333333333336e-06,
|
|
"loss": 0.954,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.02945236999539807,
|
|
"grad_norm": 6.588793754577637,
|
|
"learning_rate": 2.4444444444444447e-06,
|
|
"loss": 0.9249,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.03079111408609798,
|
|
"grad_norm": 15.669656753540039,
|
|
"learning_rate": 2.5555555555555557e-06,
|
|
"loss": 0.9315,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.03212985817679789,
|
|
"grad_norm": 8.00128173828125,
|
|
"learning_rate": 2.666666666666667e-06,
|
|
"loss": 0.918,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.0334686022674978,
|
|
"grad_norm": 23.05211067199707,
|
|
"learning_rate": 2.7777777777777783e-06,
|
|
"loss": 0.907,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.03480734635819772,
|
|
"grad_norm": 6.808403015136719,
|
|
"learning_rate": 2.888888888888889e-06,
|
|
"loss": 0.9078,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.036146090448897626,
|
|
"grad_norm": 5.905485153198242,
|
|
"learning_rate": 3e-06,
|
|
"loss": 0.9011,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.03748483453959754,
|
|
"grad_norm": 7.63453483581543,
|
|
"learning_rate": 3.1111111111111116e-06,
|
|
"loss": 0.8922,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.03882357863029745,
|
|
"grad_norm": 5.623775959014893,
|
|
"learning_rate": 3.2222222222222227e-06,
|
|
"loss": 0.8855,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.040162322720997366,
|
|
"grad_norm": 6.224774360656738,
|
|
"learning_rate": 3.3333333333333333e-06,
|
|
"loss": 0.8638,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.041501066811697275,
|
|
"grad_norm": 6.711490631103516,
|
|
"learning_rate": 3.444444444444445e-06,
|
|
"loss": 0.8648,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.04283981090239719,
|
|
"grad_norm": 5.856541156768799,
|
|
"learning_rate": 3.555555555555556e-06,
|
|
"loss": 0.8609,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.0441785549930971,
|
|
"grad_norm": 6.695345401763916,
|
|
"learning_rate": 3.6666666666666666e-06,
|
|
"loss": 0.866,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.045517299083797015,
|
|
"grad_norm": 6.749303817749023,
|
|
"learning_rate": 3.777777777777778e-06,
|
|
"loss": 0.8543,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.046856043174496924,
|
|
"grad_norm": 4.342862129211426,
|
|
"learning_rate": 3.88888888888889e-06,
|
|
"loss": 0.8638,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.04819478726519684,
|
|
"grad_norm": 6.042810440063477,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 0.8638,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.04953353135589675,
|
|
"grad_norm": 5.685999393463135,
|
|
"learning_rate": 4.111111111111111e-06,
|
|
"loss": 0.857,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.050872275446596664,
|
|
"grad_norm": 4.668613910675049,
|
|
"learning_rate": 4.222222222222223e-06,
|
|
"loss": 0.8567,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.05221101953729657,
|
|
"grad_norm": 5.36888313293457,
|
|
"learning_rate": 4.333333333333334e-06,
|
|
"loss": 0.8515,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.05354976362799649,
|
|
"grad_norm": 4.750673770904541,
|
|
"learning_rate": 4.444444444444444e-06,
|
|
"loss": 0.8403,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.0548885077186964,
|
|
"grad_norm": 4.690779685974121,
|
|
"learning_rate": 4.555555555555556e-06,
|
|
"loss": 0.8559,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.05622725180939631,
|
|
"grad_norm": 5.240411758422852,
|
|
"learning_rate": 4.666666666666667e-06,
|
|
"loss": 0.8451,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.05756599590009622,
|
|
"grad_norm": 5.428740501403809,
|
|
"learning_rate": 4.777777777777778e-06,
|
|
"loss": 0.8361,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.05890473999079614,
|
|
"grad_norm": 5.766580104827881,
|
|
"learning_rate": 4.888888888888889e-06,
|
|
"loss": 0.8398,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.060243484081496046,
|
|
"grad_norm": 4.638603210449219,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.8401,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.06158222817219596,
|
|
"grad_norm": 4.195446968078613,
|
|
"learning_rate": 5.1111111111111115e-06,
|
|
"loss": 0.8323,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.06292097226289588,
|
|
"grad_norm": 5.0665812492370605,
|
|
"learning_rate": 5.2222222222222226e-06,
|
|
"loss": 0.8364,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.06425971635359579,
|
|
"grad_norm": 4.643868446350098,
|
|
"learning_rate": 5.333333333333334e-06,
|
|
"loss": 0.8362,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.0655984604442957,
|
|
"grad_norm": 5.377744674682617,
|
|
"learning_rate": 5.444444444444445e-06,
|
|
"loss": 0.8269,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.0669372045349956,
|
|
"grad_norm": 4.733901023864746,
|
|
"learning_rate": 5.555555555555557e-06,
|
|
"loss": 0.8427,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.06827594862569553,
|
|
"grad_norm": 5.304458141326904,
|
|
"learning_rate": 5.666666666666667e-06,
|
|
"loss": 0.823,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.06961469271639543,
|
|
"grad_norm": 4.57764196395874,
|
|
"learning_rate": 5.777777777777778e-06,
|
|
"loss": 0.811,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.07095343680709534,
|
|
"grad_norm": 4.612604141235352,
|
|
"learning_rate": 5.88888888888889e-06,
|
|
"loss": 0.8346,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.07229218089779525,
|
|
"grad_norm": 4.134374141693115,
|
|
"learning_rate": 6e-06,
|
|
"loss": 0.82,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.07363092498849516,
|
|
"grad_norm": 4.34883451461792,
|
|
"learning_rate": 6.111111111111112e-06,
|
|
"loss": 0.8284,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.07496966907919508,
|
|
"grad_norm": 4.357181549072266,
|
|
"learning_rate": 6.222222222222223e-06,
|
|
"loss": 0.818,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.07630841316989499,
|
|
"grad_norm": 4.645741939544678,
|
|
"learning_rate": 6.333333333333333e-06,
|
|
"loss": 0.8258,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.0776471572605949,
|
|
"grad_norm": 8.378664016723633,
|
|
"learning_rate": 6.444444444444445e-06,
|
|
"loss": 0.8156,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.07898590135129481,
|
|
"grad_norm": 3.886690855026245,
|
|
"learning_rate": 6.555555555555556e-06,
|
|
"loss": 0.8209,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.08032464544199473,
|
|
"grad_norm": 4.341153621673584,
|
|
"learning_rate": 6.666666666666667e-06,
|
|
"loss": 0.836,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.08166338953269464,
|
|
"grad_norm": 3.9008429050445557,
|
|
"learning_rate": 6.777777777777779e-06,
|
|
"loss": 0.8217,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.08300213362339455,
|
|
"grad_norm": 5.382652282714844,
|
|
"learning_rate": 6.88888888888889e-06,
|
|
"loss": 0.8164,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.08434087771409446,
|
|
"grad_norm": 4.408705711364746,
|
|
"learning_rate": 7e-06,
|
|
"loss": 0.8187,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.08567962180479438,
|
|
"grad_norm": 17.62004280090332,
|
|
"learning_rate": 7.111111111111112e-06,
|
|
"loss": 0.8215,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.08701836589549429,
|
|
"grad_norm": 5.151593208312988,
|
|
"learning_rate": 7.222222222222223e-06,
|
|
"loss": 0.8166,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.0883571099861942,
|
|
"grad_norm": 4.942852020263672,
|
|
"learning_rate": 7.333333333333333e-06,
|
|
"loss": 0.8233,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.08969585407689411,
|
|
"grad_norm": 3.7978098392486572,
|
|
"learning_rate": 7.444444444444445e-06,
|
|
"loss": 0.8022,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.09103459816759403,
|
|
"grad_norm": 4.018903732299805,
|
|
"learning_rate": 7.555555555555556e-06,
|
|
"loss": 0.801,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.09237334225829394,
|
|
"grad_norm": 3.3000519275665283,
|
|
"learning_rate": 7.666666666666667e-06,
|
|
"loss": 0.8258,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.09371208634899385,
|
|
"grad_norm": 4.254425048828125,
|
|
"learning_rate": 7.77777777777778e-06,
|
|
"loss": 0.8062,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.09505083043969376,
|
|
"grad_norm": 5.094308376312256,
|
|
"learning_rate": 7.88888888888889e-06,
|
|
"loss": 0.8148,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.09638957453039368,
|
|
"grad_norm": 3.5040857791900635,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 0.8103,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.09772831862109359,
|
|
"grad_norm": 4.521397590637207,
|
|
"learning_rate": 8.111111111111112e-06,
|
|
"loss": 0.8192,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.0990670627117935,
|
|
"grad_norm": 4.252678871154785,
|
|
"learning_rate": 8.222222222222222e-06,
|
|
"loss": 0.8278,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.1004058068024934,
|
|
"grad_norm": 4.225308418273926,
|
|
"learning_rate": 8.333333333333334e-06,
|
|
"loss": 0.8085,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.10174455089319333,
|
|
"grad_norm": 4.590817451477051,
|
|
"learning_rate": 8.444444444444446e-06,
|
|
"loss": 0.8074,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.10308329498389324,
|
|
"grad_norm": 4.091726303100586,
|
|
"learning_rate": 8.555555555555556e-06,
|
|
"loss": 0.8102,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.10442203907459315,
|
|
"grad_norm": 5.2528557777404785,
|
|
"learning_rate": 8.666666666666668e-06,
|
|
"loss": 0.8002,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.10576078316529305,
|
|
"grad_norm": 4.67716646194458,
|
|
"learning_rate": 8.777777777777778e-06,
|
|
"loss": 0.8037,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.10709952725599298,
|
|
"grad_norm": 4.421415328979492,
|
|
"learning_rate": 8.888888888888888e-06,
|
|
"loss": 0.8043,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.10843827134669289,
|
|
"grad_norm": 5.166499614715576,
|
|
"learning_rate": 9e-06,
|
|
"loss": 0.8139,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.1097770154373928,
|
|
"grad_norm": 3.4419240951538086,
|
|
"learning_rate": 9.111111111111112e-06,
|
|
"loss": 0.8043,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.1111157595280927,
|
|
"grad_norm": 4.395360946655273,
|
|
"learning_rate": 9.222222222222224e-06,
|
|
"loss": 0.8063,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.11245450361879263,
|
|
"grad_norm": 4.6604390144348145,
|
|
"learning_rate": 9.333333333333334e-06,
|
|
"loss": 0.7877,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.11379324770949253,
|
|
"grad_norm": 3.9943435192108154,
|
|
"learning_rate": 9.444444444444445e-06,
|
|
"loss": 0.7982,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.11513199180019244,
|
|
"grad_norm": 3.9260923862457275,
|
|
"learning_rate": 9.555555555555556e-06,
|
|
"loss": 0.7937,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.11647073589089235,
|
|
"grad_norm": 4.23286771774292,
|
|
"learning_rate": 9.666666666666667e-06,
|
|
"loss": 0.8051,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.11780947998159227,
|
|
"grad_norm": 4.055145263671875,
|
|
"learning_rate": 9.777777777777779e-06,
|
|
"loss": 0.791,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.11914822407229218,
|
|
"grad_norm": 3.6109678745269775,
|
|
"learning_rate": 9.88888888888889e-06,
|
|
"loss": 0.8003,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.12048696816299209,
|
|
"grad_norm": 4.552112102508545,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.7877,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.12048696816299209,
|
|
"eval_loss": 0.49451154470443726,
|
|
"eval_runtime": 143.5523,
|
|
"eval_samples_per_second": 76.627,
|
|
"eval_steps_per_second": 9.578,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.121825712253692,
|
|
"grad_norm": 3.366373062133789,
|
|
"learning_rate": 9.999962392958281e-06,
|
|
"loss": 0.7957,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.12316445634439192,
|
|
"grad_norm": 3.7465860843658447,
|
|
"learning_rate": 9.99984957239884e-06,
|
|
"loss": 0.7917,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.12450320043509183,
|
|
"grad_norm": 4.393531322479248,
|
|
"learning_rate": 9.999661540018812e-06,
|
|
"loss": 0.8008,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.12584194452579175,
|
|
"grad_norm": 4.885051727294922,
|
|
"learning_rate": 9.999398298646738e-06,
|
|
"loss": 0.7991,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.12718068861649165,
|
|
"grad_norm": 4.613903999328613,
|
|
"learning_rate": 9.999059852242508e-06,
|
|
"loss": 0.8,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.12851943270719157,
|
|
"grad_norm": 3.7830846309661865,
|
|
"learning_rate": 9.99864620589731e-06,
|
|
"loss": 0.7892,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.12985817679789147,
|
|
"grad_norm": 4.733177661895752,
|
|
"learning_rate": 9.998157365833548e-06,
|
|
"loss": 0.7938,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.1311969208885914,
|
|
"grad_norm": 4.23670768737793,
|
|
"learning_rate": 9.997593339404757e-06,
|
|
"loss": 0.8031,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.1325356649792913,
|
|
"grad_norm": 4.839778423309326,
|
|
"learning_rate": 9.99695413509548e-06,
|
|
"loss": 0.7791,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.1338744090699912,
|
|
"grad_norm": 4.334091663360596,
|
|
"learning_rate": 9.996239762521152e-06,
|
|
"loss": 0.7993,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.13521315316069113,
|
|
"grad_norm": 3.916949987411499,
|
|
"learning_rate": 9.995450232427947e-06,
|
|
"loss": 0.8048,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.13655189725139105,
|
|
"grad_norm": 3.399409532546997,
|
|
"learning_rate": 9.994585556692624e-06,
|
|
"loss": 0.7863,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.13789064134209095,
|
|
"grad_norm": 4.329835414886475,
|
|
"learning_rate": 9.99364574832234e-06,
|
|
"loss": 0.7937,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.13922938543279087,
|
|
"grad_norm": 6.405045509338379,
|
|
"learning_rate": 9.992630821454458e-06,
|
|
"loss": 0.8128,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.14056812952349076,
|
|
"grad_norm": 10.752445220947266,
|
|
"learning_rate": 9.991540791356342e-06,
|
|
"loss": 0.801,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.1419068736141907,
|
|
"grad_norm": 50.446380615234375,
|
|
"learning_rate": 9.99037567442511e-06,
|
|
"loss": 1.0436,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.1432456177048906,
|
|
"grad_norm": 6.3563103675842285,
|
|
"learning_rate": 9.989135488187407e-06,
|
|
"loss": 0.8673,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.1445843617955905,
|
|
"grad_norm": 4.534543514251709,
|
|
"learning_rate": 9.987820251299121e-06,
|
|
"loss": 0.808,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.14592310588629043,
|
|
"grad_norm": 3.6836161613464355,
|
|
"learning_rate": 9.986429983545127e-06,
|
|
"loss": 0.7939,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.14726184997699032,
|
|
"grad_norm": 3.721494197845459,
|
|
"learning_rate": 9.98496470583896e-06,
|
|
"loss": 0.792,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.14860059406769024,
|
|
"grad_norm": 4.224364280700684,
|
|
"learning_rate": 9.98342444022253e-06,
|
|
"loss": 0.7995,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.14993933815839017,
|
|
"grad_norm": 4.185133934020996,
|
|
"learning_rate": 9.98180920986577e-06,
|
|
"loss": 0.7935,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.15127808224909006,
|
|
"grad_norm": 4.013835906982422,
|
|
"learning_rate": 9.98011903906629e-06,
|
|
"loss": 0.7937,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.15261682633978998,
|
|
"grad_norm": 4.451872825622559,
|
|
"learning_rate": 9.978353953249023e-06,
|
|
"loss": 0.7769,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.1539555704304899,
|
|
"grad_norm": 3.9698851108551025,
|
|
"learning_rate": 9.976513978965829e-06,
|
|
"loss": 0.7874,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.1552943145211898,
|
|
"grad_norm": 3.6052067279815674,
|
|
"learning_rate": 9.974599143895107e-06,
|
|
"loss": 0.7767,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.15663305861188973,
|
|
"grad_norm": 4.276534557342529,
|
|
"learning_rate": 9.972609476841368e-06,
|
|
"loss": 0.7854,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.15797180270258962,
|
|
"grad_norm": 4.056195259094238,
|
|
"learning_rate": 9.970545007734807e-06,
|
|
"loss": 0.7733,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.15931054679328954,
|
|
"grad_norm": 4.227043151855469,
|
|
"learning_rate": 9.968405767630857e-06,
|
|
"loss": 0.7749,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.16064929088398947,
|
|
"grad_norm": 3.6854279041290283,
|
|
"learning_rate": 9.966191788709716e-06,
|
|
"loss": 0.771,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.16198803497468936,
|
|
"grad_norm": 4.51245641708374,
|
|
"learning_rate": 9.963903104275859e-06,
|
|
"loss": 0.7873,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.16332677906538928,
|
|
"grad_norm": 4.075981616973877,
|
|
"learning_rate": 9.96153974875755e-06,
|
|
"loss": 0.788,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.1646655231560892,
|
|
"grad_norm": 3.9029247760772705,
|
|
"learning_rate": 9.959101757706308e-06,
|
|
"loss": 0.7739,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.1660042672467891,
|
|
"grad_norm": 4.428092956542969,
|
|
"learning_rate": 9.956589167796392e-06,
|
|
"loss": 0.7741,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.16734301133748902,
|
|
"grad_norm": 4.016323089599609,
|
|
"learning_rate": 9.954002016824226e-06,
|
|
"loss": 0.7896,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.16868175542818892,
|
|
"grad_norm": 3.254408597946167,
|
|
"learning_rate": 9.951340343707852e-06,
|
|
"loss": 0.7642,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.17002049951888884,
|
|
"grad_norm": 3.8163158893585205,
|
|
"learning_rate": 9.948604188486328e-06,
|
|
"loss": 0.7768,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.17135924360958876,
|
|
"grad_norm": 3.607434034347534,
|
|
"learning_rate": 9.945793592319137e-06,
|
|
"loss": 0.7894,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.17269798770028866,
|
|
"grad_norm": 3.229252576828003,
|
|
"learning_rate": 9.942908597485558e-06,
|
|
"loss": 0.7802,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.17403673179098858,
|
|
"grad_norm": 3.9906787872314453,
|
|
"learning_rate": 9.939949247384046e-06,
|
|
"loss": 0.7741,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.1753754758816885,
|
|
"grad_norm": 3.6085970401763916,
|
|
"learning_rate": 9.936915586531556e-06,
|
|
"loss": 0.7805,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.1767142199723884,
|
|
"grad_norm": 4.8091912269592285,
|
|
"learning_rate": 9.933807660562898e-06,
|
|
"loss": 0.7743,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.17805296406308832,
|
|
"grad_norm": 3.7678418159484863,
|
|
"learning_rate": 9.930625516230026e-06,
|
|
"loss": 0.7926,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.17939170815378822,
|
|
"grad_norm": 4.080018043518066,
|
|
"learning_rate": 9.927369201401358e-06,
|
|
"loss": 0.7601,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.18073045224448814,
|
|
"grad_norm": 6.115504264831543,
|
|
"learning_rate": 9.924038765061042e-06,
|
|
"loss": 0.7668,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.18206919633518806,
|
|
"grad_norm": 4.7918381690979,
|
|
"learning_rate": 9.920634257308217e-06,
|
|
"loss": 0.7741,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.18340794042588796,
|
|
"grad_norm": 4.837031841278076,
|
|
"learning_rate": 9.917155729356273e-06,
|
|
"loss": 0.7643,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.18474668451658788,
|
|
"grad_norm": 4.369593143463135,
|
|
"learning_rate": 9.913603233532067e-06,
|
|
"loss": 0.7692,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.1860854286072878,
|
|
"grad_norm": 3.901932954788208,
|
|
"learning_rate": 9.909976823275143e-06,
|
|
"loss": 0.7769,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.1874241726979877,
|
|
"grad_norm": 4.239790439605713,
|
|
"learning_rate": 9.906276553136924e-06,
|
|
"loss": 0.7607,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.18876291678868762,
|
|
"grad_norm": 4.206404685974121,
|
|
"learning_rate": 9.902502478779897e-06,
|
|
"loss": 0.7693,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.1901016608793875,
|
|
"grad_norm": 3.9536921977996826,
|
|
"learning_rate": 9.89865465697677e-06,
|
|
"loss": 0.7619,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.19144040497008744,
|
|
"grad_norm": 4.39210319519043,
|
|
"learning_rate": 9.894733145609623e-06,
|
|
"loss": 0.7595,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.19277914906078736,
|
|
"grad_norm": 3.726269006729126,
|
|
"learning_rate": 9.890738003669029e-06,
|
|
"loss": 0.7683,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.19411789315148725,
|
|
"grad_norm": 3.802138090133667,
|
|
"learning_rate": 9.886669291253178e-06,
|
|
"loss": 0.7721,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.19545663724218718,
|
|
"grad_norm": 3.9344289302825928,
|
|
"learning_rate": 9.882527069566965e-06,
|
|
"loss": 0.7572,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.1967953813328871,
|
|
"grad_norm": 4.495893955230713,
|
|
"learning_rate": 9.878311400921072e-06,
|
|
"loss": 0.7597,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.198134125423587,
|
|
"grad_norm": 3.827364921569824,
|
|
"learning_rate": 9.87402234873103e-06,
|
|
"loss": 0.7626,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.19947286951428692,
|
|
"grad_norm": 4.016800880432129,
|
|
"learning_rate": 9.869659977516261e-06,
|
|
"loss": 0.7706,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.2008116136049868,
|
|
"grad_norm": 3.641141414642334,
|
|
"learning_rate": 9.86522435289912e-06,
|
|
"loss": 0.7556,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.20215035769568673,
|
|
"grad_norm": 3.6428914070129395,
|
|
"learning_rate": 9.860715541603893e-06,
|
|
"loss": 0.7564,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.20348910178638666,
|
|
"grad_norm": 3.3614089488983154,
|
|
"learning_rate": 9.856133611455802e-06,
|
|
"loss": 0.7618,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.20482784587708655,
|
|
"grad_norm": 3.7163243293762207,
|
|
"learning_rate": 9.851478631379982e-06,
|
|
"loss": 0.7601,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.20616658996778647,
|
|
"grad_norm": 4.534898281097412,
|
|
"learning_rate": 9.846750671400447e-06,
|
|
"loss": 0.7499,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.2075053340584864,
|
|
"grad_norm": 4.304228782653809,
|
|
"learning_rate": 9.841949802639031e-06,
|
|
"loss": 0.783,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.2088440781491863,
|
|
"grad_norm": 3.5698232650756836,
|
|
"learning_rate": 9.83707609731432e-06,
|
|
"loss": 0.7646,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.2101828222398862,
|
|
"grad_norm": 4.564169883728027,
|
|
"learning_rate": 9.832129628740574e-06,
|
|
"loss": 0.7508,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.2115215663305861,
|
|
"grad_norm": 3.549129009246826,
|
|
"learning_rate": 9.827110471326612e-06,
|
|
"loss": 0.7581,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.21286031042128603,
|
|
"grad_norm": 4.5762481689453125,
|
|
"learning_rate": 9.822018700574696e-06,
|
|
"loss": 0.7523,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.21419905451198595,
|
|
"grad_norm": 3.662191390991211,
|
|
"learning_rate": 9.816854393079402e-06,
|
|
"loss": 0.7627,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.21553779860268585,
|
|
"grad_norm": 5.298248767852783,
|
|
"learning_rate": 9.811617626526462e-06,
|
|
"loss": 0.7616,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.21687654269338577,
|
|
"grad_norm": 4.655685901641846,
|
|
"learning_rate": 9.806308479691595e-06,
|
|
"loss": 0.7432,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.21821528678408567,
|
|
"grad_norm": 4.099658012390137,
|
|
"learning_rate": 9.800927032439322e-06,
|
|
"loss": 0.7581,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.2195540308747856,
|
|
"grad_norm": 4.044067859649658,
|
|
"learning_rate": 9.79547336572177e-06,
|
|
"loss": 0.753,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.2208927749654855,
|
|
"grad_norm": 3.636643171310425,
|
|
"learning_rate": 9.789947561577445e-06,
|
|
"loss": 0.7675,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.2222315190561854,
|
|
"grad_norm": 3.625516414642334,
|
|
"learning_rate": 9.784349703130008e-06,
|
|
"loss": 0.7397,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.22357026314688533,
|
|
"grad_norm": 3.8451921939849854,
|
|
"learning_rate": 9.778679874587016e-06,
|
|
"loss": 0.7597,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.22490900723758525,
|
|
"grad_norm": 5.806266784667969,
|
|
"learning_rate": 9.77293816123866e-06,
|
|
"loss": 0.7508,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.22624775132828515,
|
|
"grad_norm": 4.2292633056640625,
|
|
"learning_rate": 9.767124649456484e-06,
|
|
"loss": 0.7587,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.22758649541898507,
|
|
"grad_norm": 4.004525184631348,
|
|
"learning_rate": 9.761239426692077e-06,
|
|
"loss": 0.7325,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.22892523950968496,
|
|
"grad_norm": 4.519168853759766,
|
|
"learning_rate": 9.755282581475769e-06,
|
|
"loss": 0.757,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.23026398360038489,
|
|
"grad_norm": 4.462296962738037,
|
|
"learning_rate": 9.749254203415288e-06,
|
|
"loss": 0.7538,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.2316027276910848,
|
|
"grad_norm": 4.382823944091797,
|
|
"learning_rate": 9.743154383194422e-06,
|
|
"loss": 0.7489,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.2329414717817847,
|
|
"grad_norm": 5.393520355224609,
|
|
"learning_rate": 9.736983212571646e-06,
|
|
"loss": 0.7662,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.23428021587248463,
|
|
"grad_norm": 4.451732635498047,
|
|
"learning_rate": 9.730740784378755e-06,
|
|
"loss": 0.7403,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.23561895996318455,
|
|
"grad_norm": 4.634720325469971,
|
|
"learning_rate": 9.72442719251944e-06,
|
|
"loss": 0.7541,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.23695770405388444,
|
|
"grad_norm": 4.420860290527344,
|
|
"learning_rate": 9.718042531967918e-06,
|
|
"loss": 0.7468,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.23829644814458437,
|
|
"grad_norm": 3.7284321784973145,
|
|
"learning_rate": 9.711586898767462e-06,
|
|
"loss": 0.7577,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.23963519223528426,
|
|
"grad_norm": 5.270982265472412,
|
|
"learning_rate": 9.705060390028979e-06,
|
|
"loss": 0.7638,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.24097393632598418,
|
|
"grad_norm": 3.324500799179077,
|
|
"learning_rate": 9.698463103929542e-06,
|
|
"loss": 0.7449,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.24097393632598418,
|
|
"eval_loss": 0.4764781594276428,
|
|
"eval_runtime": 143.1418,
|
|
"eval_samples_per_second": 76.847,
|
|
"eval_steps_per_second": 9.606,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.2423126804166841,
|
|
"grad_norm": 3.796365976333618,
|
|
"learning_rate": 9.69179513971092e-06,
|
|
"loss": 0.7366,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.243651424507384,
|
|
"grad_norm": 3.6172876358032227,
|
|
"learning_rate": 9.685056597678075e-06,
|
|
"loss": 0.7636,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.24499016859808392,
|
|
"grad_norm": 3.682147979736328,
|
|
"learning_rate": 9.678247579197658e-06,
|
|
"loss": 0.7559,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.24632891268878385,
|
|
"grad_norm": 3.6523611545562744,
|
|
"learning_rate": 9.671368186696488e-06,
|
|
"loss": 0.7388,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.24766765677948374,
|
|
"grad_norm": 4.529437065124512,
|
|
"learning_rate": 9.664418523660004e-06,
|
|
"loss": 0.7505,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.24900640087018366,
|
|
"grad_norm": 3.691631317138672,
|
|
"learning_rate": 9.657398694630713e-06,
|
|
"loss": 0.7455,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.25034514496088356,
|
|
"grad_norm": 2.8902854919433594,
|
|
"learning_rate": 9.650308805206616e-06,
|
|
"loss": 0.7427,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.2516838890515835,
|
|
"grad_norm": 3.4010236263275146,
|
|
"learning_rate": 9.643148962039622e-06,
|
|
"loss": 0.746,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.2530226331422834,
|
|
"grad_norm": 3.878700017929077,
|
|
"learning_rate": 9.635919272833938e-06,
|
|
"loss": 0.745,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.2543613772329833,
|
|
"grad_norm": 3.759983777999878,
|
|
"learning_rate": 9.628619846344453e-06,
|
|
"loss": 0.7416,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.2557001213236832,
|
|
"grad_norm": 3.109137535095215,
|
|
"learning_rate": 9.6212507923751e-06,
|
|
"loss": 0.766,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.25703886541438314,
|
|
"grad_norm": 3.3560118675231934,
|
|
"learning_rate": 9.613812221777212e-06,
|
|
"loss": 0.7473,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.25837760950508304,
|
|
"grad_norm": 3.428746461868286,
|
|
"learning_rate": 9.60630424644784e-06,
|
|
"loss": 0.7482,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.25971635359578293,
|
|
"grad_norm": 3.476893901824951,
|
|
"learning_rate": 9.598726979328079e-06,
|
|
"loss": 0.7313,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.2610550976864829,
|
|
"grad_norm": 3.4949777126312256,
|
|
"learning_rate": 9.591080534401371e-06,
|
|
"loss": 0.7356,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.2623938417771828,
|
|
"grad_norm": 3.558475971221924,
|
|
"learning_rate": 9.583365026691785e-06,
|
|
"loss": 0.7382,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.2637325858678827,
|
|
"grad_norm": 4.724122047424316,
|
|
"learning_rate": 9.57558057226229e-06,
|
|
"loss": 0.7429,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.2650713299585826,
|
|
"grad_norm": 3.158379554748535,
|
|
"learning_rate": 9.567727288213005e-06,
|
|
"loss": 0.7358,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.2664100740492825,
|
|
"grad_norm": 3.3250861167907715,
|
|
"learning_rate": 9.559805292679445e-06,
|
|
"loss": 0.7629,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.2677488181399824,
|
|
"grad_norm": 3.297356367111206,
|
|
"learning_rate": 9.551814704830734e-06,
|
|
"loss": 0.7581,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.26908756223068236,
|
|
"grad_norm": 3.5801124572753906,
|
|
"learning_rate": 9.543755644867823e-06,
|
|
"loss": 0.7325,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.27042630632138226,
|
|
"grad_norm": 3.4547929763793945,
|
|
"learning_rate": 9.53562823402167e-06,
|
|
"loss": 0.741,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.27176505041208215,
|
|
"grad_norm": 4.123391151428223,
|
|
"learning_rate": 9.52743259455143e-06,
|
|
"loss": 0.7277,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.2731037945027821,
|
|
"grad_norm": 2.9932847023010254,
|
|
"learning_rate": 9.519168849742603e-06,
|
|
"loss": 0.7449,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.274442538593482,
|
|
"grad_norm": 3.8116295337677,
|
|
"learning_rate": 9.51083712390519e-06,
|
|
"loss": 0.7269,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.2757812826841819,
|
|
"grad_norm": 3.965240955352783,
|
|
"learning_rate": 9.502437542371812e-06,
|
|
"loss": 0.7322,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.2771200267748818,
|
|
"grad_norm": 4.292726993560791,
|
|
"learning_rate": 9.493970231495836e-06,
|
|
"loss": 0.7484,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.27845877086558174,
|
|
"grad_norm": 3.7208399772644043,
|
|
"learning_rate": 9.485435318649468e-06,
|
|
"loss": 0.7362,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.27979751495628163,
|
|
"grad_norm": 3.802112102508545,
|
|
"learning_rate": 9.476832932221835e-06,
|
|
"loss": 0.7478,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.28113625904698153,
|
|
"grad_norm": 3.4210598468780518,
|
|
"learning_rate": 9.468163201617063e-06,
|
|
"loss": 0.7506,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.2824750031376815,
|
|
"grad_norm": 3.579206943511963,
|
|
"learning_rate": 9.459426257252316e-06,
|
|
"loss": 0.7299,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.2838137472283814,
|
|
"grad_norm": 3.501737594604492,
|
|
"learning_rate": 9.450622230555849e-06,
|
|
"loss": 0.7392,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.28515249131908127,
|
|
"grad_norm": 4.062873363494873,
|
|
"learning_rate": 9.441751253965022e-06,
|
|
"loss": 0.724,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.2864912354097812,
|
|
"grad_norm": 4.042972087860107,
|
|
"learning_rate": 9.432813460924308e-06,
|
|
"loss": 0.7361,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.2878299795004811,
|
|
"grad_norm": 3.6642308235168457,
|
|
"learning_rate": 9.423808985883289e-06,
|
|
"loss": 0.7327,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.289168723591181,
|
|
"grad_norm": 5.152829170227051,
|
|
"learning_rate": 9.414737964294636e-06,
|
|
"loss": 0.7436,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.29050746768188096,
|
|
"grad_norm": 3.8207755088806152,
|
|
"learning_rate": 9.405600532612061e-06,
|
|
"loss": 0.745,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.29184621177258085,
|
|
"grad_norm": 4.000129699707031,
|
|
"learning_rate": 9.396396828288272e-06,
|
|
"loss": 0.7381,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.29318495586328075,
|
|
"grad_norm": 3.5032620429992676,
|
|
"learning_rate": 9.38712698977291e-06,
|
|
"loss": 0.753,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.29452369995398064,
|
|
"grad_norm": 3.847727060317993,
|
|
"learning_rate": 9.377791156510456e-06,
|
|
"loss": 0.7439,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.2958624440446806,
|
|
"grad_norm": 3.7372615337371826,
|
|
"learning_rate": 9.368389468938134e-06,
|
|
"loss": 0.7318,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.2972011881353805,
|
|
"grad_norm": 4.033111572265625,
|
|
"learning_rate": 9.358922068483813e-06,
|
|
"loss": 0.724,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.2985399322260804,
|
|
"grad_norm": 3.427645683288574,
|
|
"learning_rate": 9.349389097563858e-06,
|
|
"loss": 0.7375,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.29987867631678033,
|
|
"grad_norm": 3.5624077320098877,
|
|
"learning_rate": 9.339790699581004e-06,
|
|
"loss": 0.729,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.30121742040748023,
|
|
"grad_norm": 3.7358171939849854,
|
|
"learning_rate": 9.330127018922195e-06,
|
|
"loss": 0.7372,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.3025561644981801,
|
|
"grad_norm": 4.112128734588623,
|
|
"learning_rate": 9.320398200956403e-06,
|
|
"loss": 0.7504,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.3038949085888801,
|
|
"grad_norm": 3.6609609127044678,
|
|
"learning_rate": 9.310604392032457e-06,
|
|
"loss": 0.7352,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.30523365267957997,
|
|
"grad_norm": 3.046917200088501,
|
|
"learning_rate": 9.30074573947683e-06,
|
|
"loss": 0.7466,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.30657239677027986,
|
|
"grad_norm": 3.5731890201568604,
|
|
"learning_rate": 9.290822391591418e-06,
|
|
"loss": 0.7455,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.3079111408609798,
|
|
"grad_norm": 4.293703556060791,
|
|
"learning_rate": 9.280834497651334e-06,
|
|
"loss": 0.7235,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.3092498849516797,
|
|
"grad_norm": 4.142804145812988,
|
|
"learning_rate": 9.27078220790263e-06,
|
|
"loss": 0.7353,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.3105886290423796,
|
|
"grad_norm": 3.4866645336151123,
|
|
"learning_rate": 9.260665673560058e-06,
|
|
"loss": 0.725,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.31192737313307956,
|
|
"grad_norm": 3.9773788452148438,
|
|
"learning_rate": 9.25048504680479e-06,
|
|
"loss": 0.7259,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.31326611722377945,
|
|
"grad_norm": 3.6128592491149902,
|
|
"learning_rate": 9.24024048078213e-06,
|
|
"loss": 0.72,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.31460486131447934,
|
|
"grad_norm": 14.768600463867188,
|
|
"learning_rate": 9.229932129599206e-06,
|
|
"loss": 0.742,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.31594360540517924,
|
|
"grad_norm": 3.8117377758026123,
|
|
"learning_rate": 9.219560148322655e-06,
|
|
"loss": 0.7273,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.3172823494958792,
|
|
"grad_norm": 3.699889659881592,
|
|
"learning_rate": 9.209124692976287e-06,
|
|
"loss": 0.7249,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.3186210935865791,
|
|
"grad_norm": 4.186975479125977,
|
|
"learning_rate": 9.19862592053875e-06,
|
|
"loss": 0.7386,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.319959837677279,
|
|
"grad_norm": 3.8177313804626465,
|
|
"learning_rate": 9.188063988941147e-06,
|
|
"loss": 0.7251,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.32129858176797893,
|
|
"grad_norm": 3.5190351009368896,
|
|
"learning_rate": 9.177439057064684e-06,
|
|
"loss": 0.743,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.3226373258586788,
|
|
"grad_norm": 3.8348472118377686,
|
|
"learning_rate": 9.166751284738258e-06,
|
|
"loss": 0.7379,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.3239760699493787,
|
|
"grad_norm": 3.561465263366699,
|
|
"learning_rate": 9.156000832736073e-06,
|
|
"loss": 0.7426,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.32531481404007867,
|
|
"grad_norm": 3.0603857040405273,
|
|
"learning_rate": 9.145187862775208e-06,
|
|
"loss": 0.732,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.32665355813077857,
|
|
"grad_norm": 4.3121209144592285,
|
|
"learning_rate": 9.134312537513188e-06,
|
|
"loss": 0.7237,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.32799230222147846,
|
|
"grad_norm": 4.397933483123779,
|
|
"learning_rate": 9.123375020545534e-06,
|
|
"loss": 0.7347,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.3293310463121784,
|
|
"grad_norm": 4.270680904388428,
|
|
"learning_rate": 9.112375476403313e-06,
|
|
"loss": 0.725,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.3306697904028783,
|
|
"grad_norm": 3.8548951148986816,
|
|
"learning_rate": 9.101314070550647e-06,
|
|
"loss": 0.723,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.3320085344935782,
|
|
"grad_norm": 4.533010959625244,
|
|
"learning_rate": 9.09019096938224e-06,
|
|
"loss": 0.7385,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.33334727858427815,
|
|
"grad_norm": 4.1144938468933105,
|
|
"learning_rate": 9.079006340220862e-06,
|
|
"loss": 0.727,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.33468602267497805,
|
|
"grad_norm": 4.0266876220703125,
|
|
"learning_rate": 9.067760351314838e-06,
|
|
"loss": 0.7209,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.33602476676567794,
|
|
"grad_norm": 4.5649871826171875,
|
|
"learning_rate": 9.056453171835523e-06,
|
|
"loss": 0.7245,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 0.33736351085637784,
|
|
"grad_norm": 3.916438102722168,
|
|
"learning_rate": 9.045084971874738e-06,
|
|
"loss": 0.7277,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 0.3387022549470778,
|
|
"grad_norm": 3.820436477661133,
|
|
"learning_rate": 9.033655922442235e-06,
|
|
"loss": 0.7141,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 0.3400409990377777,
|
|
"grad_norm": 3.6624293327331543,
|
|
"learning_rate": 9.022166195463112e-06,
|
|
"loss": 0.7307,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 0.3413797431284776,
|
|
"grad_norm": 3.745054006576538,
|
|
"learning_rate": 9.01061596377522e-06,
|
|
"loss": 0.7162,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 0.3427184872191775,
|
|
"grad_norm": 4.034268856048584,
|
|
"learning_rate": 8.99900540112658e-06,
|
|
"loss": 0.7359,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 0.3440572313098774,
|
|
"grad_norm": 3.763793706893921,
|
|
"learning_rate": 8.987334682172759e-06,
|
|
"loss": 0.7106,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 0.3453959754005773,
|
|
"grad_norm": 4.49244499206543,
|
|
"learning_rate": 8.97560398247424e-06,
|
|
"loss": 0.7349,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 0.34673471949127727,
|
|
"grad_norm": 4.118375301361084,
|
|
"learning_rate": 8.963813478493788e-06,
|
|
"loss": 0.7167,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 0.34807346358197716,
|
|
"grad_norm": 3.4959819316864014,
|
|
"learning_rate": 8.951963347593797e-06,
|
|
"loss": 0.7303,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.34941220767267706,
|
|
"grad_norm": 3.091417074203491,
|
|
"learning_rate": 8.94005376803361e-06,
|
|
"loss": 0.726,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 0.350750951763377,
|
|
"grad_norm": 3.908771276473999,
|
|
"learning_rate": 8.92808491896685e-06,
|
|
"loss": 0.7269,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 0.3520896958540769,
|
|
"grad_norm": 3.536282777786255,
|
|
"learning_rate": 8.916056980438723e-06,
|
|
"loss": 0.7301,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 0.3534284399447768,
|
|
"grad_norm": 3.997955799102783,
|
|
"learning_rate": 8.903970133383297e-06,
|
|
"loss": 0.7197,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 0.35476718403547675,
|
|
"grad_norm": 3.2468626499176025,
|
|
"learning_rate": 8.891824559620801e-06,
|
|
"loss": 0.7265,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 0.35610592812617664,
|
|
"grad_norm": 3.5392544269561768,
|
|
"learning_rate": 8.879620441854873e-06,
|
|
"loss": 0.7156,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 0.35744467221687654,
|
|
"grad_norm": 4.095037460327148,
|
|
"learning_rate": 8.867357963669821e-06,
|
|
"loss": 0.7314,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 0.35878341630757643,
|
|
"grad_norm": 4.159110069274902,
|
|
"learning_rate": 8.855037309527854e-06,
|
|
"loss": 0.736,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 0.3601221603982764,
|
|
"grad_norm": 4.023700714111328,
|
|
"learning_rate": 8.842658664766317e-06,
|
|
"loss": 0.7305,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 0.3614609044889763,
|
|
"grad_norm": 5.768006801605225,
|
|
"learning_rate": 8.83022221559489e-06,
|
|
"loss": 0.7446,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.3614609044889763,
|
|
"eval_loss": 0.4720214903354645,
|
|
"eval_runtime": 143.3304,
|
|
"eval_samples_per_second": 76.746,
|
|
"eval_steps_per_second": 9.593,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.36279964857967617,
|
|
"grad_norm": 3.865347146987915,
|
|
"learning_rate": 8.817728149092803e-06,
|
|
"loss": 0.7324,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 0.3641383926703761,
|
|
"grad_norm": 3.804232120513916,
|
|
"learning_rate": 8.805176653206004e-06,
|
|
"loss": 0.7216,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 0.365477136761076,
|
|
"grad_norm": 3.813936233520508,
|
|
"learning_rate": 8.792567916744346e-06,
|
|
"loss": 0.7352,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 0.3668158808517759,
|
|
"grad_norm": 3.221403121948242,
|
|
"learning_rate": 8.77990212937874e-06,
|
|
"loss": 0.7288,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 0.36815462494247586,
|
|
"grad_norm": 4.729770660400391,
|
|
"learning_rate": 8.767179481638303e-06,
|
|
"loss": 0.7198,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 0.36949336903317576,
|
|
"grad_norm": 4.238366603851318,
|
|
"learning_rate": 8.754400164907496e-06,
|
|
"loss": 0.7265,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 0.37083211312387565,
|
|
"grad_norm": 3.8911354541778564,
|
|
"learning_rate": 8.741564371423235e-06,
|
|
"loss": 0.7191,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 0.3721708572145756,
|
|
"grad_norm": 5.561570644378662,
|
|
"learning_rate": 8.728672294272009e-06,
|
|
"loss": 0.7288,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 0.3735096013052755,
|
|
"grad_norm": 3.7610480785369873,
|
|
"learning_rate": 8.715724127386971e-06,
|
|
"loss": 0.7229,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 0.3748483453959754,
|
|
"grad_norm": 3.495746612548828,
|
|
"learning_rate": 8.702720065545024e-06,
|
|
"loss": 0.7201,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.3761870894866753,
|
|
"grad_norm": 3.7154171466827393,
|
|
"learning_rate": 8.689660304363883e-06,
|
|
"loss": 0.7294,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 0.37752583357737524,
|
|
"grad_norm": 3.9383347034454346,
|
|
"learning_rate": 8.676545040299145e-06,
|
|
"loss": 0.7287,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 0.37886457766807513,
|
|
"grad_norm": 3.6349284648895264,
|
|
"learning_rate": 8.663374470641319e-06,
|
|
"loss": 0.7321,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 0.380203321758775,
|
|
"grad_norm": 4.659618854522705,
|
|
"learning_rate": 8.650148793512874e-06,
|
|
"loss": 0.7173,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 0.381542065849475,
|
|
"grad_norm": 5.062843322753906,
|
|
"learning_rate": 8.636868207865244e-06,
|
|
"loss": 0.7302,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 0.38288080994017487,
|
|
"grad_norm": 3.964306354522705,
|
|
"learning_rate": 8.623532913475847e-06,
|
|
"loss": 0.7344,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 0.38421955403087477,
|
|
"grad_norm": 5.844438552856445,
|
|
"learning_rate": 8.610143110945068e-06,
|
|
"loss": 0.7385,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 0.3855582981215747,
|
|
"grad_norm": 6.338469505310059,
|
|
"learning_rate": 8.596699001693257e-06,
|
|
"loss": 0.7172,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 0.3868970422122746,
|
|
"grad_norm": 3.85361647605896,
|
|
"learning_rate": 8.58320078795768e-06,
|
|
"loss": 0.714,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 0.3882357863029745,
|
|
"grad_norm": 3.434246063232422,
|
|
"learning_rate": 8.569648672789496e-06,
|
|
"loss": 0.7352,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.38957453039367446,
|
|
"grad_norm": 3.7169744968414307,
|
|
"learning_rate": 8.556042860050686e-06,
|
|
"loss": 0.7197,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 0.39091327448437435,
|
|
"grad_norm": 3.8895809650421143,
|
|
"learning_rate": 8.542383554411e-06,
|
|
"loss": 0.723,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 0.39225201857507425,
|
|
"grad_norm": 3.972959041595459,
|
|
"learning_rate": 8.528670961344866e-06,
|
|
"loss": 0.7352,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 0.3935907626657742,
|
|
"grad_norm": 3.263845682144165,
|
|
"learning_rate": 8.51490528712831e-06,
|
|
"loss": 0.7153,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 0.3949295067564741,
|
|
"grad_norm": 3.272479772567749,
|
|
"learning_rate": 8.501086738835843e-06,
|
|
"loss": 0.7168,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 0.396268250847174,
|
|
"grad_norm": 3.9211878776550293,
|
|
"learning_rate": 8.487215524337357e-06,
|
|
"loss": 0.7212,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 0.3976069949378739,
|
|
"grad_norm": 3.7852578163146973,
|
|
"learning_rate": 8.473291852294986e-06,
|
|
"loss": 0.7262,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 0.39894573902857383,
|
|
"grad_norm": 4.1221208572387695,
|
|
"learning_rate": 8.45931593215998e-06,
|
|
"loss": 0.7254,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 0.4002844831192737,
|
|
"grad_norm": 3.474747896194458,
|
|
"learning_rate": 8.44528797416954e-06,
|
|
"loss": 0.7091,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 0.4016232272099736,
|
|
"grad_norm": 3.9018914699554443,
|
|
"learning_rate": 8.43120818934367e-06,
|
|
"loss": 0.7216,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.40296197130067357,
|
|
"grad_norm": 2.807328462600708,
|
|
"learning_rate": 8.417076789481985e-06,
|
|
"loss": 0.7136,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 0.40430071539137347,
|
|
"grad_norm": 3.5140132904052734,
|
|
"learning_rate": 8.402893987160553e-06,
|
|
"loss": 0.7259,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 0.40563945948207336,
|
|
"grad_norm": 3.6685054302215576,
|
|
"learning_rate": 8.388659995728662e-06,
|
|
"loss": 0.7264,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 0.4069782035727733,
|
|
"grad_norm": 3.668884515762329,
|
|
"learning_rate": 8.37437502930564e-06,
|
|
"loss": 0.7338,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 0.4083169476634732,
|
|
"grad_norm": 4.457537651062012,
|
|
"learning_rate": 8.360039302777614e-06,
|
|
"loss": 0.7187,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 0.4096556917541731,
|
|
"grad_norm": 3.7618181705474854,
|
|
"learning_rate": 8.345653031794292e-06,
|
|
"loss": 0.725,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 0.41099443584487305,
|
|
"grad_norm": 3.5488483905792236,
|
|
"learning_rate": 8.331216432765714e-06,
|
|
"loss": 0.6975,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 0.41233317993557295,
|
|
"grad_norm": 3.7740087509155273,
|
|
"learning_rate": 8.316729722858987e-06,
|
|
"loss": 0.7213,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 0.41367192402627284,
|
|
"grad_norm": 3.781684637069702,
|
|
"learning_rate": 8.302193119995038e-06,
|
|
"loss": 0.7162,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 0.4150106681169728,
|
|
"grad_norm": 3.610675573348999,
|
|
"learning_rate": 8.28760684284532e-06,
|
|
"loss": 0.7179,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.4163494122076727,
|
|
"grad_norm": 3.895317316055298,
|
|
"learning_rate": 8.272971110828521e-06,
|
|
"loss": 0.7326,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 0.4176881562983726,
|
|
"grad_norm": 3.395089626312256,
|
|
"learning_rate": 8.258286144107277e-06,
|
|
"loss": 0.7044,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 0.4190269003890725,
|
|
"grad_norm": 3.109924793243408,
|
|
"learning_rate": 8.243552163584851e-06,
|
|
"loss": 0.7089,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 0.4203656444797724,
|
|
"grad_norm": 3.788464069366455,
|
|
"learning_rate": 8.228769390901812e-06,
|
|
"loss": 0.7089,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 0.4217043885704723,
|
|
"grad_norm": 4.55291223526001,
|
|
"learning_rate": 8.213938048432697e-06,
|
|
"loss": 0.708,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 0.4230431326611722,
|
|
"grad_norm": 3.502070903778076,
|
|
"learning_rate": 8.199058359282675e-06,
|
|
"loss": 0.7044,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 0.42438187675187217,
|
|
"grad_norm": 4.0649003982543945,
|
|
"learning_rate": 8.18413054728418e-06,
|
|
"loss": 0.7232,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 0.42572062084257206,
|
|
"grad_norm": 3.9198923110961914,
|
|
"learning_rate": 8.16915483699355e-06,
|
|
"loss": 0.726,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 0.42705936493327196,
|
|
"grad_norm": 4.080833911895752,
|
|
"learning_rate": 8.154131453687657e-06,
|
|
"loss": 0.7159,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 0.4283981090239719,
|
|
"grad_norm": 3.5025393962860107,
|
|
"learning_rate": 8.139060623360494e-06,
|
|
"loss": 0.7153,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.4297368531146718,
|
|
"grad_norm": 5.061036586761475,
|
|
"learning_rate": 8.123942572719801e-06,
|
|
"loss": 0.7234,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 0.4310755972053717,
|
|
"grad_norm": 3.5028905868530273,
|
|
"learning_rate": 8.108777529183644e-06,
|
|
"loss": 0.7117,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 0.43241434129607165,
|
|
"grad_norm": 3.6330597400665283,
|
|
"learning_rate": 8.093565720876994e-06,
|
|
"loss": 0.7297,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 0.43375308538677154,
|
|
"grad_norm": 3.717942237854004,
|
|
"learning_rate": 8.078307376628292e-06,
|
|
"loss": 0.7092,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 0.43509182947747144,
|
|
"grad_norm": 4.931020259857178,
|
|
"learning_rate": 8.063002725966014e-06,
|
|
"loss": 0.7264,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 0.43643057356817133,
|
|
"grad_norm": 3.883100748062134,
|
|
"learning_rate": 8.047651999115216e-06,
|
|
"loss": 0.7324,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 0.4377693176588713,
|
|
"grad_norm": 4.687985420227051,
|
|
"learning_rate": 8.032255426994069e-06,
|
|
"loss": 0.7184,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 0.4391080617495712,
|
|
"grad_norm": 3.9309306144714355,
|
|
"learning_rate": 8.01681324121038e-06,
|
|
"loss": 0.7316,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 0.44044680584027107,
|
|
"grad_norm": 3.553938865661621,
|
|
"learning_rate": 8.001325674058124e-06,
|
|
"loss": 0.724,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 0.441785549930971,
|
|
"grad_norm": 4.068758487701416,
|
|
"learning_rate": 7.985792958513932e-06,
|
|
"loss": 0.706,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.4431242940216709,
|
|
"grad_norm": 3.4960126876831055,
|
|
"learning_rate": 7.970215328233597e-06,
|
|
"loss": 0.7126,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 0.4444630381123708,
|
|
"grad_norm": 5.269049167633057,
|
|
"learning_rate": 7.954593017548557e-06,
|
|
"loss": 0.7107,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 0.44580178220307076,
|
|
"grad_norm": 4.093947887420654,
|
|
"learning_rate": 7.938926261462366e-06,
|
|
"loss": 0.7271,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 0.44714052629377066,
|
|
"grad_norm": 3.2673420906066895,
|
|
"learning_rate": 7.923215295647167e-06,
|
|
"loss": 0.7239,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 0.44847927038447055,
|
|
"grad_norm": 3.5432372093200684,
|
|
"learning_rate": 7.907460356440133e-06,
|
|
"loss": 0.7212,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 0.4498180144751705,
|
|
"grad_norm": 4.167123794555664,
|
|
"learning_rate": 7.891661680839932e-06,
|
|
"loss": 0.7129,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 0.4511567585658704,
|
|
"grad_norm": 3.7853057384490967,
|
|
"learning_rate": 7.875819506503145e-06,
|
|
"loss": 0.7089,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 0.4524955026565703,
|
|
"grad_norm": 4.287417411804199,
|
|
"learning_rate": 7.859934071740693e-06,
|
|
"loss": 0.7201,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 0.45383424674727024,
|
|
"grad_norm": 4.353424549102783,
|
|
"learning_rate": 7.84400561551426e-06,
|
|
"loss": 0.7125,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 0.45517299083797014,
|
|
"grad_norm": 3.55268931388855,
|
|
"learning_rate": 7.828034377432694e-06,
|
|
"loss": 0.7108,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.45651173492867003,
|
|
"grad_norm": 3.5031793117523193,
|
|
"learning_rate": 7.8120205977484e-06,
|
|
"loss": 0.7267,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 0.4578504790193699,
|
|
"grad_norm": 4.873944282531738,
|
|
"learning_rate": 7.795964517353734e-06,
|
|
"loss": 0.7284,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 0.4591892231100699,
|
|
"grad_norm": 4.078751087188721,
|
|
"learning_rate": 7.779866377777367e-06,
|
|
"loss": 0.7025,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 0.46052796720076977,
|
|
"grad_norm": 3.1843979358673096,
|
|
"learning_rate": 7.763726421180664e-06,
|
|
"loss": 0.6866,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 0.46186671129146967,
|
|
"grad_norm": 4.79196834564209,
|
|
"learning_rate": 7.747544890354031e-06,
|
|
"loss": 0.7342,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 0.4632054553821696,
|
|
"grad_norm": 3.4191689491271973,
|
|
"learning_rate": 7.73132202871327e-06,
|
|
"loss": 0.7136,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 0.4645441994728695,
|
|
"grad_norm": 3.4719226360321045,
|
|
"learning_rate": 7.715058080295918e-06,
|
|
"loss": 0.7175,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 0.4658829435635694,
|
|
"grad_norm": 3.585686683654785,
|
|
"learning_rate": 7.698753289757565e-06,
|
|
"loss": 0.7234,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 0.46722168765426936,
|
|
"grad_norm": 3.6738579273223877,
|
|
"learning_rate": 7.68240790236819e-06,
|
|
"loss": 0.7205,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 0.46856043174496925,
|
|
"grad_norm": 3.361675977706909,
|
|
"learning_rate": 7.666022164008458e-06,
|
|
"loss": 0.6995,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.46989917583566915,
|
|
"grad_norm": 4.34644889831543,
|
|
"learning_rate": 7.649596321166024e-06,
|
|
"loss": 0.7278,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 0.4712379199263691,
|
|
"grad_norm": 4.128347873687744,
|
|
"learning_rate": 7.633130620931837e-06,
|
|
"loss": 0.7103,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 0.472576664017069,
|
|
"grad_norm": 4.6785173416137695,
|
|
"learning_rate": 7.616625310996405e-06,
|
|
"loss": 0.6994,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 0.4739154081077689,
|
|
"grad_norm": 4.666531085968018,
|
|
"learning_rate": 7.600080639646077e-06,
|
|
"loss": 0.7196,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 0.47525415219846884,
|
|
"grad_norm": 3.695772171020508,
|
|
"learning_rate": 7.5834968557593155e-06,
|
|
"loss": 0.7196,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 0.47659289628916873,
|
|
"grad_norm": 3.8881545066833496,
|
|
"learning_rate": 7.566874208802939e-06,
|
|
"loss": 0.7122,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 0.4779316403798686,
|
|
"grad_norm": 21.495349884033203,
|
|
"learning_rate": 7.550212948828377e-06,
|
|
"loss": 0.7193,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 0.4792703844705685,
|
|
"grad_norm": 3.7887818813323975,
|
|
"learning_rate": 7.533513326467911e-06,
|
|
"loss": 0.7171,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 0.4806091285612685,
|
|
"grad_norm": 3.5149006843566895,
|
|
"learning_rate": 7.5167755929309e-06,
|
|
"loss": 0.7035,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 0.48194787265196837,
|
|
"grad_norm": 3.4718422889709473,
|
|
"learning_rate": 7.500000000000001e-06,
|
|
"loss": 0.7136,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.48194787265196837,
|
|
"eval_loss": 0.463682621717453,
|
|
"eval_runtime": 143.2025,
|
|
"eval_samples_per_second": 76.814,
|
|
"eval_steps_per_second": 9.602,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.48328661674266826,
|
|
"grad_norm": 4.267759799957275,
|
|
"learning_rate": 7.483186800027381e-06,
|
|
"loss": 0.7143,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 0.4846253608333682,
|
|
"grad_norm": 3.220227003097534,
|
|
"learning_rate": 7.466336245930927e-06,
|
|
"loss": 0.7196,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 0.4859641049240681,
|
|
"grad_norm": 3.646042823791504,
|
|
"learning_rate": 7.449448591190436e-06,
|
|
"loss": 0.7209,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 0.487302849014768,
|
|
"grad_norm": 3.972449779510498,
|
|
"learning_rate": 7.4325240898438e-06,
|
|
"loss": 0.7045,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 0.48864159310546795,
|
|
"grad_norm": 4.244483470916748,
|
|
"learning_rate": 7.415562996483193e-06,
|
|
"loss": 0.7162,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 0.48998033719616785,
|
|
"grad_norm": 5.198873519897461,
|
|
"learning_rate": 7.398565566251232e-06,
|
|
"loss": 0.7339,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 0.49131908128686774,
|
|
"grad_norm": 3.7421281337738037,
|
|
"learning_rate": 7.381532054837145e-06,
|
|
"loss": 0.6877,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 0.4926578253775677,
|
|
"grad_norm": 3.299971103668213,
|
|
"learning_rate": 7.364462718472919e-06,
|
|
"loss": 0.7077,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 0.4939965694682676,
|
|
"grad_norm": 3.0541770458221436,
|
|
"learning_rate": 7.347357813929455e-06,
|
|
"loss": 0.7368,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 0.4953353135589675,
|
|
"grad_norm": 4.77625036239624,
|
|
"learning_rate": 7.330217598512696e-06,
|
|
"loss": 0.7062,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 0.49667405764966743,
|
|
"grad_norm": 3.224484920501709,
|
|
"learning_rate": 7.3130423300597575e-06,
|
|
"loss": 0.7159,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 0.49801280174036733,
|
|
"grad_norm": 11.443235397338867,
|
|
"learning_rate": 7.295832266935059e-06,
|
|
"loss": 0.7393,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 0.4993515458310672,
|
|
"grad_norm": 3.5304412841796875,
|
|
"learning_rate": 7.278587668026422e-06,
|
|
"loss": 0.7124,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 0.5006902899217671,
|
|
"grad_norm": 3.908724546432495,
|
|
"learning_rate": 7.2613087927411885e-06,
|
|
"loss": 0.7181,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 0.502029034012467,
|
|
"grad_norm": 2.7556068897247314,
|
|
"learning_rate": 7.243995901002312e-06,
|
|
"loss": 0.721,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 0.503367778103167,
|
|
"grad_norm": 3.7599422931671143,
|
|
"learning_rate": 7.226649253244448e-06,
|
|
"loss": 0.7311,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 0.5047065221938669,
|
|
"grad_norm": 3.5529394149780273,
|
|
"learning_rate": 7.20926911041004e-06,
|
|
"loss": 0.7262,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 0.5060452662845668,
|
|
"grad_norm": 4.2093281745910645,
|
|
"learning_rate": 7.191855733945388e-06,
|
|
"loss": 0.699,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 0.5073840103752667,
|
|
"grad_norm": 3.220139980316162,
|
|
"learning_rate": 7.174409385796726e-06,
|
|
"loss": 0.695,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 0.5087227544659666,
|
|
"grad_norm": 3.54583477973938,
|
|
"learning_rate": 7.156930328406268e-06,
|
|
"loss": 0.7183,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.5100614985566665,
|
|
"grad_norm": 3.4133760929107666,
|
|
"learning_rate": 7.1394188247082715e-06,
|
|
"loss": 0.7145,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 0.5114002426473664,
|
|
"grad_norm": 3.8031890392303467,
|
|
"learning_rate": 7.121875138125077e-06,
|
|
"loss": 0.7197,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 0.5127389867380664,
|
|
"grad_norm": 4.096649646759033,
|
|
"learning_rate": 7.104299532563146e-06,
|
|
"loss": 0.7192,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 0.5140777308287663,
|
|
"grad_norm": 4.508788585662842,
|
|
"learning_rate": 7.08669227240909e-06,
|
|
"loss": 0.7312,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 0.5154164749194662,
|
|
"grad_norm": 4.228316307067871,
|
|
"learning_rate": 7.069053622525697e-06,
|
|
"loss": 0.718,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 0.5167552190101661,
|
|
"grad_norm": 3.8785269260406494,
|
|
"learning_rate": 7.0513838482479424e-06,
|
|
"loss": 0.714,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 0.518093963100866,
|
|
"grad_norm": 3.532994270324707,
|
|
"learning_rate": 7.033683215379002e-06,
|
|
"loss": 0.7132,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 0.5194327071915659,
|
|
"grad_norm": 2.9710559844970703,
|
|
"learning_rate": 7.0159519901862515e-06,
|
|
"loss": 0.6966,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 0.5207714512822659,
|
|
"grad_norm": 3.068615436553955,
|
|
"learning_rate": 6.998190439397262e-06,
|
|
"loss": 0.6989,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 0.5221101953729658,
|
|
"grad_norm": 4.222218990325928,
|
|
"learning_rate": 6.980398830195785e-06,
|
|
"loss": 0.7249,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 0.5234489394636657,
|
|
"grad_norm": 7.726329803466797,
|
|
"learning_rate": 6.962577430217736e-06,
|
|
"loss": 0.7347,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 0.5247876835543656,
|
|
"grad_norm": 3.2588653564453125,
|
|
"learning_rate": 6.944726507547169e-06,
|
|
"loss": 0.6975,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 0.5261264276450655,
|
|
"grad_norm": 3.771385669708252,
|
|
"learning_rate": 6.9268463307122425e-06,
|
|
"loss": 0.6987,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 0.5274651717357653,
|
|
"grad_norm": 4.149131774902344,
|
|
"learning_rate": 6.908937168681176e-06,
|
|
"loss": 0.7108,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 0.5288039158264652,
|
|
"grad_norm": 4.3986287117004395,
|
|
"learning_rate": 6.890999290858213e-06,
|
|
"loss": 0.714,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 0.5301426599171652,
|
|
"grad_norm": 4.018884181976318,
|
|
"learning_rate": 6.873032967079562e-06,
|
|
"loss": 0.7044,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 0.5314814040078651,
|
|
"grad_norm": 4.179384231567383,
|
|
"learning_rate": 6.8550384676093355e-06,
|
|
"loss": 0.7079,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 0.532820148098565,
|
|
"grad_norm": 4.152440547943115,
|
|
"learning_rate": 6.837016063135491e-06,
|
|
"loss": 0.7188,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 0.5341588921892649,
|
|
"grad_norm": 5.560712814331055,
|
|
"learning_rate": 6.818966024765758e-06,
|
|
"loss": 0.6946,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 0.5354976362799648,
|
|
"grad_norm": 3.4238390922546387,
|
|
"learning_rate": 6.800888624023552e-06,
|
|
"loss": 0.7041,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.5368363803706647,
|
|
"grad_norm": 4.128967761993408,
|
|
"learning_rate": 6.782784132843901e-06,
|
|
"loss": 0.7158,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 0.5381751244613647,
|
|
"grad_norm": 3.747835636138916,
|
|
"learning_rate": 6.7646528235693445e-06,
|
|
"loss": 0.6969,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 0.5395138685520646,
|
|
"grad_norm": 4.874392509460449,
|
|
"learning_rate": 6.746494968945847e-06,
|
|
"loss": 0.6815,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 0.5408526126427645,
|
|
"grad_norm": 4.240722179412842,
|
|
"learning_rate": 6.7283108421186835e-06,
|
|
"loss": 0.7016,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 0.5421913567334644,
|
|
"grad_norm": 4.665043354034424,
|
|
"learning_rate": 6.710100716628345e-06,
|
|
"loss": 0.7223,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 0.5435301008241643,
|
|
"grad_norm": 3.7810027599334717,
|
|
"learning_rate": 6.691864866406407e-06,
|
|
"loss": 0.7239,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 0.5448688449148642,
|
|
"grad_norm": 3.5646510124206543,
|
|
"learning_rate": 6.6736035657714235e-06,
|
|
"loss": 0.7052,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 0.5462075890055642,
|
|
"grad_norm": 3.773944139480591,
|
|
"learning_rate": 6.655317089424791e-06,
|
|
"loss": 0.7147,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 0.5475463330962641,
|
|
"grad_norm": 3.6827080249786377,
|
|
"learning_rate": 6.637005712446622e-06,
|
|
"loss": 0.7093,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 0.548885077186964,
|
|
"grad_norm": 4.169469833374023,
|
|
"learning_rate": 6.618669710291607e-06,
|
|
"loss": 0.7068,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 0.5502238212776639,
|
|
"grad_norm": 3.9292285442352295,
|
|
"learning_rate": 6.600309358784858e-06,
|
|
"loss": 0.7267,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 0.5515625653683638,
|
|
"grad_norm": 3.827451467514038,
|
|
"learning_rate": 6.581924934117783e-06,
|
|
"loss": 0.7212,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 0.5529013094590637,
|
|
"grad_norm": 4.22733736038208,
|
|
"learning_rate": 6.56351671284391e-06,
|
|
"loss": 0.7178,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 0.5542400535497636,
|
|
"grad_norm": 4.720992088317871,
|
|
"learning_rate": 6.545084971874738e-06,
|
|
"loss": 0.7173,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 0.5555787976404636,
|
|
"grad_norm": 6.291136264801025,
|
|
"learning_rate": 6.526629988475567e-06,
|
|
"loss": 0.7151,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 0.5569175417311635,
|
|
"grad_norm": 4.103168964385986,
|
|
"learning_rate": 6.508152040261329e-06,
|
|
"loss": 0.6945,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 0.5582562858218634,
|
|
"grad_norm": 7.984721660614014,
|
|
"learning_rate": 6.48965140519241e-06,
|
|
"loss": 0.6906,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 0.5595950299125633,
|
|
"grad_norm": 4.0510993003845215,
|
|
"learning_rate": 6.4711283615704755e-06,
|
|
"loss": 0.7175,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 0.5609337740032632,
|
|
"grad_norm": 3.76582932472229,
|
|
"learning_rate": 6.452583188034275e-06,
|
|
"loss": 0.7095,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 0.5622725180939631,
|
|
"grad_norm": 3.8622148036956787,
|
|
"learning_rate": 6.434016163555452e-06,
|
|
"loss": 0.6823,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.5636112621846631,
|
|
"grad_norm": 3.941279411315918,
|
|
"learning_rate": 6.415427567434353e-06,
|
|
"loss": 0.6995,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 0.564950006275363,
|
|
"grad_norm": 6.000927925109863,
|
|
"learning_rate": 6.396817679295823e-06,
|
|
"loss": 0.7041,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 0.5662887503660629,
|
|
"grad_norm": 3.4409406185150146,
|
|
"learning_rate": 6.378186779084996e-06,
|
|
"loss": 0.7052,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 0.5676274944567627,
|
|
"grad_norm": 3.7581064701080322,
|
|
"learning_rate": 6.359535147063092e-06,
|
|
"loss": 0.7169,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 0.5689662385474626,
|
|
"grad_norm": 5.163430690765381,
|
|
"learning_rate": 6.340863063803187e-06,
|
|
"loss": 0.715,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 0.5703049826381625,
|
|
"grad_norm": 4.024590492248535,
|
|
"learning_rate": 6.322170810186013e-06,
|
|
"loss": 0.7025,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 0.5716437267288624,
|
|
"grad_norm": 3.8056271076202393,
|
|
"learning_rate": 6.3034586673957075e-06,
|
|
"loss": 0.6964,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 0.5729824708195624,
|
|
"grad_norm": 4.615815162658691,
|
|
"learning_rate": 6.284726916915611e-06,
|
|
"loss": 0.7254,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 0.5743212149102623,
|
|
"grad_norm": 4.339734077453613,
|
|
"learning_rate": 6.26597584052401e-06,
|
|
"loss": 0.704,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 0.5756599590009622,
|
|
"grad_norm": 4.46190881729126,
|
|
"learning_rate": 6.247205720289907e-06,
|
|
"loss": 0.7151,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 0.5769987030916621,
|
|
"grad_norm": 4.107763290405273,
|
|
"learning_rate": 6.228416838568782e-06,
|
|
"loss": 0.7285,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 0.578337447182362,
|
|
"grad_norm": 7.636148452758789,
|
|
"learning_rate": 6.209609477998339e-06,
|
|
"loss": 0.7185,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 0.5796761912730619,
|
|
"grad_norm": 3.7601823806762695,
|
|
"learning_rate": 6.190783921494255e-06,
|
|
"loss": 0.7106,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 0.5810149353637619,
|
|
"grad_norm": 3.318302869796753,
|
|
"learning_rate": 6.171940452245923e-06,
|
|
"loss": 0.7127,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 0.5823536794544618,
|
|
"grad_norm": 4.105878829956055,
|
|
"learning_rate": 6.153079353712201e-06,
|
|
"loss": 0.6965,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 0.5836924235451617,
|
|
"grad_norm": 4.308773517608643,
|
|
"learning_rate": 6.134200909617135e-06,
|
|
"loss": 0.7116,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 0.5850311676358616,
|
|
"grad_norm": 6.789102077484131,
|
|
"learning_rate": 6.115305403945697e-06,
|
|
"loss": 0.7124,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 0.5863699117265615,
|
|
"grad_norm": 3.4282071590423584,
|
|
"learning_rate": 6.0963931209395165e-06,
|
|
"loss": 0.7076,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 0.5877086558172614,
|
|
"grad_norm": 4.03810977935791,
|
|
"learning_rate": 6.077464345092601e-06,
|
|
"loss": 0.7036,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 0.5890473999079613,
|
|
"grad_norm": 4.613780498504639,
|
|
"learning_rate": 6.058519361147055e-06,
|
|
"loss": 0.7102,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.5903861439986613,
|
|
"grad_norm": 3.8465452194213867,
|
|
"learning_rate": 6.039558454088796e-06,
|
|
"loss": 0.7164,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 0.5917248880893612,
|
|
"grad_norm": 4.950937271118164,
|
|
"learning_rate": 6.020581909143279e-06,
|
|
"loss": 0.7177,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 0.5930636321800611,
|
|
"grad_norm": 4.751613616943359,
|
|
"learning_rate": 6.001590011771188e-06,
|
|
"loss": 0.7318,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 0.594402376270761,
|
|
"grad_norm": 5.5535478591918945,
|
|
"learning_rate": 5.982583047664151e-06,
|
|
"loss": 0.6897,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 0.5957411203614609,
|
|
"grad_norm": 3.8853037357330322,
|
|
"learning_rate": 5.9635613027404495e-06,
|
|
"loss": 0.7189,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 0.5970798644521608,
|
|
"grad_norm": 3.9044294357299805,
|
|
"learning_rate": 5.944525063140703e-06,
|
|
"loss": 0.7257,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 0.5984186085428608,
|
|
"grad_norm": 3.528970241546631,
|
|
"learning_rate": 5.925474615223573e-06,
|
|
"loss": 0.7144,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 0.5997573526335607,
|
|
"grad_norm": 3.5127477645874023,
|
|
"learning_rate": 5.906410245561459e-06,
|
|
"loss": 0.7066,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 0.6010960967242606,
|
|
"grad_norm": 3.450453758239746,
|
|
"learning_rate": 5.887332240936177e-06,
|
|
"loss": 0.6993,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 0.6024348408149605,
|
|
"grad_norm": 3.299100160598755,
|
|
"learning_rate": 5.8682408883346535e-06,
|
|
"loss": 0.6981,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.6024348408149605,
|
|
"eval_loss": 0.4652141332626343,
|
|
"eval_runtime": 143.1011,
|
|
"eval_samples_per_second": 76.869,
|
|
"eval_steps_per_second": 9.609,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.6037735849056604,
|
|
"grad_norm": 3.695812225341797,
|
|
"learning_rate": 5.849136474944603e-06,
|
|
"loss": 0.7126,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 0.6051123289963602,
|
|
"grad_norm": 3.5994956493377686,
|
|
"learning_rate": 5.830019288150222e-06,
|
|
"loss": 0.7177,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 0.6064510730870603,
|
|
"grad_norm": 3.543468475341797,
|
|
"learning_rate": 5.810889615527839e-06,
|
|
"loss": 0.7203,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 0.6077898171777601,
|
|
"grad_norm": 3.8126320838928223,
|
|
"learning_rate": 5.791747744841615e-06,
|
|
"loss": 0.7117,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 0.60912856126846,
|
|
"grad_norm": 4.627198696136475,
|
|
"learning_rate": 5.772593964039203e-06,
|
|
"loss": 0.7264,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 0.6104673053591599,
|
|
"grad_norm": 3.896590232849121,
|
|
"learning_rate": 5.753428561247416e-06,
|
|
"loss": 0.7021,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 0.6118060494498598,
|
|
"grad_norm": 3.8609979152679443,
|
|
"learning_rate": 5.734251824767895e-06,
|
|
"loss": 0.7111,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 0.6131447935405597,
|
|
"grad_norm": 3.8481388092041016,
|
|
"learning_rate": 5.715064043072771e-06,
|
|
"loss": 0.7053,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 0.6144835376312596,
|
|
"grad_norm": 3.859123706817627,
|
|
"learning_rate": 5.695865504800328e-06,
|
|
"loss": 0.7112,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 0.6158222817219596,
|
|
"grad_norm": 4.59066915512085,
|
|
"learning_rate": 5.6766564987506564e-06,
|
|
"loss": 0.7121,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.6171610258126595,
|
|
"grad_norm": 3.9722397327423096,
|
|
"learning_rate": 5.657437313881314e-06,
|
|
"loss": 0.7085,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 0.6184997699033594,
|
|
"grad_norm": 24.66577911376953,
|
|
"learning_rate": 5.638208239302975e-06,
|
|
"loss": 0.7063,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 0.6198385139940593,
|
|
"grad_norm": 6.302036762237549,
|
|
"learning_rate": 5.618969564275083e-06,
|
|
"loss": 0.7148,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 0.6211772580847592,
|
|
"grad_norm": 4.718502998352051,
|
|
"learning_rate": 5.599721578201499e-06,
|
|
"loss": 0.7064,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 0.6225160021754591,
|
|
"grad_norm": 4.457043170928955,
|
|
"learning_rate": 5.5804645706261515e-06,
|
|
"loss": 0.7052,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 0.6238547462661591,
|
|
"grad_norm": 4.146284103393555,
|
|
"learning_rate": 5.561198831228676e-06,
|
|
"loss": 0.7333,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 0.625193490356859,
|
|
"grad_norm": 5.190493583679199,
|
|
"learning_rate": 5.541924649820054e-06,
|
|
"loss": 0.7029,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 0.6265322344475589,
|
|
"grad_norm": 5.723586082458496,
|
|
"learning_rate": 5.522642316338268e-06,
|
|
"loss": 0.687,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 0.6278709785382588,
|
|
"grad_norm": 4.871129035949707,
|
|
"learning_rate": 5.503352120843923e-06,
|
|
"loss": 0.6889,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 0.6292097226289587,
|
|
"grad_norm": 4.206845760345459,
|
|
"learning_rate": 5.484054353515896e-06,
|
|
"loss": 0.7094,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 0.6305484667196586,
|
|
"grad_norm": 4.026944160461426,
|
|
"learning_rate": 5.464749304646963e-06,
|
|
"loss": 0.7069,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 0.6318872108103585,
|
|
"grad_norm": 3.8320913314819336,
|
|
"learning_rate": 5.445437264639433e-06,
|
|
"loss": 0.6943,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 0.6332259549010585,
|
|
"grad_norm": 3.6914632320404053,
|
|
"learning_rate": 5.426118524000784e-06,
|
|
"loss": 0.7174,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 0.6345646989917584,
|
|
"grad_norm": 3.848788261413574,
|
|
"learning_rate": 5.406793373339292e-06,
|
|
"loss": 0.7033,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 0.6359034430824583,
|
|
"grad_norm": 3.719350576400757,
|
|
"learning_rate": 5.387462103359655e-06,
|
|
"loss": 0.7064,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 0.6372421871731582,
|
|
"grad_norm": 5.226090908050537,
|
|
"learning_rate": 5.3681250048586246e-06,
|
|
"loss": 0.7113,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 0.6385809312638581,
|
|
"grad_norm": 4.163788795471191,
|
|
"learning_rate": 5.348782368720627e-06,
|
|
"loss": 0.7042,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 0.639919675354558,
|
|
"grad_norm": 4.043521881103516,
|
|
"learning_rate": 5.329434485913393e-06,
|
|
"loss": 0.727,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 0.641258419445258,
|
|
"grad_norm": 4.040348052978516,
|
|
"learning_rate": 5.310081647483577e-06,
|
|
"loss": 0.712,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 0.6425971635359579,
|
|
"grad_norm": 3.3194639682769775,
|
|
"learning_rate": 5.290724144552379e-06,
|
|
"loss": 0.6845,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 0.6439359076266578,
|
|
"grad_norm": 3.4912378787994385,
|
|
"learning_rate": 5.27136226831117e-06,
|
|
"loss": 0.694,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 0.6452746517173577,
|
|
"grad_norm": 4.353266716003418,
|
|
"learning_rate": 5.251996310017101e-06,
|
|
"loss": 0.7121,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 0.6466133958080575,
|
|
"grad_norm": 4.003024578094482,
|
|
"learning_rate": 5.232626560988735e-06,
|
|
"loss": 0.7021,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 0.6479521398987574,
|
|
"grad_norm": 3.7546870708465576,
|
|
"learning_rate": 5.213253312601654e-06,
|
|
"loss": 0.7141,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 0.6492908839894574,
|
|
"grad_norm": 4.198794841766357,
|
|
"learning_rate": 5.193876856284085e-06,
|
|
"loss": 0.7213,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 0.6506296280801573,
|
|
"grad_norm": 4.22196626663208,
|
|
"learning_rate": 5.174497483512506e-06,
|
|
"loss": 0.7093,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 0.6519683721708572,
|
|
"grad_norm": 3.9178671836853027,
|
|
"learning_rate": 5.155115485807269e-06,
|
|
"loss": 0.7196,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 0.6533071162615571,
|
|
"grad_norm": 3.8929224014282227,
|
|
"learning_rate": 5.135731154728215e-06,
|
|
"loss": 0.7044,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 0.654645860352257,
|
|
"grad_norm": 3.574014663696289,
|
|
"learning_rate": 5.116344781870282e-06,
|
|
"loss": 0.6894,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 0.6559846044429569,
|
|
"grad_norm": 4.0745849609375,
|
|
"learning_rate": 5.096956658859122e-06,
|
|
"loss": 0.7007,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 0.6573233485336568,
|
|
"grad_norm": 3.5146987438201904,
|
|
"learning_rate": 5.077567077346717e-06,
|
|
"loss": 0.7162,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 0.6586620926243568,
|
|
"grad_norm": 5.374062538146973,
|
|
"learning_rate": 5.0581763290069865e-06,
|
|
"loss": 0.7089,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 0.6600008367150567,
|
|
"grad_norm": 4.218367099761963,
|
|
"learning_rate": 5.038784705531402e-06,
|
|
"loss": 0.6856,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 0.6613395808057566,
|
|
"grad_norm": 3.816288709640503,
|
|
"learning_rate": 5.019392498624602e-06,
|
|
"loss": 0.7001,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 0.6626783248964565,
|
|
"grad_norm": 3.78233003616333,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.6996,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 0.6640170689871564,
|
|
"grad_norm": 4.569467067718506,
|
|
"learning_rate": 4.980607501375399e-06,
|
|
"loss": 0.7204,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 0.6653558130778563,
|
|
"grad_norm": 5.393465995788574,
|
|
"learning_rate": 4.9612152944686e-06,
|
|
"loss": 0.6985,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 0.6666945571685563,
|
|
"grad_norm": 4.383655071258545,
|
|
"learning_rate": 4.941823670993016e-06,
|
|
"loss": 0.7036,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 0.6680333012592562,
|
|
"grad_norm": 4.336970806121826,
|
|
"learning_rate": 4.922432922653284e-06,
|
|
"loss": 0.7062,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 0.6693720453499561,
|
|
"grad_norm": 3.576835870742798,
|
|
"learning_rate": 4.903043341140879e-06,
|
|
"loss": 0.7054,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.670710789440656,
|
|
"grad_norm": 5.564067840576172,
|
|
"learning_rate": 4.883655218129719e-06,
|
|
"loss": 0.7041,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"epoch": 0.6720495335313559,
|
|
"grad_norm": 4.789515972137451,
|
|
"learning_rate": 4.864268845271786e-06,
|
|
"loss": 0.7156,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 0.6733882776220558,
|
|
"grad_norm": 4.035548210144043,
|
|
"learning_rate": 4.844884514192732e-06,
|
|
"loss": 0.7162,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"epoch": 0.6747270217127557,
|
|
"grad_norm": 5.7640838623046875,
|
|
"learning_rate": 4.825502516487497e-06,
|
|
"loss": 0.6912,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 0.6760657658034557,
|
|
"grad_norm": 5.64564323425293,
|
|
"learning_rate": 4.806123143715916e-06,
|
|
"loss": 0.7066,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 0.6774045098941556,
|
|
"grad_norm": 5.186812877655029,
|
|
"learning_rate": 4.786746687398347e-06,
|
|
"loss": 0.7036,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 0.6787432539848555,
|
|
"grad_norm": 4.624047756195068,
|
|
"learning_rate": 4.767373439011267e-06,
|
|
"loss": 0.7065,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 0.6800819980755554,
|
|
"grad_norm": 4.410228252410889,
|
|
"learning_rate": 4.748003689982901e-06,
|
|
"loss": 0.7142,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 0.6814207421662553,
|
|
"grad_norm": 4.419641017913818,
|
|
"learning_rate": 4.728637731688832e-06,
|
|
"loss": 0.7034,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"epoch": 0.6827594862569552,
|
|
"grad_norm": 3.682264804840088,
|
|
"learning_rate": 4.7092758554476215e-06,
|
|
"loss": 0.7049,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 0.6840982303476552,
|
|
"grad_norm": 5.0407867431640625,
|
|
"learning_rate": 4.689918352516424e-06,
|
|
"loss": 0.7003,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"epoch": 0.685436974438355,
|
|
"grad_norm": 4.022500991821289,
|
|
"learning_rate": 4.670565514086607e-06,
|
|
"loss": 0.7149,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 0.686775718529055,
|
|
"grad_norm": 4.016271591186523,
|
|
"learning_rate": 4.651217631279374e-06,
|
|
"loss": 0.718,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"epoch": 0.6881144626197548,
|
|
"grad_norm": 4.572041034698486,
|
|
"learning_rate": 4.631874995141376e-06,
|
|
"loss": 0.6918,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 0.6894532067104547,
|
|
"grad_norm": 4.2524824142456055,
|
|
"learning_rate": 4.6125378966403465e-06,
|
|
"loss": 0.6951,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 0.6907919508011546,
|
|
"grad_norm": 3.3855910301208496,
|
|
"learning_rate": 4.59320662666071e-06,
|
|
"loss": 0.6985,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 0.6921306948918545,
|
|
"grad_norm": 3.9998672008514404,
|
|
"learning_rate": 4.573881475999218e-06,
|
|
"loss": 0.697,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"epoch": 0.6934694389825545,
|
|
"grad_norm": 3.5795857906341553,
|
|
"learning_rate": 4.5545627353605705e-06,
|
|
"loss": 0.6974,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 0.6948081830732544,
|
|
"grad_norm": 3.8423166275024414,
|
|
"learning_rate": 4.53525069535304e-06,
|
|
"loss": 0.6812,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"epoch": 0.6961469271639543,
|
|
"grad_norm": 3.8361778259277344,
|
|
"learning_rate": 4.515945646484105e-06,
|
|
"loss": 0.6765,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 0.6974856712546542,
|
|
"grad_norm": 4.976791858673096,
|
|
"learning_rate": 4.496647879156078e-06,
|
|
"loss": 0.7239,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"epoch": 0.6988244153453541,
|
|
"grad_norm": 4.1735615730285645,
|
|
"learning_rate": 4.477357683661734e-06,
|
|
"loss": 0.7153,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 0.700163159436054,
|
|
"grad_norm": 3.8258252143859863,
|
|
"learning_rate": 4.458075350179948e-06,
|
|
"loss": 0.7169,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"epoch": 0.701501903526754,
|
|
"grad_norm": 4.212690830230713,
|
|
"learning_rate": 4.4388011687713274e-06,
|
|
"loss": 0.6938,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 0.7028406476174539,
|
|
"grad_norm": 3.6461665630340576,
|
|
"learning_rate": 4.4195354293738484e-06,
|
|
"loss": 0.6997,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 0.7041793917081538,
|
|
"grad_norm": 3.5046069622039795,
|
|
"learning_rate": 4.400278421798501e-06,
|
|
"loss": 0.7038,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 0.7055181357988537,
|
|
"grad_norm": 4.3335161209106445,
|
|
"learning_rate": 4.381030435724919e-06,
|
|
"loss": 0.7073,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"epoch": 0.7068568798895536,
|
|
"grad_norm": 4.536037445068359,
|
|
"learning_rate": 4.361791760697027e-06,
|
|
"loss": 0.7089,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"epoch": 0.7081956239802535,
|
|
"grad_norm": 3.559147596359253,
|
|
"learning_rate": 4.342562686118687e-06,
|
|
"loss": 0.7122,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"epoch": 0.7095343680709535,
|
|
"grad_norm": 4.661660194396973,
|
|
"learning_rate": 4.323343501249346e-06,
|
|
"loss": 0.7093,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 0.7108731121616534,
|
|
"grad_norm": 4.578961372375488,
|
|
"learning_rate": 4.304134495199675e-06,
|
|
"loss": 0.7085,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"epoch": 0.7122118562523533,
|
|
"grad_norm": 3.8165831565856934,
|
|
"learning_rate": 4.284935956927229e-06,
|
|
"loss": 0.7025,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"epoch": 0.7135506003430532,
|
|
"grad_norm": 4.166040420532227,
|
|
"learning_rate": 4.265748175232105e-06,
|
|
"loss": 0.7159,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"epoch": 0.7148893444337531,
|
|
"grad_norm": 4.469944477081299,
|
|
"learning_rate": 4.246571438752585e-06,
|
|
"loss": 0.7215,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"epoch": 0.716228088524453,
|
|
"grad_norm": 4.120419979095459,
|
|
"learning_rate": 4.227406035960798e-06,
|
|
"loss": 0.7005,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 0.7175668326151529,
|
|
"grad_norm": 4.142100811004639,
|
|
"learning_rate": 4.208252255158387e-06,
|
|
"loss": 0.7177,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"epoch": 0.7189055767058529,
|
|
"grad_norm": 5.1433424949646,
|
|
"learning_rate": 4.189110384472164e-06,
|
|
"loss": 0.6869,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"epoch": 0.7202443207965528,
|
|
"grad_norm": 4.202688217163086,
|
|
"learning_rate": 4.1699807118497815e-06,
|
|
"loss": 0.7148,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"epoch": 0.7215830648872527,
|
|
"grad_norm": 4.3545026779174805,
|
|
"learning_rate": 4.150863525055397e-06,
|
|
"loss": 0.7187,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"epoch": 0.7229218089779526,
|
|
"grad_norm": 3.6167187690734863,
|
|
"learning_rate": 4.131759111665349e-06,
|
|
"loss": 0.6913,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 0.7229218089779526,
|
|
"eval_loss": 0.4638102948665619,
|
|
"eval_runtime": 143.2977,
|
|
"eval_samples_per_second": 76.763,
|
|
"eval_steps_per_second": 9.595,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 0.7242605530686524,
|
|
"grad_norm": 5.738918781280518,
|
|
"learning_rate": 4.112667759063825e-06,
|
|
"loss": 0.6917,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"epoch": 0.7255992971593523,
|
|
"grad_norm": 3.9583187103271484,
|
|
"learning_rate": 4.093589754438543e-06,
|
|
"loss": 0.6885,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"epoch": 0.7269380412500523,
|
|
"grad_norm": 4.710034370422363,
|
|
"learning_rate": 4.074525384776428e-06,
|
|
"loss": 0.7007,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"epoch": 0.7282767853407522,
|
|
"grad_norm": 4.086686134338379,
|
|
"learning_rate": 4.0554749368593e-06,
|
|
"loss": 0.7005,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"epoch": 0.7296155294314521,
|
|
"grad_norm": 4.158773422241211,
|
|
"learning_rate": 4.036438697259551e-06,
|
|
"loss": 0.6979,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 0.730954273522152,
|
|
"grad_norm": 3.8934173583984375,
|
|
"learning_rate": 4.017416952335849e-06,
|
|
"loss": 0.7074,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"epoch": 0.7322930176128519,
|
|
"grad_norm": 3.831171751022339,
|
|
"learning_rate": 3.998409988228813e-06,
|
|
"loss": 0.7099,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"epoch": 0.7336317617035518,
|
|
"grad_norm": 4.825276851654053,
|
|
"learning_rate": 3.979418090856723e-06,
|
|
"loss": 0.6995,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"epoch": 0.7349705057942517,
|
|
"grad_norm": 4.362029552459717,
|
|
"learning_rate": 3.960441545911205e-06,
|
|
"loss": 0.7097,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"epoch": 0.7363092498849517,
|
|
"grad_norm": 4.977943420410156,
|
|
"learning_rate": 3.941480638852948e-06,
|
|
"loss": 0.6929,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 0.7376479939756516,
|
|
"grad_norm": 4.431875705718994,
|
|
"learning_rate": 3.922535654907401e-06,
|
|
"loss": 0.6894,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"epoch": 0.7389867380663515,
|
|
"grad_norm": 4.7662248611450195,
|
|
"learning_rate": 3.903606879060483e-06,
|
|
"loss": 0.7173,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"epoch": 0.7403254821570514,
|
|
"grad_norm": 9.614615440368652,
|
|
"learning_rate": 3.884694596054304e-06,
|
|
"loss": 0.7038,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"epoch": 0.7416642262477513,
|
|
"grad_norm": 3.7639272212982178,
|
|
"learning_rate": 3.865799090382866e-06,
|
|
"loss": 0.6826,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"epoch": 0.7430029703384512,
|
|
"grad_norm": 4.703065872192383,
|
|
"learning_rate": 3.8469206462878e-06,
|
|
"loss": 0.7061,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"epoch": 0.7443417144291512,
|
|
"grad_norm": 4.418508052825928,
|
|
"learning_rate": 3.828059547754078e-06,
|
|
"loss": 0.6962,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"epoch": 0.7456804585198511,
|
|
"grad_norm": 3.396287202835083,
|
|
"learning_rate": 3.809216078505747e-06,
|
|
"loss": 0.6967,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"epoch": 0.747019202610551,
|
|
"grad_norm": 4.526957988739014,
|
|
"learning_rate": 3.790390522001662e-06,
|
|
"loss": 0.7159,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"epoch": 0.7483579467012509,
|
|
"grad_norm": 4.654516696929932,
|
|
"learning_rate": 3.7715831614312184e-06,
|
|
"loss": 0.7032,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"epoch": 0.7496966907919508,
|
|
"grad_norm": 5.0296311378479,
|
|
"learning_rate": 3.752794279710094e-06,
|
|
"loss": 0.7128,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 0.7510354348826507,
|
|
"grad_norm": 4.693541526794434,
|
|
"learning_rate": 3.7340241594759917e-06,
|
|
"loss": 0.6973,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"epoch": 0.7523741789733506,
|
|
"grad_norm": 3.954364776611328,
|
|
"learning_rate": 3.7152730830843904e-06,
|
|
"loss": 0.6826,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"epoch": 0.7537129230640506,
|
|
"grad_norm": 4.376611232757568,
|
|
"learning_rate": 3.6965413326042933e-06,
|
|
"loss": 0.7047,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"epoch": 0.7550516671547505,
|
|
"grad_norm": 4.75441837310791,
|
|
"learning_rate": 3.6778291898139907e-06,
|
|
"loss": 0.7001,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"epoch": 0.7563904112454504,
|
|
"grad_norm": 5.179128646850586,
|
|
"learning_rate": 3.6591369361968127e-06,
|
|
"loss": 0.6932,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"epoch": 0.7577291553361503,
|
|
"grad_norm": 6.205267429351807,
|
|
"learning_rate": 3.640464852936909e-06,
|
|
"loss": 0.7012,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"epoch": 0.7590678994268502,
|
|
"grad_norm": 5.691217422485352,
|
|
"learning_rate": 3.6218132209150047e-06,
|
|
"loss": 0.7101,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"epoch": 0.76040664351755,
|
|
"grad_norm": 4.000324726104736,
|
|
"learning_rate": 3.603182320704179e-06,
|
|
"loss": 0.7173,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"epoch": 0.7617453876082501,
|
|
"grad_norm": 4.616678714752197,
|
|
"learning_rate": 3.5845724325656485e-06,
|
|
"loss": 0.6875,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"epoch": 0.76308413169895,
|
|
"grad_norm": 4.166356086730957,
|
|
"learning_rate": 3.5659838364445505e-06,
|
|
"loss": 0.7092,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 0.7644228757896498,
|
|
"grad_norm": 3.632735013961792,
|
|
"learning_rate": 3.5474168119657275e-06,
|
|
"loss": 0.7026,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"epoch": 0.7657616198803497,
|
|
"grad_norm": 4.168743133544922,
|
|
"learning_rate": 3.528871638429524e-06,
|
|
"loss": 0.6944,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"epoch": 0.7671003639710496,
|
|
"grad_norm": 3.6505751609802246,
|
|
"learning_rate": 3.51034859480759e-06,
|
|
"loss": 0.7108,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"epoch": 0.7684391080617495,
|
|
"grad_norm": 5.440558433532715,
|
|
"learning_rate": 3.491847959738673e-06,
|
|
"loss": 0.6986,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"epoch": 0.7697778521524495,
|
|
"grad_norm": 4.468270301818848,
|
|
"learning_rate": 3.473370011524435e-06,
|
|
"loss": 0.6941,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"epoch": 0.7711165962431494,
|
|
"grad_norm": 4.159365653991699,
|
|
"learning_rate": 3.4549150281252635e-06,
|
|
"loss": 0.7165,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"epoch": 0.7724553403338493,
|
|
"grad_norm": 3.886552333831787,
|
|
"learning_rate": 3.436483287156091e-06,
|
|
"loss": 0.7141,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"epoch": 0.7737940844245492,
|
|
"grad_norm": 4.091336250305176,
|
|
"learning_rate": 3.418075065882217e-06,
|
|
"loss": 0.7012,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"epoch": 0.7751328285152491,
|
|
"grad_norm": 4.196002960205078,
|
|
"learning_rate": 3.399690641215142e-06,
|
|
"loss": 0.7138,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"epoch": 0.776471572605949,
|
|
"grad_norm": 4.068109512329102,
|
|
"learning_rate": 3.3813302897083955e-06,
|
|
"loss": 0.6996,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 0.7778103166966489,
|
|
"grad_norm": 5.012916088104248,
|
|
"learning_rate": 3.3629942875533784e-06,
|
|
"loss": 0.7106,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"epoch": 0.7791490607873489,
|
|
"grad_norm": 4.585369110107422,
|
|
"learning_rate": 3.3446829105752103e-06,
|
|
"loss": 0.6859,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"epoch": 0.7804878048780488,
|
|
"grad_norm": 5.0565266609191895,
|
|
"learning_rate": 3.3263964342285795e-06,
|
|
"loss": 0.7017,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"epoch": 0.7818265489687487,
|
|
"grad_norm": 6.133769989013672,
|
|
"learning_rate": 3.308135133593595e-06,
|
|
"loss": 0.6924,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"epoch": 0.7831652930594486,
|
|
"grad_norm": 4.701889514923096,
|
|
"learning_rate": 3.289899283371657e-06,
|
|
"loss": 0.6939,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"epoch": 0.7845040371501485,
|
|
"grad_norm": 3.684704065322876,
|
|
"learning_rate": 3.271689157881317e-06,
|
|
"loss": 0.7011,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"epoch": 0.7858427812408484,
|
|
"grad_norm": 5.377622604370117,
|
|
"learning_rate": 3.253505031054155e-06,
|
|
"loss": 0.698,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"epoch": 0.7871815253315484,
|
|
"grad_norm": 4.7843499183654785,
|
|
"learning_rate": 3.2353471764306567e-06,
|
|
"loss": 0.6936,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"epoch": 0.7885202694222483,
|
|
"grad_norm": 4.845401287078857,
|
|
"learning_rate": 3.2172158671561005e-06,
|
|
"loss": 0.7006,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"epoch": 0.7898590135129482,
|
|
"grad_norm": 5.628458499908447,
|
|
"learning_rate": 3.1991113759764493e-06,
|
|
"loss": 0.6981,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 0.7911977576036481,
|
|
"grad_norm": 3.9687514305114746,
|
|
"learning_rate": 3.1810339752342446e-06,
|
|
"loss": 0.7186,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"epoch": 0.792536501694348,
|
|
"grad_norm": 5.1330885887146,
|
|
"learning_rate": 3.1629839368645087e-06,
|
|
"loss": 0.7031,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"epoch": 0.7938752457850479,
|
|
"grad_norm": 4.01475191116333,
|
|
"learning_rate": 3.1449615323906657e-06,
|
|
"loss": 0.6959,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"epoch": 0.7952139898757478,
|
|
"grad_norm": 4.5600361824035645,
|
|
"learning_rate": 3.12696703292044e-06,
|
|
"loss": 0.7141,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"epoch": 0.7965527339664478,
|
|
"grad_norm": 4.481199741363525,
|
|
"learning_rate": 3.1090007091417884e-06,
|
|
"loss": 0.7125,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"epoch": 0.7978914780571477,
|
|
"grad_norm": 4.744899272918701,
|
|
"learning_rate": 3.091062831318825e-06,
|
|
"loss": 0.7064,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"epoch": 0.7992302221478476,
|
|
"grad_norm": 5.471341133117676,
|
|
"learning_rate": 3.0731536692877596e-06,
|
|
"loss": 0.6961,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"epoch": 0.8005689662385475,
|
|
"grad_norm": 4.031320095062256,
|
|
"learning_rate": 3.0552734924528304e-06,
|
|
"loss": 0.6897,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"epoch": 0.8019077103292473,
|
|
"grad_norm": 4.580793380737305,
|
|
"learning_rate": 3.0374225697822645e-06,
|
|
"loss": 0.6993,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"epoch": 0.8032464544199472,
|
|
"grad_norm": 4.880797386169434,
|
|
"learning_rate": 3.019601169804216e-06,
|
|
"loss": 0.6907,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.8045851985106472,
|
|
"grad_norm": 4.268701076507568,
|
|
"learning_rate": 3.00180956060274e-06,
|
|
"loss": 0.7202,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"epoch": 0.8059239426013471,
|
|
"grad_norm": 4.482174873352051,
|
|
"learning_rate": 2.9840480098137498e-06,
|
|
"loss": 0.6948,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"epoch": 0.807262686692047,
|
|
"grad_norm": 4.308942794799805,
|
|
"learning_rate": 2.966316784621e-06,
|
|
"loss": 0.6878,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"epoch": 0.8086014307827469,
|
|
"grad_norm": 4.806860446929932,
|
|
"learning_rate": 2.94861615175206e-06,
|
|
"loss": 0.7085,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"epoch": 0.8099401748734468,
|
|
"grad_norm": 4.6116719245910645,
|
|
"learning_rate": 2.9309463774743047e-06,
|
|
"loss": 0.7161,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"epoch": 0.8112789189641467,
|
|
"grad_norm": 6.33508825302124,
|
|
"learning_rate": 2.9133077275909112e-06,
|
|
"loss": 0.7003,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"epoch": 0.8126176630548466,
|
|
"grad_norm": 5.388082504272461,
|
|
"learning_rate": 2.895700467436855e-06,
|
|
"loss": 0.691,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"epoch": 0.8139564071455466,
|
|
"grad_norm": 4.028987884521484,
|
|
"learning_rate": 2.8781248618749235e-06,
|
|
"loss": 0.6898,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"epoch": 0.8152951512362465,
|
|
"grad_norm": 3.9158191680908203,
|
|
"learning_rate": 2.86058117529173e-06,
|
|
"loss": 0.7011,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"epoch": 0.8166338953269464,
|
|
"grad_norm": 4.687577724456787,
|
|
"learning_rate": 2.843069671593734e-06,
|
|
"loss": 0.6897,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 0.8179726394176463,
|
|
"grad_norm": 5.06867790222168,
|
|
"learning_rate": 2.825590614203277e-06,
|
|
"loss": 0.6837,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"epoch": 0.8193113835083462,
|
|
"grad_norm": 4.052064895629883,
|
|
"learning_rate": 2.8081442660546126e-06,
|
|
"loss": 0.7024,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"epoch": 0.8206501275990461,
|
|
"grad_norm": 4.204895973205566,
|
|
"learning_rate": 2.790730889589962e-06,
|
|
"loss": 0.7081,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"epoch": 0.8219888716897461,
|
|
"grad_norm": 4.370186805725098,
|
|
"learning_rate": 2.7733507467555532e-06,
|
|
"loss": 0.702,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"epoch": 0.823327615780446,
|
|
"grad_norm": 4.563244819641113,
|
|
"learning_rate": 2.7560040989976894e-06,
|
|
"loss": 0.6985,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"epoch": 0.8246663598711459,
|
|
"grad_norm": 4.537478923797607,
|
|
"learning_rate": 2.7386912072588123e-06,
|
|
"loss": 0.6951,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"epoch": 0.8260051039618458,
|
|
"grad_norm": 4.305166721343994,
|
|
"learning_rate": 2.7214123319735787e-06,
|
|
"loss": 0.7097,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"epoch": 0.8273438480525457,
|
|
"grad_norm": 5.007378578186035,
|
|
"learning_rate": 2.7041677330649408e-06,
|
|
"loss": 0.6849,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"epoch": 0.8286825921432456,
|
|
"grad_norm": 4.699695110321045,
|
|
"learning_rate": 2.686957669940242e-06,
|
|
"loss": 0.7065,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"epoch": 0.8300213362339456,
|
|
"grad_norm": 4.996771812438965,
|
|
"learning_rate": 2.6697824014873076e-06,
|
|
"loss": 0.7052,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 0.8313600803246455,
|
|
"grad_norm": 4.331625461578369,
|
|
"learning_rate": 2.6526421860705474e-06,
|
|
"loss": 0.6973,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"epoch": 0.8326988244153454,
|
|
"grad_norm": 4.313735485076904,
|
|
"learning_rate": 2.6355372815270837e-06,
|
|
"loss": 0.707,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"epoch": 0.8340375685060453,
|
|
"grad_norm": 3.9984254837036133,
|
|
"learning_rate": 2.6184679451628587e-06,
|
|
"loss": 0.6914,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"epoch": 0.8353763125967452,
|
|
"grad_norm": 3.741671323776245,
|
|
"learning_rate": 2.601434433748771e-06,
|
|
"loss": 0.7104,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"epoch": 0.8367150566874451,
|
|
"grad_norm": 5.043244361877441,
|
|
"learning_rate": 2.5844370035168077e-06,
|
|
"loss": 0.7077,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"epoch": 0.838053800778145,
|
|
"grad_norm": 4.056079387664795,
|
|
"learning_rate": 2.567475910156201e-06,
|
|
"loss": 0.7141,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"epoch": 0.839392544868845,
|
|
"grad_norm": 4.613669395446777,
|
|
"learning_rate": 2.550551408809566e-06,
|
|
"loss": 0.6938,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"epoch": 0.8407312889595449,
|
|
"grad_norm": 4.054388523101807,
|
|
"learning_rate": 2.533663754069074e-06,
|
|
"loss": 0.7012,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"epoch": 0.8420700330502447,
|
|
"grad_norm": 4.854631423950195,
|
|
"learning_rate": 2.5168131999726203e-06,
|
|
"loss": 0.683,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"epoch": 0.8434087771409446,
|
|
"grad_norm": 4.717468738555908,
|
|
"learning_rate": 2.5000000000000015e-06,
|
|
"loss": 0.7251,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 0.8434087771409446,
|
|
"eval_loss": 0.4627279043197632,
|
|
"eval_runtime": 142.8413,
|
|
"eval_samples_per_second": 77.009,
|
|
"eval_steps_per_second": 9.626,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 0.8447475212316445,
|
|
"grad_norm": 4.872498989105225,
|
|
"learning_rate": 2.4832244070691013e-06,
|
|
"loss": 0.6976,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"epoch": 0.8460862653223444,
|
|
"grad_norm": 4.940881252288818,
|
|
"learning_rate": 2.4664866735320886e-06,
|
|
"loss": 0.7098,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"epoch": 0.8474250094130444,
|
|
"grad_norm": 3.984968423843384,
|
|
"learning_rate": 2.4497870511716237e-06,
|
|
"loss": 0.6927,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"epoch": 0.8487637535037443,
|
|
"grad_norm": 4.75971794128418,
|
|
"learning_rate": 2.4331257911970628e-06,
|
|
"loss": 0.7116,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"epoch": 0.8501024975944442,
|
|
"grad_norm": 4.419068813323975,
|
|
"learning_rate": 2.4165031442406857e-06,
|
|
"loss": 0.6916,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"epoch": 0.8514412416851441,
|
|
"grad_norm": 4.9437994956970215,
|
|
"learning_rate": 2.3999193603539234e-06,
|
|
"loss": 0.688,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"epoch": 0.852779985775844,
|
|
"grad_norm": 4.8349409103393555,
|
|
"learning_rate": 2.3833746890035964e-06,
|
|
"loss": 0.6865,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"epoch": 0.8541187298665439,
|
|
"grad_norm": 4.248473167419434,
|
|
"learning_rate": 2.3668693790681634e-06,
|
|
"loss": 0.7153,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"epoch": 0.8554574739572438,
|
|
"grad_norm": 3.518911600112915,
|
|
"learning_rate": 2.3504036788339763e-06,
|
|
"loss": 0.6955,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"epoch": 0.8567962180479438,
|
|
"grad_norm": 4.7744598388671875,
|
|
"learning_rate": 2.333977835991545e-06,
|
|
"loss": 0.6981,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 0.8581349621386437,
|
|
"grad_norm": 5.812184810638428,
|
|
"learning_rate": 2.317592097631812e-06,
|
|
"loss": 0.7033,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"epoch": 0.8594737062293436,
|
|
"grad_norm": 6.024710655212402,
|
|
"learning_rate": 2.3012467102424373e-06,
|
|
"loss": 0.7113,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"epoch": 0.8608124503200435,
|
|
"grad_norm": 4.483139514923096,
|
|
"learning_rate": 2.284941919704085e-06,
|
|
"loss": 0.6978,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"epoch": 0.8621511944107434,
|
|
"grad_norm": 4.008630275726318,
|
|
"learning_rate": 2.268677971286732e-06,
|
|
"loss": 0.6925,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"epoch": 0.8634899385014433,
|
|
"grad_norm": 4.584245204925537,
|
|
"learning_rate": 2.2524551096459703e-06,
|
|
"loss": 0.6964,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"epoch": 0.8648286825921433,
|
|
"grad_norm": 4.4748616218566895,
|
|
"learning_rate": 2.236273578819337e-06,
|
|
"loss": 0.6967,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"epoch": 0.8661674266828432,
|
|
"grad_norm": 4.157332897186279,
|
|
"learning_rate": 2.2201336222226332e-06,
|
|
"loss": 0.6799,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"epoch": 0.8675061707735431,
|
|
"grad_norm": 4.799479007720947,
|
|
"learning_rate": 2.204035482646267e-06,
|
|
"loss": 0.6994,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"epoch": 0.868844914864243,
|
|
"grad_norm": 5.124458312988281,
|
|
"learning_rate": 2.1879794022516006e-06,
|
|
"loss": 0.6927,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"epoch": 0.8701836589549429,
|
|
"grad_norm": 5.162511348724365,
|
|
"learning_rate": 2.171965622567308e-06,
|
|
"loss": 0.7,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 0.8715224030456428,
|
|
"grad_norm": 5.139285087585449,
|
|
"learning_rate": 2.155994384485742e-06,
|
|
"loss": 0.7089,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"epoch": 0.8728611471363427,
|
|
"grad_norm": 4.853121757507324,
|
|
"learning_rate": 2.1400659282593083e-06,
|
|
"loss": 0.6909,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"epoch": 0.8741998912270427,
|
|
"grad_norm": 4.565732002258301,
|
|
"learning_rate": 2.1241804934968558e-06,
|
|
"loss": 0.7007,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"epoch": 0.8755386353177426,
|
|
"grad_norm": 4.962949275970459,
|
|
"learning_rate": 2.1083383191600676e-06,
|
|
"loss": 0.697,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"epoch": 0.8768773794084425,
|
|
"grad_norm": 4.6151204109191895,
|
|
"learning_rate": 2.0925396435598665e-06,
|
|
"loss": 0.6897,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"epoch": 0.8782161234991424,
|
|
"grad_norm": 3.5798747539520264,
|
|
"learning_rate": 2.076784704352835e-06,
|
|
"loss": 0.7105,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"epoch": 0.8795548675898422,
|
|
"grad_norm": 5.569591999053955,
|
|
"learning_rate": 2.061073738537635e-06,
|
|
"loss": 0.7099,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"epoch": 0.8808936116805421,
|
|
"grad_norm": 3.553903579711914,
|
|
"learning_rate": 2.0454069824514445e-06,
|
|
"loss": 0.6999,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"epoch": 0.8822323557712421,
|
|
"grad_norm": 6.162130832672119,
|
|
"learning_rate": 2.0297846717664043e-06,
|
|
"loss": 0.708,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"epoch": 0.883571099861942,
|
|
"grad_norm": 3.948383092880249,
|
|
"learning_rate": 2.0142070414860704e-06,
|
|
"loss": 0.6967,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 0.8849098439526419,
|
|
"grad_norm": 3.9884955883026123,
|
|
"learning_rate": 1.9986743259418786e-06,
|
|
"loss": 0.7163,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"epoch": 0.8862485880433418,
|
|
"grad_norm": 4.441530704498291,
|
|
"learning_rate": 1.983186758789622e-06,
|
|
"loss": 0.711,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"epoch": 0.8875873321340417,
|
|
"grad_norm": 6.796314716339111,
|
|
"learning_rate": 1.9677445730059348e-06,
|
|
"loss": 0.7095,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"epoch": 0.8889260762247416,
|
|
"grad_norm": 6.264246940612793,
|
|
"learning_rate": 1.9523480008847856e-06,
|
|
"loss": 0.6978,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"epoch": 0.8902648203154416,
|
|
"grad_norm": 5.112490653991699,
|
|
"learning_rate": 1.936997274033986e-06,
|
|
"loss": 0.7033,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"epoch": 0.8916035644061415,
|
|
"grad_norm": 5.50083589553833,
|
|
"learning_rate": 1.9216926233717087e-06,
|
|
"loss": 0.7061,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"epoch": 0.8929423084968414,
|
|
"grad_norm": 4.575523853302002,
|
|
"learning_rate": 1.9064342791230072e-06,
|
|
"loss": 0.709,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"epoch": 0.8942810525875413,
|
|
"grad_norm": 4.6602396965026855,
|
|
"learning_rate": 1.8912224708163561e-06,
|
|
"loss": 0.6877,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"epoch": 0.8956197966782412,
|
|
"grad_norm": 5.696986675262451,
|
|
"learning_rate": 1.8760574272802002e-06,
|
|
"loss": 0.702,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"epoch": 0.8969585407689411,
|
|
"grad_norm": 5.556809902191162,
|
|
"learning_rate": 1.8609393766395083e-06,
|
|
"loss": 0.727,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 0.898297284859641,
|
|
"grad_norm": 5.636332035064697,
|
|
"learning_rate": 1.8458685463123438e-06,
|
|
"loss": 0.6882,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"epoch": 0.899636028950341,
|
|
"grad_norm": 8.411212921142578,
|
|
"learning_rate": 1.8308451630064484e-06,
|
|
"loss": 0.7036,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"epoch": 0.9009747730410409,
|
|
"grad_norm": 4.418994903564453,
|
|
"learning_rate": 1.8158694527158205e-06,
|
|
"loss": 0.6952,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"epoch": 0.9023135171317408,
|
|
"grad_norm": 4.99680757522583,
|
|
"learning_rate": 1.8009416407173258e-06,
|
|
"loss": 0.6973,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"epoch": 0.9036522612224407,
|
|
"grad_norm": 5.274899959564209,
|
|
"learning_rate": 1.7860619515673034e-06,
|
|
"loss": 0.7227,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"epoch": 0.9049910053131406,
|
|
"grad_norm": 4.882939338684082,
|
|
"learning_rate": 1.7712306090981896e-06,
|
|
"loss": 0.6962,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"epoch": 0.9063297494038405,
|
|
"grad_norm": 4.529172420501709,
|
|
"learning_rate": 1.75644783641515e-06,
|
|
"loss": 0.6956,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"epoch": 0.9076684934945405,
|
|
"grad_norm": 4.752635955810547,
|
|
"learning_rate": 1.7417138558927244e-06,
|
|
"loss": 0.6959,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"epoch": 0.9090072375852404,
|
|
"grad_norm": 4.324943542480469,
|
|
"learning_rate": 1.7270288891714814e-06,
|
|
"loss": 0.7182,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"epoch": 0.9103459816759403,
|
|
"grad_norm": 4.066229343414307,
|
|
"learning_rate": 1.7123931571546826e-06,
|
|
"loss": 0.6905,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 0.9116847257666402,
|
|
"grad_norm": 3.654094934463501,
|
|
"learning_rate": 1.6978068800049624e-06,
|
|
"loss": 0.6851,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"epoch": 0.9130234698573401,
|
|
"grad_norm": 4.736865520477295,
|
|
"learning_rate": 1.6832702771410142e-06,
|
|
"loss": 0.6943,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"epoch": 0.91436221394804,
|
|
"grad_norm": 5.089278221130371,
|
|
"learning_rate": 1.6687835672342895e-06,
|
|
"loss": 0.6825,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"epoch": 0.9157009580387399,
|
|
"grad_norm": 4.635818004608154,
|
|
"learning_rate": 1.6543469682057105e-06,
|
|
"loss": 0.7053,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"epoch": 0.9170397021294399,
|
|
"grad_norm": 3.9297430515289307,
|
|
"learning_rate": 1.639960697222388e-06,
|
|
"loss": 0.6977,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"epoch": 0.9183784462201398,
|
|
"grad_norm": 5.232566833496094,
|
|
"learning_rate": 1.6256249706943628e-06,
|
|
"loss": 0.6943,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"epoch": 0.9197171903108396,
|
|
"grad_norm": 5.266841888427734,
|
|
"learning_rate": 1.611340004271339e-06,
|
|
"loss": 0.7084,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"epoch": 0.9210559344015395,
|
|
"grad_norm": 4.687314033508301,
|
|
"learning_rate": 1.5971060128394483e-06,
|
|
"loss": 0.7,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"epoch": 0.9223946784922394,
|
|
"grad_norm": 5.770331382751465,
|
|
"learning_rate": 1.5829232105180143e-06,
|
|
"loss": 0.7257,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"epoch": 0.9237334225829393,
|
|
"grad_norm": 4.606459140777588,
|
|
"learning_rate": 1.5687918106563326e-06,
|
|
"loss": 0.7109,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 0.9250721666736393,
|
|
"grad_norm": 5.567401885986328,
|
|
"learning_rate": 1.55471202583046e-06,
|
|
"loss": 0.6965,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"epoch": 0.9264109107643392,
|
|
"grad_norm": 4.404622554779053,
|
|
"learning_rate": 1.5406840678400204e-06,
|
|
"loss": 0.6887,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"epoch": 0.9277496548550391,
|
|
"grad_norm": 4.736256122589111,
|
|
"learning_rate": 1.5267081477050132e-06,
|
|
"loss": 0.7156,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"epoch": 0.929088398945739,
|
|
"grad_norm": 4.642551422119141,
|
|
"learning_rate": 1.5127844756626437e-06,
|
|
"loss": 0.7038,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"epoch": 0.9304271430364389,
|
|
"grad_norm": 4.822713375091553,
|
|
"learning_rate": 1.4989132611641576e-06,
|
|
"loss": 0.702,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"epoch": 0.9317658871271388,
|
|
"grad_norm": 5.617921829223633,
|
|
"learning_rate": 1.4850947128716914e-06,
|
|
"loss": 0.7078,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"epoch": 0.9331046312178388,
|
|
"grad_norm": 4.814059257507324,
|
|
"learning_rate": 1.471329038655135e-06,
|
|
"loss": 0.7006,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"epoch": 0.9344433753085387,
|
|
"grad_norm": 5.496111869812012,
|
|
"learning_rate": 1.4576164455890014e-06,
|
|
"loss": 0.6991,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"epoch": 0.9357821193992386,
|
|
"grad_norm": 5.050607681274414,
|
|
"learning_rate": 1.4439571399493146e-06,
|
|
"loss": 0.6882,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"epoch": 0.9371208634899385,
|
|
"grad_norm": 5.040009498596191,
|
|
"learning_rate": 1.4303513272105057e-06,
|
|
"loss": 0.6913,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 0.9384596075806384,
|
|
"grad_norm": 3.6622118949890137,
|
|
"learning_rate": 1.4167992120423212e-06,
|
|
"loss": 0.7138,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"epoch": 0.9397983516713383,
|
|
"grad_norm": 4.046322822570801,
|
|
"learning_rate": 1.4033009983067454e-06,
|
|
"loss": 0.6997,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"epoch": 0.9411370957620382,
|
|
"grad_norm": 5.057531356811523,
|
|
"learning_rate": 1.3898568890549335e-06,
|
|
"loss": 0.703,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"epoch": 0.9424758398527382,
|
|
"grad_norm": 4.67234468460083,
|
|
"learning_rate": 1.3764670865241557e-06,
|
|
"loss": 0.6942,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"epoch": 0.9438145839434381,
|
|
"grad_norm": 5.179855823516846,
|
|
"learning_rate": 1.3631317921347564e-06,
|
|
"loss": 0.7107,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"epoch": 0.945153328034138,
|
|
"grad_norm": 4.793069839477539,
|
|
"learning_rate": 1.3498512064871272e-06,
|
|
"loss": 0.6949,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"epoch": 0.9464920721248379,
|
|
"grad_norm": 3.9901440143585205,
|
|
"learning_rate": 1.3366255293586822e-06,
|
|
"loss": 0.6861,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"epoch": 0.9478308162155378,
|
|
"grad_norm": 4.42042875289917,
|
|
"learning_rate": 1.3234549597008572e-06,
|
|
"loss": 0.6947,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"epoch": 0.9491695603062377,
|
|
"grad_norm": 4.580082416534424,
|
|
"learning_rate": 1.310339695636118e-06,
|
|
"loss": 0.7061,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"epoch": 0.9505083043969377,
|
|
"grad_norm": 4.93763542175293,
|
|
"learning_rate": 1.297279934454978e-06,
|
|
"loss": 0.6875,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 0.9518470484876376,
|
|
"grad_norm": 3.6759912967681885,
|
|
"learning_rate": 1.2842758726130283e-06,
|
|
"loss": 0.7016,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"epoch": 0.9531857925783375,
|
|
"grad_norm": 4.491069793701172,
|
|
"learning_rate": 1.271327705727991e-06,
|
|
"loss": 0.7147,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"epoch": 0.9545245366690374,
|
|
"grad_norm": 4.478505611419678,
|
|
"learning_rate": 1.2584356285767652e-06,
|
|
"loss": 0.697,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"epoch": 0.9558632807597373,
|
|
"grad_norm": 4.314511299133301,
|
|
"learning_rate": 1.2455998350925042e-06,
|
|
"loss": 0.7133,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"epoch": 0.9572020248504371,
|
|
"grad_norm": 6.654889106750488,
|
|
"learning_rate": 1.2328205183616964e-06,
|
|
"loss": 0.7066,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"epoch": 0.958540768941137,
|
|
"grad_norm": 4.827569007873535,
|
|
"learning_rate": 1.2200978706212606e-06,
|
|
"loss": 0.6877,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"epoch": 0.959879513031837,
|
|
"grad_norm": 3.8334462642669678,
|
|
"learning_rate": 1.2074320832556558e-06,
|
|
"loss": 0.6983,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"epoch": 0.961218257122537,
|
|
"grad_norm": 4.697995185852051,
|
|
"learning_rate": 1.1948233467939978e-06,
|
|
"loss": 0.7199,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"epoch": 0.9625570012132368,
|
|
"grad_norm": 4.082467079162598,
|
|
"learning_rate": 1.182271850907199e-06,
|
|
"loss": 0.7069,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"epoch": 0.9638957453039367,
|
|
"grad_norm": 4.2034149169921875,
|
|
"learning_rate": 1.1697777844051105e-06,
|
|
"loss": 0.7007,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 0.9638957453039367,
|
|
"eval_loss": 0.46230047941207886,
|
|
"eval_runtime": 142.9613,
|
|
"eval_samples_per_second": 76.944,
|
|
"eval_steps_per_second": 9.618,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 0.9652344893946366,
|
|
"grad_norm": 4.125426769256592,
|
|
"learning_rate": 1.1573413352336848e-06,
|
|
"loss": 0.6979,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"epoch": 0.9665732334853365,
|
|
"grad_norm": 4.660792350769043,
|
|
"learning_rate": 1.1449626904721472e-06,
|
|
"loss": 0.7034,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"epoch": 0.9679119775760365,
|
|
"grad_norm": 4.620912551879883,
|
|
"learning_rate": 1.132642036330181e-06,
|
|
"loss": 0.7129,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"epoch": 0.9692507216667364,
|
|
"grad_norm": 6.727054119110107,
|
|
"learning_rate": 1.1203795581451288e-06,
|
|
"loss": 0.7109,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"epoch": 0.9705894657574363,
|
|
"grad_norm": 4.550580024719238,
|
|
"learning_rate": 1.1081754403792e-06,
|
|
"loss": 0.707,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"epoch": 0.9719282098481362,
|
|
"grad_norm": 4.484792232513428,
|
|
"learning_rate": 1.096029866616704e-06,
|
|
"loss": 0.6901,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"epoch": 0.9732669539388361,
|
|
"grad_norm": 5.661564826965332,
|
|
"learning_rate": 1.0839430195612794e-06,
|
|
"loss": 0.6867,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"epoch": 0.974605698029536,
|
|
"grad_norm": 4.4849534034729,
|
|
"learning_rate": 1.0719150810331497e-06,
|
|
"loss": 0.7053,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"epoch": 0.9759444421202359,
|
|
"grad_norm": 4.8529181480407715,
|
|
"learning_rate": 1.0599462319663906e-06,
|
|
"loss": 0.7143,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"epoch": 0.9772831862109359,
|
|
"grad_norm": 4.462001800537109,
|
|
"learning_rate": 1.0480366524062041e-06,
|
|
"loss": 0.6704,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 0.9786219303016358,
|
|
"grad_norm": 4.933704853057861,
|
|
"learning_rate": 1.036186521506211e-06,
|
|
"loss": 0.7034,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"epoch": 0.9799606743923357,
|
|
"grad_norm": 4.467795372009277,
|
|
"learning_rate": 1.0243960175257605e-06,
|
|
"loss": 0.6931,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"epoch": 0.9812994184830356,
|
|
"grad_norm": 4.279376983642578,
|
|
"learning_rate": 1.0126653178272422e-06,
|
|
"loss": 0.7018,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"epoch": 0.9826381625737355,
|
|
"grad_norm": 4.526325225830078,
|
|
"learning_rate": 1.0009945988734205e-06,
|
|
"loss": 0.6888,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"epoch": 0.9839769066644354,
|
|
"grad_norm": 4.8763346672058105,
|
|
"learning_rate": 9.893840362247809e-07,
|
|
"loss": 0.7086,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"epoch": 0.9853156507551354,
|
|
"grad_norm": 4.0086493492126465,
|
|
"learning_rate": 9.778338045368901e-07,
|
|
"loss": 0.7012,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"epoch": 0.9866543948458353,
|
|
"grad_norm": 4.861421585083008,
|
|
"learning_rate": 9.663440775577653e-07,
|
|
"loss": 0.7028,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"epoch": 0.9879931389365352,
|
|
"grad_norm": 3.9301681518554688,
|
|
"learning_rate": 9.549150281252633e-07,
|
|
"loss": 0.6781,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"epoch": 0.9893318830272351,
|
|
"grad_norm": 4.991429805755615,
|
|
"learning_rate": 9.435468281644799e-07,
|
|
"loss": 0.6855,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"epoch": 0.990670627117935,
|
|
"grad_norm": 4.527165412902832,
|
|
"learning_rate": 9.322396486851626e-07,
|
|
"loss": 0.6999,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 0.9920093712086349,
|
|
"grad_norm": 4.4443864822387695,
|
|
"learning_rate": 9.209936597791407e-07,
|
|
"loss": 0.7023,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"epoch": 0.9933481152993349,
|
|
"grad_norm": 5.1099958419799805,
|
|
"learning_rate": 9.098090306177626e-07,
|
|
"loss": 0.6996,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"epoch": 0.9946868593900348,
|
|
"grad_norm": 4.7418742179870605,
|
|
"learning_rate": 8.98685929449355e-07,
|
|
"loss": 0.701,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"epoch": 0.9960256034807347,
|
|
"grad_norm": 4.561365604400635,
|
|
"learning_rate": 8.876245235966884e-07,
|
|
"loss": 0.6985,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"epoch": 0.9973643475714346,
|
|
"grad_norm": 5.237748146057129,
|
|
"learning_rate": 8.766249794544662e-07,
|
|
"loss": 0.699,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"epoch": 0.9987030916621344,
|
|
"grad_norm": 4.272115707397461,
|
|
"learning_rate": 8.656874624868133e-07,
|
|
"loss": 0.6974,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"epoch": 1.0000418357528345,
|
|
"grad_norm": 4.616804599761963,
|
|
"learning_rate": 8.54812137224792e-07,
|
|
"loss": 0.6979,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"epoch": 1.0013805798435342,
|
|
"grad_norm": 5.1510515213012695,
|
|
"learning_rate": 8.439991672639264e-07,
|
|
"loss": 0.6831,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"epoch": 1.0027193239342342,
|
|
"grad_norm": 4.642550945281982,
|
|
"learning_rate": 8.332487152617424e-07,
|
|
"loss": 0.6921,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"epoch": 1.004058068024934,
|
|
"grad_norm": 5.189831733703613,
|
|
"learning_rate": 8.225609429353187e-07,
|
|
"loss": 0.6935,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 1.005396812115634,
|
|
"grad_norm": 5.009095668792725,
|
|
"learning_rate": 8.119360110588531e-07,
|
|
"loss": 0.691,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"epoch": 1.006735556206334,
|
|
"grad_norm": 3.6014087200164795,
|
|
"learning_rate": 8.013740794612512e-07,
|
|
"loss": 0.6953,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"epoch": 1.0080743002970338,
|
|
"grad_norm": 5.109480857849121,
|
|
"learning_rate": 7.908753070237124e-07,
|
|
"loss": 0.6953,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"epoch": 1.0094130443877338,
|
|
"grad_norm": 5.630359649658203,
|
|
"learning_rate": 7.804398516773465e-07,
|
|
"loss": 0.6879,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"epoch": 1.0107517884784336,
|
|
"grad_norm": 5.846060276031494,
|
|
"learning_rate": 7.700678704007947e-07,
|
|
"loss": 0.6672,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"epoch": 1.0120905325691336,
|
|
"grad_norm": 6.451261520385742,
|
|
"learning_rate": 7.597595192178702e-07,
|
|
"loss": 0.6892,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"epoch": 1.0134292766598334,
|
|
"grad_norm": 4.009243011474609,
|
|
"learning_rate": 7.495149531952101e-07,
|
|
"loss": 0.6739,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"epoch": 1.0147680207505334,
|
|
"grad_norm": 5.029900550842285,
|
|
"learning_rate": 7.393343264399439e-07,
|
|
"loss": 0.6808,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"epoch": 1.0161067648412334,
|
|
"grad_norm": 4.475840091705322,
|
|
"learning_rate": 7.292177920973726e-07,
|
|
"loss": 0.6747,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"epoch": 1.0174455089319332,
|
|
"grad_norm": 5.075997352600098,
|
|
"learning_rate": 7.191655023486682e-07,
|
|
"loss": 0.6885,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"epoch": 1.0187842530226332,
|
|
"grad_norm": 4.483147144317627,
|
|
"learning_rate": 7.091776084085828e-07,
|
|
"loss": 0.6775,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"epoch": 1.020122997113333,
|
|
"grad_norm": 4.035081386566162,
|
|
"learning_rate": 6.992542605231739e-07,
|
|
"loss": 0.6752,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"epoch": 1.021461741204033,
|
|
"grad_norm": 4.678783893585205,
|
|
"learning_rate": 6.893956079675452e-07,
|
|
"loss": 0.6753,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"epoch": 1.0228004852947328,
|
|
"grad_norm": 4.273000717163086,
|
|
"learning_rate": 6.796017990435977e-07,
|
|
"loss": 0.6763,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"epoch": 1.0241392293854328,
|
|
"grad_norm": 4.835209369659424,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.6892,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"epoch": 1.0254779734761328,
|
|
"grad_norm": 5.218572616577148,
|
|
"learning_rate": 6.602093004189963e-07,
|
|
"loss": 0.6821,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"epoch": 1.0268167175668326,
|
|
"grad_norm": 4.082930564880371,
|
|
"learning_rate": 6.506109024361429e-07,
|
|
"loss": 0.6736,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"epoch": 1.0281554616575326,
|
|
"grad_norm": 4.463101863861084,
|
|
"learning_rate": 6.410779315161885e-07,
|
|
"loss": 0.6691,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"epoch": 1.0294942057482324,
|
|
"grad_norm": 4.414255142211914,
|
|
"learning_rate": 6.316105310618664e-07,
|
|
"loss": 0.6807,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"epoch": 1.0308329498389324,
|
|
"grad_norm": 4.1366658210754395,
|
|
"learning_rate": 6.222088434895462e-07,
|
|
"loss": 0.6902,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"epoch": 1.0321716939296324,
|
|
"grad_norm": 5.217019557952881,
|
|
"learning_rate": 6.128730102270897e-07,
|
|
"loss": 0.6991,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"epoch": 1.0335104380203322,
|
|
"grad_norm": 3.9112184047698975,
|
|
"learning_rate": 6.03603171711728e-07,
|
|
"loss": 0.6664,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"epoch": 1.0348491821110322,
|
|
"grad_norm": 5.043931007385254,
|
|
"learning_rate": 5.943994673879405e-07,
|
|
"loss": 0.6803,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"epoch": 1.036187926201732,
|
|
"grad_norm": 5.630739688873291,
|
|
"learning_rate": 5.852620357053651e-07,
|
|
"loss": 0.6854,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"epoch": 1.037526670292432,
|
|
"grad_norm": 4.230434417724609,
|
|
"learning_rate": 5.76191014116711e-07,
|
|
"loss": 0.6949,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"epoch": 1.0388654143831317,
|
|
"grad_norm": 4.125826835632324,
|
|
"learning_rate": 5.671865390756948e-07,
|
|
"loss": 0.7017,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"epoch": 1.0402041584738317,
|
|
"grad_norm": 5.3223958015441895,
|
|
"learning_rate": 5.582487460349806e-07,
|
|
"loss": 0.6742,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"epoch": 1.0415429025645317,
|
|
"grad_norm": 5.887722969055176,
|
|
"learning_rate": 5.493777694441521e-07,
|
|
"loss": 0.6929,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"epoch": 1.0428816466552315,
|
|
"grad_norm": 4.237276077270508,
|
|
"learning_rate": 5.405737427476854e-07,
|
|
"loss": 0.6798,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"epoch": 1.0442203907459315,
|
|
"grad_norm": 4.491860389709473,
|
|
"learning_rate": 5.318367983829393e-07,
|
|
"loss": 0.6949,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"epoch": 1.0455591348366313,
|
|
"grad_norm": 4.849346160888672,
|
|
"learning_rate": 5.231670677781659e-07,
|
|
"loss": 0.6905,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"epoch": 1.0468978789273313,
|
|
"grad_norm": 4.150999069213867,
|
|
"learning_rate": 5.145646813505339e-07,
|
|
"loss": 0.676,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"epoch": 1.048236623018031,
|
|
"grad_norm": 5.256472110748291,
|
|
"learning_rate": 5.06029768504166e-07,
|
|
"loss": 0.6866,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"epoch": 1.0495753671087311,
|
|
"grad_norm": 4.420201778411865,
|
|
"learning_rate": 4.97562457628189e-07,
|
|
"loss": 0.6698,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"epoch": 1.0509141111994311,
|
|
"grad_norm": 6.762264728546143,
|
|
"learning_rate": 4.891628760948114e-07,
|
|
"loss": 0.6706,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"epoch": 1.052252855290131,
|
|
"grad_norm": 5.927251815795898,
|
|
"learning_rate": 4.808311502573976e-07,
|
|
"loss": 0.6891,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"epoch": 1.053591599380831,
|
|
"grad_norm": 4.7239298820495605,
|
|
"learning_rate": 4.7256740544857124e-07,
|
|
"loss": 0.6799,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"epoch": 1.0549303434715307,
|
|
"grad_norm": 3.824331045150757,
|
|
"learning_rate": 4.643717659783309e-07,
|
|
"loss": 0.6892,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"epoch": 1.0562690875622307,
|
|
"grad_norm": 3.8704497814178467,
|
|
"learning_rate": 4.562443551321788e-07,
|
|
"loss": 0.6852,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"epoch": 1.0576078316529305,
|
|
"grad_norm": 4.863569259643555,
|
|
"learning_rate": 4.481852951692672e-07,
|
|
"loss": 0.6753,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"epoch": 1.0589465757436305,
|
|
"grad_norm": 4.810764312744141,
|
|
"learning_rate": 4.401947073205559e-07,
|
|
"loss": 0.6939,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"epoch": 1.0602853198343305,
|
|
"grad_norm": 4.7015156745910645,
|
|
"learning_rate": 4.322727117869951e-07,
|
|
"loss": 0.6922,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"epoch": 1.0616240639250303,
|
|
"grad_norm": 4.202758312225342,
|
|
"learning_rate": 4.2441942773771114e-07,
|
|
"loss": 0.6885,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"epoch": 1.0629628080157303,
|
|
"grad_norm": 4.461851119995117,
|
|
"learning_rate": 4.1663497330821536e-07,
|
|
"loss": 0.691,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"epoch": 1.06430155210643,
|
|
"grad_norm": 4.451940536499023,
|
|
"learning_rate": 4.089194655986306e-07,
|
|
"loss": 0.6706,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"epoch": 1.06564029619713,
|
|
"grad_norm": 5.303378582000732,
|
|
"learning_rate": 4.0127302067192285e-07,
|
|
"loss": 0.6763,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"epoch": 1.06697904028783,
|
|
"grad_norm": 5.285184383392334,
|
|
"learning_rate": 3.936957535521624e-07,
|
|
"loss": 0.6784,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"epoch": 1.0683177843785299,
|
|
"grad_norm": 9.022964477539062,
|
|
"learning_rate": 3.8618777822278854e-07,
|
|
"loss": 0.6741,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"epoch": 1.0696565284692299,
|
|
"grad_norm": 4.669501781463623,
|
|
"learning_rate": 3.787492076248994e-07,
|
|
"loss": 0.6861,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"epoch": 1.0709952725599297,
|
|
"grad_norm": 5.3932013511657715,
|
|
"learning_rate": 3.7138015365554834e-07,
|
|
"loss": 0.6891,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 1.0723340166506297,
|
|
"grad_norm": 4.442852020263672,
|
|
"learning_rate": 3.6408072716606346e-07,
|
|
"loss": 0.6772,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"epoch": 1.0736727607413294,
|
|
"grad_norm": 4.340718746185303,
|
|
"learning_rate": 3.56851037960379e-07,
|
|
"loss": 0.6874,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"epoch": 1.0750115048320295,
|
|
"grad_norm": 4.806090831756592,
|
|
"learning_rate": 3.496911947933845e-07,
|
|
"loss": 0.6914,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"epoch": 1.0763502489227295,
|
|
"grad_norm": 5.4313740730285645,
|
|
"learning_rate": 3.426013053692878e-07,
|
|
"loss": 0.6795,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"epoch": 1.0776889930134292,
|
|
"grad_norm": 4.40963888168335,
|
|
"learning_rate": 3.355814763399973e-07,
|
|
"loss": 0.6921,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"epoch": 1.0790277371041292,
|
|
"grad_norm": 4.35569429397583,
|
|
"learning_rate": 3.2863181330351325e-07,
|
|
"loss": 0.6793,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"epoch": 1.080366481194829,
|
|
"grad_norm": 5.422229766845703,
|
|
"learning_rate": 3.2175242080234314e-07,
|
|
"loss": 0.6787,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"epoch": 1.081705225285529,
|
|
"grad_norm": 4.800388336181641,
|
|
"learning_rate": 3.1494340232192667e-07,
|
|
"loss": 0.6814,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"epoch": 1.083043969376229,
|
|
"grad_norm": 4.792459487915039,
|
|
"learning_rate": 3.082048602890808e-07,
|
|
"loss": 0.6625,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"epoch": 1.0843827134669288,
|
|
"grad_norm": 6.758553981781006,
|
|
"learning_rate": 3.015368960704584e-07,
|
|
"loss": 0.6693,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"epoch": 1.0843827134669288,
|
|
"eval_loss": 0.4596273899078369,
|
|
"eval_runtime": 142.9572,
|
|
"eval_samples_per_second": 76.946,
|
|
"eval_steps_per_second": 9.618,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"epoch": 1.0857214575576288,
|
|
"grad_norm": 3.577047824859619,
|
|
"learning_rate": 2.9493960997102224e-07,
|
|
"loss": 0.6961,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"epoch": 1.0870602016483286,
|
|
"grad_norm": 4.298323154449463,
|
|
"learning_rate": 2.8841310123253865e-07,
|
|
"loss": 0.6738,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"epoch": 1.0883989457390286,
|
|
"grad_norm": 4.555909633636475,
|
|
"learning_rate": 2.819574680320825e-07,
|
|
"loss": 0.6832,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"epoch": 1.0897376898297284,
|
|
"grad_norm": 5.363863468170166,
|
|
"learning_rate": 2.755728074805597e-07,
|
|
"loss": 0.6836,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"epoch": 1.0910764339204284,
|
|
"grad_norm": 4.5835747718811035,
|
|
"learning_rate": 2.6925921562124867e-07,
|
|
"loss": 0.6947,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"epoch": 1.0924151780111284,
|
|
"grad_norm": 4.60358190536499,
|
|
"learning_rate": 2.63016787428354e-07,
|
|
"loss": 0.684,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"epoch": 1.0937539221018282,
|
|
"grad_norm": 5.916420936584473,
|
|
"learning_rate": 2.5684561680557995e-07,
|
|
"loss": 0.6744,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"epoch": 1.0950926661925282,
|
|
"grad_norm": 5.852171897888184,
|
|
"learning_rate": 2.5074579658471266e-07,
|
|
"loss": 0.6827,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"epoch": 1.096431410283228,
|
|
"grad_norm": 4.364650726318359,
|
|
"learning_rate": 2.447174185242324e-07,
|
|
"loss": 0.6809,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"epoch": 1.097770154373928,
|
|
"grad_norm": 4.734222412109375,
|
|
"learning_rate": 2.3876057330792344e-07,
|
|
"loss": 0.6785,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"epoch": 1.0991088984646278,
|
|
"grad_norm": 4.415522575378418,
|
|
"learning_rate": 2.3287535054351716e-07,
|
|
"loss": 0.6998,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"epoch": 1.1004476425553278,
|
|
"grad_norm": 20.79244613647461,
|
|
"learning_rate": 2.2706183876134047e-07,
|
|
"loss": 0.683,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"epoch": 1.1017863866460278,
|
|
"grad_norm": 4.6959757804870605,
|
|
"learning_rate": 2.2132012541298542e-07,
|
|
"loss": 0.6907,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"epoch": 1.1031251307367276,
|
|
"grad_norm": 4.675933361053467,
|
|
"learning_rate": 2.1565029686999306e-07,
|
|
"loss": 0.7026,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"epoch": 1.1044638748274276,
|
|
"grad_norm": 4.374818325042725,
|
|
"learning_rate": 2.1005243842255552e-07,
|
|
"loss": 0.6824,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"epoch": 1.1058026189181274,
|
|
"grad_norm": 5.764030456542969,
|
|
"learning_rate": 2.0452663427823093e-07,
|
|
"loss": 0.6823,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"epoch": 1.1071413630088274,
|
|
"grad_norm": 4.563624858856201,
|
|
"learning_rate": 1.990729675606784e-07,
|
|
"loss": 0.673,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"epoch": 1.1084801070995272,
|
|
"grad_norm": 4.211341381072998,
|
|
"learning_rate": 1.9369152030840553e-07,
|
|
"loss": 0.6821,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"epoch": 1.1098188511902272,
|
|
"grad_norm": 4.860802173614502,
|
|
"learning_rate": 1.8838237347353848e-07,
|
|
"loss": 0.6887,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"epoch": 1.1111575952809272,
|
|
"grad_norm": 5.675398826599121,
|
|
"learning_rate": 1.8314560692059836e-07,
|
|
"loss": 0.674,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"epoch": 1.112496339371627,
|
|
"grad_norm": 4.875771522521973,
|
|
"learning_rate": 1.779812994253055e-07,
|
|
"loss": 0.6933,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"epoch": 1.113835083462327,
|
|
"grad_norm": 4.6181840896606445,
|
|
"learning_rate": 1.728895286733906e-07,
|
|
"loss": 0.6838,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"epoch": 1.1151738275530267,
|
|
"grad_norm": 5.557806968688965,
|
|
"learning_rate": 1.6787037125942706e-07,
|
|
"loss": 0.6803,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"epoch": 1.1165125716437267,
|
|
"grad_norm": 5.17434549331665,
|
|
"learning_rate": 1.6292390268568103e-07,
|
|
"loss": 0.6644,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"epoch": 1.1178513157344265,
|
|
"grad_norm": 3.3707222938537598,
|
|
"learning_rate": 1.5805019736097105e-07,
|
|
"loss": 0.6828,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"epoch": 1.1191900598251265,
|
|
"grad_norm": 4.812671184539795,
|
|
"learning_rate": 1.53249328599554e-07,
|
|
"loss": 0.6826,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"epoch": 1.1205288039158265,
|
|
"grad_norm": 4.712214946746826,
|
|
"learning_rate": 1.4852136862001766e-07,
|
|
"loss": 0.7035,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"epoch": 1.1218675480065263,
|
|
"grad_norm": 20.46932029724121,
|
|
"learning_rate": 1.438663885441982e-07,
|
|
"loss": 0.6732,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"epoch": 1.1232062920972263,
|
|
"grad_norm": 4.6453776359558105,
|
|
"learning_rate": 1.3928445839610782e-07,
|
|
"loss": 0.6889,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"epoch": 1.1245450361879261,
|
|
"grad_norm": 4.666750431060791,
|
|
"learning_rate": 1.3477564710088097e-07,
|
|
"loss": 0.678,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"epoch": 1.1258837802786261,
|
|
"grad_norm": 4.522703647613525,
|
|
"learning_rate": 1.303400224837398e-07,
|
|
"loss": 0.6784,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"epoch": 1.127222524369326,
|
|
"grad_norm": 4.415043354034424,
|
|
"learning_rate": 1.25977651268972e-07,
|
|
"loss": 0.6718,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"epoch": 1.128561268460026,
|
|
"grad_norm": 5.458097457885742,
|
|
"learning_rate": 1.2168859907892904e-07,
|
|
"loss": 0.681,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"epoch": 1.129900012550726,
|
|
"grad_norm": 4.814393997192383,
|
|
"learning_rate": 1.174729304330352e-07,
|
|
"loss": 0.6687,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"epoch": 1.1312387566414257,
|
|
"grad_norm": 4.654358863830566,
|
|
"learning_rate": 1.1333070874682217e-07,
|
|
"loss": 0.6848,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"epoch": 1.1325775007321257,
|
|
"grad_norm": 4.3893537521362305,
|
|
"learning_rate": 1.0926199633097156e-07,
|
|
"loss": 0.6742,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"epoch": 1.1339162448228255,
|
|
"grad_norm": 4.906573295593262,
|
|
"learning_rate": 1.0526685439037843e-07,
|
|
"loss": 0.6822,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"epoch": 1.1352549889135255,
|
|
"grad_norm": 4.322135925292969,
|
|
"learning_rate": 1.0134534302323029e-07,
|
|
"loss": 0.6649,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"epoch": 1.1365937330042255,
|
|
"grad_norm": 4.914889812469482,
|
|
"learning_rate": 9.749752122010347e-08,
|
|
"loss": 0.691,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"epoch": 1.1379324770949253,
|
|
"grad_norm": 5.3613762855529785,
|
|
"learning_rate": 9.372344686307655e-08,
|
|
"loss": 0.684,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 1.1392712211856253,
|
|
"grad_norm": 5.514501571655273,
|
|
"learning_rate": 9.002317672485828e-08,
|
|
"loss": 0.6847,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"epoch": 1.140609965276325,
|
|
"grad_norm": 6.157057762145996,
|
|
"learning_rate": 8.639676646793382e-08,
|
|
"loss": 0.6753,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"epoch": 1.141948709367025,
|
|
"grad_norm": 5.376561164855957,
|
|
"learning_rate": 8.284427064372769e-08,
|
|
"loss": 0.6833,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"epoch": 1.143287453457725,
|
|
"grad_norm": 5.199676036834717,
|
|
"learning_rate": 7.936574269178376e-08,
|
|
"loss": 0.6777,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"epoch": 1.1446261975484249,
|
|
"grad_norm": 4.218641757965088,
|
|
"learning_rate": 7.59612349389599e-08,
|
|
"loss": 0.6811,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"epoch": 1.1459649416391249,
|
|
"grad_norm": 4.764126300811768,
|
|
"learning_rate": 7.263079859864298e-08,
|
|
"loss": 0.6905,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"epoch": 1.1473036857298247,
|
|
"grad_norm": 4.879941463470459,
|
|
"learning_rate": 6.937448376997503e-08,
|
|
"loss": 0.6592,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"epoch": 1.1486424298205247,
|
|
"grad_norm": 5.09310245513916,
|
|
"learning_rate": 6.61923394371039e-08,
|
|
"loss": 0.6711,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"epoch": 1.1499811739112245,
|
|
"grad_norm": 4.643774032592773,
|
|
"learning_rate": 6.308441346844386e-08,
|
|
"loss": 0.6757,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"epoch": 1.1513199180019245,
|
|
"grad_norm": 7.005927085876465,
|
|
"learning_rate": 6.005075261595495e-08,
|
|
"loss": 0.6706,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"epoch": 1.1526586620926245,
|
|
"grad_norm": 4.500913143157959,
|
|
"learning_rate": 5.709140251444201e-08,
|
|
"loss": 0.684,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"epoch": 1.1539974061833242,
|
|
"grad_norm": 5.638104438781738,
|
|
"learning_rate": 5.42064076808646e-08,
|
|
"loss": 0.682,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"epoch": 1.1553361502740243,
|
|
"grad_norm": 5.185242176055908,
|
|
"learning_rate": 5.139581151367312e-08,
|
|
"loss": 0.672,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"epoch": 1.156674894364724,
|
|
"grad_norm": 5.097900390625,
|
|
"learning_rate": 4.865965629214819e-08,
|
|
"loss": 0.6824,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"epoch": 1.158013638455424,
|
|
"grad_norm": 5.027424335479736,
|
|
"learning_rate": 4.599798317577342e-08,
|
|
"loss": 0.7095,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"epoch": 1.1593523825461238,
|
|
"grad_norm": 6.344463348388672,
|
|
"learning_rate": 4.3410832203608645e-08,
|
|
"loss": 0.6706,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"epoch": 1.1606911266368238,
|
|
"grad_norm": 4.783915042877197,
|
|
"learning_rate": 4.0898242293691546e-08,
|
|
"loss": 0.6862,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"epoch": 1.1620298707275238,
|
|
"grad_norm": 5.185044765472412,
|
|
"learning_rate": 3.8460251242451454e-08,
|
|
"loss": 0.6765,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"epoch": 1.1633686148182236,
|
|
"grad_norm": 5.172491550445557,
|
|
"learning_rate": 3.6096895724141435e-08,
|
|
"loss": 0.6944,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"epoch": 1.1647073589089236,
|
|
"grad_norm": 5.616584777832031,
|
|
"learning_rate": 3.3808211290284886e-08,
|
|
"loss": 0.6854,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"epoch": 1.1660461029996234,
|
|
"grad_norm": 5.103434085845947,
|
|
"learning_rate": 3.159423236914261e-08,
|
|
"loss": 0.6846,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"epoch": 1.1673848470903234,
|
|
"grad_norm": 6.16726541519165,
|
|
"learning_rate": 2.9454992265193216e-08,
|
|
"loss": 0.6699,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"epoch": 1.1687235911810232,
|
|
"grad_norm": 3.9044101238250732,
|
|
"learning_rate": 2.7390523158633552e-08,
|
|
"loss": 0.6838,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"epoch": 1.1700623352717232,
|
|
"grad_norm": 6.0656819343566895,
|
|
"learning_rate": 2.5400856104894066e-08,
|
|
"loss": 0.6774,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"epoch": 1.1714010793624232,
|
|
"grad_norm": 6.067554473876953,
|
|
"learning_rate": 2.3486021034170857e-08,
|
|
"loss": 0.6803,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"epoch": 1.172739823453123,
|
|
"grad_norm": 5.079629421234131,
|
|
"learning_rate": 2.1646046750978255e-08,
|
|
"loss": 0.6892,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"epoch": 1.174078567543823,
|
|
"grad_norm": 4.549910545349121,
|
|
"learning_rate": 1.9880960933710836e-08,
|
|
"loss": 0.6925,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"epoch": 1.1754173116345228,
|
|
"grad_norm": 4.135347366333008,
|
|
"learning_rate": 1.8190790134231528e-08,
|
|
"loss": 0.672,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"epoch": 1.1767560557252228,
|
|
"grad_norm": 6.052491188049316,
|
|
"learning_rate": 1.657555977746972e-08,
|
|
"loss": 0.6878,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"epoch": 1.1780947998159226,
|
|
"grad_norm": 5.652344703674316,
|
|
"learning_rate": 1.5035294161039882e-08,
|
|
"loss": 0.6842,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"epoch": 1.1794335439066226,
|
|
"grad_norm": 6.649960041046143,
|
|
"learning_rate": 1.3570016454874658e-08,
|
|
"loss": 0.6935,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"epoch": 1.1807722879973226,
|
|
"grad_norm": 4.435061454772949,
|
|
"learning_rate": 1.2179748700879013e-08,
|
|
"loss": 0.6829,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"epoch": 1.1821110320880224,
|
|
"grad_norm": 5.93247652053833,
|
|
"learning_rate": 1.0864511812594958e-08,
|
|
"loss": 0.6888,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"epoch": 1.1834497761787224,
|
|
"grad_norm": 6.168659687042236,
|
|
"learning_rate": 9.624325574890125e-09,
|
|
"loss": 0.6868,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"epoch": 1.1847885202694222,
|
|
"grad_norm": 5.3353705406188965,
|
|
"learning_rate": 8.459208643659122e-09,
|
|
"loss": 0.6915,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"epoch": 1.1861272643601222,
|
|
"grad_norm": 4.853032112121582,
|
|
"learning_rate": 7.369178545542088e-09,
|
|
"loss": 0.6777,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"epoch": 1.187466008450822,
|
|
"grad_norm": 5.913071632385254,
|
|
"learning_rate": 6.354251677661572e-09,
|
|
"loss": 0.6862,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"epoch": 1.188804752541522,
|
|
"grad_norm": 5.52236270904541,
|
|
"learning_rate": 5.414443307377171e-09,
|
|
"loss": 0.6771,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"epoch": 1.190143496632222,
|
|
"grad_norm": 4.868106365203857,
|
|
"learning_rate": 4.5497675720540535e-09,
|
|
"loss": 0.6686,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"epoch": 1.1914822407229217,
|
|
"grad_norm": 5.625421047210693,
|
|
"learning_rate": 3.760237478849793e-09,
|
|
"loss": 0.6906,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"epoch": 1.1928209848136218,
|
|
"grad_norm": 4.456866264343262,
|
|
"learning_rate": 3.0458649045211897e-09,
|
|
"loss": 0.674,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"epoch": 1.1941597289043215,
|
|
"grad_norm": 6.232851028442383,
|
|
"learning_rate": 2.4066605952444144e-09,
|
|
"loss": 0.6887,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"epoch": 1.1954984729950215,
|
|
"grad_norm": 5.5432586669921875,
|
|
"learning_rate": 1.8426341664529168e-09,
|
|
"loss": 0.6807,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"epoch": 1.1968372170857215,
|
|
"grad_norm": 4.979255199432373,
|
|
"learning_rate": 1.3537941026914302e-09,
|
|
"loss": 0.6847,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"epoch": 1.1981759611764213,
|
|
"grad_norm": 4.797965049743652,
|
|
"learning_rate": 9.401477574932927e-10,
|
|
"loss": 0.6844,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"epoch": 1.1995147052671213,
|
|
"grad_norm": 4.181077003479004,
|
|
"learning_rate": 6.017013532627625e-10,
|
|
"loss": 0.6814,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"epoch": 1.2008534493578211,
|
|
"grad_norm": 5.089372634887695,
|
|
"learning_rate": 3.384599811889766e-10,
|
|
"loss": 0.684,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"epoch": 1.2021921934485211,
|
|
"grad_norm": 5.798556804656982,
|
|
"learning_rate": 1.504276011621286e-10,
|
|
"loss": 0.6819,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"epoch": 1.2035309375392211,
|
|
"grad_norm": 4.701151371002197,
|
|
"learning_rate": 3.760704171962282e-11,
|
|
"loss": 0.6563,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"epoch": 1.204869681629921,
|
|
"grad_norm": 4.564361572265625,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.6951,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 1.204869681629921,
|
|
"eval_loss": 0.4588835537433624,
|
|
"eval_runtime": 143.0034,
|
|
"eval_samples_per_second": 76.921,
|
|
"eval_steps_per_second": 9.615,
|
|
"step": 9000
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 9000,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 1,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 9.590046814432238e+19,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|