latest_10 / trainer_state.json
krishnakalyan3's picture
Upload folder using huggingface_hub (#1)
12e4fbc verified
{
"best_metric": 0.12010584022747946,
"best_model_checkpoint": "/workspace/disk2/krishna/checkpoints/checkpoint-940",
"epoch": 0.097,
"eval_steps": 10,
"global_step": 970,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001,
"grad_norm": 0.0356324203312397,
"learning_rate": 1e-05,
"loss": 0.1207,
"step": 10
},
{
"epoch": 0.001,
"eval_cos_sim": 0.8792359232902527,
"eval_loss": 0.12173220255124045,
"eval_runtime": 171.7728,
"eval_samples_per_second": 23.287,
"eval_steps_per_second": 0.367,
"step": 10
},
{
"epoch": 0.002,
"grad_norm": 0.030419372022151947,
"learning_rate": 2e-05,
"loss": 0.1213,
"step": 20
},
{
"epoch": 0.002,
"eval_cos_sim": 0.8789854049682617,
"eval_loss": 0.12198310520398092,
"eval_runtime": 159.1521,
"eval_samples_per_second": 25.133,
"eval_steps_per_second": 0.396,
"step": 20
},
{
"epoch": 0.003,
"grad_norm": 0.033041320741176605,
"learning_rate": 3e-05,
"loss": 0.1204,
"step": 30
},
{
"epoch": 0.003,
"eval_cos_sim": 0.8790653347969055,
"eval_loss": 0.12189955379712057,
"eval_runtime": 161.0741,
"eval_samples_per_second": 24.833,
"eval_steps_per_second": 0.391,
"step": 30
},
{
"epoch": 0.004,
"grad_norm": 0.04209210351109505,
"learning_rate": 4e-05,
"loss": 0.1213,
"step": 40
},
{
"epoch": 0.004,
"eval_cos_sim": 0.8792155385017395,
"eval_loss": 0.12175503399121237,
"eval_runtime": 159.4228,
"eval_samples_per_second": 25.091,
"eval_steps_per_second": 0.395,
"step": 40
},
{
"epoch": 0.005,
"grad_norm": 0.03182140365242958,
"learning_rate": 5e-05,
"loss": 0.1203,
"step": 50
},
{
"epoch": 0.005,
"eval_cos_sim": 0.8791962265968323,
"eval_loss": 0.12176946785199118,
"eval_runtime": 160.3207,
"eval_samples_per_second": 24.95,
"eval_steps_per_second": 0.393,
"step": 50
},
{
"epoch": 0.006,
"grad_norm": 0.05823719501495361,
"learning_rate": 2.4802665827257164e-05,
"loss": 0.1213,
"step": 60
},
{
"epoch": 0.006,
"eval_cos_sim": 0.8791635036468506,
"eval_loss": 0.12179789688336325,
"eval_runtime": 164.7575,
"eval_samples_per_second": 24.278,
"eval_steps_per_second": 0.382,
"step": 60
},
{
"epoch": 0.007,
"grad_norm": 0.02305755950510502,
"learning_rate": 4.999688473794144e-05,
"loss": 0.1211,
"step": 70
},
{
"epoch": 0.007,
"eval_cos_sim": 0.8792662024497986,
"eval_loss": 0.12169700815426779,
"eval_runtime": 159.8158,
"eval_samples_per_second": 25.029,
"eval_steps_per_second": 0.394,
"step": 70
},
{
"epoch": 0.008,
"grad_norm": 0.03906348720192909,
"learning_rate": 2.4408046661584414e-05,
"loss": 0.1201,
"step": 80
},
{
"epoch": 0.008,
"eval_cos_sim": 0.879157543182373,
"eval_loss": 0.12181253530728293,
"eval_runtime": 162.3324,
"eval_samples_per_second": 24.641,
"eval_steps_per_second": 0.388,
"step": 80
},
{
"epoch": 0.009,
"grad_norm": 0.0275803804397583,
"learning_rate": 4.998753972815434e-05,
"loss": 0.1208,
"step": 90
},
{
"epoch": 0.009,
"eval_cos_sim": 0.8792516589164734,
"eval_loss": 0.121718086264009,
"eval_runtime": 158.5035,
"eval_samples_per_second": 25.236,
"eval_steps_per_second": 0.397,
"step": 90
},
{
"epoch": 0.01,
"grad_norm": 0.03248042240738869,
"learning_rate": 2.4013575023093667e-05,
"loss": 0.1224,
"step": 100
},
{
"epoch": 0.01,
"eval_cos_sim": 0.879462718963623,
"eval_loss": 0.1215014844154067,
"eval_runtime": 159.5892,
"eval_samples_per_second": 25.064,
"eval_steps_per_second": 0.395,
"step": 100
},
{
"epoch": 0.011,
"grad_norm": 0.03436814621090889,
"learning_rate": 4.9971967299611097e-05,
"loss": 0.1205,
"step": 110
},
{
"epoch": 0.011,
"eval_cos_sim": 0.8795427680015564,
"eval_loss": 0.1214234667037673,
"eval_runtime": 168.2892,
"eval_samples_per_second": 23.769,
"eval_steps_per_second": 0.374,
"step": 110
},
{
"epoch": 0.012,
"grad_norm": 0.03663235530257225,
"learning_rate": 2.3619349222387287e-05,
"loss": 0.1209,
"step": 120
},
{
"epoch": 0.012,
"eval_cos_sim": 0.8793898224830627,
"eval_loss": 0.1215821056579299,
"eval_runtime": 170.7269,
"eval_samples_per_second": 23.429,
"eval_steps_per_second": 0.369,
"step": 120
},
{
"epoch": 0.013,
"grad_norm": 0.03549114614725113,
"learning_rate": 4.9950171333287335e-05,
"loss": 0.1218,
"step": 130
},
{
"epoch": 0.013,
"eval_cos_sim": 0.8795300722122192,
"eval_loss": 0.12144066002118063,
"eval_runtime": 162.7257,
"eval_samples_per_second": 24.581,
"eval_steps_per_second": 0.387,
"step": 130
},
{
"epoch": 0.014,
"grad_norm": 0.03164505586028099,
"learning_rate": 2.3225467508799633e-05,
"loss": 0.1208,
"step": 140
},
{
"epoch": 0.014,
"eval_cos_sim": 0.8797659873962402,
"eval_loss": 0.12119961311566306,
"eval_runtime": 163.5115,
"eval_samples_per_second": 24.463,
"eval_steps_per_second": 0.385,
"step": 140
},
{
"epoch": 0.015,
"grad_norm": 0.031108180060982704,
"learning_rate": 4.992215726119483e-05,
"loss": 0.1213,
"step": 150
},
{
"epoch": 0.015,
"eval_cos_sim": 0.8797353506088257,
"eval_loss": 0.1212306742881484,
"eval_runtime": 165.866,
"eval_samples_per_second": 24.116,
"eval_steps_per_second": 0.38,
"step": 150
},
{
"epoch": 0.016,
"grad_norm": 0.030103642493486404,
"learning_rate": 2.2832028045911203e-05,
"loss": 0.1209,
"step": 160
},
{
"epoch": 0.016,
"eval_cos_sim": 0.8793777823448181,
"eval_loss": 0.12159336998211813,
"eval_runtime": 171.7298,
"eval_samples_per_second": 23.292,
"eval_steps_per_second": 0.367,
"step": 160
},
{
"epoch": 0.017,
"grad_norm": 0.05055614188313484,
"learning_rate": 4.9887932065027656e-05,
"loss": 0.1204,
"step": 170
},
{
"epoch": 0.017,
"eval_cos_sim": 0.8795183300971985,
"eval_loss": 0.1214520344947524,
"eval_runtime": 162.2461,
"eval_samples_per_second": 24.654,
"eval_steps_per_second": 0.388,
"step": 170
},
{
"epoch": 0.018,
"grad_norm": 0.03837039694190025,
"learning_rate": 2.2439128887084646e-05,
"loss": 0.1202,
"step": 180
},
{
"epoch": 0.018,
"eval_cos_sim": 0.8797397017478943,
"eval_loss": 0.12122445728527975,
"eval_runtime": 161.451,
"eval_samples_per_second": 24.775,
"eval_steps_per_second": 0.39,
"step": 180
},
{
"epoch": 0.019,
"grad_norm": 0.03563898801803589,
"learning_rate": 4.98475042744222e-05,
"loss": 0.1221,
"step": 190
},
{
"epoch": 0.019,
"eval_cos_sim": 0.8797785639762878,
"eval_loss": 0.12118578769909812,
"eval_runtime": 157.9064,
"eval_samples_per_second": 25.331,
"eval_steps_per_second": 0.399,
"step": 190
},
{
"epoch": 0.02,
"grad_norm": 0.0392858162522316,
"learning_rate": 2.204686795102736e-05,
"loss": 0.1204,
"step": 200
},
{
"epoch": 0.02,
"eval_cos_sim": 0.8796395063400269,
"eval_loss": 0.12133027555691672,
"eval_runtime": 163.7884,
"eval_samples_per_second": 24.422,
"eval_steps_per_second": 0.385,
"step": 200
},
{
"epoch": 0.021,
"grad_norm": 0.04556349664926529,
"learning_rate": 4.980088396483144e-05,
"loss": 0.1205,
"step": 210
},
{
"epoch": 0.021,
"eval_cos_sim": 0.8796445727348328,
"eval_loss": 0.12132433941113424,
"eval_runtime": 164.6589,
"eval_samples_per_second": 24.293,
"eval_steps_per_second": 0.383,
"step": 210
},
{
"epoch": 0.022,
"grad_norm": 0.030130930244922638,
"learning_rate": 2.1655342997387947e-05,
"loss": 0.1201,
"step": 220
},
{
"epoch": 0.022,
"eval_cos_sim": 0.8796879649162292,
"eval_loss": 0.12127337600933981,
"eval_runtime": 163.052,
"eval_samples_per_second": 24.532,
"eval_steps_per_second": 0.386,
"step": 220
},
{
"epoch": 0.023,
"grad_norm": 0.027453621849417686,
"learning_rate": 4.9748082755013934e-05,
"loss": 0.1205,
"step": 230
},
{
"epoch": 0.023,
"eval_cos_sim": 0.8797481060028076,
"eval_loss": 0.12121248771893454,
"eval_runtime": 159.6988,
"eval_samples_per_second": 25.047,
"eval_steps_per_second": 0.394,
"step": 230
},
{
"epoch": 0.024,
"grad_norm": 0.029768602922558784,
"learning_rate": 2.126465160239341e-05,
"loss": 0.1206,
"step": 240
},
{
"epoch": 0.024,
"eval_cos_sim": 0.8797679543495178,
"eval_loss": 0.12119435026394797,
"eval_runtime": 170.5172,
"eval_samples_per_second": 23.458,
"eval_steps_per_second": 0.369,
"step": 240
},
{
"epoch": 0.025,
"grad_norm": 0.025975426658988,
"learning_rate": 4.968911380413809e-05,
"loss": 0.1206,
"step": 250
},
{
"epoch": 0.025,
"eval_cos_sim": 0.8798050284385681,
"eval_loss": 0.12115855839001609,
"eval_runtime": 162.5635,
"eval_samples_per_second": 24.606,
"eval_steps_per_second": 0.388,
"step": 250
},
{
"epoch": 0.026,
"grad_norm": 0.032136961817741394,
"learning_rate": 2.0874891134530094e-05,
"loss": 0.1207,
"step": 260
},
{
"epoch": 0.026,
"eval_cos_sim": 0.8799233436584473,
"eval_loss": 0.12104250910031271,
"eval_runtime": 171.0657,
"eval_samples_per_second": 23.383,
"eval_steps_per_second": 0.368,
"step": 260
},
{
"epoch": 0.027,
"grad_norm": 0.035989198833703995,
"learning_rate": 4.962399180850275e-05,
"loss": 0.12,
"step": 270
},
{
"epoch": 0.027,
"eval_cos_sim": 0.8800029754638672,
"eval_loss": 0.12096367742764426,
"eval_runtime": 162.5704,
"eval_samples_per_second": 24.605,
"eval_steps_per_second": 0.388,
"step": 270
},
{
"epoch": 0.028,
"grad_norm": 0.02917526848614216,
"learning_rate": 2.0486158730277393e-05,
"loss": 0.1205,
"step": 280
},
{
"epoch": 0.028,
"eval_cos_sim": 0.8800209164619446,
"eval_loss": 0.12094438698040914,
"eval_runtime": 163.135,
"eval_samples_per_second": 24.52,
"eval_steps_per_second": 0.386,
"step": 280
},
{
"epoch": 0.029,
"grad_norm": 0.040587518364191055,
"learning_rate": 4.955273299787453e-05,
"loss": 0.1204,
"step": 290
},
{
"epoch": 0.029,
"eval_cos_sim": 0.8800665140151978,
"eval_loss": 0.12090009071576072,
"eval_runtime": 160.8422,
"eval_samples_per_second": 24.869,
"eval_steps_per_second": 0.392,
"step": 290
},
{
"epoch": 0.03,
"grad_norm": 0.02535935305058956,
"learning_rate": 2.00985512699005e-05,
"loss": 0.121,
"step": 300
},
{
"epoch": 0.03,
"eval_cos_sim": 0.8799148201942444,
"eval_loss": 0.12105432750927878,
"eval_runtime": 162.6443,
"eval_samples_per_second": 24.594,
"eval_steps_per_second": 0.387,
"step": 300
},
{
"epoch": 0.031,
"grad_norm": 0.027923179790377617,
"learning_rate": 4.947535513144286e-05,
"loss": 0.1197,
"step": 310
},
{
"epoch": 0.031,
"eval_cos_sim": 0.8799843788146973,
"eval_loss": 0.120985016367311,
"eval_runtime": 165.6185,
"eval_samples_per_second": 24.152,
"eval_steps_per_second": 0.38,
"step": 310
},
{
"epoch": 0.032,
"grad_norm": 0.025140805169939995,
"learning_rate": 1.9712165353304617e-05,
"loss": 0.1199,
"step": 320
},
{
"epoch": 0.032,
"eval_cos_sim": 0.8800230622291565,
"eval_loss": 0.1209463920806594,
"eval_runtime": 161.2564,
"eval_samples_per_second": 24.805,
"eval_steps_per_second": 0.391,
"step": 320
},
{
"epoch": 0.033,
"grad_norm": 0.03448393940925598,
"learning_rate": 4.9391877493394335e-05,
"loss": 0.1205,
"step": 330
},
{
"epoch": 0.033,
"eval_cos_sim": 0.8800433278083801,
"eval_loss": 0.12092636968838645,
"eval_runtime": 165.583,
"eval_samples_per_second": 24.157,
"eval_steps_per_second": 0.38,
"step": 330
},
{
"epoch": 0.034,
"grad_norm": 0.027445893734693527,
"learning_rate": 1.9327097275960212e-05,
"loss": 0.1208,
"step": 340
},
{
"epoch": 0.034,
"eval_cos_sim": 0.8797867894172668,
"eval_loss": 0.12118107464062644,
"eval_runtime": 169.9936,
"eval_samples_per_second": 23.53,
"eval_steps_per_second": 0.371,
"step": 340
},
{
"epoch": 0.035,
"grad_norm": 0.032430149614810944,
"learning_rate": 4.9302320888106454e-05,
"loss": 0.1192,
"step": 350
},
{
"epoch": 0.035,
"eval_cos_sim": 0.8799253106117249,
"eval_loss": 0.12104199745404196,
"eval_runtime": 162.0388,
"eval_samples_per_second": 24.685,
"eval_steps_per_second": 0.389,
"step": 350
},
{
"epoch": 0.036,
"grad_norm": 0.03066575713455677,
"learning_rate": 1.894344300490539e-05,
"loss": 0.1207,
"step": 360
},
{
"epoch": 0.036,
"eval_cos_sim": 0.8800117373466492,
"eval_loss": 0.12095709469067527,
"eval_runtime": 168.2521,
"eval_samples_per_second": 23.774,
"eval_steps_per_second": 0.374,
"step": 360
},
{
"epoch": 0.037,
"grad_norm": 0.04023744910955429,
"learning_rate": 4.920670763496264e-05,
"loss": 0.1206,
"step": 370
},
{
"epoch": 0.037,
"eval_cos_sim": 0.8800341486930847,
"eval_loss": 0.12093350794064474,
"eval_runtime": 164.2446,
"eval_samples_per_second": 24.354,
"eval_steps_per_second": 0.384,
"step": 370
},
{
"epoch": 0.038,
"grad_norm": 0.03345053270459175,
"learning_rate": 1.8561298154827563e-05,
"loss": 0.1207,
"step": 380
},
{
"epoch": 0.038,
"eval_cos_sim": 0.8800336122512817,
"eval_loss": 0.12093256904828024,
"eval_runtime": 158.5429,
"eval_samples_per_second": 25.23,
"eval_steps_per_second": 0.397,
"step": 380
},
{
"epoch": 0.039,
"grad_norm": 0.02383916825056076,
"learning_rate": 4.910506156279026e-05,
"loss": 0.1213,
"step": 390
},
{
"epoch": 0.039,
"eval_cos_sim": 0.8800181150436401,
"eval_loss": 0.12094816543805074,
"eval_runtime": 164.6828,
"eval_samples_per_second": 24.289,
"eval_steps_per_second": 0.383,
"step": 390
},
{
"epoch": 0.04,
"grad_norm": 0.03217790648341179,
"learning_rate": 1.8180757964234907e-05,
"loss": 0.1213,
"step": 400
},
{
"epoch": 0.04,
"eval_cos_sim": 0.8800681829452515,
"eval_loss": 0.12089718677746726,
"eval_runtime": 162.9873,
"eval_samples_per_second": 24.542,
"eval_steps_per_second": 0.387,
"step": 400
},
{
"epoch": 0.041,
"grad_norm": 0.03514571115374565,
"learning_rate": 4.8997408003921466e-05,
"loss": 0.1208,
"step": 410
},
{
"epoch": 0.041,
"eval_cos_sim": 0.8801241517066956,
"eval_loss": 0.12084018089520407,
"eval_runtime": 167.298,
"eval_samples_per_second": 23.909,
"eval_steps_per_second": 0.377,
"step": 410
},
{
"epoch": 0.042,
"grad_norm": 0.03063860908150673,
"learning_rate": 1.780191727172083e-05,
"loss": 0.1207,
"step": 420
},
{
"epoch": 0.042,
"eval_cos_sim": 0.8799417018890381,
"eval_loss": 0.12102477314221335,
"eval_runtime": 165.8735,
"eval_samples_per_second": 24.115,
"eval_steps_per_second": 0.38,
"step": 420
},
{
"epoch": 0.043,
"grad_norm": 0.0319770872592926,
"learning_rate": 4.8883773787879826e-05,
"loss": 0.1205,
"step": 430
},
{
"epoch": 0.043,
"eval_cos_sim": 0.8800466060638428,
"eval_loss": 0.12091960479962302,
"eval_runtime": 163.3913,
"eval_samples_per_second": 24.481,
"eval_steps_per_second": 0.386,
"step": 430
},
{
"epoch": 0.044,
"grad_norm": 0.02543482929468155,
"learning_rate": 1.742487049232818e-05,
"loss": 0.1202,
"step": 440
},
{
"epoch": 0.044,
"eval_cos_sim": 0.8802583813667297,
"eval_loss": 0.12070597412335349,
"eval_runtime": 160.473,
"eval_samples_per_second": 24.926,
"eval_steps_per_second": 0.393,
"step": 440
},
{
"epoch": 0.045,
"grad_norm": 0.024107394739985466,
"learning_rate": 4.876418723469453e-05,
"loss": 0.1196,
"step": 450
},
{
"epoch": 0.045,
"eval_cos_sim": 0.8802623748779297,
"eval_loss": 0.12070202877270651,
"eval_runtime": 168.2567,
"eval_samples_per_second": 23.773,
"eval_steps_per_second": 0.374,
"step": 450
},
{
"epoch": 0.046,
"grad_norm": 0.04505016654729843,
"learning_rate": 1.7049711594019046e-05,
"loss": 0.1197,
"step": 460
},
{
"epoch": 0.046,
"eval_cos_sim": 0.8800992965698242,
"eval_loss": 0.12086664869534446,
"eval_runtime": 168.3415,
"eval_samples_per_second": 23.761,
"eval_steps_per_second": 0.374,
"step": 460
},
{
"epoch": 0.047,
"grad_norm": 0.026298915967345238,
"learning_rate": 4.8638678147841726e-05,
"loss": 0.1207,
"step": 470
},
{
"epoch": 0.047,
"eval_cos_sim": 0.8801872134208679,
"eval_loss": 0.12077882673489523,
"eval_runtime": 171.1708,
"eval_samples_per_second": 23.368,
"eval_steps_per_second": 0.368,
"step": 470
},
{
"epoch": 0.048,
"grad_norm": 0.04072026535868645,
"learning_rate": 1.667653407425599e-05,
"loss": 0.12,
"step": 480
},
{
"epoch": 0.048,
"eval_cos_sim": 0.8804126977920532,
"eval_loss": 0.12055267622219992,
"eval_runtime": 160.8906,
"eval_samples_per_second": 24.862,
"eval_steps_per_second": 0.392,
"step": 480
},
{
"epoch": 0.049,
"grad_norm": 0.02353891357779503,
"learning_rate": 4.850727780681685e-05,
"loss": 0.121,
"step": 490
},
{
"epoch": 0.049,
"eval_cos_sim": 0.8802867531776428,
"eval_loss": 0.12067857982861471,
"eval_runtime": 162.0814,
"eval_samples_per_second": 24.679,
"eval_steps_per_second": 0.389,
"step": 490
},
{
"epoch": 0.05,
"grad_norm": 0.03163010999560356,
"learning_rate": 1.6305430936700462e-05,
"loss": 0.1206,
"step": 500
},
{
"epoch": 0.05,
"eval_cos_sim": 0.8799078464508057,
"eval_loss": 0.12105564882504416,
"eval_runtime": 159.7041,
"eval_samples_per_second": 25.046,
"eval_steps_per_second": 0.394,
"step": 500
},
{
"epoch": 0.051,
"grad_norm": 0.03480914607644081,
"learning_rate": 4.8370018959339916e-05,
"loss": 0.1193,
"step": 510
},
{
"epoch": 0.051,
"eval_cos_sim": 0.8801398873329163,
"eval_loss": 0.12082168438183737,
"eval_runtime": 168.1565,
"eval_samples_per_second": 23.787,
"eval_steps_per_second": 0.375,
"step": 510
},
{
"epoch": 0.052,
"grad_norm": 0.031566403806209564,
"learning_rate": 1.5936494668034417e-05,
"loss": 0.1207,
"step": 520
},
{
"epoch": 0.052,
"eval_cos_sim": 0.8804723024368286,
"eval_loss": 0.12048741771923971,
"eval_runtime": 162.0779,
"eval_samples_per_second": 24.679,
"eval_steps_per_second": 0.389,
"step": 520
},
{
"epoch": 0.053,
"grad_norm": 0.02134857140481472,
"learning_rate": 4.822693581319333e-05,
"loss": 0.1207,
"step": 530
},
{
"epoch": 0.053,
"eval_cos_sim": 0.8804084062576294,
"eval_loss": 0.12055278637158347,
"eval_runtime": 165.0027,
"eval_samples_per_second": 24.242,
"eval_steps_per_second": 0.382,
"step": 530
},
{
"epoch": 0.054,
"grad_norm": 0.02998766116797924,
"learning_rate": 1.5569817214910634e-05,
"loss": 0.1206,
"step": 540
},
{
"epoch": 0.054,
"eval_cos_sim": 0.879996657371521,
"eval_loss": 0.12096652509915305,
"eval_runtime": 269.6523,
"eval_samples_per_second": 14.834,
"eval_steps_per_second": 0.234,
"step": 540
},
{
"epoch": 0.055,
"grad_norm": 0.023394938558340073,
"learning_rate": 4.807806402769648e-05,
"loss": 0.1204,
"step": 550
},
{
"epoch": 0.055,
"eval_cos_sim": 0.8802942037582397,
"eval_loss": 0.12066655971753074,
"eval_runtime": 241.2043,
"eval_samples_per_second": 16.583,
"eval_steps_per_second": 0.261,
"step": 550
},
{
"epoch": 0.056,
"grad_norm": 0.04035342484712601,
"learning_rate": 1.520548996103771e-05,
"loss": 0.1208,
"step": 560
},
{
"epoch": 0.056,
"eval_cos_sim": 0.8805367946624756,
"eval_loss": 0.12042092802273703,
"eval_runtime": 217.8859,
"eval_samples_per_second": 18.358,
"eval_steps_per_second": 0.289,
"step": 560
},
{
"epoch": 0.057,
"grad_norm": 0.02704194188117981,
"learning_rate": 4.7923440704819685e-05,
"loss": 0.1205,
"step": 570
},
{
"epoch": 0.057,
"eval_cos_sim": 0.8805016875267029,
"eval_loss": 0.12045616819607688,
"eval_runtime": 163.5639,
"eval_samples_per_second": 24.455,
"eval_steps_per_second": 0.385,
"step": 570
},
{
"epoch": 0.058,
"grad_norm": 0.041525471955537796,
"learning_rate": 1.4843603704405321e-05,
"loss": 0.1209,
"step": 580
},
{
"epoch": 0.058,
"eval_cos_sim": 0.8803950548171997,
"eval_loss": 0.12056205751645041,
"eval_runtime": 163.8454,
"eval_samples_per_second": 24.413,
"eval_steps_per_second": 0.385,
"step": 580
},
{
"epoch": 0.059,
"grad_norm": 0.02588295191526413,
"learning_rate": 4.7763104379936636e-05,
"loss": 0.12,
"step": 590
},
{
"epoch": 0.059,
"eval_cos_sim": 0.8804982304573059,
"eval_loss": 0.12045794346081687,
"eval_runtime": 175.2999,
"eval_samples_per_second": 22.818,
"eval_steps_per_second": 0.359,
"step": 590
},
{
"epoch": 0.06,
"grad_norm": 0.030644405633211136,
"learning_rate": 1.4484248634655188e-05,
"loss": 0.1211,
"step": 600
},
{
"epoch": 0.06,
"eval_cos_sim": 0.8804518580436707,
"eval_loss": 0.12050184681164694,
"eval_runtime": 165.1748,
"eval_samples_per_second": 24.217,
"eval_steps_per_second": 0.381,
"step": 600
},
{
"epoch": 0.061,
"grad_norm": 0.03162102401256561,
"learning_rate": 4.7597095012220556e-05,
"loss": 0.1194,
"step": 610
},
{
"epoch": 0.061,
"eval_cos_sim": 0.8805278539657593,
"eval_loss": 0.12042546226727438,
"eval_runtime": 169.0168,
"eval_samples_per_second": 23.666,
"eval_steps_per_second": 0.373,
"step": 610
},
{
"epoch": 0.062,
"grad_norm": 0.030952898785471916,
"learning_rate": 1.4127514310605238e-05,
"loss": 0.1202,
"step": 620
},
{
"epoch": 0.062,
"eval_cos_sim": 0.880526602268219,
"eval_loss": 0.12043014385449362,
"eval_runtime": 161.73,
"eval_samples_per_second": 24.733,
"eval_steps_per_second": 0.39,
"step": 620
},
{
"epoch": 0.063,
"grad_norm": 0.025900695472955704,
"learning_rate": 4.742545397468656e-05,
"loss": 0.1205,
"step": 630
},
{
"epoch": 0.063,
"eval_cos_sim": 0.8804138898849487,
"eval_loss": 0.12054368068921043,
"eval_runtime": 169.1165,
"eval_samples_per_second": 23.652,
"eval_steps_per_second": 0.373,
"step": 630
},
{
"epoch": 0.064,
"grad_norm": 0.02121679112315178,
"learning_rate": 1.3773489637927061e-05,
"loss": 0.1208,
"step": 640
},
{
"epoch": 0.064,
"eval_cos_sim": 0.8801329731941223,
"eval_loss": 0.120824953577394,
"eval_runtime": 161.444,
"eval_samples_per_second": 24.776,
"eval_steps_per_second": 0.39,
"step": 640
},
{
"epoch": 0.065,
"grad_norm": 0.02153482660651207,
"learning_rate": 4.7248224043879605e-05,
"loss": 0.1211,
"step": 650
},
{
"epoch": 0.065,
"eval_cos_sim": 0.8802359104156494,
"eval_loss": 0.12072229530560447,
"eval_runtime": 161.7761,
"eval_samples_per_second": 24.726,
"eval_steps_per_second": 0.389,
"step": 650
},
{
"epoch": 0.066,
"grad_norm": 0.030305592343211174,
"learning_rate": 1.342226284699112e-05,
"loss": 0.1202,
"step": 660
},
{
"epoch": 0.066,
"eval_cos_sim": 0.8804138898849487,
"eval_loss": 0.12054470445858909,
"eval_runtime": 166.8157,
"eval_samples_per_second": 23.979,
"eval_steps_per_second": 0.378,
"step": 660
},
{
"epoch": 0.067,
"grad_norm": 0.03320358693599701,
"learning_rate": 4.7065449389213644e-05,
"loss": 0.1216,
"step": 670
},
{
"epoch": 0.067,
"eval_cos_sim": 0.8803730607032776,
"eval_loss": 0.12058561565625144,
"eval_runtime": 168.9555,
"eval_samples_per_second": 23.675,
"eval_steps_per_second": 0.373,
"step": 670
},
{
"epoch": 0.068,
"grad_norm": 0.03419356420636177,
"learning_rate": 1.3073921470878081e-05,
"loss": 0.1197,
"step": 680
},
{
"epoch": 0.068,
"eval_cos_sim": 0.8803737163543701,
"eval_loss": 0.12058660843121481,
"eval_runtime": 162.073,
"eval_samples_per_second": 24.68,
"eval_steps_per_second": 0.389,
"step": 680
},
{
"epoch": 0.069,
"grad_norm": 0.022392097860574722,
"learning_rate": 4.6877175561964684e-05,
"loss": 0.12,
"step": 690
},
{
"epoch": 0.069,
"eval_cos_sim": 0.8804360628128052,
"eval_loss": 0.12052560474621725,
"eval_runtime": 184.9748,
"eval_samples_per_second": 21.625,
"eval_steps_per_second": 0.341,
"step": 690
},
{
"epoch": 0.07,
"grad_norm": 0.03007390908896923,
"learning_rate": 1.272855232356e-05,
"loss": 0.1204,
"step": 700
},
{
"epoch": 0.07,
"eval_cos_sim": 0.8805059790611267,
"eval_loss": 0.12045389034497214,
"eval_runtime": 165.5078,
"eval_samples_per_second": 24.168,
"eval_steps_per_second": 0.381,
"step": 700
},
{
"epoch": 0.071,
"grad_norm": 0.020742209628224373,
"learning_rate": 4.6683449483917846e-05,
"loss": 0.12,
"step": 710
},
{
"epoch": 0.071,
"eval_cos_sim": 0.8806586861610413,
"eval_loss": 0.12030088522183371,
"eval_runtime": 172.1855,
"eval_samples_per_second": 23.231,
"eval_steps_per_second": 0.366,
"step": 710
},
{
"epoch": 0.072,
"grad_norm": 0.023063719272613525,
"learning_rate": 1.2386241478270652e-05,
"loss": 0.1198,
"step": 720
},
{
"epoch": 0.072,
"eval_cos_sim": 0.8805540204048157,
"eval_loss": 0.12040612079846334,
"eval_runtime": 166.7801,
"eval_samples_per_second": 23.984,
"eval_steps_per_second": 0.378,
"step": 720
},
{
"epoch": 0.073,
"grad_norm": 0.027647124603390694,
"learning_rate": 4.648431943567264e-05,
"loss": 0.1205,
"step": 730
},
{
"epoch": 0.073,
"eval_cos_sim": 0.8805664777755737,
"eval_loss": 0.12039038088070822,
"eval_runtime": 164.5025,
"eval_samples_per_second": 24.316,
"eval_steps_per_second": 0.383,
"step": 730
},
{
"epoch": 0.074,
"grad_norm": 0.02208826318383217,
"learning_rate": 1.204707424604792e-05,
"loss": 0.1203,
"step": 740
},
{
"epoch": 0.074,
"eval_cos_sim": 0.8805152177810669,
"eval_loss": 0.12043928005444479,
"eval_runtime": 178.0321,
"eval_samples_per_second": 22.468,
"eval_steps_per_second": 0.354,
"step": 740
},
{
"epoch": 0.075,
"grad_norm": 0.021549325436353683,
"learning_rate": 4.627983504461235e-05,
"loss": 0.1196,
"step": 750
},
{
"epoch": 0.075,
"eval_cos_sim": 0.8806087374687195,
"eval_loss": 0.12034820940243673,
"eval_runtime": 179.3479,
"eval_samples_per_second": 22.303,
"eval_steps_per_second": 0.351,
"step": 750
},
{
"epoch": 0.076,
"grad_norm": 0.028022369369864464,
"learning_rate": 1.1711135154477562e-05,
"loss": 0.1207,
"step": 760
},
{
"epoch": 0.076,
"eval_cos_sim": 0.8805749416351318,
"eval_loss": 0.12038306286084127,
"eval_runtime": 176.8723,
"eval_samples_per_second": 22.615,
"eval_steps_per_second": 0.356,
"step": 760
},
{
"epoch": 0.077,
"grad_norm": 0.021017303690314293,
"learning_rate": 4.607004727253391e-05,
"loss": 0.12,
"step": 770
},
{
"epoch": 0.077,
"eval_cos_sim": 0.8806374669075012,
"eval_loss": 0.12032015850293112,
"eval_runtime": 163.1866,
"eval_samples_per_second": 24.512,
"eval_steps_per_second": 0.386,
"step": 770
},
{
"epoch": 0.078,
"grad_norm": 0.02246786840260029,
"learning_rate": 1.1378507926623572e-05,
"loss": 0.1199,
"step": 780
},
{
"epoch": 0.078,
"eval_cos_sim": 0.8806332349777222,
"eval_loss": 0.12032350780713034,
"eval_runtime": 163.0562,
"eval_samples_per_second": 24.531,
"eval_steps_per_second": 0.386,
"step": 780
},
{
"epoch": 0.079,
"grad_norm": 0.021708086133003235,
"learning_rate": 4.585500840294793e-05,
"loss": 0.1201,
"step": 790
},
{
"epoch": 0.079,
"eval_cos_sim": 0.8807349801063538,
"eval_loss": 0.12022322513806295,
"eval_runtime": 173.5217,
"eval_samples_per_second": 23.052,
"eval_steps_per_second": 0.363,
"step": 790
},
{
"epoch": 0.08,
"grad_norm": 0.034823887050151825,
"learning_rate": 1.1049275460164102e-05,
"loss": 0.1204,
"step": 800
},
{
"epoch": 0.08,
"eval_cos_sim": 0.8806157112121582,
"eval_loss": 0.12034289600598289,
"eval_runtime": 163.4003,
"eval_samples_per_second": 24.48,
"eval_steps_per_second": 0.386,
"step": 800
},
{
"epoch": 0.081,
"grad_norm": 0.02099907584488392,
"learning_rate": 4.563477202804924e-05,
"loss": 0.1203,
"step": 810
},
{
"epoch": 0.081,
"eval_cos_sim": 0.8805558681488037,
"eval_loss": 0.1204009156440444,
"eval_runtime": 171.4073,
"eval_samples_per_second": 23.336,
"eval_steps_per_second": 0.368,
"step": 810
},
{
"epoch": 0.082,
"grad_norm": 0.027718910947442055,
"learning_rate": 1.0723519806732512e-05,
"loss": 0.1206,
"step": 820
},
{
"epoch": 0.082,
"eval_cos_sim": 0.8804323673248291,
"eval_loss": 0.12052262928235007,
"eval_runtime": 166.5171,
"eval_samples_per_second": 24.022,
"eval_steps_per_second": 0.378,
"step": 820
},
{
"epoch": 0.083,
"grad_norm": 0.030117569491267204,
"learning_rate": 4.540939303535997e-05,
"loss": 0.1208,
"step": 830
},
{
"epoch": 0.083,
"eval_cos_sim": 0.8806024193763733,
"eval_loss": 0.12035309459912252,
"eval_runtime": 172.9809,
"eval_samples_per_second": 23.124,
"eval_steps_per_second": 0.364,
"step": 830
},
{
"epoch": 0.084,
"grad_norm": 0.025621019303798676,
"learning_rate": 1.0401322151467458e-05,
"loss": 0.1207,
"step": 840
},
{
"epoch": 0.084,
"eval_cos_sim": 0.8805838823318481,
"eval_loss": 0.12037316419827414,
"eval_runtime": 161.0298,
"eval_samples_per_second": 24.84,
"eval_steps_per_second": 0.391,
"step": 840
},
{
"epoch": 0.085,
"grad_norm": 0.037043701857328415,
"learning_rate": 4.517892759404947e-05,
"loss": 0.1192,
"step": 850
},
{
"epoch": 0.085,
"eval_cos_sim": 0.8807790279388428,
"eval_loss": 0.12017762710797263,
"eval_runtime": 165.404,
"eval_samples_per_second": 24.183,
"eval_steps_per_second": 0.381,
"step": 850
},
{
"epoch": 0.086,
"grad_norm": 0.024647973477840424,
"learning_rate": 1.0082762792778497e-05,
"loss": 0.12,
"step": 860
},
{
"epoch": 0.086,
"eval_cos_sim": 0.8808472156524658,
"eval_loss": 0.12010916282879783,
"eval_runtime": 159.4806,
"eval_samples_per_second": 25.081,
"eval_steps_per_second": 0.395,
"step": 860
},
{
"epoch": 0.087,
"grad_norm": 0.02787039987742901,
"learning_rate": 4.494343314093799e-05,
"loss": 0.1192,
"step": 870
},
{
"epoch": 0.087,
"eval_cos_sim": 0.8806514143943787,
"eval_loss": 0.12030471136319112,
"eval_runtime": 167.7185,
"eval_samples_per_second": 23.849,
"eval_steps_per_second": 0.376,
"step": 870
},
{
"epoch": 0.088,
"grad_norm": 0.027198661118745804,
"learning_rate": 9.767921122337203e-06,
"loss": 0.12,
"step": 880
},
{
"epoch": 0.088,
"eval_cos_sim": 0.8806140422821045,
"eval_loss": 0.12034183456646871,
"eval_runtime": 164.5125,
"eval_samples_per_second": 24.314,
"eval_steps_per_second": 0.383,
"step": 880
},
{
"epoch": 0.089,
"grad_norm": 0.020295780152082443,
"learning_rate": 4.4702968366179995e-05,
"loss": 0.121,
"step": 890
},
{
"epoch": 0.089,
"eval_cos_sim": 0.8807177543640137,
"eval_loss": 0.12023961307751609,
"eval_runtime": 178.9283,
"eval_samples_per_second": 22.355,
"eval_steps_per_second": 0.352,
"step": 890
},
{
"epoch": 0.09,
"grad_norm": 0.025682412087917328,
"learning_rate": 9.456875605287963e-06,
"loss": 0.1197,
"step": 900
},
{
"epoch": 0.09,
"eval_cos_sim": 0.8808146715164185,
"eval_loss": 0.12014213421093893,
"eval_runtime": 162.1503,
"eval_samples_per_second": 24.668,
"eval_steps_per_second": 0.389,
"step": 900
},
{
"epoch": 0.091,
"grad_norm": 0.0241321362555027,
"learning_rate": 4.4457593198638266e-05,
"loss": 0.1204,
"step": 910
},
{
"epoch": 0.091,
"eval_cos_sim": 0.8806710243225098,
"eval_loss": 0.12028472664105368,
"eval_runtime": 170.9864,
"eval_samples_per_second": 23.394,
"eval_steps_per_second": 0.368,
"step": 910
},
{
"epoch": 0.092,
"grad_norm": 0.03511843457818031,
"learning_rate": 9.149703760693733e-06,
"loss": 0.1204,
"step": 920
},
{
"epoch": 0.092,
"eval_cos_sim": 0.8806130886077881,
"eval_loss": 0.12034211971508932,
"eval_runtime": 163.6776,
"eval_samples_per_second": 24.438,
"eval_steps_per_second": 0.385,
"step": 920
},
{
"epoch": 0.093,
"grad_norm": 0.03159726411104202,
"learning_rate": 4.420736879094911e-05,
"loss": 0.1208,
"step": 930
},
{
"epoch": 0.093,
"eval_cos_sim": 0.8807392716407776,
"eval_loss": 0.12021884395825339,
"eval_runtime": 172.0497,
"eval_samples_per_second": 23.249,
"eval_steps_per_second": 0.366,
"step": 930
},
{
"epoch": 0.094,
"grad_norm": 0.02288082055747509,
"learning_rate": 8.846482142219678e-06,
"loss": 0.1206,
"step": 940
},
{
"epoch": 0.094,
"eval_cos_sim": 0.8808532953262329,
"eval_loss": 0.12010584022747946,
"eval_runtime": 160.6917,
"eval_samples_per_second": 24.892,
"eval_steps_per_second": 0.392,
"step": 940
},
{
"epoch": 0.095,
"grad_norm": 0.022692304104566574,
"learning_rate": 4.395235750428116e-05,
"loss": 0.1193,
"step": 950
},
{
"epoch": 0.095,
"eval_cos_sim": 0.8807929158210754,
"eval_loss": 0.1201639943336196,
"eval_runtime": 165.9011,
"eval_samples_per_second": 24.111,
"eval_steps_per_second": 0.38,
"step": 950
},
{
"epoch": 0.096,
"grad_norm": 0.02069064788520336,
"learning_rate": 8.547286319049193e-06,
"loss": 0.1204,
"step": 960
},
{
"epoch": 0.096,
"eval_cos_sim": 0.8806983232498169,
"eval_loss": 0.12025791980969382,
"eval_runtime": 163.2536,
"eval_samples_per_second": 24.502,
"eval_steps_per_second": 0.386,
"step": 960
},
{
"epoch": 0.097,
"grad_norm": 0.024405937641859055,
"learning_rate": 4.369262289279257e-05,
"loss": 0.12,
"step": 970
},
{
"epoch": 0.097,
"eval_cos_sim": 0.8808521628379822,
"eval_loss": 0.1201059584830947,
"eval_runtime": 161.1408,
"eval_samples_per_second": 24.823,
"eval_steps_per_second": 0.391,
"step": 970
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 440,
"trial_name": null,
"trial_params": null
}