|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 981, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03058103975535168, |
|
"grad_norm": 4.613314070863576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8051, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06116207951070336, |
|
"grad_norm": 1.8900295063815196, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7288, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09174311926605505, |
|
"grad_norm": 2.828490629262087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.709, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12232415902140673, |
|
"grad_norm": 1.3871149266999225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6966, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1529051987767584, |
|
"grad_norm": 1.5967435207002258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6812, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1834862385321101, |
|
"grad_norm": 1.882517891139545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6728, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21406727828746178, |
|
"grad_norm": 0.6460792834326635, |
|
"learning_rate": 5e-06, |
|
"loss": 0.661, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24464831804281345, |
|
"grad_norm": 0.8345961449379203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6527, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27522935779816515, |
|
"grad_norm": 0.6632736237849782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.65, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3058103975535168, |
|
"grad_norm": 0.6414397366786443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6416, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3363914373088685, |
|
"grad_norm": 0.6410573252326648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6423, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3669724770642202, |
|
"grad_norm": 0.6301173144669027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6431, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.39755351681957185, |
|
"grad_norm": 0.6700047532457443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6328, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.42813455657492355, |
|
"grad_norm": 0.7253886574556052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6379, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.45871559633027525, |
|
"grad_norm": 0.6468641147049369, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6226, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4892966360856269, |
|
"grad_norm": 0.5346182726002575, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6297, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5198776758409785, |
|
"grad_norm": 0.5313857610798393, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6315, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5504587155963303, |
|
"grad_norm": 1.055453437070951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6319, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.581039755351682, |
|
"grad_norm": 1.0346764494649112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6336, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6116207951070336, |
|
"grad_norm": 0.5618281272496094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6219, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6422018348623854, |
|
"grad_norm": 1.3519206618987067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6231, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.672782874617737, |
|
"grad_norm": 0.7749478594608031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6242, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7033639143730887, |
|
"grad_norm": 0.5893825196950665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6241, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7339449541284404, |
|
"grad_norm": 0.6773691040863971, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6222, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.764525993883792, |
|
"grad_norm": 0.4827169638012845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.619, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7951070336391437, |
|
"grad_norm": 0.6490833986094754, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6214, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8256880733944955, |
|
"grad_norm": 0.4630327930179835, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6234, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8562691131498471, |
|
"grad_norm": 0.5519953643760132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6238, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8868501529051988, |
|
"grad_norm": 0.6014046385653471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6205, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9174311926605505, |
|
"grad_norm": 0.9905714959978613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.614, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9480122324159022, |
|
"grad_norm": 0.5722101718286907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6174, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9785932721712538, |
|
"grad_norm": 0.49394030115956855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6318, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6179984211921692, |
|
"eval_runtime": 175.4761, |
|
"eval_samples_per_second": 50.218, |
|
"eval_steps_per_second": 0.393, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.0091743119266054, |
|
"grad_norm": 0.7689781158724909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6099, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.039755351681957, |
|
"grad_norm": 0.6771992725638727, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5636, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.070336391437309, |
|
"grad_norm": 0.8428533582721723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5702, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1009174311926606, |
|
"grad_norm": 0.6417949273070751, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5724, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1314984709480123, |
|
"grad_norm": 0.4739060770421859, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5667, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.162079510703364, |
|
"grad_norm": 0.8290714956133294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5723, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1926605504587156, |
|
"grad_norm": 0.4621451043452794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5712, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2232415902140672, |
|
"grad_norm": 0.5143870327303872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5665, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2538226299694188, |
|
"grad_norm": 0.5164679975473679, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5738, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2844036697247707, |
|
"grad_norm": 0.8444328502616898, |
|
"learning_rate": 5e-06, |
|
"loss": 0.568, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3149847094801224, |
|
"grad_norm": 0.5068551444160476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5685, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.345565749235474, |
|
"grad_norm": 0.5272187758950088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5719, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3761467889908257, |
|
"grad_norm": 0.6560982973750454, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5631, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4067278287461773, |
|
"grad_norm": 0.4484289597201794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5662, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4373088685015292, |
|
"grad_norm": 0.5845602559047488, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5555, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4678899082568808, |
|
"grad_norm": 0.5019202622500104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.572, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4984709480122325, |
|
"grad_norm": 0.5453352197296611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5678, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.529051987767584, |
|
"grad_norm": 0.5096577153134583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5747, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5596330275229358, |
|
"grad_norm": 0.6175776252130769, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5664, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5902140672782874, |
|
"grad_norm": 0.5104602945006693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5634, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.620795107033639, |
|
"grad_norm": 0.596086026271991, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5751, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6513761467889907, |
|
"grad_norm": 0.5308994737717756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5733, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6819571865443423, |
|
"grad_norm": 0.4845901344757882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5717, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7125382262996942, |
|
"grad_norm": 0.5671326796569592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5724, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7431192660550459, |
|
"grad_norm": 0.4598174188757565, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5683, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7737003058103975, |
|
"grad_norm": 0.5112157139948377, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5681, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8042813455657494, |
|
"grad_norm": 0.5586531166082738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5684, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.834862385321101, |
|
"grad_norm": 0.4688565551712795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5644, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8654434250764527, |
|
"grad_norm": 0.5068561602224454, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5634, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8960244648318043, |
|
"grad_norm": 0.5177020231748777, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5693, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.926605504587156, |
|
"grad_norm": 0.5480741145181502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5613, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9571865443425076, |
|
"grad_norm": 0.4783567812818659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5585, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9877675840978593, |
|
"grad_norm": 0.5551088225058829, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5674, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6093349456787109, |
|
"eval_runtime": 174.9969, |
|
"eval_samples_per_second": 50.355, |
|
"eval_steps_per_second": 0.394, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 2.018348623853211, |
|
"grad_norm": 0.6215337411502556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5384, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0489296636085625, |
|
"grad_norm": 0.6523651145276358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5183, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.079510703363914, |
|
"grad_norm": 0.4938873169688457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5154, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1100917431192663, |
|
"grad_norm": 0.5700577847392374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.513, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.140672782874618, |
|
"grad_norm": 0.6865444356940279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5107, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1712538226299696, |
|
"grad_norm": 0.49230256627015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5109, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.2018348623853212, |
|
"grad_norm": 0.8680096117870334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5201, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.232415902140673, |
|
"grad_norm": 0.5282977989250981, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5176, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2629969418960245, |
|
"grad_norm": 0.5641604277626704, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5175, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.293577981651376, |
|
"grad_norm": 0.5627994676639944, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5233, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.324159021406728, |
|
"grad_norm": 0.5351783170372003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5193, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3547400611620795, |
|
"grad_norm": 0.5159357728539045, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5145, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.385321100917431, |
|
"grad_norm": 0.6104068286820499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5164, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4159021406727827, |
|
"grad_norm": 0.677498087908613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5262, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.4464831804281344, |
|
"grad_norm": 0.5283675690505885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5149, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.477064220183486, |
|
"grad_norm": 0.5044083108738047, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5176, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.5076452599388377, |
|
"grad_norm": 0.49494102207897933, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5204, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.5382262996941893, |
|
"grad_norm": 0.4861352677652072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5337, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5688073394495414, |
|
"grad_norm": 0.48851988662021156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5238, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.599388379204893, |
|
"grad_norm": 0.5226833608668234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5205, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6299694189602447, |
|
"grad_norm": 0.5465654006210326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5221, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.6605504587155964, |
|
"grad_norm": 0.5039029213538379, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5196, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.691131498470948, |
|
"grad_norm": 0.5371018828919037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5237, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.7217125382262997, |
|
"grad_norm": 0.5383889127160468, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5169, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.7522935779816513, |
|
"grad_norm": 0.5406840391563221, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5116, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.782874617737003, |
|
"grad_norm": 0.5182378062317926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5147, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.8134556574923546, |
|
"grad_norm": 0.483759876679847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.514, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8440366972477067, |
|
"grad_norm": 0.5715170886676529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5189, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8746177370030583, |
|
"grad_norm": 0.4882822153954844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5131, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.90519877675841, |
|
"grad_norm": 0.518462889966442, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5228, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.9357798165137616, |
|
"grad_norm": 0.500511829330958, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5177, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.9663608562691133, |
|
"grad_norm": 0.4799796524761594, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5283, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.996941896024465, |
|
"grad_norm": 0.6756287967133653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5228, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.6134681701660156, |
|
"eval_runtime": 174.9719, |
|
"eval_samples_per_second": 50.362, |
|
"eval_steps_per_second": 0.394, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 981, |
|
"total_flos": 1642792778465280.0, |
|
"train_loss": 0.5776342995062272, |
|
"train_runtime": 29440.9564, |
|
"train_samples_per_second": 17.06, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 981, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1642792778465280.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|