|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9943289224952743, |
|
"eval_steps": 500, |
|
"global_step": 1188, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02520478890989288, |
|
"grad_norm": 9.633888402845015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9444, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05040957781978576, |
|
"grad_norm": 1.6254228664238897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8095, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07561436672967864, |
|
"grad_norm": 0.8936028314003451, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7596, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10081915563957151, |
|
"grad_norm": 0.9708205238462188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7334, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1260239445494644, |
|
"grad_norm": 0.8882056031134592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7166, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15122873345935728, |
|
"grad_norm": 0.8542020955227222, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7004, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17643352236925017, |
|
"grad_norm": 0.719450185402131, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6877, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20163831127914303, |
|
"grad_norm": 0.6838086991132964, |
|
"learning_rate": 5e-06, |
|
"loss": 0.676, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22684310018903592, |
|
"grad_norm": 0.6199036833773371, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6754, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2520478890989288, |
|
"grad_norm": 0.5787703367375647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6562, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2772526780088217, |
|
"grad_norm": 0.633249705700733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.665, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.30245746691871456, |
|
"grad_norm": 0.6304627866586173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6634, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3276622558286074, |
|
"grad_norm": 0.5506728721157861, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6549, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.35286704473850034, |
|
"grad_norm": 0.5151002955744762, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6522, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3780718336483932, |
|
"grad_norm": 0.5992082289679086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6528, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.40327662255828606, |
|
"grad_norm": 0.4663777253829188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.642, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.428481411468179, |
|
"grad_norm": 0.5785034175448459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6469, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.45368620037807184, |
|
"grad_norm": 0.6585860693219876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.646, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4788909892879647, |
|
"grad_norm": 0.6722487645620258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6403, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5040957781978576, |
|
"grad_norm": 0.5254154176523609, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6387, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5293005671077504, |
|
"grad_norm": 0.5128942747923588, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6379, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5545053560176434, |
|
"grad_norm": 0.5642956436475494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6393, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5797101449275363, |
|
"grad_norm": 0.6837039476741191, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6359, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6049149338374291, |
|
"grad_norm": 0.6667359902635842, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6351, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.630119722747322, |
|
"grad_norm": 0.6627848973706361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6343, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6553245116572148, |
|
"grad_norm": 0.5241036221432155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6279, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6805293005671077, |
|
"grad_norm": 0.576211583526909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6258, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7057340894770007, |
|
"grad_norm": 0.5136516604177849, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6322, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7309388783868935, |
|
"grad_norm": 0.5270426246043673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6243, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7561436672967864, |
|
"grad_norm": 0.5453025715363531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6224, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7813484562066793, |
|
"grad_norm": 0.5116409906154713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6155, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8065532451165721, |
|
"grad_norm": 0.5849562966179552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6212, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.831758034026465, |
|
"grad_norm": 0.6135737091827808, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6258, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.856962822936358, |
|
"grad_norm": 0.5181177544801595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6238, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8821676118462508, |
|
"grad_norm": 0.627205460376999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6167, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9073724007561437, |
|
"grad_norm": 0.5261678739160346, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6117, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9325771896660365, |
|
"grad_norm": 0.6244352371075516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6153, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9577819785759294, |
|
"grad_norm": 0.48330689120915904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6157, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9829867674858223, |
|
"grad_norm": 0.5423735222942331, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6132, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.998109640831758, |
|
"eval_loss": 0.6147013306617737, |
|
"eval_runtime": 212.3997, |
|
"eval_samples_per_second": 50.334, |
|
"eval_steps_per_second": 0.395, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.0081915563957151, |
|
"grad_norm": 0.5070359483563761, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6024, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.033396345305608, |
|
"grad_norm": 0.5765247476192352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5805, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0586011342155008, |
|
"grad_norm": 0.6110502474250971, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5715, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0838059231253938, |
|
"grad_norm": 0.7545087966794868, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5797, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1090107120352868, |
|
"grad_norm": 0.6365159456598485, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5751, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1342155009451795, |
|
"grad_norm": 0.6056198671024684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5742, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1594202898550725, |
|
"grad_norm": 0.5291507949595291, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5761, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1846250787649653, |
|
"grad_norm": 0.6153628132427752, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5779, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2098298676748582, |
|
"grad_norm": 0.5060871095129219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.574, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2350346565847512, |
|
"grad_norm": 0.6456813781855114, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5759, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.260239445494644, |
|
"grad_norm": 0.5052335738280048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5756, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.285444234404537, |
|
"grad_norm": 0.5653893528313595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5754, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3106490233144297, |
|
"grad_norm": 0.5278634789290781, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5733, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3358538122243226, |
|
"grad_norm": 0.5245168937373562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5721, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3610586011342156, |
|
"grad_norm": 0.5215086066445794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5733, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3862633900441084, |
|
"grad_norm": 0.5370456877329636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.571, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4114681789540013, |
|
"grad_norm": 0.5521665579415627, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5698, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.436672967863894, |
|
"grad_norm": 0.7002009297394419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5672, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.461877756773787, |
|
"grad_norm": 0.5738916744555749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5646, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.48708254568368, |
|
"grad_norm": 0.47370527901117226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5717, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5122873345935728, |
|
"grad_norm": 0.554049330965348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.567, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5374921235034655, |
|
"grad_norm": 0.5960178872278222, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5667, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5626969124133585, |
|
"grad_norm": 0.5085168346574576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5682, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5879017013232515, |
|
"grad_norm": 0.6537267193437978, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5629, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6131064902331445, |
|
"grad_norm": 0.5874046443880916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5686, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6383112791430372, |
|
"grad_norm": 0.5727374180932888, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5634, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.66351606805293, |
|
"grad_norm": 0.4739882267214101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5643, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.688720856962823, |
|
"grad_norm": 0.5225576157310349, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5585, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.713925645872716, |
|
"grad_norm": 0.5180601277471911, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5656, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 0.4742139859545316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5601, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.7643352236925016, |
|
"grad_norm": 0.569511350039058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5601, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7895400126023944, |
|
"grad_norm": 0.5632608053135445, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5622, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.8147448015122873, |
|
"grad_norm": 0.7901163025671061, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5617, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8399495904221803, |
|
"grad_norm": 0.7242318592229516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5681, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.865154379332073, |
|
"grad_norm": 0.5903083920098006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.561, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8903591682419658, |
|
"grad_norm": 0.7074496398717507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5618, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9155639571518588, |
|
"grad_norm": 0.6211817411184037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5602, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9407687460617518, |
|
"grad_norm": 0.7491153337871396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5563, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.9659735349716447, |
|
"grad_norm": 0.4814370775695788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5575, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.9911783238815375, |
|
"grad_norm": 0.4983004502393923, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5614, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.9987397605545052, |
|
"eval_loss": 0.5889107584953308, |
|
"eval_runtime": 212.4952, |
|
"eval_samples_per_second": 50.312, |
|
"eval_steps_per_second": 0.395, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 2.0163831127914302, |
|
"grad_norm": 0.6245989305076182, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5329, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.041587901701323, |
|
"grad_norm": 0.5453952668705114, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5187, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.066792690611216, |
|
"grad_norm": 0.4731173438501679, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5157, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.091997479521109, |
|
"grad_norm": 0.5441667804629673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5227, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.1172022684310017, |
|
"grad_norm": 0.5708819169258152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5229, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.1424070573408946, |
|
"grad_norm": 0.523183963336559, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5262, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.1676118462507876, |
|
"grad_norm": 0.5229800543372238, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5235, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.1928166351606806, |
|
"grad_norm": 0.5691473122380714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5177, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.2180214240705736, |
|
"grad_norm": 0.5400039287033931, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5273, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.243226212980466, |
|
"grad_norm": 0.5107824302626609, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5323, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.268431001890359, |
|
"grad_norm": 0.5059906821559053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5202, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.293635790800252, |
|
"grad_norm": 0.5379755140149425, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5245, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.318840579710145, |
|
"grad_norm": 0.5628132546435494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5227, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.344045368620038, |
|
"grad_norm": 0.5940730599429787, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5244, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.3692501575299305, |
|
"grad_norm": 0.4876405401032709, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5231, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.3944549464398235, |
|
"grad_norm": 0.5287351322876084, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5282, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.4196597353497165, |
|
"grad_norm": 0.5497856784965347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5224, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.4448645242596094, |
|
"grad_norm": 0.5169812352131126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5186, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.4700693131695024, |
|
"grad_norm": 0.5065988236822105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5175, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.495274102079395, |
|
"grad_norm": 0.5210233733825254, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5167, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.520478890989288, |
|
"grad_norm": 0.5188140313597245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.518, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.545683679899181, |
|
"grad_norm": 0.5405876217592858, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5193, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.570888468809074, |
|
"grad_norm": 0.46633858042210613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5248, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.596093257718967, |
|
"grad_norm": 0.5030938635404251, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5171, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.6212980466288593, |
|
"grad_norm": 0.7552264831047579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5208, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.6465028355387523, |
|
"grad_norm": 0.6446011609995526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5226, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.6717076244486453, |
|
"grad_norm": 0.598720313170294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5201, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.6969124133585383, |
|
"grad_norm": 0.5646013331239497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5247, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.7221172022684312, |
|
"grad_norm": 0.5205190466062173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5228, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.7473219911783238, |
|
"grad_norm": 0.6675587237327031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5238, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.7725267800882167, |
|
"grad_norm": 0.5888446017638219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5246, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.7977315689981097, |
|
"grad_norm": 0.5291240035154432, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5207, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.8229363579080027, |
|
"grad_norm": 0.5322435909276529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5167, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.8481411468178957, |
|
"grad_norm": 0.4603004988767882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5236, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.873345935727788, |
|
"grad_norm": 0.48682290640941545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5252, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.898550724637681, |
|
"grad_norm": 0.5361316970255996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5264, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.923755513547574, |
|
"grad_norm": 0.6157284870441493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5188, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.9489603024574667, |
|
"grad_norm": 0.5584802223169939, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5189, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.97416509136736, |
|
"grad_norm": 0.5261737585816265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5205, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.9943289224952743, |
|
"eval_loss": 0.58283931016922, |
|
"eval_runtime": 213.9304, |
|
"eval_samples_per_second": 49.974, |
|
"eval_steps_per_second": 0.393, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 2.9943289224952743, |
|
"step": 1188, |
|
"total_flos": 1989525488271360.0, |
|
"train_loss": 0.5829599999418162, |
|
"train_runtime": 35401.3525, |
|
"train_samples_per_second": 17.213, |
|
"train_steps_per_second": 0.034 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1188, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1989525488271360.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|