|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 5238, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0057273768613974796, |
|
"grad_norm": 69.81166336806189, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7165, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.011454753722794959, |
|
"grad_norm": 1.0213242744191677, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6046, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01718213058419244, |
|
"grad_norm": 0.9449746812829907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5629, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.022909507445589918, |
|
"grad_norm": 0.8701146894026842, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5346, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0286368843069874, |
|
"grad_norm": 0.9523022991510063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5263, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03436426116838488, |
|
"grad_norm": 0.8669421913033644, |
|
"learning_rate": 5e-06, |
|
"loss": 0.508, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04009163802978236, |
|
"grad_norm": 0.6655121535600234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5057, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.045819014891179836, |
|
"grad_norm": 0.5907864190703314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5001, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05154639175257732, |
|
"grad_norm": 0.614544161319113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5063, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0572737686139748, |
|
"grad_norm": 0.5397294149815204, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4995, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06300114547537228, |
|
"grad_norm": 0.5645343469851593, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5017, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06872852233676977, |
|
"grad_norm": 0.5690627599199714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4896, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07445589919816724, |
|
"grad_norm": 0.5699342648205336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4921, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08018327605956473, |
|
"grad_norm": 0.5710376129366055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4872, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0859106529209622, |
|
"grad_norm": 0.5227080703520162, |
|
"learning_rate": 5e-06, |
|
"loss": 0.477, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09163802978235967, |
|
"grad_norm": 0.5746214024823622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4923, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09736540664375716, |
|
"grad_norm": 0.593169674819385, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4889, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10309278350515463, |
|
"grad_norm": 0.5370925645974212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4905, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.10882016036655212, |
|
"grad_norm": 0.5343324536740028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4792, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1145475372279496, |
|
"grad_norm": 0.5161651942536712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4748, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12027491408934708, |
|
"grad_norm": 0.5441222202912794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4759, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12600229095074456, |
|
"grad_norm": 0.5327482809839887, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4794, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13172966781214204, |
|
"grad_norm": 0.5225815889111504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4766, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.13745704467353953, |
|
"grad_norm": 0.5070284078877997, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4709, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.143184421534937, |
|
"grad_norm": 0.517606520993401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4712, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.14891179839633448, |
|
"grad_norm": 0.5320984686529744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4695, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15463917525773196, |
|
"grad_norm": 0.529639868170382, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4655, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16036655211912945, |
|
"grad_norm": 0.575168826972108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4746, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1660939289805269, |
|
"grad_norm": 0.5001765588738531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4647, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1718213058419244, |
|
"grad_norm": 0.5396012723001057, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4659, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1775486827033219, |
|
"grad_norm": 0.5095325332822108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4579, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.18327605956471935, |
|
"grad_norm": 0.5122121536362334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4576, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.18900343642611683, |
|
"grad_norm": 0.5329548850391604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4629, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.19473081328751432, |
|
"grad_norm": 0.5042536803872275, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4605, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2004581901489118, |
|
"grad_norm": 0.5300288779122528, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4721, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.5190574562315794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4609, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.21191294387170675, |
|
"grad_norm": 0.5241199686918058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4526, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.21764032073310424, |
|
"grad_norm": 0.5233221069855917, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4618, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.22336769759450173, |
|
"grad_norm": 0.5210038372014036, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4539, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2290950744558992, |
|
"grad_norm": 0.553052125634231, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4655, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.23482245131729668, |
|
"grad_norm": 0.5402131075813968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4592, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.24054982817869416, |
|
"grad_norm": 0.547123856227283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4661, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.24627720504009165, |
|
"grad_norm": 0.5398652887149104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4682, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2520045819014891, |
|
"grad_norm": 0.546075973932548, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4628, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.25773195876288657, |
|
"grad_norm": 0.5049452018718911, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4608, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2634593356242841, |
|
"grad_norm": 0.5290358743639263, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4455, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.26918671248568155, |
|
"grad_norm": 0.5765812894544023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.463, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.27491408934707906, |
|
"grad_norm": 0.5447055932241692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4433, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2806414662084765, |
|
"grad_norm": 0.5289823549926729, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4431, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.286368843069874, |
|
"grad_norm": 0.4980084292593178, |
|
"learning_rate": 5e-06, |
|
"loss": 0.456, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2920962199312715, |
|
"grad_norm": 0.5522447921128621, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4652, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.29782359679266895, |
|
"grad_norm": 0.5296189595956464, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4297, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3035509736540664, |
|
"grad_norm": 0.5271978881360857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4629, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.30927835051546393, |
|
"grad_norm": 0.5071706509511955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4533, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3150057273768614, |
|
"grad_norm": 0.558135676822065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4513, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3207331042382589, |
|
"grad_norm": 0.5784740135003561, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4604, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.32646048109965636, |
|
"grad_norm": 0.5335495547019294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4605, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3321878579610538, |
|
"grad_norm": 0.5489863296663943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4506, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.33791523482245134, |
|
"grad_norm": 0.535282941674749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.444, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3436426116838488, |
|
"grad_norm": 0.5147273315285874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4516, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.34936998854524626, |
|
"grad_norm": 0.50077282871092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4581, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3550973654066438, |
|
"grad_norm": 0.5415059448469354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4602, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.36082474226804123, |
|
"grad_norm": 0.5362372324428211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4513, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3665521191294387, |
|
"grad_norm": 0.5417339363013141, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4536, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3722794959908362, |
|
"grad_norm": 0.5368661394039848, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4625, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.37800687285223367, |
|
"grad_norm": 0.48954972250964995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4437, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3837342497136312, |
|
"grad_norm": 0.5655597381472477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4559, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.38946162657502864, |
|
"grad_norm": 0.5042748966559669, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4444, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3951890034364261, |
|
"grad_norm": 0.502096112493101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4437, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4009163802978236, |
|
"grad_norm": 0.5154503882668929, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4517, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4066437571592211, |
|
"grad_norm": 0.5010738869784289, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4443, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 0.5262171503893435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4416, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.41809851088201605, |
|
"grad_norm": 0.5413323628955329, |
|
"learning_rate": 5e-06, |
|
"loss": 0.453, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.4238258877434135, |
|
"grad_norm": 0.5117489213181463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4506, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.42955326460481097, |
|
"grad_norm": 0.5423772098810782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4394, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4352806414662085, |
|
"grad_norm": 0.5142331037751853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4391, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.44100801832760594, |
|
"grad_norm": 0.5185745909865896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4569, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.44673539518900346, |
|
"grad_norm": 0.5250033250072343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.453, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4524627720504009, |
|
"grad_norm": 0.5273208537854123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4532, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4581901489117984, |
|
"grad_norm": 0.5219200094883771, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4534, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4639175257731959, |
|
"grad_norm": 0.5265307700538928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4467, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.46964490263459335, |
|
"grad_norm": 0.5138527259324056, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4427, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4753722794959908, |
|
"grad_norm": 0.530301258016885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.45, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.48109965635738833, |
|
"grad_norm": 0.523291306479459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.448, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.4868270332187858, |
|
"grad_norm": 0.5425078145535479, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4444, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4925544100801833, |
|
"grad_norm": 0.5486979687540795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4506, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.49828178694158076, |
|
"grad_norm": 0.5073997763272261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4498, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5040091638029782, |
|
"grad_norm": 0.5025459977891622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4347, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5097365406643757, |
|
"grad_norm": 0.5133648493490751, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4428, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5154639175257731, |
|
"grad_norm": 0.4885863471136841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4478, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5211912943871707, |
|
"grad_norm": 0.5425525752294759, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4439, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5269186712485682, |
|
"grad_norm": 0.5506692407890452, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4491, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5326460481099656, |
|
"grad_norm": 0.5478953525966641, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4539, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5383734249713631, |
|
"grad_norm": 0.4860248929907917, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4436, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5441008018327605, |
|
"grad_norm": 0.5381987297241178, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4378, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5498281786941581, |
|
"grad_norm": 0.5164585071291472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.438, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.5478408716601747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4429, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.561282932416953, |
|
"grad_norm": 0.5319168815859161, |
|
"learning_rate": 5e-06, |
|
"loss": 0.454, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5670103092783505, |
|
"grad_norm": 0.5049587251570472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4469, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.572737686139748, |
|
"grad_norm": 0.5319440597432152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.437, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5784650630011455, |
|
"grad_norm": 0.511281381398782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4382, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.584192439862543, |
|
"grad_norm": 0.5194710291745682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4559, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5899198167239404, |
|
"grad_norm": 0.5074370643679889, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4588, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5956471935853379, |
|
"grad_norm": 0.5191559040020144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4377, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6013745704467354, |
|
"grad_norm": 0.5476079411315063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4449, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6071019473081328, |
|
"grad_norm": 0.5673160064722429, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4352, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6128293241695304, |
|
"grad_norm": 0.5065625800440946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.441, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 0.5512625856761683, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4429, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6242840778923253, |
|
"grad_norm": 0.552355840658565, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4434, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.6300114547537228, |
|
"grad_norm": 0.487614534934624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4292, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6357388316151202, |
|
"grad_norm": 0.512636901651117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4441, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6414662084765178, |
|
"grad_norm": 0.5128771226016339, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4386, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6471935853379153, |
|
"grad_norm": 0.501226424633001, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4277, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6529209621993127, |
|
"grad_norm": 0.5370012590547919, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4444, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6586483390607102, |
|
"grad_norm": 0.5136199788898852, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4441, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6643757159221076, |
|
"grad_norm": 0.4964737141886483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4488, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6701030927835051, |
|
"grad_norm": 0.5516171001785463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4425, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6758304696449027, |
|
"grad_norm": 0.5495702116230217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4408, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.6815578465063001, |
|
"grad_norm": 0.5459174578575696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4407, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.6872852233676976, |
|
"grad_norm": 0.5345898636386911, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4537, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.693012600229095, |
|
"grad_norm": 0.5243198575640295, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4406, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.6987399770904925, |
|
"grad_norm": 0.5129227421288873, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4317, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7044673539518901, |
|
"grad_norm": 0.5260957634672794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4273, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7101947308132875, |
|
"grad_norm": 0.5437290419224033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4259, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.715922107674685, |
|
"grad_norm": 0.5347492902940403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4431, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7216494845360825, |
|
"grad_norm": 0.5263261030228884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4345, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.7273768613974799, |
|
"grad_norm": 0.5084395646880466, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4402, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.7331042382588774, |
|
"grad_norm": 0.5069357119145561, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4193, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.738831615120275, |
|
"grad_norm": 0.5400403941956701, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4398, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.7445589919816724, |
|
"grad_norm": 0.5233065624416613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4415, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7502863688430699, |
|
"grad_norm": 0.5423286351815177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4455, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.7560137457044673, |
|
"grad_norm": 0.5109684016877989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4424, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7617411225658648, |
|
"grad_norm": 0.527654171024067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4379, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.7674684994272624, |
|
"grad_norm": 0.4948196366852471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4409, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.7731958762886598, |
|
"grad_norm": 0.5572218648877072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.442, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.7789232531500573, |
|
"grad_norm": 0.4669728936660895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4194, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7846506300114547, |
|
"grad_norm": 0.5428733395212945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.425, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.7903780068728522, |
|
"grad_norm": 0.4985083493758901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.428, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.7961053837342497, |
|
"grad_norm": 0.5127694485606286, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4318, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.8018327605956472, |
|
"grad_norm": 0.5429586838215733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.429, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8075601374570447, |
|
"grad_norm": 0.5220082918214546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4359, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8132875143184422, |
|
"grad_norm": 0.4999143755354932, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4344, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.8190148911798396, |
|
"grad_norm": 0.5066162466504854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4341, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 0.5175621795775882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4383, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.8304696449026346, |
|
"grad_norm": 0.46825138868237176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4186, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.8361970217640321, |
|
"grad_norm": 0.49668507800036343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4352, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.8419243986254296, |
|
"grad_norm": 0.5076029260875138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4418, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.847651775486827, |
|
"grad_norm": 0.5255466837310667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4318, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.8533791523482245, |
|
"grad_norm": 0.524332739040499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4348, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.8591065292096219, |
|
"grad_norm": 0.49048587533812715, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4335, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8648339060710195, |
|
"grad_norm": 0.5274633464517493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4302, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.870561282932417, |
|
"grad_norm": 0.5216418463132072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.429, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.8762886597938144, |
|
"grad_norm": 0.51978258763271, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4348, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.8820160366552119, |
|
"grad_norm": 0.5438623460564584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4454, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.8877434135166093, |
|
"grad_norm": 0.4993963585199609, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4245, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.8934707903780069, |
|
"grad_norm": 0.5430194121475598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4456, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.8991981672394044, |
|
"grad_norm": 0.5336061227789548, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4225, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9049255441008018, |
|
"grad_norm": 0.4974230293054768, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4248, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.9106529209621993, |
|
"grad_norm": 0.5128982229920478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4338, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.9163802978235968, |
|
"grad_norm": 0.5231256959340072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.435, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9221076746849943, |
|
"grad_norm": 0.4918096317536555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4438, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.9278350515463918, |
|
"grad_norm": 0.5278974812779116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4313, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.9335624284077892, |
|
"grad_norm": 0.4921635557471188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4208, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.9392898052691867, |
|
"grad_norm": 0.5251247147790307, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4254, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.9450171821305842, |
|
"grad_norm": 0.4710846491151496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4357, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.9507445589919816, |
|
"grad_norm": 0.5545034160352649, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4296, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.9564719358533792, |
|
"grad_norm": 0.5200683652411496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.432, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.9621993127147767, |
|
"grad_norm": 0.5229813016973693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4374, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.9679266895761741, |
|
"grad_norm": 0.4819952616802862, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4137, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.9736540664375716, |
|
"grad_norm": 0.5337445133162833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4347, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.979381443298969, |
|
"grad_norm": 0.5207169222218365, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4234, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.9851088201603666, |
|
"grad_norm": 0.5235455233723544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.433, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.9908361970217641, |
|
"grad_norm": 0.5297773672827779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4322, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.9965635738831615, |
|
"grad_norm": 0.5100360279338871, |
|
"learning_rate": 5e-06, |
|
"loss": 0.429, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.4306947588920593, |
|
"eval_runtime": 295.9271, |
|
"eval_samples_per_second": 39.746, |
|
"eval_steps_per_second": 0.622, |
|
"step": 1746 |
|
}, |
|
{ |
|
"epoch": 1.002290950744559, |
|
"grad_norm": 0.5164564401791184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4117, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.0080183276059564, |
|
"grad_norm": 0.5373967670552326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3742, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.013745704467354, |
|
"grad_norm": 0.5049268545352126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3779, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.0194730813287514, |
|
"grad_norm": 0.5061218550685067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3702, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.0252004581901488, |
|
"grad_norm": 0.4986641460518763, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3711, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.0309278350515463, |
|
"grad_norm": 0.5053371876016091, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3663, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.036655211912944, |
|
"grad_norm": 0.5638955186469768, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3785, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.0423825887743414, |
|
"grad_norm": 0.5203865221430506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.37, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.0481099656357389, |
|
"grad_norm": 0.5084938620530141, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3773, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.0538373424971363, |
|
"grad_norm": 0.49981188453102293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3795, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.0595647193585338, |
|
"grad_norm": 0.4982217537194922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.368, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.0652920962199313, |
|
"grad_norm": 0.5082225744208482, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3726, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.0710194730813287, |
|
"grad_norm": 0.5263843366712018, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3806, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.0767468499427262, |
|
"grad_norm": 0.48898589739857184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3719, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.0824742268041236, |
|
"grad_norm": 0.5096088003209625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.373, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.088201603665521, |
|
"grad_norm": 0.48705936220014545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3844, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.0939289805269188, |
|
"grad_norm": 0.49298990949849025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3734, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.0996563573883162, |
|
"grad_norm": 0.5169227323207669, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3794, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.1053837342497137, |
|
"grad_norm": 0.502144264943404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3811, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.47522199328541975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3673, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.1168384879725086, |
|
"grad_norm": 0.49289512210078507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3723, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.122565864833906, |
|
"grad_norm": 0.5179394316744345, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3716, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.1282932416953035, |
|
"grad_norm": 0.4991350954847643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3757, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.134020618556701, |
|
"grad_norm": 0.5215898643906023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3811, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.1397479954180985, |
|
"grad_norm": 0.5248312587556514, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3834, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.145475372279496, |
|
"grad_norm": 0.5112191261184691, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3731, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.1512027491408934, |
|
"grad_norm": 0.49244426705823124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3782, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.1569301260022908, |
|
"grad_norm": 0.5524056534176885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3751, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.1626575028636885, |
|
"grad_norm": 0.522166484229405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3761, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.168384879725086, |
|
"grad_norm": 0.4995069364287706, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3741, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.1741122565864834, |
|
"grad_norm": 0.510902706338532, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3743, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.179839633447881, |
|
"grad_norm": 0.5065453487736599, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3722, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.1855670103092784, |
|
"grad_norm": 0.5151805060650854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.375, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.1912943871706758, |
|
"grad_norm": 0.49831260628359125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3808, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.1970217640320733, |
|
"grad_norm": 0.5121714439577272, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3698, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.2027491408934707, |
|
"grad_norm": 0.4912963388060686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3699, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.2084765177548682, |
|
"grad_norm": 0.48571233935935587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.378, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.2142038946162657, |
|
"grad_norm": 0.4821158492894926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3742, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.2199312714776633, |
|
"grad_norm": 0.4873415658805343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3757, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.2256586483390608, |
|
"grad_norm": 0.535296961321234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3803, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.2313860252004583, |
|
"grad_norm": 0.5158442516232742, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3799, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.2371134020618557, |
|
"grad_norm": 0.5188127574002888, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3799, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.2428407789232532, |
|
"grad_norm": 0.5197089286594105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3807, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.2485681557846506, |
|
"grad_norm": 0.503291581955108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.376, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.254295532646048, |
|
"grad_norm": 0.5077499589009502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3723, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.2600229095074456, |
|
"grad_norm": 0.5332500891173507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3789, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.265750286368843, |
|
"grad_norm": 0.5060162257976599, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3801, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.2714776632302405, |
|
"grad_norm": 0.4963592770753574, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3714, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.277205040091638, |
|
"grad_norm": 0.4813664865465453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.373, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.2829324169530354, |
|
"grad_norm": 0.5399021996788194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3847, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.2886597938144329, |
|
"grad_norm": 0.5213132074977475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3707, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.2943871706758305, |
|
"grad_norm": 0.5096095310985154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3735, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.300114547537228, |
|
"grad_norm": 0.5046230839401389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3777, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.3058419243986255, |
|
"grad_norm": 0.49606318588063886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3802, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.311569301260023, |
|
"grad_norm": 0.48618155936697477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3642, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.3172966781214204, |
|
"grad_norm": 0.5131608728495509, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3752, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.3230240549828178, |
|
"grad_norm": 0.5398006873457784, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3771, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.3287514318442153, |
|
"grad_norm": 0.5147920738496441, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3701, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.3344788087056128, |
|
"grad_norm": 0.5034049600590188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3661, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.3402061855670104, |
|
"grad_norm": 0.49524063841373434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3659, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.345933562428408, |
|
"grad_norm": 0.5167698042786277, |
|
"learning_rate": 5e-06, |
|
"loss": 0.369, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.3516609392898054, |
|
"grad_norm": 0.49269430039048795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3773, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.3573883161512028, |
|
"grad_norm": 0.5168560438384152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3743, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.3631156930126003, |
|
"grad_norm": 0.4932882563190922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3816, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.3688430698739977, |
|
"grad_norm": 0.5282260861884785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3844, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.3745704467353952, |
|
"grad_norm": 0.5019717962696827, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3672, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.3802978235967927, |
|
"grad_norm": 0.4895383626409957, |
|
"learning_rate": 5e-06, |
|
"loss": 0.373, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.38602520045819, |
|
"grad_norm": 0.4802171696456841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3674, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.3917525773195876, |
|
"grad_norm": 0.47543500312582, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3725, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.397479954180985, |
|
"grad_norm": 0.5058875019724552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3721, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.4032073310423825, |
|
"grad_norm": 0.4993814896944319, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3779, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.40893470790378, |
|
"grad_norm": 0.5109971377219812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3787, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.4146620847651776, |
|
"grad_norm": 0.5139034920537475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3728, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.420389461626575, |
|
"grad_norm": 0.5093772065662143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3723, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.4261168384879725, |
|
"grad_norm": 0.4971098961659199, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3707, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.43184421534937, |
|
"grad_norm": 0.4955169015948491, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3815, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.4375715922107675, |
|
"grad_norm": 0.5057004758414246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.369, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.443298969072165, |
|
"grad_norm": 0.5371969106259902, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3725, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.4490263459335624, |
|
"grad_norm": 0.5043182854139414, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3625, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.4547537227949598, |
|
"grad_norm": 0.5039396765453658, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3735, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.4604810996563573, |
|
"grad_norm": 0.4643231312606545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3714, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.466208476517755, |
|
"grad_norm": 0.5122652558271147, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3737, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.4719358533791524, |
|
"grad_norm": 0.4599727947671919, |
|
"learning_rate": 5e-06, |
|
"loss": 0.381, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.47766323024055, |
|
"grad_norm": 0.47595731930660123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3671, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.4833906071019474, |
|
"grad_norm": 0.49427201740882865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3746, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.4891179839633448, |
|
"grad_norm": 0.5160559476566308, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3732, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.4948453608247423, |
|
"grad_norm": 0.5175281975939867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3757, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.5005727376861397, |
|
"grad_norm": 0.4918105583749727, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3804, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.5063001145475372, |
|
"grad_norm": 0.5233131734973828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3808, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.5120274914089347, |
|
"grad_norm": 0.5032462746632174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3786, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.5177548682703321, |
|
"grad_norm": 0.5238794388448361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3787, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.5234822451317296, |
|
"grad_norm": 0.5075967240987918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3813, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.529209621993127, |
|
"grad_norm": 0.5094033608152486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.388, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.5349369988545245, |
|
"grad_norm": 0.4937138678412315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3774, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.540664375715922, |
|
"grad_norm": 0.5074829225323386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3804, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.5463917525773194, |
|
"grad_norm": 0.5311200275972803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3715, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.552119129438717, |
|
"grad_norm": 0.47102047541148007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3726, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.5578465063001146, |
|
"grad_norm": 0.5071401413917036, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3708, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.563573883161512, |
|
"grad_norm": 0.529109086962967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3802, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.5693012600229095, |
|
"grad_norm": 0.4921183498105907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3709, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.575028636884307, |
|
"grad_norm": 0.5098013107410593, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3807, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.5807560137457046, |
|
"grad_norm": 0.5153311664282273, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3778, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.586483390607102, |
|
"grad_norm": 0.4972397902001517, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3694, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.5922107674684995, |
|
"grad_norm": 0.5119592554713294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3664, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.597938144329897, |
|
"grad_norm": 0.5267592970470695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3782, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.6036655211912945, |
|
"grad_norm": 0.5009393372942625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3809, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.609392898052692, |
|
"grad_norm": 0.5199048188264086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3839, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.6151202749140894, |
|
"grad_norm": 0.5236842623961477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3749, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.6208476517754868, |
|
"grad_norm": 0.5159942734966763, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3842, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.6265750286368843, |
|
"grad_norm": 0.4903857233492226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3762, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.6323024054982818, |
|
"grad_norm": 0.4881518724661408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3813, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.6380297823596792, |
|
"grad_norm": 0.5231546664688862, |
|
"learning_rate": 5e-06, |
|
"loss": 0.373, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.6437571592210767, |
|
"grad_norm": 0.49484922058601294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3715, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.6494845360824741, |
|
"grad_norm": 0.5254331745353574, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3724, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.6552119129438716, |
|
"grad_norm": 0.5095482545887045, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3795, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.660939289805269, |
|
"grad_norm": 0.4741172756115684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3795, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.49234693166281435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.373, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.6723940435280642, |
|
"grad_norm": 0.49696468106227953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3758, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.6781214203894617, |
|
"grad_norm": 0.5077092844151242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3731, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.6838487972508591, |
|
"grad_norm": 0.5147506102086381, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3757, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.6895761741122566, |
|
"grad_norm": 0.4962203469789613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3807, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.695303550973654, |
|
"grad_norm": 0.5215405834502828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3817, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.7010309278350515, |
|
"grad_norm": 0.5291514188360492, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3733, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.7067583046964492, |
|
"grad_norm": 0.5152001522839965, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3713, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.7124856815578466, |
|
"grad_norm": 0.4943426681496219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3691, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.718213058419244, |
|
"grad_norm": 0.5036953905099308, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3732, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.7239404352806416, |
|
"grad_norm": 0.48232912691583646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3712, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.729667812142039, |
|
"grad_norm": 0.49927832455696153, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3795, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.7353951890034365, |
|
"grad_norm": 0.5113424281726278, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3702, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.741122565864834, |
|
"grad_norm": 0.4990266194355831, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3685, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.7468499427262314, |
|
"grad_norm": 0.4950858312514841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3708, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.7525773195876289, |
|
"grad_norm": 0.505648882916022, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3669, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.7583046964490263, |
|
"grad_norm": 0.5033432169804157, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3752, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.7640320733104238, |
|
"grad_norm": 0.4941530503640251, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3774, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.7697594501718212, |
|
"grad_norm": 0.47374323765927717, |
|
"learning_rate": 5e-06, |
|
"loss": 0.369, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.7754868270332187, |
|
"grad_norm": 0.5114072137457846, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3744, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.7812142038946162, |
|
"grad_norm": 0.5714812909761224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3728, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.7869415807560136, |
|
"grad_norm": 0.5031460002692415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3586, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.792668957617411, |
|
"grad_norm": 0.5508643603790662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3775, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.7983963344788088, |
|
"grad_norm": 0.5128236251176556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3761, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.8041237113402062, |
|
"grad_norm": 0.4976776485027551, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3711, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.8098510882016037, |
|
"grad_norm": 0.517111964713008, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3761, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.8155784650630011, |
|
"grad_norm": 0.49528104941202294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3805, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.8213058419243986, |
|
"grad_norm": 0.49140254078298945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3765, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.827033218785796, |
|
"grad_norm": 0.507350935835637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3658, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.8327605956471937, |
|
"grad_norm": 0.514856233646019, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3791, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.8384879725085912, |
|
"grad_norm": 0.48761431113920733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3719, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.8442153493699887, |
|
"grad_norm": 0.4992391499981669, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3787, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.8499427262313861, |
|
"grad_norm": 0.49038499742537395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3738, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.8556701030927836, |
|
"grad_norm": 0.48881972116025446, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3848, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.861397479954181, |
|
"grad_norm": 0.5075760745210575, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3706, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.8671248568155785, |
|
"grad_norm": 0.5261001403985256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3695, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.872852233676976, |
|
"grad_norm": 0.5143555207892127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.371, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.8785796105383734, |
|
"grad_norm": 0.516416280940184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3724, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.8843069873997709, |
|
"grad_norm": 0.5089108000531376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3722, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.8900343642611683, |
|
"grad_norm": 0.4931941440250671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3677, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.8957617411225658, |
|
"grad_norm": 0.5066849777964215, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3751, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.9014891179839633, |
|
"grad_norm": 0.4924900296297563, |
|
"learning_rate": 5e-06, |
|
"loss": 0.374, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.9072164948453607, |
|
"grad_norm": 0.48837230848342283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3633, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.9129438717067582, |
|
"grad_norm": 0.5068824941492155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3786, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.9186712485681556, |
|
"grad_norm": 0.47874082053083117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3716, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.9243986254295533, |
|
"grad_norm": 0.5084759163319922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3661, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.9301260022909508, |
|
"grad_norm": 0.495142196557906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3681, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.9358533791523482, |
|
"grad_norm": 0.4894249690522435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3718, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.9415807560137457, |
|
"grad_norm": 0.5400470537238534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3788, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.9473081328751431, |
|
"grad_norm": 0.49882507022906875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3685, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.9530355097365406, |
|
"grad_norm": 0.4898539601683119, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3737, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.9587628865979383, |
|
"grad_norm": 0.5065048271839747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3698, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.9644902634593358, |
|
"grad_norm": 0.493065120502946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3764, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.9702176403207332, |
|
"grad_norm": 0.4855908493410204, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3701, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.9759450171821307, |
|
"grad_norm": 0.5073814840391613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3686, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.9816723940435281, |
|
"grad_norm": 0.5150533996226417, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3728, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.9873997709049256, |
|
"grad_norm": 0.5168240656639432, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3725, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.993127147766323, |
|
"grad_norm": 0.5067726637599477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.378, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.9988545246277205, |
|
"grad_norm": 0.5310084366302013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3755, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.4207070767879486, |
|
"eval_runtime": 290.7303, |
|
"eval_samples_per_second": 40.457, |
|
"eval_steps_per_second": 0.633, |
|
"step": 3492 |
|
}, |
|
{ |
|
"epoch": 2.004581901489118, |
|
"grad_norm": 0.5582678235559352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3263, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.0103092783505154, |
|
"grad_norm": 0.4945423992218601, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3121, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.016036655211913, |
|
"grad_norm": 0.503862666771499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3064, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.0217640320733103, |
|
"grad_norm": 0.5070969658417839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3114, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.027491408934708, |
|
"grad_norm": 0.4962584920317651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.309, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.0332187857961053, |
|
"grad_norm": 0.5302688037121702, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3152, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.0389461626575027, |
|
"grad_norm": 0.5169447575631415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3137, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.0446735395189, |
|
"grad_norm": 0.5127605730394688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3142, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.0504009163802976, |
|
"grad_norm": 0.49179890084778705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3138, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.056128293241695, |
|
"grad_norm": 0.5228610196241354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3272, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.0618556701030926, |
|
"grad_norm": 0.5330298953216434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3217, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.0675830469644905, |
|
"grad_norm": 0.515621478476054, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3126, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.073310423825888, |
|
"grad_norm": 0.48330918450825416, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3128, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.0790378006872854, |
|
"grad_norm": 0.5133555477218072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3103, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.084765177548683, |
|
"grad_norm": 0.4770491329819935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3122, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.0904925544100803, |
|
"grad_norm": 0.4964810429371462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3241, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.0962199312714778, |
|
"grad_norm": 0.490969087167152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3183, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.1019473081328752, |
|
"grad_norm": 0.5043492115565404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3142, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.1076746849942727, |
|
"grad_norm": 0.5249832358572287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3164, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.11340206185567, |
|
"grad_norm": 0.47196104358453034, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3121, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.1191294387170676, |
|
"grad_norm": 0.5025207771066358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3134, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.124856815578465, |
|
"grad_norm": 0.512306750789012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3101, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.1305841924398625, |
|
"grad_norm": 0.48884001780131237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3188, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.13631156930126, |
|
"grad_norm": 0.4926488293937944, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3153, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.1420389461626574, |
|
"grad_norm": 0.47726922330881655, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3123, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.147766323024055, |
|
"grad_norm": 0.5330351768407757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3205, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.1534936998854524, |
|
"grad_norm": 0.4985016475823844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3139, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 2.15922107674685, |
|
"grad_norm": 0.5358542806513212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3182, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 2.1649484536082473, |
|
"grad_norm": 0.5021134280557785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.317, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.1706758304696447, |
|
"grad_norm": 0.48209366607788223, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3232, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 2.176403207331042, |
|
"grad_norm": 0.5074471012148579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.315, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.1821305841924397, |
|
"grad_norm": 0.511716111734798, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3157, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 2.1878579610538376, |
|
"grad_norm": 0.48837509446626515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3126, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 2.193585337915235, |
|
"grad_norm": 0.5257347682730206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3128, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 2.1993127147766325, |
|
"grad_norm": 0.5076959903128635, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3197, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 2.20504009163803, |
|
"grad_norm": 0.4979642794443462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3167, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.2107674684994274, |
|
"grad_norm": 0.535375199212662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3221, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 2.216494845360825, |
|
"grad_norm": 0.515160036407322, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3209, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.4951254251771504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3201, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 2.22794959908362, |
|
"grad_norm": 0.521169428795531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3239, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 2.2336769759450172, |
|
"grad_norm": 0.49666401639454866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3137, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.2394043528064147, |
|
"grad_norm": 0.5283758992360169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3157, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.245131729667812, |
|
"grad_norm": 0.5283131673253361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3228, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 2.2508591065292096, |
|
"grad_norm": 0.4927867619871094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3123, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 2.256586483390607, |
|
"grad_norm": 0.4748812879634188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3135, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 2.2623138602520045, |
|
"grad_norm": 0.4893718894473708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3166, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.268041237113402, |
|
"grad_norm": 0.5104857240717058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3202, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 2.2737686139747995, |
|
"grad_norm": 0.5013495951931499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.323, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 2.279495990836197, |
|
"grad_norm": 0.501822158580599, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3156, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 2.2852233676975944, |
|
"grad_norm": 0.5283371610492251, |
|
"learning_rate": 5e-06, |
|
"loss": 0.319, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 2.290950744558992, |
|
"grad_norm": 0.49904707487464617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3208, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.2966781214203893, |
|
"grad_norm": 0.4853428332577101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3151, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 2.3024054982817868, |
|
"grad_norm": 0.49871404661053265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3198, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 2.308132875143184, |
|
"grad_norm": 0.48059451244658824, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3104, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 2.3138602520045817, |
|
"grad_norm": 0.4751556595576501, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3223, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 2.319587628865979, |
|
"grad_norm": 0.5264389154058611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3183, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.325315005727377, |
|
"grad_norm": 0.5024196062279519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3125, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 2.3310423825887745, |
|
"grad_norm": 0.5111784195864709, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3166, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 2.336769759450172, |
|
"grad_norm": 0.4887735479921481, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3241, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 2.3424971363115694, |
|
"grad_norm": 0.5330627179549918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3267, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 2.348224513172967, |
|
"grad_norm": 0.47798882316915986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3152, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.3539518900343643, |
|
"grad_norm": 0.5191521422457772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3293, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 2.359679266895762, |
|
"grad_norm": 0.5125909905451724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3134, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 2.3654066437571593, |
|
"grad_norm": 0.4886207764110825, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3217, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 2.3711340206185567, |
|
"grad_norm": 0.5044443936607491, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3179, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 2.376861397479954, |
|
"grad_norm": 0.5120039487905359, |
|
"learning_rate": 5e-06, |
|
"loss": 0.321, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.3825887743413516, |
|
"grad_norm": 0.489165925771695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3108, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 2.388316151202749, |
|
"grad_norm": 0.5058038370694165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3217, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 2.3940435280641466, |
|
"grad_norm": 0.5098394942280727, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3162, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 2.399770904925544, |
|
"grad_norm": 0.5143275541095652, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3206, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 2.4054982817869415, |
|
"grad_norm": 0.48723979179724397, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3227, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.411225658648339, |
|
"grad_norm": 0.49153880143502765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.327, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 2.4169530355097364, |
|
"grad_norm": 0.4990389931982443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3145, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 2.422680412371134, |
|
"grad_norm": 0.5007707010716869, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3192, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 2.4284077892325313, |
|
"grad_norm": 0.4842447385057361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3187, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 2.434135166093929, |
|
"grad_norm": 0.5282173056004758, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3248, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.4398625429553267, |
|
"grad_norm": 0.5242920164717728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3276, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 2.445589919816724, |
|
"grad_norm": 0.5256084243387356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3076, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 2.4513172966781216, |
|
"grad_norm": 0.49717599592896866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3183, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 2.457044673539519, |
|
"grad_norm": 0.4917289115217294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3169, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 2.4627720504009165, |
|
"grad_norm": 0.48696947359252235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3181, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.468499427262314, |
|
"grad_norm": 0.5066247814081654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3214, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 2.4742268041237114, |
|
"grad_norm": 0.5116828561516821, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3232, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 2.479954180985109, |
|
"grad_norm": 0.4721726863796144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3203, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 2.4856815578465064, |
|
"grad_norm": 0.5140261292884363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3205, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 2.491408934707904, |
|
"grad_norm": 0.4995754907401082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3258, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.4971363115693013, |
|
"grad_norm": 0.5041823551282999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3194, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 2.5028636884306987, |
|
"grad_norm": 0.49384248849165524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3225, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 2.508591065292096, |
|
"grad_norm": 0.4889201995538099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3205, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 2.5143184421534936, |
|
"grad_norm": 0.47694834539706893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3175, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 2.520045819014891, |
|
"grad_norm": 0.49461329366934864, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3223, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.5257731958762886, |
|
"grad_norm": 0.5125961099921013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.323, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 2.531500572737686, |
|
"grad_norm": 0.48971922047461597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3201, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 2.5372279495990835, |
|
"grad_norm": 0.47489099276070756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3227, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 2.542955326460481, |
|
"grad_norm": 0.4907691648649352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3201, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 2.5486827033218784, |
|
"grad_norm": 0.5026363282224567, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3196, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.554410080183276, |
|
"grad_norm": 0.5176119238693507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3243, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 2.5601374570446733, |
|
"grad_norm": 0.5085068635390583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3183, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 2.565864833906071, |
|
"grad_norm": 0.472068765168504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3172, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 2.5715922107674682, |
|
"grad_norm": 0.5107727781006867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3276, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 2.5773195876288657, |
|
"grad_norm": 0.49576533772806247, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3197, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.5830469644902636, |
|
"grad_norm": 0.5147069600733026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3303, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 2.588774341351661, |
|
"grad_norm": 0.5410222797339587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3213, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 2.5945017182130585, |
|
"grad_norm": 0.5129756441436896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3159, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 2.600229095074456, |
|
"grad_norm": 0.5090769089905053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3198, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 2.6059564719358534, |
|
"grad_norm": 0.5312483213444277, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3239, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 2.611683848797251, |
|
"grad_norm": 0.4990401487796137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3165, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 2.6174112256586484, |
|
"grad_norm": 0.4906139970113933, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3162, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 2.623138602520046, |
|
"grad_norm": 0.49275236064069183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3162, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 2.6288659793814433, |
|
"grad_norm": 0.505876180763233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3173, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 2.6345933562428407, |
|
"grad_norm": 0.5034109210608266, |
|
"learning_rate": 5e-06, |
|
"loss": 0.327, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.640320733104238, |
|
"grad_norm": 0.5125105668846632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3201, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 2.6460481099656357, |
|
"grad_norm": 0.4986777563152259, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3243, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 2.651775486827033, |
|
"grad_norm": 0.5028807375636888, |
|
"learning_rate": 5e-06, |
|
"loss": 0.313, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 2.6575028636884306, |
|
"grad_norm": 0.5027638307219331, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3251, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 2.663230240549828, |
|
"grad_norm": 0.5118793036289471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3171, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 2.6689576174112255, |
|
"grad_norm": 0.5079584260143071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3196, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 2.6746849942726234, |
|
"grad_norm": 0.5098308127405584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.33, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 2.680412371134021, |
|
"grad_norm": 0.5028460889046886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3233, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 2.6861397479954183, |
|
"grad_norm": 0.4950860630453671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3218, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 2.691867124856816, |
|
"grad_norm": 0.5363041499133996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3115, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.6975945017182132, |
|
"grad_norm": 0.47848642716067413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3229, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 2.7033218785796107, |
|
"grad_norm": 0.5132570731802132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3176, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 2.709049255441008, |
|
"grad_norm": 0.5123910169044057, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3216, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 2.7147766323024056, |
|
"grad_norm": 0.49140644629093294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3255, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 2.720504009163803, |
|
"grad_norm": 0.4903514464262286, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3221, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 2.7262313860252005, |
|
"grad_norm": 0.5202137394656217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3188, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 2.731958762886598, |
|
"grad_norm": 0.5189552209211002, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3268, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 2.7376861397479955, |
|
"grad_norm": 0.4863895418886132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3146, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 2.743413516609393, |
|
"grad_norm": 0.4963628119686479, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3306, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 2.7491408934707904, |
|
"grad_norm": 0.4780341556691321, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3183, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.754868270332188, |
|
"grad_norm": 0.47632144883855376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3102, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 2.7605956471935853, |
|
"grad_norm": 0.5229861996836561, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3296, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 2.7663230240549828, |
|
"grad_norm": 0.5062883215395301, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3233, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 2.77205040091638, |
|
"grad_norm": 0.4943950877672053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3273, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 0.5098452348979725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3169, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 2.783505154639175, |
|
"grad_norm": 0.5014610096004497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3232, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 2.7892325315005726, |
|
"grad_norm": 0.5013602455951724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3183, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 2.79495990836197, |
|
"grad_norm": 0.53869757575354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3251, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 2.8006872852233675, |
|
"grad_norm": 0.4898326008152037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3231, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 2.806414662084765, |
|
"grad_norm": 0.4810606140091934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3174, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.8121420389461624, |
|
"grad_norm": 0.5170925060870262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3206, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 2.81786941580756, |
|
"grad_norm": 0.4980004979531469, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3271, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 2.8235967926689574, |
|
"grad_norm": 0.4751276576064784, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3255, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 2.8293241695303553, |
|
"grad_norm": 0.4858328987324905, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3199, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 2.8350515463917527, |
|
"grad_norm": 0.5147620527991269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3287, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 2.84077892325315, |
|
"grad_norm": 0.5149232879792602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3274, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 2.8465063001145476, |
|
"grad_norm": 0.5008258037850903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3166, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 2.852233676975945, |
|
"grad_norm": 0.5022192392797802, |
|
"learning_rate": 5e-06, |
|
"loss": 0.322, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 2.8579610538373426, |
|
"grad_norm": 0.47652790071973783, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3189, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 2.86368843069874, |
|
"grad_norm": 0.4759345790313986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3215, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.8694158075601375, |
|
"grad_norm": 0.5072028525553413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3293, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 2.875143184421535, |
|
"grad_norm": 0.5143722957913531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3237, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 2.8808705612829324, |
|
"grad_norm": 0.5041664264008966, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3171, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 2.88659793814433, |
|
"grad_norm": 0.49156126691930446, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3153, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 2.8923253150057273, |
|
"grad_norm": 0.49867336387580574, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3263, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 2.8980526918671248, |
|
"grad_norm": 0.46719115878858214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3214, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 2.9037800687285222, |
|
"grad_norm": 0.48603049910129803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3192, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 2.9095074455899197, |
|
"grad_norm": 0.5044239151316462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3218, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 2.915234822451317, |
|
"grad_norm": 0.4870384358690395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.318, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 2.9209621993127146, |
|
"grad_norm": 0.512106578999376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3194, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.9266895761741125, |
|
"grad_norm": 0.5023395866834555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3297, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 2.93241695303551, |
|
"grad_norm": 0.5257957417842357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3246, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 2.9381443298969074, |
|
"grad_norm": 0.48614775617490547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.318, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 2.943871706758305, |
|
"grad_norm": 0.49942615792365136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3196, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 2.9495990836197024, |
|
"grad_norm": 0.49083897626357387, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3129, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 2.9553264604811, |
|
"grad_norm": 0.49254139484612786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3247, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 2.9610538373424973, |
|
"grad_norm": 0.5090239638844181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3198, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 2.9667812142038947, |
|
"grad_norm": 0.504113575432436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3184, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 2.972508591065292, |
|
"grad_norm": 0.500020414647489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3167, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 2.9782359679266897, |
|
"grad_norm": 0.5217079453004775, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3263, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.983963344788087, |
|
"grad_norm": 0.47426209559251875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3185, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 2.9896907216494846, |
|
"grad_norm": 0.5033657036471195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3159, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 2.995418098510882, |
|
"grad_norm": 0.49409722412965856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3266, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.42773160338401794, |
|
"eval_runtime": 278.3004, |
|
"eval_samples_per_second": 42.264, |
|
"eval_steps_per_second": 0.661, |
|
"step": 5238 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 5238, |
|
"total_flos": 2745782658662400.0, |
|
"train_loss": 0.38240701185468956, |
|
"train_runtime": 39622.8086, |
|
"train_samples_per_second": 16.92, |
|
"train_steps_per_second": 0.132 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5238, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2745782658662400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|