|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0669380370872907, |
|
"eval_steps": 800, |
|
"global_step": 2360, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004522840343735866, |
|
"eval_loss": 2.6005213260650635, |
|
"eval_runtime": 106.0609, |
|
"eval_samples_per_second": 20.79, |
|
"eval_steps_per_second": 5.205, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004522840343735866, |
|
"grad_norm": 4.152566909790039, |
|
"learning_rate": 2.5e-06, |
|
"loss": 2.461, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009045680687471733, |
|
"grad_norm": 3.253302812576294, |
|
"learning_rate": 5e-06, |
|
"loss": 2.1663, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013568521031207599, |
|
"grad_norm": 3.0376248359680176, |
|
"learning_rate": 7.5e-06, |
|
"loss": 1.5723, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018091361374943465, |
|
"grad_norm": 1.5493546724319458, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0123, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.022614201718679332, |
|
"grad_norm": 0.7575089335441589, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.7936, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.027137042062415198, |
|
"grad_norm": 1.0295528173446655, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.688, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.031659882406151064, |
|
"grad_norm": 0.5441007614135742, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.6524, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03618272274988693, |
|
"grad_norm": 0.8951964974403381, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6575, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0407055630936228, |
|
"grad_norm": 0.7289975881576538, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.6334, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.045228403437358664, |
|
"grad_norm": 0.8988147377967834, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.6591, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04975124378109453, |
|
"grad_norm": 0.4641079902648926, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.6724, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.054274084124830396, |
|
"grad_norm": 0.6655707955360413, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6467, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05879692446856626, |
|
"grad_norm": 0.7066192030906677, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.6202, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06331976481230213, |
|
"grad_norm": 0.9096384644508362, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.6344, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06784260515603799, |
|
"grad_norm": 0.6074104905128479, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.5695, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07236544549977386, |
|
"grad_norm": 0.8516980409622192, |
|
"learning_rate": 4e-05, |
|
"loss": 0.5924, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07688828584350972, |
|
"grad_norm": 0.7963458299636841, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.6169, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0814111261872456, |
|
"grad_norm": 0.8972164988517761, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.5872, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08593396653098145, |
|
"grad_norm": 4.4737162590026855, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.6164, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09045680687471733, |
|
"grad_norm": 0.6346756815910339, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5744, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09497964721845319, |
|
"grad_norm": 0.8093542456626892, |
|
"learning_rate": 5.25e-05, |
|
"loss": 0.5741, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09950248756218906, |
|
"grad_norm": 0.8128954768180847, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.5842, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10402532790592492, |
|
"grad_norm": 0.5212199687957764, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 0.5368, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10854816824966079, |
|
"grad_norm": 0.7929616570472717, |
|
"learning_rate": 6e-05, |
|
"loss": 0.5645, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11307100859339665, |
|
"grad_norm": 0.6961472630500793, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.5541, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11759384893713253, |
|
"grad_norm": 0.6359185576438904, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.5642, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12211668928086838, |
|
"grad_norm": 0.6574355959892273, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 0.5556, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12663952962460426, |
|
"grad_norm": 0.5738745331764221, |
|
"learning_rate": 7e-05, |
|
"loss": 0.5582, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.13116236996834013, |
|
"grad_norm": 0.3199772834777832, |
|
"learning_rate": 7.25e-05, |
|
"loss": 0.5796, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13568521031207598, |
|
"grad_norm": 0.3531240224838257, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.5612, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14020805065581185, |
|
"grad_norm": 0.5375868678092957, |
|
"learning_rate": 7.75e-05, |
|
"loss": 0.5645, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.14473089099954772, |
|
"grad_norm": 0.2688968777656555, |
|
"learning_rate": 8e-05, |
|
"loss": 0.5485, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 0.6451207995414734, |
|
"learning_rate": 8.25e-05, |
|
"loss": 0.5242, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15377657168701944, |
|
"grad_norm": 2.1042656898498535, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.5639, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.15829941203075532, |
|
"grad_norm": 0.38682234287261963, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.5417, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1628222523744912, |
|
"grad_norm": 0.5062561631202698, |
|
"learning_rate": 9e-05, |
|
"loss": 0.5509, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16734509271822703, |
|
"grad_norm": 0.6663705706596375, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 0.557, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1718679330619629, |
|
"grad_norm": 0.4430944323539734, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.5711, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.17639077340569878, |
|
"grad_norm": 0.6567730903625488, |
|
"learning_rate": 9.75e-05, |
|
"loss": 0.5437, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.18091361374943465, |
|
"grad_norm": 0.44569894671440125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5124, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1854364540931705, |
|
"grad_norm": 3.6767895221710205, |
|
"learning_rate": 9.999957281897735e-05, |
|
"loss": 0.5489, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.18995929443690637, |
|
"grad_norm": 1.6086912155151367, |
|
"learning_rate": 9.999829128320874e-05, |
|
"loss": 0.5508, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19448213478064225, |
|
"grad_norm": 0.42233482003211975, |
|
"learning_rate": 9.999615541459207e-05, |
|
"loss": 0.5205, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.19900497512437812, |
|
"grad_norm": 0.5907697677612305, |
|
"learning_rate": 9.999316524962345e-05, |
|
"loss": 0.5201, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.20352781546811397, |
|
"grad_norm": 0.5699201226234436, |
|
"learning_rate": 9.998932083939656e-05, |
|
"loss": 0.5263, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.20805065581184984, |
|
"grad_norm": 0.8701225519180298, |
|
"learning_rate": 9.998462224960175e-05, |
|
"loss": 0.5421, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2125734961555857, |
|
"grad_norm": 0.38067907094955444, |
|
"learning_rate": 9.997906956052494e-05, |
|
"loss": 0.5388, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.21709633649932158, |
|
"grad_norm": 0.3497803509235382, |
|
"learning_rate": 9.997266286704631e-05, |
|
"loss": 0.5216, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.22161917684305743, |
|
"grad_norm": 1.2132240533828735, |
|
"learning_rate": 9.996540227863854e-05, |
|
"loss": 0.5204, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2261420171867933, |
|
"grad_norm": 0.30633747577667236, |
|
"learning_rate": 9.995728791936504e-05, |
|
"loss": 0.5494, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23066485753052918, |
|
"grad_norm": 0.6719753742218018, |
|
"learning_rate": 9.994831992787788e-05, |
|
"loss": 0.5489, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.23518769787426505, |
|
"grad_norm": 0.5767594575881958, |
|
"learning_rate": 9.993849845741524e-05, |
|
"loss": 0.5324, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2397105382180009, |
|
"grad_norm": 0.236255943775177, |
|
"learning_rate": 9.992782367579899e-05, |
|
"loss": 0.5181, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.24423337856173677, |
|
"grad_norm": 0.788173258304596, |
|
"learning_rate": 9.991629576543163e-05, |
|
"loss": 0.5303, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.24875621890547264, |
|
"grad_norm": 0.2152377963066101, |
|
"learning_rate": 9.990391492329341e-05, |
|
"loss": 0.5284, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2532790592492085, |
|
"grad_norm": 0.7022669911384583, |
|
"learning_rate": 9.989068136093873e-05, |
|
"loss": 0.5282, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.25780189959294436, |
|
"grad_norm": 0.5153821110725403, |
|
"learning_rate": 9.987659530449268e-05, |
|
"loss": 0.5221, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.26232473993668026, |
|
"grad_norm": 0.3221074640750885, |
|
"learning_rate": 9.986165699464705e-05, |
|
"loss": 0.5202, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.2668475802804161, |
|
"grad_norm": 0.22985632717609406, |
|
"learning_rate": 9.98458666866564e-05, |
|
"loss": 0.5148, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.27137042062415195, |
|
"grad_norm": 0.5174903869628906, |
|
"learning_rate": 9.98292246503335e-05, |
|
"loss": 0.5389, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.27589326096788785, |
|
"grad_norm": 0.41488873958587646, |
|
"learning_rate": 9.981173117004484e-05, |
|
"loss": 0.514, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.2804161013116237, |
|
"grad_norm": 0.4851066768169403, |
|
"learning_rate": 9.979338654470569e-05, |
|
"loss": 0.5268, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.28493894165535955, |
|
"grad_norm": 0.6160866618156433, |
|
"learning_rate": 9.977419108777514e-05, |
|
"loss": 0.5216, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.28946178199909545, |
|
"grad_norm": 0.47816914319992065, |
|
"learning_rate": 9.975414512725057e-05, |
|
"loss": 0.5058, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2939846223428313, |
|
"grad_norm": 0.47290515899658203, |
|
"learning_rate": 9.973324900566213e-05, |
|
"loss": 0.5233, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.5466296076774597, |
|
"learning_rate": 9.97115030800669e-05, |
|
"loss": 0.5208, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 0.1515071988105774, |
|
"learning_rate": 9.968890772204271e-05, |
|
"loss": 0.5153, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.3075531433740389, |
|
"grad_norm": 0.22212214767932892, |
|
"learning_rate": 9.966546331768191e-05, |
|
"loss": 0.4956, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3120759837177748, |
|
"grad_norm": 0.5279558300971985, |
|
"learning_rate": 9.96411702675847e-05, |
|
"loss": 0.5137, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.31659882406151063, |
|
"grad_norm": 0.49932244420051575, |
|
"learning_rate": 9.961602898685226e-05, |
|
"loss": 0.5226, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3211216644052465, |
|
"grad_norm": 0.4041373133659363, |
|
"learning_rate": 9.959003990507972e-05, |
|
"loss": 0.5063, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3256445047489824, |
|
"grad_norm": 0.22287319600582123, |
|
"learning_rate": 9.956320346634876e-05, |
|
"loss": 0.4965, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3301673450927182, |
|
"grad_norm": 0.379319429397583, |
|
"learning_rate": 9.953552012922012e-05, |
|
"loss": 0.4926, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.33469018543645407, |
|
"grad_norm": 0.4465825855731964, |
|
"learning_rate": 9.950699036672559e-05, |
|
"loss": 0.5295, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.33921302578018997, |
|
"grad_norm": 0.8027335405349731, |
|
"learning_rate": 9.947761466636014e-05, |
|
"loss": 0.5249, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3437358661239258, |
|
"grad_norm": 0.4452328383922577, |
|
"learning_rate": 9.944739353007344e-05, |
|
"loss": 0.5168, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3482587064676617, |
|
"grad_norm": 0.5175443291664124, |
|
"learning_rate": 9.941632747426129e-05, |
|
"loss": 0.5063, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.35278154681139756, |
|
"grad_norm": 0.6492401361465454, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 0.5028, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3573043871551334, |
|
"grad_norm": 0.28473520278930664, |
|
"learning_rate": 9.93516627418217e-05, |
|
"loss": 0.502, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3618272274988693, |
|
"grad_norm": 0.29608818888664246, |
|
"learning_rate": 9.931806517013612e-05, |
|
"loss": 0.503, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3618272274988693, |
|
"eval_loss": 0.5416414737701416, |
|
"eval_runtime": 106.1622, |
|
"eval_samples_per_second": 20.77, |
|
"eval_steps_per_second": 5.2, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.36635006784260515, |
|
"grad_norm": 0.13932587206363678, |
|
"learning_rate": 9.928362488878996e-05, |
|
"loss": 0.5012, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.370872908186341, |
|
"grad_norm": 0.5137602686882019, |
|
"learning_rate": 9.92483424862726e-05, |
|
"loss": 0.5183, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3753957485300769, |
|
"grad_norm": 0.33779120445251465, |
|
"learning_rate": 9.921221856546293e-05, |
|
"loss": 0.5007, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.37991858887381275, |
|
"grad_norm": 0.23543624579906464, |
|
"learning_rate": 9.917525374361912e-05, |
|
"loss": 0.5078, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.38444142921754865, |
|
"grad_norm": 0.4530757665634155, |
|
"learning_rate": 9.913744865236798e-05, |
|
"loss": 0.519, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3889642695612845, |
|
"grad_norm": 0.29827597737312317, |
|
"learning_rate": 9.90988039376942e-05, |
|
"loss": 0.5017, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.39348710990502034, |
|
"grad_norm": 0.8379443883895874, |
|
"learning_rate": 9.905932025992932e-05, |
|
"loss": 0.517, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.39800995024875624, |
|
"grad_norm": 0.3807092607021332, |
|
"learning_rate": 9.901899829374047e-05, |
|
"loss": 0.5118, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.4025327905924921, |
|
"grad_norm": 0.3721909523010254, |
|
"learning_rate": 9.897783872811882e-05, |
|
"loss": 0.512, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.40705563093622793, |
|
"grad_norm": 0.16293685138225555, |
|
"learning_rate": 9.893584226636772e-05, |
|
"loss": 0.4883, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.41157847127996383, |
|
"grad_norm": 0.2745698094367981, |
|
"learning_rate": 9.88930096260909e-05, |
|
"loss": 0.4878, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.4161013116236997, |
|
"grad_norm": 0.2861226499080658, |
|
"learning_rate": 9.884934153917997e-05, |
|
"loss": 0.4861, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.4206241519674355, |
|
"grad_norm": 0.14675496518611908, |
|
"learning_rate": 9.880483875180205e-05, |
|
"loss": 0.502, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.4251469923111714, |
|
"grad_norm": 0.3355661928653717, |
|
"learning_rate": 9.8759502024387e-05, |
|
"loss": 0.4769, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.42966983265490727, |
|
"grad_norm": 0.3499309718608856, |
|
"learning_rate": 9.871333213161438e-05, |
|
"loss": 0.5044, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.43419267299864317, |
|
"grad_norm": 0.13604167103767395, |
|
"learning_rate": 9.86663298624003e-05, |
|
"loss": 0.4837, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.438715513342379, |
|
"grad_norm": 0.42066818475723267, |
|
"learning_rate": 9.861849601988383e-05, |
|
"loss": 0.4768, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.44323835368611486, |
|
"grad_norm": 0.2138914316892624, |
|
"learning_rate": 9.856983142141339e-05, |
|
"loss": 0.5059, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 0.3707284927368164, |
|
"learning_rate": 9.852033689853267e-05, |
|
"loss": 0.5059, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4522840343735866, |
|
"grad_norm": 0.6477210521697998, |
|
"learning_rate": 9.847001329696653e-05, |
|
"loss": 0.4977, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.45680687471732245, |
|
"grad_norm": 0.14800307154655457, |
|
"learning_rate": 9.841886147660645e-05, |
|
"loss": 0.4976, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.46132971506105835, |
|
"grad_norm": 0.47785401344299316, |
|
"learning_rate": 9.836688231149592e-05, |
|
"loss": 0.5012, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.4658525554047942, |
|
"grad_norm": 0.12753424048423767, |
|
"learning_rate": 9.831407668981546e-05, |
|
"loss": 0.4982, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.4703753957485301, |
|
"grad_norm": 0.2438424527645111, |
|
"learning_rate": 9.826044551386744e-05, |
|
"loss": 0.5074, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.47489823609226595, |
|
"grad_norm": 1.7088356018066406, |
|
"learning_rate": 9.820598970006069e-05, |
|
"loss": 0.5032, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4794210764360018, |
|
"grad_norm": 0.4954736530780792, |
|
"learning_rate": 9.815071017889482e-05, |
|
"loss": 0.5021, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4839439167797377, |
|
"grad_norm": 0.2744172513484955, |
|
"learning_rate": 9.809460789494432e-05, |
|
"loss": 0.5182, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.48846675712347354, |
|
"grad_norm": 0.3839784264564514, |
|
"learning_rate": 9.803768380684242e-05, |
|
"loss": 0.4917, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.4929895974672094, |
|
"grad_norm": 0.1647842675447464, |
|
"learning_rate": 9.797993888726473e-05, |
|
"loss": 0.5027, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.4975124378109453, |
|
"grad_norm": 0.3968697190284729, |
|
"learning_rate": 9.792137412291265e-05, |
|
"loss": 0.4932, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5020352781546812, |
|
"grad_norm": 0.28644707798957825, |
|
"learning_rate": 9.786199051449636e-05, |
|
"loss": 0.4935, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.506558118498417, |
|
"grad_norm": 0.4823949933052063, |
|
"learning_rate": 9.780178907671789e-05, |
|
"loss": 0.4999, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5110809588421529, |
|
"grad_norm": 0.4848293364048004, |
|
"learning_rate": 9.774077083825372e-05, |
|
"loss": 0.5054, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5156037991858887, |
|
"grad_norm": 0.15839064121246338, |
|
"learning_rate": 9.767893684173721e-05, |
|
"loss": 0.4789, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5201266395296246, |
|
"grad_norm": 0.34784838557243347, |
|
"learning_rate": 9.761628814374073e-05, |
|
"loss": 0.4942, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5246494798733605, |
|
"grad_norm": 0.18030278384685516, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 0.4813, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5291723202170964, |
|
"grad_norm": 0.43726080656051636, |
|
"learning_rate": 9.748855093918417e-05, |
|
"loss": 0.5017, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5336951605608322, |
|
"grad_norm": 0.22014489769935608, |
|
"learning_rate": 9.742346461530048e-05, |
|
"loss": 0.5077, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5382180009045681, |
|
"grad_norm": 0.2954523265361786, |
|
"learning_rate": 9.735756795525231e-05, |
|
"loss": 0.4936, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5427408412483039, |
|
"grad_norm": 0.1660991609096527, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 0.4889, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5472636815920398, |
|
"grad_norm": 0.4176950454711914, |
|
"learning_rate": 9.722334814445809e-05, |
|
"loss": 0.5041, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5517865219357757, |
|
"grad_norm": 0.11009762436151505, |
|
"learning_rate": 9.715502728715826e-05, |
|
"loss": 0.4917, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5563093622795116, |
|
"grad_norm": 0.652611255645752, |
|
"learning_rate": 9.708590068054728e-05, |
|
"loss": 0.4923, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5608322026232474, |
|
"grad_norm": 0.16778019070625305, |
|
"learning_rate": 9.701596950580806e-05, |
|
"loss": 0.5064, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5653550429669832, |
|
"grad_norm": 0.3204459547996521, |
|
"learning_rate": 9.694523495787149e-05, |
|
"loss": 0.4911, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5698778833107191, |
|
"grad_norm": 0.4674127399921417, |
|
"learning_rate": 9.687369824539577e-05, |
|
"loss": 0.4971, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.574400723654455, |
|
"grad_norm": 0.12897251546382904, |
|
"learning_rate": 9.680136059074598e-05, |
|
"loss": 0.4798, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.5789235639981909, |
|
"grad_norm": 0.15009181201457977, |
|
"learning_rate": 9.672822322997305e-05, |
|
"loss": 0.4895, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5834464043419267, |
|
"grad_norm": 0.304299920797348, |
|
"learning_rate": 9.665428741279266e-05, |
|
"loss": 0.498, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5879692446856626, |
|
"grad_norm": 0.40142571926116943, |
|
"learning_rate": 9.657955440256395e-05, |
|
"loss": 0.4799, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5924920850293984, |
|
"grad_norm": 0.17843011021614075, |
|
"learning_rate": 9.650402547626786e-05, |
|
"loss": 0.4848, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.579076886177063, |
|
"learning_rate": 9.642770192448536e-05, |
|
"loss": 0.489, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6015377657168702, |
|
"grad_norm": 0.4647994637489319, |
|
"learning_rate": 9.635058505137536e-05, |
|
"loss": 0.4964, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 0.14807263016700745, |
|
"learning_rate": 9.627267617465243e-05, |
|
"loss": 0.492, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6105834464043419, |
|
"grad_norm": 0.14985252916812897, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 0.498, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6151062867480778, |
|
"grad_norm": 0.16624276340007782, |
|
"learning_rate": 9.611448774886924e-05, |
|
"loss": 0.4846, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6196291270918136, |
|
"grad_norm": 0.20036545395851135, |
|
"learning_rate": 9.60342109028127e-05, |
|
"loss": 0.4919, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.6241519674355496, |
|
"grad_norm": 0.2734001576900482, |
|
"learning_rate": 9.595314745910456e-05, |
|
"loss": 0.4885, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.6286748077792854, |
|
"grad_norm": 0.12794898450374603, |
|
"learning_rate": 9.587129880289538e-05, |
|
"loss": 0.5028, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6331976481230213, |
|
"grad_norm": 0.40597257018089294, |
|
"learning_rate": 9.578866633275288e-05, |
|
"loss": 0.5009, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6377204884667571, |
|
"grad_norm": 0.34428995847702026, |
|
"learning_rate": 9.570525146063798e-05, |
|
"loss": 0.4905, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.642243328810493, |
|
"grad_norm": 0.33436307311058044, |
|
"learning_rate": 9.562105561188069e-05, |
|
"loss": 0.4891, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.6467661691542289, |
|
"grad_norm": 0.375051885843277, |
|
"learning_rate": 9.553608022515577e-05, |
|
"loss": 0.5031, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.6512890094979648, |
|
"grad_norm": 0.1948522925376892, |
|
"learning_rate": 9.545032675245813e-05, |
|
"loss": 0.4836, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6558118498417006, |
|
"grad_norm": 0.08413676917552948, |
|
"learning_rate": 9.5363796659078e-05, |
|
"loss": 0.4831, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6603346901854364, |
|
"grad_norm": 0.289809912443161, |
|
"learning_rate": 9.527649142357596e-05, |
|
"loss": 0.4851, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6648575305291723, |
|
"grad_norm": 0.14166517555713654, |
|
"learning_rate": 9.518841253775755e-05, |
|
"loss": 0.4786, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.6693803708729081, |
|
"grad_norm": 0.13443373143672943, |
|
"learning_rate": 9.509956150664796e-05, |
|
"loss": 0.4929, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6739032112166441, |
|
"grad_norm": 0.1985711306333542, |
|
"learning_rate": 9.500993984846614e-05, |
|
"loss": 0.481, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6784260515603799, |
|
"grad_norm": 0.3250739276409149, |
|
"learning_rate": 9.491954909459895e-05, |
|
"loss": 0.4872, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6829488919041158, |
|
"grad_norm": 0.32657676935195923, |
|
"learning_rate": 9.4828390789575e-05, |
|
"loss": 0.49, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.6874717322478516, |
|
"grad_norm": 0.22073891758918762, |
|
"learning_rate": 9.473646649103818e-05, |
|
"loss": 0.5, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.6919945725915875, |
|
"grad_norm": 0.22980472445487976, |
|
"learning_rate": 9.464377776972114e-05, |
|
"loss": 0.4867, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.6965174129353234, |
|
"grad_norm": 0.1441211700439453, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 0.4779, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.7010402532790593, |
|
"grad_norm": 0.19868245720863342, |
|
"learning_rate": 9.445611340695926e-05, |
|
"loss": 0.4917, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7055630936227951, |
|
"grad_norm": 0.2807522714138031, |
|
"learning_rate": 9.43611409721806e-05, |
|
"loss": 0.4955, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.710085933966531, |
|
"grad_norm": 0.4753507375717163, |
|
"learning_rate": 9.426541052789925e-05, |
|
"loss": 0.4884, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7146087743102668, |
|
"grad_norm": 0.16993290185928345, |
|
"learning_rate": 9.416892370988444e-05, |
|
"loss": 0.4816, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.7191316146540027, |
|
"grad_norm": 0.31910213828086853, |
|
"learning_rate": 9.407168216682962e-05, |
|
"loss": 0.491, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.7236544549977386, |
|
"grad_norm": 0.14142963290214539, |
|
"learning_rate": 9.397368756032445e-05, |
|
"loss": 0.4923, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7236544549977386, |
|
"eval_loss": 0.5151379704475403, |
|
"eval_runtime": 104.895, |
|
"eval_samples_per_second": 21.021, |
|
"eval_steps_per_second": 5.262, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7281772953414745, |
|
"grad_norm": 0.26469337940216064, |
|
"learning_rate": 9.387494156482643e-05, |
|
"loss": 0.4913, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.7327001356852103, |
|
"grad_norm": 0.261874794960022, |
|
"learning_rate": 9.377544586763215e-05, |
|
"loss": 0.4778, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.7372229760289462, |
|
"grad_norm": 0.24810470640659332, |
|
"learning_rate": 9.367520216884856e-05, |
|
"loss": 0.4855, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.741745816372682, |
|
"grad_norm": 0.1436118185520172, |
|
"learning_rate": 9.357421218136386e-05, |
|
"loss": 0.4847, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 0.16066552698612213, |
|
"learning_rate": 9.347247763081835e-05, |
|
"loss": 0.4872, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.7507914970601538, |
|
"grad_norm": 0.4255355894565582, |
|
"learning_rate": 9.337000025557476e-05, |
|
"loss": 0.4928, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.7553143374038896, |
|
"grad_norm": 0.1905021220445633, |
|
"learning_rate": 9.326678180668871e-05, |
|
"loss": 0.4904, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.7598371777476255, |
|
"grad_norm": 0.2327072024345398, |
|
"learning_rate": 9.316282404787871e-05, |
|
"loss": 0.4888, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.7643600180913613, |
|
"grad_norm": 0.20930258929729462, |
|
"learning_rate": 9.305812875549599e-05, |
|
"loss": 0.4858, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.7688828584350973, |
|
"grad_norm": 0.1341727375984192, |
|
"learning_rate": 9.295269771849427e-05, |
|
"loss": 0.4793, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7734056987788331, |
|
"grad_norm": 0.17430901527404785, |
|
"learning_rate": 9.284653273839905e-05, |
|
"loss": 0.4969, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.777928539122569, |
|
"grad_norm": 0.5216471552848816, |
|
"learning_rate": 9.273963562927695e-05, |
|
"loss": 0.491, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.7824513794663048, |
|
"grad_norm": 0.19990628957748413, |
|
"learning_rate": 9.263200821770461e-05, |
|
"loss": 0.4651, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.7869742198100407, |
|
"grad_norm": 0.5802574753761292, |
|
"learning_rate": 9.252365234273755e-05, |
|
"loss": 0.4775, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.7914970601537765, |
|
"grad_norm": 0.18451005220413208, |
|
"learning_rate": 9.241456985587868e-05, |
|
"loss": 0.4823, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.7960199004975125, |
|
"grad_norm": 0.3577069640159607, |
|
"learning_rate": 9.230476262104677e-05, |
|
"loss": 0.4772, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.8005427408412483, |
|
"grad_norm": 0.13898539543151855, |
|
"learning_rate": 9.219423251454448e-05, |
|
"loss": 0.4746, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.8050655811849842, |
|
"grad_norm": 0.23426567018032074, |
|
"learning_rate": 9.208298142502636e-05, |
|
"loss": 0.4865, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.80958842152872, |
|
"grad_norm": 0.33651697635650635, |
|
"learning_rate": 9.197101125346657e-05, |
|
"loss": 0.5107, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8141112618724559, |
|
"grad_norm": 0.41530391573905945, |
|
"learning_rate": 9.185832391312644e-05, |
|
"loss": 0.4836, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8186341022161918, |
|
"grad_norm": 0.4000149965286255, |
|
"learning_rate": 9.174492132952166e-05, |
|
"loss": 0.4799, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.8231569425599277, |
|
"grad_norm": 0.2623349726200104, |
|
"learning_rate": 9.163080544038952e-05, |
|
"loss": 0.4738, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.8276797829036635, |
|
"grad_norm": 0.2306557595729828, |
|
"learning_rate": 9.151597819565571e-05, |
|
"loss": 0.4833, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.8322026232473994, |
|
"grad_norm": 0.11869548261165619, |
|
"learning_rate": 9.140044155740101e-05, |
|
"loss": 0.4836, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.8367254635911352, |
|
"grad_norm": 0.4591820240020752, |
|
"learning_rate": 9.12841974998278e-05, |
|
"loss": 0.4942, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.841248303934871, |
|
"grad_norm": 0.17287275195121765, |
|
"learning_rate": 9.116724800922629e-05, |
|
"loss": 0.4734, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.845771144278607, |
|
"grad_norm": 0.15221983194351196, |
|
"learning_rate": 9.104959508394061e-05, |
|
"loss": 0.4806, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.8502939846223428, |
|
"grad_norm": 0.4994131922721863, |
|
"learning_rate": 9.093124073433463e-05, |
|
"loss": 0.4837, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.8548168249660787, |
|
"grad_norm": 0.47702082991600037, |
|
"learning_rate": 9.081218698275763e-05, |
|
"loss": 0.4836, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.8593396653098145, |
|
"grad_norm": 0.18302541971206665, |
|
"learning_rate": 9.069243586350975e-05, |
|
"loss": 0.4837, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8638625056535504, |
|
"grad_norm": 0.35281088948249817, |
|
"learning_rate": 9.057198942280722e-05, |
|
"loss": 0.4799, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.8683853459972863, |
|
"grad_norm": 0.30625414848327637, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.4743, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.8729081863410222, |
|
"grad_norm": 0.4937283396720886, |
|
"learning_rate": 9.032901882127354e-05, |
|
"loss": 0.5007, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.877431026684758, |
|
"grad_norm": 0.25247976183891296, |
|
"learning_rate": 9.020649881213958e-05, |
|
"loss": 0.4792, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.8819538670284939, |
|
"grad_norm": 0.43513771891593933, |
|
"learning_rate": 9.008329178487442e-05, |
|
"loss": 0.486, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8864767073722297, |
|
"grad_norm": 0.4003300368785858, |
|
"learning_rate": 8.995939984474624e-05, |
|
"loss": 0.4871, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.8909995477159657, |
|
"grad_norm": 0.14090712368488312, |
|
"learning_rate": 8.983482510872645e-05, |
|
"loss": 0.4849, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.5139490962028503, |
|
"learning_rate": 8.970956970545355e-05, |
|
"loss": 0.4954, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.9000452284034374, |
|
"grad_norm": 0.15464723110198975, |
|
"learning_rate": 8.958363577519684e-05, |
|
"loss": 0.4874, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.9045680687471732, |
|
"grad_norm": 0.6348758339881897, |
|
"learning_rate": 8.945702546981969e-05, |
|
"loss": 0.4893, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.34100142121315, |
|
"learning_rate": 8.932974095274289e-05, |
|
"loss": 0.4942, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.9136137494346449, |
|
"grad_norm": 0.1424902230501175, |
|
"learning_rate": 8.920178439890765e-05, |
|
"loss": 0.4801, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.9181365897783809, |
|
"grad_norm": 0.07340684533119202, |
|
"learning_rate": 8.907315799473846e-05, |
|
"loss": 0.4734, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.9226594301221167, |
|
"grad_norm": 0.8894658088684082, |
|
"learning_rate": 8.894386393810563e-05, |
|
"loss": 0.4813, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.9271822704658526, |
|
"grad_norm": 3.0061469078063965, |
|
"learning_rate": 8.881390443828787e-05, |
|
"loss": 0.4722, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.9317051108095884, |
|
"grad_norm": 0.435508131980896, |
|
"learning_rate": 8.868328171593448e-05, |
|
"loss": 0.4856, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.9362279511533242, |
|
"grad_norm": 0.30460214614868164, |
|
"learning_rate": 8.855199800302736e-05, |
|
"loss": 0.473, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.9407507914970602, |
|
"grad_norm": 0.48862236738204956, |
|
"learning_rate": 8.842005554284296e-05, |
|
"loss": 0.4914, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.945273631840796, |
|
"grad_norm": 0.16484391689300537, |
|
"learning_rate": 8.828745658991386e-05, |
|
"loss": 0.4872, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.9497964721845319, |
|
"grad_norm": 0.19850239157676697, |
|
"learning_rate": 8.815420340999033e-05, |
|
"loss": 0.4978, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9543193125282677, |
|
"grad_norm": 0.15624162554740906, |
|
"learning_rate": 8.802029828000156e-05, |
|
"loss": 0.4904, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.9588421528720036, |
|
"grad_norm": 0.20471015572547913, |
|
"learning_rate": 8.788574348801675e-05, |
|
"loss": 0.4818, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.9633649932157394, |
|
"grad_norm": 0.879814088344574, |
|
"learning_rate": 8.775054133320604e-05, |
|
"loss": 0.4832, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.9678878335594754, |
|
"grad_norm": 0.14262792468070984, |
|
"learning_rate": 8.761469412580125e-05, |
|
"loss": 0.4893, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.9724106739032112, |
|
"grad_norm": 0.43501853942871094, |
|
"learning_rate": 8.74782041870563e-05, |
|
"loss": 0.4751, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.9769335142469471, |
|
"grad_norm": 0.10217157751321793, |
|
"learning_rate": 8.73410738492077e-05, |
|
"loss": 0.4863, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.9814563545906829, |
|
"grad_norm": 0.550026535987854, |
|
"learning_rate": 8.720330545543453e-05, |
|
"loss": 0.4832, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.9859791949344188, |
|
"grad_norm": 0.283246785402298, |
|
"learning_rate": 8.706490135981855e-05, |
|
"loss": 0.4895, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.9905020352781547, |
|
"grad_norm": 0.21039070188999176, |
|
"learning_rate": 8.692586392730387e-05, |
|
"loss": 0.494, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.9950248756218906, |
|
"grad_norm": 0.7897614240646362, |
|
"learning_rate": 8.678619553365659e-05, |
|
"loss": 0.4871, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9995477159656264, |
|
"grad_norm": 0.3698125183582306, |
|
"learning_rate": 8.66458985654242e-05, |
|
"loss": 0.4764, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.0036182722749887, |
|
"grad_norm": 0.529675304889679, |
|
"learning_rate": 8.650497541989482e-05, |
|
"loss": 0.4834, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.0081411126187245, |
|
"grad_norm": 0.10685670375823975, |
|
"learning_rate": 8.636342850505616e-05, |
|
"loss": 0.4864, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.0126639529624604, |
|
"grad_norm": 0.2046063244342804, |
|
"learning_rate": 8.622126023955446e-05, |
|
"loss": 0.4796, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.0171867933061962, |
|
"grad_norm": 0.2893441319465637, |
|
"learning_rate": 8.60784730526531e-05, |
|
"loss": 0.4882, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.021709633649932, |
|
"grad_norm": 0.12210704386234283, |
|
"learning_rate": 8.59350693841912e-05, |
|
"loss": 0.4805, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.0262324739936681, |
|
"grad_norm": 4.210540294647217, |
|
"learning_rate": 8.579105168454173e-05, |
|
"loss": 0.4812, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.030755314337404, |
|
"grad_norm": 0.3887004256248474, |
|
"learning_rate": 8.564642241456986e-05, |
|
"loss": 0.4804, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.0352781546811398, |
|
"grad_norm": 0.4718762934207916, |
|
"learning_rate": 8.550118404559075e-05, |
|
"loss": 0.489, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.0398009950248757, |
|
"grad_norm": 0.26404207944869995, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.4829, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.0443238353686115, |
|
"grad_norm": 0.6389791965484619, |
|
"learning_rate": 8.52088899478682e-05, |
|
"loss": 0.4806, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.0488466757123474, |
|
"grad_norm": 1.5533722639083862, |
|
"learning_rate": 8.506183921362443e-05, |
|
"loss": 0.4878, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.0533695160560832, |
|
"grad_norm": 0.30983996391296387, |
|
"learning_rate": 8.491418936928742e-05, |
|
"loss": 0.4808, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.057892356399819, |
|
"grad_norm": 0.16938813030719757, |
|
"learning_rate": 8.476594293778561e-05, |
|
"loss": 0.4863, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.062415196743555, |
|
"grad_norm": 1.2061636447906494, |
|
"learning_rate": 8.461710245224148e-05, |
|
"loss": 0.4806, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.0669380370872907, |
|
"grad_norm": 0.0852380022406578, |
|
"learning_rate": 8.44676704559283e-05, |
|
"loss": 0.4764, |
|
"step": 2360 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 8000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 40, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.09484398947074e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|