|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 3462, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002021952628538417, |
|
"grad_norm": 9.595681190490723, |
|
"learning_rate": 2.017291066282421e-06, |
|
"loss": 7.4786, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.004043905257076834, |
|
"grad_norm": 7.749298095703125, |
|
"learning_rate": 4.034582132564842e-06, |
|
"loss": 7.261, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.006065857885615251, |
|
"grad_norm": 9.839412689208984, |
|
"learning_rate": 6.0518731988472625e-06, |
|
"loss": 6.189, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.008087810514153668, |
|
"grad_norm": 4.33282470703125, |
|
"learning_rate": 8.069164265129683e-06, |
|
"loss": 6.0148, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.010109763142692086, |
|
"grad_norm": 4.110238552093506, |
|
"learning_rate": 1.0086455331412104e-05, |
|
"loss": 5.4958, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.012131715771230503, |
|
"grad_norm": 3.388286828994751, |
|
"learning_rate": 1.2103746397694525e-05, |
|
"loss": 5.3635, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.01415366839976892, |
|
"grad_norm": 2.6382975578308105, |
|
"learning_rate": 1.4121037463976946e-05, |
|
"loss": 5.1959, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.016175621028307337, |
|
"grad_norm": 3.417970895767212, |
|
"learning_rate": 1.6138328530259367e-05, |
|
"loss": 5.1294, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.018197573656845753, |
|
"grad_norm": 2.9546399116516113, |
|
"learning_rate": 1.8155619596541786e-05, |
|
"loss": 4.8528, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.020219526285384173, |
|
"grad_norm": 2.8038337230682373, |
|
"learning_rate": 2.017291066282421e-05, |
|
"loss": 4.6832, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02224147891392259, |
|
"grad_norm": 2.5608632564544678, |
|
"learning_rate": 2.219020172910663e-05, |
|
"loss": 4.6793, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.024263431542461005, |
|
"grad_norm": 2.646967649459839, |
|
"learning_rate": 2.420749279538905e-05, |
|
"loss": 4.4008, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.02628538417099942, |
|
"grad_norm": 2.6493546962738037, |
|
"learning_rate": 2.622478386167147e-05, |
|
"loss": 4.4752, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.02830733679953784, |
|
"grad_norm": 2.450045585632324, |
|
"learning_rate": 2.824207492795389e-05, |
|
"loss": 4.441, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.030329289428076257, |
|
"grad_norm": 2.8270249366760254, |
|
"learning_rate": 3.025936599423631e-05, |
|
"loss": 4.2923, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.032351242056614674, |
|
"grad_norm": 2.512385368347168, |
|
"learning_rate": 3.227665706051873e-05, |
|
"loss": 4.2352, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.03437319468515309, |
|
"grad_norm": 2.7446236610412598, |
|
"learning_rate": 3.4293948126801156e-05, |
|
"loss": 4.3132, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.036395147313691506, |
|
"grad_norm": 2.433756113052368, |
|
"learning_rate": 3.631123919308357e-05, |
|
"loss": 4.1372, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.03841709994222992, |
|
"grad_norm": 2.985642433166504, |
|
"learning_rate": 3.8328530259365994e-05, |
|
"loss": 4.1249, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.040439052570768345, |
|
"grad_norm": 2.4674811363220215, |
|
"learning_rate": 4.034582132564842e-05, |
|
"loss": 4.0113, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04246100519930676, |
|
"grad_norm": 2.5470809936523438, |
|
"learning_rate": 4.236311239193084e-05, |
|
"loss": 4.0147, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.04448295782784518, |
|
"grad_norm": 2.6432108879089355, |
|
"learning_rate": 4.438040345821326e-05, |
|
"loss": 3.9437, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.046504910456383594, |
|
"grad_norm": 2.5995123386383057, |
|
"learning_rate": 4.639769452449568e-05, |
|
"loss": 3.8956, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.04852686308492201, |
|
"grad_norm": 2.4285008907318115, |
|
"learning_rate": 4.84149855907781e-05, |
|
"loss": 4.002, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.05054881571346043, |
|
"grad_norm": 4.111863613128662, |
|
"learning_rate": 5.0432276657060516e-05, |
|
"loss": 3.9789, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.05257076834199884, |
|
"grad_norm": 2.493316173553467, |
|
"learning_rate": 5.244956772334294e-05, |
|
"loss": 3.7673, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.05459272097053726, |
|
"grad_norm": 2.6891896724700928, |
|
"learning_rate": 5.446685878962536e-05, |
|
"loss": 3.8416, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.05661467359907568, |
|
"grad_norm": 2.5575196743011475, |
|
"learning_rate": 5.648414985590778e-05, |
|
"loss": 3.7602, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.0586366262276141, |
|
"grad_norm": 2.6597752571105957, |
|
"learning_rate": 5.850144092219021e-05, |
|
"loss": 3.8453, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.060658578856152515, |
|
"grad_norm": 2.5099666118621826, |
|
"learning_rate": 6.051873198847262e-05, |
|
"loss": 3.7611, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06268053148469092, |
|
"grad_norm": 2.335249185562134, |
|
"learning_rate": 6.253602305475504e-05, |
|
"loss": 3.7128, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.06470248411322935, |
|
"grad_norm": 2.349083423614502, |
|
"learning_rate": 6.455331412103747e-05, |
|
"loss": 3.7442, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.06672443674176777, |
|
"grad_norm": 2.6178603172302246, |
|
"learning_rate": 6.657060518731989e-05, |
|
"loss": 3.7445, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.06874638937030618, |
|
"grad_norm": 2.6607229709625244, |
|
"learning_rate": 6.858789625360231e-05, |
|
"loss": 3.7259, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.0707683419988446, |
|
"grad_norm": 2.389965295791626, |
|
"learning_rate": 7.060518731988472e-05, |
|
"loss": 3.7101, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.07279029462738301, |
|
"grad_norm": 2.3201496601104736, |
|
"learning_rate": 7.262247838616714e-05, |
|
"loss": 3.5793, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.07481224725592144, |
|
"grad_norm": 2.3502607345581055, |
|
"learning_rate": 7.463976945244957e-05, |
|
"loss": 3.6208, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.07683419988445984, |
|
"grad_norm": 2.284522294998169, |
|
"learning_rate": 7.665706051873199e-05, |
|
"loss": 3.6786, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.07885615251299827, |
|
"grad_norm": 2.3930606842041016, |
|
"learning_rate": 7.867435158501441e-05, |
|
"loss": 3.6442, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.08087810514153669, |
|
"grad_norm": 2.294808864593506, |
|
"learning_rate": 8.069164265129683e-05, |
|
"loss": 3.6099, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0829000577700751, |
|
"grad_norm": 2.481281280517578, |
|
"learning_rate": 8.270893371757926e-05, |
|
"loss": 3.534, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.08492201039861352, |
|
"grad_norm": 2.197254180908203, |
|
"learning_rate": 8.472622478386168e-05, |
|
"loss": 3.6141, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.08694396302715193, |
|
"grad_norm": 2.4509365558624268, |
|
"learning_rate": 8.67435158501441e-05, |
|
"loss": 3.4953, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.08896591565569036, |
|
"grad_norm": 2.4953505992889404, |
|
"learning_rate": 8.876080691642652e-05, |
|
"loss": 3.457, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.09098786828422877, |
|
"grad_norm": 2.2001383304595947, |
|
"learning_rate": 9.077809798270895e-05, |
|
"loss": 3.4942, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.09300982091276719, |
|
"grad_norm": 2.1801021099090576, |
|
"learning_rate": 9.279538904899135e-05, |
|
"loss": 3.3884, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.0950317735413056, |
|
"grad_norm": 2.2749996185302734, |
|
"learning_rate": 9.481268011527378e-05, |
|
"loss": 3.495, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.09705372616984402, |
|
"grad_norm": 2.1388254165649414, |
|
"learning_rate": 9.68299711815562e-05, |
|
"loss": 3.5859, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.09907567879838244, |
|
"grad_norm": 2.1429038047790527, |
|
"learning_rate": 9.884726224783862e-05, |
|
"loss": 3.4566, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.10109763142692085, |
|
"grad_norm": 2.1946280002593994, |
|
"learning_rate": 9.999994866347054e-05, |
|
"loss": 3.4641, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.10311958405545928, |
|
"grad_norm": 3.0027740001678467, |
|
"learning_rate": 9.999942959510397e-05, |
|
"loss": 3.3817, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.10514153668399769, |
|
"grad_norm": 2.3069963455200195, |
|
"learning_rate": 9.999835153577435e-05, |
|
"loss": 3.3968, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.10716348931253611, |
|
"grad_norm": 2.2940797805786133, |
|
"learning_rate": 9.999671449753431e-05, |
|
"loss": 3.5512, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.10918544194107452, |
|
"grad_norm": 2.4967830181121826, |
|
"learning_rate": 9.999451849868585e-05, |
|
"loss": 3.4169, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.11120739456961294, |
|
"grad_norm": 2.4844703674316406, |
|
"learning_rate": 9.99917635637801e-05, |
|
"loss": 3.5323, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.11322934719815136, |
|
"grad_norm": 2.1567542552948, |
|
"learning_rate": 9.998844972361712e-05, |
|
"loss": 3.4126, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.11525129982668977, |
|
"grad_norm": 2.143780469894409, |
|
"learning_rate": 9.998457701524546e-05, |
|
"loss": 3.3793, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.1172732524552282, |
|
"grad_norm": 2.182628631591797, |
|
"learning_rate": 9.998014548196178e-05, |
|
"loss": 3.2996, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.1192952050837666, |
|
"grad_norm": 2.220407247543335, |
|
"learning_rate": 9.99751551733104e-05, |
|
"loss": 3.4636, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.12131715771230503, |
|
"grad_norm": 2.0776526927948, |
|
"learning_rate": 9.996960614508271e-05, |
|
"loss": 3.348, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.12333911034084344, |
|
"grad_norm": 2.2219507694244385, |
|
"learning_rate": 9.996349845931651e-05, |
|
"loss": 3.3982, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.12536106296938185, |
|
"grad_norm": 2.48421311378479, |
|
"learning_rate": 9.995683218429546e-05, |
|
"loss": 3.3587, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.12738301559792028, |
|
"grad_norm": 2.1980128288269043, |
|
"learning_rate": 9.994960739454811e-05, |
|
"loss": 3.3461, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.1294049682264587, |
|
"grad_norm": 2.3740694522857666, |
|
"learning_rate": 9.994182417084725e-05, |
|
"loss": 3.3966, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.1314269208549971, |
|
"grad_norm": 2.3175392150878906, |
|
"learning_rate": 9.993348260020892e-05, |
|
"loss": 3.2726, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.13344887348353554, |
|
"grad_norm": 2.2129814624786377, |
|
"learning_rate": 9.99245827758914e-05, |
|
"loss": 3.3214, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.13547082611207395, |
|
"grad_norm": 2.125340461730957, |
|
"learning_rate": 9.991512479739428e-05, |
|
"loss": 3.3272, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.13749277874061236, |
|
"grad_norm": 2.3547654151916504, |
|
"learning_rate": 9.990510877045724e-05, |
|
"loss": 3.2926, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.13951473136915077, |
|
"grad_norm": 3.001073122024536, |
|
"learning_rate": 9.989453480705895e-05, |
|
"loss": 3.3037, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.1415366839976892, |
|
"grad_norm": 2.411472797393799, |
|
"learning_rate": 9.988340302541574e-05, |
|
"loss": 3.1022, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.14355863662622761, |
|
"grad_norm": 2.169168472290039, |
|
"learning_rate": 9.987171354998033e-05, |
|
"loss": 3.2779, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.14558058925476602, |
|
"grad_norm": 2.482876777648926, |
|
"learning_rate": 9.985946651144046e-05, |
|
"loss": 3.3117, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.14760254188330446, |
|
"grad_norm": 2.099360704421997, |
|
"learning_rate": 9.984666204671735e-05, |
|
"loss": 3.293, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.14962449451184287, |
|
"grad_norm": 1.954262137413025, |
|
"learning_rate": 9.983330029896423e-05, |
|
"loss": 3.2429, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.15164644714038128, |
|
"grad_norm": 1.9902952909469604, |
|
"learning_rate": 9.981938141756476e-05, |
|
"loss": 3.227, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.1536683997689197, |
|
"grad_norm": 2.3210673332214355, |
|
"learning_rate": 9.980490555813124e-05, |
|
"loss": 3.1948, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.15569035239745813, |
|
"grad_norm": 2.481900453567505, |
|
"learning_rate": 9.978987288250307e-05, |
|
"loss": 3.2088, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.15771230502599654, |
|
"grad_norm": 2.0122852325439453, |
|
"learning_rate": 9.977428355874472e-05, |
|
"loss": 3.1562, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.15973425765453494, |
|
"grad_norm": 2.022824287414551, |
|
"learning_rate": 9.975813776114401e-05, |
|
"loss": 3.1278, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.16175621028307338, |
|
"grad_norm": 2.0773727893829346, |
|
"learning_rate": 9.97414356702101e-05, |
|
"loss": 3.1692, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1637781629116118, |
|
"grad_norm": 2.01796817779541, |
|
"learning_rate": 9.97241774726715e-05, |
|
"loss": 3.1741, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.1658001155401502, |
|
"grad_norm": 1.8899658918380737, |
|
"learning_rate": 9.970636336147391e-05, |
|
"loss": 3.1241, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.1678220681686886, |
|
"grad_norm": 2.1450061798095703, |
|
"learning_rate": 9.968799353577815e-05, |
|
"loss": 3.131, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.16984402079722705, |
|
"grad_norm": 2.2159628868103027, |
|
"learning_rate": 9.96690682009579e-05, |
|
"loss": 3.262, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.17186597342576546, |
|
"grad_norm": 2.3034591674804688, |
|
"learning_rate": 9.964958756859741e-05, |
|
"loss": 3.1114, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.17388792605430387, |
|
"grad_norm": 1.9303778409957886, |
|
"learning_rate": 9.962955185648909e-05, |
|
"loss": 3.1515, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.17590987868284227, |
|
"grad_norm": 1.83698308467865, |
|
"learning_rate": 9.960896128863115e-05, |
|
"loss": 3.2162, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.1779318313113807, |
|
"grad_norm": 1.970585823059082, |
|
"learning_rate": 9.958781609522504e-05, |
|
"loss": 3.1861, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.17995378393991912, |
|
"grad_norm": 2.279804229736328, |
|
"learning_rate": 9.95661165126729e-05, |
|
"loss": 3.0684, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.18197573656845753, |
|
"grad_norm": 1.8226346969604492, |
|
"learning_rate": 9.95438627835749e-05, |
|
"loss": 3.1671, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.18399768919699597, |
|
"grad_norm": 1.8989958763122559, |
|
"learning_rate": 9.952105515672654e-05, |
|
"loss": 3.2091, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.18601964182553438, |
|
"grad_norm": 1.846562147140503, |
|
"learning_rate": 9.949769388711591e-05, |
|
"loss": 3.1451, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.18804159445407279, |
|
"grad_norm": 1.8738723993301392, |
|
"learning_rate": 9.947377923592073e-05, |
|
"loss": 3.0587, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.1900635470826112, |
|
"grad_norm": 1.9004042148590088, |
|
"learning_rate": 9.944931147050553e-05, |
|
"loss": 3.0329, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.19208549971114963, |
|
"grad_norm": 1.9021046161651611, |
|
"learning_rate": 9.942429086441864e-05, |
|
"loss": 3.0379, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.19410745233968804, |
|
"grad_norm": 2.065567970275879, |
|
"learning_rate": 9.93987176973891e-05, |
|
"loss": 3.2037, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.19612940496822645, |
|
"grad_norm": 1.881011724472046, |
|
"learning_rate": 9.937259225532356e-05, |
|
"loss": 3.0945, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.1981513575967649, |
|
"grad_norm": 1.9205586910247803, |
|
"learning_rate": 9.934591483030306e-05, |
|
"loss": 3.0818, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.2001733102253033, |
|
"grad_norm": 1.8970929384231567, |
|
"learning_rate": 9.931868572057979e-05, |
|
"loss": 3.0714, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.2021952628538417, |
|
"grad_norm": 2.0119457244873047, |
|
"learning_rate": 9.929090523057377e-05, |
|
"loss": 3.0832, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.20421721548238012, |
|
"grad_norm": 1.9726409912109375, |
|
"learning_rate": 9.926257367086939e-05, |
|
"loss": 3.0904, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.20623916811091855, |
|
"grad_norm": 1.8767212629318237, |
|
"learning_rate": 9.923369135821198e-05, |
|
"loss": 3.0606, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.20826112073945696, |
|
"grad_norm": 1.899286150932312, |
|
"learning_rate": 9.920425861550425e-05, |
|
"loss": 3.0126, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.21028307336799537, |
|
"grad_norm": 1.8871937990188599, |
|
"learning_rate": 9.917427577180271e-05, |
|
"loss": 3.1418, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.2123050259965338, |
|
"grad_norm": 2.15138578414917, |
|
"learning_rate": 9.914374316231396e-05, |
|
"loss": 3.12, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.21432697862507222, |
|
"grad_norm": 1.8373454809188843, |
|
"learning_rate": 9.911266112839093e-05, |
|
"loss": 3.0967, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.21634893125361063, |
|
"grad_norm": 1.924065113067627, |
|
"learning_rate": 9.908103001752913e-05, |
|
"loss": 2.9944, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.21837088388214904, |
|
"grad_norm": 1.8721510171890259, |
|
"learning_rate": 9.90488501833627e-05, |
|
"loss": 3.1021, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.22039283651068747, |
|
"grad_norm": 1.8645668029785156, |
|
"learning_rate": 9.901612198566044e-05, |
|
"loss": 3.0134, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.22241478913922588, |
|
"grad_norm": 1.8228344917297363, |
|
"learning_rate": 9.898284579032188e-05, |
|
"loss": 3.0468, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.2244367417677643, |
|
"grad_norm": 1.7747055292129517, |
|
"learning_rate": 9.894902196937312e-05, |
|
"loss": 3.0553, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.22645869439630273, |
|
"grad_norm": 1.8610613346099854, |
|
"learning_rate": 9.891465090096265e-05, |
|
"loss": 3.0393, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.22848064702484114, |
|
"grad_norm": 1.8580009937286377, |
|
"learning_rate": 9.887973296935725e-05, |
|
"loss": 3.0419, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.23050259965337955, |
|
"grad_norm": 1.7875131368637085, |
|
"learning_rate": 9.884426856493746e-05, |
|
"loss": 2.9642, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.23252455228191796, |
|
"grad_norm": 1.8746123313903809, |
|
"learning_rate": 9.880825808419348e-05, |
|
"loss": 3.0594, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.2345465049104564, |
|
"grad_norm": 1.8220926523208618, |
|
"learning_rate": 9.877170192972056e-05, |
|
"loss": 3.0327, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.2365684575389948, |
|
"grad_norm": 1.9343609809875488, |
|
"learning_rate": 9.873460051021457e-05, |
|
"loss": 3.0395, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.2385904101675332, |
|
"grad_norm": 1.9146010875701904, |
|
"learning_rate": 9.86969542404674e-05, |
|
"loss": 3.0361, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.24061236279607162, |
|
"grad_norm": 1.8916493654251099, |
|
"learning_rate": 9.865876354136234e-05, |
|
"loss": 3.0112, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.24263431542461006, |
|
"grad_norm": 1.8126282691955566, |
|
"learning_rate": 9.862002883986938e-05, |
|
"loss": 3.0325, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.24465626805314847, |
|
"grad_norm": 1.9282585382461548, |
|
"learning_rate": 9.85807505690404e-05, |
|
"loss": 3.0337, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.24667822068168688, |
|
"grad_norm": 1.7758145332336426, |
|
"learning_rate": 9.854092916800442e-05, |
|
"loss": 3.0062, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.24870017331022531, |
|
"grad_norm": 1.8073774576187134, |
|
"learning_rate": 9.850056508196255e-05, |
|
"loss": 2.9754, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.2507221259387637, |
|
"grad_norm": 1.9235025644302368, |
|
"learning_rate": 9.845965876218312e-05, |
|
"loss": 2.9062, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.25274407856730213, |
|
"grad_norm": 1.7717721462249756, |
|
"learning_rate": 9.841821066599666e-05, |
|
"loss": 3.0064, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.25476603119584057, |
|
"grad_norm": 1.7075670957565308, |
|
"learning_rate": 9.837622125679062e-05, |
|
"loss": 3.0164, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.25678798382437895, |
|
"grad_norm": 1.824540138244629, |
|
"learning_rate": 9.83336910040044e-05, |
|
"loss": 3.0539, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.2588099364529174, |
|
"grad_norm": 1.7004854679107666, |
|
"learning_rate": 9.829062038312394e-05, |
|
"loss": 3.0165, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.2608318890814558, |
|
"grad_norm": 1.7976192235946655, |
|
"learning_rate": 9.824700987567653e-05, |
|
"loss": 2.9922, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.2628538417099942, |
|
"grad_norm": 1.8717296123504639, |
|
"learning_rate": 9.820285996922526e-05, |
|
"loss": 3.0127, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.26487579433853264, |
|
"grad_norm": 2.026646137237549, |
|
"learning_rate": 9.815817115736379e-05, |
|
"loss": 3.0056, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.2668977469670711, |
|
"grad_norm": 1.7542977333068848, |
|
"learning_rate": 9.811294393971063e-05, |
|
"loss": 2.9186, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.26891969959560946, |
|
"grad_norm": 1.832772135734558, |
|
"learning_rate": 9.806717882190368e-05, |
|
"loss": 2.9559, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.2709416522241479, |
|
"grad_norm": 1.7612087726593018, |
|
"learning_rate": 9.802087631559451e-05, |
|
"loss": 2.9334, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.2729636048526863, |
|
"grad_norm": 1.7945784330368042, |
|
"learning_rate": 9.797403693844271e-05, |
|
"loss": 3.0131, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.2749855574812247, |
|
"grad_norm": 1.840769648551941, |
|
"learning_rate": 9.792666121410998e-05, |
|
"loss": 2.9896, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.27700751010976316, |
|
"grad_norm": 1.9374017715454102, |
|
"learning_rate": 9.787874967225444e-05, |
|
"loss": 2.9585, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.27902946273830154, |
|
"grad_norm": 1.8398560285568237, |
|
"learning_rate": 9.783030284852454e-05, |
|
"loss": 3.0073, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.28105141536684, |
|
"grad_norm": 1.7769570350646973, |
|
"learning_rate": 9.778132128455322e-05, |
|
"loss": 2.9579, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.2830733679953784, |
|
"grad_norm": 1.9076484441757202, |
|
"learning_rate": 9.773180552795173e-05, |
|
"loss": 3.0034, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.2850953206239168, |
|
"grad_norm": 1.818621039390564, |
|
"learning_rate": 9.768175613230365e-05, |
|
"loss": 2.9642, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.28711727325245523, |
|
"grad_norm": 1.9143321514129639, |
|
"learning_rate": 9.76311736571585e-05, |
|
"loss": 2.9598, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.28913922588099367, |
|
"grad_norm": 1.8136980533599854, |
|
"learning_rate": 9.75800586680257e-05, |
|
"loss": 3.0053, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 0.29116117850953205, |
|
"grad_norm": 1.9737019538879395, |
|
"learning_rate": 9.752841173636808e-05, |
|
"loss": 2.9401, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.2931831311380705, |
|
"grad_norm": 1.8300338983535767, |
|
"learning_rate": 9.747623343959563e-05, |
|
"loss": 2.9416, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.2952050837666089, |
|
"grad_norm": 1.9094542264938354, |
|
"learning_rate": 9.74235243610589e-05, |
|
"loss": 2.8963, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.2972270363951473, |
|
"grad_norm": 2.2808616161346436, |
|
"learning_rate": 9.737028509004258e-05, |
|
"loss": 2.9543, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 0.29924898902368574, |
|
"grad_norm": 1.786083698272705, |
|
"learning_rate": 9.73165162217589e-05, |
|
"loss": 2.9339, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.3012709416522241, |
|
"grad_norm": 1.8315051794052124, |
|
"learning_rate": 9.726221835734096e-05, |
|
"loss": 2.9229, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 0.30329289428076256, |
|
"grad_norm": 1.701285719871521, |
|
"learning_rate": 9.720739210383598e-05, |
|
"loss": 2.8461, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.305314846909301, |
|
"grad_norm": 1.8492621183395386, |
|
"learning_rate": 9.715203807419855e-05, |
|
"loss": 3.0137, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 0.3073367995378394, |
|
"grad_norm": 1.8336840867996216, |
|
"learning_rate": 9.70961568872838e-05, |
|
"loss": 2.8932, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.3093587521663778, |
|
"grad_norm": 1.8716745376586914, |
|
"learning_rate": 9.70397491678404e-05, |
|
"loss": 2.955, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 0.31138070479491625, |
|
"grad_norm": 1.7288552522659302, |
|
"learning_rate": 9.698281554650366e-05, |
|
"loss": 2.8435, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.31340265742345463, |
|
"grad_norm": 1.6746890544891357, |
|
"learning_rate": 9.692535665978845e-05, |
|
"loss": 2.9312, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.31542461005199307, |
|
"grad_norm": 1.7941049337387085, |
|
"learning_rate": 9.686737315008207e-05, |
|
"loss": 2.8434, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.3174465626805315, |
|
"grad_norm": 1.8552913665771484, |
|
"learning_rate": 9.680886566563705e-05, |
|
"loss": 3.04, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 0.3194685153090699, |
|
"grad_norm": 1.8141459226608276, |
|
"learning_rate": 9.674983486056399e-05, |
|
"loss": 2.9887, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 0.3214904679376083, |
|
"grad_norm": 1.7158820629119873, |
|
"learning_rate": 9.66902813948241e-05, |
|
"loss": 2.9345, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 0.32351242056614676, |
|
"grad_norm": 2.014641761779785, |
|
"learning_rate": 9.6630205934222e-05, |
|
"loss": 2.9396, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.32553437319468514, |
|
"grad_norm": 1.8080192804336548, |
|
"learning_rate": 9.656960915039815e-05, |
|
"loss": 2.9015, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 0.3275563258232236, |
|
"grad_norm": 1.941286563873291, |
|
"learning_rate": 9.650849172082132e-05, |
|
"loss": 2.8438, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.32957827845176196, |
|
"grad_norm": 1.7322505712509155, |
|
"learning_rate": 9.644685432878117e-05, |
|
"loss": 2.9456, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 0.3316002310803004, |
|
"grad_norm": 1.6413743495941162, |
|
"learning_rate": 9.638469766338045e-05, |
|
"loss": 2.8915, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 0.33362218370883884, |
|
"grad_norm": 1.8302034139633179, |
|
"learning_rate": 9.632202241952737e-05, |
|
"loss": 2.9105, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.3356441363373772, |
|
"grad_norm": 1.7431873083114624, |
|
"learning_rate": 9.625882929792781e-05, |
|
"loss": 2.9286, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 0.33766608896591566, |
|
"grad_norm": 1.8077412843704224, |
|
"learning_rate": 9.619511900507753e-05, |
|
"loss": 2.9155, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 0.3396880415944541, |
|
"grad_norm": 1.8113936185836792, |
|
"learning_rate": 9.613089225325421e-05, |
|
"loss": 2.9043, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 0.3417099942229925, |
|
"grad_norm": 1.6953517198562622, |
|
"learning_rate": 9.606614976050949e-05, |
|
"loss": 2.936, |
|
"step": 1183 |
|
}, |
|
{ |
|
"epoch": 0.3437319468515309, |
|
"grad_norm": 1.7508031129837036, |
|
"learning_rate": 9.600089225066103e-05, |
|
"loss": 2.7969, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.34575389948006935, |
|
"grad_norm": 1.6380009651184082, |
|
"learning_rate": 9.59351204532843e-05, |
|
"loss": 2.8198, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 0.34777585210860773, |
|
"grad_norm": 1.7703418731689453, |
|
"learning_rate": 9.58688351037045e-05, |
|
"loss": 2.836, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 0.34979780473714617, |
|
"grad_norm": 1.90382719039917, |
|
"learning_rate": 9.580203694298833e-05, |
|
"loss": 2.9212, |
|
"step": 1211 |
|
}, |
|
{ |
|
"epoch": 0.35181975736568455, |
|
"grad_norm": 1.857691764831543, |
|
"learning_rate": 9.573472671793564e-05, |
|
"loss": 2.908, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 0.353841709994223, |
|
"grad_norm": 1.7361656427383423, |
|
"learning_rate": 9.56669051810712e-05, |
|
"loss": 2.8179, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.3558636626227614, |
|
"grad_norm": 1.681665062904358, |
|
"learning_rate": 9.559857309063616e-05, |
|
"loss": 2.9504, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 0.3578856152512998, |
|
"grad_norm": 1.6940059661865234, |
|
"learning_rate": 9.552973121057966e-05, |
|
"loss": 2.8335, |
|
"step": 1239 |
|
}, |
|
{ |
|
"epoch": 0.35990756787983824, |
|
"grad_norm": 1.7692625522613525, |
|
"learning_rate": 9.546038031055026e-05, |
|
"loss": 2.8118, |
|
"step": 1246 |
|
}, |
|
{ |
|
"epoch": 0.3619295205083767, |
|
"grad_norm": 1.7423547506332397, |
|
"learning_rate": 9.539052116588734e-05, |
|
"loss": 2.8345, |
|
"step": 1253 |
|
}, |
|
{ |
|
"epoch": 0.36395147313691506, |
|
"grad_norm": 1.7952203750610352, |
|
"learning_rate": 9.532015455761241e-05, |
|
"loss": 2.8732, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3659734257654535, |
|
"grad_norm": 1.6462956666946411, |
|
"learning_rate": 9.52492812724204e-05, |
|
"loss": 2.8128, |
|
"step": 1267 |
|
}, |
|
{ |
|
"epoch": 0.36799537839399193, |
|
"grad_norm": 1.7551960945129395, |
|
"learning_rate": 9.51779021026709e-05, |
|
"loss": 2.8367, |
|
"step": 1274 |
|
}, |
|
{ |
|
"epoch": 0.3700173310225303, |
|
"grad_norm": 1.922524094581604, |
|
"learning_rate": 9.510601784637921e-05, |
|
"loss": 2.8275, |
|
"step": 1281 |
|
}, |
|
{ |
|
"epoch": 0.37203928365106875, |
|
"grad_norm": 1.74599027633667, |
|
"learning_rate": 9.503362930720747e-05, |
|
"loss": 2.9025, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 0.3740612362796072, |
|
"grad_norm": 1.7603254318237305, |
|
"learning_rate": 9.496073729445573e-05, |
|
"loss": 2.9233, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.37608318890814557, |
|
"grad_norm": 1.7627036571502686, |
|
"learning_rate": 9.488734262305281e-05, |
|
"loss": 2.8665, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 0.378105141536684, |
|
"grad_norm": 1.642699956893921, |
|
"learning_rate": 9.481344611354721e-05, |
|
"loss": 2.8467, |
|
"step": 1309 |
|
}, |
|
{ |
|
"epoch": 0.3801270941652224, |
|
"grad_norm": 1.629868507385254, |
|
"learning_rate": 9.473904859209801e-05, |
|
"loss": 2.8086, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 0.3821490467937608, |
|
"grad_norm": 1.7129733562469482, |
|
"learning_rate": 9.466415089046551e-05, |
|
"loss": 2.7833, |
|
"step": 1323 |
|
}, |
|
{ |
|
"epoch": 0.38417099942229926, |
|
"grad_norm": 1.6511763334274292, |
|
"learning_rate": 9.458875384600206e-05, |
|
"loss": 2.9041, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.38619295205083765, |
|
"grad_norm": 1.7601238489151, |
|
"learning_rate": 9.451285830164256e-05, |
|
"loss": 2.8706, |
|
"step": 1337 |
|
}, |
|
{ |
|
"epoch": 0.3882149046793761, |
|
"grad_norm": 1.7456876039505005, |
|
"learning_rate": 9.44364651058952e-05, |
|
"loss": 2.8847, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 0.3902368573079145, |
|
"grad_norm": 1.7066991329193115, |
|
"learning_rate": 9.435957511283184e-05, |
|
"loss": 2.8346, |
|
"step": 1351 |
|
}, |
|
{ |
|
"epoch": 0.3922588099364529, |
|
"grad_norm": 1.6667215824127197, |
|
"learning_rate": 9.42821891820785e-05, |
|
"loss": 2.7794, |
|
"step": 1358 |
|
}, |
|
{ |
|
"epoch": 0.39428076256499134, |
|
"grad_norm": 1.7139601707458496, |
|
"learning_rate": 9.420430817880578e-05, |
|
"loss": 2.8378, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.3963027151935298, |
|
"grad_norm": 1.7201801538467407, |
|
"learning_rate": 9.412593297371914e-05, |
|
"loss": 2.8381, |
|
"step": 1372 |
|
}, |
|
{ |
|
"epoch": 0.39832466782206816, |
|
"grad_norm": 1.7901239395141602, |
|
"learning_rate": 9.404706444304921e-05, |
|
"loss": 2.7901, |
|
"step": 1379 |
|
}, |
|
{ |
|
"epoch": 0.4003466204506066, |
|
"grad_norm": 1.7698922157287598, |
|
"learning_rate": 9.396770346854197e-05, |
|
"loss": 2.9107, |
|
"step": 1386 |
|
}, |
|
{ |
|
"epoch": 0.402368573079145, |
|
"grad_norm": 1.6929407119750977, |
|
"learning_rate": 9.38878509374489e-05, |
|
"loss": 2.8586, |
|
"step": 1393 |
|
}, |
|
{ |
|
"epoch": 0.4043905257076834, |
|
"grad_norm": 1.8319777250289917, |
|
"learning_rate": 9.380750774251702e-05, |
|
"loss": 2.8823, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.40641247833622185, |
|
"grad_norm": 1.7365424633026123, |
|
"learning_rate": 9.372667478197901e-05, |
|
"loss": 2.8193, |
|
"step": 1407 |
|
}, |
|
{ |
|
"epoch": 0.40843443096476023, |
|
"grad_norm": 1.6431219577789307, |
|
"learning_rate": 9.364535295954304e-05, |
|
"loss": 2.7777, |
|
"step": 1414 |
|
}, |
|
{ |
|
"epoch": 0.41045638359329867, |
|
"grad_norm": 1.824762225151062, |
|
"learning_rate": 9.356354318438279e-05, |
|
"loss": 2.8279, |
|
"step": 1421 |
|
}, |
|
{ |
|
"epoch": 0.4124783362218371, |
|
"grad_norm": 1.7178157567977905, |
|
"learning_rate": 9.348124637112719e-05, |
|
"loss": 2.9241, |
|
"step": 1428 |
|
}, |
|
{ |
|
"epoch": 0.4145002888503755, |
|
"grad_norm": 1.7042038440704346, |
|
"learning_rate": 9.339846343985019e-05, |
|
"loss": 2.826, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.4165222414789139, |
|
"grad_norm": 1.691094160079956, |
|
"learning_rate": 9.331519531606062e-05, |
|
"loss": 2.7895, |
|
"step": 1442 |
|
}, |
|
{ |
|
"epoch": 0.41854419410745236, |
|
"grad_norm": 1.7455573081970215, |
|
"learning_rate": 9.323144293069164e-05, |
|
"loss": 2.8141, |
|
"step": 1449 |
|
}, |
|
{ |
|
"epoch": 0.42056614673599074, |
|
"grad_norm": 1.6580560207366943, |
|
"learning_rate": 9.314720722009045e-05, |
|
"loss": 2.7663, |
|
"step": 1456 |
|
}, |
|
{ |
|
"epoch": 0.4225880993645292, |
|
"grad_norm": 1.6528170108795166, |
|
"learning_rate": 9.306248912600783e-05, |
|
"loss": 2.7459, |
|
"step": 1463 |
|
}, |
|
{ |
|
"epoch": 0.4246100519930676, |
|
"grad_norm": 1.768887996673584, |
|
"learning_rate": 9.29772895955876e-05, |
|
"loss": 2.8386, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.426632004621606, |
|
"grad_norm": 1.9209166765213013, |
|
"learning_rate": 9.289160958135591e-05, |
|
"loss": 2.7621, |
|
"step": 1477 |
|
}, |
|
{ |
|
"epoch": 0.42865395725014444, |
|
"grad_norm": 1.6556992530822754, |
|
"learning_rate": 9.28054500412108e-05, |
|
"loss": 2.8435, |
|
"step": 1484 |
|
}, |
|
{ |
|
"epoch": 0.4306759098786828, |
|
"grad_norm": 1.7342525720596313, |
|
"learning_rate": 9.271881193841135e-05, |
|
"loss": 2.7762, |
|
"step": 1491 |
|
}, |
|
{ |
|
"epoch": 0.43269786250722125, |
|
"grad_norm": 1.7639968395233154, |
|
"learning_rate": 9.263169624156694e-05, |
|
"loss": 2.8624, |
|
"step": 1498 |
|
}, |
|
{ |
|
"epoch": 0.4347198151357597, |
|
"grad_norm": 1.6903929710388184, |
|
"learning_rate": 9.25441039246264e-05, |
|
"loss": 2.8375, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.43674176776429807, |
|
"grad_norm": 2.2824156284332275, |
|
"learning_rate": 9.245603596686719e-05, |
|
"loss": 2.8951, |
|
"step": 1512 |
|
}, |
|
{ |
|
"epoch": 0.4387637203928365, |
|
"grad_norm": 1.6158207654953003, |
|
"learning_rate": 9.236749335288442e-05, |
|
"loss": 2.7868, |
|
"step": 1519 |
|
}, |
|
{ |
|
"epoch": 0.44078567302137495, |
|
"grad_norm": 1.6459358930587769, |
|
"learning_rate": 9.227847707257975e-05, |
|
"loss": 2.7288, |
|
"step": 1526 |
|
}, |
|
{ |
|
"epoch": 0.44280762564991333, |
|
"grad_norm": 1.6403940916061401, |
|
"learning_rate": 9.218898812115049e-05, |
|
"loss": 2.7801, |
|
"step": 1533 |
|
}, |
|
{ |
|
"epoch": 0.44482957827845176, |
|
"grad_norm": 1.6620136499404907, |
|
"learning_rate": 9.209902749907836e-05, |
|
"loss": 2.7598, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.4468515309069902, |
|
"grad_norm": 1.7667573690414429, |
|
"learning_rate": 9.200859621211832e-05, |
|
"loss": 2.7368, |
|
"step": 1547 |
|
}, |
|
{ |
|
"epoch": 0.4488734835355286, |
|
"grad_norm": 1.568248987197876, |
|
"learning_rate": 9.191769527128736e-05, |
|
"loss": 2.7518, |
|
"step": 1554 |
|
}, |
|
{ |
|
"epoch": 0.450895436164067, |
|
"grad_norm": 1.6497979164123535, |
|
"learning_rate": 9.182632569285314e-05, |
|
"loss": 2.8241, |
|
"step": 1561 |
|
}, |
|
{ |
|
"epoch": 0.45291738879260546, |
|
"grad_norm": 1.7366968393325806, |
|
"learning_rate": 9.17344884983227e-05, |
|
"loss": 2.8069, |
|
"step": 1568 |
|
}, |
|
{ |
|
"epoch": 0.45493934142114384, |
|
"grad_norm": 1.7804850339889526, |
|
"learning_rate": 9.1642184714431e-05, |
|
"loss": 2.7597, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.4569612940496823, |
|
"grad_norm": 10.3452730178833, |
|
"learning_rate": 9.15494153731294e-05, |
|
"loss": 2.7507, |
|
"step": 1582 |
|
}, |
|
{ |
|
"epoch": 0.45898324667822066, |
|
"grad_norm": 1.733091950416565, |
|
"learning_rate": 9.145618151157424e-05, |
|
"loss": 2.7015, |
|
"step": 1589 |
|
}, |
|
{ |
|
"epoch": 0.4610051993067591, |
|
"grad_norm": 1.769957184791565, |
|
"learning_rate": 9.136248417211512e-05, |
|
"loss": 2.7582, |
|
"step": 1596 |
|
}, |
|
{ |
|
"epoch": 0.46302715193529753, |
|
"grad_norm": 1.633809208869934, |
|
"learning_rate": 9.12683244022833e-05, |
|
"loss": 2.7988, |
|
"step": 1603 |
|
}, |
|
{ |
|
"epoch": 0.4650491045638359, |
|
"grad_norm": 1.6593530178070068, |
|
"learning_rate": 9.117370325478e-05, |
|
"loss": 2.8712, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.46707105719237435, |
|
"grad_norm": 1.607200026512146, |
|
"learning_rate": 9.107862178746463e-05, |
|
"loss": 2.7589, |
|
"step": 1617 |
|
}, |
|
{ |
|
"epoch": 0.4690930098209128, |
|
"grad_norm": 1.6576822996139526, |
|
"learning_rate": 9.098308106334291e-05, |
|
"loss": 2.7401, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 0.47111496244945117, |
|
"grad_norm": 1.7137107849121094, |
|
"learning_rate": 9.088708215055508e-05, |
|
"loss": 2.7639, |
|
"step": 1631 |
|
}, |
|
{ |
|
"epoch": 0.4731369150779896, |
|
"grad_norm": 1.6344847679138184, |
|
"learning_rate": 9.079062612236387e-05, |
|
"loss": 2.8319, |
|
"step": 1638 |
|
}, |
|
{ |
|
"epoch": 0.47515886770652804, |
|
"grad_norm": 1.5891085863113403, |
|
"learning_rate": 9.069371405714252e-05, |
|
"loss": 2.6934, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.4771808203350664, |
|
"grad_norm": 4.501755237579346, |
|
"learning_rate": 9.05963470383628e-05, |
|
"loss": 2.6837, |
|
"step": 1652 |
|
}, |
|
{ |
|
"epoch": 0.47920277296360486, |
|
"grad_norm": 1.567033290863037, |
|
"learning_rate": 9.049852615458278e-05, |
|
"loss": 2.7968, |
|
"step": 1659 |
|
}, |
|
{ |
|
"epoch": 0.48122472559214324, |
|
"grad_norm": 1.6929982900619507, |
|
"learning_rate": 9.040025249943476e-05, |
|
"loss": 2.6938, |
|
"step": 1666 |
|
}, |
|
{ |
|
"epoch": 0.4832466782206817, |
|
"grad_norm": 1.7686516046524048, |
|
"learning_rate": 9.030152717161294e-05, |
|
"loss": 2.8016, |
|
"step": 1673 |
|
}, |
|
{ |
|
"epoch": 0.4852686308492201, |
|
"grad_norm": 1.6347525119781494, |
|
"learning_rate": 9.020235127486125e-05, |
|
"loss": 2.7131, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.4872905834777585, |
|
"grad_norm": 1.5744967460632324, |
|
"learning_rate": 9.010272591796097e-05, |
|
"loss": 2.7364, |
|
"step": 1687 |
|
}, |
|
{ |
|
"epoch": 0.48931253610629694, |
|
"grad_norm": 1.6553685665130615, |
|
"learning_rate": 9.000265221471822e-05, |
|
"loss": 2.7467, |
|
"step": 1694 |
|
}, |
|
{ |
|
"epoch": 0.4913344887348354, |
|
"grad_norm": 1.6058526039123535, |
|
"learning_rate": 8.990213128395175e-05, |
|
"loss": 2.7837, |
|
"step": 1701 |
|
}, |
|
{ |
|
"epoch": 0.49335644136337375, |
|
"grad_norm": 1.6541842222213745, |
|
"learning_rate": 8.980116424948019e-05, |
|
"loss": 2.7639, |
|
"step": 1708 |
|
}, |
|
{ |
|
"epoch": 0.4953783939919122, |
|
"grad_norm": 1.800726056098938, |
|
"learning_rate": 8.969975224010961e-05, |
|
"loss": 2.7972, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.49740034662045063, |
|
"grad_norm": 1.7600321769714355, |
|
"learning_rate": 8.959789638962089e-05, |
|
"loss": 2.714, |
|
"step": 1722 |
|
}, |
|
{ |
|
"epoch": 0.499422299248989, |
|
"grad_norm": 1.6169345378875732, |
|
"learning_rate": 8.9495597836757e-05, |
|
"loss": 2.78, |
|
"step": 1729 |
|
}, |
|
{ |
|
"epoch": 0.5014442518775274, |
|
"grad_norm": 1.6731981039047241, |
|
"learning_rate": 8.939285772521033e-05, |
|
"loss": 2.7272, |
|
"step": 1736 |
|
}, |
|
{ |
|
"epoch": 0.5034662045060658, |
|
"grad_norm": 1.7356120347976685, |
|
"learning_rate": 8.928967720360987e-05, |
|
"loss": 2.7888, |
|
"step": 1743 |
|
}, |
|
{ |
|
"epoch": 0.5054881571346043, |
|
"grad_norm": 1.6834124326705933, |
|
"learning_rate": 8.918605742550837e-05, |
|
"loss": 2.7427, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5075101097631427, |
|
"grad_norm": 1.5424860715866089, |
|
"learning_rate": 8.908199954936942e-05, |
|
"loss": 2.7217, |
|
"step": 1757 |
|
}, |
|
{ |
|
"epoch": 0.5095320623916811, |
|
"grad_norm": 1.7886710166931152, |
|
"learning_rate": 8.897750473855453e-05, |
|
"loss": 2.7078, |
|
"step": 1764 |
|
}, |
|
{ |
|
"epoch": 0.5115540150202196, |
|
"grad_norm": 1.64043390750885, |
|
"learning_rate": 8.887257416131016e-05, |
|
"loss": 2.7356, |
|
"step": 1771 |
|
}, |
|
{ |
|
"epoch": 0.5135759676487579, |
|
"grad_norm": 1.6376231908798218, |
|
"learning_rate": 8.876720899075455e-05, |
|
"loss": 2.7536, |
|
"step": 1778 |
|
}, |
|
{ |
|
"epoch": 0.5155979202772963, |
|
"grad_norm": 1.877820611000061, |
|
"learning_rate": 8.866141040486471e-05, |
|
"loss": 2.7294, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.5176198729058348, |
|
"grad_norm": 1.784562587738037, |
|
"learning_rate": 8.85551795864632e-05, |
|
"loss": 2.7104, |
|
"step": 1792 |
|
}, |
|
{ |
|
"epoch": 0.5196418255343732, |
|
"grad_norm": 1.5521408319473267, |
|
"learning_rate": 8.844851772320494e-05, |
|
"loss": 2.5452, |
|
"step": 1799 |
|
}, |
|
{ |
|
"epoch": 0.5216637781629117, |
|
"grad_norm": 1.6310935020446777, |
|
"learning_rate": 8.834142600756386e-05, |
|
"loss": 2.7134, |
|
"step": 1806 |
|
}, |
|
{ |
|
"epoch": 0.52368573079145, |
|
"grad_norm": 1.6490379571914673, |
|
"learning_rate": 8.823390563681965e-05, |
|
"loss": 2.7386, |
|
"step": 1813 |
|
}, |
|
{ |
|
"epoch": 0.5257076834199884, |
|
"grad_norm": 1.6033873558044434, |
|
"learning_rate": 8.812595781304436e-05, |
|
"loss": 2.7031, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5277296360485269, |
|
"grad_norm": 1.5489314794540405, |
|
"learning_rate": 8.801758374308888e-05, |
|
"loss": 2.6165, |
|
"step": 1827 |
|
}, |
|
{ |
|
"epoch": 0.5297515886770653, |
|
"grad_norm": 1.7704176902770996, |
|
"learning_rate": 8.790878463856958e-05, |
|
"loss": 2.6288, |
|
"step": 1834 |
|
}, |
|
{ |
|
"epoch": 0.5317735413056037, |
|
"grad_norm": 3.7646679878234863, |
|
"learning_rate": 8.779956171585463e-05, |
|
"loss": 2.718, |
|
"step": 1841 |
|
}, |
|
{ |
|
"epoch": 0.5337954939341422, |
|
"grad_norm": 1.51352858543396, |
|
"learning_rate": 8.768991619605054e-05, |
|
"loss": 2.7983, |
|
"step": 1848 |
|
}, |
|
{ |
|
"epoch": 0.5358174465626805, |
|
"grad_norm": 1.5823729038238525, |
|
"learning_rate": 8.757984930498833e-05, |
|
"loss": 2.6646, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.5378393991912189, |
|
"grad_norm": 1.6353611946105957, |
|
"learning_rate": 8.746936227321003e-05, |
|
"loss": 2.7042, |
|
"step": 1862 |
|
}, |
|
{ |
|
"epoch": 0.5398613518197574, |
|
"grad_norm": 1.7938235998153687, |
|
"learning_rate": 8.735845633595477e-05, |
|
"loss": 2.7986, |
|
"step": 1869 |
|
}, |
|
{ |
|
"epoch": 0.5418833044482958, |
|
"grad_norm": 1.6479319334030151, |
|
"learning_rate": 8.7247132733145e-05, |
|
"loss": 2.6975, |
|
"step": 1876 |
|
}, |
|
{ |
|
"epoch": 0.5439052570768342, |
|
"grad_norm": 1.5863536596298218, |
|
"learning_rate": 8.713539270937271e-05, |
|
"loss": 2.788, |
|
"step": 1883 |
|
}, |
|
{ |
|
"epoch": 0.5459272097053726, |
|
"grad_norm": 1.684091567993164, |
|
"learning_rate": 8.702323751388541e-05, |
|
"loss": 2.7282, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.547949162333911, |
|
"grad_norm": 1.654599666595459, |
|
"learning_rate": 8.691066840057223e-05, |
|
"loss": 2.6806, |
|
"step": 1897 |
|
}, |
|
{ |
|
"epoch": 0.5499711149624494, |
|
"grad_norm": 1.6643625497817993, |
|
"learning_rate": 8.679768662794985e-05, |
|
"loss": 2.7077, |
|
"step": 1904 |
|
}, |
|
{ |
|
"epoch": 0.5519930675909879, |
|
"grad_norm": 1.6734774112701416, |
|
"learning_rate": 8.66842934591485e-05, |
|
"loss": 2.7808, |
|
"step": 1911 |
|
}, |
|
{ |
|
"epoch": 0.5540150202195263, |
|
"grad_norm": 1.5683385133743286, |
|
"learning_rate": 8.657049016189776e-05, |
|
"loss": 2.75, |
|
"step": 1918 |
|
}, |
|
{ |
|
"epoch": 0.5560369728480647, |
|
"grad_norm": 1.7231022119522095, |
|
"learning_rate": 8.645627800851244e-05, |
|
"loss": 2.7396, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.5580589254766031, |
|
"grad_norm": 1.6285254955291748, |
|
"learning_rate": 8.634165827587834e-05, |
|
"loss": 2.7408, |
|
"step": 1932 |
|
}, |
|
{ |
|
"epoch": 0.5600808781051415, |
|
"grad_norm": 1.6170200109481812, |
|
"learning_rate": 8.622663224543797e-05, |
|
"loss": 2.6624, |
|
"step": 1939 |
|
}, |
|
{ |
|
"epoch": 0.56210283073368, |
|
"grad_norm": 1.5798912048339844, |
|
"learning_rate": 8.611120120317623e-05, |
|
"loss": 2.6589, |
|
"step": 1946 |
|
}, |
|
{ |
|
"epoch": 0.5641247833622184, |
|
"grad_norm": 1.4929171800613403, |
|
"learning_rate": 8.599536643960605e-05, |
|
"loss": 2.6562, |
|
"step": 1953 |
|
}, |
|
{ |
|
"epoch": 0.5661467359907568, |
|
"grad_norm": 1.6575974225997925, |
|
"learning_rate": 8.587912924975391e-05, |
|
"loss": 2.6959, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5681686886192953, |
|
"grad_norm": 1.6966285705566406, |
|
"learning_rate": 8.576249093314541e-05, |
|
"loss": 2.6809, |
|
"step": 1967 |
|
}, |
|
{ |
|
"epoch": 0.5701906412478336, |
|
"grad_norm": 1.5209206342697144, |
|
"learning_rate": 8.564545279379073e-05, |
|
"loss": 2.6698, |
|
"step": 1974 |
|
}, |
|
{ |
|
"epoch": 0.572212593876372, |
|
"grad_norm": 1.6631150245666504, |
|
"learning_rate": 8.552801614017004e-05, |
|
"loss": 2.7044, |
|
"step": 1981 |
|
}, |
|
{ |
|
"epoch": 0.5742345465049105, |
|
"grad_norm": 1.6683377027511597, |
|
"learning_rate": 8.541018228521886e-05, |
|
"loss": 2.7116, |
|
"step": 1988 |
|
}, |
|
{ |
|
"epoch": 0.5762564991334489, |
|
"grad_norm": 1.5585025548934937, |
|
"learning_rate": 8.529195254631345e-05, |
|
"loss": 2.7277, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.5782784517619873, |
|
"grad_norm": 1.5222512483596802, |
|
"learning_rate": 8.517332824525599e-05, |
|
"loss": 2.6625, |
|
"step": 2002 |
|
}, |
|
{ |
|
"epoch": 0.5803004043905257, |
|
"grad_norm": 1.634189486503601, |
|
"learning_rate": 8.505431070825985e-05, |
|
"loss": 2.7999, |
|
"step": 2009 |
|
}, |
|
{ |
|
"epoch": 0.5823223570190641, |
|
"grad_norm": 1.697341799736023, |
|
"learning_rate": 8.493490126593479e-05, |
|
"loss": 2.6942, |
|
"step": 2016 |
|
}, |
|
{ |
|
"epoch": 0.5843443096476025, |
|
"grad_norm": 1.6360241174697876, |
|
"learning_rate": 8.481510125327198e-05, |
|
"loss": 2.7039, |
|
"step": 2023 |
|
}, |
|
{ |
|
"epoch": 0.586366262276141, |
|
"grad_norm": 1.7048941850662231, |
|
"learning_rate": 8.46949120096292e-05, |
|
"loss": 2.5961, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.5883882149046794, |
|
"grad_norm": 1.4944428205490112, |
|
"learning_rate": 8.457433487871582e-05, |
|
"loss": 2.692, |
|
"step": 2037 |
|
}, |
|
{ |
|
"epoch": 0.5904101675332178, |
|
"grad_norm": 1.6939690113067627, |
|
"learning_rate": 8.445337120857775e-05, |
|
"loss": 2.6814, |
|
"step": 2044 |
|
}, |
|
{ |
|
"epoch": 0.5924321201617562, |
|
"grad_norm": 1.5280512571334839, |
|
"learning_rate": 8.433202235158237e-05, |
|
"loss": 2.6694, |
|
"step": 2051 |
|
}, |
|
{ |
|
"epoch": 0.5944540727902946, |
|
"grad_norm": 1.6327159404754639, |
|
"learning_rate": 8.421028966440345e-05, |
|
"loss": 2.6265, |
|
"step": 2058 |
|
}, |
|
{ |
|
"epoch": 0.596476025418833, |
|
"grad_norm": 1.5076837539672852, |
|
"learning_rate": 8.408817450800594e-05, |
|
"loss": 2.6193, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.5984979780473715, |
|
"grad_norm": 1.753414511680603, |
|
"learning_rate": 8.396567824763084e-05, |
|
"loss": 2.7144, |
|
"step": 2072 |
|
}, |
|
{ |
|
"epoch": 0.6005199306759099, |
|
"grad_norm": 1.5676695108413696, |
|
"learning_rate": 8.384280225277978e-05, |
|
"loss": 2.6933, |
|
"step": 2079 |
|
}, |
|
{ |
|
"epoch": 0.6025418833044482, |
|
"grad_norm": 1.5314265489578247, |
|
"learning_rate": 8.371954789719986e-05, |
|
"loss": 2.6253, |
|
"step": 2086 |
|
}, |
|
{ |
|
"epoch": 0.6045638359329867, |
|
"grad_norm": 1.5249189138412476, |
|
"learning_rate": 8.359591655886822e-05, |
|
"loss": 2.5989, |
|
"step": 2093 |
|
}, |
|
{ |
|
"epoch": 0.6065857885615251, |
|
"grad_norm": 1.6141725778579712, |
|
"learning_rate": 8.347190961997666e-05, |
|
"loss": 2.6513, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6086077411900636, |
|
"grad_norm": 2.041733980178833, |
|
"learning_rate": 8.334752846691614e-05, |
|
"loss": 2.7081, |
|
"step": 2107 |
|
}, |
|
{ |
|
"epoch": 0.610629693818602, |
|
"grad_norm": 1.6285977363586426, |
|
"learning_rate": 8.322277449026135e-05, |
|
"loss": 2.6404, |
|
"step": 2114 |
|
}, |
|
{ |
|
"epoch": 0.6126516464471404, |
|
"grad_norm": 1.5082623958587646, |
|
"learning_rate": 8.309764908475508e-05, |
|
"loss": 2.5684, |
|
"step": 2121 |
|
}, |
|
{ |
|
"epoch": 0.6146735990756788, |
|
"grad_norm": 1.5108692646026611, |
|
"learning_rate": 8.297215364929274e-05, |
|
"loss": 2.6821, |
|
"step": 2128 |
|
}, |
|
{ |
|
"epoch": 0.6166955517042172, |
|
"grad_norm": 1.6972891092300415, |
|
"learning_rate": 8.28462895869066e-05, |
|
"loss": 2.6732, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.6187175043327556, |
|
"grad_norm": 69.6792984008789, |
|
"learning_rate": 8.272005830475022e-05, |
|
"loss": 2.6544, |
|
"step": 2142 |
|
}, |
|
{ |
|
"epoch": 0.6207394569612941, |
|
"grad_norm": 1.5854318141937256, |
|
"learning_rate": 8.259346121408258e-05, |
|
"loss": 2.7488, |
|
"step": 2149 |
|
}, |
|
{ |
|
"epoch": 0.6227614095898325, |
|
"grad_norm": 1.8268544673919678, |
|
"learning_rate": 8.246649973025244e-05, |
|
"loss": 2.5605, |
|
"step": 2156 |
|
}, |
|
{ |
|
"epoch": 0.6247833622183708, |
|
"grad_norm": 1.5942052602767944, |
|
"learning_rate": 8.233917527268242e-05, |
|
"loss": 2.661, |
|
"step": 2163 |
|
}, |
|
{ |
|
"epoch": 0.6268053148469093, |
|
"grad_norm": 1.622071385383606, |
|
"learning_rate": 8.22114892648532e-05, |
|
"loss": 2.7426, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.6288272674754477, |
|
"grad_norm": 1.594794511795044, |
|
"learning_rate": 8.208344313428753e-05, |
|
"loss": 2.704, |
|
"step": 2177 |
|
}, |
|
{ |
|
"epoch": 0.6308492201039861, |
|
"grad_norm": 1.5333237648010254, |
|
"learning_rate": 8.195503831253438e-05, |
|
"loss": 2.6622, |
|
"step": 2184 |
|
}, |
|
{ |
|
"epoch": 0.6328711727325246, |
|
"grad_norm": 1.5856921672821045, |
|
"learning_rate": 8.182627623515278e-05, |
|
"loss": 2.6087, |
|
"step": 2191 |
|
}, |
|
{ |
|
"epoch": 0.634893125361063, |
|
"grad_norm": 1.5291564464569092, |
|
"learning_rate": 8.169715834169593e-05, |
|
"loss": 2.6231, |
|
"step": 2198 |
|
}, |
|
{ |
|
"epoch": 0.6369150779896013, |
|
"grad_norm": 1.6826353073120117, |
|
"learning_rate": 8.156768607569501e-05, |
|
"loss": 2.693, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.6389370306181398, |
|
"grad_norm": 1.5820770263671875, |
|
"learning_rate": 8.143786088464306e-05, |
|
"loss": 2.5229, |
|
"step": 2212 |
|
}, |
|
{ |
|
"epoch": 0.6409589832466782, |
|
"grad_norm": 1.591989517211914, |
|
"learning_rate": 8.130768421997881e-05, |
|
"loss": 2.6624, |
|
"step": 2219 |
|
}, |
|
{ |
|
"epoch": 0.6429809358752167, |
|
"grad_norm": 1.6424845457077026, |
|
"learning_rate": 8.117715753707045e-05, |
|
"loss": 2.6297, |
|
"step": 2226 |
|
}, |
|
{ |
|
"epoch": 0.6450028885037551, |
|
"grad_norm": 1.6022992134094238, |
|
"learning_rate": 8.104628229519935e-05, |
|
"loss": 2.6345, |
|
"step": 2233 |
|
}, |
|
{ |
|
"epoch": 0.6470248411322935, |
|
"grad_norm": 1.543290138244629, |
|
"learning_rate": 8.091505995754375e-05, |
|
"loss": 2.5814, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.6490467937608319, |
|
"grad_norm": 1.5112725496292114, |
|
"learning_rate": 8.078349199116241e-05, |
|
"loss": 2.637, |
|
"step": 2247 |
|
}, |
|
{ |
|
"epoch": 0.6510687463893703, |
|
"grad_norm": 1.6379178762435913, |
|
"learning_rate": 8.065157986697819e-05, |
|
"loss": 2.6515, |
|
"step": 2254 |
|
}, |
|
{ |
|
"epoch": 0.6530906990179087, |
|
"grad_norm": 1.5617547035217285, |
|
"learning_rate": 8.051932505976161e-05, |
|
"loss": 2.6124, |
|
"step": 2261 |
|
}, |
|
{ |
|
"epoch": 0.6551126516464472, |
|
"grad_norm": 1.7973439693450928, |
|
"learning_rate": 8.03867290481144e-05, |
|
"loss": 2.7264, |
|
"step": 2268 |
|
}, |
|
{ |
|
"epoch": 0.6571346042749856, |
|
"grad_norm": 1.5679056644439697, |
|
"learning_rate": 8.025379331445291e-05, |
|
"loss": 2.6324, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.6591565569035239, |
|
"grad_norm": 1.546196460723877, |
|
"learning_rate": 8.012051934499156e-05, |
|
"loss": 2.6175, |
|
"step": 2282 |
|
}, |
|
{ |
|
"epoch": 0.6611785095320624, |
|
"grad_norm": 1.556950330734253, |
|
"learning_rate": 7.998690862972626e-05, |
|
"loss": 2.64, |
|
"step": 2289 |
|
}, |
|
{ |
|
"epoch": 0.6632004621606008, |
|
"grad_norm": 1.6919306516647339, |
|
"learning_rate": 7.985296266241768e-05, |
|
"loss": 2.5713, |
|
"step": 2296 |
|
}, |
|
{ |
|
"epoch": 0.6652224147891392, |
|
"grad_norm": 1.5550673007965088, |
|
"learning_rate": 7.971868294057461e-05, |
|
"loss": 2.6087, |
|
"step": 2303 |
|
}, |
|
{ |
|
"epoch": 0.6672443674176777, |
|
"grad_norm": 1.4847160577774048, |
|
"learning_rate": 7.958407096543721e-05, |
|
"loss": 2.6296, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.6692663200462161, |
|
"grad_norm": 1.5030337572097778, |
|
"learning_rate": 7.944912824196019e-05, |
|
"loss": 2.5972, |
|
"step": 2317 |
|
}, |
|
{ |
|
"epoch": 0.6712882726747544, |
|
"grad_norm": 1.541245698928833, |
|
"learning_rate": 7.931385627879603e-05, |
|
"loss": 2.6291, |
|
"step": 2324 |
|
}, |
|
{ |
|
"epoch": 0.6733102253032929, |
|
"grad_norm": 1.5703462362289429, |
|
"learning_rate": 7.917825658827807e-05, |
|
"loss": 2.6247, |
|
"step": 2331 |
|
}, |
|
{ |
|
"epoch": 0.6753321779318313, |
|
"grad_norm": 1.6782206296920776, |
|
"learning_rate": 7.904233068640364e-05, |
|
"loss": 2.6725, |
|
"step": 2338 |
|
}, |
|
{ |
|
"epoch": 0.6773541305603697, |
|
"grad_norm": 1.490038275718689, |
|
"learning_rate": 7.89060800928171e-05, |
|
"loss": 2.6327, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.6793760831889082, |
|
"grad_norm": 1.5980485677719116, |
|
"learning_rate": 7.876950633079281e-05, |
|
"loss": 2.5769, |
|
"step": 2352 |
|
}, |
|
{ |
|
"epoch": 0.6813980358174465, |
|
"grad_norm": 1.5050002336502075, |
|
"learning_rate": 7.863261092721821e-05, |
|
"loss": 2.4925, |
|
"step": 2359 |
|
}, |
|
{ |
|
"epoch": 0.683419988445985, |
|
"grad_norm": 1.6050291061401367, |
|
"learning_rate": 7.84953954125766e-05, |
|
"loss": 2.5922, |
|
"step": 2366 |
|
}, |
|
{ |
|
"epoch": 0.6854419410745234, |
|
"grad_norm": 1.6114401817321777, |
|
"learning_rate": 7.835786132093014e-05, |
|
"loss": 2.696, |
|
"step": 2373 |
|
}, |
|
{ |
|
"epoch": 0.6874638937030618, |
|
"grad_norm": 1.5678999423980713, |
|
"learning_rate": 7.822001018990265e-05, |
|
"loss": 2.6002, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.6894858463316003, |
|
"grad_norm": 1.6375820636749268, |
|
"learning_rate": 7.808184356066247e-05, |
|
"loss": 2.6161, |
|
"step": 2387 |
|
}, |
|
{ |
|
"epoch": 0.6915077989601387, |
|
"grad_norm": 1.5844779014587402, |
|
"learning_rate": 7.794336297790513e-05, |
|
"loss": 2.6841, |
|
"step": 2394 |
|
}, |
|
{ |
|
"epoch": 0.693529751588677, |
|
"grad_norm": 1.6627087593078613, |
|
"learning_rate": 7.780456998983619e-05, |
|
"loss": 2.5834, |
|
"step": 2401 |
|
}, |
|
{ |
|
"epoch": 0.6955517042172155, |
|
"grad_norm": 1.545854091644287, |
|
"learning_rate": 7.766546614815389e-05, |
|
"loss": 2.5658, |
|
"step": 2408 |
|
}, |
|
{ |
|
"epoch": 0.6975736568457539, |
|
"grad_norm": 1.5244427919387817, |
|
"learning_rate": 7.752605300803176e-05, |
|
"loss": 2.6742, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.6995956094742923, |
|
"grad_norm": 1.6053026914596558, |
|
"learning_rate": 7.738633212810129e-05, |
|
"loss": 2.5592, |
|
"step": 2422 |
|
}, |
|
{ |
|
"epoch": 0.7016175621028308, |
|
"grad_norm": 1.5977808237075806, |
|
"learning_rate": 7.724630507043452e-05, |
|
"loss": 2.6546, |
|
"step": 2429 |
|
}, |
|
{ |
|
"epoch": 0.7036395147313691, |
|
"grad_norm": 1.5113341808319092, |
|
"learning_rate": 7.710597340052646e-05, |
|
"loss": 2.5986, |
|
"step": 2436 |
|
}, |
|
{ |
|
"epoch": 0.7056614673599075, |
|
"grad_norm": 1.5165156126022339, |
|
"learning_rate": 7.696533868727772e-05, |
|
"loss": 2.6337, |
|
"step": 2443 |
|
}, |
|
{ |
|
"epoch": 0.707683419988446, |
|
"grad_norm": 1.6537933349609375, |
|
"learning_rate": 7.682440250297693e-05, |
|
"loss": 2.6414, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.7097053726169844, |
|
"grad_norm": 1.6465896368026733, |
|
"learning_rate": 7.668316642328312e-05, |
|
"loss": 2.5988, |
|
"step": 2457 |
|
}, |
|
{ |
|
"epoch": 0.7117273252455228, |
|
"grad_norm": 1.5026781558990479, |
|
"learning_rate": 7.654163202720818e-05, |
|
"loss": 2.5555, |
|
"step": 2464 |
|
}, |
|
{ |
|
"epoch": 0.7137492778740613, |
|
"grad_norm": 1.5290915966033936, |
|
"learning_rate": 7.63998008970991e-05, |
|
"loss": 2.5399, |
|
"step": 2471 |
|
}, |
|
{ |
|
"epoch": 0.7157712305025996, |
|
"grad_norm": 1.5298501253128052, |
|
"learning_rate": 7.625767461862036e-05, |
|
"loss": 2.5872, |
|
"step": 2478 |
|
}, |
|
{ |
|
"epoch": 0.717793183131138, |
|
"grad_norm": 1.4506494998931885, |
|
"learning_rate": 7.611525478073622e-05, |
|
"loss": 2.581, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.7198151357596765, |
|
"grad_norm": 1.572906255722046, |
|
"learning_rate": 7.597254297569287e-05, |
|
"loss": 2.6715, |
|
"step": 2492 |
|
}, |
|
{ |
|
"epoch": 0.7218370883882149, |
|
"grad_norm": 1.5634372234344482, |
|
"learning_rate": 7.582954079900071e-05, |
|
"loss": 2.4712, |
|
"step": 2499 |
|
}, |
|
{ |
|
"epoch": 0.7238590410167534, |
|
"grad_norm": 1.4667270183563232, |
|
"learning_rate": 7.568624984941647e-05, |
|
"loss": 2.6222, |
|
"step": 2506 |
|
}, |
|
{ |
|
"epoch": 0.7258809936452918, |
|
"grad_norm": 1.5092616081237793, |
|
"learning_rate": 7.554267172892533e-05, |
|
"loss": 2.6669, |
|
"step": 2513 |
|
}, |
|
{ |
|
"epoch": 0.7279029462738301, |
|
"grad_norm": 1.562249779701233, |
|
"learning_rate": 7.539880804272306e-05, |
|
"loss": 2.6305, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.7299248989023686, |
|
"grad_norm": 1.6032488346099854, |
|
"learning_rate": 7.5254660399198e-05, |
|
"loss": 2.7047, |
|
"step": 2527 |
|
}, |
|
{ |
|
"epoch": 0.731946851530907, |
|
"grad_norm": 1.5360654592514038, |
|
"learning_rate": 7.511023040991314e-05, |
|
"loss": 2.6315, |
|
"step": 2534 |
|
}, |
|
{ |
|
"epoch": 0.7339688041594454, |
|
"grad_norm": 1.5899386405944824, |
|
"learning_rate": 7.496551968958807e-05, |
|
"loss": 2.6078, |
|
"step": 2541 |
|
}, |
|
{ |
|
"epoch": 0.7359907567879839, |
|
"grad_norm": 1.521978497505188, |
|
"learning_rate": 7.482052985608097e-05, |
|
"loss": 2.5834, |
|
"step": 2548 |
|
}, |
|
{ |
|
"epoch": 0.7380127094165222, |
|
"grad_norm": 1.43201744556427, |
|
"learning_rate": 7.467526253037045e-05, |
|
"loss": 2.6157, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.7400346620450606, |
|
"grad_norm": 1.5489884614944458, |
|
"learning_rate": 7.452971933653748e-05, |
|
"loss": 2.6691, |
|
"step": 2562 |
|
}, |
|
{ |
|
"epoch": 0.7420566146735991, |
|
"grad_norm": 1.496980905532837, |
|
"learning_rate": 7.438390190174724e-05, |
|
"loss": 2.612, |
|
"step": 2569 |
|
}, |
|
{ |
|
"epoch": 0.7440785673021375, |
|
"grad_norm": 1.6220296621322632, |
|
"learning_rate": 7.423781185623087e-05, |
|
"loss": 2.5776, |
|
"step": 2576 |
|
}, |
|
{ |
|
"epoch": 0.7461005199306759, |
|
"grad_norm": 1.5279968976974487, |
|
"learning_rate": 7.409145083326733e-05, |
|
"loss": 2.5984, |
|
"step": 2583 |
|
}, |
|
{ |
|
"epoch": 0.7481224725592144, |
|
"grad_norm": 1.5506973266601562, |
|
"learning_rate": 7.394482046916504e-05, |
|
"loss": 2.5105, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.7501444251877527, |
|
"grad_norm": 1.597609281539917, |
|
"learning_rate": 7.379792240324372e-05, |
|
"loss": 2.552, |
|
"step": 2597 |
|
}, |
|
{ |
|
"epoch": 0.7521663778162911, |
|
"grad_norm": 1.946540355682373, |
|
"learning_rate": 7.365075827781589e-05, |
|
"loss": 2.5808, |
|
"step": 2604 |
|
}, |
|
{ |
|
"epoch": 0.7541883304448296, |
|
"grad_norm": 1.557524561882019, |
|
"learning_rate": 7.350332973816867e-05, |
|
"loss": 2.5806, |
|
"step": 2611 |
|
}, |
|
{ |
|
"epoch": 0.756210283073368, |
|
"grad_norm": 1.6246957778930664, |
|
"learning_rate": 7.335563843254527e-05, |
|
"loss": 2.5267, |
|
"step": 2618 |
|
}, |
|
{ |
|
"epoch": 0.7582322357019065, |
|
"grad_norm": 1.5411909818649292, |
|
"learning_rate": 7.320768601212663e-05, |
|
"loss": 2.6094, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.7602541883304448, |
|
"grad_norm": 1.5607600212097168, |
|
"learning_rate": 7.305947413101295e-05, |
|
"loss": 2.579, |
|
"step": 2632 |
|
}, |
|
{ |
|
"epoch": 0.7622761409589832, |
|
"grad_norm": 1.5495412349700928, |
|
"learning_rate": 7.291100444620518e-05, |
|
"loss": 2.6576, |
|
"step": 2639 |
|
}, |
|
{ |
|
"epoch": 0.7642980935875217, |
|
"grad_norm": 1.6648463010787964, |
|
"learning_rate": 7.27622786175865e-05, |
|
"loss": 2.5696, |
|
"step": 2646 |
|
}, |
|
{ |
|
"epoch": 0.7663200462160601, |
|
"grad_norm": 1.562121868133545, |
|
"learning_rate": 7.261329830790376e-05, |
|
"loss": 2.6125, |
|
"step": 2653 |
|
}, |
|
{ |
|
"epoch": 0.7683419988445985, |
|
"grad_norm": 1.5453815460205078, |
|
"learning_rate": 7.246406518274886e-05, |
|
"loss": 2.4759, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.770363951473137, |
|
"grad_norm": 1.4709010124206543, |
|
"learning_rate": 7.231458091054026e-05, |
|
"loss": 2.5199, |
|
"step": 2667 |
|
}, |
|
{ |
|
"epoch": 0.7723859041016753, |
|
"grad_norm": 1.6227165460586548, |
|
"learning_rate": 7.216484716250414e-05, |
|
"loss": 2.6122, |
|
"step": 2674 |
|
}, |
|
{ |
|
"epoch": 0.7744078567302137, |
|
"grad_norm": 1.4720866680145264, |
|
"learning_rate": 7.201486561265582e-05, |
|
"loss": 2.5468, |
|
"step": 2681 |
|
}, |
|
{ |
|
"epoch": 0.7764298093587522, |
|
"grad_norm": 1.4698817729949951, |
|
"learning_rate": 7.18646379377811e-05, |
|
"loss": 2.6578, |
|
"step": 2688 |
|
}, |
|
{ |
|
"epoch": 0.7784517619872906, |
|
"grad_norm": 1.6236904859542847, |
|
"learning_rate": 7.171416581741734e-05, |
|
"loss": 2.5009, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.780473714615829, |
|
"grad_norm": 1.4604315757751465, |
|
"learning_rate": 7.156345093383489e-05, |
|
"loss": 2.4697, |
|
"step": 2702 |
|
}, |
|
{ |
|
"epoch": 0.7824956672443674, |
|
"grad_norm": 1.632841944694519, |
|
"learning_rate": 7.14124949720181e-05, |
|
"loss": 2.5719, |
|
"step": 2709 |
|
}, |
|
{ |
|
"epoch": 0.7845176198729058, |
|
"grad_norm": 1.5395116806030273, |
|
"learning_rate": 7.126129961964658e-05, |
|
"loss": 2.5451, |
|
"step": 2716 |
|
}, |
|
{ |
|
"epoch": 0.7865395725014442, |
|
"grad_norm": 1.4505419731140137, |
|
"learning_rate": 7.110986656707634e-05, |
|
"loss": 2.5052, |
|
"step": 2723 |
|
}, |
|
{ |
|
"epoch": 0.7885615251299827, |
|
"grad_norm": 1.4465105533599854, |
|
"learning_rate": 7.095819750732089e-05, |
|
"loss": 2.6008, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.7905834777585211, |
|
"grad_norm": 1.467899203300476, |
|
"learning_rate": 7.08062941360322e-05, |
|
"loss": 2.5364, |
|
"step": 2737 |
|
}, |
|
{ |
|
"epoch": 0.7926054303870596, |
|
"grad_norm": 1.5132942199707031, |
|
"learning_rate": 7.065415815148189e-05, |
|
"loss": 2.5405, |
|
"step": 2744 |
|
}, |
|
{ |
|
"epoch": 0.7946273830155979, |
|
"grad_norm": 1.4897785186767578, |
|
"learning_rate": 7.050179125454217e-05, |
|
"loss": 2.5331, |
|
"step": 2751 |
|
}, |
|
{ |
|
"epoch": 0.7966493356441363, |
|
"grad_norm": 1.5240031480789185, |
|
"learning_rate": 7.034919514866688e-05, |
|
"loss": 2.4342, |
|
"step": 2758 |
|
}, |
|
{ |
|
"epoch": 0.7986712882726748, |
|
"grad_norm": 1.5082119703292847, |
|
"learning_rate": 7.019637153987232e-05, |
|
"loss": 2.589, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.8006932409012132, |
|
"grad_norm": 1.413856863975525, |
|
"learning_rate": 7.004332213671832e-05, |
|
"loss": 2.5479, |
|
"step": 2772 |
|
}, |
|
{ |
|
"epoch": 0.8027151935297516, |
|
"grad_norm": 1.492169737815857, |
|
"learning_rate": 6.98900486502891e-05, |
|
"loss": 2.5689, |
|
"step": 2779 |
|
}, |
|
{ |
|
"epoch": 0.80473714615829, |
|
"grad_norm": 1.478893756866455, |
|
"learning_rate": 6.973655279417404e-05, |
|
"loss": 2.5185, |
|
"step": 2786 |
|
}, |
|
{ |
|
"epoch": 0.8067590987868284, |
|
"grad_norm": 1.4768402576446533, |
|
"learning_rate": 6.958283628444866e-05, |
|
"loss": 2.5312, |
|
"step": 2793 |
|
}, |
|
{ |
|
"epoch": 0.8087810514153668, |
|
"grad_norm": 1.569712519645691, |
|
"learning_rate": 6.942890083965538e-05, |
|
"loss": 2.6235, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8108030040439053, |
|
"grad_norm": 1.5381864309310913, |
|
"learning_rate": 6.927474818078424e-05, |
|
"loss": 2.5677, |
|
"step": 2807 |
|
}, |
|
{ |
|
"epoch": 0.8128249566724437, |
|
"grad_norm": 2.156064987182617, |
|
"learning_rate": 6.912038003125381e-05, |
|
"loss": 2.5038, |
|
"step": 2814 |
|
}, |
|
{ |
|
"epoch": 0.8148469093009821, |
|
"grad_norm": 1.4678055047988892, |
|
"learning_rate": 6.896579811689176e-05, |
|
"loss": 2.5175, |
|
"step": 2821 |
|
}, |
|
{ |
|
"epoch": 0.8168688619295205, |
|
"grad_norm": 1.4652937650680542, |
|
"learning_rate": 6.881100416591569e-05, |
|
"loss": 2.5186, |
|
"step": 2828 |
|
}, |
|
{ |
|
"epoch": 0.8188908145580589, |
|
"grad_norm": 1.5202971696853638, |
|
"learning_rate": 6.865599990891374e-05, |
|
"loss": 2.5939, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.8209127671865973, |
|
"grad_norm": 1.546286702156067, |
|
"learning_rate": 6.850078707882524e-05, |
|
"loss": 2.4651, |
|
"step": 2842 |
|
}, |
|
{ |
|
"epoch": 0.8229347198151358, |
|
"grad_norm": 1.5782599449157715, |
|
"learning_rate": 6.834536741092137e-05, |
|
"loss": 2.5271, |
|
"step": 2849 |
|
}, |
|
{ |
|
"epoch": 0.8249566724436742, |
|
"grad_norm": 1.4335618019104004, |
|
"learning_rate": 6.818974264278578e-05, |
|
"loss": 2.5743, |
|
"step": 2856 |
|
}, |
|
{ |
|
"epoch": 0.8269786250722126, |
|
"grad_norm": 1.5671149492263794, |
|
"learning_rate": 6.803391451429505e-05, |
|
"loss": 2.548, |
|
"step": 2863 |
|
}, |
|
{ |
|
"epoch": 0.829000577700751, |
|
"grad_norm": 1.423937439918518, |
|
"learning_rate": 6.787788476759942e-05, |
|
"loss": 2.5809, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.8310225303292894, |
|
"grad_norm": 1.490062952041626, |
|
"learning_rate": 6.772165514710314e-05, |
|
"loss": 2.5932, |
|
"step": 2877 |
|
}, |
|
{ |
|
"epoch": 0.8330444829578278, |
|
"grad_norm": 1.6760882139205933, |
|
"learning_rate": 6.756522739944503e-05, |
|
"loss": 2.4622, |
|
"step": 2884 |
|
}, |
|
{ |
|
"epoch": 0.8350664355863663, |
|
"grad_norm": 1.506843090057373, |
|
"learning_rate": 6.740860327347903e-05, |
|
"loss": 2.4669, |
|
"step": 2891 |
|
}, |
|
{ |
|
"epoch": 0.8370883882149047, |
|
"grad_norm": 1.5637836456298828, |
|
"learning_rate": 6.725178452025448e-05, |
|
"loss": 2.5673, |
|
"step": 2898 |
|
}, |
|
{ |
|
"epoch": 0.839110340843443, |
|
"grad_norm": 1.503013253211975, |
|
"learning_rate": 6.709477289299676e-05, |
|
"loss": 2.5664, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.8411322934719815, |
|
"grad_norm": 1.470767855644226, |
|
"learning_rate": 6.693757014708747e-05, |
|
"loss": 2.4871, |
|
"step": 2912 |
|
}, |
|
{ |
|
"epoch": 0.8431542461005199, |
|
"grad_norm": 1.5470950603485107, |
|
"learning_rate": 6.678017804004495e-05, |
|
"loss": 2.5527, |
|
"step": 2919 |
|
}, |
|
{ |
|
"epoch": 0.8451761987290584, |
|
"grad_norm": 1.5984236001968384, |
|
"learning_rate": 6.662259833150462e-05, |
|
"loss": 2.5518, |
|
"step": 2926 |
|
}, |
|
{ |
|
"epoch": 0.8471981513575968, |
|
"grad_norm": 1.5305312871932983, |
|
"learning_rate": 6.646483278319919e-05, |
|
"loss": 2.522, |
|
"step": 2933 |
|
}, |
|
{ |
|
"epoch": 0.8492201039861352, |
|
"grad_norm": 1.5044381618499756, |
|
"learning_rate": 6.630688315893914e-05, |
|
"loss": 2.5722, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.8512420566146736, |
|
"grad_norm": 1.4263911247253418, |
|
"learning_rate": 6.614875122459284e-05, |
|
"loss": 2.5258, |
|
"step": 2947 |
|
}, |
|
{ |
|
"epoch": 0.853264009243212, |
|
"grad_norm": 1.5409590005874634, |
|
"learning_rate": 6.59904387480669e-05, |
|
"loss": 2.5226, |
|
"step": 2954 |
|
}, |
|
{ |
|
"epoch": 0.8552859618717504, |
|
"grad_norm": 1.4655554294586182, |
|
"learning_rate": 6.58319474992864e-05, |
|
"loss": 2.5071, |
|
"step": 2961 |
|
}, |
|
{ |
|
"epoch": 0.8573079145002889, |
|
"grad_norm": 1.4593043327331543, |
|
"learning_rate": 6.567327925017507e-05, |
|
"loss": 2.5224, |
|
"step": 2968 |
|
}, |
|
{ |
|
"epoch": 0.8593298671288273, |
|
"grad_norm": 1.6059690713882446, |
|
"learning_rate": 6.551443577463549e-05, |
|
"loss": 2.4767, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.8613518197573656, |
|
"grad_norm": 3.554884433746338, |
|
"learning_rate": 6.535541884852927e-05, |
|
"loss": 2.5063, |
|
"step": 2982 |
|
}, |
|
{ |
|
"epoch": 0.8633737723859041, |
|
"grad_norm": 2.0433740615844727, |
|
"learning_rate": 6.519623024965718e-05, |
|
"loss": 2.5728, |
|
"step": 2989 |
|
}, |
|
{ |
|
"epoch": 0.8653957250144425, |
|
"grad_norm": 3.690753698348999, |
|
"learning_rate": 6.503687175773928e-05, |
|
"loss": 2.5452, |
|
"step": 2996 |
|
}, |
|
{ |
|
"epoch": 0.8674176776429809, |
|
"grad_norm": 1.5708411931991577, |
|
"learning_rate": 6.487734515439505e-05, |
|
"loss": 2.4689, |
|
"step": 3003 |
|
}, |
|
{ |
|
"epoch": 0.8694396302715194, |
|
"grad_norm": 1.6525423526763916, |
|
"learning_rate": 6.471765222312342e-05, |
|
"loss": 2.5138, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.8714615829000578, |
|
"grad_norm": 1.5194200277328491, |
|
"learning_rate": 6.455779474928286e-05, |
|
"loss": 2.6165, |
|
"step": 3017 |
|
}, |
|
{ |
|
"epoch": 0.8734835355285961, |
|
"grad_norm": 1.4519085884094238, |
|
"learning_rate": 6.439777452007144e-05, |
|
"loss": 2.564, |
|
"step": 3024 |
|
}, |
|
{ |
|
"epoch": 0.8755054881571346, |
|
"grad_norm": 1.615209698677063, |
|
"learning_rate": 6.423759332450681e-05, |
|
"loss": 2.5355, |
|
"step": 3031 |
|
}, |
|
{ |
|
"epoch": 0.877527440785673, |
|
"grad_norm": 4.623706340789795, |
|
"learning_rate": 6.407725295340619e-05, |
|
"loss": 2.5307, |
|
"step": 3038 |
|
}, |
|
{ |
|
"epoch": 0.8795493934142115, |
|
"grad_norm": 1.492876648902893, |
|
"learning_rate": 6.391675519936642e-05, |
|
"loss": 2.5213, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.8815713460427499, |
|
"grad_norm": 1.6647744178771973, |
|
"learning_rate": 6.375610185674383e-05, |
|
"loss": 2.5198, |
|
"step": 3052 |
|
}, |
|
{ |
|
"epoch": 0.8835932986712882, |
|
"grad_norm": 1.4380768537521362, |
|
"learning_rate": 6.35952947216343e-05, |
|
"loss": 2.531, |
|
"step": 3059 |
|
}, |
|
{ |
|
"epoch": 0.8856152512998267, |
|
"grad_norm": 1.5382163524627686, |
|
"learning_rate": 6.343433559185296e-05, |
|
"loss": 2.5012, |
|
"step": 3066 |
|
}, |
|
{ |
|
"epoch": 0.8876372039283651, |
|
"grad_norm": 1.5382838249206543, |
|
"learning_rate": 6.327322626691441e-05, |
|
"loss": 2.4709, |
|
"step": 3073 |
|
}, |
|
{ |
|
"epoch": 0.8896591565569035, |
|
"grad_norm": 1.5894644260406494, |
|
"learning_rate": 6.311196854801227e-05, |
|
"loss": 2.496, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.891681109185442, |
|
"grad_norm": 1.4344818592071533, |
|
"learning_rate": 6.295056423799927e-05, |
|
"loss": 2.5521, |
|
"step": 3087 |
|
}, |
|
{ |
|
"epoch": 0.8937030618139804, |
|
"grad_norm": 1.4660284519195557, |
|
"learning_rate": 6.278901514136704e-05, |
|
"loss": 2.5297, |
|
"step": 3094 |
|
}, |
|
{ |
|
"epoch": 0.8957250144425187, |
|
"grad_norm": 1.585867166519165, |
|
"learning_rate": 6.262732306422582e-05, |
|
"loss": 2.5552, |
|
"step": 3101 |
|
}, |
|
{ |
|
"epoch": 0.8977469670710572, |
|
"grad_norm": 1.4793505668640137, |
|
"learning_rate": 6.246548981428453e-05, |
|
"loss": 2.5055, |
|
"step": 3108 |
|
}, |
|
{ |
|
"epoch": 0.8997689196995956, |
|
"grad_norm": 1.5395692586898804, |
|
"learning_rate": 6.230351720083021e-05, |
|
"loss": 2.5287, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.901790872328134, |
|
"grad_norm": 1.640716314315796, |
|
"learning_rate": 6.21414070347081e-05, |
|
"loss": 2.3934, |
|
"step": 3122 |
|
}, |
|
{ |
|
"epoch": 0.9038128249566725, |
|
"grad_norm": 1.448921799659729, |
|
"learning_rate": 6.197916112830122e-05, |
|
"loss": 2.5192, |
|
"step": 3129 |
|
}, |
|
{ |
|
"epoch": 0.9058347775852109, |
|
"grad_norm": 1.545032024383545, |
|
"learning_rate": 6.181678129551017e-05, |
|
"loss": 2.4952, |
|
"step": 3136 |
|
}, |
|
{ |
|
"epoch": 0.9078567302137492, |
|
"grad_norm": 1.4347082376480103, |
|
"learning_rate": 6.165426935173287e-05, |
|
"loss": 2.4568, |
|
"step": 3143 |
|
}, |
|
{ |
|
"epoch": 0.9098786828422877, |
|
"grad_norm": 1.5237294435501099, |
|
"learning_rate": 6.149162711384417e-05, |
|
"loss": 2.477, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.9119006354708261, |
|
"grad_norm": 1.49074125289917, |
|
"learning_rate": 6.132885640017566e-05, |
|
"loss": 2.4879, |
|
"step": 3157 |
|
}, |
|
{ |
|
"epoch": 0.9139225880993646, |
|
"grad_norm": 1.4706346988677979, |
|
"learning_rate": 6.116595903049526e-05, |
|
"loss": 2.5777, |
|
"step": 3164 |
|
}, |
|
{ |
|
"epoch": 0.915944540727903, |
|
"grad_norm": 1.555942177772522, |
|
"learning_rate": 6.100293682598689e-05, |
|
"loss": 2.5135, |
|
"step": 3171 |
|
}, |
|
{ |
|
"epoch": 0.9179664933564413, |
|
"grad_norm": 1.5904544591903687, |
|
"learning_rate": 6.083979160923012e-05, |
|
"loss": 2.6248, |
|
"step": 3178 |
|
}, |
|
{ |
|
"epoch": 0.9199884459849798, |
|
"grad_norm": 1.4976989030838013, |
|
"learning_rate": 6.0676525204179815e-05, |
|
"loss": 2.4641, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.9220103986135182, |
|
"grad_norm": 1.5183124542236328, |
|
"learning_rate": 6.051313943614566e-05, |
|
"loss": 2.501, |
|
"step": 3192 |
|
}, |
|
{ |
|
"epoch": 0.9240323512420566, |
|
"grad_norm": 1.5499184131622314, |
|
"learning_rate": 6.034963613177189e-05, |
|
"loss": 2.4758, |
|
"step": 3199 |
|
}, |
|
{ |
|
"epoch": 0.9260543038705951, |
|
"grad_norm": 1.4297561645507812, |
|
"learning_rate": 6.0186017119016744e-05, |
|
"loss": 2.5047, |
|
"step": 3206 |
|
}, |
|
{ |
|
"epoch": 0.9280762564991335, |
|
"grad_norm": 1.6014001369476318, |
|
"learning_rate": 6.002228422713205e-05, |
|
"loss": 2.5153, |
|
"step": 3213 |
|
}, |
|
{ |
|
"epoch": 0.9300982091276718, |
|
"grad_norm": 2.9846770763397217, |
|
"learning_rate": 5.9858439286642864e-05, |
|
"loss": 2.4438, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.9321201617562103, |
|
"grad_norm": 1.633302927017212, |
|
"learning_rate": 5.969448412932688e-05, |
|
"loss": 2.5063, |
|
"step": 3227 |
|
}, |
|
{ |
|
"epoch": 0.9341421143847487, |
|
"grad_norm": 1.5165661573410034, |
|
"learning_rate": 5.953042058819405e-05, |
|
"loss": 2.5131, |
|
"step": 3234 |
|
}, |
|
{ |
|
"epoch": 0.9361640670132871, |
|
"grad_norm": 1.5822478532791138, |
|
"learning_rate": 5.9366250497466025e-05, |
|
"loss": 2.4719, |
|
"step": 3241 |
|
}, |
|
{ |
|
"epoch": 0.9381860196418256, |
|
"grad_norm": 1.5682454109191895, |
|
"learning_rate": 5.92019756925557e-05, |
|
"loss": 2.4422, |
|
"step": 3248 |
|
}, |
|
{ |
|
"epoch": 0.9402079722703639, |
|
"grad_norm": 1.4735851287841797, |
|
"learning_rate": 5.9037598010046644e-05, |
|
"loss": 2.4137, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.9422299248989023, |
|
"grad_norm": 1.435520887374878, |
|
"learning_rate": 5.887311928767263e-05, |
|
"loss": 2.5265, |
|
"step": 3262 |
|
}, |
|
{ |
|
"epoch": 0.9442518775274408, |
|
"grad_norm": 1.5351203680038452, |
|
"learning_rate": 5.8708541364296966e-05, |
|
"loss": 2.4811, |
|
"step": 3269 |
|
}, |
|
{ |
|
"epoch": 0.9462738301559792, |
|
"grad_norm": 1.3827540874481201, |
|
"learning_rate": 5.854386607989214e-05, |
|
"loss": 2.4361, |
|
"step": 3276 |
|
}, |
|
{ |
|
"epoch": 0.9482957827845176, |
|
"grad_norm": 1.46942937374115, |
|
"learning_rate": 5.837909527551901e-05, |
|
"loss": 2.5334, |
|
"step": 3283 |
|
}, |
|
{ |
|
"epoch": 0.9503177354130561, |
|
"grad_norm": 1.498850703239441, |
|
"learning_rate": 5.821423079330648e-05, |
|
"loss": 2.5236, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.9523396880415944, |
|
"grad_norm": 1.5063371658325195, |
|
"learning_rate": 5.804927447643065e-05, |
|
"loss": 2.4448, |
|
"step": 3297 |
|
}, |
|
{ |
|
"epoch": 0.9543616406701328, |
|
"grad_norm": 1.611903429031372, |
|
"learning_rate": 5.7884228169094346e-05, |
|
"loss": 2.4297, |
|
"step": 3304 |
|
}, |
|
{ |
|
"epoch": 0.9563835932986713, |
|
"grad_norm": 1.4947395324707031, |
|
"learning_rate": 5.771909371650655e-05, |
|
"loss": 2.5439, |
|
"step": 3311 |
|
}, |
|
{ |
|
"epoch": 0.9584055459272097, |
|
"grad_norm": 1.4461942911148071, |
|
"learning_rate": 5.755387296486161e-05, |
|
"loss": 2.5017, |
|
"step": 3318 |
|
}, |
|
{ |
|
"epoch": 0.9604274985557482, |
|
"grad_norm": 1.4871538877487183, |
|
"learning_rate": 5.738856776131878e-05, |
|
"loss": 2.4582, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.9624494511842865, |
|
"grad_norm": 1.6216068267822266, |
|
"learning_rate": 5.722317995398142e-05, |
|
"loss": 2.5564, |
|
"step": 3332 |
|
}, |
|
{ |
|
"epoch": 0.9644714038128249, |
|
"grad_norm": 1.5217235088348389, |
|
"learning_rate": 5.705771139187642e-05, |
|
"loss": 2.4783, |
|
"step": 3339 |
|
}, |
|
{ |
|
"epoch": 0.9664933564413634, |
|
"grad_norm": 1.5554358959197998, |
|
"learning_rate": 5.689216392493352e-05, |
|
"loss": 2.4965, |
|
"step": 3346 |
|
}, |
|
{ |
|
"epoch": 0.9685153090699018, |
|
"grad_norm": 1.455360770225525, |
|
"learning_rate": 5.672653940396459e-05, |
|
"loss": 2.4727, |
|
"step": 3353 |
|
}, |
|
{ |
|
"epoch": 0.9705372616984402, |
|
"grad_norm": 1.4881621599197388, |
|
"learning_rate": 5.6560839680642916e-05, |
|
"loss": 2.522, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.9725592143269787, |
|
"grad_norm": 1.4497352838516235, |
|
"learning_rate": 5.6395066607482663e-05, |
|
"loss": 2.4652, |
|
"step": 3367 |
|
}, |
|
{ |
|
"epoch": 0.974581166955517, |
|
"grad_norm": 1.4684548377990723, |
|
"learning_rate": 5.622922203781792e-05, |
|
"loss": 2.3909, |
|
"step": 3374 |
|
}, |
|
{ |
|
"epoch": 0.9766031195840554, |
|
"grad_norm": 1.5175479650497437, |
|
"learning_rate": 5.6063307825782166e-05, |
|
"loss": 2.5645, |
|
"step": 3381 |
|
}, |
|
{ |
|
"epoch": 0.9786250722125939, |
|
"grad_norm": 1.4038801193237305, |
|
"learning_rate": 5.589732582628747e-05, |
|
"loss": 2.3975, |
|
"step": 3388 |
|
}, |
|
{ |
|
"epoch": 0.9806470248411323, |
|
"grad_norm": 1.4544683694839478, |
|
"learning_rate": 5.5731277895003754e-05, |
|
"loss": 2.4395, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.9826689774696707, |
|
"grad_norm": 1.468361496925354, |
|
"learning_rate": 5.556516588833807e-05, |
|
"loss": 2.4758, |
|
"step": 3402 |
|
}, |
|
{ |
|
"epoch": 0.9846909300982092, |
|
"grad_norm": 1.4997249841690063, |
|
"learning_rate": 5.539899166341378e-05, |
|
"loss": 2.4686, |
|
"step": 3409 |
|
}, |
|
{ |
|
"epoch": 0.9867128827267475, |
|
"grad_norm": 1.4386577606201172, |
|
"learning_rate": 5.5232757078049925e-05, |
|
"loss": 2.567, |
|
"step": 3416 |
|
}, |
|
{ |
|
"epoch": 0.988734835355286, |
|
"grad_norm": 1.4930377006530762, |
|
"learning_rate": 5.506646399074029e-05, |
|
"loss": 2.381, |
|
"step": 3423 |
|
}, |
|
{ |
|
"epoch": 0.9907567879838244, |
|
"grad_norm": 1.5108468532562256, |
|
"learning_rate": 5.4900114260632754e-05, |
|
"loss": 2.59, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.9927787406123628, |
|
"grad_norm": 1.8218153715133667, |
|
"learning_rate": 5.4733709747508465e-05, |
|
"loss": 2.4011, |
|
"step": 3437 |
|
}, |
|
{ |
|
"epoch": 0.9948006932409013, |
|
"grad_norm": 1.5193471908569336, |
|
"learning_rate": 5.456725231176102e-05, |
|
"loss": 2.4279, |
|
"step": 3444 |
|
}, |
|
{ |
|
"epoch": 0.9968226458694396, |
|
"grad_norm": 1.4844223260879517, |
|
"learning_rate": 5.440074381437569e-05, |
|
"loss": 2.4751, |
|
"step": 3451 |
|
}, |
|
{ |
|
"epoch": 0.998844598497978, |
|
"grad_norm": 1.5008902549743652, |
|
"learning_rate": 5.423418611690862e-05, |
|
"loss": 2.4627, |
|
"step": 3458 |
|
} |
|
], |
|
"logging_steps": 7, |
|
"max_steps": 6924, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 3462, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.85999467371561e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|