AllSparkv2-7B-V-P / trainer_state.json
ShaoRun's picture
Upload 22 files
79ba0db verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999775734469612,
"eval_steps": 500,
"global_step": 2229,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008970621215519175,
"grad_norm": 0.9008947014808655,
"learning_rate": 5.970149253731343e-06,
"loss": 3.6325,
"step": 20
},
{
"epoch": 0.01794124243103835,
"grad_norm": 0.5066778063774109,
"learning_rate": 1.1940298507462686e-05,
"loss": 2.6461,
"step": 40
},
{
"epoch": 0.026911863646557524,
"grad_norm": 0.5029802322387695,
"learning_rate": 1.791044776119403e-05,
"loss": 2.6127,
"step": 60
},
{
"epoch": 0.0358824848620767,
"grad_norm": 0.3778040111064911,
"learning_rate": 1.999821584672887e-05,
"loss": 2.5859,
"step": 80
},
{
"epoch": 0.04485310607759587,
"grad_norm": 0.3385876715183258,
"learning_rate": 1.998850515736159e-05,
"loss": 2.6276,
"step": 100
},
{
"epoch": 0.05382372729311505,
"grad_norm": 0.2863208055496216,
"learning_rate": 1.9970358823117534e-05,
"loss": 2.6081,
"step": 120
},
{
"epoch": 0.06279434850863422,
"grad_norm": 0.31687623262405396,
"learning_rate": 1.994379216921594e-05,
"loss": 2.5151,
"step": 140
},
{
"epoch": 0.0717649697241534,
"grad_norm": 0.27848467230796814,
"learning_rate": 1.990882763213298e-05,
"loss": 2.5367,
"step": 160
},
{
"epoch": 0.08073559093967257,
"grad_norm": 0.2719942629337311,
"learning_rate": 1.986549474065333e-05,
"loss": 2.6009,
"step": 180
},
{
"epoch": 0.08970621215519174,
"grad_norm": 0.28745323419570923,
"learning_rate": 1.98138300909321e-05,
"loss": 2.6414,
"step": 200
},
{
"epoch": 0.09867683337071093,
"grad_norm": 0.27338650822639465,
"learning_rate": 1.9753877315588072e-05,
"loss": 2.3677,
"step": 220
},
{
"epoch": 0.1076474545862301,
"grad_norm": 0.3499230742454529,
"learning_rate": 1.9685687046854415e-05,
"loss": 2.4714,
"step": 240
},
{
"epoch": 0.11661807580174927,
"grad_norm": 0.2322498857975006,
"learning_rate": 1.9609316873817992e-05,
"loss": 2.5646,
"step": 260
},
{
"epoch": 0.12558869701726844,
"grad_norm": 0.26533788442611694,
"learning_rate": 1.952483129378333e-05,
"loss": 2.511,
"step": 280
},
{
"epoch": 0.13455931823278763,
"grad_norm": 0.2915363013744354,
"learning_rate": 1.9432301657802378e-05,
"loss": 2.5829,
"step": 300
},
{
"epoch": 0.1435299394483068,
"grad_norm": 0.23644109070301056,
"learning_rate": 1.9331806110416027e-05,
"loss": 2.5513,
"step": 320
},
{
"epoch": 0.15250056066382597,
"grad_norm": 0.23692189157009125,
"learning_rate": 1.922342952365829e-05,
"loss": 2.4833,
"step": 340
},
{
"epoch": 0.16147118187934514,
"grad_norm": 0.22117172181606293,
"learning_rate": 1.9107263425378873e-05,
"loss": 2.5499,
"step": 360
},
{
"epoch": 0.1704418030948643,
"grad_norm": 0.2725989818572998,
"learning_rate": 1.8983405921944686e-05,
"loss": 2.4439,
"step": 380
},
{
"epoch": 0.17941242431038348,
"grad_norm": 0.22964678704738617,
"learning_rate": 1.8851961615385542e-05,
"loss": 2.5341,
"step": 400
},
{
"epoch": 0.18838304552590268,
"grad_norm": 0.22048306465148926,
"learning_rate": 1.8713041515054065e-05,
"loss": 2.5151,
"step": 420
},
{
"epoch": 0.19735366674142185,
"grad_norm": 0.2410048246383667,
"learning_rate": 1.8566762943874376e-05,
"loss": 2.4619,
"step": 440
},
{
"epoch": 0.20632428795694102,
"grad_norm": 0.2247011512517929,
"learning_rate": 1.8413249439258743e-05,
"loss": 2.5112,
"step": 460
},
{
"epoch": 0.2152949091724602,
"grad_norm": 0.25305867195129395,
"learning_rate": 1.8252630648775874e-05,
"loss": 2.5259,
"step": 480
},
{
"epoch": 0.22426553038797936,
"grad_norm": 0.23119617998600006,
"learning_rate": 1.8085042220658993e-05,
"loss": 2.488,
"step": 500
},
{
"epoch": 0.23323615160349853,
"grad_norm": 0.2174287885427475,
"learning_rate": 1.791062568924609e-05,
"loss": 2.491,
"step": 520
},
{
"epoch": 0.24220677281901773,
"grad_norm": 0.22464613616466522,
"learning_rate": 1.7729528355449214e-05,
"loss": 2.4441,
"step": 540
},
{
"epoch": 0.25117739403453687,
"grad_norm": 0.2646411657333374,
"learning_rate": 1.7541903162353638e-05,
"loss": 2.4999,
"step": 560
},
{
"epoch": 0.26014801525005604,
"grad_norm": 0.24847449362277985,
"learning_rate": 1.734790856605204e-05,
"loss": 2.4666,
"step": 580
},
{
"epoch": 0.26911863646557527,
"grad_norm": 0.20716962218284607,
"learning_rate": 1.714770840182273e-05,
"loss": 2.4222,
"step": 600
},
{
"epoch": 0.27808925768109444,
"grad_norm": 0.24155037105083466,
"learning_rate": 1.6941471745764996e-05,
"loss": 2.4417,
"step": 620
},
{
"epoch": 0.2870598788966136,
"grad_norm": 0.2298847883939743,
"learning_rate": 1.672937277200837e-05,
"loss": 2.5199,
"step": 640
},
{
"epoch": 0.2960305001121328,
"grad_norm": 0.22792883217334747,
"learning_rate": 1.6511590605616423e-05,
"loss": 2.4298,
"step": 660
},
{
"epoch": 0.30500112132765195,
"grad_norm": 0.2478325068950653,
"learning_rate": 1.628830917130935e-05,
"loss": 2.494,
"step": 680
},
{
"epoch": 0.3139717425431711,
"grad_norm": 0.22993171215057373,
"learning_rate": 1.6059717038133038e-05,
"loss": 2.5366,
"step": 700
},
{
"epoch": 0.3229423637586903,
"grad_norm": 0.23914852738380432,
"learning_rate": 1.5826007260205868e-05,
"loss": 2.4151,
"step": 720
},
{
"epoch": 0.33191298497420946,
"grad_norm": 0.2367142289876938,
"learning_rate": 1.5587377213677705e-05,
"loss": 2.3964,
"step": 740
},
{
"epoch": 0.3408836061897286,
"grad_norm": 0.25746768712997437,
"learning_rate": 1.5344028430038764e-05,
"loss": 2.4184,
"step": 760
},
{
"epoch": 0.3498542274052478,
"grad_norm": 0.23596778512001038,
"learning_rate": 1.5096166425919176e-05,
"loss": 2.4126,
"step": 780
},
{
"epoch": 0.35882484862076697,
"grad_norm": 0.22444961965084076,
"learning_rate": 1.4844000529522942e-05,
"loss": 2.4106,
"step": 800
},
{
"epoch": 0.36779546983628614,
"grad_norm": 0.2117769718170166,
"learning_rate": 1.458774370384287e-05,
"loss": 2.4786,
"step": 820
},
{
"epoch": 0.37676609105180536,
"grad_norm": 0.19232399761676788,
"learning_rate": 1.4327612366805832e-05,
"loss": 2.436,
"step": 840
},
{
"epoch": 0.38573671226732453,
"grad_norm": 0.2247142344713211,
"learning_rate": 1.4063826208500182e-05,
"loss": 2.5193,
"step": 860
},
{
"epoch": 0.3947073334828437,
"grad_norm": 0.24078157544136047,
"learning_rate": 1.3796608005639738e-05,
"loss": 2.5,
"step": 880
},
{
"epoch": 0.4036779546983629,
"grad_norm": 0.2095336616039276,
"learning_rate": 1.352618343342098e-05,
"loss": 2.4365,
"step": 900
},
{
"epoch": 0.41264857591388204,
"grad_norm": 0.2053331881761551,
"learning_rate": 1.3252780874932395e-05,
"loss": 2.4161,
"step": 920
},
{
"epoch": 0.4216191971294012,
"grad_norm": 0.20598524808883667,
"learning_rate": 1.2976631228276894e-05,
"loss": 2.4314,
"step": 940
},
{
"epoch": 0.4305898183449204,
"grad_norm": 0.2612752318382263,
"learning_rate": 1.2697967711570243e-05,
"loss": 2.3568,
"step": 960
},
{
"epoch": 0.43956043956043955,
"grad_norm": 0.18916811048984528,
"learning_rate": 1.2417025665980114e-05,
"loss": 2.4058,
"step": 980
},
{
"epoch": 0.4485310607759587,
"grad_norm": 0.23678423464298248,
"learning_rate": 1.2134042356972175e-05,
"loss": 2.4794,
"step": 1000
},
{
"epoch": 0.4575016819914779,
"grad_norm": 0.204436257481575,
"learning_rate": 1.1849256773931058e-05,
"loss": 2.3,
"step": 1020
},
{
"epoch": 0.46647230320699706,
"grad_norm": 0.20532362163066864,
"learning_rate": 1.156290942832536e-05,
"loss": 2.3845,
"step": 1040
},
{
"epoch": 0.47544292442251623,
"grad_norm": 0.2221594899892807,
"learning_rate": 1.1275242150587254e-05,
"loss": 2.4282,
"step": 1060
},
{
"epoch": 0.48441354563803546,
"grad_norm": 0.23072290420532227,
"learning_rate": 1.0986497885878145e-05,
"loss": 2.3869,
"step": 1080
},
{
"epoch": 0.49338416685355463,
"grad_norm": 0.20840080082416534,
"learning_rate": 1.0696920488912923e-05,
"loss": 2.4322,
"step": 1100
},
{
"epoch": 0.5023547880690737,
"grad_norm": 0.23384802043437958,
"learning_rate": 1.0406754518016047e-05,
"loss": 2.506,
"step": 1120
},
{
"epoch": 0.5113254092845929,
"grad_norm": 0.22102180123329163,
"learning_rate": 1.0116245028583418e-05,
"loss": 2.4869,
"step": 1140
},
{
"epoch": 0.5202960305001121,
"grad_norm": 0.18945130705833435,
"learning_rate": 9.825637366124458e-06,
"loss": 2.3671,
"step": 1160
},
{
"epoch": 0.5292666517156313,
"grad_norm": 0.22617211937904358,
"learning_rate": 9.535176959059171e-06,
"loss": 2.423,
"step": 1180
},
{
"epoch": 0.5382372729311505,
"grad_norm": 0.2126481682062149,
"learning_rate": 9.245109111445189e-06,
"loss": 2.3887,
"step": 1200
},
{
"epoch": 0.5472078941466697,
"grad_norm": 0.22861531376838684,
"learning_rate": 8.95567879580984e-06,
"loss": 2.36,
"step": 1220
},
{
"epoch": 0.5561785153621889,
"grad_norm": 0.2701588273048401,
"learning_rate": 8.667130446262214e-06,
"loss": 2.401,
"step": 1240
},
{
"epoch": 0.565149136577708,
"grad_norm": 0.2689853608608246,
"learning_rate": 8.379707752059932e-06,
"loss": 2.3753,
"step": 1260
},
{
"epoch": 0.5741197577932272,
"grad_norm": 0.22886186838150024,
"learning_rate": 8.093653451804987e-06,
"loss": 2.4807,
"step": 1280
},
{
"epoch": 0.5830903790087464,
"grad_norm": 0.24667732417583466,
"learning_rate": 7.809209128442408e-06,
"loss": 2.4269,
"step": 1300
},
{
"epoch": 0.5920610002242656,
"grad_norm": 0.22338470816612244,
"learning_rate": 7.52661500523497e-06,
"loss": 2.4133,
"step": 1320
},
{
"epoch": 0.6010316214397847,
"grad_norm": 0.21941307187080383,
"learning_rate": 7.246109742886156e-06,
"loss": 2.4606,
"step": 1340
},
{
"epoch": 0.6100022426553039,
"grad_norm": 0.22378675639629364,
"learning_rate": 6.967930237982793e-06,
"loss": 2.3498,
"step": 1360
},
{
"epoch": 0.6189728638708231,
"grad_norm": 0.23019564151763916,
"learning_rate": 6.692311422927515e-06,
"loss": 2.3927,
"step": 1380
},
{
"epoch": 0.6279434850863422,
"grad_norm": 0.22530066967010498,
"learning_rate": 6.4194860675300695e-06,
"loss": 2.4463,
"step": 1400
},
{
"epoch": 0.6369141063018614,
"grad_norm": 0.2268647849559784,
"learning_rate": 6.149684582425013e-06,
"loss": 2.4025,
"step": 1420
},
{
"epoch": 0.6458847275173806,
"grad_norm": 0.2626585066318512,
"learning_rate": 5.883134824481786e-06,
"loss": 2.3956,
"step": 1440
},
{
"epoch": 0.6548553487328997,
"grad_norm": 0.24002288281917572,
"learning_rate": 5.620061904371565e-06,
"loss": 2.3784,
"step": 1460
},
{
"epoch": 0.6638259699484189,
"grad_norm": 0.2304755002260208,
"learning_rate": 5.360687996453348e-06,
"loss": 2.4067,
"step": 1480
},
{
"epoch": 0.6727965911639381,
"grad_norm": 0.22266767919063568,
"learning_rate": 5.105232151139895e-06,
"loss": 2.4311,
"step": 1500
},
{
"epoch": 0.6817672123794573,
"grad_norm": 0.25173887610435486,
"learning_rate": 4.853910109901901e-06,
"loss": 2.3631,
"step": 1520
},
{
"epoch": 0.6907378335949764,
"grad_norm": 0.2748667299747467,
"learning_rate": 4.606934123066739e-06,
"loss": 2.3062,
"step": 1540
},
{
"epoch": 0.6997084548104956,
"grad_norm": 0.3214375972747803,
"learning_rate": 4.3645127705655654e-06,
"loss": 2.4436,
"step": 1560
},
{
"epoch": 0.7086790760260148,
"grad_norm": 0.25126707553863525,
"learning_rate": 4.126850785780199e-06,
"loss": 2.5224,
"step": 1580
},
{
"epoch": 0.7176496972415339,
"grad_norm": 0.24009720981121063,
"learning_rate": 3.8941488826385855e-06,
"loss": 2.3984,
"step": 1600
},
{
"epoch": 0.7266203184570531,
"grad_norm": 0.269382506608963,
"learning_rate": 3.6666035861047744e-06,
"loss": 2.4344,
"step": 1620
},
{
"epoch": 0.7355909396725723,
"grad_norm": 0.2938186526298523,
"learning_rate": 3.444407066206692e-06,
"loss": 2.3371,
"step": 1640
},
{
"epoch": 0.7445615608880914,
"grad_norm": 0.23332324624061584,
"learning_rate": 3.2277469757417403e-06,
"loss": 2.3741,
"step": 1660
},
{
"epoch": 0.7535321821036107,
"grad_norm": 0.24623927474021912,
"learning_rate": 3.0168062917974173e-06,
"loss": 2.3467,
"step": 1680
},
{
"epoch": 0.7625028033191299,
"grad_norm": 0.2714971601963043,
"learning_rate": 2.8117631612207084e-06,
"loss": 2.3712,
"step": 1700
},
{
"epoch": 0.7714734245346491,
"grad_norm": 0.24978309869766235,
"learning_rate": 2.6127907501667726e-06,
"loss": 2.4389,
"step": 1720
},
{
"epoch": 0.7804440457501682,
"grad_norm": 0.25042879581451416,
"learning_rate": 2.420057097854046e-06,
"loss": 2.3793,
"step": 1740
},
{
"epoch": 0.7894146669656874,
"grad_norm": 0.2673550248146057,
"learning_rate": 2.2337249746491695e-06,
"loss": 2.3452,
"step": 1760
},
{
"epoch": 0.7983852881812066,
"grad_norm": 0.2639400362968445,
"learning_rate": 2.0539517446016975e-06,
"loss": 2.3364,
"step": 1780
},
{
"epoch": 0.8073559093967257,
"grad_norm": 0.2685152590274811,
"learning_rate": 1.880889232544585e-06,
"loss": 2.3915,
"step": 1800
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.2522066533565521,
"learning_rate": 1.714683595872777e-06,
"loss": 2.3616,
"step": 1820
},
{
"epoch": 0.8252971518277641,
"grad_norm": 0.25205305218696594,
"learning_rate": 1.5554752011081332e-06,
"loss": 2.3692,
"step": 1840
},
{
"epoch": 0.8342677730432833,
"grad_norm": 0.30145809054374695,
"learning_rate": 1.4033985053549425e-06,
"loss": 2.3174,
"step": 1860
},
{
"epoch": 0.8432383942588024,
"grad_norm": 0.2629565894603729,
"learning_rate": 1.2585819427461564e-06,
"loss": 2.3526,
"step": 1880
},
{
"epoch": 0.8522090154743216,
"grad_norm": 0.2707832455635071,
"learning_rate": 1.121147815976248e-06,
"loss": 2.3042,
"step": 1900
},
{
"epoch": 0.8611796366898408,
"grad_norm": 0.23665176331996918,
"learning_rate": 9.912121930122542e-07,
"loss": 2.3199,
"step": 1920
},
{
"epoch": 0.8701502579053599,
"grad_norm": 0.2816649377346039,
"learning_rate": 8.688848090702928e-07,
"loss": 2.331,
"step": 1940
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.27171188592910767,
"learning_rate": 7.542689739403097e-07,
"loss": 2.3775,
"step": 1960
},
{
"epoch": 0.8880915003363983,
"grad_norm": 0.250848650932312,
"learning_rate": 6.474614847373051e-07,
"loss": 2.3671,
"step": 1980
},
{
"epoch": 0.8970621215519174,
"grad_norm": 0.2723957598209381,
"learning_rate": 5.485525441527651e-07,
"loss": 2.2999,
"step": 2000
},
{
"epoch": 0.9060327427674366,
"grad_norm": 0.2722890079021454,
"learning_rate": 4.5762568427529795e-07,
"loss": 2.3807,
"step": 2020
},
{
"epoch": 0.9150033639829558,
"grad_norm": 0.2557665705680847,
"learning_rate": 3.747576960448551e-07,
"loss": 2.3589,
"step": 2040
},
{
"epoch": 0.923973985198475,
"grad_norm": 0.3588675856590271,
"learning_rate": 3.0001856440005307e-07,
"loss": 2.3025,
"step": 2060
},
{
"epoch": 0.9329446064139941,
"grad_norm": 0.2574796676635742,
"learning_rate": 2.3347140917344579e-07,
"loss": 2.4445,
"step": 2080
},
{
"epoch": 0.9419152276295133,
"grad_norm": 0.2603015899658203,
"learning_rate": 1.7517243178458486e-07,
"loss": 2.3884,
"step": 2100
},
{
"epoch": 0.9508858488450325,
"grad_norm": 0.2759827673435211,
"learning_rate": 1.2517086777594112e-07,
"loss": 2.3706,
"step": 2120
},
{
"epoch": 0.9598564700605517,
"grad_norm": 0.2819114327430725,
"learning_rate": 8.35089452317639e-08,
"loss": 2.2965,
"step": 2140
},
{
"epoch": 0.9688270912760709,
"grad_norm": 0.2688797414302826,
"learning_rate": 5.022184911495864e-08,
"loss": 2.3142,
"step": 2160
},
{
"epoch": 0.9777977124915901,
"grad_norm": 0.2871028780937195,
"learning_rate": 2.5337691552156372e-08,
"loss": 2.3665,
"step": 2180
},
{
"epoch": 0.9867683337071093,
"grad_norm": 0.25957873463630676,
"learning_rate": 8.877488092022823e-09,
"loss": 2.3602,
"step": 2200
},
{
"epoch": 0.9957389549226284,
"grad_norm": 0.2717770040035248,
"learning_rate": 8.551399568945684e-10,
"loss": 2.3519,
"step": 2220
},
{
"epoch": 0.999775734469612,
"step": 2229,
"total_flos": 3.1724140856564777e+18,
"train_loss": 2.4421788879230237,
"train_runtime": 8917.1095,
"train_samples_per_second": 15.998,
"train_steps_per_second": 0.25
}
],
"logging_steps": 20,
"max_steps": 2229,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.1724140856564777e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}