{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.688, "eval_steps": 500, "global_step": 1240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "grad_norm": 16.156723022460938, "learning_rate": 8.064516129032258e-07, "loss": 1.7136, "step": 10 }, { "epoch": 0.32, "grad_norm": 8.073517799377441, "learning_rate": 2.4193548387096776e-06, "loss": 1.2809, "step": 20 }, { "epoch": 0.48, "grad_norm": 2.825227737426758, "learning_rate": 4.032258064516129e-06, "loss": 0.8516, "step": 30 }, { "epoch": 0.64, "grad_norm": 1.599808692932129, "learning_rate": 5.645161290322582e-06, "loss": 0.8144, "step": 40 }, { "epoch": 0.8, "grad_norm": 1.7444440126419067, "learning_rate": 7.258064516129033e-06, "loss": 0.6716, "step": 50 }, { "epoch": 0.96, "grad_norm": 1.365881323814392, "learning_rate": 8.870967741935484e-06, "loss": 0.6604, "step": 60 }, { "epoch": 1.112, "grad_norm": 1.2223279476165771, "learning_rate": 1.0483870967741936e-05, "loss": 0.5284, "step": 70 }, { "epoch": 1.272, "grad_norm": 1.4933074712753296, "learning_rate": 1.1935483870967743e-05, "loss": 0.5844, "step": 80 }, { "epoch": 1.432, "grad_norm": 1.5303218364715576, "learning_rate": 1.3548387096774194e-05, "loss": 0.604, "step": 90 }, { "epoch": 1.592, "grad_norm": 1.146931529045105, "learning_rate": 1.5161290322580646e-05, "loss": 0.6135, "step": 100 }, { "epoch": 1.752, "grad_norm": 1.2514358758926392, "learning_rate": 1.6774193548387098e-05, "loss": 0.5823, "step": 110 }, { "epoch": 1.912, "grad_norm": 1.1219581365585327, "learning_rate": 1.838709677419355e-05, "loss": 0.5869, "step": 120 }, { "epoch": 2.064, "grad_norm": 1.2538785934448242, "learning_rate": 2e-05, "loss": 0.529, "step": 130 }, { "epoch": 2.224, "grad_norm": 1.4412481784820557, "learning_rate": 1.9996038016334953e-05, "loss": 0.4977, "step": 140 }, { "epoch": 2.384, "grad_norm": 1.280132532119751, "learning_rate": 1.9984155204802715e-05, "loss": 0.5153, "step": 150 }, { "epoch": 2.544, "grad_norm": 1.388814091682434, "learning_rate": 1.996436098130433e-05, "loss": 0.5185, "step": 160 }, { "epoch": 2.7039999999999997, "grad_norm": 1.2465498447418213, "learning_rate": 1.9936671030717832e-05, "loss": 0.5093, "step": 170 }, { "epoch": 2.864, "grad_norm": 6.404733657836914, "learning_rate": 1.9901107294469595e-05, "loss": 0.5256, "step": 180 }, { "epoch": 3.016, "grad_norm": 2.126950263977051, "learning_rate": 1.985769795314804e-05, "loss": 0.4664, "step": 190 }, { "epoch": 3.176, "grad_norm": 1.7593132257461548, "learning_rate": 1.9806477404173414e-05, "loss": 0.4251, "step": 200 }, { "epoch": 3.336, "grad_norm": 1.0036180019378662, "learning_rate": 1.9747486234541382e-05, "loss": 0.4481, "step": 210 }, { "epoch": 3.496, "grad_norm": 1.5059189796447754, "learning_rate": 1.9680771188662044e-05, "loss": 0.4185, "step": 220 }, { "epoch": 3.656, "grad_norm": 2.5380663871765137, "learning_rate": 1.9606385131319795e-05, "loss": 0.4336, "step": 230 }, { "epoch": 3.816, "grad_norm": 1.0066611766815186, "learning_rate": 1.9524387005783453e-05, "loss": 0.4488, "step": 240 }, { "epoch": 3.976, "grad_norm": 2.824939489364624, "learning_rate": 1.9434841787099804e-05, "loss": 0.4068, "step": 250 }, { "epoch": 4.128, "grad_norm": 1.2720246315002441, "learning_rate": 1.9337820430607594e-05, "loss": 0.3575, "step": 260 }, { "epoch": 4.288, "grad_norm": 3.2268316745758057, "learning_rate": 1.9233399815712737e-05, "loss": 0.3638, "step": 270 }, { "epoch": 4.448, "grad_norm": 1.3898473978042603, "learning_rate": 1.9121662684969337e-05, "loss": 0.3891, "step": 280 }, { "epoch": 4.608, "grad_norm": 2.0828278064727783, "learning_rate": 1.9002697578514747e-05, "loss": 0.3652, "step": 290 }, { "epoch": 4.768, "grad_norm": 1.149625539779663, "learning_rate": 1.8876598763910666e-05, "loss": 0.4091, "step": 300 }, { "epoch": 4.928, "grad_norm": 1.3654099702835083, "learning_rate": 1.8743466161445823e-05, "loss": 0.3508, "step": 310 }, { "epoch": 5.08, "grad_norm": 1.3550388813018799, "learning_rate": 1.8603405264959467e-05, "loss": 0.3232, "step": 320 }, { "epoch": 5.24, "grad_norm": 1.4081593751907349, "learning_rate": 1.8456527058248398e-05, "loss": 0.2735, "step": 330 }, { "epoch": 5.4, "grad_norm": 2.1102137565612793, "learning_rate": 1.8302947927123767e-05, "loss": 0.3039, "step": 340 }, { "epoch": 5.5600000000000005, "grad_norm": 2.0371267795562744, "learning_rate": 1.8142789567187327e-05, "loss": 0.3298, "step": 350 }, { "epoch": 5.72, "grad_norm": 1.2168771028518677, "learning_rate": 1.7976178887400263e-05, "loss": 0.3153, "step": 360 }, { "epoch": 5.88, "grad_norm": 1.5159831047058105, "learning_rate": 1.780324790952092e-05, "loss": 0.3051, "step": 370 }, { "epoch": 6.032, "grad_norm": 1.4497977495193481, "learning_rate": 1.7624133663491204e-05, "loss": 0.2808, "step": 380 }, { "epoch": 6.192, "grad_norm": 1.5660399198532104, "learning_rate": 1.7438978078854512e-05, "loss": 0.2225, "step": 390 }, { "epoch": 6.352, "grad_norm": 2.0142195224761963, "learning_rate": 1.72479278722912e-05, "loss": 0.2526, "step": 400 }, { "epoch": 6.5120000000000005, "grad_norm": 1.3689491748809814, "learning_rate": 1.7051134431360795e-05, "loss": 0.2372, "step": 410 }, { "epoch": 6.672, "grad_norm": 2.383648633956909, "learning_rate": 1.6848753694542966e-05, "loss": 0.2611, "step": 420 }, { "epoch": 6.832, "grad_norm": 2.9119460582733154, "learning_rate": 1.6640946027672395e-05, "loss": 0.2292, "step": 430 }, { "epoch": 6.992, "grad_norm": 1.594862461090088, "learning_rate": 1.6427876096865394e-05, "loss": 0.2484, "step": 440 }, { "epoch": 7.144, "grad_norm": 2.313305616378784, "learning_rate": 1.620971273803905e-05, "loss": 0.1605, "step": 450 }, { "epoch": 7.304, "grad_norm": 1.8747406005859375, "learning_rate": 1.598662882312615e-05, "loss": 0.1881, "step": 460 }, { "epoch": 7.464, "grad_norm": 2.306143045425415, "learning_rate": 1.5758801123092066e-05, "loss": 0.1805, "step": 470 }, { "epoch": 7.624, "grad_norm": 2.0819358825683594, "learning_rate": 1.552641016786199e-05, "loss": 0.1365, "step": 480 }, { "epoch": 7.784, "grad_norm": 1.9174208641052246, "learning_rate": 1.5289640103269626e-05, "loss": 0.1521, "step": 490 }, { "epoch": 7.944, "grad_norm": 1.3912276029586792, "learning_rate": 1.5048678545140634e-05, "loss": 0.1691, "step": 500 }, { "epoch": 8.096, "grad_norm": 1.8673287630081177, "learning_rate": 1.4803716430626455e-05, "loss": 0.1194, "step": 510 }, { "epoch": 8.256, "grad_norm": 4.073314189910889, "learning_rate": 1.455494786690634e-05, "loss": 0.1136, "step": 520 }, { "epoch": 8.416, "grad_norm": 1.3980969190597534, "learning_rate": 1.4302569977377462e-05, "loss": 0.0987, "step": 530 }, { "epoch": 8.576, "grad_norm": 1.0765067338943481, "learning_rate": 1.404678274545496e-05, "loss": 0.125, "step": 540 }, { "epoch": 8.736, "grad_norm": 1.532691240310669, "learning_rate": 1.3787788856105762e-05, "loss": 0.0986, "step": 550 }, { "epoch": 8.896, "grad_norm": 1.394974708557129, "learning_rate": 1.3525793535241654e-05, "loss": 0.1131, "step": 560 }, { "epoch": 9.048, "grad_norm": 1.7207937240600586, "learning_rate": 1.3261004387098951e-05, "loss": 0.0854, "step": 570 }, { "epoch": 9.208, "grad_norm": 1.8617466688156128, "learning_rate": 1.2993631229733584e-05, "loss": 0.0808, "step": 580 }, { "epoch": 9.368, "grad_norm": 1.5372612476348877, "learning_rate": 1.2723885928761934e-05, "loss": 0.0694, "step": 590 }, { "epoch": 9.528, "grad_norm": 0.963295042514801, "learning_rate": 1.2451982229479245e-05, "loss": 0.0792, "step": 600 }, { "epoch": 9.688, "grad_norm": 1.102954387664795, "learning_rate": 1.2178135587488515e-05, "loss": 0.0704, "step": 610 }, { "epoch": 9.848, "grad_norm": 0.988452672958374, "learning_rate": 1.1902562997974211e-05, "loss": 0.0904, "step": 620 }, { "epoch": 10.0, "grad_norm": 0.3999573886394501, "learning_rate": 1.1625482823755965e-05, "loss": 0.0537, "step": 630 }, { "epoch": 10.16, "grad_norm": 1.1528918743133545, "learning_rate": 1.1347114622258613e-05, "loss": 0.0433, "step": 640 }, { "epoch": 10.32, "grad_norm": 0.7834718823432922, "learning_rate": 1.106767897153559e-05, "loss": 0.0415, "step": 650 }, { "epoch": 10.48, "grad_norm": 1.2862722873687744, "learning_rate": 1.078739729548362e-05, "loss": 0.0537, "step": 660 }, { "epoch": 10.64, "grad_norm": 1.1977423429489136, "learning_rate": 1.0506491688387128e-05, "loss": 0.0561, "step": 670 }, { "epoch": 10.8, "grad_norm": 2.4095022678375244, "learning_rate": 1.0225184738931461e-05, "loss": 0.0434, "step": 680 }, { "epoch": 10.96, "grad_norm": 0.9543105959892273, "learning_rate": 9.943699353824344e-06, "loss": 0.0554, "step": 690 }, { "epoch": 11.112, "grad_norm": 0.7812919616699219, "learning_rate": 9.66225858116532e-06, "loss": 0.0356, "step": 700 }, { "epoch": 11.272, "grad_norm": 0.16588281095027924, "learning_rate": 9.381085433703183e-06, "loss": 0.0304, "step": 710 }, { "epoch": 11.432, "grad_norm": 0.9616058468818665, "learning_rate": 9.10040271212139e-06, "loss": 0.0326, "step": 720 }, { "epoch": 11.592, "grad_norm": 0.7438204288482666, "learning_rate": 8.820432828491542e-06, "loss": 0.0387, "step": 730 }, { "epoch": 11.752, "grad_norm": 0.6004329919815063, "learning_rate": 8.541397630034757e-06, "loss": 0.0344, "step": 740 }, { "epoch": 11.912, "grad_norm": 2.2135913372039795, "learning_rate": 8.263518223330698e-06, "loss": 0.0192, "step": 750 }, { "epoch": 12.064, "grad_norm": 0.8558197021484375, "learning_rate": 7.987014799113398e-06, "loss": 0.0203, "step": 760 }, { "epoch": 12.224, "grad_norm": 2.0103511810302734, "learning_rate": 7.712106457792883e-06, "loss": 0.0152, "step": 770 }, { "epoch": 12.384, "grad_norm": 1.6075392961502075, "learning_rate": 7.439011035840684e-06, "loss": 0.0168, "step": 780 }, { "epoch": 12.544, "grad_norm": 2.277149200439453, "learning_rate": 7.16794493317696e-06, "loss": 0.0174, "step": 790 }, { "epoch": 12.704, "grad_norm": 0.9537448287010193, "learning_rate": 6.899122941695894e-06, "loss": 0.0146, "step": 800 }, { "epoch": 12.864, "grad_norm": 0.9526250958442688, "learning_rate": 6.632758075065288e-06, "loss": 0.0262, "step": 810 }, { "epoch": 13.016, "grad_norm": 0.5227965712547302, "learning_rate": 6.369061399935255e-06, "loss": 0.0205, "step": 820 }, { "epoch": 13.176, "grad_norm": 0.3879546821117401, "learning_rate": 6.108241868689675e-06, "loss": 0.0075, "step": 830 }, { "epoch": 13.336, "grad_norm": 0.11712963879108429, "learning_rate": 5.8505061538730105e-06, "loss": 0.0089, "step": 840 }, { "epoch": 13.496, "grad_norm": 0.2489776313304901, "learning_rate": 5.5960584844236565e-06, "loss": 0.0093, "step": 850 }, { "epoch": 13.656, "grad_norm": 0.24504923820495605, "learning_rate": 5.345100483843617e-06, "loss": 0.0086, "step": 860 }, { "epoch": 13.816, "grad_norm": 0.9373164176940918, "learning_rate": 5.097831010432666e-06, "loss": 0.0056, "step": 870 }, { "epoch": 13.975999999999999, "grad_norm": 0.16996854543685913, "learning_rate": 4.854445999713715e-06, "loss": 0.0044, "step": 880 }, { "epoch": 14.128, "grad_norm": 0.40567436814308167, "learning_rate": 4.615138309174112e-06, "loss": 0.003, "step": 890 }, { "epoch": 14.288, "grad_norm": 0.05296005308628082, "learning_rate": 4.38009756544603e-06, "loss": 0.002, "step": 900 }, { "epoch": 14.448, "grad_norm": 0.034155845642089844, "learning_rate": 4.149510014046922e-06, "loss": 0.0051, "step": 910 }, { "epoch": 14.608, "grad_norm": 0.5557677149772644, "learning_rate": 3.923558371799194e-06, "loss": 0.0035, "step": 920 }, { "epoch": 14.768, "grad_norm": 0.2064237892627716, "learning_rate": 3.7024216820459757e-06, "loss": 0.0016, "step": 930 }, { "epoch": 14.928, "grad_norm": 0.2826680541038513, "learning_rate": 3.48627517277778e-06, "loss": 0.0014, "step": 940 }, { "epoch": 15.08, "grad_norm": 0.06162280961871147, "learning_rate": 3.275290117782397e-06, "loss": 0.0017, "step": 950 }, { "epoch": 15.24, "grad_norm": 0.08669278025627136, "learning_rate": 3.0696337009281263e-06, "loss": 0.0009, "step": 960 }, { "epoch": 15.4, "grad_norm": 0.04744670167565346, "learning_rate": 2.869468883687798e-06, "loss": 0.0017, "step": 970 }, { "epoch": 15.56, "grad_norm": 0.04455321654677391, "learning_rate": 2.6749542760086613e-06, "loss": 0.0012, "step": 980 }, { "epoch": 15.72, "grad_norm": 0.03492380306124687, "learning_rate": 2.4862440106303664e-06, "loss": 0.0009, "step": 990 }, { "epoch": 15.88, "grad_norm": 0.04152894765138626, "learning_rate": 2.303487620950677e-06, "loss": 0.0015, "step": 1000 }, { "epoch": 16.032, "grad_norm": 0.058656930923461914, "learning_rate": 2.126829922535718e-06, "loss": 0.0011, "step": 1010 }, { "epoch": 16.192, "grad_norm": 0.023427117615938187, "learning_rate": 1.9564108983685758e-06, "loss": 0.0008, "step": 1020 }, { "epoch": 16.352, "grad_norm": 0.02621988020837307, "learning_rate": 1.7923655879272395e-06, "loss": 0.0011, "step": 1030 }, { "epoch": 16.512, "grad_norm": 0.02891433797776699, "learning_rate": 1.634823980179766e-06, "loss": 0.001, "step": 1040 }, { "epoch": 16.672, "grad_norm": 0.044328656047582626, "learning_rate": 1.483910910581452e-06, "loss": 0.0012, "step": 1050 }, { "epoch": 16.832, "grad_norm": 0.05566035583615303, "learning_rate": 1.339745962155613e-06, "loss": 0.0008, "step": 1060 }, { "epoch": 16.992, "grad_norm": 0.030019577592611313, "learning_rate": 1.2024433707364002e-06, "loss": 0.0013, "step": 1070 }, { "epoch": 17.144, "grad_norm": 0.07299891859292984, "learning_rate": 1.0721119344486841e-06, "loss": 0.0013, "step": 1080 }, { "epoch": 17.304, "grad_norm": 0.0282142274081707, "learning_rate": 9.488549274967873e-07, "loss": 0.0008, "step": 1090 }, { "epoch": 17.464, "grad_norm": 0.028321148827672005, "learning_rate": 8.327700183303433e-07, "loss": 0.0009, "step": 1100 }, { "epoch": 17.624, "grad_norm": 0.03647984564304352, "learning_rate": 7.239491922521247e-07, "loss": 0.0011, "step": 1110 }, { "epoch": 17.784, "grad_norm": 0.029092855751514435, "learning_rate": 6.22478678529197e-07, "loss": 0.0008, "step": 1120 }, { "epoch": 17.944, "grad_norm": 0.028766291216015816, "learning_rate": 5.284388820651331e-07, "loss": 0.001, "step": 1130 }, { "epoch": 18.096, "grad_norm": 0.03702507168054581, "learning_rate": 4.41904319687424e-07, "loss": 0.0011, "step": 1140 }, { "epoch": 18.256, "grad_norm": 0.025985565036535263, "learning_rate": 3.629435611005916e-07, "loss": 0.0008, "step": 1150 }, { "epoch": 18.416, "grad_norm": 0.0245125163346529, "learning_rate": 2.916191745517749e-07, "loss": 0.001, "step": 1160 }, { "epoch": 18.576, "grad_norm": 0.01987411454319954, "learning_rate": 2.2798767725185856e-07, "loss": 0.0007, "step": 1170 }, { "epoch": 18.736, "grad_norm": 0.06840561330318451, "learning_rate": 1.7209949059142084e-07, "loss": 0.001, "step": 1180 }, { "epoch": 18.896, "grad_norm": 0.018267642706632614, "learning_rate": 1.2399890018698345e-07, "loss": 0.001, "step": 1190 }, { "epoch": 19.048, "grad_norm": 0.02801842987537384, "learning_rate": 8.372402078924092e-08, "loss": 0.0008, "step": 1200 }, { "epoch": 19.208, "grad_norm": 0.030942991375923157, "learning_rate": 5.1306766081048456e-08, "loss": 0.0008, "step": 1210 }, { "epoch": 19.368, "grad_norm": 0.030538473278284073, "learning_rate": 2.6772823389131787e-08, "loss": 0.0009, "step": 1220 }, { "epoch": 19.528, "grad_norm": 0.0364975668489933, "learning_rate": 1.0141633329525669e-08, "loss": 0.0012, "step": 1230 }, { "epoch": 19.688, "grad_norm": 0.029744159430265427, "learning_rate": 1.4263744029019422e-09, "loss": 0.0008, "step": 1240 } ], "logging_steps": 10, "max_steps": 1240, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.817147240873984e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }