{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9944232462577047, "eval_steps": 500, "global_step": 1275, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023481068388611682, "grad_norm": 17.85382376485096, "learning_rate": 5e-06, "loss": 0.9488, "step": 10 }, { "epoch": 0.046962136777223364, "grad_norm": 3.177756281658111, "learning_rate": 5e-06, "loss": 0.8343, "step": 20 }, { "epoch": 0.07044320516583505, "grad_norm": 2.950019754054781, "learning_rate": 5e-06, "loss": 0.7651, "step": 30 }, { "epoch": 0.09392427355444673, "grad_norm": 1.450856517854765, "learning_rate": 5e-06, "loss": 0.7374, "step": 40 }, { "epoch": 0.1174053419430584, "grad_norm": 0.8091667434976095, "learning_rate": 5e-06, "loss": 0.7161, "step": 50 }, { "epoch": 0.1408864103316701, "grad_norm": 0.8253122020905036, "learning_rate": 5e-06, "loss": 0.6935, "step": 60 }, { "epoch": 0.16436747872028176, "grad_norm": 0.6338190868455624, "learning_rate": 5e-06, "loss": 0.6718, "step": 70 }, { "epoch": 0.18784854710889345, "grad_norm": 0.8143184435680945, "learning_rate": 5e-06, "loss": 0.6635, "step": 80 }, { "epoch": 0.21132961549750515, "grad_norm": 0.7202907234528798, "learning_rate": 5e-06, "loss": 0.6581, "step": 90 }, { "epoch": 0.2348106838861168, "grad_norm": 0.9812703695297975, "learning_rate": 5e-06, "loss": 0.6595, "step": 100 }, { "epoch": 0.2582917522747285, "grad_norm": 0.6403918144312425, "learning_rate": 5e-06, "loss": 0.6511, "step": 110 }, { "epoch": 0.2817728206633402, "grad_norm": 0.6355110618738598, "learning_rate": 5e-06, "loss": 0.6443, "step": 120 }, { "epoch": 0.3052538890519519, "grad_norm": 0.8767568074879388, "learning_rate": 5e-06, "loss": 0.6457, "step": 130 }, { "epoch": 0.3287349574405635, "grad_norm": 0.7912634151639979, "learning_rate": 5e-06, "loss": 0.6352, "step": 140 }, { "epoch": 0.3522160258291752, "grad_norm": 0.6346769190072642, "learning_rate": 5e-06, "loss": 0.642, "step": 150 }, { "epoch": 0.3756970942177869, "grad_norm": 0.6538804073841475, "learning_rate": 5e-06, "loss": 0.6453, "step": 160 }, { "epoch": 0.3991781626063986, "grad_norm": 0.524677402567732, "learning_rate": 5e-06, "loss": 0.6375, "step": 170 }, { "epoch": 0.4226592309950103, "grad_norm": 0.6983299832658841, "learning_rate": 5e-06, "loss": 0.6404, "step": 180 }, { "epoch": 0.44614029938362193, "grad_norm": 0.5730458718423643, "learning_rate": 5e-06, "loss": 0.6282, "step": 190 }, { "epoch": 0.4696213677722336, "grad_norm": 0.5795935762469521, "learning_rate": 5e-06, "loss": 0.6258, "step": 200 }, { "epoch": 0.4931024361608453, "grad_norm": 0.6568013132353124, "learning_rate": 5e-06, "loss": 0.633, "step": 210 }, { "epoch": 0.516583504549457, "grad_norm": 0.625002257246399, "learning_rate": 5e-06, "loss": 0.6215, "step": 220 }, { "epoch": 0.5400645729380686, "grad_norm": 0.5866557972006909, "learning_rate": 5e-06, "loss": 0.6192, "step": 230 }, { "epoch": 0.5635456413266804, "grad_norm": 0.5523814224922182, "learning_rate": 5e-06, "loss": 0.6225, "step": 240 }, { "epoch": 0.587026709715292, "grad_norm": 0.621565965520602, "learning_rate": 5e-06, "loss": 0.6176, "step": 250 }, { "epoch": 0.6105077781039038, "grad_norm": 0.5699863716830831, "learning_rate": 5e-06, "loss": 0.6184, "step": 260 }, { "epoch": 0.6339888464925154, "grad_norm": 0.5867908049315175, "learning_rate": 5e-06, "loss": 0.6214, "step": 270 }, { "epoch": 0.657469914881127, "grad_norm": 1.0031834057307554, "learning_rate": 5e-06, "loss": 0.6204, "step": 280 }, { "epoch": 0.6809509832697388, "grad_norm": 0.5739527115683025, "learning_rate": 5e-06, "loss": 0.6166, "step": 290 }, { "epoch": 0.7044320516583504, "grad_norm": 0.6047949935557967, "learning_rate": 5e-06, "loss": 0.6096, "step": 300 }, { "epoch": 0.7279131200469622, "grad_norm": 0.5141588768585372, "learning_rate": 5e-06, "loss": 0.6076, "step": 310 }, { "epoch": 0.7513941884355738, "grad_norm": 0.689840031854037, "learning_rate": 5e-06, "loss": 0.6115, "step": 320 }, { "epoch": 0.7748752568241855, "grad_norm": 0.7551629396423358, "learning_rate": 5e-06, "loss": 0.6138, "step": 330 }, { "epoch": 0.7983563252127972, "grad_norm": 0.5356794354442618, "learning_rate": 5e-06, "loss": 0.611, "step": 340 }, { "epoch": 0.8218373936014088, "grad_norm": 0.6083117103787592, "learning_rate": 5e-06, "loss": 0.611, "step": 350 }, { "epoch": 0.8453184619900206, "grad_norm": 0.7034174178117553, "learning_rate": 5e-06, "loss": 0.6104, "step": 360 }, { "epoch": 0.8687995303786322, "grad_norm": 0.5650614816783488, "learning_rate": 5e-06, "loss": 0.6047, "step": 370 }, { "epoch": 0.8922805987672439, "grad_norm": 0.6449034585856688, "learning_rate": 5e-06, "loss": 0.613, "step": 380 }, { "epoch": 0.9157616671558556, "grad_norm": 0.6460327839123797, "learning_rate": 5e-06, "loss": 0.6104, "step": 390 }, { "epoch": 0.9392427355444672, "grad_norm": 0.6977465772968938, "learning_rate": 5e-06, "loss": 0.6124, "step": 400 }, { "epoch": 0.962723803933079, "grad_norm": 0.6646492018550946, "learning_rate": 5e-06, "loss": 0.5957, "step": 410 }, { "epoch": 0.9862048723216906, "grad_norm": 0.48972085408134464, "learning_rate": 5e-06, "loss": 0.5979, "step": 420 }, { "epoch": 0.9979454065159965, "eval_loss": 0.6003177165985107, "eval_runtime": 300.5123, "eval_samples_per_second": 38.181, "eval_steps_per_second": 0.599, "step": 425 }, { "epoch": 1.0099794540651599, "grad_norm": 0.8929971915551214, "learning_rate": 5e-06, "loss": 0.6046, "step": 430 }, { "epoch": 1.0334605224537716, "grad_norm": 0.6204797689157823, "learning_rate": 5e-06, "loss": 0.5525, "step": 440 }, { "epoch": 1.0569415908423834, "grad_norm": 0.629777186945133, "learning_rate": 5e-06, "loss": 0.5507, "step": 450 }, { "epoch": 1.0804226592309951, "grad_norm": 0.6177917289848706, "learning_rate": 5e-06, "loss": 0.5488, "step": 460 }, { "epoch": 1.1039037276196066, "grad_norm": 0.616801854556511, "learning_rate": 5e-06, "loss": 0.5553, "step": 470 }, { "epoch": 1.1273847960082184, "grad_norm": 0.6202115496799256, "learning_rate": 5e-06, "loss": 0.5517, "step": 480 }, { "epoch": 1.1508658643968301, "grad_norm": 0.6869810211842367, "learning_rate": 5e-06, "loss": 0.5513, "step": 490 }, { "epoch": 1.1743469327854417, "grad_norm": 0.6134525997635181, "learning_rate": 5e-06, "loss": 0.5565, "step": 500 }, { "epoch": 1.1978280011740534, "grad_norm": 0.6533976679541116, "learning_rate": 5e-06, "loss": 0.5508, "step": 510 }, { "epoch": 1.2213090695626652, "grad_norm": 0.7384172080803894, "learning_rate": 5e-06, "loss": 0.5513, "step": 520 }, { "epoch": 1.2447901379512767, "grad_norm": 0.6446266722666449, "learning_rate": 5e-06, "loss": 0.5538, "step": 530 }, { "epoch": 1.2682712063398884, "grad_norm": 0.6920463607543151, "learning_rate": 5e-06, "loss": 0.5442, "step": 540 }, { "epoch": 1.2917522747285002, "grad_norm": 0.7090350057008448, "learning_rate": 5e-06, "loss": 0.5566, "step": 550 }, { "epoch": 1.3152333431171117, "grad_norm": 0.7226606579176621, "learning_rate": 5e-06, "loss": 0.5494, "step": 560 }, { "epoch": 1.3387144115057235, "grad_norm": 0.5484819340890661, "learning_rate": 5e-06, "loss": 0.5471, "step": 570 }, { "epoch": 1.3621954798943352, "grad_norm": 0.6982064642691793, "learning_rate": 5e-06, "loss": 0.552, "step": 580 }, { "epoch": 1.385676548282947, "grad_norm": 0.6147097813447182, "learning_rate": 5e-06, "loss": 0.5505, "step": 590 }, { "epoch": 1.4091576166715585, "grad_norm": 0.6598577111107449, "learning_rate": 5e-06, "loss": 0.5498, "step": 600 }, { "epoch": 1.4326386850601702, "grad_norm": 0.5603317744558156, "learning_rate": 5e-06, "loss": 0.5562, "step": 610 }, { "epoch": 1.456119753448782, "grad_norm": 0.6373670868271739, "learning_rate": 5e-06, "loss": 0.5555, "step": 620 }, { "epoch": 1.4796008218373937, "grad_norm": 0.5932324800613761, "learning_rate": 5e-06, "loss": 0.544, "step": 630 }, { "epoch": 1.5030818902260052, "grad_norm": 0.5227721794518003, "learning_rate": 5e-06, "loss": 0.5451, "step": 640 }, { "epoch": 1.526562958614617, "grad_norm": 0.6093734754379218, "learning_rate": 5e-06, "loss": 0.5587, "step": 650 }, { "epoch": 1.5500440270032287, "grad_norm": 0.6287567431322201, "learning_rate": 5e-06, "loss": 0.5518, "step": 660 }, { "epoch": 1.5735250953918403, "grad_norm": 0.6541153306269628, "learning_rate": 5e-06, "loss": 0.5526, "step": 670 }, { "epoch": 1.597006163780452, "grad_norm": 0.6150418746443832, "learning_rate": 5e-06, "loss": 0.5478, "step": 680 }, { "epoch": 1.6204872321690638, "grad_norm": 0.5475396144251581, "learning_rate": 5e-06, "loss": 0.5481, "step": 690 }, { "epoch": 1.6439683005576753, "grad_norm": 0.6160254516028266, "learning_rate": 5e-06, "loss": 0.5515, "step": 700 }, { "epoch": 1.667449368946287, "grad_norm": 0.843962487015559, "learning_rate": 5e-06, "loss": 0.5505, "step": 710 }, { "epoch": 1.6909304373348988, "grad_norm": 0.5347960520186497, "learning_rate": 5e-06, "loss": 0.5496, "step": 720 }, { "epoch": 1.7144115057235103, "grad_norm": 0.7323294593212619, "learning_rate": 5e-06, "loss": 0.5484, "step": 730 }, { "epoch": 1.737892574112122, "grad_norm": 0.5476336095439653, "learning_rate": 5e-06, "loss": 0.5483, "step": 740 }, { "epoch": 1.7613736425007338, "grad_norm": 0.6011568410465314, "learning_rate": 5e-06, "loss": 0.5531, "step": 750 }, { "epoch": 1.7848547108893453, "grad_norm": 0.6797212856687878, "learning_rate": 5e-06, "loss": 0.5487, "step": 760 }, { "epoch": 1.8083357792779573, "grad_norm": 0.5819194256052239, "learning_rate": 5e-06, "loss": 0.5485, "step": 770 }, { "epoch": 1.8318168476665688, "grad_norm": 0.5292485271089171, "learning_rate": 5e-06, "loss": 0.5518, "step": 780 }, { "epoch": 1.8552979160551804, "grad_norm": 0.5799939542823142, "learning_rate": 5e-06, "loss": 0.5467, "step": 790 }, { "epoch": 1.8787789844437923, "grad_norm": 0.5570487573376319, "learning_rate": 5e-06, "loss": 0.5468, "step": 800 }, { "epoch": 1.9022600528324038, "grad_norm": 0.6174505205210491, "learning_rate": 5e-06, "loss": 0.5488, "step": 810 }, { "epoch": 1.9257411212210156, "grad_norm": 0.5693760728585585, "learning_rate": 5e-06, "loss": 0.5458, "step": 820 }, { "epoch": 1.9492221896096273, "grad_norm": 0.5189329244004007, "learning_rate": 5e-06, "loss": 0.5447, "step": 830 }, { "epoch": 1.9727032579982389, "grad_norm": 0.6238194647054416, "learning_rate": 5e-06, "loss": 0.5398, "step": 840 }, { "epoch": 1.9961843263868506, "grad_norm": 0.5651058414473042, "learning_rate": 5e-06, "loss": 0.547, "step": 850 }, { "epoch": 1.9985324332257117, "eval_loss": 0.5846751928329468, "eval_runtime": 295.576, "eval_samples_per_second": 38.819, "eval_steps_per_second": 0.609, "step": 851 }, { "epoch": 2.0199589081303198, "grad_norm": 0.6946392563773179, "learning_rate": 5e-06, "loss": 0.523, "step": 860 }, { "epoch": 2.0434399765189317, "grad_norm": 0.8362364731068118, "learning_rate": 5e-06, "loss": 0.4924, "step": 870 }, { "epoch": 2.0669210449075432, "grad_norm": 0.7772390407263537, "learning_rate": 5e-06, "loss": 0.4902, "step": 880 }, { "epoch": 2.0904021132961548, "grad_norm": 0.5219899057565048, "learning_rate": 5e-06, "loss": 0.4955, "step": 890 }, { "epoch": 2.1138831816847667, "grad_norm": 0.6541702370902746, "learning_rate": 5e-06, "loss": 0.4969, "step": 900 }, { "epoch": 2.1373642500733783, "grad_norm": 0.5786919167953266, "learning_rate": 5e-06, "loss": 0.4865, "step": 910 }, { "epoch": 2.1608453184619902, "grad_norm": 0.6257190526717714, "learning_rate": 5e-06, "loss": 0.4966, "step": 920 }, { "epoch": 2.1843263868506018, "grad_norm": 0.6070724867911851, "learning_rate": 5e-06, "loss": 0.497, "step": 930 }, { "epoch": 2.2078074552392133, "grad_norm": 0.7626194518667021, "learning_rate": 5e-06, "loss": 0.498, "step": 940 }, { "epoch": 2.2312885236278253, "grad_norm": 0.66208678281747, "learning_rate": 5e-06, "loss": 0.4978, "step": 950 }, { "epoch": 2.254769592016437, "grad_norm": 0.597091889958803, "learning_rate": 5e-06, "loss": 0.4954, "step": 960 }, { "epoch": 2.2782506604050483, "grad_norm": 0.648113572619813, "learning_rate": 5e-06, "loss": 0.5024, "step": 970 }, { "epoch": 2.3017317287936603, "grad_norm": 0.5800444078314209, "learning_rate": 5e-06, "loss": 0.4982, "step": 980 }, { "epoch": 2.325212797182272, "grad_norm": 0.5914333960587344, "learning_rate": 5e-06, "loss": 0.4994, "step": 990 }, { "epoch": 2.3486938655708833, "grad_norm": 0.7924399848654731, "learning_rate": 5e-06, "loss": 0.5002, "step": 1000 }, { "epoch": 2.3721749339594953, "grad_norm": 0.624856284950558, "learning_rate": 5e-06, "loss": 0.4994, "step": 1010 }, { "epoch": 2.395656002348107, "grad_norm": 0.6655834278053183, "learning_rate": 5e-06, "loss": 0.4906, "step": 1020 }, { "epoch": 2.4191370707367184, "grad_norm": 0.5518645145037842, "learning_rate": 5e-06, "loss": 0.4975, "step": 1030 }, { "epoch": 2.4426181391253303, "grad_norm": 0.5785599308848968, "learning_rate": 5e-06, "loss": 0.4976, "step": 1040 }, { "epoch": 2.466099207513942, "grad_norm": 0.5834274439157257, "learning_rate": 5e-06, "loss": 0.5052, "step": 1050 }, { "epoch": 2.4895802759025534, "grad_norm": 0.777173362933167, "learning_rate": 5e-06, "loss": 0.4989, "step": 1060 }, { "epoch": 2.5130613442911653, "grad_norm": 0.929125527693667, "learning_rate": 5e-06, "loss": 0.5059, "step": 1070 }, { "epoch": 2.536542412679777, "grad_norm": 0.6950708275354937, "learning_rate": 5e-06, "loss": 0.4946, "step": 1080 }, { "epoch": 2.560023481068389, "grad_norm": 0.5542934300294031, "learning_rate": 5e-06, "loss": 0.5086, "step": 1090 }, { "epoch": 2.5835045494570004, "grad_norm": 0.7961833486998505, "learning_rate": 5e-06, "loss": 0.498, "step": 1100 }, { "epoch": 2.606985617845612, "grad_norm": 0.6237039054057065, "learning_rate": 5e-06, "loss": 0.4998, "step": 1110 }, { "epoch": 2.6304666862342234, "grad_norm": 0.5833433757118394, "learning_rate": 5e-06, "loss": 0.4944, "step": 1120 }, { "epoch": 2.6539477546228354, "grad_norm": 0.5868493372745182, "learning_rate": 5e-06, "loss": 0.5028, "step": 1130 }, { "epoch": 2.677428823011447, "grad_norm": 0.6399452879323836, "learning_rate": 5e-06, "loss": 0.4982, "step": 1140 }, { "epoch": 2.700909891400059, "grad_norm": 0.6458178037380802, "learning_rate": 5e-06, "loss": 0.5009, "step": 1150 }, { "epoch": 2.7243909597886704, "grad_norm": 0.5439040913583559, "learning_rate": 5e-06, "loss": 0.5012, "step": 1160 }, { "epoch": 2.747872028177282, "grad_norm": 0.735794966069211, "learning_rate": 5e-06, "loss": 0.5032, "step": 1170 }, { "epoch": 2.771353096565894, "grad_norm": 0.601851566145536, "learning_rate": 5e-06, "loss": 0.5033, "step": 1180 }, { "epoch": 2.7948341649545054, "grad_norm": 0.5929336883777777, "learning_rate": 5e-06, "loss": 0.5037, "step": 1190 }, { "epoch": 2.818315233343117, "grad_norm": 0.5778177618943612, "learning_rate": 5e-06, "loss": 0.508, "step": 1200 }, { "epoch": 2.841796301731729, "grad_norm": 0.5990753577226628, "learning_rate": 5e-06, "loss": 0.4957, "step": 1210 }, { "epoch": 2.8652773701203404, "grad_norm": 0.7825195485986434, "learning_rate": 5e-06, "loss": 0.499, "step": 1220 }, { "epoch": 2.888758438508952, "grad_norm": 0.5698945641530911, "learning_rate": 5e-06, "loss": 0.4982, "step": 1230 }, { "epoch": 2.912239506897564, "grad_norm": 0.5941997062897303, "learning_rate": 5e-06, "loss": 0.5006, "step": 1240 }, { "epoch": 2.9357205752861755, "grad_norm": 0.5494008699329855, "learning_rate": 5e-06, "loss": 0.494, "step": 1250 }, { "epoch": 2.9592016436747874, "grad_norm": 0.6295305801304111, "learning_rate": 5e-06, "loss": 0.5001, "step": 1260 }, { "epoch": 2.982682712063399, "grad_norm": 0.6627594327431557, "learning_rate": 5e-06, "loss": 0.5003, "step": 1270 }, { "epoch": 2.9944232462577047, "eval_loss": 0.5840933322906494, "eval_runtime": 292.1509, "eval_samples_per_second": 39.274, "eval_steps_per_second": 0.616, "step": 1275 }, { "epoch": 2.9944232462577047, "step": 1275, "total_flos": 2135463108280320.0, "train_loss": 0.5660238438026578, "train_runtime": 41769.8593, "train_samples_per_second": 15.658, "train_steps_per_second": 0.031 } ], "logging_steps": 10, "max_steps": 1275, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2135463108280320.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }