{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.902439024390244, "eval_steps": 500, "global_step": 1020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3902439024390244, "grad_norm": 4.256475448608398, "learning_rate": 9.80392156862745e-05, "loss": 1.8243, "step": 20 }, { "epoch": 0.7804878048780488, "grad_norm": 3.3582167625427246, "learning_rate": 0.000196078431372549, "loss": 1.358, "step": 40 }, { "epoch": 1.170731707317073, "grad_norm": 2.848921775817871, "learning_rate": 0.00029411764705882356, "loss": 1.0892, "step": 60 }, { "epoch": 1.5609756097560976, "grad_norm": 3.141003370285034, "learning_rate": 0.000392156862745098, "loss": 0.9606, "step": 80 }, { "epoch": 1.951219512195122, "grad_norm": 2.106764078140259, "learning_rate": 0.0004901960784313725, "loss": 0.9963, "step": 100 }, { "epoch": 2.341463414634146, "grad_norm": 1.790688157081604, "learning_rate": 0.0004995258321842611, "loss": 0.7703, "step": 120 }, { "epoch": 2.7317073170731705, "grad_norm": 1.8451107740402222, "learning_rate": 0.0004978890426632721, "loss": 0.7723, "step": 140 }, { "epoch": 3.1219512195121952, "grad_norm": 1.552288293838501, "learning_rate": 0.0004950914404289423, "loss": 0.7005, "step": 160 }, { "epoch": 3.5121951219512195, "grad_norm": 1.4375431537628174, "learning_rate": 0.0004911461260693639, "loss": 0.5082, "step": 180 }, { "epoch": 3.902439024390244, "grad_norm": 1.4421968460083008, "learning_rate": 0.0004860715746692661, "loss": 0.563, "step": 200 }, { "epoch": 4.2926829268292686, "grad_norm": 1.249590516090393, "learning_rate": 0.0004798915492950456, "loss": 0.3945, "step": 220 }, { "epoch": 4.682926829268292, "grad_norm": 1.2297637462615967, "learning_rate": 0.0004726349897172791, "loss": 0.3679, "step": 240 }, { "epoch": 5.073170731707317, "grad_norm": 0.9739590883255005, "learning_rate": 0.00046433587689181054, "loss": 0.3572, "step": 260 }, { "epoch": 5.463414634146342, "grad_norm": 1.0546987056732178, "learning_rate": 0.00045503307383401896, "loss": 0.2272, "step": 280 }, { "epoch": 5.853658536585366, "grad_norm": 1.131352424621582, "learning_rate": 0.0004447701436314176, "loss": 0.2635, "step": 300 }, { "epoch": 6.2439024390243905, "grad_norm": 0.8285700082778931, "learning_rate": 0.00043359514544679713, "loss": 0.1999, "step": 320 }, { "epoch": 6.634146341463414, "grad_norm": 0.8312835097312927, "learning_rate": 0.00042156040946718344, "loss": 0.1749, "step": 340 }, { "epoch": 7.024390243902439, "grad_norm": 0.6575689911842346, "learning_rate": 0.0004087222918524807, "loss": 0.1888, "step": 360 }, { "epoch": 7.414634146341464, "grad_norm": 0.8747310042381287, "learning_rate": 0.0003951409108313223, "loss": 0.1192, "step": 380 }, { "epoch": 7.804878048780488, "grad_norm": 0.688596248626709, "learning_rate": 0.00038087986517993766, "loss": 0.1361, "step": 400 }, { "epoch": 8.195121951219512, "grad_norm": 0.6887462139129639, "learning_rate": 0.00036600593640234084, "loss": 0.1098, "step": 420 }, { "epoch": 8.585365853658537, "grad_norm": 0.7194873094558716, "learning_rate": 0.0003505887760064681, "loss": 0.0964, "step": 440 }, { "epoch": 8.975609756097562, "grad_norm": 0.681694746017456, "learning_rate": 0.0003347005793406853, "loss": 0.1006, "step": 460 }, { "epoch": 9.365853658536585, "grad_norm": 0.5795145630836487, "learning_rate": 0.0003184157475180208, "loss": 0.0672, "step": 480 }, { "epoch": 9.75609756097561, "grad_norm": 0.5477526783943176, "learning_rate": 0.00030181053901126245, "loss": 0.0741, "step": 500 }, { "epoch": 10.146341463414634, "grad_norm": 0.46498867869377136, "learning_rate": 0.00028496271255042615, "loss": 0.0625, "step": 520 }, { "epoch": 10.536585365853659, "grad_norm": 0.5062224268913269, "learning_rate": 0.00026795116299483193, "loss": 0.0516, "step": 540 }, { "epoch": 10.926829268292684, "grad_norm": 0.4446791410446167, "learning_rate": 0.0002508555518849238, "loss": 0.0574, "step": 560 }, { "epoch": 11.317073170731707, "grad_norm": 0.4164929986000061, "learning_rate": 0.0002337559344038817, "loss": 0.0448, "step": 580 }, { "epoch": 11.707317073170731, "grad_norm": 0.48985204100608826, "learning_rate": 0.00021673238449588668, "loss": 0.0402, "step": 600 }, { "epoch": 12.097560975609756, "grad_norm": 0.37333598732948303, "learning_rate": 0.0001998646198965312, "loss": 0.0393, "step": 620 }, { "epoch": 12.487804878048781, "grad_norm": 0.34097352623939514, "learning_rate": 0.0001832316288312821, "loss": 0.0285, "step": 640 }, { "epoch": 12.878048780487806, "grad_norm": 0.3413819670677185, "learning_rate": 0.00016691130013008512, "loss": 0.0307, "step": 660 }, { "epoch": 13.268292682926829, "grad_norm": 0.27664974331855774, "learning_rate": 0.0001509800584902108, "loss": 0.0272, "step": 680 }, { "epoch": 13.658536585365853, "grad_norm": 0.2621878981590271, "learning_rate": 0.00013551250659532853, "loss": 0.0213, "step": 700 }, { "epoch": 14.048780487804878, "grad_norm": 0.22704452276229858, "learning_rate": 0.0001205810757666894, "loss": 0.0212, "step": 720 }, { "epoch": 14.439024390243903, "grad_norm": 0.17278432846069336, "learning_rate": 0.00010625568678234838, "loss": 0.0161, "step": 740 }, { "epoch": 14.829268292682928, "grad_norm": 0.1445775330066681, "learning_rate": 9.260342245273506e-05, "loss": 0.0159, "step": 760 }, { "epoch": 15.21951219512195, "grad_norm": 0.14733894169330597, "learning_rate": 7.968821348583643e-05, "loss": 0.0137, "step": 780 }, { "epoch": 15.609756097560975, "grad_norm": 0.15792453289031982, "learning_rate": 6.75705391130183e-05, "loss": 0.0112, "step": 800 }, { "epoch": 16.0, "grad_norm": 0.21093232929706573, "learning_rate": 5.6307143877391305e-05, "loss": 0.0129, "step": 820 }, { "epoch": 16.390243902439025, "grad_norm": 0.15143983066082, "learning_rate": 4.59507719109446e-05, "loss": 0.0092, "step": 840 }, { "epoch": 16.78048780487805, "grad_norm": 0.16431018710136414, "learning_rate": 3.654991994477039e-05, "loss": 0.0088, "step": 860 }, { "epoch": 17.170731707317074, "grad_norm": 0.10315347462892532, "learning_rate": 2.8148610208981464e-05, "loss": 0.0093, "step": 880 }, { "epoch": 17.5609756097561, "grad_norm": 0.10833552479743958, "learning_rate": 2.07861842857843e-05, "loss": 0.0075, "step": 900 }, { "epoch": 17.951219512195124, "grad_norm": 0.1411917805671692, "learning_rate": 1.4497118881050458e-05, "loss": 0.0084, "step": 920 }, { "epoch": 18.341463414634145, "grad_norm": 0.10689696669578552, "learning_rate": 9.310864377089696e-06, "loss": 0.0067, "step": 940 }, { "epoch": 18.73170731707317, "grad_norm": 0.10839959979057312, "learning_rate": 5.251706922648869e-06, "loss": 0.0071, "step": 960 }, { "epoch": 19.121951219512194, "grad_norm": 0.13111558556556702, "learning_rate": 2.3386547059396633e-06, "loss": 0.0074, "step": 980 }, { "epoch": 19.51219512195122, "grad_norm": 0.14715228974819183, "learning_rate": 5.853489432556536e-07, "loss": 0.0066, "step": 1000 }, { "epoch": 19.902439024390244, "grad_norm": 0.13206566870212555, "learning_rate": 0.0, "loss": 0.007, "step": 1020 }, { "epoch": 19.902439024390244, "step": 1020, "total_flos": 1.7419395455334912e+17, "train_loss": 0.2547569075197566, "train_runtime": 35555.2849, "train_samples_per_second": 0.922, "train_steps_per_second": 0.029 } ], "logging_steps": 20, "max_steps": 1020, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7419395455334912e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }