{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9108910891089108, "eval_steps": 13, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019801980198019802, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.0919, "step": 1 }, { "epoch": 0.019801980198019802, "eval_loss": 2.079954147338867, "eval_runtime": 13.8908, "eval_samples_per_second": 8.999, "eval_steps_per_second": 4.535, "step": 1 }, { "epoch": 0.039603960396039604, "grad_norm": 1.203125, "learning_rate": 4e-05, "loss": 2.0814, "step": 2 }, { "epoch": 0.0594059405940594, "grad_norm": 1.1953125, "learning_rate": 6e-05, "loss": 2.0499, "step": 3 }, { "epoch": 0.07920792079207921, "grad_norm": 1.0859375, "learning_rate": 8e-05, "loss": 2.0153, "step": 4 }, { "epoch": 0.09900990099009901, "grad_norm": 1.0390625, "learning_rate": 0.0001, "loss": 1.9548, "step": 5 }, { "epoch": 0.1188118811881188, "grad_norm": 0.89453125, "learning_rate": 0.00012, "loss": 1.8982, "step": 6 }, { "epoch": 0.13861386138613863, "grad_norm": 0.67578125, "learning_rate": 0.00014, "loss": 1.8226, "step": 7 }, { "epoch": 0.15841584158415842, "grad_norm": 0.66796875, "learning_rate": 0.00016, "loss": 1.7572, "step": 8 }, { "epoch": 0.1782178217821782, "grad_norm": 0.78515625, "learning_rate": 0.00018, "loss": 1.7074, "step": 9 }, { "epoch": 0.19801980198019803, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 1.6317, "step": 10 }, { "epoch": 0.21782178217821782, "grad_norm": 0.484375, "learning_rate": 0.0001999863304992469, "loss": 1.5801, "step": 11 }, { "epoch": 0.2376237623762376, "grad_norm": 0.53125, "learning_rate": 0.00019994532573409262, "loss": 1.5721, "step": 12 }, { "epoch": 0.25742574257425743, "grad_norm": 0.6953125, "learning_rate": 0.00019987699691483048, "loss": 1.5479, "step": 13 }, { "epoch": 0.25742574257425743, "eval_loss": 1.5341482162475586, "eval_runtime": 13.8795, "eval_samples_per_second": 9.006, "eval_steps_per_second": 4.539, "step": 13 }, { "epoch": 0.27722772277227725, "grad_norm": 0.65234375, "learning_rate": 0.00019978136272187747, "loss": 1.534, "step": 14 }, { "epoch": 0.297029702970297, "grad_norm": 0.515625, "learning_rate": 0.000199658449300667, "loss": 1.4804, "step": 15 }, { "epoch": 0.31683168316831684, "grad_norm": 0.439453125, "learning_rate": 0.00019950829025450114, "loss": 1.4805, "step": 16 }, { "epoch": 0.33663366336633666, "grad_norm": 0.361328125, "learning_rate": 0.00019933092663536382, "loss": 1.3809, "step": 17 }, { "epoch": 0.3564356435643564, "grad_norm": 0.3125, "learning_rate": 0.00019912640693269752, "loss": 1.3837, "step": 18 }, { "epoch": 0.37623762376237624, "grad_norm": 0.337890625, "learning_rate": 0.00019889478706014687, "loss": 1.3673, "step": 19 }, { "epoch": 0.39603960396039606, "grad_norm": 0.298828125, "learning_rate": 0.00019863613034027224, "loss": 1.366, "step": 20 }, { "epoch": 0.4158415841584158, "grad_norm": 0.34375, "learning_rate": 0.00019835050748723824, "loss": 1.3318, "step": 21 }, { "epoch": 0.43564356435643564, "grad_norm": 0.341796875, "learning_rate": 0.00019803799658748094, "loss": 1.2741, "step": 22 }, { "epoch": 0.45544554455445546, "grad_norm": 0.326171875, "learning_rate": 0.00019769868307835994, "loss": 1.2978, "step": 23 }, { "epoch": 0.4752475247524752, "grad_norm": 0.291015625, "learning_rate": 0.0001973326597248006, "loss": 1.2733, "step": 24 }, { "epoch": 0.49504950495049505, "grad_norm": 0.306640625, "learning_rate": 0.00019694002659393305, "loss": 1.2302, "step": 25 }, { "epoch": 0.5148514851485149, "grad_norm": 0.318359375, "learning_rate": 0.00019652089102773488, "loss": 1.2083, "step": 26 }, { "epoch": 0.5148514851485149, "eval_loss": 1.224540114402771, "eval_runtime": 13.8695, "eval_samples_per_second": 9.013, "eval_steps_per_second": 4.542, "step": 26 }, { "epoch": 0.5346534653465347, "grad_norm": 0.26953125, "learning_rate": 0.00019607536761368484, "loss": 1.1761, "step": 27 }, { "epoch": 0.5544554455445545, "grad_norm": 0.296875, "learning_rate": 0.00019560357815343577, "loss": 1.1751, "step": 28 }, { "epoch": 0.5742574257425742, "grad_norm": 0.310546875, "learning_rate": 0.00019510565162951537, "loss": 1.2002, "step": 29 }, { "epoch": 0.594059405940594, "grad_norm": 0.287109375, "learning_rate": 0.00019458172417006347, "loss": 1.1544, "step": 30 }, { "epoch": 0.6138613861386139, "grad_norm": 0.365234375, "learning_rate": 0.00019403193901161613, "loss": 1.1384, "step": 31 }, { "epoch": 0.6336633663366337, "grad_norm": 0.236328125, "learning_rate": 0.0001934564464599461, "loss": 1.0999, "step": 32 }, { "epoch": 0.6534653465346535, "grad_norm": 0.326171875, "learning_rate": 0.00019285540384897073, "loss": 1.1576, "step": 33 }, { "epoch": 0.6732673267326733, "grad_norm": 0.310546875, "learning_rate": 0.00019222897549773848, "loss": 1.091, "step": 34 }, { "epoch": 0.693069306930693, "grad_norm": 0.2578125, "learning_rate": 0.00019157733266550575, "loss": 1.056, "step": 35 }, { "epoch": 0.7128712871287128, "grad_norm": 0.267578125, "learning_rate": 0.00019090065350491626, "loss": 1.1068, "step": 36 }, { "epoch": 0.7326732673267327, "grad_norm": 0.2490234375, "learning_rate": 0.00019019912301329592, "loss": 1.0583, "step": 37 }, { "epoch": 0.7524752475247525, "grad_norm": 0.2734375, "learning_rate": 0.00018947293298207635, "loss": 1.0671, "step": 38 }, { "epoch": 0.7722772277227723, "grad_norm": 0.2490234375, "learning_rate": 0.0001887222819443612, "loss": 1.0851, "step": 39 }, { "epoch": 0.7722772277227723, "eval_loss": 1.060703158378601, "eval_runtime": 13.878, "eval_samples_per_second": 9.007, "eval_steps_per_second": 4.54, "step": 39 }, { "epoch": 0.7920792079207921, "grad_norm": 0.22265625, "learning_rate": 0.0001879473751206489, "loss": 1.0343, "step": 40 }, { "epoch": 0.8118811881188119, "grad_norm": 0.1796875, "learning_rate": 0.00018714842436272773, "loss": 0.9789, "step": 41 }, { "epoch": 0.8316831683168316, "grad_norm": 0.248046875, "learning_rate": 0.00018632564809575742, "loss": 1.0174, "step": 42 }, { "epoch": 0.8514851485148515, "grad_norm": 0.2294921875, "learning_rate": 0.0001854792712585539, "loss": 1.0004, "step": 43 }, { "epoch": 0.8712871287128713, "grad_norm": 0.228515625, "learning_rate": 0.00018460952524209355, "loss": 1.0281, "step": 44 }, { "epoch": 0.8910891089108911, "grad_norm": 0.220703125, "learning_rate": 0.00018371664782625287, "loss": 0.9992, "step": 45 }, { "epoch": 0.9108910891089109, "grad_norm": 0.2138671875, "learning_rate": 0.00018280088311480201, "loss": 0.9635, "step": 46 }, { "epoch": 0.9306930693069307, "grad_norm": 0.265625, "learning_rate": 0.00018186248146866927, "loss": 1.006, "step": 47 }, { "epoch": 0.9504950495049505, "grad_norm": 0.2451171875, "learning_rate": 0.00018090169943749476, "loss": 0.9891, "step": 48 }, { "epoch": 0.9702970297029703, "grad_norm": 0.28515625, "learning_rate": 0.0001799187996894925, "loss": 0.9809, "step": 49 }, { "epoch": 0.9900990099009901, "grad_norm": 0.212890625, "learning_rate": 0.00017891405093963938, "loss": 0.9646, "step": 50 }, { "epoch": 1.00990099009901, "grad_norm": 0.2451171875, "learning_rate": 0.00017788772787621126, "loss": 0.9553, "step": 51 }, { "epoch": 1.0297029702970297, "grad_norm": 0.2578125, "learning_rate": 0.00017684011108568592, "loss": 0.9432, "step": 52 }, { "epoch": 1.0297029702970297, "eval_loss": 0.9755253195762634, "eval_runtime": 13.879, "eval_samples_per_second": 9.006, "eval_steps_per_second": 4.539, "step": 52 }, { "epoch": 1.0495049504950495, "grad_norm": 0.2021484375, "learning_rate": 0.0001757714869760335, "loss": 0.9631, "step": 53 }, { "epoch": 1.0693069306930694, "grad_norm": 0.3046875, "learning_rate": 0.0001746821476984154, "loss": 0.9539, "step": 54 }, { "epoch": 1.0198019801980198, "grad_norm": 0.232421875, "learning_rate": 0.00017357239106731317, "loss": 0.9559, "step": 55 }, { "epoch": 1.0396039603960396, "grad_norm": 0.283203125, "learning_rate": 0.00017244252047910892, "loss": 0.9111, "step": 56 }, { "epoch": 1.0594059405940595, "grad_norm": 0.30859375, "learning_rate": 0.00017129284482913972, "loss": 0.9503, "step": 57 }, { "epoch": 1.0792079207920793, "grad_norm": 0.2265625, "learning_rate": 0.00017012367842724887, "loss": 0.911, "step": 58 }, { "epoch": 1.099009900990099, "grad_norm": 0.3515625, "learning_rate": 0.0001689353409118566, "loss": 0.9041, "step": 59 }, { "epoch": 1.118811881188119, "grad_norm": 0.26171875, "learning_rate": 0.00016772815716257412, "loss": 0.9117, "step": 60 }, { "epoch": 1.1386138613861387, "grad_norm": 0.2890625, "learning_rate": 0.0001665024572113848, "loss": 0.9351, "step": 61 }, { "epoch": 1.1584158415841583, "grad_norm": 0.251953125, "learning_rate": 0.00016525857615241687, "loss": 0.9438, "step": 62 }, { "epoch": 1.1782178217821782, "grad_norm": 0.2138671875, "learning_rate": 0.00016399685405033167, "loss": 0.9075, "step": 63 }, { "epoch": 1.198019801980198, "grad_norm": 0.2490234375, "learning_rate": 0.0001627176358473537, "loss": 0.8983, "step": 64 }, { "epoch": 1.2178217821782178, "grad_norm": 0.2021484375, "learning_rate": 0.0001614212712689668, "loss": 0.9007, "step": 65 }, { "epoch": 1.2178217821782178, "eval_loss": 0.9333999156951904, "eval_runtime": 13.8668, "eval_samples_per_second": 9.014, "eval_steps_per_second": 4.543, "step": 65 }, { "epoch": 1.2376237623762376, "grad_norm": 0.2431640625, "learning_rate": 0.00016010811472830252, "loss": 0.9108, "step": 66 }, { "epoch": 1.2574257425742574, "grad_norm": 0.232421875, "learning_rate": 0.00015877852522924732, "loss": 0.9177, "step": 67 }, { "epoch": 1.2772277227722773, "grad_norm": 0.271484375, "learning_rate": 0.00015743286626829437, "loss": 0.9, "step": 68 }, { "epoch": 1.297029702970297, "grad_norm": 0.2431640625, "learning_rate": 0.0001560715057351673, "loss": 0.9096, "step": 69 }, { "epoch": 1.316831683168317, "grad_norm": 0.22265625, "learning_rate": 0.00015469481581224272, "loss": 0.8946, "step": 70 }, { "epoch": 1.3366336633663367, "grad_norm": 0.31640625, "learning_rate": 0.0001533031728727994, "loss": 0.8995, "step": 71 }, { "epoch": 1.3564356435643563, "grad_norm": 0.2197265625, "learning_rate": 0.00015189695737812152, "loss": 0.922, "step": 72 }, { "epoch": 1.3762376237623761, "grad_norm": 0.22265625, "learning_rate": 0.0001504765537734844, "loss": 0.885, "step": 73 }, { "epoch": 1.396039603960396, "grad_norm": 0.248046875, "learning_rate": 0.00014904235038305083, "loss": 0.895, "step": 74 }, { "epoch": 1.4158415841584158, "grad_norm": 0.2431640625, "learning_rate": 0.00014759473930370736, "loss": 0.892, "step": 75 }, { "epoch": 1.4356435643564356, "grad_norm": 0.216796875, "learning_rate": 0.0001461341162978688, "loss": 0.8277, "step": 76 }, { "epoch": 1.4554455445544554, "grad_norm": 0.23828125, "learning_rate": 0.00014466088068528068, "loss": 0.8687, "step": 77 }, { "epoch": 1.4752475247524752, "grad_norm": 0.228515625, "learning_rate": 0.00014317543523384928, "loss": 0.8765, "step": 78 }, { "epoch": 1.4752475247524752, "eval_loss": 0.9083698391914368, "eval_runtime": 13.8834, "eval_samples_per_second": 9.004, "eval_steps_per_second": 4.538, "step": 78 }, { "epoch": 1.495049504950495, "grad_norm": 0.228515625, "learning_rate": 0.00014167818604952906, "loss": 0.8797, "step": 79 }, { "epoch": 1.5148514851485149, "grad_norm": 0.1982421875, "learning_rate": 0.00014016954246529696, "loss": 0.905, "step": 80 }, { "epoch": 1.5346534653465347, "grad_norm": 0.25390625, "learning_rate": 0.00013864991692924523, "loss": 0.8575, "step": 81 }, { "epoch": 1.5544554455445545, "grad_norm": 0.2451171875, "learning_rate": 0.00013711972489182208, "loss": 0.8957, "step": 82 }, { "epoch": 1.5742574257425743, "grad_norm": 0.2216796875, "learning_rate": 0.00013557938469225167, "loss": 0.8792, "step": 83 }, { "epoch": 1.5940594059405941, "grad_norm": 0.21484375, "learning_rate": 0.00013402931744416433, "loss": 0.889, "step": 84 }, { "epoch": 1.613861386138614, "grad_norm": 0.228515625, "learning_rate": 0.00013246994692046836, "loss": 0.8657, "step": 85 }, { "epoch": 1.6336633663366338, "grad_norm": 0.20703125, "learning_rate": 0.00013090169943749476, "loss": 0.8784, "step": 86 }, { "epoch": 1.6534653465346536, "grad_norm": 0.265625, "learning_rate": 0.0001293250037384465, "loss": 0.8822, "step": 87 }, { "epoch": 1.6732673267326734, "grad_norm": 0.2197265625, "learning_rate": 0.00012774029087618446, "loss": 0.9092, "step": 88 }, { "epoch": 1.693069306930693, "grad_norm": 0.234375, "learning_rate": 0.00012614799409538198, "loss": 0.8813, "step": 89 }, { "epoch": 1.7128712871287128, "grad_norm": 0.2294921875, "learning_rate": 0.00012454854871407994, "loss": 0.8975, "step": 90 }, { "epoch": 1.7326732673267327, "grad_norm": 0.259765625, "learning_rate": 0.00012294239200467516, "loss": 0.8789, "step": 91 }, { "epoch": 1.7326732673267327, "eval_loss": 0.8891416788101196, "eval_runtime": 13.872, "eval_samples_per_second": 9.011, "eval_steps_per_second": 4.542, "step": 91 }, { "epoch": 1.7524752475247525, "grad_norm": 0.26171875, "learning_rate": 0.0001213299630743747, "loss": 0.9184, "step": 92 }, { "epoch": 1.7722772277227723, "grad_norm": 0.337890625, "learning_rate": 0.00011971170274514802, "loss": 0.8854, "step": 93 }, { "epoch": 1.7920792079207921, "grad_norm": 0.2890625, "learning_rate": 0.000118088053433211, "loss": 0.8688, "step": 94 }, { "epoch": 1.811881188118812, "grad_norm": 0.3515625, "learning_rate": 0.00011645945902807341, "loss": 0.8281, "step": 95 }, { "epoch": 1.8316831683168315, "grad_norm": 0.26953125, "learning_rate": 0.0001148263647711842, "loss": 0.8488, "step": 96 }, { "epoch": 1.8514851485148514, "grad_norm": 0.2490234375, "learning_rate": 0.00011318921713420691, "loss": 0.8742, "step": 97 }, { "epoch": 1.8712871287128712, "grad_norm": 0.265625, "learning_rate": 0.00011154846369695863, "loss": 0.8586, "step": 98 }, { "epoch": 1.891089108910891, "grad_norm": 0.265625, "learning_rate": 0.0001099045530250463, "loss": 0.8776, "step": 99 }, { "epoch": 1.9108910891089108, "grad_norm": 0.259765625, "learning_rate": 0.00010825793454723325, "loss": 0.8563, "step": 100 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.08354098020352e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }