{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8316831683168315, "eval_steps": 13, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019801980198019802, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.0919, "step": 1 }, { "epoch": 0.019801980198019802, "eval_loss": 2.079954147338867, "eval_runtime": 13.8908, "eval_samples_per_second": 8.999, "eval_steps_per_second": 4.535, "step": 1 }, { "epoch": 0.039603960396039604, "grad_norm": 1.203125, "learning_rate": 4e-05, "loss": 2.0814, "step": 2 }, { "epoch": 0.0594059405940594, "grad_norm": 1.1953125, "learning_rate": 6e-05, "loss": 2.0499, "step": 3 }, { "epoch": 0.07920792079207921, "grad_norm": 1.0859375, "learning_rate": 8e-05, "loss": 2.0153, "step": 4 }, { "epoch": 0.09900990099009901, "grad_norm": 1.0390625, "learning_rate": 0.0001, "loss": 1.9548, "step": 5 }, { "epoch": 0.1188118811881188, "grad_norm": 0.89453125, "learning_rate": 0.00012, "loss": 1.8982, "step": 6 }, { "epoch": 0.13861386138613863, "grad_norm": 0.67578125, "learning_rate": 0.00014, "loss": 1.8226, "step": 7 }, { "epoch": 0.15841584158415842, "grad_norm": 0.66796875, "learning_rate": 0.00016, "loss": 1.7572, "step": 8 }, { "epoch": 0.1782178217821782, "grad_norm": 0.78515625, "learning_rate": 0.00018, "loss": 1.7074, "step": 9 }, { "epoch": 0.19801980198019803, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 1.6317, "step": 10 }, { "epoch": 0.21782178217821782, "grad_norm": 0.484375, "learning_rate": 0.0001999863304992469, "loss": 1.5801, "step": 11 }, { "epoch": 0.2376237623762376, "grad_norm": 0.53125, "learning_rate": 0.00019994532573409262, "loss": 1.5721, "step": 12 }, { "epoch": 0.25742574257425743, "grad_norm": 0.6953125, "learning_rate": 0.00019987699691483048, "loss": 1.5479, "step": 13 }, { "epoch": 0.25742574257425743, "eval_loss": 1.5341482162475586, "eval_runtime": 13.8795, "eval_samples_per_second": 9.006, "eval_steps_per_second": 4.539, "step": 13 }, { "epoch": 0.27722772277227725, "grad_norm": 0.65234375, "learning_rate": 0.00019978136272187747, "loss": 1.534, "step": 14 }, { "epoch": 0.297029702970297, "grad_norm": 0.515625, "learning_rate": 0.000199658449300667, "loss": 1.4804, "step": 15 }, { "epoch": 0.31683168316831684, "grad_norm": 0.439453125, "learning_rate": 0.00019950829025450114, "loss": 1.4805, "step": 16 }, { "epoch": 0.33663366336633666, "grad_norm": 0.361328125, "learning_rate": 0.00019933092663536382, "loss": 1.3809, "step": 17 }, { "epoch": 0.3564356435643564, "grad_norm": 0.3125, "learning_rate": 0.00019912640693269752, "loss": 1.3837, "step": 18 }, { "epoch": 0.37623762376237624, "grad_norm": 0.337890625, "learning_rate": 0.00019889478706014687, "loss": 1.3673, "step": 19 }, { "epoch": 0.39603960396039606, "grad_norm": 0.298828125, "learning_rate": 0.00019863613034027224, "loss": 1.366, "step": 20 }, { "epoch": 0.4158415841584158, "grad_norm": 0.34375, "learning_rate": 0.00019835050748723824, "loss": 1.3318, "step": 21 }, { "epoch": 0.43564356435643564, "grad_norm": 0.341796875, "learning_rate": 0.00019803799658748094, "loss": 1.2741, "step": 22 }, { "epoch": 0.45544554455445546, "grad_norm": 0.326171875, "learning_rate": 0.00019769868307835994, "loss": 1.2978, "step": 23 }, { "epoch": 0.4752475247524752, "grad_norm": 0.291015625, "learning_rate": 0.0001973326597248006, "loss": 1.2733, "step": 24 }, { "epoch": 0.49504950495049505, "grad_norm": 0.306640625, "learning_rate": 0.00019694002659393305, "loss": 1.2302, "step": 25 }, { "epoch": 0.5148514851485149, "grad_norm": 0.318359375, "learning_rate": 0.00019652089102773488, "loss": 1.2083, "step": 26 }, { "epoch": 0.5148514851485149, "eval_loss": 1.224540114402771, "eval_runtime": 13.8695, "eval_samples_per_second": 9.013, "eval_steps_per_second": 4.542, "step": 26 }, { "epoch": 0.5346534653465347, "grad_norm": 0.26953125, "learning_rate": 0.00019607536761368484, "loss": 1.1761, "step": 27 }, { "epoch": 0.5544554455445545, "grad_norm": 0.296875, "learning_rate": 0.00019560357815343577, "loss": 1.1751, "step": 28 }, { "epoch": 0.5742574257425742, "grad_norm": 0.310546875, "learning_rate": 0.00019510565162951537, "loss": 1.2002, "step": 29 }, { "epoch": 0.594059405940594, "grad_norm": 0.287109375, "learning_rate": 0.00019458172417006347, "loss": 1.1544, "step": 30 }, { "epoch": 0.6138613861386139, "grad_norm": 0.365234375, "learning_rate": 0.00019403193901161613, "loss": 1.1384, "step": 31 }, { "epoch": 0.6336633663366337, "grad_norm": 0.236328125, "learning_rate": 0.0001934564464599461, "loss": 1.0999, "step": 32 }, { "epoch": 0.6534653465346535, "grad_norm": 0.326171875, "learning_rate": 0.00019285540384897073, "loss": 1.1576, "step": 33 }, { "epoch": 0.6732673267326733, "grad_norm": 0.310546875, "learning_rate": 0.00019222897549773848, "loss": 1.091, "step": 34 }, { "epoch": 0.693069306930693, "grad_norm": 0.2578125, "learning_rate": 0.00019157733266550575, "loss": 1.056, "step": 35 }, { "epoch": 0.7128712871287128, "grad_norm": 0.267578125, "learning_rate": 0.00019090065350491626, "loss": 1.1068, "step": 36 }, { "epoch": 0.7326732673267327, "grad_norm": 0.2490234375, "learning_rate": 0.00019019912301329592, "loss": 1.0583, "step": 37 }, { "epoch": 0.7524752475247525, "grad_norm": 0.2734375, "learning_rate": 0.00018947293298207635, "loss": 1.0671, "step": 38 }, { "epoch": 0.7722772277227723, "grad_norm": 0.2490234375, "learning_rate": 0.0001887222819443612, "loss": 1.0851, "step": 39 }, { "epoch": 0.7722772277227723, "eval_loss": 1.060703158378601, "eval_runtime": 13.878, "eval_samples_per_second": 9.007, "eval_steps_per_second": 4.54, "step": 39 }, { "epoch": 0.7920792079207921, "grad_norm": 0.22265625, "learning_rate": 0.0001879473751206489, "loss": 1.0343, "step": 40 }, { "epoch": 0.8118811881188119, "grad_norm": 0.1796875, "learning_rate": 0.00018714842436272773, "loss": 0.9789, "step": 41 }, { "epoch": 0.8316831683168316, "grad_norm": 0.248046875, "learning_rate": 0.00018632564809575742, "loss": 1.0174, "step": 42 }, { "epoch": 0.8514851485148515, "grad_norm": 0.2294921875, "learning_rate": 0.0001854792712585539, "loss": 1.0004, "step": 43 }, { "epoch": 0.8712871287128713, "grad_norm": 0.228515625, "learning_rate": 0.00018460952524209355, "loss": 1.0281, "step": 44 }, { "epoch": 0.8910891089108911, "grad_norm": 0.220703125, "learning_rate": 0.00018371664782625287, "loss": 0.9992, "step": 45 }, { "epoch": 0.9108910891089109, "grad_norm": 0.2138671875, "learning_rate": 0.00018280088311480201, "loss": 0.9635, "step": 46 }, { "epoch": 0.9306930693069307, "grad_norm": 0.265625, "learning_rate": 0.00018186248146866927, "loss": 1.006, "step": 47 }, { "epoch": 0.9504950495049505, "grad_norm": 0.2451171875, "learning_rate": 0.00018090169943749476, "loss": 0.9891, "step": 48 }, { "epoch": 0.9702970297029703, "grad_norm": 0.28515625, "learning_rate": 0.0001799187996894925, "loss": 0.9809, "step": 49 }, { "epoch": 0.9900990099009901, "grad_norm": 0.212890625, "learning_rate": 0.00017891405093963938, "loss": 0.9646, "step": 50 }, { "epoch": 1.00990099009901, "grad_norm": 0.2451171875, "learning_rate": 0.00017788772787621126, "loss": 0.9553, "step": 51 }, { "epoch": 1.0297029702970297, "grad_norm": 0.2578125, "learning_rate": 0.00017684011108568592, "loss": 0.9432, "step": 52 }, { "epoch": 1.0297029702970297, "eval_loss": 0.9755253195762634, "eval_runtime": 13.879, "eval_samples_per_second": 9.006, "eval_steps_per_second": 4.539, "step": 52 }, { "epoch": 1.0495049504950495, "grad_norm": 0.2021484375, "learning_rate": 0.0001757714869760335, "loss": 0.9631, "step": 53 }, { "epoch": 1.0693069306930694, "grad_norm": 0.3046875, "learning_rate": 0.0001746821476984154, "loss": 0.9539, "step": 54 }, { "epoch": 1.0198019801980198, "grad_norm": 0.232421875, "learning_rate": 0.00017357239106731317, "loss": 0.9559, "step": 55 }, { "epoch": 1.0396039603960396, "grad_norm": 0.283203125, "learning_rate": 0.00017244252047910892, "loss": 0.9111, "step": 56 }, { "epoch": 1.0594059405940595, "grad_norm": 0.30859375, "learning_rate": 0.00017129284482913972, "loss": 0.9503, "step": 57 }, { "epoch": 1.0792079207920793, "grad_norm": 0.2265625, "learning_rate": 0.00017012367842724887, "loss": 0.911, "step": 58 }, { "epoch": 1.099009900990099, "grad_norm": 0.3515625, "learning_rate": 0.0001689353409118566, "loss": 0.9041, "step": 59 }, { "epoch": 1.118811881188119, "grad_norm": 0.26171875, "learning_rate": 0.00016772815716257412, "loss": 0.9117, "step": 60 }, { "epoch": 1.1386138613861387, "grad_norm": 0.2890625, "learning_rate": 0.0001665024572113848, "loss": 0.9351, "step": 61 }, { "epoch": 1.1584158415841583, "grad_norm": 0.251953125, "learning_rate": 0.00016525857615241687, "loss": 0.9438, "step": 62 }, { "epoch": 1.1782178217821782, "grad_norm": 0.2138671875, "learning_rate": 0.00016399685405033167, "loss": 0.9075, "step": 63 }, { "epoch": 1.198019801980198, "grad_norm": 0.2490234375, "learning_rate": 0.0001627176358473537, "loss": 0.8983, "step": 64 }, { "epoch": 1.2178217821782178, "grad_norm": 0.2021484375, "learning_rate": 0.0001614212712689668, "loss": 0.9007, "step": 65 }, { "epoch": 1.2178217821782178, "eval_loss": 0.9333999156951904, "eval_runtime": 13.8668, "eval_samples_per_second": 9.014, "eval_steps_per_second": 4.543, "step": 65 }, { "epoch": 1.2376237623762376, "grad_norm": 0.2431640625, "learning_rate": 0.00016010811472830252, "loss": 0.9108, "step": 66 }, { "epoch": 1.2574257425742574, "grad_norm": 0.232421875, "learning_rate": 0.00015877852522924732, "loss": 0.9177, "step": 67 }, { "epoch": 1.2772277227722773, "grad_norm": 0.271484375, "learning_rate": 0.00015743286626829437, "loss": 0.9, "step": 68 }, { "epoch": 1.297029702970297, "grad_norm": 0.2431640625, "learning_rate": 0.0001560715057351673, "loss": 0.9096, "step": 69 }, { "epoch": 1.316831683168317, "grad_norm": 0.22265625, "learning_rate": 0.00015469481581224272, "loss": 0.8946, "step": 70 }, { "epoch": 1.3366336633663367, "grad_norm": 0.31640625, "learning_rate": 0.0001533031728727994, "loss": 0.8995, "step": 71 }, { "epoch": 1.3564356435643563, "grad_norm": 0.2197265625, "learning_rate": 0.00015189695737812152, "loss": 0.922, "step": 72 }, { "epoch": 1.3762376237623761, "grad_norm": 0.22265625, "learning_rate": 0.0001504765537734844, "loss": 0.885, "step": 73 }, { "epoch": 1.396039603960396, "grad_norm": 0.248046875, "learning_rate": 0.00014904235038305083, "loss": 0.895, "step": 74 }, { "epoch": 1.4158415841584158, "grad_norm": 0.2431640625, "learning_rate": 0.00014759473930370736, "loss": 0.892, "step": 75 }, { "epoch": 1.4356435643564356, "grad_norm": 0.216796875, "learning_rate": 0.0001461341162978688, "loss": 0.8277, "step": 76 }, { "epoch": 1.4554455445544554, "grad_norm": 0.23828125, "learning_rate": 0.00014466088068528068, "loss": 0.8687, "step": 77 }, { "epoch": 1.4752475247524752, "grad_norm": 0.228515625, "learning_rate": 0.00014317543523384928, "loss": 0.8765, "step": 78 }, { "epoch": 1.4752475247524752, "eval_loss": 0.9083698391914368, "eval_runtime": 13.8834, "eval_samples_per_second": 9.004, "eval_steps_per_second": 4.538, "step": 78 }, { "epoch": 1.495049504950495, "grad_norm": 0.228515625, "learning_rate": 0.00014167818604952906, "loss": 0.8797, "step": 79 }, { "epoch": 1.5148514851485149, "grad_norm": 0.1982421875, "learning_rate": 0.00014016954246529696, "loss": 0.905, "step": 80 }, { "epoch": 1.5346534653465347, "grad_norm": 0.25390625, "learning_rate": 0.00013864991692924523, "loss": 0.8575, "step": 81 }, { "epoch": 1.5544554455445545, "grad_norm": 0.2451171875, "learning_rate": 0.00013711972489182208, "loss": 0.8957, "step": 82 }, { "epoch": 1.5742574257425743, "grad_norm": 0.2216796875, "learning_rate": 0.00013557938469225167, "loss": 0.8792, "step": 83 }, { "epoch": 1.5940594059405941, "grad_norm": 0.21484375, "learning_rate": 0.00013402931744416433, "loss": 0.889, "step": 84 }, { "epoch": 1.613861386138614, "grad_norm": 0.228515625, "learning_rate": 0.00013246994692046836, "loss": 0.8657, "step": 85 }, { "epoch": 1.6336633663366338, "grad_norm": 0.20703125, "learning_rate": 0.00013090169943749476, "loss": 0.8784, "step": 86 }, { "epoch": 1.6534653465346536, "grad_norm": 0.265625, "learning_rate": 0.0001293250037384465, "loss": 0.8822, "step": 87 }, { "epoch": 1.6732673267326734, "grad_norm": 0.2197265625, "learning_rate": 0.00012774029087618446, "loss": 0.9092, "step": 88 }, { "epoch": 1.693069306930693, "grad_norm": 0.234375, "learning_rate": 0.00012614799409538198, "loss": 0.8813, "step": 89 }, { "epoch": 1.7128712871287128, "grad_norm": 0.2294921875, "learning_rate": 0.00012454854871407994, "loss": 0.8975, "step": 90 }, { "epoch": 1.7326732673267327, "grad_norm": 0.259765625, "learning_rate": 0.00012294239200467516, "loss": 0.8789, "step": 91 }, { "epoch": 1.7326732673267327, "eval_loss": 0.8891416788101196, "eval_runtime": 13.872, "eval_samples_per_second": 9.011, "eval_steps_per_second": 4.542, "step": 91 }, { "epoch": 1.7524752475247525, "grad_norm": 0.26171875, "learning_rate": 0.0001213299630743747, "loss": 0.9184, "step": 92 }, { "epoch": 1.7722772277227723, "grad_norm": 0.337890625, "learning_rate": 0.00011971170274514802, "loss": 0.8854, "step": 93 }, { "epoch": 1.7920792079207921, "grad_norm": 0.2890625, "learning_rate": 0.000118088053433211, "loss": 0.8688, "step": 94 }, { "epoch": 1.811881188118812, "grad_norm": 0.3515625, "learning_rate": 0.00011645945902807341, "loss": 0.8281, "step": 95 }, { "epoch": 1.8316831683168315, "grad_norm": 0.26953125, "learning_rate": 0.0001148263647711842, "loss": 0.8488, "step": 96 }, { "epoch": 1.8514851485148514, "grad_norm": 0.2490234375, "learning_rate": 0.00011318921713420691, "loss": 0.8742, "step": 97 }, { "epoch": 1.8712871287128712, "grad_norm": 0.265625, "learning_rate": 0.00011154846369695863, "loss": 0.8586, "step": 98 }, { "epoch": 1.891089108910891, "grad_norm": 0.265625, "learning_rate": 0.0001099045530250463, "loss": 0.8776, "step": 99 }, { "epoch": 1.9108910891089108, "grad_norm": 0.259765625, "learning_rate": 0.00010825793454723325, "loss": 0.8563, "step": 100 }, { "epoch": 1.9306930693069306, "grad_norm": 0.283203125, "learning_rate": 0.00010660905843256994, "loss": 0.8381, "step": 101 }, { "epoch": 1.9504950495049505, "grad_norm": 0.201171875, "learning_rate": 0.00010495837546732224, "loss": 0.847, "step": 102 }, { "epoch": 1.9702970297029703, "grad_norm": 0.23828125, "learning_rate": 0.00010330633693173082, "loss": 0.8512, "step": 103 }, { "epoch": 1.99009900990099, "grad_norm": 0.283203125, "learning_rate": 0.00010165339447663587, "loss": 0.8304, "step": 104 }, { "epoch": 1.99009900990099, "eval_loss": 0.8779018521308899, "eval_runtime": 13.8827, "eval_samples_per_second": 9.004, "eval_steps_per_second": 4.538, "step": 104 }, { "epoch": 2.00990099009901, "grad_norm": 0.283203125, "learning_rate": 0.0001, "loss": 0.8523, "step": 105 }, { "epoch": 2.0297029702970297, "grad_norm": 0.2392578125, "learning_rate": 9.834660552336415e-05, "loss": 0.8109, "step": 106 }, { "epoch": 2.0495049504950495, "grad_norm": 0.224609375, "learning_rate": 9.669366306826919e-05, "loss": 0.8394, "step": 107 }, { "epoch": 2.0693069306930694, "grad_norm": 0.283203125, "learning_rate": 9.504162453267777e-05, "loss": 0.8524, "step": 108 }, { "epoch": 2.01980198019802, "grad_norm": 0.22265625, "learning_rate": 9.339094156743007e-05, "loss": 0.8391, "step": 109 }, { "epoch": 2.0396039603960396, "grad_norm": 0.2001953125, "learning_rate": 9.174206545276677e-05, "loss": 0.8317, "step": 110 }, { "epoch": 2.0594059405940595, "grad_norm": 0.22265625, "learning_rate": 9.009544697495374e-05, "loss": 0.833, "step": 111 }, { "epoch": 2.0792079207920793, "grad_norm": 0.2041015625, "learning_rate": 8.845153630304139e-05, "loss": 0.8408, "step": 112 }, { "epoch": 2.099009900990099, "grad_norm": 0.2080078125, "learning_rate": 8.681078286579311e-05, "loss": 0.8459, "step": 113 }, { "epoch": 2.118811881188119, "grad_norm": 0.2021484375, "learning_rate": 8.517363522881579e-05, "loss": 0.8177, "step": 114 }, { "epoch": 2.1386138613861387, "grad_norm": 0.2265625, "learning_rate": 8.35405409719266e-05, "loss": 0.8451, "step": 115 }, { "epoch": 2.1584158415841586, "grad_norm": 0.2294921875, "learning_rate": 8.191194656678904e-05, "loss": 0.8543, "step": 116 }, { "epoch": 2.1782178217821784, "grad_norm": 0.22265625, "learning_rate": 8.028829725485199e-05, "loss": 0.8194, "step": 117 }, { "epoch": 2.1782178217821784, "eval_loss": 0.8713971972465515, "eval_runtime": 13.8976, "eval_samples_per_second": 8.994, "eval_steps_per_second": 4.533, "step": 117 }, { "epoch": 2.198019801980198, "grad_norm": 0.2333984375, "learning_rate": 7.867003692562534e-05, "loss": 0.808, "step": 118 }, { "epoch": 2.217821782178218, "grad_norm": 0.2470703125, "learning_rate": 7.705760799532485e-05, "loss": 0.8073, "step": 119 }, { "epoch": 2.237623762376238, "grad_norm": 0.201171875, "learning_rate": 7.54514512859201e-05, "loss": 0.8392, "step": 120 }, { "epoch": 2.2574257425742577, "grad_norm": 0.25, "learning_rate": 7.385200590461803e-05, "loss": 0.8574, "step": 121 }, { "epoch": 2.2772277227722775, "grad_norm": 0.271484375, "learning_rate": 7.225970912381556e-05, "loss": 0.8338, "step": 122 }, { "epoch": 2.297029702970297, "grad_norm": 0.294921875, "learning_rate": 7.067499626155354e-05, "loss": 0.8788, "step": 123 }, { "epoch": 2.3168316831683167, "grad_norm": 0.2265625, "learning_rate": 6.909830056250527e-05, "loss": 0.8297, "step": 124 }, { "epoch": 2.3366336633663365, "grad_norm": 0.267578125, "learning_rate": 6.753005307953167e-05, "loss": 0.8125, "step": 125 }, { "epoch": 2.3564356435643563, "grad_norm": 0.2431640625, "learning_rate": 6.59706825558357e-05, "loss": 0.814, "step": 126 }, { "epoch": 2.376237623762376, "grad_norm": 0.27734375, "learning_rate": 6.442061530774834e-05, "loss": 0.8335, "step": 127 }, { "epoch": 2.396039603960396, "grad_norm": 0.2216796875, "learning_rate": 6.28802751081779e-05, "loss": 0.8512, "step": 128 }, { "epoch": 2.4158415841584158, "grad_norm": 0.224609375, "learning_rate": 6.135008307075481e-05, "loss": 0.8297, "step": 129 }, { "epoch": 2.4356435643564356, "grad_norm": 0.2412109375, "learning_rate": 5.983045753470308e-05, "loss": 0.848, "step": 130 }, { "epoch": 2.4356435643564356, "eval_loss": 0.8665071129798889, "eval_runtime": 13.8735, "eval_samples_per_second": 9.01, "eval_steps_per_second": 4.541, "step": 130 }, { "epoch": 2.4554455445544554, "grad_norm": 0.2265625, "learning_rate": 5.832181395047098e-05, "loss": 0.8203, "step": 131 }, { "epoch": 2.4752475247524752, "grad_norm": 0.287109375, "learning_rate": 5.6824564766150726e-05, "loss": 0.8519, "step": 132 }, { "epoch": 2.495049504950495, "grad_norm": 0.21484375, "learning_rate": 5.533911931471936e-05, "loss": 0.83, "step": 133 }, { "epoch": 2.514851485148515, "grad_norm": 0.2109375, "learning_rate": 5.386588370213124e-05, "loss": 0.842, "step": 134 }, { "epoch": 2.5346534653465347, "grad_norm": 0.2412109375, "learning_rate": 5.240526069629265e-05, "loss": 0.8419, "step": 135 }, { "epoch": 2.5544554455445545, "grad_norm": 0.267578125, "learning_rate": 5.095764961694922e-05, "loss": 0.8458, "step": 136 }, { "epoch": 2.5742574257425743, "grad_norm": 0.203125, "learning_rate": 4.952344622651566e-05, "loss": 0.8133, "step": 137 }, { "epoch": 2.594059405940594, "grad_norm": 0.2060546875, "learning_rate": 4.810304262187852e-05, "loss": 0.8103, "step": 138 }, { "epoch": 2.613861386138614, "grad_norm": 0.20703125, "learning_rate": 4.669682712720065e-05, "loss": 0.8105, "step": 139 }, { "epoch": 2.633663366336634, "grad_norm": 0.2060546875, "learning_rate": 4.530518418775733e-05, "loss": 0.8305, "step": 140 }, { "epoch": 2.6534653465346536, "grad_norm": 0.2080078125, "learning_rate": 4.392849426483274e-05, "loss": 0.7881, "step": 141 }, { "epoch": 2.6732673267326734, "grad_norm": 0.2216796875, "learning_rate": 4.256713373170564e-05, "loss": 0.8204, "step": 142 }, { "epoch": 2.693069306930693, "grad_norm": 0.263671875, "learning_rate": 4.12214747707527e-05, "loss": 0.8354, "step": 143 }, { "epoch": 2.693069306930693, "eval_loss": 0.8626759648323059, "eval_runtime": 13.8585, "eval_samples_per_second": 9.02, "eval_steps_per_second": 4.546, "step": 143 }, { "epoch": 2.7128712871287126, "grad_norm": 0.2138671875, "learning_rate": 3.9891885271697496e-05, "loss": 0.8441, "step": 144 }, { "epoch": 2.7326732673267324, "grad_norm": 0.2197265625, "learning_rate": 3.857872873103322e-05, "loss": 0.8084, "step": 145 }, { "epoch": 2.7524752475247523, "grad_norm": 0.18359375, "learning_rate": 3.7282364152646297e-05, "loss": 0.8184, "step": 146 }, { "epoch": 2.772277227722772, "grad_norm": 0.1904296875, "learning_rate": 3.600314594966834e-05, "loss": 0.8302, "step": 147 }, { "epoch": 2.792079207920792, "grad_norm": 0.2041015625, "learning_rate": 3.4741423847583134e-05, "loss": 0.8503, "step": 148 }, { "epoch": 2.8118811881188117, "grad_norm": 0.2265625, "learning_rate": 3.349754278861517e-05, "loss": 0.8273, "step": 149 }, { "epoch": 2.8316831683168315, "grad_norm": 0.1943359375, "learning_rate": 3.227184283742591e-05, "loss": 0.8332, "step": 150 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.1227070440800256e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }