{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.732620320855615, "eval_steps": 12, "global_step": 138, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0213903743315508, "grad_norm": 23.385425567626953, "learning_rate": 2e-05, "loss": 2.2375, "step": 1 }, { "epoch": 0.0213903743315508, "eval_loss": 2.295367479324341, "eval_runtime": 16.9622, "eval_samples_per_second": 17.686, "eval_steps_per_second": 8.843, "step": 1 }, { "epoch": 0.0427807486631016, "grad_norm": 31.645936965942383, "learning_rate": 4e-05, "loss": 2.3841, "step": 2 }, { "epoch": 0.06417112299465241, "grad_norm": 34.55805206298828, "learning_rate": 6e-05, "loss": 2.6477, "step": 3 }, { "epoch": 0.0855614973262032, "grad_norm": 32.25712966918945, "learning_rate": 8e-05, "loss": 1.9082, "step": 4 }, { "epoch": 0.10695187165775401, "grad_norm": 25.384239196777344, "learning_rate": 0.0001, "loss": 1.5055, "step": 5 }, { "epoch": 0.12834224598930483, "grad_norm": 22.698801040649414, "learning_rate": 0.00012, "loss": 1.237, "step": 6 }, { "epoch": 0.1497326203208556, "grad_norm": 25.194936752319336, "learning_rate": 0.00014, "loss": 1.1552, "step": 7 }, { "epoch": 0.1711229946524064, "grad_norm": 17.266931533813477, "learning_rate": 0.00016, "loss": 0.9119, "step": 8 }, { "epoch": 0.1925133689839572, "grad_norm": 15.346185684204102, "learning_rate": 0.00018, "loss": 0.834, "step": 9 }, { "epoch": 0.21390374331550802, "grad_norm": 26.989032745361328, "learning_rate": 0.0002, "loss": 0.7533, "step": 10 }, { "epoch": 0.23529411764705882, "grad_norm": 11.042716979980469, "learning_rate": 0.00019998370105646414, "loss": 0.6231, "step": 11 }, { "epoch": 0.25668449197860965, "grad_norm": 8.389060020446777, "learning_rate": 0.0001999348095389677, "loss": 0.4795, "step": 12 }, { "epoch": 0.25668449197860965, "eval_loss": 0.612250566482544, "eval_runtime": 17.2275, "eval_samples_per_second": 17.414, "eval_steps_per_second": 8.707, "step": 12 }, { "epoch": 0.27807486631016043, "grad_norm": 32.6131477355957, "learning_rate": 0.00019985334138511237, "loss": 0.6842, "step": 13 }, { "epoch": 0.2994652406417112, "grad_norm": 35.244876861572266, "learning_rate": 0.000199739323151795, "loss": 1.0486, "step": 14 }, { "epoch": 0.32085561497326204, "grad_norm": 14.032343864440918, "learning_rate": 0.00019959279200655044, "loss": 0.9261, "step": 15 }, { "epoch": 0.3422459893048128, "grad_norm": 14.313758850097656, "learning_rate": 0.00019941379571543596, "loss": 0.811, "step": 16 }, { "epoch": 0.36363636363636365, "grad_norm": 14.679015159606934, "learning_rate": 0.00019920239262746043, "loss": 0.7831, "step": 17 }, { "epoch": 0.3850267379679144, "grad_norm": 14.768975257873535, "learning_rate": 0.00019895865165556377, "loss": 0.5663, "step": 18 }, { "epoch": 0.40641711229946526, "grad_norm": 16.57910919189453, "learning_rate": 0.00019868265225415265, "loss": 0.7238, "step": 19 }, { "epoch": 0.42780748663101603, "grad_norm": 11.922738075256348, "learning_rate": 0.00019837448439320027, "loss": 0.6445, "step": 20 }, { "epoch": 0.44919786096256686, "grad_norm": 17.637170791625977, "learning_rate": 0.00019803424852891802, "loss": 1.2175, "step": 21 }, { "epoch": 0.47058823529411764, "grad_norm": 9.833541870117188, "learning_rate": 0.00019766205557100868, "loss": 0.8675, "step": 22 }, { "epoch": 0.4919786096256685, "grad_norm": 11.809141159057617, "learning_rate": 0.00019725802684651233, "loss": 1.1414, "step": 23 }, { "epoch": 0.5133689839572193, "grad_norm": 18.123613357543945, "learning_rate": 0.00019682229406025635, "loss": 0.9939, "step": 24 }, { "epoch": 0.5133689839572193, "eval_loss": 0.6862085461616516, "eval_runtime": 17.2196, "eval_samples_per_second": 17.422, "eval_steps_per_second": 8.711, "step": 24 }, { "epoch": 0.5347593582887701, "grad_norm": 11.453226089477539, "learning_rate": 0.0001963549992519223, "loss": 0.8239, "step": 25 }, { "epoch": 0.5561497326203209, "grad_norm": 7.978896617889404, "learning_rate": 0.00019585629474974415, "loss": 0.5823, "step": 26 }, { "epoch": 0.5775401069518716, "grad_norm": 7.980404376983643, "learning_rate": 0.0001953263431208523, "loss": 0.6231, "step": 27 }, { "epoch": 0.5989304812834224, "grad_norm": 12.718688011169434, "learning_rate": 0.00019476531711828027, "loss": 0.6814, "step": 28 }, { "epoch": 0.6203208556149733, "grad_norm": 11.378684997558594, "learning_rate": 0.00019417339962465082, "loss": 0.8471, "step": 29 }, { "epoch": 0.6417112299465241, "grad_norm": 10.157166481018066, "learning_rate": 0.0001935507835925601, "loss": 0.6058, "step": 30 }, { "epoch": 0.6631016042780749, "grad_norm": 10.623138427734375, "learning_rate": 0.00019289767198167916, "loss": 0.81, "step": 31 }, { "epoch": 0.6844919786096256, "grad_norm": 16.875181198120117, "learning_rate": 0.00019221427769259333, "loss": 0.8037, "step": 32 }, { "epoch": 0.7058823529411765, "grad_norm": 11.729046821594238, "learning_rate": 0.0001915008234974012, "loss": 0.5548, "step": 33 }, { "epoch": 0.7272727272727273, "grad_norm": 10.128629684448242, "learning_rate": 0.00019075754196709572, "loss": 0.6979, "step": 34 }, { "epoch": 0.7486631016042781, "grad_norm": 10.893034934997559, "learning_rate": 0.0001899846753957507, "loss": 0.6173, "step": 35 }, { "epoch": 0.7700534759358288, "grad_norm": 11.063718795776367, "learning_rate": 0.00018918247572153823, "loss": 0.6885, "step": 36 }, { "epoch": 0.7700534759358288, "eval_loss": 0.6894535422325134, "eval_runtime": 16.9719, "eval_samples_per_second": 17.676, "eval_steps_per_second": 8.838, "step": 36 }, { "epoch": 0.7914438502673797, "grad_norm": 13.273902893066406, "learning_rate": 0.0001883512044446023, "loss": 0.8057, "step": 37 }, { "epoch": 0.8128342245989305, "grad_norm": 17.475479125976562, "learning_rate": 0.00018749113254181498, "loss": 0.8596, "step": 38 }, { "epoch": 0.8342245989304813, "grad_norm": 13.216853141784668, "learning_rate": 0.00018660254037844388, "loss": 0.7885, "step": 39 }, { "epoch": 0.8556149732620321, "grad_norm": 11.209785461425781, "learning_rate": 0.00018568571761675893, "loss": 0.7028, "step": 40 }, { "epoch": 0.8770053475935828, "grad_norm": 8.61646556854248, "learning_rate": 0.00018474096312160864, "loss": 0.5775, "step": 41 }, { "epoch": 0.8983957219251337, "grad_norm": 10.86780071258545, "learning_rate": 0.00018376858486299647, "loss": 0.8106, "step": 42 }, { "epoch": 0.9197860962566845, "grad_norm": 6.3408427238464355, "learning_rate": 0.00018276889981568906, "loss": 0.4089, "step": 43 }, { "epoch": 0.9411764705882353, "grad_norm": 19.630786895751953, "learning_rate": 0.00018174223385588917, "loss": 1.1041, "step": 44 }, { "epoch": 0.9625668449197861, "grad_norm": 10.352812767028809, "learning_rate": 0.00018068892165500704, "loss": 0.749, "step": 45 }, { "epoch": 0.983957219251337, "grad_norm": 8.671784400939941, "learning_rate": 0.00017960930657056438, "loss": 0.6975, "step": 46 }, { "epoch": 1.0053475935828877, "grad_norm": 10.529078483581543, "learning_rate": 0.00017850374053426723, "loss": 0.5922, "step": 47 }, { "epoch": 1.0267379679144386, "grad_norm": 9.373391151428223, "learning_rate": 0.00017737258393728364, "loss": 0.729, "step": 48 }, { "epoch": 1.0267379679144386, "eval_loss": 0.6926581859588623, "eval_runtime": 17.8687, "eval_samples_per_second": 16.789, "eval_steps_per_second": 8.395, "step": 48 }, { "epoch": 1.0481283422459893, "grad_norm": 18.76338005065918, "learning_rate": 0.00017621620551276366, "loss": 0.9321, "step": 49 }, { "epoch": 1.0695187165775402, "grad_norm": 7.443489074707031, "learning_rate": 0.00017503498221564025, "loss": 0.5063, "step": 50 }, { "epoch": 1.0909090909090908, "grad_norm": 8.333600044250488, "learning_rate": 0.00017382929909974987, "loss": 0.4377, "step": 51 }, { "epoch": 1.1122994652406417, "grad_norm": 14.056548118591309, "learning_rate": 0.0001725995491923131, "loss": 1.2233, "step": 52 }, { "epoch": 1.0213903743315509, "grad_norm": 13.999143600463867, "learning_rate": 0.00017134613336581599, "loss": 0.3174, "step": 53 }, { "epoch": 1.0427807486631016, "grad_norm": 10.154765129089355, "learning_rate": 0.00017006946020733425, "loss": 0.162, "step": 54 }, { "epoch": 1.0641711229946524, "grad_norm": 4.229214191436768, "learning_rate": 0.00016876994588534234, "loss": 0.1357, "step": 55 }, { "epoch": 1.085561497326203, "grad_norm": 4.834641456604004, "learning_rate": 0.0001674480140140514, "loss": 0.1686, "step": 56 }, { "epoch": 1.106951871657754, "grad_norm": 8.32540225982666, "learning_rate": 0.00016610409551532005, "loss": 0.2455, "step": 57 }, { "epoch": 1.1283422459893049, "grad_norm": 5.407528400421143, "learning_rate": 0.00016473862847818277, "loss": 0.1116, "step": 58 }, { "epoch": 1.1497326203208555, "grad_norm": 5.010147571563721, "learning_rate": 0.0001633520580160424, "loss": 0.1156, "step": 59 }, { "epoch": 1.1711229946524064, "grad_norm": 6.022050380706787, "learning_rate": 0.0001619448361215723, "loss": 0.225, "step": 60 }, { "epoch": 1.1711229946524064, "eval_loss": 0.7360826134681702, "eval_runtime": 17.9845, "eval_samples_per_second": 16.681, "eval_steps_per_second": 8.34, "step": 60 }, { "epoch": 1.192513368983957, "grad_norm": 5.873435974121094, "learning_rate": 0.00016051742151937655, "loss": 0.0572, "step": 61 }, { "epoch": 1.213903743315508, "grad_norm": 10.556258201599121, "learning_rate": 0.0001590702795164551, "loss": 0.1261, "step": 62 }, { "epoch": 1.2352941176470589, "grad_norm": 17.438634872436523, "learning_rate": 0.00015760388185052398, "loss": 0.382, "step": 63 }, { "epoch": 1.2566844919786098, "grad_norm": 8.371882438659668, "learning_rate": 0.00015611870653623825, "loss": 0.228, "step": 64 }, { "epoch": 1.2780748663101604, "grad_norm": 9.435052871704102, "learning_rate": 0.0001546152377093697, "loss": 0.1978, "step": 65 }, { "epoch": 1.299465240641711, "grad_norm": 8.652084350585938, "learning_rate": 0.0001530939654689887, "loss": 0.1587, "step": 66 }, { "epoch": 1.320855614973262, "grad_norm": 8.71427059173584, "learning_rate": 0.00015155538571770218, "loss": 0.0637, "step": 67 }, { "epoch": 1.3422459893048129, "grad_norm": 5.761900901794434, "learning_rate": 0.00015000000000000001, "loss": 0.2027, "step": 68 }, { "epoch": 1.3636363636363638, "grad_norm": 8.064814567565918, "learning_rate": 0.00014842831533876195, "loss": 0.1595, "step": 69 }, { "epoch": 1.3850267379679144, "grad_norm": 7.962220668792725, "learning_rate": 0.00014684084406997903, "loss": 0.3658, "step": 70 }, { "epoch": 1.4064171122994653, "grad_norm": 4.9641242027282715, "learning_rate": 0.00014523810367574272, "loss": 0.1368, "step": 71 }, { "epoch": 1.427807486631016, "grad_norm": 7.027231216430664, "learning_rate": 0.00014362061661555675, "loss": 0.1688, "step": 72 }, { "epoch": 1.427807486631016, "eval_loss": 0.8078603148460388, "eval_runtime": 17.7522, "eval_samples_per_second": 16.899, "eval_steps_per_second": 8.45, "step": 72 }, { "epoch": 1.4491978609625669, "grad_norm": 3.6481807231903076, "learning_rate": 0.00014198891015602646, "loss": 0.0844, "step": 73 }, { "epoch": 1.4705882352941178, "grad_norm": 7.879701614379883, "learning_rate": 0.00014034351619898088, "loss": 0.3332, "step": 74 }, { "epoch": 1.4919786096256684, "grad_norm": 5.169126987457275, "learning_rate": 0.00013868497110808395, "loss": 0.1301, "step": 75 }, { "epoch": 1.5133689839572193, "grad_norm": 5.538336753845215, "learning_rate": 0.00013701381553399145, "loss": 0.145, "step": 76 }, { "epoch": 1.53475935828877, "grad_norm": 25.778339385986328, "learning_rate": 0.00013533059423811026, "loss": 0.2704, "step": 77 }, { "epoch": 1.5561497326203209, "grad_norm": 5.043937683105469, "learning_rate": 0.0001336358559150175, "loss": 0.1458, "step": 78 }, { "epoch": 1.5775401069518717, "grad_norm": 7.204008102416992, "learning_rate": 0.000131930153013598, "loss": 0.1012, "step": 79 }, { "epoch": 1.5989304812834224, "grad_norm": 8.367019653320312, "learning_rate": 0.00013021404155695725, "loss": 0.1889, "step": 80 }, { "epoch": 1.6203208556149733, "grad_norm": 5.196023464202881, "learning_rate": 0.00012848808096117, "loss": 0.0881, "step": 81 }, { "epoch": 1.641711229946524, "grad_norm": 8.067776679992676, "learning_rate": 0.00012675283385292212, "loss": 0.2948, "step": 82 }, { "epoch": 1.6631016042780749, "grad_norm": 8.772676467895508, "learning_rate": 0.0001250088658861063, "loss": 0.154, "step": 83 }, { "epoch": 1.6844919786096257, "grad_norm": 16.28201675415039, "learning_rate": 0.00012325674555743106, "loss": 0.3085, "step": 84 }, { "epoch": 1.6844919786096257, "eval_loss": 0.780529797077179, "eval_runtime": 17.4078, "eval_samples_per_second": 17.234, "eval_steps_per_second": 8.617, "step": 84 }, { "epoch": 1.7058823529411766, "grad_norm": 8.888002395629883, "learning_rate": 0.00012149704402110243, "loss": 0.2488, "step": 85 }, { "epoch": 1.7272727272727273, "grad_norm": 6.574310779571533, "learning_rate": 0.00011973033490264001, "loss": 0.1698, "step": 86 }, { "epoch": 1.748663101604278, "grad_norm": 10.671408653259277, "learning_rate": 0.00011795719411188718, "loss": 0.1778, "step": 87 }, { "epoch": 1.7700534759358288, "grad_norm": 6.570982933044434, "learning_rate": 0.0001161781996552765, "loss": 0.115, "step": 88 }, { "epoch": 1.7914438502673797, "grad_norm": 11.171520233154297, "learning_rate": 0.0001143939314474119, "loss": 0.2032, "step": 89 }, { "epoch": 1.8128342245989306, "grad_norm": 4.231898307800293, "learning_rate": 0.00011260497112202895, "loss": 0.0596, "step": 90 }, { "epoch": 1.8342245989304813, "grad_norm": 5.200542449951172, "learning_rate": 0.00011081190184239419, "loss": 0.1294, "step": 91 }, { "epoch": 1.855614973262032, "grad_norm": 6.871387958526611, "learning_rate": 0.00010901530811120655, "loss": 0.283, "step": 92 }, { "epoch": 1.8770053475935828, "grad_norm": 4.964606285095215, "learning_rate": 0.00010721577558006164, "loss": 0.2673, "step": 93 }, { "epoch": 1.8983957219251337, "grad_norm": 8.798148155212402, "learning_rate": 0.00010541389085854176, "loss": 0.2055, "step": 94 }, { "epoch": 1.9197860962566846, "grad_norm": 5.59722375869751, "learning_rate": 0.00010361024132299364, "loss": 0.209, "step": 95 }, { "epoch": 1.9411764705882353, "grad_norm": 8.256745338439941, "learning_rate": 0.00010180541492505604, "loss": 0.1079, "step": 96 }, { "epoch": 1.9411764705882353, "eval_loss": 0.7684900760650635, "eval_runtime": 17.746, "eval_samples_per_second": 16.905, "eval_steps_per_second": 8.453, "step": 96 }, { "epoch": 1.962566844919786, "grad_norm": 4.481995582580566, "learning_rate": 0.0001, "loss": 0.1824, "step": 97 }, { "epoch": 1.9839572192513368, "grad_norm": 3.852792263031006, "learning_rate": 9.819458507494394e-05, "loss": 0.124, "step": 98 }, { "epoch": 2.0053475935828877, "grad_norm": 29.360998153686523, "learning_rate": 9.638975867700638e-05, "loss": 0.1844, "step": 99 }, { "epoch": 2.0267379679144386, "grad_norm": 4.80811071395874, "learning_rate": 9.458610914145826e-05, "loss": 0.1347, "step": 100 }, { "epoch": 2.048128342245989, "grad_norm": 17.53403091430664, "learning_rate": 9.27842244199384e-05, "loss": 0.3378, "step": 101 }, { "epoch": 2.06951871657754, "grad_norm": 7.34214973449707, "learning_rate": 9.098469188879349e-05, "loss": 0.2045, "step": 102 }, { "epoch": 2.090909090909091, "grad_norm": 4.968944072723389, "learning_rate": 8.918809815760585e-05, "loss": 0.2284, "step": 103 }, { "epoch": 2.0053475935828877, "grad_norm": 4.202042579650879, "learning_rate": 8.739502887797107e-05, "loss": 0.1655, "step": 104 }, { "epoch": 2.0267379679144386, "grad_norm": 3.561790704727173, "learning_rate": 8.560606855258808e-05, "loss": 0.0442, "step": 105 }, { "epoch": 2.0481283422459895, "grad_norm": 3.8292624950408936, "learning_rate": 8.382180034472353e-05, "loss": 0.0821, "step": 106 }, { "epoch": 2.06951871657754, "grad_norm": 2.1650640964508057, "learning_rate": 8.204280588811283e-05, "loss": 0.0384, "step": 107 }, { "epoch": 2.090909090909091, "grad_norm": 1.6922334432601929, "learning_rate": 8.026966509736001e-05, "loss": 0.0342, "step": 108 }, { "epoch": 2.090909090909091, "eval_loss": 0.7716657519340515, "eval_runtime": 17.9156, "eval_samples_per_second": 16.745, "eval_steps_per_second": 8.373, "step": 108 }, { "epoch": 2.1122994652406417, "grad_norm": 1.0654356479644775, "learning_rate": 7.85029559788976e-05, "loss": 0.0184, "step": 109 }, { "epoch": 2.1336898395721926, "grad_norm": 3.1057019233703613, "learning_rate": 7.674325444256899e-05, "loss": 0.0417, "step": 110 }, { "epoch": 2.1550802139037435, "grad_norm": 0.19042205810546875, "learning_rate": 7.499113411389371e-05, "loss": 0.0026, "step": 111 }, { "epoch": 2.176470588235294, "grad_norm": 1.5116851329803467, "learning_rate": 7.324716614707793e-05, "loss": 0.0089, "step": 112 }, { "epoch": 2.197860962566845, "grad_norm": 2.5151679515838623, "learning_rate": 7.151191903883001e-05, "loss": 0.0357, "step": 113 }, { "epoch": 2.2192513368983957, "grad_norm": 2.838503837585449, "learning_rate": 6.978595844304271e-05, "loss": 0.0366, "step": 114 }, { "epoch": 2.2406417112299466, "grad_norm": 3.835000514984131, "learning_rate": 6.806984698640202e-05, "loss": 0.1412, "step": 115 }, { "epoch": 2.2620320855614975, "grad_norm": 3.4443538188934326, "learning_rate": 6.636414408498249e-05, "loss": 0.0707, "step": 116 }, { "epoch": 2.283422459893048, "grad_norm": 2.701524496078491, "learning_rate": 6.466940576188977e-05, "loss": 0.0497, "step": 117 }, { "epoch": 2.304812834224599, "grad_norm": 2.612593412399292, "learning_rate": 6.298618446600856e-05, "loss": 0.052, "step": 118 }, { "epoch": 2.3262032085561497, "grad_norm": 4.986962795257568, "learning_rate": 6.13150288919161e-05, "loss": 0.1255, "step": 119 }, { "epoch": 2.3475935828877006, "grad_norm": 1.8598374128341675, "learning_rate": 5.965648380101916e-05, "loss": 0.0309, "step": 120 }, { "epoch": 2.3475935828877006, "eval_loss": 0.785007119178772, "eval_runtime": 17.7923, "eval_samples_per_second": 16.861, "eval_steps_per_second": 8.431, "step": 120 }, { "epoch": 2.3689839572192515, "grad_norm": 1.5813214778900146, "learning_rate": 5.801108984397354e-05, "loss": 0.0201, "step": 121 }, { "epoch": 2.3903743315508024, "grad_norm": 0.13843385875225067, "learning_rate": 5.6379383384443255e-05, "loss": 0.0018, "step": 122 }, { "epoch": 2.411764705882353, "grad_norm": 4.4155707359313965, "learning_rate": 5.476189632425732e-05, "loss": 0.0326, "step": 123 }, { "epoch": 2.4331550802139037, "grad_norm": 3.5101325511932373, "learning_rate": 5.3159155930021e-05, "loss": 0.0259, "step": 124 }, { "epoch": 2.4545454545454546, "grad_norm": 5.201532363891602, "learning_rate": 5.1571684661238075e-05, "loss": 0.0761, "step": 125 }, { "epoch": 2.4759358288770055, "grad_norm": 2.48543119430542, "learning_rate": 5.000000000000002e-05, "loss": 0.0587, "step": 126 }, { "epoch": 2.497326203208556, "grad_norm": 7.39755916595459, "learning_rate": 4.844461428229782e-05, "loss": 0.0391, "step": 127 }, { "epoch": 2.518716577540107, "grad_norm": 4.151485443115234, "learning_rate": 4.6906034531011346e-05, "loss": 0.0982, "step": 128 }, { "epoch": 2.5401069518716577, "grad_norm": 4.144845485687256, "learning_rate": 4.53847622906303e-05, "loss": 0.0707, "step": 129 }, { "epoch": 2.5614973262032086, "grad_norm": 7.3682732582092285, "learning_rate": 4.388129346376178e-05, "loss": 0.0455, "step": 130 }, { "epoch": 2.5828877005347595, "grad_norm": 4.947929382324219, "learning_rate": 4.239611814947605e-05, "loss": 0.033, "step": 131 }, { "epoch": 2.6042780748663104, "grad_norm": 3.0208606719970703, "learning_rate": 4.092972048354491e-05, "loss": 0.0373, "step": 132 }, { "epoch": 2.6042780748663104, "eval_loss": 0.776565432548523, "eval_runtime": 17.3019, "eval_samples_per_second": 17.339, "eval_steps_per_second": 8.67, "step": 132 }, { "epoch": 2.625668449197861, "grad_norm": 7.514610290527344, "learning_rate": 3.948257848062351e-05, "loss": 0.0323, "step": 133 }, { "epoch": 2.6470588235294117, "grad_norm": 1.8352607488632202, "learning_rate": 3.80551638784277e-05, "loss": 0.043, "step": 134 }, { "epoch": 2.6684491978609626, "grad_norm": 3.525506019592285, "learning_rate": 3.664794198395764e-05, "loss": 0.0643, "step": 135 }, { "epoch": 2.6898395721925135, "grad_norm": 5.074891567230225, "learning_rate": 3.5261371521817244e-05, "loss": 0.0658, "step": 136 }, { "epoch": 2.711229946524064, "grad_norm": 3.6220922470092773, "learning_rate": 3.3895904484679984e-05, "loss": 0.1535, "step": 137 }, { "epoch": 2.732620320855615, "grad_norm": 3.9044840335845947, "learning_rate": 3.2551985985948616e-05, "loss": 0.0572, "step": 138 } ], "logging_steps": 1, "max_steps": 184, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 46, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3759516646572032e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }