{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020964360587002098, "grad_norm": 3.998500347137451, "learning_rate": 9.958071278825997e-05, "loss": 0.1821, "step": 10 }, { "epoch": 0.041928721174004195, "grad_norm": 4.23317813873291, "learning_rate": 9.916142557651992e-05, "loss": 0.1617, "step": 20 }, { "epoch": 0.06289308176100629, "grad_norm": 4.348649024963379, "learning_rate": 9.874213836477988e-05, "loss": 0.1799, "step": 30 }, { "epoch": 0.08385744234800839, "grad_norm": 4.8091020584106445, "learning_rate": 9.832285115303984e-05, "loss": 0.2127, "step": 40 }, { "epoch": 0.10482180293501048, "grad_norm": 4.523597240447998, "learning_rate": 9.790356394129979e-05, "loss": 0.2195, "step": 50 }, { "epoch": 0.12578616352201258, "grad_norm": 6.083479404449463, "learning_rate": 9.748427672955975e-05, "loss": 0.2142, "step": 60 }, { "epoch": 0.14675052410901468, "grad_norm": 5.268312454223633, "learning_rate": 9.706498951781971e-05, "loss": 0.2293, "step": 70 }, { "epoch": 0.16771488469601678, "grad_norm": 5.908782482147217, "learning_rate": 9.664570230607967e-05, "loss": 0.2391, "step": 80 }, { "epoch": 0.18867924528301888, "grad_norm": 5.333123207092285, "learning_rate": 9.622641509433963e-05, "loss": 0.2519, "step": 90 }, { "epoch": 0.20964360587002095, "grad_norm": 4.541295051574707, "learning_rate": 9.58071278825996e-05, "loss": 0.241, "step": 100 }, { "epoch": 0.23060796645702306, "grad_norm": 5.367836952209473, "learning_rate": 9.538784067085954e-05, "loss": 0.2516, "step": 110 }, { "epoch": 0.25157232704402516, "grad_norm": 4.476315975189209, "learning_rate": 9.496855345911951e-05, "loss": 0.2581, "step": 120 }, { "epoch": 0.27253668763102723, "grad_norm": 4.104060173034668, "learning_rate": 9.454926624737947e-05, "loss": 0.2612, "step": 130 }, { "epoch": 0.29350104821802936, "grad_norm": 5.152798652648926, "learning_rate": 9.412997903563942e-05, "loss": 0.2736, "step": 140 }, { "epoch": 0.31446540880503143, "grad_norm": 4.785223960876465, "learning_rate": 9.371069182389938e-05, "loss": 0.267, "step": 150 }, { "epoch": 0.33542976939203356, "grad_norm": 4.769683837890625, "learning_rate": 9.329140461215934e-05, "loss": 0.2766, "step": 160 }, { "epoch": 0.35639412997903563, "grad_norm": 4.8373942375183105, "learning_rate": 9.287211740041929e-05, "loss": 0.2878, "step": 170 }, { "epoch": 0.37735849056603776, "grad_norm": 4.6672515869140625, "learning_rate": 9.245283018867925e-05, "loss": 0.2748, "step": 180 }, { "epoch": 0.39832285115303984, "grad_norm": 7.549396514892578, "learning_rate": 9.203354297693921e-05, "loss": 0.264, "step": 190 }, { "epoch": 0.4192872117400419, "grad_norm": 6.70684289932251, "learning_rate": 9.161425576519916e-05, "loss": 0.2843, "step": 200 }, { "epoch": 0.44025157232704404, "grad_norm": 5.79678201675415, "learning_rate": 9.119496855345912e-05, "loss": 0.2907, "step": 210 }, { "epoch": 0.4612159329140461, "grad_norm": 6.699160575866699, "learning_rate": 9.077568134171907e-05, "loss": 0.2678, "step": 220 }, { "epoch": 0.48218029350104824, "grad_norm": 4.357872486114502, "learning_rate": 9.035639412997903e-05, "loss": 0.2965, "step": 230 }, { "epoch": 0.5031446540880503, "grad_norm": 5.521259307861328, "learning_rate": 8.9937106918239e-05, "loss": 0.2876, "step": 240 }, { "epoch": 0.5241090146750524, "grad_norm": 6.627674579620361, "learning_rate": 8.951781970649896e-05, "loss": 0.2825, "step": 250 }, { "epoch": 0.5450733752620545, "grad_norm": 5.5945634841918945, "learning_rate": 8.909853249475892e-05, "loss": 0.2855, "step": 260 }, { "epoch": 0.5660377358490566, "grad_norm": 4.596319198608398, "learning_rate": 8.867924528301888e-05, "loss": 0.2753, "step": 270 }, { "epoch": 0.5870020964360587, "grad_norm": 5.92698860168457, "learning_rate": 8.825995807127883e-05, "loss": 0.3076, "step": 280 }, { "epoch": 0.6079664570230608, "grad_norm": 4.971231937408447, "learning_rate": 8.784067085953879e-05, "loss": 0.2945, "step": 290 }, { "epoch": 0.6289308176100629, "grad_norm": 5.803814888000488, "learning_rate": 8.742138364779875e-05, "loss": 0.3121, "step": 300 }, { "epoch": 0.649895178197065, "grad_norm": 4.640927314758301, "learning_rate": 8.70020964360587e-05, "loss": 0.2965, "step": 310 }, { "epoch": 0.6708595387840671, "grad_norm": 4.943049430847168, "learning_rate": 8.662473794549267e-05, "loss": 0.2814, "step": 320 }, { "epoch": 0.6918238993710691, "grad_norm": 18.59223175048828, "learning_rate": 8.620545073375263e-05, "loss": 0.2998, "step": 330 }, { "epoch": 0.7127882599580713, "grad_norm": 4.966686725616455, "learning_rate": 8.578616352201259e-05, "loss": 0.282, "step": 340 }, { "epoch": 0.7337526205450734, "grad_norm": 6.274560451507568, "learning_rate": 8.536687631027254e-05, "loss": 0.3123, "step": 350 }, { "epoch": 0.7547169811320755, "grad_norm": 5.680440902709961, "learning_rate": 8.49475890985325e-05, "loss": 0.2915, "step": 360 }, { "epoch": 0.7756813417190775, "grad_norm": 5.426858425140381, "learning_rate": 8.452830188679246e-05, "loss": 0.3192, "step": 370 }, { "epoch": 0.7966457023060797, "grad_norm": 5.293428421020508, "learning_rate": 8.410901467505241e-05, "loss": 0.3207, "step": 380 }, { "epoch": 0.8176100628930818, "grad_norm": 5.778203964233398, "learning_rate": 8.368972746331237e-05, "loss": 0.3202, "step": 390 }, { "epoch": 0.8385744234800838, "grad_norm": 5.326625347137451, "learning_rate": 8.327044025157233e-05, "loss": 0.3088, "step": 400 }, { "epoch": 0.859538784067086, "grad_norm": 5.26389217376709, "learning_rate": 8.285115303983228e-05, "loss": 0.3084, "step": 410 }, { "epoch": 0.8805031446540881, "grad_norm": 5.999328136444092, "learning_rate": 8.243186582809224e-05, "loss": 0.2822, "step": 420 }, { "epoch": 0.9014675052410901, "grad_norm": 5.0098958015441895, "learning_rate": 8.20125786163522e-05, "loss": 0.307, "step": 430 }, { "epoch": 0.9224318658280922, "grad_norm": 6.384605407714844, "learning_rate": 8.159329140461215e-05, "loss": 0.2976, "step": 440 }, { "epoch": 0.9433962264150944, "grad_norm": 9.32125186920166, "learning_rate": 8.117400419287212e-05, "loss": 0.3198, "step": 450 }, { "epoch": 0.9643605870020965, "grad_norm": 4.630229949951172, "learning_rate": 8.075471698113208e-05, "loss": 0.3031, "step": 460 }, { "epoch": 0.9853249475890985, "grad_norm": 4.775624752044678, "learning_rate": 8.033542976939204e-05, "loss": 0.2676, "step": 470 }, { "epoch": 1.0, "eval_loss": 0.6731473207473755, "eval_runtime": 20.7096, "eval_samples_per_second": 72.43, "eval_steps_per_second": 9.078, "step": 477 }, { "epoch": 1.0062893081761006, "grad_norm": 1.9550973176956177, "learning_rate": 7.9916142557652e-05, "loss": 0.2451, "step": 480 }, { "epoch": 1.0272536687631026, "grad_norm": 3.6781623363494873, "learning_rate": 7.949685534591196e-05, "loss": 0.1127, "step": 490 }, { "epoch": 1.0482180293501049, "grad_norm": 3.6007821559906006, "learning_rate": 7.907756813417191e-05, "loss": 0.1089, "step": 500 }, { "epoch": 1.069182389937107, "grad_norm": 3.0092623233795166, "learning_rate": 7.865828092243187e-05, "loss": 0.1203, "step": 510 }, { "epoch": 1.090146750524109, "grad_norm": 2.852006196975708, "learning_rate": 7.823899371069184e-05, "loss": 0.1126, "step": 520 }, { "epoch": 1.1111111111111112, "grad_norm": 3.5420737266540527, "learning_rate": 7.781970649895178e-05, "loss": 0.1254, "step": 530 }, { "epoch": 1.1320754716981132, "grad_norm": 3.2206077575683594, "learning_rate": 7.740041928721175e-05, "loss": 0.1328, "step": 540 }, { "epoch": 1.1530398322851152, "grad_norm": 3.895273447036743, "learning_rate": 7.698113207547171e-05, "loss": 0.125, "step": 550 }, { "epoch": 1.1740041928721174, "grad_norm": 3.2703685760498047, "learning_rate": 7.656184486373166e-05, "loss": 0.1306, "step": 560 }, { "epoch": 1.1949685534591195, "grad_norm": 2.977438449859619, "learning_rate": 7.614255765199162e-05, "loss": 0.1261, "step": 570 }, { "epoch": 1.2159329140461215, "grad_norm": 3.917140007019043, "learning_rate": 7.572327044025158e-05, "loss": 0.1232, "step": 580 }, { "epoch": 1.2368972746331237, "grad_norm": 4.090723514556885, "learning_rate": 7.530398322851153e-05, "loss": 0.1374, "step": 590 }, { "epoch": 1.2578616352201257, "grad_norm": 3.666203737258911, "learning_rate": 7.488469601677149e-05, "loss": 0.1317, "step": 600 }, { "epoch": 1.2788259958071277, "grad_norm": 4.610381126403809, "learning_rate": 7.446540880503144e-05, "loss": 0.1276, "step": 610 }, { "epoch": 1.29979035639413, "grad_norm": 5.928828716278076, "learning_rate": 7.40461215932914e-05, "loss": 0.1258, "step": 620 }, { "epoch": 1.320754716981132, "grad_norm": 4.312713623046875, "learning_rate": 7.362683438155136e-05, "loss": 0.1318, "step": 630 }, { "epoch": 1.3417190775681342, "grad_norm": 3.831658363342285, "learning_rate": 7.320754716981132e-05, "loss": 0.1217, "step": 640 }, { "epoch": 1.3626834381551363, "grad_norm": 2.7465438842773438, "learning_rate": 7.278825995807129e-05, "loss": 0.1157, "step": 650 }, { "epoch": 1.3836477987421385, "grad_norm": 2.915738582611084, "learning_rate": 7.236897274633125e-05, "loss": 0.1414, "step": 660 }, { "epoch": 1.4046121593291405, "grad_norm": 3.2107832431793213, "learning_rate": 7.19496855345912e-05, "loss": 0.1253, "step": 670 }, { "epoch": 1.4255765199161425, "grad_norm": 3.7944188117980957, "learning_rate": 7.153039832285116e-05, "loss": 0.1249, "step": 680 }, { "epoch": 1.4465408805031448, "grad_norm": 3.0400636196136475, "learning_rate": 7.111111111111112e-05, "loss": 0.1323, "step": 690 }, { "epoch": 1.4675052410901468, "grad_norm": 3.911773443222046, "learning_rate": 7.069182389937107e-05, "loss": 0.1265, "step": 700 }, { "epoch": 1.4884696016771488, "grad_norm": 4.3404765129089355, "learning_rate": 7.027253668763103e-05, "loss": 0.1328, "step": 710 }, { "epoch": 1.509433962264151, "grad_norm": 3.5862231254577637, "learning_rate": 6.985324947589099e-05, "loss": 0.1336, "step": 720 }, { "epoch": 1.530398322851153, "grad_norm": 4.099681377410889, "learning_rate": 6.943396226415094e-05, "loss": 0.1345, "step": 730 }, { "epoch": 1.551362683438155, "grad_norm": 4.485490322113037, "learning_rate": 6.90146750524109e-05, "loss": 0.1368, "step": 740 }, { "epoch": 1.5723270440251573, "grad_norm": 3.4771294593811035, "learning_rate": 6.859538784067086e-05, "loss": 0.1197, "step": 750 }, { "epoch": 1.5932914046121593, "grad_norm": 3.602762460708618, "learning_rate": 6.817610062893081e-05, "loss": 0.125, "step": 760 }, { "epoch": 1.6142557651991614, "grad_norm": 3.6131410598754883, "learning_rate": 6.775681341719077e-05, "loss": 0.1363, "step": 770 }, { "epoch": 1.6352201257861636, "grad_norm": 4.117647647857666, "learning_rate": 6.733752620545074e-05, "loss": 0.1289, "step": 780 }, { "epoch": 1.6561844863731656, "grad_norm": 3.723130941390991, "learning_rate": 6.691823899371068e-05, "loss": 0.1319, "step": 790 }, { "epoch": 1.6771488469601676, "grad_norm": 3.42946195602417, "learning_rate": 6.649895178197065e-05, "loss": 0.1449, "step": 800 }, { "epoch": 1.6981132075471699, "grad_norm": 3.705895185470581, "learning_rate": 6.607966457023061e-05, "loss": 0.1446, "step": 810 }, { "epoch": 1.719077568134172, "grad_norm": 4.407285213470459, "learning_rate": 6.566037735849057e-05, "loss": 0.1417, "step": 820 }, { "epoch": 1.740041928721174, "grad_norm": 3.557760000228882, "learning_rate": 6.524109014675053e-05, "loss": 0.1338, "step": 830 }, { "epoch": 1.7610062893081762, "grad_norm": 3.342160224914551, "learning_rate": 6.48218029350105e-05, "loss": 0.1439, "step": 840 }, { "epoch": 1.7819706498951782, "grad_norm": 3.4098939895629883, "learning_rate": 6.440251572327044e-05, "loss": 0.1444, "step": 850 }, { "epoch": 1.8029350104821802, "grad_norm": 3.757795572280884, "learning_rate": 6.39832285115304e-05, "loss": 0.1327, "step": 860 }, { "epoch": 1.8238993710691824, "grad_norm": 3.3094663619995117, "learning_rate": 6.356394129979037e-05, "loss": 0.1338, "step": 870 }, { "epoch": 1.8448637316561844, "grad_norm": 3.925997495651245, "learning_rate": 6.314465408805031e-05, "loss": 0.1391, "step": 880 }, { "epoch": 1.8658280922431865, "grad_norm": 3.38191819190979, "learning_rate": 6.272536687631028e-05, "loss": 0.1396, "step": 890 }, { "epoch": 1.8867924528301887, "grad_norm": 3.194735288619995, "learning_rate": 6.230607966457024e-05, "loss": 0.1304, "step": 900 }, { "epoch": 1.9077568134171907, "grad_norm": 3.6266653537750244, "learning_rate": 6.188679245283019e-05, "loss": 0.1259, "step": 910 }, { "epoch": 1.9287211740041927, "grad_norm": 4.087035655975342, "learning_rate": 6.146750524109015e-05, "loss": 0.145, "step": 920 }, { "epoch": 1.949685534591195, "grad_norm": 3.725505828857422, "learning_rate": 6.104821802935011e-05, "loss": 0.1291, "step": 930 }, { "epoch": 1.9706498951781972, "grad_norm": 3.440652847290039, "learning_rate": 6.0628930817610065e-05, "loss": 0.1348, "step": 940 }, { "epoch": 1.991614255765199, "grad_norm": 3.5756237506866455, "learning_rate": 6.020964360587003e-05, "loss": 0.1535, "step": 950 }, { "epoch": 2.0, "eval_loss": 0.6702887415885925, "eval_runtime": 20.8625, "eval_samples_per_second": 71.9, "eval_steps_per_second": 9.011, "step": 954 }, { "epoch": 2.0125786163522013, "grad_norm": 1.9692338705062866, "learning_rate": 5.979035639412999e-05, "loss": 0.0812, "step": 960 }, { "epoch": 2.0335429769392035, "grad_norm": 2.1989524364471436, "learning_rate": 5.937106918238994e-05, "loss": 0.0489, "step": 970 }, { "epoch": 2.0545073375262053, "grad_norm": 1.9158607721328735, "learning_rate": 5.89517819706499e-05, "loss": 0.0465, "step": 980 }, { "epoch": 2.0754716981132075, "grad_norm": 1.7822378873825073, "learning_rate": 5.853249475890986e-05, "loss": 0.049, "step": 990 }, { "epoch": 2.0964360587002098, "grad_norm": 2.644869089126587, "learning_rate": 5.811320754716981e-05, "loss": 0.0545, "step": 1000 }, { "epoch": 2.1174004192872116, "grad_norm": 2.888030529022217, "learning_rate": 5.769392033542977e-05, "loss": 0.0563, "step": 1010 }, { "epoch": 2.138364779874214, "grad_norm": 2.034641742706299, "learning_rate": 5.727463312368973e-05, "loss": 0.0527, "step": 1020 }, { "epoch": 2.159329140461216, "grad_norm": 1.8326114416122437, "learning_rate": 5.685534591194969e-05, "loss": 0.0509, "step": 1030 }, { "epoch": 2.180293501048218, "grad_norm": 2.1253228187561035, "learning_rate": 5.643605870020965e-05, "loss": 0.0533, "step": 1040 }, { "epoch": 2.20125786163522, "grad_norm": 2.369814872741699, "learning_rate": 5.60167714884696e-05, "loss": 0.0535, "step": 1050 }, { "epoch": 2.2222222222222223, "grad_norm": 2.15043568611145, "learning_rate": 5.559748427672956e-05, "loss": 0.0525, "step": 1060 }, { "epoch": 2.243186582809224, "grad_norm": 2.138932943344116, "learning_rate": 5.517819706498952e-05, "loss": 0.052, "step": 1070 }, { "epoch": 2.2641509433962264, "grad_norm": 2.2840254306793213, "learning_rate": 5.475890985324947e-05, "loss": 0.0582, "step": 1080 }, { "epoch": 2.2851153039832286, "grad_norm": 1.516945481300354, "learning_rate": 5.433962264150943e-05, "loss": 0.0557, "step": 1090 }, { "epoch": 2.3060796645702304, "grad_norm": 2.2309811115264893, "learning_rate": 5.3920335429769395e-05, "loss": 0.0563, "step": 1100 }, { "epoch": 2.3270440251572326, "grad_norm": 2.9257094860076904, "learning_rate": 5.350104821802935e-05, "loss": 0.056, "step": 1110 }, { "epoch": 2.348008385744235, "grad_norm": 2.112194538116455, "learning_rate": 5.308176100628931e-05, "loss": 0.0567, "step": 1120 }, { "epoch": 2.368972746331237, "grad_norm": 2.411298990249634, "learning_rate": 5.2662473794549274e-05, "loss": 0.0558, "step": 1130 }, { "epoch": 2.389937106918239, "grad_norm": 2.410545825958252, "learning_rate": 5.224318658280922e-05, "loss": 0.0557, "step": 1140 }, { "epoch": 2.410901467505241, "grad_norm": 2.1554999351501465, "learning_rate": 5.1823899371069184e-05, "loss": 0.0589, "step": 1150 }, { "epoch": 2.431865828092243, "grad_norm": 2.1737091541290283, "learning_rate": 5.1404612159329146e-05, "loss": 0.0673, "step": 1160 }, { "epoch": 2.452830188679245, "grad_norm": 3.798069477081299, "learning_rate": 5.0985324947589094e-05, "loss": 0.0546, "step": 1170 }, { "epoch": 2.4737945492662474, "grad_norm": 2.5850603580474854, "learning_rate": 5.0566037735849056e-05, "loss": 0.0546, "step": 1180 }, { "epoch": 2.4947589098532497, "grad_norm": 2.899369239807129, "learning_rate": 5.014675052410902e-05, "loss": 0.0576, "step": 1190 }, { "epoch": 2.5157232704402515, "grad_norm": 2.534893035888672, "learning_rate": 4.972746331236898e-05, "loss": 0.0529, "step": 1200 }, { "epoch": 2.5366876310272537, "grad_norm": 4.3690924644470215, "learning_rate": 4.9308176100628935e-05, "loss": 0.0494, "step": 1210 }, { "epoch": 2.5576519916142555, "grad_norm": 1.8138058185577393, "learning_rate": 4.888888888888889e-05, "loss": 0.0559, "step": 1220 }, { "epoch": 2.5786163522012577, "grad_norm": 1.9707310199737549, "learning_rate": 4.846960167714885e-05, "loss": 0.0544, "step": 1230 }, { "epoch": 2.59958071278826, "grad_norm": 2.480036973953247, "learning_rate": 4.805031446540881e-05, "loss": 0.0591, "step": 1240 }, { "epoch": 2.620545073375262, "grad_norm": 3.3198599815368652, "learning_rate": 4.763102725366876e-05, "loss": 0.0519, "step": 1250 }, { "epoch": 2.641509433962264, "grad_norm": 4.302845478057861, "learning_rate": 4.7211740041928724e-05, "loss": 0.0588, "step": 1260 }, { "epoch": 2.6624737945492662, "grad_norm": 2.769380807876587, "learning_rate": 4.679245283018868e-05, "loss": 0.0561, "step": 1270 }, { "epoch": 2.6834381551362685, "grad_norm": 2.0273640155792236, "learning_rate": 4.637316561844864e-05, "loss": 0.0531, "step": 1280 }, { "epoch": 2.7044025157232703, "grad_norm": 3.1066033840179443, "learning_rate": 4.59538784067086e-05, "loss": 0.0547, "step": 1290 }, { "epoch": 2.7253668763102725, "grad_norm": 3.7857868671417236, "learning_rate": 4.553459119496856e-05, "loss": 0.0552, "step": 1300 }, { "epoch": 2.7463312368972748, "grad_norm": 2.8125133514404297, "learning_rate": 4.511530398322851e-05, "loss": 0.0592, "step": 1310 }, { "epoch": 2.767295597484277, "grad_norm": 1.8335483074188232, "learning_rate": 4.469601677148847e-05, "loss": 0.0576, "step": 1320 }, { "epoch": 2.788259958071279, "grad_norm": 2.8290367126464844, "learning_rate": 4.427672955974843e-05, "loss": 0.0566, "step": 1330 }, { "epoch": 2.809224318658281, "grad_norm": 2.31840443611145, "learning_rate": 4.3857442348008385e-05, "loss": 0.0504, "step": 1340 }, { "epoch": 2.830188679245283, "grad_norm": 2.372063159942627, "learning_rate": 4.343815513626835e-05, "loss": 0.0591, "step": 1350 }, { "epoch": 2.851153039832285, "grad_norm": 2.2945821285247803, "learning_rate": 4.301886792452831e-05, "loss": 0.0571, "step": 1360 }, { "epoch": 2.8721174004192873, "grad_norm": 1.8437625169754028, "learning_rate": 4.2599580712788264e-05, "loss": 0.0544, "step": 1370 }, { "epoch": 2.8930817610062896, "grad_norm": 2.612316608428955, "learning_rate": 4.218029350104822e-05, "loss": 0.0517, "step": 1380 }, { "epoch": 2.9140461215932913, "grad_norm": 2.7713685035705566, "learning_rate": 4.176100628930818e-05, "loss": 0.0565, "step": 1390 }, { "epoch": 2.9350104821802936, "grad_norm": 2.0892138481140137, "learning_rate": 4.1341719077568136e-05, "loss": 0.0591, "step": 1400 }, { "epoch": 2.9559748427672954, "grad_norm": 1.9455903768539429, "learning_rate": 4.092243186582809e-05, "loss": 0.0494, "step": 1410 }, { "epoch": 2.9769392033542976, "grad_norm": 3.191281318664551, "learning_rate": 4.050314465408805e-05, "loss": 0.0562, "step": 1420 }, { "epoch": 2.9979035639413, "grad_norm": 2.252624034881592, "learning_rate": 4.008385744234801e-05, "loss": 0.053, "step": 1430 }, { "epoch": 3.0, "eval_loss": 0.667631983757019, "eval_runtime": 22.0522, "eval_samples_per_second": 68.02, "eval_steps_per_second": 8.525, "step": 1431 }, { "epoch": 3.018867924528302, "grad_norm": 1.2633923292160034, "learning_rate": 3.966457023060797e-05, "loss": 0.025, "step": 1440 }, { "epoch": 3.039832285115304, "grad_norm": 1.2608442306518555, "learning_rate": 3.924528301886793e-05, "loss": 0.0191, "step": 1450 }, { "epoch": 3.060796645702306, "grad_norm": 3.670048475265503, "learning_rate": 3.882599580712789e-05, "loss": 0.0231, "step": 1460 }, { "epoch": 3.0817610062893084, "grad_norm": 3.2211225032806396, "learning_rate": 3.840670859538784e-05, "loss": 0.0225, "step": 1470 }, { "epoch": 3.10272536687631, "grad_norm": 1.0529983043670654, "learning_rate": 3.7987421383647804e-05, "loss": 0.0242, "step": 1480 }, { "epoch": 3.1236897274633124, "grad_norm": 1.5843929052352905, "learning_rate": 3.756813417190776e-05, "loss": 0.0199, "step": 1490 }, { "epoch": 3.1446540880503147, "grad_norm": 1.28718900680542, "learning_rate": 3.7148846960167714e-05, "loss": 0.022, "step": 1500 }, { "epoch": 3.1656184486373165, "grad_norm": 1.1010375022888184, "learning_rate": 3.672955974842767e-05, "loss": 0.0185, "step": 1510 }, { "epoch": 3.1865828092243187, "grad_norm": 1.0108301639556885, "learning_rate": 3.631027253668763e-05, "loss": 0.0203, "step": 1520 }, { "epoch": 3.207547169811321, "grad_norm": 1.856557846069336, "learning_rate": 3.589098532494759e-05, "loss": 0.0188, "step": 1530 }, { "epoch": 3.2285115303983227, "grad_norm": 1.2554458379745483, "learning_rate": 3.547169811320755e-05, "loss": 0.0217, "step": 1540 }, { "epoch": 3.249475890985325, "grad_norm": 1.5458911657333374, "learning_rate": 3.505241090146751e-05, "loss": 0.0217, "step": 1550 }, { "epoch": 3.270440251572327, "grad_norm": 1.8719966411590576, "learning_rate": 3.4633123689727465e-05, "loss": 0.0239, "step": 1560 }, { "epoch": 3.291404612159329, "grad_norm": 0.8294363617897034, "learning_rate": 3.421383647798742e-05, "loss": 0.0174, "step": 1570 }, { "epoch": 3.3123689727463312, "grad_norm": 1.013922095298767, "learning_rate": 3.379454926624738e-05, "loss": 0.0191, "step": 1580 }, { "epoch": 3.3333333333333335, "grad_norm": 2.7998366355895996, "learning_rate": 3.337526205450734e-05, "loss": 0.0211, "step": 1590 }, { "epoch": 3.3542976939203353, "grad_norm": 1.3313286304473877, "learning_rate": 3.295597484276729e-05, "loss": 0.0214, "step": 1600 }, { "epoch": 3.3752620545073375, "grad_norm": 1.3300907611846924, "learning_rate": 3.2536687631027254e-05, "loss": 0.0229, "step": 1610 }, { "epoch": 3.3962264150943398, "grad_norm": 1.3926533460617065, "learning_rate": 3.2117400419287216e-05, "loss": 0.0192, "step": 1620 }, { "epoch": 3.4171907756813416, "grad_norm": 1.8201309442520142, "learning_rate": 3.169811320754717e-05, "loss": 0.0194, "step": 1630 }, { "epoch": 3.438155136268344, "grad_norm": 1.1363880634307861, "learning_rate": 3.127882599580713e-05, "loss": 0.0196, "step": 1640 }, { "epoch": 3.459119496855346, "grad_norm": 1.1997497081756592, "learning_rate": 3.085953878406709e-05, "loss": 0.0192, "step": 1650 }, { "epoch": 3.480083857442348, "grad_norm": 1.1632133722305298, "learning_rate": 3.0440251572327043e-05, "loss": 0.0165, "step": 1660 }, { "epoch": 3.50104821802935, "grad_norm": 1.141566514968872, "learning_rate": 3.0020964360587005e-05, "loss": 0.0182, "step": 1670 }, { "epoch": 3.5220125786163523, "grad_norm": 1.4071044921875, "learning_rate": 2.9601677148846964e-05, "loss": 0.0177, "step": 1680 }, { "epoch": 3.5429769392033545, "grad_norm": 0.6424669623374939, "learning_rate": 2.918238993710692e-05, "loss": 0.018, "step": 1690 }, { "epoch": 3.5639412997903563, "grad_norm": 1.240339756011963, "learning_rate": 2.8763102725366874e-05, "loss": 0.0181, "step": 1700 }, { "epoch": 3.5849056603773586, "grad_norm": 1.8148901462554932, "learning_rate": 2.8343815513626836e-05, "loss": 0.0203, "step": 1710 }, { "epoch": 3.6058700209643604, "grad_norm": 2.1308140754699707, "learning_rate": 2.7924528301886794e-05, "loss": 0.0208, "step": 1720 }, { "epoch": 3.6268343815513626, "grad_norm": 1.0319384336471558, "learning_rate": 2.750524109014675e-05, "loss": 0.0171, "step": 1730 }, { "epoch": 3.647798742138365, "grad_norm": 1.056301236152649, "learning_rate": 2.708595387840671e-05, "loss": 0.0193, "step": 1740 }, { "epoch": 3.668763102725367, "grad_norm": 1.0502632856369019, "learning_rate": 2.6666666666666667e-05, "loss": 0.0181, "step": 1750 }, { "epoch": 3.689727463312369, "grad_norm": 1.5024515390396118, "learning_rate": 2.6247379454926625e-05, "loss": 0.0194, "step": 1760 }, { "epoch": 3.710691823899371, "grad_norm": 1.1332629919052124, "learning_rate": 2.5828092243186587e-05, "loss": 0.0154, "step": 1770 }, { "epoch": 3.731656184486373, "grad_norm": 1.3538895845413208, "learning_rate": 2.5408805031446542e-05, "loss": 0.0174, "step": 1780 }, { "epoch": 3.752620545073375, "grad_norm": 3.9446728229522705, "learning_rate": 2.49895178197065e-05, "loss": 0.0174, "step": 1790 }, { "epoch": 3.7735849056603774, "grad_norm": 1.6350576877593994, "learning_rate": 2.4570230607966456e-05, "loss": 0.0211, "step": 1800 }, { "epoch": 3.7945492662473796, "grad_norm": 0.998974621295929, "learning_rate": 2.4150943396226418e-05, "loss": 0.0185, "step": 1810 }, { "epoch": 3.8155136268343814, "grad_norm": 1.1701918840408325, "learning_rate": 2.3731656184486376e-05, "loss": 0.019, "step": 1820 }, { "epoch": 3.8364779874213837, "grad_norm": 1.005288004875183, "learning_rate": 2.331236897274633e-05, "loss": 0.0169, "step": 1830 }, { "epoch": 3.8574423480083855, "grad_norm": 1.4519301652908325, "learning_rate": 2.289308176100629e-05, "loss": 0.0191, "step": 1840 }, { "epoch": 3.8784067085953877, "grad_norm": 0.8834218382835388, "learning_rate": 2.2473794549266248e-05, "loss": 0.0171, "step": 1850 }, { "epoch": 3.89937106918239, "grad_norm": 1.3449293375015259, "learning_rate": 2.2054507337526207e-05, "loss": 0.0149, "step": 1860 }, { "epoch": 3.920335429769392, "grad_norm": 1.1032202243804932, "learning_rate": 2.1635220125786165e-05, "loss": 0.0175, "step": 1870 }, { "epoch": 3.941299790356394, "grad_norm": 0.9004182815551758, "learning_rate": 2.121593291404612e-05, "loss": 0.0152, "step": 1880 }, { "epoch": 3.9622641509433962, "grad_norm": 1.1716896295547485, "learning_rate": 2.0796645702306082e-05, "loss": 0.0189, "step": 1890 }, { "epoch": 3.9832285115303985, "grad_norm": 1.6147270202636719, "learning_rate": 2.037735849056604e-05, "loss": 0.0183, "step": 1900 }, { "epoch": 4.0, "eval_loss": 0.6632634401321411, "eval_runtime": 22.7298, "eval_samples_per_second": 65.993, "eval_steps_per_second": 8.271, "step": 1908 }, { "epoch": 4.0041928721174, "grad_norm": 0.37951168417930603, "learning_rate": 1.9958071278825996e-05, "loss": 0.0163, "step": 1910 }, { "epoch": 4.0251572327044025, "grad_norm": 0.3050394654273987, "learning_rate": 1.9538784067085954e-05, "loss": 0.007, "step": 1920 }, { "epoch": 4.046121593291405, "grad_norm": 0.34913668036460876, "learning_rate": 1.9119496855345913e-05, "loss": 0.0064, "step": 1930 }, { "epoch": 4.067085953878407, "grad_norm": 0.36414623260498047, "learning_rate": 1.870020964360587e-05, "loss": 0.0067, "step": 1940 }, { "epoch": 4.088050314465409, "grad_norm": 0.5772935152053833, "learning_rate": 1.828092243186583e-05, "loss": 0.0061, "step": 1950 }, { "epoch": 4.109014675052411, "grad_norm": 0.2230810970067978, "learning_rate": 1.7861635220125785e-05, "loss": 0.0059, "step": 1960 }, { "epoch": 4.129979035639413, "grad_norm": 0.46709367632865906, "learning_rate": 1.7442348008385743e-05, "loss": 0.0063, "step": 1970 }, { "epoch": 4.150943396226415, "grad_norm": 0.40340206027030945, "learning_rate": 1.7023060796645705e-05, "loss": 0.006, "step": 1980 }, { "epoch": 4.171907756813417, "grad_norm": 0.22066733241081238, "learning_rate": 1.660377358490566e-05, "loss": 0.0055, "step": 1990 }, { "epoch": 4.1928721174004195, "grad_norm": 0.22712306678295135, "learning_rate": 1.618448637316562e-05, "loss": 0.0075, "step": 2000 }, { "epoch": 4.213836477987422, "grad_norm": 0.3246201276779175, "learning_rate": 1.5765199161425577e-05, "loss": 0.0059, "step": 2010 }, { "epoch": 4.234800838574423, "grad_norm": 0.6034521460533142, "learning_rate": 1.5345911949685536e-05, "loss": 0.0058, "step": 2020 }, { "epoch": 4.255765199161425, "grad_norm": 0.3003758490085602, "learning_rate": 1.4926624737945494e-05, "loss": 0.0059, "step": 2030 }, { "epoch": 4.276729559748428, "grad_norm": 0.38291364908218384, "learning_rate": 1.450733752620545e-05, "loss": 0.0053, "step": 2040 }, { "epoch": 4.29769392033543, "grad_norm": 0.24918225407600403, "learning_rate": 1.408805031446541e-05, "loss": 0.0055, "step": 2050 }, { "epoch": 4.318658280922432, "grad_norm": 0.29115813970565796, "learning_rate": 1.3668763102725368e-05, "loss": 0.0057, "step": 2060 }, { "epoch": 4.339622641509434, "grad_norm": 0.6018015146255493, "learning_rate": 1.3249475890985325e-05, "loss": 0.0056, "step": 2070 }, { "epoch": 4.360587002096436, "grad_norm": 0.4561573565006256, "learning_rate": 1.2830188679245283e-05, "loss": 0.0053, "step": 2080 }, { "epoch": 4.381551362683438, "grad_norm": 0.2961859703063965, "learning_rate": 1.2410901467505242e-05, "loss": 0.0058, "step": 2090 }, { "epoch": 4.40251572327044, "grad_norm": 0.7806200385093689, "learning_rate": 1.19916142557652e-05, "loss": 0.0051, "step": 2100 }, { "epoch": 4.423480083857442, "grad_norm": 0.2288259118795395, "learning_rate": 1.1572327044025157e-05, "loss": 0.0062, "step": 2110 }, { "epoch": 4.444444444444445, "grad_norm": 0.5480152368545532, "learning_rate": 1.1153039832285116e-05, "loss": 0.0054, "step": 2120 }, { "epoch": 4.465408805031447, "grad_norm": 0.3903510272502899, "learning_rate": 1.0733752620545073e-05, "loss": 0.0068, "step": 2130 }, { "epoch": 4.486373165618448, "grad_norm": 0.26396581530570984, "learning_rate": 1.0314465408805033e-05, "loss": 0.0051, "step": 2140 }, { "epoch": 4.5073375262054505, "grad_norm": 0.3533041477203369, "learning_rate": 9.89517819706499e-06, "loss": 0.0062, "step": 2150 }, { "epoch": 4.528301886792453, "grad_norm": 0.23736946284770966, "learning_rate": 9.475890985324948e-06, "loss": 0.0056, "step": 2160 }, { "epoch": 4.549266247379455, "grad_norm": 0.7073323726654053, "learning_rate": 9.056603773584905e-06, "loss": 0.0054, "step": 2170 }, { "epoch": 4.570230607966457, "grad_norm": 0.22164012491703033, "learning_rate": 8.637316561844865e-06, "loss": 0.0055, "step": 2180 }, { "epoch": 4.591194968553459, "grad_norm": 0.27965155243873596, "learning_rate": 8.218029350104822e-06, "loss": 0.0048, "step": 2190 }, { "epoch": 4.612159329140461, "grad_norm": 0.7329670786857605, "learning_rate": 7.79874213836478e-06, "loss": 0.0057, "step": 2200 }, { "epoch": 4.633123689727463, "grad_norm": 0.310923308134079, "learning_rate": 7.379454926624739e-06, "loss": 0.0054, "step": 2210 }, { "epoch": 4.654088050314465, "grad_norm": 0.19455011188983917, "learning_rate": 6.9601677148846965e-06, "loss": 0.0072, "step": 2220 }, { "epoch": 4.6750524109014675, "grad_norm": 0.257545530796051, "learning_rate": 6.540880503144654e-06, "loss": 0.0057, "step": 2230 }, { "epoch": 4.69601677148847, "grad_norm": 1.0093029737472534, "learning_rate": 6.121593291404613e-06, "loss": 0.006, "step": 2240 }, { "epoch": 4.716981132075472, "grad_norm": 0.4301735758781433, "learning_rate": 5.70230607966457e-06, "loss": 0.0056, "step": 2250 }, { "epoch": 4.737945492662474, "grad_norm": 0.18497976660728455, "learning_rate": 5.283018867924529e-06, "loss": 0.0052, "step": 2260 }, { "epoch": 4.758909853249476, "grad_norm": 0.21521350741386414, "learning_rate": 4.8637316561844865e-06, "loss": 0.0055, "step": 2270 }, { "epoch": 4.779874213836478, "grad_norm": 0.20319782197475433, "learning_rate": 4.444444444444445e-06, "loss": 0.0058, "step": 2280 }, { "epoch": 4.80083857442348, "grad_norm": 0.2288380116224289, "learning_rate": 4.025157232704403e-06, "loss": 0.005, "step": 2290 }, { "epoch": 4.821802935010482, "grad_norm": 1.407477855682373, "learning_rate": 3.6058700209643607e-06, "loss": 0.0073, "step": 2300 }, { "epoch": 4.8427672955974845, "grad_norm": 0.26325657963752747, "learning_rate": 3.1865828092243184e-06, "loss": 0.0057, "step": 2310 }, { "epoch": 4.863731656184486, "grad_norm": 0.4354378283023834, "learning_rate": 2.767295597484277e-06, "loss": 0.006, "step": 2320 }, { "epoch": 4.884696016771488, "grad_norm": 0.7161938548088074, "learning_rate": 2.348008385744235e-06, "loss": 0.0055, "step": 2330 }, { "epoch": 4.90566037735849, "grad_norm": 1.1639270782470703, "learning_rate": 1.928721174004193e-06, "loss": 0.0056, "step": 2340 }, { "epoch": 4.926624737945493, "grad_norm": 0.3333396017551422, "learning_rate": 1.509433962264151e-06, "loss": 0.0061, "step": 2350 }, { "epoch": 4.947589098532495, "grad_norm": 0.3744851052761078, "learning_rate": 1.090146750524109e-06, "loss": 0.0056, "step": 2360 }, { "epoch": 4.968553459119497, "grad_norm": 0.1569022685289383, "learning_rate": 6.70859538784067e-07, "loss": 0.0052, "step": 2370 }, { "epoch": 4.989517819706499, "grad_norm": 0.30756279826164246, "learning_rate": 2.5157232704402517e-07, "loss": 0.0056, "step": 2380 }, { "epoch": 5.0, "eval_loss": 0.6631835699081421, "eval_runtime": 22.601, "eval_samples_per_second": 66.369, "eval_steps_per_second": 8.318, "step": 2385 } ], "logging_steps": 10, "max_steps": 2385, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8785440290816e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }