diff --git "a/checkpoint-1130/trainer_state.json" "b/checkpoint-1130/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1130/trainer_state.json" @@ -0,0 +1,7995 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9991154356479433, + "eval_steps": 142, + "global_step": 1130, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.6921162605285645, + "learning_rate": 5e-05, + "loss": 3.3182, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 3.3362529277801514, + "eval_runtime": 14.4412, + "eval_samples_per_second": 33.031, + "eval_steps_per_second": 8.31, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 4.620742321014404, + "learning_rate": 0.0001, + "loss": 3.2788, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 4.650161266326904, + "learning_rate": 0.00015, + "loss": 3.2271, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 4.024933815002441, + "learning_rate": 0.0002, + "loss": 2.402, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 2.751981496810913, + "learning_rate": 0.00025, + "loss": 1.0544, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 1.4766970872879028, + "learning_rate": 0.0003, + "loss": 0.3549, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 0.8064658641815186, + "learning_rate": 0.00035, + "loss": 0.1533, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 2.232205390930176, + "learning_rate": 0.0004, + "loss": 0.31, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 1.1898847818374634, + "learning_rate": 0.00045000000000000004, + "loss": 0.1818, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 0.7394833564758301, + "learning_rate": 0.0005, + "loss": 0.1751, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 0.16317571699619293, + "learning_rate": 0.0004999995654799487, + "loss": 0.1411, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.10235322266817093, + "learning_rate": 0.0004999982619213052, + "loss": 0.1363, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 0.19907887279987335, + "learning_rate": 0.0004999960893286008, + "loss": 0.128, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 0.6823816299438477, + "learning_rate": 0.0004999930477093878, + "loss": 0.143, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 0.10187644511461258, + "learning_rate": 0.0004999891370742394, + "loss": 0.1322, + "step": 15 + }, + { + "epoch": 0.03, + "grad_norm": 0.09401004016399384, + "learning_rate": 0.0004999843574367498, + "loss": 0.1361, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 0.17946797609329224, + "learning_rate": 0.0004999787088135334, + "loss": 0.1412, + "step": 17 + }, + { + "epoch": 0.03, + "grad_norm": 0.890545666217804, + "learning_rate": 0.0004999721912242259, + "loss": 0.1593, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.434042751789093, + "learning_rate": 0.0004999648046914836, + "loss": 0.1548, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 0.28103551268577576, + "learning_rate": 0.0004999565492409831, + "loss": 0.1459, + "step": 20 + }, + { + "epoch": 0.04, + "grad_norm": 0.2690610885620117, + "learning_rate": 0.0004999474249014217, + "loss": 0.1248, + "step": 21 + }, + { + "epoch": 0.04, + "grad_norm": 0.37668731808662415, + "learning_rate": 0.0004999374317045172, + "loss": 0.1481, + "step": 22 + }, + { + "epoch": 0.04, + "grad_norm": 0.23762015998363495, + "learning_rate": 0.0004999265696850074, + "loss": 0.1407, + "step": 23 + }, + { + "epoch": 0.04, + "grad_norm": 0.1988176554441452, + "learning_rate": 0.0004999148388806504, + "loss": 0.1398, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.3805619776248932, + "learning_rate": 0.0004999022393322246, + "loss": 0.1474, + "step": 25 + }, + { + "epoch": 0.05, + "grad_norm": 0.5069771409034729, + "learning_rate": 0.0004998887710835278, + "loss": 0.1509, + "step": 26 + }, + { + "epoch": 0.05, + "grad_norm": 0.42066043615341187, + "learning_rate": 0.0004998744341813779, + "loss": 0.1341, + "step": 27 + }, + { + "epoch": 0.05, + "grad_norm": 0.0944904088973999, + "learning_rate": 0.0004998592286756123, + "loss": 0.1233, + "step": 28 + }, + { + "epoch": 0.05, + "grad_norm": 0.849244236946106, + "learning_rate": 0.0004998431546190875, + "loss": 0.1999, + "step": 29 + }, + { + "epoch": 0.05, + "grad_norm": 0.09785456210374832, + "learning_rate": 0.00049982621206768, + "loss": 0.1272, + "step": 30 + }, + { + "epoch": 0.05, + "grad_norm": 0.38225457072257996, + "learning_rate": 0.0004998084010802845, + "loss": 0.1634, + "step": 31 + }, + { + "epoch": 0.06, + "grad_norm": 0.08135183900594711, + "learning_rate": 0.0004997897217188149, + "loss": 0.1383, + "step": 32 + }, + { + "epoch": 0.06, + "grad_norm": 0.17299437522888184, + "learning_rate": 0.0004997701740482036, + "loss": 0.1427, + "step": 33 + }, + { + "epoch": 0.06, + "grad_norm": 0.11125747114419937, + "learning_rate": 0.0004997497581364015, + "loss": 0.1379, + "step": 34 + }, + { + "epoch": 0.06, + "grad_norm": 0.08914893865585327, + "learning_rate": 0.0004997284740543776, + "loss": 0.1388, + "step": 35 + }, + { + "epoch": 0.06, + "grad_norm": 0.034590039402246475, + "learning_rate": 0.0004997063218761188, + "loss": 0.1387, + "step": 36 + }, + { + "epoch": 0.07, + "grad_norm": 0.08675777167081833, + "learning_rate": 0.0004996833016786296, + "loss": 0.1384, + "step": 37 + }, + { + "epoch": 0.07, + "grad_norm": 0.4440009295940399, + "learning_rate": 0.0004996594135419318, + "loss": 0.152, + "step": 38 + }, + { + "epoch": 0.07, + "grad_norm": 0.0814109519124031, + "learning_rate": 0.0004996346575490646, + "loss": 0.1373, + "step": 39 + }, + { + "epoch": 0.07, + "grad_norm": 0.37724560499191284, + "learning_rate": 0.0004996090337860836, + "loss": 0.1362, + "step": 40 + }, + { + "epoch": 0.07, + "grad_norm": 0.21177273988723755, + "learning_rate": 0.0004995825423420613, + "loss": 0.1423, + "step": 41 + }, + { + "epoch": 0.07, + "grad_norm": 0.12168041616678238, + "learning_rate": 0.000499555183309086, + "loss": 0.1381, + "step": 42 + }, + { + "epoch": 0.08, + "grad_norm": 0.21096466481685638, + "learning_rate": 0.0004995269567822623, + "loss": 0.139, + "step": 43 + }, + { + "epoch": 0.08, + "grad_norm": 0.49395841360092163, + "learning_rate": 0.0004994978628597099, + "loss": 0.1016, + "step": 44 + }, + { + "epoch": 0.08, + "grad_norm": 0.1108216792345047, + "learning_rate": 0.0004994679016425642, + "loss": 0.1334, + "step": 45 + }, + { + "epoch": 0.08, + "grad_norm": 0.5518127679824829, + "learning_rate": 0.000499437073234975, + "loss": 0.1568, + "step": 46 + }, + { + "epoch": 0.08, + "grad_norm": 0.4762812852859497, + "learning_rate": 0.0004994053777441069, + "loss": 0.1543, + "step": 47 + }, + { + "epoch": 0.08, + "grad_norm": 0.3477722108364105, + "learning_rate": 0.0004993728152801384, + "loss": 0.1464, + "step": 48 + }, + { + "epoch": 0.09, + "grad_norm": 0.4996407628059387, + "learning_rate": 0.0004993393859562621, + "loss": 0.154, + "step": 49 + }, + { + "epoch": 0.09, + "grad_norm": 0.20425601303577423, + "learning_rate": 0.0004993050898886833, + "loss": 0.1372, + "step": 50 + }, + { + "epoch": 0.09, + "grad_norm": 0.034631408751010895, + "learning_rate": 0.000499269927196621, + "loss": 0.137, + "step": 51 + }, + { + "epoch": 0.09, + "grad_norm": 0.24027873575687408, + "learning_rate": 0.0004992338980023062, + "loss": 0.1468, + "step": 52 + }, + { + "epoch": 0.09, + "grad_norm": 0.22242723405361176, + "learning_rate": 0.000499197002430982, + "loss": 0.1418, + "step": 53 + }, + { + "epoch": 0.1, + "grad_norm": 0.6540514826774597, + "learning_rate": 0.0004991592406109036, + "loss": 0.1564, + "step": 54 + }, + { + "epoch": 0.1, + "grad_norm": 0.030118577182292938, + "learning_rate": 0.000499120612673337, + "loss": 0.1365, + "step": 55 + }, + { + "epoch": 0.1, + "grad_norm": 0.07544097304344177, + "learning_rate": 0.0004990811187525592, + "loss": 0.1334, + "step": 56 + }, + { + "epoch": 0.1, + "grad_norm": 0.37415480613708496, + "learning_rate": 0.0004990407589858572, + "loss": 0.155, + "step": 57 + }, + { + "epoch": 0.1, + "grad_norm": 0.557809054851532, + "learning_rate": 0.0004989995335135282, + "loss": 0.1603, + "step": 58 + }, + { + "epoch": 0.1, + "grad_norm": 0.14802873134613037, + "learning_rate": 0.0004989574424788787, + "loss": 0.1387, + "step": 59 + }, + { + "epoch": 0.11, + "grad_norm": 0.3581993281841278, + "learning_rate": 0.0004989144860282236, + "loss": 0.1475, + "step": 60 + }, + { + "epoch": 0.11, + "grad_norm": 0.04818522185087204, + "learning_rate": 0.0004988706643108864, + "loss": 0.1362, + "step": 61 + }, + { + "epoch": 0.11, + "grad_norm": 0.21908174455165863, + "learning_rate": 0.0004988259774791987, + "loss": 0.1386, + "step": 62 + }, + { + "epoch": 0.11, + "grad_norm": 0.1852695643901825, + "learning_rate": 0.0004987804256884988, + "loss": 0.1387, + "step": 63 + }, + { + "epoch": 0.11, + "grad_norm": 0.025747304782271385, + "learning_rate": 0.0004987340090971323, + "loss": 0.1393, + "step": 64 + }, + { + "epoch": 0.11, + "grad_norm": 0.045346710830926895, + "learning_rate": 0.0004986867278664504, + "loss": 0.1354, + "step": 65 + }, + { + "epoch": 0.12, + "grad_norm": 0.34946465492248535, + "learning_rate": 0.0004986385821608106, + "loss": 0.152, + "step": 66 + }, + { + "epoch": 0.12, + "grad_norm": 0.2552882432937622, + "learning_rate": 0.0004985895721475748, + "loss": 0.1463, + "step": 67 + }, + { + "epoch": 0.12, + "grad_norm": 0.0560542456805706, + "learning_rate": 0.0004985396979971099, + "loss": 0.1391, + "step": 68 + }, + { + "epoch": 0.12, + "grad_norm": 0.14347511529922485, + "learning_rate": 0.0004984889598827863, + "loss": 0.1353, + "step": 69 + }, + { + "epoch": 0.12, + "grad_norm": 0.12386342883110046, + "learning_rate": 0.0004984373579809778, + "loss": 0.1343, + "step": 70 + }, + { + "epoch": 0.13, + "grad_norm": 0.03070697747170925, + "learning_rate": 0.000498384892471061, + "loss": 0.1356, + "step": 71 + }, + { + "epoch": 0.13, + "grad_norm": 0.0531514473259449, + "learning_rate": 0.0004983315635354144, + "loss": 0.1346, + "step": 72 + }, + { + "epoch": 0.13, + "grad_norm": 0.24197503924369812, + "learning_rate": 0.0004982773713594178, + "loss": 0.1217, + "step": 73 + }, + { + "epoch": 0.13, + "grad_norm": 0.08417380601167679, + "learning_rate": 0.0004982223161314522, + "loss": 0.1223, + "step": 74 + }, + { + "epoch": 0.13, + "grad_norm": 0.40045711398124695, + "learning_rate": 0.000498166398042898, + "loss": 0.1513, + "step": 75 + }, + { + "epoch": 0.13, + "grad_norm": 0.12452740222215652, + "learning_rate": 0.0004981096172881358, + "loss": 0.1296, + "step": 76 + }, + { + "epoch": 0.14, + "grad_norm": 0.21590262651443481, + "learning_rate": 0.0004980519740645444, + "loss": 0.1375, + "step": 77 + }, + { + "epoch": 0.14, + "grad_norm": 0.07704459875822067, + "learning_rate": 0.0004979934685725011, + "loss": 0.1299, + "step": 78 + }, + { + "epoch": 0.14, + "grad_norm": 0.28334081172943115, + "learning_rate": 0.0004979341010153801, + "loss": 0.1387, + "step": 79 + }, + { + "epoch": 0.14, + "grad_norm": 0.12374007701873779, + "learning_rate": 0.0004978738715995526, + "loss": 0.1383, + "step": 80 + }, + { + "epoch": 0.14, + "grad_norm": 0.040613267570734024, + "learning_rate": 0.000497812780534386, + "loss": 0.1367, + "step": 81 + }, + { + "epoch": 0.15, + "grad_norm": 0.09974126517772675, + "learning_rate": 0.0004977508280322423, + "loss": 0.1248, + "step": 82 + }, + { + "epoch": 0.15, + "grad_norm": 0.2616259753704071, + "learning_rate": 0.0004976880143084786, + "loss": 0.1311, + "step": 83 + }, + { + "epoch": 0.15, + "grad_norm": 0.15635579824447632, + "learning_rate": 0.0004976243395814452, + "loss": 0.1189, + "step": 84 + }, + { + "epoch": 0.15, + "grad_norm": 0.259250670671463, + "learning_rate": 0.000497559804072486, + "loss": 0.1099, + "step": 85 + }, + { + "epoch": 0.15, + "grad_norm": 1.2778699398040771, + "learning_rate": 0.0004974944080059365, + "loss": 0.1416, + "step": 86 + }, + { + "epoch": 0.15, + "grad_norm": 0.2155281901359558, + "learning_rate": 0.000497428151609124, + "loss": 0.1253, + "step": 87 + }, + { + "epoch": 0.16, + "grad_norm": 0.17533721029758453, + "learning_rate": 0.0004973610351123664, + "loss": 0.1446, + "step": 88 + }, + { + "epoch": 0.16, + "grad_norm": 0.07620590180158615, + "learning_rate": 0.0004972930587489715, + "loss": 0.1309, + "step": 89 + }, + { + "epoch": 0.16, + "grad_norm": 0.2370779663324356, + "learning_rate": 0.0004972242227552358, + "loss": 0.149, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 0.06374065577983856, + "learning_rate": 0.0004971545273704444, + "loss": 0.1307, + "step": 91 + }, + { + "epoch": 0.16, + "grad_norm": 0.22728750109672546, + "learning_rate": 0.0004970839728368697, + "loss": 0.1438, + "step": 92 + }, + { + "epoch": 0.16, + "grad_norm": 0.16872233152389526, + "learning_rate": 0.0004970125593997706, + "loss": 0.1364, + "step": 93 + }, + { + "epoch": 0.17, + "grad_norm": 0.18773947656154633, + "learning_rate": 0.0004969402873073914, + "loss": 0.146, + "step": 94 + }, + { + "epoch": 0.17, + "grad_norm": 0.1468167006969452, + "learning_rate": 0.0004968671568109616, + "loss": 0.1401, + "step": 95 + }, + { + "epoch": 0.17, + "grad_norm": 0.09030504524707794, + "learning_rate": 0.0004967931681646948, + "loss": 0.1318, + "step": 96 + }, + { + "epoch": 0.17, + "grad_norm": 0.061796192079782486, + "learning_rate": 0.000496718321625787, + "loss": 0.1244, + "step": 97 + }, + { + "epoch": 0.17, + "grad_norm": 0.045495226979255676, + "learning_rate": 0.0004966426174544171, + "loss": 0.1265, + "step": 98 + }, + { + "epoch": 0.18, + "grad_norm": 0.08449025452136993, + "learning_rate": 0.0004965660559137448, + "loss": 0.1276, + "step": 99 + }, + { + "epoch": 0.18, + "grad_norm": 0.09982559829950333, + "learning_rate": 0.0004964886372699102, + "loss": 0.1253, + "step": 100 + }, + { + "epoch": 0.18, + "grad_norm": 0.05831208825111389, + "learning_rate": 0.0004964103617920332, + "loss": 0.1271, + "step": 101 + }, + { + "epoch": 0.18, + "grad_norm": 0.20548835396766663, + "learning_rate": 0.0004963312297522116, + "loss": 0.1415, + "step": 102 + }, + { + "epoch": 0.18, + "grad_norm": 0.09664470702409744, + "learning_rate": 0.0004962512414255214, + "loss": 0.1083, + "step": 103 + }, + { + "epoch": 0.18, + "grad_norm": 0.16931602358818054, + "learning_rate": 0.0004961703970900145, + "loss": 0.1431, + "step": 104 + }, + { + "epoch": 0.19, + "grad_norm": 0.10859667509794235, + "learning_rate": 0.000496088697026719, + "loss": 0.12, + "step": 105 + }, + { + "epoch": 0.19, + "grad_norm": 0.21958191692829132, + "learning_rate": 0.0004960061415196374, + "loss": 0.1492, + "step": 106 + }, + { + "epoch": 0.19, + "grad_norm": 0.06437578052282333, + "learning_rate": 0.0004959227308557459, + "loss": 0.1083, + "step": 107 + }, + { + "epoch": 0.19, + "grad_norm": 0.14975550770759583, + "learning_rate": 0.0004958384653249932, + "loss": 0.1155, + "step": 108 + }, + { + "epoch": 0.19, + "grad_norm": 0.11868852376937866, + "learning_rate": 0.0004957533452203, + "loss": 0.1237, + "step": 109 + }, + { + "epoch": 0.19, + "grad_norm": 0.2610260546207428, + "learning_rate": 0.0004956673708375574, + "loss": 0.1264, + "step": 110 + }, + { + "epoch": 0.2, + "grad_norm": 0.378467321395874, + "learning_rate": 0.000495580542475626, + "loss": 0.1222, + "step": 111 + }, + { + "epoch": 0.2, + "grad_norm": 0.0926096960902214, + "learning_rate": 0.0004954928604363352, + "loss": 0.1096, + "step": 112 + }, + { + "epoch": 0.2, + "grad_norm": 0.06858692318201065, + "learning_rate": 0.0004954043250244819, + "loss": 0.1144, + "step": 113 + }, + { + "epoch": 0.2, + "grad_norm": 0.3068992495536804, + "learning_rate": 0.0004953149365478293, + "loss": 0.1563, + "step": 114 + }, + { + "epoch": 0.2, + "grad_norm": 0.15458936989307404, + "learning_rate": 0.0004952246953171061, + "loss": 0.1216, + "step": 115 + }, + { + "epoch": 0.21, + "grad_norm": 0.10287577658891678, + "learning_rate": 0.0004951336016460053, + "loss": 0.0893, + "step": 116 + }, + { + "epoch": 0.21, + "grad_norm": 0.1215134710073471, + "learning_rate": 0.0004950416558511833, + "loss": 0.1016, + "step": 117 + }, + { + "epoch": 0.21, + "grad_norm": 0.1392650604248047, + "learning_rate": 0.000494948858252258, + "loss": 0.1111, + "step": 118 + }, + { + "epoch": 0.21, + "grad_norm": 0.4350431263446808, + "learning_rate": 0.0004948552091718092, + "loss": 0.1192, + "step": 119 + }, + { + "epoch": 0.21, + "grad_norm": 0.21448662877082825, + "learning_rate": 0.0004947607089353758, + "loss": 0.07, + "step": 120 + }, + { + "epoch": 0.21, + "grad_norm": 1.6086686849594116, + "learning_rate": 0.0004946653578714559, + "loss": 0.1352, + "step": 121 + }, + { + "epoch": 0.22, + "grad_norm": 0.25963085889816284, + "learning_rate": 0.0004945691563115051, + "loss": 0.1447, + "step": 122 + }, + { + "epoch": 0.22, + "grad_norm": 0.11575956642627716, + "learning_rate": 0.0004944721045899356, + "loss": 0.1055, + "step": 123 + }, + { + "epoch": 0.22, + "grad_norm": 0.11230157315731049, + "learning_rate": 0.0004943742030441145, + "loss": 0.0917, + "step": 124 + }, + { + "epoch": 0.22, + "grad_norm": 0.3376341760158539, + "learning_rate": 0.0004942754520143634, + "loss": 0.1364, + "step": 125 + }, + { + "epoch": 0.22, + "grad_norm": 0.2757412791252136, + "learning_rate": 0.0004941758518439566, + "loss": 0.1418, + "step": 126 + }, + { + "epoch": 0.22, + "grad_norm": 0.1438644975423813, + "learning_rate": 0.0004940754028791205, + "loss": 0.1162, + "step": 127 + }, + { + "epoch": 0.23, + "grad_norm": 0.14210884273052216, + "learning_rate": 0.0004939741054690316, + "loss": 0.1312, + "step": 128 + }, + { + "epoch": 0.23, + "grad_norm": 0.1861649751663208, + "learning_rate": 0.0004938719599658162, + "loss": 0.1447, + "step": 129 + }, + { + "epoch": 0.23, + "grad_norm": 0.19665485620498657, + "learning_rate": 0.0004937689667245481, + "loss": 0.1439, + "step": 130 + }, + { + "epoch": 0.23, + "grad_norm": 0.22447055578231812, + "learning_rate": 0.0004936651261032486, + "loss": 0.1568, + "step": 131 + }, + { + "epoch": 0.23, + "grad_norm": 0.10008269548416138, + "learning_rate": 0.0004935604384628843, + "loss": 0.1081, + "step": 132 + }, + { + "epoch": 0.24, + "grad_norm": 0.0549234002828598, + "learning_rate": 0.0004934549041673661, + "loss": 0.1216, + "step": 133 + }, + { + "epoch": 0.24, + "grad_norm": 0.11616212874650955, + "learning_rate": 0.0004933485235835483, + "loss": 0.1108, + "step": 134 + }, + { + "epoch": 0.24, + "grad_norm": 0.08554813265800476, + "learning_rate": 0.0004932412970812269, + "loss": 0.135, + "step": 135 + }, + { + "epoch": 0.24, + "grad_norm": 0.08642842620611191, + "learning_rate": 0.0004931332250331382, + "loss": 0.1205, + "step": 136 + }, + { + "epoch": 0.24, + "grad_norm": 0.20417262613773346, + "learning_rate": 0.0004930243078149582, + "loss": 0.1169, + "step": 137 + }, + { + "epoch": 0.24, + "grad_norm": 0.11088764667510986, + "learning_rate": 0.0004929145458053005, + "loss": 0.1014, + "step": 138 + }, + { + "epoch": 0.25, + "grad_norm": 0.3510516881942749, + "learning_rate": 0.0004928039393857155, + "loss": 0.0967, + "step": 139 + }, + { + "epoch": 0.25, + "grad_norm": 0.2401883453130722, + "learning_rate": 0.0004926924889406888, + "loss": 0.106, + "step": 140 + }, + { + "epoch": 0.25, + "grad_norm": 0.28403300046920776, + "learning_rate": 0.0004925801948576402, + "loss": 0.079, + "step": 141 + }, + { + "epoch": 0.25, + "grad_norm": 0.46027252078056335, + "learning_rate": 0.0004924670575269217, + "loss": 0.0899, + "step": 142 + }, + { + "epoch": 0.25, + "eval_loss": 0.09421269595623016, + "eval_runtime": 14.7696, + "eval_samples_per_second": 32.296, + "eval_steps_per_second": 8.125, + "step": 142 + }, + { + "epoch": 0.25, + "grad_norm": 0.29767730832099915, + "learning_rate": 0.0004923530773418169, + "loss": 0.1265, + "step": 143 + }, + { + "epoch": 0.25, + "grad_norm": 0.37391072511672974, + "learning_rate": 0.0004922382546985394, + "loss": 0.1244, + "step": 144 + }, + { + "epoch": 0.26, + "grad_norm": 0.8874172568321228, + "learning_rate": 0.0004921225899962308, + "loss": 0.1796, + "step": 145 + }, + { + "epoch": 0.26, + "grad_norm": 0.2554258704185486, + "learning_rate": 0.0004920060836369603, + "loss": 0.0528, + "step": 146 + }, + { + "epoch": 0.26, + "grad_norm": 0.1981816440820694, + "learning_rate": 0.0004918887360257228, + "loss": 0.1159, + "step": 147 + }, + { + "epoch": 0.26, + "grad_norm": 0.14500874280929565, + "learning_rate": 0.0004917705475704373, + "loss": 0.0992, + "step": 148 + }, + { + "epoch": 0.26, + "grad_norm": 0.1315220594406128, + "learning_rate": 0.000491651518681946, + "loss": 0.1248, + "step": 149 + }, + { + "epoch": 0.27, + "grad_norm": 0.0798826813697815, + "learning_rate": 0.0004915316497740121, + "loss": 0.1151, + "step": 150 + }, + { + "epoch": 0.27, + "grad_norm": 0.10213784873485565, + "learning_rate": 0.0004914109412633194, + "loss": 0.1098, + "step": 151 + }, + { + "epoch": 0.27, + "grad_norm": 0.23167072236537933, + "learning_rate": 0.00049128939356947, + "loss": 0.1236, + "step": 152 + }, + { + "epoch": 0.27, + "grad_norm": 0.173340305685997, + "learning_rate": 0.0004911670071149831, + "loss": 0.1098, + "step": 153 + }, + { + "epoch": 0.27, + "grad_norm": 0.1079009547829628, + "learning_rate": 0.0004910437823252937, + "loss": 0.1014, + "step": 154 + }, + { + "epoch": 0.27, + "grad_norm": 0.320765882730484, + "learning_rate": 0.0004909197196287509, + "loss": 0.1285, + "step": 155 + }, + { + "epoch": 0.28, + "grad_norm": 0.40041017532348633, + "learning_rate": 0.0004907948194566166, + "loss": 0.1421, + "step": 156 + }, + { + "epoch": 0.28, + "grad_norm": 0.4091287851333618, + "learning_rate": 0.0004906690822430638, + "loss": 0.1451, + "step": 157 + }, + { + "epoch": 0.28, + "grad_norm": 0.39893922209739685, + "learning_rate": 0.0004905425084251753, + "loss": 0.1289, + "step": 158 + }, + { + "epoch": 0.28, + "grad_norm": 0.14173893630504608, + "learning_rate": 0.0004904150984429419, + "loss": 0.0712, + "step": 159 + }, + { + "epoch": 0.28, + "grad_norm": 0.4723054766654968, + "learning_rate": 0.0004902868527392611, + "loss": 0.2141, + "step": 160 + }, + { + "epoch": 0.28, + "grad_norm": 0.13493523001670837, + "learning_rate": 0.0004901577717599355, + "loss": 0.0881, + "step": 161 + }, + { + "epoch": 0.29, + "grad_norm": 0.10770414024591446, + "learning_rate": 0.0004900278559536716, + "loss": 0.0746, + "step": 162 + }, + { + "epoch": 0.29, + "grad_norm": 0.5121994614601135, + "learning_rate": 0.0004898971057720773, + "loss": 0.1705, + "step": 163 + }, + { + "epoch": 0.29, + "grad_norm": 0.09419309347867966, + "learning_rate": 0.0004897655216696612, + "loss": 0.1085, + "step": 164 + }, + { + "epoch": 0.29, + "grad_norm": 0.3557867407798767, + "learning_rate": 0.0004896331041038309, + "loss": 0.1027, + "step": 165 + }, + { + "epoch": 0.29, + "grad_norm": 0.082126185297966, + "learning_rate": 0.000489499853534891, + "loss": 0.1113, + "step": 166 + }, + { + "epoch": 0.3, + "grad_norm": 0.8520584106445312, + "learning_rate": 0.0004893657704260419, + "loss": 0.1291, + "step": 167 + }, + { + "epoch": 0.3, + "grad_norm": 0.4607222080230713, + "learning_rate": 0.000489230855243378, + "loss": 0.1241, + "step": 168 + }, + { + "epoch": 0.3, + "grad_norm": 0.5181136727333069, + "learning_rate": 0.0004890951084558859, + "loss": 0.0957, + "step": 169 + }, + { + "epoch": 0.3, + "grad_norm": 0.42894089221954346, + "learning_rate": 0.0004889585305354435, + "loss": 0.0895, + "step": 170 + }, + { + "epoch": 0.3, + "grad_norm": 0.14509521424770355, + "learning_rate": 0.0004888211219568175, + "loss": 0.0732, + "step": 171 + }, + { + "epoch": 0.3, + "grad_norm": 0.24262909591197968, + "learning_rate": 0.0004886828831976621, + "loss": 0.0917, + "step": 172 + }, + { + "epoch": 0.31, + "grad_norm": 0.44387635588645935, + "learning_rate": 0.0004885438147385175, + "loss": 0.0636, + "step": 173 + }, + { + "epoch": 0.31, + "grad_norm": 0.1804012507200241, + "learning_rate": 0.0004884039170628077, + "loss": 0.0295, + "step": 174 + }, + { + "epoch": 0.31, + "grad_norm": 0.5566735863685608, + "learning_rate": 0.0004882631906568398, + "loss": 0.1104, + "step": 175 + }, + { + "epoch": 0.31, + "grad_norm": 0.9653083682060242, + "learning_rate": 0.0004881216360098012, + "loss": 0.2236, + "step": 176 + }, + { + "epoch": 0.31, + "grad_norm": 0.27046507596969604, + "learning_rate": 0.0004879792536137585, + "loss": 0.1082, + "step": 177 + }, + { + "epoch": 0.31, + "grad_norm": 0.47974228858947754, + "learning_rate": 0.00048783604396365586, + "loss": 0.0884, + "step": 178 + }, + { + "epoch": 0.32, + "grad_norm": 0.23638087511062622, + "learning_rate": 0.0004876920075573129, + "loss": 0.0968, + "step": 179 + }, + { + "epoch": 0.32, + "grad_norm": 0.12476328015327454, + "learning_rate": 0.0004875471448954234, + "loss": 0.1078, + "step": 180 + }, + { + "epoch": 0.32, + "grad_norm": 0.2455732375383377, + "learning_rate": 0.00048740145648155307, + "loss": 0.1124, + "step": 181 + }, + { + "epoch": 0.32, + "grad_norm": 0.2744804620742798, + "learning_rate": 0.0004872549428221384, + "loss": 0.0797, + "step": 182 + }, + { + "epoch": 0.32, + "grad_norm": 0.19536937773227692, + "learning_rate": 0.00048710760442648415, + "loss": 0.1091, + "step": 183 + }, + { + "epoch": 0.33, + "grad_norm": 0.5277348160743713, + "learning_rate": 0.0004869594418067623, + "loss": 0.1261, + "step": 184 + }, + { + "epoch": 0.33, + "grad_norm": 0.13960392773151398, + "learning_rate": 0.00048681045547801003, + "loss": 0.0879, + "step": 185 + }, + { + "epoch": 0.33, + "grad_norm": 0.2567049562931061, + "learning_rate": 0.00048666064595812746, + "loss": 0.083, + "step": 186 + }, + { + "epoch": 0.33, + "grad_norm": 0.3075740337371826, + "learning_rate": 0.00048651001376787676, + "loss": 0.1167, + "step": 187 + }, + { + "epoch": 0.33, + "grad_norm": 0.5257586240768433, + "learning_rate": 0.0004863585594308794, + "loss": 0.1019, + "step": 188 + }, + { + "epoch": 0.33, + "grad_norm": 0.41611766815185547, + "learning_rate": 0.00048620628347361496, + "loss": 0.1392, + "step": 189 + }, + { + "epoch": 0.34, + "grad_norm": 0.30399614572525024, + "learning_rate": 0.00048605318642541917, + "loss": 0.1339, + "step": 190 + }, + { + "epoch": 0.34, + "grad_norm": 0.41276878118515015, + "learning_rate": 0.00048589926881848194, + "loss": 0.1028, + "step": 191 + }, + { + "epoch": 0.34, + "grad_norm": 0.19717253744602203, + "learning_rate": 0.0004857445311878456, + "loss": 0.1032, + "step": 192 + }, + { + "epoch": 0.34, + "grad_norm": 0.3766873776912689, + "learning_rate": 0.0004855889740714028, + "loss": 0.1486, + "step": 193 + }, + { + "epoch": 0.34, + "grad_norm": 0.17443525791168213, + "learning_rate": 0.0004854325980098951, + "loss": 0.096, + "step": 194 + }, + { + "epoch": 0.34, + "grad_norm": 0.1278471201658249, + "learning_rate": 0.0004852754035469109, + "loss": 0.0746, + "step": 195 + }, + { + "epoch": 0.35, + "grad_norm": 0.14356929063796997, + "learning_rate": 0.0004851173912288833, + "loss": 0.0857, + "step": 196 + }, + { + "epoch": 0.35, + "grad_norm": 0.20514866709709167, + "learning_rate": 0.0004849585616050884, + "loss": 0.0833, + "step": 197 + }, + { + "epoch": 0.35, + "grad_norm": 0.4683605134487152, + "learning_rate": 0.0004847989152276435, + "loss": 0.1538, + "step": 198 + }, + { + "epoch": 0.35, + "grad_norm": 0.29194721579551697, + "learning_rate": 0.00048463845265150495, + "loss": 0.1035, + "step": 199 + }, + { + "epoch": 0.35, + "grad_norm": 0.22838515043258667, + "learning_rate": 0.0004844771744344666, + "loss": 0.0762, + "step": 200 + }, + { + "epoch": 0.36, + "grad_norm": 0.3635599911212921, + "learning_rate": 0.0004843150811371572, + "loss": 0.1165, + "step": 201 + }, + { + "epoch": 0.36, + "grad_norm": 0.2508073151111603, + "learning_rate": 0.0004841521733230391, + "loss": 0.0736, + "step": 202 + }, + { + "epoch": 0.36, + "grad_norm": 0.24161550402641296, + "learning_rate": 0.000483988451558406, + "loss": 0.1309, + "step": 203 + }, + { + "epoch": 0.36, + "grad_norm": 0.4697308838367462, + "learning_rate": 0.0004838239164123811, + "loss": 0.1731, + "step": 204 + }, + { + "epoch": 0.36, + "grad_norm": 0.17773008346557617, + "learning_rate": 0.0004836585684569148, + "loss": 0.1158, + "step": 205 + }, + { + "epoch": 0.36, + "grad_norm": 0.21285519003868103, + "learning_rate": 0.0004834924082667833, + "loss": 0.0949, + "step": 206 + }, + { + "epoch": 0.37, + "grad_norm": 0.2403111308813095, + "learning_rate": 0.0004833254364195859, + "loss": 0.0801, + "step": 207 + }, + { + "epoch": 0.37, + "grad_norm": 0.2033465951681137, + "learning_rate": 0.0004831576534957437, + "loss": 0.069, + "step": 208 + }, + { + "epoch": 0.37, + "grad_norm": 0.5510303378105164, + "learning_rate": 0.000482989060078497, + "loss": 0.1766, + "step": 209 + }, + { + "epoch": 0.37, + "grad_norm": 0.32342344522476196, + "learning_rate": 0.0004828196567539034, + "loss": 0.1229, + "step": 210 + }, + { + "epoch": 0.37, + "grad_norm": 0.3102104663848877, + "learning_rate": 0.00048264944411083625, + "loss": 0.1297, + "step": 211 + }, + { + "epoch": 0.38, + "grad_norm": 0.32639122009277344, + "learning_rate": 0.00048247842274098187, + "loss": 0.1011, + "step": 212 + }, + { + "epoch": 0.38, + "grad_norm": 0.4720034897327423, + "learning_rate": 0.00048230659323883804, + "loss": 0.1282, + "step": 213 + }, + { + "epoch": 0.38, + "grad_norm": 0.5249712467193604, + "learning_rate": 0.00048213395620171166, + "loss": 0.1376, + "step": 214 + }, + { + "epoch": 0.38, + "grad_norm": 0.3953443467617035, + "learning_rate": 0.00048196051222971673, + "loss": 0.1186, + "step": 215 + }, + { + "epoch": 0.38, + "grad_norm": 0.15697738528251648, + "learning_rate": 0.0004817862619257723, + "loss": 0.1079, + "step": 216 + }, + { + "epoch": 0.38, + "grad_norm": 0.32511651515960693, + "learning_rate": 0.0004816112058956005, + "loss": 0.1052, + "step": 217 + }, + { + "epoch": 0.39, + "grad_norm": 0.1850031018257141, + "learning_rate": 0.00048143534474772397, + "loss": 0.1236, + "step": 218 + }, + { + "epoch": 0.39, + "grad_norm": 0.10901057720184326, + "learning_rate": 0.0004812586790934645, + "loss": 0.1094, + "step": 219 + }, + { + "epoch": 0.39, + "grad_norm": 0.23395784199237823, + "learning_rate": 0.00048108120954694014, + "loss": 0.0556, + "step": 220 + }, + { + "epoch": 0.39, + "grad_norm": 0.21469372510910034, + "learning_rate": 0.00048090293672506347, + "loss": 0.0594, + "step": 221 + }, + { + "epoch": 0.39, + "grad_norm": 0.17289988696575165, + "learning_rate": 0.00048072386124753944, + "loss": 0.0219, + "step": 222 + }, + { + "epoch": 0.39, + "grad_norm": 0.21490757167339325, + "learning_rate": 0.0004805439837368631, + "loss": 0.0203, + "step": 223 + }, + { + "epoch": 0.4, + "grad_norm": 1.1259506940841675, + "learning_rate": 0.0004803633048183176, + "loss": 0.1576, + "step": 224 + }, + { + "epoch": 0.4, + "grad_norm": 1.2934038639068604, + "learning_rate": 0.00048018182511997185, + "loss": 0.1233, + "step": 225 + }, + { + "epoch": 0.4, + "grad_norm": 0.4250846207141876, + "learning_rate": 0.0004799995452726783, + "loss": 0.1023, + "step": 226 + }, + { + "epoch": 0.4, + "grad_norm": 1.4675579071044922, + "learning_rate": 0.000479816465910071, + "loss": 0.1242, + "step": 227 + }, + { + "epoch": 0.4, + "grad_norm": 0.7030429840087891, + "learning_rate": 0.0004796325876685632, + "loss": 0.0514, + "step": 228 + }, + { + "epoch": 0.41, + "grad_norm": 0.5683910846710205, + "learning_rate": 0.00047944791118734517, + "loss": 0.0923, + "step": 229 + }, + { + "epoch": 0.41, + "grad_norm": 0.8425244092941284, + "learning_rate": 0.0004792624371083819, + "loss": 0.0976, + "step": 230 + }, + { + "epoch": 0.41, + "grad_norm": 0.21189981698989868, + "learning_rate": 0.00047907616607641113, + "loss": 0.1016, + "step": 231 + }, + { + "epoch": 0.41, + "grad_norm": 0.36100390553474426, + "learning_rate": 0.0004788890987389408, + "loss": 0.1015, + "step": 232 + }, + { + "epoch": 0.41, + "grad_norm": 0.42600420117378235, + "learning_rate": 0.000478701235746247, + "loss": 0.1401, + "step": 233 + }, + { + "epoch": 0.41, + "grad_norm": 0.649318516254425, + "learning_rate": 0.0004785125777513716, + "loss": 0.1012, + "step": 234 + }, + { + "epoch": 0.42, + "grad_norm": 0.3490477204322815, + "learning_rate": 0.00047832312541012007, + "loss": 0.1015, + "step": 235 + }, + { + "epoch": 0.42, + "grad_norm": 0.6937799453735352, + "learning_rate": 0.0004781328793810592, + "loss": 0.1188, + "step": 236 + }, + { + "epoch": 0.42, + "grad_norm": 1.0924077033996582, + "learning_rate": 0.0004779418403255146, + "loss": 0.1093, + "step": 237 + }, + { + "epoch": 0.42, + "grad_norm": 0.36075183749198914, + "learning_rate": 0.0004777500089075687, + "loss": 0.0971, + "step": 238 + }, + { + "epoch": 0.42, + "grad_norm": 0.41673243045806885, + "learning_rate": 0.00047755738579405836, + "loss": 0.0953, + "step": 239 + }, + { + "epoch": 0.42, + "grad_norm": 0.13159583508968353, + "learning_rate": 0.0004773639716545723, + "loss": 0.0571, + "step": 240 + }, + { + "epoch": 0.43, + "grad_norm": 0.9338862895965576, + "learning_rate": 0.00047716976716144917, + "loss": 0.202, + "step": 241 + }, + { + "epoch": 0.43, + "grad_norm": 0.3190581798553467, + "learning_rate": 0.0004769747729897749, + "loss": 0.1071, + "step": 242 + }, + { + "epoch": 0.43, + "grad_norm": 0.23796042799949646, + "learning_rate": 0.0004767789898173806, + "loss": 0.0659, + "step": 243 + }, + { + "epoch": 0.43, + "grad_norm": 0.19194231927394867, + "learning_rate": 0.0004765824183248399, + "loss": 0.0611, + "step": 244 + }, + { + "epoch": 0.43, + "grad_norm": 0.16703608632087708, + "learning_rate": 0.0004763850591954668, + "loss": 0.0855, + "step": 245 + }, + { + "epoch": 0.44, + "grad_norm": 0.3395439684391022, + "learning_rate": 0.0004761869131153135, + "loss": 0.0926, + "step": 246 + }, + { + "epoch": 0.44, + "grad_norm": 0.2820179760456085, + "learning_rate": 0.0004759879807731673, + "loss": 0.0508, + "step": 247 + }, + { + "epoch": 0.44, + "grad_norm": 0.20656561851501465, + "learning_rate": 0.00047578826286054897, + "loss": 0.068, + "step": 248 + }, + { + "epoch": 0.44, + "grad_norm": 0.4477837383747101, + "learning_rate": 0.00047558776007171024, + "loss": 0.0918, + "step": 249 + }, + { + "epoch": 0.44, + "grad_norm": 0.18997950851917267, + "learning_rate": 0.0004753864731036307, + "loss": 0.0734, + "step": 250 + }, + { + "epoch": 0.44, + "grad_norm": 0.2841518521308899, + "learning_rate": 0.0004751844026560163, + "loss": 0.1194, + "step": 251 + }, + { + "epoch": 0.45, + "grad_norm": 0.29770052433013916, + "learning_rate": 0.0004749815494312963, + "loss": 0.0996, + "step": 252 + }, + { + "epoch": 0.45, + "grad_norm": 0.2982254922389984, + "learning_rate": 0.00047477791413462104, + "loss": 0.0945, + "step": 253 + }, + { + "epoch": 0.45, + "grad_norm": 0.4625980854034424, + "learning_rate": 0.00047457349747385936, + "loss": 0.131, + "step": 254 + }, + { + "epoch": 0.45, + "grad_norm": 0.29756709933280945, + "learning_rate": 0.00047436830015959653, + "loss": 0.1057, + "step": 255 + }, + { + "epoch": 0.45, + "grad_norm": 0.19971434772014618, + "learning_rate": 0.00047416232290513127, + "loss": 0.0794, + "step": 256 + }, + { + "epoch": 0.45, + "grad_norm": 0.12171836197376251, + "learning_rate": 0.0004739555664264736, + "loss": 0.0527, + "step": 257 + }, + { + "epoch": 0.46, + "grad_norm": 0.23848529160022736, + "learning_rate": 0.00047374803144234213, + "loss": 0.134, + "step": 258 + }, + { + "epoch": 0.46, + "grad_norm": 0.12673752009868622, + "learning_rate": 0.0004735397186741618, + "loss": 0.0774, + "step": 259 + }, + { + "epoch": 0.46, + "grad_norm": 0.11961629241704941, + "learning_rate": 0.00047333062884606116, + "loss": 0.0661, + "step": 260 + }, + { + "epoch": 0.46, + "grad_norm": 0.18004140257835388, + "learning_rate": 0.00047312076268487, + "loss": 0.1132, + "step": 261 + }, + { + "epoch": 0.46, + "grad_norm": 0.1698005348443985, + "learning_rate": 0.00047291012092011685, + "loss": 0.057, + "step": 262 + }, + { + "epoch": 0.47, + "grad_norm": 0.1949334442615509, + "learning_rate": 0.0004726987042840263, + "loss": 0.0703, + "step": 263 + }, + { + "epoch": 0.47, + "grad_norm": 0.4016534686088562, + "learning_rate": 0.0004724865135115163, + "loss": 0.1178, + "step": 264 + }, + { + "epoch": 0.47, + "grad_norm": 0.36885496973991394, + "learning_rate": 0.00047227354934019605, + "loss": 0.1303, + "step": 265 + }, + { + "epoch": 0.47, + "grad_norm": 0.3214585483074188, + "learning_rate": 0.00047205981251036334, + "loss": 0.1019, + "step": 266 + }, + { + "epoch": 0.47, + "grad_norm": 0.15313082933425903, + "learning_rate": 0.0004718453037650016, + "loss": 0.0581, + "step": 267 + }, + { + "epoch": 0.47, + "grad_norm": 0.3251878321170807, + "learning_rate": 0.0004716300238497775, + "loss": 0.099, + "step": 268 + }, + { + "epoch": 0.48, + "grad_norm": 0.20356950163841248, + "learning_rate": 0.0004714139735130388, + "loss": 0.0767, + "step": 269 + }, + { + "epoch": 0.48, + "grad_norm": 0.2644464373588562, + "learning_rate": 0.00047119715350581095, + "loss": 0.1003, + "step": 270 + }, + { + "epoch": 0.48, + "grad_norm": 0.22035302221775055, + "learning_rate": 0.000470979564581795, + "loss": 0.0722, + "step": 271 + }, + { + "epoch": 0.48, + "grad_norm": 0.5284466743469238, + "learning_rate": 0.0004707612074973653, + "loss": 0.1282, + "step": 272 + }, + { + "epoch": 0.48, + "grad_norm": 0.34032565355300903, + "learning_rate": 0.0004705420830115658, + "loss": 0.099, + "step": 273 + }, + { + "epoch": 0.48, + "grad_norm": 0.26527565717697144, + "learning_rate": 0.00047032219188610836, + "loss": 0.0911, + "step": 274 + }, + { + "epoch": 0.49, + "grad_norm": 0.2254990190267563, + "learning_rate": 0.0004701015348853699, + "loss": 0.0667, + "step": 275 + }, + { + "epoch": 0.49, + "grad_norm": 0.21334387362003326, + "learning_rate": 0.0004698801127763895, + "loss": 0.0659, + "step": 276 + }, + { + "epoch": 0.49, + "grad_norm": 0.2917044758796692, + "learning_rate": 0.0004696579263288661, + "loss": 0.1159, + "step": 277 + }, + { + "epoch": 0.49, + "grad_norm": 0.14027804136276245, + "learning_rate": 0.00046943497631515526, + "loss": 0.0323, + "step": 278 + }, + { + "epoch": 0.49, + "grad_norm": 0.3988366425037384, + "learning_rate": 0.00046921126351026697, + "loss": 0.0887, + "step": 279 + }, + { + "epoch": 0.5, + "grad_norm": 0.36629319190979004, + "learning_rate": 0.00046898678869186297, + "loss": 0.1079, + "step": 280 + }, + { + "epoch": 0.5, + "grad_norm": 0.35548141598701477, + "learning_rate": 0.0004687615526402536, + "loss": 0.1056, + "step": 281 + }, + { + "epoch": 0.5, + "grad_norm": 0.21030637621879578, + "learning_rate": 0.0004685355561383956, + "loss": 0.0717, + "step": 282 + }, + { + "epoch": 0.5, + "grad_norm": 0.24192889034748077, + "learning_rate": 0.000468308799971889, + "loss": 0.1047, + "step": 283 + }, + { + "epoch": 0.5, + "grad_norm": 0.16289295256137848, + "learning_rate": 0.00046808128492897464, + "loss": 0.0519, + "step": 284 + }, + { + "epoch": 0.5, + "eval_loss": 0.08938124030828476, + "eval_runtime": 14.7518, + "eval_samples_per_second": 32.335, + "eval_steps_per_second": 8.135, + "step": 284 + }, + { + "epoch": 0.5, + "grad_norm": 0.23021583259105682, + "learning_rate": 0.00046785301180053126, + "loss": 0.1161, + "step": 285 + }, + { + "epoch": 0.51, + "grad_norm": 0.3577558398246765, + "learning_rate": 0.0004676239813800729, + "loss": 0.1239, + "step": 286 + }, + { + "epoch": 0.51, + "grad_norm": 0.15293735265731812, + "learning_rate": 0.0004673941944637461, + "loss": 0.0401, + "step": 287 + }, + { + "epoch": 0.51, + "grad_norm": 0.342631459236145, + "learning_rate": 0.00046716365185032696, + "loss": 0.1358, + "step": 288 + }, + { + "epoch": 0.51, + "grad_norm": 0.4987104833126068, + "learning_rate": 0.0004669323543412186, + "loss": 0.1312, + "step": 289 + }, + { + "epoch": 0.51, + "grad_norm": 0.21678434312343597, + "learning_rate": 0.0004667003027404483, + "loss": 0.0791, + "step": 290 + }, + { + "epoch": 0.51, + "grad_norm": 0.2781723141670227, + "learning_rate": 0.00046646749785466464, + "loss": 0.0809, + "step": 291 + }, + { + "epoch": 0.52, + "grad_norm": 0.3997693359851837, + "learning_rate": 0.00046623394049313474, + "loss": 0.0938, + "step": 292 + }, + { + "epoch": 0.52, + "grad_norm": 0.2478984147310257, + "learning_rate": 0.00046599963146774136, + "loss": 0.0671, + "step": 293 + }, + { + "epoch": 0.52, + "grad_norm": 0.35655421018600464, + "learning_rate": 0.0004657645715929805, + "loss": 0.107, + "step": 294 + }, + { + "epoch": 0.52, + "grad_norm": 0.41986069083213806, + "learning_rate": 0.0004655287616859577, + "loss": 0.1381, + "step": 295 + }, + { + "epoch": 0.52, + "grad_norm": 0.2831580340862274, + "learning_rate": 0.00046529220256638626, + "loss": 0.1012, + "step": 296 + }, + { + "epoch": 0.53, + "grad_norm": 0.2183172106742859, + "learning_rate": 0.0004650548950565835, + "loss": 0.0883, + "step": 297 + }, + { + "epoch": 0.53, + "grad_norm": 0.1485687792301178, + "learning_rate": 0.0004648168399814684, + "loss": 0.094, + "step": 298 + }, + { + "epoch": 0.53, + "grad_norm": 0.3192533552646637, + "learning_rate": 0.0004645780381685586, + "loss": 0.1144, + "step": 299 + }, + { + "epoch": 0.53, + "grad_norm": 0.20768460631370544, + "learning_rate": 0.0004643384904479675, + "loss": 0.1119, + "step": 300 + }, + { + "epoch": 0.53, + "grad_norm": 0.16704390943050385, + "learning_rate": 0.00046409819765240147, + "loss": 0.0852, + "step": 301 + }, + { + "epoch": 0.53, + "grad_norm": 0.33123648166656494, + "learning_rate": 0.0004638571606171567, + "loss": 0.1608, + "step": 302 + }, + { + "epoch": 0.54, + "grad_norm": 0.408978134393692, + "learning_rate": 0.0004636153801801167, + "loss": 0.0906, + "step": 303 + }, + { + "epoch": 0.54, + "grad_norm": 0.29201096296310425, + "learning_rate": 0.00046337285718174896, + "loss": 0.1237, + "step": 304 + }, + { + "epoch": 0.54, + "grad_norm": 0.45836058259010315, + "learning_rate": 0.00046312959246510237, + "loss": 0.0926, + "step": 305 + }, + { + "epoch": 0.54, + "grad_norm": 0.5405777096748352, + "learning_rate": 0.0004628855868758041, + "loss": 0.0727, + "step": 306 + }, + { + "epoch": 0.54, + "grad_norm": 0.3068138062953949, + "learning_rate": 0.00046264084126205676, + "loss": 0.1006, + "step": 307 + }, + { + "epoch": 0.54, + "grad_norm": 0.2990975081920624, + "learning_rate": 0.00046239535647463534, + "loss": 0.1033, + "step": 308 + }, + { + "epoch": 0.55, + "grad_norm": 0.2938540279865265, + "learning_rate": 0.00046214913336688424, + "loss": 0.1084, + "step": 309 + }, + { + "epoch": 0.55, + "grad_norm": 0.49840983748435974, + "learning_rate": 0.00046190217279471466, + "loss": 0.1066, + "step": 310 + }, + { + "epoch": 0.55, + "grad_norm": 0.4558626711368561, + "learning_rate": 0.000461654475616601, + "loss": 0.121, + "step": 311 + }, + { + "epoch": 0.55, + "grad_norm": 0.20964759588241577, + "learning_rate": 0.0004614060426935786, + "loss": 0.0843, + "step": 312 + }, + { + "epoch": 0.55, + "grad_norm": 0.2254151701927185, + "learning_rate": 0.00046115687488923983, + "loss": 0.0781, + "step": 313 + }, + { + "epoch": 0.56, + "grad_norm": 0.6117066740989685, + "learning_rate": 0.0004609069730697322, + "loss": 0.1208, + "step": 314 + }, + { + "epoch": 0.56, + "grad_norm": 0.5352897644042969, + "learning_rate": 0.0004606563381037544, + "loss": 0.1056, + "step": 315 + }, + { + "epoch": 0.56, + "grad_norm": 0.31001269817352295, + "learning_rate": 0.00046040497086255385, + "loss": 0.1213, + "step": 316 + }, + { + "epoch": 0.56, + "grad_norm": 0.22637054324150085, + "learning_rate": 0.0004601528722199234, + "loss": 0.105, + "step": 317 + }, + { + "epoch": 0.56, + "grad_norm": 0.20077432692050934, + "learning_rate": 0.0004599000430521984, + "loss": 0.0837, + "step": 318 + }, + { + "epoch": 0.56, + "grad_norm": 0.24702684581279755, + "learning_rate": 0.0004596464842382534, + "loss": 0.0695, + "step": 319 + }, + { + "epoch": 0.57, + "grad_norm": 0.253387987613678, + "learning_rate": 0.0004593921966594997, + "loss": 0.1184, + "step": 320 + }, + { + "epoch": 0.57, + "grad_norm": 0.2703799605369568, + "learning_rate": 0.0004591371811998817, + "loss": 0.117, + "step": 321 + }, + { + "epoch": 0.57, + "grad_norm": 0.23513701558113098, + "learning_rate": 0.00045888143874587396, + "loss": 0.1359, + "step": 322 + }, + { + "epoch": 0.57, + "grad_norm": 0.25604313611984253, + "learning_rate": 0.00045862497018647833, + "loss": 0.1018, + "step": 323 + }, + { + "epoch": 0.57, + "grad_norm": 0.16947636008262634, + "learning_rate": 0.0004583677764132207, + "loss": 0.0958, + "step": 324 + }, + { + "epoch": 0.57, + "grad_norm": 0.20054908096790314, + "learning_rate": 0.0004581098583201478, + "loss": 0.0803, + "step": 325 + }, + { + "epoch": 0.58, + "grad_norm": 0.12656472623348236, + "learning_rate": 0.00045785121680382436, + "loss": 0.0679, + "step": 326 + }, + { + "epoch": 0.58, + "grad_norm": 0.1423499882221222, + "learning_rate": 0.0004575918527633297, + "loss": 0.0959, + "step": 327 + }, + { + "epoch": 0.58, + "grad_norm": 0.36370569467544556, + "learning_rate": 0.0004573317671002549, + "loss": 0.1088, + "step": 328 + }, + { + "epoch": 0.58, + "grad_norm": 0.18775340914726257, + "learning_rate": 0.0004570709607186995, + "loss": 0.0905, + "step": 329 + }, + { + "epoch": 0.58, + "grad_norm": 0.14833571016788483, + "learning_rate": 0.0004568094345252681, + "loss": 0.0661, + "step": 330 + }, + { + "epoch": 0.59, + "grad_norm": 0.2987270653247833, + "learning_rate": 0.00045654718942906794, + "loss": 0.0872, + "step": 331 + }, + { + "epoch": 0.59, + "grad_norm": 0.21985827386379242, + "learning_rate": 0.000456284226341705, + "loss": 0.0882, + "step": 332 + }, + { + "epoch": 0.59, + "grad_norm": 0.2726268470287323, + "learning_rate": 0.00045602054617728093, + "loss": 0.0864, + "step": 333 + }, + { + "epoch": 0.59, + "grad_norm": 0.2882244884967804, + "learning_rate": 0.00045575614985239057, + "loss": 0.1032, + "step": 334 + }, + { + "epoch": 0.59, + "grad_norm": 0.427500456571579, + "learning_rate": 0.0004554910382861178, + "loss": 0.1309, + "step": 335 + }, + { + "epoch": 0.59, + "grad_norm": 0.43029338121414185, + "learning_rate": 0.000455225212400033, + "loss": 0.1071, + "step": 336 + }, + { + "epoch": 0.6, + "grad_norm": 0.2297673523426056, + "learning_rate": 0.0004549586731181896, + "loss": 0.0526, + "step": 337 + }, + { + "epoch": 0.6, + "grad_norm": 0.4533613920211792, + "learning_rate": 0.0004546914213671209, + "loss": 0.1154, + "step": 338 + }, + { + "epoch": 0.6, + "grad_norm": 0.3973630666732788, + "learning_rate": 0.0004544234580758367, + "loss": 0.0707, + "step": 339 + }, + { + "epoch": 0.6, + "grad_norm": 0.40036290884017944, + "learning_rate": 0.0004541547841758207, + "loss": 0.0932, + "step": 340 + }, + { + "epoch": 0.6, + "grad_norm": 0.4273395240306854, + "learning_rate": 0.0004538854006010262, + "loss": 0.1112, + "step": 341 + }, + { + "epoch": 0.61, + "grad_norm": 0.28109779953956604, + "learning_rate": 0.0004536153082878738, + "loss": 0.1003, + "step": 342 + }, + { + "epoch": 0.61, + "grad_norm": 0.21950216591358185, + "learning_rate": 0.00045334450817524776, + "loss": 0.0538, + "step": 343 + }, + { + "epoch": 0.61, + "grad_norm": 0.2968471646308899, + "learning_rate": 0.00045307300120449263, + "loss": 0.0775, + "step": 344 + }, + { + "epoch": 0.61, + "grad_norm": 0.1488364040851593, + "learning_rate": 0.00045280078831941024, + "loss": 0.0513, + "step": 345 + }, + { + "epoch": 0.61, + "grad_norm": 0.22750218212604523, + "learning_rate": 0.00045252787046625624, + "loss": 0.0943, + "step": 346 + }, + { + "epoch": 0.61, + "grad_norm": 0.3048767149448395, + "learning_rate": 0.0004522542485937369, + "loss": 0.079, + "step": 347 + }, + { + "epoch": 0.62, + "grad_norm": 0.33520030975341797, + "learning_rate": 0.0004519799236530057, + "loss": 0.1584, + "step": 348 + }, + { + "epoch": 0.62, + "grad_norm": 0.20777581632137299, + "learning_rate": 0.00045170489659766003, + "loss": 0.0903, + "step": 349 + }, + { + "epoch": 0.62, + "grad_norm": 0.1602245271205902, + "learning_rate": 0.00045142916838373826, + "loss": 0.0446, + "step": 350 + }, + { + "epoch": 0.62, + "grad_norm": 0.2512218952178955, + "learning_rate": 0.0004511527399697158, + "loss": 0.069, + "step": 351 + }, + { + "epoch": 0.62, + "grad_norm": 0.17349962890148163, + "learning_rate": 0.0004508756123165021, + "loss": 0.0765, + "step": 352 + }, + { + "epoch": 0.62, + "grad_norm": 0.26563215255737305, + "learning_rate": 0.00045059778638743744, + "loss": 0.0966, + "step": 353 + }, + { + "epoch": 0.63, + "grad_norm": 0.23987066745758057, + "learning_rate": 0.00045031926314828926, + "loss": 0.0702, + "step": 354 + }, + { + "epoch": 0.63, + "grad_norm": 0.21901372075080872, + "learning_rate": 0.000450040043567249, + "loss": 0.0457, + "step": 355 + }, + { + "epoch": 0.63, + "grad_norm": 0.24179872870445251, + "learning_rate": 0.00044976012861492877, + "loss": 0.0651, + "step": 356 + }, + { + "epoch": 0.63, + "grad_norm": 0.3544818162918091, + "learning_rate": 0.0004494795192643578, + "loss": 0.0622, + "step": 357 + }, + { + "epoch": 0.63, + "grad_norm": 0.4363332986831665, + "learning_rate": 0.00044919821649097916, + "loss": 0.0972, + "step": 358 + }, + { + "epoch": 0.64, + "grad_norm": 0.43788430094718933, + "learning_rate": 0.0004489162212726465, + "loss": 0.0843, + "step": 359 + }, + { + "epoch": 0.64, + "grad_norm": 0.5084832906723022, + "learning_rate": 0.00044863353458962044, + "loss": 0.0888, + "step": 360 + }, + { + "epoch": 0.64, + "grad_norm": 0.44660842418670654, + "learning_rate": 0.0004483501574245652, + "loss": 0.113, + "step": 361 + }, + { + "epoch": 0.64, + "grad_norm": 0.7528813481330872, + "learning_rate": 0.0004480660907625452, + "loss": 0.0512, + "step": 362 + }, + { + "epoch": 0.64, + "grad_norm": 0.9723535776138306, + "learning_rate": 0.0004477813355910219, + "loss": 0.1154, + "step": 363 + }, + { + "epoch": 0.64, + "grad_norm": 0.2641480565071106, + "learning_rate": 0.0004474958928998498, + "loss": 0.0575, + "step": 364 + }, + { + "epoch": 0.65, + "grad_norm": 0.12234170734882355, + "learning_rate": 0.00044720976368127355, + "loss": 0.0441, + "step": 365 + }, + { + "epoch": 0.65, + "grad_norm": 0.26976636052131653, + "learning_rate": 0.00044692294892992416, + "loss": 0.0676, + "step": 366 + }, + { + "epoch": 0.65, + "grad_norm": 0.22729526460170746, + "learning_rate": 0.00044663544964281573, + "loss": 0.098, + "step": 367 + }, + { + "epoch": 0.65, + "grad_norm": 0.2270442545413971, + "learning_rate": 0.0004463472668193419, + "loss": 0.0842, + "step": 368 + }, + { + "epoch": 0.65, + "grad_norm": 0.19249562919139862, + "learning_rate": 0.0004460584014612724, + "loss": 0.0537, + "step": 369 + }, + { + "epoch": 0.65, + "grad_norm": 0.22312623262405396, + "learning_rate": 0.0004457688545727496, + "loss": 0.0547, + "step": 370 + }, + { + "epoch": 0.66, + "grad_norm": 0.281658411026001, + "learning_rate": 0.0004454786271602849, + "loss": 0.089, + "step": 371 + }, + { + "epoch": 0.66, + "grad_norm": 0.49952250719070435, + "learning_rate": 0.00044518772023275526, + "loss": 0.1298, + "step": 372 + }, + { + "epoch": 0.66, + "grad_norm": 0.186232328414917, + "learning_rate": 0.0004448961348013999, + "loss": 0.0628, + "step": 373 + }, + { + "epoch": 0.66, + "grad_norm": 0.2980823814868927, + "learning_rate": 0.0004446038718798166, + "loss": 0.0828, + "step": 374 + }, + { + "epoch": 0.66, + "grad_norm": 0.3794187605381012, + "learning_rate": 0.00044431093248395806, + "loss": 0.0776, + "step": 375 + }, + { + "epoch": 0.67, + "grad_norm": 0.29262277483940125, + "learning_rate": 0.0004440173176321287, + "loss": 0.0924, + "step": 376 + }, + { + "epoch": 0.67, + "grad_norm": 0.30543988943099976, + "learning_rate": 0.0004437230283449808, + "loss": 0.1264, + "step": 377 + }, + { + "epoch": 0.67, + "grad_norm": 0.3436485826969147, + "learning_rate": 0.0004434280656455111, + "loss": 0.1066, + "step": 378 + }, + { + "epoch": 0.67, + "grad_norm": 0.23679965734481812, + "learning_rate": 0.0004431324305590572, + "loss": 0.075, + "step": 379 + }, + { + "epoch": 0.67, + "grad_norm": 0.4399561882019043, + "learning_rate": 0.0004428361241132943, + "loss": 0.1445, + "step": 380 + }, + { + "epoch": 0.67, + "grad_norm": 0.39203163981437683, + "learning_rate": 0.0004425391473382309, + "loss": 0.0995, + "step": 381 + }, + { + "epoch": 0.68, + "grad_norm": 0.4687665104866028, + "learning_rate": 0.0004422415012662061, + "loss": 0.1489, + "step": 382 + }, + { + "epoch": 0.68, + "grad_norm": 0.2634904086589813, + "learning_rate": 0.00044194318693188526, + "loss": 0.1164, + "step": 383 + }, + { + "epoch": 0.68, + "grad_norm": 0.23031170666217804, + "learning_rate": 0.0004416442053722569, + "loss": 0.0742, + "step": 384 + }, + { + "epoch": 0.68, + "grad_norm": 0.30467960238456726, + "learning_rate": 0.00044134455762662894, + "loss": 0.0984, + "step": 385 + }, + { + "epoch": 0.68, + "grad_norm": 0.16692829132080078, + "learning_rate": 0.0004410442447366249, + "loss": 0.0732, + "step": 386 + }, + { + "epoch": 0.68, + "grad_norm": 0.20100833475589752, + "learning_rate": 0.00044074326774618065, + "loss": 0.1082, + "step": 387 + }, + { + "epoch": 0.69, + "grad_norm": 0.29799607396125793, + "learning_rate": 0.0004404416277015404, + "loss": 0.0761, + "step": 388 + }, + { + "epoch": 0.69, + "grad_norm": 0.647639274597168, + "learning_rate": 0.0004401393256512534, + "loss": 0.1218, + "step": 389 + }, + { + "epoch": 0.69, + "grad_norm": 0.2610540986061096, + "learning_rate": 0.00043983636264617013, + "loss": 0.0923, + "step": 390 + }, + { + "epoch": 0.69, + "grad_norm": 0.4049086570739746, + "learning_rate": 0.0004395327397394384, + "loss": 0.1091, + "step": 391 + }, + { + "epoch": 0.69, + "grad_norm": 0.36092105507850647, + "learning_rate": 0.00043922845798650034, + "loss": 0.0927, + "step": 392 + }, + { + "epoch": 0.7, + "grad_norm": 0.542421281337738, + "learning_rate": 0.00043892351844508805, + "loss": 0.1014, + "step": 393 + }, + { + "epoch": 0.7, + "grad_norm": 0.291595995426178, + "learning_rate": 0.0004386179221752202, + "loss": 0.0902, + "step": 394 + }, + { + "epoch": 0.7, + "grad_norm": 0.17152707278728485, + "learning_rate": 0.0004383116702391987, + "loss": 0.0651, + "step": 395 + }, + { + "epoch": 0.7, + "grad_norm": 0.16654878854751587, + "learning_rate": 0.00043800476370160416, + "loss": 0.0824, + "step": 396 + }, + { + "epoch": 0.7, + "grad_norm": 0.18530108034610748, + "learning_rate": 0.000437697203629293, + "loss": 0.0549, + "step": 397 + }, + { + "epoch": 0.7, + "grad_norm": 0.5760988593101501, + "learning_rate": 0.0004373889910913934, + "loss": 0.0803, + "step": 398 + }, + { + "epoch": 0.71, + "grad_norm": 0.4253963232040405, + "learning_rate": 0.00043708012715930154, + "loss": 0.0728, + "step": 399 + }, + { + "epoch": 0.71, + "grad_norm": 0.7932385206222534, + "learning_rate": 0.00043677061290667805, + "loss": 0.1442, + "step": 400 + }, + { + "epoch": 0.71, + "grad_norm": 0.5904709696769714, + "learning_rate": 0.00043646044940944407, + "loss": 0.0999, + "step": 401 + }, + { + "epoch": 0.71, + "grad_norm": 0.9570127129554749, + "learning_rate": 0.0004361496377457777, + "loss": 0.1298, + "step": 402 + }, + { + "epoch": 0.71, + "grad_norm": 0.5049470663070679, + "learning_rate": 0.00043583817899611017, + "loss": 0.0263, + "step": 403 + }, + { + "epoch": 0.71, + "grad_norm": 0.589408814907074, + "learning_rate": 0.00043552607424312195, + "loss": 0.1051, + "step": 404 + }, + { + "epoch": 0.72, + "grad_norm": 0.43722283840179443, + "learning_rate": 0.0004352133245717393, + "loss": 0.0715, + "step": 405 + }, + { + "epoch": 0.72, + "grad_norm": 1.3537758588790894, + "learning_rate": 0.00043489993106913036, + "loss": 0.0322, + "step": 406 + }, + { + "epoch": 0.72, + "grad_norm": 0.3382836580276489, + "learning_rate": 0.000434585894824701, + "loss": 0.0818, + "step": 407 + }, + { + "epoch": 0.72, + "grad_norm": 0.9946733713150024, + "learning_rate": 0.00043427121693009164, + "loss": 0.1536, + "step": 408 + }, + { + "epoch": 0.72, + "grad_norm": 0.9138526320457458, + "learning_rate": 0.0004339558984791732, + "loss": 0.1299, + "step": 409 + }, + { + "epoch": 0.73, + "grad_norm": 0.35993850231170654, + "learning_rate": 0.0004336399405680432, + "loss": 0.0654, + "step": 410 + }, + { + "epoch": 0.73, + "grad_norm": 0.30418309569358826, + "learning_rate": 0.0004333233442950219, + "loss": 0.1026, + "step": 411 + }, + { + "epoch": 0.73, + "grad_norm": 0.256728857755661, + "learning_rate": 0.00043300611076064886, + "loss": 0.083, + "step": 412 + }, + { + "epoch": 0.73, + "grad_norm": 0.3472314774990082, + "learning_rate": 0.00043268824106767865, + "loss": 0.0637, + "step": 413 + }, + { + "epoch": 0.73, + "grad_norm": 0.5615578293800354, + "learning_rate": 0.00043236973632107735, + "loss": 0.1028, + "step": 414 + }, + { + "epoch": 0.73, + "grad_norm": 0.35775747895240784, + "learning_rate": 0.00043205059762801854, + "loss": 0.0829, + "step": 415 + }, + { + "epoch": 0.74, + "grad_norm": 0.287270724773407, + "learning_rate": 0.0004317308260978795, + "loss": 0.0718, + "step": 416 + }, + { + "epoch": 0.74, + "grad_norm": 0.3237059414386749, + "learning_rate": 0.00043141042284223737, + "loss": 0.0797, + "step": 417 + }, + { + "epoch": 0.74, + "grad_norm": 0.2153153419494629, + "learning_rate": 0.0004310893889748653, + "loss": 0.0778, + "step": 418 + }, + { + "epoch": 0.74, + "grad_norm": 0.33600860834121704, + "learning_rate": 0.00043076772561172845, + "loss": 0.0594, + "step": 419 + }, + { + "epoch": 0.74, + "grad_norm": 0.8778895139694214, + "learning_rate": 0.00043044543387098027, + "loss": 0.1722, + "step": 420 + }, + { + "epoch": 0.74, + "grad_norm": 0.503434419631958, + "learning_rate": 0.0004301225148729586, + "loss": 0.1228, + "step": 421 + }, + { + "epoch": 0.75, + "grad_norm": 0.3694842457771301, + "learning_rate": 0.00042979896974018166, + "loss": 0.1033, + "step": 422 + }, + { + "epoch": 0.75, + "grad_norm": 0.329818457365036, + "learning_rate": 0.00042947479959734423, + "loss": 0.0471, + "step": 423 + }, + { + "epoch": 0.75, + "grad_norm": 0.19436267018318176, + "learning_rate": 0.0004291500055713138, + "loss": 0.0409, + "step": 424 + }, + { + "epoch": 0.75, + "grad_norm": 0.3269858658313751, + "learning_rate": 0.0004288245887911263, + "loss": 0.096, + "step": 425 + }, + { + "epoch": 0.75, + "grad_norm": 0.3542991578578949, + "learning_rate": 0.00042849855038798283, + "loss": 0.0986, + "step": 426 + }, + { + "epoch": 0.75, + "eval_loss": 0.07792978733778, + "eval_runtime": 14.8115, + "eval_samples_per_second": 32.205, + "eval_steps_per_second": 8.102, + "step": 426 + }, + { + "epoch": 0.76, + "grad_norm": 0.29685521125793457, + "learning_rate": 0.00042817189149524517, + "loss": 0.11, + "step": 427 + }, + { + "epoch": 0.76, + "grad_norm": 0.2116887867450714, + "learning_rate": 0.00042784461324843194, + "loss": 0.0686, + "step": 428 + }, + { + "epoch": 0.76, + "grad_norm": 0.4866883456707001, + "learning_rate": 0.00042751671678521486, + "loss": 0.0824, + "step": 429 + }, + { + "epoch": 0.76, + "grad_norm": 0.1293468475341797, + "learning_rate": 0.00042718820324541475, + "loss": 0.0464, + "step": 430 + }, + { + "epoch": 0.76, + "grad_norm": 0.3253125250339508, + "learning_rate": 0.0004268590737709972, + "loss": 0.0996, + "step": 431 + }, + { + "epoch": 0.76, + "grad_norm": 0.25559771060943604, + "learning_rate": 0.00042652932950606917, + "loss": 0.0545, + "step": 432 + }, + { + "epoch": 0.77, + "grad_norm": 0.2788093686103821, + "learning_rate": 0.0004261989715968746, + "loss": 0.0502, + "step": 433 + }, + { + "epoch": 0.77, + "grad_norm": 0.6902124285697937, + "learning_rate": 0.00042586800119179046, + "loss": 0.1598, + "step": 434 + }, + { + "epoch": 0.77, + "grad_norm": 0.4788605570793152, + "learning_rate": 0.00042553641944132316, + "loss": 0.1552, + "step": 435 + }, + { + "epoch": 0.77, + "grad_norm": 0.43495067954063416, + "learning_rate": 0.00042520422749810395, + "loss": 0.0907, + "step": 436 + }, + { + "epoch": 0.77, + "grad_norm": 0.3549440801143646, + "learning_rate": 0.0004248714265168853, + "loss": 0.1152, + "step": 437 + }, + { + "epoch": 0.77, + "grad_norm": 0.7210204601287842, + "learning_rate": 0.00042453801765453687, + "loss": 0.1891, + "step": 438 + }, + { + "epoch": 0.78, + "grad_norm": 0.4578750729560852, + "learning_rate": 0.00042420400207004126, + "loss": 0.1383, + "step": 439 + }, + { + "epoch": 0.78, + "grad_norm": 0.3323976993560791, + "learning_rate": 0.00042386938092449036, + "loss": 0.0936, + "step": 440 + }, + { + "epoch": 0.78, + "grad_norm": 0.15145371854305267, + "learning_rate": 0.00042353415538108076, + "loss": 0.0608, + "step": 441 + }, + { + "epoch": 0.78, + "grad_norm": 0.10744435340166092, + "learning_rate": 0.00042319832660511037, + "loss": 0.0865, + "step": 442 + }, + { + "epoch": 0.78, + "grad_norm": 0.13599476218223572, + "learning_rate": 0.0004228618957639738, + "loss": 0.0763, + "step": 443 + }, + { + "epoch": 0.79, + "grad_norm": 0.18250028789043427, + "learning_rate": 0.00042252486402715865, + "loss": 0.0813, + "step": 444 + }, + { + "epoch": 0.79, + "grad_norm": 0.5180188417434692, + "learning_rate": 0.00042218723256624136, + "loss": 0.1603, + "step": 445 + }, + { + "epoch": 0.79, + "grad_norm": 0.2943187355995178, + "learning_rate": 0.000421849002554883, + "loss": 0.1031, + "step": 446 + }, + { + "epoch": 0.79, + "grad_norm": 0.14898087084293365, + "learning_rate": 0.0004215101751688253, + "loss": 0.071, + "step": 447 + }, + { + "epoch": 0.79, + "grad_norm": 0.2951905131340027, + "learning_rate": 0.00042117075158588663, + "loss": 0.0772, + "step": 448 + }, + { + "epoch": 0.79, + "grad_norm": 0.39807453751564026, + "learning_rate": 0.00042083073298595787, + "loss": 0.0561, + "step": 449 + }, + { + "epoch": 0.8, + "grad_norm": 0.45217999815940857, + "learning_rate": 0.0004204901205509981, + "loss": 0.1076, + "step": 450 + }, + { + "epoch": 0.8, + "grad_norm": 0.24114732444286346, + "learning_rate": 0.000420148915465031, + "loss": 0.1169, + "step": 451 + }, + { + "epoch": 0.8, + "grad_norm": 0.6120204329490662, + "learning_rate": 0.00041980711891413994, + "loss": 0.1144, + "step": 452 + }, + { + "epoch": 0.8, + "grad_norm": 0.3900619447231293, + "learning_rate": 0.0004194647320864646, + "loss": 0.0806, + "step": 453 + }, + { + "epoch": 0.8, + "grad_norm": 0.3331635296344757, + "learning_rate": 0.0004191217561721967, + "loss": 0.0655, + "step": 454 + }, + { + "epoch": 0.8, + "grad_norm": 0.29893186688423157, + "learning_rate": 0.0004187781923635753, + "loss": 0.0482, + "step": 455 + }, + { + "epoch": 0.81, + "grad_norm": 0.20024164021015167, + "learning_rate": 0.00041843404185488346, + "loss": 0.0773, + "step": 456 + }, + { + "epoch": 0.81, + "grad_norm": 0.3644329905509949, + "learning_rate": 0.0004180893058424435, + "loss": 0.1062, + "step": 457 + }, + { + "epoch": 0.81, + "grad_norm": 0.5457159280776978, + "learning_rate": 0.0004177439855246132, + "loss": 0.1901, + "step": 458 + }, + { + "epoch": 0.81, + "grad_norm": 0.282032310962677, + "learning_rate": 0.0004173980821017812, + "loss": 0.0656, + "step": 459 + }, + { + "epoch": 0.81, + "grad_norm": 0.1957680881023407, + "learning_rate": 0.00041705159677636334, + "loss": 0.0725, + "step": 460 + }, + { + "epoch": 0.82, + "grad_norm": 0.2736223042011261, + "learning_rate": 0.00041670453075279827, + "loss": 0.0897, + "step": 461 + }, + { + "epoch": 0.82, + "grad_norm": 0.2145017832517624, + "learning_rate": 0.0004163568852375431, + "loss": 0.046, + "step": 462 + }, + { + "epoch": 0.82, + "grad_norm": 0.1434750258922577, + "learning_rate": 0.00041600866143906947, + "loss": 0.0483, + "step": 463 + }, + { + "epoch": 0.82, + "grad_norm": 0.2438279092311859, + "learning_rate": 0.000415659860567859, + "loss": 0.0935, + "step": 464 + }, + { + "epoch": 0.82, + "grad_norm": 0.24830487370491028, + "learning_rate": 0.00041531048383639966, + "loss": 0.1061, + "step": 465 + }, + { + "epoch": 0.82, + "grad_norm": 0.25185227394104004, + "learning_rate": 0.000414960532459181, + "loss": 0.082, + "step": 466 + }, + { + "epoch": 0.83, + "grad_norm": 0.391631156206131, + "learning_rate": 0.00041461000765269, + "loss": 0.1274, + "step": 467 + }, + { + "epoch": 0.83, + "grad_norm": 0.30484774708747864, + "learning_rate": 0.0004142589106354071, + "loss": 0.0672, + "step": 468 + }, + { + "epoch": 0.83, + "grad_norm": 0.2584599554538727, + "learning_rate": 0.0004139072426278021, + "loss": 0.0863, + "step": 469 + }, + { + "epoch": 0.83, + "grad_norm": 0.27182772755622864, + "learning_rate": 0.0004135550048523292, + "loss": 0.0996, + "step": 470 + }, + { + "epoch": 0.83, + "grad_norm": 0.2670001685619354, + "learning_rate": 0.00041320219853342347, + "loss": 0.0592, + "step": 471 + }, + { + "epoch": 0.84, + "grad_norm": 0.19571639597415924, + "learning_rate": 0.0004128488248974962, + "loss": 0.0618, + "step": 472 + }, + { + "epoch": 0.84, + "grad_norm": 0.436814546585083, + "learning_rate": 0.00041249488517293095, + "loss": 0.1131, + "step": 473 + }, + { + "epoch": 0.84, + "grad_norm": 0.21684250235557556, + "learning_rate": 0.0004121403805900789, + "loss": 0.0759, + "step": 474 + }, + { + "epoch": 0.84, + "grad_norm": 0.39313605427742004, + "learning_rate": 0.0004117853123812549, + "loss": 0.0992, + "step": 475 + }, + { + "epoch": 0.84, + "grad_norm": 0.3653202950954437, + "learning_rate": 0.00041142968178073294, + "loss": 0.099, + "step": 476 + }, + { + "epoch": 0.84, + "grad_norm": 0.36615628004074097, + "learning_rate": 0.00041107349002474206, + "loss": 0.06, + "step": 477 + }, + { + "epoch": 0.85, + "grad_norm": 0.2431243658065796, + "learning_rate": 0.00041071673835146194, + "loss": 0.0689, + "step": 478 + }, + { + "epoch": 0.85, + "grad_norm": 0.7869367599487305, + "learning_rate": 0.00041035942800101864, + "loss": 0.1308, + "step": 479 + }, + { + "epoch": 0.85, + "grad_norm": 0.2831230163574219, + "learning_rate": 0.0004100015602154802, + "loss": 0.087, + "step": 480 + }, + { + "epoch": 0.85, + "grad_norm": 0.3709629774093628, + "learning_rate": 0.0004096431362388525, + "loss": 0.0822, + "step": 481 + }, + { + "epoch": 0.85, + "grad_norm": 0.4082586467266083, + "learning_rate": 0.0004092841573170748, + "loss": 0.1114, + "step": 482 + }, + { + "epoch": 0.85, + "grad_norm": 0.2919554114341736, + "learning_rate": 0.0004089246246980154, + "loss": 0.1059, + "step": 483 + }, + { + "epoch": 0.86, + "grad_norm": 0.3750731945037842, + "learning_rate": 0.0004085645396314673, + "loss": 0.082, + "step": 484 + }, + { + "epoch": 0.86, + "grad_norm": 0.21013659238815308, + "learning_rate": 0.000408203903369144, + "loss": 0.0819, + "step": 485 + }, + { + "epoch": 0.86, + "grad_norm": 0.20771674811840057, + "learning_rate": 0.00040784271716467503, + "loss": 0.0687, + "step": 486 + }, + { + "epoch": 0.86, + "grad_norm": 0.157434344291687, + "learning_rate": 0.00040748098227360154, + "loss": 0.0826, + "step": 487 + }, + { + "epoch": 0.86, + "grad_norm": 0.40467727184295654, + "learning_rate": 0.000407118699953372, + "loss": 0.1131, + "step": 488 + }, + { + "epoch": 0.87, + "grad_norm": 0.17521728575229645, + "learning_rate": 0.0004067558714633378, + "loss": 0.116, + "step": 489 + }, + { + "epoch": 0.87, + "grad_norm": 0.2975709140300751, + "learning_rate": 0.0004063924980647492, + "loss": 0.0787, + "step": 490 + }, + { + "epoch": 0.87, + "grad_norm": 0.22513332962989807, + "learning_rate": 0.0004060285810207503, + "loss": 0.0754, + "step": 491 + }, + { + "epoch": 0.87, + "grad_norm": 0.2939409613609314, + "learning_rate": 0.00040566412159637514, + "loss": 0.0505, + "step": 492 + }, + { + "epoch": 0.87, + "grad_norm": 0.21415212750434875, + "learning_rate": 0.000405299121058543, + "loss": 0.0486, + "step": 493 + }, + { + "epoch": 0.87, + "grad_norm": 0.24846945703029633, + "learning_rate": 0.00040493358067605445, + "loss": 0.0645, + "step": 494 + }, + { + "epoch": 0.88, + "grad_norm": 0.42928287386894226, + "learning_rate": 0.00040456750171958655, + "loss": 0.1455, + "step": 495 + }, + { + "epoch": 0.88, + "grad_norm": 0.30920714139938354, + "learning_rate": 0.0004042008854616883, + "loss": 0.0743, + "step": 496 + }, + { + "epoch": 0.88, + "grad_norm": 0.43211719393730164, + "learning_rate": 0.00040383373317677687, + "loss": 0.1037, + "step": 497 + }, + { + "epoch": 0.88, + "grad_norm": 0.49942275881767273, + "learning_rate": 0.00040346604614113215, + "loss": 0.123, + "step": 498 + }, + { + "epoch": 0.88, + "grad_norm": 0.18615621328353882, + "learning_rate": 0.00040309782563289353, + "loss": 0.0783, + "step": 499 + }, + { + "epoch": 0.88, + "grad_norm": 0.22238926589488983, + "learning_rate": 0.0004027290729320545, + "loss": 0.0698, + "step": 500 + }, + { + "epoch": 0.89, + "grad_norm": 0.31746548414230347, + "learning_rate": 0.0004023597893204586, + "loss": 0.1682, + "step": 501 + }, + { + "epoch": 0.89, + "grad_norm": 0.19328100979328156, + "learning_rate": 0.00040198997608179477, + "loss": 0.1028, + "step": 502 + }, + { + "epoch": 0.89, + "grad_norm": 0.15466806292533875, + "learning_rate": 0.00040161963450159333, + "loss": 0.065, + "step": 503 + }, + { + "epoch": 0.89, + "grad_norm": 0.3000398874282837, + "learning_rate": 0.00040124876586722103, + "loss": 0.1071, + "step": 504 + }, + { + "epoch": 0.89, + "grad_norm": 0.16753748059272766, + "learning_rate": 0.00040087737146787654, + "loss": 0.056, + "step": 505 + }, + { + "epoch": 0.9, + "grad_norm": 0.17570586502552032, + "learning_rate": 0.00040050545259458654, + "loss": 0.0732, + "step": 506 + }, + { + "epoch": 0.9, + "grad_norm": 0.19240190088748932, + "learning_rate": 0.00040013301054020055, + "loss": 0.0444, + "step": 507 + }, + { + "epoch": 0.9, + "grad_norm": 0.23935984075069427, + "learning_rate": 0.00039976004659938714, + "loss": 0.0583, + "step": 508 + }, + { + "epoch": 0.9, + "grad_norm": 0.22633028030395508, + "learning_rate": 0.00039938656206862857, + "loss": 0.065, + "step": 509 + }, + { + "epoch": 0.9, + "grad_norm": 0.18621531128883362, + "learning_rate": 0.000399012558246217, + "loss": 0.0489, + "step": 510 + }, + { + "epoch": 0.9, + "grad_norm": 0.37711310386657715, + "learning_rate": 0.0003986380364322498, + "loss": 0.1367, + "step": 511 + }, + { + "epoch": 0.91, + "grad_norm": 0.26448771357536316, + "learning_rate": 0.00039826299792862475, + "loss": 0.076, + "step": 512 + }, + { + "epoch": 0.91, + "grad_norm": 0.22461633384227753, + "learning_rate": 0.00039788744403903604, + "loss": 0.0734, + "step": 513 + }, + { + "epoch": 0.91, + "grad_norm": 0.23908165097236633, + "learning_rate": 0.00039751137606896907, + "loss": 0.0718, + "step": 514 + }, + { + "epoch": 0.91, + "grad_norm": 0.37807080149650574, + "learning_rate": 0.00039713479532569646, + "loss": 0.1495, + "step": 515 + }, + { + "epoch": 0.91, + "grad_norm": 0.16840259730815887, + "learning_rate": 0.00039675770311827337, + "loss": 0.0491, + "step": 516 + }, + { + "epoch": 0.91, + "grad_norm": 0.35179728269577026, + "learning_rate": 0.00039638010075753274, + "loss": 0.0839, + "step": 517 + }, + { + "epoch": 0.92, + "grad_norm": 0.3631207048892975, + "learning_rate": 0.00039600198955608084, + "loss": 0.1348, + "step": 518 + }, + { + "epoch": 0.92, + "grad_norm": 0.38650691509246826, + "learning_rate": 0.00039562337082829304, + "loss": 0.15, + "step": 519 + }, + { + "epoch": 0.92, + "grad_norm": 0.2523843050003052, + "learning_rate": 0.00039524424589030866, + "loss": 0.1172, + "step": 520 + }, + { + "epoch": 0.92, + "grad_norm": 0.2690166234970093, + "learning_rate": 0.00039486461606002686, + "loss": 0.0619, + "step": 521 + }, + { + "epoch": 0.92, + "grad_norm": 0.31193405389785767, + "learning_rate": 0.0003944844826571018, + "loss": 0.0834, + "step": 522 + }, + { + "epoch": 0.93, + "grad_norm": 0.21751855313777924, + "learning_rate": 0.00039410384700293814, + "loss": 0.068, + "step": 523 + }, + { + "epoch": 0.93, + "grad_norm": 0.34191232919692993, + "learning_rate": 0.0003937227104206865, + "loss": 0.1337, + "step": 524 + }, + { + "epoch": 0.93, + "grad_norm": 0.34457269310951233, + "learning_rate": 0.0003933410742352388, + "loss": 0.0929, + "step": 525 + }, + { + "epoch": 0.93, + "grad_norm": 0.22599942982196808, + "learning_rate": 0.0003929589397732236, + "loss": 0.0899, + "step": 526 + }, + { + "epoch": 0.93, + "grad_norm": 0.23162932693958282, + "learning_rate": 0.0003925763083630017, + "loss": 0.0869, + "step": 527 + }, + { + "epoch": 0.93, + "grad_norm": 0.19502510130405426, + "learning_rate": 0.00039219318133466104, + "loss": 0.0834, + "step": 528 + }, + { + "epoch": 0.94, + "grad_norm": 0.2539670169353485, + "learning_rate": 0.0003918095600200128, + "loss": 0.0589, + "step": 529 + }, + { + "epoch": 0.94, + "grad_norm": 0.15578749775886536, + "learning_rate": 0.00039142544575258614, + "loss": 0.0471, + "step": 530 + }, + { + "epoch": 0.94, + "grad_norm": 0.41006144881248474, + "learning_rate": 0.00039104083986762396, + "loss": 0.1215, + "step": 531 + }, + { + "epoch": 0.94, + "grad_norm": 0.3161672055721283, + "learning_rate": 0.00039065574370207785, + "loss": 0.0599, + "step": 532 + }, + { + "epoch": 0.94, + "grad_norm": 0.2556127607822418, + "learning_rate": 0.00039027015859460394, + "loss": 0.0882, + "step": 533 + }, + { + "epoch": 0.94, + "grad_norm": 0.5484500527381897, + "learning_rate": 0.000389884085885558, + "loss": 0.1342, + "step": 534 + }, + { + "epoch": 0.95, + "grad_norm": 0.3688224256038666, + "learning_rate": 0.0003894975269169906, + "loss": 0.062, + "step": 535 + }, + { + "epoch": 0.95, + "grad_norm": 0.6328185796737671, + "learning_rate": 0.0003891104830326427, + "loss": 0.1068, + "step": 536 + }, + { + "epoch": 0.95, + "grad_norm": 0.5094593167304993, + "learning_rate": 0.00038872295557794103, + "loss": 0.0593, + "step": 537 + }, + { + "epoch": 0.95, + "grad_norm": 0.44920942187309265, + "learning_rate": 0.0003883349458999931, + "loss": 0.1134, + "step": 538 + }, + { + "epoch": 0.95, + "grad_norm": 0.25559201836586, + "learning_rate": 0.0003879464553475828, + "loss": 0.0842, + "step": 539 + }, + { + "epoch": 0.96, + "grad_norm": 0.24992522597312927, + "learning_rate": 0.0003875574852711656, + "loss": 0.0684, + "step": 540 + }, + { + "epoch": 0.96, + "grad_norm": 0.7482407093048096, + "learning_rate": 0.0003871680370228639, + "loss": 0.1698, + "step": 541 + }, + { + "epoch": 0.96, + "grad_norm": 0.42716777324676514, + "learning_rate": 0.00038677811195646233, + "loss": 0.1335, + "step": 542 + }, + { + "epoch": 0.96, + "grad_norm": 0.5867021083831787, + "learning_rate": 0.0003863877114274029, + "loss": 0.153, + "step": 543 + }, + { + "epoch": 0.96, + "grad_norm": 0.14882822334766388, + "learning_rate": 0.0003859968367927805, + "loss": 0.0548, + "step": 544 + }, + { + "epoch": 0.96, + "grad_norm": 0.16213174164295197, + "learning_rate": 0.0003856054894113381, + "loss": 0.0859, + "step": 545 + }, + { + "epoch": 0.97, + "grad_norm": 0.13216906785964966, + "learning_rate": 0.0003852136706434619, + "loss": 0.0837, + "step": 546 + }, + { + "epoch": 0.97, + "grad_norm": 0.28230682015419006, + "learning_rate": 0.00038482138185117685, + "loss": 0.0746, + "step": 547 + }, + { + "epoch": 0.97, + "grad_norm": 0.15776745975017548, + "learning_rate": 0.0003844286243981417, + "loss": 0.0758, + "step": 548 + }, + { + "epoch": 0.97, + "grad_norm": 0.38748612999916077, + "learning_rate": 0.0003840353996496444, + "loss": 0.0946, + "step": 549 + }, + { + "epoch": 0.97, + "grad_norm": 0.4377779960632324, + "learning_rate": 0.0003836417089725971, + "loss": 0.078, + "step": 550 + }, + { + "epoch": 0.97, + "grad_norm": 0.4776962101459503, + "learning_rate": 0.0003832475537355319, + "loss": 0.0996, + "step": 551 + }, + { + "epoch": 0.98, + "grad_norm": 0.16078083217144012, + "learning_rate": 0.00038285293530859553, + "loss": 0.0813, + "step": 552 + }, + { + "epoch": 0.98, + "grad_norm": 0.19620949029922485, + "learning_rate": 0.00038245785506354514, + "loss": 0.0716, + "step": 553 + }, + { + "epoch": 0.98, + "grad_norm": 0.23539945483207703, + "learning_rate": 0.0003820623143737427, + "loss": 0.0727, + "step": 554 + }, + { + "epoch": 0.98, + "grad_norm": 0.2797366678714752, + "learning_rate": 0.0003816663146141514, + "loss": 0.0307, + "step": 555 + }, + { + "epoch": 0.98, + "grad_norm": 0.31704849004745483, + "learning_rate": 0.00038126985716132976, + "loss": 0.0522, + "step": 556 + }, + { + "epoch": 0.99, + "grad_norm": 1.038294792175293, + "learning_rate": 0.00038087294339342765, + "loss": 0.1602, + "step": 557 + }, + { + "epoch": 0.99, + "grad_norm": 0.39535316824913025, + "learning_rate": 0.00038047557469018077, + "loss": 0.0672, + "step": 558 + }, + { + "epoch": 0.99, + "grad_norm": 0.5337291359901428, + "learning_rate": 0.00038007775243290666, + "loss": 0.238, + "step": 559 + }, + { + "epoch": 0.99, + "grad_norm": 0.7618711590766907, + "learning_rate": 0.0003796794780044992, + "loss": 0.0741, + "step": 560 + }, + { + "epoch": 0.99, + "grad_norm": 0.3507292568683624, + "learning_rate": 0.0003792807527894242, + "loss": 0.1035, + "step": 561 + }, + { + "epoch": 0.99, + "grad_norm": 0.29699352383613586, + "learning_rate": 0.00037888157817371455, + "loss": 0.0732, + "step": 562 + }, + { + "epoch": 1.0, + "grad_norm": 0.1690889596939087, + "learning_rate": 0.0003784819555449651, + "loss": 0.0625, + "step": 563 + }, + { + "epoch": 1.0, + "grad_norm": 0.28516581654548645, + "learning_rate": 0.0003780818862923284, + "loss": 0.0705, + "step": 564 + }, + { + "epoch": 1.0, + "grad_norm": 0.3408360481262207, + "learning_rate": 0.00037768137180650913, + "loss": 0.1025, + "step": 565 + }, + { + "epoch": 1.0, + "grad_norm": 0.28147757053375244, + "learning_rate": 0.00037728041347976005, + "loss": 0.0495, + "step": 566 + }, + { + "epoch": 1.0, + "grad_norm": 0.31090235710144043, + "learning_rate": 0.00037687901270587655, + "loss": 0.0874, + "step": 567 + }, + { + "epoch": 1.0, + "grad_norm": 0.29111558198928833, + "learning_rate": 0.00037647717088019217, + "loss": 0.0589, + "step": 568 + }, + { + "epoch": 1.0, + "eval_loss": 0.08437130600214005, + "eval_runtime": 14.722, + "eval_samples_per_second": 32.4, + "eval_steps_per_second": 8.151, + "step": 568 + }, + { + "epoch": 1.01, + "grad_norm": 0.14942067861557007, + "learning_rate": 0.0003760748893995736, + "loss": 0.0391, + "step": 569 + }, + { + "epoch": 1.01, + "grad_norm": 0.28915029764175415, + "learning_rate": 0.0003756721696624156, + "loss": 0.0522, + "step": 570 + }, + { + "epoch": 1.01, + "grad_norm": 0.1540856510400772, + "learning_rate": 0.0003752690130686367, + "loss": 0.0473, + "step": 571 + }, + { + "epoch": 1.01, + "grad_norm": 0.31572067737579346, + "learning_rate": 0.0003748654210196739, + "loss": 0.058, + "step": 572 + }, + { + "epoch": 1.01, + "grad_norm": 0.4497004449367523, + "learning_rate": 0.0003744613949184779, + "loss": 0.0937, + "step": 573 + }, + { + "epoch": 1.02, + "grad_norm": 0.48680734634399414, + "learning_rate": 0.0003740569361695082, + "loss": 0.0925, + "step": 574 + }, + { + "epoch": 1.02, + "grad_norm": 0.3604874610900879, + "learning_rate": 0.00037365204617872836, + "loss": 0.0273, + "step": 575 + }, + { + "epoch": 1.02, + "grad_norm": 0.31378456950187683, + "learning_rate": 0.0003732467263536008, + "loss": 0.048, + "step": 576 + }, + { + "epoch": 1.02, + "grad_norm": 0.37197205424308777, + "learning_rate": 0.0003728409781030824, + "loss": 0.0445, + "step": 577 + }, + { + "epoch": 1.02, + "grad_norm": 0.09396038949489594, + "learning_rate": 0.00037243480283761913, + "loss": 0.0102, + "step": 578 + }, + { + "epoch": 1.02, + "grad_norm": 0.4994851052761078, + "learning_rate": 0.00037202820196914133, + "loss": 0.074, + "step": 579 + }, + { + "epoch": 1.03, + "grad_norm": 0.20099425315856934, + "learning_rate": 0.0003716211769110589, + "loss": 0.0239, + "step": 580 + }, + { + "epoch": 1.03, + "grad_norm": 0.33086463809013367, + "learning_rate": 0.0003712137290782561, + "loss": 0.0305, + "step": 581 + }, + { + "epoch": 1.03, + "grad_norm": 0.3871704041957855, + "learning_rate": 0.0003708058598870871, + "loss": 0.0309, + "step": 582 + }, + { + "epoch": 1.03, + "grad_norm": 0.514127790927887, + "learning_rate": 0.0003703975707553706, + "loss": 0.0639, + "step": 583 + }, + { + "epoch": 1.03, + "grad_norm": 0.29386666417121887, + "learning_rate": 0.000369988863102385, + "loss": 0.0778, + "step": 584 + }, + { + "epoch": 1.03, + "grad_norm": 0.4717571437358856, + "learning_rate": 0.0003695797383488638, + "loss": 0.0414, + "step": 585 + }, + { + "epoch": 1.04, + "grad_norm": 0.61000657081604, + "learning_rate": 0.0003691701979169903, + "loss": 0.0687, + "step": 586 + }, + { + "epoch": 1.04, + "grad_norm": 0.3639252483844757, + "learning_rate": 0.0003687602432303926, + "loss": 0.0337, + "step": 587 + }, + { + "epoch": 1.04, + "grad_norm": 0.20936115086078644, + "learning_rate": 0.0003683498757141391, + "loss": 0.0232, + "step": 588 + }, + { + "epoch": 1.04, + "grad_norm": 0.5917474031448364, + "learning_rate": 0.00036793909679473294, + "loss": 0.0564, + "step": 589 + }, + { + "epoch": 1.04, + "grad_norm": 0.23065000772476196, + "learning_rate": 0.00036752790790010767, + "loss": 0.0246, + "step": 590 + }, + { + "epoch": 1.05, + "grad_norm": 0.1876888871192932, + "learning_rate": 0.00036711631045962173, + "loss": 0.0351, + "step": 591 + }, + { + "epoch": 1.05, + "grad_norm": 0.6483283042907715, + "learning_rate": 0.000366704305904054, + "loss": 0.0628, + "step": 592 + }, + { + "epoch": 1.05, + "grad_norm": 0.8450531363487244, + "learning_rate": 0.0003662918956655983, + "loss": 0.0922, + "step": 593 + }, + { + "epoch": 1.05, + "grad_norm": 0.5190649628639221, + "learning_rate": 0.00036587908117785887, + "loss": 0.0715, + "step": 594 + }, + { + "epoch": 1.05, + "grad_norm": 0.597562849521637, + "learning_rate": 0.000365465863875845, + "loss": 0.0728, + "step": 595 + }, + { + "epoch": 1.05, + "grad_norm": 0.5079246759414673, + "learning_rate": 0.0003650522451959663, + "loss": 0.1145, + "step": 596 + }, + { + "epoch": 1.06, + "grad_norm": 0.4016817808151245, + "learning_rate": 0.0003646382265760276, + "loss": 0.1373, + "step": 597 + }, + { + "epoch": 1.06, + "grad_norm": 0.2420119345188141, + "learning_rate": 0.00036422380945522426, + "loss": 0.0428, + "step": 598 + }, + { + "epoch": 1.06, + "grad_norm": 0.2923775017261505, + "learning_rate": 0.00036380899527413646, + "loss": 0.0407, + "step": 599 + }, + { + "epoch": 1.06, + "grad_norm": 0.2968994379043579, + "learning_rate": 0.00036339378547472497, + "loss": 0.039, + "step": 600 + }, + { + "epoch": 1.06, + "grad_norm": 0.3694530129432678, + "learning_rate": 0.0003629781815003256, + "loss": 0.0765, + "step": 601 + }, + { + "epoch": 1.07, + "grad_norm": 0.19854502379894257, + "learning_rate": 0.0003625621847956443, + "loss": 0.0283, + "step": 602 + }, + { + "epoch": 1.07, + "grad_norm": 0.16821172833442688, + "learning_rate": 0.0003621457968067526, + "loss": 0.0551, + "step": 603 + }, + { + "epoch": 1.07, + "grad_norm": 0.5689147114753723, + "learning_rate": 0.00036172901898108177, + "loss": 0.0818, + "step": 604 + }, + { + "epoch": 1.07, + "grad_norm": 0.16156058013439178, + "learning_rate": 0.0003613118527674185, + "loss": 0.0368, + "step": 605 + }, + { + "epoch": 1.07, + "grad_norm": 0.21022377908229828, + "learning_rate": 0.00036089429961589926, + "loss": 0.0614, + "step": 606 + }, + { + "epoch": 1.07, + "grad_norm": 0.3432522714138031, + "learning_rate": 0.00036047636097800593, + "loss": 0.0393, + "step": 607 + }, + { + "epoch": 1.08, + "grad_norm": 0.2537219524383545, + "learning_rate": 0.00036005803830656036, + "loss": 0.0852, + "step": 608 + }, + { + "epoch": 1.08, + "grad_norm": 0.209491565823555, + "learning_rate": 0.00035963933305571916, + "loss": 0.0476, + "step": 609 + }, + { + "epoch": 1.08, + "grad_norm": 0.2286662459373474, + "learning_rate": 0.00035922024668096883, + "loss": 0.0614, + "step": 610 + }, + { + "epoch": 1.08, + "grad_norm": 0.5772972106933594, + "learning_rate": 0.00035880078063912105, + "loss": 0.0546, + "step": 611 + }, + { + "epoch": 1.08, + "grad_norm": 0.37829965353012085, + "learning_rate": 0.0003583809363883069, + "loss": 0.0526, + "step": 612 + }, + { + "epoch": 1.08, + "grad_norm": 0.1876819133758545, + "learning_rate": 0.0003579607153879724, + "loss": 0.0339, + "step": 613 + }, + { + "epoch": 1.09, + "grad_norm": 0.42904049158096313, + "learning_rate": 0.0003575401190988732, + "loss": 0.0705, + "step": 614 + }, + { + "epoch": 1.09, + "grad_norm": 0.2780819833278656, + "learning_rate": 0.0003571191489830693, + "loss": 0.0425, + "step": 615 + }, + { + "epoch": 1.09, + "grad_norm": 0.3338189721107483, + "learning_rate": 0.00035669780650392056, + "loss": 0.0713, + "step": 616 + }, + { + "epoch": 1.09, + "grad_norm": 0.2791332006454468, + "learning_rate": 0.000356276093126081, + "loss": 0.0392, + "step": 617 + }, + { + "epoch": 1.09, + "grad_norm": 0.4691467881202698, + "learning_rate": 0.0003558540103154939, + "loss": 0.0756, + "step": 618 + }, + { + "epoch": 1.1, + "grad_norm": 0.34194234013557434, + "learning_rate": 0.00035543155953938674, + "loss": 0.057, + "step": 619 + }, + { + "epoch": 1.1, + "grad_norm": 0.43898969888687134, + "learning_rate": 0.00035500874226626633, + "loss": 0.1484, + "step": 620 + }, + { + "epoch": 1.1, + "grad_norm": 0.3562189042568207, + "learning_rate": 0.00035458555996591325, + "loss": 0.0801, + "step": 621 + }, + { + "epoch": 1.1, + "grad_norm": 0.2978869080543518, + "learning_rate": 0.0003541620141093771, + "loss": 0.0422, + "step": 622 + }, + { + "epoch": 1.1, + "grad_norm": 0.415714293718338, + "learning_rate": 0.00035373810616897116, + "loss": 0.042, + "step": 623 + }, + { + "epoch": 1.1, + "grad_norm": 0.28547269105911255, + "learning_rate": 0.00035331383761826756, + "loss": 0.0722, + "step": 624 + }, + { + "epoch": 1.11, + "grad_norm": 0.2831112742424011, + "learning_rate": 0.00035288920993209173, + "loss": 0.0339, + "step": 625 + }, + { + "epoch": 1.11, + "grad_norm": 0.372010201215744, + "learning_rate": 0.00035246422458651766, + "loss": 0.0573, + "step": 626 + }, + { + "epoch": 1.11, + "grad_norm": 0.07014724612236023, + "learning_rate": 0.0003520388830588625, + "loss": 0.0108, + "step": 627 + }, + { + "epoch": 1.11, + "grad_norm": 0.5464847087860107, + "learning_rate": 0.0003516131868276817, + "loss": 0.0871, + "step": 628 + }, + { + "epoch": 1.11, + "grad_norm": 0.118097685277462, + "learning_rate": 0.00035118713737276376, + "loss": 0.0176, + "step": 629 + }, + { + "epoch": 1.11, + "grad_norm": 0.43580326437950134, + "learning_rate": 0.00035076073617512475, + "loss": 0.0817, + "step": 630 + }, + { + "epoch": 1.12, + "grad_norm": 0.42866209149360657, + "learning_rate": 0.00035033398471700367, + "loss": 0.1195, + "step": 631 + }, + { + "epoch": 1.12, + "grad_norm": 0.42996305227279663, + "learning_rate": 0.0003499068844818571, + "loss": 0.12, + "step": 632 + }, + { + "epoch": 1.12, + "grad_norm": 0.4283413290977478, + "learning_rate": 0.0003494794369543539, + "loss": 0.085, + "step": 633 + }, + { + "epoch": 1.12, + "grad_norm": 0.693706214427948, + "learning_rate": 0.0003490516436203703, + "loss": 0.126, + "step": 634 + }, + { + "epoch": 1.12, + "grad_norm": 0.264961838722229, + "learning_rate": 0.00034862350596698456, + "loss": 0.0556, + "step": 635 + }, + { + "epoch": 1.13, + "grad_norm": 0.2530398368835449, + "learning_rate": 0.00034819502548247175, + "loss": 0.0514, + "step": 636 + }, + { + "epoch": 1.13, + "grad_norm": 0.18521098792552948, + "learning_rate": 0.0003477662036562989, + "loss": 0.0387, + "step": 637 + }, + { + "epoch": 1.13, + "grad_norm": 0.34398314356803894, + "learning_rate": 0.00034733704197911937, + "loss": 0.1047, + "step": 638 + }, + { + "epoch": 1.13, + "grad_norm": 0.16019423305988312, + "learning_rate": 0.000346907541942768, + "loss": 0.0299, + "step": 639 + }, + { + "epoch": 1.13, + "grad_norm": 0.3269899785518646, + "learning_rate": 0.00034647770504025587, + "loss": 0.0405, + "step": 640 + }, + { + "epoch": 1.13, + "grad_norm": 0.46410876512527466, + "learning_rate": 0.00034604753276576487, + "loss": 0.0855, + "step": 641 + }, + { + "epoch": 1.14, + "grad_norm": 0.33000048995018005, + "learning_rate": 0.000345617026614643, + "loss": 0.0759, + "step": 642 + }, + { + "epoch": 1.14, + "grad_norm": 0.31162315607070923, + "learning_rate": 0.0003451861880833986, + "loss": 0.0558, + "step": 643 + }, + { + "epoch": 1.14, + "grad_norm": 0.42918407917022705, + "learning_rate": 0.0003447550186696956, + "loss": 0.0365, + "step": 644 + }, + { + "epoch": 1.14, + "grad_norm": 0.2732023298740387, + "learning_rate": 0.00034432351987234786, + "loss": 0.0616, + "step": 645 + }, + { + "epoch": 1.14, + "grad_norm": 0.1899593621492386, + "learning_rate": 0.00034389169319131476, + "loss": 0.0286, + "step": 646 + }, + { + "epoch": 1.14, + "grad_norm": 0.447968065738678, + "learning_rate": 0.0003434595401276947, + "loss": 0.0701, + "step": 647 + }, + { + "epoch": 1.15, + "grad_norm": 0.15938018262386322, + "learning_rate": 0.0003430270621837213, + "loss": 0.026, + "step": 648 + }, + { + "epoch": 1.15, + "grad_norm": 0.17606832087039948, + "learning_rate": 0.0003425942608627572, + "loss": 0.0245, + "step": 649 + }, + { + "epoch": 1.15, + "grad_norm": 0.49266988039016724, + "learning_rate": 0.0003421611376692892, + "loss": 0.0823, + "step": 650 + }, + { + "epoch": 1.15, + "grad_norm": 0.3935730755329132, + "learning_rate": 0.0003417276941089232, + "loss": 0.0426, + "step": 651 + }, + { + "epoch": 1.15, + "grad_norm": 0.5984533429145813, + "learning_rate": 0.0003412939316883782, + "loss": 0.0833, + "step": 652 + }, + { + "epoch": 1.16, + "grad_norm": 0.3196690082550049, + "learning_rate": 0.00034085985191548217, + "loss": 0.0337, + "step": 653 + }, + { + "epoch": 1.16, + "grad_norm": 0.39022788405418396, + "learning_rate": 0.000340425456299166, + "loss": 0.0235, + "step": 654 + }, + { + "epoch": 1.16, + "grad_norm": 0.29681891202926636, + "learning_rate": 0.00033999074634945856, + "loss": 0.0155, + "step": 655 + }, + { + "epoch": 1.16, + "grad_norm": 0.5547076463699341, + "learning_rate": 0.0003395557235774813, + "loss": 0.0942, + "step": 656 + }, + { + "epoch": 1.16, + "grad_norm": 0.9071078300476074, + "learning_rate": 0.00033912038949544316, + "loss": 0.1004, + "step": 657 + }, + { + "epoch": 1.16, + "grad_norm": 0.5410562753677368, + "learning_rate": 0.00033868474561663534, + "loss": 0.0743, + "step": 658 + }, + { + "epoch": 1.17, + "grad_norm": 0.5785720348358154, + "learning_rate": 0.0003382487934554257, + "loss": 0.1017, + "step": 659 + }, + { + "epoch": 1.17, + "grad_norm": 0.60345858335495, + "learning_rate": 0.0003378125345272539, + "loss": 0.117, + "step": 660 + }, + { + "epoch": 1.17, + "grad_norm": 0.23607215285301208, + "learning_rate": 0.0003373759703486262, + "loss": 0.0149, + "step": 661 + }, + { + "epoch": 1.17, + "grad_norm": 0.3551620543003082, + "learning_rate": 0.0003369391024371093, + "loss": 0.0435, + "step": 662 + }, + { + "epoch": 1.17, + "grad_norm": 0.4280162453651428, + "learning_rate": 0.00033650193231132657, + "loss": 0.1019, + "step": 663 + }, + { + "epoch": 1.17, + "grad_norm": 0.12040708214044571, + "learning_rate": 0.0003360644614909512, + "loss": 0.0165, + "step": 664 + }, + { + "epoch": 1.18, + "grad_norm": 0.6838027238845825, + "learning_rate": 0.00033562669149670213, + "loss": 0.0909, + "step": 665 + }, + { + "epoch": 1.18, + "grad_norm": 0.2861780524253845, + "learning_rate": 0.00033518862385033786, + "loss": 0.0719, + "step": 666 + }, + { + "epoch": 1.18, + "grad_norm": 0.24380998313426971, + "learning_rate": 0.00033475026007465184, + "loss": 0.0388, + "step": 667 + }, + { + "epoch": 1.18, + "grad_norm": 0.41201332211494446, + "learning_rate": 0.00033431160169346714, + "loss": 0.0442, + "step": 668 + }, + { + "epoch": 1.18, + "grad_norm": 0.3734363615512848, + "learning_rate": 0.0003338726502316304, + "loss": 0.0687, + "step": 669 + }, + { + "epoch": 1.19, + "grad_norm": 0.21814176440238953, + "learning_rate": 0.00033343340721500743, + "loss": 0.0743, + "step": 670 + }, + { + "epoch": 1.19, + "grad_norm": 0.17123498022556305, + "learning_rate": 0.00033299387417047723, + "loss": 0.0446, + "step": 671 + }, + { + "epoch": 1.19, + "grad_norm": 0.3857256770133972, + "learning_rate": 0.0003325540526259275, + "loss": 0.0524, + "step": 672 + }, + { + "epoch": 1.19, + "grad_norm": 0.7980711460113525, + "learning_rate": 0.00033211394411024813, + "loss": 0.0786, + "step": 673 + }, + { + "epoch": 1.19, + "grad_norm": 0.31176111102104187, + "learning_rate": 0.00033167355015332713, + "loss": 0.0499, + "step": 674 + }, + { + "epoch": 1.19, + "grad_norm": 0.6255938410758972, + "learning_rate": 0.0003312328722860445, + "loss": 0.0664, + "step": 675 + }, + { + "epoch": 1.2, + "grad_norm": 0.3685753047466278, + "learning_rate": 0.00033079191204026713, + "loss": 0.0495, + "step": 676 + }, + { + "epoch": 1.2, + "grad_norm": 0.4045291841030121, + "learning_rate": 0.00033035067094884366, + "loss": 0.0697, + "step": 677 + }, + { + "epoch": 1.2, + "grad_norm": 0.6035248637199402, + "learning_rate": 0.0003299091505455989, + "loss": 0.1206, + "step": 678 + }, + { + "epoch": 1.2, + "grad_norm": 0.3399547338485718, + "learning_rate": 0.00032946735236532855, + "loss": 0.035, + "step": 679 + }, + { + "epoch": 1.2, + "grad_norm": 0.3604506552219391, + "learning_rate": 0.0003290252779437939, + "loss": 0.1087, + "step": 680 + }, + { + "epoch": 1.2, + "grad_norm": 0.28665006160736084, + "learning_rate": 0.0003285829288177167, + "loss": 0.0858, + "step": 681 + }, + { + "epoch": 1.21, + "grad_norm": 0.41967740654945374, + "learning_rate": 0.0003281403065247733, + "loss": 0.0851, + "step": 682 + }, + { + "epoch": 1.21, + "grad_norm": 0.43989259004592896, + "learning_rate": 0.00032769741260358997, + "loss": 0.0793, + "step": 683 + }, + { + "epoch": 1.21, + "grad_norm": 0.29274123907089233, + "learning_rate": 0.00032725424859373687, + "loss": 0.0538, + "step": 684 + }, + { + "epoch": 1.21, + "grad_norm": 0.27231287956237793, + "learning_rate": 0.0003268108160357233, + "loss": 0.0692, + "step": 685 + }, + { + "epoch": 1.21, + "grad_norm": 0.3030160963535309, + "learning_rate": 0.0003263671164709918, + "loss": 0.0786, + "step": 686 + }, + { + "epoch": 1.22, + "grad_norm": 0.19824832677841187, + "learning_rate": 0.0003259231514419135, + "loss": 0.0699, + "step": 687 + }, + { + "epoch": 1.22, + "grad_norm": 0.23121508955955505, + "learning_rate": 0.0003254789224917818, + "loss": 0.0499, + "step": 688 + }, + { + "epoch": 1.22, + "grad_norm": 0.15240328013896942, + "learning_rate": 0.0003250344311648079, + "loss": 0.0431, + "step": 689 + }, + { + "epoch": 1.22, + "grad_norm": 0.16165931522846222, + "learning_rate": 0.000324589679006115, + "loss": 0.0554, + "step": 690 + }, + { + "epoch": 1.22, + "grad_norm": 0.29693150520324707, + "learning_rate": 0.0003241446675617329, + "loss": 0.0554, + "step": 691 + }, + { + "epoch": 1.22, + "grad_norm": 0.7295424938201904, + "learning_rate": 0.00032369939837859275, + "loss": 0.1232, + "step": 692 + }, + { + "epoch": 1.23, + "grad_norm": 0.43246909976005554, + "learning_rate": 0.0003232538730045215, + "loss": 0.0598, + "step": 693 + }, + { + "epoch": 1.23, + "grad_norm": 0.18855467438697815, + "learning_rate": 0.00032280809298823723, + "loss": 0.0252, + "step": 694 + }, + { + "epoch": 1.23, + "grad_norm": 0.26345816254615784, + "learning_rate": 0.00032236205987934234, + "loss": 0.0809, + "step": 695 + }, + { + "epoch": 1.23, + "grad_norm": 0.497403085231781, + "learning_rate": 0.00032191577522831984, + "loss": 0.0482, + "step": 696 + }, + { + "epoch": 1.23, + "grad_norm": 0.2640454173088074, + "learning_rate": 0.0003214692405865264, + "loss": 0.0538, + "step": 697 + }, + { + "epoch": 1.23, + "grad_norm": 0.335443377494812, + "learning_rate": 0.00032102245750618833, + "loss": 0.1, + "step": 698 + }, + { + "epoch": 1.24, + "grad_norm": 0.37383145093917847, + "learning_rate": 0.00032057542754039526, + "loss": 0.0767, + "step": 699 + }, + { + "epoch": 1.24, + "grad_norm": 0.3101638853549957, + "learning_rate": 0.00032012815224309496, + "loss": 0.0499, + "step": 700 + }, + { + "epoch": 1.24, + "grad_norm": 0.2282581478357315, + "learning_rate": 0.00031968063316908815, + "loss": 0.0424, + "step": 701 + }, + { + "epoch": 1.24, + "grad_norm": 0.1553642451763153, + "learning_rate": 0.00031923287187402287, + "loss": 0.0446, + "step": 702 + }, + { + "epoch": 1.24, + "grad_norm": 0.1549568623304367, + "learning_rate": 0.0003187848699143894, + "loss": 0.0252, + "step": 703 + }, + { + "epoch": 1.25, + "grad_norm": 0.4515601694583893, + "learning_rate": 0.00031833662884751416, + "loss": 0.0852, + "step": 704 + }, + { + "epoch": 1.25, + "grad_norm": 0.23497383296489716, + "learning_rate": 0.0003178881502315552, + "loss": 0.0291, + "step": 705 + }, + { + "epoch": 1.25, + "grad_norm": 0.5249956846237183, + "learning_rate": 0.000317439435625496, + "loss": 0.0478, + "step": 706 + }, + { + "epoch": 1.25, + "grad_norm": 0.7284122705459595, + "learning_rate": 0.0003169904865891405, + "loss": 0.0584, + "step": 707 + }, + { + "epoch": 1.25, + "grad_norm": 0.34631770849227905, + "learning_rate": 0.00031654130468310784, + "loss": 0.092, + "step": 708 + }, + { + "epoch": 1.25, + "grad_norm": 0.43634921312332153, + "learning_rate": 0.000316091891468826, + "loss": 0.0758, + "step": 709 + }, + { + "epoch": 1.26, + "grad_norm": 0.2862977683544159, + "learning_rate": 0.00031564224850852754, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 1.26, + "eval_loss": 0.08585863560438156, + "eval_runtime": 14.7073, + "eval_samples_per_second": 32.433, + "eval_steps_per_second": 8.159, + "step": 710 + }, + { + "epoch": 1.26, + "grad_norm": 0.4469147324562073, + "learning_rate": 0.0003151923773652436, + "loss": 0.0807, + "step": 711 + }, + { + "epoch": 1.26, + "grad_norm": 0.25653207302093506, + "learning_rate": 0.00031474227960279834, + "loss": 0.0618, + "step": 712 + }, + { + "epoch": 1.26, + "grad_norm": 0.4219815731048584, + "learning_rate": 0.0003142919567858039, + "loss": 0.0592, + "step": 713 + }, + { + "epoch": 1.26, + "grad_norm": 0.8190480470657349, + "learning_rate": 0.0003138414104796545, + "loss": 0.103, + "step": 714 + }, + { + "epoch": 1.26, + "grad_norm": 0.3400764465332031, + "learning_rate": 0.0003133906422505215, + "loss": 0.0998, + "step": 715 + }, + { + "epoch": 1.27, + "grad_norm": 0.21949267387390137, + "learning_rate": 0.0003129396536653474, + "loss": 0.0395, + "step": 716 + }, + { + "epoch": 1.27, + "grad_norm": 0.30582305788993835, + "learning_rate": 0.0003124884462918411, + "loss": 0.0835, + "step": 717 + }, + { + "epoch": 1.27, + "grad_norm": 0.11762493848800659, + "learning_rate": 0.0003120370216984716, + "loss": 0.026, + "step": 718 + }, + { + "epoch": 1.27, + "grad_norm": 0.1867324858903885, + "learning_rate": 0.00031158538145446314, + "loss": 0.0544, + "step": 719 + }, + { + "epoch": 1.27, + "grad_norm": 0.15806153416633606, + "learning_rate": 0.00031113352712978996, + "loss": 0.0406, + "step": 720 + }, + { + "epoch": 1.28, + "grad_norm": 0.2605026662349701, + "learning_rate": 0.00031068146029516997, + "loss": 0.0431, + "step": 721 + }, + { + "epoch": 1.28, + "grad_norm": 0.27978816628456116, + "learning_rate": 0.00031022918252206005, + "loss": 0.0948, + "step": 722 + }, + { + "epoch": 1.28, + "grad_norm": 0.19412288069725037, + "learning_rate": 0.00030977669538265017, + "loss": 0.0305, + "step": 723 + }, + { + "epoch": 1.28, + "grad_norm": 0.42198699712753296, + "learning_rate": 0.0003093240004498585, + "loss": 0.1205, + "step": 724 + }, + { + "epoch": 1.28, + "grad_norm": 0.22501109540462494, + "learning_rate": 0.0003088710992973249, + "loss": 0.0368, + "step": 725 + }, + { + "epoch": 1.28, + "grad_norm": 0.4651114046573639, + "learning_rate": 0.00030841799349940667, + "loss": 0.1044, + "step": 726 + }, + { + "epoch": 1.29, + "grad_norm": 0.4760609567165375, + "learning_rate": 0.00030796468463117216, + "loss": 0.0829, + "step": 727 + }, + { + "epoch": 1.29, + "grad_norm": 0.308748722076416, + "learning_rate": 0.0003075111742683957, + "loss": 0.0382, + "step": 728 + }, + { + "epoch": 1.29, + "grad_norm": 0.21451212465763092, + "learning_rate": 0.0003070574639875521, + "loss": 0.0441, + "step": 729 + }, + { + "epoch": 1.29, + "grad_norm": 0.1944577842950821, + "learning_rate": 0.00030660355536581103, + "loss": 0.0326, + "step": 730 + }, + { + "epoch": 1.29, + "grad_norm": 0.5249868035316467, + "learning_rate": 0.0003061494499810317, + "loss": 0.0857, + "step": 731 + }, + { + "epoch": 1.3, + "grad_norm": 0.2004554718732834, + "learning_rate": 0.00030569514941175725, + "loss": 0.0533, + "step": 732 + }, + { + "epoch": 1.3, + "grad_norm": 0.21708889305591583, + "learning_rate": 0.00030524065523720935, + "loss": 0.0562, + "step": 733 + }, + { + "epoch": 1.3, + "grad_norm": 0.36287248134613037, + "learning_rate": 0.00030478596903728267, + "loss": 0.1, + "step": 734 + }, + { + "epoch": 1.3, + "grad_norm": 0.4405117928981781, + "learning_rate": 0.0003043310923925394, + "loss": 0.0929, + "step": 735 + }, + { + "epoch": 1.3, + "grad_norm": 0.3343874514102936, + "learning_rate": 0.0003038760268842036, + "loss": 0.0549, + "step": 736 + }, + { + "epoch": 1.3, + "grad_norm": 0.2254178822040558, + "learning_rate": 0.00030342077409415606, + "loss": 0.0495, + "step": 737 + }, + { + "epoch": 1.31, + "grad_norm": 0.19972631335258484, + "learning_rate": 0.00030296533560492854, + "loss": 0.0301, + "step": 738 + }, + { + "epoch": 1.31, + "grad_norm": 0.19470427930355072, + "learning_rate": 0.0003025097129996983, + "loss": 0.0485, + "step": 739 + }, + { + "epoch": 1.31, + "grad_norm": 0.34024572372436523, + "learning_rate": 0.0003020539078622824, + "loss": 0.0509, + "step": 740 + }, + { + "epoch": 1.31, + "grad_norm": 0.26424598693847656, + "learning_rate": 0.00030159792177713294, + "loss": 0.0293, + "step": 741 + }, + { + "epoch": 1.31, + "grad_norm": 0.3307158946990967, + "learning_rate": 0.00030114175632933043, + "loss": 0.0302, + "step": 742 + }, + { + "epoch": 1.31, + "grad_norm": 0.448448121547699, + "learning_rate": 0.0003006854131045793, + "loss": 0.0683, + "step": 743 + }, + { + "epoch": 1.32, + "grad_norm": 0.4010024070739746, + "learning_rate": 0.0003002288936892017, + "loss": 0.078, + "step": 744 + }, + { + "epoch": 1.32, + "grad_norm": 0.22475454211235046, + "learning_rate": 0.0002997721996701324, + "loss": 0.0303, + "step": 745 + }, + { + "epoch": 1.32, + "grad_norm": 0.31867673993110657, + "learning_rate": 0.000299315332634913, + "loss": 0.0521, + "step": 746 + }, + { + "epoch": 1.32, + "grad_norm": 0.2040076106786728, + "learning_rate": 0.0002988582941716867, + "loss": 0.0225, + "step": 747 + }, + { + "epoch": 1.32, + "grad_norm": 1.1286782026290894, + "learning_rate": 0.00029840108586919246, + "loss": 0.0833, + "step": 748 + }, + { + "epoch": 1.33, + "grad_norm": 0.6526787877082825, + "learning_rate": 0.00029794370931675963, + "loss": 0.1085, + "step": 749 + }, + { + "epoch": 1.33, + "grad_norm": 0.3272201418876648, + "learning_rate": 0.00029748616610430264, + "loss": 0.0213, + "step": 750 + }, + { + "epoch": 1.33, + "grad_norm": 0.5573351383209229, + "learning_rate": 0.0002970284578223149, + "loss": 0.0478, + "step": 751 + }, + { + "epoch": 1.33, + "grad_norm": 0.31984782218933105, + "learning_rate": 0.00029657058606186393, + "loss": 0.0353, + "step": 752 + }, + { + "epoch": 1.33, + "grad_norm": 1.712653398513794, + "learning_rate": 0.00029611255241458533, + "loss": 0.0787, + "step": 753 + }, + { + "epoch": 1.33, + "grad_norm": 0.5753667950630188, + "learning_rate": 0.00029565435847267766, + "loss": 0.1024, + "step": 754 + }, + { + "epoch": 1.34, + "grad_norm": 0.44150909781455994, + "learning_rate": 0.00029519600582889657, + "loss": 0.0261, + "step": 755 + }, + { + "epoch": 1.34, + "grad_norm": 0.2625367343425751, + "learning_rate": 0.00029473749607654914, + "loss": 0.0685, + "step": 756 + }, + { + "epoch": 1.34, + "grad_norm": 0.48232585191726685, + "learning_rate": 0.00029427883080948905, + "loss": 0.0299, + "step": 757 + }, + { + "epoch": 1.34, + "grad_norm": 0.2749970853328705, + "learning_rate": 0.00029382001162211026, + "loss": 0.022, + "step": 758 + }, + { + "epoch": 1.34, + "grad_norm": 0.2759183645248413, + "learning_rate": 0.00029336104010934186, + "loss": 0.0417, + "step": 759 + }, + { + "epoch": 1.34, + "grad_norm": 0.11506503075361252, + "learning_rate": 0.0002929019178666425, + "loss": 0.0114, + "step": 760 + }, + { + "epoch": 1.35, + "grad_norm": 0.10981517285108566, + "learning_rate": 0.0002924426464899947, + "loss": 0.0132, + "step": 761 + }, + { + "epoch": 1.35, + "grad_norm": 0.2663099467754364, + "learning_rate": 0.0002919832275758994, + "loss": 0.0628, + "step": 762 + }, + { + "epoch": 1.35, + "grad_norm": 0.6475871205329895, + "learning_rate": 0.0002915236627213705, + "loss": 0.0819, + "step": 763 + }, + { + "epoch": 1.35, + "grad_norm": 0.3743927478790283, + "learning_rate": 0.00029106395352392913, + "loss": 0.0526, + "step": 764 + }, + { + "epoch": 1.35, + "grad_norm": 0.2879592776298523, + "learning_rate": 0.0002906041015815983, + "loss": 0.0441, + "step": 765 + }, + { + "epoch": 1.36, + "grad_norm": 0.5118088126182556, + "learning_rate": 0.0002901441084928969, + "loss": 0.0304, + "step": 766 + }, + { + "epoch": 1.36, + "grad_norm": 0.8219065070152283, + "learning_rate": 0.000289683975856835, + "loss": 0.0682, + "step": 767 + }, + { + "epoch": 1.36, + "grad_norm": 0.45978665351867676, + "learning_rate": 0.00028922370527290715, + "loss": 0.0385, + "step": 768 + }, + { + "epoch": 1.36, + "grad_norm": 0.3295181095600128, + "learning_rate": 0.000288763298341088, + "loss": 0.0219, + "step": 769 + }, + { + "epoch": 1.36, + "grad_norm": 0.9101231694221497, + "learning_rate": 0.00028830275666182564, + "loss": 0.1396, + "step": 770 + }, + { + "epoch": 1.36, + "grad_norm": 0.4265996217727661, + "learning_rate": 0.000287842081836037, + "loss": 0.0514, + "step": 771 + }, + { + "epoch": 1.37, + "grad_norm": 0.6179928183555603, + "learning_rate": 0.00028738127546510165, + "loss": 0.0615, + "step": 772 + }, + { + "epoch": 1.37, + "grad_norm": 0.39278754591941833, + "learning_rate": 0.00028692033915085635, + "loss": 0.0422, + "step": 773 + }, + { + "epoch": 1.37, + "grad_norm": 0.4660128653049469, + "learning_rate": 0.00028645927449558986, + "loss": 0.1055, + "step": 774 + }, + { + "epoch": 1.37, + "grad_norm": 0.6555572152137756, + "learning_rate": 0.0002859980831020366, + "loss": 0.0934, + "step": 775 + }, + { + "epoch": 1.37, + "grad_norm": 0.4676145613193512, + "learning_rate": 0.0002855367665733722, + "loss": 0.0624, + "step": 776 + }, + { + "epoch": 1.37, + "grad_norm": 0.6434176564216614, + "learning_rate": 0.0002850753265132066, + "loss": 0.0415, + "step": 777 + }, + { + "epoch": 1.38, + "grad_norm": 0.2195662558078766, + "learning_rate": 0.0002846137645255796, + "loss": 0.0502, + "step": 778 + }, + { + "epoch": 1.38, + "grad_norm": 0.33307918906211853, + "learning_rate": 0.00028415208221495465, + "loss": 0.0692, + "step": 779 + }, + { + "epoch": 1.38, + "grad_norm": 0.16945867240428925, + "learning_rate": 0.0002836902811862136, + "loss": 0.0296, + "step": 780 + }, + { + "epoch": 1.38, + "grad_norm": 0.3181239366531372, + "learning_rate": 0.00028322836304465093, + "loss": 0.056, + "step": 781 + }, + { + "epoch": 1.38, + "grad_norm": 0.2984309494495392, + "learning_rate": 0.000282766329395968, + "loss": 0.0603, + "step": 782 + }, + { + "epoch": 1.39, + "grad_norm": 0.19178460538387299, + "learning_rate": 0.0002823041818462681, + "loss": 0.0315, + "step": 783 + }, + { + "epoch": 1.39, + "grad_norm": 0.3132456839084625, + "learning_rate": 0.0002818419220020502, + "loss": 0.0421, + "step": 784 + }, + { + "epoch": 1.39, + "grad_norm": 0.4850837290287018, + "learning_rate": 0.00028137955147020355, + "loss": 0.0835, + "step": 785 + }, + { + "epoch": 1.39, + "grad_norm": 0.2325926274061203, + "learning_rate": 0.00028091707185800245, + "loss": 0.033, + "step": 786 + }, + { + "epoch": 1.39, + "grad_norm": 0.5203066468238831, + "learning_rate": 0.0002804544847731001, + "loss": 0.0766, + "step": 787 + }, + { + "epoch": 1.39, + "grad_norm": 0.2967028021812439, + "learning_rate": 0.00027999179182352347, + "loss": 0.0287, + "step": 788 + }, + { + "epoch": 1.4, + "grad_norm": 0.42808797955513, + "learning_rate": 0.0002795289946176674, + "loss": 0.039, + "step": 789 + }, + { + "epoch": 1.4, + "grad_norm": 0.36871954798698425, + "learning_rate": 0.00027906609476428937, + "loss": 0.0388, + "step": 790 + }, + { + "epoch": 1.4, + "grad_norm": 0.31911173462867737, + "learning_rate": 0.0002786030938725034, + "loss": 0.0575, + "step": 791 + }, + { + "epoch": 1.4, + "grad_norm": 0.2864239513874054, + "learning_rate": 0.00027813999355177476, + "loss": 0.0711, + "step": 792 + }, + { + "epoch": 1.4, + "grad_norm": 0.11181977391242981, + "learning_rate": 0.0002776767954119147, + "loss": 0.0126, + "step": 793 + }, + { + "epoch": 1.4, + "grad_norm": 0.32950931787490845, + "learning_rate": 0.0002772135010630741, + "loss": 0.025, + "step": 794 + }, + { + "epoch": 1.41, + "grad_norm": 0.3144387900829315, + "learning_rate": 0.0002767501121157386, + "loss": 0.0244, + "step": 795 + }, + { + "epoch": 1.41, + "grad_norm": 0.7986451387405396, + "learning_rate": 0.0002762866301807222, + "loss": 0.013, + "step": 796 + }, + { + "epoch": 1.41, + "grad_norm": 0.11357055604457855, + "learning_rate": 0.0002758230568691627, + "loss": 0.0105, + "step": 797 + }, + { + "epoch": 1.41, + "grad_norm": 0.8880018591880798, + "learning_rate": 0.00027535939379251523, + "loss": 0.1036, + "step": 798 + }, + { + "epoch": 1.41, + "grad_norm": 0.5905174016952515, + "learning_rate": 0.000274895642562547, + "loss": 0.1287, + "step": 799 + }, + { + "epoch": 1.42, + "grad_norm": 0.1973883956670761, + "learning_rate": 0.0002744318047913318, + "loss": 0.0161, + "step": 800 + }, + { + "epoch": 1.42, + "grad_norm": 0.38918519020080566, + "learning_rate": 0.00027396788209124387, + "loss": 0.0428, + "step": 801 + }, + { + "epoch": 1.42, + "grad_norm": 0.2719264328479767, + "learning_rate": 0.0002735038760749531, + "loss": 0.0496, + "step": 802 + }, + { + "epoch": 1.42, + "grad_norm": 0.3014307916164398, + "learning_rate": 0.0002730397883554189, + "loss": 0.0241, + "step": 803 + }, + { + "epoch": 1.42, + "grad_norm": 0.25686872005462646, + "learning_rate": 0.00027257562054588453, + "loss": 0.0672, + "step": 804 + }, + { + "epoch": 1.42, + "grad_norm": 0.30520740151405334, + "learning_rate": 0.00027211137425987175, + "loss": 0.0376, + "step": 805 + }, + { + "epoch": 1.43, + "grad_norm": 0.3014354407787323, + "learning_rate": 0.00027164705111117516, + "loss": 0.0201, + "step": 806 + }, + { + "epoch": 1.43, + "grad_norm": 0.1687713861465454, + "learning_rate": 0.0002711826527138565, + "loss": 0.0328, + "step": 807 + }, + { + "epoch": 1.43, + "grad_norm": 0.31807759404182434, + "learning_rate": 0.00027071818068223906, + "loss": 0.121, + "step": 808 + }, + { + "epoch": 1.43, + "grad_norm": 0.6311066150665283, + "learning_rate": 0.00027025363663090216, + "loss": 0.0745, + "step": 809 + }, + { + "epoch": 1.43, + "grad_norm": 0.12313732504844666, + "learning_rate": 0.0002697890221746754, + "loss": 0.0103, + "step": 810 + }, + { + "epoch": 1.43, + "grad_norm": 0.7472290396690369, + "learning_rate": 0.00026932433892863324, + "loss": 0.0935, + "step": 811 + }, + { + "epoch": 1.44, + "grad_norm": 0.3236583173274994, + "learning_rate": 0.00026885958850808914, + "loss": 0.0592, + "step": 812 + }, + { + "epoch": 1.44, + "grad_norm": 0.6342505216598511, + "learning_rate": 0.00026839477252859007, + "loss": 0.0919, + "step": 813 + }, + { + "epoch": 1.44, + "grad_norm": 0.37357425689697266, + "learning_rate": 0.0002679298926059109, + "loss": 0.0426, + "step": 814 + }, + { + "epoch": 1.44, + "grad_norm": 0.6023468375205994, + "learning_rate": 0.000267464950356049, + "loss": 0.0756, + "step": 815 + }, + { + "epoch": 1.44, + "grad_norm": 0.35651543736457825, + "learning_rate": 0.0002669999473952181, + "loss": 0.0323, + "step": 816 + }, + { + "epoch": 1.45, + "grad_norm": 0.4318545162677765, + "learning_rate": 0.00026653488533984307, + "loss": 0.1178, + "step": 817 + }, + { + "epoch": 1.45, + "grad_norm": 0.31720227003097534, + "learning_rate": 0.00026606976580655415, + "loss": 0.0974, + "step": 818 + }, + { + "epoch": 1.45, + "grad_norm": 0.5617119669914246, + "learning_rate": 0.00026560459041218156, + "loss": 0.1098, + "step": 819 + }, + { + "epoch": 1.45, + "grad_norm": 0.5732232332229614, + "learning_rate": 0.00026513936077374954, + "loss": 0.0949, + "step": 820 + }, + { + "epoch": 1.45, + "grad_norm": 0.33141466975212097, + "learning_rate": 0.00026467407850847105, + "loss": 0.0417, + "step": 821 + }, + { + "epoch": 1.45, + "grad_norm": 0.41724076867103577, + "learning_rate": 0.00026420874523374173, + "loss": 0.0466, + "step": 822 + }, + { + "epoch": 1.46, + "grad_norm": 0.24849863350391388, + "learning_rate": 0.0002637433625671347, + "loss": 0.0536, + "step": 823 + }, + { + "epoch": 1.46, + "grad_norm": 0.3150210380554199, + "learning_rate": 0.00026327793212639486, + "loss": 0.0806, + "step": 824 + }, + { + "epoch": 1.46, + "grad_norm": 0.2865166664123535, + "learning_rate": 0.00026281245552943293, + "loss": 0.0533, + "step": 825 + }, + { + "epoch": 1.46, + "grad_norm": 0.2756612300872803, + "learning_rate": 0.00026234693439432043, + "loss": 0.0504, + "step": 826 + }, + { + "epoch": 1.46, + "grad_norm": 0.9047459363937378, + "learning_rate": 0.0002618813703392833, + "loss": 0.109, + "step": 827 + }, + { + "epoch": 1.46, + "grad_norm": 0.10576523840427399, + "learning_rate": 0.00026141576498269706, + "loss": 0.0231, + "step": 828 + }, + { + "epoch": 1.47, + "grad_norm": 0.4203824996948242, + "learning_rate": 0.00026095011994308056, + "loss": 0.0727, + "step": 829 + }, + { + "epoch": 1.47, + "grad_norm": 0.4648183584213257, + "learning_rate": 0.0002604844368390905, + "loss": 0.1066, + "step": 830 + }, + { + "epoch": 1.47, + "grad_norm": 0.2482835054397583, + "learning_rate": 0.00026001871728951624, + "loss": 0.0237, + "step": 831 + }, + { + "epoch": 1.47, + "grad_norm": 0.4374096393585205, + "learning_rate": 0.00025955296291327356, + "loss": 0.0934, + "step": 832 + }, + { + "epoch": 1.47, + "grad_norm": 0.3488870859146118, + "learning_rate": 0.00025908717532939946, + "loss": 0.0638, + "step": 833 + }, + { + "epoch": 1.48, + "grad_norm": 0.5614967942237854, + "learning_rate": 0.00025862135615704613, + "loss": 0.0827, + "step": 834 + }, + { + "epoch": 1.48, + "grad_norm": 0.30991917848587036, + "learning_rate": 0.0002581555070154759, + "loss": 0.0438, + "step": 835 + }, + { + "epoch": 1.48, + "grad_norm": 0.44601985812187195, + "learning_rate": 0.00025768962952405503, + "loss": 0.0797, + "step": 836 + }, + { + "epoch": 1.48, + "grad_norm": 0.3628086745738983, + "learning_rate": 0.00025722372530224844, + "loss": 0.0366, + "step": 837 + }, + { + "epoch": 1.48, + "grad_norm": 0.2644861936569214, + "learning_rate": 0.000256757795969614, + "loss": 0.0331, + "step": 838 + }, + { + "epoch": 1.48, + "grad_norm": 0.4585146903991699, + "learning_rate": 0.0002562918431457967, + "loss": 0.0635, + "step": 839 + }, + { + "epoch": 1.49, + "grad_norm": 0.4738370478153229, + "learning_rate": 0.0002558258684505233, + "loss": 0.0599, + "step": 840 + }, + { + "epoch": 1.49, + "grad_norm": 0.6536511182785034, + "learning_rate": 0.00025535987350359664, + "loss": 0.077, + "step": 841 + }, + { + "epoch": 1.49, + "grad_norm": 0.43449538946151733, + "learning_rate": 0.00025489385992489, + "loss": 0.0432, + "step": 842 + }, + { + "epoch": 1.49, + "grad_norm": 0.5419031977653503, + "learning_rate": 0.0002544278293343411, + "loss": 0.093, + "step": 843 + }, + { + "epoch": 1.49, + "grad_norm": 0.30555063486099243, + "learning_rate": 0.0002539617833519472, + "loss": 0.0572, + "step": 844 + }, + { + "epoch": 1.49, + "grad_norm": 0.18094651401042938, + "learning_rate": 0.0002534957235977589, + "loss": 0.0353, + "step": 845 + }, + { + "epoch": 1.5, + "grad_norm": 0.34586626291275024, + "learning_rate": 0.00025302965169187467, + "loss": 0.0554, + "step": 846 + }, + { + "epoch": 1.5, + "grad_norm": 0.38212889432907104, + "learning_rate": 0.00025256356925443507, + "loss": 0.0624, + "step": 847 + }, + { + "epoch": 1.5, + "grad_norm": 0.5566253066062927, + "learning_rate": 0.00025209747790561754, + "loss": 0.0603, + "step": 848 + }, + { + "epoch": 1.5, + "grad_norm": 0.2991026043891907, + "learning_rate": 0.0002516313792656304, + "loss": 0.0374, + "step": 849 + }, + { + "epoch": 1.5, + "grad_norm": 0.6026126146316528, + "learning_rate": 0.0002511652749547072, + "loss": 0.1283, + "step": 850 + }, + { + "epoch": 1.51, + "grad_norm": 0.28952938318252563, + "learning_rate": 0.0002506991665931013, + "loss": 0.0708, + "step": 851 + }, + { + "epoch": 1.51, + "grad_norm": 0.3491526246070862, + "learning_rate": 0.00025023305580108027, + "loss": 0.0536, + "step": 852 + }, + { + "epoch": 1.51, + "eval_loss": 0.08201431483030319, + "eval_runtime": 14.6754, + "eval_samples_per_second": 32.503, + "eval_steps_per_second": 8.177, + "step": 852 + }, + { + "epoch": 1.51, + "grad_norm": 0.7052826881408691, + "learning_rate": 0.00024976694419891974, + "loss": 0.059, + "step": 853 + }, + { + "epoch": 1.51, + "grad_norm": 0.43753165006637573, + "learning_rate": 0.0002493008334068987, + "loss": 0.0751, + "step": 854 + }, + { + "epoch": 1.51, + "grad_norm": 0.1675650030374527, + "learning_rate": 0.00024883472504529287, + "loss": 0.0224, + "step": 855 + }, + { + "epoch": 1.51, + "grad_norm": 0.3790821433067322, + "learning_rate": 0.00024836862073436967, + "loss": 0.0707, + "step": 856 + }, + { + "epoch": 1.52, + "grad_norm": 0.3162197470664978, + "learning_rate": 0.0002479025220943825, + "loss": 0.0375, + "step": 857 + }, + { + "epoch": 1.52, + "grad_norm": 0.5708606243133545, + "learning_rate": 0.00024743643074556494, + "loss": 0.0632, + "step": 858 + }, + { + "epoch": 1.52, + "grad_norm": 0.24075205624103546, + "learning_rate": 0.00024697034830812535, + "loss": 0.0452, + "step": 859 + }, + { + "epoch": 1.52, + "grad_norm": 0.371979683637619, + "learning_rate": 0.00024650427640224114, + "loss": 0.0676, + "step": 860 + }, + { + "epoch": 1.52, + "grad_norm": 0.2892770767211914, + "learning_rate": 0.00024603821664805276, + "loss": 0.0592, + "step": 861 + }, + { + "epoch": 1.52, + "grad_norm": 0.14655162394046783, + "learning_rate": 0.00024557217066565896, + "loss": 0.0161, + "step": 862 + }, + { + "epoch": 1.53, + "grad_norm": 0.22819265723228455, + "learning_rate": 0.0002451061400751101, + "loss": 0.0418, + "step": 863 + }, + { + "epoch": 1.53, + "grad_norm": 0.4637010097503662, + "learning_rate": 0.0002446401264964034, + "loss": 0.0555, + "step": 864 + }, + { + "epoch": 1.53, + "grad_norm": 0.30727583169937134, + "learning_rate": 0.00024417413154947677, + "loss": 0.0258, + "step": 865 + }, + { + "epoch": 1.53, + "grad_norm": 0.26563793420791626, + "learning_rate": 0.00024370815685420338, + "loss": 0.0528, + "step": 866 + }, + { + "epoch": 1.53, + "grad_norm": 0.18691249191761017, + "learning_rate": 0.00024324220403038613, + "loss": 0.0432, + "step": 867 + }, + { + "epoch": 1.54, + "grad_norm": 0.3845004439353943, + "learning_rate": 0.00024277627469775163, + "loss": 0.08, + "step": 868 + }, + { + "epoch": 1.54, + "grad_norm": 0.36308273673057556, + "learning_rate": 0.00024231037047594495, + "loss": 0.0569, + "step": 869 + }, + { + "epoch": 1.54, + "grad_norm": 0.43758854269981384, + "learning_rate": 0.00024184449298452414, + "loss": 0.0451, + "step": 870 + }, + { + "epoch": 1.54, + "grad_norm": 0.30163124203681946, + "learning_rate": 0.00024137864384295388, + "loss": 0.0219, + "step": 871 + }, + { + "epoch": 1.54, + "grad_norm": 0.20218737423419952, + "learning_rate": 0.00024091282467060055, + "loss": 0.0277, + "step": 872 + }, + { + "epoch": 1.54, + "grad_norm": 0.2397354692220688, + "learning_rate": 0.00024044703708672648, + "loss": 0.0307, + "step": 873 + }, + { + "epoch": 1.55, + "grad_norm": 0.31938987970352173, + "learning_rate": 0.00023998128271048374, + "loss": 0.0453, + "step": 874 + }, + { + "epoch": 1.55, + "grad_norm": 0.5195713043212891, + "learning_rate": 0.00023951556316090952, + "loss": 0.0852, + "step": 875 + }, + { + "epoch": 1.55, + "grad_norm": 0.5428398251533508, + "learning_rate": 0.00023904988005691953, + "loss": 0.0941, + "step": 876 + }, + { + "epoch": 1.55, + "grad_norm": 0.9265273213386536, + "learning_rate": 0.00023858423501730295, + "loss": 0.0489, + "step": 877 + }, + { + "epoch": 1.55, + "grad_norm": 0.5896188020706177, + "learning_rate": 0.00023811862966071674, + "loss": 0.0601, + "step": 878 + }, + { + "epoch": 1.56, + "grad_norm": 0.6030554175376892, + "learning_rate": 0.0002376530656056796, + "loss": 0.0958, + "step": 879 + }, + { + "epoch": 1.56, + "grad_norm": 0.3220241665840149, + "learning_rate": 0.00023718754447056708, + "loss": 0.0487, + "step": 880 + }, + { + "epoch": 1.56, + "grad_norm": 0.49902886152267456, + "learning_rate": 0.00023672206787360523, + "loss": 0.0457, + "step": 881 + }, + { + "epoch": 1.56, + "grad_norm": 0.3744886517524719, + "learning_rate": 0.00023625663743286534, + "loss": 0.0771, + "step": 882 + }, + { + "epoch": 1.56, + "grad_norm": 0.39943140745162964, + "learning_rate": 0.0002357912547662584, + "loss": 0.0389, + "step": 883 + }, + { + "epoch": 1.56, + "grad_norm": 0.34382057189941406, + "learning_rate": 0.00023532592149152898, + "loss": 0.0405, + "step": 884 + }, + { + "epoch": 1.57, + "grad_norm": 0.22058314085006714, + "learning_rate": 0.00023486063922625042, + "loss": 0.032, + "step": 885 + }, + { + "epoch": 1.57, + "grad_norm": 0.32711130380630493, + "learning_rate": 0.00023439540958781848, + "loss": 0.0589, + "step": 886 + }, + { + "epoch": 1.57, + "grad_norm": 0.42972657084465027, + "learning_rate": 0.0002339302341934459, + "loss": 0.0601, + "step": 887 + }, + { + "epoch": 1.57, + "grad_norm": 0.31368395686149597, + "learning_rate": 0.00023346511466015708, + "loss": 0.0345, + "step": 888 + }, + { + "epoch": 1.57, + "grad_norm": 0.28611400723457336, + "learning_rate": 0.00023300005260478194, + "loss": 0.0432, + "step": 889 + }, + { + "epoch": 1.57, + "grad_norm": 0.5313751101493835, + "learning_rate": 0.00023253504964395097, + "loss": 0.0427, + "step": 890 + }, + { + "epoch": 1.58, + "grad_norm": 0.2192300707101822, + "learning_rate": 0.00023207010739408908, + "loss": 0.0392, + "step": 891 + }, + { + "epoch": 1.58, + "grad_norm": 0.7893845438957214, + "learning_rate": 0.00023160522747141, + "loss": 0.1338, + "step": 892 + }, + { + "epoch": 1.58, + "grad_norm": 0.5475191473960876, + "learning_rate": 0.00023114041149191098, + "loss": 0.1458, + "step": 893 + }, + { + "epoch": 1.58, + "grad_norm": 0.4575919806957245, + "learning_rate": 0.00023067566107136685, + "loss": 0.0593, + "step": 894 + }, + { + "epoch": 1.58, + "grad_norm": 0.47522222995758057, + "learning_rate": 0.00023021097782532457, + "loss": 0.0744, + "step": 895 + }, + { + "epoch": 1.59, + "grad_norm": 0.3471393883228302, + "learning_rate": 0.0002297463633690979, + "loss": 0.0795, + "step": 896 + }, + { + "epoch": 1.59, + "grad_norm": 0.6821273565292358, + "learning_rate": 0.00022928181931776098, + "loss": 0.0692, + "step": 897 + }, + { + "epoch": 1.59, + "grad_norm": 0.4375980496406555, + "learning_rate": 0.00022881734728614347, + "loss": 0.0704, + "step": 898 + }, + { + "epoch": 1.59, + "grad_norm": 0.596495509147644, + "learning_rate": 0.0002283529488888249, + "loss": 0.0744, + "step": 899 + }, + { + "epoch": 1.59, + "grad_norm": 0.36588358879089355, + "learning_rate": 0.00022788862574012824, + "loss": 0.0576, + "step": 900 + }, + { + "epoch": 1.59, + "grad_norm": 0.49629420042037964, + "learning_rate": 0.0002274243794541155, + "loss": 0.0847, + "step": 901 + }, + { + "epoch": 1.6, + "grad_norm": 0.8968174457550049, + "learning_rate": 0.0002269602116445811, + "loss": 0.0521, + "step": 902 + }, + { + "epoch": 1.6, + "grad_norm": 0.260775089263916, + "learning_rate": 0.00022649612392504687, + "loss": 0.0345, + "step": 903 + }, + { + "epoch": 1.6, + "grad_norm": 0.33473265171051025, + "learning_rate": 0.00022603211790875622, + "loss": 0.0483, + "step": 904 + }, + { + "epoch": 1.6, + "grad_norm": 0.451668918132782, + "learning_rate": 0.0002255681952086683, + "loss": 0.0862, + "step": 905 + }, + { + "epoch": 1.6, + "grad_norm": 0.6166467070579529, + "learning_rate": 0.00022510435743745304, + "loss": 0.1038, + "step": 906 + }, + { + "epoch": 1.6, + "grad_norm": 0.361858069896698, + "learning_rate": 0.0002246406062074848, + "loss": 0.073, + "step": 907 + }, + { + "epoch": 1.61, + "grad_norm": 0.42583227157592773, + "learning_rate": 0.00022417694313083735, + "loss": 0.0923, + "step": 908 + }, + { + "epoch": 1.61, + "grad_norm": 0.2566489279270172, + "learning_rate": 0.00022371336981927788, + "loss": 0.0358, + "step": 909 + }, + { + "epoch": 1.61, + "grad_norm": 0.3075582683086395, + "learning_rate": 0.0002232498878842615, + "loss": 0.0912, + "step": 910 + }, + { + "epoch": 1.61, + "grad_norm": 0.1581839770078659, + "learning_rate": 0.00022278649893692584, + "loss": 0.0309, + "step": 911 + }, + { + "epoch": 1.61, + "grad_norm": 0.37088707089424133, + "learning_rate": 0.00022232320458808532, + "loss": 0.074, + "step": 912 + }, + { + "epoch": 1.62, + "grad_norm": 0.40888333320617676, + "learning_rate": 0.00022186000644822522, + "loss": 0.0618, + "step": 913 + }, + { + "epoch": 1.62, + "grad_norm": 0.2161816507577896, + "learning_rate": 0.00022139690612749672, + "loss": 0.0355, + "step": 914 + }, + { + "epoch": 1.62, + "grad_norm": 0.3577941954135895, + "learning_rate": 0.00022093390523571067, + "loss": 0.0499, + "step": 915 + }, + { + "epoch": 1.62, + "grad_norm": 0.3455984592437744, + "learning_rate": 0.0002204710053823326, + "loss": 0.0712, + "step": 916 + }, + { + "epoch": 1.62, + "grad_norm": 0.2223758101463318, + "learning_rate": 0.0002200082081764766, + "loss": 0.0362, + "step": 917 + }, + { + "epoch": 1.62, + "grad_norm": 0.3027271032333374, + "learning_rate": 0.00021954551522689993, + "loss": 0.0579, + "step": 918 + }, + { + "epoch": 1.63, + "grad_norm": 0.12046008557081223, + "learning_rate": 0.00021908292814199764, + "loss": 0.0227, + "step": 919 + }, + { + "epoch": 1.63, + "grad_norm": 0.3685935437679291, + "learning_rate": 0.00021862044852979652, + "loss": 0.0797, + "step": 920 + }, + { + "epoch": 1.63, + "grad_norm": 0.26438644528388977, + "learning_rate": 0.00021815807799794982, + "loss": 0.0567, + "step": 921 + }, + { + "epoch": 1.63, + "grad_norm": 0.44811537861824036, + "learning_rate": 0.00021769581815373192, + "loss": 0.0623, + "step": 922 + }, + { + "epoch": 1.63, + "grad_norm": 0.36297371983528137, + "learning_rate": 0.000217233670604032, + "loss": 0.0512, + "step": 923 + }, + { + "epoch": 1.63, + "grad_norm": 0.30395954847335815, + "learning_rate": 0.00021677163695534913, + "loss": 0.0423, + "step": 924 + }, + { + "epoch": 1.64, + "grad_norm": 0.26092368364334106, + "learning_rate": 0.00021630971881378644, + "loss": 0.0463, + "step": 925 + }, + { + "epoch": 1.64, + "grad_norm": 0.639478325843811, + "learning_rate": 0.0002158479177850453, + "loss": 0.0564, + "step": 926 + }, + { + "epoch": 1.64, + "grad_norm": 0.25447505712509155, + "learning_rate": 0.00021538623547442045, + "loss": 0.0317, + "step": 927 + }, + { + "epoch": 1.64, + "grad_norm": 0.24460607767105103, + "learning_rate": 0.00021492467348679345, + "loss": 0.0375, + "step": 928 + }, + { + "epoch": 1.64, + "grad_norm": 0.43972596526145935, + "learning_rate": 0.00021446323342662785, + "loss": 0.0587, + "step": 929 + }, + { + "epoch": 1.65, + "grad_norm": 0.08744898438453674, + "learning_rate": 0.00021400191689796338, + "loss": 0.0074, + "step": 930 + }, + { + "epoch": 1.65, + "grad_norm": 0.29619458317756653, + "learning_rate": 0.00021354072550441018, + "loss": 0.0422, + "step": 931 + }, + { + "epoch": 1.65, + "grad_norm": 0.3273064196109772, + "learning_rate": 0.00021307966084914372, + "loss": 0.0362, + "step": 932 + }, + { + "epoch": 1.65, + "grad_norm": 0.2981872260570526, + "learning_rate": 0.00021261872453489842, + "loss": 0.0246, + "step": 933 + }, + { + "epoch": 1.65, + "grad_norm": 0.7154226899147034, + "learning_rate": 0.00021215791816396303, + "loss": 0.0856, + "step": 934 + }, + { + "epoch": 1.65, + "grad_norm": 0.4782339930534363, + "learning_rate": 0.00021169724333817443, + "loss": 0.0482, + "step": 935 + }, + { + "epoch": 1.66, + "grad_norm": 0.5048168897628784, + "learning_rate": 0.00021123670165891208, + "loss": 0.0405, + "step": 936 + }, + { + "epoch": 1.66, + "grad_norm": 0.22893092036247253, + "learning_rate": 0.0002107762947270928, + "loss": 0.0181, + "step": 937 + }, + { + "epoch": 1.66, + "grad_norm": 0.6863519549369812, + "learning_rate": 0.00021031602414316506, + "loss": 0.0643, + "step": 938 + }, + { + "epoch": 1.66, + "grad_norm": 0.5500178337097168, + "learning_rate": 0.0002098558915071031, + "loss": 0.0599, + "step": 939 + }, + { + "epoch": 1.66, + "grad_norm": 0.7170897126197815, + "learning_rate": 0.0002093958984184018, + "loss": 0.1167, + "step": 940 + }, + { + "epoch": 1.66, + "grad_norm": 0.48540323972702026, + "learning_rate": 0.00020893604647607088, + "loss": 0.0443, + "step": 941 + }, + { + "epoch": 1.67, + "grad_norm": 0.09318219870328903, + "learning_rate": 0.0002084763372786295, + "loss": 0.0092, + "step": 942 + }, + { + "epoch": 1.67, + "grad_norm": 0.7748222351074219, + "learning_rate": 0.00020801677242410067, + "loss": 0.0912, + "step": 943 + }, + { + "epoch": 1.67, + "grad_norm": 0.2857852280139923, + "learning_rate": 0.00020755735351000537, + "loss": 0.0313, + "step": 944 + }, + { + "epoch": 1.67, + "grad_norm": 0.483059287071228, + "learning_rate": 0.00020709808213335758, + "loss": 0.0768, + "step": 945 + }, + { + "epoch": 1.67, + "grad_norm": 0.2753046751022339, + "learning_rate": 0.0002066389598906582, + "loss": 0.0339, + "step": 946 + }, + { + "epoch": 1.68, + "grad_norm": 0.29203662276268005, + "learning_rate": 0.00020617998837788975, + "loss": 0.0185, + "step": 947 + }, + { + "epoch": 1.68, + "grad_norm": 0.3008381426334381, + "learning_rate": 0.00020572116919051098, + "loss": 0.0574, + "step": 948 + }, + { + "epoch": 1.68, + "grad_norm": 0.6928682923316956, + "learning_rate": 0.0002052625039234509, + "loss": 0.051, + "step": 949 + }, + { + "epoch": 1.68, + "grad_norm": 0.27521032094955444, + "learning_rate": 0.00020480399417110352, + "loss": 0.05, + "step": 950 + }, + { + "epoch": 1.68, + "grad_norm": 0.1467350274324417, + "learning_rate": 0.00020434564152732238, + "loss": 0.0254, + "step": 951 + }, + { + "epoch": 1.68, + "grad_norm": 0.2940123379230499, + "learning_rate": 0.00020388744758541462, + "loss": 0.0521, + "step": 952 + }, + { + "epoch": 1.69, + "grad_norm": 0.2931191325187683, + "learning_rate": 0.00020342941393813613, + "loss": 0.0394, + "step": 953 + }, + { + "epoch": 1.69, + "grad_norm": 0.2518831491470337, + "learning_rate": 0.00020297154217768513, + "loss": 0.041, + "step": 954 + }, + { + "epoch": 1.69, + "grad_norm": 0.48200809955596924, + "learning_rate": 0.00020251383389569743, + "loss": 0.0693, + "step": 955 + }, + { + "epoch": 1.69, + "grad_norm": 0.4188168942928314, + "learning_rate": 0.0002020562906832404, + "loss": 0.0594, + "step": 956 + }, + { + "epoch": 1.69, + "grad_norm": 0.4201320707798004, + "learning_rate": 0.00020159891413080755, + "loss": 0.0737, + "step": 957 + }, + { + "epoch": 1.69, + "grad_norm": 0.4236721396446228, + "learning_rate": 0.00020114170582831342, + "loss": 0.0443, + "step": 958 + }, + { + "epoch": 1.7, + "grad_norm": 0.39196375012397766, + "learning_rate": 0.00020068466736508704, + "loss": 0.0728, + "step": 959 + }, + { + "epoch": 1.7, + "grad_norm": 0.5320992469787598, + "learning_rate": 0.00020022780032986765, + "loss": 0.0416, + "step": 960 + }, + { + "epoch": 1.7, + "grad_norm": 0.12672173976898193, + "learning_rate": 0.00019977110631079836, + "loss": 0.0191, + "step": 961 + }, + { + "epoch": 1.7, + "grad_norm": 0.6431661248207092, + "learning_rate": 0.0001993145868954207, + "loss": 0.038, + "step": 962 + }, + { + "epoch": 1.7, + "grad_norm": 0.29868853092193604, + "learning_rate": 0.00019885824367066955, + "loss": 0.0245, + "step": 963 + }, + { + "epoch": 1.71, + "grad_norm": 0.5283737778663635, + "learning_rate": 0.0001984020782228671, + "loss": 0.0722, + "step": 964 + }, + { + "epoch": 1.71, + "grad_norm": 0.5567461252212524, + "learning_rate": 0.00019794609213771755, + "loss": 0.1026, + "step": 965 + }, + { + "epoch": 1.71, + "grad_norm": 0.9617827534675598, + "learning_rate": 0.00019749028700030181, + "loss": 0.078, + "step": 966 + }, + { + "epoch": 1.71, + "grad_norm": 0.5466052889823914, + "learning_rate": 0.0001970346643950715, + "loss": 0.075, + "step": 967 + }, + { + "epoch": 1.71, + "grad_norm": 0.18108782172203064, + "learning_rate": 0.00019657922590584392, + "loss": 0.024, + "step": 968 + }, + { + "epoch": 1.71, + "grad_norm": 0.4150354862213135, + "learning_rate": 0.00019612397311579647, + "loss": 0.0656, + "step": 969 + }, + { + "epoch": 1.72, + "grad_norm": 0.21237897872924805, + "learning_rate": 0.0001956689076074607, + "loss": 0.0378, + "step": 970 + }, + { + "epoch": 1.72, + "grad_norm": 0.21254923939704895, + "learning_rate": 0.0001952140309627174, + "loss": 0.0173, + "step": 971 + }, + { + "epoch": 1.72, + "grad_norm": 0.2641647756099701, + "learning_rate": 0.0001947593447627907, + "loss": 0.049, + "step": 972 + }, + { + "epoch": 1.72, + "grad_norm": 0.3682314455509186, + "learning_rate": 0.00019430485058824276, + "loss": 0.0485, + "step": 973 + }, + { + "epoch": 1.72, + "grad_norm": 0.2566399574279785, + "learning_rate": 0.00019385055001896835, + "loss": 0.0388, + "step": 974 + }, + { + "epoch": 1.72, + "grad_norm": 0.20328454673290253, + "learning_rate": 0.000193396444634189, + "loss": 0.0259, + "step": 975 + }, + { + "epoch": 1.73, + "grad_norm": 0.5327407717704773, + "learning_rate": 0.00019294253601244792, + "loss": 0.085, + "step": 976 + }, + { + "epoch": 1.73, + "grad_norm": 0.6960484385490417, + "learning_rate": 0.00019248882573160437, + "loss": 0.1077, + "step": 977 + }, + { + "epoch": 1.73, + "grad_norm": 0.5338547229766846, + "learning_rate": 0.00019203531536882785, + "loss": 0.0421, + "step": 978 + }, + { + "epoch": 1.73, + "grad_norm": 0.1970924586057663, + "learning_rate": 0.00019158200650059337, + "loss": 0.024, + "step": 979 + }, + { + "epoch": 1.73, + "grad_norm": 0.4665428698062897, + "learning_rate": 0.0001911289007026751, + "loss": 0.0549, + "step": 980 + }, + { + "epoch": 1.74, + "grad_norm": 0.4011171758174896, + "learning_rate": 0.00019067599955014156, + "loss": 0.0482, + "step": 981 + }, + { + "epoch": 1.74, + "grad_norm": 0.25179675221443176, + "learning_rate": 0.00019022330461734982, + "loss": 0.0327, + "step": 982 + }, + { + "epoch": 1.74, + "grad_norm": 0.2733090817928314, + "learning_rate": 0.00018977081747794, + "loss": 0.0271, + "step": 983 + }, + { + "epoch": 1.74, + "grad_norm": 0.5745018124580383, + "learning_rate": 0.00018931853970483012, + "loss": 0.0344, + "step": 984 + }, + { + "epoch": 1.74, + "grad_norm": 0.12801390886306763, + "learning_rate": 0.00018886647287021007, + "loss": 0.0144, + "step": 985 + }, + { + "epoch": 1.74, + "grad_norm": 0.1290263682603836, + "learning_rate": 0.00018841461854553681, + "loss": 0.0132, + "step": 986 + }, + { + "epoch": 1.75, + "grad_norm": 0.863158643245697, + "learning_rate": 0.00018796297830152853, + "loss": 0.1274, + "step": 987 + }, + { + "epoch": 1.75, + "grad_norm": 0.3602030277252197, + "learning_rate": 0.00018751155370815895, + "loss": 0.0549, + "step": 988 + }, + { + "epoch": 1.75, + "grad_norm": 0.379295289516449, + "learning_rate": 0.00018706034633465257, + "loss": 0.0266, + "step": 989 + }, + { + "epoch": 1.75, + "grad_norm": 0.43907806277275085, + "learning_rate": 0.00018660935774947858, + "loss": 0.0499, + "step": 990 + }, + { + "epoch": 1.75, + "grad_norm": 0.822163462638855, + "learning_rate": 0.00018615858952034548, + "loss": 0.1464, + "step": 991 + }, + { + "epoch": 1.75, + "grad_norm": 0.3563006520271301, + "learning_rate": 0.00018570804321419614, + "loss": 0.0499, + "step": 992 + }, + { + "epoch": 1.76, + "grad_norm": 0.082757368683815, + "learning_rate": 0.00018525772039720167, + "loss": 0.0088, + "step": 993 + }, + { + "epoch": 1.76, + "grad_norm": 0.349202424287796, + "learning_rate": 0.00018480762263475638, + "loss": 0.0325, + "step": 994 + }, + { + "epoch": 1.76, + "eval_loss": 0.0834248885512352, + "eval_runtime": 14.6855, + "eval_samples_per_second": 32.481, + "eval_steps_per_second": 8.171, + "step": 994 + }, + { + "epoch": 1.76, + "grad_norm": 0.2513701319694519, + "learning_rate": 0.0001843577514914725, + "loss": 0.0186, + "step": 995 + }, + { + "epoch": 1.76, + "grad_norm": 0.4057576656341553, + "learning_rate": 0.00018390810853117408, + "loss": 0.0348, + "step": 996 + }, + { + "epoch": 1.76, + "grad_norm": 0.46003130078315735, + "learning_rate": 0.0001834586953168923, + "loss": 0.0689, + "step": 997 + }, + { + "epoch": 1.77, + "grad_norm": 0.41909146308898926, + "learning_rate": 0.00018300951341085946, + "loss": 0.0298, + "step": 998 + }, + { + "epoch": 1.77, + "grad_norm": 0.9829010367393494, + "learning_rate": 0.00018256056437450399, + "loss": 0.2026, + "step": 999 + }, + { + "epoch": 1.77, + "grad_norm": 0.31356698274612427, + "learning_rate": 0.00018211184976844487, + "loss": 0.0263, + "step": 1000 + }, + { + "epoch": 1.77, + "grad_norm": 0.4269973337650299, + "learning_rate": 0.00018166337115248585, + "loss": 0.1063, + "step": 1001 + }, + { + "epoch": 1.77, + "grad_norm": 0.3558803200721741, + "learning_rate": 0.00018121513008561064, + "loss": 0.0389, + "step": 1002 + }, + { + "epoch": 1.77, + "grad_norm": 0.5086562633514404, + "learning_rate": 0.0001807671281259771, + "loss": 0.0593, + "step": 1003 + }, + { + "epoch": 1.78, + "grad_norm": 0.1954115778207779, + "learning_rate": 0.00018031936683091186, + "loss": 0.0327, + "step": 1004 + }, + { + "epoch": 1.78, + "grad_norm": 0.1789095103740692, + "learning_rate": 0.0001798718477569051, + "loss": 0.024, + "step": 1005 + }, + { + "epoch": 1.78, + "grad_norm": 0.38798651099205017, + "learning_rate": 0.0001794245724596048, + "loss": 0.0791, + "step": 1006 + }, + { + "epoch": 1.78, + "grad_norm": 0.34710198640823364, + "learning_rate": 0.00017897754249381165, + "loss": 0.0571, + "step": 1007 + }, + { + "epoch": 1.78, + "grad_norm": 0.2781204283237457, + "learning_rate": 0.00017853075941347363, + "loss": 0.0521, + "step": 1008 + }, + { + "epoch": 1.79, + "grad_norm": 0.2825307548046112, + "learning_rate": 0.00017808422477168023, + "loss": 0.0474, + "step": 1009 + }, + { + "epoch": 1.79, + "grad_norm": 0.23594889044761658, + "learning_rate": 0.0001776379401206577, + "loss": 0.0295, + "step": 1010 + }, + { + "epoch": 1.79, + "grad_norm": 0.37222880125045776, + "learning_rate": 0.00017719190701176286, + "loss": 0.0411, + "step": 1011 + }, + { + "epoch": 1.79, + "grad_norm": 0.25766775012016296, + "learning_rate": 0.00017674612699547846, + "loss": 0.0419, + "step": 1012 + }, + { + "epoch": 1.79, + "grad_norm": 0.27667155861854553, + "learning_rate": 0.00017630060162140737, + "loss": 0.0325, + "step": 1013 + }, + { + "epoch": 1.79, + "grad_norm": 0.49651435017585754, + "learning_rate": 0.00017585533243826712, + "loss": 0.0435, + "step": 1014 + }, + { + "epoch": 1.8, + "grad_norm": 0.7008858323097229, + "learning_rate": 0.00017541032099388499, + "loss": 0.1405, + "step": 1015 + }, + { + "epoch": 1.8, + "grad_norm": 0.17448720335960388, + "learning_rate": 0.0001749655688351921, + "loss": 0.0269, + "step": 1016 + }, + { + "epoch": 1.8, + "grad_norm": 0.2893378734588623, + "learning_rate": 0.0001745210775082182, + "loss": 0.0417, + "step": 1017 + }, + { + "epoch": 1.8, + "grad_norm": 0.18504270911216736, + "learning_rate": 0.0001740768485580866, + "loss": 0.0302, + "step": 1018 + }, + { + "epoch": 1.8, + "grad_norm": 0.2060771882534027, + "learning_rate": 0.00017363288352900818, + "loss": 0.047, + "step": 1019 + }, + { + "epoch": 1.8, + "grad_norm": 0.8185610771179199, + "learning_rate": 0.00017318918396427675, + "loss": 0.1398, + "step": 1020 + }, + { + "epoch": 1.81, + "grad_norm": 0.46132713556289673, + "learning_rate": 0.00017274575140626317, + "loss": 0.0776, + "step": 1021 + }, + { + "epoch": 1.81, + "grad_norm": 0.16016420722007751, + "learning_rate": 0.0001723025873964101, + "loss": 0.0161, + "step": 1022 + }, + { + "epoch": 1.81, + "grad_norm": 0.6459915041923523, + "learning_rate": 0.00017185969347522674, + "loss": 0.0711, + "step": 1023 + }, + { + "epoch": 1.81, + "grad_norm": 0.40434324741363525, + "learning_rate": 0.0001714170711822834, + "loss": 0.0571, + "step": 1024 + }, + { + "epoch": 1.81, + "grad_norm": 0.5824777483940125, + "learning_rate": 0.00017097472205620607, + "loss": 0.1141, + "step": 1025 + }, + { + "epoch": 1.82, + "grad_norm": 0.4143454134464264, + "learning_rate": 0.00017053264763467152, + "loss": 0.0558, + "step": 1026 + }, + { + "epoch": 1.82, + "grad_norm": 0.36720553040504456, + "learning_rate": 0.00017009084945440113, + "loss": 0.0376, + "step": 1027 + }, + { + "epoch": 1.82, + "grad_norm": 0.27180641889572144, + "learning_rate": 0.00016964932905115632, + "loss": 0.054, + "step": 1028 + }, + { + "epoch": 1.82, + "grad_norm": 0.43961653113365173, + "learning_rate": 0.0001692080879597329, + "loss": 0.0773, + "step": 1029 + }, + { + "epoch": 1.82, + "grad_norm": 0.2728005647659302, + "learning_rate": 0.00016876712771395552, + "loss": 0.0142, + "step": 1030 + }, + { + "epoch": 1.82, + "grad_norm": 0.5099291205406189, + "learning_rate": 0.0001683264498466729, + "loss": 0.0404, + "step": 1031 + }, + { + "epoch": 1.83, + "grad_norm": 0.3162379562854767, + "learning_rate": 0.00016788605588975193, + "loss": 0.0332, + "step": 1032 + }, + { + "epoch": 1.83, + "grad_norm": 0.4152194857597351, + "learning_rate": 0.0001674459473740726, + "loss": 0.0352, + "step": 1033 + }, + { + "epoch": 1.83, + "grad_norm": 0.3174980878829956, + "learning_rate": 0.00016700612582952278, + "loss": 0.0777, + "step": 1034 + }, + { + "epoch": 1.83, + "grad_norm": 0.6996863484382629, + "learning_rate": 0.0001665665927849926, + "loss": 0.1145, + "step": 1035 + }, + { + "epoch": 1.83, + "grad_norm": 0.2766638398170471, + "learning_rate": 0.0001661273497683697, + "loss": 0.0179, + "step": 1036 + }, + { + "epoch": 1.83, + "grad_norm": 0.45079368352890015, + "learning_rate": 0.00016568839830653287, + "loss": 0.1081, + "step": 1037 + }, + { + "epoch": 1.84, + "grad_norm": 0.44944706559181213, + "learning_rate": 0.0001652497399253481, + "loss": 0.0964, + "step": 1038 + }, + { + "epoch": 1.84, + "grad_norm": 0.5892651081085205, + "learning_rate": 0.00016481137614966223, + "loss": 0.1138, + "step": 1039 + }, + { + "epoch": 1.84, + "grad_norm": 0.29900479316711426, + "learning_rate": 0.00016437330850329793, + "loss": 0.0429, + "step": 1040 + }, + { + "epoch": 1.84, + "grad_norm": 0.3094378411769867, + "learning_rate": 0.00016393553850904878, + "loss": 0.0577, + "step": 1041 + }, + { + "epoch": 1.84, + "grad_norm": 0.23039738833904266, + "learning_rate": 0.00016349806768867345, + "loss": 0.026, + "step": 1042 + }, + { + "epoch": 1.85, + "grad_norm": 0.3328697979450226, + "learning_rate": 0.00016306089756289063, + "loss": 0.0542, + "step": 1043 + }, + { + "epoch": 1.85, + "grad_norm": 0.3017619252204895, + "learning_rate": 0.0001626240296513739, + "loss": 0.0363, + "step": 1044 + }, + { + "epoch": 1.85, + "grad_norm": 0.15930373966693878, + "learning_rate": 0.0001621874654727461, + "loss": 0.02, + "step": 1045 + }, + { + "epoch": 1.85, + "grad_norm": 0.40952980518341064, + "learning_rate": 0.00016175120654457432, + "loss": 0.0523, + "step": 1046 + }, + { + "epoch": 1.85, + "grad_norm": 0.6540464162826538, + "learning_rate": 0.00016131525438336475, + "loss": 0.0744, + "step": 1047 + }, + { + "epoch": 1.85, + "grad_norm": 0.3518769443035126, + "learning_rate": 0.00016087961050455685, + "loss": 0.05, + "step": 1048 + }, + { + "epoch": 1.86, + "grad_norm": 0.4166756570339203, + "learning_rate": 0.0001604442764225188, + "loss": 0.0681, + "step": 1049 + }, + { + "epoch": 1.86, + "grad_norm": 0.39616283774375916, + "learning_rate": 0.00016000925365054154, + "loss": 0.0416, + "step": 1050 + }, + { + "epoch": 1.86, + "grad_norm": 0.20040427148342133, + "learning_rate": 0.00015957454370083398, + "loss": 0.0284, + "step": 1051 + }, + { + "epoch": 1.86, + "grad_norm": 0.7230433821678162, + "learning_rate": 0.00015914014808451784, + "loss": 0.1035, + "step": 1052 + }, + { + "epoch": 1.86, + "grad_norm": 0.1968737691640854, + "learning_rate": 0.00015870606831162182, + "loss": 0.0281, + "step": 1053 + }, + { + "epoch": 1.86, + "grad_norm": 0.2677242159843445, + "learning_rate": 0.0001582723058910769, + "loss": 0.0566, + "step": 1054 + }, + { + "epoch": 1.87, + "grad_norm": 0.12256369739770889, + "learning_rate": 0.00015783886233071076, + "loss": 0.0192, + "step": 1055 + }, + { + "epoch": 1.87, + "grad_norm": 0.311192125082016, + "learning_rate": 0.00015740573913724276, + "loss": 0.035, + "step": 1056 + }, + { + "epoch": 1.87, + "grad_norm": 0.36169809103012085, + "learning_rate": 0.00015697293781627878, + "loss": 0.0755, + "step": 1057 + }, + { + "epoch": 1.87, + "grad_norm": 0.8104953765869141, + "learning_rate": 0.00015654045987230532, + "loss": 0.0418, + "step": 1058 + }, + { + "epoch": 1.87, + "grad_norm": 0.35273879766464233, + "learning_rate": 0.00015610830680868533, + "loss": 0.0266, + "step": 1059 + }, + { + "epoch": 1.88, + "grad_norm": 0.18313364684581757, + "learning_rate": 0.00015567648012765212, + "loss": 0.0538, + "step": 1060 + }, + { + "epoch": 1.88, + "grad_norm": 0.40563294291496277, + "learning_rate": 0.0001552449813303044, + "loss": 0.046, + "step": 1061 + }, + { + "epoch": 1.88, + "grad_norm": 0.44426023960113525, + "learning_rate": 0.00015481381191660143, + "loss": 0.0938, + "step": 1062 + }, + { + "epoch": 1.88, + "grad_norm": 0.4189196228981018, + "learning_rate": 0.00015438297338535702, + "loss": 0.0344, + "step": 1063 + }, + { + "epoch": 1.88, + "grad_norm": 0.6641749143600464, + "learning_rate": 0.0001539524672342351, + "loss": 0.0729, + "step": 1064 + }, + { + "epoch": 1.88, + "grad_norm": 0.2397107034921646, + "learning_rate": 0.00015352229495974422, + "loss": 0.0493, + "step": 1065 + }, + { + "epoch": 1.89, + "grad_norm": 0.17326873540878296, + "learning_rate": 0.00015309245805723205, + "loss": 0.0131, + "step": 1066 + }, + { + "epoch": 1.89, + "grad_norm": 0.69275963306427, + "learning_rate": 0.00015266295802088064, + "loss": 0.1512, + "step": 1067 + }, + { + "epoch": 1.89, + "grad_norm": 0.3260841369628906, + "learning_rate": 0.00015223379634370115, + "loss": 0.0602, + "step": 1068 + }, + { + "epoch": 1.89, + "grad_norm": 0.45368266105651855, + "learning_rate": 0.00015180497451752826, + "loss": 0.0593, + "step": 1069 + }, + { + "epoch": 1.89, + "grad_norm": 0.5664640069007874, + "learning_rate": 0.0001513764940330155, + "loss": 0.0651, + "step": 1070 + }, + { + "epoch": 1.89, + "grad_norm": 0.21212846040725708, + "learning_rate": 0.00015094835637962975, + "loss": 0.0232, + "step": 1071 + }, + { + "epoch": 1.9, + "grad_norm": 0.364945650100708, + "learning_rate": 0.0001505205630456461, + "loss": 0.0436, + "step": 1072 + }, + { + "epoch": 1.9, + "grad_norm": 0.48835766315460205, + "learning_rate": 0.00015009311551814297, + "loss": 0.0885, + "step": 1073 + }, + { + "epoch": 1.9, + "grad_norm": 0.22198058664798737, + "learning_rate": 0.00014966601528299637, + "loss": 0.026, + "step": 1074 + }, + { + "epoch": 1.9, + "grad_norm": 0.2598209083080292, + "learning_rate": 0.00014923926382487534, + "loss": 0.0306, + "step": 1075 + }, + { + "epoch": 1.9, + "grad_norm": 0.22863651812076569, + "learning_rate": 0.0001488128626272363, + "loss": 0.0476, + "step": 1076 + }, + { + "epoch": 1.91, + "grad_norm": 0.4222748875617981, + "learning_rate": 0.00014838681317231822, + "loss": 0.0837, + "step": 1077 + }, + { + "epoch": 1.91, + "grad_norm": 0.5555634498596191, + "learning_rate": 0.00014796111694113752, + "loss": 0.0747, + "step": 1078 + }, + { + "epoch": 1.91, + "grad_norm": 0.28704702854156494, + "learning_rate": 0.0001475357754134824, + "loss": 0.0388, + "step": 1079 + }, + { + "epoch": 1.91, + "grad_norm": 0.3526531457901001, + "learning_rate": 0.00014711079006790828, + "loss": 0.0396, + "step": 1080 + }, + { + "epoch": 1.91, + "grad_norm": 0.41639888286590576, + "learning_rate": 0.0001466861623817325, + "loss": 0.0954, + "step": 1081 + }, + { + "epoch": 1.91, + "grad_norm": 0.288824200630188, + "learning_rate": 0.0001462618938310288, + "loss": 0.0355, + "step": 1082 + }, + { + "epoch": 1.92, + "grad_norm": 0.257003515958786, + "learning_rate": 0.00014583798589062292, + "loss": 0.0257, + "step": 1083 + }, + { + "epoch": 1.92, + "grad_norm": 0.23509138822555542, + "learning_rate": 0.00014541444003408682, + "loss": 0.0548, + "step": 1084 + }, + { + "epoch": 1.92, + "grad_norm": 0.3425995707511902, + "learning_rate": 0.0001449912577337337, + "loss": 0.0691, + "step": 1085 + }, + { + "epoch": 1.92, + "grad_norm": 0.2606826722621918, + "learning_rate": 0.00014456844046061332, + "loss": 0.029, + "step": 1086 + }, + { + "epoch": 1.92, + "grad_norm": 0.10555114597082138, + "learning_rate": 0.00014414598968450615, + "loss": 0.0166, + "step": 1087 + }, + { + "epoch": 1.92, + "grad_norm": 0.47334909439086914, + "learning_rate": 0.00014372390687391906, + "loss": 0.0438, + "step": 1088 + }, + { + "epoch": 1.93, + "grad_norm": 0.36925116181373596, + "learning_rate": 0.00014330219349607947, + "loss": 0.0163, + "step": 1089 + }, + { + "epoch": 1.93, + "grad_norm": 0.2707056999206543, + "learning_rate": 0.0001428808510169307, + "loss": 0.0709, + "step": 1090 + }, + { + "epoch": 1.93, + "grad_norm": 0.20645679533481598, + "learning_rate": 0.00014245988090112694, + "loss": 0.0351, + "step": 1091 + }, + { + "epoch": 1.93, + "grad_norm": 0.1839297115802765, + "learning_rate": 0.00014203928461202763, + "loss": 0.025, + "step": 1092 + }, + { + "epoch": 1.93, + "grad_norm": 0.6433751583099365, + "learning_rate": 0.0001416190636116932, + "loss": 0.0693, + "step": 1093 + }, + { + "epoch": 1.94, + "grad_norm": 0.44755876064300537, + "learning_rate": 0.00014119921936087907, + "loss": 0.0788, + "step": 1094 + }, + { + "epoch": 1.94, + "grad_norm": 0.3716716766357422, + "learning_rate": 0.00014077975331903118, + "loss": 0.0429, + "step": 1095 + }, + { + "epoch": 1.94, + "grad_norm": 0.7285858392715454, + "learning_rate": 0.00014036066694428096, + "loss": 0.035, + "step": 1096 + }, + { + "epoch": 1.94, + "grad_norm": 0.22279293835163116, + "learning_rate": 0.00013994196169343963, + "loss": 0.012, + "step": 1097 + }, + { + "epoch": 1.94, + "grad_norm": 0.17774836719036102, + "learning_rate": 0.00013952363902199405, + "loss": 0.0238, + "step": 1098 + }, + { + "epoch": 1.94, + "grad_norm": 0.2312237024307251, + "learning_rate": 0.0001391057003841008, + "loss": 0.0546, + "step": 1099 + }, + { + "epoch": 1.95, + "grad_norm": 0.6004791855812073, + "learning_rate": 0.0001386881472325816, + "loss": 0.0625, + "step": 1100 + }, + { + "epoch": 1.95, + "grad_norm": 0.21409562230110168, + "learning_rate": 0.0001382709810189183, + "loss": 0.034, + "step": 1101 + }, + { + "epoch": 1.95, + "grad_norm": 0.295493483543396, + "learning_rate": 0.00013785420319324744, + "loss": 0.0332, + "step": 1102 + }, + { + "epoch": 1.95, + "grad_norm": 0.5428887009620667, + "learning_rate": 0.00013743781520435573, + "loss": 0.0649, + "step": 1103 + }, + { + "epoch": 1.95, + "grad_norm": 0.331990122795105, + "learning_rate": 0.00013702181849967453, + "loss": 0.046, + "step": 1104 + }, + { + "epoch": 1.95, + "grad_norm": 0.4544171392917633, + "learning_rate": 0.00013660621452527504, + "loss": 0.0563, + "step": 1105 + }, + { + "epoch": 1.96, + "grad_norm": 0.35486406087875366, + "learning_rate": 0.0001361910047258635, + "loss": 0.0583, + "step": 1106 + }, + { + "epoch": 1.96, + "grad_norm": 0.24665361642837524, + "learning_rate": 0.00013577619054477575, + "loss": 0.0267, + "step": 1107 + }, + { + "epoch": 1.96, + "grad_norm": 0.07276459783315659, + "learning_rate": 0.00013536177342397243, + "loss": 0.0064, + "step": 1108 + }, + { + "epoch": 1.96, + "grad_norm": 0.4690609872341156, + "learning_rate": 0.00013494775480403384, + "loss": 0.0553, + "step": 1109 + }, + { + "epoch": 1.96, + "grad_norm": 0.4010032117366791, + "learning_rate": 0.00013453413612415512, + "loss": 0.0514, + "step": 1110 + }, + { + "epoch": 1.97, + "grad_norm": 0.3563205301761627, + "learning_rate": 0.00013412091882214112, + "loss": 0.0553, + "step": 1111 + }, + { + "epoch": 1.97, + "grad_norm": 0.6027369499206543, + "learning_rate": 0.00013370810433440167, + "loss": 0.0677, + "step": 1112 + }, + { + "epoch": 1.97, + "grad_norm": 0.5082702040672302, + "learning_rate": 0.00013329569409594605, + "loss": 0.1265, + "step": 1113 + }, + { + "epoch": 1.97, + "grad_norm": 0.5313373804092407, + "learning_rate": 0.00013288368954037834, + "loss": 0.0234, + "step": 1114 + }, + { + "epoch": 1.97, + "grad_norm": 0.33385488390922546, + "learning_rate": 0.00013247209209989242, + "loss": 0.0252, + "step": 1115 + }, + { + "epoch": 1.97, + "grad_norm": 0.36459195613861084, + "learning_rate": 0.00013206090320526704, + "loss": 0.0211, + "step": 1116 + }, + { + "epoch": 1.98, + "grad_norm": 0.5477709770202637, + "learning_rate": 0.00013165012428586096, + "loss": 0.0416, + "step": 1117 + }, + { + "epoch": 1.98, + "grad_norm": 0.3133089542388916, + "learning_rate": 0.0001312397567696074, + "loss": 0.036, + "step": 1118 + }, + { + "epoch": 1.98, + "grad_norm": 0.284045934677124, + "learning_rate": 0.00013082980208300971, + "loss": 0.0249, + "step": 1119 + }, + { + "epoch": 1.98, + "grad_norm": 0.3401576578617096, + "learning_rate": 0.00013042026165113618, + "loss": 0.0281, + "step": 1120 + }, + { + "epoch": 1.98, + "grad_norm": 0.21280981600284576, + "learning_rate": 0.00013001113689761496, + "loss": 0.0186, + "step": 1121 + }, + { + "epoch": 1.98, + "grad_norm": 0.1294207125902176, + "learning_rate": 0.00012960242924462957, + "loss": 0.0156, + "step": 1122 + }, + { + "epoch": 1.99, + "grad_norm": 0.11151756346225739, + "learning_rate": 0.00012919414011291298, + "loss": 0.0111, + "step": 1123 + }, + { + "epoch": 1.99, + "grad_norm": 0.7448397874832153, + "learning_rate": 0.0001287862709217439, + "loss": 0.0898, + "step": 1124 + }, + { + "epoch": 1.99, + "grad_norm": 0.4080904424190521, + "learning_rate": 0.00012837882308894117, + "loss": 0.0323, + "step": 1125 + }, + { + "epoch": 1.99, + "grad_norm": 0.33506283164024353, + "learning_rate": 0.00012797179803085862, + "loss": 0.0309, + "step": 1126 + }, + { + "epoch": 1.99, + "grad_norm": 0.32063427567481995, + "learning_rate": 0.00012756519716238096, + "loss": 0.0978, + "step": 1127 + }, + { + "epoch": 2.0, + "grad_norm": 0.7773024439811707, + "learning_rate": 0.0001271590218969176, + "loss": 0.1017, + "step": 1128 + }, + { + "epoch": 2.0, + "grad_norm": 0.2934909164905548, + "learning_rate": 0.00012675327364639917, + "loss": 0.0192, + "step": 1129 + }, + { + "epoch": 2.0, + "grad_norm": 0.6137431859970093, + "learning_rate": 0.0001263479538212717, + "loss": 0.1013, + "step": 1130 + } + ], + "logging_steps": 1, + "max_steps": 1695, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 565, + "total_flos": 1.0339891388035891e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}