diff --git "a/checkpoint-1695/trainer_state.json" "b/checkpoint-1695/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1695/trainer_state.json" @@ -0,0 +1,11982 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9986731534719153, + "eval_steps": 142, + "global_step": 1695, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.6921162605285645, + "learning_rate": 5e-05, + "loss": 3.3182, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 3.3362529277801514, + "eval_runtime": 14.4412, + "eval_samples_per_second": 33.031, + "eval_steps_per_second": 8.31, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 4.620742321014404, + "learning_rate": 0.0001, + "loss": 3.2788, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 4.650161266326904, + "learning_rate": 0.00015, + "loss": 3.2271, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 4.024933815002441, + "learning_rate": 0.0002, + "loss": 2.402, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 2.751981496810913, + "learning_rate": 0.00025, + "loss": 1.0544, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 1.4766970872879028, + "learning_rate": 0.0003, + "loss": 0.3549, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 0.8064658641815186, + "learning_rate": 0.00035, + "loss": 0.1533, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 2.232205390930176, + "learning_rate": 0.0004, + "loss": 0.31, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 1.1898847818374634, + "learning_rate": 0.00045000000000000004, + "loss": 0.1818, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 0.7394833564758301, + "learning_rate": 0.0005, + "loss": 0.1751, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 0.16317571699619293, + "learning_rate": 0.0004999995654799487, + "loss": 0.1411, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.10235322266817093, + "learning_rate": 0.0004999982619213052, + "loss": 0.1363, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 0.19907887279987335, + "learning_rate": 0.0004999960893286008, + "loss": 0.128, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 0.6823816299438477, + "learning_rate": 0.0004999930477093878, + "loss": 0.143, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 0.10187644511461258, + "learning_rate": 0.0004999891370742394, + "loss": 0.1322, + "step": 15 + }, + { + "epoch": 0.03, + "grad_norm": 0.09401004016399384, + "learning_rate": 0.0004999843574367498, + "loss": 0.1361, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 0.17946797609329224, + "learning_rate": 0.0004999787088135334, + "loss": 0.1412, + "step": 17 + }, + { + "epoch": 0.03, + "grad_norm": 0.890545666217804, + "learning_rate": 0.0004999721912242259, + "loss": 0.1593, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.434042751789093, + "learning_rate": 0.0004999648046914836, + "loss": 0.1548, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 0.28103551268577576, + "learning_rate": 0.0004999565492409831, + "loss": 0.1459, + "step": 20 + }, + { + "epoch": 0.04, + "grad_norm": 0.2690610885620117, + "learning_rate": 0.0004999474249014217, + "loss": 0.1248, + "step": 21 + }, + { + "epoch": 0.04, + "grad_norm": 0.37668731808662415, + "learning_rate": 0.0004999374317045172, + "loss": 0.1481, + "step": 22 + }, + { + "epoch": 0.04, + "grad_norm": 0.23762015998363495, + "learning_rate": 0.0004999265696850074, + "loss": 0.1407, + "step": 23 + }, + { + "epoch": 0.04, + "grad_norm": 0.1988176554441452, + "learning_rate": 0.0004999148388806504, + "loss": 0.1398, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.3805619776248932, + "learning_rate": 0.0004999022393322246, + "loss": 0.1474, + "step": 25 + }, + { + "epoch": 0.05, + "grad_norm": 0.5069771409034729, + "learning_rate": 0.0004998887710835278, + "loss": 0.1509, + "step": 26 + }, + { + "epoch": 0.05, + "grad_norm": 0.42066043615341187, + "learning_rate": 0.0004998744341813779, + "loss": 0.1341, + "step": 27 + }, + { + "epoch": 0.05, + "grad_norm": 0.0944904088973999, + "learning_rate": 0.0004998592286756123, + "loss": 0.1233, + "step": 28 + }, + { + "epoch": 0.05, + "grad_norm": 0.849244236946106, + "learning_rate": 0.0004998431546190875, + "loss": 0.1999, + "step": 29 + }, + { + "epoch": 0.05, + "grad_norm": 0.09785456210374832, + "learning_rate": 0.00049982621206768, + "loss": 0.1272, + "step": 30 + }, + { + "epoch": 0.05, + "grad_norm": 0.38225457072257996, + "learning_rate": 0.0004998084010802845, + "loss": 0.1634, + "step": 31 + }, + { + "epoch": 0.06, + "grad_norm": 0.08135183900594711, + "learning_rate": 0.0004997897217188149, + "loss": 0.1383, + "step": 32 + }, + { + "epoch": 0.06, + "grad_norm": 0.17299437522888184, + "learning_rate": 0.0004997701740482036, + "loss": 0.1427, + "step": 33 + }, + { + "epoch": 0.06, + "grad_norm": 0.11125747114419937, + "learning_rate": 0.0004997497581364015, + "loss": 0.1379, + "step": 34 + }, + { + "epoch": 0.06, + "grad_norm": 0.08914893865585327, + "learning_rate": 0.0004997284740543776, + "loss": 0.1388, + "step": 35 + }, + { + "epoch": 0.06, + "grad_norm": 0.034590039402246475, + "learning_rate": 0.0004997063218761188, + "loss": 0.1387, + "step": 36 + }, + { + "epoch": 0.07, + "grad_norm": 0.08675777167081833, + "learning_rate": 0.0004996833016786296, + "loss": 0.1384, + "step": 37 + }, + { + "epoch": 0.07, + "grad_norm": 0.4440009295940399, + "learning_rate": 0.0004996594135419318, + "loss": 0.152, + "step": 38 + }, + { + "epoch": 0.07, + "grad_norm": 0.0814109519124031, + "learning_rate": 0.0004996346575490646, + "loss": 0.1373, + "step": 39 + }, + { + "epoch": 0.07, + "grad_norm": 0.37724560499191284, + "learning_rate": 0.0004996090337860836, + "loss": 0.1362, + "step": 40 + }, + { + "epoch": 0.07, + "grad_norm": 0.21177273988723755, + "learning_rate": 0.0004995825423420613, + "loss": 0.1423, + "step": 41 + }, + { + "epoch": 0.07, + "grad_norm": 0.12168041616678238, + "learning_rate": 0.000499555183309086, + "loss": 0.1381, + "step": 42 + }, + { + "epoch": 0.08, + "grad_norm": 0.21096466481685638, + "learning_rate": 0.0004995269567822623, + "loss": 0.139, + "step": 43 + }, + { + "epoch": 0.08, + "grad_norm": 0.49395841360092163, + "learning_rate": 0.0004994978628597099, + "loss": 0.1016, + "step": 44 + }, + { + "epoch": 0.08, + "grad_norm": 0.1108216792345047, + "learning_rate": 0.0004994679016425642, + "loss": 0.1334, + "step": 45 + }, + { + "epoch": 0.08, + "grad_norm": 0.5518127679824829, + "learning_rate": 0.000499437073234975, + "loss": 0.1568, + "step": 46 + }, + { + "epoch": 0.08, + "grad_norm": 0.4762812852859497, + "learning_rate": 0.0004994053777441069, + "loss": 0.1543, + "step": 47 + }, + { + "epoch": 0.08, + "grad_norm": 0.3477722108364105, + "learning_rate": 0.0004993728152801384, + "loss": 0.1464, + "step": 48 + }, + { + "epoch": 0.09, + "grad_norm": 0.4996407628059387, + "learning_rate": 0.0004993393859562621, + "loss": 0.154, + "step": 49 + }, + { + "epoch": 0.09, + "grad_norm": 0.20425601303577423, + "learning_rate": 0.0004993050898886833, + "loss": 0.1372, + "step": 50 + }, + { + "epoch": 0.09, + "grad_norm": 0.034631408751010895, + "learning_rate": 0.000499269927196621, + "loss": 0.137, + "step": 51 + }, + { + "epoch": 0.09, + "grad_norm": 0.24027873575687408, + "learning_rate": 0.0004992338980023062, + "loss": 0.1468, + "step": 52 + }, + { + "epoch": 0.09, + "grad_norm": 0.22242723405361176, + "learning_rate": 0.000499197002430982, + "loss": 0.1418, + "step": 53 + }, + { + "epoch": 0.1, + "grad_norm": 0.6540514826774597, + "learning_rate": 0.0004991592406109036, + "loss": 0.1564, + "step": 54 + }, + { + "epoch": 0.1, + "grad_norm": 0.030118577182292938, + "learning_rate": 0.000499120612673337, + "loss": 0.1365, + "step": 55 + }, + { + "epoch": 0.1, + "grad_norm": 0.07544097304344177, + "learning_rate": 0.0004990811187525592, + "loss": 0.1334, + "step": 56 + }, + { + "epoch": 0.1, + "grad_norm": 0.37415480613708496, + "learning_rate": 0.0004990407589858572, + "loss": 0.155, + "step": 57 + }, + { + "epoch": 0.1, + "grad_norm": 0.557809054851532, + "learning_rate": 0.0004989995335135282, + "loss": 0.1603, + "step": 58 + }, + { + "epoch": 0.1, + "grad_norm": 0.14802873134613037, + "learning_rate": 0.0004989574424788787, + "loss": 0.1387, + "step": 59 + }, + { + "epoch": 0.11, + "grad_norm": 0.3581993281841278, + "learning_rate": 0.0004989144860282236, + "loss": 0.1475, + "step": 60 + }, + { + "epoch": 0.11, + "grad_norm": 0.04818522185087204, + "learning_rate": 0.0004988706643108864, + "loss": 0.1362, + "step": 61 + }, + { + "epoch": 0.11, + "grad_norm": 0.21908174455165863, + "learning_rate": 0.0004988259774791987, + "loss": 0.1386, + "step": 62 + }, + { + "epoch": 0.11, + "grad_norm": 0.1852695643901825, + "learning_rate": 0.0004987804256884988, + "loss": 0.1387, + "step": 63 + }, + { + "epoch": 0.11, + "grad_norm": 0.025747304782271385, + "learning_rate": 0.0004987340090971323, + "loss": 0.1393, + "step": 64 + }, + { + "epoch": 0.11, + "grad_norm": 0.045346710830926895, + "learning_rate": 0.0004986867278664504, + "loss": 0.1354, + "step": 65 + }, + { + "epoch": 0.12, + "grad_norm": 0.34946465492248535, + "learning_rate": 0.0004986385821608106, + "loss": 0.152, + "step": 66 + }, + { + "epoch": 0.12, + "grad_norm": 0.2552882432937622, + "learning_rate": 0.0004985895721475748, + "loss": 0.1463, + "step": 67 + }, + { + "epoch": 0.12, + "grad_norm": 0.0560542456805706, + "learning_rate": 0.0004985396979971099, + "loss": 0.1391, + "step": 68 + }, + { + "epoch": 0.12, + "grad_norm": 0.14347511529922485, + "learning_rate": 0.0004984889598827863, + "loss": 0.1353, + "step": 69 + }, + { + "epoch": 0.12, + "grad_norm": 0.12386342883110046, + "learning_rate": 0.0004984373579809778, + "loss": 0.1343, + "step": 70 + }, + { + "epoch": 0.13, + "grad_norm": 0.03070697747170925, + "learning_rate": 0.000498384892471061, + "loss": 0.1356, + "step": 71 + }, + { + "epoch": 0.13, + "grad_norm": 0.0531514473259449, + "learning_rate": 0.0004983315635354144, + "loss": 0.1346, + "step": 72 + }, + { + "epoch": 0.13, + "grad_norm": 0.24197503924369812, + "learning_rate": 0.0004982773713594178, + "loss": 0.1217, + "step": 73 + }, + { + "epoch": 0.13, + "grad_norm": 0.08417380601167679, + "learning_rate": 0.0004982223161314522, + "loss": 0.1223, + "step": 74 + }, + { + "epoch": 0.13, + "grad_norm": 0.40045711398124695, + "learning_rate": 0.000498166398042898, + "loss": 0.1513, + "step": 75 + }, + { + "epoch": 0.13, + "grad_norm": 0.12452740222215652, + "learning_rate": 0.0004981096172881358, + "loss": 0.1296, + "step": 76 + }, + { + "epoch": 0.14, + "grad_norm": 0.21590262651443481, + "learning_rate": 0.0004980519740645444, + "loss": 0.1375, + "step": 77 + }, + { + "epoch": 0.14, + "grad_norm": 0.07704459875822067, + "learning_rate": 0.0004979934685725011, + "loss": 0.1299, + "step": 78 + }, + { + "epoch": 0.14, + "grad_norm": 0.28334081172943115, + "learning_rate": 0.0004979341010153801, + "loss": 0.1387, + "step": 79 + }, + { + "epoch": 0.14, + "grad_norm": 0.12374007701873779, + "learning_rate": 0.0004978738715995526, + "loss": 0.1383, + "step": 80 + }, + { + "epoch": 0.14, + "grad_norm": 0.040613267570734024, + "learning_rate": 0.000497812780534386, + "loss": 0.1367, + "step": 81 + }, + { + "epoch": 0.15, + "grad_norm": 0.09974126517772675, + "learning_rate": 0.0004977508280322423, + "loss": 0.1248, + "step": 82 + }, + { + "epoch": 0.15, + "grad_norm": 0.2616259753704071, + "learning_rate": 0.0004976880143084786, + "loss": 0.1311, + "step": 83 + }, + { + "epoch": 0.15, + "grad_norm": 0.15635579824447632, + "learning_rate": 0.0004976243395814452, + "loss": 0.1189, + "step": 84 + }, + { + "epoch": 0.15, + "grad_norm": 0.259250670671463, + "learning_rate": 0.000497559804072486, + "loss": 0.1099, + "step": 85 + }, + { + "epoch": 0.15, + "grad_norm": 1.2778699398040771, + "learning_rate": 0.0004974944080059365, + "loss": 0.1416, + "step": 86 + }, + { + "epoch": 0.15, + "grad_norm": 0.2155281901359558, + "learning_rate": 0.000497428151609124, + "loss": 0.1253, + "step": 87 + }, + { + "epoch": 0.16, + "grad_norm": 0.17533721029758453, + "learning_rate": 0.0004973610351123664, + "loss": 0.1446, + "step": 88 + }, + { + "epoch": 0.16, + "grad_norm": 0.07620590180158615, + "learning_rate": 0.0004972930587489715, + "loss": 0.1309, + "step": 89 + }, + { + "epoch": 0.16, + "grad_norm": 0.2370779663324356, + "learning_rate": 0.0004972242227552358, + "loss": 0.149, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 0.06374065577983856, + "learning_rate": 0.0004971545273704444, + "loss": 0.1307, + "step": 91 + }, + { + "epoch": 0.16, + "grad_norm": 0.22728750109672546, + "learning_rate": 0.0004970839728368697, + "loss": 0.1438, + "step": 92 + }, + { + "epoch": 0.16, + "grad_norm": 0.16872233152389526, + "learning_rate": 0.0004970125593997706, + "loss": 0.1364, + "step": 93 + }, + { + "epoch": 0.17, + "grad_norm": 0.18773947656154633, + "learning_rate": 0.0004969402873073914, + "loss": 0.146, + "step": 94 + }, + { + "epoch": 0.17, + "grad_norm": 0.1468167006969452, + "learning_rate": 0.0004968671568109616, + "loss": 0.1401, + "step": 95 + }, + { + "epoch": 0.17, + "grad_norm": 0.09030504524707794, + "learning_rate": 0.0004967931681646948, + "loss": 0.1318, + "step": 96 + }, + { + "epoch": 0.17, + "grad_norm": 0.061796192079782486, + "learning_rate": 0.000496718321625787, + "loss": 0.1244, + "step": 97 + }, + { + "epoch": 0.17, + "grad_norm": 0.045495226979255676, + "learning_rate": 0.0004966426174544171, + "loss": 0.1265, + "step": 98 + }, + { + "epoch": 0.18, + "grad_norm": 0.08449025452136993, + "learning_rate": 0.0004965660559137448, + "loss": 0.1276, + "step": 99 + }, + { + "epoch": 0.18, + "grad_norm": 0.09982559829950333, + "learning_rate": 0.0004964886372699102, + "loss": 0.1253, + "step": 100 + }, + { + "epoch": 0.18, + "grad_norm": 0.05831208825111389, + "learning_rate": 0.0004964103617920332, + "loss": 0.1271, + "step": 101 + }, + { + "epoch": 0.18, + "grad_norm": 0.20548835396766663, + "learning_rate": 0.0004963312297522116, + "loss": 0.1415, + "step": 102 + }, + { + "epoch": 0.18, + "grad_norm": 0.09664470702409744, + "learning_rate": 0.0004962512414255214, + "loss": 0.1083, + "step": 103 + }, + { + "epoch": 0.18, + "grad_norm": 0.16931602358818054, + "learning_rate": 0.0004961703970900145, + "loss": 0.1431, + "step": 104 + }, + { + "epoch": 0.19, + "grad_norm": 0.10859667509794235, + "learning_rate": 0.000496088697026719, + "loss": 0.12, + "step": 105 + }, + { + "epoch": 0.19, + "grad_norm": 0.21958191692829132, + "learning_rate": 0.0004960061415196374, + "loss": 0.1492, + "step": 106 + }, + { + "epoch": 0.19, + "grad_norm": 0.06437578052282333, + "learning_rate": 0.0004959227308557459, + "loss": 0.1083, + "step": 107 + }, + { + "epoch": 0.19, + "grad_norm": 0.14975550770759583, + "learning_rate": 0.0004958384653249932, + "loss": 0.1155, + "step": 108 + }, + { + "epoch": 0.19, + "grad_norm": 0.11868852376937866, + "learning_rate": 0.0004957533452203, + "loss": 0.1237, + "step": 109 + }, + { + "epoch": 0.19, + "grad_norm": 0.2610260546207428, + "learning_rate": 0.0004956673708375574, + "loss": 0.1264, + "step": 110 + }, + { + "epoch": 0.2, + "grad_norm": 0.378467321395874, + "learning_rate": 0.000495580542475626, + "loss": 0.1222, + "step": 111 + }, + { + "epoch": 0.2, + "grad_norm": 0.0926096960902214, + "learning_rate": 0.0004954928604363352, + "loss": 0.1096, + "step": 112 + }, + { + "epoch": 0.2, + "grad_norm": 0.06858692318201065, + "learning_rate": 0.0004954043250244819, + "loss": 0.1144, + "step": 113 + }, + { + "epoch": 0.2, + "grad_norm": 0.3068992495536804, + "learning_rate": 0.0004953149365478293, + "loss": 0.1563, + "step": 114 + }, + { + "epoch": 0.2, + "grad_norm": 0.15458936989307404, + "learning_rate": 0.0004952246953171061, + "loss": 0.1216, + "step": 115 + }, + { + "epoch": 0.21, + "grad_norm": 0.10287577658891678, + "learning_rate": 0.0004951336016460053, + "loss": 0.0893, + "step": 116 + }, + { + "epoch": 0.21, + "grad_norm": 0.1215134710073471, + "learning_rate": 0.0004950416558511833, + "loss": 0.1016, + "step": 117 + }, + { + "epoch": 0.21, + "grad_norm": 0.1392650604248047, + "learning_rate": 0.000494948858252258, + "loss": 0.1111, + "step": 118 + }, + { + "epoch": 0.21, + "grad_norm": 0.4350431263446808, + "learning_rate": 0.0004948552091718092, + "loss": 0.1192, + "step": 119 + }, + { + "epoch": 0.21, + "grad_norm": 0.21448662877082825, + "learning_rate": 0.0004947607089353758, + "loss": 0.07, + "step": 120 + }, + { + "epoch": 0.21, + "grad_norm": 1.6086686849594116, + "learning_rate": 0.0004946653578714559, + "loss": 0.1352, + "step": 121 + }, + { + "epoch": 0.22, + "grad_norm": 0.25963085889816284, + "learning_rate": 0.0004945691563115051, + "loss": 0.1447, + "step": 122 + }, + { + "epoch": 0.22, + "grad_norm": 0.11575956642627716, + "learning_rate": 0.0004944721045899356, + "loss": 0.1055, + "step": 123 + }, + { + "epoch": 0.22, + "grad_norm": 0.11230157315731049, + "learning_rate": 0.0004943742030441145, + "loss": 0.0917, + "step": 124 + }, + { + "epoch": 0.22, + "grad_norm": 0.3376341760158539, + "learning_rate": 0.0004942754520143634, + "loss": 0.1364, + "step": 125 + }, + { + "epoch": 0.22, + "grad_norm": 0.2757412791252136, + "learning_rate": 0.0004941758518439566, + "loss": 0.1418, + "step": 126 + }, + { + "epoch": 0.22, + "grad_norm": 0.1438644975423813, + "learning_rate": 0.0004940754028791205, + "loss": 0.1162, + "step": 127 + }, + { + "epoch": 0.23, + "grad_norm": 0.14210884273052216, + "learning_rate": 0.0004939741054690316, + "loss": 0.1312, + "step": 128 + }, + { + "epoch": 0.23, + "grad_norm": 0.1861649751663208, + "learning_rate": 0.0004938719599658162, + "loss": 0.1447, + "step": 129 + }, + { + "epoch": 0.23, + "grad_norm": 0.19665485620498657, + "learning_rate": 0.0004937689667245481, + "loss": 0.1439, + "step": 130 + }, + { + "epoch": 0.23, + "grad_norm": 0.22447055578231812, + "learning_rate": 0.0004936651261032486, + "loss": 0.1568, + "step": 131 + }, + { + "epoch": 0.23, + "grad_norm": 0.10008269548416138, + "learning_rate": 0.0004935604384628843, + "loss": 0.1081, + "step": 132 + }, + { + "epoch": 0.24, + "grad_norm": 0.0549234002828598, + "learning_rate": 0.0004934549041673661, + "loss": 0.1216, + "step": 133 + }, + { + "epoch": 0.24, + "grad_norm": 0.11616212874650955, + "learning_rate": 0.0004933485235835483, + "loss": 0.1108, + "step": 134 + }, + { + "epoch": 0.24, + "grad_norm": 0.08554813265800476, + "learning_rate": 0.0004932412970812269, + "loss": 0.135, + "step": 135 + }, + { + "epoch": 0.24, + "grad_norm": 0.08642842620611191, + "learning_rate": 0.0004931332250331382, + "loss": 0.1205, + "step": 136 + }, + { + "epoch": 0.24, + "grad_norm": 0.20417262613773346, + "learning_rate": 0.0004930243078149582, + "loss": 0.1169, + "step": 137 + }, + { + "epoch": 0.24, + "grad_norm": 0.11088764667510986, + "learning_rate": 0.0004929145458053005, + "loss": 0.1014, + "step": 138 + }, + { + "epoch": 0.25, + "grad_norm": 0.3510516881942749, + "learning_rate": 0.0004928039393857155, + "loss": 0.0967, + "step": 139 + }, + { + "epoch": 0.25, + "grad_norm": 0.2401883453130722, + "learning_rate": 0.0004926924889406888, + "loss": 0.106, + "step": 140 + }, + { + "epoch": 0.25, + "grad_norm": 0.28403300046920776, + "learning_rate": 0.0004925801948576402, + "loss": 0.079, + "step": 141 + }, + { + "epoch": 0.25, + "grad_norm": 0.46027252078056335, + "learning_rate": 0.0004924670575269217, + "loss": 0.0899, + "step": 142 + }, + { + "epoch": 0.25, + "eval_loss": 0.09421269595623016, + "eval_runtime": 14.7696, + "eval_samples_per_second": 32.296, + "eval_steps_per_second": 8.125, + "step": 142 + }, + { + "epoch": 0.25, + "grad_norm": 0.29767730832099915, + "learning_rate": 0.0004923530773418169, + "loss": 0.1265, + "step": 143 + }, + { + "epoch": 0.25, + "grad_norm": 0.37391072511672974, + "learning_rate": 0.0004922382546985394, + "loss": 0.1244, + "step": 144 + }, + { + "epoch": 0.26, + "grad_norm": 0.8874172568321228, + "learning_rate": 0.0004921225899962308, + "loss": 0.1796, + "step": 145 + }, + { + "epoch": 0.26, + "grad_norm": 0.2554258704185486, + "learning_rate": 0.0004920060836369603, + "loss": 0.0528, + "step": 146 + }, + { + "epoch": 0.26, + "grad_norm": 0.1981816440820694, + "learning_rate": 0.0004918887360257228, + "loss": 0.1159, + "step": 147 + }, + { + "epoch": 0.26, + "grad_norm": 0.14500874280929565, + "learning_rate": 0.0004917705475704373, + "loss": 0.0992, + "step": 148 + }, + { + "epoch": 0.26, + "grad_norm": 0.1315220594406128, + "learning_rate": 0.000491651518681946, + "loss": 0.1248, + "step": 149 + }, + { + "epoch": 0.27, + "grad_norm": 0.0798826813697815, + "learning_rate": 0.0004915316497740121, + "loss": 0.1151, + "step": 150 + }, + { + "epoch": 0.27, + "grad_norm": 0.10213784873485565, + "learning_rate": 0.0004914109412633194, + "loss": 0.1098, + "step": 151 + }, + { + "epoch": 0.27, + "grad_norm": 0.23167072236537933, + "learning_rate": 0.00049128939356947, + "loss": 0.1236, + "step": 152 + }, + { + "epoch": 0.27, + "grad_norm": 0.173340305685997, + "learning_rate": 0.0004911670071149831, + "loss": 0.1098, + "step": 153 + }, + { + "epoch": 0.27, + "grad_norm": 0.1079009547829628, + "learning_rate": 0.0004910437823252937, + "loss": 0.1014, + "step": 154 + }, + { + "epoch": 0.27, + "grad_norm": 0.320765882730484, + "learning_rate": 0.0004909197196287509, + "loss": 0.1285, + "step": 155 + }, + { + "epoch": 0.28, + "grad_norm": 0.40041017532348633, + "learning_rate": 0.0004907948194566166, + "loss": 0.1421, + "step": 156 + }, + { + "epoch": 0.28, + "grad_norm": 0.4091287851333618, + "learning_rate": 0.0004906690822430638, + "loss": 0.1451, + "step": 157 + }, + { + "epoch": 0.28, + "grad_norm": 0.39893922209739685, + "learning_rate": 0.0004905425084251753, + "loss": 0.1289, + "step": 158 + }, + { + "epoch": 0.28, + "grad_norm": 0.14173893630504608, + "learning_rate": 0.0004904150984429419, + "loss": 0.0712, + "step": 159 + }, + { + "epoch": 0.28, + "grad_norm": 0.4723054766654968, + "learning_rate": 0.0004902868527392611, + "loss": 0.2141, + "step": 160 + }, + { + "epoch": 0.28, + "grad_norm": 0.13493523001670837, + "learning_rate": 0.0004901577717599355, + "loss": 0.0881, + "step": 161 + }, + { + "epoch": 0.29, + "grad_norm": 0.10770414024591446, + "learning_rate": 0.0004900278559536716, + "loss": 0.0746, + "step": 162 + }, + { + "epoch": 0.29, + "grad_norm": 0.5121994614601135, + "learning_rate": 0.0004898971057720773, + "loss": 0.1705, + "step": 163 + }, + { + "epoch": 0.29, + "grad_norm": 0.09419309347867966, + "learning_rate": 0.0004897655216696612, + "loss": 0.1085, + "step": 164 + }, + { + "epoch": 0.29, + "grad_norm": 0.3557867407798767, + "learning_rate": 0.0004896331041038309, + "loss": 0.1027, + "step": 165 + }, + { + "epoch": 0.29, + "grad_norm": 0.082126185297966, + "learning_rate": 0.000489499853534891, + "loss": 0.1113, + "step": 166 + }, + { + "epoch": 0.3, + "grad_norm": 0.8520584106445312, + "learning_rate": 0.0004893657704260419, + "loss": 0.1291, + "step": 167 + }, + { + "epoch": 0.3, + "grad_norm": 0.4607222080230713, + "learning_rate": 0.000489230855243378, + "loss": 0.1241, + "step": 168 + }, + { + "epoch": 0.3, + "grad_norm": 0.5181136727333069, + "learning_rate": 0.0004890951084558859, + "loss": 0.0957, + "step": 169 + }, + { + "epoch": 0.3, + "grad_norm": 0.42894089221954346, + "learning_rate": 0.0004889585305354435, + "loss": 0.0895, + "step": 170 + }, + { + "epoch": 0.3, + "grad_norm": 0.14509521424770355, + "learning_rate": 0.0004888211219568175, + "loss": 0.0732, + "step": 171 + }, + { + "epoch": 0.3, + "grad_norm": 0.24262909591197968, + "learning_rate": 0.0004886828831976621, + "loss": 0.0917, + "step": 172 + }, + { + "epoch": 0.31, + "grad_norm": 0.44387635588645935, + "learning_rate": 0.0004885438147385175, + "loss": 0.0636, + "step": 173 + }, + { + "epoch": 0.31, + "grad_norm": 0.1804012507200241, + "learning_rate": 0.0004884039170628077, + "loss": 0.0295, + "step": 174 + }, + { + "epoch": 0.31, + "grad_norm": 0.5566735863685608, + "learning_rate": 0.0004882631906568398, + "loss": 0.1104, + "step": 175 + }, + { + "epoch": 0.31, + "grad_norm": 0.9653083682060242, + "learning_rate": 0.0004881216360098012, + "loss": 0.2236, + "step": 176 + }, + { + "epoch": 0.31, + "grad_norm": 0.27046507596969604, + "learning_rate": 0.0004879792536137585, + "loss": 0.1082, + "step": 177 + }, + { + "epoch": 0.31, + "grad_norm": 0.47974228858947754, + "learning_rate": 0.00048783604396365586, + "loss": 0.0884, + "step": 178 + }, + { + "epoch": 0.32, + "grad_norm": 0.23638087511062622, + "learning_rate": 0.0004876920075573129, + "loss": 0.0968, + "step": 179 + }, + { + "epoch": 0.32, + "grad_norm": 0.12476328015327454, + "learning_rate": 0.0004875471448954234, + "loss": 0.1078, + "step": 180 + }, + { + "epoch": 0.32, + "grad_norm": 0.2455732375383377, + "learning_rate": 0.00048740145648155307, + "loss": 0.1124, + "step": 181 + }, + { + "epoch": 0.32, + "grad_norm": 0.2744804620742798, + "learning_rate": 0.0004872549428221384, + "loss": 0.0797, + "step": 182 + }, + { + "epoch": 0.32, + "grad_norm": 0.19536937773227692, + "learning_rate": 0.00048710760442648415, + "loss": 0.1091, + "step": 183 + }, + { + "epoch": 0.33, + "grad_norm": 0.5277348160743713, + "learning_rate": 0.0004869594418067623, + "loss": 0.1261, + "step": 184 + }, + { + "epoch": 0.33, + "grad_norm": 0.13960392773151398, + "learning_rate": 0.00048681045547801003, + "loss": 0.0879, + "step": 185 + }, + { + "epoch": 0.33, + "grad_norm": 0.2567049562931061, + "learning_rate": 0.00048666064595812746, + "loss": 0.083, + "step": 186 + }, + { + "epoch": 0.33, + "grad_norm": 0.3075740337371826, + "learning_rate": 0.00048651001376787676, + "loss": 0.1167, + "step": 187 + }, + { + "epoch": 0.33, + "grad_norm": 0.5257586240768433, + "learning_rate": 0.0004863585594308794, + "loss": 0.1019, + "step": 188 + }, + { + "epoch": 0.33, + "grad_norm": 0.41611766815185547, + "learning_rate": 0.00048620628347361496, + "loss": 0.1392, + "step": 189 + }, + { + "epoch": 0.34, + "grad_norm": 0.30399614572525024, + "learning_rate": 0.00048605318642541917, + "loss": 0.1339, + "step": 190 + }, + { + "epoch": 0.34, + "grad_norm": 0.41276878118515015, + "learning_rate": 0.00048589926881848194, + "loss": 0.1028, + "step": 191 + }, + { + "epoch": 0.34, + "grad_norm": 0.19717253744602203, + "learning_rate": 0.0004857445311878456, + "loss": 0.1032, + "step": 192 + }, + { + "epoch": 0.34, + "grad_norm": 0.3766873776912689, + "learning_rate": 0.0004855889740714028, + "loss": 0.1486, + "step": 193 + }, + { + "epoch": 0.34, + "grad_norm": 0.17443525791168213, + "learning_rate": 0.0004854325980098951, + "loss": 0.096, + "step": 194 + }, + { + "epoch": 0.34, + "grad_norm": 0.1278471201658249, + "learning_rate": 0.0004852754035469109, + "loss": 0.0746, + "step": 195 + }, + { + "epoch": 0.35, + "grad_norm": 0.14356929063796997, + "learning_rate": 0.0004851173912288833, + "loss": 0.0857, + "step": 196 + }, + { + "epoch": 0.35, + "grad_norm": 0.20514866709709167, + "learning_rate": 0.0004849585616050884, + "loss": 0.0833, + "step": 197 + }, + { + "epoch": 0.35, + "grad_norm": 0.4683605134487152, + "learning_rate": 0.0004847989152276435, + "loss": 0.1538, + "step": 198 + }, + { + "epoch": 0.35, + "grad_norm": 0.29194721579551697, + "learning_rate": 0.00048463845265150495, + "loss": 0.1035, + "step": 199 + }, + { + "epoch": 0.35, + "grad_norm": 0.22838515043258667, + "learning_rate": 0.0004844771744344666, + "loss": 0.0762, + "step": 200 + }, + { + "epoch": 0.36, + "grad_norm": 0.3635599911212921, + "learning_rate": 0.0004843150811371572, + "loss": 0.1165, + "step": 201 + }, + { + "epoch": 0.36, + "grad_norm": 0.2508073151111603, + "learning_rate": 0.0004841521733230391, + "loss": 0.0736, + "step": 202 + }, + { + "epoch": 0.36, + "grad_norm": 0.24161550402641296, + "learning_rate": 0.000483988451558406, + "loss": 0.1309, + "step": 203 + }, + { + "epoch": 0.36, + "grad_norm": 0.4697308838367462, + "learning_rate": 0.0004838239164123811, + "loss": 0.1731, + "step": 204 + }, + { + "epoch": 0.36, + "grad_norm": 0.17773008346557617, + "learning_rate": 0.0004836585684569148, + "loss": 0.1158, + "step": 205 + }, + { + "epoch": 0.36, + "grad_norm": 0.21285519003868103, + "learning_rate": 0.0004834924082667833, + "loss": 0.0949, + "step": 206 + }, + { + "epoch": 0.37, + "grad_norm": 0.2403111308813095, + "learning_rate": 0.0004833254364195859, + "loss": 0.0801, + "step": 207 + }, + { + "epoch": 0.37, + "grad_norm": 0.2033465951681137, + "learning_rate": 0.0004831576534957437, + "loss": 0.069, + "step": 208 + }, + { + "epoch": 0.37, + "grad_norm": 0.5510303378105164, + "learning_rate": 0.000482989060078497, + "loss": 0.1766, + "step": 209 + }, + { + "epoch": 0.37, + "grad_norm": 0.32342344522476196, + "learning_rate": 0.0004828196567539034, + "loss": 0.1229, + "step": 210 + }, + { + "epoch": 0.37, + "grad_norm": 0.3102104663848877, + "learning_rate": 0.00048264944411083625, + "loss": 0.1297, + "step": 211 + }, + { + "epoch": 0.38, + "grad_norm": 0.32639122009277344, + "learning_rate": 0.00048247842274098187, + "loss": 0.1011, + "step": 212 + }, + { + "epoch": 0.38, + "grad_norm": 0.4720034897327423, + "learning_rate": 0.00048230659323883804, + "loss": 0.1282, + "step": 213 + }, + { + "epoch": 0.38, + "grad_norm": 0.5249712467193604, + "learning_rate": 0.00048213395620171166, + "loss": 0.1376, + "step": 214 + }, + { + "epoch": 0.38, + "grad_norm": 0.3953443467617035, + "learning_rate": 0.00048196051222971673, + "loss": 0.1186, + "step": 215 + }, + { + "epoch": 0.38, + "grad_norm": 0.15697738528251648, + "learning_rate": 0.0004817862619257723, + "loss": 0.1079, + "step": 216 + }, + { + "epoch": 0.38, + "grad_norm": 0.32511651515960693, + "learning_rate": 0.0004816112058956005, + "loss": 0.1052, + "step": 217 + }, + { + "epoch": 0.39, + "grad_norm": 0.1850031018257141, + "learning_rate": 0.00048143534474772397, + "loss": 0.1236, + "step": 218 + }, + { + "epoch": 0.39, + "grad_norm": 0.10901057720184326, + "learning_rate": 0.0004812586790934645, + "loss": 0.1094, + "step": 219 + }, + { + "epoch": 0.39, + "grad_norm": 0.23395784199237823, + "learning_rate": 0.00048108120954694014, + "loss": 0.0556, + "step": 220 + }, + { + "epoch": 0.39, + "grad_norm": 0.21469372510910034, + "learning_rate": 0.00048090293672506347, + "loss": 0.0594, + "step": 221 + }, + { + "epoch": 0.39, + "grad_norm": 0.17289988696575165, + "learning_rate": 0.00048072386124753944, + "loss": 0.0219, + "step": 222 + }, + { + "epoch": 0.39, + "grad_norm": 0.21490757167339325, + "learning_rate": 0.0004805439837368631, + "loss": 0.0203, + "step": 223 + }, + { + "epoch": 0.4, + "grad_norm": 1.1259506940841675, + "learning_rate": 0.0004803633048183176, + "loss": 0.1576, + "step": 224 + }, + { + "epoch": 0.4, + "grad_norm": 1.2934038639068604, + "learning_rate": 0.00048018182511997185, + "loss": 0.1233, + "step": 225 + }, + { + "epoch": 0.4, + "grad_norm": 0.4250846207141876, + "learning_rate": 0.0004799995452726783, + "loss": 0.1023, + "step": 226 + }, + { + "epoch": 0.4, + "grad_norm": 1.4675579071044922, + "learning_rate": 0.000479816465910071, + "loss": 0.1242, + "step": 227 + }, + { + "epoch": 0.4, + "grad_norm": 0.7030429840087891, + "learning_rate": 0.0004796325876685632, + "loss": 0.0514, + "step": 228 + }, + { + "epoch": 0.41, + "grad_norm": 0.5683910846710205, + "learning_rate": 0.00047944791118734517, + "loss": 0.0923, + "step": 229 + }, + { + "epoch": 0.41, + "grad_norm": 0.8425244092941284, + "learning_rate": 0.0004792624371083819, + "loss": 0.0976, + "step": 230 + }, + { + "epoch": 0.41, + "grad_norm": 0.21189981698989868, + "learning_rate": 0.00047907616607641113, + "loss": 0.1016, + "step": 231 + }, + { + "epoch": 0.41, + "grad_norm": 0.36100390553474426, + "learning_rate": 0.0004788890987389408, + "loss": 0.1015, + "step": 232 + }, + { + "epoch": 0.41, + "grad_norm": 0.42600420117378235, + "learning_rate": 0.000478701235746247, + "loss": 0.1401, + "step": 233 + }, + { + "epoch": 0.41, + "grad_norm": 0.649318516254425, + "learning_rate": 0.0004785125777513716, + "loss": 0.1012, + "step": 234 + }, + { + "epoch": 0.42, + "grad_norm": 0.3490477204322815, + "learning_rate": 0.00047832312541012007, + "loss": 0.1015, + "step": 235 + }, + { + "epoch": 0.42, + "grad_norm": 0.6937799453735352, + "learning_rate": 0.0004781328793810592, + "loss": 0.1188, + "step": 236 + }, + { + "epoch": 0.42, + "grad_norm": 1.0924077033996582, + "learning_rate": 0.0004779418403255146, + "loss": 0.1093, + "step": 237 + }, + { + "epoch": 0.42, + "grad_norm": 0.36075183749198914, + "learning_rate": 0.0004777500089075687, + "loss": 0.0971, + "step": 238 + }, + { + "epoch": 0.42, + "grad_norm": 0.41673243045806885, + "learning_rate": 0.00047755738579405836, + "loss": 0.0953, + "step": 239 + }, + { + "epoch": 0.42, + "grad_norm": 0.13159583508968353, + "learning_rate": 0.0004773639716545723, + "loss": 0.0571, + "step": 240 + }, + { + "epoch": 0.43, + "grad_norm": 0.9338862895965576, + "learning_rate": 0.00047716976716144917, + "loss": 0.202, + "step": 241 + }, + { + "epoch": 0.43, + "grad_norm": 0.3190581798553467, + "learning_rate": 0.0004769747729897749, + "loss": 0.1071, + "step": 242 + }, + { + "epoch": 0.43, + "grad_norm": 0.23796042799949646, + "learning_rate": 0.0004767789898173806, + "loss": 0.0659, + "step": 243 + }, + { + "epoch": 0.43, + "grad_norm": 0.19194231927394867, + "learning_rate": 0.0004765824183248399, + "loss": 0.0611, + "step": 244 + }, + { + "epoch": 0.43, + "grad_norm": 0.16703608632087708, + "learning_rate": 0.0004763850591954668, + "loss": 0.0855, + "step": 245 + }, + { + "epoch": 0.44, + "grad_norm": 0.3395439684391022, + "learning_rate": 0.0004761869131153135, + "loss": 0.0926, + "step": 246 + }, + { + "epoch": 0.44, + "grad_norm": 0.2820179760456085, + "learning_rate": 0.0004759879807731673, + "loss": 0.0508, + "step": 247 + }, + { + "epoch": 0.44, + "grad_norm": 0.20656561851501465, + "learning_rate": 0.00047578826286054897, + "loss": 0.068, + "step": 248 + }, + { + "epoch": 0.44, + "grad_norm": 0.4477837383747101, + "learning_rate": 0.00047558776007171024, + "loss": 0.0918, + "step": 249 + }, + { + "epoch": 0.44, + "grad_norm": 0.18997950851917267, + "learning_rate": 0.0004753864731036307, + "loss": 0.0734, + "step": 250 + }, + { + "epoch": 0.44, + "grad_norm": 0.2841518521308899, + "learning_rate": 0.0004751844026560163, + "loss": 0.1194, + "step": 251 + }, + { + "epoch": 0.45, + "grad_norm": 0.29770052433013916, + "learning_rate": 0.0004749815494312963, + "loss": 0.0996, + "step": 252 + }, + { + "epoch": 0.45, + "grad_norm": 0.2982254922389984, + "learning_rate": 0.00047477791413462104, + "loss": 0.0945, + "step": 253 + }, + { + "epoch": 0.45, + "grad_norm": 0.4625980854034424, + "learning_rate": 0.00047457349747385936, + "loss": 0.131, + "step": 254 + }, + { + "epoch": 0.45, + "grad_norm": 0.29756709933280945, + "learning_rate": 0.00047436830015959653, + "loss": 0.1057, + "step": 255 + }, + { + "epoch": 0.45, + "grad_norm": 0.19971434772014618, + "learning_rate": 0.00047416232290513127, + "loss": 0.0794, + "step": 256 + }, + { + "epoch": 0.45, + "grad_norm": 0.12171836197376251, + "learning_rate": 0.0004739555664264736, + "loss": 0.0527, + "step": 257 + }, + { + "epoch": 0.46, + "grad_norm": 0.23848529160022736, + "learning_rate": 0.00047374803144234213, + "loss": 0.134, + "step": 258 + }, + { + "epoch": 0.46, + "grad_norm": 0.12673752009868622, + "learning_rate": 0.0004735397186741618, + "loss": 0.0774, + "step": 259 + }, + { + "epoch": 0.46, + "grad_norm": 0.11961629241704941, + "learning_rate": 0.00047333062884606116, + "loss": 0.0661, + "step": 260 + }, + { + "epoch": 0.46, + "grad_norm": 0.18004140257835388, + "learning_rate": 0.00047312076268487, + "loss": 0.1132, + "step": 261 + }, + { + "epoch": 0.46, + "grad_norm": 0.1698005348443985, + "learning_rate": 0.00047291012092011685, + "loss": 0.057, + "step": 262 + }, + { + "epoch": 0.47, + "grad_norm": 0.1949334442615509, + "learning_rate": 0.0004726987042840263, + "loss": 0.0703, + "step": 263 + }, + { + "epoch": 0.47, + "grad_norm": 0.4016534686088562, + "learning_rate": 0.0004724865135115163, + "loss": 0.1178, + "step": 264 + }, + { + "epoch": 0.47, + "grad_norm": 0.36885496973991394, + "learning_rate": 0.00047227354934019605, + "loss": 0.1303, + "step": 265 + }, + { + "epoch": 0.47, + "grad_norm": 0.3214585483074188, + "learning_rate": 0.00047205981251036334, + "loss": 0.1019, + "step": 266 + }, + { + "epoch": 0.47, + "grad_norm": 0.15313082933425903, + "learning_rate": 0.0004718453037650016, + "loss": 0.0581, + "step": 267 + }, + { + "epoch": 0.47, + "grad_norm": 0.3251878321170807, + "learning_rate": 0.0004716300238497775, + "loss": 0.099, + "step": 268 + }, + { + "epoch": 0.48, + "grad_norm": 0.20356950163841248, + "learning_rate": 0.0004714139735130388, + "loss": 0.0767, + "step": 269 + }, + { + "epoch": 0.48, + "grad_norm": 0.2644464373588562, + "learning_rate": 0.00047119715350581095, + "loss": 0.1003, + "step": 270 + }, + { + "epoch": 0.48, + "grad_norm": 0.22035302221775055, + "learning_rate": 0.000470979564581795, + "loss": 0.0722, + "step": 271 + }, + { + "epoch": 0.48, + "grad_norm": 0.5284466743469238, + "learning_rate": 0.0004707612074973653, + "loss": 0.1282, + "step": 272 + }, + { + "epoch": 0.48, + "grad_norm": 0.34032565355300903, + "learning_rate": 0.0004705420830115658, + "loss": 0.099, + "step": 273 + }, + { + "epoch": 0.48, + "grad_norm": 0.26527565717697144, + "learning_rate": 0.00047032219188610836, + "loss": 0.0911, + "step": 274 + }, + { + "epoch": 0.49, + "grad_norm": 0.2254990190267563, + "learning_rate": 0.0004701015348853699, + "loss": 0.0667, + "step": 275 + }, + { + "epoch": 0.49, + "grad_norm": 0.21334387362003326, + "learning_rate": 0.0004698801127763895, + "loss": 0.0659, + "step": 276 + }, + { + "epoch": 0.49, + "grad_norm": 0.2917044758796692, + "learning_rate": 0.0004696579263288661, + "loss": 0.1159, + "step": 277 + }, + { + "epoch": 0.49, + "grad_norm": 0.14027804136276245, + "learning_rate": 0.00046943497631515526, + "loss": 0.0323, + "step": 278 + }, + { + "epoch": 0.49, + "grad_norm": 0.3988366425037384, + "learning_rate": 0.00046921126351026697, + "loss": 0.0887, + "step": 279 + }, + { + "epoch": 0.5, + "grad_norm": 0.36629319190979004, + "learning_rate": 0.00046898678869186297, + "loss": 0.1079, + "step": 280 + }, + { + "epoch": 0.5, + "grad_norm": 0.35548141598701477, + "learning_rate": 0.0004687615526402536, + "loss": 0.1056, + "step": 281 + }, + { + "epoch": 0.5, + "grad_norm": 0.21030637621879578, + "learning_rate": 0.0004685355561383956, + "loss": 0.0717, + "step": 282 + }, + { + "epoch": 0.5, + "grad_norm": 0.24192889034748077, + "learning_rate": 0.000468308799971889, + "loss": 0.1047, + "step": 283 + }, + { + "epoch": 0.5, + "grad_norm": 0.16289295256137848, + "learning_rate": 0.00046808128492897464, + "loss": 0.0519, + "step": 284 + }, + { + "epoch": 0.5, + "eval_loss": 0.08938124030828476, + "eval_runtime": 14.7518, + "eval_samples_per_second": 32.335, + "eval_steps_per_second": 8.135, + "step": 284 + }, + { + "epoch": 0.5, + "grad_norm": 0.23021583259105682, + "learning_rate": 0.00046785301180053126, + "loss": 0.1161, + "step": 285 + }, + { + "epoch": 0.51, + "grad_norm": 0.3577558398246765, + "learning_rate": 0.0004676239813800729, + "loss": 0.1239, + "step": 286 + }, + { + "epoch": 0.51, + "grad_norm": 0.15293735265731812, + "learning_rate": 0.0004673941944637461, + "loss": 0.0401, + "step": 287 + }, + { + "epoch": 0.51, + "grad_norm": 0.342631459236145, + "learning_rate": 0.00046716365185032696, + "loss": 0.1358, + "step": 288 + }, + { + "epoch": 0.51, + "grad_norm": 0.4987104833126068, + "learning_rate": 0.0004669323543412186, + "loss": 0.1312, + "step": 289 + }, + { + "epoch": 0.51, + "grad_norm": 0.21678434312343597, + "learning_rate": 0.0004667003027404483, + "loss": 0.0791, + "step": 290 + }, + { + "epoch": 0.51, + "grad_norm": 0.2781723141670227, + "learning_rate": 0.00046646749785466464, + "loss": 0.0809, + "step": 291 + }, + { + "epoch": 0.52, + "grad_norm": 0.3997693359851837, + "learning_rate": 0.00046623394049313474, + "loss": 0.0938, + "step": 292 + }, + { + "epoch": 0.52, + "grad_norm": 0.2478984147310257, + "learning_rate": 0.00046599963146774136, + "loss": 0.0671, + "step": 293 + }, + { + "epoch": 0.52, + "grad_norm": 0.35655421018600464, + "learning_rate": 0.0004657645715929805, + "loss": 0.107, + "step": 294 + }, + { + "epoch": 0.52, + "grad_norm": 0.41986069083213806, + "learning_rate": 0.0004655287616859577, + "loss": 0.1381, + "step": 295 + }, + { + "epoch": 0.52, + "grad_norm": 0.2831580340862274, + "learning_rate": 0.00046529220256638626, + "loss": 0.1012, + "step": 296 + }, + { + "epoch": 0.53, + "grad_norm": 0.2183172106742859, + "learning_rate": 0.0004650548950565835, + "loss": 0.0883, + "step": 297 + }, + { + "epoch": 0.53, + "grad_norm": 0.1485687792301178, + "learning_rate": 0.0004648168399814684, + "loss": 0.094, + "step": 298 + }, + { + "epoch": 0.53, + "grad_norm": 0.3192533552646637, + "learning_rate": 0.0004645780381685586, + "loss": 0.1144, + "step": 299 + }, + { + "epoch": 0.53, + "grad_norm": 0.20768460631370544, + "learning_rate": 0.0004643384904479675, + "loss": 0.1119, + "step": 300 + }, + { + "epoch": 0.53, + "grad_norm": 0.16704390943050385, + "learning_rate": 0.00046409819765240147, + "loss": 0.0852, + "step": 301 + }, + { + "epoch": 0.53, + "grad_norm": 0.33123648166656494, + "learning_rate": 0.0004638571606171567, + "loss": 0.1608, + "step": 302 + }, + { + "epoch": 0.54, + "grad_norm": 0.408978134393692, + "learning_rate": 0.0004636153801801167, + "loss": 0.0906, + "step": 303 + }, + { + "epoch": 0.54, + "grad_norm": 0.29201096296310425, + "learning_rate": 0.00046337285718174896, + "loss": 0.1237, + "step": 304 + }, + { + "epoch": 0.54, + "grad_norm": 0.45836058259010315, + "learning_rate": 0.00046312959246510237, + "loss": 0.0926, + "step": 305 + }, + { + "epoch": 0.54, + "grad_norm": 0.5405777096748352, + "learning_rate": 0.0004628855868758041, + "loss": 0.0727, + "step": 306 + }, + { + "epoch": 0.54, + "grad_norm": 0.3068138062953949, + "learning_rate": 0.00046264084126205676, + "loss": 0.1006, + "step": 307 + }, + { + "epoch": 0.54, + "grad_norm": 0.2990975081920624, + "learning_rate": 0.00046239535647463534, + "loss": 0.1033, + "step": 308 + }, + { + "epoch": 0.55, + "grad_norm": 0.2938540279865265, + "learning_rate": 0.00046214913336688424, + "loss": 0.1084, + "step": 309 + }, + { + "epoch": 0.55, + "grad_norm": 0.49840983748435974, + "learning_rate": 0.00046190217279471466, + "loss": 0.1066, + "step": 310 + }, + { + "epoch": 0.55, + "grad_norm": 0.4558626711368561, + "learning_rate": 0.000461654475616601, + "loss": 0.121, + "step": 311 + }, + { + "epoch": 0.55, + "grad_norm": 0.20964759588241577, + "learning_rate": 0.0004614060426935786, + "loss": 0.0843, + "step": 312 + }, + { + "epoch": 0.55, + "grad_norm": 0.2254151701927185, + "learning_rate": 0.00046115687488923983, + "loss": 0.0781, + "step": 313 + }, + { + "epoch": 0.56, + "grad_norm": 0.6117066740989685, + "learning_rate": 0.0004609069730697322, + "loss": 0.1208, + "step": 314 + }, + { + "epoch": 0.56, + "grad_norm": 0.5352897644042969, + "learning_rate": 0.0004606563381037544, + "loss": 0.1056, + "step": 315 + }, + { + "epoch": 0.56, + "grad_norm": 0.31001269817352295, + "learning_rate": 0.00046040497086255385, + "loss": 0.1213, + "step": 316 + }, + { + "epoch": 0.56, + "grad_norm": 0.22637054324150085, + "learning_rate": 0.0004601528722199234, + "loss": 0.105, + "step": 317 + }, + { + "epoch": 0.56, + "grad_norm": 0.20077432692050934, + "learning_rate": 0.0004599000430521984, + "loss": 0.0837, + "step": 318 + }, + { + "epoch": 0.56, + "grad_norm": 0.24702684581279755, + "learning_rate": 0.0004596464842382534, + "loss": 0.0695, + "step": 319 + }, + { + "epoch": 0.57, + "grad_norm": 0.253387987613678, + "learning_rate": 0.0004593921966594997, + "loss": 0.1184, + "step": 320 + }, + { + "epoch": 0.57, + "grad_norm": 0.2703799605369568, + "learning_rate": 0.0004591371811998817, + "loss": 0.117, + "step": 321 + }, + { + "epoch": 0.57, + "grad_norm": 0.23513701558113098, + "learning_rate": 0.00045888143874587396, + "loss": 0.1359, + "step": 322 + }, + { + "epoch": 0.57, + "grad_norm": 0.25604313611984253, + "learning_rate": 0.00045862497018647833, + "loss": 0.1018, + "step": 323 + }, + { + "epoch": 0.57, + "grad_norm": 0.16947636008262634, + "learning_rate": 0.0004583677764132207, + "loss": 0.0958, + "step": 324 + }, + { + "epoch": 0.57, + "grad_norm": 0.20054908096790314, + "learning_rate": 0.0004581098583201478, + "loss": 0.0803, + "step": 325 + }, + { + "epoch": 0.58, + "grad_norm": 0.12656472623348236, + "learning_rate": 0.00045785121680382436, + "loss": 0.0679, + "step": 326 + }, + { + "epoch": 0.58, + "grad_norm": 0.1423499882221222, + "learning_rate": 0.0004575918527633297, + "loss": 0.0959, + "step": 327 + }, + { + "epoch": 0.58, + "grad_norm": 0.36370569467544556, + "learning_rate": 0.0004573317671002549, + "loss": 0.1088, + "step": 328 + }, + { + "epoch": 0.58, + "grad_norm": 0.18775340914726257, + "learning_rate": 0.0004570709607186995, + "loss": 0.0905, + "step": 329 + }, + { + "epoch": 0.58, + "grad_norm": 0.14833571016788483, + "learning_rate": 0.0004568094345252681, + "loss": 0.0661, + "step": 330 + }, + { + "epoch": 0.59, + "grad_norm": 0.2987270653247833, + "learning_rate": 0.00045654718942906794, + "loss": 0.0872, + "step": 331 + }, + { + "epoch": 0.59, + "grad_norm": 0.21985827386379242, + "learning_rate": 0.000456284226341705, + "loss": 0.0882, + "step": 332 + }, + { + "epoch": 0.59, + "grad_norm": 0.2726268470287323, + "learning_rate": 0.00045602054617728093, + "loss": 0.0864, + "step": 333 + }, + { + "epoch": 0.59, + "grad_norm": 0.2882244884967804, + "learning_rate": 0.00045575614985239057, + "loss": 0.1032, + "step": 334 + }, + { + "epoch": 0.59, + "grad_norm": 0.427500456571579, + "learning_rate": 0.0004554910382861178, + "loss": 0.1309, + "step": 335 + }, + { + "epoch": 0.59, + "grad_norm": 0.43029338121414185, + "learning_rate": 0.000455225212400033, + "loss": 0.1071, + "step": 336 + }, + { + "epoch": 0.6, + "grad_norm": 0.2297673523426056, + "learning_rate": 0.0004549586731181896, + "loss": 0.0526, + "step": 337 + }, + { + "epoch": 0.6, + "grad_norm": 0.4533613920211792, + "learning_rate": 0.0004546914213671209, + "loss": 0.1154, + "step": 338 + }, + { + "epoch": 0.6, + "grad_norm": 0.3973630666732788, + "learning_rate": 0.0004544234580758367, + "loss": 0.0707, + "step": 339 + }, + { + "epoch": 0.6, + "grad_norm": 0.40036290884017944, + "learning_rate": 0.0004541547841758207, + "loss": 0.0932, + "step": 340 + }, + { + "epoch": 0.6, + "grad_norm": 0.4273395240306854, + "learning_rate": 0.0004538854006010262, + "loss": 0.1112, + "step": 341 + }, + { + "epoch": 0.61, + "grad_norm": 0.28109779953956604, + "learning_rate": 0.0004536153082878738, + "loss": 0.1003, + "step": 342 + }, + { + "epoch": 0.61, + "grad_norm": 0.21950216591358185, + "learning_rate": 0.00045334450817524776, + "loss": 0.0538, + "step": 343 + }, + { + "epoch": 0.61, + "grad_norm": 0.2968471646308899, + "learning_rate": 0.00045307300120449263, + "loss": 0.0775, + "step": 344 + }, + { + "epoch": 0.61, + "grad_norm": 0.1488364040851593, + "learning_rate": 0.00045280078831941024, + "loss": 0.0513, + "step": 345 + }, + { + "epoch": 0.61, + "grad_norm": 0.22750218212604523, + "learning_rate": 0.00045252787046625624, + "loss": 0.0943, + "step": 346 + }, + { + "epoch": 0.61, + "grad_norm": 0.3048767149448395, + "learning_rate": 0.0004522542485937369, + "loss": 0.079, + "step": 347 + }, + { + "epoch": 0.62, + "grad_norm": 0.33520030975341797, + "learning_rate": 0.0004519799236530057, + "loss": 0.1584, + "step": 348 + }, + { + "epoch": 0.62, + "grad_norm": 0.20777581632137299, + "learning_rate": 0.00045170489659766003, + "loss": 0.0903, + "step": 349 + }, + { + "epoch": 0.62, + "grad_norm": 0.1602245271205902, + "learning_rate": 0.00045142916838373826, + "loss": 0.0446, + "step": 350 + }, + { + "epoch": 0.62, + "grad_norm": 0.2512218952178955, + "learning_rate": 0.0004511527399697158, + "loss": 0.069, + "step": 351 + }, + { + "epoch": 0.62, + "grad_norm": 0.17349962890148163, + "learning_rate": 0.0004508756123165021, + "loss": 0.0765, + "step": 352 + }, + { + "epoch": 0.62, + "grad_norm": 0.26563215255737305, + "learning_rate": 0.00045059778638743744, + "loss": 0.0966, + "step": 353 + }, + { + "epoch": 0.63, + "grad_norm": 0.23987066745758057, + "learning_rate": 0.00045031926314828926, + "loss": 0.0702, + "step": 354 + }, + { + "epoch": 0.63, + "grad_norm": 0.21901372075080872, + "learning_rate": 0.000450040043567249, + "loss": 0.0457, + "step": 355 + }, + { + "epoch": 0.63, + "grad_norm": 0.24179872870445251, + "learning_rate": 0.00044976012861492877, + "loss": 0.0651, + "step": 356 + }, + { + "epoch": 0.63, + "grad_norm": 0.3544818162918091, + "learning_rate": 0.0004494795192643578, + "loss": 0.0622, + "step": 357 + }, + { + "epoch": 0.63, + "grad_norm": 0.4363332986831665, + "learning_rate": 0.00044919821649097916, + "loss": 0.0972, + "step": 358 + }, + { + "epoch": 0.64, + "grad_norm": 0.43788430094718933, + "learning_rate": 0.0004489162212726465, + "loss": 0.0843, + "step": 359 + }, + { + "epoch": 0.64, + "grad_norm": 0.5084832906723022, + "learning_rate": 0.00044863353458962044, + "loss": 0.0888, + "step": 360 + }, + { + "epoch": 0.64, + "grad_norm": 0.44660842418670654, + "learning_rate": 0.0004483501574245652, + "loss": 0.113, + "step": 361 + }, + { + "epoch": 0.64, + "grad_norm": 0.7528813481330872, + "learning_rate": 0.0004480660907625452, + "loss": 0.0512, + "step": 362 + }, + { + "epoch": 0.64, + "grad_norm": 0.9723535776138306, + "learning_rate": 0.0004477813355910219, + "loss": 0.1154, + "step": 363 + }, + { + "epoch": 0.64, + "grad_norm": 0.2641480565071106, + "learning_rate": 0.0004474958928998498, + "loss": 0.0575, + "step": 364 + }, + { + "epoch": 0.65, + "grad_norm": 0.12234170734882355, + "learning_rate": 0.00044720976368127355, + "loss": 0.0441, + "step": 365 + }, + { + "epoch": 0.65, + "grad_norm": 0.26976636052131653, + "learning_rate": 0.00044692294892992416, + "loss": 0.0676, + "step": 366 + }, + { + "epoch": 0.65, + "grad_norm": 0.22729526460170746, + "learning_rate": 0.00044663544964281573, + "loss": 0.098, + "step": 367 + }, + { + "epoch": 0.65, + "grad_norm": 0.2270442545413971, + "learning_rate": 0.0004463472668193419, + "loss": 0.0842, + "step": 368 + }, + { + "epoch": 0.65, + "grad_norm": 0.19249562919139862, + "learning_rate": 0.0004460584014612724, + "loss": 0.0537, + "step": 369 + }, + { + "epoch": 0.65, + "grad_norm": 0.22312623262405396, + "learning_rate": 0.0004457688545727496, + "loss": 0.0547, + "step": 370 + }, + { + "epoch": 0.66, + "grad_norm": 0.281658411026001, + "learning_rate": 0.0004454786271602849, + "loss": 0.089, + "step": 371 + }, + { + "epoch": 0.66, + "grad_norm": 0.49952250719070435, + "learning_rate": 0.00044518772023275526, + "loss": 0.1298, + "step": 372 + }, + { + "epoch": 0.66, + "grad_norm": 0.186232328414917, + "learning_rate": 0.0004448961348013999, + "loss": 0.0628, + "step": 373 + }, + { + "epoch": 0.66, + "grad_norm": 0.2980823814868927, + "learning_rate": 0.0004446038718798166, + "loss": 0.0828, + "step": 374 + }, + { + "epoch": 0.66, + "grad_norm": 0.3794187605381012, + "learning_rate": 0.00044431093248395806, + "loss": 0.0776, + "step": 375 + }, + { + "epoch": 0.67, + "grad_norm": 0.29262277483940125, + "learning_rate": 0.0004440173176321287, + "loss": 0.0924, + "step": 376 + }, + { + "epoch": 0.67, + "grad_norm": 0.30543988943099976, + "learning_rate": 0.0004437230283449808, + "loss": 0.1264, + "step": 377 + }, + { + "epoch": 0.67, + "grad_norm": 0.3436485826969147, + "learning_rate": 0.0004434280656455111, + "loss": 0.1066, + "step": 378 + }, + { + "epoch": 0.67, + "grad_norm": 0.23679965734481812, + "learning_rate": 0.0004431324305590572, + "loss": 0.075, + "step": 379 + }, + { + "epoch": 0.67, + "grad_norm": 0.4399561882019043, + "learning_rate": 0.0004428361241132943, + "loss": 0.1445, + "step": 380 + }, + { + "epoch": 0.67, + "grad_norm": 0.39203163981437683, + "learning_rate": 0.0004425391473382309, + "loss": 0.0995, + "step": 381 + }, + { + "epoch": 0.68, + "grad_norm": 0.4687665104866028, + "learning_rate": 0.0004422415012662061, + "loss": 0.1489, + "step": 382 + }, + { + "epoch": 0.68, + "grad_norm": 0.2634904086589813, + "learning_rate": 0.00044194318693188526, + "loss": 0.1164, + "step": 383 + }, + { + "epoch": 0.68, + "grad_norm": 0.23031170666217804, + "learning_rate": 0.0004416442053722569, + "loss": 0.0742, + "step": 384 + }, + { + "epoch": 0.68, + "grad_norm": 0.30467960238456726, + "learning_rate": 0.00044134455762662894, + "loss": 0.0984, + "step": 385 + }, + { + "epoch": 0.68, + "grad_norm": 0.16692829132080078, + "learning_rate": 0.0004410442447366249, + "loss": 0.0732, + "step": 386 + }, + { + "epoch": 0.68, + "grad_norm": 0.20100833475589752, + "learning_rate": 0.00044074326774618065, + "loss": 0.1082, + "step": 387 + }, + { + "epoch": 0.69, + "grad_norm": 0.29799607396125793, + "learning_rate": 0.0004404416277015404, + "loss": 0.0761, + "step": 388 + }, + { + "epoch": 0.69, + "grad_norm": 0.647639274597168, + "learning_rate": 0.0004401393256512534, + "loss": 0.1218, + "step": 389 + }, + { + "epoch": 0.69, + "grad_norm": 0.2610540986061096, + "learning_rate": 0.00043983636264617013, + "loss": 0.0923, + "step": 390 + }, + { + "epoch": 0.69, + "grad_norm": 0.4049086570739746, + "learning_rate": 0.0004395327397394384, + "loss": 0.1091, + "step": 391 + }, + { + "epoch": 0.69, + "grad_norm": 0.36092105507850647, + "learning_rate": 0.00043922845798650034, + "loss": 0.0927, + "step": 392 + }, + { + "epoch": 0.7, + "grad_norm": 0.542421281337738, + "learning_rate": 0.00043892351844508805, + "loss": 0.1014, + "step": 393 + }, + { + "epoch": 0.7, + "grad_norm": 0.291595995426178, + "learning_rate": 0.0004386179221752202, + "loss": 0.0902, + "step": 394 + }, + { + "epoch": 0.7, + "grad_norm": 0.17152707278728485, + "learning_rate": 0.0004383116702391987, + "loss": 0.0651, + "step": 395 + }, + { + "epoch": 0.7, + "grad_norm": 0.16654878854751587, + "learning_rate": 0.00043800476370160416, + "loss": 0.0824, + "step": 396 + }, + { + "epoch": 0.7, + "grad_norm": 0.18530108034610748, + "learning_rate": 0.000437697203629293, + "loss": 0.0549, + "step": 397 + }, + { + "epoch": 0.7, + "grad_norm": 0.5760988593101501, + "learning_rate": 0.0004373889910913934, + "loss": 0.0803, + "step": 398 + }, + { + "epoch": 0.71, + "grad_norm": 0.4253963232040405, + "learning_rate": 0.00043708012715930154, + "loss": 0.0728, + "step": 399 + }, + { + "epoch": 0.71, + "grad_norm": 0.7932385206222534, + "learning_rate": 0.00043677061290667805, + "loss": 0.1442, + "step": 400 + }, + { + "epoch": 0.71, + "grad_norm": 0.5904709696769714, + "learning_rate": 0.00043646044940944407, + "loss": 0.0999, + "step": 401 + }, + { + "epoch": 0.71, + "grad_norm": 0.9570127129554749, + "learning_rate": 0.0004361496377457777, + "loss": 0.1298, + "step": 402 + }, + { + "epoch": 0.71, + "grad_norm": 0.5049470663070679, + "learning_rate": 0.00043583817899611017, + "loss": 0.0263, + "step": 403 + }, + { + "epoch": 0.71, + "grad_norm": 0.589408814907074, + "learning_rate": 0.00043552607424312195, + "loss": 0.1051, + "step": 404 + }, + { + "epoch": 0.72, + "grad_norm": 0.43722283840179443, + "learning_rate": 0.0004352133245717393, + "loss": 0.0715, + "step": 405 + }, + { + "epoch": 0.72, + "grad_norm": 1.3537758588790894, + "learning_rate": 0.00043489993106913036, + "loss": 0.0322, + "step": 406 + }, + { + "epoch": 0.72, + "grad_norm": 0.3382836580276489, + "learning_rate": 0.000434585894824701, + "loss": 0.0818, + "step": 407 + }, + { + "epoch": 0.72, + "grad_norm": 0.9946733713150024, + "learning_rate": 0.00043427121693009164, + "loss": 0.1536, + "step": 408 + }, + { + "epoch": 0.72, + "grad_norm": 0.9138526320457458, + "learning_rate": 0.0004339558984791732, + "loss": 0.1299, + "step": 409 + }, + { + "epoch": 0.73, + "grad_norm": 0.35993850231170654, + "learning_rate": 0.0004336399405680432, + "loss": 0.0654, + "step": 410 + }, + { + "epoch": 0.73, + "grad_norm": 0.30418309569358826, + "learning_rate": 0.0004333233442950219, + "loss": 0.1026, + "step": 411 + }, + { + "epoch": 0.73, + "grad_norm": 0.256728857755661, + "learning_rate": 0.00043300611076064886, + "loss": 0.083, + "step": 412 + }, + { + "epoch": 0.73, + "grad_norm": 0.3472314774990082, + "learning_rate": 0.00043268824106767865, + "loss": 0.0637, + "step": 413 + }, + { + "epoch": 0.73, + "grad_norm": 0.5615578293800354, + "learning_rate": 0.00043236973632107735, + "loss": 0.1028, + "step": 414 + }, + { + "epoch": 0.73, + "grad_norm": 0.35775747895240784, + "learning_rate": 0.00043205059762801854, + "loss": 0.0829, + "step": 415 + }, + { + "epoch": 0.74, + "grad_norm": 0.287270724773407, + "learning_rate": 0.0004317308260978795, + "loss": 0.0718, + "step": 416 + }, + { + "epoch": 0.74, + "grad_norm": 0.3237059414386749, + "learning_rate": 0.00043141042284223737, + "loss": 0.0797, + "step": 417 + }, + { + "epoch": 0.74, + "grad_norm": 0.2153153419494629, + "learning_rate": 0.0004310893889748653, + "loss": 0.0778, + "step": 418 + }, + { + "epoch": 0.74, + "grad_norm": 0.33600860834121704, + "learning_rate": 0.00043076772561172845, + "loss": 0.0594, + "step": 419 + }, + { + "epoch": 0.74, + "grad_norm": 0.8778895139694214, + "learning_rate": 0.00043044543387098027, + "loss": 0.1722, + "step": 420 + }, + { + "epoch": 0.74, + "grad_norm": 0.503434419631958, + "learning_rate": 0.0004301225148729586, + "loss": 0.1228, + "step": 421 + }, + { + "epoch": 0.75, + "grad_norm": 0.3694842457771301, + "learning_rate": 0.00042979896974018166, + "loss": 0.1033, + "step": 422 + }, + { + "epoch": 0.75, + "grad_norm": 0.329818457365036, + "learning_rate": 0.00042947479959734423, + "loss": 0.0471, + "step": 423 + }, + { + "epoch": 0.75, + "grad_norm": 0.19436267018318176, + "learning_rate": 0.0004291500055713138, + "loss": 0.0409, + "step": 424 + }, + { + "epoch": 0.75, + "grad_norm": 0.3269858658313751, + "learning_rate": 0.0004288245887911263, + "loss": 0.096, + "step": 425 + }, + { + "epoch": 0.75, + "grad_norm": 0.3542991578578949, + "learning_rate": 0.00042849855038798283, + "loss": 0.0986, + "step": 426 + }, + { + "epoch": 0.75, + "eval_loss": 0.07792978733778, + "eval_runtime": 14.8115, + "eval_samples_per_second": 32.205, + "eval_steps_per_second": 8.102, + "step": 426 + }, + { + "epoch": 0.76, + "grad_norm": 0.29685521125793457, + "learning_rate": 0.00042817189149524517, + "loss": 0.11, + "step": 427 + }, + { + "epoch": 0.76, + "grad_norm": 0.2116887867450714, + "learning_rate": 0.00042784461324843194, + "loss": 0.0686, + "step": 428 + }, + { + "epoch": 0.76, + "grad_norm": 0.4866883456707001, + "learning_rate": 0.00042751671678521486, + "loss": 0.0824, + "step": 429 + }, + { + "epoch": 0.76, + "grad_norm": 0.1293468475341797, + "learning_rate": 0.00042718820324541475, + "loss": 0.0464, + "step": 430 + }, + { + "epoch": 0.76, + "grad_norm": 0.3253125250339508, + "learning_rate": 0.0004268590737709972, + "loss": 0.0996, + "step": 431 + }, + { + "epoch": 0.76, + "grad_norm": 0.25559771060943604, + "learning_rate": 0.00042652932950606917, + "loss": 0.0545, + "step": 432 + }, + { + "epoch": 0.77, + "grad_norm": 0.2788093686103821, + "learning_rate": 0.0004261989715968746, + "loss": 0.0502, + "step": 433 + }, + { + "epoch": 0.77, + "grad_norm": 0.6902124285697937, + "learning_rate": 0.00042586800119179046, + "loss": 0.1598, + "step": 434 + }, + { + "epoch": 0.77, + "grad_norm": 0.4788605570793152, + "learning_rate": 0.00042553641944132316, + "loss": 0.1552, + "step": 435 + }, + { + "epoch": 0.77, + "grad_norm": 0.43495067954063416, + "learning_rate": 0.00042520422749810395, + "loss": 0.0907, + "step": 436 + }, + { + "epoch": 0.77, + "grad_norm": 0.3549440801143646, + "learning_rate": 0.0004248714265168853, + "loss": 0.1152, + "step": 437 + }, + { + "epoch": 0.77, + "grad_norm": 0.7210204601287842, + "learning_rate": 0.00042453801765453687, + "loss": 0.1891, + "step": 438 + }, + { + "epoch": 0.78, + "grad_norm": 0.4578750729560852, + "learning_rate": 0.00042420400207004126, + "loss": 0.1383, + "step": 439 + }, + { + "epoch": 0.78, + "grad_norm": 0.3323976993560791, + "learning_rate": 0.00042386938092449036, + "loss": 0.0936, + "step": 440 + }, + { + "epoch": 0.78, + "grad_norm": 0.15145371854305267, + "learning_rate": 0.00042353415538108076, + "loss": 0.0608, + "step": 441 + }, + { + "epoch": 0.78, + "grad_norm": 0.10744435340166092, + "learning_rate": 0.00042319832660511037, + "loss": 0.0865, + "step": 442 + }, + { + "epoch": 0.78, + "grad_norm": 0.13599476218223572, + "learning_rate": 0.0004228618957639738, + "loss": 0.0763, + "step": 443 + }, + { + "epoch": 0.79, + "grad_norm": 0.18250028789043427, + "learning_rate": 0.00042252486402715865, + "loss": 0.0813, + "step": 444 + }, + { + "epoch": 0.79, + "grad_norm": 0.5180188417434692, + "learning_rate": 0.00042218723256624136, + "loss": 0.1603, + "step": 445 + }, + { + "epoch": 0.79, + "grad_norm": 0.2943187355995178, + "learning_rate": 0.000421849002554883, + "loss": 0.1031, + "step": 446 + }, + { + "epoch": 0.79, + "grad_norm": 0.14898087084293365, + "learning_rate": 0.0004215101751688253, + "loss": 0.071, + "step": 447 + }, + { + "epoch": 0.79, + "grad_norm": 0.2951905131340027, + "learning_rate": 0.00042117075158588663, + "loss": 0.0772, + "step": 448 + }, + { + "epoch": 0.79, + "grad_norm": 0.39807453751564026, + "learning_rate": 0.00042083073298595787, + "loss": 0.0561, + "step": 449 + }, + { + "epoch": 0.8, + "grad_norm": 0.45217999815940857, + "learning_rate": 0.0004204901205509981, + "loss": 0.1076, + "step": 450 + }, + { + "epoch": 0.8, + "grad_norm": 0.24114732444286346, + "learning_rate": 0.000420148915465031, + "loss": 0.1169, + "step": 451 + }, + { + "epoch": 0.8, + "grad_norm": 0.6120204329490662, + "learning_rate": 0.00041980711891413994, + "loss": 0.1144, + "step": 452 + }, + { + "epoch": 0.8, + "grad_norm": 0.3900619447231293, + "learning_rate": 0.0004194647320864646, + "loss": 0.0806, + "step": 453 + }, + { + "epoch": 0.8, + "grad_norm": 0.3331635296344757, + "learning_rate": 0.0004191217561721967, + "loss": 0.0655, + "step": 454 + }, + { + "epoch": 0.8, + "grad_norm": 0.29893186688423157, + "learning_rate": 0.0004187781923635753, + "loss": 0.0482, + "step": 455 + }, + { + "epoch": 0.81, + "grad_norm": 0.20024164021015167, + "learning_rate": 0.00041843404185488346, + "loss": 0.0773, + "step": 456 + }, + { + "epoch": 0.81, + "grad_norm": 0.3644329905509949, + "learning_rate": 0.0004180893058424435, + "loss": 0.1062, + "step": 457 + }, + { + "epoch": 0.81, + "grad_norm": 0.5457159280776978, + "learning_rate": 0.0004177439855246132, + "loss": 0.1901, + "step": 458 + }, + { + "epoch": 0.81, + "grad_norm": 0.282032310962677, + "learning_rate": 0.0004173980821017812, + "loss": 0.0656, + "step": 459 + }, + { + "epoch": 0.81, + "grad_norm": 0.1957680881023407, + "learning_rate": 0.00041705159677636334, + "loss": 0.0725, + "step": 460 + }, + { + "epoch": 0.82, + "grad_norm": 0.2736223042011261, + "learning_rate": 0.00041670453075279827, + "loss": 0.0897, + "step": 461 + }, + { + "epoch": 0.82, + "grad_norm": 0.2145017832517624, + "learning_rate": 0.0004163568852375431, + "loss": 0.046, + "step": 462 + }, + { + "epoch": 0.82, + "grad_norm": 0.1434750258922577, + "learning_rate": 0.00041600866143906947, + "loss": 0.0483, + "step": 463 + }, + { + "epoch": 0.82, + "grad_norm": 0.2438279092311859, + "learning_rate": 0.000415659860567859, + "loss": 0.0935, + "step": 464 + }, + { + "epoch": 0.82, + "grad_norm": 0.24830487370491028, + "learning_rate": 0.00041531048383639966, + "loss": 0.1061, + "step": 465 + }, + { + "epoch": 0.82, + "grad_norm": 0.25185227394104004, + "learning_rate": 0.000414960532459181, + "loss": 0.082, + "step": 466 + }, + { + "epoch": 0.83, + "grad_norm": 0.391631156206131, + "learning_rate": 0.00041461000765269, + "loss": 0.1274, + "step": 467 + }, + { + "epoch": 0.83, + "grad_norm": 0.30484774708747864, + "learning_rate": 0.0004142589106354071, + "loss": 0.0672, + "step": 468 + }, + { + "epoch": 0.83, + "grad_norm": 0.2584599554538727, + "learning_rate": 0.0004139072426278021, + "loss": 0.0863, + "step": 469 + }, + { + "epoch": 0.83, + "grad_norm": 0.27182772755622864, + "learning_rate": 0.0004135550048523292, + "loss": 0.0996, + "step": 470 + }, + { + "epoch": 0.83, + "grad_norm": 0.2670001685619354, + "learning_rate": 0.00041320219853342347, + "loss": 0.0592, + "step": 471 + }, + { + "epoch": 0.84, + "grad_norm": 0.19571639597415924, + "learning_rate": 0.0004128488248974962, + "loss": 0.0618, + "step": 472 + }, + { + "epoch": 0.84, + "grad_norm": 0.436814546585083, + "learning_rate": 0.00041249488517293095, + "loss": 0.1131, + "step": 473 + }, + { + "epoch": 0.84, + "grad_norm": 0.21684250235557556, + "learning_rate": 0.0004121403805900789, + "loss": 0.0759, + "step": 474 + }, + { + "epoch": 0.84, + "grad_norm": 0.39313605427742004, + "learning_rate": 0.0004117853123812549, + "loss": 0.0992, + "step": 475 + }, + { + "epoch": 0.84, + "grad_norm": 0.3653202950954437, + "learning_rate": 0.00041142968178073294, + "loss": 0.099, + "step": 476 + }, + { + "epoch": 0.84, + "grad_norm": 0.36615628004074097, + "learning_rate": 0.00041107349002474206, + "loss": 0.06, + "step": 477 + }, + { + "epoch": 0.85, + "grad_norm": 0.2431243658065796, + "learning_rate": 0.00041071673835146194, + "loss": 0.0689, + "step": 478 + }, + { + "epoch": 0.85, + "grad_norm": 0.7869367599487305, + "learning_rate": 0.00041035942800101864, + "loss": 0.1308, + "step": 479 + }, + { + "epoch": 0.85, + "grad_norm": 0.2831230163574219, + "learning_rate": 0.0004100015602154802, + "loss": 0.087, + "step": 480 + }, + { + "epoch": 0.85, + "grad_norm": 0.3709629774093628, + "learning_rate": 0.0004096431362388525, + "loss": 0.0822, + "step": 481 + }, + { + "epoch": 0.85, + "grad_norm": 0.4082586467266083, + "learning_rate": 0.0004092841573170748, + "loss": 0.1114, + "step": 482 + }, + { + "epoch": 0.85, + "grad_norm": 0.2919554114341736, + "learning_rate": 0.0004089246246980154, + "loss": 0.1059, + "step": 483 + }, + { + "epoch": 0.86, + "grad_norm": 0.3750731945037842, + "learning_rate": 0.0004085645396314673, + "loss": 0.082, + "step": 484 + }, + { + "epoch": 0.86, + "grad_norm": 0.21013659238815308, + "learning_rate": 0.000408203903369144, + "loss": 0.0819, + "step": 485 + }, + { + "epoch": 0.86, + "grad_norm": 0.20771674811840057, + "learning_rate": 0.00040784271716467503, + "loss": 0.0687, + "step": 486 + }, + { + "epoch": 0.86, + "grad_norm": 0.157434344291687, + "learning_rate": 0.00040748098227360154, + "loss": 0.0826, + "step": 487 + }, + { + "epoch": 0.86, + "grad_norm": 0.40467727184295654, + "learning_rate": 0.000407118699953372, + "loss": 0.1131, + "step": 488 + }, + { + "epoch": 0.87, + "grad_norm": 0.17521728575229645, + "learning_rate": 0.0004067558714633378, + "loss": 0.116, + "step": 489 + }, + { + "epoch": 0.87, + "grad_norm": 0.2975709140300751, + "learning_rate": 0.0004063924980647492, + "loss": 0.0787, + "step": 490 + }, + { + "epoch": 0.87, + "grad_norm": 0.22513332962989807, + "learning_rate": 0.0004060285810207503, + "loss": 0.0754, + "step": 491 + }, + { + "epoch": 0.87, + "grad_norm": 0.2939409613609314, + "learning_rate": 0.00040566412159637514, + "loss": 0.0505, + "step": 492 + }, + { + "epoch": 0.87, + "grad_norm": 0.21415212750434875, + "learning_rate": 0.000405299121058543, + "loss": 0.0486, + "step": 493 + }, + { + "epoch": 0.87, + "grad_norm": 0.24846945703029633, + "learning_rate": 0.00040493358067605445, + "loss": 0.0645, + "step": 494 + }, + { + "epoch": 0.88, + "grad_norm": 0.42928287386894226, + "learning_rate": 0.00040456750171958655, + "loss": 0.1455, + "step": 495 + }, + { + "epoch": 0.88, + "grad_norm": 0.30920714139938354, + "learning_rate": 0.0004042008854616883, + "loss": 0.0743, + "step": 496 + }, + { + "epoch": 0.88, + "grad_norm": 0.43211719393730164, + "learning_rate": 0.00040383373317677687, + "loss": 0.1037, + "step": 497 + }, + { + "epoch": 0.88, + "grad_norm": 0.49942275881767273, + "learning_rate": 0.00040346604614113215, + "loss": 0.123, + "step": 498 + }, + { + "epoch": 0.88, + "grad_norm": 0.18615621328353882, + "learning_rate": 0.00040309782563289353, + "loss": 0.0783, + "step": 499 + }, + { + "epoch": 0.88, + "grad_norm": 0.22238926589488983, + "learning_rate": 0.0004027290729320545, + "loss": 0.0698, + "step": 500 + }, + { + "epoch": 0.89, + "grad_norm": 0.31746548414230347, + "learning_rate": 0.0004023597893204586, + "loss": 0.1682, + "step": 501 + }, + { + "epoch": 0.89, + "grad_norm": 0.19328100979328156, + "learning_rate": 0.00040198997608179477, + "loss": 0.1028, + "step": 502 + }, + { + "epoch": 0.89, + "grad_norm": 0.15466806292533875, + "learning_rate": 0.00040161963450159333, + "loss": 0.065, + "step": 503 + }, + { + "epoch": 0.89, + "grad_norm": 0.3000398874282837, + "learning_rate": 0.00040124876586722103, + "loss": 0.1071, + "step": 504 + }, + { + "epoch": 0.89, + "grad_norm": 0.16753748059272766, + "learning_rate": 0.00040087737146787654, + "loss": 0.056, + "step": 505 + }, + { + "epoch": 0.9, + "grad_norm": 0.17570586502552032, + "learning_rate": 0.00040050545259458654, + "loss": 0.0732, + "step": 506 + }, + { + "epoch": 0.9, + "grad_norm": 0.19240190088748932, + "learning_rate": 0.00040013301054020055, + "loss": 0.0444, + "step": 507 + }, + { + "epoch": 0.9, + "grad_norm": 0.23935984075069427, + "learning_rate": 0.00039976004659938714, + "loss": 0.0583, + "step": 508 + }, + { + "epoch": 0.9, + "grad_norm": 0.22633028030395508, + "learning_rate": 0.00039938656206862857, + "loss": 0.065, + "step": 509 + }, + { + "epoch": 0.9, + "grad_norm": 0.18621531128883362, + "learning_rate": 0.000399012558246217, + "loss": 0.0489, + "step": 510 + }, + { + "epoch": 0.9, + "grad_norm": 0.37711310386657715, + "learning_rate": 0.0003986380364322498, + "loss": 0.1367, + "step": 511 + }, + { + "epoch": 0.91, + "grad_norm": 0.26448771357536316, + "learning_rate": 0.00039826299792862475, + "loss": 0.076, + "step": 512 + }, + { + "epoch": 0.91, + "grad_norm": 0.22461633384227753, + "learning_rate": 0.00039788744403903604, + "loss": 0.0734, + "step": 513 + }, + { + "epoch": 0.91, + "grad_norm": 0.23908165097236633, + "learning_rate": 0.00039751137606896907, + "loss": 0.0718, + "step": 514 + }, + { + "epoch": 0.91, + "grad_norm": 0.37807080149650574, + "learning_rate": 0.00039713479532569646, + "loss": 0.1495, + "step": 515 + }, + { + "epoch": 0.91, + "grad_norm": 0.16840259730815887, + "learning_rate": 0.00039675770311827337, + "loss": 0.0491, + "step": 516 + }, + { + "epoch": 0.91, + "grad_norm": 0.35179728269577026, + "learning_rate": 0.00039638010075753274, + "loss": 0.0839, + "step": 517 + }, + { + "epoch": 0.92, + "grad_norm": 0.3631207048892975, + "learning_rate": 0.00039600198955608084, + "loss": 0.1348, + "step": 518 + }, + { + "epoch": 0.92, + "grad_norm": 0.38650691509246826, + "learning_rate": 0.00039562337082829304, + "loss": 0.15, + "step": 519 + }, + { + "epoch": 0.92, + "grad_norm": 0.2523843050003052, + "learning_rate": 0.00039524424589030866, + "loss": 0.1172, + "step": 520 + }, + { + "epoch": 0.92, + "grad_norm": 0.2690166234970093, + "learning_rate": 0.00039486461606002686, + "loss": 0.0619, + "step": 521 + }, + { + "epoch": 0.92, + "grad_norm": 0.31193405389785767, + "learning_rate": 0.0003944844826571018, + "loss": 0.0834, + "step": 522 + }, + { + "epoch": 0.93, + "grad_norm": 0.21751855313777924, + "learning_rate": 0.00039410384700293814, + "loss": 0.068, + "step": 523 + }, + { + "epoch": 0.93, + "grad_norm": 0.34191232919692993, + "learning_rate": 0.0003937227104206865, + "loss": 0.1337, + "step": 524 + }, + { + "epoch": 0.93, + "grad_norm": 0.34457269310951233, + "learning_rate": 0.0003933410742352388, + "loss": 0.0929, + "step": 525 + }, + { + "epoch": 0.93, + "grad_norm": 0.22599942982196808, + "learning_rate": 0.0003929589397732236, + "loss": 0.0899, + "step": 526 + }, + { + "epoch": 0.93, + "grad_norm": 0.23162932693958282, + "learning_rate": 0.0003925763083630017, + "loss": 0.0869, + "step": 527 + }, + { + "epoch": 0.93, + "grad_norm": 0.19502510130405426, + "learning_rate": 0.00039219318133466104, + "loss": 0.0834, + "step": 528 + }, + { + "epoch": 0.94, + "grad_norm": 0.2539670169353485, + "learning_rate": 0.0003918095600200128, + "loss": 0.0589, + "step": 529 + }, + { + "epoch": 0.94, + "grad_norm": 0.15578749775886536, + "learning_rate": 0.00039142544575258614, + "loss": 0.0471, + "step": 530 + }, + { + "epoch": 0.94, + "grad_norm": 0.41006144881248474, + "learning_rate": 0.00039104083986762396, + "loss": 0.1215, + "step": 531 + }, + { + "epoch": 0.94, + "grad_norm": 0.3161672055721283, + "learning_rate": 0.00039065574370207785, + "loss": 0.0599, + "step": 532 + }, + { + "epoch": 0.94, + "grad_norm": 0.2556127607822418, + "learning_rate": 0.00039027015859460394, + "loss": 0.0882, + "step": 533 + }, + { + "epoch": 0.94, + "grad_norm": 0.5484500527381897, + "learning_rate": 0.000389884085885558, + "loss": 0.1342, + "step": 534 + }, + { + "epoch": 0.95, + "grad_norm": 0.3688224256038666, + "learning_rate": 0.0003894975269169906, + "loss": 0.062, + "step": 535 + }, + { + "epoch": 0.95, + "grad_norm": 0.6328185796737671, + "learning_rate": 0.0003891104830326427, + "loss": 0.1068, + "step": 536 + }, + { + "epoch": 0.95, + "grad_norm": 0.5094593167304993, + "learning_rate": 0.00038872295557794103, + "loss": 0.0593, + "step": 537 + }, + { + "epoch": 0.95, + "grad_norm": 0.44920942187309265, + "learning_rate": 0.0003883349458999931, + "loss": 0.1134, + "step": 538 + }, + { + "epoch": 0.95, + "grad_norm": 0.25559201836586, + "learning_rate": 0.0003879464553475828, + "loss": 0.0842, + "step": 539 + }, + { + "epoch": 0.96, + "grad_norm": 0.24992522597312927, + "learning_rate": 0.0003875574852711656, + "loss": 0.0684, + "step": 540 + }, + { + "epoch": 0.96, + "grad_norm": 0.7482407093048096, + "learning_rate": 0.0003871680370228639, + "loss": 0.1698, + "step": 541 + }, + { + "epoch": 0.96, + "grad_norm": 0.42716777324676514, + "learning_rate": 0.00038677811195646233, + "loss": 0.1335, + "step": 542 + }, + { + "epoch": 0.96, + "grad_norm": 0.5867021083831787, + "learning_rate": 0.0003863877114274029, + "loss": 0.153, + "step": 543 + }, + { + "epoch": 0.96, + "grad_norm": 0.14882822334766388, + "learning_rate": 0.0003859968367927805, + "loss": 0.0548, + "step": 544 + }, + { + "epoch": 0.96, + "grad_norm": 0.16213174164295197, + "learning_rate": 0.0003856054894113381, + "loss": 0.0859, + "step": 545 + }, + { + "epoch": 0.97, + "grad_norm": 0.13216906785964966, + "learning_rate": 0.0003852136706434619, + "loss": 0.0837, + "step": 546 + }, + { + "epoch": 0.97, + "grad_norm": 0.28230682015419006, + "learning_rate": 0.00038482138185117685, + "loss": 0.0746, + "step": 547 + }, + { + "epoch": 0.97, + "grad_norm": 0.15776745975017548, + "learning_rate": 0.0003844286243981417, + "loss": 0.0758, + "step": 548 + }, + { + "epoch": 0.97, + "grad_norm": 0.38748612999916077, + "learning_rate": 0.0003840353996496444, + "loss": 0.0946, + "step": 549 + }, + { + "epoch": 0.97, + "grad_norm": 0.4377779960632324, + "learning_rate": 0.0003836417089725971, + "loss": 0.078, + "step": 550 + }, + { + "epoch": 0.97, + "grad_norm": 0.4776962101459503, + "learning_rate": 0.0003832475537355319, + "loss": 0.0996, + "step": 551 + }, + { + "epoch": 0.98, + "grad_norm": 0.16078083217144012, + "learning_rate": 0.00038285293530859553, + "loss": 0.0813, + "step": 552 + }, + { + "epoch": 0.98, + "grad_norm": 0.19620949029922485, + "learning_rate": 0.00038245785506354514, + "loss": 0.0716, + "step": 553 + }, + { + "epoch": 0.98, + "grad_norm": 0.23539945483207703, + "learning_rate": 0.0003820623143737427, + "loss": 0.0727, + "step": 554 + }, + { + "epoch": 0.98, + "grad_norm": 0.2797366678714752, + "learning_rate": 0.0003816663146141514, + "loss": 0.0307, + "step": 555 + }, + { + "epoch": 0.98, + "grad_norm": 0.31704849004745483, + "learning_rate": 0.00038126985716132976, + "loss": 0.0522, + "step": 556 + }, + { + "epoch": 0.99, + "grad_norm": 1.038294792175293, + "learning_rate": 0.00038087294339342765, + "loss": 0.1602, + "step": 557 + }, + { + "epoch": 0.99, + "grad_norm": 0.39535316824913025, + "learning_rate": 0.00038047557469018077, + "loss": 0.0672, + "step": 558 + }, + { + "epoch": 0.99, + "grad_norm": 0.5337291359901428, + "learning_rate": 0.00038007775243290666, + "loss": 0.238, + "step": 559 + }, + { + "epoch": 0.99, + "grad_norm": 0.7618711590766907, + "learning_rate": 0.0003796794780044992, + "loss": 0.0741, + "step": 560 + }, + { + "epoch": 0.99, + "grad_norm": 0.3507292568683624, + "learning_rate": 0.0003792807527894242, + "loss": 0.1035, + "step": 561 + }, + { + "epoch": 0.99, + "grad_norm": 0.29699352383613586, + "learning_rate": 0.00037888157817371455, + "loss": 0.0732, + "step": 562 + }, + { + "epoch": 1.0, + "grad_norm": 0.1690889596939087, + "learning_rate": 0.0003784819555449651, + "loss": 0.0625, + "step": 563 + }, + { + "epoch": 1.0, + "grad_norm": 0.28516581654548645, + "learning_rate": 0.0003780818862923284, + "loss": 0.0705, + "step": 564 + }, + { + "epoch": 1.0, + "grad_norm": 0.3408360481262207, + "learning_rate": 0.00037768137180650913, + "loss": 0.1025, + "step": 565 + }, + { + "epoch": 1.0, + "grad_norm": 0.28147757053375244, + "learning_rate": 0.00037728041347976005, + "loss": 0.0495, + "step": 566 + }, + { + "epoch": 1.0, + "grad_norm": 0.31090235710144043, + "learning_rate": 0.00037687901270587655, + "loss": 0.0874, + "step": 567 + }, + { + "epoch": 1.0, + "grad_norm": 0.29111558198928833, + "learning_rate": 0.00037647717088019217, + "loss": 0.0589, + "step": 568 + }, + { + "epoch": 1.0, + "eval_loss": 0.08437130600214005, + "eval_runtime": 14.722, + "eval_samples_per_second": 32.4, + "eval_steps_per_second": 8.151, + "step": 568 + }, + { + "epoch": 1.01, + "grad_norm": 0.14942067861557007, + "learning_rate": 0.0003760748893995736, + "loss": 0.0391, + "step": 569 + }, + { + "epoch": 1.01, + "grad_norm": 0.28915029764175415, + "learning_rate": 0.0003756721696624156, + "loss": 0.0522, + "step": 570 + }, + { + "epoch": 1.01, + "grad_norm": 0.1540856510400772, + "learning_rate": 0.0003752690130686367, + "loss": 0.0473, + "step": 571 + }, + { + "epoch": 1.01, + "grad_norm": 0.31572067737579346, + "learning_rate": 0.0003748654210196739, + "loss": 0.058, + "step": 572 + }, + { + "epoch": 1.01, + "grad_norm": 0.4497004449367523, + "learning_rate": 0.0003744613949184779, + "loss": 0.0937, + "step": 573 + }, + { + "epoch": 1.02, + "grad_norm": 0.48680734634399414, + "learning_rate": 0.0003740569361695082, + "loss": 0.0925, + "step": 574 + }, + { + "epoch": 1.02, + "grad_norm": 0.3604874610900879, + "learning_rate": 0.00037365204617872836, + "loss": 0.0273, + "step": 575 + }, + { + "epoch": 1.02, + "grad_norm": 0.31378456950187683, + "learning_rate": 0.0003732467263536008, + "loss": 0.048, + "step": 576 + }, + { + "epoch": 1.02, + "grad_norm": 0.37197205424308777, + "learning_rate": 0.0003728409781030824, + "loss": 0.0445, + "step": 577 + }, + { + "epoch": 1.02, + "grad_norm": 0.09396038949489594, + "learning_rate": 0.00037243480283761913, + "loss": 0.0102, + "step": 578 + }, + { + "epoch": 1.02, + "grad_norm": 0.4994851052761078, + "learning_rate": 0.00037202820196914133, + "loss": 0.074, + "step": 579 + }, + { + "epoch": 1.03, + "grad_norm": 0.20099425315856934, + "learning_rate": 0.0003716211769110589, + "loss": 0.0239, + "step": 580 + }, + { + "epoch": 1.03, + "grad_norm": 0.33086463809013367, + "learning_rate": 0.0003712137290782561, + "loss": 0.0305, + "step": 581 + }, + { + "epoch": 1.03, + "grad_norm": 0.3871704041957855, + "learning_rate": 0.0003708058598870871, + "loss": 0.0309, + "step": 582 + }, + { + "epoch": 1.03, + "grad_norm": 0.514127790927887, + "learning_rate": 0.0003703975707553706, + "loss": 0.0639, + "step": 583 + }, + { + "epoch": 1.03, + "grad_norm": 0.29386666417121887, + "learning_rate": 0.000369988863102385, + "loss": 0.0778, + "step": 584 + }, + { + "epoch": 1.03, + "grad_norm": 0.4717571437358856, + "learning_rate": 0.0003695797383488638, + "loss": 0.0414, + "step": 585 + }, + { + "epoch": 1.04, + "grad_norm": 0.61000657081604, + "learning_rate": 0.0003691701979169903, + "loss": 0.0687, + "step": 586 + }, + { + "epoch": 1.04, + "grad_norm": 0.3639252483844757, + "learning_rate": 0.0003687602432303926, + "loss": 0.0337, + "step": 587 + }, + { + "epoch": 1.04, + "grad_norm": 0.20936115086078644, + "learning_rate": 0.0003683498757141391, + "loss": 0.0232, + "step": 588 + }, + { + "epoch": 1.04, + "grad_norm": 0.5917474031448364, + "learning_rate": 0.00036793909679473294, + "loss": 0.0564, + "step": 589 + }, + { + "epoch": 1.04, + "grad_norm": 0.23065000772476196, + "learning_rate": 0.00036752790790010767, + "loss": 0.0246, + "step": 590 + }, + { + "epoch": 1.05, + "grad_norm": 0.1876888871192932, + "learning_rate": 0.00036711631045962173, + "loss": 0.0351, + "step": 591 + }, + { + "epoch": 1.05, + "grad_norm": 0.6483283042907715, + "learning_rate": 0.000366704305904054, + "loss": 0.0628, + "step": 592 + }, + { + "epoch": 1.05, + "grad_norm": 0.8450531363487244, + "learning_rate": 0.0003662918956655983, + "loss": 0.0922, + "step": 593 + }, + { + "epoch": 1.05, + "grad_norm": 0.5190649628639221, + "learning_rate": 0.00036587908117785887, + "loss": 0.0715, + "step": 594 + }, + { + "epoch": 1.05, + "grad_norm": 0.597562849521637, + "learning_rate": 0.000365465863875845, + "loss": 0.0728, + "step": 595 + }, + { + "epoch": 1.05, + "grad_norm": 0.5079246759414673, + "learning_rate": 0.0003650522451959663, + "loss": 0.1145, + "step": 596 + }, + { + "epoch": 1.06, + "grad_norm": 0.4016817808151245, + "learning_rate": 0.0003646382265760276, + "loss": 0.1373, + "step": 597 + }, + { + "epoch": 1.06, + "grad_norm": 0.2420119345188141, + "learning_rate": 0.00036422380945522426, + "loss": 0.0428, + "step": 598 + }, + { + "epoch": 1.06, + "grad_norm": 0.2923775017261505, + "learning_rate": 0.00036380899527413646, + "loss": 0.0407, + "step": 599 + }, + { + "epoch": 1.06, + "grad_norm": 0.2968994379043579, + "learning_rate": 0.00036339378547472497, + "loss": 0.039, + "step": 600 + }, + { + "epoch": 1.06, + "grad_norm": 0.3694530129432678, + "learning_rate": 0.0003629781815003256, + "loss": 0.0765, + "step": 601 + }, + { + "epoch": 1.07, + "grad_norm": 0.19854502379894257, + "learning_rate": 0.0003625621847956443, + "loss": 0.0283, + "step": 602 + }, + { + "epoch": 1.07, + "grad_norm": 0.16821172833442688, + "learning_rate": 0.0003621457968067526, + "loss": 0.0551, + "step": 603 + }, + { + "epoch": 1.07, + "grad_norm": 0.5689147114753723, + "learning_rate": 0.00036172901898108177, + "loss": 0.0818, + "step": 604 + }, + { + "epoch": 1.07, + "grad_norm": 0.16156058013439178, + "learning_rate": 0.0003613118527674185, + "loss": 0.0368, + "step": 605 + }, + { + "epoch": 1.07, + "grad_norm": 0.21022377908229828, + "learning_rate": 0.00036089429961589926, + "loss": 0.0614, + "step": 606 + }, + { + "epoch": 1.07, + "grad_norm": 0.3432522714138031, + "learning_rate": 0.00036047636097800593, + "loss": 0.0393, + "step": 607 + }, + { + "epoch": 1.08, + "grad_norm": 0.2537219524383545, + "learning_rate": 0.00036005803830656036, + "loss": 0.0852, + "step": 608 + }, + { + "epoch": 1.08, + "grad_norm": 0.209491565823555, + "learning_rate": 0.00035963933305571916, + "loss": 0.0476, + "step": 609 + }, + { + "epoch": 1.08, + "grad_norm": 0.2286662459373474, + "learning_rate": 0.00035922024668096883, + "loss": 0.0614, + "step": 610 + }, + { + "epoch": 1.08, + "grad_norm": 0.5772972106933594, + "learning_rate": 0.00035880078063912105, + "loss": 0.0546, + "step": 611 + }, + { + "epoch": 1.08, + "grad_norm": 0.37829965353012085, + "learning_rate": 0.0003583809363883069, + "loss": 0.0526, + "step": 612 + }, + { + "epoch": 1.08, + "grad_norm": 0.1876819133758545, + "learning_rate": 0.0003579607153879724, + "loss": 0.0339, + "step": 613 + }, + { + "epoch": 1.09, + "grad_norm": 0.42904049158096313, + "learning_rate": 0.0003575401190988732, + "loss": 0.0705, + "step": 614 + }, + { + "epoch": 1.09, + "grad_norm": 0.2780819833278656, + "learning_rate": 0.0003571191489830693, + "loss": 0.0425, + "step": 615 + }, + { + "epoch": 1.09, + "grad_norm": 0.3338189721107483, + "learning_rate": 0.00035669780650392056, + "loss": 0.0713, + "step": 616 + }, + { + "epoch": 1.09, + "grad_norm": 0.2791332006454468, + "learning_rate": 0.000356276093126081, + "loss": 0.0392, + "step": 617 + }, + { + "epoch": 1.09, + "grad_norm": 0.4691467881202698, + "learning_rate": 0.0003558540103154939, + "loss": 0.0756, + "step": 618 + }, + { + "epoch": 1.1, + "grad_norm": 0.34194234013557434, + "learning_rate": 0.00035543155953938674, + "loss": 0.057, + "step": 619 + }, + { + "epoch": 1.1, + "grad_norm": 0.43898969888687134, + "learning_rate": 0.00035500874226626633, + "loss": 0.1484, + "step": 620 + }, + { + "epoch": 1.1, + "grad_norm": 0.3562189042568207, + "learning_rate": 0.00035458555996591325, + "loss": 0.0801, + "step": 621 + }, + { + "epoch": 1.1, + "grad_norm": 0.2978869080543518, + "learning_rate": 0.0003541620141093771, + "loss": 0.0422, + "step": 622 + }, + { + "epoch": 1.1, + "grad_norm": 0.415714293718338, + "learning_rate": 0.00035373810616897116, + "loss": 0.042, + "step": 623 + }, + { + "epoch": 1.1, + "grad_norm": 0.28547269105911255, + "learning_rate": 0.00035331383761826756, + "loss": 0.0722, + "step": 624 + }, + { + "epoch": 1.11, + "grad_norm": 0.2831112742424011, + "learning_rate": 0.00035288920993209173, + "loss": 0.0339, + "step": 625 + }, + { + "epoch": 1.11, + "grad_norm": 0.372010201215744, + "learning_rate": 0.00035246422458651766, + "loss": 0.0573, + "step": 626 + }, + { + "epoch": 1.11, + "grad_norm": 0.07014724612236023, + "learning_rate": 0.0003520388830588625, + "loss": 0.0108, + "step": 627 + }, + { + "epoch": 1.11, + "grad_norm": 0.5464847087860107, + "learning_rate": 0.0003516131868276817, + "loss": 0.0871, + "step": 628 + }, + { + "epoch": 1.11, + "grad_norm": 0.118097685277462, + "learning_rate": 0.00035118713737276376, + "loss": 0.0176, + "step": 629 + }, + { + "epoch": 1.11, + "grad_norm": 0.43580326437950134, + "learning_rate": 0.00035076073617512475, + "loss": 0.0817, + "step": 630 + }, + { + "epoch": 1.12, + "grad_norm": 0.42866209149360657, + "learning_rate": 0.00035033398471700367, + "loss": 0.1195, + "step": 631 + }, + { + "epoch": 1.12, + "grad_norm": 0.42996305227279663, + "learning_rate": 0.0003499068844818571, + "loss": 0.12, + "step": 632 + }, + { + "epoch": 1.12, + "grad_norm": 0.4283413290977478, + "learning_rate": 0.0003494794369543539, + "loss": 0.085, + "step": 633 + }, + { + "epoch": 1.12, + "grad_norm": 0.693706214427948, + "learning_rate": 0.0003490516436203703, + "loss": 0.126, + "step": 634 + }, + { + "epoch": 1.12, + "grad_norm": 0.264961838722229, + "learning_rate": 0.00034862350596698456, + "loss": 0.0556, + "step": 635 + }, + { + "epoch": 1.13, + "grad_norm": 0.2530398368835449, + "learning_rate": 0.00034819502548247175, + "loss": 0.0514, + "step": 636 + }, + { + "epoch": 1.13, + "grad_norm": 0.18521098792552948, + "learning_rate": 0.0003477662036562989, + "loss": 0.0387, + "step": 637 + }, + { + "epoch": 1.13, + "grad_norm": 0.34398314356803894, + "learning_rate": 0.00034733704197911937, + "loss": 0.1047, + "step": 638 + }, + { + "epoch": 1.13, + "grad_norm": 0.16019423305988312, + "learning_rate": 0.000346907541942768, + "loss": 0.0299, + "step": 639 + }, + { + "epoch": 1.13, + "grad_norm": 0.3269899785518646, + "learning_rate": 0.00034647770504025587, + "loss": 0.0405, + "step": 640 + }, + { + "epoch": 1.13, + "grad_norm": 0.46410876512527466, + "learning_rate": 0.00034604753276576487, + "loss": 0.0855, + "step": 641 + }, + { + "epoch": 1.14, + "grad_norm": 0.33000048995018005, + "learning_rate": 0.000345617026614643, + "loss": 0.0759, + "step": 642 + }, + { + "epoch": 1.14, + "grad_norm": 0.31162315607070923, + "learning_rate": 0.0003451861880833986, + "loss": 0.0558, + "step": 643 + }, + { + "epoch": 1.14, + "grad_norm": 0.42918407917022705, + "learning_rate": 0.0003447550186696956, + "loss": 0.0365, + "step": 644 + }, + { + "epoch": 1.14, + "grad_norm": 0.2732023298740387, + "learning_rate": 0.00034432351987234786, + "loss": 0.0616, + "step": 645 + }, + { + "epoch": 1.14, + "grad_norm": 0.1899593621492386, + "learning_rate": 0.00034389169319131476, + "loss": 0.0286, + "step": 646 + }, + { + "epoch": 1.14, + "grad_norm": 0.447968065738678, + "learning_rate": 0.0003434595401276947, + "loss": 0.0701, + "step": 647 + }, + { + "epoch": 1.15, + "grad_norm": 0.15938018262386322, + "learning_rate": 0.0003430270621837213, + "loss": 0.026, + "step": 648 + }, + { + "epoch": 1.15, + "grad_norm": 0.17606832087039948, + "learning_rate": 0.0003425942608627572, + "loss": 0.0245, + "step": 649 + }, + { + "epoch": 1.15, + "grad_norm": 0.49266988039016724, + "learning_rate": 0.0003421611376692892, + "loss": 0.0823, + "step": 650 + }, + { + "epoch": 1.15, + "grad_norm": 0.3935730755329132, + "learning_rate": 0.0003417276941089232, + "loss": 0.0426, + "step": 651 + }, + { + "epoch": 1.15, + "grad_norm": 0.5984533429145813, + "learning_rate": 0.0003412939316883782, + "loss": 0.0833, + "step": 652 + }, + { + "epoch": 1.16, + "grad_norm": 0.3196690082550049, + "learning_rate": 0.00034085985191548217, + "loss": 0.0337, + "step": 653 + }, + { + "epoch": 1.16, + "grad_norm": 0.39022788405418396, + "learning_rate": 0.000340425456299166, + "loss": 0.0235, + "step": 654 + }, + { + "epoch": 1.16, + "grad_norm": 0.29681891202926636, + "learning_rate": 0.00033999074634945856, + "loss": 0.0155, + "step": 655 + }, + { + "epoch": 1.16, + "grad_norm": 0.5547076463699341, + "learning_rate": 0.0003395557235774813, + "loss": 0.0942, + "step": 656 + }, + { + "epoch": 1.16, + "grad_norm": 0.9071078300476074, + "learning_rate": 0.00033912038949544316, + "loss": 0.1004, + "step": 657 + }, + { + "epoch": 1.16, + "grad_norm": 0.5410562753677368, + "learning_rate": 0.00033868474561663534, + "loss": 0.0743, + "step": 658 + }, + { + "epoch": 1.17, + "grad_norm": 0.5785720348358154, + "learning_rate": 0.0003382487934554257, + "loss": 0.1017, + "step": 659 + }, + { + "epoch": 1.17, + "grad_norm": 0.60345858335495, + "learning_rate": 0.0003378125345272539, + "loss": 0.117, + "step": 660 + }, + { + "epoch": 1.17, + "grad_norm": 0.23607215285301208, + "learning_rate": 0.0003373759703486262, + "loss": 0.0149, + "step": 661 + }, + { + "epoch": 1.17, + "grad_norm": 0.3551620543003082, + "learning_rate": 0.0003369391024371093, + "loss": 0.0435, + "step": 662 + }, + { + "epoch": 1.17, + "grad_norm": 0.4280162453651428, + "learning_rate": 0.00033650193231132657, + "loss": 0.1019, + "step": 663 + }, + { + "epoch": 1.17, + "grad_norm": 0.12040708214044571, + "learning_rate": 0.0003360644614909512, + "loss": 0.0165, + "step": 664 + }, + { + "epoch": 1.18, + "grad_norm": 0.6838027238845825, + "learning_rate": 0.00033562669149670213, + "loss": 0.0909, + "step": 665 + }, + { + "epoch": 1.18, + "grad_norm": 0.2861780524253845, + "learning_rate": 0.00033518862385033786, + "loss": 0.0719, + "step": 666 + }, + { + "epoch": 1.18, + "grad_norm": 0.24380998313426971, + "learning_rate": 0.00033475026007465184, + "loss": 0.0388, + "step": 667 + }, + { + "epoch": 1.18, + "grad_norm": 0.41201332211494446, + "learning_rate": 0.00033431160169346714, + "loss": 0.0442, + "step": 668 + }, + { + "epoch": 1.18, + "grad_norm": 0.3734363615512848, + "learning_rate": 0.0003338726502316304, + "loss": 0.0687, + "step": 669 + }, + { + "epoch": 1.19, + "grad_norm": 0.21814176440238953, + "learning_rate": 0.00033343340721500743, + "loss": 0.0743, + "step": 670 + }, + { + "epoch": 1.19, + "grad_norm": 0.17123498022556305, + "learning_rate": 0.00033299387417047723, + "loss": 0.0446, + "step": 671 + }, + { + "epoch": 1.19, + "grad_norm": 0.3857256770133972, + "learning_rate": 0.0003325540526259275, + "loss": 0.0524, + "step": 672 + }, + { + "epoch": 1.19, + "grad_norm": 0.7980711460113525, + "learning_rate": 0.00033211394411024813, + "loss": 0.0786, + "step": 673 + }, + { + "epoch": 1.19, + "grad_norm": 0.31176111102104187, + "learning_rate": 0.00033167355015332713, + "loss": 0.0499, + "step": 674 + }, + { + "epoch": 1.19, + "grad_norm": 0.6255938410758972, + "learning_rate": 0.0003312328722860445, + "loss": 0.0664, + "step": 675 + }, + { + "epoch": 1.2, + "grad_norm": 0.3685753047466278, + "learning_rate": 0.00033079191204026713, + "loss": 0.0495, + "step": 676 + }, + { + "epoch": 1.2, + "grad_norm": 0.4045291841030121, + "learning_rate": 0.00033035067094884366, + "loss": 0.0697, + "step": 677 + }, + { + "epoch": 1.2, + "grad_norm": 0.6035248637199402, + "learning_rate": 0.0003299091505455989, + "loss": 0.1206, + "step": 678 + }, + { + "epoch": 1.2, + "grad_norm": 0.3399547338485718, + "learning_rate": 0.00032946735236532855, + "loss": 0.035, + "step": 679 + }, + { + "epoch": 1.2, + "grad_norm": 0.3604506552219391, + "learning_rate": 0.0003290252779437939, + "loss": 0.1087, + "step": 680 + }, + { + "epoch": 1.2, + "grad_norm": 0.28665006160736084, + "learning_rate": 0.0003285829288177167, + "loss": 0.0858, + "step": 681 + }, + { + "epoch": 1.21, + "grad_norm": 0.41967740654945374, + "learning_rate": 0.0003281403065247733, + "loss": 0.0851, + "step": 682 + }, + { + "epoch": 1.21, + "grad_norm": 0.43989259004592896, + "learning_rate": 0.00032769741260358997, + "loss": 0.0793, + "step": 683 + }, + { + "epoch": 1.21, + "grad_norm": 0.29274123907089233, + "learning_rate": 0.00032725424859373687, + "loss": 0.0538, + "step": 684 + }, + { + "epoch": 1.21, + "grad_norm": 0.27231287956237793, + "learning_rate": 0.0003268108160357233, + "loss": 0.0692, + "step": 685 + }, + { + "epoch": 1.21, + "grad_norm": 0.3030160963535309, + "learning_rate": 0.0003263671164709918, + "loss": 0.0786, + "step": 686 + }, + { + "epoch": 1.22, + "grad_norm": 0.19824832677841187, + "learning_rate": 0.0003259231514419135, + "loss": 0.0699, + "step": 687 + }, + { + "epoch": 1.22, + "grad_norm": 0.23121508955955505, + "learning_rate": 0.0003254789224917818, + "loss": 0.0499, + "step": 688 + }, + { + "epoch": 1.22, + "grad_norm": 0.15240328013896942, + "learning_rate": 0.0003250344311648079, + "loss": 0.0431, + "step": 689 + }, + { + "epoch": 1.22, + "grad_norm": 0.16165931522846222, + "learning_rate": 0.000324589679006115, + "loss": 0.0554, + "step": 690 + }, + { + "epoch": 1.22, + "grad_norm": 0.29693150520324707, + "learning_rate": 0.0003241446675617329, + "loss": 0.0554, + "step": 691 + }, + { + "epoch": 1.22, + "grad_norm": 0.7295424938201904, + "learning_rate": 0.00032369939837859275, + "loss": 0.1232, + "step": 692 + }, + { + "epoch": 1.23, + "grad_norm": 0.43246909976005554, + "learning_rate": 0.0003232538730045215, + "loss": 0.0598, + "step": 693 + }, + { + "epoch": 1.23, + "grad_norm": 0.18855467438697815, + "learning_rate": 0.00032280809298823723, + "loss": 0.0252, + "step": 694 + }, + { + "epoch": 1.23, + "grad_norm": 0.26345816254615784, + "learning_rate": 0.00032236205987934234, + "loss": 0.0809, + "step": 695 + }, + { + "epoch": 1.23, + "grad_norm": 0.497403085231781, + "learning_rate": 0.00032191577522831984, + "loss": 0.0482, + "step": 696 + }, + { + "epoch": 1.23, + "grad_norm": 0.2640454173088074, + "learning_rate": 0.0003214692405865264, + "loss": 0.0538, + "step": 697 + }, + { + "epoch": 1.23, + "grad_norm": 0.335443377494812, + "learning_rate": 0.00032102245750618833, + "loss": 0.1, + "step": 698 + }, + { + "epoch": 1.24, + "grad_norm": 0.37383145093917847, + "learning_rate": 0.00032057542754039526, + "loss": 0.0767, + "step": 699 + }, + { + "epoch": 1.24, + "grad_norm": 0.3101638853549957, + "learning_rate": 0.00032012815224309496, + "loss": 0.0499, + "step": 700 + }, + { + "epoch": 1.24, + "grad_norm": 0.2282581478357315, + "learning_rate": 0.00031968063316908815, + "loss": 0.0424, + "step": 701 + }, + { + "epoch": 1.24, + "grad_norm": 0.1553642451763153, + "learning_rate": 0.00031923287187402287, + "loss": 0.0446, + "step": 702 + }, + { + "epoch": 1.24, + "grad_norm": 0.1549568623304367, + "learning_rate": 0.0003187848699143894, + "loss": 0.0252, + "step": 703 + }, + { + "epoch": 1.25, + "grad_norm": 0.4515601694583893, + "learning_rate": 0.00031833662884751416, + "loss": 0.0852, + "step": 704 + }, + { + "epoch": 1.25, + "grad_norm": 0.23497383296489716, + "learning_rate": 0.0003178881502315552, + "loss": 0.0291, + "step": 705 + }, + { + "epoch": 1.25, + "grad_norm": 0.5249956846237183, + "learning_rate": 0.000317439435625496, + "loss": 0.0478, + "step": 706 + }, + { + "epoch": 1.25, + "grad_norm": 0.7284122705459595, + "learning_rate": 0.0003169904865891405, + "loss": 0.0584, + "step": 707 + }, + { + "epoch": 1.25, + "grad_norm": 0.34631770849227905, + "learning_rate": 0.00031654130468310784, + "loss": 0.092, + "step": 708 + }, + { + "epoch": 1.25, + "grad_norm": 0.43634921312332153, + "learning_rate": 0.000316091891468826, + "loss": 0.0758, + "step": 709 + }, + { + "epoch": 1.26, + "grad_norm": 0.2862977683544159, + "learning_rate": 0.00031564224850852754, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 1.26, + "eval_loss": 0.08585863560438156, + "eval_runtime": 14.7073, + "eval_samples_per_second": 32.433, + "eval_steps_per_second": 8.159, + "step": 710 + }, + { + "epoch": 1.26, + "grad_norm": 0.4469147324562073, + "learning_rate": 0.0003151923773652436, + "loss": 0.0807, + "step": 711 + }, + { + "epoch": 1.26, + "grad_norm": 0.25653207302093506, + "learning_rate": 0.00031474227960279834, + "loss": 0.0618, + "step": 712 + }, + { + "epoch": 1.26, + "grad_norm": 0.4219815731048584, + "learning_rate": 0.0003142919567858039, + "loss": 0.0592, + "step": 713 + }, + { + "epoch": 1.26, + "grad_norm": 0.8190480470657349, + "learning_rate": 0.0003138414104796545, + "loss": 0.103, + "step": 714 + }, + { + "epoch": 1.26, + "grad_norm": 0.3400764465332031, + "learning_rate": 0.0003133906422505215, + "loss": 0.0998, + "step": 715 + }, + { + "epoch": 1.27, + "grad_norm": 0.21949267387390137, + "learning_rate": 0.0003129396536653474, + "loss": 0.0395, + "step": 716 + }, + { + "epoch": 1.27, + "grad_norm": 0.30582305788993835, + "learning_rate": 0.0003124884462918411, + "loss": 0.0835, + "step": 717 + }, + { + "epoch": 1.27, + "grad_norm": 0.11762493848800659, + "learning_rate": 0.0003120370216984716, + "loss": 0.026, + "step": 718 + }, + { + "epoch": 1.27, + "grad_norm": 0.1867324858903885, + "learning_rate": 0.00031158538145446314, + "loss": 0.0544, + "step": 719 + }, + { + "epoch": 1.27, + "grad_norm": 0.15806153416633606, + "learning_rate": 0.00031113352712978996, + "loss": 0.0406, + "step": 720 + }, + { + "epoch": 1.28, + "grad_norm": 0.2605026662349701, + "learning_rate": 0.00031068146029516997, + "loss": 0.0431, + "step": 721 + }, + { + "epoch": 1.28, + "grad_norm": 0.27978816628456116, + "learning_rate": 0.00031022918252206005, + "loss": 0.0948, + "step": 722 + }, + { + "epoch": 1.28, + "grad_norm": 0.19412288069725037, + "learning_rate": 0.00030977669538265017, + "loss": 0.0305, + "step": 723 + }, + { + "epoch": 1.28, + "grad_norm": 0.42198699712753296, + "learning_rate": 0.0003093240004498585, + "loss": 0.1205, + "step": 724 + }, + { + "epoch": 1.28, + "grad_norm": 0.22501109540462494, + "learning_rate": 0.0003088710992973249, + "loss": 0.0368, + "step": 725 + }, + { + "epoch": 1.28, + "grad_norm": 0.4651114046573639, + "learning_rate": 0.00030841799349940667, + "loss": 0.1044, + "step": 726 + }, + { + "epoch": 1.29, + "grad_norm": 0.4760609567165375, + "learning_rate": 0.00030796468463117216, + "loss": 0.0829, + "step": 727 + }, + { + "epoch": 1.29, + "grad_norm": 0.308748722076416, + "learning_rate": 0.0003075111742683957, + "loss": 0.0382, + "step": 728 + }, + { + "epoch": 1.29, + "grad_norm": 0.21451212465763092, + "learning_rate": 0.0003070574639875521, + "loss": 0.0441, + "step": 729 + }, + { + "epoch": 1.29, + "grad_norm": 0.1944577842950821, + "learning_rate": 0.00030660355536581103, + "loss": 0.0326, + "step": 730 + }, + { + "epoch": 1.29, + "grad_norm": 0.5249868035316467, + "learning_rate": 0.0003061494499810317, + "loss": 0.0857, + "step": 731 + }, + { + "epoch": 1.3, + "grad_norm": 0.2004554718732834, + "learning_rate": 0.00030569514941175725, + "loss": 0.0533, + "step": 732 + }, + { + "epoch": 1.3, + "grad_norm": 0.21708889305591583, + "learning_rate": 0.00030524065523720935, + "loss": 0.0562, + "step": 733 + }, + { + "epoch": 1.3, + "grad_norm": 0.36287248134613037, + "learning_rate": 0.00030478596903728267, + "loss": 0.1, + "step": 734 + }, + { + "epoch": 1.3, + "grad_norm": 0.4405117928981781, + "learning_rate": 0.0003043310923925394, + "loss": 0.0929, + "step": 735 + }, + { + "epoch": 1.3, + "grad_norm": 0.3343874514102936, + "learning_rate": 0.0003038760268842036, + "loss": 0.0549, + "step": 736 + }, + { + "epoch": 1.3, + "grad_norm": 0.2254178822040558, + "learning_rate": 0.00030342077409415606, + "loss": 0.0495, + "step": 737 + }, + { + "epoch": 1.31, + "grad_norm": 0.19972631335258484, + "learning_rate": 0.00030296533560492854, + "loss": 0.0301, + "step": 738 + }, + { + "epoch": 1.31, + "grad_norm": 0.19470427930355072, + "learning_rate": 0.0003025097129996983, + "loss": 0.0485, + "step": 739 + }, + { + "epoch": 1.31, + "grad_norm": 0.34024572372436523, + "learning_rate": 0.0003020539078622824, + "loss": 0.0509, + "step": 740 + }, + { + "epoch": 1.31, + "grad_norm": 0.26424598693847656, + "learning_rate": 0.00030159792177713294, + "loss": 0.0293, + "step": 741 + }, + { + "epoch": 1.31, + "grad_norm": 0.3307158946990967, + "learning_rate": 0.00030114175632933043, + "loss": 0.0302, + "step": 742 + }, + { + "epoch": 1.31, + "grad_norm": 0.448448121547699, + "learning_rate": 0.0003006854131045793, + "loss": 0.0683, + "step": 743 + }, + { + "epoch": 1.32, + "grad_norm": 0.4010024070739746, + "learning_rate": 0.0003002288936892017, + "loss": 0.078, + "step": 744 + }, + { + "epoch": 1.32, + "grad_norm": 0.22475454211235046, + "learning_rate": 0.0002997721996701324, + "loss": 0.0303, + "step": 745 + }, + { + "epoch": 1.32, + "grad_norm": 0.31867673993110657, + "learning_rate": 0.000299315332634913, + "loss": 0.0521, + "step": 746 + }, + { + "epoch": 1.32, + "grad_norm": 0.2040076106786728, + "learning_rate": 0.0002988582941716867, + "loss": 0.0225, + "step": 747 + }, + { + "epoch": 1.32, + "grad_norm": 1.1286782026290894, + "learning_rate": 0.00029840108586919246, + "loss": 0.0833, + "step": 748 + }, + { + "epoch": 1.33, + "grad_norm": 0.6526787877082825, + "learning_rate": 0.00029794370931675963, + "loss": 0.1085, + "step": 749 + }, + { + "epoch": 1.33, + "grad_norm": 0.3272201418876648, + "learning_rate": 0.00029748616610430264, + "loss": 0.0213, + "step": 750 + }, + { + "epoch": 1.33, + "grad_norm": 0.5573351383209229, + "learning_rate": 0.0002970284578223149, + "loss": 0.0478, + "step": 751 + }, + { + "epoch": 1.33, + "grad_norm": 0.31984782218933105, + "learning_rate": 0.00029657058606186393, + "loss": 0.0353, + "step": 752 + }, + { + "epoch": 1.33, + "grad_norm": 1.712653398513794, + "learning_rate": 0.00029611255241458533, + "loss": 0.0787, + "step": 753 + }, + { + "epoch": 1.33, + "grad_norm": 0.5753667950630188, + "learning_rate": 0.00029565435847267766, + "loss": 0.1024, + "step": 754 + }, + { + "epoch": 1.34, + "grad_norm": 0.44150909781455994, + "learning_rate": 0.00029519600582889657, + "loss": 0.0261, + "step": 755 + }, + { + "epoch": 1.34, + "grad_norm": 0.2625367343425751, + "learning_rate": 0.00029473749607654914, + "loss": 0.0685, + "step": 756 + }, + { + "epoch": 1.34, + "grad_norm": 0.48232585191726685, + "learning_rate": 0.00029427883080948905, + "loss": 0.0299, + "step": 757 + }, + { + "epoch": 1.34, + "grad_norm": 0.2749970853328705, + "learning_rate": 0.00029382001162211026, + "loss": 0.022, + "step": 758 + }, + { + "epoch": 1.34, + "grad_norm": 0.2759183645248413, + "learning_rate": 0.00029336104010934186, + "loss": 0.0417, + "step": 759 + }, + { + "epoch": 1.34, + "grad_norm": 0.11506503075361252, + "learning_rate": 0.0002929019178666425, + "loss": 0.0114, + "step": 760 + }, + { + "epoch": 1.35, + "grad_norm": 0.10981517285108566, + "learning_rate": 0.0002924426464899947, + "loss": 0.0132, + "step": 761 + }, + { + "epoch": 1.35, + "grad_norm": 0.2663099467754364, + "learning_rate": 0.0002919832275758994, + "loss": 0.0628, + "step": 762 + }, + { + "epoch": 1.35, + "grad_norm": 0.6475871205329895, + "learning_rate": 0.0002915236627213705, + "loss": 0.0819, + "step": 763 + }, + { + "epoch": 1.35, + "grad_norm": 0.3743927478790283, + "learning_rate": 0.00029106395352392913, + "loss": 0.0526, + "step": 764 + }, + { + "epoch": 1.35, + "grad_norm": 0.2879592776298523, + "learning_rate": 0.0002906041015815983, + "loss": 0.0441, + "step": 765 + }, + { + "epoch": 1.36, + "grad_norm": 0.5118088126182556, + "learning_rate": 0.0002901441084928969, + "loss": 0.0304, + "step": 766 + }, + { + "epoch": 1.36, + "grad_norm": 0.8219065070152283, + "learning_rate": 0.000289683975856835, + "loss": 0.0682, + "step": 767 + }, + { + "epoch": 1.36, + "grad_norm": 0.45978665351867676, + "learning_rate": 0.00028922370527290715, + "loss": 0.0385, + "step": 768 + }, + { + "epoch": 1.36, + "grad_norm": 0.3295181095600128, + "learning_rate": 0.000288763298341088, + "loss": 0.0219, + "step": 769 + }, + { + "epoch": 1.36, + "grad_norm": 0.9101231694221497, + "learning_rate": 0.00028830275666182564, + "loss": 0.1396, + "step": 770 + }, + { + "epoch": 1.36, + "grad_norm": 0.4265996217727661, + "learning_rate": 0.000287842081836037, + "loss": 0.0514, + "step": 771 + }, + { + "epoch": 1.37, + "grad_norm": 0.6179928183555603, + "learning_rate": 0.00028738127546510165, + "loss": 0.0615, + "step": 772 + }, + { + "epoch": 1.37, + "grad_norm": 0.39278754591941833, + "learning_rate": 0.00028692033915085635, + "loss": 0.0422, + "step": 773 + }, + { + "epoch": 1.37, + "grad_norm": 0.4660128653049469, + "learning_rate": 0.00028645927449558986, + "loss": 0.1055, + "step": 774 + }, + { + "epoch": 1.37, + "grad_norm": 0.6555572152137756, + "learning_rate": 0.0002859980831020366, + "loss": 0.0934, + "step": 775 + }, + { + "epoch": 1.37, + "grad_norm": 0.4676145613193512, + "learning_rate": 0.0002855367665733722, + "loss": 0.0624, + "step": 776 + }, + { + "epoch": 1.37, + "grad_norm": 0.6434176564216614, + "learning_rate": 0.0002850753265132066, + "loss": 0.0415, + "step": 777 + }, + { + "epoch": 1.38, + "grad_norm": 0.2195662558078766, + "learning_rate": 0.0002846137645255796, + "loss": 0.0502, + "step": 778 + }, + { + "epoch": 1.38, + "grad_norm": 0.33307918906211853, + "learning_rate": 0.00028415208221495465, + "loss": 0.0692, + "step": 779 + }, + { + "epoch": 1.38, + "grad_norm": 0.16945867240428925, + "learning_rate": 0.0002836902811862136, + "loss": 0.0296, + "step": 780 + }, + { + "epoch": 1.38, + "grad_norm": 0.3181239366531372, + "learning_rate": 0.00028322836304465093, + "loss": 0.056, + "step": 781 + }, + { + "epoch": 1.38, + "grad_norm": 0.2984309494495392, + "learning_rate": 0.000282766329395968, + "loss": 0.0603, + "step": 782 + }, + { + "epoch": 1.39, + "grad_norm": 0.19178460538387299, + "learning_rate": 0.0002823041818462681, + "loss": 0.0315, + "step": 783 + }, + { + "epoch": 1.39, + "grad_norm": 0.3132456839084625, + "learning_rate": 0.0002818419220020502, + "loss": 0.0421, + "step": 784 + }, + { + "epoch": 1.39, + "grad_norm": 0.4850837290287018, + "learning_rate": 0.00028137955147020355, + "loss": 0.0835, + "step": 785 + }, + { + "epoch": 1.39, + "grad_norm": 0.2325926274061203, + "learning_rate": 0.00028091707185800245, + "loss": 0.033, + "step": 786 + }, + { + "epoch": 1.39, + "grad_norm": 0.5203066468238831, + "learning_rate": 0.0002804544847731001, + "loss": 0.0766, + "step": 787 + }, + { + "epoch": 1.39, + "grad_norm": 0.2967028021812439, + "learning_rate": 0.00027999179182352347, + "loss": 0.0287, + "step": 788 + }, + { + "epoch": 1.4, + "grad_norm": 0.42808797955513, + "learning_rate": 0.0002795289946176674, + "loss": 0.039, + "step": 789 + }, + { + "epoch": 1.4, + "grad_norm": 0.36871954798698425, + "learning_rate": 0.00027906609476428937, + "loss": 0.0388, + "step": 790 + }, + { + "epoch": 1.4, + "grad_norm": 0.31911173462867737, + "learning_rate": 0.0002786030938725034, + "loss": 0.0575, + "step": 791 + }, + { + "epoch": 1.4, + "grad_norm": 0.2864239513874054, + "learning_rate": 0.00027813999355177476, + "loss": 0.0711, + "step": 792 + }, + { + "epoch": 1.4, + "grad_norm": 0.11181977391242981, + "learning_rate": 0.0002776767954119147, + "loss": 0.0126, + "step": 793 + }, + { + "epoch": 1.4, + "grad_norm": 0.32950931787490845, + "learning_rate": 0.0002772135010630741, + "loss": 0.025, + "step": 794 + }, + { + "epoch": 1.41, + "grad_norm": 0.3144387900829315, + "learning_rate": 0.0002767501121157386, + "loss": 0.0244, + "step": 795 + }, + { + "epoch": 1.41, + "grad_norm": 0.7986451387405396, + "learning_rate": 0.0002762866301807222, + "loss": 0.013, + "step": 796 + }, + { + "epoch": 1.41, + "grad_norm": 0.11357055604457855, + "learning_rate": 0.0002758230568691627, + "loss": 0.0105, + "step": 797 + }, + { + "epoch": 1.41, + "grad_norm": 0.8880018591880798, + "learning_rate": 0.00027535939379251523, + "loss": 0.1036, + "step": 798 + }, + { + "epoch": 1.41, + "grad_norm": 0.5905174016952515, + "learning_rate": 0.000274895642562547, + "loss": 0.1287, + "step": 799 + }, + { + "epoch": 1.42, + "grad_norm": 0.1973883956670761, + "learning_rate": 0.0002744318047913318, + "loss": 0.0161, + "step": 800 + }, + { + "epoch": 1.42, + "grad_norm": 0.38918519020080566, + "learning_rate": 0.00027396788209124387, + "loss": 0.0428, + "step": 801 + }, + { + "epoch": 1.42, + "grad_norm": 0.2719264328479767, + "learning_rate": 0.0002735038760749531, + "loss": 0.0496, + "step": 802 + }, + { + "epoch": 1.42, + "grad_norm": 0.3014307916164398, + "learning_rate": 0.0002730397883554189, + "loss": 0.0241, + "step": 803 + }, + { + "epoch": 1.42, + "grad_norm": 0.25686872005462646, + "learning_rate": 0.00027257562054588453, + "loss": 0.0672, + "step": 804 + }, + { + "epoch": 1.42, + "grad_norm": 0.30520740151405334, + "learning_rate": 0.00027211137425987175, + "loss": 0.0376, + "step": 805 + }, + { + "epoch": 1.43, + "grad_norm": 0.3014354407787323, + "learning_rate": 0.00027164705111117516, + "loss": 0.0201, + "step": 806 + }, + { + "epoch": 1.43, + "grad_norm": 0.1687713861465454, + "learning_rate": 0.0002711826527138565, + "loss": 0.0328, + "step": 807 + }, + { + "epoch": 1.43, + "grad_norm": 0.31807759404182434, + "learning_rate": 0.00027071818068223906, + "loss": 0.121, + "step": 808 + }, + { + "epoch": 1.43, + "grad_norm": 0.6311066150665283, + "learning_rate": 0.00027025363663090216, + "loss": 0.0745, + "step": 809 + }, + { + "epoch": 1.43, + "grad_norm": 0.12313732504844666, + "learning_rate": 0.0002697890221746754, + "loss": 0.0103, + "step": 810 + }, + { + "epoch": 1.43, + "grad_norm": 0.7472290396690369, + "learning_rate": 0.00026932433892863324, + "loss": 0.0935, + "step": 811 + }, + { + "epoch": 1.44, + "grad_norm": 0.3236583173274994, + "learning_rate": 0.00026885958850808914, + "loss": 0.0592, + "step": 812 + }, + { + "epoch": 1.44, + "grad_norm": 0.6342505216598511, + "learning_rate": 0.00026839477252859007, + "loss": 0.0919, + "step": 813 + }, + { + "epoch": 1.44, + "grad_norm": 0.37357425689697266, + "learning_rate": 0.0002679298926059109, + "loss": 0.0426, + "step": 814 + }, + { + "epoch": 1.44, + "grad_norm": 0.6023468375205994, + "learning_rate": 0.000267464950356049, + "loss": 0.0756, + "step": 815 + }, + { + "epoch": 1.44, + "grad_norm": 0.35651543736457825, + "learning_rate": 0.0002669999473952181, + "loss": 0.0323, + "step": 816 + }, + { + "epoch": 1.45, + "grad_norm": 0.4318545162677765, + "learning_rate": 0.00026653488533984307, + "loss": 0.1178, + "step": 817 + }, + { + "epoch": 1.45, + "grad_norm": 0.31720227003097534, + "learning_rate": 0.00026606976580655415, + "loss": 0.0974, + "step": 818 + }, + { + "epoch": 1.45, + "grad_norm": 0.5617119669914246, + "learning_rate": 0.00026560459041218156, + "loss": 0.1098, + "step": 819 + }, + { + "epoch": 1.45, + "grad_norm": 0.5732232332229614, + "learning_rate": 0.00026513936077374954, + "loss": 0.0949, + "step": 820 + }, + { + "epoch": 1.45, + "grad_norm": 0.33141466975212097, + "learning_rate": 0.00026467407850847105, + "loss": 0.0417, + "step": 821 + }, + { + "epoch": 1.45, + "grad_norm": 0.41724076867103577, + "learning_rate": 0.00026420874523374173, + "loss": 0.0466, + "step": 822 + }, + { + "epoch": 1.46, + "grad_norm": 0.24849863350391388, + "learning_rate": 0.0002637433625671347, + "loss": 0.0536, + "step": 823 + }, + { + "epoch": 1.46, + "grad_norm": 0.3150210380554199, + "learning_rate": 0.00026327793212639486, + "loss": 0.0806, + "step": 824 + }, + { + "epoch": 1.46, + "grad_norm": 0.2865166664123535, + "learning_rate": 0.00026281245552943293, + "loss": 0.0533, + "step": 825 + }, + { + "epoch": 1.46, + "grad_norm": 0.2756612300872803, + "learning_rate": 0.00026234693439432043, + "loss": 0.0504, + "step": 826 + }, + { + "epoch": 1.46, + "grad_norm": 0.9047459363937378, + "learning_rate": 0.0002618813703392833, + "loss": 0.109, + "step": 827 + }, + { + "epoch": 1.46, + "grad_norm": 0.10576523840427399, + "learning_rate": 0.00026141576498269706, + "loss": 0.0231, + "step": 828 + }, + { + "epoch": 1.47, + "grad_norm": 0.4203824996948242, + "learning_rate": 0.00026095011994308056, + "loss": 0.0727, + "step": 829 + }, + { + "epoch": 1.47, + "grad_norm": 0.4648183584213257, + "learning_rate": 0.0002604844368390905, + "loss": 0.1066, + "step": 830 + }, + { + "epoch": 1.47, + "grad_norm": 0.2482835054397583, + "learning_rate": 0.00026001871728951624, + "loss": 0.0237, + "step": 831 + }, + { + "epoch": 1.47, + "grad_norm": 0.4374096393585205, + "learning_rate": 0.00025955296291327356, + "loss": 0.0934, + "step": 832 + }, + { + "epoch": 1.47, + "grad_norm": 0.3488870859146118, + "learning_rate": 0.00025908717532939946, + "loss": 0.0638, + "step": 833 + }, + { + "epoch": 1.48, + "grad_norm": 0.5614967942237854, + "learning_rate": 0.00025862135615704613, + "loss": 0.0827, + "step": 834 + }, + { + "epoch": 1.48, + "grad_norm": 0.30991917848587036, + "learning_rate": 0.0002581555070154759, + "loss": 0.0438, + "step": 835 + }, + { + "epoch": 1.48, + "grad_norm": 0.44601985812187195, + "learning_rate": 0.00025768962952405503, + "loss": 0.0797, + "step": 836 + }, + { + "epoch": 1.48, + "grad_norm": 0.3628086745738983, + "learning_rate": 0.00025722372530224844, + "loss": 0.0366, + "step": 837 + }, + { + "epoch": 1.48, + "grad_norm": 0.2644861936569214, + "learning_rate": 0.000256757795969614, + "loss": 0.0331, + "step": 838 + }, + { + "epoch": 1.48, + "grad_norm": 0.4585146903991699, + "learning_rate": 0.0002562918431457967, + "loss": 0.0635, + "step": 839 + }, + { + "epoch": 1.49, + "grad_norm": 0.4738370478153229, + "learning_rate": 0.0002558258684505233, + "loss": 0.0599, + "step": 840 + }, + { + "epoch": 1.49, + "grad_norm": 0.6536511182785034, + "learning_rate": 0.00025535987350359664, + "loss": 0.077, + "step": 841 + }, + { + "epoch": 1.49, + "grad_norm": 0.43449538946151733, + "learning_rate": 0.00025489385992489, + "loss": 0.0432, + "step": 842 + }, + { + "epoch": 1.49, + "grad_norm": 0.5419031977653503, + "learning_rate": 0.0002544278293343411, + "loss": 0.093, + "step": 843 + }, + { + "epoch": 1.49, + "grad_norm": 0.30555063486099243, + "learning_rate": 0.0002539617833519472, + "loss": 0.0572, + "step": 844 + }, + { + "epoch": 1.49, + "grad_norm": 0.18094651401042938, + "learning_rate": 0.0002534957235977589, + "loss": 0.0353, + "step": 845 + }, + { + "epoch": 1.5, + "grad_norm": 0.34586626291275024, + "learning_rate": 0.00025302965169187467, + "loss": 0.0554, + "step": 846 + }, + { + "epoch": 1.5, + "grad_norm": 0.38212889432907104, + "learning_rate": 0.00025256356925443507, + "loss": 0.0624, + "step": 847 + }, + { + "epoch": 1.5, + "grad_norm": 0.5566253066062927, + "learning_rate": 0.00025209747790561754, + "loss": 0.0603, + "step": 848 + }, + { + "epoch": 1.5, + "grad_norm": 0.2991026043891907, + "learning_rate": 0.0002516313792656304, + "loss": 0.0374, + "step": 849 + }, + { + "epoch": 1.5, + "grad_norm": 0.6026126146316528, + "learning_rate": 0.0002511652749547072, + "loss": 0.1283, + "step": 850 + }, + { + "epoch": 1.51, + "grad_norm": 0.28952938318252563, + "learning_rate": 0.0002506991665931013, + "loss": 0.0708, + "step": 851 + }, + { + "epoch": 1.51, + "grad_norm": 0.3491526246070862, + "learning_rate": 0.00025023305580108027, + "loss": 0.0536, + "step": 852 + }, + { + "epoch": 1.51, + "eval_loss": 0.08201431483030319, + "eval_runtime": 14.6754, + "eval_samples_per_second": 32.503, + "eval_steps_per_second": 8.177, + "step": 852 + }, + { + "epoch": 1.51, + "grad_norm": 0.7052826881408691, + "learning_rate": 0.00024976694419891974, + "loss": 0.059, + "step": 853 + }, + { + "epoch": 1.51, + "grad_norm": 0.43753165006637573, + "learning_rate": 0.0002493008334068987, + "loss": 0.0751, + "step": 854 + }, + { + "epoch": 1.51, + "grad_norm": 0.1675650030374527, + "learning_rate": 0.00024883472504529287, + "loss": 0.0224, + "step": 855 + }, + { + "epoch": 1.51, + "grad_norm": 0.3790821433067322, + "learning_rate": 0.00024836862073436967, + "loss": 0.0707, + "step": 856 + }, + { + "epoch": 1.52, + "grad_norm": 0.3162197470664978, + "learning_rate": 0.0002479025220943825, + "loss": 0.0375, + "step": 857 + }, + { + "epoch": 1.52, + "grad_norm": 0.5708606243133545, + "learning_rate": 0.00024743643074556494, + "loss": 0.0632, + "step": 858 + }, + { + "epoch": 1.52, + "grad_norm": 0.24075205624103546, + "learning_rate": 0.00024697034830812535, + "loss": 0.0452, + "step": 859 + }, + { + "epoch": 1.52, + "grad_norm": 0.371979683637619, + "learning_rate": 0.00024650427640224114, + "loss": 0.0676, + "step": 860 + }, + { + "epoch": 1.52, + "grad_norm": 0.2892770767211914, + "learning_rate": 0.00024603821664805276, + "loss": 0.0592, + "step": 861 + }, + { + "epoch": 1.52, + "grad_norm": 0.14655162394046783, + "learning_rate": 0.00024557217066565896, + "loss": 0.0161, + "step": 862 + }, + { + "epoch": 1.53, + "grad_norm": 0.22819265723228455, + "learning_rate": 0.0002451061400751101, + "loss": 0.0418, + "step": 863 + }, + { + "epoch": 1.53, + "grad_norm": 0.4637010097503662, + "learning_rate": 0.0002446401264964034, + "loss": 0.0555, + "step": 864 + }, + { + "epoch": 1.53, + "grad_norm": 0.30727583169937134, + "learning_rate": 0.00024417413154947677, + "loss": 0.0258, + "step": 865 + }, + { + "epoch": 1.53, + "grad_norm": 0.26563793420791626, + "learning_rate": 0.00024370815685420338, + "loss": 0.0528, + "step": 866 + }, + { + "epoch": 1.53, + "grad_norm": 0.18691249191761017, + "learning_rate": 0.00024324220403038613, + "loss": 0.0432, + "step": 867 + }, + { + "epoch": 1.54, + "grad_norm": 0.3845004439353943, + "learning_rate": 0.00024277627469775163, + "loss": 0.08, + "step": 868 + }, + { + "epoch": 1.54, + "grad_norm": 0.36308273673057556, + "learning_rate": 0.00024231037047594495, + "loss": 0.0569, + "step": 869 + }, + { + "epoch": 1.54, + "grad_norm": 0.43758854269981384, + "learning_rate": 0.00024184449298452414, + "loss": 0.0451, + "step": 870 + }, + { + "epoch": 1.54, + "grad_norm": 0.30163124203681946, + "learning_rate": 0.00024137864384295388, + "loss": 0.0219, + "step": 871 + }, + { + "epoch": 1.54, + "grad_norm": 0.20218737423419952, + "learning_rate": 0.00024091282467060055, + "loss": 0.0277, + "step": 872 + }, + { + "epoch": 1.54, + "grad_norm": 0.2397354692220688, + "learning_rate": 0.00024044703708672648, + "loss": 0.0307, + "step": 873 + }, + { + "epoch": 1.55, + "grad_norm": 0.31938987970352173, + "learning_rate": 0.00023998128271048374, + "loss": 0.0453, + "step": 874 + }, + { + "epoch": 1.55, + "grad_norm": 0.5195713043212891, + "learning_rate": 0.00023951556316090952, + "loss": 0.0852, + "step": 875 + }, + { + "epoch": 1.55, + "grad_norm": 0.5428398251533508, + "learning_rate": 0.00023904988005691953, + "loss": 0.0941, + "step": 876 + }, + { + "epoch": 1.55, + "grad_norm": 0.9265273213386536, + "learning_rate": 0.00023858423501730295, + "loss": 0.0489, + "step": 877 + }, + { + "epoch": 1.55, + "grad_norm": 0.5896188020706177, + "learning_rate": 0.00023811862966071674, + "loss": 0.0601, + "step": 878 + }, + { + "epoch": 1.56, + "grad_norm": 0.6030554175376892, + "learning_rate": 0.0002376530656056796, + "loss": 0.0958, + "step": 879 + }, + { + "epoch": 1.56, + "grad_norm": 0.3220241665840149, + "learning_rate": 0.00023718754447056708, + "loss": 0.0487, + "step": 880 + }, + { + "epoch": 1.56, + "grad_norm": 0.49902886152267456, + "learning_rate": 0.00023672206787360523, + "loss": 0.0457, + "step": 881 + }, + { + "epoch": 1.56, + "grad_norm": 0.3744886517524719, + "learning_rate": 0.00023625663743286534, + "loss": 0.0771, + "step": 882 + }, + { + "epoch": 1.56, + "grad_norm": 0.39943140745162964, + "learning_rate": 0.0002357912547662584, + "loss": 0.0389, + "step": 883 + }, + { + "epoch": 1.56, + "grad_norm": 0.34382057189941406, + "learning_rate": 0.00023532592149152898, + "loss": 0.0405, + "step": 884 + }, + { + "epoch": 1.57, + "grad_norm": 0.22058314085006714, + "learning_rate": 0.00023486063922625042, + "loss": 0.032, + "step": 885 + }, + { + "epoch": 1.57, + "grad_norm": 0.32711130380630493, + "learning_rate": 0.00023439540958781848, + "loss": 0.0589, + "step": 886 + }, + { + "epoch": 1.57, + "grad_norm": 0.42972657084465027, + "learning_rate": 0.0002339302341934459, + "loss": 0.0601, + "step": 887 + }, + { + "epoch": 1.57, + "grad_norm": 0.31368395686149597, + "learning_rate": 0.00023346511466015708, + "loss": 0.0345, + "step": 888 + }, + { + "epoch": 1.57, + "grad_norm": 0.28611400723457336, + "learning_rate": 0.00023300005260478194, + "loss": 0.0432, + "step": 889 + }, + { + "epoch": 1.57, + "grad_norm": 0.5313751101493835, + "learning_rate": 0.00023253504964395097, + "loss": 0.0427, + "step": 890 + }, + { + "epoch": 1.58, + "grad_norm": 0.2192300707101822, + "learning_rate": 0.00023207010739408908, + "loss": 0.0392, + "step": 891 + }, + { + "epoch": 1.58, + "grad_norm": 0.7893845438957214, + "learning_rate": 0.00023160522747141, + "loss": 0.1338, + "step": 892 + }, + { + "epoch": 1.58, + "grad_norm": 0.5475191473960876, + "learning_rate": 0.00023114041149191098, + "loss": 0.1458, + "step": 893 + }, + { + "epoch": 1.58, + "grad_norm": 0.4575919806957245, + "learning_rate": 0.00023067566107136685, + "loss": 0.0593, + "step": 894 + }, + { + "epoch": 1.58, + "grad_norm": 0.47522222995758057, + "learning_rate": 0.00023021097782532457, + "loss": 0.0744, + "step": 895 + }, + { + "epoch": 1.59, + "grad_norm": 0.3471393883228302, + "learning_rate": 0.0002297463633690979, + "loss": 0.0795, + "step": 896 + }, + { + "epoch": 1.59, + "grad_norm": 0.6821273565292358, + "learning_rate": 0.00022928181931776098, + "loss": 0.0692, + "step": 897 + }, + { + "epoch": 1.59, + "grad_norm": 0.4375980496406555, + "learning_rate": 0.00022881734728614347, + "loss": 0.0704, + "step": 898 + }, + { + "epoch": 1.59, + "grad_norm": 0.596495509147644, + "learning_rate": 0.0002283529488888249, + "loss": 0.0744, + "step": 899 + }, + { + "epoch": 1.59, + "grad_norm": 0.36588358879089355, + "learning_rate": 0.00022788862574012824, + "loss": 0.0576, + "step": 900 + }, + { + "epoch": 1.59, + "grad_norm": 0.49629420042037964, + "learning_rate": 0.0002274243794541155, + "loss": 0.0847, + "step": 901 + }, + { + "epoch": 1.6, + "grad_norm": 0.8968174457550049, + "learning_rate": 0.0002269602116445811, + "loss": 0.0521, + "step": 902 + }, + { + "epoch": 1.6, + "grad_norm": 0.260775089263916, + "learning_rate": 0.00022649612392504687, + "loss": 0.0345, + "step": 903 + }, + { + "epoch": 1.6, + "grad_norm": 0.33473265171051025, + "learning_rate": 0.00022603211790875622, + "loss": 0.0483, + "step": 904 + }, + { + "epoch": 1.6, + "grad_norm": 0.451668918132782, + "learning_rate": 0.0002255681952086683, + "loss": 0.0862, + "step": 905 + }, + { + "epoch": 1.6, + "grad_norm": 0.6166467070579529, + "learning_rate": 0.00022510435743745304, + "loss": 0.1038, + "step": 906 + }, + { + "epoch": 1.6, + "grad_norm": 0.361858069896698, + "learning_rate": 0.0002246406062074848, + "loss": 0.073, + "step": 907 + }, + { + "epoch": 1.61, + "grad_norm": 0.42583227157592773, + "learning_rate": 0.00022417694313083735, + "loss": 0.0923, + "step": 908 + }, + { + "epoch": 1.61, + "grad_norm": 0.2566489279270172, + "learning_rate": 0.00022371336981927788, + "loss": 0.0358, + "step": 909 + }, + { + "epoch": 1.61, + "grad_norm": 0.3075582683086395, + "learning_rate": 0.0002232498878842615, + "loss": 0.0912, + "step": 910 + }, + { + "epoch": 1.61, + "grad_norm": 0.1581839770078659, + "learning_rate": 0.00022278649893692584, + "loss": 0.0309, + "step": 911 + }, + { + "epoch": 1.61, + "grad_norm": 0.37088707089424133, + "learning_rate": 0.00022232320458808532, + "loss": 0.074, + "step": 912 + }, + { + "epoch": 1.62, + "grad_norm": 0.40888333320617676, + "learning_rate": 0.00022186000644822522, + "loss": 0.0618, + "step": 913 + }, + { + "epoch": 1.62, + "grad_norm": 0.2161816507577896, + "learning_rate": 0.00022139690612749672, + "loss": 0.0355, + "step": 914 + }, + { + "epoch": 1.62, + "grad_norm": 0.3577941954135895, + "learning_rate": 0.00022093390523571067, + "loss": 0.0499, + "step": 915 + }, + { + "epoch": 1.62, + "grad_norm": 0.3455984592437744, + "learning_rate": 0.0002204710053823326, + "loss": 0.0712, + "step": 916 + }, + { + "epoch": 1.62, + "grad_norm": 0.2223758101463318, + "learning_rate": 0.0002200082081764766, + "loss": 0.0362, + "step": 917 + }, + { + "epoch": 1.62, + "grad_norm": 0.3027271032333374, + "learning_rate": 0.00021954551522689993, + "loss": 0.0579, + "step": 918 + }, + { + "epoch": 1.63, + "grad_norm": 0.12046008557081223, + "learning_rate": 0.00021908292814199764, + "loss": 0.0227, + "step": 919 + }, + { + "epoch": 1.63, + "grad_norm": 0.3685935437679291, + "learning_rate": 0.00021862044852979652, + "loss": 0.0797, + "step": 920 + }, + { + "epoch": 1.63, + "grad_norm": 0.26438644528388977, + "learning_rate": 0.00021815807799794982, + "loss": 0.0567, + "step": 921 + }, + { + "epoch": 1.63, + "grad_norm": 0.44811537861824036, + "learning_rate": 0.00021769581815373192, + "loss": 0.0623, + "step": 922 + }, + { + "epoch": 1.63, + "grad_norm": 0.36297371983528137, + "learning_rate": 0.000217233670604032, + "loss": 0.0512, + "step": 923 + }, + { + "epoch": 1.63, + "grad_norm": 0.30395954847335815, + "learning_rate": 0.00021677163695534913, + "loss": 0.0423, + "step": 924 + }, + { + "epoch": 1.64, + "grad_norm": 0.26092368364334106, + "learning_rate": 0.00021630971881378644, + "loss": 0.0463, + "step": 925 + }, + { + "epoch": 1.64, + "grad_norm": 0.639478325843811, + "learning_rate": 0.0002158479177850453, + "loss": 0.0564, + "step": 926 + }, + { + "epoch": 1.64, + "grad_norm": 0.25447505712509155, + "learning_rate": 0.00021538623547442045, + "loss": 0.0317, + "step": 927 + }, + { + "epoch": 1.64, + "grad_norm": 0.24460607767105103, + "learning_rate": 0.00021492467348679345, + "loss": 0.0375, + "step": 928 + }, + { + "epoch": 1.64, + "grad_norm": 0.43972596526145935, + "learning_rate": 0.00021446323342662785, + "loss": 0.0587, + "step": 929 + }, + { + "epoch": 1.65, + "grad_norm": 0.08744898438453674, + "learning_rate": 0.00021400191689796338, + "loss": 0.0074, + "step": 930 + }, + { + "epoch": 1.65, + "grad_norm": 0.29619458317756653, + "learning_rate": 0.00021354072550441018, + "loss": 0.0422, + "step": 931 + }, + { + "epoch": 1.65, + "grad_norm": 0.3273064196109772, + "learning_rate": 0.00021307966084914372, + "loss": 0.0362, + "step": 932 + }, + { + "epoch": 1.65, + "grad_norm": 0.2981872260570526, + "learning_rate": 0.00021261872453489842, + "loss": 0.0246, + "step": 933 + }, + { + "epoch": 1.65, + "grad_norm": 0.7154226899147034, + "learning_rate": 0.00021215791816396303, + "loss": 0.0856, + "step": 934 + }, + { + "epoch": 1.65, + "grad_norm": 0.4782339930534363, + "learning_rate": 0.00021169724333817443, + "loss": 0.0482, + "step": 935 + }, + { + "epoch": 1.66, + "grad_norm": 0.5048168897628784, + "learning_rate": 0.00021123670165891208, + "loss": 0.0405, + "step": 936 + }, + { + "epoch": 1.66, + "grad_norm": 0.22893092036247253, + "learning_rate": 0.0002107762947270928, + "loss": 0.0181, + "step": 937 + }, + { + "epoch": 1.66, + "grad_norm": 0.6863519549369812, + "learning_rate": 0.00021031602414316506, + "loss": 0.0643, + "step": 938 + }, + { + "epoch": 1.66, + "grad_norm": 0.5500178337097168, + "learning_rate": 0.0002098558915071031, + "loss": 0.0599, + "step": 939 + }, + { + "epoch": 1.66, + "grad_norm": 0.7170897126197815, + "learning_rate": 0.0002093958984184018, + "loss": 0.1167, + "step": 940 + }, + { + "epoch": 1.66, + "grad_norm": 0.48540323972702026, + "learning_rate": 0.00020893604647607088, + "loss": 0.0443, + "step": 941 + }, + { + "epoch": 1.67, + "grad_norm": 0.09318219870328903, + "learning_rate": 0.0002084763372786295, + "loss": 0.0092, + "step": 942 + }, + { + "epoch": 1.67, + "grad_norm": 0.7748222351074219, + "learning_rate": 0.00020801677242410067, + "loss": 0.0912, + "step": 943 + }, + { + "epoch": 1.67, + "grad_norm": 0.2857852280139923, + "learning_rate": 0.00020755735351000537, + "loss": 0.0313, + "step": 944 + }, + { + "epoch": 1.67, + "grad_norm": 0.483059287071228, + "learning_rate": 0.00020709808213335758, + "loss": 0.0768, + "step": 945 + }, + { + "epoch": 1.67, + "grad_norm": 0.2753046751022339, + "learning_rate": 0.0002066389598906582, + "loss": 0.0339, + "step": 946 + }, + { + "epoch": 1.68, + "grad_norm": 0.29203662276268005, + "learning_rate": 0.00020617998837788975, + "loss": 0.0185, + "step": 947 + }, + { + "epoch": 1.68, + "grad_norm": 0.3008381426334381, + "learning_rate": 0.00020572116919051098, + "loss": 0.0574, + "step": 948 + }, + { + "epoch": 1.68, + "grad_norm": 0.6928682923316956, + "learning_rate": 0.0002052625039234509, + "loss": 0.051, + "step": 949 + }, + { + "epoch": 1.68, + "grad_norm": 0.27521032094955444, + "learning_rate": 0.00020480399417110352, + "loss": 0.05, + "step": 950 + }, + { + "epoch": 1.68, + "grad_norm": 0.1467350274324417, + "learning_rate": 0.00020434564152732238, + "loss": 0.0254, + "step": 951 + }, + { + "epoch": 1.68, + "grad_norm": 0.2940123379230499, + "learning_rate": 0.00020388744758541462, + "loss": 0.0521, + "step": 952 + }, + { + "epoch": 1.69, + "grad_norm": 0.2931191325187683, + "learning_rate": 0.00020342941393813613, + "loss": 0.0394, + "step": 953 + }, + { + "epoch": 1.69, + "grad_norm": 0.2518831491470337, + "learning_rate": 0.00020297154217768513, + "loss": 0.041, + "step": 954 + }, + { + "epoch": 1.69, + "grad_norm": 0.48200809955596924, + "learning_rate": 0.00020251383389569743, + "loss": 0.0693, + "step": 955 + }, + { + "epoch": 1.69, + "grad_norm": 0.4188168942928314, + "learning_rate": 0.0002020562906832404, + "loss": 0.0594, + "step": 956 + }, + { + "epoch": 1.69, + "grad_norm": 0.4201320707798004, + "learning_rate": 0.00020159891413080755, + "loss": 0.0737, + "step": 957 + }, + { + "epoch": 1.69, + "grad_norm": 0.4236721396446228, + "learning_rate": 0.00020114170582831342, + "loss": 0.0443, + "step": 958 + }, + { + "epoch": 1.7, + "grad_norm": 0.39196375012397766, + "learning_rate": 0.00020068466736508704, + "loss": 0.0728, + "step": 959 + }, + { + "epoch": 1.7, + "grad_norm": 0.5320992469787598, + "learning_rate": 0.00020022780032986765, + "loss": 0.0416, + "step": 960 + }, + { + "epoch": 1.7, + "grad_norm": 0.12672173976898193, + "learning_rate": 0.00019977110631079836, + "loss": 0.0191, + "step": 961 + }, + { + "epoch": 1.7, + "grad_norm": 0.6431661248207092, + "learning_rate": 0.0001993145868954207, + "loss": 0.038, + "step": 962 + }, + { + "epoch": 1.7, + "grad_norm": 0.29868853092193604, + "learning_rate": 0.00019885824367066955, + "loss": 0.0245, + "step": 963 + }, + { + "epoch": 1.71, + "grad_norm": 0.5283737778663635, + "learning_rate": 0.0001984020782228671, + "loss": 0.0722, + "step": 964 + }, + { + "epoch": 1.71, + "grad_norm": 0.5567461252212524, + "learning_rate": 0.00019794609213771755, + "loss": 0.1026, + "step": 965 + }, + { + "epoch": 1.71, + "grad_norm": 0.9617827534675598, + "learning_rate": 0.00019749028700030181, + "loss": 0.078, + "step": 966 + }, + { + "epoch": 1.71, + "grad_norm": 0.5466052889823914, + "learning_rate": 0.0001970346643950715, + "loss": 0.075, + "step": 967 + }, + { + "epoch": 1.71, + "grad_norm": 0.18108782172203064, + "learning_rate": 0.00019657922590584392, + "loss": 0.024, + "step": 968 + }, + { + "epoch": 1.71, + "grad_norm": 0.4150354862213135, + "learning_rate": 0.00019612397311579647, + "loss": 0.0656, + "step": 969 + }, + { + "epoch": 1.72, + "grad_norm": 0.21237897872924805, + "learning_rate": 0.0001956689076074607, + "loss": 0.0378, + "step": 970 + }, + { + "epoch": 1.72, + "grad_norm": 0.21254923939704895, + "learning_rate": 0.0001952140309627174, + "loss": 0.0173, + "step": 971 + }, + { + "epoch": 1.72, + "grad_norm": 0.2641647756099701, + "learning_rate": 0.0001947593447627907, + "loss": 0.049, + "step": 972 + }, + { + "epoch": 1.72, + "grad_norm": 0.3682314455509186, + "learning_rate": 0.00019430485058824276, + "loss": 0.0485, + "step": 973 + }, + { + "epoch": 1.72, + "grad_norm": 0.2566399574279785, + "learning_rate": 0.00019385055001896835, + "loss": 0.0388, + "step": 974 + }, + { + "epoch": 1.72, + "grad_norm": 0.20328454673290253, + "learning_rate": 0.000193396444634189, + "loss": 0.0259, + "step": 975 + }, + { + "epoch": 1.73, + "grad_norm": 0.5327407717704773, + "learning_rate": 0.00019294253601244792, + "loss": 0.085, + "step": 976 + }, + { + "epoch": 1.73, + "grad_norm": 0.6960484385490417, + "learning_rate": 0.00019248882573160437, + "loss": 0.1077, + "step": 977 + }, + { + "epoch": 1.73, + "grad_norm": 0.5338547229766846, + "learning_rate": 0.00019203531536882785, + "loss": 0.0421, + "step": 978 + }, + { + "epoch": 1.73, + "grad_norm": 0.1970924586057663, + "learning_rate": 0.00019158200650059337, + "loss": 0.024, + "step": 979 + }, + { + "epoch": 1.73, + "grad_norm": 0.4665428698062897, + "learning_rate": 0.0001911289007026751, + "loss": 0.0549, + "step": 980 + }, + { + "epoch": 1.74, + "grad_norm": 0.4011171758174896, + "learning_rate": 0.00019067599955014156, + "loss": 0.0482, + "step": 981 + }, + { + "epoch": 1.74, + "grad_norm": 0.25179675221443176, + "learning_rate": 0.00019022330461734982, + "loss": 0.0327, + "step": 982 + }, + { + "epoch": 1.74, + "grad_norm": 0.2733090817928314, + "learning_rate": 0.00018977081747794, + "loss": 0.0271, + "step": 983 + }, + { + "epoch": 1.74, + "grad_norm": 0.5745018124580383, + "learning_rate": 0.00018931853970483012, + "loss": 0.0344, + "step": 984 + }, + { + "epoch": 1.74, + "grad_norm": 0.12801390886306763, + "learning_rate": 0.00018886647287021007, + "loss": 0.0144, + "step": 985 + }, + { + "epoch": 1.74, + "grad_norm": 0.1290263682603836, + "learning_rate": 0.00018841461854553681, + "loss": 0.0132, + "step": 986 + }, + { + "epoch": 1.75, + "grad_norm": 0.863158643245697, + "learning_rate": 0.00018796297830152853, + "loss": 0.1274, + "step": 987 + }, + { + "epoch": 1.75, + "grad_norm": 0.3602030277252197, + "learning_rate": 0.00018751155370815895, + "loss": 0.0549, + "step": 988 + }, + { + "epoch": 1.75, + "grad_norm": 0.379295289516449, + "learning_rate": 0.00018706034633465257, + "loss": 0.0266, + "step": 989 + }, + { + "epoch": 1.75, + "grad_norm": 0.43907806277275085, + "learning_rate": 0.00018660935774947858, + "loss": 0.0499, + "step": 990 + }, + { + "epoch": 1.75, + "grad_norm": 0.822163462638855, + "learning_rate": 0.00018615858952034548, + "loss": 0.1464, + "step": 991 + }, + { + "epoch": 1.75, + "grad_norm": 0.3563006520271301, + "learning_rate": 0.00018570804321419614, + "loss": 0.0499, + "step": 992 + }, + { + "epoch": 1.76, + "grad_norm": 0.082757368683815, + "learning_rate": 0.00018525772039720167, + "loss": 0.0088, + "step": 993 + }, + { + "epoch": 1.76, + "grad_norm": 0.349202424287796, + "learning_rate": 0.00018480762263475638, + "loss": 0.0325, + "step": 994 + }, + { + "epoch": 1.76, + "eval_loss": 0.0834248885512352, + "eval_runtime": 14.6855, + "eval_samples_per_second": 32.481, + "eval_steps_per_second": 8.171, + "step": 994 + }, + { + "epoch": 1.76, + "grad_norm": 0.2513701319694519, + "learning_rate": 0.0001843577514914725, + "loss": 0.0186, + "step": 995 + }, + { + "epoch": 1.76, + "grad_norm": 0.4057576656341553, + "learning_rate": 0.00018390810853117408, + "loss": 0.0348, + "step": 996 + }, + { + "epoch": 1.76, + "grad_norm": 0.46003130078315735, + "learning_rate": 0.0001834586953168923, + "loss": 0.0689, + "step": 997 + }, + { + "epoch": 1.77, + "grad_norm": 0.41909146308898926, + "learning_rate": 0.00018300951341085946, + "loss": 0.0298, + "step": 998 + }, + { + "epoch": 1.77, + "grad_norm": 0.9829010367393494, + "learning_rate": 0.00018256056437450399, + "loss": 0.2026, + "step": 999 + }, + { + "epoch": 1.77, + "grad_norm": 0.31356698274612427, + "learning_rate": 0.00018211184976844487, + "loss": 0.0263, + "step": 1000 + }, + { + "epoch": 1.77, + "grad_norm": 0.4269973337650299, + "learning_rate": 0.00018166337115248585, + "loss": 0.1063, + "step": 1001 + }, + { + "epoch": 1.77, + "grad_norm": 0.3558803200721741, + "learning_rate": 0.00018121513008561064, + "loss": 0.0389, + "step": 1002 + }, + { + "epoch": 1.77, + "grad_norm": 0.5086562633514404, + "learning_rate": 0.0001807671281259771, + "loss": 0.0593, + "step": 1003 + }, + { + "epoch": 1.78, + "grad_norm": 0.1954115778207779, + "learning_rate": 0.00018031936683091186, + "loss": 0.0327, + "step": 1004 + }, + { + "epoch": 1.78, + "grad_norm": 0.1789095103740692, + "learning_rate": 0.0001798718477569051, + "loss": 0.024, + "step": 1005 + }, + { + "epoch": 1.78, + "grad_norm": 0.38798651099205017, + "learning_rate": 0.0001794245724596048, + "loss": 0.0791, + "step": 1006 + }, + { + "epoch": 1.78, + "grad_norm": 0.34710198640823364, + "learning_rate": 0.00017897754249381165, + "loss": 0.0571, + "step": 1007 + }, + { + "epoch": 1.78, + "grad_norm": 0.2781204283237457, + "learning_rate": 0.00017853075941347363, + "loss": 0.0521, + "step": 1008 + }, + { + "epoch": 1.79, + "grad_norm": 0.2825307548046112, + "learning_rate": 0.00017808422477168023, + "loss": 0.0474, + "step": 1009 + }, + { + "epoch": 1.79, + "grad_norm": 0.23594889044761658, + "learning_rate": 0.0001776379401206577, + "loss": 0.0295, + "step": 1010 + }, + { + "epoch": 1.79, + "grad_norm": 0.37222880125045776, + "learning_rate": 0.00017719190701176286, + "loss": 0.0411, + "step": 1011 + }, + { + "epoch": 1.79, + "grad_norm": 0.25766775012016296, + "learning_rate": 0.00017674612699547846, + "loss": 0.0419, + "step": 1012 + }, + { + "epoch": 1.79, + "grad_norm": 0.27667155861854553, + "learning_rate": 0.00017630060162140737, + "loss": 0.0325, + "step": 1013 + }, + { + "epoch": 1.79, + "grad_norm": 0.49651435017585754, + "learning_rate": 0.00017585533243826712, + "loss": 0.0435, + "step": 1014 + }, + { + "epoch": 1.8, + "grad_norm": 0.7008858323097229, + "learning_rate": 0.00017541032099388499, + "loss": 0.1405, + "step": 1015 + }, + { + "epoch": 1.8, + "grad_norm": 0.17448720335960388, + "learning_rate": 0.0001749655688351921, + "loss": 0.0269, + "step": 1016 + }, + { + "epoch": 1.8, + "grad_norm": 0.2893378734588623, + "learning_rate": 0.0001745210775082182, + "loss": 0.0417, + "step": 1017 + }, + { + "epoch": 1.8, + "grad_norm": 0.18504270911216736, + "learning_rate": 0.0001740768485580866, + "loss": 0.0302, + "step": 1018 + }, + { + "epoch": 1.8, + "grad_norm": 0.2060771882534027, + "learning_rate": 0.00017363288352900818, + "loss": 0.047, + "step": 1019 + }, + { + "epoch": 1.8, + "grad_norm": 0.8185610771179199, + "learning_rate": 0.00017318918396427675, + "loss": 0.1398, + "step": 1020 + }, + { + "epoch": 1.81, + "grad_norm": 0.46132713556289673, + "learning_rate": 0.00017274575140626317, + "loss": 0.0776, + "step": 1021 + }, + { + "epoch": 1.81, + "grad_norm": 0.16016420722007751, + "learning_rate": 0.0001723025873964101, + "loss": 0.0161, + "step": 1022 + }, + { + "epoch": 1.81, + "grad_norm": 0.6459915041923523, + "learning_rate": 0.00017185969347522674, + "loss": 0.0711, + "step": 1023 + }, + { + "epoch": 1.81, + "grad_norm": 0.40434324741363525, + "learning_rate": 0.0001714170711822834, + "loss": 0.0571, + "step": 1024 + }, + { + "epoch": 1.81, + "grad_norm": 0.5824777483940125, + "learning_rate": 0.00017097472205620607, + "loss": 0.1141, + "step": 1025 + }, + { + "epoch": 1.82, + "grad_norm": 0.4143454134464264, + "learning_rate": 0.00017053264763467152, + "loss": 0.0558, + "step": 1026 + }, + { + "epoch": 1.82, + "grad_norm": 0.36720553040504456, + "learning_rate": 0.00017009084945440113, + "loss": 0.0376, + "step": 1027 + }, + { + "epoch": 1.82, + "grad_norm": 0.27180641889572144, + "learning_rate": 0.00016964932905115632, + "loss": 0.054, + "step": 1028 + }, + { + "epoch": 1.82, + "grad_norm": 0.43961653113365173, + "learning_rate": 0.0001692080879597329, + "loss": 0.0773, + "step": 1029 + }, + { + "epoch": 1.82, + "grad_norm": 0.2728005647659302, + "learning_rate": 0.00016876712771395552, + "loss": 0.0142, + "step": 1030 + }, + { + "epoch": 1.82, + "grad_norm": 0.5099291205406189, + "learning_rate": 0.0001683264498466729, + "loss": 0.0404, + "step": 1031 + }, + { + "epoch": 1.83, + "grad_norm": 0.3162379562854767, + "learning_rate": 0.00016788605588975193, + "loss": 0.0332, + "step": 1032 + }, + { + "epoch": 1.83, + "grad_norm": 0.4152194857597351, + "learning_rate": 0.0001674459473740726, + "loss": 0.0352, + "step": 1033 + }, + { + "epoch": 1.83, + "grad_norm": 0.3174980878829956, + "learning_rate": 0.00016700612582952278, + "loss": 0.0777, + "step": 1034 + }, + { + "epoch": 1.83, + "grad_norm": 0.6996863484382629, + "learning_rate": 0.0001665665927849926, + "loss": 0.1145, + "step": 1035 + }, + { + "epoch": 1.83, + "grad_norm": 0.2766638398170471, + "learning_rate": 0.0001661273497683697, + "loss": 0.0179, + "step": 1036 + }, + { + "epoch": 1.83, + "grad_norm": 0.45079368352890015, + "learning_rate": 0.00016568839830653287, + "loss": 0.1081, + "step": 1037 + }, + { + "epoch": 1.84, + "grad_norm": 0.44944706559181213, + "learning_rate": 0.0001652497399253481, + "loss": 0.0964, + "step": 1038 + }, + { + "epoch": 1.84, + "grad_norm": 0.5892651081085205, + "learning_rate": 0.00016481137614966223, + "loss": 0.1138, + "step": 1039 + }, + { + "epoch": 1.84, + "grad_norm": 0.29900479316711426, + "learning_rate": 0.00016437330850329793, + "loss": 0.0429, + "step": 1040 + }, + { + "epoch": 1.84, + "grad_norm": 0.3094378411769867, + "learning_rate": 0.00016393553850904878, + "loss": 0.0577, + "step": 1041 + }, + { + "epoch": 1.84, + "grad_norm": 0.23039738833904266, + "learning_rate": 0.00016349806768867345, + "loss": 0.026, + "step": 1042 + }, + { + "epoch": 1.85, + "grad_norm": 0.3328697979450226, + "learning_rate": 0.00016306089756289063, + "loss": 0.0542, + "step": 1043 + }, + { + "epoch": 1.85, + "grad_norm": 0.3017619252204895, + "learning_rate": 0.0001626240296513739, + "loss": 0.0363, + "step": 1044 + }, + { + "epoch": 1.85, + "grad_norm": 0.15930373966693878, + "learning_rate": 0.0001621874654727461, + "loss": 0.02, + "step": 1045 + }, + { + "epoch": 1.85, + "grad_norm": 0.40952980518341064, + "learning_rate": 0.00016175120654457432, + "loss": 0.0523, + "step": 1046 + }, + { + "epoch": 1.85, + "grad_norm": 0.6540464162826538, + "learning_rate": 0.00016131525438336475, + "loss": 0.0744, + "step": 1047 + }, + { + "epoch": 1.85, + "grad_norm": 0.3518769443035126, + "learning_rate": 0.00016087961050455685, + "loss": 0.05, + "step": 1048 + }, + { + "epoch": 1.86, + "grad_norm": 0.4166756570339203, + "learning_rate": 0.0001604442764225188, + "loss": 0.0681, + "step": 1049 + }, + { + "epoch": 1.86, + "grad_norm": 0.39616283774375916, + "learning_rate": 0.00016000925365054154, + "loss": 0.0416, + "step": 1050 + }, + { + "epoch": 1.86, + "grad_norm": 0.20040427148342133, + "learning_rate": 0.00015957454370083398, + "loss": 0.0284, + "step": 1051 + }, + { + "epoch": 1.86, + "grad_norm": 0.7230433821678162, + "learning_rate": 0.00015914014808451784, + "loss": 0.1035, + "step": 1052 + }, + { + "epoch": 1.86, + "grad_norm": 0.1968737691640854, + "learning_rate": 0.00015870606831162182, + "loss": 0.0281, + "step": 1053 + }, + { + "epoch": 1.86, + "grad_norm": 0.2677242159843445, + "learning_rate": 0.0001582723058910769, + "loss": 0.0566, + "step": 1054 + }, + { + "epoch": 1.87, + "grad_norm": 0.12256369739770889, + "learning_rate": 0.00015783886233071076, + "loss": 0.0192, + "step": 1055 + }, + { + "epoch": 1.87, + "grad_norm": 0.311192125082016, + "learning_rate": 0.00015740573913724276, + "loss": 0.035, + "step": 1056 + }, + { + "epoch": 1.87, + "grad_norm": 0.36169809103012085, + "learning_rate": 0.00015697293781627878, + "loss": 0.0755, + "step": 1057 + }, + { + "epoch": 1.87, + "grad_norm": 0.8104953765869141, + "learning_rate": 0.00015654045987230532, + "loss": 0.0418, + "step": 1058 + }, + { + "epoch": 1.87, + "grad_norm": 0.35273879766464233, + "learning_rate": 0.00015610830680868533, + "loss": 0.0266, + "step": 1059 + }, + { + "epoch": 1.88, + "grad_norm": 0.18313364684581757, + "learning_rate": 0.00015567648012765212, + "loss": 0.0538, + "step": 1060 + }, + { + "epoch": 1.88, + "grad_norm": 0.40563294291496277, + "learning_rate": 0.0001552449813303044, + "loss": 0.046, + "step": 1061 + }, + { + "epoch": 1.88, + "grad_norm": 0.44426023960113525, + "learning_rate": 0.00015481381191660143, + "loss": 0.0938, + "step": 1062 + }, + { + "epoch": 1.88, + "grad_norm": 0.4189196228981018, + "learning_rate": 0.00015438297338535702, + "loss": 0.0344, + "step": 1063 + }, + { + "epoch": 1.88, + "grad_norm": 0.6641749143600464, + "learning_rate": 0.0001539524672342351, + "loss": 0.0729, + "step": 1064 + }, + { + "epoch": 1.88, + "grad_norm": 0.2397107034921646, + "learning_rate": 0.00015352229495974422, + "loss": 0.0493, + "step": 1065 + }, + { + "epoch": 1.89, + "grad_norm": 0.17326873540878296, + "learning_rate": 0.00015309245805723205, + "loss": 0.0131, + "step": 1066 + }, + { + "epoch": 1.89, + "grad_norm": 0.69275963306427, + "learning_rate": 0.00015266295802088064, + "loss": 0.1512, + "step": 1067 + }, + { + "epoch": 1.89, + "grad_norm": 0.3260841369628906, + "learning_rate": 0.00015223379634370115, + "loss": 0.0602, + "step": 1068 + }, + { + "epoch": 1.89, + "grad_norm": 0.45368266105651855, + "learning_rate": 0.00015180497451752826, + "loss": 0.0593, + "step": 1069 + }, + { + "epoch": 1.89, + "grad_norm": 0.5664640069007874, + "learning_rate": 0.0001513764940330155, + "loss": 0.0651, + "step": 1070 + }, + { + "epoch": 1.89, + "grad_norm": 0.21212846040725708, + "learning_rate": 0.00015094835637962975, + "loss": 0.0232, + "step": 1071 + }, + { + "epoch": 1.9, + "grad_norm": 0.364945650100708, + "learning_rate": 0.0001505205630456461, + "loss": 0.0436, + "step": 1072 + }, + { + "epoch": 1.9, + "grad_norm": 0.48835766315460205, + "learning_rate": 0.00015009311551814297, + "loss": 0.0885, + "step": 1073 + }, + { + "epoch": 1.9, + "grad_norm": 0.22198058664798737, + "learning_rate": 0.00014966601528299637, + "loss": 0.026, + "step": 1074 + }, + { + "epoch": 1.9, + "grad_norm": 0.2598209083080292, + "learning_rate": 0.00014923926382487534, + "loss": 0.0306, + "step": 1075 + }, + { + "epoch": 1.9, + "grad_norm": 0.22863651812076569, + "learning_rate": 0.0001488128626272363, + "loss": 0.0476, + "step": 1076 + }, + { + "epoch": 1.91, + "grad_norm": 0.4222748875617981, + "learning_rate": 0.00014838681317231822, + "loss": 0.0837, + "step": 1077 + }, + { + "epoch": 1.91, + "grad_norm": 0.5555634498596191, + "learning_rate": 0.00014796111694113752, + "loss": 0.0747, + "step": 1078 + }, + { + "epoch": 1.91, + "grad_norm": 0.28704702854156494, + "learning_rate": 0.0001475357754134824, + "loss": 0.0388, + "step": 1079 + }, + { + "epoch": 1.91, + "grad_norm": 0.3526531457901001, + "learning_rate": 0.00014711079006790828, + "loss": 0.0396, + "step": 1080 + }, + { + "epoch": 1.91, + "grad_norm": 0.41639888286590576, + "learning_rate": 0.0001466861623817325, + "loss": 0.0954, + "step": 1081 + }, + { + "epoch": 1.91, + "grad_norm": 0.288824200630188, + "learning_rate": 0.0001462618938310288, + "loss": 0.0355, + "step": 1082 + }, + { + "epoch": 1.92, + "grad_norm": 0.257003515958786, + "learning_rate": 0.00014583798589062292, + "loss": 0.0257, + "step": 1083 + }, + { + "epoch": 1.92, + "grad_norm": 0.23509138822555542, + "learning_rate": 0.00014541444003408682, + "loss": 0.0548, + "step": 1084 + }, + { + "epoch": 1.92, + "grad_norm": 0.3425995707511902, + "learning_rate": 0.0001449912577337337, + "loss": 0.0691, + "step": 1085 + }, + { + "epoch": 1.92, + "grad_norm": 0.2606826722621918, + "learning_rate": 0.00014456844046061332, + "loss": 0.029, + "step": 1086 + }, + { + "epoch": 1.92, + "grad_norm": 0.10555114597082138, + "learning_rate": 0.00014414598968450615, + "loss": 0.0166, + "step": 1087 + }, + { + "epoch": 1.92, + "grad_norm": 0.47334909439086914, + "learning_rate": 0.00014372390687391906, + "loss": 0.0438, + "step": 1088 + }, + { + "epoch": 1.93, + "grad_norm": 0.36925116181373596, + "learning_rate": 0.00014330219349607947, + "loss": 0.0163, + "step": 1089 + }, + { + "epoch": 1.93, + "grad_norm": 0.2707056999206543, + "learning_rate": 0.0001428808510169307, + "loss": 0.0709, + "step": 1090 + }, + { + "epoch": 1.93, + "grad_norm": 0.20645679533481598, + "learning_rate": 0.00014245988090112694, + "loss": 0.0351, + "step": 1091 + }, + { + "epoch": 1.93, + "grad_norm": 0.1839297115802765, + "learning_rate": 0.00014203928461202763, + "loss": 0.025, + "step": 1092 + }, + { + "epoch": 1.93, + "grad_norm": 0.6433751583099365, + "learning_rate": 0.0001416190636116932, + "loss": 0.0693, + "step": 1093 + }, + { + "epoch": 1.94, + "grad_norm": 0.44755876064300537, + "learning_rate": 0.00014119921936087907, + "loss": 0.0788, + "step": 1094 + }, + { + "epoch": 1.94, + "grad_norm": 0.3716716766357422, + "learning_rate": 0.00014077975331903118, + "loss": 0.0429, + "step": 1095 + }, + { + "epoch": 1.94, + "grad_norm": 0.7285858392715454, + "learning_rate": 0.00014036066694428096, + "loss": 0.035, + "step": 1096 + }, + { + "epoch": 1.94, + "grad_norm": 0.22279293835163116, + "learning_rate": 0.00013994196169343963, + "loss": 0.012, + "step": 1097 + }, + { + "epoch": 1.94, + "grad_norm": 0.17774836719036102, + "learning_rate": 0.00013952363902199405, + "loss": 0.0238, + "step": 1098 + }, + { + "epoch": 1.94, + "grad_norm": 0.2312237024307251, + "learning_rate": 0.0001391057003841008, + "loss": 0.0546, + "step": 1099 + }, + { + "epoch": 1.95, + "grad_norm": 0.6004791855812073, + "learning_rate": 0.0001386881472325816, + "loss": 0.0625, + "step": 1100 + }, + { + "epoch": 1.95, + "grad_norm": 0.21409562230110168, + "learning_rate": 0.0001382709810189183, + "loss": 0.034, + "step": 1101 + }, + { + "epoch": 1.95, + "grad_norm": 0.295493483543396, + "learning_rate": 0.00013785420319324744, + "loss": 0.0332, + "step": 1102 + }, + { + "epoch": 1.95, + "grad_norm": 0.5428887009620667, + "learning_rate": 0.00013743781520435573, + "loss": 0.0649, + "step": 1103 + }, + { + "epoch": 1.95, + "grad_norm": 0.331990122795105, + "learning_rate": 0.00013702181849967453, + "loss": 0.046, + "step": 1104 + }, + { + "epoch": 1.95, + "grad_norm": 0.4544171392917633, + "learning_rate": 0.00013660621452527504, + "loss": 0.0563, + "step": 1105 + }, + { + "epoch": 1.96, + "grad_norm": 0.35486406087875366, + "learning_rate": 0.0001361910047258635, + "loss": 0.0583, + "step": 1106 + }, + { + "epoch": 1.96, + "grad_norm": 0.24665361642837524, + "learning_rate": 0.00013577619054477575, + "loss": 0.0267, + "step": 1107 + }, + { + "epoch": 1.96, + "grad_norm": 0.07276459783315659, + "learning_rate": 0.00013536177342397243, + "loss": 0.0064, + "step": 1108 + }, + { + "epoch": 1.96, + "grad_norm": 0.4690609872341156, + "learning_rate": 0.00013494775480403384, + "loss": 0.0553, + "step": 1109 + }, + { + "epoch": 1.96, + "grad_norm": 0.4010032117366791, + "learning_rate": 0.00013453413612415512, + "loss": 0.0514, + "step": 1110 + }, + { + "epoch": 1.97, + "grad_norm": 0.3563205301761627, + "learning_rate": 0.00013412091882214112, + "loss": 0.0553, + "step": 1111 + }, + { + "epoch": 1.97, + "grad_norm": 0.6027369499206543, + "learning_rate": 0.00013370810433440167, + "loss": 0.0677, + "step": 1112 + }, + { + "epoch": 1.97, + "grad_norm": 0.5082702040672302, + "learning_rate": 0.00013329569409594605, + "loss": 0.1265, + "step": 1113 + }, + { + "epoch": 1.97, + "grad_norm": 0.5313373804092407, + "learning_rate": 0.00013288368954037834, + "loss": 0.0234, + "step": 1114 + }, + { + "epoch": 1.97, + "grad_norm": 0.33385488390922546, + "learning_rate": 0.00013247209209989242, + "loss": 0.0252, + "step": 1115 + }, + { + "epoch": 1.97, + "grad_norm": 0.36459195613861084, + "learning_rate": 0.00013206090320526704, + "loss": 0.0211, + "step": 1116 + }, + { + "epoch": 1.98, + "grad_norm": 0.5477709770202637, + "learning_rate": 0.00013165012428586096, + "loss": 0.0416, + "step": 1117 + }, + { + "epoch": 1.98, + "grad_norm": 0.3133089542388916, + "learning_rate": 0.0001312397567696074, + "loss": 0.036, + "step": 1118 + }, + { + "epoch": 1.98, + "grad_norm": 0.284045934677124, + "learning_rate": 0.00013082980208300971, + "loss": 0.0249, + "step": 1119 + }, + { + "epoch": 1.98, + "grad_norm": 0.3401576578617096, + "learning_rate": 0.00013042026165113618, + "loss": 0.0281, + "step": 1120 + }, + { + "epoch": 1.98, + "grad_norm": 0.21280981600284576, + "learning_rate": 0.00013001113689761496, + "loss": 0.0186, + "step": 1121 + }, + { + "epoch": 1.98, + "grad_norm": 0.1294207125902176, + "learning_rate": 0.00012960242924462957, + "loss": 0.0156, + "step": 1122 + }, + { + "epoch": 1.99, + "grad_norm": 0.11151756346225739, + "learning_rate": 0.00012919414011291298, + "loss": 0.0111, + "step": 1123 + }, + { + "epoch": 1.99, + "grad_norm": 0.7448397874832153, + "learning_rate": 0.0001287862709217439, + "loss": 0.0898, + "step": 1124 + }, + { + "epoch": 1.99, + "grad_norm": 0.4080904424190521, + "learning_rate": 0.00012837882308894117, + "loss": 0.0323, + "step": 1125 + }, + { + "epoch": 1.99, + "grad_norm": 0.33506283164024353, + "learning_rate": 0.00012797179803085862, + "loss": 0.0309, + "step": 1126 + }, + { + "epoch": 1.99, + "grad_norm": 0.32063427567481995, + "learning_rate": 0.00012756519716238096, + "loss": 0.0978, + "step": 1127 + }, + { + "epoch": 2.0, + "grad_norm": 0.7773024439811707, + "learning_rate": 0.0001271590218969176, + "loss": 0.1017, + "step": 1128 + }, + { + "epoch": 2.0, + "grad_norm": 0.2934909164905548, + "learning_rate": 0.00012675327364639917, + "loss": 0.0192, + "step": 1129 + }, + { + "epoch": 2.0, + "grad_norm": 0.6137431859970093, + "learning_rate": 0.0001263479538212717, + "loss": 0.1013, + "step": 1130 + }, + { + "epoch": 2.0, + "grad_norm": 0.2201017141342163, + "learning_rate": 0.00012594306383049186, + "loss": 0.0129, + "step": 1131 + }, + { + "epoch": 2.0, + "grad_norm": 0.04556626081466675, + "learning_rate": 0.00012553860508152212, + "loss": 0.0066, + "step": 1132 + }, + { + "epoch": 2.0, + "grad_norm": 0.17257572710514069, + "learning_rate": 0.00012513457898032616, + "loss": 0.0133, + "step": 1133 + }, + { + "epoch": 2.01, + "grad_norm": 0.29365074634552, + "learning_rate": 0.0001247309869313633, + "loss": 0.0194, + "step": 1134 + }, + { + "epoch": 2.01, + "grad_norm": 0.032978832721710205, + "learning_rate": 0.00012432783033758447, + "loss": 0.0047, + "step": 1135 + }, + { + "epoch": 2.01, + "grad_norm": 0.05853278562426567, + "learning_rate": 0.0001239251106004265, + "loss": 0.0062, + "step": 1136 + }, + { + "epoch": 2.01, + "eval_loss": 0.08742678165435791, + "eval_runtime": 14.7019, + "eval_samples_per_second": 32.445, + "eval_steps_per_second": 8.162, + "step": 1136 + }, + { + "epoch": 2.01, + "grad_norm": 0.1357499063014984, + "learning_rate": 0.00012352282911980782, + "loss": 0.0081, + "step": 1137 + }, + { + "epoch": 2.01, + "grad_norm": 0.17858386039733887, + "learning_rate": 0.00012312098729412346, + "loss": 0.0148, + "step": 1138 + }, + { + "epoch": 2.02, + "grad_norm": 0.12391608208417892, + "learning_rate": 0.00012271958652023993, + "loss": 0.0082, + "step": 1139 + }, + { + "epoch": 2.02, + "grad_norm": 0.10268434137105942, + "learning_rate": 0.0001223186281934909, + "loss": 0.0068, + "step": 1140 + }, + { + "epoch": 2.02, + "grad_norm": 0.26439937949180603, + "learning_rate": 0.00012191811370767172, + "loss": 0.0149, + "step": 1141 + }, + { + "epoch": 2.02, + "grad_norm": 0.051827941089868546, + "learning_rate": 0.00012151804445503492, + "loss": 0.0066, + "step": 1142 + }, + { + "epoch": 2.02, + "grad_norm": 0.354404091835022, + "learning_rate": 0.00012111842182628555, + "loss": 0.0539, + "step": 1143 + }, + { + "epoch": 2.02, + "grad_norm": 0.036191366612911224, + "learning_rate": 0.00012071924721057579, + "loss": 0.0034, + "step": 1144 + }, + { + "epoch": 2.03, + "grad_norm": 0.10083664208650589, + "learning_rate": 0.00012032052199550083, + "loss": 0.0096, + "step": 1145 + }, + { + "epoch": 2.03, + "grad_norm": 0.1553259789943695, + "learning_rate": 0.00011992224756709343, + "loss": 0.0077, + "step": 1146 + }, + { + "epoch": 2.03, + "grad_norm": 0.3661736845970154, + "learning_rate": 0.00011952442530981921, + "loss": 0.027, + "step": 1147 + }, + { + "epoch": 2.03, + "grad_norm": 0.25327032804489136, + "learning_rate": 0.00011912705660657244, + "loss": 0.0212, + "step": 1148 + }, + { + "epoch": 2.03, + "grad_norm": 0.30773070454597473, + "learning_rate": 0.0001187301428386702, + "loss": 0.0175, + "step": 1149 + }, + { + "epoch": 2.03, + "grad_norm": 0.056706108152866364, + "learning_rate": 0.00011833368538584863, + "loss": 0.0056, + "step": 1150 + }, + { + "epoch": 2.04, + "grad_norm": 0.24290138483047485, + "learning_rate": 0.00011793768562625734, + "loss": 0.0165, + "step": 1151 + }, + { + "epoch": 2.04, + "grad_norm": 0.10750260949134827, + "learning_rate": 0.00011754214493645493, + "loss": 0.0075, + "step": 1152 + }, + { + "epoch": 2.04, + "grad_norm": 0.07075980305671692, + "learning_rate": 0.00011714706469140449, + "loss": 0.0052, + "step": 1153 + }, + { + "epoch": 2.04, + "grad_norm": 0.03554920107126236, + "learning_rate": 0.0001167524462644681, + "loss": 0.0043, + "step": 1154 + }, + { + "epoch": 2.04, + "grad_norm": 0.10947668552398682, + "learning_rate": 0.00011635829102740294, + "loss": 0.0077, + "step": 1155 + }, + { + "epoch": 2.05, + "grad_norm": 0.036122776567935944, + "learning_rate": 0.00011596460035035572, + "loss": 0.0018, + "step": 1156 + }, + { + "epoch": 2.05, + "grad_norm": 0.036581866443157196, + "learning_rate": 0.00011557137560185829, + "loss": 0.0033, + "step": 1157 + }, + { + "epoch": 2.05, + "grad_norm": 0.3237001597881317, + "learning_rate": 0.00011517861814882308, + "loss": 0.0203, + "step": 1158 + }, + { + "epoch": 2.05, + "grad_norm": 0.3374278247356415, + "learning_rate": 0.00011478632935653805, + "loss": 0.0076, + "step": 1159 + }, + { + "epoch": 2.05, + "grad_norm": 0.3109148144721985, + "learning_rate": 0.0001143945105886619, + "loss": 0.0252, + "step": 1160 + }, + { + "epoch": 2.05, + "grad_norm": 0.1825418621301651, + "learning_rate": 0.00011400316320721951, + "loss": 0.0095, + "step": 1161 + }, + { + "epoch": 2.06, + "grad_norm": 0.056046485900878906, + "learning_rate": 0.00011361228857259709, + "loss": 0.0036, + "step": 1162 + }, + { + "epoch": 2.06, + "grad_norm": 0.10362865775823593, + "learning_rate": 0.00011322188804353761, + "loss": 0.0048, + "step": 1163 + }, + { + "epoch": 2.06, + "grad_norm": 0.09634903073310852, + "learning_rate": 0.00011283196297713608, + "loss": 0.0056, + "step": 1164 + }, + { + "epoch": 2.06, + "grad_norm": 0.0892496109008789, + "learning_rate": 0.00011244251472883446, + "loss": 0.0051, + "step": 1165 + }, + { + "epoch": 2.06, + "grad_norm": 0.9342542886734009, + "learning_rate": 0.00011205354465241732, + "loss": 0.0462, + "step": 1166 + }, + { + "epoch": 2.06, + "grad_norm": 0.08937977254390717, + "learning_rate": 0.00011166505410000697, + "loss": 0.0055, + "step": 1167 + }, + { + "epoch": 2.07, + "grad_norm": 0.032398249953985214, + "learning_rate": 0.00011127704442205897, + "loss": 0.0027, + "step": 1168 + }, + { + "epoch": 2.07, + "grad_norm": 0.20088808238506317, + "learning_rate": 0.0001108895169673573, + "loss": 0.0328, + "step": 1169 + }, + { + "epoch": 2.07, + "grad_norm": 0.0885910764336586, + "learning_rate": 0.00011050247308300945, + "loss": 0.005, + "step": 1170 + }, + { + "epoch": 2.07, + "grad_norm": 0.009513720870018005, + "learning_rate": 0.00011011591411444199, + "loss": 0.0009, + "step": 1171 + }, + { + "epoch": 2.07, + "grad_norm": 0.04859737306833267, + "learning_rate": 0.00010972984140539605, + "loss": 0.0024, + "step": 1172 + }, + { + "epoch": 2.08, + "grad_norm": 0.08774285018444061, + "learning_rate": 0.00010934425629792214, + "loss": 0.0048, + "step": 1173 + }, + { + "epoch": 2.08, + "grad_norm": 0.09334032982587814, + "learning_rate": 0.00010895916013237619, + "loss": 0.0022, + "step": 1174 + }, + { + "epoch": 2.08, + "grad_norm": 0.016344185918569565, + "learning_rate": 0.00010857455424741388, + "loss": 0.0014, + "step": 1175 + }, + { + "epoch": 2.08, + "grad_norm": 0.06272843480110168, + "learning_rate": 0.00010819043997998721, + "loss": 0.0018, + "step": 1176 + }, + { + "epoch": 2.08, + "grad_norm": 0.42132094502449036, + "learning_rate": 0.00010780681866533897, + "loss": 0.0221, + "step": 1177 + }, + { + "epoch": 2.08, + "grad_norm": 0.2525283992290497, + "learning_rate": 0.00010742369163699841, + "loss": 0.0114, + "step": 1178 + }, + { + "epoch": 2.09, + "grad_norm": 0.9688707590103149, + "learning_rate": 0.00010704106022677645, + "loss": 0.0166, + "step": 1179 + }, + { + "epoch": 2.09, + "grad_norm": 0.14904755353927612, + "learning_rate": 0.00010665892576476122, + "loss": 0.0044, + "step": 1180 + }, + { + "epoch": 2.09, + "grad_norm": 0.020122570917010307, + "learning_rate": 0.00010627728957931346, + "loss": 0.0017, + "step": 1181 + }, + { + "epoch": 2.09, + "grad_norm": 0.44014814496040344, + "learning_rate": 0.00010589615299706187, + "loss": 0.0078, + "step": 1182 + }, + { + "epoch": 2.09, + "grad_norm": 0.6460086107254028, + "learning_rate": 0.00010551551734289827, + "loss": 0.0398, + "step": 1183 + }, + { + "epoch": 2.09, + "grad_norm": 0.1972631961107254, + "learning_rate": 0.00010513538393997316, + "loss": 0.0286, + "step": 1184 + }, + { + "epoch": 2.1, + "grad_norm": 0.04835314676165581, + "learning_rate": 0.00010475575410969138, + "loss": 0.0026, + "step": 1185 + }, + { + "epoch": 2.1, + "grad_norm": 0.4275042712688446, + "learning_rate": 0.00010437662917170695, + "loss": 0.0136, + "step": 1186 + }, + { + "epoch": 2.1, + "grad_norm": 0.010206771083176136, + "learning_rate": 0.00010399801044391918, + "loss": 0.0007, + "step": 1187 + }, + { + "epoch": 2.1, + "grad_norm": 0.01756274327635765, + "learning_rate": 0.00010361989924246737, + "loss": 0.0011, + "step": 1188 + }, + { + "epoch": 2.1, + "grad_norm": 0.6749821901321411, + "learning_rate": 0.00010324229688172665, + "loss": 0.0332, + "step": 1189 + }, + { + "epoch": 2.11, + "grad_norm": 0.1459697037935257, + "learning_rate": 0.00010286520467430357, + "loss": 0.0052, + "step": 1190 + }, + { + "epoch": 2.11, + "grad_norm": 0.0659116879105568, + "learning_rate": 0.00010248862393103092, + "loss": 0.0037, + "step": 1191 + }, + { + "epoch": 2.11, + "grad_norm": 0.11409203708171844, + "learning_rate": 0.000102112555960964, + "loss": 0.0046, + "step": 1192 + }, + { + "epoch": 2.11, + "grad_norm": 0.19234444200992584, + "learning_rate": 0.00010173700207137529, + "loss": 0.0084, + "step": 1193 + }, + { + "epoch": 2.11, + "grad_norm": 0.01539614424109459, + "learning_rate": 0.00010136196356775024, + "loss": 0.001, + "step": 1194 + }, + { + "epoch": 2.11, + "grad_norm": 0.006582081783562899, + "learning_rate": 0.00010098744175378308, + "loss": 0.0005, + "step": 1195 + }, + { + "epoch": 2.12, + "grad_norm": 0.5718114972114563, + "learning_rate": 0.00010061343793137149, + "loss": 0.0232, + "step": 1196 + }, + { + "epoch": 2.12, + "grad_norm": 0.4466152787208557, + "learning_rate": 0.00010023995340061292, + "loss": 0.0192, + "step": 1197 + }, + { + "epoch": 2.12, + "grad_norm": 0.3126049041748047, + "learning_rate": 9.986698945979946e-05, + "loss": 0.0057, + "step": 1198 + }, + { + "epoch": 2.12, + "grad_norm": 0.7232750058174133, + "learning_rate": 9.94945474054135e-05, + "loss": 0.0372, + "step": 1199 + }, + { + "epoch": 2.12, + "grad_norm": 0.00877452827990055, + "learning_rate": 9.91226285321235e-05, + "loss": 0.0005, + "step": 1200 + }, + { + "epoch": 2.12, + "grad_norm": 0.023644492030143738, + "learning_rate": 9.8751234132779e-05, + "loss": 0.0012, + "step": 1201 + }, + { + "epoch": 2.13, + "grad_norm": 0.008336883969604969, + "learning_rate": 9.838036549840668e-05, + "loss": 0.0005, + "step": 1202 + }, + { + "epoch": 2.13, + "grad_norm": 0.1168123185634613, + "learning_rate": 9.801002391820527e-05, + "loss": 0.0036, + "step": 1203 + }, + { + "epoch": 2.13, + "grad_norm": 0.4848583936691284, + "learning_rate": 9.764021067954146e-05, + "loss": 0.0177, + "step": 1204 + }, + { + "epoch": 2.13, + "grad_norm": 0.32454514503479004, + "learning_rate": 9.727092706794555e-05, + "loss": 0.0056, + "step": 1205 + }, + { + "epoch": 2.13, + "grad_norm": 0.1333390176296234, + "learning_rate": 9.690217436710646e-05, + "loss": 0.0026, + "step": 1206 + }, + { + "epoch": 2.14, + "grad_norm": 0.09174130856990814, + "learning_rate": 9.653395385886787e-05, + "loss": 0.0062, + "step": 1207 + }, + { + "epoch": 2.14, + "grad_norm": 0.01976648159325123, + "learning_rate": 9.616626682322327e-05, + "loss": 0.0008, + "step": 1208 + }, + { + "epoch": 2.14, + "grad_norm": 0.2334078848361969, + "learning_rate": 9.579911453831166e-05, + "loss": 0.0069, + "step": 1209 + }, + { + "epoch": 2.14, + "grad_norm": 0.010424863547086716, + "learning_rate": 9.543249828041342e-05, + "loss": 0.0007, + "step": 1210 + }, + { + "epoch": 2.14, + "grad_norm": 0.06431838870048523, + "learning_rate": 9.506641932394552e-05, + "loss": 0.0021, + "step": 1211 + }, + { + "epoch": 2.14, + "grad_norm": 0.2083590179681778, + "learning_rate": 9.470087894145704e-05, + "loss": 0.0092, + "step": 1212 + }, + { + "epoch": 2.15, + "grad_norm": 0.03698310628533363, + "learning_rate": 9.433587840362501e-05, + "loss": 0.0018, + "step": 1213 + }, + { + "epoch": 2.15, + "grad_norm": 0.836845874786377, + "learning_rate": 9.397141897924974e-05, + "loss": 0.0274, + "step": 1214 + }, + { + "epoch": 2.15, + "grad_norm": 0.007941615767776966, + "learning_rate": 9.360750193525076e-05, + "loss": 0.0004, + "step": 1215 + }, + { + "epoch": 2.15, + "grad_norm": 0.03961505368351936, + "learning_rate": 9.324412853666217e-05, + "loss": 0.0018, + "step": 1216 + }, + { + "epoch": 2.15, + "grad_norm": 0.040892381221055984, + "learning_rate": 9.28813000466281e-05, + "loss": 0.0015, + "step": 1217 + }, + { + "epoch": 2.15, + "grad_norm": 0.05060145631432533, + "learning_rate": 9.25190177263986e-05, + "loss": 0.0025, + "step": 1218 + }, + { + "epoch": 2.16, + "grad_norm": 0.5681605935096741, + "learning_rate": 9.215728283532502e-05, + "loss": 0.0695, + "step": 1219 + }, + { + "epoch": 2.16, + "grad_norm": 0.012955489568412304, + "learning_rate": 9.179609663085595e-05, + "loss": 0.0006, + "step": 1220 + }, + { + "epoch": 2.16, + "grad_norm": 0.6021496653556824, + "learning_rate": 9.143546036853279e-05, + "loss": 0.0213, + "step": 1221 + }, + { + "epoch": 2.16, + "grad_norm": 0.011097903363406658, + "learning_rate": 9.107537530198464e-05, + "loss": 0.0006, + "step": 1222 + }, + { + "epoch": 2.16, + "grad_norm": 0.009882250800728798, + "learning_rate": 9.071584268292515e-05, + "loss": 0.0006, + "step": 1223 + }, + { + "epoch": 2.17, + "grad_norm": 0.020904725417494774, + "learning_rate": 9.035686376114749e-05, + "loss": 0.0009, + "step": 1224 + }, + { + "epoch": 2.17, + "grad_norm": 1.1304357051849365, + "learning_rate": 8.999843978451977e-05, + "loss": 0.0159, + "step": 1225 + }, + { + "epoch": 2.17, + "grad_norm": 0.019047705456614494, + "learning_rate": 8.964057199898148e-05, + "loss": 0.0008, + "step": 1226 + }, + { + "epoch": 2.17, + "grad_norm": 0.05119523033499718, + "learning_rate": 8.928326164853811e-05, + "loss": 0.0018, + "step": 1227 + }, + { + "epoch": 2.17, + "grad_norm": 0.01547545101493597, + "learning_rate": 8.892650997525794e-05, + "loss": 0.0008, + "step": 1228 + }, + { + "epoch": 2.17, + "grad_norm": 0.7677561044692993, + "learning_rate": 8.857031821926711e-05, + "loss": 0.0276, + "step": 1229 + }, + { + "epoch": 2.18, + "grad_norm": 0.07431495934724808, + "learning_rate": 8.821468761874518e-05, + "loss": 0.0029, + "step": 1230 + }, + { + "epoch": 2.18, + "grad_norm": 0.018387913703918457, + "learning_rate": 8.785961940992118e-05, + "loss": 0.0008, + "step": 1231 + }, + { + "epoch": 2.18, + "grad_norm": 0.34750181436538696, + "learning_rate": 8.75051148270691e-05, + "loss": 0.009, + "step": 1232 + }, + { + "epoch": 2.18, + "grad_norm": 0.36946621537208557, + "learning_rate": 8.715117510250378e-05, + "loss": 0.0281, + "step": 1233 + }, + { + "epoch": 2.18, + "grad_norm": 0.03397729992866516, + "learning_rate": 8.67978014665766e-05, + "loss": 0.0012, + "step": 1234 + }, + { + "epoch": 2.18, + "grad_norm": 0.12867552042007446, + "learning_rate": 8.644499514767088e-05, + "loss": 0.0062, + "step": 1235 + }, + { + "epoch": 2.19, + "grad_norm": 0.010095584206283092, + "learning_rate": 8.609275737219793e-05, + "loss": 0.0006, + "step": 1236 + }, + { + "epoch": 2.19, + "grad_norm": 0.028368016704916954, + "learning_rate": 8.57410893645929e-05, + "loss": 0.0014, + "step": 1237 + }, + { + "epoch": 2.19, + "grad_norm": 0.12146025896072388, + "learning_rate": 8.538999234731004e-05, + "loss": 0.005, + "step": 1238 + }, + { + "epoch": 2.19, + "grad_norm": 0.0617150254547596, + "learning_rate": 8.50394675408191e-05, + "loss": 0.0031, + "step": 1239 + }, + { + "epoch": 2.19, + "grad_norm": 0.005746606737375259, + "learning_rate": 8.468951616360038e-05, + "loss": 0.0003, + "step": 1240 + }, + { + "epoch": 2.2, + "grad_norm": 0.4718758761882782, + "learning_rate": 8.434013943214097e-05, + "loss": 0.0496, + "step": 1241 + }, + { + "epoch": 2.2, + "grad_norm": 0.008204291574656963, + "learning_rate": 8.399133856093061e-05, + "loss": 0.0004, + "step": 1242 + }, + { + "epoch": 2.2, + "grad_norm": 0.5281336307525635, + "learning_rate": 8.36431147624569e-05, + "loss": 0.0375, + "step": 1243 + }, + { + "epoch": 2.2, + "grad_norm": 0.006096679251641035, + "learning_rate": 8.329546924720177e-05, + "loss": 0.0004, + "step": 1244 + }, + { + "epoch": 2.2, + "grad_norm": 0.040774155408144, + "learning_rate": 8.294840322363672e-05, + "loss": 0.0015, + "step": 1245 + }, + { + "epoch": 2.2, + "grad_norm": 0.060770515352487564, + "learning_rate": 8.260191789821884e-05, + "loss": 0.0016, + "step": 1246 + }, + { + "epoch": 2.21, + "grad_norm": 0.2820521295070648, + "learning_rate": 8.225601447538689e-05, + "loss": 0.0085, + "step": 1247 + }, + { + "epoch": 2.21, + "grad_norm": 0.04571341350674629, + "learning_rate": 8.191069415755645e-05, + "loss": 0.0018, + "step": 1248 + }, + { + "epoch": 2.21, + "grad_norm": 0.015350689180195332, + "learning_rate": 8.156595814511655e-05, + "loss": 0.0011, + "step": 1249 + }, + { + "epoch": 2.21, + "grad_norm": 0.40809088945388794, + "learning_rate": 8.122180763642475e-05, + "loss": 0.0129, + "step": 1250 + }, + { + "epoch": 2.21, + "grad_norm": 0.030470022931694984, + "learning_rate": 8.087824382780335e-05, + "loss": 0.0014, + "step": 1251 + }, + { + "epoch": 2.21, + "grad_norm": 0.11672550439834595, + "learning_rate": 8.05352679135354e-05, + "loss": 0.0031, + "step": 1252 + }, + { + "epoch": 2.22, + "grad_norm": 0.009694907814264297, + "learning_rate": 8.01928810858601e-05, + "loss": 0.0006, + "step": 1253 + }, + { + "epoch": 2.22, + "grad_norm": 0.03160417824983597, + "learning_rate": 7.985108453496909e-05, + "loss": 0.0015, + "step": 1254 + }, + { + "epoch": 2.22, + "grad_norm": 0.3607199490070343, + "learning_rate": 7.950987944900193e-05, + "loss": 0.0141, + "step": 1255 + }, + { + "epoch": 2.22, + "grad_norm": 0.12325584143400192, + "learning_rate": 7.916926701404217e-05, + "loss": 0.0052, + "step": 1256 + }, + { + "epoch": 2.22, + "grad_norm": 0.03867779299616814, + "learning_rate": 7.882924841411343e-05, + "loss": 0.0016, + "step": 1257 + }, + { + "epoch": 2.23, + "grad_norm": 0.008120791055262089, + "learning_rate": 7.848982483117473e-05, + "loss": 0.0004, + "step": 1258 + }, + { + "epoch": 2.23, + "grad_norm": 0.12468260526657104, + "learning_rate": 7.815099744511708e-05, + "loss": 0.0053, + "step": 1259 + }, + { + "epoch": 2.23, + "grad_norm": 0.006734863854944706, + "learning_rate": 7.78127674337587e-05, + "loss": 0.0005, + "step": 1260 + }, + { + "epoch": 2.23, + "grad_norm": 0.11184979975223541, + "learning_rate": 7.747513597284134e-05, + "loss": 0.0023, + "step": 1261 + }, + { + "epoch": 2.23, + "grad_norm": 0.030304264277219772, + "learning_rate": 7.713810423602619e-05, + "loss": 0.0018, + "step": 1262 + }, + { + "epoch": 2.23, + "grad_norm": 0.4860706925392151, + "learning_rate": 7.680167339488967e-05, + "loss": 0.0358, + "step": 1263 + }, + { + "epoch": 2.24, + "grad_norm": 0.08765775710344315, + "learning_rate": 7.646584461891929e-05, + "loss": 0.0031, + "step": 1264 + }, + { + "epoch": 2.24, + "grad_norm": 0.06726156920194626, + "learning_rate": 7.613061907550975e-05, + "loss": 0.003, + "step": 1265 + }, + { + "epoch": 2.24, + "grad_norm": 0.04926925525069237, + "learning_rate": 7.579599792995872e-05, + "loss": 0.0024, + "step": 1266 + }, + { + "epoch": 2.24, + "grad_norm": 0.31866374611854553, + "learning_rate": 7.546198234546309e-05, + "loss": 0.0263, + "step": 1267 + }, + { + "epoch": 2.24, + "grad_norm": 0.016148075461387634, + "learning_rate": 7.512857348311466e-05, + "loss": 0.0006, + "step": 1268 + }, + { + "epoch": 2.25, + "grad_norm": 0.013526340946555138, + "learning_rate": 7.479577250189606e-05, + "loss": 0.0005, + "step": 1269 + }, + { + "epoch": 2.25, + "grad_norm": 0.28299248218536377, + "learning_rate": 7.446358055867688e-05, + "loss": 0.0102, + "step": 1270 + }, + { + "epoch": 2.25, + "grad_norm": 0.34623104333877563, + "learning_rate": 7.413199880820953e-05, + "loss": 0.0105, + "step": 1271 + }, + { + "epoch": 2.25, + "grad_norm": 0.471078485250473, + "learning_rate": 7.380102840312541e-05, + "loss": 0.0406, + "step": 1272 + }, + { + "epoch": 2.25, + "grad_norm": 0.015806253999471664, + "learning_rate": 7.347067049393091e-05, + "loss": 0.0008, + "step": 1273 + }, + { + "epoch": 2.25, + "grad_norm": 0.2348398119211197, + "learning_rate": 7.314092622900285e-05, + "loss": 0.0119, + "step": 1274 + }, + { + "epoch": 2.26, + "grad_norm": 0.018902868032455444, + "learning_rate": 7.281179675458527e-05, + "loss": 0.0008, + "step": 1275 + }, + { + "epoch": 2.26, + "grad_norm": 0.07969076931476593, + "learning_rate": 7.248328321478512e-05, + "loss": 0.0026, + "step": 1276 + }, + { + "epoch": 2.26, + "grad_norm": 0.09565503150224686, + "learning_rate": 7.215538675156804e-05, + "loss": 0.0018, + "step": 1277 + }, + { + "epoch": 2.26, + "grad_norm": 0.10772550851106644, + "learning_rate": 7.182810850475494e-05, + "loss": 0.0024, + "step": 1278 + }, + { + "epoch": 2.26, + "eval_loss": 0.10833004117012024, + "eval_runtime": 14.7071, + "eval_samples_per_second": 32.433, + "eval_steps_per_second": 8.159, + "step": 1278 + }, + { + "epoch": 2.26, + "grad_norm": 0.16737312078475952, + "learning_rate": 7.15014496120172e-05, + "loss": 0.005, + "step": 1279 + }, + { + "epoch": 2.26, + "grad_norm": 0.013761342503130436, + "learning_rate": 7.11754112088737e-05, + "loss": 0.0006, + "step": 1280 + }, + { + "epoch": 2.27, + "grad_norm": 0.32918259501457214, + "learning_rate": 7.084999442868629e-05, + "loss": 0.007, + "step": 1281 + }, + { + "epoch": 2.27, + "grad_norm": 0.009015708230435848, + "learning_rate": 7.052520040265581e-05, + "loss": 0.0005, + "step": 1282 + }, + { + "epoch": 2.27, + "grad_norm": 0.03732029348611832, + "learning_rate": 7.020103025981839e-05, + "loss": 0.0011, + "step": 1283 + }, + { + "epoch": 2.27, + "grad_norm": 0.010399947874248028, + "learning_rate": 6.987748512704143e-05, + "loss": 0.0005, + "step": 1284 + }, + { + "epoch": 2.27, + "grad_norm": 0.34805890917778015, + "learning_rate": 6.955456612901973e-05, + "loss": 0.014, + "step": 1285 + }, + { + "epoch": 2.28, + "grad_norm": 0.03678420931100845, + "learning_rate": 6.923227438827159e-05, + "loss": 0.0011, + "step": 1286 + }, + { + "epoch": 2.28, + "grad_norm": 0.09479320049285889, + "learning_rate": 6.891061102513479e-05, + "loss": 0.002, + "step": 1287 + }, + { + "epoch": 2.28, + "grad_norm": 0.09461376816034317, + "learning_rate": 6.858957715776265e-05, + "loss": 0.0032, + "step": 1288 + }, + { + "epoch": 2.28, + "grad_norm": 0.042890515178442, + "learning_rate": 6.826917390212056e-05, + "loss": 0.0014, + "step": 1289 + }, + { + "epoch": 2.28, + "grad_norm": 0.5706468224525452, + "learning_rate": 6.79494023719815e-05, + "loss": 0.0081, + "step": 1290 + }, + { + "epoch": 2.28, + "grad_norm": 0.01644587516784668, + "learning_rate": 6.763026367892269e-05, + "loss": 0.0009, + "step": 1291 + }, + { + "epoch": 2.29, + "grad_norm": 0.020033186301589012, + "learning_rate": 6.731175893232141e-05, + "loss": 0.001, + "step": 1292 + }, + { + "epoch": 2.29, + "grad_norm": 0.13267932832241058, + "learning_rate": 6.699388923935118e-05, + "loss": 0.0016, + "step": 1293 + }, + { + "epoch": 2.29, + "grad_norm": 0.01488049328327179, + "learning_rate": 6.667665570497813e-05, + "loss": 0.0004, + "step": 1294 + }, + { + "epoch": 2.29, + "grad_norm": 0.07691626995801926, + "learning_rate": 6.636005943195683e-05, + "loss": 0.0023, + "step": 1295 + }, + { + "epoch": 2.29, + "grad_norm": 0.36601608991622925, + "learning_rate": 6.604410152082683e-05, + "loss": 0.0058, + "step": 1296 + }, + { + "epoch": 2.29, + "grad_norm": 0.053158730268478394, + "learning_rate": 6.57287830699084e-05, + "loss": 0.0013, + "step": 1297 + }, + { + "epoch": 2.3, + "grad_norm": 0.0060308403335511684, + "learning_rate": 6.541410517529906e-05, + "loss": 0.0002, + "step": 1298 + }, + { + "epoch": 2.3, + "grad_norm": 0.06450387835502625, + "learning_rate": 6.510006893086973e-05, + "loss": 0.0016, + "step": 1299 + }, + { + "epoch": 2.3, + "grad_norm": 0.0036570043303072453, + "learning_rate": 6.478667542826064e-05, + "loss": 0.0002, + "step": 1300 + }, + { + "epoch": 2.3, + "grad_norm": 0.010067092254757881, + "learning_rate": 6.447392575687805e-05, + "loss": 0.0003, + "step": 1301 + }, + { + "epoch": 2.3, + "grad_norm": 0.0020321488846093416, + "learning_rate": 6.41618210038899e-05, + "loss": 0.0001, + "step": 1302 + }, + { + "epoch": 2.31, + "grad_norm": 0.07873855531215668, + "learning_rate": 6.38503622542223e-05, + "loss": 0.0023, + "step": 1303 + }, + { + "epoch": 2.31, + "grad_norm": 0.021666008979082108, + "learning_rate": 6.353955059055597e-05, + "loss": 0.0006, + "step": 1304 + }, + { + "epoch": 2.31, + "grad_norm": 0.1410478949546814, + "learning_rate": 6.322938709332196e-05, + "loss": 0.0031, + "step": 1305 + }, + { + "epoch": 2.31, + "grad_norm": 0.01548179890960455, + "learning_rate": 6.291987284069849e-05, + "loss": 0.0006, + "step": 1306 + }, + { + "epoch": 2.31, + "grad_norm": 0.10515931993722916, + "learning_rate": 6.261100890860668e-05, + "loss": 0.0024, + "step": 1307 + }, + { + "epoch": 2.31, + "grad_norm": 0.05864081159234047, + "learning_rate": 6.230279637070704e-05, + "loss": 0.0017, + "step": 1308 + }, + { + "epoch": 2.32, + "grad_norm": 0.007448627147823572, + "learning_rate": 6.199523629839591e-05, + "loss": 0.0003, + "step": 1309 + }, + { + "epoch": 2.32, + "grad_norm": 0.1981428563594818, + "learning_rate": 6.168832976080133e-05, + "loss": 0.0027, + "step": 1310 + }, + { + "epoch": 2.32, + "grad_norm": 0.002334183780476451, + "learning_rate": 6.138207782477976e-05, + "loss": 0.0001, + "step": 1311 + }, + { + "epoch": 2.32, + "grad_norm": 0.12165654450654984, + "learning_rate": 6.107648155491202e-05, + "loss": 0.0029, + "step": 1312 + }, + { + "epoch": 2.32, + "grad_norm": 0.038640353828668594, + "learning_rate": 6.077154201349966e-05, + "loss": 0.0008, + "step": 1313 + }, + { + "epoch": 2.32, + "grad_norm": 0.004505124408751726, + "learning_rate": 6.046726026056154e-05, + "loss": 0.0002, + "step": 1314 + }, + { + "epoch": 2.33, + "grad_norm": 0.3108009696006775, + "learning_rate": 6.01636373538299e-05, + "loss": 0.0064, + "step": 1315 + }, + { + "epoch": 2.33, + "grad_norm": 0.003968897275626659, + "learning_rate": 5.986067434874662e-05, + "loss": 0.0002, + "step": 1316 + }, + { + "epoch": 2.33, + "grad_norm": 0.026421288028359413, + "learning_rate": 5.955837229845965e-05, + "loss": 0.0007, + "step": 1317 + }, + { + "epoch": 2.33, + "grad_norm": 0.007577298209071159, + "learning_rate": 5.925673225381939e-05, + "loss": 0.0003, + "step": 1318 + }, + { + "epoch": 2.33, + "grad_norm": 0.009191282093524933, + "learning_rate": 5.89557552633751e-05, + "loss": 0.0003, + "step": 1319 + }, + { + "epoch": 2.34, + "grad_norm": 0.0959949716925621, + "learning_rate": 5.865544237337117e-05, + "loss": 0.0015, + "step": 1320 + }, + { + "epoch": 2.34, + "grad_norm": 0.014931642450392246, + "learning_rate": 5.835579462774312e-05, + "loss": 0.0005, + "step": 1321 + }, + { + "epoch": 2.34, + "grad_norm": 0.023637857288122177, + "learning_rate": 5.80568130681148e-05, + "loss": 0.0003, + "step": 1322 + }, + { + "epoch": 2.34, + "grad_norm": 0.07706478983163834, + "learning_rate": 5.775849873379393e-05, + "loss": 0.0022, + "step": 1323 + }, + { + "epoch": 2.34, + "grad_norm": 0.3625439703464508, + "learning_rate": 5.746085266176907e-05, + "loss": 0.0146, + "step": 1324 + }, + { + "epoch": 2.34, + "grad_norm": 0.12187057733535767, + "learning_rate": 5.7163875886705824e-05, + "loss": 0.0036, + "step": 1325 + }, + { + "epoch": 2.35, + "grad_norm": 0.0629158765077591, + "learning_rate": 5.686756944094282e-05, + "loss": 0.0018, + "step": 1326 + }, + { + "epoch": 2.35, + "grad_norm": 0.0620671845972538, + "learning_rate": 5.657193435448896e-05, + "loss": 0.0012, + "step": 1327 + }, + { + "epoch": 2.35, + "grad_norm": 0.00476891128346324, + "learning_rate": 5.627697165501927e-05, + "loss": 0.0002, + "step": 1328 + }, + { + "epoch": 2.35, + "grad_norm": 0.019357487559318542, + "learning_rate": 5.598268236787138e-05, + "loss": 0.0005, + "step": 1329 + }, + { + "epoch": 2.35, + "grad_norm": 0.21635344624519348, + "learning_rate": 5.5689067516041994e-05, + "loss": 0.007, + "step": 1330 + }, + { + "epoch": 2.35, + "grad_norm": 0.6375563740730286, + "learning_rate": 5.539612812018344e-05, + "loss": 0.0377, + "step": 1331 + }, + { + "epoch": 2.36, + "grad_norm": 0.0075180609710514545, + "learning_rate": 5.5103865198600085e-05, + "loss": 0.0003, + "step": 1332 + }, + { + "epoch": 2.36, + "grad_norm": 0.030566420406103134, + "learning_rate": 5.481227976724476e-05, + "loss": 0.0006, + "step": 1333 + }, + { + "epoch": 2.36, + "grad_norm": 0.3805221617221832, + "learning_rate": 5.45213728397152e-05, + "loss": 0.0245, + "step": 1334 + }, + { + "epoch": 2.36, + "grad_norm": 0.033048298209905624, + "learning_rate": 5.423114542725049e-05, + "loss": 0.0006, + "step": 1335 + }, + { + "epoch": 2.36, + "grad_norm": 0.18197427690029144, + "learning_rate": 5.3941598538727625e-05, + "loss": 0.0034, + "step": 1336 + }, + { + "epoch": 2.37, + "grad_norm": 0.009043999947607517, + "learning_rate": 5.365273318065811e-05, + "loss": 0.0004, + "step": 1337 + }, + { + "epoch": 2.37, + "grad_norm": 0.036380622535943985, + "learning_rate": 5.3364550357184325e-05, + "loss": 0.0012, + "step": 1338 + }, + { + "epoch": 2.37, + "grad_norm": 0.02875285968184471, + "learning_rate": 5.307705107007593e-05, + "loss": 0.0007, + "step": 1339 + }, + { + "epoch": 2.37, + "grad_norm": 0.006231918465346098, + "learning_rate": 5.2790236318726484e-05, + "loss": 0.0003, + "step": 1340 + }, + { + "epoch": 2.37, + "grad_norm": 0.012466797605156898, + "learning_rate": 5.2504107100150245e-05, + "loss": 0.0006, + "step": 1341 + }, + { + "epoch": 2.37, + "grad_norm": 0.013808784075081348, + "learning_rate": 5.221866440897807e-05, + "loss": 0.0004, + "step": 1342 + }, + { + "epoch": 2.38, + "grad_norm": 0.5687355995178223, + "learning_rate": 5.193390923745475e-05, + "loss": 0.0199, + "step": 1343 + }, + { + "epoch": 2.38, + "grad_norm": 0.0935133546590805, + "learning_rate": 5.1649842575434844e-05, + "loss": 0.0036, + "step": 1344 + }, + { + "epoch": 2.38, + "grad_norm": 0.6903636455535889, + "learning_rate": 5.136646541037956e-05, + "loss": 0.0154, + "step": 1345 + }, + { + "epoch": 2.38, + "grad_norm": 0.07615023851394653, + "learning_rate": 5.108377872735351e-05, + "loss": 0.002, + "step": 1346 + }, + { + "epoch": 2.38, + "grad_norm": 0.017100483179092407, + "learning_rate": 5.0801783509020844e-05, + "loss": 0.0005, + "step": 1347 + }, + { + "epoch": 2.38, + "grad_norm": 0.01030554249882698, + "learning_rate": 5.052048073564228e-05, + "loss": 0.0004, + "step": 1348 + }, + { + "epoch": 2.39, + "grad_norm": 0.33771464228630066, + "learning_rate": 5.023987138507133e-05, + "loss": 0.0053, + "step": 1349 + }, + { + "epoch": 2.39, + "grad_norm": 0.03562993183732033, + "learning_rate": 4.995995643275103e-05, + "loss": 0.0012, + "step": 1350 + }, + { + "epoch": 2.39, + "grad_norm": 0.037580545991659164, + "learning_rate": 4.968073685171082e-05, + "loss": 0.0009, + "step": 1351 + }, + { + "epoch": 2.39, + "grad_norm": 0.018147172406315804, + "learning_rate": 4.940221361256259e-05, + "loss": 0.0004, + "step": 1352 + }, + { + "epoch": 2.39, + "grad_norm": 0.1366257667541504, + "learning_rate": 4.912438768349792e-05, + "loss": 0.0039, + "step": 1353 + }, + { + "epoch": 2.4, + "grad_norm": 0.020096778869628906, + "learning_rate": 4.884726003028428e-05, + "loss": 0.0004, + "step": 1354 + }, + { + "epoch": 2.4, + "grad_norm": 0.005000903271138668, + "learning_rate": 4.8570831616261745e-05, + "loss": 0.0002, + "step": 1355 + }, + { + "epoch": 2.4, + "grad_norm": 0.02229972742497921, + "learning_rate": 4.829510340234e-05, + "loss": 0.0008, + "step": 1356 + }, + { + "epoch": 2.4, + "grad_norm": 0.04677216708660126, + "learning_rate": 4.802007634699437e-05, + "loss": 0.0008, + "step": 1357 + }, + { + "epoch": 2.4, + "grad_norm": 0.008060449734330177, + "learning_rate": 4.7745751406263163e-05, + "loss": 0.0002, + "step": 1358 + }, + { + "epoch": 2.4, + "grad_norm": 0.004165045917034149, + "learning_rate": 4.74721295337438e-05, + "loss": 0.0003, + "step": 1359 + }, + { + "epoch": 2.41, + "grad_norm": 0.2668968439102173, + "learning_rate": 4.719921168058977e-05, + "loss": 0.0055, + "step": 1360 + }, + { + "epoch": 2.41, + "grad_norm": 0.33916279673576355, + "learning_rate": 4.6926998795507406e-05, + "loss": 0.0061, + "step": 1361 + }, + { + "epoch": 2.41, + "grad_norm": 0.023179393261671066, + "learning_rate": 4.6655491824752263e-05, + "loss": 0.0006, + "step": 1362 + }, + { + "epoch": 2.41, + "grad_norm": 0.01882367953658104, + "learning_rate": 4.6384691712126225e-05, + "loss": 0.0007, + "step": 1363 + }, + { + "epoch": 2.41, + "grad_norm": 0.013556770980358124, + "learning_rate": 4.611459939897386e-05, + "loss": 0.0003, + "step": 1364 + }, + { + "epoch": 2.41, + "grad_norm": 0.215042382478714, + "learning_rate": 4.5845215824179335e-05, + "loss": 0.0051, + "step": 1365 + }, + { + "epoch": 2.42, + "grad_norm": 0.02435019426047802, + "learning_rate": 4.557654192416319e-05, + "loss": 0.0004, + "step": 1366 + }, + { + "epoch": 2.42, + "grad_norm": 0.017334023490548134, + "learning_rate": 4.530857863287913e-05, + "loss": 0.0005, + "step": 1367 + }, + { + "epoch": 2.42, + "grad_norm": 0.20091189444065094, + "learning_rate": 4.5041326881810395e-05, + "loss": 0.0069, + "step": 1368 + }, + { + "epoch": 2.42, + "grad_norm": 0.05958046764135361, + "learning_rate": 4.4774787599967004e-05, + "loss": 0.0009, + "step": 1369 + }, + { + "epoch": 2.42, + "grad_norm": 0.08757390081882477, + "learning_rate": 4.450896171388219e-05, + "loss": 0.0025, + "step": 1370 + }, + { + "epoch": 2.43, + "grad_norm": 0.028023116290569305, + "learning_rate": 4.424385014760937e-05, + "loss": 0.0008, + "step": 1371 + }, + { + "epoch": 2.43, + "grad_norm": 0.022646216675639153, + "learning_rate": 4.397945382271909e-05, + "loss": 0.0004, + "step": 1372 + }, + { + "epoch": 2.43, + "grad_norm": 0.04051864892244339, + "learning_rate": 4.37157736582951e-05, + "loss": 0.0009, + "step": 1373 + }, + { + "epoch": 2.43, + "grad_norm": 0.7693873643875122, + "learning_rate": 4.3452810570932115e-05, + "loss": 0.0696, + "step": 1374 + }, + { + "epoch": 2.43, + "grad_norm": 0.0007687499164603651, + "learning_rate": 4.3190565474731904e-05, + "loss": 0.0, + "step": 1375 + }, + { + "epoch": 2.43, + "grad_norm": 0.00293734110891819, + "learning_rate": 4.292903928130054e-05, + "loss": 0.0001, + "step": 1376 + }, + { + "epoch": 2.44, + "grad_norm": 0.6718714237213135, + "learning_rate": 4.266823289974517e-05, + "loss": 0.0105, + "step": 1377 + }, + { + "epoch": 2.44, + "grad_norm": 0.003744264366105199, + "learning_rate": 4.240814723667033e-05, + "loss": 0.0001, + "step": 1378 + }, + { + "epoch": 2.44, + "grad_norm": 0.002130451612174511, + "learning_rate": 4.214878319617568e-05, + "loss": 0.0001, + "step": 1379 + }, + { + "epoch": 2.44, + "grad_norm": 0.02480950951576233, + "learning_rate": 4.189014167985225e-05, + "loss": 0.0004, + "step": 1380 + }, + { + "epoch": 2.44, + "grad_norm": 0.00624463614076376, + "learning_rate": 4.163222358677937e-05, + "loss": 0.0002, + "step": 1381 + }, + { + "epoch": 2.44, + "grad_norm": 0.0020641738083213568, + "learning_rate": 4.137502981352173e-05, + "loss": 0.0001, + "step": 1382 + }, + { + "epoch": 2.45, + "grad_norm": 0.14240607619285583, + "learning_rate": 4.111856125412608e-05, + "loss": 0.0036, + "step": 1383 + }, + { + "epoch": 2.45, + "grad_norm": 0.006745543330907822, + "learning_rate": 4.086281880011833e-05, + "loss": 0.0002, + "step": 1384 + }, + { + "epoch": 2.45, + "grad_norm": 0.015076296404004097, + "learning_rate": 4.060780334050032e-05, + "loss": 0.0005, + "step": 1385 + }, + { + "epoch": 2.45, + "grad_norm": 0.008114277385175228, + "learning_rate": 4.035351576174667e-05, + "loss": 0.0003, + "step": 1386 + }, + { + "epoch": 2.45, + "grad_norm": 0.014272456988692284, + "learning_rate": 4.0099956947801745e-05, + "loss": 0.0003, + "step": 1387 + }, + { + "epoch": 2.46, + "grad_norm": 0.2593598961830139, + "learning_rate": 3.9847127780076626e-05, + "loss": 0.0062, + "step": 1388 + }, + { + "epoch": 2.46, + "grad_norm": 0.011931393295526505, + "learning_rate": 3.959502913744614e-05, + "loss": 0.0004, + "step": 1389 + }, + { + "epoch": 2.46, + "grad_norm": 0.37758180499076843, + "learning_rate": 3.934366189624561e-05, + "loss": 0.0124, + "step": 1390 + }, + { + "epoch": 2.46, + "grad_norm": 0.7305591702461243, + "learning_rate": 3.9093026930267864e-05, + "loss": 0.0191, + "step": 1391 + }, + { + "epoch": 2.46, + "grad_norm": 0.3506815433502197, + "learning_rate": 3.8843125110760186e-05, + "loss": 0.004, + "step": 1392 + }, + { + "epoch": 2.46, + "grad_norm": 1.1179872751235962, + "learning_rate": 3.859395730642151e-05, + "loss": 0.1287, + "step": 1393 + }, + { + "epoch": 2.47, + "grad_norm": 0.07131043821573257, + "learning_rate": 3.8345524383398974e-05, + "loss": 0.0015, + "step": 1394 + }, + { + "epoch": 2.47, + "grad_norm": 0.10362354665994644, + "learning_rate": 3.8097827205285374e-05, + "loss": 0.002, + "step": 1395 + }, + { + "epoch": 2.47, + "grad_norm": 0.035806868225336075, + "learning_rate": 3.7850866633115767e-05, + "loss": 0.0006, + "step": 1396 + }, + { + "epoch": 2.47, + "grad_norm": 0.09917337447404861, + "learning_rate": 3.760464352536469e-05, + "loss": 0.0013, + "step": 1397 + }, + { + "epoch": 2.47, + "grad_norm": 0.013792157173156738, + "learning_rate": 3.735915873794327e-05, + "loss": 0.0005, + "step": 1398 + }, + { + "epoch": 2.48, + "grad_norm": 0.11405621469020844, + "learning_rate": 3.711441312419589e-05, + "loss": 0.0017, + "step": 1399 + }, + { + "epoch": 2.48, + "grad_norm": 0.37025147676467896, + "learning_rate": 3.687040753489765e-05, + "loss": 0.0066, + "step": 1400 + }, + { + "epoch": 2.48, + "grad_norm": 0.005007702391594648, + "learning_rate": 3.662714281825111e-05, + "loss": 0.0001, + "step": 1401 + }, + { + "epoch": 2.48, + "grad_norm": 0.035408273339271545, + "learning_rate": 3.6384619819883335e-05, + "loss": 0.0005, + "step": 1402 + }, + { + "epoch": 2.48, + "grad_norm": 1.066094160079956, + "learning_rate": 3.614283938284332e-05, + "loss": 0.0229, + "step": 1403 + }, + { + "epoch": 2.48, + "grad_norm": 0.06345271319150925, + "learning_rate": 3.590180234759857e-05, + "loss": 0.0012, + "step": 1404 + }, + { + "epoch": 2.49, + "grad_norm": 0.13740839064121246, + "learning_rate": 3.566150955203251e-05, + "loss": 0.003, + "step": 1405 + }, + { + "epoch": 2.49, + "grad_norm": 0.18154090642929077, + "learning_rate": 3.542196183144148e-05, + "loss": 0.0045, + "step": 1406 + }, + { + "epoch": 2.49, + "grad_norm": 1.66633141040802, + "learning_rate": 3.518316001853164e-05, + "loss": 0.0242, + "step": 1407 + }, + { + "epoch": 2.49, + "grad_norm": 0.054164353758096695, + "learning_rate": 3.494510494341657e-05, + "loss": 0.0022, + "step": 1408 + }, + { + "epoch": 2.49, + "grad_norm": 0.011136863380670547, + "learning_rate": 3.470779743361374e-05, + "loss": 0.0005, + "step": 1409 + }, + { + "epoch": 2.49, + "grad_norm": 0.011947316117584705, + "learning_rate": 3.447123831404228e-05, + "loss": 0.0005, + "step": 1410 + }, + { + "epoch": 2.5, + "grad_norm": 0.17515721917152405, + "learning_rate": 3.423542840701957e-05, + "loss": 0.0035, + "step": 1411 + }, + { + "epoch": 2.5, + "grad_norm": 0.014279712922871113, + "learning_rate": 3.4000368532258604e-05, + "loss": 0.0006, + "step": 1412 + }, + { + "epoch": 2.5, + "grad_norm": 0.004492651205509901, + "learning_rate": 3.376605950686532e-05, + "loss": 0.0003, + "step": 1413 + }, + { + "epoch": 2.5, + "grad_norm": 0.012061775662004948, + "learning_rate": 3.3532502145335373e-05, + "loss": 0.0004, + "step": 1414 + }, + { + "epoch": 2.5, + "grad_norm": 0.019419865682721138, + "learning_rate": 3.3299697259551706e-05, + "loss": 0.0007, + "step": 1415 + }, + { + "epoch": 2.51, + "grad_norm": 0.18258291482925415, + "learning_rate": 3.3067645658781425e-05, + "loss": 0.004, + "step": 1416 + }, + { + "epoch": 2.51, + "grad_norm": 0.2847168445587158, + "learning_rate": 3.283634814967307e-05, + "loss": 0.0092, + "step": 1417 + }, + { + "epoch": 2.51, + "grad_norm": 0.4876096248626709, + "learning_rate": 3.260580553625389e-05, + "loss": 0.0131, + "step": 1418 + }, + { + "epoch": 2.51, + "grad_norm": 0.6045449376106262, + "learning_rate": 3.237601861992709e-05, + "loss": 0.0193, + "step": 1419 + }, + { + "epoch": 2.51, + "grad_norm": 0.1586480587720871, + "learning_rate": 3.214698819946879e-05, + "loss": 0.0026, + "step": 1420 + }, + { + "epoch": 2.51, + "eval_loss": 0.12486789375543594, + "eval_runtime": 14.7046, + "eval_samples_per_second": 32.439, + "eval_steps_per_second": 8.161, + "step": 1420 + }, + { + "epoch": 2.51, + "grad_norm": 0.12747687101364136, + "learning_rate": 3.191871507102545e-05, + "loss": 0.0017, + "step": 1421 + }, + { + "epoch": 2.52, + "grad_norm": 0.04522531479597092, + "learning_rate": 3.1691200028111044e-05, + "loss": 0.0009, + "step": 1422 + }, + { + "epoch": 2.52, + "grad_norm": 0.2810252904891968, + "learning_rate": 3.146444386160441e-05, + "loss": 0.0041, + "step": 1423 + }, + { + "epoch": 2.52, + "grad_norm": 0.06912285834550858, + "learning_rate": 3.123844735974646e-05, + "loss": 0.0008, + "step": 1424 + }, + { + "epoch": 2.52, + "grad_norm": 0.3842872679233551, + "learning_rate": 3.1013211308137054e-05, + "loss": 0.0075, + "step": 1425 + }, + { + "epoch": 2.52, + "grad_norm": 0.018251223489642143, + "learning_rate": 3.078873648973304e-05, + "loss": 0.0003, + "step": 1426 + }, + { + "epoch": 2.52, + "grad_norm": 0.005700815003365278, + "learning_rate": 3.0565023684844765e-05, + "loss": 0.0002, + "step": 1427 + }, + { + "epoch": 2.53, + "grad_norm": 1.1778111457824707, + "learning_rate": 3.034207367113387e-05, + "loss": 0.0241, + "step": 1428 + }, + { + "epoch": 2.53, + "grad_norm": 0.5692623853683472, + "learning_rate": 3.0119887223610475e-05, + "loss": 0.0074, + "step": 1429 + }, + { + "epoch": 2.53, + "grad_norm": 0.007092227227985859, + "learning_rate": 2.9898465114630123e-05, + "loss": 0.0003, + "step": 1430 + }, + { + "epoch": 2.53, + "grad_norm": 0.006958132144063711, + "learning_rate": 2.9677808113891675e-05, + "loss": 0.0003, + "step": 1431 + }, + { + "epoch": 2.53, + "grad_norm": 0.2849915027618408, + "learning_rate": 2.945791698843431e-05, + "loss": 0.0114, + "step": 1432 + }, + { + "epoch": 2.54, + "grad_norm": 0.3223225176334381, + "learning_rate": 2.9238792502634782e-05, + "loss": 0.0105, + "step": 1433 + }, + { + "epoch": 2.54, + "grad_norm": 0.3097008466720581, + "learning_rate": 2.902043541820501e-05, + "loss": 0.0067, + "step": 1434 + }, + { + "epoch": 2.54, + "grad_norm": 0.08397488296031952, + "learning_rate": 2.880284649418913e-05, + "loss": 0.0029, + "step": 1435 + }, + { + "epoch": 2.54, + "grad_norm": 0.029154475778341293, + "learning_rate": 2.8586026486961235e-05, + "loss": 0.0008, + "step": 1436 + }, + { + "epoch": 2.54, + "grad_norm": 0.02804793231189251, + "learning_rate": 2.836997615022249e-05, + "loss": 0.0007, + "step": 1437 + }, + { + "epoch": 2.54, + "grad_norm": 0.008161618374288082, + "learning_rate": 2.8154696234998472e-05, + "loss": 0.0003, + "step": 1438 + }, + { + "epoch": 2.55, + "grad_norm": 0.055503591895103455, + "learning_rate": 2.7940187489636697e-05, + "loss": 0.0015, + "step": 1439 + }, + { + "epoch": 2.55, + "grad_norm": 0.02855825237929821, + "learning_rate": 2.7726450659803947e-05, + "loss": 0.0008, + "step": 1440 + }, + { + "epoch": 2.55, + "grad_norm": 0.01375576015561819, + "learning_rate": 2.751348648848373e-05, + "loss": 0.0004, + "step": 1441 + }, + { + "epoch": 2.55, + "grad_norm": 0.0038817201275378466, + "learning_rate": 2.7301295715973757e-05, + "loss": 0.0001, + "step": 1442 + }, + { + "epoch": 2.55, + "grad_norm": 0.023767409846186638, + "learning_rate": 2.708987907988314e-05, + "loss": 0.0007, + "step": 1443 + }, + { + "epoch": 2.55, + "grad_norm": 0.06856090575456619, + "learning_rate": 2.687923731512995e-05, + "loss": 0.0016, + "step": 1444 + }, + { + "epoch": 2.56, + "grad_norm": 0.001726289396174252, + "learning_rate": 2.666937115393886e-05, + "loss": 0.0001, + "step": 1445 + }, + { + "epoch": 2.56, + "grad_norm": 0.011897512711584568, + "learning_rate": 2.646028132583822e-05, + "loss": 0.0003, + "step": 1446 + }, + { + "epoch": 2.56, + "grad_norm": 0.011256224475800991, + "learning_rate": 2.6251968557657908e-05, + "loss": 0.0004, + "step": 1447 + }, + { + "epoch": 2.56, + "grad_norm": 0.09208813309669495, + "learning_rate": 2.6044433573526454e-05, + "loss": 0.0021, + "step": 1448 + }, + { + "epoch": 2.56, + "grad_norm": 0.028126435354351997, + "learning_rate": 2.58376770948687e-05, + "loss": 0.0003, + "step": 1449 + }, + { + "epoch": 2.57, + "grad_norm": 0.02156115137040615, + "learning_rate": 2.5631699840403476e-05, + "loss": 0.0007, + "step": 1450 + }, + { + "epoch": 2.57, + "grad_norm": 0.039257489144802094, + "learning_rate": 2.542650252614062e-05, + "loss": 0.0013, + "step": 1451 + }, + { + "epoch": 2.57, + "grad_norm": 1.2106196880340576, + "learning_rate": 2.5222085865379023e-05, + "loss": 0.0606, + "step": 1452 + }, + { + "epoch": 2.57, + "grad_norm": 0.011918625794351101, + "learning_rate": 2.501845056870375e-05, + "loss": 0.0002, + "step": 1453 + }, + { + "epoch": 2.57, + "grad_norm": 0.35332852602005005, + "learning_rate": 2.4815597343983697e-05, + "loss": 0.0813, + "step": 1454 + }, + { + "epoch": 2.57, + "grad_norm": 0.0004311532247811556, + "learning_rate": 2.4613526896369308e-05, + "loss": 0.0, + "step": 1455 + }, + { + "epoch": 2.58, + "grad_norm": 0.20872049033641815, + "learning_rate": 2.441223992828978e-05, + "loss": 0.0047, + "step": 1456 + }, + { + "epoch": 2.58, + "grad_norm": 1.3243696689605713, + "learning_rate": 2.421173713945099e-05, + "loss": 0.024, + "step": 1457 + }, + { + "epoch": 2.58, + "grad_norm": 0.005421569105237722, + "learning_rate": 2.4012019226832772e-05, + "loss": 0.0002, + "step": 1458 + }, + { + "epoch": 2.58, + "grad_norm": 0.3085567355155945, + "learning_rate": 2.381308688468656e-05, + "loss": 0.0047, + "step": 1459 + }, + { + "epoch": 2.58, + "grad_norm": 0.002236233791336417, + "learning_rate": 2.361494080453319e-05, + "loss": 0.0001, + "step": 1460 + }, + { + "epoch": 2.58, + "grad_norm": 0.5045457482337952, + "learning_rate": 2.3417581675160088e-05, + "loss": 0.0054, + "step": 1461 + }, + { + "epoch": 2.59, + "grad_norm": 0.0775829628109932, + "learning_rate": 2.3221010182619406e-05, + "loss": 0.0014, + "step": 1462 + }, + { + "epoch": 2.59, + "grad_norm": 0.47293341159820557, + "learning_rate": 2.302522701022511e-05, + "loss": 0.0563, + "step": 1463 + }, + { + "epoch": 2.59, + "grad_norm": 0.30130696296691895, + "learning_rate": 2.2830232838550845e-05, + "loss": 0.0057, + "step": 1464 + }, + { + "epoch": 2.59, + "grad_norm": 0.0052946461364626884, + "learning_rate": 2.2636028345427745e-05, + "loss": 0.0002, + "step": 1465 + }, + { + "epoch": 2.59, + "grad_norm": 0.017164453864097595, + "learning_rate": 2.244261420594168e-05, + "loss": 0.0005, + "step": 1466 + }, + { + "epoch": 2.6, + "grad_norm": 0.8137333393096924, + "learning_rate": 2.224999109243131e-05, + "loss": 0.039, + "step": 1467 + }, + { + "epoch": 2.6, + "grad_norm": 0.19935303926467896, + "learning_rate": 2.205815967448546e-05, + "loss": 0.0056, + "step": 1468 + }, + { + "epoch": 2.6, + "grad_norm": 0.3961120545864105, + "learning_rate": 2.1867120618940833e-05, + "loss": 0.0056, + "step": 1469 + }, + { + "epoch": 2.6, + "grad_norm": 0.019902199506759644, + "learning_rate": 2.167687458987991e-05, + "loss": 0.0007, + "step": 1470 + }, + { + "epoch": 2.6, + "grad_norm": 0.009017308242619038, + "learning_rate": 2.1487422248628457e-05, + "loss": 0.0004, + "step": 1471 + }, + { + "epoch": 2.6, + "grad_norm": 0.001593849272467196, + "learning_rate": 2.1298764253753044e-05, + "loss": 0.0001, + "step": 1472 + }, + { + "epoch": 2.61, + "grad_norm": 0.023191401734948158, + "learning_rate": 2.1110901261059245e-05, + "loss": 0.0006, + "step": 1473 + }, + { + "epoch": 2.61, + "grad_norm": 0.03758575767278671, + "learning_rate": 2.0923833923588874e-05, + "loss": 0.0011, + "step": 1474 + }, + { + "epoch": 2.61, + "grad_norm": 0.3719123303890228, + "learning_rate": 2.0737562891618074e-05, + "loss": 0.0061, + "step": 1475 + }, + { + "epoch": 2.61, + "grad_norm": 0.004338722676038742, + "learning_rate": 2.0552088812654885e-05, + "loss": 0.0002, + "step": 1476 + }, + { + "epoch": 2.61, + "grad_norm": 0.004211151506751776, + "learning_rate": 2.0367412331436796e-05, + "loss": 0.0002, + "step": 1477 + }, + { + "epoch": 2.61, + "grad_norm": 0.008191877976059914, + "learning_rate": 2.018353408992901e-05, + "loss": 0.0003, + "step": 1478 + }, + { + "epoch": 2.62, + "grad_norm": 0.4456917941570282, + "learning_rate": 2.0000454727321703e-05, + "loss": 0.0153, + "step": 1479 + }, + { + "epoch": 2.62, + "grad_norm": 0.03597655147314072, + "learning_rate": 1.9818174880028213e-05, + "loss": 0.0013, + "step": 1480 + }, + { + "epoch": 2.62, + "grad_norm": 0.0015153115382418036, + "learning_rate": 1.963669518168243e-05, + "loss": 0.0001, + "step": 1481 + }, + { + "epoch": 2.62, + "grad_norm": 0.008952159434556961, + "learning_rate": 1.9456016263136923e-05, + "loss": 0.0003, + "step": 1482 + }, + { + "epoch": 2.62, + "grad_norm": 0.00541423074901104, + "learning_rate": 1.927613875246059e-05, + "loss": 0.0002, + "step": 1483 + }, + { + "epoch": 2.63, + "grad_norm": 0.09652198851108551, + "learning_rate": 1.909706327493657e-05, + "loss": 0.0016, + "step": 1484 + }, + { + "epoch": 2.63, + "grad_norm": 0.22741255164146423, + "learning_rate": 1.8918790453059914e-05, + "loss": 0.0031, + "step": 1485 + }, + { + "epoch": 2.63, + "grad_norm": 0.040858399122953415, + "learning_rate": 1.8741320906535546e-05, + "loss": 0.0007, + "step": 1486 + }, + { + "epoch": 2.63, + "grad_norm": 0.053088657557964325, + "learning_rate": 1.856465525227602e-05, + "loss": 0.0019, + "step": 1487 + }, + { + "epoch": 2.63, + "grad_norm": 0.008057578466832638, + "learning_rate": 1.8388794104399558e-05, + "loss": 0.0003, + "step": 1488 + }, + { + "epoch": 2.63, + "grad_norm": 0.009569887071847916, + "learning_rate": 1.8213738074227742e-05, + "loss": 0.0004, + "step": 1489 + }, + { + "epoch": 2.64, + "grad_norm": 0.00848764181137085, + "learning_rate": 1.803948777028336e-05, + "loss": 0.0003, + "step": 1490 + }, + { + "epoch": 2.64, + "grad_norm": 0.6634844541549683, + "learning_rate": 1.7866043798288433e-05, + "loss": 0.0438, + "step": 1491 + }, + { + "epoch": 2.64, + "grad_norm": 0.15894575417041779, + "learning_rate": 1.7693406761162016e-05, + "loss": 0.0055, + "step": 1492 + }, + { + "epoch": 2.64, + "grad_norm": 0.013144123367965221, + "learning_rate": 1.752157725901815e-05, + "loss": 0.0005, + "step": 1493 + }, + { + "epoch": 2.64, + "grad_norm": 0.018960585817694664, + "learning_rate": 1.735055588916379e-05, + "loss": 0.0003, + "step": 1494 + }, + { + "epoch": 2.64, + "grad_norm": 0.016689004376530647, + "learning_rate": 1.718034324609663e-05, + "loss": 0.0006, + "step": 1495 + }, + { + "epoch": 2.65, + "grad_norm": 0.006429940462112427, + "learning_rate": 1.701093992150307e-05, + "loss": 0.0003, + "step": 1496 + }, + { + "epoch": 2.65, + "grad_norm": 0.11746788769960403, + "learning_rate": 1.684234650425631e-05, + "loss": 0.0032, + "step": 1497 + }, + { + "epoch": 2.65, + "grad_norm": 0.004738051909953356, + "learning_rate": 1.6674563580414053e-05, + "loss": 0.0002, + "step": 1498 + }, + { + "epoch": 2.65, + "grad_norm": 0.004201785195618868, + "learning_rate": 1.65075917332167e-05, + "loss": 0.0002, + "step": 1499 + }, + { + "epoch": 2.65, + "grad_norm": 0.07351760566234589, + "learning_rate": 1.6341431543085207e-05, + "loss": 0.0017, + "step": 1500 + }, + { + "epoch": 2.66, + "grad_norm": 0.03148205578327179, + "learning_rate": 1.6176083587618935e-05, + "loss": 0.0011, + "step": 1501 + }, + { + "epoch": 2.66, + "grad_norm": 0.07184291630983353, + "learning_rate": 1.6011548441594e-05, + "loss": 0.0018, + "step": 1502 + }, + { + "epoch": 2.66, + "grad_norm": 0.01941969059407711, + "learning_rate": 1.5847826676960914e-05, + "loss": 0.0007, + "step": 1503 + }, + { + "epoch": 2.66, + "grad_norm": 0.004026324022561312, + "learning_rate": 1.5684918862842846e-05, + "loss": 0.0001, + "step": 1504 + }, + { + "epoch": 2.66, + "grad_norm": 0.002735046437010169, + "learning_rate": 1.5522825565533445e-05, + "loss": 0.0002, + "step": 1505 + }, + { + "epoch": 2.66, + "grad_norm": 0.09817846864461899, + "learning_rate": 1.5361547348495013e-05, + "loss": 0.0016, + "step": 1506 + }, + { + "epoch": 2.67, + "grad_norm": 0.022421833127737045, + "learning_rate": 1.5201084772356544e-05, + "loss": 0.0004, + "step": 1507 + }, + { + "epoch": 2.67, + "grad_norm": 0.1519351303577423, + "learning_rate": 1.5041438394911622e-05, + "loss": 0.0019, + "step": 1508 + }, + { + "epoch": 2.67, + "grad_norm": 0.1200493648648262, + "learning_rate": 1.4882608771116719e-05, + "loss": 0.002, + "step": 1509 + }, + { + "epoch": 2.67, + "grad_norm": 0.002202227944508195, + "learning_rate": 1.4724596453089101e-05, + "loss": 0.0001, + "step": 1510 + }, + { + "epoch": 2.67, + "grad_norm": 0.030175110325217247, + "learning_rate": 1.4567401990104867e-05, + "loss": 0.0009, + "step": 1511 + }, + { + "epoch": 2.67, + "grad_norm": 0.03393395245075226, + "learning_rate": 1.441102592859725e-05, + "loss": 0.0014, + "step": 1512 + }, + { + "epoch": 2.68, + "grad_norm": 0.7548046112060547, + "learning_rate": 1.4255468812154477e-05, + "loss": 0.0468, + "step": 1513 + }, + { + "epoch": 2.68, + "grad_norm": 0.015761546790599823, + "learning_rate": 1.4100731181518056e-05, + "loss": 0.0006, + "step": 1514 + }, + { + "epoch": 2.68, + "grad_norm": 0.014691539108753204, + "learning_rate": 1.3946813574580858e-05, + "loss": 0.0006, + "step": 1515 + }, + { + "epoch": 2.68, + "grad_norm": 0.06156891956925392, + "learning_rate": 1.3793716526385058e-05, + "loss": 0.0015, + "step": 1516 + }, + { + "epoch": 2.68, + "grad_norm": 0.011981514282524586, + "learning_rate": 1.36414405691207e-05, + "loss": 0.0002, + "step": 1517 + }, + { + "epoch": 2.69, + "grad_norm": 0.04529397189617157, + "learning_rate": 1.3489986232123302e-05, + "loss": 0.0017, + "step": 1518 + }, + { + "epoch": 2.69, + "grad_norm": 0.0034320768900215626, + "learning_rate": 1.333935404187253e-05, + "loss": 0.0002, + "step": 1519 + }, + { + "epoch": 2.69, + "grad_norm": 0.5339131951332092, + "learning_rate": 1.3189544521990032e-05, + "loss": 0.031, + "step": 1520 + }, + { + "epoch": 2.69, + "grad_norm": 0.21039195358753204, + "learning_rate": 1.3040558193237657e-05, + "loss": 0.002, + "step": 1521 + }, + { + "epoch": 2.69, + "grad_norm": 0.0334392786026001, + "learning_rate": 1.2892395573515819e-05, + "loss": 0.0014, + "step": 1522 + }, + { + "epoch": 2.69, + "grad_norm": 0.009169626981019974, + "learning_rate": 1.2745057177861647e-05, + "loss": 0.0003, + "step": 1523 + }, + { + "epoch": 2.7, + "grad_norm": 0.006436166353523731, + "learning_rate": 1.2598543518446887e-05, + "loss": 0.0004, + "step": 1524 + }, + { + "epoch": 2.7, + "grad_norm": 0.004530397243797779, + "learning_rate": 1.245285510457661e-05, + "loss": 0.0002, + "step": 1525 + }, + { + "epoch": 2.7, + "grad_norm": 0.013779646717011929, + "learning_rate": 1.2307992442687072e-05, + "loss": 0.0005, + "step": 1526 + }, + { + "epoch": 2.7, + "grad_norm": 0.0018188911490142345, + "learning_rate": 1.2163956036344153e-05, + "loss": 0.0001, + "step": 1527 + }, + { + "epoch": 2.7, + "grad_norm": 0.15664638578891754, + "learning_rate": 1.2020746386241565e-05, + "loss": 0.0029, + "step": 1528 + }, + { + "epoch": 2.7, + "grad_norm": 0.017927415668964386, + "learning_rate": 1.1878363990198871e-05, + "loss": 0.0004, + "step": 1529 + }, + { + "epoch": 2.71, + "grad_norm": 0.17458246648311615, + "learning_rate": 1.1736809343160237e-05, + "loss": 0.0035, + "step": 1530 + }, + { + "epoch": 2.71, + "grad_norm": 0.016326846554875374, + "learning_rate": 1.1596082937192276e-05, + "loss": 0.0006, + "step": 1531 + }, + { + "epoch": 2.71, + "grad_norm": 0.75627201795578, + "learning_rate": 1.1456185261482565e-05, + "loss": 0.0268, + "step": 1532 + }, + { + "epoch": 2.71, + "grad_norm": 0.003338505746796727, + "learning_rate": 1.1317116802337906e-05, + "loss": 0.0002, + "step": 1533 + }, + { + "epoch": 2.71, + "grad_norm": 0.003949652425944805, + "learning_rate": 1.1178878043182462e-05, + "loss": 0.0002, + "step": 1534 + }, + { + "epoch": 2.72, + "grad_norm": 0.005014631897211075, + "learning_rate": 1.1041469464556419e-05, + "loss": 0.0003, + "step": 1535 + }, + { + "epoch": 2.72, + "grad_norm": 0.09127765148878098, + "learning_rate": 1.090489154411406e-05, + "loss": 0.0022, + "step": 1536 + }, + { + "epoch": 2.72, + "grad_norm": 0.002308650640770793, + "learning_rate": 1.0769144756622106e-05, + "loss": 0.0001, + "step": 1537 + }, + { + "epoch": 2.72, + "grad_norm": 0.01227644830942154, + "learning_rate": 1.0634229573958155e-05, + "loss": 0.0005, + "step": 1538 + }, + { + "epoch": 2.72, + "grad_norm": 0.07127789407968521, + "learning_rate": 1.0500146465108995e-05, + "loss": 0.0016, + "step": 1539 + }, + { + "epoch": 2.72, + "grad_norm": 0.3965778648853302, + "learning_rate": 1.0366895896169098e-05, + "loss": 0.0119, + "step": 1540 + }, + { + "epoch": 2.73, + "grad_norm": 0.006936135236173868, + "learning_rate": 1.0234478330338775e-05, + "loss": 0.0003, + "step": 1541 + }, + { + "epoch": 2.73, + "grad_norm": 0.1017901748418808, + "learning_rate": 1.0102894227922737e-05, + "loss": 0.0019, + "step": 1542 + }, + { + "epoch": 2.73, + "grad_norm": 0.004074485041201115, + "learning_rate": 9.972144046328429e-06, + "loss": 0.0002, + "step": 1543 + }, + { + "epoch": 2.73, + "grad_norm": 0.5157943367958069, + "learning_rate": 9.842228240064421e-06, + "loss": 0.0767, + "step": 1544 + }, + { + "epoch": 2.73, + "grad_norm": 0.35279086232185364, + "learning_rate": 9.713147260738936e-06, + "loss": 0.0063, + "step": 1545 + }, + { + "epoch": 2.74, + "grad_norm": 0.005003898870199919, + "learning_rate": 9.584901557058156e-06, + "loss": 0.0002, + "step": 1546 + }, + { + "epoch": 2.74, + "grad_norm": 0.001333480584435165, + "learning_rate": 9.457491574824757e-06, + "loss": 0.0001, + "step": 1547 + }, + { + "epoch": 2.74, + "grad_norm": 0.02091939002275467, + "learning_rate": 9.330917756936174e-06, + "loss": 0.0009, + "step": 1548 + }, + { + "epoch": 2.74, + "grad_norm": 0.30123287439346313, + "learning_rate": 9.2051805433834e-06, + "loss": 0.0046, + "step": 1549 + }, + { + "epoch": 2.74, + "grad_norm": 0.17032013833522797, + "learning_rate": 9.080280371249112e-06, + "loss": 0.0036, + "step": 1550 + }, + { + "epoch": 2.74, + "grad_norm": 0.4087766706943512, + "learning_rate": 8.956217674706363e-06, + "loss": 0.0049, + "step": 1551 + }, + { + "epoch": 2.75, + "grad_norm": 0.09622213989496231, + "learning_rate": 8.832992885016988e-06, + "loss": 0.0024, + "step": 1552 + }, + { + "epoch": 2.75, + "grad_norm": 0.016199596226215363, + "learning_rate": 8.710606430530066e-06, + "loss": 0.0005, + "step": 1553 + }, + { + "epoch": 2.75, + "grad_norm": 0.0021451227366924286, + "learning_rate": 8.589058736680643e-06, + "loss": 0.0001, + "step": 1554 + }, + { + "epoch": 2.75, + "grad_norm": 0.005553426221013069, + "learning_rate": 8.46835022598791e-06, + "loss": 0.0002, + "step": 1555 + }, + { + "epoch": 2.75, + "grad_norm": 0.5673149824142456, + "learning_rate": 8.348481318054075e-06, + "loss": 0.0614, + "step": 1556 + }, + { + "epoch": 2.75, + "grad_norm": 0.014100473374128342, + "learning_rate": 8.229452429562661e-06, + "loss": 0.0004, + "step": 1557 + }, + { + "epoch": 2.76, + "grad_norm": 0.026370083913207054, + "learning_rate": 8.111263974277166e-06, + "loss": 0.0006, + "step": 1558 + }, + { + "epoch": 2.76, + "grad_norm": 0.00458921492099762, + "learning_rate": 7.993916363039672e-06, + "loss": 0.0002, + "step": 1559 + }, + { + "epoch": 2.76, + "grad_norm": 0.0019055847078561783, + "learning_rate": 7.877410003769236e-06, + "loss": 0.0001, + "step": 1560 + }, + { + "epoch": 2.76, + "grad_norm": 0.010362006723880768, + "learning_rate": 7.761745301460676e-06, + "loss": 0.0004, + "step": 1561 + }, + { + "epoch": 2.76, + "grad_norm": 0.06622084230184555, + "learning_rate": 7.646922658183092e-06, + "loss": 0.0016, + "step": 1562 + }, + { + "epoch": 2.76, + "eval_loss": 0.12071493268013, + "eval_runtime": 14.7006, + "eval_samples_per_second": 32.448, + "eval_steps_per_second": 8.163, + "step": 1562 + }, + { + "epoch": 2.77, + "grad_norm": 0.21637581288814545, + "learning_rate": 7.532942473078341e-06, + "loss": 0.0077, + "step": 1563 + }, + { + "epoch": 2.77, + "grad_norm": 0.2466088980436325, + "learning_rate": 7.419805142359875e-06, + "loss": 0.005, + "step": 1564 + }, + { + "epoch": 2.77, + "grad_norm": 0.1368948519229889, + "learning_rate": 7.307511059311184e-06, + "loss": 0.0024, + "step": 1565 + }, + { + "epoch": 2.77, + "grad_norm": 0.4194111227989197, + "learning_rate": 7.196060614284544e-06, + "loss": 0.0097, + "step": 1566 + }, + { + "epoch": 2.77, + "grad_norm": 0.0024943724274635315, + "learning_rate": 7.085454194699553e-06, + "loss": 0.0001, + "step": 1567 + }, + { + "epoch": 2.77, + "grad_norm": 0.5519493222236633, + "learning_rate": 6.975692185041848e-06, + "loss": 0.0128, + "step": 1568 + }, + { + "epoch": 2.78, + "grad_norm": 0.17635726928710938, + "learning_rate": 6.866774966861833e-06, + "loss": 0.0047, + "step": 1569 + }, + { + "epoch": 2.78, + "grad_norm": 0.17873595654964447, + "learning_rate": 6.758702918773202e-06, + "loss": 0.0041, + "step": 1570 + }, + { + "epoch": 2.78, + "grad_norm": 0.008024133741855621, + "learning_rate": 6.651476416451696e-06, + "loss": 0.0004, + "step": 1571 + }, + { + "epoch": 2.78, + "grad_norm": 0.018708229064941406, + "learning_rate": 6.545095832633907e-06, + "loss": 0.0008, + "step": 1572 + }, + { + "epoch": 2.78, + "grad_norm": 0.3967719078063965, + "learning_rate": 6.439561537115751e-06, + "loss": 0.0771, + "step": 1573 + }, + { + "epoch": 2.78, + "grad_norm": 0.039222270250320435, + "learning_rate": 6.334873896751414e-06, + "loss": 0.001, + "step": 1574 + }, + { + "epoch": 2.79, + "grad_norm": 0.013372889719903469, + "learning_rate": 6.231033275451908e-06, + "loss": 0.0007, + "step": 1575 + }, + { + "epoch": 2.79, + "grad_norm": 0.008132797665894032, + "learning_rate": 6.12804003418388e-06, + "loss": 0.0003, + "step": 1576 + }, + { + "epoch": 2.79, + "grad_norm": 0.15415289998054504, + "learning_rate": 6.0258945309683565e-06, + "loss": 0.0044, + "step": 1577 + }, + { + "epoch": 2.79, + "grad_norm": 0.5614458918571472, + "learning_rate": 5.9245971208795045e-06, + "loss": 0.0128, + "step": 1578 + }, + { + "epoch": 2.79, + "grad_norm": 0.0058565386570990086, + "learning_rate": 5.824148156043374e-06, + "loss": 0.0002, + "step": 1579 + }, + { + "epoch": 2.8, + "grad_norm": 0.6066858172416687, + "learning_rate": 5.724547985636652e-06, + "loss": 0.0365, + "step": 1580 + }, + { + "epoch": 2.8, + "grad_norm": 0.39940959215164185, + "learning_rate": 5.625796955885526e-06, + "loss": 0.0168, + "step": 1581 + }, + { + "epoch": 2.8, + "grad_norm": 0.10061442852020264, + "learning_rate": 5.527895410064459e-06, + "loss": 0.0013, + "step": 1582 + }, + { + "epoch": 2.8, + "grad_norm": 0.003030581632629037, + "learning_rate": 5.430843688494836e-06, + "loss": 0.0001, + "step": 1583 + }, + { + "epoch": 2.8, + "grad_norm": 0.11822108179330826, + "learning_rate": 5.3346421285440925e-06, + "loss": 0.0033, + "step": 1584 + }, + { + "epoch": 2.8, + "grad_norm": 0.004489241633564234, + "learning_rate": 5.239291064624258e-06, + "loss": 0.0003, + "step": 1585 + }, + { + "epoch": 2.81, + "grad_norm": 1.0766096115112305, + "learning_rate": 5.144790828190887e-06, + "loss": 0.036, + "step": 1586 + }, + { + "epoch": 2.81, + "grad_norm": 0.09894857555627823, + "learning_rate": 5.051141747741989e-06, + "loss": 0.0018, + "step": 1587 + }, + { + "epoch": 2.81, + "grad_norm": 0.49432384967803955, + "learning_rate": 4.958344148816824e-06, + "loss": 0.0061, + "step": 1588 + }, + { + "epoch": 2.81, + "grad_norm": 0.07527617365121841, + "learning_rate": 4.8663983539946885e-06, + "loss": 0.0017, + "step": 1589 + }, + { + "epoch": 2.81, + "grad_norm": 0.0023846172261983156, + "learning_rate": 4.775304682893944e-06, + "loss": 0.0001, + "step": 1590 + }, + { + "epoch": 2.81, + "grad_norm": 0.01218903437256813, + "learning_rate": 4.685063452170735e-06, + "loss": 0.0006, + "step": 1591 + }, + { + "epoch": 2.82, + "grad_norm": 0.029260369017720222, + "learning_rate": 4.595674975518133e-06, + "loss": 0.0013, + "step": 1592 + }, + { + "epoch": 2.82, + "grad_norm": 0.00425742520019412, + "learning_rate": 4.507139563664802e-06, + "loss": 0.0002, + "step": 1593 + }, + { + "epoch": 2.82, + "grad_norm": 0.003876454196870327, + "learning_rate": 4.419457524374032e-06, + "loss": 0.0002, + "step": 1594 + }, + { + "epoch": 2.82, + "grad_norm": 0.1555083692073822, + "learning_rate": 4.332629162442675e-06, + "loss": 0.0028, + "step": 1595 + }, + { + "epoch": 2.82, + "grad_norm": 0.0016881643095985055, + "learning_rate": 4.246654779699988e-06, + "loss": 0.0001, + "step": 1596 + }, + { + "epoch": 2.83, + "grad_norm": 0.00919495802372694, + "learning_rate": 4.161534675006739e-06, + "loss": 0.0005, + "step": 1597 + }, + { + "epoch": 2.83, + "grad_norm": 0.008377696387469769, + "learning_rate": 4.077269144254103e-06, + "loss": 0.0003, + "step": 1598 + }, + { + "epoch": 2.83, + "grad_norm": 1.2848001718521118, + "learning_rate": 3.993858480362572e-06, + "loss": 0.0519, + "step": 1599 + }, + { + "epoch": 2.83, + "grad_norm": 0.07129717618227005, + "learning_rate": 3.9113029732809615e-06, + "loss": 0.0011, + "step": 1600 + }, + { + "epoch": 2.83, + "grad_norm": 0.001333805383183062, + "learning_rate": 3.8296029099854635e-06, + "loss": 0.0001, + "step": 1601 + }, + { + "epoch": 2.83, + "grad_norm": 0.017828090116381645, + "learning_rate": 3.748758574478622e-06, + "loss": 0.0008, + "step": 1602 + }, + { + "epoch": 2.84, + "grad_norm": 0.010987207293510437, + "learning_rate": 3.6687702477883332e-06, + "loss": 0.0004, + "step": 1603 + }, + { + "epoch": 2.84, + "grad_norm": 0.006206910125911236, + "learning_rate": 3.5896382079668166e-06, + "loss": 0.0003, + "step": 1604 + }, + { + "epoch": 2.84, + "grad_norm": 0.001939537120051682, + "learning_rate": 3.511362730089729e-06, + "loss": 0.0001, + "step": 1605 + }, + { + "epoch": 2.84, + "grad_norm": 0.2372162640094757, + "learning_rate": 3.4339440862552194e-06, + "loss": 0.0031, + "step": 1606 + }, + { + "epoch": 2.84, + "grad_norm": 0.0658475011587143, + "learning_rate": 3.3573825455829043e-06, + "loss": 0.002, + "step": 1607 + }, + { + "epoch": 2.84, + "grad_norm": 0.012416253797709942, + "learning_rate": 3.2816783742129762e-06, + "loss": 0.0005, + "step": 1608 + }, + { + "epoch": 2.85, + "grad_norm": 0.001760126673616469, + "learning_rate": 3.206831835305263e-06, + "loss": 0.0001, + "step": 1609 + }, + { + "epoch": 2.85, + "grad_norm": 0.1820244938135147, + "learning_rate": 3.132843189038365e-06, + "loss": 0.0035, + "step": 1610 + }, + { + "epoch": 2.85, + "grad_norm": 0.2944984436035156, + "learning_rate": 3.059712692608657e-06, + "loss": 0.0086, + "step": 1611 + }, + { + "epoch": 2.85, + "grad_norm": 0.013544095680117607, + "learning_rate": 2.9874406002295128e-06, + "loss": 0.0006, + "step": 1612 + }, + { + "epoch": 2.85, + "grad_norm": 0.008425934240221977, + "learning_rate": 2.9160271631303025e-06, + "loss": 0.0003, + "step": 1613 + }, + { + "epoch": 2.86, + "grad_norm": 0.0034127351827919483, + "learning_rate": 2.84547262955559e-06, + "loss": 0.0001, + "step": 1614 + }, + { + "epoch": 2.86, + "grad_norm": 0.03862582519650459, + "learning_rate": 2.775777244764216e-06, + "loss": 0.0007, + "step": 1615 + }, + { + "epoch": 2.86, + "grad_norm": 0.9643748998641968, + "learning_rate": 2.7069412510285773e-06, + "loss": 0.0381, + "step": 1616 + }, + { + "epoch": 2.86, + "grad_norm": 0.027254153043031693, + "learning_rate": 2.6389648876335716e-06, + "loss": 0.0007, + "step": 1617 + }, + { + "epoch": 2.86, + "grad_norm": 0.14486004412174225, + "learning_rate": 2.571848390875986e-06, + "loss": 0.0039, + "step": 1618 + }, + { + "epoch": 2.86, + "grad_norm": 0.002894668374210596, + "learning_rate": 2.5055919940635276e-06, + "loss": 0.0001, + "step": 1619 + }, + { + "epoch": 2.87, + "grad_norm": 0.8268805742263794, + "learning_rate": 2.440195927514044e-06, + "loss": 0.0447, + "step": 1620 + }, + { + "epoch": 2.87, + "grad_norm": 0.17663922905921936, + "learning_rate": 2.375660418554776e-06, + "loss": 0.0076, + "step": 1621 + }, + { + "epoch": 2.87, + "grad_norm": 0.12479265034198761, + "learning_rate": 2.3119856915214677e-06, + "loss": 0.002, + "step": 1622 + }, + { + "epoch": 2.87, + "grad_norm": 0.14911742508411407, + "learning_rate": 2.249171967757674e-06, + "loss": 0.0046, + "step": 1623 + }, + { + "epoch": 2.87, + "grad_norm": 1.0442122220993042, + "learning_rate": 2.1872194656140377e-06, + "loss": 0.0154, + "step": 1624 + }, + { + "epoch": 2.87, + "grad_norm": 0.018565386533737183, + "learning_rate": 2.126128400447347e-06, + "loss": 0.0007, + "step": 1625 + }, + { + "epoch": 2.88, + "grad_norm": 0.7466828227043152, + "learning_rate": 2.0658989846199516e-06, + "loss": 0.0113, + "step": 1626 + }, + { + "epoch": 2.88, + "grad_norm": 0.034939322620630264, + "learning_rate": 2.006531427499014e-06, + "loss": 0.0008, + "step": 1627 + }, + { + "epoch": 2.88, + "grad_norm": 0.028358131647109985, + "learning_rate": 1.948025935455594e-06, + "loss": 0.001, + "step": 1628 + }, + { + "epoch": 2.88, + "grad_norm": 0.043177518993616104, + "learning_rate": 1.8903827118642303e-06, + "loss": 0.0008, + "step": 1629 + }, + { + "epoch": 2.88, + "grad_norm": 0.03869365155696869, + "learning_rate": 1.8336019571019712e-06, + "loss": 0.0022, + "step": 1630 + }, + { + "epoch": 2.89, + "grad_norm": 0.0050878459587693214, + "learning_rate": 1.7776838685478748e-06, + "loss": 0.0002, + "step": 1631 + }, + { + "epoch": 2.89, + "grad_norm": 0.11395423114299774, + "learning_rate": 1.7226286405821746e-06, + "loss": 0.0024, + "step": 1632 + }, + { + "epoch": 2.89, + "grad_norm": 0.5369199514389038, + "learning_rate": 1.6684364645856431e-06, + "loss": 0.0145, + "step": 1633 + }, + { + "epoch": 2.89, + "grad_norm": 0.012085442431271076, + "learning_rate": 1.6151075289390082e-06, + "loss": 0.0003, + "step": 1634 + }, + { + "epoch": 2.89, + "grad_norm": 0.15687036514282227, + "learning_rate": 1.5626420190222034e-06, + "loss": 0.0059, + "step": 1635 + }, + { + "epoch": 2.89, + "grad_norm": 0.02339506521821022, + "learning_rate": 1.5110401172137578e-06, + "loss": 0.0004, + "step": 1636 + }, + { + "epoch": 2.9, + "grad_norm": 0.15162977576255798, + "learning_rate": 1.4603020028901292e-06, + "loss": 0.0026, + "step": 1637 + }, + { + "epoch": 2.9, + "grad_norm": 0.0018832300556823611, + "learning_rate": 1.4104278524251778e-06, + "loss": 0.0001, + "step": 1638 + }, + { + "epoch": 2.9, + "grad_norm": 0.030508514493703842, + "learning_rate": 1.3614178391894438e-06, + "loss": 0.0013, + "step": 1639 + }, + { + "epoch": 2.9, + "grad_norm": 0.00550305750221014, + "learning_rate": 1.3132721335495645e-06, + "loss": 0.0002, + "step": 1640 + }, + { + "epoch": 2.9, + "grad_norm": 0.01754370890557766, + "learning_rate": 1.265990902867803e-06, + "loss": 0.0005, + "step": 1641 + }, + { + "epoch": 2.9, + "grad_norm": 0.00891774520277977, + "learning_rate": 1.2195743115012148e-06, + "loss": 0.0002, + "step": 1642 + }, + { + "epoch": 2.91, + "grad_norm": 0.02182842791080475, + "learning_rate": 1.1740225208013712e-06, + "loss": 0.0007, + "step": 1643 + }, + { + "epoch": 2.91, + "grad_norm": 0.0056608193553984165, + "learning_rate": 1.1293356891136086e-06, + "loss": 0.0002, + "step": 1644 + }, + { + "epoch": 2.91, + "grad_norm": 0.004720357712358236, + "learning_rate": 1.0855139717765028e-06, + "loss": 0.0003, + "step": 1645 + }, + { + "epoch": 2.91, + "grad_norm": 0.005244838539510965, + "learning_rate": 1.0425575211213956e-06, + "loss": 0.0003, + "step": 1646 + }, + { + "epoch": 2.91, + "grad_norm": 0.0327480174601078, + "learning_rate": 1.0004664864717573e-06, + "loss": 0.001, + "step": 1647 + }, + { + "epoch": 2.92, + "grad_norm": 0.8325297832489014, + "learning_rate": 9.592410141427977e-07, + "loss": 0.0526, + "step": 1648 + }, + { + "epoch": 2.92, + "grad_norm": 0.0062986682169139385, + "learning_rate": 9.188812474408837e-07, + "loss": 0.0003, + "step": 1649 + }, + { + "epoch": 2.92, + "grad_norm": 0.01729346066713333, + "learning_rate": 8.793873266630393e-07, + "loss": 0.0006, + "step": 1650 + }, + { + "epoch": 2.92, + "grad_norm": 0.00606880709528923, + "learning_rate": 8.407593890964461e-07, + "loss": 0.0004, + "step": 1651 + }, + { + "epoch": 2.92, + "grad_norm": 0.009202031418681145, + "learning_rate": 8.029975690179992e-07, + "loss": 0.0005, + "step": 1652 + }, + { + "epoch": 2.92, + "grad_norm": 0.14149095118045807, + "learning_rate": 7.661019976939187e-07, + "loss": 0.0014, + "step": 1653 + }, + { + "epoch": 2.93, + "grad_norm": 0.06853083521127701, + "learning_rate": 7.300728033790282e-07, + "loss": 0.0026, + "step": 1654 + }, + { + "epoch": 2.93, + "grad_norm": 0.5628806352615356, + "learning_rate": 6.949101113166712e-07, + "loss": 0.0322, + "step": 1655 + }, + { + "epoch": 2.93, + "grad_norm": 0.004697425756603479, + "learning_rate": 6.606140437379616e-07, + "loss": 0.0002, + "step": 1656 + }, + { + "epoch": 2.93, + "grad_norm": 0.026971176266670227, + "learning_rate": 6.271847198615343e-07, + "loss": 0.0008, + "step": 1657 + }, + { + "epoch": 2.93, + "grad_norm": 0.416694313287735, + "learning_rate": 5.946222558931014e-07, + "loss": 0.0072, + "step": 1658 + }, + { + "epoch": 2.93, + "grad_norm": 0.16609083116054535, + "learning_rate": 5.629267650249792e-07, + "loss": 0.0056, + "step": 1659 + }, + { + "epoch": 2.94, + "grad_norm": 0.020159004256129265, + "learning_rate": 5.320983574358118e-07, + "loss": 0.0006, + "step": 1660 + }, + { + "epoch": 2.94, + "grad_norm": 0.020069165155291557, + "learning_rate": 5.02137140290071e-07, + "loss": 0.0007, + "step": 1661 + }, + { + "epoch": 2.94, + "grad_norm": 0.04708680510520935, + "learning_rate": 4.7304321773775085e-07, + "loss": 0.0024, + "step": 1662 + }, + { + "epoch": 2.94, + "grad_norm": 0.03632630407810211, + "learning_rate": 4.4481669091400725e-07, + "loss": 0.0015, + "step": 1663 + }, + { + "epoch": 2.94, + "grad_norm": 0.006391770206391811, + "learning_rate": 4.1745765793874126e-07, + "loss": 0.0004, + "step": 1664 + }, + { + "epoch": 2.95, + "grad_norm": 0.8153448700904846, + "learning_rate": 3.90966213916405e-07, + "loss": 0.0167, + "step": 1665 + }, + { + "epoch": 2.95, + "grad_norm": 0.08031131327152252, + "learning_rate": 3.6534245093544636e-07, + "loss": 0.0036, + "step": 1666 + }, + { + "epoch": 2.95, + "grad_norm": 0.015597201883792877, + "learning_rate": 3.405864580681983e-07, + "loss": 0.0006, + "step": 1667 + }, + { + "epoch": 2.95, + "grad_norm": 0.024990934878587723, + "learning_rate": 3.1669832137046216e-07, + "loss": 0.0008, + "step": 1668 + }, + { + "epoch": 2.95, + "grad_norm": 0.005498305428773165, + "learning_rate": 2.9367812388123025e-07, + "loss": 0.0002, + "step": 1669 + }, + { + "epoch": 2.95, + "grad_norm": 0.19190239906311035, + "learning_rate": 2.715259456224084e-07, + "loss": 0.0051, + "step": 1670 + }, + { + "epoch": 2.96, + "grad_norm": 0.008078343234956264, + "learning_rate": 2.502418635985382e-07, + "loss": 0.0004, + "step": 1671 + }, + { + "epoch": 2.96, + "grad_norm": 0.02244127169251442, + "learning_rate": 2.2982595179646404e-07, + "loss": 0.0009, + "step": 1672 + }, + { + "epoch": 2.96, + "grad_norm": 0.027221323922276497, + "learning_rate": 2.1027828118519442e-07, + "loss": 0.001, + "step": 1673 + }, + { + "epoch": 2.96, + "grad_norm": 0.05436151847243309, + "learning_rate": 1.91598919715541e-07, + "loss": 0.0012, + "step": 1674 + }, + { + "epoch": 2.96, + "grad_norm": 0.751646876335144, + "learning_rate": 1.7378793232000755e-07, + "loss": 0.0248, + "step": 1675 + }, + { + "epoch": 2.97, + "grad_norm": 0.04363579303026199, + "learning_rate": 1.5684538091240153e-07, + "loss": 0.0015, + "step": 1676 + }, + { + "epoch": 2.97, + "grad_norm": 0.02322092093527317, + "learning_rate": 1.407713243877784e-07, + "loss": 0.001, + "step": 1677 + }, + { + "epoch": 2.97, + "grad_norm": 0.023351455107331276, + "learning_rate": 1.2556581862213646e-07, + "loss": 0.001, + "step": 1678 + }, + { + "epoch": 2.97, + "grad_norm": 0.02784132957458496, + "learning_rate": 1.1122891647222244e-07, + "loss": 0.0007, + "step": 1679 + }, + { + "epoch": 2.97, + "grad_norm": 0.018551329150795937, + "learning_rate": 9.776066777542058e-08, + "loss": 0.0008, + "step": 1680 + }, + { + "epoch": 2.97, + "grad_norm": 0.0073452303186059, + "learning_rate": 8.516111934955828e-08, + "loss": 0.0004, + "step": 1681 + }, + { + "epoch": 2.98, + "grad_norm": 0.022210106253623962, + "learning_rate": 7.343031499262853e-08, + "loss": 0.0005, + "step": 1682 + }, + { + "epoch": 2.98, + "grad_norm": 0.36818450689315796, + "learning_rate": 6.256829548284549e-08, + "loss": 0.0045, + "step": 1683 + }, + { + "epoch": 2.98, + "grad_norm": 0.012365012429654598, + "learning_rate": 5.257509857828358e-08, + "loss": 0.0005, + "step": 1684 + }, + { + "epoch": 2.98, + "grad_norm": 0.007756817154586315, + "learning_rate": 4.345075901693307e-08, + "loss": 0.0003, + "step": 1685 + }, + { + "epoch": 2.98, + "grad_norm": 0.31708094477653503, + "learning_rate": 3.51953085164225e-08, + "loss": 0.01, + "step": 1686 + }, + { + "epoch": 2.98, + "grad_norm": 0.026957141235470772, + "learning_rate": 2.7808775774074147e-08, + "loss": 0.0005, + "step": 1687 + }, + { + "epoch": 2.99, + "grad_norm": 0.7157958149909973, + "learning_rate": 2.1291186466626535e-08, + "loss": 0.0264, + "step": 1688 + }, + { + "epoch": 2.99, + "grad_norm": 0.003837962169200182, + "learning_rate": 1.5642563250289897e-08, + "loss": 0.0002, + "step": 1689 + }, + { + "epoch": 2.99, + "grad_norm": 0.22489942610263824, + "learning_rate": 1.0862925760551923e-08, + "loss": 0.0026, + "step": 1690 + }, + { + "epoch": 2.99, + "grad_norm": 0.4433555603027344, + "learning_rate": 6.952290612205481e-09, + "loss": 0.037, + "step": 1691 + }, + { + "epoch": 2.99, + "grad_norm": 0.0032361983321607113, + "learning_rate": 3.910671399265376e-09, + "loss": 0.0002, + "step": 1692 + }, + { + "epoch": 3.0, + "grad_norm": 0.13408610224723816, + "learning_rate": 1.7380786948850702e-09, + "loss": 0.0032, + "step": 1693 + }, + { + "epoch": 3.0, + "grad_norm": 0.017384354025125504, + "learning_rate": 4.345200513289349e-10, + "loss": 0.0004, + "step": 1694 + }, + { + "epoch": 3.0, + "grad_norm": 0.0010622803820297122, + "learning_rate": 0.0, + "loss": 0.0001, + "step": 1695 + } + ], + "logging_steps": 1, + "max_steps": 1695, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 565, + "total_flos": 1.549439947809751e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}